class NoopTokenParserTests(unittest.TestCase): def setUp(self): self.tp = NoopTokenParser() def test_process_token(self): self.assertEqual(list(self.tp.process_token("abcdef")), ["abcdef"]) self.assertEqual(list(self.tp.process_token("abcd_ef")), ["abcd_ef"]) self.assertEqual(list(self.tp.process_token("abcDef")), ["abcDef"])
def __init__(self, token2index=None, token_parser=None): """ :param token2index: The mapping from tokens to bag keys. If None, no mapping is performed. :param token_parser: Specify token parser if you want to use a custom one. \ :class:'NoopTokenParser' is used if it is not specified. """ self._token2index = FakeVocabulary() if token2index is None else token2index self._token_parser = NoopTokenParser() if token_parser is None else token_parser
def __init__(self, split_stem=False, type="tree", max_distance=DEFAULT_MAX_DISTANCE, **kwargs): super().__init__(**kwargs) Uast2IdDistance = self.DistanceType.resolve(type) self.uast2id_distance = Uast2IdDistance( token_parser=NoopTokenParser() if not split_stem else None, max_distance=max_distance)
class UastTokens2Bag(Uast2BagBase): """ Converts a UAST to a weighed bag of tokens via xpath. """ XPATH = None # Should be overridden in child class def __init__(self, token2index=None, token_parser=None): """ :param token2index: The mapping from tokens to bag keys. If None, no mapping is performed. :param token_parser: Specify token parser if you want to use a custom one. \ :class:'NoopTokenParser' is used if it is not specified. """ self._token2index = FakeVocabulary( ) if token2index is None else token2index self._token_parser = NoopTokenParser( ) if token_parser is None else token_parser @property def token_parser(self): return self._token_parser @property def token2index(self): return self._token2index def __call__(self, uast): """ Converts a UAST to a weighed bag-of-words. The weights are words frequencies. The tokens are preprocessed by _token_parser. :param uast: The UAST root node. :return: """ nodes = bblfsh.filter(uast, self.XPATH) bag = defaultdict(int) for node in nodes: for sub in self._token_parser.process_token(node.token): try: bag[self._token2index[sub]] += 1 except KeyError: continue return bag
def setUp(self): self.uast2role_id_pairs = Uast2RoleIdPairs( token_parser=NoopTokenParser()) self.uast = BblfshClient("0.0.0.0:9432").parse( SOURCE_PY, mode=Modes.ANNOTATED).uast
def __init__(self, docfreq_threshold=None, split_stem=True, **kwargs): super().__init__(docfreq_threshold, **kwargs) self.id2bag = UastIds2Bag( None, NoopTokenParser() if not split_stem else None)
def setUp(self): self.uast2role_id_pairs = Uast2IdLineDistance( token_parser=NoopTokenParser(), max_distance=3) self.uast = BblfshClient("0.0.0.0:9432").parse(SOURCE_PY).uast self.maxDiff = None
def setUp(self): self.uast2id_sequence = Uast2IdSequence(token_parser=NoopTokenParser()) self.uast = BblfshClient("0.0.0.0:9432").parse( SOURCE_PY, mode=Modes.ANNOTATED).uast
class RoleIdsExtractor(Extractor): NAME = "roleids" ALGORITHM = Uast2RoleIdPairs(token_parser=NoopTokenParser())
def setUp(self): self.tp = NoopTokenParser()
def __init__(self, split_stem=False, **kwargs): super().__init__(**kwargs) self.uast2id_sequence = Uast2IdSequence( None, NoopTokenParser() if not split_stem else None)
def setUp(self): self.uast2role_id_pairs = Uast2IdTreeDistance( token_parser=NoopTokenParser(), max_distance=4) self.uast = BblfshClient("0.0.0.0:9432").parse( SOURCE_PY, mode=Modes.ANNOTATED).uast self.maxDiff = None