예제 #1
0
class NoopTokenParserTests(unittest.TestCase):
    def setUp(self):
        self.tp = NoopTokenParser()

    def test_process_token(self):
        self.assertEqual(list(self.tp.process_token("abcdef")), ["abcdef"])
        self.assertEqual(list(self.tp.process_token("abcd_ef")), ["abcd_ef"])
        self.assertEqual(list(self.tp.process_token("abcDef")), ["abcDef"])
예제 #2
0
 def __init__(self, token2index=None, token_parser=None):
     """
     :param token2index: The mapping from tokens to bag keys. If None, no mapping is performed.
     :param token_parser: Specify token parser if you want to use a custom one. \
         :class:'NoopTokenParser' is used if it is not specified.
     """
     self._token2index = FakeVocabulary() if token2index is None else token2index
     self._token_parser = NoopTokenParser() if token_parser is None else token_parser
예제 #3
0
 def __init__(self,
              split_stem=False,
              type="tree",
              max_distance=DEFAULT_MAX_DISTANCE,
              **kwargs):
     super().__init__(**kwargs)
     Uast2IdDistance = self.DistanceType.resolve(type)
     self.uast2id_distance = Uast2IdDistance(
         token_parser=NoopTokenParser() if not split_stem else None,
         max_distance=max_distance)
예제 #4
0
class UastTokens2Bag(Uast2BagBase):
    """
    Converts a UAST to a weighed bag of tokens via xpath.
    """

    XPATH = None  # Should be overridden in child class

    def __init__(self, token2index=None, token_parser=None):
        """
        :param token2index: The mapping from tokens to bag keys. If None, no mapping is performed.
        :param token_parser: Specify token parser if you want to use a custom one. \
            :class:'NoopTokenParser' is used if it is not specified.
        """
        self._token2index = FakeVocabulary(
        ) if token2index is None else token2index
        self._token_parser = NoopTokenParser(
        ) if token_parser is None else token_parser

    @property
    def token_parser(self):
        return self._token_parser

    @property
    def token2index(self):
        return self._token2index

    def __call__(self, uast):
        """
        Converts a UAST to a weighed bag-of-words. The weights are words frequencies.
        The tokens are preprocessed by _token_parser.

        :param uast: The UAST root node.
        :return:
        """
        nodes = bblfsh.filter(uast, self.XPATH)
        bag = defaultdict(int)
        for node in nodes:
            for sub in self._token_parser.process_token(node.token):
                try:
                    bag[self._token2index[sub]] += 1
                except KeyError:
                    continue
        return bag
예제 #5
0
 def setUp(self):
     self.uast2role_id_pairs = Uast2RoleIdPairs(
         token_parser=NoopTokenParser())
     self.uast = BblfshClient("0.0.0.0:9432").parse(
         SOURCE_PY, mode=Modes.ANNOTATED).uast
예제 #6
0
파일: identifiers.py 프로젝트: zurk/ml-core
 def __init__(self, docfreq_threshold=None, split_stem=True, **kwargs):
     super().__init__(docfreq_threshold, **kwargs)
     self.id2bag = UastIds2Bag(
         None,
         NoopTokenParser() if not split_stem else None)
예제 #7
0
 def setUp(self):
     self.uast2role_id_pairs = Uast2IdLineDistance(
         token_parser=NoopTokenParser(), max_distance=3)
     self.uast = BblfshClient("0.0.0.0:9432").parse(SOURCE_PY).uast
     self.maxDiff = None
예제 #8
0
 def setUp(self):
     self.uast2id_sequence = Uast2IdSequence(token_parser=NoopTokenParser())
     self.uast = BblfshClient("0.0.0.0:9432").parse(
         SOURCE_PY, mode=Modes.ANNOTATED).uast
예제 #9
0
class RoleIdsExtractor(Extractor):
    NAME = "roleids"
    ALGORITHM = Uast2RoleIdPairs(token_parser=NoopTokenParser())
예제 #10
0
 def setUp(self):
     self.tp = NoopTokenParser()
예제 #11
0
파일: id_sequence.py 프로젝트: zurk/ml-core
 def __init__(self, split_stem=False, **kwargs):
     super().__init__(**kwargs)
     self.uast2id_sequence = Uast2IdSequence(
         None,
         NoopTokenParser() if not split_stem else None)
예제 #12
0
 def setUp(self):
     self.uast2role_id_pairs = Uast2IdTreeDistance(
         token_parser=NoopTokenParser(), max_distance=4)
     self.uast = BblfshClient("0.0.0.0:9432").parse(
         SOURCE_PY, mode=Modes.ANNOTATED).uast
     self.maxDiff = None