예제 #1
0
    def __call__(self, value, positions=False, chars=False, keeporiginal=False,
                 removestops=True, start_pos=0, start_char=0, tokenize=True,
                 mode='', **kwargs):

        assert isinstance(value, text_type), "%s is not unicode" % repr(value)

        t = Token(positions, chars, removestops=removestops, mode=mode,
                  **kwargs)
        # The default: expression matches are used as tokens
        for i, match in enumerate(value.split('\n')):
            fields = match.strip().split('\t')
            word, lemma, pos, ne = fields if len(fields) is 4 else ["", "", "", ""]
            t.text = match.strip().split('\t')[0]
            t.lemma = lemma
            t.part_of_speech = pos
            t.named_entity = ne
            t.boost = 1.0
            if keeporiginal:
                t.original = t.text
            t.stopped = False
            if positions:
                t.pos = start_pos + i
            if chars:
                t.startchar = start_char + match.start()
                t.endchar = start_char + match.end()
            yield t
예제 #2
0
    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 tokenize=True,
                 mode='',
                 **kwargs):

        assert isinstance(value, text_type), "%s is not unicode" % repr(value)

        t = Token(positions,
                  chars,
                  removestops=removestops,
                  mode=mode,
                  **kwargs)
        # The default: expression matches are used as tokens
        for i, match in enumerate(value.split('\n')):
            fields = match.strip().split('\t')
            word, lemma, pos, ne = fields if len(fields) is 4 else [
                "", "", "", ""
            ]
            t.text = match.strip().split('\t')[0]
            t.lemma = lemma
            t.part_of_speech = pos
            t.named_entity = ne
            t.boost = 1.0
            if keeporiginal:
                t.original = t.text
            t.stopped = False
            if positions:
                t.pos = start_pos + i
            if chars:
                t.startchar = start_char + match.start()
                t.endchar = start_char + match.end()
            yield t