def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), "%s is not unicode" % repr(value) t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) # The default: expression matches are used as tokens for i, match in enumerate(value.split('\n')): fields = match.strip().split('\t') word, lemma, pos, ne = fields if len(fields) is 4 else ["", "", "", ""] t.text = match.strip().split('\t')[0] t.lemma = lemma t.part_of_speech = pos t.named_entity = ne t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = start_pos + i if chars: t.startchar = start_char + match.start() t.endchar = start_char + match.end() yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), "%s is not unicode" % repr(value) t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) # The default: expression matches are used as tokens for i, match in enumerate(value.split('\n')): fields = match.strip().split('\t') word, lemma, pos, ne = fields if len(fields) is 4 else [ "", "", "", "" ] t.text = match.strip().split('\t')[0] t.lemma = lemma t.part_of_speech = pos t.named_entity = ne t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = start_pos + i if chars: t.startchar = start_char + match.start() t.endchar = start_char + match.end() yield t