def __call__( self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode="", **kwargs ): assert isinstance(value, text_type), "%r is not unicode" % value inlen = len(value) t = Token(positions, chars, removestops=removestops, mode=mode) pos = start_pos if mode == "query": size = min(self.max, inlen) for start in xrange(0, inlen - size + 1): end = start + size if end > inlen: continue t.text = value[start:end] if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos if chars: t.startchar = start_char + start t.endchar = start_char + end yield t pos += 1 else: for start in xrange(0, inlen - self.min + 1): for size in xrange(self.min, self.max + 1): end = start + size if end > inlen: continue t.text = value[start:end] if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos if chars: t.startchar = start_char + start t.endchar = start_char + end yield t pos += 1
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value inlen = len(value) t = Token(positions, chars, removestops=removestops, mode=mode) pos = start_pos if mode == "query" and self.reduce_for_query: size = min(self.max, inlen) for start in xrange(0, inlen - size + 1): end = start + size if end > inlen: continue t.text = value[start:end] if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos if chars: t.startchar = start_char + start t.endchar = start_char + end yield t pos += 1 else: for start in xrange(0, inlen - self.min + 1): for size in xrange(self.min, self.max + 1): end = start + size if end > inlen: continue t.text = value[start:end] if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos if chars: t.startchar = start_char + start t.endchar = start_char + end yield t pos += 1
def __call__(self, source_string, positions=False, chars=False, keeporiginal=False, start_pos=0, removestops=False, start_char=0, tokenize=True, mode='', **kwargs): t = Token(positions, chars, removestops=removestops, mode=mode) all_subtrees = dump(ast.parse(source_string), **self.string_dump_options) full_tree = next(all_subtrees) for pos, subtree in enumerate( itertools.chain((full_tree, ), all_subtrees)): t.text = subtree t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = start_pos + pos if chars: t.startchar = start_char + full_tree.index(subtree) t.endchar = start_char + t.startchar + len(subtree) yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), "%s is not unicode" % repr(value) t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) # The default: expression matches are used as tokens for i, match in enumerate(value.split('\n')): fields = match.strip().split('\t') word, lemma, pos, ne = fields if len(fields) is 4 else ["", "", "", ""] t.text = match.strip().split('\t')[0] t.lemma = lemma t.part_of_speech = pos t.named_entity = ne t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = start_pos + i if chars: t.startchar = start_char + match.start() t.endchar = start_char + match.end() yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) for pos, match in enumerate(value.split(self.separator)): t.text = match t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = start_pos + pos if chars: t.startchar = start_char + match.start() t.endchar = start_char + match.end() yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), "%s is not unicode" % repr(value) t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) # The default: expression matches are used as tokens for i, match in enumerate(value.split('\n')): fields = match.strip().split('\t') word, lemma, pos, ne = fields if len(fields) is 4 else [ "", "", "", "" ] t.text = match.strip().split('\t')[0] t.lemma = lemma t.part_of_speech = pos t.named_entity = ne t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = start_pos + i if chars: t.startchar = start_char + match.start() t.endchar = start_char + match.end() yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): """ :param value: The unicode string to tokenize. :param positions: Whether to record token positions in the token. :param chars: Whether to record character offsets in the token. :param start_pos: The position number of the first token. For example, if you set start_pos=2, the tokens will be numbered 2,3,4,... instead of 0,1,2,... :param start_char: The offset of the first character of the first token. For example, if you set start_char=2, the text "aaa bbb" will have chars (2,5),(6,9) instead (0,3),(4,7). :param tokenize: if True, the text should be tokenized. """ assert isinstance(value, text_type), "%s is not unicode" % repr(value) t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t elif not self.gaps: # The default: expression matches are used as tokens for pos, match in enumerate(self.expression.finditer(value)): t.text = match.group(0) t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = start_pos + pos if chars: t.startchar = start_char + match.start() t.endchar = start_char + match.end() yield t else: # When gaps=True, iterate through the matches and # yield the text between them. prevend = 0 pos = start_pos for match in self.expression.finditer(value): start = prevend end = match.start() text = value[start:end] if text: t.text = text t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: t.startchar = start_char + start t.endchar = start_char + end yield t prevend = match.end() # If the last "gap" was before the end of the text, # yield the last bit of text as a final token. if prevend < len(value): t.text = value[prevend:] t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos if chars: t.startchar = prevend t.endchar = len(value) yield t
def __call__( self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode="", **kwargs ): """ :param value: The unicode string to tokenize. :param positions: Whether to record token positions in the token. :param chars: Whether to record character offsets in the token. :param start_pos: The position number of the first token. For example, if you set start_pos=2, the tokens will be numbered 2,3,4,... instead of 0,1,2,... :param start_char: The offset of the first character of the first token. For example, if you set start_char=2, the text "aaa bbb" will have chars (2,5),(6,9) instead (0,3),(4,7). :param tokenize: if True, the text should be tokenized. """ assert isinstance(value, text_type), "%s is not unicode" % repr(value) t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t elif not self.gaps: # The default: expression matches are used as tokens for pos, match in enumerate(self.expression.finditer(value)): t.text = match.group(0) t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = start_pos + pos if chars: t.startchar = start_char + match.start() t.endchar = start_char + match.end() yield t else: # When gaps=True, iterate through the matches and # yield the text between them. prevend = 0 pos = start_pos for match in self.expression.finditer(value): start = prevend end = match.start() text = value[start:end] if text: t.text = text t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: t.startchar = start_char + start t.endchar = start_char + end yield t prevend = match.end() # If the last "gap" was before the end of the text, # yield the last bit of text as a final token. if prevend < len(value): t.text = value[prevend:] t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos if chars: t.startchar = prevend t.endchar = len(value) yield t