def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), '%s is not unicode' % repr(value) token = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: token.original = token.text = value token.boost = 1.0 if positions: token.pos = start_pos if chars: token.startchar = start_char token.endchar = start_char + len(value) yield token else: pos = start_pos for janome_token in self.tagger.tokenize(value): token.text = janome_token.surface token.boost = 1.0 if keeporiginal: token.original = token.text token.stopped = False if positions: token.pos = pos pos += 1 if chars: token.startchar = start_char + janome_token.start token.endchar = token.startchar + len(janome_token.surface) yield token
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), "%s is not unicode" % repr(value) t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: for pos, (start, stop, text) in enumerate(self.iter_value(value)): t.text = text t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = start_pos + pos if chars: t.startchar = start_char + start t.endchar = start_char + stop yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: pos = start_pos for m in self.tagger.parse(value): t.text = m.surface t.feature = m.feature # TODO: use base form. t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: t.startchar = start_char + m.start t.endchar = t.startchar + len(m.surface) yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value enc = self.encoding t = Token(positions, chars, removestops=removestops, mode=mode) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: pos = start_pos offset = start_char byte_offset = 0 byte = value.encode('utf-8') m = self.tagger.parseToNode(toMeCab(value)) while m: if len(m.surface) == 0: m = m.next continue t.text = fromMeCab(m.surface, enc) t.feature = fromMeCab(m.feature, enc) # TODO: use base form. t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: s = byte_offset + m.rlength - m.length e = s + m.length # convert num of byte to num of unicode chars t.startchar = offset + len(byte[byte_offset:s].decode(enc)) t.endchar = t.startchar + len(byte[s:e].decode(enc)) offset = t.endchar byte_offset = e m = m.next yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): """ Rewritten call method :param value: The unicode string to tokenize. :param positions: Whether to record token positions in the token. :param chars: Whether to record character offsets in the token. :param start_pos: The position number of the first token. :param start_char: The offset of the first character of the first token. :param tokenize: if True, the text should be tokenized. """ assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: # The default: expression matches are used as tokens for pos, match in enumerate(my_tokenize_func(value)): t.text = match t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = start_pos + pos if chars: t.startchar = start_char t.endchar = start_char + len(match) start_char = t.endchar + 1 yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: if self.strip: strip = text_type.strip else: def strip(s): return s pos = start_pos startchar = start_char for s, l in \ ((strip(s), len(s)) for s in tinysegmenter.tokenize(value)): t.text = s t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: t.startchar = startchar startchar += l t.endchar = startchar yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: pos = start_pos offset = start_char byte_offset = 0 # TODO: support other encodings byte = value.encode('utf-8') m = self.tagger.parseToNode(byte) while m: if len(m.surface) == 0: m = m.next continue t.text = m.surface.decode('utf-8') t.feature = m.feature # TODO: use base form. t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: s = byte_offset + m.rlength - m.length e = s + m.length t.startchar = offset + \ len(byte[byte_offset:s].decode('utf-8')) t.endchar = t.startchar + len(byte[s:e].decode('utf-8')) offset = t.endchar byte_offset = e m = m.next yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): """ :param value: 进行令牌解析的 Unicode 字符串。 :param positions: 是否在 token 令牌中记录 token 令牌位置。 :param chars: 是否在 token 中记录字符偏移。 :param start_pos: 第一个 token 的位置。例如, 如果设置 start_pos=2, 那么 token 的位置将是 2,3,4,...而非 0,1,2,... :param start_char: 第一个 token 中第一个字符的偏移量。 例如, 如果设置 start_char=2, 那么文本 "aaa bbb" 解析的两个字符串位置将体现为 (2,5),(6,9) 而非 (0,3),(4,7). :param tokenize: 如果为 True, 文本应该被令牌解析。 """ # 判断传入的文本是否为字符串,如果不为字符串则抛出 assert isinstance(value, text_type), "%s is not unicode" % repr(value) t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t elif not self.gaps: # The default: expression matches are used as tokens # 默认情况下,正则表达式的匹配用作 token 令牌 # for pos, match in enumerate(self.expression.finditer(value)): # t.text = match.group(0) # t.boost = 1.0 # if keeporiginal: # t.original = t.text # t.stopped = False # if positions: # t.pos = start_pos + pos # if chars: # t.startchar = start_char + match.start() # t.endchar = start_char + match.end() # yield t seglist = jieba.cut(value, cut_all=True) for w in seglist: t.original = t.text = w t.boost = 1.0 if positions: t.pos = start_pos + value.find(w) if chars: t.startchar = start_char + value.find(w) t.endchar = start_char + value.find(w) + len(w) yield t else: # When gaps=True, iterate through the matches and # yield the text between them. # 当 gaps=True, 遍历匹配项并在它们之间生成文本。 prevend = 0 pos = start_pos for match in self.expression.finditer(value): start = prevend end = match.start() text = value[start:end] if text: t.text = text t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: t.startchar = start_char + start t.endchar = start_char + end yield t prevend = match.end() # If the last "gap" was before the end of the text, # yield the last bit of text as a final token. if prevend < len(value): t.text = value[prevend:] t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos if chars: t.startchar = prevend t.endchar = len(value) yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): """ :param value: The unicode string to tokenize. :param positions: Whether to record token positions in the token. :param chars: Whether to record character offsets in the token. :param start_pos: The position number of the first token. For example, if you set start_pos=2, the tokens will be numbered 2,3,4,... instead of 0,1,2,... :param start_char: The offset of the first character of the first token. For example, if you set start_char=2, the text "aaa bbb" will have chars (2,5),(6,9) instead (0,3),(4,7). :param tokenize: if True, the text should be tokenized. """ assert isinstance(value, text_type), "%s is not unicode" % repr(value) t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: pos = start_pos try: json_result = self.stanford_parser.api_call( value, properties=self.additional_properties) for sentence in json_result['sentences']: for token in sentence['tokens']: if token: t.text = token['word'] t.lemma = token['lemma'] t.pos = token['pos'] t.boost = 1.0 if keeporiginal: t.original = token['originalText'] t.stopped = False if positions: t.pos = pos pos += 1 if chars: t.startchar = token['characterOffsetBegin'] t.endchar = token['characterOffsetEnd'] yield t except Exception as e: logging.critical(str(e)) pass