def __call__(self, value, start_pos=0, positions=False, **kwargs): """ Tokenizer behaviour: Input: u"text/x.moin.wiki;charset=utf-8" Output: u"text/x.moin.wiki;charset=utf-8", u"text", u"x.moin.wiki", u"charset=utf-8" Input: u"application/pdf" Output: u"application/pdf", u"application", u"pdf" :param value: String for tokenization :param start_pos: The position number of the first token. For example, if you set start_pos=2, the tokens will be numbered 2,3,4,... instead of 0,1,2,... :param positions: Whether to record token positions in the token. """ assert isinstance(value, unicode), "{0!r} is not unicode".format(value) if u'/' not in value: # Add '/' if user forgot do this value += u'/' pos = start_pos tk = Token() tp = Type(value) # we need to yield the complete contenttype in one piece, # so we can find it with Term(CONTENTTYPE, contenttype): if tp.type is not None and tp.subtype is not None: # note: we do not use "value" directly, so Type.__unicode__ can normalize it: tk.text = unicode(tp) if positions: tk.pos = pos pos += 1 yield tk # now yield the pieces: tk.text = tp.type if positions: tk.pos = pos pos += 1 yield tk if tp.subtype is not None: tk.text = tp.subtype if positions: tk.pos = pos pos += 1 yield tk for key, value in tp.parameters.items(): tk.text = u"{0}={1}".format(key, value) if positions: tk.pos = pos pos += 1 yield tk
def __call__(self, value, start_pos=0, positions=False, mode=u'', **kwargs): """ Calls AccessControlList for tokenization Analyzer behaviour: In index mode: Input: u"JoeDoe,JaneDoe:admin,read,write,destroy +EditorGroup:write All:read" Output: "u'JoeDoe:+read', u'JoeDoe:+write', u'JoeDoe:-create', u'JoeDoe:+admin', u'JoeDoe:+destroy', u'JaneDoe:+read', u'JaneDoe:+write', u'JaneDoe:-create', u'JaneDoe:+admin', u'JaneDoe:+destroy', u'EditorGroup:+write', u'All:+read', u'All:-write', u'All:-create', u'All:-admin', u'All:-destroy' In query mode: Input: u"JoeDoe:+write" Output: u"JoeDoe:+write" :param value: unicode string :param positions: Whether to record token positions in the token. :param start_pos: The position number of the first token. For example, if you set start_pos=2, the tokens will be numbered 2,3,4,... instead of 0,1,2,... """ assert isinstance(value, unicode) pos = start_pos tk = Token() tk.mode = mode if mode == "query": tk.text = value if positions: tk.pos = pos yield tk else: acl = AccessControlList([value], valid=self._acl_rights_contents) for name, permissions in acl.acl: for permission in permissions: sign = "+" if permissions[permission] else "-" tk.text = u"{0}:{1}{2}".format(name, sign, permission) if positions: tk.pos = pos pos += 1 yield tk
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), "%s is not unicode" % repr(value) t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: for pos, (start, stop, text) in enumerate(self.iter_value(value)): t.text = text t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = start_pos + pos if chars: t.startchar = start_char + start t.endchar = start_char + stop yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), '%s is not unicode' % repr(value) token = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: token.original = token.text = value token.boost = 1.0 if positions: token.pos = start_pos if chars: token.startchar = start_char token.endchar = start_char + len(value) yield token else: pos = start_pos for janome_token in self.tagger.tokenize(value): token.text = janome_token.surface token.boost = 1.0 if keeporiginal: token.original = token.text token.stopped = False if positions: token.pos = pos pos += 1 if chars: token.startchar = start_char + janome_token.start token.endchar = token.startchar + len(janome_token.surface) yield token
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: pos = start_pos for m in self.tagger.parse(value): t.text = m.surface t.feature = m.feature # TODO: use base form. t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: t.startchar = start_char + m.start t.endchar = t.startchar + len(m.surface) yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value enc = self.encoding t = Token(positions, chars, removestops=removestops, mode=mode) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: pos = start_pos offset = start_char byte_offset = 0 byte = value.encode('utf-8') m = self.tagger.parseToNode(toMeCab(value)) while m: if len(m.surface) == 0: m = m.next continue t.text = fromMeCab(m.surface, enc) t.feature = fromMeCab(m.feature, enc) # TODO: use base form. t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: s = byte_offset + m.rlength - m.length e = s + m.length # convert num of byte to num of unicode chars t.startchar = offset + len(byte[byte_offset:s].decode(enc)) t.endchar = t.startchar + len(byte[s:e].decode(enc)) offset = t.endchar byte_offset = e m = m.next yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): """ Rewritten call method :param value: The unicode string to tokenize. :param positions: Whether to record token positions in the token. :param chars: Whether to record character offsets in the token. :param start_pos: The position number of the first token. :param start_char: The offset of the first character of the first token. :param tokenize: if True, the text should be tokenized. """ assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: # The default: expression matches are used as tokens for pos, match in enumerate(my_tokenize_func(value)): t.text = match t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = start_pos + pos if chars: t.startchar = start_char t.endchar = start_char + len(match) start_char = t.endchar + 1 yield t
def __call__(self, text, **kargs): words = jieba.tokenize(text, mode="search") token = Token() for (w, start_pos, stop_pos) in words: if not accepted_chars.match(w): if len(w) > 1: pass else: continue token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = stop_pos yield token
def __call__(self,text,**kargs): words = jieba.tokenize(text,mode="search") token = Token() for (w,start_pos,stop_pos) in words: if not accepted_chars.match(w): if len(w)>1: pass else: continue token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = stop_pos yield token
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: if self.strip: strip = text_type.strip else: def strip(s): return s pos = start_pos startchar = start_char for s, l in \ ((strip(s), len(s)) for s in tinysegmenter.tokenize(value)): t.text = s t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: t.startchar = startchar startchar += l t.endchar = startchar yield t
def __call__(self, value, mode='', positions=False, **kwargs): assert isinstance(value, unicode), "%r is not unicode" % value token = Token(**kwargs) tagger = MeCab.Tagger('mecabrc') result = tagger.parse(value.encode("utf8")).decode('utf8') cur = 0 for match in re.compile("(\S+)\s+(\S+)\n").finditer(result): category = match.group(2).split(",") if 0 < len(category) and \ (category[0] == u'名詞' or category[0] == u'動詞' \ or category[0] == u'形容詞' or category[0] == u'副詞'): token.text = match.group(1) token.pos = cur yield token cur += len(match.group(1))
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: pos = start_pos offset = start_char byte_offset = 0 # TODO: support other encodings byte = value.encode('utf-8') m = self.tagger.parseToNode(byte) while m: if len(m.surface) == 0: m = m.next continue t.text = m.surface.decode('utf-8') t.feature = m.feature # TODO: use base form. t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: s = byte_offset + m.rlength - m.length e = s + m.length t.startchar = offset + \ len(byte[byte_offset:s].decode('utf-8')) t.endchar = t.startchar + len(byte[s:e].decode('utf-8')) offset = t.endchar byte_offset = e m = m.next yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) nlp = spacy.load('en_core_web_sm') doc = nlp(value) t.pos = start_pos for chunk in doc: t.text = chunk.dep_ yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): """ :param value: The unicode string to tokenize. :param positions: Whether to record token positions in the token. :param chars: Whether to record character offsets in the token. :param start_pos: The position number of the first token. For example, if you set start_pos=2, the tokens will be numbered 2,3,4,... instead of 0,1,2,... :param start_char: The offset of the first character of the first token. For example, if you set start_char=2, the text "aaa bbb" will have chars (2,5),(6,9) instead (0,3),(4,7). :param tokenize: if True, the text should be tokenized. """ assert isinstance(value, text_type), "%s is not unicode" % repr(value) t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: pos = start_pos try: json_result = self.stanford_parser.api_call( value, properties=self.additional_properties) for sentence in json_result['sentences']: for token in sentence['tokens']: if token: t.text = token['word'] t.lemma = token['lemma'] t.pos = token['pos'] t.boost = 1.0 if keeporiginal: t.original = token['originalText'] t.stopped = False if positions: t.pos = pos pos += 1 if chars: t.startchar = token['characterOffsetBegin'] t.endchar = token['characterOffsetEnd'] yield t except Exception as e: logging.critical(str(e)) pass
def __call__(self, value, start_pos=0, positions=False, mode='', **kwargs): """ This tokenizer is used for both indexing and queries. Queries are simple, usually return the input value as is. For indexing, tokens are generated for the incoming value plus various parts as shown below. Special cases create tokens for moinwiki, jpg, and mp3. Input: "text/x.moin.wiki;charset=utf-8" Output: "text/x.moin.wiki;charset=utf-8", "text", "moinwiki", "x.moin.wiki", "x", "moin", "wiki", "charset=utf-8", "charset", "utf-8" Input: "application/pdf" Output: "application/pdf", "application", "pdf" :param value: String for tokenization :mode value: query or index :param start_pos: The position number of the first token. For example, if you set start_pos=2, the tokens will be numbered 2,3,4,... instead of 0,1,2,... :param positions: Whether to record token positions in the token. These are unwanted, but positions=True is passed on indexing, positions=False on queries. """ tk = Token() tk.pos = 0 if mode == 'query': # 1 term expected, but contenttype:'moin utf-8' is valid val = value.split() for v in val: tk.text = v yield tk else: # mode = 'index' tk.text = value # text/x.moin.wiki;charset=utf-8 yield tk if '/' not in value: # unsupported contenttype return major, minor = value.split('/') # text, x.moin.wiki;charset=utf-8 tk.text = major # text yield tk if ';' in minor: parameters = minor.split(';') # x.moin.wiki, charset=utf-8 for par in parameters[1:]: tk.text = par # charset=utf-8 yield tk key, val = par.split('=') # charset, utf-8 tk.text = key # charset yield tk tk.text = val # utf-8 yield tk minor = parameters[0] # x.moin.wiki if minor == 'mpeg': # 'audio/mpeg' most people expect mp3 tk.text = 'mp3' yield tk if minor == 'jpeg': # 'image/jpeg' most people expect jpg tk.text = 'jpg' yield tk if minor == 'x.moin.wiki': # moin is valid for moin and creole, use this to get just moin tk.text = 'moinwiki' yield tk tk.text = minor # x.moin.wiki yield tk if '.' in minor: min = minor.split('.') # x, moin, wiki for m in min: tk.text = m yield tk if '-' in minor: # x-markdown min = minor.split('-') for m in min: tk.text = m yield tk if '+' in minor: # svg+xml min = minor.split('+') for m in min: tk.text = m yield tk
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): """ :param value: 进行令牌解析的 Unicode 字符串。 :param positions: 是否在 token 令牌中记录 token 令牌位置。 :param chars: 是否在 token 中记录字符偏移。 :param start_pos: 第一个 token 的位置。例如, 如果设置 start_pos=2, 那么 token 的位置将是 2,3,4,...而非 0,1,2,... :param start_char: 第一个 token 中第一个字符的偏移量。 例如, 如果设置 start_char=2, 那么文本 "aaa bbb" 解析的两个字符串位置将体现为 (2,5),(6,9) 而非 (0,3),(4,7). :param tokenize: 如果为 True, 文本应该被令牌解析。 """ # 判断传入的文本是否为字符串,如果不为字符串则抛出 assert isinstance(value, text_type), "%s is not unicode" % repr(value) t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t elif not self.gaps: # The default: expression matches are used as tokens # 默认情况下,正则表达式的匹配用作 token 令牌 # for pos, match in enumerate(self.expression.finditer(value)): # t.text = match.group(0) # t.boost = 1.0 # if keeporiginal: # t.original = t.text # t.stopped = False # if positions: # t.pos = start_pos + pos # if chars: # t.startchar = start_char + match.start() # t.endchar = start_char + match.end() # yield t seglist = jieba.cut(value, cut_all=True) for w in seglist: t.original = t.text = w t.boost = 1.0 if positions: t.pos = start_pos + value.find(w) if chars: t.startchar = start_char + value.find(w) t.endchar = start_char + value.find(w) + len(w) yield t else: # When gaps=True, iterate through the matches and # yield the text between them. # 当 gaps=True, 遍历匹配项并在它们之间生成文本。 prevend = 0 pos = start_pos for match in self.expression.finditer(value): start = prevend end = match.start() text = value[start:end] if text: t.text = text t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: t.startchar = start_char + start t.endchar = start_char + end yield t prevend = match.end() # If the last "gap" was before the end of the text, # yield the last bit of text as a final token. if prevend < len(value): t.text = value[prevend:] t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos if chars: t.startchar = prevend t.endchar = len(value) yield t