def seg_txt_search(txt): result = [] buffer = [] def _(): if len(buffer) > 1: result.extend(word_len2("".join(buffer))) elif buffer: if "一" <= buffer[0] <= "龥": if buffer[0] not in SMALLCHAR: result.append(buffer[0]) for word in seg_txt(txt): word = word.decode("utf-8", "ignore") if len(word) == 1: buffer.append(word) else: _() buffer = [] if len(word) <= 16: word = word.lower() utf8_word = word.encode("utf-8", "ignore") if utf8_word.isalnum(): result.append(word) elif len(word) <= 2: result.append(utf8_word) else: result.extend(word_len2(word)) _() result = [i.encode("utf-8", "ignore") if type(i) is str else i for i in result] return result
def seg_txt2(txt): for i in seg_txt(txt): i = i.lower() if len(i) > 3: yield i else: i = i.decode("utf-8","ignore") if len(i) == 1: if "一" <= i <= "龥" and i not in STOPWORD: yield i else: yield i
def seg_title_search(txt): result = [] buffer = [] for word in seg_txt(txt): word = word.decode("utf-8", "ignore") if len(word) == 1: buffer.append(word) else: for i in buffer: result.append(i) if len(buffer) > 1: result.extend(word_len2("".join(buffer))) buffer = [] if len(word) <= 16: word = word.lower() utf8_word = word.encode("utf-8", "ignore") if utf8_word.isalnum(): result.append(word) else: for i in word: result.append(i) if len(word) <= 2: result.append(utf8_word) else: result.extend(word_len2(word)) if len(buffer) > 1: result.extend(word_len2("".join(buffer))) elif buffer: if u"一" <= buffer[0] <= u"龥": if buffer[0] not in SMALLCHAR: result.append(buffer[0]) result = [ i.encode("utf-8", "ignore") if type(i) is unicode else i for i in result ] # txt = txt.decode("utf-8", "ignore") return result
def seg_title_search(txt): result = [] buffer = [] for word in seg_txt(txt): word = word.decode("utf-8", "ignore") if len(word) == 1: buffer.append(word) else: for i in buffer: result.append(i) if len(buffer) > 1: result.extend(word_len2("".join(buffer))) buffer = [] if len(word) <= 16: word = word.lower() utf8_word = word.encode("utf-8", "ignore") if utf8_word.isalnum(): result.append(word) else: for i in word: result.append(i) if len(word) <= 2: result.append(utf8_word) else: result.extend(word_len2(word)) if len(buffer) > 1: result.extend(word_len2("".join(buffer))) elif buffer: if u"一" <= buffer[0] <= u"龥": if buffer[0] not in SMALLCHAR: result.append(buffer[0]) result = [i.encode("utf-8", "ignore") if type(i) is unicode else i for i in result] # txt = txt.decode("utf-8", "ignore") return result