def main(): jieba_instance = Tokenizer() seg_list = jieba_instance.cut("我来到北京清华大学", cut_all=True) print(type(seg_list)) print("Full Mode: " + "/ ".join(seg_list)) # 全模式 seg_list = jieba_instance.cut("他来到了网易杭研大厦") # 默认是精确模式 print(", ".join(seg_list)) seg_list = jieba_instance.cut_for_search( "小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式 print(", ".join(seg_list)) t1 = datetime.datetime.now() initialize() t2 = datetime.datetime.now() print("initialize costs:%s" % (t2 - t1)) print(lcut("我来到北京清华大学")) print(list(cut("我来到北京清华大学"))) print(cut("我来到北京清华大学", cut_all=True)) print(lcut_for_search("我来到北京清华大学")) print(list(cut_for_search("我来到北京清华大学"))) print(pseg.lcut("我来到北京清华大学")) print(list(pseg.cut("我来到北京清华大学"))) s = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。" r = analyse.extract_tags(s) print(r) r = analyse.textrank(s, withWeight=True) print(r) tr = TextRank(jieba_instance) print(tr.textrank(s, topK=2, withWeight=True)) tf = TFIDF(jieba_instance) print(tf.extract_tags(s, topK=10)) result = jieba_instance.tokenize('永和服装饰品有限公司') for tk in result: print("word %s\t\t start: %d \t\t end:%d" % (tk[0], tk[1], tk[2])) print(tokenize('永和服装饰品有限公司', mode="search")) jieba_instance.load_userdict(["卧槽"]) load_userdict(set(["卧槽"]))
def takes_arg3_as_HMM(self): jieba.cut("", True, True)
def takes_arg2_as_cut_all(self): jieba.cut("", True)
def takes_arg1_as_sentence(self): jieba.cut("")
def returns_iterator(self): from collections import Iterable, Sequence r = jieba.cut("", True, True) iterable = isinstance(r, Iterable) sequence = isinstance(r, Sequence) assert iterable and not sequence
import sys import os import random import datetime #wget https://raw.githubusercontent.com/yanyiwu/practice/master/nodejs/nodejieba/performance/weicheng.utf8 -O performace_test/weicheng.utf8 if __name__ == "__main__": if sys.argv[1] == "cppjieba_py": # 0:00:03.861202 import cppjieba_py as jieba elif sys.argv[1] == "jieba": # 0:01:24.703040 import jieba elif sys.argv[1] == "jieba_fast": import jieba_fast as jieba lines = [] weicheng = os.path.join(os.path.dirname(__file__), "weicheng.utf8") for line in open(weicheng): lines.append(line.strip()) result = [""] * 10 result[random.randint(0, 9)] = '/'.join(jieba.cut("南京长江大桥")) starttime = datetime.datetime.now() for i in range(50): for line in lines: r = '/'.join(jieba.cut(line)) # print(r) result[random.randint(0, 9)] = r #result[random.randint(0, 9)] = jieba.cut(line) endtime = datetime.datetime.now() print(endtime - starttime)