예제 #1
0
def main():

    jieba_instance = Tokenizer()
    seg_list = jieba_instance.cut("我来到北京清华大学", cut_all=True)
    print(type(seg_list))
    print("Full Mode: " + "/ ".join(seg_list))  # 全模式

    seg_list = jieba_instance.cut("他来到了网易杭研大厦")  # 默认是精确模式
    print(", ".join(seg_list))

    seg_list = jieba_instance.cut_for_search(
        "小明硕士毕业于中国科学院计算所,后在日本京都大学深造")  # 搜索引擎模式
    print(", ".join(seg_list))

    t1 = datetime.datetime.now()
    initialize()
    t2 = datetime.datetime.now()
    print("initialize costs:%s" % (t2 - t1))

    print(lcut("我来到北京清华大学"))
    print(list(cut("我来到北京清华大学")))
    print(cut("我来到北京清华大学", cut_all=True))
    print(lcut_for_search("我来到北京清华大学"))
    print(list(cut_for_search("我来到北京清华大学")))

    print(pseg.lcut("我来到北京清华大学"))
    print(list(pseg.cut("我来到北京清华大学")))

    s = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。"
    r = analyse.extract_tags(s)
    print(r)

    r = analyse.textrank(s, withWeight=True)
    print(r)

    tr = TextRank(jieba_instance)
    print(tr.textrank(s, topK=2, withWeight=True))

    tf = TFIDF(jieba_instance)
    print(tf.extract_tags(s, topK=10))

    result = jieba_instance.tokenize('永和服装饰品有限公司')
    for tk in result:
        print("word %s\t\t start: %d \t\t end:%d" % (tk[0], tk[1], tk[2]))

    print(tokenize('永和服装饰品有限公司', mode="search"))

    jieba_instance.load_userdict(["卧槽"])

    load_userdict(set(["卧槽"]))
예제 #2
0
 def takes_arg3_as_HMM(self):
     jieba.cut("", True, True)
예제 #3
0
 def takes_arg2_as_cut_all(self):
     jieba.cut("", True)
예제 #4
0
 def takes_arg1_as_sentence(self):
     jieba.cut("")
예제 #5
0
 def returns_iterator(self):
     from collections import Iterable, Sequence
     r = jieba.cut("", True, True)
     iterable = isinstance(r, Iterable)
     sequence = isinstance(r, Sequence)
     assert iterable and not sequence
예제 #6
0
import sys
import os
import random
import datetime

#wget https://raw.githubusercontent.com/yanyiwu/practice/master/nodejs/nodejieba/performance/weicheng.utf8 -O performace_test/weicheng.utf8

if __name__ == "__main__":
    if sys.argv[1] == "cppjieba_py":  # 0:00:03.861202
        import cppjieba_py as jieba
    elif sys.argv[1] == "jieba":  # 0:01:24.703040
        import jieba
    elif sys.argv[1] == "jieba_fast":
        import jieba_fast as jieba
    lines = []
    weicheng = os.path.join(os.path.dirname(__file__), "weicheng.utf8")
    for line in open(weicheng):
        lines.append(line.strip())

    result = [""] * 10
    result[random.randint(0, 9)] = '/'.join(jieba.cut("南京长江大桥"))
    starttime = datetime.datetime.now()

    for i in range(50):
        for line in lines:
            r = '/'.join(jieba.cut(line))
            # print(r)
            result[random.randint(0, 9)] = r
            #result[random.randint(0, 9)] = jieba.cut(line)
    endtime = datetime.datetime.now()
    print(endtime - starttime)