示例#1
0
def build_model(cache=True):
    if cache:
        f = "%s/word2vec.model" % cache_dir()
        if os.path.isfile(f):
            return Word2Vec.load(f)
    texts = []
    for url in crawl_report_list():
        html = get(url)
        enc, time, title, text = ce.parse(url, html)
        sentences = text_util.get_sentences(text)
        for s in sentences:
            texts.append([w for w in jieba.cut(s)])
    b = Word2Vec(texts)
    if cache:
        b.save(f)
    return b
示例#2
0
文件: cn.py 项目: liuzl/nlp4econ
def tf(cache=True, force=False):
    f = "%s/tf.txt" % cache_dir()
    if cache and not force:
        if os.path.isfile(f):
            return True
    d = defaultdict(int)
    for url in (crawl_report_list() + crawl_plan_list()):
        html = get(url)
        enc, time, title, text = ce.parse(url, html)
        sentences = text_util.get_sentences(text)
        for s in sentences:
            for w in jieba.cut(s):
                d[w] += 1
    r = sorted(d.items(), key=lambda x:x[1], reverse=True)
    if cache:
        out = open(f, "w")
        for k,v in r:
            out.write(("%s\t%s\n" % (k,v)).encode('utf-8', 'ignore'))
        out.close()
    return True
示例#3
0
#encoding: utf-8
import sys
import content_extract as ce
sys.path.append("../../lib")
import download

if __name__ == "__main__":
    html = download.getPage(sys.argv[1])
    enc, time, title, text = ce.parse(sys.argv[1], html)
    print "标题:" + title.encode('utf-8', 'ignore')
    print "时间:" + time.encode('utf-8', 'ignore')
    print '=' * 10
    print "内容:" + text.encode('utf-8', 'ignore')