print "cpu_num=" + str(cpu_num) print "doc_limit=" + str(doc_limit) if os.path.exists(docpath): from ar import filebyfileHandle filebyfileHandle(docpath, doc_limit, cpu_num, NUM_DOC) # 100字符内的文件抛掉不处理,多进程不指定默认 multiprocess=4 else: mkdir(docpath) t02 = time.time() print "prepare time = ", t02 - t01 t11 = time.time() from dict_stream_train import getDictionary dict = getDictionary(lsipath=lsipath, docpath=docpath) t12 = time.time() print "dict time = ", t12 - t11 t21 = time.time() from corpus_stream_train import getCorpus corpus = getCorpus(lsipath=lsipath, docpath=docpath) t22 = time.time() print "corpus time = ", t22 - t21 #gc del dict, corpus gc.collect() t31 = time.time()
NUM_TOPIC = 300 # 主题的数量,默认为 300 NUM_DOC = -1 # 所选取的语料集中的文件数量 # if os.path.exists(docpath): # shutil.rmtree(docpath) # 删除目录 # if os.path.exists(lsipath): # shutil.rmtree(lsipath) # 删除目录 t01 = time.time() if os.path.exists(docpath): from ar import filebyfileHandle filebyfileHandle(docpath,100,4,NUM_DOC) #100字符内的文件抛掉不处理,多进程默认 multiprocess=4 t02 = time.time() t11 = time.time() from dict_stream_train import getDictionary dict = getDictionary(lsipath=lsipath, docpath=docpath) t12 = time.time() t21 = time.time() from corpus_stream_train import getCorpus corpus = getCorpus(lsipath=lsipath, docpath=docpath) t22 = time.time() t31 = time.time() from lsi_stream_train import getLsiModel lsimodel = getLsiModel(lsipath=lsipath, num_topics=NUM_TOPIC) t32 = time.time() t41 = time.time() from index_stream_train import getIndex getIndex(lsipath, NUM_TOPIC) #change by baobao ,add NUM_TOPIC
def sim_update(results): """ Update Models. :param results: :return: """ shutil.rmtree(lsitemp,ignore_errors=True) mkdir(lsitemp) t_total_begin = time.time() # print("Checking repeat ...") # results_temp = check_repet_new(results) results_temp = results # print("Check repeat complete!") print("Prefix mapping ...") results = prefix_map(results_temp) print("Prefix map complete!") del results_temp print("Building LSI model ...") # Extended Dictionary dictionary = corpora.Dictionary.load(lsipath + 'viva.dict') # Load Models corpus_raw = corpora.MmCorpus(lsipath + 'viva.mm') lsi = lsimodel.LsiModel.load(lsipath + 'viva.lsi') # 将 mm 文件中的 corpus 映射到 LSI 空间当中 mkdir(news_post_add) # Preporcessing text. Get corpus_add. for postfile in results: deltags = stripTags(postfile['text']) text_del = delstopwords("".join(deltags.split())) # text_vec = jieba.lcut(text_del) # del and with open(news_post_add + postfile['name'], 'w') as fp: fp.write(text_del) files = os.listdir(news_post_add) for i in files: shutil.copy(news_post_add + i, docpath) from dict_stream_train import getDictionary dict2 = getDictionary(lsipath=lsitemp, docpath=news_post_add) dict2 = corpora.Dictionary.load(lsitemp + 'viva.dict') from corpus_stream_train import getCorpus corpus2 = getCorpus(lsipath=lsitemp, docpath=news_post_add) corpus2 = corpora.MmCorpus(lsitemp + 'viva.mm') dict2_to_dict1 = dictionary.merge_with(dict2) # dict2_to_dict1.save(lsipath + 'viva2.dict') # dict2_to_dict1 = corpora.Dictionary.load(lsipath + 'viva2.dict') merged_corpus = itertools.chain(corpus_raw, dict2_to_dict1[corpus2]) corpora.MmCorpus.serialize(lsipath + 'viva.mm', [i for i in merged_corpus]) merged_corpus = corpora.MmCorpus(lsipath + 'viva.mm') # Get TF-IDF vecters of documents tfidf = tfidfmodel.TfidfModel(merged_corpus) print("Building tfidf model ...") corpus_tfidf = tfidf[merged_corpus] print("Building corpus_tfidf model ...") # Updated LSI Model # lsi.add_documents(corpus_tfidf, chunksize=chunksize, decay=DECAY_FACTOR) # # lsi.add_documents(corpus_tfidf, chunksize=chunksize) # # print("Builded lsi add documents to model ...") # # Updated Corpus # if not os.path.exists(lsipath): # os.mkdir(lsipath) # # corpus = corpora.MmCorpus.serialize(lsipath + 'viva.mm', itertools.chain(corpus_raw, corpus2)) lsi = lsimodel.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=NUM_TOPIC, chunksize=chunksize, power_iters=2, onepass=True) # 其他参数都是默认 lsi.save(lsipath + 'viva.lsi') lsi = models.lsimodel.LsiModel.load(lsipath + 'viva.lsi') index = similarities.docsim.Similarity(lsipath + 'viva.index', lsi[merged_corpus], num_features=NUM_TOPIC) # Save Models index.save(lsipath + 'viva.index') print("LSI model saved!") # Print elasped time t2 = time.time() print "Total elapsed time is: ", t2 - t_total_begin, "s"
def sim_update(results): """ Update Models. :param results: :return: """ shutil.rmtree(lsitemp, ignore_errors=True) mkdir(lsitemp) t_total_begin = time.time() # print("Checking repeat ...") # results_temp = check_repet_new(results) results_temp = results # print("Check repeat complete!") print("Prefix mapping ...") results = prefix_map(results_temp) print("Prefix map complete!") del results_temp print("Building LSI model ...") # Extended Dictionary dictionary = corpora.Dictionary.load(lsipath + 'viva.dict') # Load Models corpus_raw = corpora.MmCorpus(lsipath + 'viva.mm') lsi = lsimodel.LsiModel.load(lsipath + 'viva.lsi') # 将 mm 文件中的 corpus 映射到 LSI 空间当中 mkdir(news_post_add) # Preporcessing text. Get corpus_add. for postfile in results: deltags = stripTags(postfile['text']) text_del = delstopwords("".join(deltags.split())) # text_vec = jieba.lcut(text_del) # del and with open(news_post_add + postfile['name'], 'w') as fp: fp.write(text_del) files = os.listdir(news_post_add) for i in files: shutil.copy(news_post_add + i, docpath) from dict_stream_train import getDictionary dict2 = getDictionary(lsipath=lsitemp, docpath=news_post_add) dict2 = corpora.Dictionary.load(lsitemp + 'viva.dict') from corpus_stream_train import getCorpus corpus2 = getCorpus(lsipath=lsitemp, docpath=news_post_add) corpus2 = corpora.MmCorpus(lsitemp + 'viva.mm') dict2_to_dict1 = dictionary.merge_with(dict2) # dict2_to_dict1.save(lsipath + 'viva2.dict') # dict2_to_dict1 = corpora.Dictionary.load(lsipath + 'viva2.dict') merged_corpus = itertools.chain(corpus_raw, dict2_to_dict1[corpus2]) corpora.MmCorpus.serialize(lsipath + 'viva.mm', [i for i in merged_corpus]) merged_corpus = corpora.MmCorpus(lsipath + 'viva.mm') # Get TF-IDF vecters of documents tfidf = tfidfmodel.TfidfModel(merged_corpus) print("Building tfidf model ...") corpus_tfidf = tfidf[merged_corpus] print("Building corpus_tfidf model ...") # Updated LSI Model # lsi.add_documents(corpus_tfidf, chunksize=chunksize, decay=DECAY_FACTOR) # # lsi.add_documents(corpus_tfidf, chunksize=chunksize) # # print("Builded lsi add documents to model ...") # # Updated Corpus # if not os.path.exists(lsipath): # os.mkdir(lsipath) # # corpus = corpora.MmCorpus.serialize(lsipath + 'viva.mm', itertools.chain(corpus_raw, corpus2)) lsi = lsimodel.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=NUM_TOPIC, chunksize=chunksize, power_iters=2, onepass=True) # 其他参数都是默认 lsi.save(lsipath + 'viva.lsi') lsi = models.lsimodel.LsiModel.load(lsipath + 'viva.lsi') index = similarities.docsim.Similarity(lsipath + 'viva.index', lsi[merged_corpus], num_features=NUM_TOPIC) # Save Models index.save(lsipath + 'viva.index') print("LSI model saved!") # Print elasped time t2 = time.time() print "Total elapsed time is: ", t2 - t_total_begin, "s"