def calAuthorSimByTFIDFSelf(): print('calAuthorSimBySelf') def seg(content): return segWithFilter(content) + segAllWithFilter(content) conn = sqlite3.connect(db_path) db = conn.cursor() rows = db.execute('SELECT id FROM author WHERE potery_num>=10') should_pro_authors = set([row[0] for row in rows]) rows = db.execute('SELECT poet_id, content FROM potery') rows = list(rows) #[:100] author2content = {} for poet_id, content in rows: # if poet_id in should_pro_authors: if poet_id not in author2content: author2content[poet_id] = '' author2content[poet_id] += content poet_ids = [poet_id for poet_id in author2content] seg_poteries = [seg(author2content[poet_id]) for poet_id in author2content] dictionary = corpora.Dictionary(seg_poteries) # 生成字典和向量语料 corpus = [dictionary.doc2bow(seg_potery) for seg_potery in seg_poteries] print('开始训练') tfidf_model = models.TfidfModel(corpus) corpus_tfidf = tfidf_model[corpus] model_path = './data_process/model/author2vec/tfidf/tf-idf.model' tfidf_model.save(model_path) sim_path = './data_process/model/author2vec/tfidf/Similarity-tfidf-index' similarity = similarities.Similarity(sim_path, corpus_tfidf, num_features=len(dictionary)) similarity.num_best = 30 print('训练结束') for index, poet_id in enumerate(poet_ids): if poet_id not in should_pro_authors: continue if index % 1000 == 0: print(index, len(should_pro_authors)) conn.commit() doc_bow = corpus[index] #dictionary.doc2bow(seg_poteries[index]) vec_tfidf = tfidf_model[doc_bow] sims = similarity[vec_tfidf] sims.sort(key=lambda item: -item[1]) # print(sims) sims = [[poet_ids[int(sim[0])], float(sim[1])] for sim in sims] db.execute('UPDATE author SET sims_tfidf_self = ? WHERE id=?', (toJson(sims), poet_id)) conn.commit() conn.close()
def calSentenceTfIdfModel(): print('calSentenceTfIdfModel') def seg(content): return segWithFilter(content) + segAllWithFilter(content) model_path = './data_process/model/sentence2vec/tfidf/tf-idf.model' sim_path = './data_process/model/sentence2vec/tfidf/Similarity-tfidf-index' dict_path = './data_process/model/sentence2vec/tfidf/dict' conn = sqlite3.connect(db_path) db = conn.cursor() rows = list(db.execute('SELECT id FROM sentence')) ids = [row[0] for row in rows] # seg_sentences = [seg(row[1]) for row in rows] # print('分词完成') # dictionary = corpora.Dictionary(seg_sentences) # 生成字典和向量语料 # dictionary.save(dict_path) # corpus = [dictionary.doc2bow(seg_sentence) for seg_sentence in seg_sentences] # tfidf_model = models.TfidfModel(corpus) # tfidf_model.save(model_path) # corpus_tfidf = tfidf_model[corpus] # similarity = similarities.Similarity(sim_path, corpus_tfidf , num_features=len(dictionary)) # similarity.save() tfidf_model = models.TfidfModel.load(model_path) similarity = similarities.Similarity.load(sim_path) dictionary = corpora.Dictionary.load(dict_path) need_process_ids = list( db.execute('SELECT id, content FROM sentence WHERE sim IS NULL')) similarity.num_best = 6 for index, elm in enumerate(need_process_ids): s_id, s_content = elm if index % 10 == 0: print(index, len(need_process_ids)) conn.commit() words = seg(s_content) if len(words) < 5: sims = [] else: doc_bow = dictionary.doc2bow(words) vec_tfidf = tfidf_model[doc_bow] sims = similarity[vec_tfidf] sims.sort(key=lambda item: -item[1]) # print(sims) sims = [[ids[int(sim[0])], float(sim[1])] for sim in sims] db.execute('UPDATE sentence SET sim = ? WHERE id=?', (toJson(sims), s_id)) print('保存') conn.close()
def saveWord2db(): conn = sqlite3.connect(db_path) db = conn.cursor() word2count = loadJson('./data_process/data/word2count.json') for index, word in enumerate(word2count): if index % 1000 == 0: conn.commit() sql_comment = "INSERT INTO word VALUES (?,?,null,null, ?, null)" db.execute(sql_comment, (word, word2count[word], len(word))) conn.commit() word_dir_path = './data_process/data/dict' word2info = {} files = os.listdir(word_dir_path) #列出文件夹下所有的目录与文件 for file in files: path = os.path.join(word_dir_path, file) if os.path.isfile(path) and '.json' in file: file = loadJson(path) for word in file: word2info[word] = file[word] for index, word in enumerate(word2info): if index % 1000 == 0: conn.commit() info = word2info[word] sql_comment = 'UPDATE word SET info = ? WHERE word=?' db.execute(sql_comment, (toJson(info), word)) common_words = open('./data_process/data/停用词表.csv', 'r', encoding='utf-8').read().strip('\n').split('\n') for word in common_words: # print(word) sql_comment = 'UPDATE word SET is_common = 1 WHERE word=?' db.execute(sql_comment, (word, )) conn.commit() conn.close() # saveWord2db()
def calAuthorSimByPotery(): print('calAuthorSimByPotery') conn = sqlite3.connect(db_path) db = conn.cursor() rows = db.execute('SELECT id, poet_id FROM potery') potery2author = {row[0]: row[1] for row in rows} author2sim_num = {} rows = db.execute( 'SELECT poet_id, sim_potery_ids_tfidf FROM potery WHERE sim_potery_ids_tfidf IS NOT NULL' ) rows = list(rows) #[:100] for row in rows: id = row[0] sims = json.loads(row[1]) for sim_id, sim in sims: sim_id = potery2author[sim_id] if id not in author2sim_num: author2sim_num[id] = {} if sim_id not in author2sim_num: author2sim_num[sim_id] = {} if sim_id not in author2sim_num[id]: author2sim_num[id][sim_id] = 0 if id not in author2sim_num[sim_id]: author2sim_num[sim_id][id] = 0 author2sim_num[id][sim_id] += 1 author2sim_num[sim_id][id] += 1 for a_id in author2sim_num: sims = author2sim_num[a_id] sims = [[sim_id, sims[sim_id]] for sim_id in sims] sims.sort(key=lambda item: -item[1]) sims = sims[0:30] db.execute('UPDATE author SET sims_tfidf_potery=? WHERE id=?', (toJson(sims), a_id)) conn.commit() conn.close()
def calPoteriesSimByTfIDF(): print('calPoteriesSimByTfIDF') def seg(content): return segWithFilter(content) + segAllWithFilter(content) conn = sqlite3.connect(db_path) db = conn.cursor() rows = db.execute('SELECT id, poet_id, content FROM potery') rows = list(rows) #[:100] ids = [row[0] for row in rows] # print('开始改正') # temp_rows = db.execute('SELECT id, sim_potery_ids_tfidf FROM potery WHERE sim_potery_ids_tfidf IS NOT NULL') # temp_rows = list(temp_rows) #[:100] # for row in temp_rows: # # print(row, row[1]) # sims = json.loads(row[1]) # sims= [ [ids[int(sim[0])], sim[1]] for sim in sims] # db.execute('UPDATE potery SET sim_potery_ids_tfidf=? WHERE id=?', (toJson(sims), row[0])) # conn.commit() # print('改正完成') re_poets = [row[1] for row in rows] seg_poteries = [seg(row[2]) for row in rows] dictionary = corpora.Dictionary(seg_poteries) # 生成字典和向量语料 corpus = [dictionary.doc2bow(seg_potery) for seg_potery in seg_poteries] tfidf_model = models.TfidfModel(corpus) model_path = './data_process/model/potery2vec/tfidf/tf-idf.model' tfidf_model.save(model_path) corpus_tfidf = tfidf_model[corpus] sim_path = './data_process/model/potery2vec/tfidf/Similarity-tfidf-index' similarity = similarities.Similarity(sim_path, corpus_tfidf, num_features=len(dictionary)) # similarity = similarities.Similarity.load(mode_path) print('保存') rows = db.execute( 'SELECT id FROM potery WHERE sim_potery_ids_tfidf IS NOT NULL') rows = list(rows) #[:100] fin_ids = set([row[0] for row in rows]) rows = db.execute('SELECT id FROM author WHERE potery_num>=10') should_pro_authors = set([row[0] for row in rows]) should_pro_ids = [ id for index, id in enumerate(ids) if re_poets[index] in should_pro_authors and id not in fin_ids ] should_pro_ids = set(should_pro_ids) similarity.num_best = 30 for index, p_id in enumerate(ids): if p_id not in should_pro_ids: continue if index % 1000 == 0: print(index, len(should_pro_ids)) conn.commit() doc_bow = corpus[index] #dictionary.doc2bow(seg_poteries[index]) vec_tfidf = tfidf_model[doc_bow] sims = similarity[vec_tfidf] sims.sort(key=lambda item: -item[1]) # print(sims) sims = [[ids[int(sim[0])], float(sim[1])] for sim in sims] db.execute('UPDATE potery SET sim_potery_ids_tfidf = ? WHERE id=?', (toJson(sims), p_id)) conn.commit() conn.close()