def proc(self, request): """ param request : """ # params = json.loads(request) # print type(params) # print params.keys() # print params project_id = params['project_id'] print ">>>>>>>>>>>>>>>>>>>>>project_id:{0}".format(project_id) target_text = params['parameter_list'] print target_text # baike_text = target_text['project_info'] # print baike_text data_preprocessing.log_info("生成词云...........................") self.__generate_wordcloud(project_id, target_text) data_preprocessing.log_info("计算相似度...........................") _calculate_sim(self.wv_model_local, self.wb_collection, self.w_collection, project_id, target_text, source_id=1) _calculate_sim(self.wv_model_local, self.jd_collection, self.w_collection, project_id, target_text, source_id=2)
def show_sim(project_id, input_vector, theme_sentence_vector, w_collection, source_id): """ param project_id : 项目id param input_vector : 输入sentence的vec param theme_sentence_vector : """ time_temp = int(time.time()) timeArray = time.localtime(time_temp) stamp_temp = time.strftime("%Y-%m-%d %H:%M:%S", timeArray) if len(set(input_vector)) == 1 and np.sum(set(input_vector)) == 0: print("输入错误") print(input_vector) else: w_collection.remove({"project_id": project_id, "source_id": source_id}) data_preprocessing.log_info("数据入库操作,执行中......") index = 0 for vec_idx in theme_sentence_vector.keys(): id2sim_temp = dict() dot_temp = cos_sim(np.array(input_vector), np.array(theme_sentence_vector[vec_idx])) id2sim_temp["doc_id"] = vec_idx[0] id2sim_temp["project_id"] = project_id id2sim_temp["uid"] = vec_idx[1] id2sim_temp["source_id"] = source_id id2sim_temp["similarity"] = dot_temp id2sim_temp["update_time"] = stamp_temp w_collection.insert(id2sim_temp)
def __init__(self): self.log = {} data_preprocessing.log_info("加载wordembedding模型......") self.wv_model_local = Word2Vec.load("./package_nlp/wv/word2vec_wx") self.jd_collection = data_preprocessing.connect_mongo( host_mongo, db_mongo_r, 'words_jd_10', port_mongo) data_preprocessing.log_info("self.jd_collection") print self.jd_collection self.wb_collection = data_preprocessing.connect_mongo( host_mongo, db_mongo_r, 'words_wb_10', port_mongo) data_preprocessing.log_info("self.wb_collection") print self.wb_collection self.w_collection = data_preprocessing.connect_mongo( host_mongo, db_mongo_w, sim_mongo_w, port_mongo) data_preprocessing.log_info("self.w_collection") print self.w_collection
def __init__(self): self.log = {} self.jd_collection = data_preprocessing.connect_mongo( host_mongo, db_mongo_r, 'seg_words_spark_jd', port_mongo) data_preprocessing.log_info("self.jd_collection") print self.jd_collection self.wb_collection = data_preprocessing.connect_mongo( host_mongo, db_mongo_r, 'seg_words_spark_wb', port_mongo) data_preprocessing.log_info("self.wb_collection") print self.wb_collection self.data_temp = sentence_sim.read_mongo_weight(self.wb_collection)
def show_sim(project_id, input_vector, theme_sentence_vector, w_collection, source_id): """ param project_id : 项目id param input_vector : 输入sentence的vec param theme_sentence_vector : """ time_temp = int(time.time()) timeArray = time.localtime(time_temp) stamp_temp = time.strftime("%Y-%m-%d %H:%M:%S", timeArray) if len(set(input_vector)) == 1 and np.sum(set(input_vector)) == 0: print("输入错误") print(input_vector) else: print("输出相似度信息") w_collection.remove({"project_id": project_id, "source_id": source_id}) print("当前数据总量:{0}".format( w_collection.find({ "project_id": project_id, "source_id": source_id }).count())) data_preprocessing.log_info("数据入库操作,执行中......") index = 0 for vec_idx in theme_sentence_vector.keys(): id2sim_temp = dict() dot_temp = cos_sim(np.array(input_vector), np.array(theme_sentence_vector[vec_idx])) id2sim_temp["doc_id"] = vec_idx[0] id2sim_temp["project_id"] = project_id id2sim_temp["uid"] = vec_idx[1] id2sim_temp["source_id"] = source_id id2sim_temp["similarity"] = dot_temp id2sim_temp["update_time"] = stamp_temp w_collection.insert(id2sim_temp) index += 1 if index % 1000 == 0: print("》》》》》》》》》》》数据已经更新{0}条".format(index)) total_number = w_collection.find({}).count() print("插入数据后,数据总量:{0}".format(total_number))
def __generate_wordcloud(self, project_id, target_text): """ param : project_id 项目ID param : target_text 词云文本 return: """ sql_conn = words_cloud.connect_mysql( host_sql, user_sql, passwd_sql, db_name_sql, port_sql) data_preprocessing.log_info("self.sql_conn") print sql_conn top_k = 60 top_n = 80 iterations = 30 num_topics = 5 data_preprocessing.log_info("获取输入信息主题词向量......") ss = words_cloud.single_sentence( top_k, top_n, iterations, num_topics, stop_words_path) lda_model, doc_corpus = ss.sentence_transmit( target_sentence=target_text) target_words = ss.sentence_to_lda_words(lda_model, doc_corpus) target_words_dict = dict() for item in target_words: target_words_dict[item[0]] = int(item[1] * 1000) data_preprocessing.log_info("文本主题词数据入库......") words_cloud.insert_mysql(sql_conn, table_name_sql, json.dumps( target_words_dict, encoding="utf8", ensure_ascii=False), project_id)
def _calculate_sim(wv_model_local, r_collection, w_collection, project_id, target_text, source_id=1): """ param : project_id 项目ID param : target_text 词云文本 return: """ data_preprocessing.log_info("开始执行similarity的计算>>>>>>>>>>>>>>>>>>") libc = ctypes.cdll.LoadLibrary('libc.so.6') tid = libc.syscall(SYS_gettid) print("当前线程id{0}".format(tid)) top_k = 6 top_n = 5 iterations = 30 num_topics = 5 data_preprocessing.log_info("获取主题词......") id2words = sentence_sim.read_mongo_weight(r_collection) data_preprocessing.log_info("获取主题词向量......") theme_sentence_vector = dict() for key in id2words: if len(id2words[key]) > 6: id2words_temp = id2words[key][:6] else: id2words_temp = id2words[key] array_temp = sentence_sim.sentence_embedding_weight( id2words_temp, wv_model_local) theme_sentence_vector[key] = array_temp del id2words data_preprocessing.log_info("获取输入信息主题词向量......") ss = words_cloud.single_sentence( top_k, top_n, iterations, num_topics, stop_words_path) lda_model, doc_corpus = ss.sentence_transmit( target_sentence=target_text) print(lda_model) input_words = ss.sentence_to_lda_words(lda_model, doc_corpus) print("目标词汇:{0}".format(input_words)) data_preprocessing.log_info("计算sentenceembedding......") input_vector = sentence_sim.sentence_embedding_weight( target_words=input_words, wv_model=wv_model_local) print input_vector sentence_sim.show_sim( project_id, input_vector, theme_sentence_vector, w_collection, str(source_id) ) data_preprocessing.log_info("线程{0}文本similarity是计算结束。".format(tid))
db_name_sql = sc["db_name_sql"] table_name_sql = sc["table_name_sql"] host_mongo = sc["host_mongo"] db_mongo_r = sc["db_mongo_r"] port_mongo = sc["port_mongo"] db_mongo_w = sc["db_mongo_w"] sim_mongo_w = sc["sim_mongo_w"] spark_temp = SparkSession.builder.master("local").config( "spark.some.config.option", "some-value").appName("minProject").getOrCreate() print spark_temp data_preprocessing.log_info("加载wordembedding模型......") sqlContext = SQLContext(spark_temp) lookup = sqlContext.read.parquet("./data_temp/w2c.model/data").alias("lookup") lookup_bd = spark_temp.sparkContext.broadcast(lookup.rdd.collectAsMap()) def cos_sim(vec_a, vec_b): """ param vec_a : type->array param vec_b : type->array return similarity """ mat_a = np.mat(vec_a) mat_b = np.mat(vec_b) num = float(mat_a * mat_b.T) denom = la.norm(mat_a) * la.norm(mat_b)