Пример #1
0
    def proc(self, request):
        """
        param request :
        """
        #
        params = json.loads(request)
        # print type(params)
        # print params.keys()
        # print params

        project_id = params['project_id']
        print ">>>>>>>>>>>>>>>>>>>>>project_id:{0}".format(project_id)
        target_text = params['parameter_list']
        print target_text
        # baike_text = target_text['project_info']
        # print baike_text
        data_preprocessing.log_info("生成词云...........................")
        self.__generate_wordcloud(project_id, target_text)

        data_preprocessing.log_info("计算相似度...........................")
        _calculate_sim(self.wv_model_local, self.wb_collection, self.w_collection,
                       project_id, target_text, source_id=1)

        _calculate_sim(self.wv_model_local, self.jd_collection, self.w_collection,
                       project_id, target_text, source_id=2)
Пример #2
0
def show_sim(project_id, input_vector, theme_sentence_vector, w_collection,
             source_id):
    """
    param project_id : 项目id
    param input_vector : 输入sentence的vec
    param theme_sentence_vector :
    """
    time_temp = int(time.time())
    timeArray = time.localtime(time_temp)
    stamp_temp = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)

    if len(set(input_vector)) == 1 and np.sum(set(input_vector)) == 0:
        print("输入错误")
        print(input_vector)
    else:
        w_collection.remove({"project_id": project_id, "source_id": source_id})

        data_preprocessing.log_info("数据入库操作,执行中......")
        index = 0
        for vec_idx in theme_sentence_vector.keys():

            id2sim_temp = dict()
            dot_temp = cos_sim(np.array(input_vector),
                               np.array(theme_sentence_vector[vec_idx]))

            id2sim_temp["doc_id"] = vec_idx[0]
            id2sim_temp["project_id"] = project_id
            id2sim_temp["uid"] = vec_idx[1]
            id2sim_temp["source_id"] = source_id
            id2sim_temp["similarity"] = dot_temp
            id2sim_temp["update_time"] = stamp_temp
            w_collection.insert(id2sim_temp)
Пример #3
0
    def __init__(self):
        self.log = {}

        data_preprocessing.log_info("加载wordembedding模型......")
        self.wv_model_local = Word2Vec.load("./package_nlp/wv/word2vec_wx")

        self.jd_collection = data_preprocessing.connect_mongo(
            host_mongo,
            db_mongo_r,
            'words_jd_10',
            port_mongo)
        data_preprocessing.log_info("self.jd_collection")
        print self.jd_collection

        self.wb_collection = data_preprocessing.connect_mongo(
            host_mongo,
            db_mongo_r,
            'words_wb_10',
            port_mongo)
        data_preprocessing.log_info("self.wb_collection")
        print self.wb_collection

        self.w_collection = data_preprocessing.connect_mongo(
            host_mongo,
            db_mongo_w,
            sim_mongo_w,
            port_mongo)
        data_preprocessing.log_info("self.w_collection")
        print self.w_collection
Пример #4
0
    def __init__(self):
        self.log = {}

        self.jd_collection = data_preprocessing.connect_mongo(
            host_mongo, db_mongo_r, 'seg_words_spark_jd', port_mongo)
        data_preprocessing.log_info("self.jd_collection")
        print self.jd_collection

        self.wb_collection = data_preprocessing.connect_mongo(
            host_mongo, db_mongo_r, 'seg_words_spark_wb', port_mongo)
        data_preprocessing.log_info("self.wb_collection")
        print self.wb_collection

        self.data_temp = sentence_sim.read_mongo_weight(self.wb_collection)
Пример #5
0
def show_sim(project_id, input_vector, theme_sentence_vector, w_collection,
             source_id):
    """
    param project_id : 项目id
    param input_vector : 输入sentence的vec
    param theme_sentence_vector :
    """
    time_temp = int(time.time())
    timeArray = time.localtime(time_temp)
    stamp_temp = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)

    if len(set(input_vector)) == 1 and np.sum(set(input_vector)) == 0:
        print("输入错误")
        print(input_vector)
    else:
        print("输出相似度信息")
        w_collection.remove({"project_id": project_id, "source_id": source_id})

        print("当前数据总量:{0}".format(
            w_collection.find({
                "project_id": project_id,
                "source_id": source_id
            }).count()))

        data_preprocessing.log_info("数据入库操作,执行中......")
        index = 0
        for vec_idx in theme_sentence_vector.keys():

            id2sim_temp = dict()
            dot_temp = cos_sim(np.array(input_vector),
                               np.array(theme_sentence_vector[vec_idx]))

            id2sim_temp["doc_id"] = vec_idx[0]
            id2sim_temp["project_id"] = project_id
            id2sim_temp["uid"] = vec_idx[1]
            id2sim_temp["source_id"] = source_id
            id2sim_temp["similarity"] = dot_temp
            id2sim_temp["update_time"] = stamp_temp

            w_collection.insert(id2sim_temp)
            index += 1
            if index % 1000 == 0:
                print("》》》》》》》》》》》数据已经更新{0}条".format(index))

        total_number = w_collection.find({}).count()
        print("插入数据后,数据总量:{0}".format(total_number))
Пример #6
0
    def __generate_wordcloud(self, project_id, target_text):
        """
        param : project_id   项目ID
        param : target_text  词云文本
        return:
        """
        sql_conn = words_cloud.connect_mysql(
            host_sql,
            user_sql,
            passwd_sql,
            db_name_sql,
            port_sql)
        data_preprocessing.log_info("self.sql_conn")
        print sql_conn

        top_k = 60
        top_n = 80
        iterations = 30
        num_topics = 5

        data_preprocessing.log_info("获取输入信息主题词向量......")
        ss = words_cloud.single_sentence(
            top_k, top_n, iterations, num_topics, stop_words_path)
        lda_model, doc_corpus = ss.sentence_transmit(
            target_sentence=target_text)
        target_words = ss.sentence_to_lda_words(lda_model, doc_corpus)

        target_words_dict = dict()
        for item in target_words:
            target_words_dict[item[0]] = int(item[1] * 1000)

        data_preprocessing.log_info("文本主题词数据入库......")
        words_cloud.insert_mysql(sql_conn, table_name_sql, json.dumps(
            target_words_dict, encoding="utf8", ensure_ascii=False), project_id)
Пример #7
0
def _calculate_sim(wv_model_local, r_collection, w_collection, project_id, target_text, source_id=1):
    """
    param : project_id   项目ID
    param : target_text  词云文本
    return:
    """
    data_preprocessing.log_info("开始执行similarity的计算>>>>>>>>>>>>>>>>>>")
    libc = ctypes.cdll.LoadLibrary('libc.so.6')
    tid = libc.syscall(SYS_gettid)
    print("当前线程id{0}".format(tid))

    top_k = 6
    top_n = 5
    iterations = 30
    num_topics = 5

    data_preprocessing.log_info("获取主题词......")
    id2words = sentence_sim.read_mongo_weight(r_collection)

    data_preprocessing.log_info("获取主题词向量......")
    theme_sentence_vector = dict()
    for key in id2words:
        if len(id2words[key]) > 6:
            id2words_temp = id2words[key][:6]
        else:
            id2words_temp = id2words[key]
        array_temp = sentence_sim.sentence_embedding_weight(
            id2words_temp, wv_model_local)
        theme_sentence_vector[key] = array_temp
    del id2words

    data_preprocessing.log_info("获取输入信息主题词向量......")
    ss = words_cloud.single_sentence(
        top_k, top_n, iterations, num_topics, stop_words_path)
    lda_model, doc_corpus = ss.sentence_transmit(
        target_sentence=target_text)
    print(lda_model)

    input_words = ss.sentence_to_lda_words(lda_model, doc_corpus)
    print("目标词汇:{0}".format(input_words))

    data_preprocessing.log_info("计算sentenceembedding......")

    input_vector = sentence_sim.sentence_embedding_weight(
        target_words=input_words, wv_model=wv_model_local)
    print input_vector

    sentence_sim.show_sim(
        project_id,
        input_vector,
        theme_sentence_vector,
        w_collection,
        str(source_id)
    )
    data_preprocessing.log_info("线程{0}文本similarity是计算结束。".format(tid))
Пример #8
0
db_name_sql = sc["db_name_sql"]
table_name_sql = sc["table_name_sql"]

host_mongo = sc["host_mongo"]
db_mongo_r = sc["db_mongo_r"]
port_mongo = sc["port_mongo"]

db_mongo_w = sc["db_mongo_w"]
sim_mongo_w = sc["sim_mongo_w"]

spark_temp = SparkSession.builder.master("local").config(
    "spark.some.config.option",
    "some-value").appName("minProject").getOrCreate()
print spark_temp

data_preprocessing.log_info("加载wordembedding模型......")
sqlContext = SQLContext(spark_temp)
lookup = sqlContext.read.parquet("./data_temp/w2c.model/data").alias("lookup")
lookup_bd = spark_temp.sparkContext.broadcast(lookup.rdd.collectAsMap())


def cos_sim(vec_a, vec_b):
    """
    param vec_a : type->array
    param vec_b : type->array
    return similarity
    """
    mat_a = np.mat(vec_a)
    mat_b = np.mat(vec_b)
    num = float(mat_a * mat_b.T)
    denom = la.norm(mat_a) * la.norm(mat_b)