Пример #1
0
def recommend_cal(text):
    short_engine_wrapper = InferenceEngineWrapper(
        '/root/Familia/model/webpage', 'lda.conf', 'webpage_twe_lda.model')
    doc_seg_short = short_engine_wrapper.tokenize(text)

    long_engine_wrapper = InferenceEngineWrapper('/root/Familia/model/webpage',
                                                 'lda.conf')
    doc_seg_long = long_engine_wrapper.tokenize(text)

    top_200_list = short_long_cal(short_engine_wrapper, doc_seg_short)

    top_3_jobs = long_long_cal(long_engine_wrapper, doc_seg_long, top_200_list)
    return top_3_jobs
Пример #2
0
def doc_distance():
    category = request.form['category']
    in_type = request.form['type']
    if in_type == 'doc':
        f1 = request.files['text1']
        f2 = request.files['text2']
        if save_file(f1) and save_file(f2):
            f_text1 = read_file(f1)
            f_text2 = read_file(f2)
    else:
        f_text1 = request.form['text1'].encode('utf-8').strip()
        f_text2 = request.form['text2'].encode('utf-8').strip()

    inference_engine_wrapper = InferenceEngineWrapper(get_model_dir(category),
                                                      get_lda_conf())
    doc1_seg = inference_engine_wrapper.tokenize(f_text1)
    doc2_seg = inference_engine_wrapper.tokenize(f_text2)
    distances = inference_engine_wrapper.cal_doc_distance(doc1_seg, doc2_seg)

    return json.dumps(
        {
            "Jensen-Shannon Divergence": distances[0],
            "Hellinger Distance": distances[1]
        },
        ensure_ascii=False)
Пример #3
0
def query_doc_sim():
    category = request.form['category']
    in_type = request.form['type']
    if in_type == 'doc':
        f1 = request.files['text1']
        f2 = request.files['text2']
        if save_file(f1) and save_file(f2):
            f_text1 = read_file(f1)
            f_text2 = read_file(f2)
    else:
        f_text1 = request.form['text1'].encode('utf-8').strip()
        f_text2 = request.form['text2'].encode('utf-8').strip()

    inference_engine_wrapper = InferenceEngineWrapper(get_model_dir(category),
                                                      get_lda_conf(),
                                                      get_emb_file(category))
    doc1_seg = inference_engine_wrapper.tokenize(f_text1)
    doc2_seg = inference_engine_wrapper.tokenize(f_text2)
    distances = inference_engine_wrapper.cal_query_doc_similarity(
        doc1_seg, doc2_seg)

    return json.dumps(
        {
            "LDA Similarity": distances[0],
            "TWE Similarity": distances[1]
        },
        ensure_ascii=False)
Пример #4
0
def lda_infer():
    category = request.form['category']
    in_type = request.form['type']
    f_text = input_doc_str(in_type)
    inference_engine_wrapper = InferenceEngineWrapper(get_model_dir(category),
                                                      get_lda_conf())
    seg_list = inference_engine_wrapper.tokenize(f_text)
    topic_dist = inference_engine_wrapper.lda_infer(seg_list)

    return json_format(topic_dist)
Пример #5
0
def doc_keywords():
    category = request.form['category']
    word = request.form['word'].encode('utf-8').strip()
    in_type = request.form['type']
    f_text = input_doc_str(in_type)
    inference_engine_wrapper = InferenceEngineWrapper(get_model_dir(category),
                                                      get_lda_conf())
    seg_list = inference_engine_wrapper.tokenize(f_text)
    items = inference_engine_wrapper.cal_keywords_similarity(
        word, ' '.join(seg_list))

    return json_format(items)
Пример #6
0
def slda_infer():
    category = request.form['category']
    in_type = request.form['type']
    f_text = input_doc_str(in_type)
    inference_engine_wrapper = InferenceEngineWrapper(get_model_dir(category),
                                                      get_slda_conf())
    seg_list = inference_engine_wrapper.tokenize(f_text)
    sentences = []
    length = len(seg_list)
    for index in range(0, length, 5):
        sentences.append(seg_list[index:index + 5])
    topic_dist = inference_engine_wrapper.slda_infer(sentences)

    return json_format(topic_dist)
Пример #7
0
def doc_keywords_plus():
    category = request.form['category']
    #word = request.form['word'].encode('utf-8').strip()
    in_type = request.form['type']
    f_text = input_doc_str(in_type)
    inference_engine_wrapper = InferenceEngineWrapper(get_model_dir(category),
                                                      get_lda_conf())
    seg_list = inference_engine_wrapper.tokenize(f_text)
    items = {}
    for x, w in jieba.analyse.extract_tags(f_text, withWeight=True):
        result = inference_engine_wrapper.cal_keywords_similarity(
            x.encode('utf-8').strip(), ' '.join(seg_list))
        items.update(result)

    return json_format(items)
Пример #8
0
def doc_topic_word_lda():
    category = request.form['category']
    in_type = request.form['type']
    f_text = input_doc_str(in_type)
    inference_engine_wrapper = InferenceEngineWrapper(get_model_dir(category),
                                                      get_lda_conf())
    seg_list = inference_engine_wrapper.tokenize(f_text)
    topic_dist = inference_engine_wrapper.lda_infer(seg_list)

    result = {}
    for key, value in dict(topic_dist).items():
        twe_wrapper = TopicalWordEmbeddingsWrapper(get_model_dir(category),
                                                   get_emb_file(category))
        result_dict = dict(
            twe_wrapper.nearest_words_around_topic(int(key), get_count()))
        result[value] = result_dict

    return json.dumps(result)
Пример #9
0
    result.append(ent)
    return result


if __name__ == '__main__':
    path = '/media/iiip/数据/duanduan/data/validation.csv'
    documents = read_whole_file(path)
    if len(sys.argv) < 3:
        sys.stderr.write("Usage:python {} {} {}\n".format(
            sys.argv[0], "model_dir", "conf_file"))
        exit(-1)
    # 获取参数
    model_dir = sys.argv[1]
    conf_file = sys.argv[2]
    # 创建InferenceEngineWrapper对象
    inference_engine_wrapper = InferenceEngineWrapper(model_dir, conf_file)
    topic_result = {}
    for key in documents:
        print key
        seg_list = inference_engine_wrapper.tokenize(documents[key])
        # 进行推断
        topic_dist = inference_engine_wrapper.lda_infer(seg_list)
        topic_result[key] = cal_topic(topic_dist)
    file = open(path.replace(".csv", "_topic.csv"), 'w')
    writer = csv.writer(file)
    for each in topic_result:
        writer.writerow([each, topic_result[each][0], topic_result[each][1]])
    file.close()
    # return topic_result

Пример #10
0
import sys
from familia_wrapper import InferenceEngineWrapper

if sys.version_info < (3, 0):
    input = raw_input

if __name__ == '__main__':
    if len(sys.argv) < 4:
        sys.stderr.write("Usage:python {} {} {} {}.\n".format(
            sys.argv[0], "model_dir", "conf_file", "emb_file"))
        exit(-1)

    # 获取参数
    model_dir = sys.argv[1]
    conf_file = sys.argv[2]
    emb_file = sys.argv[3]
    # 创建InferenceEngineWrapper对象
    inference_engine_wrapper = InferenceEngineWrapper(model_dir, conf_file,
                                                      emb_file)
    while True:
        # 输入短文本和长文本
        query = input("Enter Query: ").strip()
        doc = input("Enter Document: ").strip()
        query_seg = inference_engine_wrapper.tokenize(query)
        doc_seg = inference_engine_wrapper.tokenize(doc)
        distances = inference_engine_wrapper.cal_query_doc_similarity(
            query_seg, doc_seg)
        # 打印结果
        print("LDA Similarity = {}".format(distances[0]))
        print("TWE similarity = {}".format(distances[1]))
Пример #11
0
# found in the LICENSE file.

import sys
from familia_wrapper import InferenceEngineWrapper

if sys.version_info < (3, 0):
    input = raw_input

if __name__ == '__main__':
    if len(sys.argv) < 3:
        sys.stderr.write("Usage:python {} {} {}.\n".format(
            sys.argv[0], "model_dir", "conf_file"))
        exit(-1)

    # 获取参数
    model_dir = sys.argv[1]
    conf_file = sys.argv[2]
    # 创建InferenceEngineWrapper对象
    inference_engine_wrapper = InferenceEngineWrapper(model_dir, conf_file)
    while True:
        # 输入两个长文本
        words = input("Enter Keywords: ").strip()
        doc = input("Enter Document: ").strip()
        seg_list = inference_engine_wrapper.tokenize(doc)
        items = inference_engine_wrapper.cal_keywords_similarity(
            words, ' '.join(seg_list))
        # 打印结果
        print('----------------------------')
        for item in items:
            print(item[0] + '\t' + str(item[1]))
Пример #12
0
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
#
# Author: [email protected]

import sys
from familia_wrapper import InferenceEngineWrapper

if sys.version_info < (3,0):
    input = raw_input

if __name__ == '__main__':
    if len(sys.argv) < 3:
        sys.stderr.write("Usage:python {} {} {}\n".format(
            sys.argv[0], "model_dir", "conf_file"))
        exit(-1)
    # 获取参数
    model_dir = sys.argv[1]
    conf_file = sys.argv[2]
    # 创建InferenceEngineWrapper对象
    inference_engine_wrapper = InferenceEngineWrapper(model_dir, conf_file)
    while True:
        input_text = input("Enter Document: ")
        # 分词
        seg_list = inference_engine_wrapper.tokenize(input_text)
        # 进行推断
        topic_dist = inference_engine_wrapper.lda_infer(seg_list)
        # 打印结果
        print("Document Topic Distribution:")
        print(topic_dist)
Пример #13
0
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import sys
from familia_wrapper import InferenceEngineWrapper

if sys.version_info < (3,0):
    input = raw_input

if __name__ == '__main__':
    if len(sys.argv) < 3:
        sys.stderr.write("Usage:python {} {} {}.\n".format(
            sys.argv[0], "model_dir", "conf_file"))
        exit(-1)

    # 获取参数
    model_dir = sys.argv[1]
    conf_file = sys.argv[2]
    # 创建InferenceEngineWrapper对象
    inference_engine_wrapper = InferenceEngineWrapper(model_dir, conf_file)
    while True:
        # 输入两个长文本
        doc1 = input("Enter Document1: ").strip()
        doc2 = input("Enter Document2: ").strip()
        doc1_seg = inference_engine_wrapper.tokenize(doc1)
        doc2_seg = inference_engine_wrapper.tokenize(doc2)
        distances = inference_engine_wrapper.cal_doc_distance(doc1_seg, doc2_seg)
        # 打印结果
        print("Jensen-Shannon Divergence = {}".format(distances[0]))
        print("Hellinger Distance = {}".format(distances[1]))
Пример #14
0
import sys
from familia_wrapper import InferenceEngineWrapper

if sys.version_info < (3, 0):
    input = raw_input

if __name__ == '__main__':
    if len(sys.argv) < 3:
        sys.stderr.write("Usage:python {} {} {}\n".format(
            sys.argv[0], "model_dir", "conf_file"))
        exit(-1)
    # 获取参数
    model_dir = sys.argv[1]
    conf_file = sys.argv[2]
    # 创建InferenceEngineWrapper对象
    inference_engine_wrapper = InferenceEngineWrapper(model_dir, conf_file)
    while True:
        input_text = input("Enter Document: ")
        # 分词
        seg_list = inference_engine_wrapper.tokenize(input_text.strip())
        # 构建句子结构,5个词为一个句子
        sentences = []
        length = len(seg_list)
        for index in range(0, length, 5):
            sentences.append(seg_list[index:index + 5])
        # 进行推断
        topic_dist = inference_engine_wrapper.slda_infer(sentences)
        # 打印结果
        print("Document Topic Distribution:")
        print(topic_dist)