def doc_distance(): category = request.form['category'] in_type = request.form['type'] if in_type == 'doc': f1 = request.files['text1'] f2 = request.files['text2'] if save_file(f1) and save_file(f2): f_text1 = read_file(f1) f_text2 = read_file(f2) else: f_text1 = request.form['text1'].encode('utf-8').strip() f_text2 = request.form['text2'].encode('utf-8').strip() inference_engine_wrapper = InferenceEngineWrapper(get_model_dir(category), get_lda_conf()) doc1_seg = inference_engine_wrapper.tokenize(f_text1) doc2_seg = inference_engine_wrapper.tokenize(f_text2) distances = inference_engine_wrapper.cal_doc_distance(doc1_seg, doc2_seg) return json.dumps( { "Jensen-Shannon Divergence": distances[0], "Hellinger Distance": distances[1] }, ensure_ascii=False)
def query_doc_sim(): category = request.form['category'] in_type = request.form['type'] if in_type == 'doc': f1 = request.files['text1'] f2 = request.files['text2'] if save_file(f1) and save_file(f2): f_text1 = read_file(f1) f_text2 = read_file(f2) else: f_text1 = request.form['text1'].encode('utf-8').strip() f_text2 = request.form['text2'].encode('utf-8').strip() inference_engine_wrapper = InferenceEngineWrapper(get_model_dir(category), get_lda_conf(), get_emb_file(category)) doc1_seg = inference_engine_wrapper.tokenize(f_text1) doc2_seg = inference_engine_wrapper.tokenize(f_text2) distances = inference_engine_wrapper.cal_query_doc_similarity( doc1_seg, doc2_seg) return json.dumps( { "LDA Similarity": distances[0], "TWE Similarity": distances[1] }, ensure_ascii=False)
def lda_infer(): category = request.form['category'] in_type = request.form['type'] f_text = input_doc_str(in_type) inference_engine_wrapper = InferenceEngineWrapper(get_model_dir(category), get_lda_conf()) seg_list = inference_engine_wrapper.tokenize(f_text) topic_dist = inference_engine_wrapper.lda_infer(seg_list) return json_format(topic_dist)
def doc_keywords(): category = request.form['category'] word = request.form['word'].encode('utf-8').strip() in_type = request.form['type'] f_text = input_doc_str(in_type) inference_engine_wrapper = InferenceEngineWrapper(get_model_dir(category), get_lda_conf()) seg_list = inference_engine_wrapper.tokenize(f_text) items = inference_engine_wrapper.cal_keywords_similarity( word, ' '.join(seg_list)) return json_format(items)
def slda_infer(): category = request.form['category'] in_type = request.form['type'] f_text = input_doc_str(in_type) inference_engine_wrapper = InferenceEngineWrapper(get_model_dir(category), get_slda_conf()) seg_list = inference_engine_wrapper.tokenize(f_text) sentences = [] length = len(seg_list) for index in range(0, length, 5): sentences.append(seg_list[index:index + 5]) topic_dist = inference_engine_wrapper.slda_infer(sentences) return json_format(topic_dist)
def doc_keywords_plus(): category = request.form['category'] #word = request.form['word'].encode('utf-8').strip() in_type = request.form['type'] f_text = input_doc_str(in_type) inference_engine_wrapper = InferenceEngineWrapper(get_model_dir(category), get_lda_conf()) seg_list = inference_engine_wrapper.tokenize(f_text) items = {} for x, w in jieba.analyse.extract_tags(f_text, withWeight=True): result = inference_engine_wrapper.cal_keywords_similarity( x.encode('utf-8').strip(), ' '.join(seg_list)) items.update(result) return json_format(items)
def doc_topic_word_lda(): category = request.form['category'] in_type = request.form['type'] f_text = input_doc_str(in_type) inference_engine_wrapper = InferenceEngineWrapper(get_model_dir(category), get_lda_conf()) seg_list = inference_engine_wrapper.tokenize(f_text) topic_dist = inference_engine_wrapper.lda_infer(seg_list) result = {} for key, value in dict(topic_dist).items(): twe_wrapper = TopicalWordEmbeddingsWrapper(get_model_dir(category), get_emb_file(category)) result_dict = dict( twe_wrapper.nearest_words_around_topic(int(key), get_count())) result[value] = result_dict return json.dumps(result)
def recommend_cal(text): short_engine_wrapper = InferenceEngineWrapper( '/root/Familia/model/webpage', 'lda.conf', 'webpage_twe_lda.model') doc_seg_short = short_engine_wrapper.tokenize(text) long_engine_wrapper = InferenceEngineWrapper('/root/Familia/model/webpage', 'lda.conf') doc_seg_long = long_engine_wrapper.tokenize(text) top_200_list = short_long_cal(short_engine_wrapper, doc_seg_short) top_3_jobs = long_long_cal(long_engine_wrapper, doc_seg_long, top_200_list) return top_3_jobs
# found in the LICENSE file. # # Author: [email protected] import sys from familia_wrapper import InferenceEngineWrapper if sys.version_info < (3,0): input = raw_input if __name__ == '__main__': if len(sys.argv) < 4: sys.stderr.write("Usage:python {} {} {} {}.\n".format( sys.argv[0], "model_dir", "conf_file", "emb_file")) exit(-1) # 获取参数 model_dir = sys.argv[1] conf_file = sys.argv[2] emb_file = sys.argv[3] # 创建InferenceEngineWrapper对象 inference_engine_wrapper = InferenceEngineWrapper(model_dir, conf_file, emb_file) while True: # 输入短文本和长文本 query = input("Enter Query: ").strip() doc = input("Enter Document: ").strip() distances = inference_engine_wrapper.cal_query_doc_similarity(query, doc) # 打印结果 print("LDA Similarity = {}".format(distances[0])) print("TWE similarity = {}".format(distances[1]))
import sys from familia_wrapper import InferenceEngineWrapper if sys.version_info < (3, 0): input = raw_input if __name__ == '__main__': if len(sys.argv) < 4: sys.stderr.write("Usage:python {} {} {} {}.\n".format( sys.argv[0], "model_dir", "conf_file", "emb_file")) exit(-1) # 获取参数 model_dir = sys.argv[1] conf_file = sys.argv[2] emb_file = sys.argv[3] # 创建InferenceEngineWrapper对象 inference_engine_wrapper = InferenceEngineWrapper(model_dir, conf_file, emb_file) while True: # 输入短文本和长文本 query = input("Enter Query: ").strip() doc = input("Enter Document: ").strip() query_seg = inference_engine_wrapper.tokenize(query) doc_seg = inference_engine_wrapper.tokenize(doc) distances = inference_engine_wrapper.cal_query_doc_similarity( query_seg, doc_seg) # 打印结果 print("LDA Similarity = {}".format(distances[0])) print("TWE similarity = {}".format(distances[1]))
result.append(ent) return result if __name__ == '__main__': path = '/media/iiip/数据/duanduan/data/validation.csv' documents = read_whole_file(path) if len(sys.argv) < 3: sys.stderr.write("Usage:python {} {} {}\n".format( sys.argv[0], "model_dir", "conf_file")) exit(-1) # 获取参数 model_dir = sys.argv[1] conf_file = sys.argv[2] # 创建InferenceEngineWrapper对象 inference_engine_wrapper = InferenceEngineWrapper(model_dir, conf_file) topic_result = {} for key in documents: print key seg_list = inference_engine_wrapper.tokenize(documents[key]) # 进行推断 topic_dist = inference_engine_wrapper.lda_infer(seg_list) topic_result[key] = cal_topic(topic_dist) file = open(path.replace(".csv", "_topic.csv"), 'w') writer = csv.writer(file) for each in topic_result: writer.writerow([each, topic_result[each][0], topic_result[each][1]]) file.close() # return topic_result
app = Sanic("Familia", strict_slashes=True) app.blueprint(swagger_blueprint) app.config.API_TITLE = 'Familia API' app.config.API_DESCRIPTION = 'A Toolkit for Industrial Topic Modeling' app.config.API_PRODUCES_CONTENT_TYPES = ['application/json'] RE_BACKSPACES = re.compile("\b+") model_name = os.environ.get("MODEL_NAME", 'news').lower() n_workers = int(os.environ.get('WORKERS', multiprocessing.cpu_count())) model_dir = f"/familia/model/{model_name}" emb_file = f"{model_name}_twe_lda.model" inference_engine_lda = InferenceEngineWrapper(model_dir, 'lda.conf', emb_file) inference_engine_slda = InferenceEngineWrapper(model_dir, 'slda.conf') twe = TopicalWordEmbeddingsWrapper(model_dir, emb_file) def read_topic_words_from_file(topic_words_file_name='topic_words.lda.txt'): logger.info(f"reading topic_words from file: {topic_words_file_name}") topic_words = defaultdict(list) file_path = os.path.join(model_dir, topic_words_file_name) if not os.path.exists(file_path): logger.warn(f"topic_words file not found: {file_path}") return topic_words with open(file_path, 'r') as f: line = f.readline() while line: pos = line.find('=')
# Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. # # Author: [email protected] import sys from familia_wrapper import InferenceEngineWrapper if sys.version_info < (3,0): input = raw_input if __name__ == '__main__': if len(sys.argv) < 3: sys.stderr.write("Usage:python {} {} {}.\n".format( sys.argv[0], "model_dir", "conf_file")) exit(-1) # 获取参数 model_dir = sys.argv[1] conf_file = sys.argv[2] # 创建InferenceEngineWrapper对象 inference_engine_wrapper = InferenceEngineWrapper(model_dir, conf_file) while True: # 输入两个长文本 doc1 = input("Enter Document1: ").strip() doc2 = input("Enter Document2: ").strip() distances = inference_engine_wrapper.cal_doc_distance(doc1, doc2) # 打印结果 print("Jensen-Shannon Divergence = {}".format(distances[0])) print("Hellinger Distance = {}".format(distances[1]))
# Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. # # Author: [email protected] import sys from familia_wrapper import InferenceEngineWrapper if sys.version_info < (3,0): input = raw_input if __name__ == '__main__': if len(sys.argv) < 3: sys.stderr.write("Usage:python {} {} {}\n".format( sys.argv[0], "model_dir", "conf_file")) exit(-1) # 获取参数 model_dir = sys.argv[1] conf_file = sys.argv[2] # 创建InferenceEngineWrapper对象 inference_engine_wrapper = InferenceEngineWrapper(model_dir, conf_file) while True: input_text = input("Enter Document: ") # 分词 seg_list = inference_engine_wrapper.tokenize(input_text) # 进行推断 topic_dist = inference_engine_wrapper.lda_infer(seg_list) # 打印结果 print("Document Topic Distribution:") print(topic_dist)
# found in the LICENSE file. import sys from familia_wrapper import InferenceEngineWrapper if sys.version_info < (3, 0): input = raw_input if __name__ == '__main__': if len(sys.argv) < 3: sys.stderr.write("Usage:python {} {} {}.\n".format( sys.argv[0], "model_dir", "conf_file")) exit(-1) # 获取参数 model_dir = sys.argv[1] conf_file = sys.argv[2] # 创建InferenceEngineWrapper对象 inference_engine_wrapper = InferenceEngineWrapper(model_dir, conf_file) while True: # 输入两个长文本 words = input("Enter Keywords: ").strip() doc = input("Enter Document: ").strip() seg_list = inference_engine_wrapper.tokenize(doc) items = inference_engine_wrapper.cal_keywords_similarity( words, ' '.join(seg_list)) # 打印结果 print('----------------------------') for item in items: print(item[0] + '\t' + str(item[1]))
# Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. import sys from familia_wrapper import InferenceEngineWrapper if sys.version_info < (3,0): input = raw_input if __name__ == '__main__': if len(sys.argv) < 3: sys.stderr.write("Usage:python {} {} {}.\n".format( sys.argv[0], "model_dir", "conf_file")) exit(-1) # 获取参数 model_dir = sys.argv[1] conf_file = sys.argv[2] # 创建InferenceEngineWrapper对象 inference_engine_wrapper = InferenceEngineWrapper(model_dir, conf_file) while True: # 输入两个长文本 doc1 = input("Enter Document1: ").strip() doc2 = input("Enter Document2: ").strip() doc1_seg = inference_engine_wrapper.tokenize(doc1) doc2_seg = inference_engine_wrapper.tokenize(doc2) distances = inference_engine_wrapper.cal_doc_distance(doc1_seg, doc2_seg) # 打印结果 print("Jensen-Shannon Divergence = {}".format(distances[0])) print("Hellinger Distance = {}".format(distances[1]))
# Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. import sys from familia_wrapper import InferenceEngineWrapper if sys.version_info < (3, 0): input = raw_input if __name__ == '__main__': if len(sys.argv) < 3: sys.stderr.write("Usage:python {} {} {}.\n".format( sys.argv[0], "model_dir", "conf_file")) exit(-1) # 获取参数 model_dir = sys.argv[1] conf_file = sys.argv[2] # 创建InferenceEngineWrapper对象 inference_engine_wrapper = InferenceEngineWrapper(model_dir, conf_file) while True: # 输入两个长文本 words = input("Enter Keywords: ").strip() doc = input("Enter Document: ").strip() items = inference_engine_wrapper.cal_keywords_similarity(words, doc) # 打印结果 print('----------------------------') for item in items: print item[0] + '\t' + str(item[1])
import sys from familia_wrapper import InferenceEngineWrapper if sys.version_info < (3,0): input = raw_input if __name__ == '__main__': if len(sys.argv) < 3: sys.stderr.write("Usage:python {} {} {}\n".format( sys.argv[0], "model_dir", "conf_file")) exit(-1) # 获取参数 model_dir = sys.argv[1] conf_file = sys.argv[2] # 创建InferenceEngineWrapper对象 inference_engine_wrapper = InferenceEngineWrapper(model_dir, conf_file) while True: input_text = input("Enter Document: ") # 分词 seg_list = inference_engine_wrapper.tokenize(input_text.strip()) # 构建句子结构,5个词为一个句子 sentences = [] length = len(seg_list) for index in range(0, length, 5): sentences.append(seg_list[index: index + 5]) # 进行推断 topic_dist = inference_engine_wrapper.slda_infer(sentences) # 打印结果 print("Document Topic Distribution:") print(topic_dist)
# Author: [email protected] import sys from familia_wrapper import InferenceEngineWrapper if sys.version_info < (3, 0): input = raw_input if __name__ == '__main__': if len(sys.argv) < 4: sys.stderr.write("Usage:python {} {} {} {}.\n".format( sys.argv[0], "model_dir", "conf_file", "emb_file")) exit(-1) # 获取参数 model_dir = sys.argv[1] conf_file = sys.argv[2] emb_file = sys.argv[3] # 创建InferenceEngineWrapper对象 inference_engine_wrapper = InferenceEngineWrapper(model_dir, conf_file, emb_file) while True: # 输入短文本和长文本 query = input("Enter Query: ").strip() doc = input("Enter Document: ").strip() distances = inference_engine_wrapper.cal_query_doc_similarity( query, doc) # 打印结果 print("LDA Similarity = {}".format(distances[0])) print("TWE similarity = {}".format(distances[1]))
import sys from familia_wrapper import InferenceEngineWrapper if sys.version_info < (3, 0): input = raw_input if __name__ == '__main__': if len(sys.argv) < 3: sys.stderr.write("Usage:python {} {} {}\n".format( sys.argv[0], "model_dir", "conf_file")) exit(-1) # 获取参数 model_dir = sys.argv[1] conf_file = sys.argv[2] # 创建InferenceEngineWrapper对象 inference_engine_wrapper = InferenceEngineWrapper(model_dir, conf_file) while True: input_text = input("Enter Document: ") # 分词 seg_list = inference_engine_wrapper.tokenize(input_text.strip()) # 构建句子结构,5个词为一个句子 sentences = [] length = len(seg_list) for index in range(0, length, 5): sentences.append(seg_list[index:index + 5]) # 进行推断 topic_dist = inference_engine_wrapper.slda_infer(sentences) # 打印结果 print("Document Topic Distribution:") print(topic_dist)