def run(self): self.ELMO = word_emb_elmo.WordEmbeddings(self.model_file, cuda_device=self.gpu_id) self.SIF = sent_emb_sif.SentEmbeddings(self.ELMO, lamda=1.0) if self.cut_dict == True: self.zh_model = thulac.thulac(model_path=r'./auxiliary_data/thulac.models/', seg_only=self.seg_only) else: self.zh_model = thulac.thulac(model_path=r'./auxiliary_data/thulac.models/',user_dict=self.user_dict_file, seg_only=self.seg_only) while self.stop_sign.value == 0: if self.recv_queue.empty() == False: try: data = self.recv_queue.get(True, 1) except Exception as e: continue docid = data[0] text = data[1] if len(text) > 4000: text = text[0:4000] # [title, content] = text.split('\t') self.logger.info("worker_process[%d] %s, len:%d" %(self.worker_id, docid, len(text))) keywords = extract_keyword(text, self.SIF, self.zh_model, self.elmo_layers_weight, plus=self.plus, topk=20, kwdict=self.user_dict, kw_info=self.kw_info, cut_dict=self.cut_dict, seg_only=self.seg_only) self.logger.info("worker_succ[%d] %s" %(self.worker_id, docid)) self.logger.info("worker_succ[%d] %s %s" %(self.worker_id, docid, keywords)) #self.push_queue.put([docid, title_kw, content_kw]) self.push_queue.put([docid, keywords]) self.logger.info("stop worker[%d]" %(self.worker_id))
import jieba.analyse import os import csv import logging logging.basicConfig(level=logging.DEBUG, format="[%(levelname).1s %(asctime)s] %(message)s", datefmt="%Y-%m-%d_%H:%M:%S") logger = logging.getLogger() logger.setLevel(logging.DEBUG) user_dict_file = r'./auxiliary_data/keyword_vocab_final' #user_dict_file=r'/search/odin/liruihong/keyword-project/data/keywords_vocab/keyword_clean' #user_dict_file=r'./auxiliary_data/user_dict.txt' #user_dict_file=None model_file = r'./auxiliary_data/zhs.model/' ELMO = word_emb_elmo.WordEmbeddings(model_file, cuda_device=6) SIF = sent_emb_sif.SentEmbeddings(ELMO, lamda=1.0) zh_model = thulac.thulac(model_path=r'./auxiliary_data/thulac.models/', user_dict=user_dict_file, seg_only=False) elmo_layers_weight = [1.0, 0.0, 0.0] def load_cut_dict(user_dict_file): trie_dict = dict() with open(user_dict_file, "r", encoding="utf-8") as fp: for line in fp: cut_parts = line.strip().split(' ') num = len(cut_parts) tmp_dict = trie_dict for i in range(num):
#! /usr/bin/env python # -*- coding: utf-8 -*- # __author__ = "Sponge_sy" # Date: 2020/2/21 from embeddings import sent_emb_sif, word_emb_elmo from model.method import SIFRank, SIFRank_plus import thulac import jieba.analyse #download from https://github.com/HIT-SCIR/ELMoForManyLangs model_file = r'../auxiliary_data/zhs.model/' ELMO = word_emb_elmo.WordEmbeddings(model_file) SIF = sent_emb_sif.SentEmbeddings(ELMO, lamda=1.0) #download from http://thulac.thunlp.org/ zh_model = thulac.thulac(model_path=r'../auxiliary_data/thulac.models/',user_dict=r'../auxiliary_data/user_dict.txt') elmo_layers_weight = [0.0, 1.0, 0.0] text = "计算机科学与技术(Computer Science and Technology)是国家一级学科,下设信息安全、软件工程、计算机软件与理论、计算机系统结构、计算机应用技术、计算机技术等专业。 [1]主修大数据技术导论、数据采集与处理实践(Python)、Web前/后端开发、统计与数据分析、机器学习、高级数据库系统、数据可视化、云计算技术、人工智能、自然语言处理、媒体大数据案例分析、网络空间安全、计算机网络、数据结构、软件工程、操作系统等课程,以及大数据方向系列实验,并完成程序设计、数据分析、机器学习、数据可视化、大数据综合应用实践、专业实训和毕业设计等多种实践环节。" keyphrases = SIFRank(text, SIF, zh_model, N=15,elmo_layers_weight=elmo_layers_weight) keyphrases_ = SIFRank_plus(text, SIF, zh_model, N=15, elmo_layers_weight=elmo_layers_weight) print("------------------------------------------") print("原文:"+text) print("------------------------------------------") print("SIFRank_zh结果:") print(keyphrases) print("SIFRank+_zh结果:") print(keyphrases_) print("------------------------------------------") print("jieba分词TFIDF算法结果:")
elmo_layers_weight = [0.0, 1.0, 0.0] elif (database == "Duc2001"): data, labels = fileIO.get_duc2001_data() lamda = 1.0 elmo_layers_weight = [1.0, 0.0, 0.0] else: data, labels = fileIO.get_semeval2017_data() lamda = 0.6 elmo_layers_weight = [1.0, 0.0, 0.0] #download from https://allennlp.org/elmo options_file = "../auxiliary_data/elmo_2x4096_512_2048cnn_2xhighway_options.json" weight_file = "../auxiliary_data/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" porter = nltk.PorterStemmer() #please download nltk ELMO = word_emb_elmo.WordEmbeddings(options_file, weight_file, cuda_device=0) SIF = sent_emb_sif.SentEmbeddings(ELMO, lamda=lamda, database=database) en_model = StanfordCoreNLP( r'E:\Python_Files\stanford-corenlp-full-2018-02-27', quiet=True) #download from https://stanfordnlp.github.io/CoreNLP/ try: for key, data in data.items(): lables = labels[key] lables_stemed = [] for lable in lables: tokens = lable.split() lables_stemed.append(' '.join(porter.stem(t) for t in tokens))