def run(self):
        self.ELMO = word_emb_elmo.WordEmbeddings(self.model_file, cuda_device=self.gpu_id)
        self.SIF = sent_emb_sif.SentEmbeddings(self.ELMO, lamda=1.0)
        if self.cut_dict == True:
            self.zh_model = thulac.thulac(model_path=r'./auxiliary_data/thulac.models/', seg_only=self.seg_only)
        else:
            self.zh_model = thulac.thulac(model_path=r'./auxiliary_data/thulac.models/',user_dict=self.user_dict_file, seg_only=self.seg_only)
        while self.stop_sign.value == 0:
            if self.recv_queue.empty() == False:
                try:
                    data = self.recv_queue.get(True, 1)
                except Exception as e:
                    continue
                docid = data[0]
                text = data[1]
                if len(text) > 4000:
                    text = text[0:4000]
                # [title, content] = text.split('\t')
                self.logger.info("worker_process[%d] %s, len:%d" %(self.worker_id, docid, len(text)))
                keywords = extract_keyword(text, self.SIF, self.zh_model, self.elmo_layers_weight, plus=self.plus,
                                            topk=20, kwdict=self.user_dict, kw_info=self.kw_info, cut_dict=self.cut_dict, seg_only=self.seg_only)

                self.logger.info("worker_succ[%d] %s" %(self.worker_id, docid))
                self.logger.info("worker_succ[%d] %s %s" %(self.worker_id, docid, keywords))
                #self.push_queue.put([docid, title_kw, content_kw])
                self.push_queue.put([docid, keywords])

        self.logger.info("stop worker[%d]" %(self.worker_id))
예제 #2
0
import jieba.analyse
import os
import csv
import logging
logging.basicConfig(level=logging.DEBUG,
                    format="[%(levelname).1s %(asctime)s] %(message)s",
                    datefmt="%Y-%m-%d_%H:%M:%S")
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

user_dict_file = r'./auxiliary_data/keyword_vocab_final'
#user_dict_file=r'/search/odin/liruihong/keyword-project/data/keywords_vocab/keyword_clean'
#user_dict_file=r'./auxiliary_data/user_dict.txt'
#user_dict_file=None
model_file = r'./auxiliary_data/zhs.model/'
ELMO = word_emb_elmo.WordEmbeddings(model_file, cuda_device=6)
SIF = sent_emb_sif.SentEmbeddings(ELMO, lamda=1.0)
zh_model = thulac.thulac(model_path=r'./auxiliary_data/thulac.models/',
                         user_dict=user_dict_file,
                         seg_only=False)
elmo_layers_weight = [1.0, 0.0, 0.0]


def load_cut_dict(user_dict_file):
    trie_dict = dict()
    with open(user_dict_file, "r", encoding="utf-8") as fp:
        for line in fp:
            cut_parts = line.strip().split(' ')
            num = len(cut_parts)
            tmp_dict = trie_dict
            for i in range(num):
예제 #3
0
파일: test.py 프로젝트: zxlzr/SIFRank_zh
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# __author__ = "Sponge_sy"
# Date: 2020/2/21

from embeddings import sent_emb_sif, word_emb_elmo
from model.method import SIFRank, SIFRank_plus
import thulac
import jieba.analyse

#download from https://github.com/HIT-SCIR/ELMoForManyLangs
model_file = r'../auxiliary_data/zhs.model/'

ELMO = word_emb_elmo.WordEmbeddings(model_file)
SIF = sent_emb_sif.SentEmbeddings(ELMO, lamda=1.0)
#download from http://thulac.thunlp.org/
zh_model = thulac.thulac(model_path=r'../auxiliary_data/thulac.models/',user_dict=r'../auxiliary_data/user_dict.txt')
elmo_layers_weight = [0.0, 1.0, 0.0]

text = "计算机科学与技术(Computer Science and Technology)是国家一级学科,下设信息安全、软件工程、计算机软件与理论、计算机系统结构、计算机应用技术、计算机技术等专业。 [1]主修大数据技术导论、数据采集与处理实践(Python)、Web前/后端开发、统计与数据分析、机器学习、高级数据库系统、数据可视化、云计算技术、人工智能、自然语言处理、媒体大数据案例分析、网络空间安全、计算机网络、数据结构、软件工程、操作系统等课程,以及大数据方向系列实验,并完成程序设计、数据分析、机器学习、数据可视化、大数据综合应用实践、专业实训和毕业设计等多种实践环节。"
keyphrases = SIFRank(text, SIF, zh_model, N=15,elmo_layers_weight=elmo_layers_weight)
keyphrases_ = SIFRank_plus(text, SIF, zh_model, N=15, elmo_layers_weight=elmo_layers_weight)
print("------------------------------------------")
print("原文:"+text)
print("------------------------------------------")
print("SIFRank_zh结果:")
print(keyphrases)
print("SIFRank+_zh结果:")
print(keyphrases_)
print("------------------------------------------")
print("jieba分词TFIDF算法结果:")
예제 #4
0
    elmo_layers_weight = [0.0, 1.0, 0.0]
elif (database == "Duc2001"):
    data, labels = fileIO.get_duc2001_data()
    lamda = 1.0
    elmo_layers_weight = [1.0, 0.0, 0.0]
else:
    data, labels = fileIO.get_semeval2017_data()
    lamda = 0.6
    elmo_layers_weight = [1.0, 0.0, 0.0]

#download from https://allennlp.org/elmo
options_file = "../auxiliary_data/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "../auxiliary_data/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

porter = nltk.PorterStemmer()  #please download nltk
ELMO = word_emb_elmo.WordEmbeddings(options_file, weight_file, cuda_device=0)
SIF = sent_emb_sif.SentEmbeddings(ELMO, lamda=lamda, database=database)
en_model = StanfordCoreNLP(
    r'E:\Python_Files\stanford-corenlp-full-2018-02-27',
    quiet=True)  #download from https://stanfordnlp.github.io/CoreNLP/

try:
    for key, data in data.items():

        lables = labels[key]
        lables_stemed = []

        for lable in lables:
            tokens = lable.split()
            lables_stemed.append(' '.join(porter.stem(t) for t in tokens))