示例#1
0
def getkeywords():

    jieba.load_userdict("../user_dic")
    analyse.set_idf_path("idf.txt")  #file_name为自定义语料库的路径
    #    analyse.set_stop_words("stop_words.txt") #file_name为自定义语料库的路径。

    string = pd.read_csv('100w.csv', header=None)
    data = string.ix[:, :1].dropna()
    clfs = data[0].unique()

    k = len(clfs) / 12
    for i in range(k):
        print "..."
        t1 = time.time()
        Processes = []
        if i == k - 1:
            for clf in clfs[i * 12:]:
                pc = multiprocessing.Process(target=extract, args=(clf, data))
                Processes.append(pc)
        else:
            for clf in clfs[i * 12:(i + 1) * 12]:
                pc = multiprocessing.Process(target=extract, args=(clf, data))
                Processes.append(pc)

        for p in Processes:
            p.start()

        for p in Processes:
            p.join()

        t2 = time.time()
        print "%d seconds" % (t2 - t1)
示例#2
0
 def __init__(
         self,
         opt,
         poem_file: str,
         song_file: str,
         keywords: List[str],
         out_file: str,
         base_dir_for_save=r'E:\\PycharmProjects\\FirstDayOnMS2\\Data\\Poem_Song',
         save_dir_for_songVec="songVec.pkl",
         save_dir_for_poemVec="poemVec.pkl",
         idf_path=None,
         additional_key_words_path=None,
         out_txt_dir=None,
         seg_point_field_name="seg_point"):
     super(MatchSeggedPoemSong,
           self).__init__(opt, poem_file, song_file, keywords,
                          base_dir_for_save, save_dir_for_songVec,
                          save_dir_for_poemVec)
     self.base_dir_for_save = base_dir_for_save
     print("PoemMatchSong.__init__(): keywords = ", self.keywords)
     self.out_file = out_file
     self.idf_path = idf_path
     self.seg_point_field_name = seg_point_field_name
     if self.idf_path is not None:
         analyse.set_idf_path(self.idf_path)
     self.additional_key_words = dict()
     if additional_key_words_path is not None:
         self.additional_key_words = self.load_additional_key_words(
             additional_key_words_path)
     self.modelSearchSong = SearchSong()
     self.out_txt_dir = out_txt_dir  #写到txt中供yaml调用
示例#3
0
    def __init__(
            self,
            opt,
            poem_file: str,
            song_file: str,
            keywords: List[str],
            base_dir_for_save=r'E:\\PycharmProjects\\FirstDayOnMS2\\Data\\Poem_Song',
            save_dir_for_songVec="songVec.pkl",
            save_dir_for_poemVec="poemVec.pkl",
            idf_path=None):
        self.poem_file = poem_file
        self.song_file = song_file

        self.opt = opt

        self.base_dir_for_save = base_dir_for_save
        self.save_dir_for_songVec = save_dir_for_songVec
        self.save_dir_for_poemVec = save_dir_for_poemVec
        self.keywords = keywords
        self.poems = []
        self.sub_poems = []
        self.songs = []
        self.sub_songs = []
        self.songVecs = []
        self.poemVecs = []
        self.model = g.Doc2Vec.load(gensim_weight_path)
        self.idf_path = idf_path
        if idf_path is not None:
            analyse.set_idf_path(idf_path)
示例#4
0
 def __init__(self, prose_path, idf_path=None, top_n=10, save_path=None):
     super(StaticPoem, self).__init__()
     self.idf_path = idf_path
     self.top_n = top_n
     if idf_path != None:
         analyse.set_idf_path(idf_path)
     self.poems = json.load(open(prose_path, "r", encoding="utf-8"))
     self.save_path = save_path
示例#5
0
    def tf_if_parse(content, keywords=None, topK = 50):
        """ keywords must be include
        """
        import jieba.analyse as analyse
        import jieba

        tfidf_path = os.path.join(resource_dir,'resources','text','tf-idf.txt')

        user_dict_path = os.path.join(resource_dir,'resources','text','user_dict.txt')
        stopwords_path = os.path.join(resource_dir,'resources','text','stopwords-zh.txt')

        jieba.load_userdict(user_dict_path)
        analyse.set_stop_words(stopwords_path)
        analyse.set_idf_path(tfidf_path)
        tags = analyse.extract_tags(content, topK=topK)
        return tags
示例#6
0
    def __init__(self,
                 seed_word_path,
                 prose_path,
                 proseSelcted=None,
                 top_n=10,
                 max_iter_num=3,
                 threshold=0.001,
                 ExtractMode="tfidf",
                 idf_path=""):
        '''
        :param seed_word_path: 种子词的文件路径
        :param sheet_name: 种子词excel文件中的sheetName
        :param prose_path: 文章的文件路径
        :param top_n: 取前top_n个种子词
        :param max_iter_num: 最大的迭代次数
        '''
        if seed_word_path is None or not os.path.exists(seed_word_path):
            raise Exception('初始种子词文件必须提供')

        self.seed_word_path = seed_word_path
        self.prose_path = prose_path
        self.proseSelcted = proseSelcted
        self.top_n = top_n
        self.max_iter_num = max_iter_num
        self.threshold = threshold
        self.ExtractMode = ExtractMode
        if ExtractMode == "tfidf" and idf_path:
            analyse.set_idf_path(idf_path)
        self.seed_words = set()
        self.seed_words |= self.loadSeedWord(seed_word_path)

        self.prosePastSelected = set()
        if proseSelcted is not None:
            self.prosePastSelected |= self.loadProseNumber(path=proseSelcted)

        #self.poems = self.getProse(prose_path)
        self.poems = pickle.load(open("poemsIdf.pkl", "rb"))
        print("散文加载成功")

        self.proseCurrentSelected = set()  #用当前这些关键词检索出来的文章
        self.wordCurrentSelected = set()  #用当前的这些文章检索出来的关键词
        self.wordCurrentSelectedDict = dict()

        self.ILLEGAL_CHARACTERS_RE = re.compile(
            r'[\000-\010]|[\013-\014]|[\016-\037]')
        self.PoemID2Index()
        print("初始化结束")
示例#7
0
    def test_extract_tags(self):
        """
        3. 关键词抽取
        """
        topic = '关键词抽取'
        split_line = self.get_split_line(topic=topic)
        self.logger.info(split_line)

        term = '我们时人中国的可是is of super man'
        res = analyse.extract_tags(term)
        self.logger.info('{topic}_标准抽取: {term} -> {msg}'.format(topic=topic,
                                                                term=term,
                                                                msg=res))

        #  TODO: 自定义语料库运行有问题
        user_idf_path = os.path.dirname(__file__) + '/jieba_dict/idf.txt.big'
        analyse.set_idf_path(user_idf_path)
        res = analyse.extract_tags(term)
        self.logger.info('{topic}_自定义逆向文件频率: {term} -> {msg}'.format(
            topic=topic, term=term, msg=res))
示例#8
0
def getkeywords2():

    jieba.load_userdict("../user_dic")
    analyse.set_idf_path("idf.txt")  #file_name为自定义语料库的路径
    #    analyse.set_stop_words("stop_words.txt") #file_name为自定义语料库的路径。

    print "connecting"
    mgclient = MongoClient(host='127.0.0.1')
    db_auth = mgclient.npl
    db_auth.authenticate("npl", "npl")
    db = mgclient.npl
    sale_info = db.npl_sale_info
    print "connected to npl_sale_info"

    cursor = sale_info.find({}, {
        "_id": 0,
        "sec_cate": 1,
        "detail": 1
    },
                            limit=10000000)
    results = DataFrame(list(cursor), columns=["sec_cate", "detail"])
    data = results[:].dropna()  # drop后index可能不连续
    data.columns = [0, 1]
    print data[:10]
    clfs = pd.read_csv('recalc.txt', header=None)

    t1 = time.time()
    Processes = []
    for clf in clfs[0]:
        pc = multiprocessing.Process(target=extract, args=(clf, data))
        Processes.append(pc)

    for p in Processes:
        p.start()

    for p in Processes:
        p.join()

    t2 = time.time()
    print "%d seconds" % (t2 - t1)
    def __init__(self,
                 idf_path: str = None,
                 user_dict_path: str = os.path.join(curr_dir, 'userdict.txt'),
                 stop_words_path: str = os.path.join(curr_dir,
                                                     'stop_words.txt'),
                 default_method: str = 'jieba.extract_tags'):
        """
        Methods:

        tfidf: customized TFIDF
        jieba.textrank: jieba's textrank
        jieba.extract_tags: jieba's tfidf?!
        jieba.tfidf: jieba's tfidf
        """
        if user_dict_path:
            jieba.load_userdict(user_dict_path)
        if idf_path:
            analyse.set_idf_path(idf_path)
        if stop_words_path:
            analyse.set_stop_words(stop_words_path)

        self.default_method = default_method
示例#10
0
 def ExtractKeyWord(self):
     if self.idf_path is not None:
         analyse.set_idf_path(self.idf_path)
     for i, poem_dict in enumerate(self.poems):
         new_paras = []
         for j, para_dict in enumerate(poem_dict['paras']):
             all_content = '\n'.join(para_dict['para_content'])
             key_words = analyse.textrank(
                 all_content,
                 topK=10,
                 withWeight=True,
                 allowPOS=('ns', 'n', 'vn',
                           'v'))  #'v' #ns地名;n名词;vn名动词,比如思索;v动词
             idf_key_words = analyse.extract_tags(
                 all_content,
                 topK=10,
                 withWeight=True,
                 allowPOS=('ns', 'n', 'vn',
                           'v'))  #'v' #ns地名;n名词;vn名动词,比如思索;v动词
             para_dict[
                 'key_words'] = key_words  #with the type of List [Tuples('word',float)]
             para_dict['idf_key_words'] = idf_key_words
             new_paras.append(para_dict)
         self.poems[i]['paras'] = new_paras
示例#11
0
sent = '好丑的证件照片'
print('/ '.join(jieba.cut(sent, HMM=False)))

jieba.suggest_freq(('证件照片'), True)
print('/ '.join(jieba.cut(sent, HMM=False)))

import jieba.analyse as aly

content = '''
自然语言处理(NLP)是计算机科学,人工智能,语言学关注计算机和人类(自然)语言之间的相互作用的领域。
因此,自然语言处理是与人机交互的领域有关的。在自然语言处理面临很多挑战,包括自然语言理解,因此,自然语言处理涉及人机交互的面积。
在NLP诸多挑战涉及自然语言理解,即计算机源于人为或自然语言输入的意思,和其他涉及到自然语言生成。
'''

#加载自定义idf词典
aly.set_idf_path('./data/idf.txt.big')
#加载停用词典
aly.set_stop_words('./data/stop_words.utf8')

# 第一个参数:待提取关键词的文本
# 第二个参数:返回关键词的数量,重要性从高到低排序
# 第三个参数:是否同时返回每个关键词的权重
# 第四个参数:词性过滤,为空表示不过滤,若提供则仅返回符合词性要求的关键词
keywords = aly.extract_tags(content, topK=10, withWeight=True, allowPOS=())

for item in keywords:
    # 分别为关键词和相应的权重
    print(item[0], item[1])
    
import jieba.analyse as aly
示例#12
0
import sys
import os
from gensim.models import word2vec
import copy
import jieba
import math
from jieba import analyse
from math import sqrt
import jieba.posseg as pseg

reload(sys)
sys.setdefaultencoding('utf-8')
# model = word2vec.Word2Vec.load('Top250Word')
real_dir = os.path.split(os.path.realpath(__file__))[0]

analyse.set_idf_path(real_dir+"/idf_value.txt")
analyse.set_stop_words(real_dir+"/stop.txt")


class KeywordHandler:
	textrank = analyse.textrank
	tf_idf = analyse.extract_tags
	POS = ['ns','nt','nz','n','vn','an','a'] #词性限制为人名,地名,名词,动名词
	filter_set = set()
	allow_set = set()
	with open(real_dir+'/douban_tag.txt') as f:
		lines = f.readlines()
		for word in lines:
			allow_set.add(word[:-1].decode('utf-8'))

	print "Loading Idf Value"
示例#13
0
def extract_keywords_by_tfidf(content, idf_path=None):
    if idf_path:
        analyse.set_idf_path(idf_path)
    keywords = "  ".join(analyse.extract_tags(content, topK=30, withWeight=False, allowPOS=()))
    return keywords
示例#14
0
文件: demo.py 项目: denisyq/code
jieba.load_userdict('userdict.txt')#user dict defines specific words

#词性
import jieba.posseg as pseg 
words=pseg.cut(f.read().strip())
for word in words:
    print word.word, word.flag #词性形容词等

#jieba TF-IDF
import jieba
from jieba import analyse
tf_idf = analyse.extract_tags
tags = jieba.analyse.extract_tags(sentence,topK=20, withWeight=False, allowPos=())#待提取文本,返回K最大的关键词,是否一并返回权重值
#这个功能,jieba需要判断stopWords,用的自己的,需要判断IDF,也是用的自己的语料库。所以这个功能,在高精确度的地方,无法使用
analyse.set_stop_words("stop_words.txt")#自己设置停用词
analyse.set_idf_path(file_name)
keywords = tf_idf(text)

def stopWordsList(filepath):
    stopwords=[ line.strip() for line in open(filepath,'r',encoding='utf-8').readlines()]
    return stopwords

def seg_sentence(line):
    sentence_seg = jieba.cut_for_search(line.strip())
    stopwords = stopWordsList('/home/luyq/nlp/stopWords_ch.txt')
    outStr=""
    for word in sentence_seg:
        if word not in stopwords:
            if word != '\t':
                outStr += word
                outStr += " "
示例#15
0
#-*- coding:utf-8 -*-
from __future__ import print_function
from __future__ import division
import os
import sys
curdir = os.path.dirname(os.path.abspath(__file__))
sys.path.append(curdir)

if sys.version_info[0] < 3:
    reload(sys)
    sys.setdefaultencoding("utf-8")

import jieba.analyse as analyzer
JIEBA_ANALYZER_IDF = os.path.join(curdir, os.path.pardir, os.path.pardir, "resources", "similarity.vocab.idf")
JIEBA_ANALYZER_STOPWORDS = os.path.join(curdir, os.path.pardir, os.path.pardir, "resources", "jieba_ext", "stop_words.txt")
analyzer.set_idf_path(JIEBA_ANALYZER_IDF)
analyzer.set_stop_words(JIEBA_ANALYZER_STOPWORDS)


def keywords(content, topK=10, vendor = "tfidf", title = None):
    words = []
    scores = []
    if vendor == 'tfidf':
        for x,y in analyzer.extract_tags(content, topK=topK, withWeight=True):
            words.append(x)
            scores.append(y)
    else:
        raise BaseException("Invalid vendor")
    return words, scores

 def self_define(self):
     jb_analyse.set_stop_words("../data/stopwords")
     jb_analyse.set_idf_path("../data/sk_tfidf.txt")