def getkeywords(): jieba.load_userdict("../user_dic") analyse.set_idf_path("idf.txt") #file_name为自定义语料库的路径 # analyse.set_stop_words("stop_words.txt") #file_name为自定义语料库的路径。 string = pd.read_csv('100w.csv', header=None) data = string.ix[:, :1].dropna() clfs = data[0].unique() k = len(clfs) / 12 for i in range(k): print "..." t1 = time.time() Processes = [] if i == k - 1: for clf in clfs[i * 12:]: pc = multiprocessing.Process(target=extract, args=(clf, data)) Processes.append(pc) else: for clf in clfs[i * 12:(i + 1) * 12]: pc = multiprocessing.Process(target=extract, args=(clf, data)) Processes.append(pc) for p in Processes: p.start() for p in Processes: p.join() t2 = time.time() print "%d seconds" % (t2 - t1)
def __init__( self, opt, poem_file: str, song_file: str, keywords: List[str], out_file: str, base_dir_for_save=r'E:\\PycharmProjects\\FirstDayOnMS2\\Data\\Poem_Song', save_dir_for_songVec="songVec.pkl", save_dir_for_poemVec="poemVec.pkl", idf_path=None, additional_key_words_path=None, out_txt_dir=None, seg_point_field_name="seg_point"): super(MatchSeggedPoemSong, self).__init__(opt, poem_file, song_file, keywords, base_dir_for_save, save_dir_for_songVec, save_dir_for_poemVec) self.base_dir_for_save = base_dir_for_save print("PoemMatchSong.__init__(): keywords = ", self.keywords) self.out_file = out_file self.idf_path = idf_path self.seg_point_field_name = seg_point_field_name if self.idf_path is not None: analyse.set_idf_path(self.idf_path) self.additional_key_words = dict() if additional_key_words_path is not None: self.additional_key_words = self.load_additional_key_words( additional_key_words_path) self.modelSearchSong = SearchSong() self.out_txt_dir = out_txt_dir #写到txt中供yaml调用
def __init__( self, opt, poem_file: str, song_file: str, keywords: List[str], base_dir_for_save=r'E:\\PycharmProjects\\FirstDayOnMS2\\Data\\Poem_Song', save_dir_for_songVec="songVec.pkl", save_dir_for_poemVec="poemVec.pkl", idf_path=None): self.poem_file = poem_file self.song_file = song_file self.opt = opt self.base_dir_for_save = base_dir_for_save self.save_dir_for_songVec = save_dir_for_songVec self.save_dir_for_poemVec = save_dir_for_poemVec self.keywords = keywords self.poems = [] self.sub_poems = [] self.songs = [] self.sub_songs = [] self.songVecs = [] self.poemVecs = [] self.model = g.Doc2Vec.load(gensim_weight_path) self.idf_path = idf_path if idf_path is not None: analyse.set_idf_path(idf_path)
def __init__(self, prose_path, idf_path=None, top_n=10, save_path=None): super(StaticPoem, self).__init__() self.idf_path = idf_path self.top_n = top_n if idf_path != None: analyse.set_idf_path(idf_path) self.poems = json.load(open(prose_path, "r", encoding="utf-8")) self.save_path = save_path
def tf_if_parse(content, keywords=None, topK = 50): """ keywords must be include """ import jieba.analyse as analyse import jieba tfidf_path = os.path.join(resource_dir,'resources','text','tf-idf.txt') user_dict_path = os.path.join(resource_dir,'resources','text','user_dict.txt') stopwords_path = os.path.join(resource_dir,'resources','text','stopwords-zh.txt') jieba.load_userdict(user_dict_path) analyse.set_stop_words(stopwords_path) analyse.set_idf_path(tfidf_path) tags = analyse.extract_tags(content, topK=topK) return tags
def __init__(self, seed_word_path, prose_path, proseSelcted=None, top_n=10, max_iter_num=3, threshold=0.001, ExtractMode="tfidf", idf_path=""): ''' :param seed_word_path: 种子词的文件路径 :param sheet_name: 种子词excel文件中的sheetName :param prose_path: 文章的文件路径 :param top_n: 取前top_n个种子词 :param max_iter_num: 最大的迭代次数 ''' if seed_word_path is None or not os.path.exists(seed_word_path): raise Exception('初始种子词文件必须提供') self.seed_word_path = seed_word_path self.prose_path = prose_path self.proseSelcted = proseSelcted self.top_n = top_n self.max_iter_num = max_iter_num self.threshold = threshold self.ExtractMode = ExtractMode if ExtractMode == "tfidf" and idf_path: analyse.set_idf_path(idf_path) self.seed_words = set() self.seed_words |= self.loadSeedWord(seed_word_path) self.prosePastSelected = set() if proseSelcted is not None: self.prosePastSelected |= self.loadProseNumber(path=proseSelcted) #self.poems = self.getProse(prose_path) self.poems = pickle.load(open("poemsIdf.pkl", "rb")) print("散文加载成功") self.proseCurrentSelected = set() #用当前这些关键词检索出来的文章 self.wordCurrentSelected = set() #用当前的这些文章检索出来的关键词 self.wordCurrentSelectedDict = dict() self.ILLEGAL_CHARACTERS_RE = re.compile( r'[\000-\010]|[\013-\014]|[\016-\037]') self.PoemID2Index() print("初始化结束")
def test_extract_tags(self): """ 3. 关键词抽取 """ topic = '关键词抽取' split_line = self.get_split_line(topic=topic) self.logger.info(split_line) term = '我们时人中国的可是is of super man' res = analyse.extract_tags(term) self.logger.info('{topic}_标准抽取: {term} -> {msg}'.format(topic=topic, term=term, msg=res)) # TODO: 自定义语料库运行有问题 user_idf_path = os.path.dirname(__file__) + '/jieba_dict/idf.txt.big' analyse.set_idf_path(user_idf_path) res = analyse.extract_tags(term) self.logger.info('{topic}_自定义逆向文件频率: {term} -> {msg}'.format( topic=topic, term=term, msg=res))
def getkeywords2(): jieba.load_userdict("../user_dic") analyse.set_idf_path("idf.txt") #file_name为自定义语料库的路径 # analyse.set_stop_words("stop_words.txt") #file_name为自定义语料库的路径。 print "connecting" mgclient = MongoClient(host='127.0.0.1') db_auth = mgclient.npl db_auth.authenticate("npl", "npl") db = mgclient.npl sale_info = db.npl_sale_info print "connected to npl_sale_info" cursor = sale_info.find({}, { "_id": 0, "sec_cate": 1, "detail": 1 }, limit=10000000) results = DataFrame(list(cursor), columns=["sec_cate", "detail"]) data = results[:].dropna() # drop后index可能不连续 data.columns = [0, 1] print data[:10] clfs = pd.read_csv('recalc.txt', header=None) t1 = time.time() Processes = [] for clf in clfs[0]: pc = multiprocessing.Process(target=extract, args=(clf, data)) Processes.append(pc) for p in Processes: p.start() for p in Processes: p.join() t2 = time.time() print "%d seconds" % (t2 - t1)
def __init__(self, idf_path: str = None, user_dict_path: str = os.path.join(curr_dir, 'userdict.txt'), stop_words_path: str = os.path.join(curr_dir, 'stop_words.txt'), default_method: str = 'jieba.extract_tags'): """ Methods: tfidf: customized TFIDF jieba.textrank: jieba's textrank jieba.extract_tags: jieba's tfidf?! jieba.tfidf: jieba's tfidf """ if user_dict_path: jieba.load_userdict(user_dict_path) if idf_path: analyse.set_idf_path(idf_path) if stop_words_path: analyse.set_stop_words(stop_words_path) self.default_method = default_method
def ExtractKeyWord(self): if self.idf_path is not None: analyse.set_idf_path(self.idf_path) for i, poem_dict in enumerate(self.poems): new_paras = [] for j, para_dict in enumerate(poem_dict['paras']): all_content = '\n'.join(para_dict['para_content']) key_words = analyse.textrank( all_content, topK=10, withWeight=True, allowPOS=('ns', 'n', 'vn', 'v')) #'v' #ns地名;n名词;vn名动词,比如思索;v动词 idf_key_words = analyse.extract_tags( all_content, topK=10, withWeight=True, allowPOS=('ns', 'n', 'vn', 'v')) #'v' #ns地名;n名词;vn名动词,比如思索;v动词 para_dict[ 'key_words'] = key_words #with the type of List [Tuples('word',float)] para_dict['idf_key_words'] = idf_key_words new_paras.append(para_dict) self.poems[i]['paras'] = new_paras
sent = '好丑的证件照片' print('/ '.join(jieba.cut(sent, HMM=False))) jieba.suggest_freq(('证件照片'), True) print('/ '.join(jieba.cut(sent, HMM=False))) import jieba.analyse as aly content = ''' 自然语言处理(NLP)是计算机科学,人工智能,语言学关注计算机和人类(自然)语言之间的相互作用的领域。 因此,自然语言处理是与人机交互的领域有关的。在自然语言处理面临很多挑战,包括自然语言理解,因此,自然语言处理涉及人机交互的面积。 在NLP诸多挑战涉及自然语言理解,即计算机源于人为或自然语言输入的意思,和其他涉及到自然语言生成。 ''' #加载自定义idf词典 aly.set_idf_path('./data/idf.txt.big') #加载停用词典 aly.set_stop_words('./data/stop_words.utf8') # 第一个参数:待提取关键词的文本 # 第二个参数:返回关键词的数量,重要性从高到低排序 # 第三个参数:是否同时返回每个关键词的权重 # 第四个参数:词性过滤,为空表示不过滤,若提供则仅返回符合词性要求的关键词 keywords = aly.extract_tags(content, topK=10, withWeight=True, allowPOS=()) for item in keywords: # 分别为关键词和相应的权重 print(item[0], item[1]) import jieba.analyse as aly
import sys import os from gensim.models import word2vec import copy import jieba import math from jieba import analyse from math import sqrt import jieba.posseg as pseg reload(sys) sys.setdefaultencoding('utf-8') # model = word2vec.Word2Vec.load('Top250Word') real_dir = os.path.split(os.path.realpath(__file__))[0] analyse.set_idf_path(real_dir+"/idf_value.txt") analyse.set_stop_words(real_dir+"/stop.txt") class KeywordHandler: textrank = analyse.textrank tf_idf = analyse.extract_tags POS = ['ns','nt','nz','n','vn','an','a'] #词性限制为人名,地名,名词,动名词 filter_set = set() allow_set = set() with open(real_dir+'/douban_tag.txt') as f: lines = f.readlines() for word in lines: allow_set.add(word[:-1].decode('utf-8')) print "Loading Idf Value"
def extract_keywords_by_tfidf(content, idf_path=None): if idf_path: analyse.set_idf_path(idf_path) keywords = " ".join(analyse.extract_tags(content, topK=30, withWeight=False, allowPOS=())) return keywords
jieba.load_userdict('userdict.txt')#user dict defines specific words #词性 import jieba.posseg as pseg words=pseg.cut(f.read().strip()) for word in words: print word.word, word.flag #词性形容词等 #jieba TF-IDF import jieba from jieba import analyse tf_idf = analyse.extract_tags tags = jieba.analyse.extract_tags(sentence,topK=20, withWeight=False, allowPos=())#待提取文本,返回K最大的关键词,是否一并返回权重值 #这个功能,jieba需要判断stopWords,用的自己的,需要判断IDF,也是用的自己的语料库。所以这个功能,在高精确度的地方,无法使用 analyse.set_stop_words("stop_words.txt")#自己设置停用词 analyse.set_idf_path(file_name) keywords = tf_idf(text) def stopWordsList(filepath): stopwords=[ line.strip() for line in open(filepath,'r',encoding='utf-8').readlines()] return stopwords def seg_sentence(line): sentence_seg = jieba.cut_for_search(line.strip()) stopwords = stopWordsList('/home/luyq/nlp/stopWords_ch.txt') outStr="" for word in sentence_seg: if word not in stopwords: if word != '\t': outStr += word outStr += " "
#-*- coding:utf-8 -*- from __future__ import print_function from __future__ import division import os import sys curdir = os.path.dirname(os.path.abspath(__file__)) sys.path.append(curdir) if sys.version_info[0] < 3: reload(sys) sys.setdefaultencoding("utf-8") import jieba.analyse as analyzer JIEBA_ANALYZER_IDF = os.path.join(curdir, os.path.pardir, os.path.pardir, "resources", "similarity.vocab.idf") JIEBA_ANALYZER_STOPWORDS = os.path.join(curdir, os.path.pardir, os.path.pardir, "resources", "jieba_ext", "stop_words.txt") analyzer.set_idf_path(JIEBA_ANALYZER_IDF) analyzer.set_stop_words(JIEBA_ANALYZER_STOPWORDS) def keywords(content, topK=10, vendor = "tfidf", title = None): words = [] scores = [] if vendor == 'tfidf': for x,y in analyzer.extract_tags(content, topK=topK, withWeight=True): words.append(x) scores.append(y) else: raise BaseException("Invalid vendor") return words, scores
def self_define(self): jb_analyse.set_stop_words("../data/stopwords") jb_analyse.set_idf_path("../data/sk_tfidf.txt")