def load_doc_list(): pynlpir.open() doc_list = os.listdir(SOURCE_DOC_DIR_PATH) segment_list = [] for doc in doc_list: fr = codecs.open(SOURCE_DOC_DIR_PATH + doc, 'r', 'utf-8') line_list = fr.read() fr.close() ''' line_list = line_list.split(NEW_LINE) line_list.pop() # seg_str = '' for i in range(len(line_list)): segment = pynlpir.segment(line_list[i], pos_tagging=False) seg_str = '' for seg in segment: seg_str += seg + ' ' line_list[i] = seg_str.strip() # segment_list.append(' '.join(line_list)) temp_str = ' '.join(line_list) ''' key_word_list = pynlpir.get_key_words(line_list, max_words=10, weighted=True) for key_word in key_word_list: print(key_word[0], '\t', key_word[1]) pynlpir.close() exit(0)
def get_key_words(): s = '' max_words = MAX_WORDS_DEFAULT max_hot_words = MAX_HOT_WORDS_DEFAULT update_hot_word = UPDATE_HOT_WORD_DEFAULT # get doc if request.method == 'POST': s = request.form.get('s', type=str, default='') update_hot_word = request.form.get('update_hot_word', type=str, default=UPDATE_HOT_WORD_DEFAULT) # 是否更新hot_word表 try: max_words = request.form.get('max_words', type=str, default=MAX_WORDS_DEFAULT) if max_words != '': # 有max_words参数(可能是默认值'3') print('[POST] max_words yes') max_words = int(max_words.strip()) print('\tmax_words =', max_words) else: max_words = MAX_WORDS_DEFAULT print('[POST] max_words no') except: # max_words参数处理异常,设置默认值3 max_words = MAX_WORDS_DEFAULT try: max_hot_words = request.form.get('max_hot_words', type=str, default=MAX_HOT_WORDS_DEFAULT) if max_hot_words != '': max_hot_words = int(max_hot_words.strip()) else: max_hot_words = MAX_HOT_WORDS_DEFAULT except: max_hot_words = MAX_HOT_WORDS_DEFAULT elif request.method == 'GET': s = request.args.get('s') update_hot_word = request.args.get('update_hot_word') if update_hot_word != 'False': update_hot_word = 'True' try: max_words = int(request.args.get('max_words').strip()) except: max_words = MAX_WORDS_DEFAULT try: max_hot_words = int(request.args.get('max_hot_words').strip()) except: max_hot_words = MAX_HOT_WORDS_DEFAULT # get key words if s == '': # 文章内容为空,不分析 return 'null' else: # 分析关键词 pynlpir.open() key_word_list = pynlpir.get_key_words(s, max_words=max_words, weighted=False) # temp_str = '' for i in range(len(key_word_list)): key_word_list[i] = key_word_list[i] pynlpir.close() if update_hot_word == 'True': # 新开一个线程,更新数据库 print('[update_hot_word] True') t = threading.Thread(target=db_helper.update_tables, args=(','.join(key_word_list), max_hot_words)) t.setDaemon(True) t.start() else: print('[update_hot_word] False') return ','.join(key_word_list)
def separateWordFromFile(fileName): pynlpir.open() file = open(fileName,'r') lines = file.readlines() i = 0 allSegmentResult = [] #print type(s) label = [] for line in lines: i = i+1 textsegment = line if textsegment == "\n": print "skip" continue ##note: ''' gbk 转 utf-8时, gbk --> unicode --> utf-8 分解为两个步骤, 1. gbk --> unicode python 语法:你的字符串.decode("gbk") 2. unicode --> utf-8 python 语法:你的字符串.decode("gbk").encode("utf-8") ''' segmentResult = pynlpir.segment(textsegment,pos_tagging=True) newSegmentResult = removePunctuation(segmentResult) allSegmentResult.append(newSegmentResult) print len(allSegmentResult) file.close() pynlpir.close() #print label return allSegmentResult
def segment(self, sentence): #分词 pynlpir.open(license_code=")VhTW_9s02tDm") list = pynlpir.segment(sentence) wordList = [] for res in list: wordList.append(res[0]) return wordList
def wordSegmenter(sentence='', pathOfStopWords=''): """ 将传入的句子分词并去除停用词 :param sentence: 传入的句子 :param pathOfStopWords: 停用词的路径 :return: 分词并去除停用词后由空格分隔的字符串 """ #打开分词器 pynlpir.open() #分词 seg_list = [] for seg in pynlpir.segment(sentence): seg_list.append(seg[0]) #去除停用词 resultWords = [] if pathOfStopWords == '': #没指定停用词就使用默认的停用词 pathOfStopWords = path.join(ROOT, STOP_WORDS) f_stop = open(pathOfStopWords, 'rt', encoding='utf-8') try: f_stop_text = f_stop.read() finally: f_stop.close() f_stop_words = f_stop_text.split("\n") for seg in seg_list: seg = seg.strip() if re.match(r'[a-zA-Z0-9]+', seg): #去掉英文以及数字 continue if len(seg) > 0 and (seg not in f_stop_words): resultWords.append(seg) return " ".join(resultWords)
def GetKeyWorld(filePath): #使用PYNLPIR getkeywokld来实现 #filePath='/home/yuanzhu/Desktop/NewsData/20190603/20190603419.json' try: pr.open() #filePath='/home/yuanzhu/Desktop/NewsData/20190501/20190501181.json' dicNews = GetDictFromJsonFile(filePath) content = dicNews['content'] # segs=pr.segment(content) # for seg in segs: # print(seg) tupkeywords = pr.get_key_words( content, weighted=True) #使用TF-IDF算法提取关键词(貌似还挺有效果) keywords = [] for i, w in enumerate(tupkeywords): keywords.append(w[0]) if i == 9: break i += 1 except Exception as e: strLogErr = 'Get {} keyworld error :{}'.format(filePath, e) print(strLogErr) return None print("FilePath=", filePath) print('获取热点:', keywords) return keywords
def __init__(self): jieba.initialize() self.ltpseg = pyltp.Segmentor() self.ltpseg.load('model/ltp_data_v3.4.0/cws.model') jiagu.init() self.thu1 = thulac.thulac(seg_only=True) pynlpir.open()
def train(self): # df_table = {"valid": {"science": 35, "physics": 34, "robot": 57}, "invalid": {"fat": 30, "large": 34, "cheap": 55}} # The number of articles containing "science", "physics" or "robot" # prior_table = {"valid": 183, "invalid": 244} pynlpir.open() prior_table = {ele: 0 for ele in self.category_list} posterior_table = {ele: dict() for ele in self.category_list} i = 0 for sample in self.training_set_material: buffer = sample.split("\t") text = buffer[0] seg_words = pynlpir.segment(text, pos_tagging=False) words_set = set(seg_words) try: label = buffer[1] except: print("Line " + str(i) + "in training set corrupted") continue prior_table[label] += 1 for word in words_set: # all words in the text if word in posterior_table[label].keys(): posterior_table[label][word] += 1 # posterior count +1 when this word already exists in posterior else: posterior_table[label][ word ] = 1 # posterior count assigned to 1 when this word does exist in posterior yet i += 1 return prior_table, posterior_table
def preprocess(filename): f_save = open('data/char_test.txt', 'w', encoding='utf-8') pynlpir.open() with open(filename, 'r', encoding='utf-8') as f: for line in f: lst = line.rstrip().split(' ') for item in lst: c, t = item.split('/') if t == 'o': c = pynlpir.segment(c, pos_tagging=False) for i, x in enumerate(c): f_save.write(x + ' ' + 'O' + '\n') elif t == 'ns': c = pynlpir.segment(c, pos_tagging=False) for i, x in enumerate(c): if i == 0: f_save.write(x + ' ' + 'B-LOC' + '\n') else: f_save.write(x + ' ' + 'I-LOC' + '\n') elif t == 'nt': c = pynlpir.segment(c, pos_tagging=False) for i, x in enumerate(c): if i == 0: f_save.write(x + ' ' + 'B-ORG' + '\n') else: f_save.write(x + ' ' + 'I-ORG' + '\n') elif t == 'nr': c = pynlpir.segment(c, pos_tagging=False) for i, x in enumerate(c): if i == 0: f_save.write(x + ' ' + 'B-PER' + '\n') else: f_save.write(x + ' ' + 'I-PER' + '\n') f_save.write('\n') f_save.close()
def splitFile(docName, encodingType): ''' default code style of docName : encodingType function : segmente the chinese text of docName and return ''' # all is wrote in cache -- ok ? maybe wrote in files f = file(docName, 'r') pynlpir.open(encoding='utf-8') contest = [] line = f.readline() cou = 0 while line: line = line.strip() cou += 1 try: line = line.decode(encodingType) if line.find(testChar) != -1: #delete the file header line = f.readline() continue temp = pynlpir.segment(line, pos_tagging=False) contest += temp line = f.readline() except: line = f.readline() # print '.' # print "err %s, %d"%(docName, cou) f.close() pynlpir.close() return contest
def word_segment(): in_text = codecs.open('data/xuezhong.txt', 'r', encoding='UTF-8').read() pynlpir.open() # 添加自定义字典 nlpir.AddUserWord(c_char_p("徐骁".encode())) nlpir.AddUserWord(c_char_p("老怪物".encode())) nlpir.AddUserWord(c_char_p("徐渭熊".encode())) nlpir.AddUserWord(c_char_p("徐北枳".encode())) nlpir.AddUserWord(c_char_p("白狐儿脸".encode())) nlpir.AddUserWord(c_char_p("轩辕青锋".encode())) nlpir.AddUserWord(c_char_p("姜泥".encode())) nlpir.AddUserWord(c_char_p("大官子".encode())) nlpir.AddUserWord(c_char_p("北凉".encode())) nlpir.AddUserWord(c_char_p("小和尚".encode())) # 对文件分词 nlpir.FileProcess('data/xuezhong.txt'.encode("utf-8"), 'data/xuezhong_seg_1.txt'.encode("utf-8"), False) # key_words = pynlpir.get_key_words(in_text, max_words=100, weighted=True) pynlpir.close() print(key_words) print("segment finished")
def setUp(self): try: pynlpir.cli.update_license_file(DATA_DIR) except URLError: pass pynlpir.open()
def init(self, filename=TRAINSETFILE, IsTraining=True, IsSegment=True): with open(filename, encoding='GB18030') as file: filereader = csv.reader(file, dialect='excel-tab', quoting=csv.QUOTE_NONE) if not IsSegment: for item in filereader: self.userlist.append(item) else: pynlpir.open() if IsTraining: infoflag = 4 else: infoflag = 1 # count_test =0 for userquery in filereader: userdict = {} self.userinfo.append(userquery[:infoflag]) for item in userquery[infoflag:]: for word in pynlpir.segment(item, pos_tagging=False): if word not in self.dict.keys(): self.dict[word] = 0 if word in userdict.keys(): userdict[word] += 1 else: userdict[word] = 1 self.userlist.append(userdict) # count_test +=1 # if count_test>100: # break pynlpir.close() self.IsTraining = IsTraining self.IsSegment = IsSegment self.IsDF = False
def drive_start(tag_flag = True): targ_name = u'汇总词典.dic' all_dicts = {u'药物':'MED', u'疾病':'DIS', u'症状':'SYM', u'手术检查':'TRE'} ner_mm_rule.map_dict('./tag/dictionary', all_dicts, targ_name) pynlpir.open() if tag_flag == True: tag_ner()
def main(input_file, output_file): pynlpir.open() fw = open(output_file, 'w+', encoding='utf-8') pos2id = get_pos_map() data = read_corpus(input_file) for _sent, _tags in data: sent = ''.join(_sent) result = pynlpir.segment(sent, pos_tagging=True, pos_names='parent', pos_english=True) # print(result) i = 0 for _word, _speech in result: for j in range(len(_word)): char = _word[j] speech = '' if _speech is None or _speech not in reserve_pos_list: speech = 'O' else: speech = '-'.join(_speech.split(' ')) if j == 0: speech = 'B-' + speech else: speech = 'I-' + speech if i >= len(_tags): print(i, len(_sent), _sent) fw.write(char + ' ' + _tags[i] + ' ' + speech + '\n') i += 1 fw.write('\n') fw.close() pynlpir.close()
def get_tokenised_parts(self): pynlpir.open() for s in self.sentences: sen_parts = re.split('[?!.,。,?!]', s) for sen_part in sen_parts: tokens = pynlpir.segment(sen_part) yield tokens
def train(self): # df_table = {"valid": {"science": 35, "physics": 34, "robot": 57}, "invalid": {"fat": 30, "large": 34, "cheap": 55}} # The number of articles containing "science", "physics" or "robot" # prior_table = {"valid": 183, "invalid": 244} pynlpir.open() prior_table = {ele: 0 for ele in self.category_list} posterior_table = {ele: dict() for ele in self.category_list} i = 0 for sample in self.training_set_material: buffer = sample.split("\t") text = buffer[0] seg_words = pynlpir.segment(text, pos_tagging=False) words_set = set(seg_words) try: label = buffer[1] except: print("Line " + str(i) + "in training set corrupted") continue prior_table[label] += 1 for word in words_set: # all words in the text if word in posterior_table[label].keys(): posterior_table[label][word] += 1 # posterior count +1 when this word already exists in posterior else: posterior_table[label][word] = 1 # posterior count assigned to 1 when this word does exist in posterior yet i += 1 return prior_table, posterior_table
def fenci(content): dict = {} # pr.open() # dicConf=GetDicConfig() # FilePath=dicConf['Testfilepath'] # DicNews=GetDictFromJsonFile(FilePath) # content=DicNews['content'] pr.open() segs = pr.segment(content, pos_english=False, pos_names='child') AllList = [] NamedList = [] OtherList = [] for w, c in segs: if len(w) < 2: continue else: AllList.append(w) if c == '地名' or c == '人名': NamedList.append(w) else: OtherList.append(w) #print("NameList=",NamedList) #print('OtherList=',OtherList) #print('Alllist=',AllList) dict.update({'NameList': NamedList}) dict.update({'OtherList': OtherList}) dict.update({'AllList': AllList}) pr.close() return dict
def seg2(curpus_path, seg_path, seg_test_path): pynlpir.open(encoding='gbk') check_dir_exist(seg_path) check_dir_exist(seg_test_path) cat_folders = os.listdir(curpus_path) i = 0 for folder in cat_folders[::-1]: folds_path = os.path.join(curpus_path, folder) folds = os.listdir(folds_path) for fold in folds: files_path = os.path.join(folds_path, fold) files = os.listdir(files_path) for file in files: i += 1 from_file = os.path.join(files_path, file) if i < 55000: to_file = os.path.join(seg_path, str(i) + '.txt') elif i < 125000: to_file = os.path.join(seg_test_path, str(i - 55000) + '.txt') else: pynlpir.close() return nlpir.FileProcess(from_file.encode('UTF-8'), to_file.encode('UTF-8'), True) content = readfile(to_file, encoding='gbk') pat = re.compile(u'\s+([\u4e00-\u9fa5]+)/n') result = pat.findall(str(content)) write(to_file, ' '.join(result))
def open(conf=CONF_PATH): global CONF_PARSER if not CONF_PARSER: if not os.path.isfile(conf): raise IOError('Config file is not exist.') CONF_PARSER = ConfigParser() CONF_PARSER.read(conf) pynlpir.open()
def __init__(self): # load stopwords f = open('../data/stopwords.txt', 'r') lines = f.readlines(-1) for line in lines: self.stopwords.add(line.strip().decode('utf-8')) pynlpir.open()
def cut(data_list): """ 分词 """ pynlpir.open() data_list = [(pynlpir.segment(x)) for x in data_list] pynlpir.close() return data_list
def wordSegment(file): pynlpir.open() des_filename = '/home/cc/Desktop/sentences.txt' with open(des_filename, 'w') as fout: for line in file: r = pynlpir.nlpir.ParagraphProcess(line, False) fout.write(r + '\n') pynlpir.close()
def main(): # 设置结果保存的目录 result_dir = r'D:\semantic analysis\新结果\共现网络//' txt_dir = r"D:\semantic analysis\新纯文本\1常用词//" # k_list = util.get_key_list() # k_list = ['不约而同', '喜闻乐见', '努力', '感觉', '简单', '无聊', '希望', '美好'] # 中心词 k_list = ['美好'] # 结巴分词词典的目录 # jieba.set_dictionary("D:\semantic analysis\分词\词库\导出结果\dict1.txt") # jieba.initialize() pynlpir.open() for key in k_list: print(key) pynlpir.nlpir.AddUserWord(c_char_p(key.encode())) for key in k_list: print(key) # 文件目录 file_list = util.get_file_list(txt_dir + key, ".txt") # 建立目录 mk_dir(result_dir + key) # mk_dir(result_dir+key+'//w') mk_dir(result_dir + key + '//p') for n_file in file_list: s_list = util.get_list_from_file(txt_dir + key + "//" + n_file) # 过滤相同的语句,防止重复计算 print(len(s_list)) s_list = list(set(s_list)) print(len(s_list)) # 生成所有句子的网络 # ps_list, mn, pps_list,pmn = create_matrix(s_list,key) pps_list, pmn = create_matrix(s_list, key) pkl_name = n_file[:-4] + '.pkl' # for w_list in ps_list: # # 创建整句话的网络 # mn.add_edges(w_list) # util.save_nw(mn.get_network(), result_dir+key+'//w//' + pkl_name) for w_list in pps_list: # pmn.add_edges(w_list) pmn.add_gram_edges(w_list) util.save_nw(pmn.get_network(), result_dir + key + '//p//' + pkl_name) print(n_file) print( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) with open(result_dir + key + '//record.txt', 'a', encoding='utf-8') as rf: rf.write(n_file + '\n') pynlpir.close()
def segment(self): """ fni: str; input file name with path fno: str; output file name with path lang: str; language code pos: bool; POS tags included n: int; no. of lines processed """ import copy from PyQt5.QtWidgets import QApplication from opencc import OpenCC openCC = OpenCC('t2s') # convert from Traditional-to-Simplified pynlpir.open(encoding="utf-8") print("Finished initializing ITCLAS/NLPIR") count = lineCount(self.fni) fit = open(self.fni, "r", encoding="UTF-8") fot = open(self.fno, "w", encoding="UTF-8", newline="\n") sep = " " # separator of Chinese tokens (space by default) n = 0 for linet in fit: n += 1 if (linet.strip() == ''): # empty string fot.write("\n") continue lines = openCC.convert(linet.strip()) lines_seg = pynlpir.segment(lines, pos_tagging=True, pos_names=None) # segment with optional POS-tagging # The following segments the zht text according to the # segmentation patterns obtained from NLPIR above tokens = [] # initialize list to hold 'words' of segmented zht line pos_tags = [] # initialize list to hold pos tags of segmented words while len(lines_seg) > 0: # loop until nothing is left in lines_seg t, p = lines_seg.pop(0) # remove leftmost zhs token and save to variable t0 m = len(t) # no. of characters in token tokens.append(linet[0:m]) # add corresponding zht token to tokens[] pos_tags.append(p) linet = linet[m:] # delete token from zht line (from beginning of string) #fot.write(sep.join(tokens)+"\n") # wirte zht-seg output tok_pos = ["{}_{}".format(x, y) for x,y in zip(tokens, pos_tags)] # list of tok_pos pairs fot.write(sep.join(tok_pos)+"\n") #if (n == 1): break if n % 50 == 0: self.window.ui.progressBar.setValue(round(100 * n / self.fi_linecount, 0)) self.window.ui.progressBar.repaint() QApplication.processEvents() self.window.ui.progressBar.setValue(100) self.window.ui.progressBar.repaint() fit.close() fot.close() pynlpir.close() self.numLineProcessed = n return n
def get_words(sentence,query_id): #输入为一个string的句子,输出为这个句子的分解的单词 pynlpir.open() print 'sentence : ' + str(sentence) try: sentence_words_list = pynlpir.segment(sentence,pos_tagging=False) return sentence_words_list except BaseException: return ['ERROR',str(query_id)]
def run(self): starttime = time.time() pynlpir.open() print('PyNLPIR:') print("".join( [(x + '/ ') for x in pynlpir.segment(self.test_text, pos_tagging=False) if x not in self.stopwords])) endtime = time.time() print('time cost:' + str(round((endtime - starttime), 4)) + ' seconds.\n')
def build_word_vectors(infile_name, outfile_name): print('building word vectors...') pynlpir.open() jieba.initialize() df = pd.read_json(infile_name) with open(outfile_name, 'w') as f: for content in tqdm(df.article_content): f.write(' '.join(word_tokenize(content)))
def nlpir_seg_txt(text): # nlpir 分词并用去停用词 pynlpir.open() # 分词且去停用词 segments = pynlpir.segment(text, pos_tagging=False, pos_english=False) # 去停用词 remove_segments = [word for word in segments if word not in stopwords and len(word) > 1] pynlpir.close() return remove_segments
def extract_text_type(user, text, user_dict, entity_dict, keyword_dict, topic_word_list, all_topic_list): pynlpir.open() user_id = user_dict[user] entity_list = find_raw_entity(text, entity_dict) keyword_list = find_keyword(text, keyword_dict) topic_id = find_topic(text, topic_word_list, all_topic_list) pynlpir.close() return user_id, entity_list, keyword_list, topic_id
def segment(path='F:/Data/Chinese/chinese.json', json_path='F:/Data/Chinese/chinese_token.json'): """ NLPIR分词+根据词性清洗+去掉为问题或回答空的项 :param path: 源数据路径 :param json_path: 结果保存路径 :return: """ # 启动分词工具 pynlpir.open() # 只保留文本部分,并分词,根据词性过滤 # 保留以下词性的词,并去除词性标记 # 词性含义请查看https://github.com/tsroten/pynlpir/blob/master/pynlpir/pos_map.py word_filter = { 'noun', 'time word', 'locative word', 'noun of locality', 'verb', 'adjective', 'distinguishing word', 'status word', 'numeral' } # 清除分词异常的数据 question_id_filter = {294118450, 300106271, 291834409} # 边读边处理边写入文件,减少内存消耗 count = 0 with open(path, 'r') as f_in, open(json_path, 'w') as f_out: for line in f_in: q = json.loads(line) if q['question_id'] in question_id_filter: continue # 干掉有换行的情况 小写化 if '\n' in q['question']: print 'question:' print q['question'] q['question'] = q['question'].replace('\n', ' ') q['question'] = [ w[0] for w in pynlpir.segment(q['question'].lower()) if w[1] in word_filter and w[0] != u'' ] for a in q['answers']: # 干掉有换行的情况 if '\n' in a['answer']: print 'answer:' print a['answer'] a['answer'] = a['answer'].replace('\n', ' ') a['answer'] = [ w[0] for w in pynlpir.segment(a['answer'].lower()) if w[1] in word_filter and w[0] != u'' ] # 清除回答为空 q['answers'] = [a for a in q['answers'] if len(a['answer']) > 0] count = count + 1 if count % 1000 == 0: print count # 清除回答列表为空和问题为空的 if len(q['question']) > 0 and len(q['answers']) > 0: f_out.write(json.dumps(q)) f_out.write('\n') pynlpir.close()
def word_segment(text): pynlpir.open() segments = nlpir.segment_pos(text) segment_result = [] pos_result = [] for segment in segments: segment_result.append(segment[0]) pos_result.append(segment[1]) pynlpir.close() return segment_result, pos_result
def nlpir_keywords(text,n): pynlpir.open() # print '关键词测试:\n' key_words = list(pynlpir.get_key_words(text,n,weighted=False)) # for key_word in key_words: # print key_word[0], '\t', key_word[1] pynlpir.close() print key_words return key_words
def get_key_words(text): pynlpir.open() result = [] keywords = pynlpir.get_key_words(text, weighted=True) if len(keywords) == 0: return result for i in range(len(keywords)): keyword = keywords[i][0] result.append(keyword) pynlpir.close() return result
def words_cixing(question,pos=1): #pos=1,标注词性;否则不标注 pynlpir.open() if pos: pos1=['{}/{}'.format(k,v)for k,v in pynlpir.segment(question, pos_names=None,pos_tagging=pos)] else: pos0=pynlpir.segment(question) pynlpir.close() if pos: return pos1 else : return pos0
def query2words(self,query): words = [] segs = query.split(' ') for s in segs: s = s.strip() ## need regularization if s in self.vocab: words.append(s) ## in word2vec vocab else: pynlpir.open() # words.extend(pynlpir.get_key_words(query,max_words=3)) word_segs = pynlpir.segment(query,pos_tagging=False) for word in word_segs: if word not in self.stop_list: words.append(word) print(words) return words
def _part_document(self): pynlpir.open() docs = {} for dirname, dirnames,filenames in os.walk('dependence/new_data'): for filename in filenames: path = os.path.join(dirname, filename) text = '' with io.open(path, 'r',encoding='utf-8') as f: text = f.readline() words = pynlpir.segment(text,pos_tagging=False) clean_words = [w for w in words if w not in self.stop_list and len(w)>1] index = filename[:6] docs[index] = clean_words dictionary = corpora.Dictionary(docs.values()) corporas = {index: dictionary.doc2bow(docs[index]) for index in docs} return docs, dictionary, corporas
def test_license_auto_update(self): """Tests that the auto-update of the license works.""" try: # switch old one to the new one os.rename(os.path.join(DATA_DIR, LICENSE_NAME), os.path.join(DATA_DIR, "{}.copy".format(LICENSE_NAME))) os.rename(os.path.join(DATA_DIR, "{}.old".format(LICENSE_NAME)), os.path.join(DATA_DIR, LICENSE_NAME)) pynlpir.open() pynlpir.close() finally: # switch back the license os.rename(os.path.join(DATA_DIR, LICENSE_NAME), os.path.join(DATA_DIR, "{}.old".format(LICENSE_NAME))) os.rename(os.path.join(DATA_DIR, "{}.copy".format(LICENSE_NAME)), os.path.join(DATA_DIR, LICENSE_NAME))
def predict(self, text): # words = [word1, word2, word3, ...] pynlpir.open() seg_words = pynlpir.segment(text, pos_tagging=False) words_set = set(seg_words) result = dict() for category in self.category_list: prob = self.comp_prop(category, words_set) result[category] = prob """ buffer = [result[my_key] for my_key in result.keys()] score_sum = sum(buffer) # result = {my_key: result[my_key]/score_sum for my_key in result.keys()} """ buffer = list(result.items()) buffer.sort(key=lambda x: x[1], reverse=True) top_category = buffer[0][0] return top_category
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) nlpir.Init(nlpir.PACKAGE_DIR, nlpir.UTF8_CODE) pynlpir.open() pynlpir.open(encoding='utf-8') seglist = pynlpir.segment(value,) for w in seglist: t.original = t.text = w t.boost = 1.0 if positions: t.pos=start_pos+value.find(w) if chars: t.startchar=start_char+value.find(w) t.endchar=start_char+value.find(w)+len(w) yield t #通过生成器返回每个分词的结果token
def document2sentences(self,document): pynlpir.open() words = pynlpir.segment(document,pos_tagging=False) sign = ['。', ';', '.', ';'] pause_position = [] for i in range(len(words)): if words[i] in sign: pause_position.append(i) setences = [] if len(pause_position) == 0: clean_d = [s.strip() for s in words if s not in self.stop_list] setences.append(' '.join(clean_d)+'\n') else: for i in range(len(pause_position)): setence = [] if i == 0: setence = words[:pause_position[i]] elif i == len(pause_position)-1 and i != 0: break else: setence = words[pause_position[i]:pause_position[i+1]] clean_s = [s.strip() for s in setence if s not in self.stop_list] setences.append(' '.join(clean_s)+'\n') return setences
def read_lexical_datas(file, compose_func=None): pynlpir.open() f = open(file, 'r', encoding='utf-8') tokens_list = [pynlpir.segment(line.rstrip('\n').replace('幺', '一'), pos_tagging=False) for line in f] if compose_func is None: word_idx = {} for tokens in tokens_list: for token in tokens: if token not in word_idx: word_idx[token] = len(word_idx) array = numpy.zeros([len(tokens_list), len(word_idx)]) for i, tokens in enumerate(tokens_list): for token in tokens: array[i][word_idx[token]] = 1.0 else: print('reading word vectors') word_vecs = word_vec.read_word_vec(r'../data/vectors_cbow') print('reading complete') array = numpy.asarray([compose_func(tokens, word_vecs) for tokens in tokens_list]) return array
def main(): py.open() a = sys.argv[1] result = py.segment(a) res_str = [] for r in result: if len(r[0]) == 2 and (r[1] == "noun" or r[1] == "verb" or r[1] == "adjective"): f_result = fsame.find(r[0]) ff_result = fsame.ffind(r[0]) if f_result == r[0] or ff_result == r[0]: res_str.append(r[0]) else: if random.randint(0, 1) == 0: res_str.append(f_result) else: res_str.append(ff_result) else: res_str.append(r[0]) print "".join(res_str) py.close()
def extract_news_kws(hot_news): pynlpir.open() s = hot_news kw_list = pynlpir.segment(s, pos_tagging=True, pos_names=None) kws = "" for kw in kw_list: pos = kw[0] tagging = kw[1] try: if tagging: # test if tagging is none, which means the pos is a space character tagging_first = tagging[0] else: tagging_first = "" except: tagging_first = "" if tagging_first == "n" and len(pos) > 1: if pos != "quot": kws = kws + pos + u" " kws = kws.strip(u" ") return kws
def part_sentence(stop_list): pynlpir.open() for dirname, dirnames,filenames in os.walk('dependence/ch_corporas/wiki/lost'): for filename in filenames: lines = [] read_path = os.path.join(dirname, filename) rf = open(read_path,'rb') print(filename) for line in rf: # detector.feed(byte) encoding = chardet.detect(line)['encoding'] if encoding == None: encoding = 'utf-8' new_line = line.decode(encoding,'ignore') words = pynlpir.segment(new_line,pos_tagging=False) clean_words = [w.strip() for w in words if w not in stop_list] str_line = ' '.join(clean_words) if str_line: lines.append(str_line+'\n') rf.close() write_path = os.path.join('dependence/ch_corporas/wiki_clean', filename) wf = open(write_path, 'w') wf.writelines(lines) wf.close()
def __init__(self, content, norm="l1_norm"): self.norm = norm pynlpir.open() words = pynlpir.segment(content, pos_tagging=True, pos_names=None) kws = "" for word in words: pos = word[0] tagging = word[1] try: if tagging: # test if tagging is none, which means the pos is a space character tagging_first = tagging[0] else: tagging_first = "" except: tagging_first = "" if tagging_first == "n" and len(pos) > 1: if pos != "quot": kws = kws + pos + u" " result = kws.split(" ") self.PoS = result
def tokenize(file): words = [] pynlpir.open() directory = '\\resources\\original files\\htl_del_4000\\' posWords = codecs.open(directory + file + 'Words.txt', 'w+', 'utf-8') with codecs.open(directory + file + '.txt', 'r', 'utf-8') as posFile: for s in posFile.readlines(): # print posFile.readline() a = pynlpir.segment(s, pos_tagging=False) # print a for i in range(len(a)): # print a[i] if i != (len(a) - 1): # print 'i='+str(i) # print 'a='+str(len(a)) posWords.write(a[i] + ' ') else: posWords.write(a[i] + '\r') # for i in a: # posWords.write(i + ';') # posWords.write('\0') posWords.close()
def post(self,request): obj_id = request.POST['obj_id'] school = MySchool.objects.get(id=int(obj_id)) feeds = [] # weibo # App Key:802677147 # App Secret:f75be23800d779cc9dbbf6b467b7ff61 # Redirect url: https://api.weibo.com/oauth2/default.html # code: 4ccb7879bf204466b80e02c106d09727 # read baidu params = {'keyword':school.name} # send a 3rd party service request baidu_consumer.delay(params) # read saved feeds feeds = MyBaiduStream.objects.filter(school=school).order_by('-last_updated')[:100] content = loader.get_template(self.template_name) tieba_html= content.render(Context({ 'obj':school, 'feeds': feeds, })) # hot topics pynlpir.open() # must have this line! topics = feeds[:50] content = loader.get_template(self.newsticker_template_name) newsticker_html= content.render(Context({ 'objs':topics, 'keywords': pynlpir.get_key_words(''.join([f.name+f.description for f in feeds]), max_words=50, weighted=True) })) pynlpir.close() return HttpResponse(json.dumps({'bd_html':tieba_html,'news_html':newsticker_html}), content_type='application/javascript')
import csv import sys import pynlpir csv.field_size_limit(sys.maxsize) pynlpir.open() n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/places_raw.txt") print "num imported: " + str(n) n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/pop_raw.txt") print "num imported: " + str(n) n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/poprecommend_raw.txt") print "num imported: " + str(n) n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/politics_raw.txt") print "num imported: " + str(n) n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/weibo-data/data/0.txt") print "num imported: " + str(n) n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/weibo-data/data/1.txt") print "num imported: " + str(n) n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/weibo-data/data/2.txt") print "num imported: " + str(n) n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/weibo-data/data/3.txt") print "num imported: " + str(n)
def __init__(self, file): with open(file, 'r', encoding="utf-8") as f: content = f.read() pynlpir.open() result = pynlpir.segment(content, pos_tagging=False) self.PoS = result
def main(argv): csv.field_size_limit(sys.maxsize) pynlpir.open() #load sogou dictionaries n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/places_raw.txt") print "num imported: " + str(n) n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/pop_raw.txt") print "num imported: " + str(n) n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/poprecommend_raw.txt") print "num imported: " + str(n) n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/politics_raw.txt") print "num imported: " + str(n) n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/weibo-data/data/0.txt") print "num imported: " + str(n) n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/weibo-data/data/1.txt") print "num imported: " + str(n) n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/weibo-data/data/2.txt") print "num imported: " + str(n) n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/weibo-data/data/3.txt") print "num imported: " + str(n) n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/weibo-data/data/4.txt") print "num imported: " + str(n) #write out parsed words # files = ["week" + str(i) + ".csv" for i in range(4,53)] # for fi in files: fi = "week" + str(argv[0]) + ".csv" print fi # for fi in files: infile = "./../" + fi a = fi.split('.') outfile = "./../parsed/" + a[0] + "parsed.csv" f = open(infile, 'rb') of = open(outfile, 'wb') reader = csv.reader(f, delimiter=",") writer = csv.writer(of, delimiter=",", quotechar='|', quoting=csv.QUOTE_MINIMAL) count = 0 total = sum(1 for row in reader) print total f.seek(0) errors = 0 unbounderrors = 0 for row in reader: mid = row[0] message = row[6] censored = None try: segmented = pynlpir.segment(message) except UnicodeDecodeError: errors += 1 continue except UnboundLocalError: unbounderrors += 1 print "what??" continue except: print "core dump...?" continue mString = "" for segment in segmented: mString += segment[0] mString += " " if row[10]!="": censored = 1 else: censored = 0 writer.writerow([mid, censored, mString.encode("utf-8")]) # progress if count%1000 == 0: print str(count) + "/" + str(total) + "\r", sys.stdout.flush() count += 1 print "count: " + str(count) print "errors: " + str(errors) print "unbounderrors: " + str(unbounderrors)
def main(): csv.field_size_limit(sys.maxsize) pynlpir.open() # load sogou dictionaries n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/places_raw.txt") print "num imported: " + str(n) n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/pop_raw.txt") print "num imported: " + str(n) n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/poprecommend_raw.txt") print "num imported: " + str(n) n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/politics_raw.txt") print "num imported: " + str(n) n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/weibo-data/data/0.txt") print "num imported: " + str(n) n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/weibo-data/data/1.txt") print "num imported: " + str(n) n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/weibo-data/data/2.txt") print "num imported: " + str(n) n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/weibo-data/data/3.txt") print "num imported: " + str(n) n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/weibo-data/data/4.txt") print "num imported: " + str(n) # write out parsed words files = ["week" + str(i) + ".csv" for i in range(1, 53)] print "loading user data into map" userdict = dict() userdata = "./../userdata.csv" with open(userdata, "rb") as users: reader = csv.reader(users, delimiter=",") for row in reader: userdict[row[0]] = [row[1], row[2], row[3]] outfile = "full_uncensored_sample.csv" addedCount = 0 for fi in files: # fi = "week" + str(argv[0]) + ".csv" print fi # for fi in files: infile = "./../" + fi a = fi.split(".") f = open(infile, "rb") of = open(outfile, "a") reader = csv.reader(f, delimiter=",") writer = csv.writer(of, delimiter=",", quotechar="|", quoting=csv.QUOTE_MINIMAL) count = 0 total = sum(1 for row in reader) print total f.seek(0) errors = 0 unbounderrors = 0 for row in reader: if count % 2421 == 0 and row[10] == "": mid = row[0] message = row[6] censored = 0 try: segmented = pynlpir.segment(message) except UnicodeDecodeError: errors += 1 continue except UnboundLocalError: unbounderrors += 1 print "what??" continue except: print "core dump...?" continue mString = "" for segment in segmented: mString += segment[0] mString += " " try: d = userdict[row[2]] except KeyError: # print "no key for userid " + row[2] continue except ValueError: print "ValueError" continue writer.writerow( [mid, censored, mString.encode("utf-8"), row[1], row[2], row[5], row[8], row[9], d[0], d[1], d[2]] ) addedCount += 1 # progress if count % 1000 == 0: print str(count) + "/" + str(total) + "\r", sys.stdout.flush() count += 1 print "addedCount: " + str(addedCount) print "count: " + str(count) print "errors: " + str(errors) print "unbounderrors: " + str(unbounderrors)
def createDocMapAndClickInfo(total_set_file, doc_set_file): doc_map1 = {} #doc map(Facilitate the calculation in PLSA) doc_map2 = {} #not use user_set = set() #total users doc_set = set() #total documents doc_click_count = {} #clicks in every document user_doc_click_count = {} #clicks in specific document from specific user if os.path.isfile(doc_set_file): is_write_need_file = True else: is_write_need_file = False fp_total_set = open(total_set_file, 'r') if is_write_need_file == False: fp_doc_set = open(doc_set_file, 'w') fp_doc_map1 = open('data//doc_map1.csv', 'w') fp_doc_map2 = open('data//doc_map2.csv', 'w') fp_doc_click_count = open('data//doc_click_count.csv', 'w') fp_user_doc_click_count = open('data//user_doc_click_count.csv', 'w') cnt = 0 pynlpir.open() for line in fp_total_set: word = line.split('\t') user_set.add(word[0]) doc_set.add(word[1]) doc_click_count.setdefault(word[1], 0) doc_click_count[word[1]] += 1 user_doc_click_count.setdefault(word[0], {}) if user_doc_click_count[word[0]].has_key(word[1]) == False: user_doc_click_count[word[0]][word[1]] = 0 user_doc_click_count[word[0]][word[1]] += 1 if doc_map1.has_key(word[1]) == False: doc_map1[word[1]] = cnt doc_map2[cnt] = word[1] cnt += 1 if is_write_need_file == False: # title_split_result = pynlpir.nlpir.ParagraphProcess(word[4], True) content_split_result = pynlpir.nlpir.ParagraphProcess(word[5], True) fp_doc_set.write('%s\t%s' %(word[1], content_split_result))#, content_split_result)) # doc_map = sorted(doc_map1.items(), key=lambda d:d[1], reverse=False) if is_write_need_file == False: for d, dtag in doc_map1.items(): fp_doc_map1.write('%s %d\n' %(d, dtag)) for dtag, d in doc_map2.items(): fp_doc_map2.write('%d %s\n' %(dtag, d)) for d, dclicks in doc_click_count.items(): fp_doc_click_count.write('%s %d\n' %(d, dclicks)) user_clicks = 0 for u, uitem in user_doc_click_count.items(): for d in uitem.keys(): if is_write_need_file == False: fp_user_doc_click_count.write('%s %s %d\n' %(u, d, uitem[d])) user_clicks += uitem[d] print 'user clicks = ', user_clicks pynlpir.close() if is_write_need_file == False: fp_doc_set.close() fp_total_set.close() print 'number of users:', len(user_set) print 'number of documents:', len(doc_set) print 'createDocMap end' return user_set, doc_set, doc_map1, doc_map2, doc_click_count, user_doc_click_count
def import_userdict(file_dir): pynlpir.open() nlpir.import_userdict(file_dir) pynlpir.close()
def setUp(self): try: pynlpir.open() except pynlpir.LicenseError: pynlpir.cli.update_license_file(DATA_DIR) pynlpir.open()