def wordToVocabulary(self, originFile, vocabFile, segementFile): # stopwords = [i.strip() for i in open(self.stopwordsFile).readlines()] # print(stopwords) # exit() vocabulary = [] sege = open(segementFile, "w") with open(originFile, 'r') as en: for sent in en.readlines(): # 去标点 if "enc" in segementFile: sentence = sent.strip() words = jieba.lcut(sentence) print(words) else: words = jieba.lcut(sent.strip()) vocabulary.extend(words) for word in words: sege.write(word + " ") sege.write("\n") sege.close() # 去重并存入词典 vocab_file = open(vocabFile, "w") _vocabulary = list(set(vocabulary)) _vocabulary.sort(key=vocabulary.index) _vocabulary = self.vocab + _vocabulary for index, word in enumerate(_vocabulary): vocab_file.write(word + "\n") vocab_file.close()
def search(request): if request.user.is_authenticated(): if request.GET: input_word=request.GET["search"] seg_list= jieba.lcut(input_word, cut_all=True) result=[] for art in Art.objects.all(): art_name_seg=jieba.lcut(art.name ,cut_all=True) for key in seg_list: if any(key==name for name in art_name_seg): result.append(art) elif any(key==style for style in art.tag_set.all()[0].style): result.append(art) elif any(key==word for word in jieba.lcut(art.tag_set.all()[0].description ,cut_all=True)): result.append(art) if request.POST: name_collection=request.POST['getcollection'] Collection.objects.get_or_create(collecter=request.user,defaults={'collecter':request.user}) Art.objects.update_or_create(name=name_collection,defaults={'collecter':Collection.objects.filter(collecter=request.user)}) return render_to_response('search.html',RequestContext(request,locals())) else: return HttpResponseRedirect("/login/")
def __init__(self): # load the jieba user.f_dict root_path = os.path.dirname(os.path.abspath(__file__)) jieba.load_userdict(os.path.join(root_path, "f_dict/user.dict")) # get the positive f_corpus and length self.pos_doc_list = [] with open(os.path.join(root_path, "f_corpus/waimai/positive_corpus_v1.txt"), encoding="utf-8") as pos_f: for line in pos_f: # self.pos_doc_list.append(list(set(jieba.lcut(line.strip())))) self.pos_doc_list.append(jieba.lcut(line.strip())) self.pos_doc_length = len(self.pos_doc_list) # get the negative f_corpus and length self.neg_doc_list = [] with open(os.path.join(root_path, "f_corpus/waimai/negative_corpus_v1.txt"), encoding="utf-8") as pos_f: for line in pos_f: # self.neg_doc_list.append(list(set(jieba.lcut(line.strip())))) self.neg_doc_list.append(jieba.lcut(line.strip())) self.neg_doc_length = len(self.neg_doc_list) # define the variable about train number self.pos_train_num = 0 self.neg_train_num = 0 self.pos_test_num = 0 self.neg_test_num = 0 runout_content = "You are using the waimai f_corpus version 1.0.\n" runout_content += "There are total %d positive and %d negative f_corpus." % \ (self.pos_doc_length, self.neg_doc_length) print(runout_content)
def preprocess(): # 提取正类分词结果,用pos_word_doc的list存放每篇正类文档的分词结果;用pos_word_set的set存放所有正类文档的分词 dir1 = os.curdir + "\\data\\Y-cut" files1 = os.listdir(dir1) pos_word_doc = list() pos_word_set = set() for name in files1: if name.endswith(".txt"): filename = dir1 + "\\" + name file = open(filename, "r") content = file.readlines() word_list = list() for line in content: line.decode("utf-8") seg_list = jieba.lcut(line, cut_all=False) word_list.extend(seg_list) word_set = set(word_list) pos_word_doc.extend(list(word_set)) pos_word_set = pos_word_set | word_set file.close() # 提取负类分词结果,用neg_word_doc的list存放每篇负类文档的分词结果;用neg_word_set的set存放所有正类文档的分词 dir2 = os.curdir + "\\data\\N-cut" files2 = os.listdir(dir2) neg_word_doc = list() neg_word_set = set() for name in files2: if name.endswith(".txt"): filename = dir2 + "\\" + name file = open(filename, "r") content = file.readlines() word_list = list() for line in content: line.decode("utf-8") seg_list = jieba.lcut(line, cut_all=False) word_list.extend(seg_list) word_set = set(word_list) neg_word_doc.extend(list(word_set)) neg_word_set = neg_word_set | word_set file.close() # 正/负类所有文档的分词结果set all_word_set all_word_set = pos_word_set | neg_word_set word_dict = dict() print(len(all_word_set)) m = 0 # 统计各个词在正/负类中出现的次数 存在字典word_dict中,并输出到dict.txt文件 for word in all_word_set: n1 = pos_word_doc.count(word) n2 = neg_word_doc.count(word) word_dict[word] = (n1, n2) m += 1 if not (m % 100): print(m) out = open(os.curdir + "\\dict.txt", "w") for k in word_dict: out.write(k.encode("utf-8") + "\t" + str(word_dict[k][0]) + "\t" + str(word_dict[k][1]) + "\n") out.close()
def vectorize_2(test_words): input_words = jieba.lcut(test_words[0]) print check_neg(input_words) # if len(jieba.lcut(test_words[0])) < 2: if len(jieba.lcut(test_words[0])) < 2: return None, False else: v = HashingVectorizer(tokenizer=comma_tokenizer, stop_words=stopwords, n_features=100000, non_negative=True) test_data = v.fit_transform(test_words) print test_data return test_data, check_neg(input_words)
def onlinelearning(self, input_strs, target_strs): input_seg = jieba.lcut(input_strs) target_seg = jieba.lcut(target_strs) input_vec = [] for word in input_seg: if word not in self.enc_vocab.keys(): vec = self.add_voc(word, "enc") else: vec = self.enc_vocab.get(word) input_vec.append(vec) target_vec = [] for word in target_seg: if word not in self.dec_vocab.keys(): vec = self.add_voc(word, "dec") else: vec = self.dec_vocab.get(word) target_vec.append(vec) with tf.Session() as sess: # 初始化变量 ckpt = tf.train.get_checkpoint_state(self.model_path) if ckpt is not None: print(ckpt.model_checkpoint_path) self.model.saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(tf.global_variables_initializer()) fd = self.data_iter([input_vec], [target_vec], 1, 1) for i in range(100): _, loss, _, _ = sess.run([self.model.train_op, self.model.loss, self.model.gradient_norms, self.model.updates], fd) checkpoint_path = self.model_path + "nlp_chat.ckpt" # 保存模型 self.model.saver.save( sess, checkpoint_path, global_step=self.model.global_step) for i, (e_in, dt_pred) in enumerate(zip( fd[self.model.decoder_targets].T, sess.run(self.model.decoder_prediction_train, fd).T )): print(' sample {}:'.format(i + 1)) print(' dec targets > {}'.format(e_in)) print(' dec predict > {}'.format(dt_pred)) if i >= 0: break
def Participle(path): try: fp = open(path, "r") ad = fp.readline().strip('\n') na = fp.readline().strip('\n') ti = fp.readline().strip('\n')#time si = fp.readline().strip('\n') cont = na+fp.read() fp.close() except IOError: return 0 try: insi = {} insi['time'] = ti print(ti) insi['url'] = ad insi['title'] = na insi['site'] = si#decode("gb2312").encode("utf-8") global fnum global segcont global doc seg_list = jieba.lcut(cont, cut_all=False) stline = "" for word in seg_list: if ((word in d) is False) and word != '\n': stline = stline + " " + word segcont.append(stline) print (str(fnum) + " 分词") doc[fnum] = insi fnum = fnum + 1 except UnicodeError: return 0
def segment(sentence, cut_type='word', pos=False, None_flag='O'): """ 切词 :param sentence: :param cut_type: 'word' use jieba.lcut; 'char' use list(sentence) :param pos: enable POS :param None_flag: 'BIO' the 'O' :return: list """ import logging jieba.default_logger.setLevel(logging.ERROR) if pos: if cut_type == 'word': word_pos_seq = posseg.lcut(sentence) word_seq, pos_seq = [], [] for w, p in word_pos_seq: word_seq.append(w) pos_seq.append(p) return word_seq, pos_seq elif cut_type == 'char': word_seq = list(sentence) pos_seq = [] for w in word_seq: w_p = posseg.lcut(w) pos_seq.append(w_p[0].flag) return word_seq, pos_seq else: if cut_type == 'word': return jieba.lcut(sentence) elif cut_type == 'char': return list(sentence)
def cut_to_lists(filein, filein_name): """ 分词并输出列表到文件 分词后已去重 """ t = re.compile('\t') time_sep = re.compile('-') temp = [] fileout = open(filein_name[:-4] + 'cut.txt', 'w', encoding='utf-8') for line in filein: # write number of users as demand in num_user '''(uid, mid, time, forward_count, comment_count, like_count, content) = t.split(line) forward_count = int(forward_count) comment_count = int(comment_count) like_count = int(like_count)''' (uid, mid, time, content) = t.split(line) # for predict data cut_list = jieba.lcut(content) content = ' '.join(cut_list) ''' remove duplicates cut_list_no_dup = [] # remove duplicates for i in cut_list: if i not in cut_list_no_dup: cut_list_no_dup.append(i) ''' fileout.write(json.dumps([uid, mid, time, cut_list]) + '\n')
def findinFiles(filename, samples): #使用tfidf dictionary = filetoDict(filename, dictpath) for file in filename: lines = cutlines(file) words = cutwords(file) corpus = [dictionary.doc2bow(word) for word in words] # 转化成tfidf # class gensim.models.tfidfmodel.TfidfModel(corpus=None, id2word=None, dictionary=None, wlocal=<function identity>, wglobal=<function df2idf>, normalize=True)¶ # 转化成类 tfidf = models.TfidfModel(corpus) # 调用类TfidfModel中的tfidf函数 corpus_tfidf = tfidf[corpus] # 观察结果 # for doc in corpus_tfidf: # print doc # 查询相似度 index = similarities.MatrixSimilarity(corpus_tfidf) # 将查询文档转到tfidf sample_tfidf = tfidf[dictionary.doc2bow(jieba.lcut(sample, cut_all=False))] sims = index[sample_tfidf] # print list(enumerate(sims)) # 排序输出,查询文档和0,7,2,3号文档相似性较高 sims = sorted(enumerate(sims), key=lambda item: item[1], reverse=True) # print sims # print sims[0] (x, y) = sims[0] # print file return lines[x]
def result_by_BM25(self, sentence): seg_list = jieba.lcut(sentence, cut_all=False) n, cleaned_dict = self.clean_list(seg_list) BM25_scores = {} for term in cleaned_dict.keys(): r = self.fetch_from_db(term) if r is None: continue df = r[1] w = math.log2((self.N - df + 0.5) / (df + 0.5)) docs = r[2].split('\n') for doc in docs: docid, date_time, tf, ld = doc.split('\t') docid = int(docid) tf = int(tf) ld = int(ld) s = (self.K1 * tf * w) / (tf + self.K1 * (1 - self.B + self.B * ld / self.AVG_L)) if docid in BM25_scores: BM25_scores[docid] = BM25_scores[docid] + s else: BM25_scores[docid] = s BM25_scores = sorted(BM25_scores.items(), key = operator.itemgetter(1)) BM25_scores.reverse() if len(BM25_scores) == 0: return 0, [] else: return 1, BM25_scores
def __clean_content(raw): """ 文本清洗 :param raw: :return: """ # 过滤空白字符 # print raw blank_re = re.compile(r"\s+") temp_raw = map(lambda x: x[1:], raw) # print temp_raw texts_raw = [map(lambda x: blank_re.split(x), ele) for ele in temp_raw] # return texts_raw # 分词 texts_splited = [[reduce(lambda x, y: x + jieba.lcut(y), text, []) for text in temp_list] for temp_list in texts_raw] # 过滤停用词 texts_filtered = [[[word for word in text if word not in data.stopwords] for text in temp_filt] for temp_filt in texts_splited] # 过滤低频词 # all_words = sum(texts_filtered, []) # words_once = {word for word in set(all_words) if all_words.count(word) == 1} # texts = [[word for word in text if word not in words_once] for text in texts_filtered] return texts_filtered
def format_text(args, stop_words): for line in args: temp = map(etl, jieba.lcut(line[2].lower())) yield filter(lambda word: (len(word)) > 0 and (word not in stop_words), temp) cursor.close()
def get_data_and_write(filein, fileout): """用 readline 逐行读入数据并 unpack 可用适当参数输出 """ t = re.compile('\t') time_sep = re.compile('-') temp = [] for line in filein: # write number of users as demand in num_user (uid, mid, time, forward_count, comment_count, like_count, content) = t.split(line) #yyyy, mm, dd = time_sep.split(time) #print(yyyy, mm, dd) forward_count = int(forward_count) comment_count = int(comment_count) like_count = int(like_count) cut_list = jieba.lcut(content) #print(cut_list) if forward_count == 0 and comment_count == 0 and like_count == 0: fileout000.write('000 '+ ' '.join(cut_list)) else: if forward_count != 0: fileout100.write('100 '+ ' '.join(cut_list)) if comment_count != 0: fileout010.write('010 '+ ' '.join(cut_list)) if like_count != 0: fileout001.write('001 '+ ' '.join(cut_list))
def checking_sentiment(_text): """ 对传入字符串进行情感词提取,及情感极性判别 返回tuple类型,分别为 (正面词汇,下面词汇数量,负面词汇,负面词汇数量,情感极性差别) :param _text: """ def good_or_bad(lg, lb, score_ok): gi = len(lg) bi = len(lb) if gi == 0 and bi == 0: return 0 elif gi == 0 and bi != 0: return -1 elif gi != 0 and bi == 0: return 1 elif abs(gi - bi) <= score_ok: return 0 elif gi > bi - score_ok: return 1 else: return -1 g = list() # 正面 b = list() # 负面 fenchi = jieba.lcut(_text) for c in fenchi: chi = c.encode('utf8') if _sentiment_dict.get(chi) == '1': g.append(chi) elif _sentiment_dict.get(chi) == '-1': b.append(chi) return ' '.join(g), len(g), ' '.join(b), len(b), good_or_bad(g, b, 1)
def result_by_hot(self, sentence): seg_list = jieba.lcut(sentence, cut_all=False) n, cleaned_dict = self.clean_list(seg_list) hot_scores = {} for term in cleaned_dict.keys(): r = self.fetch_from_db(term) if r is None: continue df = r[1] w = math.log2((self.N - df + 0.5) / (df + 0.5)) docs = r[2].split('\n') for doc in docs: docid, date_time, tf, ld = doc.split('\t') docid = int(docid) tf = int(tf) ld = int(ld) news_datetime = datetime.strptime(date_time, "%Y-%m-%d %H:%M:%S") now_datetime = datetime.now() td = now_datetime - news_datetime BM25_score = (self.K1 * tf * w) / (tf + self.K1 * (1 - self.B + self.B * ld / self.AVG_L)) td = (timedelta.total_seconds(td) / 3600) # hour hot_score = math.log(BM25_score) + 1 / td if docid in hot_scores: hot_scores[docid] = hot_scores[docid] + hot_score else: hot_scores[docid] = hot_score hot_scores = sorted(hot_scores.items(), key = operator.itemgetter(1)) hot_scores.reverse() if len(hot_scores) == 0: return 0, [] else: return 1, hot_scores
def cutwords(filename): lines = cutlines(filename) print lines words = [] for line in lines: words.append(jieba.lcut(line, cut_all=False)) return words
def tokenizer_word(iterator): jieba.load_userdict('./dict.txt') for sentence in iterator: sentence = sentence.decode("utf8") sentence = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。:??、~@#¥%……&*()]+".decode("utf8"), "".decode("utf8"), sentence) yield list(jieba.lcut(sentence))
def construct_postings_lists(self): config = configparser.ConfigParser() config.read(self.config_path, self.config_encoding) files = listdir(config['DEFAULT']['doc_dir_path']) AVG_L = 0 for i in files: root = ET.parse(config['DEFAULT']['doc_dir_path'] + i).getroot() title = root.find('title').text body = root.find('body').text docid = int(root.find('id').text) date_time = root.find('datetime').text seg_list = jieba.lcut(title + '。' + body, cut_all=False) ld, cleaned_dict = self.clean_list(seg_list) AVG_L = AVG_L + ld for key, value in cleaned_dict.items(): d = Doc(docid, date_time, value, ld) if key in self.postings_lists: self.postings_lists[key][0] = self.postings_lists[key][0] + 1 # df++ self.postings_lists[key][1].append(d) else: self.postings_lists[key] = [1, [d]] # [df, [Doc]] AVG_L = AVG_L / len(files) config.set('DEFAULT', 'N', str(len(files))) config.set('DEFAULT', 'avg_l', str(AVG_L)) with open(self.config_path, 'w', encoding = self.config_encoding) as configfile: config.write(configfile) self.write_postings_to_db(config['DEFAULT']['db_path'])
def gen_idf_file(self): news_data = [] with open(self.doc_dir_path , encoding = 'utf-8') as f: for line in f: news_data.append(json.loads(line)) n = float(len(news_data)) idf = {} for i in range(0, len(news_data)): keyword = ' '.join(news_data[i]['keyword']) title = news_data[i]['title'] body = news_data[i]['content'] seg_list = jieba.lcut(title + '。' + keyword + body, cut_all = False) seg_list = set(seg_list) - self.stop_words # 去除停用词 for word in seg_list: word = word.strip().lower() if word == '' or self.is_number(word): continue if word not in idf: idf[word] = 1 else: idf[word] = idf[word] + 1 idf_file = open(self.idf_path, 'w', encoding = 'utf-8') for word, df in idf.items(): idf_file.write('%s %.9f\n'%(word, math.log(n / df))) idf_file.close()
def preprocess(in_filename, out_filename): in_file = open(in_filename, 'r') out_file = open(out_filename,'a') while True: line = in_file.readline() if not line: break line = line.strip() if not line: continue #sep by punctuation #sentences = re.split('《', line) #sentences = re.split('。|?|!|\.|-|:| |(|)', line) sentence = line #for sentence in sentences: # if not sentence: # continue cut_res = jieba.lcut(sentence, cut_all=True) remove_stopwords(cut_res) if not cut_res: continue new_line = ' '.join(cut_res)+'\n' out_file.write(new_line) in_file.close() out_file.close()
def spritz(language="chinese"): if language == 'english': f = open('./app/static/file/article_eg.txt','r') article = f.read() f.close() paragraghs = article.split("\n") words = [] for para in paragraghs: words.extend(para.split(' ')) return render_template('spritz.html', paragraghs=paragraghs, words=words, length=len(words), version='en' ) if language == 'chinese': f = open('./app/static/file/article_ch.txt','r') article = f.read() f.close() article = article.decode('utf-8') paragraghs = article.split("\n") re_attach_article = "" for p in paragraghs: re_attach_article += p words = jieba.lcut(re_attach_article) return render_template('spritz.html', paragraghs=paragraghs, words=words, length=len(words), version='ch' ) if language == 'chinese_led': f = open('./app/static/file/article_ch.txt','r') article = f.read() f.close() clean_article = article.replace(' ','') clean_article = clean_article.replace('\n', '') clean_article = clean_article.decode('utf-8') article = article.decode('utf-8') paragraghs = article.split("\n") words = list(clean_article) return render_template('spritz.html', paragraghs=paragraghs, words=words, length=len(words), version='led')
def filter_word_by_jieba(text): """利用分词过滤常用词,例如语气词等""" tmp_segs = jieba.lcut(text, cut_all=False, HMM=False) ssegs,tmp = [],[] for seg in tmp_segs: if seg == ".": ssegs.append(tmp) tmp = [] else: tmp.append(seg) data = [] def part_by_part(segs): tmp = "" for idx, seg in enumerate(segs): if len(seg)==1: tmp += seg if (idx==len(segs)-1 or len(segs[idx+1])>1) and len(tmp)>1: data.append(tmp) tmp = "" for ss in ssegs: part_by_part(ss) return data
def rewrite2wordlist(inputFileName,outputFileName,haslabel=True,maxLine=-1): fin=codecs.open(inputFileName,'r','utf-8') fout=codecs.open(outputFileName,'w','utf-8') if haslabel: foutlb=codecs.open(outputFileName+u'.lb','w','utf-8') import jieba lcount=0 while True: line=fin.readline() if not line:break line=line.strip() sentance=line if haslabel: lb=line.split(u' ')[0] sentance= u' '.join(line.split(u' ')[1:]) while u' ' in sentance: sentance=sentance.replace(u' ', u' ') if sentance.endswith(u'None'): continue if len(sentance)<4:continue fout.write(lb+u' '+u' '.join(jieba.lcut(sentance)) + u'\n') if haslabel: foutlb.write(lb+u'\n') lcount+=1 if lcount%5000==0: print lcount if maxLine!=-1: if lcount > maxLine: break fin.close() fout.close() if haslabel: foutlb.close()
def load_data_and_multilabels(): """ 5 kinds of company name """ # Load data from files positive_examples = list(open("normal.csv").readlines()) positive_examples = [s.strip() for s in positive_examples] negative_examples = list(open("rubbish.csv").readlines()) negative_examples = [s.strip() for s in negative_examples] invest_examples = list(open("invest.csv").readlines()) invest_examples = [s.strip() for s in invest_examples] sci_examples = list(open("science.csv").readlines()) sci_examples = [s.strip() for s in sci_examples] serv_examples = list(open("service.csv").readlines()) serv_examples = [s.strip() for s in serv_examples] # Split by words x_text = positive_examples + negative_examples+invest_examples+sci_examples+serv_examples x_text = [clean_str(sent) for sent in x_text] x_text = [jieba.lcut(s) for s in x_text] # Generate labels positive_labels = [1 for _ in positive_examples] negative_labels = [0 for _ in negative_examples] invest_labels = [2 for _ in invest_examples] sci_labels = [3 for _ in sci_examples] serv_labels = [4 for _ in serv_examples] y = np.concatenate([positive_labels, negative_labels, invest_labels, sci_labels, serv_labels], 0) return [x_text, y]
def tokenizer(text): ''' Simple Parser converting each document to lower-case, then removing the breaks for new lines and finally splitting on the whitespace ''' text = [jieba.lcut(document.replace('\n', '')) for document in text] return text
def jieba_example(): raw = "我爱北京S5天安门(,123,三,四" raw_seq = jieba.cut(raw) raw_seq_list = jieba.lcut(raw) raw_keyword = jieba.analyse.extract_tags(raw, topK=3, withWeight=False, allowPOS=()) raw_with_ictclas = pseg.cut(raw) for word, flag in raw_with_ictclas: print word, flag
def get_phrase_answers(self, answers): """Get phrase answers and append them to the overall list of answers""" if self.query["phrase"]: for phrase in self.query["phrase"]: phlist = jieba.lcut(phrase, cut_all = False) for term in phlist: answers.append(term) return answers
def String_make_corpus(text): corpus = "" if isinstance(text, basestring): words_seg = jieba.lcut(text) for i in range(len(words_seg)): words_seg[i] = words_seg[i].encode('utf-8') corpus = ' '.join(words_seg) return corpus
def get_boolean_answers(self, answers): """Get boolean answers and append them to the overall list of answers""" if self.query["bool"]: for sentence in self.query["bool"]: boolist = jieba.lcut(sentence, cut_all = False) for term in boolist: answers.append(term) return answers
v = np.zeros(200) for word in words: v += model[word] v /= len(words) return v v1, v2 = sentence_vector(s1), sentence_vector(s2) return np.dot(v1, v2) / (norm(v1) * norm(v2)) #sentence1="抢劫" sentence1 = "渔船用5厘米尺寸的渔网捕鱼" sentence2 = "使用炸鱼、毒鱼、电鱼等破坏渔业资源方法进行捕捞" ##sentence2=sentence2.replace(',', '').replace('。', '').replace('?', '').replace('!', '') \ ## .replace('“', '').replace('”', '').replace(':', '').replace('…', '').replace('(', '').replace(')', '') \ ## .replace('—', '').replace('《', '').replace('》', '').replace('、', '').replace('‘', '') \ ## .replace('’', '').replace(' ','') # 去掉标点符号 sentence3 = "抢夺公私财物" sentence4 = "使用小于最小网目尺寸的网具进行捕捞" words = jieba.lcut(sentence3, cut_all=False) #print(words) ## ##words2=[word for word in words if word not in "、|,"] ##print(words2) print(vector_similarity(sentence1, sentence2)) print(vector_similarity(sentence1, sentence3)) print(vector_similarity(sentence1, sentence4))
#例子6-1:将文本句子 “2018年世界杯小组赛抽签在莫斯科克里姆林宫举行”进行分词。 import jieba print(jieba.lcut("2018年世界杯小组赛抽签在莫斯科克里姆林宫举行")) print(jieba.lcut("2018年世界杯小组赛抽签在莫斯科克里姆林宫举行", cut_all=True)) print(jieba.lcut_for_search("2018年世界杯小组赛抽签在莫斯科克里姆林宫举行")) # 例6-2:显示例6-1分词后每个词的词性。 import jieba.posseg as pseg words = pseg.cut("2018年世界杯小组赛抽签在莫斯科克里姆林宫举行") for word, tag in words: print('word:{}, tag:{}'.format(word, tag))
#GovRptWordCloudv2T1.py import matplotlib.pyplot as plt from PIL import Image import jieba import wordcloud #from scipy.misc import imread import imageio mask = imageio.imread("fivestart.png") excludes = {} f = open("新时代中国特色社会主义.txt", "r", encoding="UTF-8") t = f.read() f.close() ls = jieba.lcut(t) txt = " ".join(ls) w = wordcloud.WordCloud(\ width = 1000, height = 700,\ background_color = "white", font_path = "Hiragino Sans GB.ttc", mask = mask ) w.generate(txt) w.to_file("grwordcloud.png") plt.im = Image.open("grwordcloud.png") plt.im.show()
def predict_one(s): s = clean_text(s) s = np.array(doc2num(jieba.lcut(s), MAX_SEQUENCE_LENGTH)) s = s.reshape((1, s.shape[0])) return model.predict_classes(s, verbose=0)[0][0]
def fenci(x): words = jieba.lcut(x) return " ".join(words)
import jieba s = "世界冠军运动员的乒乓球拍卖完了" ls = jieba.lcut(s, True) print(ls)
def clean_cut(s): # 要将文本清洗、分词并以空格隔开,以便转为Tfidf向量表征 # 清洗过程中发现有些文本还有\\n,也一并删除 return ' '.join(jieba.lcut(re.sub('[\r\n\u3000]', '', s).replace('\\n','')))
def test_process_hits(self, sess, data, args): with open(os.path.join(args.datapath, 'test_distractors.json'), 'r', encoding='utf8') as f: test_distractors = json.load(f) data.restart("test", batch_size=1, shuffle=False) batched_data = data.get_next_batch("test") loss_record = [] cnt = 0 while batched_data != None: for key in batched_data: if isinstance(batched_data[key], np.ndarray): batched_data[key] = batched_data[key].tolist() batched_data['resp_length'] = [len(batched_data['resp'][0])] for each_resp in test_distractors[cnt]: batched_data['resp'].append( [data.go_id] + data.convert_tokens_to_ids(jieba.lcut(each_resp)) + [data.eos_id]) batched_data['resp_length'].append( len(batched_data['resp'][-1])) max_length = max(batched_data['resp_length']) resp = np.zeros((len(batched_data['resp']), max_length), dtype=int) for i, each_resp in enumerate(batched_data['resp']): resp[i, :len(each_resp)] = each_resp batched_data['resp'] = resp post = [] post_length = [] prev_length = [] kg = [] kg_h_length = [] kg_hr_length = [] kg_hrt_length = [] kg_index = [] for _ in range(len(resp)): post += batched_data['post'] post_length += batched_data['post_length'] prev_length += batched_data['prev_length'] kg += batched_data['kg'] kg_h_length += batched_data['kg_h_length'] kg_hr_length += batched_data['kg_hr_length'] kg_hrt_length += batched_data['kg_hrt_length'] kg_index += batched_data['kg_index'] batched_data['post'] = post batched_data['post_length'] = post_length batched_data['prev_length'] = prev_length batched_data['kg'] = kg batched_data['kg_h_length'] = kg_h_length batched_data['kg_hr_length'] = kg_hr_length batched_data['kg_hrt_length'] = kg_hrt_length batched_data['kg_index'] = kg_index _, _, loss, _, _ = self.inference(sess, batched_data, lamb=args.lamb) loss_record.append(loss) cnt += 1 batched_data = data.get_next_batch("test") assert cnt == len(test_distractors) loss = np.array(loss_record) loss_rank = np.argsort(loss, axis=1) hits1 = float(np.mean(loss_rank[:, 0] == 0)) hits3 = float(np.mean(np.min(loss_rank[:, :3], axis=1) == 0)) hits5 = float(np.mean(np.min(loss_rank[:, :5], axis=1) == 0)) return {'hits@1': hits1, 'hits@3': hits3, 'hits@5': hits5}
def cut_sentences(self, text): """文本分句,然后分词""" sentences = re.findall(".*?[。?!]", text) cut_sentences = [jieba.lcut(sent) for sent in sentences] return cut_sentences
import jieba from pyecharts import options as opts from pyecharts.charts import WordCloud story = ''' 实打实打算阿斯顿阿三爱上倒是都i阿松嗲送i的厚爱收到安抚和速度放缓阿斯顿法海速度回复哎u收到饭后阿萨的哈佛i啊u是和覅u送达和覅u阿斯顿哈佛i撒旦和覅说的话弗兰克世界大会风口浪尖撒旦和李开复盛大开放哈桑了艰苦奋斗 ''' words = jieba.lcut(story) counts = {} for w in words: if len(w) == 1: continue else: counts[w] = counts.get(w, 0) + 1 items = list(counts.items()) words = [(k, v) for (k, v) in items] (WordCloud().add( series_name="MyLove", data_pair=words, word_size_range=[6, 66]).set_global_opts( title_opts=opts.TitleOpts( title="MyLove", title_textstyle_opts=opts.TextStyleOpts(font_size=23)), tooltip_opts=opts.TooltipOpts(is_show=True), ).render("MyLove.html"))
def tokenize(sequence): word_list = jieba.lcut(sequence) return word_list
''' wordcloud pip3 install wordcloud w=wordcloud.WordCloud() w=wordcloud.WordCloud(width=400) w=wordcloud.WordCloud(height=200) w=wordcloud.WordCloud(min_font_size=4) w=wordcloud.WordCloud(max_font_size) w=wordcloud.WordCloud(font_size=1) w=wordcloud.WordCloud(font_path=None) w=wordcloud.WordCloud(max_words=200) w=wordcloud.WordCloud(stop_words={""}) mask from scipy.misc import imread mk=imread("pic.png") w=wordcloud.WordCloud(mask=mk) w=wordcloud.WordCloud(background_color="black") w.generate(txt) w.to_file(filename) .png .jpg ''' import jieba import wordcloud txt = "程序设计语言是计算机能够理解和识别用户操作意图的一种交互体系,它按照特定规则组织计算机指令,使计算机能够自动进行各种运算处理" w = wordcloud.WordCloud(width=1000, height=700, font_path="msyh.ttf") w.generate(' '.join(jieba.lcut(txt))) w.to_file("pywordcloud.png")
# with open("6第六周\沉默的羔羊.txt","r",encoding="utf-8") as f: # txt=f.read() # return txt #words=jieba.lcut(function()) #counts={} #for x in words: # if len(x)>2: # counts[x]=counts.get(x,0)+1 #items=list(counts.items()) #items.sort(key=lambda x:x[1],reverse=True) #for i in range(1): # word, count = items[i] # print ("{}".format(word)) import jieba f = open("6第六周\沉默的羔羊.txt", "r", encoding="utf-8") ls = jieba.lcut(f.read()) #ls = f.read().split() d = {} for w in ls: d[w] = d.get(w, 0) + 1 maxc = 0 maxw = "" for k in d: if d[k] > maxc and len(k) > 2: maxc = d[k] maxw = k if d[k] == maxc and len(k) > 2 and k > maxw: maxw = k print(maxw) f.close()
# -*- coding: utf-8 -*- """ Project: PyLib Author: Jarod Create time: 2020-04-15 12:02 IDE: PyCharm Introduction: """ import jieba import wordcloud ''' txt = "life is short, you need python" w = wordcloud.WordCloud(background_color="white") w.generate(txt) w.to_file("pywordcloud.png") ''' txt = "程序设计语言是计算机能顾理解和识别用户操作意图的一种交互体系,它按照特定规则组织计算机指令,是计算机能够自动运行各种运算处理。" w = wordcloud.WordCloud(width=1000, height=700, font_path="msyh.ttc") w.generate(" ".join(jieba.lcut(txt))) w.to_file("pywordcloud.png")
_context, _trigger, _object, _subject, _time, _location = \ item[1].replace("-", "-").replace("~", "~"), item[2], item[3].replace("-", "-"), item[4].replace("-", "-"), \ item[5].replace("-", "-"), item[6].replace("-", "-") # 特殊化处理(仅仅训练集存在这种情况->将所有的变种0~9进行替换) for i in range(10): r_c = chr(65296 + i) _context = _context.replace(r_c, "%d" % i) _trigger = _trigger.replace(r_c, "%d" % i) _object = _object.replace(r_c, "%d" % i) _subject = _subject.replace(r_c, "%d" % i) _time = _time.replace(r_c, "%d" % i) _location = _location.replace(r_c, "%d" % i) trigger_index = len(obj) # 首先处理triggers x = list(jieba.tokenize(_context)) # 切词带索引 y = jieba.lcut(_context) # 单纯的切词序列 assert len(x) == len(y) overlap_flag = False __context = "" overlap_index = -1 for i in range(trigger_index): if obj[trigger_index - 1 - i]["trigger"] == _trigger or _trigger in obj[trigger_index - 1 - i]["trigger"]: overlap_flag = True x = list( jieba.tokenize(_context[obj[trigger_index - 1 - i]["end"] + 1:])) y = jieba.lcut(_context[obj[trigger_index - 1 - i]["end"] + 1:])
def tokenizer(text): text = [jieba.lcut(document.replace('\n', '')) for document in text] return text
sleep_num1 = random.randint(90, 120) time.sleep(sleep_num1) try: print("正在获取请求......") response = requests.get(url=url, headers=headers, params=params) print("获取请求成功!") response.encoding = 'utf-8' page_text = response.text r_ex = '<div class="c-abstract">(.*?)</div>' content_list = re.findall(r_ex, page_text, re.S) print('content_list:', end=' ') print(content_list) for content in content_list: content_fenci = jieba.lcut(content, cut_all=True) print('content_fenci:', end=' ') print(content_fenci) for fenci in content_fenci: if fenci == kw: kw_cnt = kw_cnt + 1 ent_kw_cnt[loop_cnt] = kw_cnt print(kw + "出现频次:", end=' ') print(kw_cnt) except requests.exceptions.ConnectionError: print("------这一条爬取失败,休息3分钟-----") time.sleep(180) continue loop_cnt = loop_cnt + 1 if ent_kw_cnt[0] != "卫星石化":
# wordCloudthreeKingdoms.py import wordcloud import jieba # 读取txt threeKingdomsTxt = open("E:\\21_git\\pythonMOOC\\week6\\threekingdoms.txt", \ "rt", encoding = 'utf-8').read() # 利用jieba库 分词 【√】 words = jieba.lcut(threeKingdomsTxt) for word in words: if len(word) == 1: del word # 转化为以空格分隔的字符串 txt = " ".join(words) # 生成词云 wc = wordcloud.WordCloud(width = 1920, height = 1080, font_path="msyh.ttc", \ background_color = 'white') # 注意需要加载中文字体 wc.generate(txt) wc.to_file('wordCloudthreeKingdoms.png')
#jieba加载自定义的词典; root = os.path.dirname(os.path.realpath(__file__)) my_dict = root+'/dict/mydict.txt' template = root+'/dict/template.txt' #模版列表的dict ltemplate = [] fdatas = open(template , 'r', encoding='utf_8').readlines() for line in fdatas: sdict = {} sline = line.split("|") sdict["sname"] = sline[0] sdict["stag"] = sline[1] sdict["sentence"] = sline[2] #把分词后的内容也存入到dict中; sn = jieba.lcut(sline[2],cut_all=False) sdict["sn"] = sn ltemplate.append(sdict) def get_catalog(sentence): #首先需要对于sentence内容做切割 asentence = re.split("。|,|?|\.|\?|,", sentence) max = 0 catalog = [] for s in asentence: for t1 in ltemplate: sn = t1["sn"] diff = count_distance(s, sn) if diff > max: max = diff
def tokenize(line, is_zh): tokens = jieba.lcut(line) if is_zh else nltk.word_tokenize(line) return ' '.join(tokens), len(tokens)
def tokenize(sent): # with open(stop_words_path, "r", encoding="utf-8") as sw: # stop_words_ = sw.readlines() # stop_words = [w.replace("\n", "") for w in stop_words_] return ' '.join([w for w in jieba.lcut(clean(sent.strip()))])
def cut(string): return jieba.lcut(string)
def sentiment_score_list(seg_sentence): # seg_sentence = dataset.split('。') words = [] count1 = [] count2 = [] senti_score_words_result = {"count2": count2, "words": words} for index, sen in enumerate(seg_sentence): #循环遍历每一个评论 # print index if not sen: continue segtmp = jieba.lcut(sen, cut_all=False) #把句子进行分词,以列表的形式返回 # 把所有的分词加到总list中 words.extend(segtmp) i = 0 #记录扫描到的词的位置 a = 0 #记录情感词的位置 poscount = 0 #积极词的第一次分值 poscount2 = 0 #积极词反转后的分值 poscount3 = 0 #积极词的最后分值(包括叹号的分值) negcount = 0 negcount2 = 0 negcount3 = 0 for word in segtmp: word = word.encode("utf-8") if word in posdict: # 判断词语是否是情感词 poscount += 1 c = 0 for w in segtmp[a:i]: # 扫描情感词前的程度词 w = w.encode("utf-8") if w in mostdict: poscount *= 4.0 elif w in verydict: poscount *= 3.0 elif w in moredict: poscount *= 2.0 elif w in ishdict: poscount *= 0.5 elif w in deny_word: c += 1 if judgeodd(c) == 'odd': # 扫描情感词前的否定词数 poscount *= -1.0 poscount2 += poscount poscount = 0 poscount3 = poscount + poscount2 + poscount3 poscount2 = 0 else: poscount3 = poscount + poscount2 + poscount3 poscount = 0 a = i + 1 # 情感词的位置变化 elif word in negdict: # 消极情感的分析,与上面一致 negcount += 1 d = 0 for w in segtmp[a:i]: w = w.encode("utf-8") if w in mostdict: negcount *= 4.0 elif w in verydict: negcount *= 3.0 elif w in moredict: negcount *= 2.0 elif w in ishdict: negcount *= 0.5 elif w in degree_word: d += 1 if judgeodd(d) == 'odd': negcount *= -1.0 negcount2 += negcount negcount = 0 negcount3 = negcount + negcount2 + negcount3 negcount2 = 0 else: negcount3 = negcount + negcount2 + negcount3 negcount = 0 a = i + 1 elif word == '!' or word == '!': ##判断句子是否有感叹号 for w2 in segtmp[::-1]: # 扫描感叹号前的情感词,发现后权值+2,然后退出循环 if w2 in posdict or negdict: poscount3 += 2 negcount3 += 2 break i += 1 # 扫描词位置前移 # 以下是防止出现负数的情况 pos_count = 0 neg_count = 0 if poscount3 < 0 and negcount3 > 0: neg_count += negcount3 - poscount3 pos_count = 0 elif negcount3 < 0 and poscount3 > 0: pos_count = poscount3 - negcount3 neg_count = 0 elif poscount3 < 0 and negcount3 < 0: neg_count = -poscount3 pos_count = -negcount3 else: pos_count = poscount3 neg_count = negcount3 count1.append([pos_count, neg_count]) count2.append(count1) count1 = [] return senti_score_words_result
def train(self, datas, labels, **kwargs): if 'retrain' in kwargs and kwargs['retrain'] is True: Utils.remove_models(self.model_dir) self.datas = datas self.labels = labels # 加载之前的模型和步数 self.model, self.start_epoch = Utils.load_previous_model( self.model_dir) lang = Lang() all_labels = [] for idx in range(len(datas)): data = datas[idx] if type(data) == str: data_list = jieba.lcut(data) lang.index_words(data) if labels[idx] not in all_labels: all_labels.append(labels[idx]) if self.kwargs['word2vec']: if self.kwargs['word2vec_model'] == None: raise ValueError('请填写word2vec模型存储路径') self.dataset = Dataset( datas, labels, lang, all_labels, batch_size=self.kwargs['batch_size'], word2vec=True, word2vec_model=self.kwargs['word2vec_model']) if self.model == None: self.kwargs['word_embedding'] = self.dataset.word_embedding self.kwargs['embed_size'] = self.dataset.embed_size else: self.dataset = Dataset(datas, labels, lang, all_labels, batch_size=self.kwargs['batch_size'], word2vec=False) param = self.kwargs param['dataset'] = self.dataset Utils.save_param(param, self.model_dir) if self.model == None: self.kwargs['input_size'] = lang.n_words self.kwargs['output_size'] = len(all_labels) self.model = BiGRUNet(self.kwargs) self.criterion = torch.nn.NLLLoss() self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.kwargs['lr_rate']) self.n_epochs = len(labels) * self.kwargs['epoch'] self.progress = ProgressBar(count=self.start_epoch, total=self.n_epochs + 1) self.train_iter()
import jieba from wordcloud import WordCloud # 排除词列表,剔除无效词语 excludes = { "div", "h5", "class", "h4", "h3", "h2", "h1", "conlun2_box_text", "p", "id" } # 以只读模式打开刚爬取的txt文档 f = open( '/home/mark/Documents/Code-insider/Unnamed/Udemy/Python/Grawl_analyse_report/reports/2014_gov_report.txt', 'r', encoding='utf-8') txt = f.read() f.close() words = jieba.lcut(txt) # 使用精确分词模式进行分词 newtxt = ''.join(words) # 利用空格连接精确分词后的词语 # 对词云图参数进行设置,以newtxt文本生成词云 wordcloud = WordCloud( background_color="white", # 设置图片背景颜色 width=800, # 设置图片宽度 height=600, # 设置图片高度 # 指定字体文件的完整路径,如果不设置可能显示不了中文 font_path="/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc", max_words=100, # 词云中最大词数 max_font_size=80, # 词云中最大的字体号数 stopwords=excludes, # 被排除的词列表 ).generate(newtxt) wordcloud.to_file('Wordcloud_Analysis.png')
def cut_word(): import jieba jieba.load_userdict(os.path.join(Data_PATH, "word_dict.txt")) jieba.add_word('_e11_') jieba.add_word('_e12_') jieba.add_word('_e21_') jieba.add_word('_e22_') for en_word in ['disease','reason','symptom','test','test_value','drug','frequency','amount','method','treatment', 'operation','sideeff','anatomy','level','duration']: jieba.add_word(en_word) for file_name in ['sample_negative.txt', 'sample_positive.txt']: with open(os.path.join(Data_PATH, file_name.replace('.txt', '_cut.txt')), 'w') as fout: with open(os.path.join(Data_PATH, file_name)) as f: for line in f: r_name, sent, fn, rid = line.rstrip().split('\t') if any(a in sent for a in ['_e11_', '_e12_', '_e21_', '_e22_']): print('contains new annotation !!!') # if not (fn == '131_5' and rid == 'R21'): # continue # e1_s = sent.index('<e1>') # e1_e = sent.index('</e1>') # e2_s = sent.index('<e2>') # e2_e = sent.index('</e2>') sent_simp = sent.replace('<e1>', '_e11_').replace('</e1>', '_e12_').replace('<e2>', '_e21_').replace('</e2>', '_e22_') sent_cut = jieba.lcut(sent_simp, HMM=False) # i = 0 # offset = [] # for w in sent_cut: # offset.append([i, i+len(w), w]) # i += len(w) # # if e1_s < e2_s: # # for idx, (s_idx, e_idx, w) in enumerate(offset): # if e1_s == 0: # offset.insert(0, [e1_s, e1_s + 4, '<e1>']) # offset = offset[:1] + [[si+4, ei+4, w] for si, ei, w in offset[1:]] # break # # if e_idx == e1_s: # offset.insert(idx+1, [e1_s, e1_s + 4, '<e1>']) # offset = offset[:idx+2] + [[si+4, ei+4, w] for si, ei, w in offset[idx+2:]] # break # # for idx, (s_idx, e_idx, w) in enumerate(offset): # if e_idx == e1_e: # offset.insert(idx+1, [e1_e, e1_e + 5, '</e1>']) # offset = offset[:idx+2] + [[si+5, ei+5, w] for si, ei, w in offset[idx+2:]] # break # # for idx, (s_idx, e_idx, w) in enumerate(offset): # if e_idx == e2_s: # offset.insert(idx+1, [e2_s, e2_s + 4, '<e2>']) # offset = offset[:idx+2] + [[si+4, ei+4, w] for si, ei, w in offset[idx+2:]] # break # # for idx, (s_idx, e_idx, w) in enumerate(offset): # if e_idx == e2_e: # offset.insert(idx+1, [e2_e, e2_e + 5, '</e2>']) # offset = offset[:idx+2] + [[si+5, ei+5, w] for si, ei, w in offset[idx+2:]] # break # else: # for idx, (s_idx, e_idx, w) in enumerate(offset): # if e2_s == 0: # offset.insert(0, [e2_s, e2_s + 4, '<e2>']) # offset = offset[:1] + [[si+4, ei+4, w] for si, ei, w in offset[1:]] # break # # if e_idx == e2_s: # offset.insert(idx+1, [e2_s, e2_s + 4, '<e2>']) # offset = offset[:idx+2] + [[si+4, ei+4, w] for si, ei, w in offset[idx+2:]] # break # # for idx, (s_idx, e_idx, w) in enumerate(offset): # if e_idx == e2_e: # offset.insert(idx+1, [e2_e, e2_e + 5, '</e2>']) # offset = offset[:idx+2] + [[si+5, ei+5, w] for si, ei, w in offset[idx+2:]] # break # # for idx, (s_idx, e_idx, w) in enumerate(offset): # if e_idx == e1_s: # offset.insert(idx+1, [e1_s, e1_s + 4, '<e1>']) # offset = offset[:idx+2] + [[si+4, ei+4, w] for si, ei, w in offset[idx+2:]] # break # # for idx, (s_idx, e_idx, w) in enumerate(offset): # if e_idx == e1_e: # offset.insert(idx+1, [e1_e, e1_e + 5, '</e1>']) # offset = offset[:idx+2] + [[si+5, ei+5, w] for si, ei, w in offset[idx+2:]] # break # sent_cut_upt = [w for _, _, w in offset] fout.write('{}\t{}\t{}\t{}\n'.format(r_name, ' '.join(sent_cut), fn, rid))
# print(df) # 数据清洗 # 数据处理 # 1.观看人群性别 gender = df['gender'].value_counts().sort_index(ascending=1) print(gender) gender_title = '观看人群性别比' pie(gender, gender_title) # 2.评分情况 score = df['score'].value_counts().sort_index(ascending=1) print(score) score_title = '评分占比情况' pie(score, score_title) # 3.影评分析 df['comment'].to_csv('zhuxian.txt', index=False) f = open('zhuxian.txt', 'r', encoding='utf-8') txt = f.read() f.close() words = jieba.lcut(txt) # print(words) nettxt = ' '.join(words) # print(nettxt) wordcloud = WordCloud(background_color='white', width=800, height=600, font_path='msyh.ttc', max_words=200, max_font_size=80).generate(nettxt) wordcloud.to_file('诛仙评论词云.png')
def tokenize(sents): res = [] for sent in sents: res.append(jieba.lcut(sent[0])) return res
import jieba s = "中国特色社会主义进入新时代,我国社会主要矛盾已经转化为人民日益增长的美好生活需要和不平衡不充分的发展之间的矛盾。" n = len(s) m = len(jieba.lcut(s)) print("中文字符数为{},中文词语数为{}。".format(n, m))
def predictByBeamSearch(self, inputSeq, beamWidth=10, maxAnswerLength=32, alpha=0.7, isRandomChoose=False, allRandomChoose=False, improve=True, showInfo=False): outputSize = len(self.id2word) inputSeq = filter_sent(inputSeq) inputSeq = [ w for w in jieba.lcut(inputSeq) if w in self.word2id.keys() ] X = seq2id(self.word2id, inputSeq) XLens = torch.tensor([len(X) + 1], dtype=torch.int, device=self.device) X = X + [eosToken] X = torch.tensor([X], dtype=torch.long, device=self.device) d = int(self.encoderRNN.bidirectional) + 1 hidden = torch.zeros( (d * self.encoderRNN.numLayers, 1, self.hiddenSize), dtype=torch.float32, device=self.device) encoderOutput, hidden = self.encoderRNN(X, XLens, hidden) hidden = hidden[-d * self.decoderRNN.numLayers::2].contiguous() Y = np.ones([beamWidth, maxAnswerLength], dtype='int32') * eosToken # prob: beamWidth × 1 prob = np.zeros([beamWidth, 1], dtype='float32') decoderInput = torch.tensor([[sosToken]], dtype=torch.long, device=self.device) # decoderOutput: 1 × 1 × outputSize; hidden: numLayers × 1 × hiddenSize decoderOutput, hidden, decoderAttentionWeight = self.decoderRNN( decoderInput, hidden, encoderOutput) # topv: 1 × 1 × beamWidth; topi: 1 × 1 × beamWidth topv, topi = decoderOutput.topk(beamWidth) # decoderInput: beamWidth × 1 decoderInput = topi.view(beamWidth, 1) for i in range(beamWidth): Y[i, 0] = decoderInput[i].item() Y_ = Y.copy() prob += topv.view(beamWidth, 1).data.cpu().numpy() prob_ = prob.copy() # hidden: numLayers × beamWidth × hiddenSize hidden = hidden.expand(-1, beamWidth, -1).contiguous() localRestId = np.array([i for i in range(beamWidth)], dtype='int32') encoderOutput = encoderOutput.expand( beamWidth, -1, -1) # => beamWidth × 1 × hiddenSize for i in range(1, maxAnswerLength): # decoderOutput: beamWidth × 1 × outputSize; hidden: numLayers × beamWidth × hiddenSize; decoderAttentionWeight: beamWidth × 1 × XSeqLen decoderOutput, hidden, decoderAttentionWeight = self.decoderRNN( decoderInput, hidden, encoderOutput) # topv: beamWidth × 1; topi: beamWidth × 1 if improve: decoderOutput = decoderOutput.view(-1, 1) if allRandomChoose: topv, topi = self._random_pick_k_by_prob(decoderOutput, k=beamWidth) else: topv, topi = decoderOutput.topk(beamWidth, dim=0) else: topv, topi = (torch.tensor(prob[localRestId], dtype=torch.float32, device=self.device).unsqueeze(2) + decoderOutput).view(-1, 1).topk(beamWidth, dim=0) # decoderInput: beamWidth × 1 decoderInput = topi % outputSize idFrom = topi.cpu().view(-1).numpy() // outputSize Y[localRestId, :i + 1] = np.hstack( [Y[localRestId[idFrom], :i], decoderInput.cpu().numpy()]) prob[localRestId] = prob[ localRestId[idFrom]] + topv.data.cpu().numpy() hidden = hidden[:, idFrom, :] restId = (decoderInput != eosToken).cpu().view(-1) localRestId = localRestId[restId.numpy().astype('bool')] decoderInput = decoderInput[restId] hidden = hidden[:, restId, :] encoderOutput = encoderOutput[restId] beamWidth = len(localRestId) if beamWidth < 1: break lens = [ i.index(eosToken) if eosToken in i else maxAnswerLength for i in Y.tolist() ] ans = [''.join(id2seq(self.id2word, i[:l])) for i, l in zip(Y, lens)] prob = [prob[i, 0] / np.power(lens[i], alpha) for i in range(len(ans))] if isRandomChoose or allRandomChoose: prob = [np.exp(p) for p in prob] prob = [p / sum(prob) for p in prob] if showInfo: for i in range(len(ans)): print((ans[i], prob[i])) return random_pick(ans, prob) else: ansAndProb = list(zip(ans, prob)) ansAndProb.sort(key=lambda x: x[1], reverse=True) if showInfo: for i in ansAndProb: print(i) return ansAndProb[0][0]