def test_b(self): """ Changing the b parameter should give consistent results """ corpus = common_texts index = 0 doc = corpus[index] first_b = 1.0 second_b = 2.0 first_bm25 = BM25(corpus, b=first_b) second_bm25 = BM25(corpus, b=second_b) first_score = first_bm25.get_score(doc, index) second_score = second_bm25.get_score(doc, index) self.assertLess(first_score, second_score) first_iter = iter_bm25_bow(corpus, b=first_b) second_iter = iter_bm25_bow(corpus, b=second_b) first_score = dict(next(iter(first_iter)))[index] second_score = dict(next(iter(second_iter)))[index] self.assertLess(first_score, second_score) first_weights = get_bm25_weights(corpus, b=first_b) second_weights = get_bm25_weights(corpus, b=second_b) first_score = first_weights[index] second_score = second_weights[index] self.assertLess(first_score, second_score)
def test_epsilon(self): """ Changing the b parameter should give consistent results """ corpus = [['cat', 'dog', 'mouse'], ['cat', 'lion'], ['cat', 'lion']] first_epsilon = 1.0 second_epsilon = 2.0 bm25 = BM25(corpus) words_with_negative_idfs = set( [word for word, idf in bm25.idf.items() if idf < 0]) index, doc = [(index, document) for index, document in enumerate(corpus) if words_with_negative_idfs & set(document)][0] first_bm25 = BM25(corpus, epsilon=first_epsilon) second_bm25 = BM25(corpus, epsilon=second_epsilon) first_score = first_bm25.get_score(doc, index) second_score = second_bm25.get_score(doc, index) self.assertGreater(first_score, second_score) first_iter = iter_bm25_bow(corpus, epsilon=first_epsilon) second_iter = iter_bm25_bow(corpus, epsilon=second_epsilon) first_score = dict(next(iter(first_iter)))[index] second_score = dict(next(iter(second_iter)))[index] self.assertGreater(first_score, second_score) first_weights = get_bm25_weights(corpus, epsilon=first_epsilon) second_weights = get_bm25_weights(corpus, epsilon=second_epsilon) first_score = first_weights[index] second_score = second_weights[index] self.assertGreater(first_score, second_score)
def test_multiprocessing(self): """ Result should be the same using different processes """ weights1 = get_bm25_weights(common_texts) weights2 = get_bm25_weights(common_texts, n_jobs=2) weights3 = get_bm25_weights(common_texts, n_jobs=-1) self.assertAlmostEqual(weights1, weights2) self.assertAlmostEqual(weights1, weights3) self.assertAlmostEqual(weights2, weights3)
def test_max_match_with_itself(self): """ Document should show maximum matching with itself """ weights = get_bm25_weights(common_texts) for index, doc_weights in enumerate(weights): expected = max(doc_weights) predicted = doc_weights[index] self.assertAlmostEqual(expected, predicted)
def test_with_generator(self): """ Check above function with input as generator """ text_gen = (i for i in common_texts) weights = get_bm25_weights(text_gen) for index, doc_weights in enumerate(weights): expected = max(doc_weights) predicted = doc_weights[index] self.assertAlmostEqual(expected, predicted)
def main(): documents = get_documents() wakachi_documents = [x['wakachi'] for x in documents] results = get_bm25_weights(wakachi_documents, n_jobs=1) for i, result in enumerate(results): documents[i]['sum'] = sum(result) output_csv(documents)
def writeOkapiFile(txtArrAfter, outputName, path): f = open(path + "/" + outputName + ".txt", 'w') arr = [] for i in range(len(txtArrAfter)): item = txtArrAfter[i].split(" ") arr.append(item) result = get_bm25_weights(arr, n_jobs=-1) result = str(result) f.write(result) f.close()
def summarize(text, P=5): sents = Normalizer(text).sent_tokenize() words = [Normalizer(sent).clean_up() for sent in sents] sim_mat = get_bm25_weights(words, n_jobs=-1) sim_mat_np = np.array(sim_mat) graph = nx.from_numpy_array(sim_mat_np) scores = nx.pagerank(graph) weighted = sorted(list(set(((scores[i], s) for i, s in enumerate(sents)))), reverse=True)[:P] return _to_text([tup[1] for tup in weighted]), len(words)
def writeOkapiFile(txtArrAfter, outputName, path): f = open(path + '/' + outputName + '.txt', 'w') arr = [] for i in range(len(txtArrAfter)): item = txtArrAfter[i].split(" ") arr.append(item) result = get_bm25_weights(arr, n_jobs=-1) for i in range(len(result)): strItem = "" for j in range(len(result[i])): okapi = str(round(result[i][j], 3)) if len(okapi) == 3: strItem = strItem + okapi + " " elif len(okapi) == 4: strItem = strItem + okapi + " " elif len(okapi) == 5: strItem = strItem + okapi + " " else: strItem = strItem + okapi + " " f.write(strItem + "\n") f.close()
#pickle.dump(clf, open(filename, 'wb')) predicted= clf.predict(X_test) print("MultinomialNB Accuracy TF-IDF:",metrics.accuracy_score(y_test, predicted)) b = datetime.datetime.now() c=b-a print(c) #%% Do the same but with BM25 weights. Takes long time for all children! but is probably more accurate from gensim.summarization.bm25 import get_bm25_weights children_capped_themes.loc[:,'splitted_content'] = children_capped_themes.loc[:,'content'].str.split(' ') #%% a = datetime.datetime.now() print('busy with BM25') small_test = children_capped_themes.sample(n=5000) BM25 = get_bm25_weights(small_test.loc[:,'splitted_content']) #%% from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split( BM25, small_test['encoded_label'], test_size=0.2, random_state=123) #%% from sklearn.naive_bayes import MultinomialNB from sklearn import metrics # Model Generation Using Multinomial Naive Bayes clf = MultinomialNB().fit(X_train, y_train) filename = 'MultiNB_BM25.sav' pickle.dump(clf, open(filename, 'wb')) predicted= clf.predict(X_test) print("MultinomialNB Accuracy BM25:",metrics.accuracy_score(y_test, predicted))
#filtered_word_list = [word.lower() for word in line if (( len(word) >= 3 and word[0].isalpha() and word.lower() not in stop_words ))] query_list.extend(filtered_word_list) docs_list.append(query_list) for candidatesfile in candidates_list: with open(root_path+task_id+ '/candidates/' + candidatesfile, mode="r", encoding="utf-8") as file: doc=[] for line in file.readlines(): line=line.rstrip() #line=unicodeToAscii(line) filtered_word_list = line.split() #filtered_word_list = [word.lower() for word in line if (( len(word) >= 3 and word[0].isalpha() and word.lower() not in stop_words ))] doc.extend(filtered_word_list) docs_list.append(doc) scores = get_bm25_weights(docs_list) scores[0] = scores[0][1:] best_docs = sorted(range(len(scores[0])), key=scores[0].__getitem__) best_docs.reverse() for rank,id1 in enumerate(best_docs[:100]): context = str(task_id)+"\t"+ str(id1+1) + "\t" + str(rank+1) + "\t" + str(scores[0][id1]) + "\n" print(context) bm25_rank_file.write(context) ''' best_docs = sorted(range(len(scores[0])), key=scores[0].__getitem__) best_docs.reverse() best_docs.remove(0) bm25_rank_file.write(task_id+",") for rank,id1 in enumerate(best_docs[:100]): bm25_rank_file.write(str(id1)+" ") bm25_rank_file.write("\n")
def test_same_match_with_same_document(self): """ A document should always get the same weight when matched with a particular document """ corpus = [['cat', 'dog', 'mouse'], ['cat', 'lion'], ['cat', 'lion']] weights = get_bm25_weights(corpus) self.assertAlmostEqual(weights[0][1], weights[0][2])
#Phương pháp CosSim if choose_relevance == '1': file_CosSim = open(path_output + '/' + str_method + '_CosSim.txt', "w", encoding="utf8") for item1 in list_result_after_convert_vector: str_result = "" for item2 in list_result_after_convert_vector: result = 1 - spatial.distance.cosine(item1, item2) result = round(result, 5) str_result += str(result).ljust(10) file_CosSim.write(str_result + '\n') file_CosSim.close() #phương pháp Okapi BM25 elif choose_relevance == '2': result_bm25 = get_bm25_weights(list_word, n_jobs=-1) file_BM25 = open(path_output + '/' + str_method + '_OkapiBM25.txt', "w", encoding="utf8") for item in result_bm25: str_result = ' '.join(str(round(e, 3)).ljust(10) for e in item) file_BM25.write(str_result + '\n') file_BM25.close()
def test_nonnegative_weights(self): """ All the weights for a partiular document should be non negative """ weights = get_bm25_weights(common_texts) for doc_weights in weights: for weight in doc_weights: self.assertTrue(weight >= 0.)
manager = multiprocessing.Manager() output = manager.dict() for rank in range(num_cores): if rank + 1 == num_cores: file_chunk = files[rank * chunk_size:] else: file_chunk = files[rank * chunk_size:(rank + 1) * chunk_size] print(f"Reading chunk {rank}...") json_contents_list = [read_json(x, as_list=True) for x in file_chunk] p = multiprocessing.Process(target=process_corpus, args=(json_contents_list, output, rank)) p.start() processes.append(p) for k, p in enumerate(processes): p.join() print(f"{k} has finished") main_corpus = [] for k, v in output: main_corpus.extend(v) bm25_weights = get_bm25_weights(main_corpus, n_jobs=num_cores) pickle.dump(bm25_weights, open("bm25_weights.p", "wb"))
def test_disjoint_docs_if_weight_zero(self): """ Two disjoint documents should have zero matching""" corpus = [['cat', 'dog', 'lion'], ['goat', 'fish', 'tiger']] weights = get_bm25_weights(corpus) self.assertAlmostEqual(weights[0][1], 0) self.assertAlmostEqual(weights[1][0], 0)
from nltk import word_tokenize from nltk.stem import PorterStemmer import pickle def stemm_paragraph(paragraph): return [ps.stem(word) for word in word_tokenize(paragraph.lower())] ps = PorterStemmer() index = {} corpus = [] for k, filename in enumerate(get_filename_list()): print(k, filename) json_file = json.loads(open(filename).read()) paragraphs = [stemm_paragraph(x['text']) for x in json_file['body_text']] corpus.extend(paragraphs) chunk = len(corpus) // 10 for i in range(9): pickle.dump(corpus[i*chunk: (i+1)*chunk], open(f"bm25_corpus_{i}.p", "wb"), pickle.HIGHEST_PROTOCOL) bm25_weights = get_bm25_weights(corpus, n_jobs=10) pickle.dump(bm25_weights, open("bm25_weights.p", "wb"))
def bm25(data): vectors = np.array(get_bm25_weights(data, n_jobs=-1)) # vectors /= vectors.sum(axis=1, keepdims=True) # L1 vectors /= np.linalg.norm(vectors, axis=1, keepdims=True) # L2 vectors[np.isnan(vectors)] = 0 return vectors
def _txt2vec(texts, config, clf_h=None, txt_vctrz=None, char_vctrz=None, use_tfidf=True, ftdecomp=None, ftmdl=None, n_components=128, saved_path='.', prefix='corpus', **kwargs): extra_outputs = () logging.info('Converting text to vectors with parameters: %s; %s' % (str( dict(txt_vctrz=txt_vctrz, char_vctrz=char_vctrz, use_tfidf=use_tfidf, ftdecomp=ftdecomp, ftmdl=ftmdl, n_components=n_components, prefix=prefix, saved_path=saved_path)), str(kwargs))) from scipy.sparse import csr_matrix, hstack, issparse if config.sentvec_path and os.path.isfile(config.sentvec_path): import sent2vec sentvec_model = sent2vec.Sent2vecModel() sentvec_model.load_model(config.sentvec_path) sentvec = sentvec_model.embed_sentences(texts) logging.info('Sentence vector dimension of dataset %s: %i' % (prefix, sentvec.shape[1])) clf_h = hstack( (csr_matrix(clf_h), txt_X)) if clf_h is not None else sentvec if config.do_tfidf: tfidf_cache_fpath = os.path.join(saved_path, '%s_tfidf.pkl' % prefix) if os.path.exists(tfidf_cache_fpath): with open(tfidf_cache_fpath, 'rb') as fd: txt_X, txt_vctrz = pickle.load(fd) else: if txt_vctrz is None: binary = (ftdecomp == 'svd' or kwargs.setdefault('binary', False)) txt_vctrz = AdvancedTfidfVectorizer( stop_words=kwargs.setdefault('stop_words', 'english'), ngram_range=kwargs.setdefault('ngram', (1, 1)), binary=binary, dtype='float32', use_idf=kwargs.setdefault('use_idf', True), sublinear_tf=kwargs.setdefault('sublinear_tf', False), lemma=kwargs.setdefault('lemma', False), stem=kwargs.setdefault('stem', False), synonym=kwargs.setdefault('synonym', False), w2v_fpath=kwargs.setdefault('w2v_fpath', None), w2v_topk=kwargs.setdefault('w2v_topk', 10), phraser_fpath=kwargs.setdefault('phraser', None), keep_orig=kwargs.setdefault( 'keep_orig', False)) if use_tfidf else AdvancedCountVectorizer( stop_words=kwargs.setdefault( 'stop_words', 'english'), ngram_range=kwargs.setdefault('ngram', (1, 1)), binary=binary, dtype='int8' if binary else 'int32', lemma=kwargs.setdefault('lemma', False), stem=kwargs.setdefault('stem', False), synonym=kwargs.setdefault('synonym', False), w2v_fpath=kwargs.setdefault('w2v_fpath', None), w2v_topk=kwargs.setdefault('w2v_topk', 10), phraser_fpath=kwargs.setdefault('phraser', None), keep_orig=kwargs.setdefault('keep_orig', False)) txt_X = txt_vctrz.fit_transform(texts) if len(kwargs.setdefault( 'ngram_weights', {})) == txt_vctrz.get_params()['ngram_range'][ 1] - txt_vctrz.get_params()['ngram_range'][0] + 1: ngram_types = np.array( list( map(lambda x: x.count(' ') + 1, txt_vctrz.get_feature_names()))) ngram_idx = dict((tp, np.where(ngram_types == tp)[0]) for tp in np.unique(ngram_types)) if all([ k in kwargs['ngram_weights'] for k in ngram_idx.keys() ]): norm_weights = imath.normalize( list(kwargs['ngram_weights'].values())) for i, k in enumerate(kwargs['ngram_weights'].keys()): ngram_idx[k] = (ngram_idx[k], norm_weights[i]) extra_outputs += (ngram_idx, ) else: logging.info('Eval mode of TFIDF:') txt_X = txt_vctrz.transform(texts) with open('%s_tfidf.pkl' % prefix, 'wb') as fd: pickle.dump((txt_X, txt_vctrz), fd) logging.info('TFIDF dimension of dataset %s: %i' % (prefix, txt_X.shape[1])) clf_h = hstack( (csr_matrix(clf_h), txt_X)) if clf_h is not None else txt_X if config.do_chartfidf: chartfidf_cache_fpath = os.path.join(saved_path, '%s_chartfidf.pkl' % prefix) if os.path.exists(chartfidf_cache_fpath): with open(chartfidf_cache_fpath, 'rb') as fd: char_X, char_vctrz = pickle.load(fd) else: if char_vctrz is None: binary = (ftdecomp == 'svd' or kwargs.setdefault('binary', False)) char_vctrz = AdvancedTfidfVectorizer( analyzer=kwargs.setdefault('char_analyzer', 'char_wb'), stop_words=kwargs.setdefault('stop_words', 'english'), ngram_range=kwargs.setdefault('char_ngram', (4, 6)), binary=binary, dtype='float32', use_idf=kwargs.setdefault('use_idf', True), sublinear_tf=kwargs.setdefault('sublinear_tf', False) ) if use_tfidf else AdvancedCountVectorizer( analyzer=kwargs.setdefault('char_analyzer', 'char_wb'), stop_words=kwargs.setdefault('stop_words', 'english'), ngram_range=kwargs.setdefault('char_ngram', (4, 6)), binary=binary, dtype='int8' if binary else 'int32') char_X = char_vctrz.fit_transform(texts) if len(kwargs.setdefault( 'ngram_weights', {})) == char_vctrz.get_params()['ngram_range'][ 1] - char_vctrz.get_params()['ngram_range'][0] + 1: ngram_types = np.array( list( map(lambda x: x.count(' '), char_vctrz.get_feature_names()))) ngram_idx = dict((tp, np.where(ngram_types == tp)[0]) for tp in np.unique(ngram_types)) if all([ k in kwargs['ngram_weights'] for k in ngram_idx.keys() ]): norm_weights = imath.normalize( kwargs['ngram_weights'].values()) for i, k in enumerate(kwargs['ngram_weights'].keys()): ngram_idx[k] = (ngram_idx[k], norm_weights[i]) extra_outputs += (ngram_idx, ) else: logging.info('Eval mode of Char TFIDF:') char_X = char_vctrz.transform(texts) with open('%s_chartfidf.pkl' % prefix, 'wb') as fd: pickle.dump((char_X, char_vctrz), fd) logging.info('Char TFIDF dimension of dataset %s: %i' % (prefix, char_X.shape[1])) clf_h = hstack( (csr_matrix(clf_h), char_X)) if clf_h is not None else char_X if config.do_bm25: bm25_cache_fpath = os.path.join(saved_path, '%s_bm25.pkl' % prefix) if os.path.exists(bm25_cache_fpath): with open(bm25_cache_fpath, 'rb') as fd: txt_bm25_X = pickle.load(fd) else: from gensim.summarization.bm25 import get_bm25_weights txt_bm25_X = np.array(get_bm25_weights(texts, n_jobs=config.np)) with open('%s_bm25.pkl' % prefix, 'wb') as fd: pickle.dump(txt_bm25_X, fd) logging.info('BM25 dimension of dataset %s: %i' % (prefix, txt_bm25_X.shape[1])) clf_h = hstack((csr_matrix(clf_h), txt_bm25_X)) if clf_h is not None else txt_bm25_X if type(ftdecomp) is str: ftdecomp = ftdecomp.lower() if issparse(clf_h) and ftdecomp != 'svd': clf_h = clf_h.toarray() # Feature reduction if ftdecomp is None or type(ftdecomp) is str and ftdecomp.lower( ) == 'none' or n_components >= clf_h.shape[1]: return clf_h, txt_vctrz, char_vctrz, None if ftmdl is None: if ftdecomp == 'pca': from sklearn.decomposition import PCA ftmdl = PCA(n_components=min(n_components, clf_h.shape[0])) elif ftdecomp == 'svd': from sklearn.decomposition import TruncatedSVD ftmdl = TruncatedSVD(n_components=n_components) logging.info('Using %s feature reduction...' % ftdecomp.upper()) clf_h = ftmdl.fit_transform(clf_h).astype('float32') else: logging.info('Eval mode of feature reduction:') clf_h = ftmdl.transform(clf_h).astype('float32') return (clf_h, txt_vctrz, char_vctrz, ftmdl) + extra_outputs
def bm25_query_k_con_4_dia(train_dia, train_con, val_dia, val_con, infer_dia, infer_con, k_set, out_path): """ 使用BM25算法,对给定的诊断文本,查询k个候选概念 :param train_dia: :param train_con: :param val_dia: :param val_con: :param infer_dia: :param infer_con: :param k_set: :param out_path: :return: """ start_time = time.time() train_dia_f = open(train_dia, 'r', encoding='utf-8').readlines() train_dia_f = [i.split('\t')[0] for i in train_dia_f] # # 不使用病例信息,经过验证,加入会变差 val_dia_f = open(val_dia, 'r', encoding='utf-8').readlines() val_dia_f = [i.split('\t')[0] for i in val_dia_f] # 不使用病例信息,经过验证,加入会变差 infer_dia_f = open(infer_dia, 'r', encoding='utf-8').readlines() infer_dia_f = [i.split('\t')[0] for i in infer_dia_f] # 不使用病例信息,经过验证,加入会变差 train_con_f = open(train_con, 'r', encoding='utf-8').readlines() train_con_f = [i.rstrip('\n') for i in train_con_f] val_con_f = open(val_con, 'r', encoding='utf-8').readlines() val_con_f = [i.rstrip('\n') for i in val_con_f] infer_con_f = open(infer_con, 'r', encoding='utf-8').readlines() infer_con_f = [i.rstrip('\n') for i in infer_con_f] all_dia = train_dia_f + val_dia_f + infer_dia_f # all_dia = list(set(all_dia)) infer_dia_f_num = len(infer_dia_f) all_dia_num = len(all_dia) all_con = train_con_f + val_con_f + infer_con_f all_con = list(set(all_con)) all_con_num = len(all_con) all_con_dict_idx2txt = {} for idx, txt in enumerate(all_con): all_con_dict_idx2txt[idx] = txt corpus = all_dia + all_con corpus = [i.split(' ') for i in corpus] corpus_dict = {} for idx, line in enumerate(corpus): corpus_dict[idx] = line # bm25_model = bm25.BM25(corpus) bm25_model_w = get_bm25_weights(corpus, n_jobs=2) # bm25_model = BM25(corpus) result_log = open(output_path + 'result-log.txt', 'w', encoding='utf-8') all_con_weight = bm25_model_w[-all_con_num:] assert len(all_con_weight) == all_con_num, \ '概念权重的个数{}和全体概念的个数{}不一致!'.format(len(all_con_weight), all_con_num) all_con_w_kdtree = KDTree(normalize(np.mat(all_con_weight)), metric='euclidean') infer_dia_weight = bm25_model_w[-(infer_dia_f_num + all_con_num):-all_con_num] assert len(infer_dia_weight) == infer_dia_f_num, \ '推理诊断权重个数{}与推理诊断个数{}不一致!'.format(len(infer_dia_weight), infer_dia_f_num) infer_dia_weight = normalize(infer_dia_weight) for k in k_set: output_pro_txt = open(output_path + 'k=' + str(k) + '.txt', 'w', encoding='utf-8') y_pred = [0 for _ in range(len(infer_con_f))] y_true = [1 for _ in range(len(infer_con_f))] for idx, query_dia in enumerate(infer_dia_weight): query_dia = np.mat(query_dia) query_pro, query_idx = all_con_w_kdtree.query( query_dia.reshape(1, -1), k) candidate_pro = [round(i, 5) for i in query_pro[0]] candidate_txt = [all_con_dict_idx2txt[i] for i in query_idx[0]] assert len(candidate_pro) == len(candidate_txt), '候选的概率和个数不一致!' if infer_con_f[idx] in candidate_txt: y_pred[idx] = 1 acc = round(accuracy_score(y_true, y_pred), 3) end_time = time.time() use_time = round(end_time - start_time, 3) each_time = round(use_time / len(infer_con_f), 3) result_log.write('当k={}时,Cov值为{},总消耗的时间为{},每条消耗时间为{}。\n'.format( k, acc, use_time, each_time)) print('当k={}时,Cov值为{},总消耗的时间为{},每条消耗时间为{}。'.format( k, acc, use_time, each_time)) output_pro_txt.close() result_log.close()