def bm25(self, query, categorized_qa): # 只有问题分类的情况下才在这里做模型实例化,其他情况下模型已经在__init__()里实例化过了 if args.categorize_question: if len(categorized_qa['cut_answers']) != 0: # 非空的时候才用这个作corpus传进BM25 self.bm25_model = BM25(categorized_qa['cut_answers']) # print(categorized_qa['classes']) else: # 如果为空,那么还用原来的corpus传进BM25 self.bm25_model = BM25(self.cut_answers) # print('没用分类问题') bm25_weights = self.bm25_model.get_scores(query) sorted_scores = sorted(bm25_weights, reverse=True) # 将得分从大到小排序 sorted_scores = [s / (len(query) + 1) for s in sorted_scores] # 将得分除以句长 max_pos = np.argsort(bm25_weights)[::-1] # 从大到小排序,返回index(而不是真正的value) # 根据max_pos从答案库里把真正的答案抽出来 if args.categorize_question: # 答案来源是categorized的时候 if len(categorized_qa['cut_answers']) != 0: # 非空的时候才用这个作为answer base answers = self.__max_pos2answers(max_pos, categorized_qa['uncut_answers']) else: # 如果为空,那么还用原来的self.uncut_answers作为answer base answers = self.__max_pos2answers(max_pos, self.uncut_answers) else: # 答案来源不是categorized的时候,categorized_qa是None answers = self.__max_pos2answers(max_pos, self.uncut_answers) return sorted_scores, max_pos, answers
def test_epsilon(self): """ Changing the b parameter should give consistent results """ corpus = [['cat', 'dog', 'mouse'], ['cat', 'lion'], ['cat', 'lion']] first_epsilon = 1.0 second_epsilon = 2.0 bm25 = BM25(corpus) words_with_negative_idfs = set( [word for word, idf in bm25.idf.items() if idf < 0]) index, doc = [(index, document) for index, document in enumerate(corpus) if words_with_negative_idfs & set(document)][0] first_bm25 = BM25(corpus, epsilon=first_epsilon) second_bm25 = BM25(corpus, epsilon=second_epsilon) first_score = first_bm25.get_score(doc, index) second_score = second_bm25.get_score(doc, index) self.assertGreater(first_score, second_score) first_iter = iter_bm25_bow(corpus, epsilon=first_epsilon) second_iter = iter_bm25_bow(corpus, epsilon=second_epsilon) first_score = dict(next(iter(first_iter)))[index] second_score = dict(next(iter(second_iter)))[index] self.assertGreater(first_score, second_score) first_weights = get_bm25_weights(corpus, epsilon=first_epsilon) second_weights = get_bm25_weights(corpus, epsilon=second_epsilon) first_score = first_weights[index] second_score = second_weights[index] self.assertGreater(first_score, second_score)
def bm25_sim_matrix(self, src_indices, tgt_indices, index_source=True, monitor_progress=True): base_matrix = np.zeros((len(src_indices), len(tgt_indices))) if monitor_progress: pbar = tqdm(total=len(src_indices) if index_source is True else len(tgt_indices), desc="BM25 matrix") print(base_matrix.shape) src_lines = [clean_text(self.get_src_line(s)) for s in src_indices] tgt_lines = [clean_text(self.get_tgt_line(t)) for t in tgt_indices] if index_source: bm25 = BM25(src_lines) else: bm25 = BM25(tgt_lines) average_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf) for src_id, src_line in enumerate(src_lines): for tgt_id, tgt_line in enumerate(tgt_lines): if index_source: bm25_res = bm25.get_score(tgt_line, src_id, average_idf) else: bm25_res = bm25.get_score(src_line, tgt_id, average_idf) base_matrix[src_id, tgt_id] = bm25_res if monitor_progress: pbar.update() return base_matrix / np.max(base_matrix)
def test_b(self): """ Changing the b parameter should give consistent results """ corpus = common_texts index = 0 doc = corpus[index] first_b = 1.0 second_b = 2.0 first_bm25 = BM25(corpus, b=first_b) second_bm25 = BM25(corpus, b=second_b) first_score = first_bm25.get_score(doc, index) second_score = second_bm25.get_score(doc, index) self.assertLess(first_score, second_score) first_iter = iter_bm25_bow(corpus, b=first_b) second_iter = iter_bm25_bow(corpus, b=second_b) first_score = dict(next(iter(first_iter)))[index] second_score = dict(next(iter(second_iter)))[index] self.assertLess(first_score, second_score) first_weights = get_bm25_weights(corpus, b=first_b) second_weights = get_bm25_weights(corpus, b=second_b) first_score = first_weights[index] second_score = second_weights[index] self.assertLess(first_score, second_score)
def __init__(self, context_filters: Filters, author_filters: Filters, num_filters: int, embed_size:int, num_layers: int, path_to_weights: PathOrStr, data: BaseData, evaluate: bool = True, show_attention: bool = False): self.data = data self.context, self.title, self.authors = self.data.cntxt, self.data.ttl, self.data.aut self.pad = self.title.vocab.stoi['<pad>'] self.criterion = nn.CrossEntropyLoss(ignore_index = self.pad, reduction="none") self.model = NeuralCitationNetwork(context_filters=context_filters, author_filters=author_filters, context_vocab_size=len(self.context.vocab), title_vocab_size=len(self.title.vocab), author_vocab_size=len(self.authors.vocab), pad_idx=self.pad, num_filters=num_filters, authors=True, embed_size=embed_size, num_layers=num_layers, hidden_size=num_filters, dropout_p=0.2, show_attention=show_attention) self.model.to(DEVICE) self.model.load_state_dict(torch.load(path_to_weights, map_location=DEVICE), strict=False) self.model.eval() logger.info(self.model.settings) self.eval = evaluate self.show_attention = show_attention # instantiate examples, corpus and bm25 depending on mode logger.info(f"Creating corpus in eval={self.eval} mode.") if self.eval: self.examples = data.test.examples logger.info(f"Number of samples in BM25 corpus: {len(self.examples)}") self.corpus = list(set([tuple(example.title_cited) for example in self.examples])) self.bm25 = BM25(self.corpus) self.context_cited_indices = self._get_context_title_indices(self.examples) else: self.examples = data.train.examples + data.train.examples+ data.train.examples logger.info(f"Number of samples in BM25 corpus: {len(self.examples)}") self.corpus = list(set([tuple(example.title_cited) for example in self.examples])) self.bm25 = BM25(self.corpus) # load mapping to give proper recommendations with open("assets/title_tokenized_to_full.pkl", "rb") as fp: self.title_to_full = pickle.load(fp) with open("assets/title_to_aut_cited.pkl", "rb") as fp: self.title_aut_cited = pickle.load(fp)
def bm25(k, queries, collection, idf_base): """ Computes BM25 scores for each query and the documents in the collection :param k: The number of documents in the collection to be ranked :param queries: all query vectors :param collection: all collection vectors :param idf_base: store idf with identifier "above_threshold", "below_threshold", "all" :return: """ bm25 = BM25(collection) if not os.path.exists( "public_data/vocab/bm25_word2weight_%s.pkl" % idf_base): with open("public_data/vocab/bm25_word2weight_%s.pkl" % idf_base, 'wb') as out: pickle.dump(bm25.idf, out, protocol=4) avg_idf = sum(bm25.idf.values()) / len(bm25.idf.values()) top_ids = [] for i, query in enumerate(queries): scores = bm25.get_scores(query, avg_idf) ids = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k] top_ids.append(ids) top_ids = np.array(top_ids) return top_ids
def getContext(self, sentences, question): documents = [] for sent in sentences: documents.append(self.tokenize(sent)) bm25 = BM25(documents) scores = bm25.get_scores(self.tokenize(question)) results = {} for index, score in enumerate(scores): results[index] = score sorted_results = { k: v for k, v in sorted( results.items(), key=lambda item: item[1], reverse=True) } results_list = list(sorted_results.keys()) final_results = results_list if len( results_list ) < self.numberOfResults else results_list[:self.numberOfResults] questionContext = "" for final_result in final_results: questionContext = questionContext + " ".join( documents[final_result]) return questionContext
def calculate_scores(question, candidate_set): model = BM25(candidate_set) average_idf = sum(float(val) for val in model.idf.values()) / len(model.idf) scores = [] for idx, val in enumerate(candidate_set): scores += [model.get_score(question, idx, average_idf)] return scores
def get_other_results(queries, qml_rankings, num_ranks=None): document_lookup = read_cache('./doc_lookup.json', get_robust_documents) document_title_to_id = read_cache('./document_title_to_id.json', lambda: print('failed')) document_id_to_title = _.invert(document_title_to_id) doc_ids = range(len(document_id_to_title)) documents = [ document_lookup[document_id_to_title[doc_id]] for doc_id in doc_ids ] tokenizer = Tokenizer( rules=[handle_caps, fix_html, spec_add_spaces, rm_useless_spaces]) tokenized_documents = read_cache('tok_docs.json', lambda: tokenizer.process_all(documents)) tokenized_queries = tokenizer.process_all(queries) bm25 = BM25(tokenized_documents) average_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf) bm25_rankings = [] glove_rankings = [] rm3_rankings = [] glove = get_glove_lookup(embedding_dim=300, use_large_embed=True) docs_lms = _calc_docs_lms(bm25.df, bm25.f) for q, qml_ranking in progressbar(zip(tokenized_queries, qml_rankings)): bm25_rankings.append( _get_bm25_ranking(bm25, qml_ranking, q, average_idf=average_idf)) glove_rankings.append( _get_glove_ranking(glove, tokenized_documents, qml_ranking, q)) rm3_rankings.append(_get_rm3_ranking(docs_lms, bm25.f, qml_ranking, q)) return bm25_rankings, glove_rankings, rm3_rankings
def build_index(): # build bm25 index corpus = [] total = 0.0 SIA = SentimentIntensityAnalyzer() for rid in reviews: self.review_ids.append(rid) if 'text' in reviews[rid]: sent = reviews[rid]['text'] else: sent = reviews[rid]['review'] tokens = gensim.utils.simple_preprocess(sent.lower()) reviews[rid]['sentiment'] = SIA.polarity_scores(sent)['compound'] corpus.append(tokens) self.position_index[rid] = {} for (pos, token) in enumerate(tokens): if token not in self.position_index[rid]: self.position_index[rid][token] = [pos] else: self.position_index[rid][token].append(pos) for ext in reviews[rid]['extractions']: attr = ext['attribute'] if attr not in self.idf: self.idf[attr] = 1 else: self.idf[attr] += 1 total += 1 bm25 = BM25(corpus) for attr in self.idf: self.idf[attr] = math.log2(total / self.idf[attr]) return bm25
def __init__(self, raw_corpus: list, processed_corpus: list = None): self.__pipeline = spacy.load('en_core_web_sm', disable=["parser", "tagger", "ner"]) self.raw_corpus = raw_corpus self.processed_corpus = self.__pre_process_corpus( ) if processed_corpus is None else processed_corpus self.raw_corpus = np.array(self.raw_corpus) self.model = BM25(self.processed_corpus)
def load_bm25(self): """ Convert the description of candidates into corpus and load the corpus into a bm25 object, which is used for keyword ranking. """ self.corpus = [] for candidate in self.candidates: self.corpus.append(text2tokens(candidate['description'])) self.bm25 = BM25(self.corpus) self.average_idf = sum( float(val) for val in self.bm25.idf.values()) / len(self.bm25.idf)
def main(): docs = get_list_of_docs('../test_files') corpus, doc_titles = get_words(docs) bm = BM25(corpus) while (True): query = input('Query:') scores = bm.get_scores(query.split(' ')) print(list(enumerate(scores))) print(get_ranked_docs(doc_titles, scores))
def train_bm25Model(Dataset: List[Sample], Stopwords: set) -> BM25: corpus = [] # 去停用词 for cur_sample in Dataset: corpus.append([word for word in cur_sample.source_text.split(" ") if word not in Stopwords]) corpus.append([word for word in cur_sample.target_text.split(" ") if word not in Stopwords]) # 送入模型 参数corpus:str的list集合 bm25Model = BM25(corpus) return bm25Model
def __init__(self, texts): assert isinstance(texts, list) and len(texts) > 0 and isinstance(texts[0], list) time_init = time.time() # self.dct = corpora.Dictionary(texts) # self.corpus_ = [self.dct.doc2bow(text) for text in texts] self.model = BM25(texts) self.doc_num = len(texts) self.avg_idf = sum(map(lambda k: float(self.model.idf[k]), self.model.idf.keys())) / len(self.model.idf.keys()) logger.warning("Build BM25 model use time %s", time.time()-time_init)
def __init__(self, corpus=None): self.corpus = corpus if self.corpus != None: self.tfidf_vectorizer = self.get_tfidf_vectorizer(self.corpus) self.corpus_vec = self.tfidf_vectorizer.transform(self.corpus) self.bm25_model = BM25([s.split() for s in corpus]) self.average_idf = sum( map(lambda k: float(self.bm25_model.idf[k]), self.bm25_model.idf.keys())) / len( self.bm25_model.idf.keys())
def __init__(self, docs): self.docs = [] for doc in tqdm(docs, desc='LSI process docs', total=len(docs)): self.docs.append(clean_claim_description(doc, True).split()) self.bm25_model = BM25(self.docs) docs_str = [' '.join(d) for d in self.docs] tf_idf_vectoraizer = TfidfVectorizer(stop_words='english', ) tf_idf_vectoraizer.fit(docs_str) self.avg_idf = tf_idf_vectoraizer.idf_.mean()
def process(self, message, **kwargs): """Process an incoming message. This is the components chance to process an incoming message. The component can rely on any context attribute to be present, that gets created by a call to :meth:`components.Component.pipeline_init` of ANY component and on any context attributes created by a call to :meth:`components.Component.process` of components previous to this one.""" sentence = message.text question_vectors = [ self.query(sentence) for sentence in self.get_tokenized_qa_words([sentence]) ] intent = message.data["intent"]["name"] intent_num = self.base_intents.index(intent) intent_index = np.argwhere(self.intents_vec == intent_num).reshape(-1) intent_question = self.q_a_v[intent_index] sim_mat = cosine_similarity(question_vectors, intent_question) top_posi = np.argsort(sim_mat, axis=1)[0][-10:][::-1] full_index = intent_index[top_posi] sim_value = sim_mat[0, top_posi] if "full_index" not in message.data: message.set("full_index", full_index.tolist()) else: exist = message.get("full_index") full_index = list(set(exist + full_index.tolist())) message.set("full_index", full_index) sim_value = json.dumps(sim_value.tolist(), ensure_ascii=False) message.set("top_similar_value", sim_value, add_to_output=True) # ======================================== sim_mat_part = cosine_similarity(question_vectors, self.q_a_v[full_index])[0] candicate_sentences = self.questions[full_index].tolist() candicate_sentences.insert(0, sentence) sentence_tokens = self.get_tokenized_qa_words(candicate_sentences) bm25_object = BM25(sentence_tokens[1:]) bm25_score = np.array(bm25_object.get_scores(sentence_tokens[0])) feature_score = np.array([sim_mat_part, bm25_score]).T scaler = MinMaxScaler() scaler.fit(feature_score) feature_score = scaler.transform(feature_score) weight = np.array([0.8, 0.2]) final_score = (feature_score * weight).sum(axis=1) # final_score = sim_mat_part + bm25_score best_match_question = self.questions[full_index[np.argmax( final_score)]] message.set("best_match", best_match_question, add_to_output=True) responses = self.qa_map.get(best_match_question) message.set("response", responses, add_to_output=True)
def main(): document_lookup = read_cache('./doc_lookup.json', get_robust_documents) document_title_to_id = create_id_lookup(document_lookup.keys()) document_id_to_title = _.invert(document_title_to_id) doc_ids = range(len(document_id_to_title)) documents = [document_lookup[document_id_to_title[doc_id]] for doc_id in doc_ids] tokenizer = Tokenizer() tokenized_documents = read_cache('tok_docs.json', lambda: tokenizer.process_all(documents)) bm25 = BM25(tokenized_documents) with open('./doc_word_idf.json', 'w+') as fh: json.dump(bm25.idf, fh)
def wrapper(query): #print("reading generated csv") print("pre processing query") query = pre_processing(query) print("bm25 generation1") average_idf1 = sum(float(val) for val in bm251.idf.values()) / len(bm251.idf) bm25_scores1 = bm251.get_scores(query.split(), average_idf1) print("bm25 generation2") average_idf2 = sum(float(val) for val in bm252.idf.values()) / len(bm252.idf) bm25_scores2 = bm252.get_scores(query.split(), average_idf2) arr = np.array([(0.7 * bm25_scores1[i] + bm25_scores2[i]) / 2 for i in range(0, df1.shape[0])]) x = arr.argsort(axis=0)[-10:] print("getting confidences") confidences = [] for i in np.fliplr([x])[0]: confidences.append(arr[i]) resolutions = [] for i in np.fliplr([x])[0]: resolutions.append(df1.Resolution[i]) score = [] print("Using page rank algorithm to swap the retrieved resolutions") print("\n\n") dict_d = {} for i in range(len(resolutions)): corpus = [ resolutions[j].split() for j in range(len(resolutions)) if j != i ] bm25 = BM25(corpus) average_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf) bm25_scores = bm25.get_scores(resolutions[i].split(), average_idf) score.append(sum(bm25_scores)) score = [(score[i] + confidences[i]) / 2 for i in range(len(score))] for i in range(len(resolutions)): dict_d[score[i]] = resolutions[i] final_confidences = sorted(dict_d.keys(), reverse=True) final_resolutions = [] for i in final_confidences: final_resolutions.append(dict_d[i]) return (confidences, final_resolutions)
def _init_bm25(self): questions = pd.read_csv('../data/question_id.csv') corpus = questions.wid.tolist() bm25_dim = len(corpus) corpus = [s.split() for s in corpus] from gensim.summarization.bm25 import get_bm25_weights, BM25 bm25_model = BM25(corpus) bm25_avg_idf = sum( map(lambda k: float(bm25_model.idf[k]), bm25_model.idf.keys())) / len(bm25_model.idf.keys()) return bm25_model, bm25_dim, bm25_avg_idf
def __fill_bm25_scores(self): """ Calculates BM25 scores of each document in corpus for a query """ corpus = [Input(doc).tokens for doc in self.data[self.column]] bm25 = BM25(corpus) average_idf = sum(float(val) for val in bm25.idf.values()) / float(len(bm25.idf)) query = Input(self.query).tokens scores = bm25.get_scores(query, average_idf) self.data[CONST.COL_BM25] = scores
def generate_conv_data(in_path, subreddit): df_conv = pd.read_csv(in_path, lineterminator= "\n") df_conv = df_conv[df_conv['subreddit'] == subreddit] df_conv['query'] = df_conv['query'].astype(str) df_conv['relevant_response'] = df_conv['relevant_response'].astype(str) documents = np.array(df_conv['relevant_response']) corpus = [context.split(" ") for context in documents] bm25 = BM25(corpus) cache = {} instances = [] index_subreddit = [] for idx, r in tqdm([x for x in df_conv.iterrows()]): if r['query'] in cache: max_positions = cache[r['query']] else: scores = np.array(bm25.get_scores(str(r['query']).split(" "))) max_positions = heapq.nlargest(negative_samples, range(len(scores)), scores.take) cache[r['query']] = max_positions while idx in max_positions: new_doc = random.sample(range(len(documents)), 1)[0] max_positions[max_positions.index(idx)] = new_doc candidates = documents[max_positions] instances.append([ r['query'], r['relevant_response'] ] + list(candidates)) index_subreddit.append([idx, r['subreddit']]) # random.shuffle(instances) #<-- this shouldnt be here. instances from same dialogue will be spread over different data splits train, valid, test = (instances[0: int(0.8*len(instances))], instances[int(0.8*len(instances)) : int(0.9*len(instances))], instances[int(0.9*len(instances)):]) cols = ["query", "relevant_doc"] + \ ["non_relevant_"+str(i+1) for i in range(negative_samples)] train, valid, test = (pd.DataFrame(train, columns=cols), pd.DataFrame(valid, columns=cols), pd.DataFrame(test, columns=cols)) indexes = pd.DataFrame(index_subreddit, columns = ['index', 'subreddit']) return train, valid, test, indexes
def query_to_document(query): """ Takes string question and returns the name of the document which the question is likely to be present in""" bm25_df = BM25(query).head( n=50) # gets the dataframe of BM25 with scores and ranks of documents tfidf_df = TFIDF(query).head( n=50) # gets the dataframe of TFIDF with scores and ranks of documents doc2vec_df = Doc2Vec(query).head( n=50 ) # gets the dataframe of Doc2Vec with scores and ranks of documents # combining all the dataframes final_df = pd.merge(pd.merge(bm25_df, tfidf_df, on=['Document'], how='outer'), doc2vec_df, on=['Document'], how='outer') final_df.fillna(0, inplace=True) # Normalising the scores between 0 and 1 bm25_normalised = (final_df.Score_BM25 - final_df.Score_BM25.min()) / ( final_df.Score_BM25.max() - final_df.Score_BM25.min()) tfidf_normalised = (final_df.Score_TFIDF - final_df.Score_TFIDF.min()) / ( final_df.Score_TFIDF.max() - final_df.Score_TFIDF.min()) doc2vec_normalised = ( final_df.Score_Doc2Vec - final_df.Score_Doc2Vec.min()) / ( final_df.Score_Doc2Vec.max() - final_df.Score_Doc2Vec.min()) # Getting the total score based on the preious overall accuracy final_df[ 'total_score'] = 0.01243557 * bm25_normalised + 0.29682442 * tfidf_normalised - 0.01673123 * doc2vec_normalised final_df['bm25_normalised'] = bm25_normalised final_df['tfidf_normalised'] = tfidf_normalised final_df['doc2vec_normalised'] = doc2vec_normalised final_document_list = final_df.Document.values[:] final_scores = np.array( final_df. loc[:, ['bm25_normalised', 'tfidf_normalised', 'doc2vec_normalised']]) prediction_scores = [] for document, scores in zip(final_document_list, final_scores): scores = np.array(scores).reshape(1, 3) prediction = model_perceptron.predict(scores) prediction_scores.append(prediction) return final_document_list[np.array(prediction_scores).argmax()]
def __init__(self, cleanup_urls=True, nltk_tokenizer=False): super().__init__(cleanup_urls=cleanup_urls, nltk_tokenizer=nltk_tokenizer) self.corpus = [] self.bug_ids = [] for bug in bugzilla.get_bugs(): self.corpus.append(self.text_preprocess(self.get_text(bug))) self.bug_ids.append(bug["id"]) indexes = list(range(len(self.corpus))) random.shuffle(indexes) self.corpus = [self.corpus[idx] for idx in indexes] self.bug_ids = [self.bug_ids[idx] for idx in indexes] self.model = BM25(self.corpus)
def __init__(self, tokenizer, corpus, idxs2id): """ Parameters ---------- tokenizer : gensim.corpora.Dictionary Word tokenizer. corpus : gensim.corpora.mmcorpus.MmCorpus Bag-of-words formatted corpus of documents. """ self.preprocessor = TextPreprocessor() self.tokenizer = tokenizer self.corpus = corpus self.internal_engine = BM25(self.corpus) self.idxs2id = idxs2id print("BM25 engine loaded")
def get_g_bm25_model(self, text_pool): """ 2d list :param text_pool: :return: """ # text_pool = [line.split() for line in raw_text] word_freq = defaultdict(int) for line in text_pool: for word in line: word_freq[word] += 1 text_pool = [[token for token in line if word_freq[token] > 1] for line in text_pool] g_bm25_model = BM25(text_pool) return g_bm25_model
def start(self, model, data, sess, valid_only=False): stemmer = PorterStemmer() texts = [] for q_a in data.archive.answers + data.archive.questions: q_a.metadata['tokens_stemmed'] = [stemmer.stem(t.text) for t in q_a.tokens] texts.append(q_a.metadata['tokens_stemmed']) bm25 = BM25(texts) average_idf = sum(bm25.idf.values()) / float(len(bm25.idf)) # average_idf = sum(map(lambda k: float(bm25.idf[k]), bm25.idf.keys())) / float(len(bm25.idf.keys())) self.score(data.archive.valid, bm25, average_idf, data.archive.answers) for test in data.archive.test: self.score(test, bm25, average_idf, data.archive.answers)
def get_bm25_rankings(question_corpora, query_doc, question_part): parsed_query_doc = [word for word in query_doc.split()] parsed_qcorpora = [[ word for word in strip_text(question[question_part]).split() ] for question in question_corpora] dictionary = corpora.Dictionary(parsed_qcorpora) tokenized_qcorpora = [ dictionary.doc2bow( [word for word in strip_text(question[question_part]).split()]) for question in question_corpora ] tokenized_query_doc = dictionary.doc2bow(parsed_query_doc) bm25 = BM25(tokenized_qcorpora) average_idf = sum(map(lambda k: float(bm25.idf[k]), bm25.idf.keys())) / len(bm25.idf.keys()) return bm25.get_scores(tokenized_query_doc, average_idf)
def apply_fun(df): df.columns = ['d_id', 'key', 'doc'] query_id_group = df.groupby(['d_id']) bm_list = [] for name, group in tqdm(query_id_group): corpus = group['doc'].values.tolist() corpus = [sentence.strip().split() for sentence in corpus] query = group['key'].values[0].strip().split() bm25Model = BM25(corpus) average_idf = sum( map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len(bm25Model.idf.keys()) bmscore = bm25Model.get_scores(query, average_idf) bm_list.extend(bmscore) return bm_list