def test_vectorizer_query(self): vectorizer = BM25Vectorizer(self.index_path, min_df=5) result = vectorizer.get_query_vector( 'this is a query to test query vector') self.assertEqual(result[0, 2703], 2) self.assertEqual(result[0, 3078], 1) self.assertEqual(result[0, 3204], 1)
def test_bm25_vectorizer_train(self): vectorizer = BM25Vectorizer(self.index_path, min_df=5) train_docs = ['CACM-0239', 'CACM-0440', 'CACM-3168', 'CACM-3169'] train_labels = [1, 1, 0, 0] test_docs = ['CACM-0634', 'CACM-3134'] train_vectors = vectorizer.get_vectors(train_docs) test_vectors = vectorizer.get_vectors(test_docs) clf = LogisticRegression() clf.fit(train_vectors, train_labels) pred = clf.predict_proba(test_vectors) self.assertAlmostEqual(0.4629749, pred[0][0], places=8) self.assertAlmostEqual(0.5370251, pred[0][1], places=8) self.assertAlmostEqual(0.48288416, pred[1][0], places=8) self.assertAlmostEqual(0.51711584, pred[1][1], places=8)
def rank(new_qrels: str, base: str,tmp_base:str, qrels_path: str, lucene_index_path: str, R: List[int], score_path: str, alpha: float, clf_type: ClassifierType, vec_type: VectorizerType, tag: str): # build output path base_str = base.split('/')[-1] R_str = ''.join([str(i) for i in R]) curdir = os.getcwd() if curdir.endswith('integrations'): output_path = f'{tmp_base}/runs/{base_str}.{ClassifierStr[clf_type]}.{VectorizerStr[vec_type]}.R{R_str}.A{alpha}.txt' else: output_path = f'integrations/{tmp_base}/runs/{base_str}.{ClassifierStr[clf_type]}.{VectorizerStr[vec_type]}.R{R_str}.A{alpha}.txt' print(f'Output -> {output_path}') os.system('mkdir -p runs') vectorizer = None if vec_type == VectorizerType.TFIDF: vectorizer = TfidfVectorizer(lucene_index_path, min_df=5) elif vec_type == VectorizerType.SPECTER: base += '.specter' qrels_path += '.specter' vectorizer = SpecterVectorizer() elif vec_type == VectorizerType.BM25: vectorizer = BM25Vectorizer(lucene_index_path, min_df=5) else: print('invalid vectorizer') exit() f = open(output_path, 'w+') skipped_topics = set() topics = get_topics_from_qrun(base) for topic in topics: train_docs, train_labels = get_X_Y_from_qrels_by_topic(qrels_path, topic, R) if len(train_docs) == 0: print(f'[topic][{topic}] skipped') skipped_topics.add(topic) continue print(f'[topic][{topic}] eligible train docs {len(train_docs)}') clf = None if clf_type == ClassifierType.NB: clf = MultinomialNB() elif clf_type == ClassifierType.LR: clf = LogisticRegression() elif clf_type == ClassifierType.SVM: clf = SVC(kernel='linear', probability=True) else: print('ClassifierType not supported') exit() train_vectors = vectorizer.get_vectors(train_docs) clf.fit(train_vectors, train_labels) test_docs, base_scores = get_docs_from_qrun_by_topic(base, topic) print(f'[topic][{topic}] eligible test docs {len(test_docs)}') test_vectors = vectorizer.get_vectors(test_docs) rank_scores = clf.predict_proba(test_vectors) rank_scores = [row[1] for row in rank_scores] rank_scores = normalize(rank_scores) base_scores = normalize(base_scores) preds = [a * alpha + b * (1-alpha) for a, b in zip(rank_scores, base_scores)] preds, docs = sort_dual_list(preds, test_docs) for index, (score, doc_id) in enumerate(zip(preds, docs)): rank = index + 1 f.write(f'{topic} Q0 {doc_id} {rank} {score} {tag}\n') for topic in sort_str_topics_list(list(skipped_topics)): lines = get_lines_by_topic(base, topic, tag) print(f'Copying over skipped topic {topic} with {len(lines)} lines') for line in lines: f.write(f'{line}\n') f.close() map_score,ndcg_score = evaluate(new_qrels, output_path) with open(score_path, 'w') as outfile: json.dump({'map':map_score,'ndcg':ndcg_score}, outfile)
def test_bm25_vectorizer(self): vectorizer = BM25Vectorizer(self.index_path, min_df=5) result = vectorizer.get_vectors(['CACM-0239', 'CACM-0440'], norm=None) self.assertAlmostEqual(result[0, 190], 1.7513844966888428, places=8) self.assertAlmostEqual(result[1, 391], 0.03765463829040527, places=8)