def main(): qp = QueryParser(filename="../text/domain_keywords_20150617.txt") cp = CorpusParser(filename=candidate_text_path) qp.parse() queries = qp.get_queries() cp.parse() corpus = cp.get_corpus() proc = QueryProcessor(queries, corpus) results = proc.run() qid = 0 topn_idx = [] for result in results: sorted_x = sorted(result.iteritems(), key=operator.itemgetter(1)) sorted_x.reverse() index = 0 for i in sorted_x[:]: tmp = (qid, i[0], i[1], texts[int(i[0]) - 1]) # print '{:>1}\t{:>4}\t{:>12}\t{}'.format(*tmp) index += 1 topn_idx.append(i[0]) qid += 1 labels = read_candidate_label() precision(topn_idx, labels, topn=10) precision(topn_idx, labels, topn=20) precision(topn_idx, labels, topn=30) precision(topn_idx, labels, topn=40) precision(topn_idx, labels, topn=50) precision(topn_idx, labels, topn=60) precision(topn_idx, labels, topn=70) precision(topn_idx, labels, topn=80) precision(topn_idx, labels, topn=90) precision(topn_idx, labels, topn=100)
def main(): qp = QueryParser(filename=r'..\text\query.txt') cp = CorpusParser(filename=r'..\text\corpus.txt') qp.parse() queries = qp.get_queries() cp.parse() corpus = cp.get_corpus() proc = QueryProcessor(queries, corpus) for k1_value in range(12, 18): k1 = k1_value / 10.0 for k2 in range(100, 600, 100): for b_value in range(3, 8): b = b_value / 10.0 print 'k1=' + str(k1) + ' k2=' + str(k2) + ' b=' + str(b) result_path = 'E:\\GridSearchBM25Component\\' + str( k1) + ' ' + str(k2) + ' ' + str(b) + ' ' + 'Result.txt' out = open(result_path, 'w') results = proc.run(k1, k2, b) qid = 0 for result in results: sorted_x = sorted(result.iteritems(), key=operator.itemgetter(1)) sorted_x.reverse() index = 0 #这里可指定输出topK个 for i in sorted_x[:]: out.write( str(qid) + '\t' + str(index) + '\t' + i[0] + '\t' + str(i[1]) + '\n') out.flush() index += 1 qid += 1 out.close()
def main(): qp = QueryParser( filename= 'C:\\Users\\jrlimingyang\\PycharmProjects\\chatbot-version2\\cache\\query.txt' ) cp = CorpusParser( filename= 'C:\\Users\\jrlimingyang\\PycharmProjects\\chatbot-version2\\cache\\corpus.txt' ) qp.parse() queries = qp.get_queries() cp.parse() corpus = cp.get_corpus() proc = QueryProcessor(queries, corpus) results = proc.run() qid = 0 for result in results: sorted_x = sorted(result.iteritems(), key=operator.itemgetter(1)) sorted_x.reverse() index = 0 for i in sorted_x[:10]: tmp = (qid, qid, i[0], index, i[1]) print '{:>1}\tQ{:>1}\t{:>4}\t{:>2}\t{:>12}\tL-BM25'.format(*tmp) index += 1 qid += 1
def main(): #qp = QueryParser(filename='../text/queries.txt') qp = QueryParser(filename='../text/query_documents2.txt') #cp = CorpusParser(filename='../text/corpus.txt') #cp = CorpusParser(filename='../text/candidate_methodbody_documents_only.txt') cp = CorpusParser( filename='../text/candidate_methodbody_apitext_documents_only.txt') qp.parse() queries = qp.get_queries() cp.parse() corpus = cp.get_corpus() proc = QueryProcessor(queries, corpus) results = proc.run() qid = 0 print len(results) for result in results: #print result #sorted_x = sorted(result.iteritems(), key=operator.itemgetter(1)) sorted_x = sorted(result.items(), key=operator.itemgetter(1), reverse=True) #sorted_x.reverse() index = 0 print sorted_x print len(sorted_x) for i in sorted_x[:100]: tmp = (qid, i[0], index, i[1]) print '{:>1}\tQ0\t{:>4}\t{:>2}\t{:>12}\tNH-BM25'.format(*tmp) index += 1 qid += 1
def main(): qp = QueryParser(filename='../text/quer.txt') qp.parse() cp = CorpusParser(filename='../text/corps.txt') queries = qp.get_queries() cp.parse() corpus = cp.get_corpus() proc = QueryProcessor(queries, corpus) results, originalResults = proc.run() qid = 0 total = 10 correct = 0 for result in results: sorted_x = sorted(result.iteritems(), key=operator.itemgetter(1)) sorted_x.reverse() index = 0 #print sorted_x[0] try: actualOutput = sorted_x[0][0] expected = originalResults[qid] actualOutput = actualOutput.split("-")[0].strip() expected = expected.split("-")[0].strip() #print actualOutput,expected if actualOutput == expected: correct += 1 total += 1 except Exception as e: pass qid += 1 print("Accuracy"), print((correct / float(total)) * 100)
def main(): qp = QueryParser(filename=r'..\text\query.txt') cp = CorpusParser(filename=r'..\text\corpus.txt') qp.parse() queries = qp.get_queries() cp.parse() corpus = cp.get_corpus() proc = QueryProcessor(queries, corpus) k1 = 1.2 k2 = 100 b = 0.5 result_path = r'.\Result.txt' out = open(result_path, 'w') results = proc.run(k1, k2, b) qid = 0 for result in results: sorted_x = sorted(result.iteritems(), key=operator.itemgetter(1)) sorted_x.reverse() index = 0 #这里可指定输出topK个 for i in sorted_x[:]: out.write( str(qid) + '\t' + str(index) + '\t' + i[0] + '\t' + str(i[1]) + '\n') out.flush() index += 1 qid += 1 print qid out.close()
def main(): pickle_in = open('../pickles/dev_question_candidates.pkl', 'rb') question_candidates = pickle.load(pickle_in) # qp = QueryParser(filename='../text/queries.txt') # cp = CorpusParser(filename='../text/corpus.txt') qp = QueryParser(filename='../text/queries-fiqa.txt') cp = CorpusParser(filename='../text/corpus-fiqa.txt') qp.parse(isCustomFormat=True) queries = qp.get_queries() cp.parse() corpus = cp.get_corpus() proc = QueryProcessor(queries, corpus, dev_candidates=question_candidates) results = proc.run(isCustomFormat=True) # qid = 0 result_dict = dict() for result, qid in results: sorted_x = sorted(result.items(), key=operator.itemgetter(1), reverse=True) # sorted_x.reverse() index = 0 result_dict[qid] = [] for i in sorted_x[:100]: tmp = (qid, i[0], index, i[1]) print('{:>1}\tQ0\t{:>6}\t{:>2}\t{:>12}\tNH-BM25'.format(*tmp)) index += 1 result_dict[qid].append(i[0]) # qid += 1 print(result_dict) pickle_out = open('dev_candidate_after_bm25', 'wb') pickle.dump(result_dict, pickle_out)
def main(queryPath, corpusPath, resultPath, k1Value,k2Value, bValue): qp = QueryParser(queryPath) cp = CorpusParser(corpusPath) qp.parse() queries = qp.get_queries() cp.parse() corpus = cp.get_corpus() proc = QueryProcessor(queries, corpus) k1=k1Value k2=k2Value b=bValue result_path=resultPath out=open(result_path,'w') results = proc.run(k1, k2, b) qid = 0 for result in results: sorted_x = sorted(result.iteritems(), key=operator.itemgetter(1)) sorted_x.reverse() index = 0 #这里可指定输出topK个 for i in sorted_x[:]: out.write(str(qid)+'\t'+str(index)+'\t'+i[0]+'\t'+str(i[1])+'\n') out.flush() index += 1 qid += 1 print qid out.close()
def bm25_sort(queries, corpus): proc = QueryProcessor(queries, corpus) results = proc.run() for result in results: sorted_x = sorted(result.iteritems(), key=operator.itemgetter(1)) sorted_x.reverse() return dict(sorted_x)
def main(passage_path, query_path, output_path, stop_words_path, w2v_path): #停用词路径,词向量模型路径 stop_words_path = stop_words_path w2v_path = w2v_path query_path = query_path passage_path = passage_path #读取查询文件,构建查询表 qp = QueryParser(query_path, w2v_path, stop_words_path) qp.parse() queries = qp.get_queries() #获取 qid-q print("Read queries end.") #读取passage文件 cp = CorpusParser(passage_path, stop_words_path) cp.parse() corpus = cp.get_corpus() #获取 qid-{pid-p} print("Build passage pools end.") #每个查询根据各自候选池 进行PM25打分 results = {} i = 1 for qid in queries: print("No. %s query has been scored..." % i) proc = QueryProcessor(queries[qid], corpus[qid]) results[qid] = proc.run_query() #得到一个pid-score的字典 i += 1 #排序后写出 res_string = "" for qid in results: #对于每组结果按照分数排序 sorted_res = sorted(results[qid].items(), key = lambda kv:(kv[1], kv[0])) sorted_res.reverse() rank = 1 last_score = 0 #最后一个得分 for i in sorted_res: #pid - score res_string += std_out(qid, i[0], rank, i[1]) rank += 1 #遍历所有的pid,如果没有被写出则直接跟在最后 if len(sorted_res) > 0: #防止文章池太小(eg:5)没有找到一篇匹配文章的情况 last_score = sorted_res[len(sorted_res) - 1][1] pids = np.array(sorted_res)[:, 0] for pid in corpus[qid]: if pid not in pids: last_score -= 0.1 #每个减0.1 res_string += std_out(qid, pid, rank, last_score - 0.1) rank += 1 else: for pid in corpus[qid]: last_score -= 0.1 #每个减0.1 res_string += std_out(qid, pid, rank, last_score - 0.1) rank += 1 with open(os.path.join(output_path), "w") as f: f.write(res_string) f.close()
def start_bm25(): # print "i was HERE!!!" thresh = open('/home/deep/TwitterTrend/text/Threshold.txt') th = [] for each in thresh: th.append(each.strip()) prof = open('/home/deep/TwitterTrend/text/ProfileName.txt') pro = [] for each in prof: pro.append(each.strip()) qp = QueryParser(filename='text/query.txt') cp = CorpusParser(filename='corpus111.txt') qp.parse() queries = qp.get_queries() #print "Q is ",queries cp.parse() corpus = cp.get_corpus() #print "c is ", corpus proc = QueryProcessor(queries, corpus) results = proc.run() print results qid = 1 bm25_output_list = [] for result in results: threshold = th[qid - 1] profile_name = pro[qid - 1] print threshold print profile_name sorted_x = sorted(result.iteritems(), key=operator.itemgetter(1)) sorted_x.reverse() index = 0 for i in sorted_x[:100]: tweet_text = cp.get_text(i[0]) tmp = (qid, i[0], index, i[1]) # print str(tmp) #print "yaay" text = '{:>1}, {:>4}, {:>2}, {:>12}'.format( *tmp) + ", " + tweet_text flag = threshold_check(i[1], threshold) if flag: push.push(profile_name, i[1], tweet_text) bm25_output_list.append(text) index += 1 qid += 1 with open('ss_b' 'm25_output.txt', 'a') as f: for item in bm25_output_list: f.write(item + ' \n') rparser.seperate_store_result()
def main(): qp = QueryParser(filename='../text/queries.txt') cp = CorpusParser(filename='../text/corpus.txt') qp.parse() queries = qp.get_queries() cp.parse() corpus = cp.get_corpus() proc = QueryProcessor(queries, corpus) results = proc.run() qid = 0 for result in results: sorted_x = sorted(result.iteritems(), key=operator.itemgetter(1)) sorted_x.reverse() index = 0 for i in sorted_x[:100]: tmp = (qid, i[0], index, i[1]) print '{:>1}\tQ0\t{:>4}\t{:>2}\t{:>12}\tNH-BM25'.format(*tmp) index += 1 qid += 1
def main(): qp = QueryParser(filename='./data/queries.txt') cp = CorpusParser(filename='./data/corpus.txt') qp.parse() queries = qp.get_queries() cp.parse() corpus = cp.get_corpus() proc = QueryProcessor(queries, corpus) results = proc.run() qid = 0 for result in results: sorted_x = sorted(result.iteritems(), key=operator.itemgetter(1)) sorted_x.reverse() index = 0 for i in sorted_x[:10]: tmp = (qid, i[0], index, i[1]) print '{:>1}\tQ0\t{:>4}\t{:>2}\t{:>12}\tL-BM25'.format(*tmp) index += 1 qid += 1
def main(): qp = QueryParser('../text/queries.txt') cp = CorpusParser('../text/ptwiki-v2.trec.xml', '../text/stopwords.txt') queries = qp.queries corpus = cp.corpus proc = QueryProcessor(queries, corpus) results = proc.run() qid = 0 for result in results: sorted_x = sorted(result.items(), key=operator.itemgetter(1)) sorted_x.reverse() index = 0 print("Query: {}".format(' '.join(queries[qid]))) for i in sorted_x[:10]: doc_num = i[0] doc_bm25_score = i[1] tmp = (index, doc_num, doc_bm25_score) print('{:>4}\t{:>2}\t{:>12}'.format(*tmp)) index += 1 qid += 1
def main(): qp = QueryParser('../text/queries.txt') cp = CorpusParser('../text/ptwiki-v2.trec.xml', '../text/stopwords.txt') queries = qp.queries corpus = cp.corpus proc = QueryProcessor(queries, corpus) results = proc.run() qid = 0 for result in results: sorted_x = sorted(result.items(), key=operator.itemgetter(1)) sorted_x.reverse() index = 0 print("Query: {}".format(' '.join(queries[qid]))) for i in sorted_x[:10]: doc_num = i[0] doc_bm25_score = i[1] tmp = (index, doc_num, doc_bm25_score) print ('{:>4}\t{:>2}\t{:>12}'.format(*tmp)) index += 1 qid += 1
def main(): course_index_chinese_dict = chinese_course_dict('../text/course.csv') # print(course_index_chinese_dict) qp = QueryParser(filename='../text/job_phrase.txt') # qp = QueryParser(filename='../text/mini_job_phrase.txt') qp.parse() queries = qp.get_queries() print(len(queries)) # print(queries) # print(len(queries)) cp = CorpusParser(filename='../text/course_phrase.txt') cp.parse() corpus = cp.get_corpus() # print(corpus) proc = QueryProcessor(queries, corpus) # print(proc.index.index) # print(proc.dlt.table.keys()) results = proc.run() # print(results[0][725]) print(len(results)) # #top 60=1 # with open('bm25_weaksupervision.csv', 'w', encoding='utf-8') as writer: # for result in results: # sorted_x = sorted(result.items(), key=operator.itemgetter(1)) # sorted_x.reverse() # for i in sorted_x[:60]: # writer.write(str(i[0])+',') # writer.write('\n') # evaluate hr_20, ndcg_20, hr_10, ndcg_10, hr_5, ndcg_5, f_map, f_mrr = evaluate_ranking_output( test_filename='../text/job_course_99neg_1pos.txt', ranking_results=results) print( 'hr5 = %.4f, ndcg5 = %.4f, hr20 = %.4f, ndcg20 = %.4f, hr10 = %.4f, ndcg10 = %.4f, map = %.4f, mrr = %.4f' % (hr_5, ndcg_5, hr_20, ndcg_20, hr_10, ndcg_10, f_map, f_mrr))
def rankpapers(): qp = QueryParser(filename='../data/queriesProtonbeam.txt') cp = CorpusParser(filename='../data/CorpusProtonbeam.txt') qp.parse() queries = qp.get_queries() cp.parse() corpus = cp.get_corpus() proc = QueryProcessor(queries, corpus) results = proc.run() qid = 0 for result in results: sorted_x = sorted(result.iteritems(), key=operator.itemgetter(1)) sorted_x.reverse() index = 0 # maxScore=sorted_x[0][1] for i in sorted_x: tmp = (qid, i[0], index, i[1]) Orpapers[i[0]] = i[1] print '{:>1}\tQ0\t{:>4}\t{:>2}\t{:>12}\tNH-BM25'.format(*tmp) index += 1 qid += 1
def eval(index_file, query_text, qrels, n): qrys = cranqry.loadCranQry(query_text) queries = {} for q in qrys: queries[q] = qrys[q].text query_ids = list(queries.keys()) query_ids.sort() query_ids_ints = [] for k in range(0, len(query_ids)): # generating n random queries query_ids_ints.append(int(query_ids[k])) set1 = set() while len(set1) != n: set1.add(random.choice(query_ids_ints)) selected_queries = list(set1) docs = set() qrels = {} f = open("qrels.text", "r") # parsing relevant queries(qrels.text) l = f.readline() while l: j = l.split(" ") if query_ids_ints[int(j[0]) - 1] in qrels.keys(): qrels[query_ids_ints[int(j[0]) - 1]].append(int(j[1])) else: qrels[query_ids_ints[int(j[0]) - 1]] = [int(j[1])] l = f.readline() cranqryobj = cranqry.loadCranQry(query_text) dict_query = {} for q in cranqryobj: dict_query[int(q)] = cranqryobj[ q].text # matching queries in query.text and qrels.text indexObject = index.InvertedIndex() items = indexObject.load(index_file) vector_ndcg_score = {} vector_score_dict = {} for q in selected_queries: print(q) query_raw = dict_query[q] QPobj = QueryProcessor(query_raw, items, index_file) QPobj.preprocessing() result_list = QPobj.vectorQuery( 10) # fetching first 10 documents for a query using vector model boolean_result_list = QPobj.booleanQuery() print("Boolean query result : ", boolean_result_list ) # fetching documents for a query using booleanQuery ndcg_boolean = 0 truth_list = qrels[q] boolean_output_list = [] rank_doc_list = list(map(lambda x: int(x[0]), result_list)) print("Relavant documents for this query : ", truth_list) # relavant documents for the query print("Vector model result : ", rank_doc_list) # documents result list for vector model vector_score_list = [] for id in boolean_result_list: # calculating the predicted scores for boolean model if int(id) in truth_list: boolean_output_list.append(1) else: boolean_output_list.append(0) boolean_score_list = [] if len(boolean_score_list) < 10: boolean_score_list = boolean_output_list while len(boolean_score_list) != 10: boolean_score_list.append(0) elif len(boolean_score_list) > 10: for i in range(0, 10): boolean_score_list[i] = boolean_output_list[i] for id in rank_doc_list: # calculating the predicted scores for vector model if id in truth_list: vector_score_list.append(1) else: vector_score_list.append(0) vector_score_dict[q] = vector_score_list truth_score_list = [] for i in range( 0, len(vector_score_list) ): # calculating the ground_truth scores for vector model truth_score_list.append(vector_score_list[i]) truth_score_list.sort(reverse=True) boolean_truth_score_list = [] for i in range( 0, len(boolean_score_list) ): # calculating the ground_truth scores for boolean model boolean_truth_score_list.append(boolean_score_list[i]) boolean_truth_score_list.sort(reverse=True) print("Vector model ground_truth list is:\n", truth_score_list) print("Vector ranking score list is:\n", vector_score_list) print("Boolean model ground_truth list is:\n", boolean_truth_score_list) print("Boolean model score list is:\n", boolean_score_list) vector_ndcg_score[q] = [ ndcg_score(np.array(boolean_truth_score_list), np.array(boolean_score_list)), ndcg_score(np.array(truth_score_list), np.array(vector_score_list)) ] vector_list = [ ] # compute ndcg score for boolean and vector models for all the randomly generated queries boolean_list = [] for qu in vector_ndcg_score: vector_list.append(vector_ndcg_score[qu][1]) boolean_list.append(vector_ndcg_score[qu][0]) print("ndcg score of boolean and vector models for all the queries:\n", vector_ndcg_score) print("ndcg scores list for boolean model for all the queries:\n", boolean_list) print("ndcg scores list for vector model for all the queries:\n", vector_list) p_value_wilcoxon = stats.wilcoxon( np.array(boolean_list), np.array(vector_list) ) # calculating p value using wilcoxon test and ttest for boolean and vector models p_value_ttest=stats.ttest_ind(np.array(boolean_list),np.array(vector_list), equal_var = False) print("wilcoxon test p value is:", p_value_wilcoxon[1]) print("ttest p value is :", p_value_ttest[1])
import cran import query from cranqry import loadCranQry from index import InvertedIndex, test from query import QueryProcessor print("***************Test Cases Running for Index File****************") invertedobj = InvertedIndex() test(invertedobj) print("***************Test Cases Running for Query File****************") # load documents inputdocument = cran.CranFile("cran.all") # load the index file saved at from part 1 index = InvertedIndex().load("index_file") # load query processed files queries = loadCranQry("query.text") qp = QueryProcessor(queries, index, inputdocument, 29) query.test(qp) qp = QueryProcessor(queries, index, inputdocument, 29) qp.vectorQuery(3)
def main(): # qp = QueryParser(filename='../text/queries.txt') # cp = CorpusParser(filename='../text/corpus.txt') # cp = CorpusParser(filename='../text/comments.txt') # kw = KeywordParser(filename='default_db_name) # ar = ArticleParser(filename='../text/articles.txt') # kwt = KeywordTypeParser(filename='../text/hatetype.txt') run_results_file = '../results/run_results.txt' # this is the file used to write master activity #qp = QueryParser(filename='../text/queries.txt') qp = QueryParser(db_name=default_db_name) cp = CorpusParser(db_name=default_db_name) kw = KeywordParser(db_name=default_db_name) ar = ArticleParser(db_name=default_db_name) kwt = KeywordTypeParser(db_name=default_db_name) qp.parse() queries = qp.get_queries() cp.parse() corpus = cp.get_corpus() kw.parse() keywords = kw.get_keywords() #print('keywords retrieved successfull') #print('printing keywords') #for key, value in keywords.items(): #print(key, value) kwt.parse() keyword_types = kwt.get_keywords() ar.parse() articles = ar.get_articles() run_date = datetime.datetime.now() proc = QueryProcessor(queries, corpus, keywords, keyword_types, run_date, run_results_file, articles, default_db_name) results = proc.run() qid = 0 data = {} for result in results: sorted_x = sorted(result.items(), key=operator.itemgetter(1)) sorted_x.reverse() index = 0 j = 0 for i in sorted_x[:100]: tmp = (qid, i[0], index, i[1]) # todo: add lookup to the original article and add to output # print('{:>1}\tQ0\t{:>4}\t{:>2}\t{:>12}\tNH-BM25'.format(*tmp)) j += 1 score = i[1] docid = i[0] ## title = articles[int(i[0])]['title'].rstrip() ## pub_url = articles[int(i[0])]['pub_url'].rstrip() ## pub_date = articles[int(i[0])]['pub_date'].rstrip() ## source = articles[int(i[0])]['source'].rstrip() ## data.update({'docId': i[0], 'rank_score': j, 'Score': score, 'source': source, 'title': title, ## 'pub_date': pub_date}) ## out_string = docid + ', ' + str(j) + ', '+str(round(score, 4))+', "' + title + '", "' + source + '", "'+pub_date + '", "' + pub_url # print(out_string) # with open('../results/rankings.csv', 'a') as f: # f.write(out_string) index += 1 qid += 1
def eval(): # Algorithm: # Pick N random samples from query.txt # Get top 10 results from bool query for each rnd query # Get top 10 results from vector query for each rnd query # Compute NDCG btn bool query results and qrels.txt # Compute NDCG btn vector query results and qrels.txt # Get p-value btn bool and vector # Get the query collection qc = loadCranQry(query_path) poss_queries = list(qc) # Load up the inverted index ii = InvertedIndex() ii.load(index_file) # Load up the document collection cf = CranFile("cran.all") # Get ground-truth results from qrels.txt with open(qrels_path) as f: qrels = f.readlines() # Index qrels into a dict qrel_dict = {} for qrel in qrels: qrel_split = qrel.split() if int(qrel_split[0]) in qrel_dict: qrel_dict[int(qrel_split[0])].append(int(qrel_split[1])) else: qrel_dict[int(qrel_split[0])] = [int(qrel_split[1])] # Run over N random queries, collecting NDCGs bool_ndcgs = [] vector_ndcgs = [] for _ in range(n): # Get random query ID query_id = choice(poss_queries) # Get the query if 0 < int(query_id) < 10: query_id = '00' + str(int(query_id)) elif 9 < int(query_id) < 100: query_id = '0' + str(int(query_id)) try: query = qc[query_id].text except KeyError: print("Invalid query id", query_id) return # Initialize the query processor qp = QueryProcessor(query, ii, cf) # Run bool query bool_result = qp.booleanQuery()[:10] # Run vector query vector_result = qp.vectorQuery(10) # Pull top 10 ground-truth results from qrels dict gt_results = qrel_dict[poss_queries.index(query_id) + 1][:10] # Compute NDCG for bool query # NOTE: There is no weighting on the bool query, so give all an even 1 truth_vector = list(map(lambda x: x in gt_results, bool_result)) bool_ndcg = ndcg_score(truth_vector, [1] * len(truth_vector), k=len(truth_vector)) # Compute NDCG for vector query vector_docs = [] vector_scores = [] for v in vector_result: vector_docs.append(v[0]) vector_scores.append(v[1]) truth_vector = list(map(lambda x: x in gt_results, vector_docs)) vector_ndcg = ndcg_score(truth_vector, vector_scores, k=len(truth_vector)) # Accumulate NDCGs bool_ndcgs.append(bool_ndcg) vector_ndcgs.append(vector_ndcg) # Average out score lists bool_avg = 0 for bool in bool_ndcgs: bool_avg += bool bool_avg /= len(bool_ndcgs) vector_avg = 0 for vector in vector_ndcgs: vector_avg += vector vector_avg /= len(vector_ndcgs) # Present averages and p-values print("Boolean NDCG average:", bool_avg) print("Vector NDCG average:", vector_avg) if n > 19: print("Wilcoxon p-value:", wilcoxon(bool_ndcgs, vector_ndcgs).pvalue) else: print("Wilcoxon p-value: Sample size too small to be significant") print("T-Test p-value:", ttest_ind(bool_ndcgs, vector_ndcgs).pvalue)
def eval(testOn): k = 10 # k the number of top k pairs of (docID, similarity) to get from vectorQuery dictQ_ID = [] indexFile = sys.argv[1] #v "src/Data/tempFile" queryText = sys.argv[2] qrelsText = sys.argv[3] dictOfQuery = {} dictQrelsText = {} docCollection = CranFile('./CranfieldDataset/cran.all') NDCGScoreBool = [] numberOfQueries = int(sys.argv[4]) NDCGScoreVector = [] #indexFile = "src/Data/tempFile" #queryText = 'src/CranfieldDataset/query.text' #qrelsText = 'src/CranfieldDataset/qrels.text' #numberOfQueries = 50 numberOfTimeToLoop = 5 #Loads Files listOfQueryRelsMaping = readFile(qrelsText) queryFile = loadCranQry(queryText) #Data Need for i in range(numberOfTimeToLoop): #Get random Queiry dictOfQuery = getRandomQuery(queryFile, numberOfQueries) if testOn: assert len(dictOfQuery ) == numberOfQueries, "Error are getting random query" # Return all query # dictOfQuery = getAllDataItems(queryFile) # if testOn: # assert len(dictOfQuery) == 225, "Error are getting random query" #get list of Query result from qrel.txt dictQrelsText = getResultsFrom_QrelsFile(listOfQueryRelsMaping, dictOfQuery) if testOn: assert len(dictQrelsText ) == numberOfQueries, "Error number Of Queries to large" start = timer() queryProcessor = QueryProcessor( "", indexFile, docCollection.docs) # This is an extremely expensive process\ end = timer() if testOn: print("Time for creating QueryProcessor:", end - start) countDoc = 0 start = timer() dictQ_ID = [] for qid, queryText in dictOfQuery.items(): countDoc += 1 dictQ_ID.append(qid) if testOn: print("QID:", qid) start = timer() queryProcessor.loadQuery(queryText) end = timer() if testOn: print("Time for Load:", end - start) print("qrels: ", dictQrelsText[qid]) start = timer() docIDs = queryProcessor.booleanQuery( ) # data would need to be like this [12, 14, 78, 141, 486, 746, 172, 573, 1003] #docIDs_1 = queryProcessor.booleanQuery_1() end = timer() if testOn: print("Time for booleanQuery:", end - start) start = timer() listOfDocIDAndSimilarity = queryProcessor.vectorQuery( k ) # data need to look like k=3 [[625,0.8737006126353902],[401,0.8697643788341478],[943,0.8424991316663082]] #vectorQueryDict[qid] = dictOfDocIDAndSimilarity end = timer() if testOn: print("Time for vectorQuery:", end - start) print("booleanQuery:", docIDs) #For Boolean part start = timer() yTrue = [] yScore = [] for docID in docIDs: yScore.append(1) if docID in dictQrelsText[qid]: yTrue.append(1) else: yTrue.append(0) yTrue.sort(reverse=True) score = metrics.ndcg_score(yTrue[:k], yScore[:k], k, "exponential") if math.isnan(score): NDCGScoreBool.append(0) else: NDCGScoreBool.append(score) end = timer() if testOn: print("Time for Boolean ndcg:", end - start) #For Vector part start = timer() yTrue = [] yScore = [] if testOn: print("vectorQuery:", listOfDocIDAndSimilarity) for docID_Score in listOfDocIDAndSimilarity: yScore.append(float(docID_Score[1])) if docID_Score[0] in dictQrelsText[qid]: yTrue.append(1) else: yTrue.append(0) yTrue.sort(reverse=True) score = metrics.ndcg_score(yTrue[:k], yScore[:k], k, "exponential") if math.isnan(score): NDCGScoreVector.append(0) else: NDCGScoreVector.append(score) end = timer() if testOn: print("Time for Vector ndcg:", end - start) print("\nRunning Querys iteration:(", str(i + 1), ")\n", dictQ_ID) if testOn: for QID, boolScore, vectorScore in zip(dictQ_ID, NDCGScoreBool, NDCGScoreVector): print("QID", QID, "Boolean Model:", boolScore, "Vector Model", vectorScore) print("\nThe Length Of Both NDCG Score is: ", len(NDCGScoreBool), "==", len(NDCGScoreVector)) print('\nThe Avg NDCG Score') vectorAvg = avg(NDCGScoreVector) BoolAvg = avg(NDCGScoreBool) print("Avg NDCG Score for Bool:", BoolAvg, "\nAvg NDCG Score for Vector:", vectorAvg) end = timer() if testOn: print("\n\nTime for running ", countDoc, " queries:", end - start) print('\nThe P-Value') p_va_ttest = stats.ttest_ind(NDCGScoreBool, NDCGScoreVector) p_va_wilcoxon = stats.wilcoxon(NDCGScoreBool, NDCGScoreVector) print("T-Test P-value: ", p_va_ttest) print("Wilcoxon P-value: ", p_va_wilcoxon) print('Done')
def main(): # qp = QueryParser(filename='../text/queries.txt') # cp = CorpusParser(filename='../text/corpus.txt') # cp = CorpusParser(filename='../text/comments.txt') # kw = KeywordParser(filename='../text/'+sys.argv[1]) # ar = ArticleParser(filename='../text/articles.txt') # kwt = KeywordTypeParser(filename='../text/hatetype.txt') run_results_file = '../results/run_results.txt' # this is the file used to write master activity qp = QueryParser(filename='../text/queries.txt') cp = CorpusParser(filename='../text/comments.txt') kw = KeywordParser(filename='../text/weights-chen.txt') ar = ArticleParser(filename='../text/articles.txt') kwt = KeywordTypeParser(filename='../text/hatetype.txt') qp.parse() queries = qp.get_queries() cp.parse() corpus = cp.get_corpus() kw.parse() keywords = kw.get_keywords() kwt.parse() keyword_types = kwt.get_keywords() ar.parse() articles = ar.get_articles() run_date = datetime.datetime.now() proc = QueryProcessor(queries, corpus, keywords, keyword_types, run_date, run_results_file, articles) results = proc.run() qid = 0 data = {} for result in results: sorted_x = sorted(result.items(), key=operator.itemgetter(1)) sorted_x.reverse() index = 0 j = 0 for i in sorted_x[:100]: tmp = (qid, i[0], index, i[1]) # todo: add lookup to the original article and add to output # print('{:>1}\tQ0\t{:>4}\t{:>2}\t{:>12}\tNH-BM25'.format(*tmp)) j += 1 score = i[1] docid = i[0] title = articles[int(i[0])]['title'].rstrip() pub_url = articles[int(i[0])]['pub_url'].rstrip() pub_date = articles[int(i[0])]['pub_date'].rstrip() source = articles[int(i[0])]['source'].rstrip() data.update({ 'docId': i[0], 'rank_score': j, 'Score': score, 'source': source, 'title': title, 'pub_date': pub_date }) out_string = docid + ', ' + str(j) + ', ' + str( round(score, 4) ) + ', "' + title + '", "' + source + '", "' + pub_date + '", "' + pub_url print(out_string) # with open('../results/rankings.csv', 'a') as f: # f.write(out_string) index += 1 qid += 1 print( '\n**The application has finished: You may view the results and supporting files in ' 'the ../results directory for this run.\nEach query in the /text/query.txt file' 'will generate one directory with the format YYYYMMDDHHMMSSQ# with # being the query #.\n' 'Inside the director you will find the following files:\n\n' 'xxxxx.category - This file contains a record for each document showing the most prevelant category, the number\n' '\t\tof occurances, if the document contained terms considered threatening and all additional categories and ' 'their counts\nxxxxx.details - This file contains all documents, each term found and count and the weight applied' 'to the specific term found. This is supporting for the analysis on how a document was ranked\n' 'xxxxx.query - This file contains the terms used for this query\nxxxxx.rank - this contains a list of all' 'documents that were ranked including their score, the source, title, date\nxxxxx.weights - this file' 'contains all of the terms found and the weights in effect at the time of the run.' )