def query(): ''' the main query processing program, using QueryProcessor''' II = index.InvertedIndex() index_file = sys.argv[1] index_file = II.load(index_file) proc_alg = sys.argv[2] proc_alg = proc_alg q_text = sys.argv[3] q_text = q_text qid = sys.argv[4] qid = qid qrys = cranqry.loadCranQry(q_text) #qrys is a dict #qrys = cranqry.loadCranQry('../CranfieldDataset/query.text') #can also be hard-coded like this one #qid = '069' #example of hard-coding a query id qp = QueryProcessor( qrys[qid].text, index_file, 'cran.all') #qid, and index_file are to be passed by the user #print qp.booleanQuery() if proc_alg == '0': qp.booleanQuery() print qp.booleanQuery() elif proc_alg == '1': qp.vectorQuery(3) #returning top 3 ranked results for the vector model print qp.vectorQuery(3)
def query(index_file, processing_algorithm, query_file, query_id): """ the main query processing program, using QueryProcessor""" cranqryobj = cranqry.loadCranQry(query_file) dict_query = {} for q in cranqryobj: dict_query[q] = cranqryobj[q].text query_txt = dict_query[query_id] indexObject = index.InvertedIndex() items = indexObject.load(index_file) QPobj = QueryProcessor(query_txt, items, index_file) QPobj.preprocessing() doc_ids = [] if processing_algorithm == "0": # boolean Query doc_ids = QPobj.booleanQuery() elif processing_algorithm == "1": # vector Query doc_ids = QPobj.vectorQuery(3) # first 3 documents based on ranking else: print("Invalid Processing algorithm") print(doc_ids) return doc_ids
queries on your search engine. In order to let everything go the best, you have to be sure that the engine will work on pre-computed indeces. Thus, forget to allow the main file to build the index from scratch. When the user executes the file it should be able to choose: + search_engine: a parameter that the user set to choose the search engine to run. According to the request of the homework, you can get 1,2 or 3. + Any other parameters you would like. """ import utils import index import index_utils inverted_index = index.InvertedIndex(index.idx) def menu(): choice = 0 print( "1 - Search without score\n2 - Search with score\n3- Search with 'new' score!" ) while choice != 1 and choice != 2 and choice != 3: choice = int(input("Number (1, 2, 3): ")) result = 'None' search_term = input("Enter term(s) to search: ") if choice == 1: result = inverted_index.lookup_conjunctive_query( index_utils.format_text(search_term))
def to_ndcg(qrels, q_text, idx_file, tk=10, n=2): column_names = ['qid', 'docid', 'bool_rel', 'vec_rel' ] #for creating a dataframe for easier data manupilation #df_qrels = pd.read_csv('../CranfieldDataset/qrels.text', names=column_names, sep=' ') #can test by hard-coding df_qrels = pd.read_csv('../CranfieldDataset/qrels.sample', names=column_names, sep=' ') #can test by hard-coding #df_qrels = pd.read_csv(qrels, names=column_names, sep=' ') #print df_qrels unique_qids = list(set(list(df_qrels.qid.values))) random.shuffle(unique_qids) random_qids = unique_qids[0:n] qrys = cranqry.loadCranQry('../CranfieldDataset/query.text' ) #qrys is a dict---for hard-coded testing #qrys = cranqry.loadCranQry(q_text) #qrys is a dict qrys_ids = [key for key, val in qrys.iteritems()] II = index.InvertedIndex() index_file = II.load("index_file.json") #for hard-coded testing #index_file = II.load(idx_file) vec_agg_ndcg, bool_agg_ndcg = list(), list( ) #for storing aggregate ndcg scores for qid in random_qids: print qid df_qid = df_qrels[ df_qrels["qid"] == qid] #dataframe for one query id---comparison of an integer qid in a string qid qid_docids = list( df_qid['docid'] ) #list of doc ids for a randomly chosen query id from qrels.text---to be used for ndcg_score print qid_docids st_qid = str( qid ) #very important----the decimal number in random_qids should be matched the octal numbers in the cranfield dataset if len(st_qid) == 1: #for handing decimal to octal qid conversion st_qid = "00" + st_qid elif len(st_qid) == 2: st_qid = "0" + st_qid else: st_qid = st_qid if st_qid in qrys_ids: qp = QueryProcessor(qrys[st_qid].text, index_file, 'cran.all') bool_array = qp.booleanQuery() vec_array = qp.vectorQuery(10) #change back to 'tk' print bool_array bool_array = [int(v) for v in bool_array] print bool_array #ndcg for boolean model bool_list = [(0, 0)] * 10 #change back to tk idx = 0 for doc_id in bool_array: if doc_id in qid_docids: #iteratively check if a docid returned by the vector model is present in qrels.text for the specific query(qid) #y_true[idx] = 1 bool_list[idx] = (1, 1) idx += 1 else: bool_list[idx] = (0, 1) if idx == 10: break #print bool_list y_true = [int(bool_id[0]) for bool_id in bool_list] y_score = [int(bool_id[1]) for bool_id in bool_list] print "bool", y_true print "bool", y_score bool_agg_ndcg.append(metrics.ndcg_score(y_true, y_score, 10)) #ndcg for vector model print vec_array y_score = [ vec_id[1] for vec_id in vec_array ] #y_score--to be passed to ndcg_score is the list of cosine similarity scores vec_ids = [ int(vec_id[0]) for vec_id in vec_array ] #list of docids from the list of tuples of the form (docid, similarity_score) #print vec_ids y_true = [0] * 10 ##added on 0317---change back to tk idx = 0 for doc_id in vec_ids: if doc_id in qid_docids: #iteratively check if a docid returned by the vector model is present in qrels.text for the specific query(qid) y_true[idx] = 1 idx += 1 print "vec", y_true print "vec", y_score vec_agg_ndcg.append(metrics.ndcg_score(y_true, y_score, 10)) del qp ##garbage collection return bool_agg_ndcg, vec_agg_ndcg
def eval(index_file, query_text, qrels, n): qrys = cranqry.loadCranQry(query_text) queries = {} for q in qrys: queries[q] = qrys[q].text query_ids = list(queries.keys()) query_ids.sort() query_ids_ints = [] for k in range(0, len(query_ids)): # generating n random queries query_ids_ints.append(int(query_ids[k])) set1 = set() while len(set1) != n: set1.add(random.choice(query_ids_ints)) selected_queries = list(set1) docs = set() qrels = {} f = open("qrels.text", "r") # parsing relevant queries(qrels.text) l = f.readline() while l: j = l.split(" ") if query_ids_ints[int(j[0]) - 1] in qrels.keys(): qrels[query_ids_ints[int(j[0]) - 1]].append(int(j[1])) else: qrels[query_ids_ints[int(j[0]) - 1]] = [int(j[1])] l = f.readline() cranqryobj = cranqry.loadCranQry(query_text) dict_query = {} for q in cranqryobj: dict_query[int(q)] = cranqryobj[ q].text # matching queries in query.text and qrels.text indexObject = index.InvertedIndex() items = indexObject.load(index_file) vector_ndcg_score = {} vector_score_dict = {} for q in selected_queries: print(q) query_raw = dict_query[q] QPobj = QueryProcessor(query_raw, items, index_file) QPobj.preprocessing() result_list = QPobj.vectorQuery( 10) # fetching first 10 documents for a query using vector model boolean_result_list = QPobj.booleanQuery() print("Boolean query result : ", boolean_result_list ) # fetching documents for a query using booleanQuery ndcg_boolean = 0 truth_list = qrels[q] boolean_output_list = [] rank_doc_list = list(map(lambda x: int(x[0]), result_list)) print("Relavant documents for this query : ", truth_list) # relavant documents for the query print("Vector model result : ", rank_doc_list) # documents result list for vector model vector_score_list = [] for id in boolean_result_list: # calculating the predicted scores for boolean model if int(id) in truth_list: boolean_output_list.append(1) else: boolean_output_list.append(0) boolean_score_list = [] if len(boolean_score_list) < 10: boolean_score_list = boolean_output_list while len(boolean_score_list) != 10: boolean_score_list.append(0) elif len(boolean_score_list) > 10: for i in range(0, 10): boolean_score_list[i] = boolean_output_list[i] for id in rank_doc_list: # calculating the predicted scores for vector model if id in truth_list: vector_score_list.append(1) else: vector_score_list.append(0) vector_score_dict[q] = vector_score_list truth_score_list = [] for i in range( 0, len(vector_score_list) ): # calculating the ground_truth scores for vector model truth_score_list.append(vector_score_list[i]) truth_score_list.sort(reverse=True) boolean_truth_score_list = [] for i in range( 0, len(boolean_score_list) ): # calculating the ground_truth scores for boolean model boolean_truth_score_list.append(boolean_score_list[i]) boolean_truth_score_list.sort(reverse=True) print("Vector model ground_truth list is:\n", truth_score_list) print("Vector ranking score list is:\n", vector_score_list) print("Boolean model ground_truth list is:\n", boolean_truth_score_list) print("Boolean model score list is:\n", boolean_score_list) vector_ndcg_score[q] = [ ndcg_score(np.array(boolean_truth_score_list), np.array(boolean_score_list)), ndcg_score(np.array(truth_score_list), np.array(vector_score_list)) ] vector_list = [ ] # compute ndcg score for boolean and vector models for all the randomly generated queries boolean_list = [] for qu in vector_ndcg_score: vector_list.append(vector_ndcg_score[qu][1]) boolean_list.append(vector_ndcg_score[qu][0]) print("ndcg score of boolean and vector models for all the queries:\n", vector_ndcg_score) print("ndcg scores list for boolean model for all the queries:\n", boolean_list) print("ndcg scores list for vector model for all the queries:\n", vector_list) p_value_wilcoxon = stats.wilcoxon( np.array(boolean_list), np.array(vector_list) ) # calculating p value using wilcoxon test and ttest for boolean and vector models p_value_ttest=stats.ttest_ind(np.array(boolean_list),np.array(vector_list), equal_var = False) print("wilcoxon test p value is:", p_value_wilcoxon[1]) print("ttest p value is :", p_value_ttest[1])