def query(): ''' the main query processing program, using QueryProcessor''' # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm" # processing_algorithm: 0 for booleanQuery and 1 for vectorQuery # for booleanQuery, the program will print the total number of documents and the list of docuement IDs # for vectorQuery, the program will output the top 3 most similar documents index_data = {} collect = Collection() qf = loadCranQry(query_doc) for q in qf: if (qf[q].qid == query_id): query_text = qf[q].text # loading index_file with open(index_file, "r") as read_file: index_data = json.load(read_file).items() queryprocess = QueryProcessor(query_text, index_data, collect.docs) querytokens = queryprocess.preprocessing() print("process alg:", process_alg) if (process_alg == '0'): result = queryprocess.booleanQuery(querytokens) print("Query results", result) elif (process_alg == '1'): result = queryprocess.vectorQuery(querytokens) else: print("enter 0 for boolean query and 1 for vector query")
def query(): ''' the main query processing program, using QueryProcessor''' # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm" # processing_algorithm: 0 for booleanQuery and 1 for vectorQuer # for booleanQuery, the program will print the total number of documents and the list of docuement IDs # for vectorQuery, the program will output the top 3 most similar documents # Ensure args are valid if len(argv) is not 5: print( "Syntax: python query.py <index-file-path> <processing-algorithm> <query.txt path> <query-id>" ) return # Grab arguments index_file_loc = argv[1] processing_algo = argv[2] query_file_path = argv[3] query_id = argv[4] # Grab index file to restore II ii = InvertedIndex() ii.load(index_file_loc) # Get the document collection cf = CranFile("cran.all") # Get the query collection qc = loadCranQry(query_file_path) # Get the query if 0 < int(query_id) < 10: query_id = '00' + str(int(query_id)) elif 9 < int(query_id) < 100: query_id = '0' + str(int(query_id)) try: query = qc[query_id].text except KeyError: print("Invalid query id", query_id) return # Initialize a query processor qp = QueryProcessor(query, ii, cf) # Do query if int(processing_algo) is 0: result = qp.booleanQuery() if result: print("Results:", ", ".join(str(x) for x in qp.booleanQuery())) else: print("Results: None") elif int(processing_algo) is 1: result = qp.vectorQuery(k=3) print("Results:") for r in result: print("Doc", r[0], "Score", r[1]) else: print("Invalid processing algorithm", processing_algo + ". Use 0 (boolean) or 1 (vector).")
def query(): ''' the main query processing program, using QueryProcessor''' II = index.InvertedIndex() index_file = sys.argv[1] index_file = II.load(index_file) proc_alg = sys.argv[2] proc_alg = proc_alg q_text = sys.argv[3] q_text = q_text qid = sys.argv[4] qid = qid qrys = cranqry.loadCranQry(q_text) #qrys is a dict #qrys = cranqry.loadCranQry('../CranfieldDataset/query.text') #can also be hard-coded like this one #qid = '069' #example of hard-coding a query id qp = QueryProcessor( qrys[qid].text, index_file, 'cran.all') #qid, and index_file are to be passed by the user #print qp.booleanQuery() if proc_alg == '0': qp.booleanQuery() print qp.booleanQuery() elif proc_alg == '1': qp.vectorQuery(3) #returning top 3 ranked results for the vector model print qp.vectorQuery(3)
def query( indexfilename,processingalgorithm,queryfilename, queryid, numresults=3): ''' the main query processing program, using QueryProcessor''' # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm" # processing_algorithm: 0 for booleanQuery and 1 for vectorQuery # for booleanQuery, the program will print the total number of documents and the list of docuement IDs # for vectorQuery, the program will output the top 3 most similar documents qrys = loadCranQry(queryfilename) # for q in qrys: # print(q, qrys[q].text) loadiindex = InvertedIndex() loadiindex = loadiindex.load(indexfilename) # print("index loaded") cf = CranFile('cran.all') queryProcessor = QueryProcessor(qrys, loadiindex, cf.docs, numresults) if processingalgorithm == '0' : queryProcessor.preprocessing() queryProcessor.queryId = queryid results = queryProcessor.booleanQuery() if processingalgorithm == '1': queryProcessor.queryId = queryid results = queryProcessor.vectorQuery(queryProcessor.numofresults) return results
def query(): ''' the main query processing program, using QueryProcessor''' #i = QueryProcessor() # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm" # processing_algorithm: 0 for booleanQuery and 1 for vectorQuery # for booleanQuery, the program will print the total number of documents and the list of docuement IDs # for vectorQuery, the program will output the top 3 most similar documents #reading command line arguments index = sys.argv[1] algo = sys.argv[2] file = sys.argv[3] qid = sys.argv[4] index += ".p" #number of results to display k = 6 #loading the query.text qrys = cranqry.loadCranQry(file) obj = QueryProcessor(qrys[qid].text, index) obj.preprocessing() if algo == '0': obj.booleanQuery() if algo == '1': obj.vectorQuery(k)
def query(index_file, algorithm, query_file, query_id): ''' the main query processing program, using QueryProcessor''' # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm" # processing_algorithm: 0 for booleanQuery and 1 for vectorQuery # for booleanQuery, the program will print the total number of documents and the list of docuement IDs # for vectorQuery, the program will output the top 3 most similar documents query_file = cranqry.loadCranQry(query_file) # loading file index_items = InvertedIndex() index_items = index_items.load(index_file) cran_file = cran.CranFile('cran.all') query_verify = QueryProcessor(query_file, index_items, cran_file.docs) query_verify.preprocessing() results = None if algorithm == '0': # if algorithm is 0 it represents boolean model results = query_verify.booleanQuery(query_id) elif algorithm == '1': # if algorithm is 1 it is vector model results = query_verify.vectorQuery(3, query_id) print(results)
def query(index_file, processing_algorithm, query_file, query_id): """ the main query processing program, using QueryProcessor""" cranqryobj = cranqry.loadCranQry(query_file) dict_query = {} for q in cranqryobj: dict_query[q] = cranqryobj[q].text query_txt = dict_query[query_id] indexObject = index.InvertedIndex() items = indexObject.load(index_file) QPobj = QueryProcessor(query_txt, items, index_file) QPobj.preprocessing() doc_ids = [] if processing_algorithm == "0": # boolean Query doc_ids = QPobj.booleanQuery() elif processing_algorithm == "1": # vector Query doc_ids = QPobj.vectorQuery(3) # first 3 documents based on ranking else: print("Invalid Processing algorithm") print(doc_ids) return doc_ids
def query(index_file, model_type, query_file, query_id): ''' the main query processing program, using QueryProcessor''' # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm" # processing_algorithm: 0 for booleanQuery and 1 for vectorQuery # for booleanQuery, the program will print the total number of documents and the list of docuement IDs # for vectorQuery, the program will output the top 3 most similar documents #load documents inputdocument = cran.CranFile("cran.all") #load the index file saved at from part 1 index = InvertedIndex().load(index_file) #load query processed files queries = loadCranQry(query_file) qp = QueryProcessor(queries, index, inputdocument, query_id) if model_type == 0: Booleanres = qp.booleanQuery() print(Booleanres) if model_type == 1: vectorres = qp.vectorQuery(3) print(vectorres) if model_type == 2: qp.BatchEvaluation()
def eval(testOn): k = 10 # k the number of top k pairs of (docID, similarity) to get from vectorQuery dictQ_ID = [] indexFile = sys.argv[1] #v "src/Data/tempFile" queryText = sys.argv[2] qrelsText = sys.argv[3] dictOfQuery = {} dictQrelsText = {} docCollection = CranFile('./CranfieldDataset/cran.all') NDCGScoreBool = [] numberOfQueries = int(sys.argv[4]) NDCGScoreVector = [] #indexFile = "src/Data/tempFile" #queryText = 'src/CranfieldDataset/query.text' #qrelsText = 'src/CranfieldDataset/qrels.text' #numberOfQueries = 50 numberOfTimeToLoop = 5 #Loads Files listOfQueryRelsMaping = readFile(qrelsText) queryFile = loadCranQry(queryText) #Data Need for i in range(numberOfTimeToLoop): #Get random Queiry dictOfQuery = getRandomQuery(queryFile, numberOfQueries) if testOn: assert len(dictOfQuery ) == numberOfQueries, "Error are getting random query" # Return all query # dictOfQuery = getAllDataItems(queryFile) # if testOn: # assert len(dictOfQuery) == 225, "Error are getting random query" #get list of Query result from qrel.txt dictQrelsText = getResultsFrom_QrelsFile(listOfQueryRelsMaping, dictOfQuery) if testOn: assert len(dictQrelsText ) == numberOfQueries, "Error number Of Queries to large" start = timer() queryProcessor = QueryProcessor( "", indexFile, docCollection.docs) # This is an extremely expensive process\ end = timer() if testOn: print("Time for creating QueryProcessor:", end - start) countDoc = 0 start = timer() dictQ_ID = [] for qid, queryText in dictOfQuery.items(): countDoc += 1 dictQ_ID.append(qid) if testOn: print("QID:", qid) start = timer() queryProcessor.loadQuery(queryText) end = timer() if testOn: print("Time for Load:", end - start) print("qrels: ", dictQrelsText[qid]) start = timer() docIDs = queryProcessor.booleanQuery( ) # data would need to be like this [12, 14, 78, 141, 486, 746, 172, 573, 1003] #docIDs_1 = queryProcessor.booleanQuery_1() end = timer() if testOn: print("Time for booleanQuery:", end - start) start = timer() listOfDocIDAndSimilarity = queryProcessor.vectorQuery( k ) # data need to look like k=3 [[625,0.8737006126353902],[401,0.8697643788341478],[943,0.8424991316663082]] #vectorQueryDict[qid] = dictOfDocIDAndSimilarity end = timer() if testOn: print("Time for vectorQuery:", end - start) print("booleanQuery:", docIDs) #For Boolean part start = timer() yTrue = [] yScore = [] for docID in docIDs: yScore.append(1) if docID in dictQrelsText[qid]: yTrue.append(1) else: yTrue.append(0) yTrue.sort(reverse=True) score = metrics.ndcg_score(yTrue[:k], yScore[:k], k, "exponential") if math.isnan(score): NDCGScoreBool.append(0) else: NDCGScoreBool.append(score) end = timer() if testOn: print("Time for Boolean ndcg:", end - start) #For Vector part start = timer() yTrue = [] yScore = [] if testOn: print("vectorQuery:", listOfDocIDAndSimilarity) for docID_Score in listOfDocIDAndSimilarity: yScore.append(float(docID_Score[1])) if docID_Score[0] in dictQrelsText[qid]: yTrue.append(1) else: yTrue.append(0) yTrue.sort(reverse=True) score = metrics.ndcg_score(yTrue[:k], yScore[:k], k, "exponential") if math.isnan(score): NDCGScoreVector.append(0) else: NDCGScoreVector.append(score) end = timer() if testOn: print("Time for Vector ndcg:", end - start) print("\nRunning Querys iteration:(", str(i + 1), ")\n", dictQ_ID) if testOn: for QID, boolScore, vectorScore in zip(dictQ_ID, NDCGScoreBool, NDCGScoreVector): print("QID", QID, "Boolean Model:", boolScore, "Vector Model", vectorScore) print("\nThe Length Of Both NDCG Score is: ", len(NDCGScoreBool), "==", len(NDCGScoreVector)) print('\nThe Avg NDCG Score') vectorAvg = avg(NDCGScoreVector) BoolAvg = avg(NDCGScoreBool) print("Avg NDCG Score for Bool:", BoolAvg, "\nAvg NDCG Score for Vector:", vectorAvg) end = timer() if testOn: print("\n\nTime for running ", countDoc, " queries:", end - start) print('\nThe P-Value') p_va_ttest = stats.ttest_ind(NDCGScoreBool, NDCGScoreVector) p_va_wilcoxon = stats.wilcoxon(NDCGScoreBool, NDCGScoreVector) print("T-Test P-value: ", p_va_ttest) print("Wilcoxon P-value: ", p_va_wilcoxon) print('Done')
def query(): ''' the main query processing program, using QueryProcessor''' # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm" # processing_algorithm: 0 for booleanQuery and 1 for vectorQuery # for booleanQuery, the program will print the total number of documents and the list of docuement IDs # for vectorQuery, the program will output the top 3 most similar documents #ndexFile = "src/Data/tempFile" #model_selection = "0" #queryText = 'src/CranfieldDataset/query.text' #query_id = "226" docCollection = CranFile('CranfieldDataset/cran.all') indexFile = sys.argv[1] model_selection = sys.argv[2] queryText = sys.argv[3] query_id = sys.argv[4] query_id = str(query_id).zfill(3) # need for number 001 or 050 queryTest = "" queryFile = loadCranQry(queryText) #Data Need if not model_selection == '2': queryTuple = queryFile[query_id] if query_id == queryTuple.qid: queryTest = queryTuple.text queryProcessor = QueryProcessor(queryTest, indexFile, docCollection.docs) if model_selection == "0": docIDs = queryProcessor.booleanQuery() print("Boolean") print("Total number of documents is:", str(len(docIDs)) + "\nTheir DocIDs our:" + str(docIDs)) elif model_selection == "1": print("Vector") print(queryProcessor.vectorQuery(3)) elif model_selection == "2": numberOfTimeToLoop = 5 numberOfQueries = int(query_id) k = 10 bresults = [] vresults = [] #Data Need for _ in range(numberOfTimeToLoop): #get list of Query result from qrel.txt dictOfQuery = getRandomQuery(queryFile, numberOfQueries) queryProcessor = QueryProcessor( "", indexFile, docCollection.docs) # This is an extremely expensive process\ start = timer() for __, queryText in dictOfQuery.items(): queryProcessor.loadQuery(queryText) #docIDs = queryProcessor.booleanQuery() queryProcessor.booleanQuery() end = timer() # print("Run:",i+1, "\nTime for boolean model on Query (",numberOfQueries,") \nTime:", end - start, "\n") bresults.append(end - start) start = timer() for __, queryText in dictOfQuery.items(): #listOfDocIDAndSimilarity = queryProcessor.vectorQuery(k) queryProcessor.vectorQuery(k) end = timer() # print("Run:",i+1, "\nTime for Vector model on Query (",numberOfQueries,") \nTime:", end - start, "\n") vresults.append(end - start) print("Model\t\tRun:" + '\t\t\tRun:'.join(map(str, range(numberOfTimeToLoop + 1)[1:]))) print() print("Boolean Model: \t" + '\t'.join(map(str, bresults))) print() print("Vector Model: \t" + '\t'.join(map(str, vresults))) print()
def eval(indexfilename, queryfilename, queryrefilename, numberofrandomqueries): # ToDo actual = [] # if numberofrandomqueries > 225: raise Exception('please enter query count less than or equal to 225') qrys = loadCranQry("query.text") validqueries = [] querycounter = 0 for q in qrys: validqueries.append(int(q)) loadiindex = InvertedIndex() loadiindex = loadiindex.load("index_file.pickle") # print("index loaded") cf = CranFile('cran.all') #QueryProcessor.numberofresult =10 #qp = QueryProcessor(qrys,loadiindex,cf.docs,10) queryRelevence = dict() for line in open(queryrefilename): fields = line.split(" ") fields[0] = '%0*d' % (3, int(fields[0])) if fields[0] in queryRelevence: # and let's extract the data: queryRelevence[fields[0]].append(fields[1]) else: # create a new array in this slot queryRelevence[fields[0]] = [fields[1]] replacecounter = 0 queryRelevenceUpdated = {} for k in queryRelevence: queryRelevenceUpdated['%0*d' % (3, int( validqueries[replacecounter]))] = queryRelevence.get(k) replacecounter = replacecounter + 1 # relevent = list(queryRelevence.keys()) # relevent = list(map(int, relevent)) #samplespace = np.intersect1d(relevent, validqueries) list_of_random_items = random.sample(validqueries, numberofrandomqueries) tempcounter2 = 0 booleanndcg = [] vectorndcg = [] while tempcounter2 < numberofrandomqueries: list_of_random_items[tempcounter2] = '%0*d' % ( 3, int(list_of_random_items[tempcounter2])) print('query for which ndcg is calculated ' + str(list_of_random_items[tempcounter2])) y = str(list_of_random_items[tempcounter2]) vectorresult = query(indexfilename, '1', queryfilename, str(list_of_random_items[tempcounter2]), 10) # vectorresult = ['573', '51', '944', '878', '12', '486', '875', '879', '746', '665'] # print(vectorresult) tempcounter = 0 for z in vectorresult: if z in queryRelevenceUpdated[str( list_of_random_items[tempcounter2])]: vectorresult[tempcounter] = 1 else: vectorresult[tempcounter] = 0 tempcounter = tempcounter + 1 #print(vectorresult) idealvectorresult = vectorresult.copy() idealvectorresult.sort(reverse=True) #print(idealvectorresult) if sum(idealvectorresult) == 0: ndcgscore = 0 else: ndcgscore = ndcg_score(idealvectorresult, vectorresult) # print(ndcgscore) vectorndcg.append(ndcgscore) tempcounter3 = 0 booleanqueryresult = query(indexfilename, '0', queryfilename, str(list_of_random_items[tempcounter2]), 10) #booleanqueryresult = ['462','462','462','462','462','462','462','462','462'] booleanquery = booleanqueryresult.copy() for g in booleanquery: if g in queryRelevenceUpdated[str( list_of_random_items[tempcounter2])]: booleanquery[tempcounter3] = 1 else: booleanquery[tempcounter3] = 0 tempcounter3 = tempcounter3 + 1 #print(booleanquery) tempcounter4 = len(booleanquery) while tempcounter4 < 10: booleanquery.append(0) tempcounter4 = tempcounter4 + 1 idealbooleanresult = [] for i in range(0, 10): if i < len(queryRelevenceUpdated[str( list_of_random_items[tempcounter2])]): idealbooleanresult.append(1) else: idealbooleanresult.append(0) idealbooleanresult.sort(reverse=True) if sum(booleanquery) == 0: ndcgscoreboolean = 0 else: ndcgscoreboolean = ndcg_score(booleanquery, idealbooleanresult) booleanndcg.append(ndcgscoreboolean) tempcounter2 = tempcounter2 + 1 print('P value for all the queries processed is:') print( scipy.stats.wilcoxon(vectorndcg, booleanndcg, zero_method='wilcox', correction=False)) print('Done')
qp.preprocessing() if (processing_algorithm == 0): qp.booleanQuery() else: qp.vectorQuery(3) def getDoc(qrys): myDoc = [] for doc in qrys: myDoc.append(qrys[doc].text) return myDoc if __name__ == '__main__': qrys = loadCranQry('query.text') #loadCranQry('query.text') # query.text is retrieved from loadCranQry invertedInd = InvertedIndex() index = invertedInd.load("index_file.pickle") # sys.argv[1] # arg 1 in command line is pickle file # qr = QueryProcessor(query, index) coll = getDoc(qrys) #qr = QueryProcessor(qrys, index, coll) qr = QueryProcessor(qrys, index) qr.preprocessing('009') alg = '0' # sys.argv[2] # arg 2 is 0 for bool, 1 for vector if (alg == '0'): qr.booleanQuery() else:
def eval(index_file, query_file, qrels_File, number_of_queries): #read queryfile,indexfile # ToDo queries = loadCranQry(query_file) queries_id_list = [str(int(x)) for x in queries.keys()] #print(queries_id_list) #read querls.txt qrels_dict = process_querls_file(qrels_File, queries_id_list) inputdocument = cran.CranFile("cran.all") # load the index file saved at from part 1 index = InvertedIndex().load(index_file) qp = QueryProcessor(queries, index, inputdocument, number_of_queries) queries_id_list_int = [int(x) for x in qrels_dict.keys()] queries_id_ls = [int(x) for x in queries.keys()] #IdeaVectorsforQuery_ids={} sumbooleanNADC = [] sumvectorNADC = [] with open('Evaluation_search.csv', 'w') as f: f.write("%s,%s,%s,%s\n" % ("Iteration", "AverageNDCG-booleanModel", "AverageNDCG-vectorModel", "P-value")) for i in range(0, 5): vectorNADC = [] booleanNADC = [] intersection_queries = list( set(queries_id_list_int) & set(queries_id_ls)) random_query_id_list = random.sample(queries_id_list_int, number_of_queries) #random_query_id_list=[153, 18] #print(random_query_id_list) for q_id in random_query_id_list: print("Processing for Query ID ::", q_id) qp.querynumber = q_id #boolean_res=qp.booleanQuery() vector_top3 = qp.vectorQuery(5) #vector_top3=[('12',0.34),('746',0.33),('875',0.24)] #print(boolean_res) print("Output for Vector Model Result::", vector_top3) if (vector_top3.__len__() < 1): vectorNADC.append(0) else: vector_label = [x[0] for x in vector_top3] score = [x[1] for x in vector_top3] print("DocumentIDs of Vector Model Result:: ", vector_label) print("Scores of Vector Model Result::", score) true_label = vector_label.copy() query_id = str(q_id) for x in vector_label: #str_x="{0:0=3d}".format(x) ind = vector_label.index(x) if (x in qrels_dict.get(query_id)): true_label[ind] = 1 else: true_label[ind] = 0 if true_label.__len__() < 5: len_val = 10 - (true_label.__len__()) true_label.extend([0] * len_val) print("Actual Vector:: ", true_label) print("Predicted Vector:: ", score) if sum(true_label) == 0: vectorNADC.append(0) else: ndcg = metrics.ndcg_score(true_label, score, 5) print("Calculated ndcg for Vector::", ndcg) vectorNADC.append(ndcg) boolean_res = qp.booleanQuery() print("output of boolean_res:: ", boolean_res) if boolean_res.__len__() < 1: booleanNADC.append(0) else: score = [1] * len(boolean_res) if (score.__len__() < 5): leng = 5 - (score.__len__()) score.extend([0] * leng) true_label = boolean_res.copy() query_id = str(q_id) for x in boolean_res: ind = boolean_res.index(x) if (x in qrels_dict.get(query_id)): true_label[ind] = 1 else: true_label[ind] = 0 if true_label.__len__() < 5: len_val = 10 - (true_label.__len__()) true_label.extend([0] * len_val) print("Actual boolean:: ", true_label) print("Predicted boolean:: ", score) if sum(true_label) == 0: booleanNADC.append(0) else: ndcg = metrics.ndcg_score(true_label, score, 5) print("Calculated ndcg for Boolean::", ndcg) booleanNADC.append(ndcg) print("Calculated NADC sum for all queries", vectorNADC) avergae_vectorNADC = float(sum(vectorNADC) / number_of_queries) print("Calculated NADC sum for all queries", booleanNADC) avergae_booleanNADC = float(sum(booleanNADC) / number_of_queries) print("Avergae NADC Vector::", avergae_vectorNADC) print("Avergae NADC boolean::", avergae_booleanNADC) p_value = scipy.stats.wilcoxon(vectorNADC, booleanNADC, zero_method='wilcox', correction=False) print(i, str(avergae_booleanNADC), str(avergae_vectorNADC), str(p_value[1])) p = "%.20f" % float(str(p_value[1])) print('P value for all the queries processed is:', p) f.write("%s,%s,%s,%s\n" % (i + 1, str(avergae_booleanNADC), str(avergae_vectorNADC), str(p))) print('Done')
def eval(): # Algorithm: # Pick N random samples from query.txt # Get top 10 results from bool query for each rnd query # Get top 10 results from vector query for each rnd query # Compute NDCG btn bool query results and qrels.txt # Compute NDCG btn vector query results and qrels.txt # Get p-value btn bool and vector # Get the query collection qc = loadCranQry(query_path) poss_queries = list(qc) # Load up the inverted index ii = InvertedIndex() ii.load(index_file) # Load up the document collection cf = CranFile("cran.all") # Get ground-truth results from qrels.txt with open(qrels_path) as f: qrels = f.readlines() # Index qrels into a dict qrel_dict = {} for qrel in qrels: qrel_split = qrel.split() if int(qrel_split[0]) in qrel_dict: qrel_dict[int(qrel_split[0])].append(int(qrel_split[1])) else: qrel_dict[int(qrel_split[0])] = [int(qrel_split[1])] # Run over N random queries, collecting NDCGs bool_ndcgs = [] vector_ndcgs = [] for _ in range(n): # Get random query ID query_id = choice(poss_queries) # Get the query if 0 < int(query_id) < 10: query_id = '00' + str(int(query_id)) elif 9 < int(query_id) < 100: query_id = '0' + str(int(query_id)) try: query = qc[query_id].text except KeyError: print("Invalid query id", query_id) return # Initialize the query processor qp = QueryProcessor(query, ii, cf) # Run bool query bool_result = qp.booleanQuery()[:10] # Run vector query vector_result = qp.vectorQuery(10) # Pull top 10 ground-truth results from qrels dict gt_results = qrel_dict[poss_queries.index(query_id) + 1][:10] # Compute NDCG for bool query # NOTE: There is no weighting on the bool query, so give all an even 1 truth_vector = list(map(lambda x: x in gt_results, bool_result)) bool_ndcg = ndcg_score(truth_vector, [1] * len(truth_vector), k=len(truth_vector)) # Compute NDCG for vector query vector_docs = [] vector_scores = [] for v in vector_result: vector_docs.append(v[0]) vector_scores.append(v[1]) truth_vector = list(map(lambda x: x in gt_results, vector_docs)) vector_ndcg = ndcg_score(truth_vector, vector_scores, k=len(truth_vector)) # Accumulate NDCGs bool_ndcgs.append(bool_ndcg) vector_ndcgs.append(vector_ndcg) # Average out score lists bool_avg = 0 for bool in bool_ndcgs: bool_avg += bool bool_avg /= len(bool_ndcgs) vector_avg = 0 for vector in vector_ndcgs: vector_avg += vector vector_avg /= len(vector_ndcgs) # Present averages and p-values print("Boolean NDCG average:", bool_avg) print("Vector NDCG average:", vector_avg) if n > 19: print("Wilcoxon p-value:", wilcoxon(bool_ndcgs, vector_ndcgs).pvalue) else: print("Wilcoxon p-value: Sample size too small to be significant") print("T-Test p-value:", ttest_ind(bool_ndcgs, vector_ndcgs).pvalue)
def to_ndcg(qrels, q_text, idx_file, tk=10, n=2): column_names = ['qid', 'docid', 'bool_rel', 'vec_rel' ] #for creating a dataframe for easier data manupilation #df_qrels = pd.read_csv('../CranfieldDataset/qrels.text', names=column_names, sep=' ') #can test by hard-coding df_qrels = pd.read_csv('../CranfieldDataset/qrels.sample', names=column_names, sep=' ') #can test by hard-coding #df_qrels = pd.read_csv(qrels, names=column_names, sep=' ') #print df_qrels unique_qids = list(set(list(df_qrels.qid.values))) random.shuffle(unique_qids) random_qids = unique_qids[0:n] qrys = cranqry.loadCranQry('../CranfieldDataset/query.text' ) #qrys is a dict---for hard-coded testing #qrys = cranqry.loadCranQry(q_text) #qrys is a dict qrys_ids = [key for key, val in qrys.iteritems()] II = index.InvertedIndex() index_file = II.load("index_file.json") #for hard-coded testing #index_file = II.load(idx_file) vec_agg_ndcg, bool_agg_ndcg = list(), list( ) #for storing aggregate ndcg scores for qid in random_qids: print qid df_qid = df_qrels[ df_qrels["qid"] == qid] #dataframe for one query id---comparison of an integer qid in a string qid qid_docids = list( df_qid['docid'] ) #list of doc ids for a randomly chosen query id from qrels.text---to be used for ndcg_score print qid_docids st_qid = str( qid ) #very important----the decimal number in random_qids should be matched the octal numbers in the cranfield dataset if len(st_qid) == 1: #for handing decimal to octal qid conversion st_qid = "00" + st_qid elif len(st_qid) == 2: st_qid = "0" + st_qid else: st_qid = st_qid if st_qid in qrys_ids: qp = QueryProcessor(qrys[st_qid].text, index_file, 'cran.all') bool_array = qp.booleanQuery() vec_array = qp.vectorQuery(10) #change back to 'tk' print bool_array bool_array = [int(v) for v in bool_array] print bool_array #ndcg for boolean model bool_list = [(0, 0)] * 10 #change back to tk idx = 0 for doc_id in bool_array: if doc_id in qid_docids: #iteratively check if a docid returned by the vector model is present in qrels.text for the specific query(qid) #y_true[idx] = 1 bool_list[idx] = (1, 1) idx += 1 else: bool_list[idx] = (0, 1) if idx == 10: break #print bool_list y_true = [int(bool_id[0]) for bool_id in bool_list] y_score = [int(bool_id[1]) for bool_id in bool_list] print "bool", y_true print "bool", y_score bool_agg_ndcg.append(metrics.ndcg_score(y_true, y_score, 10)) #ndcg for vector model print vec_array y_score = [ vec_id[1] for vec_id in vec_array ] #y_score--to be passed to ndcg_score is the list of cosine similarity scores vec_ids = [ int(vec_id[0]) for vec_id in vec_array ] #list of docids from the list of tuples of the form (docid, similarity_score) #print vec_ids y_true = [0] * 10 ##added on 0317---change back to tk idx = 0 for doc_id in vec_ids: if doc_id in qid_docids: #iteratively check if a docid returned by the vector model is present in qrels.text for the specific query(qid) y_true[idx] = 1 idx += 1 print "vec", y_true print "vec", y_score vec_agg_ndcg.append(metrics.ndcg_score(y_true, y_score, 10)) del qp ##garbage collection return bool_agg_ndcg, vec_agg_ndcg
qp.booleanQuery() else: qp.vectorQuery(3) if __name__ == '__main__': #index_file = str(sys.argv[1]) #index_file.pickle #algo = int(sys.argv[2]) # 0 #query_text = str(sys.argv[3]) #query.text #queryId = str(sys.argv[4]) # '009' index_file = "index_file.pickle" algo = 0 query_text = "query.text" queryId = '009' qrys = loadCranQry(query_text) invertedInd = InvertedIndex() #loading the indexed doucment file index = invertedInd.load(index_file) #no need to use below one #coll = getDoc(qrys) #query(alogo, qrys, index, queryId) qr = QueryProcessor(qrys, index) qr.preprocessing(queryId) #There are two types of queries # 1. is booleanQuery, 2. vectoryQuery
def eval(): qf = cranqry.loadCranQry(queryfile) print('Done')
def test(index_loc, cran_loc, qrels_loc): ''' test your code thoroughly. put the testing cases here''' ##### SETUP ITEMS ##### # Grab index file to restore II ii = InvertedIndex() ii.load(index_loc) # Get the document collection cf = CranFile(cran_loc) # Get ground-truth results from qrels.txt with open(qrels_loc) as f: qrels = f.readlines() # Index qrels into a dict qrel_dict = {} for qrel in qrels: qrel_split = qrel.split() if int(qrel_split[0]) in qrel_dict: qrel_dict[int(qrel_split[0])].append(int(qrel_split[1])) else: qrel_dict[int(qrel_split[0])] = [int(qrel_split[1])] ##### INITIAL TEST ITEMS ##### print("TESTS BASED ON SUGGESTED TESTING POINTS") # Ensure tf is correct # Find a random word and check TF value against what is manually done posting_list = ii.find("experiment").posting tf_vector = [] for posting in posting_list: tf_vector.append(len(posting_list[posting].positions) \ == posting_list[posting].term_freq()) print("TF is computed correctly:", all(tf_vector)) # Ensure idf is correct print("IDF is computed correctly:", log10(ii.nDocs / len(posting_list)) \ == ii.idf("experiment")) # As both tf and idf are correct, and tf-idf is a product of the two, # it is reasonable to assume tf-idf is computed correctly ##### BOOL QUERY TESTS ##### # Here, I use very specific boolean queries to ensure that a # limited number of documents are returned print("\nBOOL QUERY TESTS") # Ensure that the exact title of doc 8 matches for doc 8 doc8 = "measurements of the effect of two-dimensional and three-dimensional roughness elements on boundary layer transition" qp1 = QueryProcessor(doc8, ii, cf) print("Bool query matches on exact title:", qp1.booleanQuery() == [8]) # Ensure that bool query matches very specific AND query qp2 = QueryProcessor("hugoniot and infinitesimally", ii, cf) print( "Bool query matches on specific AND query ('hugoniot and infinitesimally'):", qp2.booleanQuery() == [329]) # Test that an OR query is handled properly # Both gravel and stagnation have completely distinct postings lists. # OR should merge them. gravel_postings = ii.find("gravel").sorted_postings[:] stag_postings = ii.find("stagnat").sorted_postings[:] gravel_postings.extend(stag_postings) qp3 = QueryProcessor("gravel or stagnation", ii, cf) print("Bool query successfully handles OR ('gravel or stagnation'):", qp3.booleanQuery() == sorted(gravel_postings)) # Test that NOT is handled properly # The posting list for "diameter" is a subset of "slipstream" postings # (oddly enough). To test this works, do "slipstream and not diameter" # and we chould get slipstream's postings minus those of diameter. slip_postings = ii.find("slipstream").sorted_postings[:] diam_postings = ii.find("diamet").sorted_postings[:] slip_not_diam = [t for t in slip_postings if t not in diam_postings] print("Bool query successfully handles NOT ('slipstream and not diameter'):", QueryProcessor("slipstream and not diameter", ii, cf).booleanQuery() \ == slip_not_diam) # Ensure AND/OR order doesn't matter print("Bool query can handle query regardless of AND order ('a and b' = 'b and a'):", QueryProcessor("slipstream and diameter", ii, cf).booleanQuery() \ == QueryProcessor("diameter and slipstream", ii, cf).booleanQuery()) print("Bool query can handle query regardless of OR order ('a or b' = 'b or a'):", QueryProcessor("slipstream or diameter", ii, cf).booleanQuery() \ == QueryProcessor("diameter or slipstream", ii, cf).booleanQuery()) # Ensure that the presence of parens does not change query results print("Bool query can handle query regardless of parens ('slipstream and diameter'):", QueryProcessor("slipstream and diameter", ii, cf).booleanQuery() \ == QueryProcessor("(slipstream and diameter)", ii, cf).booleanQuery()) # Ensure parentheses do not change order of processing for AND-AND and OR-OR queries print("Bool query AND is accociative ('(a and b) and c' = 'a and (b and c)'):", QueryProcessor("(slipstream and diameter) and thrust", ii, cf).booleanQuery() \ == QueryProcessor("slipstream and (diameter and thrust)", ii, cf).booleanQuery()) print("Bool query OR is accociative ('(a or b) or c' = 'a or (b or c)'):", QueryProcessor("(slipstream or diameter) or thrust", ii, cf).booleanQuery() \ == QueryProcessor("slipstream or (diameter or thrust)", ii, cf).booleanQuery()) # Ensure parentheses properly group items # Tested by doing the query "manually" by adding/orring the correct terms part_one = QueryProcessor("conduction and cylinder and gas", ii, cf).booleanQuery() part_two = QueryProcessor("radiation and gas", ii, cf).booleanQuery() part_one.extend(part_two) expected_result = QueryProcessor("hugoniot", ii, cf).booleanQuery() expected_result.extend(part_one) print("Bool query parens successfully group conflicting operators:", QueryProcessor("(conduction and cylinder and gas) or (radiation and gas) or hugoniot", ii, cf).booleanQuery() \ == sorted(list(set(expected_result)))) ##### VECTOR QUERY TESTS ##### # For this, just ensure that most of the results are in the expected list print("\nVECTOR QUERY TESTS") # Ensure vector query can match on exact title print("Vector query matches on exact title:", qp1.vectorQuery(1)[0][0] == 8) # Try a few example queries from query.text # As long as one-fifth of t-10 are in gt_result, call it a pass # Note that queries with larger answer sets were chosen to # ensure there were enough to get to one-fifth of ten qc = loadCranQry("query.text") poss_queries = list(qc) # Query 001 result = QueryProcessor(qc["001"].text, ii, cf).vectorQuery(10) gt_result = qrel_dict[poss_queries.index("001") + 1] correct_vector = list(map(lambda x: x in gt_result, [x[0] for x in result])) print("Vector query is at least one-fifth correct for query 001:", sum(correct_vector) > 2) # Query 128 result = QueryProcessor(qc["128"].text, ii, cf).vectorQuery(10) gt_result = qrel_dict[poss_queries.index("128") + 1] correct_vector = list(map(lambda x: x in gt_result, [x[0] for x in result])) print("Vector query is at least one-fifth correct for query 128:", sum(correct_vector) > 2) # Query 226 result = QueryProcessor(qc["226"].text, ii, cf).vectorQuery(10) gt_result = qrel_dict[poss_queries.index("226") + 1] correct_vector = list(map(lambda x: x in gt_result, [x[0] for x in result])) print("Vector query is at least one-fifth correct for query 226:", sum(correct_vector) > 2) # Query 196 result = QueryProcessor(qc["196"].text, ii, cf).vectorQuery(10) gt_result = qrel_dict[poss_queries.index("196") + 1] correct_vector = list(map(lambda x: x in gt_result, [x[0] for x in result])) print("Vector query is at least one-fifth correct for query 196:", sum(correct_vector) > 2) # Query 291 result = QueryProcessor(qc["291"].text, ii, cf).vectorQuery(10) gt_result = qrel_dict[poss_queries.index("291") + 1] correct_vector = list(map(lambda x: x in gt_result, [x[0] for x in result])) print("Vector query is at least one-fifth correct for query 291:", sum(correct_vector) > 2)
def VectorCompare(): queries = loadCranQry("query.text") queries_id_list=[str(int(x)) for x in queries.keys()] inputdocument = cran.CranFile("cran.all") # load the index file saved at from part 1 index = InvertedIndex().load("index_file") qp = QueryProcessor(queries, index, inputdocument, 10) queries_id_list=[str(int(x)) for x in queries.keys()] #print(queries_id_list) #read querls.txt qrels_dict=process_querls_file("qrels.text",queries_id_list) #IdeaVectorsforQuery_ids={} sumbooleanNADC=[] sumvectorNADC=[] vectorNADC1 = [] booleanNADC2 = [] # random_query_id_list=[153, 18] # print(random_query_id_list) query_id = [4 , 29, 53, 58, 100] vectorNADC1=[] vectorNADC2=[] for q_id in query_id: qp.querynumber = q_id # boolean_res=qp.booleanQuery() vector_top3 = qp.vectorQuery(5) vector2_top3=qp.vectorQuery(5,True) # vector_top3=[('12',0.34),('746',0.33),('875',0.24)] # print(boolean_res) print("Output for Vector Model Result::", vector_top3) if (vector_top3.__len__() < 1): vectorNADC1.append(0) else: vector_label = [x[0] for x in vector_top3] score = [x[1] for x in vector_top3] print("DocumentIDs of Vector Model Result:: ", vector_label) print("Scores of Vector Model Result::", score) true_label = vector_label.copy() query_id = str(q_id) for x in vector_label: # str_x="{0:0=3d}".format(x) ind = vector_label.index(x) if (x in qrels_dict.get(query_id)): true_label[ind] = 1 else: true_label[ind] = 0 if true_label.__len__() < 5: len_val = 10 - (true_label.__len__()) true_label.extend([0] * len_val) print("Actual Vector:: ", true_label) print("Predicted Vector:: ", score) if sum(true_label) == 0: vectorNADC1.append(0) else: ndcg = metrics.ndcg_score(true_label, score, 5) print("Calculated ndcg for Vector::", ndcg) vectorNADC1.append(ndcg) if (vector2_top3.__len__() < 1): vectorNADC2.append(0) else: vector_label = [x[0] for x in vector2_top3] score = [x[1] for x in vector2_top3] print("DocumentIDs of Vector Model Result:: ", vector_label) print("Scores of Vector Model Result::", score) true_label = vector_label.copy() query_id = str(q_id) for x in vector_label: # str_x="{0:0=3d}".format(x) ind = vector_label.index(x) if (x in qrels_dict.get(query_id)): true_label[ind] = 1 else: true_label[ind] = 0 if true_label.__len__() < 5: len_val = 10 - (true_label.__len__()) true_label.extend([0] * len_val) print("Actual Vector:: ", true_label) print("Predicted Vector:: ", score) if sum(true_label) == 0: vectorNADC2.append(0) else: ndcg = metrics.ndcg_score(true_label, score, 5) print("Calculated ndcg for Vector::", ndcg) vectorNADC2.append(ndcg) print("Calculated NADC sum for all queries", vectorNADC1) avergae_vectorNADC = float(sum(vectorNADC1) / 5) print("Calculated NADC sum for all queries", vectorNADC2) avergae_vectorNADC2 = float(sum(vectorNADC2) / 5) print("Avergae NADC Vector::", avergae_vectorNADC) print("Avergae NADC boolean::", avergae_vectorNADC2) print(vectorNADC1) print(vectorNADC2) p_value = scipy.stats.wilcoxon(vectorNADC1, vectorNADC2, zero_method='wilcox', correction=False) p = "%.20f" % float(str(p_value[1])) print('P value for all the queries processed is:', p)
import cran import query from cranqry import loadCranQry from index import InvertedIndex, test from query import QueryProcessor print("***************Test Cases Running for Index File****************") invertedobj = InvertedIndex() test(invertedobj) print("***************Test Cases Running for Query File****************") # load documents inputdocument = cran.CranFile("cran.all") # load the index file saved at from part 1 index = InvertedIndex().load("index_file") # load query processed files queries = loadCranQry("query.text") qp = QueryProcessor(queries, index, inputdocument, 29) query.test(qp) qp = QueryProcessor(queries, index, inputdocument, 29) qp.vectorQuery(3)
def eval(index_file, query_text, qrels, n): qrys = cranqry.loadCranQry(query_text) queries = {} for q in qrys: queries[q] = qrys[q].text query_ids = list(queries.keys()) query_ids.sort() query_ids_ints = [] for k in range(0, len(query_ids)): # generating n random queries query_ids_ints.append(int(query_ids[k])) set1 = set() while len(set1) != n: set1.add(random.choice(query_ids_ints)) selected_queries = list(set1) docs = set() qrels = {} f = open("qrels.text", "r") # parsing relevant queries(qrels.text) l = f.readline() while l: j = l.split(" ") if query_ids_ints[int(j[0]) - 1] in qrels.keys(): qrels[query_ids_ints[int(j[0]) - 1]].append(int(j[1])) else: qrels[query_ids_ints[int(j[0]) - 1]] = [int(j[1])] l = f.readline() cranqryobj = cranqry.loadCranQry(query_text) dict_query = {} for q in cranqryobj: dict_query[int(q)] = cranqryobj[ q].text # matching queries in query.text and qrels.text indexObject = index.InvertedIndex() items = indexObject.load(index_file) vector_ndcg_score = {} vector_score_dict = {} for q in selected_queries: print(q) query_raw = dict_query[q] QPobj = QueryProcessor(query_raw, items, index_file) QPobj.preprocessing() result_list = QPobj.vectorQuery( 10) # fetching first 10 documents for a query using vector model boolean_result_list = QPobj.booleanQuery() print("Boolean query result : ", boolean_result_list ) # fetching documents for a query using booleanQuery ndcg_boolean = 0 truth_list = qrels[q] boolean_output_list = [] rank_doc_list = list(map(lambda x: int(x[0]), result_list)) print("Relavant documents for this query : ", truth_list) # relavant documents for the query print("Vector model result : ", rank_doc_list) # documents result list for vector model vector_score_list = [] for id in boolean_result_list: # calculating the predicted scores for boolean model if int(id) in truth_list: boolean_output_list.append(1) else: boolean_output_list.append(0) boolean_score_list = [] if len(boolean_score_list) < 10: boolean_score_list = boolean_output_list while len(boolean_score_list) != 10: boolean_score_list.append(0) elif len(boolean_score_list) > 10: for i in range(0, 10): boolean_score_list[i] = boolean_output_list[i] for id in rank_doc_list: # calculating the predicted scores for vector model if id in truth_list: vector_score_list.append(1) else: vector_score_list.append(0) vector_score_dict[q] = vector_score_list truth_score_list = [] for i in range( 0, len(vector_score_list) ): # calculating the ground_truth scores for vector model truth_score_list.append(vector_score_list[i]) truth_score_list.sort(reverse=True) boolean_truth_score_list = [] for i in range( 0, len(boolean_score_list) ): # calculating the ground_truth scores for boolean model boolean_truth_score_list.append(boolean_score_list[i]) boolean_truth_score_list.sort(reverse=True) print("Vector model ground_truth list is:\n", truth_score_list) print("Vector ranking score list is:\n", vector_score_list) print("Boolean model ground_truth list is:\n", boolean_truth_score_list) print("Boolean model score list is:\n", boolean_score_list) vector_ndcg_score[q] = [ ndcg_score(np.array(boolean_truth_score_list), np.array(boolean_score_list)), ndcg_score(np.array(truth_score_list), np.array(vector_score_list)) ] vector_list = [ ] # compute ndcg score for boolean and vector models for all the randomly generated queries boolean_list = [] for qu in vector_ndcg_score: vector_list.append(vector_ndcg_score[qu][1]) boolean_list.append(vector_ndcg_score[qu][0]) print("ndcg score of boolean and vector models for all the queries:\n", vector_ndcg_score) print("ndcg scores list for boolean model for all the queries:\n", boolean_list) print("ndcg scores list for vector model for all the queries:\n", vector_list) p_value_wilcoxon = stats.wilcoxon( np.array(boolean_list), np.array(vector_list) ) # calculating p value using wilcoxon test and ttest for boolean and vector models p_value_ttest=stats.ttest_ind(np.array(boolean_list),np.array(vector_list), equal_var = False) print("wilcoxon test p value is:", p_value_wilcoxon[1]) print("ttest p value is :", p_value_ttest[1])