def psuedoRFSearch(): query = sys.argv[1] df_psuedoRF = pd.DataFrame( columns=['Doc_no', 'rank', 'Product_title', 'Result_score']) index = MyIndexReader.MyIndexReader() pesudo_search = PseudoRFRetrievalModel.PseudoRFRetreivalModel(index) extractor = TransformQuery.TransformQuery() queries = extractor.getQuries(query) for query in queries: print(query.queryId, "\t", query.queryContent) results = pesudo_search.retrieveQuery(query, 20, 100, 0.4) rank = 1 for result in results: df_psuedoRF = df_psuedoRF.append( { 'Doc_no': result.getDocNo(), 'rank': rank, 'Product_title': result.getDocTitle(), 'Result_score': result.getScore() }, ignore_index=True) rank += 1 print(df_psuedoRF)
def indexRead(term): index = MyIndexReader.MyIndexReader() # retrieve the token. df = index.DocFreq(term) ctf = index.CollectionFreq(term) #print(" >> the token \""+term+"\" appeared in "+ str(df) +" documents and "+ str(ctf) +" times in total") if df>0: posting = index.getPostingList(term) for docId in posting: docNo = index.getDocNo(docId)
def ReadIndex(type, token): # Initiate the index file reader. index = MyIndexReader.MyIndexReader(type) # retrieve the token. df = index.DocFreq(token) ctf = index.CollectionFreq(token) print(" >> the token \"" + token + "\" appeared in " + str(df) + " documents and " + str(ctf) + " times in total") if df > 0: posting = index.getPostingList(token) for docId in posting: docNo = index.getDocNo(docId) print(docNo + "\t" + str(docId) + "\t" + str(posting[docId]))
def qrmSearch(): query = sys.argv[1] df_qrm = pd.DataFrame(columns = ['Doc_no','rank','Product_title', 'Result_score']) index = MyIndexReader.MyIndexReader() search = QueryRetreivalModel.QueryRetrievalModel(index) extractor = TransformQuery.TransformQuery() #extractor.getQuries(query) queries= extractor.getQuries(query) for query in queries: #print(query.queryId,"\t",query.queryContent) results = search.retrieveQuery(query, 20) rank = 1 for result in results: df_qrm = df_qrm.append({'Doc_no': result.getDocNo(), 'rank': rank, 'Product_title': result.getDocTitle(), 'Result_score' : result.getScore()}, ignore_index=True) rank +=1
processed_file = Path.ResultHM1 + 'result_' + user + '.txt' names_file = Path.FilesDictDir + 'dict_' + user + '.txt' dict_file = Path.IndexDir + "dictionary_" + user postings_file = Path.IndexDir + "postings_" + user dr = open(names_file, "r", encoding='cp437') while True: l = dr.readline() if not l: break l2 = l.split(":") doc_to_title_dict[int(l2[0])] = l2[1] pass index = MyIndexReader.MyIndexReader(user) search = QueryRetreivalModel.QueryRetrievalModel(index, user) #preprocessing and indexing ends extractor = ExtractQuery.ExtractQuery(Query_string) #query execution starts query = extractor.getQuries() results = search.retrieveQuery(query, 4, miu, doc_to_title_dict, user) rank = 1 final_json = [] if len(results) == 0: print("[]") else: for result in results: