class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) self._indexer.add_square_Wij() to_Save=(self._indexer.inverted_idx, self._indexer.postingDict, self._indexer.num_of_docs, self._indexer.avg_Size_doc,self._indexer.doc_info) utils.save_obj(to_Save, "index_4") print('Finished parsing and indexing.') #print(sorted( self._indexer.inverted_idx,key=lambda x: self._indexer.inverted_idx[x])) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ obj = utils.load_obj(fn) self._indexer.inverted_idx = obj[0] self._indexer.postingDict = obj[1] self._indexer.num_of_docs = obj[2] self._indexer.avg_Size_doc = obj[3] self._indexer.doc_info = obj[4] # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self,model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query,k=2000): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer) return searcher.search(query,k) def main(self,output_path,stemming,query_to_check,num_docs_to_retrieve): self.build_index_from_parquet("data/benchmark_data_train.snappy.parquet") if isinstance(query_to_check, list): queries = query_to_check elif isinstance(query_to_check, str): if query_to_check.endswith(".txt"): try: with open(query_to_check, "r",encoding="utf-8") as queries: queries = queries.readlines() query2 = [] for q in queries: if (q != "\n"): query2.append(q) queries=query2 except FileNotFoundError as e: print(e) else: queries = [query_to_check] else: return if (stemming): output_path = output_path + "/WithStem" else: output_path = output_path + "/WithoutStem" query_num = 1 queries = pd.read_csv(os.path.join('data', 'queries_train.tsv'), sep='\t') for i, row in queries.iterrows(): q_id = row['query_id'] q_keywords = row['keywords'] start = time.time() mylist = self.search(q_keywords, num_docs_to_retrieve) answer_to_run = mylist[1] for doc_tuple in answer_to_run: print('tweet id: {}'.format(doc_tuple)) query_num += 1 print("time that toke to retrieve :" + str(time.time() - start))