def main(): ''' The main loop for the program ''' config = ConfigClass() se = search_engine_best.SearchEngine(config=config) r = ReadFile(corpus_path=config.get__corpusPath()) # parquet_file_path =r.get_all_path_of_parquet()[0][0]+r.get_all_path_of_parquet()[0][1] # se.build_index_from_parquet(parquet_file_path) se.load_index('idx_bench') g = GUI() # s.load_existing_index() # load if exists, otherwise return empty list while True: event, values = g.window.read() if event is None: break if event == '_SEARCH_': g.clear() query = values['TERM'] start = datetime.now() relevant, tweets_id = se.search(query) end = datetime.now() total_time = (end - start).total_seconds() # print the results to output element index = 0 for tweet_id in tweets_id: if index < 25: print("%s. tweet id: %s" % (index + 1, tweet_id)) index += 1 print() print("About %s tweets (%s seconds)" % (relevant, total_time))
def run_engine(corpus_path="testData", output_path="posting", stemming=True, glove_dict=None): """ This function build the inverted index over the corpus. send each tweet to parsing and indexing. if the stemming is True the parsing will use the stemmer on the tokens. :param glove_dict: Glove file including all word vectors :param corpus_path: root folder containing the raw tweet files :param output_path for the inverted index, posting files and tweets dictionary :param stemming if True use stemmer on terms """ config = ConfigClass(corpus_path, number_of_term_buckets=26, number_of_entities_buckets=2, output_path=output_path) r = ReadFile(corpus_path=config.get_corpusPath()) p = Parse(stemming) indexer = Indexer(config) all_files_paths = glob.glob(config.get_corpusPath() + "\\*\\*.snappy.parquet") all_files_names = [file_name[file_name.find("\\") + 1:] for file_name in all_files_paths] start_time = time.time() file_counter = 0 for file_name in all_files_names: file_start_time = time.time() # print("start file :", file_counter) documents_list = [document for document in r.read_file(file_name=file_name)] # Iterate over every document in the file for idx, document in enumerate(documents_list): parsed_document = p.parse_doc(document) indexer.add_new_doc(parsed_document, glove_dict) # print("end file number ", file_counter, " in: ", time.time() - file_start_time) file_counter += 1 total_time = time.time() - start_time indexer.finish_indexing()
def run_engine(corpus_path, output_path, stemming, queries, num_docs_to_retrieve): """ :return: """ number_of_documents = 0 config = ConfigClass(corpus_path, output_path, stemming) r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(stemming) indexer = Indexer(config, p.terms_dic_to_document) # Iterate over every document in the file for i in r.filesPath: documents_list = r.read_file(i) start_time = time.time() for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) # update the number of doc in system number_of_documents += 1 # index the document data indexer.add_new_doc(parsed_document) # print(time.time() - start_time) print('--------------------------') print('Start writing to disk left overs') indexer.save_all_left_overs() print('Finish without waiting ' + str(time.time() - start_time)) print('Start waiting') indexer.wait_untill_all_finish() print('End Waiting') print('Finished writing to disk left overs') print('--------------------------') print('Finished parsing and indexing. Starting to export files') print('Finish all Time ' + str(time.time() - start_time)) utils.save_obj(indexer.inverted_idx, "inverted_idx")
def __init__(self, config=None): if not config: self._config = ConfigClass() else: self._config = config self._parser = Parse() self._indexer = Indexer(self._config) self._model = None self._reader = ReadFile(self._config.get__corpusPath())
def test_add_new_doc(self): config = ConfigClass() r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse() indexer = Indexer(config) documents_list = r.read_file(file_name='sample3.parquet') # text1 = '@ampalombo I was going to my grandsons baseball games and the dumb F****s made a mask mandatory, are you kidding me' assert indexer.add_new_doc() text = 'i wad born in 2019'
def write_content_for_tweet_id(): corpus_path = "C:\\Users\\ASUS\\Desktop\\Data" config = ConfigClass(corpus_path) r = ReadFile(corpus_path=config.get__corpusPath()) names = r.get_files_names_in_dir() with open("text.csv", "w", newline='', encoding='utf-8') as f: writer = csv.writer(f) for name in names: documents_list = r.read_file_by_name(file_name=str(name)) for doc in documents_list: if doc[0] in tweet_ids: writer.writerow([doc[0], doc[2]])
def run_engine(corpus_path='', output_path='.', stemming=False): """ Entry point for corpus parsing and indexing :param corpus_path: :param output_path: :param stemming: boolean that says if stemming should be apllied :return: total number of tweets parsed """ config = ConfigClass(corpus_path, stemming, output_path) r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(stemming) tweets_parsed = parse_wrapper(r, p, config)
def main(): config = ConfigClass() se = SearchEngine(config) se.build_index_from_parquet( r'C:\Users\Owner\Desktop\SearchEngine\Part C\data\benchmark_data_train.snappy.parquet' ) n_res, res, docs = se.search('vaccines move freely') df = pd.read_parquet( r'C:\Users\Owner\Desktop\SearchEngine\Part C\data\benchmark_data_train.snappy.parquet', engine="pyarrow") to_return = pd.DataFrame(columns=["query", "tweet_id"]) for r in res: to_return = to_return.append({ "query": 5, "tweet_id": r }, ignore_index=True) print(r) print([w for w in df[df.tweet_id == r].full_text.tolist()]) to_return.to_csv("results6.csv", index=False) print(n_res)
def run_engine(corpus_path, output_path, stemming=False): """ :param corpus_path: path for parquet files :param output_path: path to write pickle files :param stemming: boolean to use stemming or not :return: """ ConfigClass(corpus_path, output_path, stemming) r = ReadFile(corpus_path) p = Parse(stemming) indexer = Indexer(output_path, stemming) if corpus_path.endswith('parquet'): documents_list = r.read_file(corpus_path) parseAndIndexDocuments(documents_list, p, indexer) else: documents_list = r.read_dir() while documents_list: parseAndIndexDocuments(documents_list, p, indexer) documents_list = r.read_dir() documents_list.clear() indexer.merge_posting_files() lda = LDA(output_path, indexer.dictdoc, stemming) lda.build_ldaModel()
def main(corpus_path, output_path, stemming, queries, num_docs_to_retrieve): config = ConfigClass(corpus_path, output_path, stemming) indexer = Indexer(config) run_engine(config, indexer) if isinstance(queries, list): queries_list = queries else: # queries_list = [] # queries_file = open(queries, encoding='utf8') # lines = [l for l in queries_file.readlines() if l is not '\n'] # for line in lines: # queries_list.append(line[line.index('.') + 1: -1]) queries_df = pd.read_table(queries) queries_list = list(queries_df['information_need'].values) lst_to_csv = [['Query_num', 'Tweet_id', 'Score']] for num, query in enumerate(queries_list): stime = time.time() print(f'query {num+1}') n_relevant, ranked_doc_ids = search_and_rank_query( query=query, parser=Parse(config), indexer=indexer, k=num_docs_to_retrieve) for tweet_id, score in ranked_doc_ids: print(f'Tweet id: {tweet_id}, Score: {score}') lst_to_csv.append([num + 1, tweet_id, score]) print(f'time for query no. {num+1} is {time.time() - stime}') with open('data\\analysis_data.csv', 'w', newline='') as file: file.truncate() writer = csv.writer(file) writer.writerows(lst_to_csv)
def main(): # print("start: ", time.asctime(time.localtime(time.time()))) config = ConfigClass() Engine = SearchEngine(config) # print("start: ", time.asctime(time.localtime(time.time()))) corpus_path = "C:\\Users\\ASUS\\Desktop\\data_part_c\\data\\benchmark_data_train.snappy.parquet" # corpus_path = "C:\\Users\\ASUS\\Desktop\\Data\\Data\\date=07-19-2020\\covid19_07-19.snappy.parquet" # Engine.build_index_from_parquet( corpus_path) # Engine._indexer.save_index("inverted_idx") # print("finish: ", time.asctime(time.localtime(time.time()))) Engine.load_index("inverted_idx") Engine.load_precomputed_model() queries = read_queries("full_queries2.txt") df = pd.read_parquet(corpus_path, engine="pyarrow") documents_list = df.values.tolist() i = 0 for query in queries: n_relevant, ranked_doc_ids = Engine.search(query) for doc_tuple in ranked_doc_ids: for doc in documents_list: if doc[0] == doc_tuple[0]: i += 1 print('tweet id: {}, similarity: {}'.format( doc_tuple[0], doc_tuple[1])) print(doc[0], ":", doc[2])
def run_engine(corpus_path='', output_path='', stemming=False): """ :return: """ # Create PostingFile directory if it doesn't exist number_of_documents = 0 config = ConfigClass() r = ReadFile(corpus_path=corpus_path) p = Parse(stemming) indexer = Indexer(config, output_path) # Get all parquet files from corpus path parquets = [] for root, dirs, files in os.walk(corpus_path): for name in files: if name.endswith((".parquet", ".htm")): parquets.append((root, name)) for index in range(len(parquets)): r.corpus_path = parquets[index][0] documents_list = r.read_file(file_name=parquets[index][1]) # Create a new process for each document with Pool(CPUCOUNT) as _p: for parsed_doc in _p.imap_unordered(p.parse_doc, documents_list): number_of_documents += 1 indexer.add_new_doc(parsed_doc) _p.close() _p.join() p.entities.clear() indexer.finish_index() save_obj(indexer.term_dict, output_path + '/' + "inverted_idx") save_obj(indexer.document_dict, output_path + '/' + "doc_dictionary") indexer.document_dict.clear() indexer.term_dict.clear()
def main(corpus_path, output_path, stemming, queries, num_docs_to_retrieve): config = ConfigClass(corpus_path) word2vec = Word2vec() num_of_writes = run_engine(corpus_path, output_path, stemming, queries, num_docs_to_retrieve, word2vec) union_posting_files(num_of_writes, stemming, config, output_path) # print("finish union posting files: ", time.asctime(time.localtime(time.time()))) if type(queries) != list: queries = read_queries(queries) inverted_index = utils.load_inverted_index() # temp1 = dict(sorted(inverted_index.items(), key=lambda item: item[1].isdigit(), reverse=False)) # temp2 = dict(sorted(inverted_index.items(), reverse=True)) rank_query = search_and_rank_query(corpus_path, queries, inverted_index, num_docs_to_retrieve, stemming, word2vec, output_path) path = os.path.join(output_path, 'results.csv') with open(path, 'a', newline='') as f: writer = csv.writer(f) writer.writerow(["Query_num", "Tweet_id", "Rank"]) for i in rank_query: for doc_tuple in rank_query[i]: print('tweet id: {}, similarity: {}'.format( doc_tuple[0], doc_tuple[1])) with open(path, 'a', newline='') as f: writer = csv.writer(f) writer.writerow([i + 1, doc_tuple[0], doc_tuple[1]])
def run_engine(corpus_path, output_path, stemming, queries, num_docs_to_retrieve, word2vec): """ :return: """ # print("start: ", time.asctime(time.localtime(time.time()))) number_of_documents = 0 num_of_writes = 1 config = ConfigClass(corpus_path) r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(stemming) indexer = Indexer(config, word2vec) # documents_list = r.read_file(file_name='covid19_07-30.snappy.parquet') # TODO - handel all files ~50 (can do with from multiprocessing.pool import ThreadPool) # Iterate over every document in the file counter = 0 names = r.get_files_names_in_dir() for name in names: documents_list = r.read_file_by_name(file_name=str(name)) for idx, document in enumerate(documents_list): parsed_document = p.parse_doc(document) # parse the document if parsed_document == {}: # RT continue number_of_documents += 1 indexer.add_new_doc(parsed_document, num_of_writes) # index the document data counter += 1 if counter >= 500000: write_and_clean_buffer(indexer, num_of_writes, stemming, config, output_path) counter = 0 # print("finish parser & index number: ", num_of_writes, " At: ", time.asctime(time.localtime(time.time()))) num_of_writes += 1 # print('Finished parsing and indexing. Starting to export files') write_and_clean_buffer(indexer, num_of_writes, stemming, config, output_path) # print("finish parser & index: ", time.asctime(time.localtime(time.time()))) indexer.inverted_idx = { key: val for key, val in indexer.inverted_idx.items() if val != 1 } utils.save_obj(indexer.inverted_idx, "inverted_idx") # print("finish save index: ", time.asctime(time.localtime(time.time()))) return num_of_writes
def __init__(self): #self.model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True,encoding='utf-8') self.model = gensim.models.KeyedVectors.load_word2vec_format( ConfigClass().google_news_vectors_negative300_path, binary=True, encoding='utf-8') self.terms_dict = {}
def __init__(self, config=None): if not config: self._config = ConfigClass() else: self._config = config self._parser = Parse() self._indexer = Indexer(self._config) self._model = None
def run_engine(corpus_path_, output_path_, stemming_): """ :return: """ number_of_documents = 0 config = ConfigClass(corpuspath=corpus_path_,outputpath=output_path_,stemming=stemming_) config.corpusPath = corpus_path_ config.savedFileMainFolder=output_path_ r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse() indexer = Indexer(config) pathes = r.get_all_path_of_parquet() length_of_array = len(pathes) iteration = 0 is_stemmer = config.toStem parsed_doc_list = list() for i in range(0, length_of_array): documents_list = r.get_documents(pathes[i][0], pathes[i][0]) for doc, j in zip(documents_list, range(len(documents_list))): parsed_document = p.parse_doc(doc, stemmer=is_stemmer) if parsed_document == None: continue parsed_doc_list.append(parsed_document) number_of_documents += 1 if number_of_documents % 200000 == 0: for doc in parsed_doc_list: indexer.add_new_doc(doc) indexer.write_posting_to_txt_file_lower_upper(iteration) iteration += 1 parsed_doc_list.clear() parsed_doc_list = list() elif j == len(documents_list) - 1 and i == length_of_array - 1: for doc in parsed_doc_list: indexer.add_new_doc(doc) indexer.write_posting_to_txt_file_lower_upper(iteration) parsed_doc_list.clear() parsed_doc_list = list() indexer.merge_posting_file() indexer.merge_two_last_posting_file() indexer.split_posting_file_and_create_inverted_index() indexer.write_inverted_index_to_txt_file() number_of_documents = 0
def __init__(self): self.config = ConfigClass() self.local_address = r"C:\Users\benro\OneDrive\Desktop\glove.twitter.27B.25d.txt" self.server_address = self.config.glove_twitter_27B_25d_path self.input_file = self.local_address self.output_file = 'glove.twitter.27B.25d.txt.word2vec' glove2word2vec(self.input_file, self.output_file) self.model = KeyedVectors.load_word2vec_format(self.output_file, binary=False)
def __init__(self, parser, indexer, model=None): # self._model = model self.parser = parser self.ranker = Ranker(indexer.tweet_info) self.inverted_index = indexer.inverted_idx self.firstUnion = True self.posting_dir = ConfigClass.get_output() self.DocsToRetrieve = ConfigClass.numOfDocsToRetrieve self.scoreLowerBoundFactor = 0.5
def main(corpus_path, output_path, stemming, queries, num_doc_to_retrieve): config = ConfigClass() config.corpusPath = corpus_path config.savedFileMainFolder = output_path config.toStem = stemming run_engine(config) inverted_index = load_index() queries_file = open(queries, encoding="utf8") tuple_answers = [] query_num = 1 for query in queries_file: for doc_tuple in search_and_rank_query(query[:-1], inverted_index, num_doc_to_retrieve, config): print('tweet id: {} Score: {}'.format(doc_tuple[0], doc_tuple[1])) doc_tuple = doc_tuple + (query_num,) tuple_answers.append(doc_tuple) query_num += 1 queries_file.close()
def main(corpus_path='', output_path='', stemming=False, queries=None, num_docs_to_retrieve=10): ConfigClass.set_path(output_path) run_engine(corpus_path, output_path, stemming) docs = load_index(output_path) if type(queries) != list: file1 = open(queries, 'r', encoding="utf8") queries = file1.readlines() for idx, query in enumerate(queries): query = query.replace('\n', '') for doc_tuple in search_and_rank_query(query, docs, num_docs_to_retrieve, stemming, output_path): print('Tweet id: ' + str(doc_tuple[0]) + ' Score: ' + str(doc_tuple[1]))
def run_engine(corpus_path, output_path, stemming, queries, num_docs_to_retrieve): """ :return: """ config = ConfigClass(corpus_path, output_path, stemming) number_of_documents = 0 r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse() indexer = Indexer(config) Parse.stemmer = stemming corpus_list = r.read_corpus() for idx in range(len(corpus_list)): documents_list = r.read_file(file_name=corpus_list[idx], read_corpus=True) for i in tqdm(range(len(documents_list))): parsed_document = p.parse_doc(documents_list[i]) if i == len(documents_list) - 1 and idx == len(corpus_list) - 1: indexer.is_last_doc = True indexer.add_new_doc(parsed_document) number_of_documents += 1 indexer.is_last_doc = False documents_list = [] with open('spell_dict.json', 'w') as f: json.dump(indexer.spell_dict, f) pickle_out = open("docs_dict_and_extras", "wb") pickle.dump(indexer.docs_dict, pickle_out) pickle_out.close() start = time.time() indexer.merge_files() end = time.time() print("merge time was: {}".format(end - start)) utils.save_obj(indexer.inverted_idx, "inverted_index") pickle_out = open("docs_dict_and_extras", "ab") pickle.dump(number_of_documents, pickle_out) pickle.dump(Parse.AMOUNT_OF_NUMBERS_IN_CORPUS, pickle_out) pickle.dump(indexer.dump_path, pickle_out) pickle_out.close()
def main(corpus_path='', output_path='.', stemming=False, queries='', num_docs_to_retrive=10): """ This is the main function for the search engine. It manages parsing the data, indexing id and running queries on the data. :param corpus_path: string that points to corpus path, where the input files lay :param output_path: String that points to where the results of the query should be writen :param stemming: boolean that decides if the negine will apply stemming or not :param queries: list of queris, or a string that points to a file with queries :param num_docs_to_retrive: Maximum number of tweets to retrive per query :return: - """ start = dt.datetime.now() # Entry point to the parsing and indexing phase run_engine(corpus_path, output_path, stemming) end = dt.datetime.now() total_parse_and_ind_time = (end - start).total_seconds() / 60.0 #print("Total parsing and building index and posting time was: {}".format(total_parse_and_ind_time)) start = dt.datetime.now() k = num_docs_to_retrive config = ConfigClass(corpus_path, stemming, output_path) inverted_index = load_index(config) # Handle both cases of queries input, list and file name if type(queries) is list: queries_list = queries else: queries_list = parse_queries_from_file(queries) output_set = [] for i in range(len(queries_list)): query = queries_list[i] #print(query) # quering phase doc_tuples = search_and_rank_query(query, inverted_index, k, config) for j in range(len(doc_tuples)): doc_tuple = doc_tuples[j] output_set.append((i + 1, doc_tuple[0], doc_tuple[1])) print( 'query number:{} tweet id: {}, score (TF-idf cosine similarity): {}' .format(i + 1, doc_tuple[0], doc_tuple[1])) results_set = pd.DataFrame(output_set, columns=['query_num', 'tweet_id', 'tf_score']) # Write results to output outfile = output_path + '/results.csv' results_set.to_csv(outfile) end = dt.datetime.now() total_query_time = (end - start).total_seconds()
def __init__(self, config=None): if config == None: config = ConfigClass() self._config = config if config.toStem: self._parser = Parse_stem() else: self._parser = Parse() self._indexer = Indexer(config) self._model = None
def __init__(self, config=None, run_config=None): if not config: config = ConfigClass() if not run_config: run_config = RunConfigClass() self._run_config = run_config self._config = config self._parser = Parse(run_config) self._indexer = Indexer(run_config) self._model = None self.searcher = Searcher(self._parser, self._indexer, run_config, model=self._model)
def run_engine(): """ :return: """ number_of_documents = 0 timer = True config = ConfigClass() r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse() #p = Parse(with_stemmer=True) indexer = Indexer(config) data_dir = 'Data' + os.sep + 'Data' npy_dirs = [root for root, dirs, files in os.walk(data_dir)] for dir_path in npy_dirs: files = [ os.path.join(dir_path, fname) for fname in os.listdir(dir_path) if fname.endswith('.parquet') ] for file in files: tweets = r.read_file(file_name=file) start_time = time.perf_counter() documents_list = multiprocessing.Pool(12).map(p.parse_doc, tweets) end_time = time.perf_counter() avg_time_per_tweet = (end_time - start_time) / len(tweets) print( f'Parsed {len(tweets)} tweets, Elapsed time: {end_time - start_time:0.4f} seconds, average per tweet: {avg_time_per_tweet:0.8f} seconds' ) start_time = time.perf_counter() for parsed_document in documents_list: indexer.add_new_doc(parsed_document) end_time = time.perf_counter() print( f'Indexing {len(documents_list)} tweets, Elapsed time: {end_time - start_time:0.4f} seconds' ) print('Finished parsing and indexing. Starting to export files') utils.save_obj(indexer.inverted_idx, "inverted_idx") utils.save_obj(indexer.postingDict, "posting")
def main(corpus_path, output_path, stemming, queries, num_docs_to_retrieve): if queries is not None: config = ConfigClass(corpus_path, output_path, stemming) run_engine(config) query_list = utils.load_queries_list(queries) inverted_index = load_index(output_path) for idx in range(1, len(query_list) + 1): print("query {}:".format(idx)) for doc_tuple in search_and_rank_query(query_list[idx - 1], inverted_index, k=num_docs_to_retrieve, config=config): print('\ttweet id: {} | score : {} '.format( doc_tuple[0], doc_tuple[1]))
def main(corpus_path=None, output_path='', stemming=False, queries=None, num_docs_to_retrieve=1): if queries is not None: config = ConfigClass(corpus_path, output_path, stemming) run_engine(config) query_list = handle_queries(queries) inverted_index, document_dict, num_of_docs, avg_length_per_doc = load_index(output_path) # tweet_url = 'http://twitter.com/anyuser/status/' # num_of_docs = 10000000 # avg_length_per_doc = 21.5 for idx, query in enumerate(query_list): docs_list = search_and_rank_query(query, inverted_index, document_dict, num_docs_to_retrieve, num_of_docs, avg_length_per_doc, config) for doc_tuple in docs_list: print('tweet id: {}, score: {}'.format(str(doc_tuple[1]), doc_tuple[0]))
def run_engine(): """ :return: """ number_of_documents = 0 config = ConfigClass() r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse() indexer = Indexer(config) documents_list = r.read_file(file_name='sample3.parquet') # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) number_of_documents += 1 # index the document data indexer.add_new_doc(parsed_document) print('Finished parsing and indexing. Starting to export files') utils.save_obj(indexer.inverted_idx, "inverted_idx") utils.save_obj(indexer.postingDict, "posting")
def main(): config = ConfigClass() corpus_path = configuration.ConfigClass.get__corpusPath(config) Search_Engine = SearchEngine(config) Search_Engine.build_index_from_parquet(corpus_path) #Search_Engine.load_index('idx_bench.pkl') print(datetime.now()) final_tweets = Search_Engine.search('Herd immunity has been reached.') print(datetime.now()) print("num of relevant:", final_tweets[0]) num = 1 for tweet_id in final_tweets[1].keys(): if num <= 5: print("Tweet id: " + "{" + tweet_id + "}" + " Score: " + "{" + str(num) + "}") num += 1