def main(): bench_data_path = os.path.join('data', 'benchmark_data_train.snappy.parquet') bench_lbls_path = os.path.join('data', 'benchmark_lbls_train.csv') queries_path = os.path.join('data', 'queries_train.tsv') output_path = os.getcwd() + "\\results analysis" bench_lbls = pd.read_csv(bench_lbls_path, dtype={ 'query': int, 'tweet': str, 'y_true': int }) q2n_relevant = bench_lbls.groupby('query')['y_true'].sum().to_dict() queries = pd.read_csv(queries_path, sep='\t') stemming = False engine = search_engine_best.SearchEngine(stemming=stemming) stemmed = "WithStem\\" if stemming else "WithoutStem\\" engine.build_index_from_parquet(bench_data_path, toSave=True, save_path=os.path.join( output_path, stemmed + "idx_bench")) with open(os.path.join(output_path, "log.txt"), "a") as log: log.write("Corpus size {ifStem}: {num}\n".format( num=engine._indexer.number_of_documents, ifStem="with stemming" if stemming else "without stemming")) log.write("Avrgdl size {ifStem}: {num}\n".format( num=engine._indexer.average_document_length, ifStem="with stemming" if stemming else "without stemming")) report_analysis(output_path, stemming, engine)
def main(): ''' The main loop for the program ''' config = ConfigClass() se = search_engine_best.SearchEngine(config=config) r = ReadFile(corpus_path=config.get__corpusPath()) # parquet_file_path =r.get_all_path_of_parquet()[0][0]+r.get_all_path_of_parquet()[0][1] # se.build_index_from_parquet(parquet_file_path) se.load_index('idx_bench') g = GUI() # s.load_existing_index() # load if exists, otherwise return empty list while True: event, values = g.window.read() if event is None: break if event == '_SEARCH_': g.clear() query = values['TERM'] start = datetime.now() relevant, tweets_id = se.search(query) end = datetime.now() total_time = (end - start).total_seconds() # print the results to output element index = 0 for tweet_id in tweets_id: if index < 25: print("%s. tweet id: %s" % (index + 1, tweet_id)) index += 1 print() print("About %s tweets (%s seconds)" % (relevant, total_time))
'entity': [True, False], 'less_more': [True, False], 'hashtag': [True, False], 'url': [True, False], 'tag': [True, False], 'capitals': [True, False], 'cos_sym': [True, False], 'min_length': [i for i in range(1, 4, 2)], 'min_relevant': [i for i in range(1, 6, 2)], 'the_count': [i for i in range(2, 5, 2)], 'wordnet_count': [i for i in range(2, 5, 2)], 'min_occurrence': [i for i in range(1, 6, 2)], 'ext_val': [i / 10 for i in range(1, 10, 2)] } var_options_list = generate_var_options(v_options) progressbar = tqdm(total=len(var_options_list)) for opt in var_options_list: config = run_configs.RunConfigClass(**opt) opt_engine = search_engine_best.SearchEngine(config) opt['build_idx_time'] = timeit.timeit( "opt_engine.build_index_from_parquet(bench_data_path)", globals=globals(), number=1) if opt['build_idx_time'] > 60: print(f'Build time exceeded: {opt}') for methods_list in methods_opt: o = opt.copy() o['methods'] = methods_list test(opt_engine, o) progressbar.update(1 / len(methods_opt))
import run_configs import search_engine_best if __name__ == '__main__': engine = search_engine_best.SearchEngine(run_configs.RunConfigClass()) path = r'data\benchmark_data_train.snappy.parquet' engine.build_index_from_parquet(path) print(engine.search('Herd immunity has been reached'))
import os import time from flask import Flask, request from flask_cors import CORS import search_engine_best app = Flask(_name_) CORS(app) print("start building inverted index") se = search_engine_best.SearchEngine() bench_data_path = os.path.join('data', 'benchmark_data_train.snappy.parquet') se.build_index_from_parquet(bench_data_path) print("finished") @app.route('/', methods=['POST']) def index(): query = request.json['query'] print(query) numOfResults, results = se.search(query) print(results[:5]) # return jsonify({"return": results[:20]}) return {'res': results[:100]} # return results[:200]