Exemplo n.º 1
0
def main():
    bench_data_path = os.path.join('data',
                                   'benchmark_data_train.snappy.parquet')
    bench_lbls_path = os.path.join('data', 'benchmark_lbls_train.csv')
    queries_path = os.path.join('data', 'queries_train.tsv')
    output_path = os.getcwd() + "\\results analysis"

    bench_lbls = pd.read_csv(bench_lbls_path,
                             dtype={
                                 'query': int,
                                 'tweet': str,
                                 'y_true': int
                             })
    q2n_relevant = bench_lbls.groupby('query')['y_true'].sum().to_dict()
    queries = pd.read_csv(queries_path, sep='\t')

    stemming = False
    engine = search_engine_best.SearchEngine(stemming=stemming)
    stemmed = "WithStem\\" if stemming else "WithoutStem\\"
    engine.build_index_from_parquet(bench_data_path,
                                    toSave=True,
                                    save_path=os.path.join(
                                        output_path, stemmed + "idx_bench"))

    with open(os.path.join(output_path, "log.txt"), "a") as log:
        log.write("Corpus size {ifStem}: {num}\n".format(
            num=engine._indexer.number_of_documents,
            ifStem="with stemming" if stemming else "without stemming"))

        log.write("Avrgdl size {ifStem}: {num}\n".format(
            num=engine._indexer.average_document_length,
            ifStem="with stemming" if stemming else "without stemming"))

    report_analysis(output_path, stemming, engine)
Exemplo n.º 2
0
def main():
    ''' The main loop for the program '''
    config = ConfigClass()
    se = search_engine_best.SearchEngine(config=config)
    r = ReadFile(corpus_path=config.get__corpusPath())
    # parquet_file_path =r.get_all_path_of_parquet()[0][0]+r.get_all_path_of_parquet()[0][1]
    # se.build_index_from_parquet(parquet_file_path)
    se.load_index('idx_bench')
    g = GUI()

    # s.load_existing_index()  # load if exists, otherwise return empty list

    while True:
        event, values = g.window.read()

        if event is None:
            break

        if event == '_SEARCH_':
            g.clear()
            query = values['TERM']
            start = datetime.now()
            relevant, tweets_id = se.search(query)
            end = datetime.now()
            total_time = (end - start).total_seconds()
            # print the results to output element
            index = 0
            for tweet_id in tweets_id:
                if index < 25:
                    print("%s. tweet id: %s" % (index + 1, tweet_id))
                index += 1

            print()
            print("About %s tweets (%s seconds)" % (relevant, total_time))
Exemplo n.º 3
0
        'entity': [True, False],
        'less_more': [True, False],
        'hashtag': [True, False],
        'url': [True, False],
        'tag': [True, False],
        'capitals': [True, False],
        'cos_sym': [True, False],
        'min_length': [i for i in range(1, 4, 2)],
        'min_relevant': [i for i in range(1, 6, 2)],
        'the_count': [i for i in range(2, 5, 2)],
        'wordnet_count': [i for i in range(2, 5, 2)],
        'min_occurrence': [i for i in range(1, 6, 2)],
        'ext_val': [i / 10 for i in range(1, 10, 2)]
    }
    var_options_list = generate_var_options(v_options)
    progressbar = tqdm(total=len(var_options_list))
    for opt in var_options_list:
        config = run_configs.RunConfigClass(**opt)
        opt_engine = search_engine_best.SearchEngine(config)
        opt['build_idx_time'] = timeit.timeit(
            "opt_engine.build_index_from_parquet(bench_data_path)",
            globals=globals(),
            number=1)
        if opt['build_idx_time'] > 60:
            print(f'Build time exceeded: {opt}')
        for methods_list in methods_opt:
            o = opt.copy()
            o['methods'] = methods_list
            test(opt_engine, o)
            progressbar.update(1 / len(methods_opt))
Exemplo n.º 4
0
import run_configs
import search_engine_best

if __name__ == '__main__':
    engine = search_engine_best.SearchEngine(run_configs.RunConfigClass())
    path = r'data\benchmark_data_train.snappy.parquet'
    engine.build_index_from_parquet(path)
    print(engine.search('Herd immunity has been reached'))
Exemplo n.º 5
0
import os
import time

from flask import Flask, request
from flask_cors import CORS

import search_engine_best

app = Flask(_name_)
CORS(app)

print("start building inverted index")
se = search_engine_best.SearchEngine()
bench_data_path = os.path.join('data', 'benchmark_data_train.snappy.parquet')
se.build_index_from_parquet(bench_data_path)
print("finished")


@app.route('/', methods=['POST'])
def index():
    query = request.json['query']
    print(query)

    numOfResults, results = se.search(query)

    print(results[:5])
    # return jsonify({"return": results[:20]})
    return {'res': results[:100]}
    # return results[:200]