Пример #1
0
def read_collection(coll_main_folder,
                    output_model_path,
                    stemming,
                    stoplist=None):
    if not os.path.isfile(output_model_path):
        if stoplist is None:
            stoplist = util.load_indri_stopwords()
        text_by_name = {}
        print('reading files in folder')
        pool = multiprocessing.Pool(8)
        fnames_list = os.listdir(coll_main_folder)
        doc_paths_list = [
            os.path.join(coll_main_folder, filename)
            for filename in fnames_list
        ]
        print('processing collection')
        tokenized_docs = pool.starmap(
            util.tokenize,
            [(' '.join(open(fp, 'r').readlines()), stemming, stoplist)
             for fp in doc_paths_list])

        for i in range(len(fnames_list)):
            text_by_name[fnames_list[i].split(r'.')[0]] = tokenized_docs[i]

        print('saving model')
        util.save_model(text_by_name, output_model_path)
    else:
        print('loading model: %s' % output_model_path)
        text_by_name = util.load_model(output_model_path)
    return text_by_name
Пример #2
0
def compute_inverted_index(coll_folder, stemming, output_file_path_ii):
    if not os.path.isfile(output_file_path_ii):
        print('computing inverted index')
        inverted_idx = {}
        sw = util.load_indri_stopwords()
        doc_n = 0
        for filename in tqdm(os.listdir(coll_folder)):
            fp = os.path.join(coll_folder, filename)
            doc_id = filename.split(r'.')[0]
            if os.path.isfile(fp):
                doc_n += 1
                d = util.tokenize(' '.join(open(fp, 'r').readlines()),
                                  stemming,
                                  stoplist=sw)
                set_w_in_doc = set(d)
                for w in set_w_in_doc:
                    if w in inverted_idx.keys():
                        inverted_idx[w].append((doc_id, d.count(w)))
                    else:
                        inverted_idx[w] = [(doc_id, d.count(w))]

        util.save_model(inverted_idx, output_file_path_ii)
    else:
        inverted_idx = util.load_model(output_file_path_ii)
    return inverted_idx
Пример #3
0
def encode_queries(queries_main_folder, wi, stemming):
    sw = util.load_indri_stopwords()
    encoded_qbn = {}
    for filename in tqdm(os.listdir(queries_main_folder)):
        fp = os.path.join(queries_main_folder, filename)
        if os.path.isfile(fp):
            tokenized_query = util.tokenize(' '.join(open(fp, 'r').readlines()), stemming=stemming, stoplist=sw)
            qn = filename.split(r'.')[0]
            encoded_qbn[qn] = [wi[w] for w in tokenized_query if w in wi.keys()]
    return encoded_qbn
Пример #4
0
def compute_data():
    ftext_model_path = '../data/fasttext_models/wiki.en.bin'
    output_path_wi_model = '../data/fasttext_models/wi_robust'
    output_path_ii_model = '../data/fasttext_models/ii_robust'
    output_path_idf_model = '../data/fasttext_models/idf_robust'
    output_path_encoded_d_model = '../data/fasttext_models/encoded_dbn'
    output_path_encoded_q_model = '../data/fasttext_models/encoded_qbn'
    output_path_we_matrix_model = '../data/fasttext_models/word_embeddings_matrix_robust'
    coll_path = '/Users/albertopurpura/ExperimentalCollections/Robust04/processed/corpus'
    queries_main_folder = '/Users/albertopurpura/ExperimentalCollections/Robust04/processed/topics'
    output_model_path = 'data/robust/stemmed_coll_model'
    encoded_out_folder_docs = 'data/robust/stemmed_encoded_docs_ft'

    stemming = True

    if not os.path.isfile(output_path_ii_model):
        print('computing inverted index')
        ii = compute_inverted_index(coll_path, stemming, output_path_ii_model)
        util.save_model(ii, output_path_ii_model)
    else:
        print('loading inverted index')
        ii = util.load_model(output_path_ii_model)

    if not os.path.isfile(output_path_encoded_d_model):
        text_dbn = read_collection(coll_path,
                                   output_model_path,
                                   stemming=stemming,
                                   stoplist=util.load_indri_stopwords())

        encoded_dbn, wi, we_matrix = compute_input_data(
            text_dbn, ftext_model_path, encoded_out_folder_docs)

        util.save_model(encoded_dbn, output_path_encoded_d_model)
        util.save_model(wi, output_path_wi_model)
        util.save_model(we_matrix, output_path_we_matrix_model)
    else:
        encoded_dbn = util.load_model(output_path_encoded_d_model)
        wi = util.load_model(output_path_wi_model)
        we_matrix = util.load_model(output_path_we_matrix_model)

    if not os.path.isfile(output_path_encoded_q_model):
        encoded_qbn = encode_queries(queries_main_folder, wi, stemming)
        util.save_model(encoded_qbn, output_path_encoded_q_model)
    else:
        encoded_qbn = util.load_model(output_path_encoded_q_model)

    idf_scores = du.compute_idf(coll_path, stemming, output_path_ii_model,
                                output_path_idf_model)

    return encoded_dbn, encoded_qbn, we_matrix, wi, ii, idf_scores