Exemplo n.º 1
0
def run_engine(config):
    """
    :return:
    """
    number_of_documents = 0
    output_path = config.savedFileMainFolder
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse(config.toStem)
    m_Indexer = Indexer(output_path)
    parquetPaths = []
    for (dirPath, dirNames, fileNames) in os.walk(config.get__corpusPath()):
        for fileName in fileNames:
            parquetPaths.append((dirPath + '\\' + fileName))
    for i in range(len(parquetPaths)):
        parquetPaths[i] = parquetPaths[i][parquetPaths[i].find('\\') + 1:]
        if ".DS_Store" in parquetPaths[i]:
            continue
        parquet = r.read_file(file_name=parquetPaths[i])
        for document in parquet:
            number_of_documents += 1
            parsed_document = p.parse_doc(document)
            # index the document data
            m_Indexer.add_new_doc(parsed_document)
    # if there's more postings to flush, do it.
    if len(m_Indexer.postingDictionary) > 0:
        utils.save_obj(m_Indexer.postingDictionary,
                       m_Indexer.postingsPath + '/' + str(m_Indexer.pkl_key))
    # Clear single terms and entities, updated inverted index to disk.
    clearSingleEntities(m_Indexer.inverted_idx, p, output_path,
                        m_Indexer.num_of_docs_in_corpus)
    utils.save_obj(m_Indexer.inverted_idx, output_path + '/inverted_idx')
    m_Indexer.inverted_idx.clear()
    utils.save_obj(number_of_documents,
                   output_path + '/PostingFiles/num_of_docs_in_corpus')
Exemplo n.º 2
0
def run_engine(corpus_path='', output_path='', stemming=False):
    """

    :return:
    """
    # Create PostingFile directory if it doesn't exist
    number_of_documents = 0
    config = ConfigClass()
    r = ReadFile(corpus_path=corpus_path)
    p = Parse(stemming)
    indexer = Indexer(config, output_path)
    # Get all parquet files from corpus path
    parquets = []
    for root, dirs, files in os.walk(corpus_path):
        for name in files:
            if name.endswith((".parquet", ".htm")):
                parquets.append((root, name))

    for index in range(len(parquets)):
        r.corpus_path = parquets[index][0]
        documents_list = r.read_file(file_name=parquets[index][1])
        # Create a new process for each document
        with Pool(CPUCOUNT) as _p:
            for parsed_doc in _p.imap_unordered(p.parse_doc, documents_list):
                number_of_documents += 1
                indexer.add_new_doc(parsed_doc)
            _p.close()
            _p.join()

    p.entities.clear()
    indexer.finish_index()
    save_obj(indexer.term_dict, output_path + '/' + "inverted_idx")
    save_obj(indexer.document_dict, output_path + '/' + "doc_dictionary")
    indexer.document_dict.clear()
    indexer.term_dict.clear()
Exemplo n.º 3
0
def run_engine(corpus_path, output_path, stemming=False):
    """

    :param corpus_path: path for parquet files
    :param output_path: path to write pickle files
    :param stemming: boolean to use stemming or not
    :return:
    """

    ConfigClass(corpus_path, output_path, stemming)
    r = ReadFile(corpus_path)
    p = Parse(stemming)
    indexer = Indexer(output_path, stemming)

    if corpus_path.endswith('parquet'):
        documents_list = r.read_file(corpus_path)
        parseAndIndexDocuments(documents_list, p, indexer)
    else:
        documents_list = r.read_dir()

        while documents_list:
            parseAndIndexDocuments(documents_list, p, indexer)
            documents_list = r.read_dir()

    documents_list.clear()
    indexer.merge_posting_files()

    lda = LDA(output_path, indexer.dictdoc, stemming)
    lda.build_ldaModel()
Exemplo n.º 4
0
def run_engine(config):
    """

    :param config:
    :return:
    """
    number_of_documents = 0

    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse(config.toStem)
    indexer = Indexer(config)
    paruet_list = r.read_all_parquet()
    for list in paruet_list:
        #for i in tqdm(range(0,len(list))): # for every doc
        for i in range(0, len(list)):  # for every doc
            # parse the document
            parsed_document = p.parse_doc(list[i])
            if parsed_document is None:
                continue
            number_of_documents += 1

            # index the document data
            indexer.add_new_doc(parsed_document)

    #print('Finished parsing and indexing. Starting to export files')

    indexer.save_postings()  # saves the remaining posting file .
    PostingsMerge(indexer).chunks_merging()
    utils.save_dict_as_pickle(indexer.inverted_idx, "inverted_idx",
                              config.get_out_path())
Exemplo n.º 5
0
def run_engine(corpus_path="testData", output_path="posting", stemming=True, glove_dict=None):
    """
    This function build the inverted index over the corpus.
    send each tweet to parsing and indexing.
    if the stemming is True the parsing will use the stemmer on the tokens.
    :param glove_dict: Glove file including all word vectors
    :param corpus_path: root folder containing the raw tweet files
    :param output_path for the inverted index, posting files and tweets dictionary
    :param stemming if True use stemmer on terms
    """

    config = ConfigClass(corpus_path, number_of_term_buckets=26, number_of_entities_buckets=2, output_path=output_path)
    r = ReadFile(corpus_path=config.get_corpusPath())
    p = Parse(stemming)
    indexer = Indexer(config)
    all_files_paths = glob.glob(config.get_corpusPath() + "\\*\\*.snappy.parquet")
    all_files_names = [file_name[file_name.find("\\") + 1:] for file_name in all_files_paths]
    start_time = time.time()
    file_counter = 0
    for file_name in all_files_names:
        file_start_time = time.time()
        # print("start file :", file_counter)
        documents_list = [document for document in r.read_file(file_name=file_name)]
        # Iterate over every document in the file
        for idx, document in enumerate(documents_list):
            parsed_document = p.parse_doc(document)
            indexer.add_new_doc(parsed_document, glove_dict)
        # print("end file number ", file_counter, " in: ", time.time() - file_start_time)
        file_counter += 1
    total_time = time.time() - start_time
    indexer.finish_indexing()
Exemplo n.º 6
0
def run_engine(corpus_path, output_path, stemming, queries, num_docs_to_retrieve):
    """
    :return:
    """
    number_of_documents = 0

    config = ConfigClass(corpus_path, output_path, stemming)
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse(stemming)
    indexer = Indexer(config, p.terms_dic_to_document)
    # Iterate over every document in the file
    for i in r.filesPath:
        documents_list = r.read_file(i)
        start_time = time.time()
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = p.parse_doc(document)
            # update the number of doc in system
            number_of_documents += 1
            # index the document data
            indexer.add_new_doc(parsed_document)
        # print(time.time() - start_time)
    print('--------------------------')
    print('Start writing to disk left overs')
    indexer.save_all_left_overs()
    print('Finish without waiting ' + str(time.time() - start_time))
    print('Start waiting')
    indexer.wait_untill_all_finish()
    print('End Waiting')
    print('Finished writing to disk left overs')
    print('--------------------------')
    print('Finished parsing and indexing. Starting to export files')
    print('Finish all Time ' + str(time.time() - start_time))
    utils.save_obj(indexer.inverted_idx, "inverted_idx")
Exemplo n.º 7
0
    def test_add_new_doc(self):
        config = ConfigClass()
        r = ReadFile(corpus_path=config.get__corpusPath())
        p = Parse()
        indexer = Indexer(config)
        documents_list = r.read_file(file_name='sample3.parquet')
        # text1 = '@ampalombo I was going to my grandsons baseball games and the dumb F****s made a mask mandatory, are you kidding me'
        assert indexer.add_new_doc()

        text = 'i wad born in 2019'
Exemplo n.º 8
0
 def test_reader():
     global num_test_failed, results_summary
     num_test_failed = 0
     r = ReadFile(corpus_path)
     correct_answers = [x['len'] for x in reader_inputs]
     student_answers = [
         len(r.read_file(x['file'])) for x in reader_inputs
     ]
     test_part(correct_answers, student_answers, error_str="read")
     if num_test_failed == 0:
         results_summary.append('All Reader tests passed')
Exemplo n.º 9
0
def write_content_for_tweet_id():
    corpus_path = "C:\\Users\\ASUS\\Desktop\\Data"
    config = ConfigClass(corpus_path)
    r = ReadFile(corpus_path=config.get__corpusPath())
    names = r.get_files_names_in_dir()

    with open("text.csv", "w", newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        for name in names:
            documents_list = r.read_file_by_name(file_name=str(name))
            for doc in documents_list:
                if doc[0] in tweet_ids:
                    writer.writerow([doc[0], doc[2]])
Exemplo n.º 10
0
 def __init__(self, config=None):
     """
            init engine with the relevant model - Thesaurus_Searcher
            :param config:
            """
     self._config = config
     try:
         self._reader = ReadFile(corpus_path=config.get__corpusPath())
     except:
         self._reader = ReadFile("")
     self._parser = Parse()
     self._parser.STEMMER = config.toStem
     self._indexer = Indexer(config)
     self._model = Thesaurus_Searcher(self._indexer)
     self.last_parquet = False
 def __init__(self, config=None):
     self._config = config
     self._parser = Parse(False)
     self.reader = ReadFile(corpus_path=config.get__corpusPath())
     self._indexer = Indexer(config)
     self.model = self.initialize_glove_dict()
     self._indexer.set_glove_dict(self.model)
Exemplo n.º 12
0
def main():
    ''' The main loop for the program '''
    config = ConfigClass()
    se = search_engine_best.SearchEngine(config=config)
    r = ReadFile(corpus_path=config.get__corpusPath())
    # parquet_file_path =r.get_all_path_of_parquet()[0][0]+r.get_all_path_of_parquet()[0][1]
    # se.build_index_from_parquet(parquet_file_path)
    se.load_index('idx_bench')
    g = GUI()

    # s.load_existing_index()  # load if exists, otherwise return empty list

    while True:
        event, values = g.window.read()

        if event is None:
            break

        if event == '_SEARCH_':
            g.clear()
            query = values['TERM']
            start = datetime.now()
            relevant, tweets_id = se.search(query)
            end = datetime.now()
            total_time = (end - start).total_seconds()
            # print the results to output element
            index = 0
            for tweet_id in tweets_id:
                if index < 25:
                    print("%s. tweet id: %s" % (index + 1, tweet_id))
                index += 1

            print()
            print("About %s tweets (%s seconds)" % (relevant, total_time))
Exemplo n.º 13
0
def run_engine(corpus_path, output_path, stemming, queries,
               num_docs_to_retrieve, word2vec):
    """

    :return:
    """
    # print("start: ", time.asctime(time.localtime(time.time())))
    number_of_documents = 0
    num_of_writes = 1
    config = ConfigClass(corpus_path)
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse(stemming)
    indexer = Indexer(config, word2vec)
    # documents_list = r.read_file(file_name='covid19_07-30.snappy.parquet')  # TODO - handel all files ~50 (can do with from multiprocessing.pool import ThreadPool)

    # Iterate over every document in the file
    counter = 0
    names = r.get_files_names_in_dir()
    for name in names:
        documents_list = r.read_file_by_name(file_name=str(name))
        for idx, document in enumerate(documents_list):
            parsed_document = p.parse_doc(document)  # parse the document
            if parsed_document == {}:  # RT
                continue
            number_of_documents += 1

            indexer.add_new_doc(parsed_document,
                                num_of_writes)  # index the document data
            counter += 1
            if counter >= 500000:
                write_and_clean_buffer(indexer, num_of_writes, stemming,
                                       config, output_path)
                counter = 0
                # print("finish parser & index number: ", num_of_writes, " At: ", time.asctime(time.localtime(time.time())))
                num_of_writes += 1
        # print('Finished parsing and indexing. Starting to export files')
    write_and_clean_buffer(indexer, num_of_writes, stemming, config,
                           output_path)
    # print("finish parser & index: ", time.asctime(time.localtime(time.time())))
    indexer.inverted_idx = {
        key: val
        for key, val in indexer.inverted_idx.items() if val != 1
    }
    utils.save_obj(indexer.inverted_idx, "inverted_idx")
    # print("finish save index: ", time.asctime(time.localtime(time.time())))

    return num_of_writes
Exemplo n.º 14
0
def run_engine(config, indexer):
    """
    :return:
    """
    number_of_documents = 0

    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse(config)

    doc = r.read_file('benchmark_data_train.snappy.parquet')
    for document in doc:
        parsed_document = p.parse_doc(document)
        indexer.add_new_doc(parsed_document)
        number_of_documents += 1
    capital_letters = p.caps_dict
    indexer.change_inverted_by_caps(capital_letters)
    indexer.save_index('idx_bench')
Exemplo n.º 15
0
def run_engine(corpus_path_, output_path_, stemming_):
    """

    :return:
    """

    number_of_documents = 0
    config = ConfigClass(corpuspath=corpus_path_,outputpath=output_path_,stemming=stemming_)
    config.corpusPath = corpus_path_
    config.savedFileMainFolder=output_path_
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse()
    indexer = Indexer(config)

    pathes = r.get_all_path_of_parquet()
    length_of_array = len(pathes)
    iteration = 0
    is_stemmer = config.toStem
    parsed_doc_list = list()
    for i in range(0, length_of_array):
        documents_list = r.get_documents(pathes[i][0], pathes[i][0])
        for doc, j in zip(documents_list, range(len(documents_list))):
            parsed_document = p.parse_doc(doc, stemmer=is_stemmer)
            if parsed_document == None:
                continue
            parsed_doc_list.append(parsed_document)
            number_of_documents += 1
            if number_of_documents % 200000 == 0:
                for doc in parsed_doc_list:
                    indexer.add_new_doc(doc)
                indexer.write_posting_to_txt_file_lower_upper(iteration)
                iteration += 1
                parsed_doc_list.clear()
                parsed_doc_list = list()
            elif j == len(documents_list) - 1 and i == length_of_array - 1:
                for doc in parsed_doc_list:
                    indexer.add_new_doc(doc)
                indexer.write_posting_to_txt_file_lower_upper(iteration)
                parsed_doc_list.clear()
                parsed_doc_list = list()
                indexer.merge_posting_file()
                indexer.merge_two_last_posting_file()
                indexer.split_posting_file_and_create_inverted_index()
                indexer.write_inverted_index_to_txt_file()
                number_of_documents = 0
Exemplo n.º 16
0
 def __init__(self, config=None):
     if not config:
         self._config = ConfigClass()
     else:
         self._config = config
     self._parser = Parse()
     self._indexer = Indexer(self._config)
     self._model = None
     self._reader = ReadFile(self._config.get__corpusPath())
Exemplo n.º 17
0
def run_engine(corpus_path, output_path, stemming, queries,
               num_docs_to_retrieve):
    """
    :return:
    """
    config = ConfigClass(corpus_path, output_path, stemming)
    number_of_documents = 0
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse()
    indexer = Indexer(config)
    Parse.stemmer = stemming

    corpus_list = r.read_corpus()

    for idx in range(len(corpus_list)):
        documents_list = r.read_file(file_name=corpus_list[idx],
                                     read_corpus=True)
        for i in tqdm(range(len(documents_list))):
            parsed_document = p.parse_doc(documents_list[i])
            if i == len(documents_list) - 1 and idx == len(corpus_list) - 1:
                indexer.is_last_doc = True
            indexer.add_new_doc(parsed_document)
            number_of_documents += 1
        indexer.is_last_doc = False
    documents_list = []

    with open('spell_dict.json', 'w') as f:
        json.dump(indexer.spell_dict, f)

    pickle_out = open("docs_dict_and_extras", "wb")
    pickle.dump(indexer.docs_dict, pickle_out)
    pickle_out.close()

    start = time.time()
    indexer.merge_files()
    end = time.time()
    print("merge time was: {}".format(end - start))

    utils.save_obj(indexer.inverted_idx, "inverted_index")
    pickle_out = open("docs_dict_and_extras", "ab")
    pickle.dump(number_of_documents, pickle_out)
    pickle.dump(Parse.AMOUNT_OF_NUMBERS_IN_CORPUS, pickle_out)
    pickle.dump(indexer.dump_path, pickle_out)
    pickle_out.close()
Exemplo n.º 18
0
def run_engine(config):
    """

    :return:
    """

    number_of_documents = 0
    sum_of_doc_lengths = 0

    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse(config.toStem)
    indexer = Indexer(config, glove_dict)
    # documents_list = r.read_file(file_name=config.get__corpusPath())
    parquet_documents_list = r.read_folder(config.get__corpusPath())
    for parquet_file in parquet_documents_list:
        documents_list = r.read_file(file_name=parquet_file)
        # Iterate over every document in the file
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = p.parse_doc(document)
            if parsed_document is None:
                continue
            number_of_documents += 1
            sum_of_doc_lengths += parsed_document.doc_length
            # index the document data
            indexer.add_new_doc(parsed_document)

    # saves last posting file after indexer has done adding documents.
    indexer.save_postings()
    if len(indexer.doc_posting_dict) > 0:
        indexer.save_doc_posting()
    utils.save_dict(indexer.document_dict, "documents_dict", config.get_out_path())
    if len(indexer.document_posting_covid) > 0:
        indexer.save_doc_covid()

    indexer.delete_dict_after_saving()

    # merges posting files.
    indexer.merge_chunks()
    utils.save_dict(indexer.inverted_idx, "inverted_idx", config.get_out_path())

    dits = {'number_of_documents': number_of_documents, "avg_length_per_doc": sum_of_doc_lengths/number_of_documents }

    utils.save_dict(dits, 'details', config.get_out_path())
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        config = self._config
        indexer = self._indexer
        number_of_documents = 0

        if (config.getoneFile()):
            df = pd.read_parquet(fn, engine="pyarrow")
            documents_list = df.values.tolist()
            # Iterate over every document in the file
            for idx, document in enumerate(documents_list):
                # parse the document
                parsed_document = self._parser.parse_doc(document)
                number_of_documents += 1
                # index the document data
                self._indexer.add_new_doc(parsed_document)
            self._indexer.calculationSummerize()
        else:
            r = ReadFile(corpus_path=config.get__corpusPath())
            for root, dirs, files in os.walk(config.get__corpusPath(),
                                             topdown=True):
                for name in files:
                    ext = name.split('.')[-1]
                    if ext == 'parquet':
                        documents_list = r.read_folder(root, file_name=name)
                        # Iterate over every document in the file
                        for idx, document in enumerate(documents_list):
                            # parse the document
                            parsed_document = self._parser.parse_doc(document)
                            number_of_documents += 1
                            # index the document data
                            indexer.add_new_doc(parsed_document)
                        # indexer.update_posting_files()
                        # indexer.reset_cach()

        self._indexer.save_index('inverted_idx')
        print('Finished parsing and indexing.')
Exemplo n.º 20
0
 def build_index_from_parquet(self, fn):
     """
     Reads parquet file and passes it to the parser, then indexer.
     Input:
         fn - path to parquet file
     Output:
         No output, just modifies the internal _indexer object.
     """
     r = ReadFile()
     df = r.read_file(fn)
     documents_list = df
     # Iterate over every document in the file
     number_of_documents = 0
     for idx, document in enumerate(documents_list):
         # parse the document
         parsed_document = self._parser.parse_doc(document)
         number_of_documents += 1
         # index the document data
         self._indexer.add_new_doc(parsed_document)
     print('Finished parsing and indexing.')
Exemplo n.º 21
0
def run_engine(corpus_path=None,
               output_path=None,
               stemming=False,
               lemma=False,
               queries=None,
               num_docs_to_retrieve=None):
    """
    :return:
    """
    global config, number_of_documents

    number_of_documents = 0

    config = ConfigClass()
    config.corpusPath = corpus_path
    config.set_output_path(output_path)
    config.toStem = stemming
    config.toLemm = lemma
    if os.path.exists(config.get_output_path()):
        shutil.rmtree(config.get_output_path())

    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse(config.toStem, config.toLemm)
    indexer = Indexer(config)

    documents_list = []
    for root, dirs, files in os.walk(corpus_path):
        r.set_corpus_path(root)
        for file in files:
            if file.endswith(".parquet"):
                documents_list += r.read_file(file)
    # Iterate over every document in the file
    for idx, document in enumerate(documents_list):
        # parse the document
        parsed_document = p.parse_doc(document)
        number_of_documents += 1
        # index the document data
        indexer.add_new_doc(parsed_document)
    documents_list.clear(
    )  # Finished parsing and indexing all files - need to clean all the used memory
    indexer.cleanup(number_of_documents)
Exemplo n.º 22
0
    def run_engine(self):
        """
        :return:
        """
        r = ReadFile(corpus_path=self._config.get__corpusPath())
        number_of_files = 0

        for i, file in enumerate(r.read_corpus()):
            # Iterate over every document in the file
            number_of_files += 1
            for idx, document in enumerate(file):
                # parse the document
                parsed_document = self._parser.parse_doc(document)
                self._indexer.add_new_doc(parsed_document)

        self._indexer.entities_and_small_big()
        self._indexer.calculate_idf(self._parser.number_of_documents)
        # avg_doc_len = self._parser.total_len_docs / self._parser.number_of_documents
        # self._indexer.save_index("inverted_idx")
        # TODO - check the name of inverted_idx
        self._indexer.save_index("idx_bench.pkl")
Exemplo n.º 23
0
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        number_of_documents = 0

        r = ReadFile(corpus_path=self._config.get__corpusPath())

        doc = r.read_file(fn)
        for document in doc:
            parsed_document = self._parser.parse_doc(document)
            self._indexer.add_new_doc(parsed_document)
            number_of_documents += 1
        capital_letters = self._parser.caps_dict
        self._indexer.change_inverted_by_caps(capital_letters)
        self._indexer.save_index('idx_bench')
        print('Finished parsing and indexing.')
Exemplo n.º 24
0
def run_engine():
    """

    :return:
    """
    number_of_documents = 0
    timer = True
    config = ConfigClass()
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse()  #p = Parse(with_stemmer=True)
    indexer = Indexer(config)

    data_dir = 'Data' + os.sep + 'Data'
    npy_dirs = [root for root, dirs, files in os.walk(data_dir)]
    for dir_path in npy_dirs:
        files = [
            os.path.join(dir_path, fname) for fname in os.listdir(dir_path)
            if fname.endswith('.parquet')
        ]
        for file in files:
            tweets = r.read_file(file_name=file)
            start_time = time.perf_counter()
            documents_list = multiprocessing.Pool(12).map(p.parse_doc, tweets)
            end_time = time.perf_counter()
            avg_time_per_tweet = (end_time - start_time) / len(tweets)
            print(
                f'Parsed {len(tweets)} tweets, Elapsed time: {end_time - start_time:0.4f} seconds, average per tweet: {avg_time_per_tweet:0.8f} seconds'
            )

            start_time = time.perf_counter()
            for parsed_document in documents_list:
                indexer.add_new_doc(parsed_document)
            end_time = time.perf_counter()
            print(
                f'Indexing {len(documents_list)} tweets, Elapsed time: {end_time - start_time:0.4f} seconds'
            )
    print('Finished parsing and indexing. Starting to export files')
    utils.save_obj(indexer.inverted_idx, "inverted_idx")
    utils.save_obj(indexer.postingDict, "posting")
Exemplo n.º 25
0
def run_engine(corpus_path='', output_path='.', stemming=False):
    """
    Entry point for corpus parsing and indexing
    :param corpus_path:
    :param output_path:
    :param stemming: boolean that says if stemming should be apllied
    :return: total number of tweets parsed
    """

    config = ConfigClass(corpus_path, stemming, output_path)
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse(stemming)

    tweets_parsed = parse_wrapper(r, p, config)
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """

        rd = ReadFile(fn)
        documents_list = rd.read_file()

        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)
        self._indexer.thresh_hold = 100000
        self._indexer.thresh_hold_handler()
        self._indexer.save_index("inverted_idx")
Exemplo n.º 27
0
def run_engine():
    """

    :return:
    """
    number_of_documents = 0

    config = ConfigClass()
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse()
    indexer = Indexer(config)

    documents_list = r.read_file(file_name='sample3.parquet')
    # Iterate over every document in the file
    for idx, document in enumerate(documents_list):
        # parse the document
        parsed_document = p.parse_doc(document)
        number_of_documents += 1
        # index the document data
        indexer.add_new_doc(parsed_document)
    print('Finished parsing and indexing. Starting to export files')

    utils.save_obj(indexer.inverted_idx, "inverted_idx")
    utils.save_obj(indexer.postingDict, "posting")
Exemplo n.º 28
0
def run_engine(config):
    """
    :return:
    """
    parser = Parse(config)
    r = ReadFile(corpus_path=config.get__corpusPath())
    indexer = Indexer(config)
    number_of_files = 0

    for i, file in enumerate(r.read_corpus()):
        # Iterate over every document in the file
        number_of_files += 1
        for idx, document in enumerate(file):
            # parse the document
            parsed_document = parser.parse_doc(document)
            indexer.add_new_doc(parsed_document)
    indexer.check_last()
    indexer.merge_sort_parallel(3)
    indexer.calculate_idf(parser.number_of_documents)
    avg_doc_len = parser.total_len_docs / parser.number_of_documents
    utils.save_obj(avg_doc_len, config.get_savedFileMainFolder() + "\\data")

    utils.save_obj(indexer.inverted_idx, config.get_savedFileMainFolder() + "\\inverted_idx")
    utils.save_obj(indexer.docs_inverted, config.get_savedFileMainFolder() + "\\docs_inverted")
Exemplo n.º 29
0
def create_table(stemming, corpus):
    myclient = pymongo.MongoClient("mongodb://localhost:27017/")
    mydb = myclient["mydatabase"]
    mycol = mydb["global"]
    mycol.drop()
    r = ReadFile(corpus)
    p = Parse(stemming)
    for documents_list in r:
        step = 1 / len(documents_list)
        for document in documents_list:
            parsed_list = [t.text.lower() for t in p.parse_doc(document) if '$' not in t.text]

            for word_1 in parsed_list:
                query = {'term': word_1}
                row = mycol.find_one(query)
                if not row:
                    mycol.insert_one({**query, 'terms': {}})
                    row = mycol.find_one(query)
                for word_2 in parsed_list:
                    if word_2 not in row['terms'].keys():
                        row['terms'][word_2] = 0
                    row['terms'][word_2] += 1
                try:
                    mycol.update_one(query, {"$set": {'terms': row['terms']}})
                except:
                    print(row['terms'])

            r.progressbar.update(step)
            counter += 1
    global_table = {}
    for word_1 in mycol.find():
        top = []
        for word_2 in word_1['terms'].keys():
            s = word_1['terms'][word_2] / (
                    word_1['terms'][word_1['term']] + mycol.find_one({'term': word_2})['terms'][word_1['term']] -
                    word_1['terms'][word_2])
            if len(top) < 10:
                top.append((word_2, s))
                top.sort(key=lambda score: score[1])
            elif s > top[0][1]:
                top[0] = (word_2, s)
                top.sort(key=lambda score: score[1])
        global_table[word_1['term']] = top
    utils.save_obj(global_table, f'global_table_{stemming}')
Exemplo n.º 30
0
def run_engine(corpus_path, stemming, output_path):
    """
    :return:
    """
    r = ReadFile(corpus_path)
    p = Parse(stemming)
    m = BinaryMemoryPosting(os.path.join(output_path, PostingFile))
    indexer = Indexer()
    max_posting_size = 100000

    if os.path.exists(os.path.join(output_path, PostingFile)):
        os.remove(os.path.join(output_path, PostingFile))
    if os.path.exists(InvertedIndexFile + '.pkl'):
        os.remove(InvertedIndexFile + '.pkl')
    if not os.path.exists(output_path):
        os.mkdir(output_path)

    # Iterate over every document in the file
    idx = 0
    for documents_list in r:
        step = 1 / len(documents_list)
        for document in documents_list:
            parsed_list = p.parse_doc(document)

            # index the document data
            indexer.add_new_doc(parsed_list, idx, document[0])
            idx += 1

            if idx % max_posting_size == 0:
                m.Save(p.word_dict)
            r.progressbar.update(step)

    r.progressbar.close()
    m.Save(p.word_dict)

    global_table = utils.load_obj(f'global_table_{stemming}')

    inv_index = indexer.CreatInvertedIndex(p.word_dict, idx, global_table)
    m.Merge(inv_index)
    utils.save_obj(inv_index, InvertedIndexFile)