Пример #1
0
 def reset(self):
     global cache_dictionary
     global final_dictionary
     global documents_dictionary
     cache_dictionary = None
     final_dictionary = None
     documents_dictionary = None
     Indexer.reset()
     Writer.reset()
     Stemmer.reset()
     Reader.reset()
Пример #2
0
def handle_files(file_list_ref, documents_dictionary_ref):
    terms_dictionary = {}

    if stem_mode:
        # This code take a document's text from the list and parsing & stemming the text
        for value in file_list_ref:
            doc_id = value[0]
            file_name = value[2]
            after_stemming = Stemmer.stemWithCache(Parser.start(value[1]))
            # This function update the document parameters
            __update_and_merge_dictionaries(doc_id, file_name,
                                            terms_dictionary,
                                            documents_dictionary_ref,
                                            after_stemming)
            # This function merge all the dictionary in loop and create dictionary for the whole part
    else:
        # This code take a document's text from the list and only parsing the text
        for value in file_list_ref:
            doc_id = value[0]
            file_name = value[2]
            after_parse = Parser.start(value[1])
            # This function update the document parameters
            __update_and_merge_dictionaries(doc_id, file_name,
                                            terms_dictionary,
                                            documents_dictionary_ref,
                                            after_parse)
            # This function merge all the dictionary in loop and create dictionary for the whole part

    # This function create new temp posting file for each part
    Indexer.create_temp_posting_file(terms_dictionary)
Пример #3
0
    def results_relevant_documents_to_one_query(self, query):
        print("results_relevant_documents_to_one_query")
        if (self.final_dic):
            dictionary_parser = Parse.parse_text(query)
            dictionary_stemm = Stemmer.stemming(dictionary_parser,
                                                self.stemm_mode)

            final_dictionary_query = {}
            for term, details in dictionary_stemm.items():
                freq = details[0]
                final_dictionary_query[term] = freq

            result_rank = self.ranker.rank(self.stemm_mode, self.all_document,
                                           self.final_dic, self.path_folder,
                                           final_dictionary_query,
                                           self.semantic_mode)

        else:
            return []

        print("finish")
        self.ranker.reset_rank()
        return list(result_rank.keys())
Пример #4
0
 def __parse_stem(self, text):
     if self.stem_mode:  # stem mode is True
         after_dictionary = Stemmer.stemWithCache(Parser.start(text))
     else:  # stem mode is False
         after_dictionary = Parser.start(text)
     return after_dictionary
Пример #5
0
    def results_relevant_documents(self, queries_dictionary):
        print("results_relevant_documents")
        results = []
        if (self.final_dic):
            for number, value in queries_dictionary.items():
                # this is the dictionary of all terms in this query
                query_dictionary_description = {}
                query_title = value[0]
                query_description = value[1]
                query_narrative = value[2]

                # title
                parse_title = Parse.parse_text(query_title)
                stemm_title = Stemmer.stemming(parse_title, self.stemm_mode)
                # narrative - break down sentences and remove not relevant
                query_narrative = Parse.parse_query_narrative(query_narrative)
                # description + narrative
                query_description_narrative = query_narrative + ' ' + query_description
                parse_description_narrative = Parse.parse_text(
                    query_description_narrative)
                stemm_description_narrative = Stemmer.stemming(
                    parse_description_narrative, self.stemm_mode)

                # Normalize the number of occurrences of the term
                for term, details in stemm_description_narrative.items():
                    freq = details[0]
                    if term in self.final_dic:
                        idf = self.final_dic[term][1]
                        # new freq with idf
                        query_dictionary_description[term] = (
                            self.weight_idf * idf) + (self.weight_df * freq)

                number_of_term_in_query = len(query_dictionary_description)
                normalized_number_of_results = number_of_term_in_query / self.denominator
                normalized_number_of_results = int(
                    normalized_number_of_results)
                # Dictionary with the normalized_number_of_results most common terms
                query_dictionary_description_most_common = dict(
                    Counter(query_dictionary_description).most_common(
                        normalized_number_of_results))
                query_dictionary_description.clear()

                # chang dictionary
                final_dictionary_query = {}
                for term, details in stemm_title.items():
                    freq = details[0]
                    final_dictionary_query[term] = freq

                # merge dictionary_rank_title and query_dictionary_description_most_common to final_dictionary_query
                for term, val in query_dictionary_description_most_common.items(
                ):
                    if term in final_dictionary_query:
                        final_dictionary_query[
                            term] = final_dictionary_query[term] + val
                    else:
                        final_dictionary_query[term] = val

                # send the number of occurrences of a word in a document to the ranker
                # The ranker return dictionary with [term] = [(d1,tf1),(d2,tf2)...]
                result_rank = self.ranker.rank(self.stemm_mode,
                                               self.all_document,
                                               self.final_dic,
                                               self.path_folder,
                                               final_dictionary_query,
                                               self.semantic_mode)
                # [ query1 , {term1: [(d1,tf1),(d2,tf2)...] , term2: [(d1,tf1),(d2,tf2)...]} ]
                results.append((number, list(result_rank.keys())))

        print("finish")
        self.ranker.reset_rank()
        return results
Пример #6
0
 def stem_vocabulary(self):
     stemmer = Stemmer()
     for key in self.vocabulary:
         stemmer._stem(key)
     stemmer.save_stemmed_vocabulary(self.path)
Пример #7
0
    def run(self):
        global cache_dictionary
        global final_dictionary
        global documents_dictionary
        start_time = time.time()
        cache_dictionary = {}
        final_dictionary = {}
        documents_dictionary = {}

        # Creates a list with all of the file paths in the corpus. Pops to remove the corpus file path
        sub_dirs = [x[0] for x in os.walk(corpus_path)]
        sub_dirs.pop(0)

        files_list = []  # This list will save each part
        file_index = 1  # This index point to current file
        iterate_over_parts = 1  # This part point to the current part

        next_part = int(
            fileNum /
            parts) * iterate_over_parts  # The last index of the first part
        if thread_mode == 'on':  # Here we using ThreadPool
            # Init for ThreadPool with number of threads from config file
            executor = concurrent.futures.ThreadPoolExecutor(
                max_workers=number_of_threads)
            for subdir in sub_dirs:
                textList = Reader.separate(subdir)
                files_list.extend(textList)
                if file_index == next_part:
                    executor.submit(handle_files, files_list,
                                    documents_dictionary)
                    files_list = []  # cleaning the files list
                    if not iterate_over_parts + 1 == parts:
                        iterate_over_parts += 1
                        # update the last index of the next part
                        next_part = (int(fileNum / parts) * iterate_over_parts)
                if file_index == fileNum:  # The last index of the last part
                    executor.submit(handle_files, files_list,
                                    documents_dictionary)
                    break  # if we not iterate over the whole corpus
                file_index += 1
                # This function shut down the ThreadPool but wait until the Threads will finish
            executor.shutdown(wait=True)
        else:
            for subdir in sub_dirs:
                textList = Reader.separate(subdir)
                files_list.extend(textList)
                if file_index == next_part:
                    handle_files(files_list, documents_dictionary)
                    files_list = []  # cleaning the files list
                    if not iterate_over_parts + 1 == parts:
                        iterate_over_parts += 1
                        # update the last index of the next part
                        next_part = (int(fileNum / parts) * iterate_over_parts)
                if file_index == fileNum:  # The last index of the last part
                    handle_files(files_list, documents_dictionary)
                    break  # if we not iterate over the whole corpus
                file_index += 1

        sub_dirs = None
        files_list = None
        Stemmer.clean_cache()
        # Merge the temp files and removed them
        final_dictionary, cache_dictionary, posting_file_size = Indexer.merge_files(
            documents_dictionary)

        end_time = time.time()
        total_time = end_time - start_time

        # Stemmer.write_cache()
        print("Number of documents: " + str(len(documents_dictionary)))
        print("Number of terms: " + str(len(final_dictionary)))
        print("Time: " + str("{:.2f}".format(total_time)) + " seconds")
        print("Time: " + str("{:.2f}".format(total_time / 60)) + " minutes")

        final_dictionary_file_size = sys.getsizeof(final_dictionary)
        cache_file_size = sys.getsizeof(cache_dictionary)

        print("Posting file size: " + str(posting_file_size) + " Bytes")
        print("Dictionary file size: " + str(final_dictionary_file_size) +
              " Bytes")
        print("Cache file size: " + str(cache_file_size) + " Bytes")
        Writer.remove_temp_file()

        # Announce to the gui that indexing has concluded.
        global stem_mode
        self.view.finished_indexing(str(len(documents_dictionary)),
                                    str(final_dictionary_file_size),
                                    str(cache_file_size), str(int(total_time)),
                                    str(len(final_dictionary)),
                                    str(posting_file_size), stem_mode)
Пример #8
0
    def find(self, query, stem_mode):

        total_value = 0
        for x in range(0, 5):  # We want to use this value for the weights
            total_value += x

        sum_of_df = 0
        wiki_wiki = wikipediaapi.Wikipedia(
            'en')  # Which language we want to search the term for
        page_py = wiki_wiki.page(query)  # Define the query in the file

        query_dictionary = {}  # The dictionary we will return to the user
        if page_py.exists():
            line = page_py.summary  # Here we collect the summary about the page in wiki
            if len(
                    line
            ) < 300:  # If we wiki didn't return a specific term, we ask for the sections
                line = print_sections(page_py.sections)
            if stem_mode:
                stop_set = {'disambigu'}  # Popular words we want to avoid
                query_after = Stemmer.stemWithCache(Parser.start(query))
                terms_dictionary = Stemmer.stemWithCache(Parser.start(line))
            else:
                stop_set = {'Disambiguation'}  # Popular words we want to avoid
                query_after = Parser.start(query)
                terms_dictionary = Parser.start(line)

            concept = {}
            links = page_py.links  # Here we collect the links from the page in wiki
            for title in sorted(links.keys()):
                if stem_mode:
                    term = Stemmer.stemWithCache(
                        Parser.start(links[title].title))
                else:
                    term = Parser.start(links[title].title)

                for t, value in term.items(
                ):  # For each term in summary dictionary, we need to check the values
                    if links[title].ns == 0 and t in terms_dictionary and \
                            t not in query_after and t not in stop_set:
                        if t not in concept:
                            concept[t] = value
                        else:
                            concept[
                                t] += value  # we want to add the value (the df to the dictionary)

            # Here we ask only for most common query results
            query_dictionary = dict(
                Counter(concept).most_common(number_of_results))
            for term, value in query_dictionary.items():
                sum_of_df += value

            for term, value in query_dictionary.items():
                positive_value = int(total_value * value / sum_of_df) + 1
                if positive_value == 0:
                    positive_value = 1
                query_dictionary[term] = positive_value
            if len(query_after) is not 0:
                query = list(query_after.keys())[0]
        else:
            print("Invalid query")

        query_dictionary[query] = number_of_results
        return query_dictionary
Пример #9
0
def merge_all_posting(stemming_mode, posting_id, number_doc_in_corpus,
                      the_final_terms_dictionary, cach_dictionary, all_city,
                      max_doc_city):
    #check_uppercase()
    path_folder_posting, path_folder_abc_posting, stemming_mode, city_path = init_path(
        stemming_mode)
    print("merge_all_posting")
    finish = False
    number_of_line_in_abc_posting = {}
    all_final_posting_path = create_final_posting(
        path_folder_abc_posting, number_of_line_in_abc_posting, city_path)
    term_first_line_postings = {}
    freq_sum_doc_first_line_postings = {}
    the_open_posting_file = {}
    stemm_dictionary_values = []
    if stemming_mode == 'yes':
        stemm_dictionary = Stemmer.get_dictionary()  # all stemming_term
        stemm_dictionary_values = Stemmer.get_dictionary_value()
    elif stemming_mode == 'no':
        stemm_dictionary = Stemmer.get_dictionary_without_stemming(
        )  # all stemming_term
    cach_dictionary.clear()
    terms_to_updated = {}  # The terms are in lower case letters

    close_file = {}
    # save the first line of each temp posting
    for index_file_of_posting in range(1, posting_id + 1):
        file_path = path_folder_posting + "\TempPostings" + str(
            index_file_of_posting) + '.txt'
        curr_posting_file = open(file_path, "r")
        the_open_posting_file[index_file_of_posting] = curr_posting_file
        close_file[index_file_of_posting] = False
        find_first_line(curr_posting_file, index_file_of_posting,
                        term_first_line_postings,
                        freq_sum_doc_first_line_postings, close_file)

    while not finish:
        #min_temp_posting = min(term_first_line_postings.keys(), key=(lambda index_post: term_first_line_postings[index_post]))
        min_term = min(term_first_line_postings.values())
        all_posting_file_with_equal_term = []
        list_doc = {}
        sum_tf = 0
        df = 0
        for index, term in term_first_line_postings.items():
            if min_term == term:
                all_posting_file_with_equal_term.append(index)
                sum_tf = sum_tf + int(
                    (freq_sum_doc_first_line_postings[index])[0])
                df = df + int((freq_sum_doc_first_line_postings[index])[1])
                list_doc.update((freq_sum_doc_first_line_postings[index])[2])
        # Handling capitalization !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
        if min_term[0].isupper():  # Party # The
            lowercase_term = min_term.lower()  # party # the
            if lowercase_term in stemm_dictionary:
                if stemming_mode == 'yes':
                    lowercase_term_after_stemm = stemm_dictionary[
                        lowercase_term]  # parti # the
                else:
                    lowercase_term_after_stemm = lowercase_term
                if lowercase_term_after_stemm in terms_to_updated:
                    sum_tf = sum_tf + terms_to_updated[
                        lowercase_term_after_stemm][0]
                    list_doc.update(
                        terms_to_updated[lowercase_term_after_stemm][1])
                    terms_to_updated[lowercase_term_after_stemm] = (sum_tf,
                                                                    list_doc)
                else:
                    terms_to_updated[lowercase_term_after_stemm] = (sum_tf,
                                                                    list_doc)
            elif stemming_mode == 'yes' and lowercase_term in stemm_dictionary_values:
                if lowercase_term in terms_to_updated:
                    sum_tf = sum_tf + terms_to_updated[lowercase_term][0]
                    list_doc.update(terms_to_updated[lowercase_term][1])
                    terms_to_updated[lowercase_term] = (sum_tf, list_doc)
                else:
                    terms_to_updated[lowercase_term] = (sum_tf, list_doc)
            else:
                cach_dictionary[min_term] = sum_tf
                calculations_and_income_to_final_dictionary(
                    list_doc, sum_tf, df, number_doc_in_corpus, min_term,
                    all_final_posting_path, number_of_line_in_abc_posting,
                    the_final_terms_dictionary, all_city, max_doc_city)
        else:
            if min_term in terms_to_updated:  # parti #the
                sum_tf = sum_tf + terms_to_updated[min_term][0]
                cach_dictionary[min_term] = sum_tf
                list_doc.update(terms_to_updated[min_term][1])
                #print("final posting: " + min_term)
                calculations_and_income_to_final_dictionary(
                    list_doc, sum_tf, df, number_doc_in_corpus, min_term,
                    all_final_posting_path, number_of_line_in_abc_posting,
                    the_final_terms_dictionary, all_city, max_doc_city)
            else:
                #print("final posting: " + min_term)
                cach_dictionary[min_term] = sum_tf
                calculations_and_income_to_final_dictionary(
                    list_doc, sum_tf, df, number_doc_in_corpus, min_term,
                    all_final_posting_path, number_of_line_in_abc_posting,
                    the_final_terms_dictionary, all_city, max_doc_city)

        for i in all_posting_file_with_equal_term:
            find_first_line(the_open_posting_file[i], i,
                            term_first_line_postings,
                            freq_sum_doc_first_line_postings, close_file)

        finish = check_if_finish(close_file)

    ## out while
    close_all_files(all_final_posting_path)
    Stemmer.reset()
    reset_temp_posting()
    return sum_numbers