Exemplo n.º 1
0
    def search(self, query):
        """ Search for the query terms in files
        Args:
            query (str): query input, "user input goes here"
        Returns:
            list: a list of files in descending order of relevancy
        """
        # parse words
        filtered_query = self.parse_words([query])

        # remove duplicate words using a hash table
        word_table = HashTable()
        for word in filtered_query:
            word_table.put(word, word)
        word_table_keys = word_table.keys()
        parsed_query_terms = []  # changes from string to a list
        # add all words from hash table to list using keys()
        for key in word_table_keys:
            parsed_query_terms.append(word_table[key][0])

        # pass query terms to get_scores()
        tuples = self.get_scores(parsed_query_terms)

        # pass resulting list of tuples to rank()
        results = self.rank(tuples)

        # rank's result will be displayed in descending order on screen
        for a_tuple in results:
            print(a_tuple[0])
Exemplo n.º 2
0
    def get_scores(self, terms):
        """ Creates list of scores for each file in corpus.
        The score = (weighted frequency / total word count in file)
        Compute the score for each term in a query and sum all the scores.
        Args:
            terms (list): a list of strings, raw input string from user query
        Returns:
            list: a list of tuples, each containing the filename and its relevancy score
        """
        # scores = HashMap()
        score_table = HashTable(
        )  # contains tuples of (filename, weighted_frequency)

        for query_term in terms:

            # fetch a hash table of "term" from self.term_freqs
            query_term_table = self.term_freqs[query_term][1]

            # for each file in the hash table, add weighted frequency to scores[file]
            qt_table_keys = query_term_table.keys()
            for key in qt_table_keys:  # key is a file name
                weighted_frequency = self.get_wf(query_term_table[key][1])
                if weighted_frequency != 0:

                    # if this is the second query_term
                    if score_table.contains(key):

                        # new frequency + old frequency
                        old_freq = score_table[key][1]
                        updated_freq = weighted_frequency + old_freq
                        score_table.put(key, updated_freq)

                    # if score_table[key] is empty, use put (if first query_term)
                    else:
                        score_table.put(key, weighted_frequency)

        # for each file in scores, do scores[file] /= self.doc_length[file]
        score_table_keys = score_table.keys()
        score_list = []
        for key in score_table_keys:  # key is a filename
            normalized_score = score_table[key][1] / self.doc_length[key][1]
            score_table[key] = normalized_score
            score_list.append(score_table[key])

        # return scores, which is a list of tuples neglecting terms with frequencies of 0
        return score_list