예제 #1
0
def search_and_rank_query(query, inverted_index, k, output_path, vectorDict,
                          stemming):
    p = Parse(stemming)
    # parse query.
    query_as_dict = p.parse_sentence(query, term_dict={})
    if len(query_as_dict.keys()) == 0:
        return []
    searcher = Searcher(inverted_index, output_path)
    # search for relevant docs given the query. min threshold is 100 docs.
    relevant_docs = searcher.relevant_docs_from_posting(
        query_as_dict, 100, output_path)
    # rank those docs and get the top 100 of them.
    ranked_docs, sorted_keys = searcher.ranker.rank_relevant_doc(
        relevant_docs, query_as_dict, inverted_index, output_path,
        vectorDict)  # { doc: 4, doc: 10}
    top_100_keys = searcher.ranker.retrieve_top_k(sorted_keys, 100)
    # build association matrix and expand the query.
    expanded_query = local_method.build_association_matrix(
        inverted_index, query_as_dict, top_100_keys, vectorDict)
    # search again, with the expanded query.
    relevant_docs = searcher.relevant_docs_from_posting(
        expanded_query, k, output_path)
    # rank again and return the top K (given input) ranked.
    ranked_docs, sorted_keys = searcher.ranker.rank_relevant_doc(
        relevant_docs, expanded_query, inverted_index, output_path,
        vectorDict)  # { doc: 4, doc: 10}
    top_k_keys = searcher.ranker.retrieve_top_k(sorted_keys, k)
    top_K = []
    for doc_id in top_k_keys:
        top_K.append(ranked_docs[doc_id])
    return top_K
예제 #2
0
def search_and_rank_query(query, inverted_index, k):
    p = Parse()
    query_as_list = p.parse_sentence(query)
    searcher = Searcher(inverted_index)
    relevant_docs = searcher.relevant_docs_from_posting(query_as_list)
    ranked_docs = searcher.ranker.rank_relevant_doc(relevant_docs)
    return searcher.ranker.retrieve_top_k(ranked_docs, k)
예제 #3
0
def search_and_rank_query(query, inverted_index, k, config):
    p = Parse(config.toStem)
    query_as_list = p.parse_sentence(query)
    searcher = Searcher(inverted_index, config)
    relevant_docs, documents_dict = searcher.relevant_docs_from_posting(query_as_list)
    ranked_docs = searcher.ranker.rank_relevant_doc(relevant_docs, documents_dict, query_as_list)
    return searcher.ranker.retrieve_top_k(ranked_docs, k)
예제 #4
0
def search_and_rank_query(config, query, inverted_index, inverted_docs, k, avg_doc_len):
    p = Parse(config)
    query_as_list = p.parse_sentence(query)[0]
    searcher = Searcher(config, inverted_index, inverted_docs)
    query_dict = searcher.get_query_dict(query_as_list)
    relevant_docs, query_vector = searcher.relevant_docs_from_posting(query_dict)
    ranked_docs = searcher.ranker.rank_relevant_docs(relevant_docs, query_vector, avg_doc_len)
    return searcher.ranker.retrieve_top_k(ranked_docs, k)
예제 #5
0
def search_and_rank_query(query, inverted_index, document_dict, k, num_of_docs, avg_length_per_doc, config):
    p = Parse(config.toStem)
    query_as_list = p.parse_sentence(query)
    searcher = Searcher(inverted_index, document_dict, num_of_docs, avg_length_per_doc, glove_dict, config)
    # s = time.time()
    relevant_docs, query_glove_vec, query_vec = searcher.relevant_docs_from_posting(query_as_list[0])
    # print("Time for searcher: {}".format(time.time() - s))
    # s=time.time()
    ranked_docs = searcher.ranker.rank_relevant_doc(relevant_docs, query_glove_vec, query_vec)
    # print("Time for ranker: {}".format(time.time() - s))
    check = searcher.ranker.retrieve_top_k(ranked_docs, k)
    return check
예제 #6
0
def search_and_rank_query(query, inverted_index, k, tweet_dict):
    p = Parse()
    to_return = []
    for q in query:
        query_as_list = p.parse_sentence(q)
        searcher = Searcher(inverted_index)
        relevant_docs = searcher.relevant_docs_from_posting(query_as_list)
        ranked_docs = searcher.ranker.rank_relevant_doc(
            relevant_docs, tweet_dict)
        ans = searcher.ranker.retrieve_top_k(ranked_docs, k)
        to_return.extend(ans)
    return to_return
예제 #7
0
def search_and_rank_query(query, inverted_index, k, docs_data=None):
    global config, number_of_documents

    p = Parse(config.toStem)
    query_as_list = p.parse_sentence(query)
    searcher = Searcher(inverted_index, config, docs_data)
    relevant_docs, query_weight = searcher.relevant_docs_from_posting(
        query_as_list)
    ranked_docs = searcher.ranker.rank_relevant_doc(relevant_docs,
                                                    query_weight,
                                                    number_of_documents)
    return searcher.ranker.retrieve_top_k(ranked_docs, k)
def search_and_rank_query(query, inverted_index, k, stemming, output_path):
    p = Parse(stemming)

    query_as_list = [term.text.lower() for term in p.parse_sentence(query)]

    searcher = Searcher(inverted_index, os.path.join(output_path, PostingFile))

    w_of_term_in_query = searcher.CalculateW(query_as_list)

    relevant_docs = searcher.relevant_docs_from_posting(list(w_of_term_in_query.keys()))

    ranked_docs = searcher.ranker.rank_relevant_doc(relevant_docs, w_of_term_in_query)
    output = searcher.ranker.retrieve_top_k(ranked_docs, k)
    return output
예제 #9
0
def search_and_rank_query(query, inverted_index, k, config=None):
    """
    This function search for relevant docs according to the query and rank them
    :param query:
    :param inverted_index:
    :param k:
    :param config:
    :return:
    """
    p = Parse(config.toStem)
    query_as_list = p.parse_sentence(query)
    searcher = Searcher(inverted_index, config)
    relevant_docs = searcher.relevant_docs_from_posting(query_as_list)
    ranked_docs = searcher.ranker.rank_relevant_doc(relevant_docs)
    return searcher.ranker.retrieve_top_k(ranked_docs, k)
예제 #10
0
def search_and_rank_query(queries, inverted_index, k, lda):
    #print("start:", datetime.now())

    # config = ConfigClass()
    indexer = Indexer(config)
    # indexer = Indexer(config)
    to_stem = config.get__toStem()
    # to_stem = config.get__toStem()
    queries_list = []
    if type(queries) is list:  # if queries is a list
        for query in queries:
            queries_list.append(query)
    if type(queries) is str:  # if queries is a text file
        with open(queries, encoding='utf-8') as f:
            for line in f:
                if line != "\n":
                    queries_list.append(line)
    all_results = []
    query_num = 1
    tweet_id_num = 1
    for query in queries_list:
        p = Parse(config)
        # parse LDA query
        tokenized_query = p.parse_sentence(query, 0)
        original_query_list = query.split(" ")
        stop_words = stopwords.words('english')
        original_query_list = [
            w for w in original_query_list if w not in stop_words
        ]
        # find long terms and upper case words
        counter = 0
        while counter < len(original_query_list):
            len_term = 1
            word = original_query_list[counter]
            if word.isupper():  # NBA
                if word.find("\n") != -1:
                    word = word[:-1]
                    if word.find(".") != -1:
                        word = word[:-1]
                if not to_stem:
                    tokenized_query.append(word)
                else:
                    stem_word = Stemmer().stem_term(word)
                    tokenized_query.append(stem_word)
            elif len(word) > 1 and re.search(
                    '[a-zA-Z]',
                    word) and word[0].isupper():  # upper first char
                term = word
                if original_query_list.index(word) + 1 < len(
                        original_query_list):
                    index = original_query_list.index(word) + 1
                    while index < len(original_query_list):  # find all term
                        if len(original_query_list[index]) > 1 and re.search('[a-zA-Z]', original_query_list[index]) and \
                                original_query_list[index][0].isupper():
                            new_word2 = original_query_list[index][
                                0] + original_query_list[index][1:].lower(
                                )  # Donald Trump
                            term += " " + new_word2
                            index += 1
                            len_term += 1
                        else:
                            break
                    if len_term > 1:
                        tokenized_query.append(term)
            counter += len_term
        #print(tokenized_query)
        # WordNet query
        wn = WordNet_ranker(tokenized_query)
        WordNet_query = wn.extend_query()
        #print("WordNet_query", WordNet_query)
        searcher = Searcher(inverted_index)
        #print("inverted_index", len(inverted_index))
        # find relevant_docs
        relevant_docs = searcher.relevant_docs_from_posting(WordNet_query)
        #print("relevant", len(relevant_docs))
        # find LDA relevant
        cosine_dict = lda.prob(tokenized_query)
        #print("cosine dict", len(cosine_dict))

        dict_of_cosine_tweets = {}
        #list out keys and values separately
        key_list = list(indexer.tweet_line_dict.keys())
        val_list = list(indexer.tweet_line_dict.values())
        for index in cosine_dict.keys():  # find the tweet id
            dict_of_cosine_tweets[key_list[val_list.index(
                index)]] = cosine_dict[index]
        #print("finish_topic relevant", len(dict_of_cosine_tweets))

        final_dict = {}
        for tweet_id in dict_of_cosine_tweets.keys():
            if k > len(final_dict):
                if tweet_id in relevant_docs:
                    final_dict[tweet_id] = 0
                    final_dict[tweet_id] += (relevant_docs[tweet_id] +
                                             dict_of_cosine_tweets[tweet_id])

        sorted_cosine_tweets = {
            k: v
            for k, v in sorted(
                final_dict.items(), key=lambda item: item[1], reverse=True)
        }
        final_tweets = list(sorted_cosine_tweets.keys())
        #print("final before add K", len(final_tweets))
        if k > len(final_tweets):
            for key in relevant_docs.keys():
                if key not in final_dict:
                    if k > len(final_tweets):
                        final_tweets.append(key)
                    if k == len(final_tweets):
                        break
        #print("final after K", len(final_tweets))
        #print("relevant", relevant_docs)

        #print("sorted_cosine_tweets", sorted_cosine_tweets)
        """for tweet in relevant_docs.keys():
            if tweet in list_of_cosine_tweets:
                if len(final_tweets) < k:
                    final_tweets.append(tweet)

        if len(final_tweets) < k:
            sorted_cosine_tweets = {k: v for k, v in
                                    sorted(list_of_cosine_tweets.items(), key=lambda item: item[1], reverse=True)}
            for key in sorted_cosine_tweets:
                if k > len(final_tweets) and key not in final_tweets:
                    final_tweets.append(key)
                else:
                    break"""

        # write the results into csv file
        tweet_id_num = 1
        s = ""
        with open('results.csv', 'a', encoding='utf-8') as fp:
            for p in final_tweets:
                s = ("Tweet id: " + "{" + p + "}" + " Score: " + "{" +
                     str(tweet_id_num) + "}" + "\n")
                tweet_id_num += 1
                fp.write(s)
        query_num += 1
        all_results.append(final_tweets)
    #print("end:", datetime.now())

    # return top K of final_tweets
    return all_results
예제 #11
0
class SearchEngine:

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config
        self._parser = Parse()
        self._indexer = Indexer(config)
        self._model = None

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        r = ReadFile()
        df = r.read_file(fn)
        documents_list = df
        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)
        print('Finished parsing and indexing.')

        # self._indexer.save_index('idx_bench.pkl')
        # self._indexer.save_index('inverted_idx.pkl')

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        query_as_list = self._parser.parse_sentence(query)
        add_to_query = {}
        for q in query_as_list:
            for syn in wordnet.synsets(q):
                for lemma in syn.lemmas():
                    if lemma.name() == q.lower():
                        continue
                    score = wordnet.synsets(q)[0].wup_similarity(syn)
                    if score is not None and score > 0.8:
                        add_to_query[lemma.name()] = score

        if len(add_to_query) > 3:
            add_to_query = sorted(add_to_query.items(), key=lambda item: item[1], reverse=True)
            query_as_list.extend([add_to_query[0][0], add_to_query[1][0], add_to_query[2][0]])
        else:
            query_as_list.extend(add_to_query)

        new_query = ' '.join(query_as_list)
        relevant_docs = searcher.search(new_query)

        return relevant_docs

    @property
    def indexer(self):
        return self._indexer
class SearchEngine:

    num_of_tweets = 0

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config
        self._parser = Parse()
        self._indexer = Indexer(config)
        self._model = None

    def get_num_of_tweets(self):
        return self.num_of_tweets

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        self.num_of_tweets = len(documents_list)

        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            parsed_document.num_of_tweets = self.num_of_tweets
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)
        print('Finished parsing and indexing.')
        # TODO: check indexer saving
        utils.save_obj(self._indexer.inverted_idx, "inverted_idx")

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        inverted_idx = self._indexer.load_index(fn)
        return inverted_idx

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

        # DO NOT MODIFY THIS SIGNATURE
        # You can change the internal implementation as you see fit.

    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        query_as_list = self._parser.parse_sentence(query, 0)
        original_query_list = query.split(" ")
        stop_words = stopwords.words('english')
        original_query_list = [
            w for w in original_query_list if w not in stop_words
        ]
        # find long terms and upper case words
        counter = 0
        while counter < len(original_query_list):
            len_term = 1
            word = original_query_list[counter]
            if word.isupper():  # NBA
                if word.find("\n") != -1:
                    word = word[:-1]
                    if word.find(".") != -1:
                        word = word[:-1]
                query_as_list.append(word)
            elif len(word) > 1 and re.search(
                    '[a-zA-Z]',
                    word) and word[0].isupper():  # upper first char
                term = word
                if original_query_list.index(word) + 1 < len(
                        original_query_list):
                    index = original_query_list.index(word) + 1
                    while index < len(original_query_list):  # find all term
                        if len(original_query_list[index]) > 1 and re.search('[a-zA-Z]',
                                                                             original_query_list[index]) and \
                                original_query_list[index][0].isupper():
                            new_word2 = original_query_list[index][
                                0] + original_query_list[index][1:].lower(
                                )  # Donald Trump
                            term += " " + new_word2
                            index += 1
                            len_term += 1
                        else:
                            break
                    if len_term > 1:
                        query_as_list.append(term)
            counter += len_term
        wordNet = WordNet_ranker(query_as_list)
        new_query = wordNet.extend_query()
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        return searcher.search(new_query)  # TODO: add K results
class SearchEngine:
    GLOVE_PATH_SERVER = '../../../../glove.twitter.27B.25d.txt'
    GLOVE_PATH_LOCAL = '.\model/model.txt'

    def __init__(self, config=None):
        self._config = config
        self._parser = Parse(False)
        self.reader = ReadFile(corpus_path=config.get__corpusPath())
        self._indexer = Indexer(config)
        self.model = self.initialize_glove_dict()
        self._indexer.set_glove_dict(self.model)

    def initialize_glove_dict(self):
        glove_dict = {}
        with open(self.GLOVE_PATH_LOCAL, 'r', encoding='utf-8') as f:
            for line in tqdm(f):
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], "float32")
                glove_dict[word] = vector
        return glove_dict

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in tqdm(enumerate(documents_list)):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            if parsed_document is None:
                continue
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)

        tuple_to_save = self._indexer.fix_inverted_index()
        utils.save_pickle_tuple(tuple_to_save, 'idx_engine1',
                                self._config.get_out_path())

        print('Finished parsing and indexing.')

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_path):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

    def load_index(self, fn):
        return self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        self._indexer.inverted_idx, self._indexer.document_dict = self.load_index(
            'idx_engine1.pkl')
        searcher = Searcher(self._parser, self._indexer, model=self.model)
        # TODO check about K
        query_as_list = self._parser.parse_sentence(query)
        l_res = searcher.search(query_as_list[0])
        t_ids = [tup[1] for tup in l_res]
        return len(l_res), t_ids
예제 #14
0
class SearchEngine:

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        if not config:
            self._config = ConfigClass()
        else:
            self._config = config
        self._parser = Parse()
        self._indexer = Indexer(self._config)
        self._model = None
        self._reader = ReadFile(self._config.get__corpusPath())

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implmentation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file

        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)

            # index the document data
            self._indexer.add_new_doc(parsed_document)

        self._indexer.check_pending_list()
        self._indexer.calculate_and_add_idf()
        self._indexer.calculate_sigma_Wij()
        self._indexer.calculate_avg_doc_len()

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implmentation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implmentation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

    def search(self, query, k=None):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        query_as_tuple = self._parser.parse_sentence(query)
        query_as_list = query_as_tuple[0] + query_as_tuple[1]
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        return searcher.search(query_as_list, k)
class SearchEngine:

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config
        self._parser = Parse()
        self._indexer = Indexer(config)
        self._model = None

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        r = ReadFile()
        df = r.read_file(fn)
        documents_list = df
        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)

        self._indexer.save_index('inverted_idx.pkl')
        # self._indexer.save_index('idx_bench.pkl')

        print('Finished parsing and indexing.')

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and 
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

        # DO NOT MODIFY THIS SIGNATURE
        # You can change the internal implementation as you see fit.

    def search(self, query):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relavant 
            and the last is the least relevant result.
        """
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        # spell checker
        query_as_list = self._parser.parse_sentence(query)
        inverted_idx = self.indexer.inverted_idx
        spell = SpellChecker()
        misspelled = spell.unknown(query_as_list)
        assist = [x.lower()
                  for x in query_as_list]  # all the query terms in lower case

        for word in misspelled:
            if word.upper() in inverted_idx.keys() or word.lower(
            ) in inverted_idx.keys() or ' ' in word:
                continue  # if the word is in the inverted index- no correction need

            word_idx = assist.index(word)
            corrections = spell.edit_distance_1(
                word
            )  # list of all the suggested corrections with distance value 1
            corrections_dict = {}
            # check if the suggested corrections is in inverted index and collect the frequency of each correction
            for correction in corrections:
                if correction.upper() in inverted_idx.keys():
                    corrections_dict[correction] = inverted_idx[
                        correction.upper()]

                if correction.lower() in inverted_idx.keys():
                    corrections_dict[correction] = inverted_idx[
                        correction.lower()]

            if corrections_dict:
                query_as_list[word_idx] = max(
                    corrections_dict, key=corrections_dict.get
                )  # choose the most common correction
            else:
                query_as_list[word_idx] = spell.correction(word)

        new_query = ' '.join(query_as_list)
        relevant_docs = searcher.search(new_query)
        return relevant_docs

    @property
    def indexer(self):
        return self._indexer
예제 #16
0
class Searcher:
    def __init__(self, inverted_index, corpus_size, average_length,
                 output_path):
        """
        :param inverted_index: dictionary of inverted index
        """
        self.parser = Parse()
        self.ranker = Ranker()
        self.inverted_index = inverted_index
        self.corpus_size = corpus_size
        self.average_length = average_length
        self.output_path = output_path

    def calculate_doc_scores(self, term, relevant_docs, posting_pointer,
                             posting_file):
        """
        Retrieves term's posting file and calculates score for each relevant document.
        Adds the relevant documents to relevant_docs dictionary
        :param term: query term for retrieval
        :param relevant_docs: dictionary of relevant documents
        :param posting_pointer: pointer (name) of relevant posting file
        :param posting_file: relevant posting file
        :return: returns a tuple of the current relevant posting pointer and posting file
        """
        # retrieve term's posting file
        if posting_pointer is None or term[0].lower(
        ) != posting_pointer or posting_file is None:
            posting_pointer = self.inverted_index[term][POSTING_POINTER_INDEX]
            posting_file = utils.load_obj(self.output_path +
                                          str(posting_pointer))

        inverted_document_frequency = log(self.corpus_size /
                                          self.inverted_index[term][DF_INDEX])

        documents = posting_file[term]
        for document in documents:

            # calculate score
            document_id = document[DOCUMENT_ID_INDEX]
            doc_weight = document[FREQUENCY_INDEX]
            normalized_length = document[LENGTH_INDEX] / self.average_length

            if document_id not in relevant_docs:
                relevant_docs[document_id] = 0

            # calculate score according to BM25+ weighting formula
            relevant_docs[document_id] += inverted_document_frequency * (float(
                (doc_weight *
                 (K1 + 1))) / (doc_weight + K1 *
                               (1 - B + B * normalized_length)) + DELTA)

        return posting_pointer, posting_file

    def relevant_docs_from_posting(self, query):
        """
        Search and retrieve relevant documents for the query. Calculate the similarity score for each document.
        :param query: query
        :return: dictionary of relevant documents and their scores
        """

        # parse query according to the same parsing rules of the corpus
        entities = dict()
        term_dict = dict()
        parsed_query = self.parser.parse_sentence(query, entities)
        self.parser.parse_capital_letters(parsed_query, term_dict)

        # perform spell correction
        spell_checker = SpellChecker()
        corrected_terms = []
        misspelled_terms = spell_checker.unknown([*term_dict.keys()])
        for term in misspelled_terms:

            # only correct terms that aren't in the inverted dictionary
            # terms in the dictionary are considered correct for retrieval
            if term not in self.inverted_index:
                candidates = spell_checker.candidates(term)
                if term in candidates:  # remove duplicate originally correct terms
                    candidates.remove(term)
                corrected_terms.extend(candidates)

        # sort the parsed query alphabetically for optimal posting files retrieval
        # always hold at most one posting file in memory
        sorted_query = [*term_dict.keys()] + [*entities.keys()
                                              ] + corrected_terms
        sorted_query.sort()

        # dictionary for holding all relevant documents (at least one query term appeared in the document)
        # format: {document_id: score}
        relevant_docs = dict()
        posting_file = None  # currently used posting file from disk
        posting_pointer = None  # current posting's pointer
        for term in sorted_query:

            # check if term exists in inverted dictionary in either lower or upper form
            if term in self.inverted_index:
                posting_pointer, posting_file = self.calculate_doc_scores(
                    term, relevant_docs, posting_pointer, posting_file)
            elif term.islower() and term.upper() in self.inverted_index:
                posting_pointer, posting_file = self.calculate_doc_scores(
                    term.upper(), relevant_docs, posting_pointer, posting_file)
            elif term.isupper() and term.lower() in self.inverted_index:
                posting_pointer, posting_file = self.calculate_doc_scores(
                    term.lower(), relevant_docs, posting_pointer, posting_file)

        return relevant_docs
예제 #17
0
class SearchEngine:
    def __init__(self, config=None):
        self._config = config
        self._parser = Parse(False)
        self.reader = ReadFile(corpus_path=config.get__corpusPath())
        self._indexer = Indexer(config)
        self.model = None

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in tqdm(enumerate(documents_list)):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            if parsed_document is None:
                continue
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)

        tuple_to_save = self._indexer.fix_inverted_index()
        utils.save_pickle_tuple(tuple_to_save, 'idx_engine2',
                                self._config.get_out_path())

        print('Finished parsing and indexing.')

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_path):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

    def load_index(self, fn):
        return self._indexer.load_index(fn)

    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        self._indexer.inverted_idx, self._indexer.document_dict = self.load_index(
            'idx_engine2.pkl')
        searcher = Searcher(self._parser, self._indexer, model=self.model)
        # TODO check about K
        query_as_list = self._parser.parse_sentence(query)
        list_copy = list(query_as_list[0])
        tagged_words = pos_tag(list_copy)
        for word in tagged_words:
            wn_tag = Wordnet.get_wordnet_pos(word[1])
            synonym = Wordnet.get_closest_term(word[0], wn_tag)
            if synonym is not None:
                list_copy.append(synonym)
        l_res = searcher.search(list_copy)
        t_ids = [tup[1] for tup in l_res]
        return len(l_res), t_ids
예제 #18
0
class SearchEngine:
    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config
        self._parser = Parse()
        self._indexer = Indexer(config)
        self.invertedIndex = self._indexer.inverted_idx
        self._model = None

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """

        # r = ReadFile(ConfigClass.corpusPath)
        # documents_list = r.readAllCorpus() #change if we need to read more then 1 parquet

        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()

        utils.save_obj(
            {}, "inverted_idx"
        )  # needed to pass boris tests, sometimes, inverted_idx fails to save in testings system

        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            if parsed_document.doc_length != 0:  #sometimes we get an empty tweet, no need to index them
                # index the document data
                self._indexer.add_new_doc(parsed_document)
        # Inserting entities to the indexer and posting files
        self._indexer.addEntities(self._parser.suspectedEntityDict)
        # Sort the posting files
        self._indexer.update_idfWij(idx)
        self._indexer.save_index("inverted_idx")
        print('Finished parsing and indexing.')

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """

        self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

        # DO NOT MODIFY THIS SIGNATURE
        # You can change the internal implementation as you see fit.

    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        self._parser.suspectedEntityDict = {}

        query_as_list = self._parser.parse_sentence(query)

        # add entities to query - entities doesn't adds to query_as_list in parse_sentence
        # suspectedEntityDict holds only entities from original query
        for entity in self._parser.suspectedEntityDict:
            query_as_list.append(entity)

        # Clear query from Entities parts
        query_as_list = self.clearEntitiesParts(query_as_list)

        # WordNet expenssion
        extendedQ = copy.deepcopy(query_as_list)
        for term in query_as_list:
            synset = wordnet.synsets(term)
            try:
                for i in range(2):
                    Synonym = synset[i].lemmas()[0].name()
                    if term.lower() != Synonym.lower(
                    ) and Synonym + "~" not in extendedQ:
                        Synonym += "~"
                        extendedQ.append(Synonym)
            except:
                continue
        query_as_list = extendedQ

        numberOFresults, relevantDocIdList = searcher.search(
            query_as_list
        )  # returns tuple (number of results,relevantDocIdList)
        return numberOFresults, relevantDocIdList

    def clearEntitiesParts(self, query):
        modifiedQuery_l = copy.deepcopy(query)
        termsToRemoveFromQuery = []
        # at this point if query holds Entity, it will hold the terms builds the Entity and the Entity as 1 term
        # this is why this part below for : ['BILL','Gates','blabla','bla','Bill Gates']
        # if "Bill Gates" is already known Entity it will leave us with: ['blabla','bla','Bill Gates']
        for term in query:  # cleaning parts of entities from the query if the entity exist in the inverted index
            if " " in term:
                if term in self.invertedIndex:  # entity and in inverted Index
                    # modifiedQuery_l.append(term)
                    entity_l = term.split(" ")
                    for word in entity_l:
                        try:
                            termsToRemoveFromQuery.append(word.upper())
                        except:
                            termsToRemoveFromQuery.append(word.lower())
                else:  # unknown entity
                    modifiedQuery_l.remove(term)

        for word in termsToRemoveFromQuery:  #clear all appears of token from modifiedQuery
            modifiedQuery_l[:] = [x for x in modifiedQuery_l if x != word]
        query = modifiedQuery_l
        return query