예제 #1
0
class Searcher:
    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit. The model
    # parameter allows you to pass in a precomputed model that is already in
    # memory for the searcher to use such as LSI, LDA, Word2vec models.
    # MAKE SURE YOU DON'T LOAD A MODEL INTO MEMORY HERE AS THIS IS RUN AT QUERY TIME.
    def __init__(self, parser, indexer, model=None):
        self._parser = parser
        self._indexer = indexer
        self._ranker = Ranker()
        self._model = model

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query, k=None):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relavant 
            and the last is the least relevant result.
        """
        query_list = query.split(" ")
        query_as_list = self._parser.text_operation(query_list)
        # extension  by spell checker
        queary_list_after_word_net = self.q_spell_check(query_as_list)
        #remove stop words
        query_as_list = self._parser.parse_sentence(queary_list_after_word_net)
        # find the docs
        relevant_docs = self._relevant_docs_from_posting(
            query_as_list)  # return all the rel doc for the quiry
        #ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs)

        relevant_docs = OrderedDict(
            sorted(relevant_docs.items(),
                   key=lambda item: item[1],
                   reverse=True))
        relevant_docs = dict(itertools.islice(relevant_docs.items(),
                                              2000))  #max is 2000 docs
        relevant_docs_sort = self._ranker.dot_prodact_and_cos(
            relevant_docs, self._indexer, len(query_as_list))
        n_relevant = len(relevant_docs)
        if k is not None:
            relevant_docs_sort = self.ranker.retrieve_top_k(
                relevant_docs_sort, self.k)
        return n_relevant, relevant_docs_sort

    # feel free to change the signature and/or implementation of this function
    # or drop altogether.
    def _relevant_docs_from_posting(self, query_as_list):
        """
        This function loads the posting list and count the amount of relevant documents per term.
        :param query_as_list: parsed query tokens
        :return: dictionary of relevant documents mapping doc_id to document frequency.
        """
        relevant_docs = {}
        for word in query_as_list:
            posting_list = self._indexer.get_term_posting_list(
                word)  # get all the twite with this word
            for doc in posting_list:
                tf = self._indexer.get_term_inverted_idx(word)[2]
                id = doc[0]
                if id not in relevant_docs.keys():
                    relevant_docs[id] = [1, []]
                    # self._indexer.get_term_inverted_idx[word]
                    tfidf = doc[4] * tf
                    relevant_docs[id][1].append(tfidf)
                else:
                    tfidf = doc[4] * tf
                    relevant_docs[id][1].append(tfidf)
                    relevant_docs[id][0] += 1

        return relevant_docs

    """
    this function expand the query by using spell checker 
    get query as list and add words by checker
    """

    def q_spell_check(self, query):
        spell = SpellChecker()
        corr_q = []
        corr_q.extend(query)
        i = 0
        for word in query:
            new_word = spell.correction(word)
            if new_word != word:
                if self._indexer._is_term_exist_in_idx(new_word):
                    corr_q[i] = new_word
            i += 1
        return corr_q
예제 #2
0
class Searcher:
    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit. The model
    # parameter allows you to pass in a precomputed model that is already in
    # memory for the searcher to use such as LSI, LDA, Word2vec models.
    # MAKE SURE YOU DON'T LOAD A MODEL INTO MEMORY HERE AS THIS IS RUN AT QUERY TIME.
    def __init__(self, parser, indexer, model=None):
        self._parser = parser
        self._indexer = indexer
        self._ranker = Ranker()
        self._model = model

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query, k=None):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """

        quert_list = query.split(" ")
        quert_list = self.teasarous_(quert_list)
        query_as_list = self._parser.text_operation(quert_list)
        query_as_list = self._parser.parse_sentence(query_as_list)
        relevant_docs = self._relevant_docs_from_posting(
            query_as_list)  # return all the rel doc for the quiry

        relevant_docs = OrderedDict(
            sorted(relevant_docs.items(),
                   key=lambda item: item[1],
                   reverse=True))
        relevant_docs = dict(itertools.islice(relevant_docs.items(),
                                              2000))  #max is 2000 docs
        relevant_docs_sort = self._ranker.dot_prodact_and_cos(
            relevant_docs, self._indexer, len(query_as_list))
        n_relevant = len(relevant_docs)
        if k is not None:
            relevant_docs_sort = self.ranker.retrieve_top_k(
                relevant_docs_sort, k)

        return n_relevant, relevant_docs_sort

    # feel free to change the signature and/or implementation of this function
    # or drop altogether.
    def _relevant_docs_from_posting(self, query_as_list):
        """
        This function loads the posting list and count the amount of relevant documents per term.
        :param query_as_list: parsed query tokens
        :return: dictionary of relevant documents mapping doc_id to document frequency.
        """
        relevant_docs = {}
        for word in query_as_list:
            posting_list = self._indexer.get_term_posting_list(
                word)  #get all the twite with this word
            for doc in posting_list:
                id = doc[0]
                if id not in relevant_docs.keys():
                    relevant_docs[id] = [1, []]
                    #self._indexer.get_term_inverted_idx[word]
                    tfidf = doc[4] * self._indexer.get_term_inverted_idx(
                        word)[2]
                    relevant_docs[id][1].append(tfidf)
                else:
                    tfidf = doc[4] * self._indexer.get_term_inverted_idx(
                        word)[2]
                    relevant_docs[id][1].append(tfidf)
                    relevant_docs[id][0] += 1
            # for list_doc_id in posting_list:
            #     df = relevant_docs.get(list_doc_id, 0)
            #     relevant_docs[doc_id] = df + 1
        return relevant_docs

    """
        this function expand the query by using teasarous 
        get query as list and add words by teasarous
    """

    def teasarous_(self, query):
        new_query = []
        new_query.extend(query)
        try:
            for word in query:
                new_word = list(thesaurus.synonyms(word, fileid="simN.lsp"))
                new_word_1 = list(thesaurus.synonyms(word, fileid="simV.lsp"))
                if len(new_word) > 0:
                    new_query.append(new_word[0])
                if len(new_word_1):
                    new_query.append(new_word_1[0])
        except:
            pass
            # print("no word for theasarous")
            # print(len(new_word))
        return new_query