Python Ranker.dot_prodact_and_cos 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: ranker

클래스/타입: Ranker

메소드/함수: dot_prodact_and_cos

hotexamples.com에서의 예제들: 2

Python Ranker.dot_prodact_and_cos - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 ranker.Ranker.dot_prodact_and_cos에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Ranker(30)

rank_relevant_docs(23)

rank(11)

retrieve_top_k(4)

load(3)

dot_prodact_and_cos(2)

train(2)

level_rank_features(2)

rank_nodes(2)

rank_relevant_doc(2)

predict(2)

save(2)

rank_counties(1)

rank_relevant_docs_w2v(1)

rank_states(1)

rank_with_bm25(1)

BM25(1)

search_mechane_tf_idf(1)

set_dict(1)

setup_arrivals_ranking(1)

state_dict(1)

step_test(1)

step_train(1)

test2vec(1)

topRatings(1)

train2vec(1)

update(1)

process_queries(1)

maximizeKGreatItems(1)

parameters(1)

cuda(1)

Processor(1)

Rank_with_cosimilarity(1)

_get_basic_rank(1)

_is_wordstart(1)

add_posting(1)

basic_rank_relevant_docs(1)

build_model(1)

check_solution(1)

compute_extend_word(1)

do_ranking(1)

open_json(1)

eval(1)

evaluate(1)

exec_query(1)

find_mone_cos_sim(1)

get_num_words(1)

get_start_letters(1)

get_starts(1)

init_from_config(1)

예제 #1

파일 보기

class Searcher:
    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit. The model
    # parameter allows you to pass in a precomputed model that is already in
    # memory for the searcher to use such as LSI, LDA, Word2vec models.
    # MAKE SURE YOU DON'T LOAD A MODEL INTO MEMORY HERE AS THIS IS RUN AT QUERY TIME.
    def __init__(self, parser, indexer, model=None):
        self._parser = parser
        self._indexer = indexer
        self._ranker = Ranker()
        self._model = model

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query, k=None):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relavant 
            and the last is the least relevant result.
        """
        query_list = query.split(" ")
        query_as_list = self._parser.text_operation(query_list)
        # extension  by spell checker
        queary_list_after_word_net = self.q_spell_check(query_as_list)
        #remove stop words
        query_as_list = self._parser.parse_sentence(queary_list_after_word_net)
        # find the docs
        relevant_docs = self._relevant_docs_from_posting(
            query_as_list)  # return all the rel doc for the quiry
        #ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs)

        relevant_docs = OrderedDict(
            sorted(relevant_docs.items(),
                   key=lambda item: item[1],
                   reverse=True))
        relevant_docs = dict(itertools.islice(relevant_docs.items(),
                                              2000))  #max is 2000 docs
        relevant_docs_sort = self._ranker.dot_prodact_and_cos(
            relevant_docs, self._indexer, len(query_as_list))
        n_relevant = len(relevant_docs)
        if k is not None:
            relevant_docs_sort = self.ranker.retrieve_top_k(
                relevant_docs_sort, self.k)
        return n_relevant, relevant_docs_sort

    # feel free to change the signature and/or implementation of this function
    # or drop altogether.
    def _relevant_docs_from_posting(self, query_as_list):
        """
        This function loads the posting list and count the amount of relevant documents per term.
        :param query_as_list: parsed query tokens
        :return: dictionary of relevant documents mapping doc_id to document frequency.
        """
        relevant_docs = {}
        for word in query_as_list:
            posting_list = self._indexer.get_term_posting_list(
                word)  # get all the twite with this word
            for doc in posting_list:
                tf = self._indexer.get_term_inverted_idx(word)[2]
                id = doc[0]
                if id not in relevant_docs.keys():
                    relevant_docs[id] = [1, []]
                    # self._indexer.get_term_inverted_idx[word]
                    tfidf = doc[4] * tf
                    relevant_docs[id][1].append(tfidf)
                else:
                    tfidf = doc[4] * tf
                    relevant_docs[id][1].append(tfidf)
                    relevant_docs[id][0] += 1

        return relevant_docs

    """
    this function expand the query by using spell checker 
    get query as list and add words by checker
    """

    def q_spell_check(self, query):
        spell = SpellChecker()
        corr_q = []
        corr_q.extend(query)
        i = 0
        for word in query:
            new_word = spell.correction(word)
            if new_word != word:
                if self._indexer._is_term_exist_in_idx(new_word):
                    corr_q[i] = new_word
            i += 1
        return corr_q

예제 #2

파일 보기

파일: searcher_3.py 프로젝트: hallelhel/Search_Engine

class Searcher:
    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit. The model
    # parameter allows you to pass in a precomputed model that is already in
    # memory for the searcher to use such as LSI, LDA, Word2vec models.
    # MAKE SURE YOU DON'T LOAD A MODEL INTO MEMORY HERE AS THIS IS RUN AT QUERY TIME.
    def __init__(self, parser, indexer, model=None):
        self._parser = parser
        self._indexer = indexer
        self._ranker = Ranker()
        self._model = model

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query, k=None):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """

        quert_list = query.split(" ")
        quert_list = self.teasarous_(quert_list)
        query_as_list = self._parser.text_operation(quert_list)
        query_as_list = self._parser.parse_sentence(query_as_list)
        relevant_docs = self._relevant_docs_from_posting(
            query_as_list)  # return all the rel doc for the quiry

        relevant_docs = OrderedDict(
            sorted(relevant_docs.items(),
                   key=lambda item: item[1],
                   reverse=True))
        relevant_docs = dict(itertools.islice(relevant_docs.items(),
                                              2000))  #max is 2000 docs
        relevant_docs_sort = self._ranker.dot_prodact_and_cos(
            relevant_docs, self._indexer, len(query_as_list))
        n_relevant = len(relevant_docs)
        if k is not None:
            relevant_docs_sort = self.ranker.retrieve_top_k(
                relevant_docs_sort, k)

        return n_relevant, relevant_docs_sort

    # feel free to change the signature and/or implementation of this function
    # or drop altogether.
    def _relevant_docs_from_posting(self, query_as_list):
        """
        This function loads the posting list and count the amount of relevant documents per term.
        :param query_as_list: parsed query tokens
        :return: dictionary of relevant documents mapping doc_id to document frequency.
        """
        relevant_docs = {}
        for word in query_as_list:
            posting_list = self._indexer.get_term_posting_list(
                word)  #get all the twite with this word
            for doc in posting_list:
                id = doc[0]
                if id not in relevant_docs.keys():
                    relevant_docs[id] = [1, []]
                    #self._indexer.get_term_inverted_idx[word]
                    tfidf = doc[4] * self._indexer.get_term_inverted_idx(
                        word)[2]
                    relevant_docs[id][1].append(tfidf)
                else:
                    tfidf = doc[4] * self._indexer.get_term_inverted_idx(
                        word)[2]
                    relevant_docs[id][1].append(tfidf)
                    relevant_docs[id][0] += 1
            # for list_doc_id in posting_list:
            #     df = relevant_docs.get(list_doc_id, 0)
            #     relevant_docs[doc_id] = df + 1
        return relevant_docs

    """
        this function expand the query by using teasarous 
        get query as list and add words by teasarous
    """

    def teasarous_(self, query):
        new_query = []
        new_query.extend(query)
        try:
            for word in query:
                new_word = list(thesaurus.synonyms(word, fileid="simN.lsp"))
                new_word_1 = list(thesaurus.synonyms(word, fileid="simV.lsp"))
                if len(new_word) > 0:
                    new_query.append(new_word[0])
                if len(new_word_1):
                    new_query.append(new_word_1[0])
        except:
            pass
            # print("no word for theasarous")
            # print(len(new_word))
        return new_query