Пример #1
0
    def bm25(self, query, categorized_qa):
        # 只有问题分类的情况下才在这里做模型实例化,其他情况下模型已经在__init__()里实例化过了
        if args.categorize_question:
            if len(categorized_qa['cut_answers']) != 0:
                # 非空的时候才用这个作corpus传进BM25
                self.bm25_model = BM25(categorized_qa['cut_answers'])
                # print(categorized_qa['classes'])
            else:
                # 如果为空,那么还用原来的corpus传进BM25
                self.bm25_model = BM25(self.cut_answers)
                # print('没用分类问题')

        bm25_weights = self.bm25_model.get_scores(query)

        sorted_scores = sorted(bm25_weights, reverse=True)  # 将得分从大到小排序
        sorted_scores = [s / (len(query) + 1) for s in sorted_scores]  # 将得分除以句长
        max_pos = np.argsort(bm25_weights)[::-1]  # 从大到小排序,返回index(而不是真正的value)

        # 根据max_pos从答案库里把真正的答案抽出来
        if args.categorize_question:
            # 答案来源是categorized的时候
            if len(categorized_qa['cut_answers']) != 0:
                # 非空的时候才用这个作为answer base
                answers = self.__max_pos2answers(max_pos, categorized_qa['uncut_answers'])
            else:
                # 如果为空,那么还用原来的self.uncut_answers作为answer base
                answers = self.__max_pos2answers(max_pos, self.uncut_answers)
        else:
            # 答案来源不是categorized的时候,categorized_qa是None
            answers = self.__max_pos2answers(max_pos, self.uncut_answers)

        return sorted_scores, max_pos, answers
Пример #2
0
    def test_epsilon(self):
        """ Changing the b parameter should give consistent results """
        corpus = [['cat', 'dog', 'mouse'], ['cat', 'lion'], ['cat', 'lion']]
        first_epsilon = 1.0
        second_epsilon = 2.0
        bm25 = BM25(corpus)
        words_with_negative_idfs = set(
            [word for word, idf in bm25.idf.items() if idf < 0])
        index, doc = [(index, document)
                      for index, document in enumerate(corpus)
                      if words_with_negative_idfs & set(document)][0]

        first_bm25 = BM25(corpus, epsilon=first_epsilon)
        second_bm25 = BM25(corpus, epsilon=second_epsilon)
        first_score = first_bm25.get_score(doc, index)
        second_score = second_bm25.get_score(doc, index)
        self.assertGreater(first_score, second_score)

        first_iter = iter_bm25_bow(corpus, epsilon=first_epsilon)
        second_iter = iter_bm25_bow(corpus, epsilon=second_epsilon)
        first_score = dict(next(iter(first_iter)))[index]
        second_score = dict(next(iter(second_iter)))[index]
        self.assertGreater(first_score, second_score)

        first_weights = get_bm25_weights(corpus, epsilon=first_epsilon)
        second_weights = get_bm25_weights(corpus, epsilon=second_epsilon)
        first_score = first_weights[index]
        second_score = second_weights[index]
        self.assertGreater(first_score, second_score)
Пример #3
0
    def bm25_sim_matrix(self,
                        src_indices,
                        tgt_indices,
                        index_source=True,
                        monitor_progress=True):
        base_matrix = np.zeros((len(src_indices), len(tgt_indices)))
        if monitor_progress:
            pbar = tqdm(total=len(src_indices)
                        if index_source is True else len(tgt_indices),
                        desc="BM25 matrix")

        print(base_matrix.shape)

        src_lines = [clean_text(self.get_src_line(s)) for s in src_indices]
        tgt_lines = [clean_text(self.get_tgt_line(t)) for t in tgt_indices]
        if index_source:
            bm25 = BM25(src_lines)
        else:
            bm25 = BM25(tgt_lines)
        average_idf = sum(float(val)
                          for val in bm25.idf.values()) / len(bm25.idf)

        for src_id, src_line in enumerate(src_lines):
            for tgt_id, tgt_line in enumerate(tgt_lines):
                if index_source:
                    bm25_res = bm25.get_score(tgt_line, src_id, average_idf)
                else:
                    bm25_res = bm25.get_score(src_line, tgt_id, average_idf)
                base_matrix[src_id, tgt_id] = bm25_res
            if monitor_progress:
                pbar.update()
        return base_matrix / np.max(base_matrix)
Пример #4
0
    def test_b(self):
        """ Changing the b parameter should give consistent results """
        corpus = common_texts
        index = 0
        doc = corpus[index]
        first_b = 1.0
        second_b = 2.0

        first_bm25 = BM25(corpus, b=first_b)
        second_bm25 = BM25(corpus, b=second_b)
        first_score = first_bm25.get_score(doc, index)
        second_score = second_bm25.get_score(doc, index)
        self.assertLess(first_score, second_score)

        first_iter = iter_bm25_bow(corpus, b=first_b)
        second_iter = iter_bm25_bow(corpus, b=second_b)
        first_score = dict(next(iter(first_iter)))[index]
        second_score = dict(next(iter(second_iter)))[index]
        self.assertLess(first_score, second_score)

        first_weights = get_bm25_weights(corpus, b=first_b)
        second_weights = get_bm25_weights(corpus, b=second_b)
        first_score = first_weights[index]
        second_score = second_weights[index]
        self.assertLess(first_score, second_score)
Пример #5
0
    def __init__(self, context_filters: Filters, author_filters: Filters,
                 num_filters: int, embed_size:int, num_layers: int,
                 path_to_weights: PathOrStr, data: BaseData, 
                 evaluate: bool = True, show_attention: bool = False):
        self.data = data
        self.context, self.title, self.authors = self.data.cntxt, self.data.ttl, self.data.aut
        self.pad = self.title.vocab.stoi['<pad>']
        self.criterion = nn.CrossEntropyLoss(ignore_index = self.pad, reduction="none")

        self.model = NeuralCitationNetwork(context_filters=context_filters,
                                            author_filters=author_filters,
                                            context_vocab_size=len(self.context.vocab),
                                            title_vocab_size=len(self.title.vocab),
                                            author_vocab_size=len(self.authors.vocab),
                                            pad_idx=self.pad,
                                            num_filters=num_filters,
                                            authors=True, 
                                            embed_size=embed_size,
                                            num_layers=num_layers,
                                            hidden_size=num_filters,
                                            dropout_p=0.2,
                                            show_attention=show_attention)
        self.model.to(DEVICE)
        self.model.load_state_dict(torch.load(path_to_weights, map_location=DEVICE), strict=False)
        self.model.eval()
        logger.info(self.model.settings)

        self.eval = evaluate
        self.show_attention = show_attention

        # instantiate examples, corpus and bm25 depending on mode
        logger.info(f"Creating corpus in eval={self.eval} mode.")
        if self.eval:
            self.examples = data.test.examples
            logger.info(f"Number of samples in BM25 corpus: {len(self.examples)}")
            self.corpus = list(set([tuple(example.title_cited) for example in self.examples]))
            self.bm25 = BM25(self.corpus)
            self.context_cited_indices = self._get_context_title_indices(self.examples)
        else:
            self.examples = data.train.examples + data.train.examples+ data.train.examples
            logger.info(f"Number of samples in BM25 corpus: {len(self.examples)}")
            self.corpus = list(set([tuple(example.title_cited) for example in self.examples]))
            self.bm25 = BM25(self.corpus)
            
            # load mapping to give proper recommendations
            with open("assets/title_tokenized_to_full.pkl", "rb") as fp:
                self.title_to_full = pickle.load(fp)

        
        with open("assets/title_to_aut_cited.pkl", "rb") as fp:
            self.title_aut_cited = pickle.load(fp)
def bm25(k, queries, collection, idf_base):
    """
    Computes BM25 scores for each query and the documents in the collection
    :param k: The number of documents in the collection to be ranked
    :param queries: all query vectors
    :param collection: all collection vectors
    :param idf_base: store idf with identifier "above_threshold", "below_threshold", "all"
    :return:
    """

    bm25 = BM25(collection)

    if not os.path.exists(
            "public_data/vocab/bm25_word2weight_%s.pkl" % idf_base):
        with open("public_data/vocab/bm25_word2weight_%s.pkl" % idf_base,
                  'wb') as out:
            pickle.dump(bm25.idf, out, protocol=4)

    avg_idf = sum(bm25.idf.values()) / len(bm25.idf.values())
    top_ids = []
    for i, query in enumerate(queries):
        scores = bm25.get_scores(query, avg_idf)
        ids = sorted(range(len(scores)), key=lambda i: scores[i],
                     reverse=True)[:k]
        top_ids.append(ids)
    top_ids = np.array(top_ids)

    return top_ids
Пример #7
0
    def getContext(self, sentences, question):
        documents = []
        for sent in sentences:
            documents.append(self.tokenize(sent))

        bm25 = BM25(documents)

        scores = bm25.get_scores(self.tokenize(question))
        results = {}
        for index, score in enumerate(scores):
            results[index] = score

        sorted_results = {
            k: v
            for k, v in sorted(
                results.items(), key=lambda item: item[1], reverse=True)
        }
        results_list = list(sorted_results.keys())
        final_results = results_list if len(
            results_list
        ) < self.numberOfResults else results_list[:self.numberOfResults]
        questionContext = ""
        for final_result in final_results:
            questionContext = questionContext + " ".join(
                documents[final_result])
        return questionContext
Пример #8
0
def calculate_scores(question, candidate_set):
    model = BM25(candidate_set)
    average_idf = sum(float(val) for val in model.idf.values()) / len(model.idf)
    scores = []
    for idx, val in enumerate(candidate_set):
        scores += [model.get_score(question, idx, average_idf)]
    return scores
Пример #9
0
def get_other_results(queries, qml_rankings, num_ranks=None):
    document_lookup = read_cache('./doc_lookup.json', get_robust_documents)
    document_title_to_id = read_cache('./document_title_to_id.json',
                                      lambda: print('failed'))
    document_id_to_title = _.invert(document_title_to_id)
    doc_ids = range(len(document_id_to_title))
    documents = [
        document_lookup[document_id_to_title[doc_id]] for doc_id in doc_ids
    ]
    tokenizer = Tokenizer(
        rules=[handle_caps, fix_html, spec_add_spaces, rm_useless_spaces])
    tokenized_documents = read_cache('tok_docs.json',
                                     lambda: tokenizer.process_all(documents))
    tokenized_queries = tokenizer.process_all(queries)
    bm25 = BM25(tokenized_documents)
    average_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf)
    bm25_rankings = []
    glove_rankings = []
    rm3_rankings = []
    glove = get_glove_lookup(embedding_dim=300, use_large_embed=True)
    docs_lms = _calc_docs_lms(bm25.df, bm25.f)
    for q, qml_ranking in progressbar(zip(tokenized_queries, qml_rankings)):
        bm25_rankings.append(
            _get_bm25_ranking(bm25, qml_ranking, q, average_idf=average_idf))
        glove_rankings.append(
            _get_glove_ranking(glove, tokenized_documents, qml_ranking, q))
        rm3_rankings.append(_get_rm3_ranking(docs_lms, bm25.f, qml_ranking, q))
    return bm25_rankings, glove_rankings, rm3_rankings
Пример #10
0
        def build_index():
            # build bm25 index
            corpus = []
            total = 0.0
            SIA = SentimentIntensityAnalyzer()
            for rid in reviews:
                self.review_ids.append(rid)
                if 'text' in reviews[rid]:
                    sent = reviews[rid]['text']
                else:
                    sent = reviews[rid]['review']

                tokens = gensim.utils.simple_preprocess(sent.lower())
                reviews[rid]['sentiment'] = SIA.polarity_scores(sent)['compound']

                corpus.append(tokens)
                self.position_index[rid] = {}
                for (pos, token) in enumerate(tokens):
                    if token not in self.position_index[rid]:
                        self.position_index[rid][token] = [pos]
                    else:
                        self.position_index[rid][token].append(pos)

                for ext in reviews[rid]['extractions']:
                    attr = ext['attribute']
                    if attr not in self.idf:
                        self.idf[attr] = 1
                    else:
                        self.idf[attr] += 1
                    total += 1
            bm25 = BM25(corpus)
            for attr in self.idf:
                self.idf[attr] = math.log2(total / self.idf[attr])
            return bm25
Пример #11
0
 def __init__(self, raw_corpus: list, processed_corpus: list = None):
     self.__pipeline = spacy.load('en_core_web_sm',
                                  disable=["parser", "tagger", "ner"])
     self.raw_corpus = raw_corpus
     self.processed_corpus = self.__pre_process_corpus(
     ) if processed_corpus is None else processed_corpus
     self.raw_corpus = np.array(self.raw_corpus)
     self.model = BM25(self.processed_corpus)
 def load_bm25(self):
     """ Convert the description of candidates into corpus and load the corpus into a bm25 object, which is used for keyword ranking.
     """
     self.corpus = []
     for candidate in self.candidates:
         self.corpus.append(text2tokens(candidate['description']))
     self.bm25 = BM25(self.corpus)
     self.average_idf = sum(
         float(val) for val in self.bm25.idf.values()) / len(self.bm25.idf)
Пример #13
0
def main():
    docs = get_list_of_docs('../test_files')
    corpus, doc_titles = get_words(docs)
    bm = BM25(corpus)
    while (True):
        query = input('Query:')
        scores = bm.get_scores(query.split(' '))
        print(list(enumerate(scores)))
        print(get_ranked_docs(doc_titles, scores))
def train_bm25Model(Dataset: List[Sample], Stopwords: set) -> BM25:
    corpus = []
    # 去停用词
    for cur_sample in Dataset:
        corpus.append([word for word in cur_sample.source_text.split(" ") if word not in Stopwords])
        corpus.append([word for word in cur_sample.target_text.split(" ") if word not in Stopwords])
    # 送入模型 参数corpus:str的list集合
    bm25Model = BM25(corpus)
    return bm25Model
Пример #15
0
 def __init__(self, texts):
     assert isinstance(texts, list) and len(texts) > 0 and isinstance(texts[0], list)
     time_init = time.time()
     # self.dct = corpora.Dictionary(texts)
     # self.corpus_ = [self.dct.doc2bow(text) for text in texts]
     self.model = BM25(texts)
     self.doc_num = len(texts)
     self.avg_idf = sum(map(lambda k: float(self.model.idf[k]), self.model.idf.keys())) / len(self.model.idf.keys())
     logger.warning("Build BM25 model use time %s", time.time()-time_init)
Пример #16
0
 def __init__(self, corpus=None):
     self.corpus = corpus
     if self.corpus != None:
         self.tfidf_vectorizer = self.get_tfidf_vectorizer(self.corpus)
         self.corpus_vec = self.tfidf_vectorizer.transform(self.corpus)
         self.bm25_model = BM25([s.split() for s in corpus])
         self.average_idf = sum(
             map(lambda k: float(self.bm25_model.idf[k]),
                 self.bm25_model.idf.keys())) / len(
                     self.bm25_model.idf.keys())
Пример #17
0
    def __init__(self, docs):
        self.docs = []
        for doc in tqdm(docs, desc='LSI process docs', total=len(docs)):
            self.docs.append(clean_claim_description(doc, True).split())
        self.bm25_model = BM25(self.docs)

        docs_str = [' '.join(d) for d in self.docs]
        tf_idf_vectoraizer = TfidfVectorizer(stop_words='english', )
        tf_idf_vectoraizer.fit(docs_str)
        self.avg_idf = tf_idf_vectoraizer.idf_.mean()
Пример #18
0
    def process(self, message, **kwargs):
        """Process an incoming message.

        This is the components chance to process an incoming
        message. The component can rely on
        any context attribute to be present, that gets created
        by a call to :meth:`components.Component.pipeline_init`
        of ANY component and
        on any context attributes created by a call to
        :meth:`components.Component.process`
        of components previous to this one."""
        sentence = message.text
        question_vectors = [
            self.query(sentence)
            for sentence in self.get_tokenized_qa_words([sentence])
        ]
        intent = message.data["intent"]["name"]
        intent_num = self.base_intents.index(intent)
        intent_index = np.argwhere(self.intents_vec == intent_num).reshape(-1)
        intent_question = self.q_a_v[intent_index]
        sim_mat = cosine_similarity(question_vectors, intent_question)
        top_posi = np.argsort(sim_mat, axis=1)[0][-10:][::-1]
        full_index = intent_index[top_posi]
        sim_value = sim_mat[0, top_posi]
        if "full_index" not in message.data:
            message.set("full_index", full_index.tolist())
        else:
            exist = message.get("full_index")
            full_index = list(set(exist + full_index.tolist()))
            message.set("full_index", full_index)
        sim_value = json.dumps(sim_value.tolist(), ensure_ascii=False)
        message.set("top_similar_value", sim_value, add_to_output=True)

        # ========================================
        sim_mat_part = cosine_similarity(question_vectors,
                                         self.q_a_v[full_index])[0]

        candicate_sentences = self.questions[full_index].tolist()
        candicate_sentences.insert(0, sentence)
        sentence_tokens = self.get_tokenized_qa_words(candicate_sentences)
        bm25_object = BM25(sentence_tokens[1:])
        bm25_score = np.array(bm25_object.get_scores(sentence_tokens[0]))
        feature_score = np.array([sim_mat_part, bm25_score]).T
        scaler = MinMaxScaler()
        scaler.fit(feature_score)
        feature_score = scaler.transform(feature_score)
        weight = np.array([0.8, 0.2])
        final_score = (feature_score * weight).sum(axis=1)
        # final_score = sim_mat_part + bm25_score

        best_match_question = self.questions[full_index[np.argmax(
            final_score)]]
        message.set("best_match", best_match_question, add_to_output=True)
        responses = self.qa_map.get(best_match_question)
        message.set("response", responses, add_to_output=True)
Пример #19
0
def main():
  document_lookup = read_cache('./doc_lookup.json', get_robust_documents)
  document_title_to_id = create_id_lookup(document_lookup.keys())
  document_id_to_title = _.invert(document_title_to_id)
  doc_ids = range(len(document_id_to_title))
  documents = [document_lookup[document_id_to_title[doc_id]] for doc_id in doc_ids]
  tokenizer = Tokenizer()
  tokenized_documents = read_cache('tok_docs.json',
                                   lambda: tokenizer.process_all(documents))
  bm25 = BM25(tokenized_documents)
  with open('./doc_word_idf.json', 'w+') as fh:
    json.dump(bm25.idf, fh)
Пример #20
0
def wrapper(query):
    #print("reading generated csv")

    print("pre processing query")
    query = pre_processing(query)

    print("bm25 generation1")
    average_idf1 = sum(float(val)
                       for val in bm251.idf.values()) / len(bm251.idf)
    bm25_scores1 = bm251.get_scores(query.split(), average_idf1)

    print("bm25 generation2")

    average_idf2 = sum(float(val)
                       for val in bm252.idf.values()) / len(bm252.idf)
    bm25_scores2 = bm252.get_scores(query.split(), average_idf2)

    arr = np.array([(0.7 * bm25_scores1[i] + bm25_scores2[i]) / 2
                    for i in range(0, df1.shape[0])])
    x = arr.argsort(axis=0)[-10:]
    print("getting confidences")
    confidences = []
    for i in np.fliplr([x])[0]:
        confidences.append(arr[i])
    resolutions = []
    for i in np.fliplr([x])[0]:
        resolutions.append(df1.Resolution[i])
    score = []
    print("Using page rank algorithm to swap the retrieved resolutions")
    print("\n\n")
    dict_d = {}
    for i in range(len(resolutions)):
        corpus = [
            resolutions[j].split() for j in range(len(resolutions)) if j != i
        ]
        bm25 = BM25(corpus)
        average_idf = sum(float(val)
                          for val in bm25.idf.values()) / len(bm25.idf)
        bm25_scores = bm25.get_scores(resolutions[i].split(), average_idf)
        score.append(sum(bm25_scores))

    score = [(score[i] + confidences[i]) / 2 for i in range(len(score))]

    for i in range(len(resolutions)):
        dict_d[score[i]] = resolutions[i]

    final_confidences = sorted(dict_d.keys(), reverse=True)
    final_resolutions = []
    for i in final_confidences:
        final_resolutions.append(dict_d[i])

    return (confidences, final_resolutions)
Пример #21
0
    def _init_bm25(self):
        questions = pd.read_csv('../data/question_id.csv')
        corpus = questions.wid.tolist()
        bm25_dim = len(corpus)
        corpus = [s.split() for s in corpus]
        from gensim.summarization.bm25 import get_bm25_weights, BM25
        bm25_model = BM25(corpus)

        bm25_avg_idf = sum(
            map(lambda k: float(bm25_model.idf[k]),
                bm25_model.idf.keys())) / len(bm25_model.idf.keys())

        return bm25_model, bm25_dim, bm25_avg_idf
Пример #22
0
    def __fill_bm25_scores(self):
        """
        Calculates BM25 scores of each document in corpus for a query
        """
        corpus = [Input(doc).tokens for doc in self.data[self.column]]

        bm25 = BM25(corpus)
        average_idf = sum(float(val)
                          for val in bm25.idf.values()) / float(len(bm25.idf))
        query = Input(self.query).tokens

        scores = bm25.get_scores(query, average_idf)
        self.data[CONST.COL_BM25] = scores
Пример #23
0
def generate_conv_data(in_path, subreddit):
    df_conv = pd.read_csv(in_path, lineterminator= "\n")
    df_conv = df_conv[df_conv['subreddit'] == subreddit]
    df_conv['query'] = df_conv['query'].astype(str)
    df_conv['relevant_response'] = df_conv['relevant_response'].astype(str)

    documents = np.array(df_conv['relevant_response'])
    corpus = [context.split(" ") for context in documents]
    bm25 = BM25(corpus)

    cache = {}
    instances = []
    index_subreddit = []

    for idx, r in tqdm([x for x in df_conv.iterrows()]):
        if r['query'] in cache:
            max_positions = cache[r['query']]
        else:
            scores = np.array(bm25.get_scores(str(r['query']).split(" ")))
            max_positions = heapq.nlargest(negative_samples,
                                       range(len(scores)),
                                       scores.take)
            cache[r['query']] = max_positions

        while idx in max_positions:
            new_doc = random.sample(range(len(documents)), 1)[0]
            max_positions[max_positions.index(idx)] = new_doc

        candidates = documents[max_positions]

        instances.append([
            r['query'],
            r['relevant_response']
        ] + list(candidates))
        index_subreddit.append([idx, r['subreddit']])

    # random.shuffle(instances) #<-- this shouldnt be here. instances from same dialogue will be spread over different data splits
    train, valid, test = (instances[0: int(0.8*len(instances))],
                        instances[int(0.8*len(instances)) : int(0.9*len(instances))],
                        instances[int(0.9*len(instances)):])

    cols = ["query", "relevant_doc"] + \
           ["non_relevant_"+str(i+1) for i in range(negative_samples)]

    train, valid, test = (pd.DataFrame(train, columns=cols),
                          pd.DataFrame(valid, columns=cols),
                          pd.DataFrame(test, columns=cols))

    indexes = pd.DataFrame(index_subreddit, columns = ['index', 'subreddit'])

    return train, valid, test, indexes
Пример #24
0
def query_to_document(query):
    """ Takes string question and returns the name of the document which the question is likely to be present in"""

    bm25_df = BM25(query).head(
        n=50)  # gets the dataframe of BM25 with scores and ranks of documents
    tfidf_df = TFIDF(query).head(
        n=50)  # gets the dataframe of TFIDF with scores and ranks of documents
    doc2vec_df = Doc2Vec(query).head(
        n=50
    )  # gets the dataframe of Doc2Vec with scores and ranks of documents

    # combining all the dataframes
    final_df = pd.merge(pd.merge(bm25_df,
                                 tfidf_df,
                                 on=['Document'],
                                 how='outer'),
                        doc2vec_df,
                        on=['Document'],
                        how='outer')
    final_df.fillna(0, inplace=True)

    # Normalising the scores between 0 and 1
    bm25_normalised = (final_df.Score_BM25 - final_df.Score_BM25.min()) / (
        final_df.Score_BM25.max() - final_df.Score_BM25.min())
    tfidf_normalised = (final_df.Score_TFIDF - final_df.Score_TFIDF.min()) / (
        final_df.Score_TFIDF.max() - final_df.Score_TFIDF.min())
    doc2vec_normalised = (
        final_df.Score_Doc2Vec - final_df.Score_Doc2Vec.min()) / (
            final_df.Score_Doc2Vec.max() - final_df.Score_Doc2Vec.min())

    # Getting the total score based on the preious overall accuracy
    final_df[
        'total_score'] = 0.01243557 * bm25_normalised + 0.29682442 * tfidf_normalised - 0.01673123 * doc2vec_normalised

    final_df['bm25_normalised'] = bm25_normalised
    final_df['tfidf_normalised'] = tfidf_normalised
    final_df['doc2vec_normalised'] = doc2vec_normalised

    final_document_list = final_df.Document.values[:]
    final_scores = np.array(
        final_df.
        loc[:, ['bm25_normalised', 'tfidf_normalised', 'doc2vec_normalised']])

    prediction_scores = []
    for document, scores in zip(final_document_list, final_scores):
        scores = np.array(scores).reshape(1, 3)
        prediction = model_perceptron.predict(scores)
        prediction_scores.append(prediction)
    return final_document_list[np.array(prediction_scores).argmax()]
Пример #25
0
    def __init__(self, cleanup_urls=True, nltk_tokenizer=False):
        super().__init__(cleanup_urls=cleanup_urls, nltk_tokenizer=nltk_tokenizer)
        self.corpus = []
        self.bug_ids = []

        for bug in bugzilla.get_bugs():
            self.corpus.append(self.text_preprocess(self.get_text(bug)))
            self.bug_ids.append(bug["id"])

        indexes = list(range(len(self.corpus)))
        random.shuffle(indexes)
        self.corpus = [self.corpus[idx] for idx in indexes]
        self.bug_ids = [self.bug_ids[idx] for idx in indexes]

        self.model = BM25(self.corpus)
 def __init__(self, tokenizer, corpus, idxs2id):
     """
     Parameters
     ----------
     tokenizer : gensim.corpora.Dictionary
         Word tokenizer.
     corpus : gensim.corpora.mmcorpus.MmCorpus
         Bag-of-words formatted corpus of documents.
     """
     self.preprocessor = TextPreprocessor()
     self.tokenizer = tokenizer
     self.corpus = corpus
     self.internal_engine = BM25(self.corpus)
     self.idxs2id = idxs2id
     print("BM25 engine loaded")
Пример #27
0
 def get_g_bm25_model(self, text_pool):
     """
     2d list
     :param text_pool:
     :return:
     """
     # text_pool = [line.split() for line in raw_text]
     word_freq = defaultdict(int)
     for line in text_pool:
         for word in line:
             word_freq[word] += 1
     text_pool = [[token for token in line if word_freq[token] > 1]
                  for line in text_pool]
     g_bm25_model = BM25(text_pool)
     return g_bm25_model
Пример #28
0
    def start(self, model, data, sess, valid_only=False):
        stemmer = PorterStemmer()

        texts = []
        for q_a in data.archive.answers + data.archive.questions:
            q_a.metadata['tokens_stemmed'] = [stemmer.stem(t.text) for t in q_a.tokens]
            texts.append(q_a.metadata['tokens_stemmed'])

        bm25 = BM25(texts)
        average_idf = sum(bm25.idf.values()) / float(len(bm25.idf))
        # average_idf = sum(map(lambda k: float(bm25.idf[k]), bm25.idf.keys())) / float(len(bm25.idf.keys()))

        self.score(data.archive.valid, bm25, average_idf, data.archive.answers)
        for test in data.archive.test:
            self.score(test, bm25, average_idf, data.archive.answers)
Пример #29
0
def get_bm25_rankings(question_corpora, query_doc, question_part):
    parsed_query_doc = [word for word in query_doc.split()]
    parsed_qcorpora = [[
        word for word in strip_text(question[question_part]).split()
    ] for question in question_corpora]
    dictionary = corpora.Dictionary(parsed_qcorpora)
    tokenized_qcorpora = [
        dictionary.doc2bow(
            [word for word in strip_text(question[question_part]).split()])
        for question in question_corpora
    ]
    tokenized_query_doc = dictionary.doc2bow(parsed_query_doc)
    bm25 = BM25(tokenized_qcorpora)
    average_idf = sum(map(lambda k: float(bm25.idf[k]),
                          bm25.idf.keys())) / len(bm25.idf.keys())
    return bm25.get_scores(tokenized_query_doc, average_idf)
    def apply_fun(df):
        df.columns = ['d_id', 'key', 'doc']
        query_id_group = df.groupby(['d_id'])
        bm_list = []
        for name, group in tqdm(query_id_group):
            corpus = group['doc'].values.tolist()
            corpus = [sentence.strip().split() for sentence in corpus]
            query = group['key'].values[0].strip().split()
            bm25Model = BM25(corpus)
            average_idf = sum(
                map(lambda k: float(bm25Model.idf[k]),
                    bm25Model.idf.keys())) / len(bm25Model.idf.keys())
            bmscore = bm25Model.get_scores(query, average_idf)
            bm_list.extend(bmscore)

        return bm_list