class LocalPrfKldWeightedBM25ExpanderRanker(Ranker): preprocessor = Preprocessor() extractor = Extractor(preprocessor) def __init__(self, conf: Configuration, internal_ranker: BM25OkapiWeightedRanker, dataset: pd.DataFrame, counter: Dict[Text, int], probs_t_dc: Dict[Text, float], k: Optional[int] = 10, top_w: Optional[int] = 10): self.conf = conf self.ranker = internal_ranker self.dataset = dataset self.counter = counter self.probs_t_dc = probs_t_dc self.k = k self.top_w = top_w def rank(self, query: Text, rank_threshold: Optional[float] = 0.0, rank_cutoff: Optional[int] = 100) -> List[Tuple[int, float]]: logging.info('LocPrfKldWeightedRanker: Ranking for "{}"'.format(query)) preproc_query = LocalPrfKldWeightedBM25ExpanderRanker.preprocessor.\ preprocess_text(query, tokenized=True, remove_stopwords=self.conf.preprocesing_rm_stopwords, stemmed=self.conf.preprocesing_stemmed) preproc_query_w = [(token, 1) for token in preproc_query] ranking = self.ranker.rank(preproc_query_w, rank_cutoff=rank_cutoff) probs_t_dr = self.get_probs_from_top([index for index, conf in ranking]) all_cands = self.get_top_expansion_cands(probs_t_dr, rm_tokens=preproc_query) top_cands = all_cands[:self.top_w] preproc_query_w.extend(top_cands) re_ranking = self.ranker.rank(preproc_query_w, rank_cutoff=rank_cutoff) return re_ranking def rank_gs(self, query: Text, goldstandard: Set[int], rank_threshold: Optional[float] = 0.0, rank_cutoff: Optional[int] = 100) -> List[Tuple[int, float]]: logging.info('LocPrfKldWeightedRanker: Ranking for "{}"'.format(query)) preproc_query = LocalPrfKldWeightedBM25ExpanderRanker.preprocessor.\ preprocess_text(query, tokenized=True, remove_stopwords=self.conf.preprocesing_rm_stopwords, stemmed=self.conf.preprocesing_stemmed) preproc_query_w = [(token, 1) for token in preproc_query] ranking = self.ranker.rank(preproc_query_w, rank_cutoff=rank_cutoff) probs_t_dr = self.get_probs_from_top([index for index, conf in ranking]) all_cands = self.get_top_expansion_cands(probs_t_dr, rm_tokens=preproc_query) top_cands = all_cands[:self.top_w] preproc_query_w.extend(top_cands) re_ranking = self.ranker.rank_gs(preproc_query_w, goldstandard, rank_cutoff=rank_cutoff) return re_ranking def get_top_expansion_cands(self, probs_t_dr: Dict[Text, float], rm_tokens: Optional[List[Text]] = None) -> List[Tuple[Text, float]]: scores_kld = {} for word, prob_t_dr in probs_t_dr.items(): if word not in rm_tokens: scores_kld[word] = LocalPrfKldWeightedBM25ExpanderRanker.\ kullback_leibler_divergence(prob_t_dr=prob_t_dr, prob_t_dc=self.probs_t_dc[word]) top_words = [(k,v) for k, v in sorted(scores_kld.items(), key=lambda item: item[1], reverse=True)] return top_words def get_probs_from_top(self, indexes: List[int]): top_indexes = indexes[:self.k] results = self.dataset.loc[self.dataset['fileindex'].isin(top_indexes),] words = [] for index, data in results.iterrows(): text = Ranker.get_text(self.conf, data) words.extend(text) counter = Counter(words) sum = len(words) probs = {word: (count / sum) for word, count in counter.items()} return probs @staticmethod def kullback_leibler_divergence(prob_t_dr: float, prob_t_dc: float): return prob_t_dr * (np.log((prob_t_dr / prob_t_dc))) def persist(self, path: Optional[Text]) -> None: pass @staticmethod def load(conf: Configuration, force: Optional[bool] = False, persist: Optional[bool] = True) -> "LocalPrfKldWeightedBM25ExpanderRanker": dataset = LocalPrfKldWeightedBM25ExpanderRanker.extractor.load_dataset(conf=conf) dataset.loc[:, 'fileindex'] = dataset.loc[:, 'filename'].apply(lambda x: int(x.split('.')[0])) internal_ranker = BM25OkapiWeightedRanker.load(conf, persist=persist) counter, probs = LocalPrfKldWeightedBM25ExpanderRanker.create_counter(conf, dataset) return LocalPrfKldWeightedBM25ExpanderRanker(conf=conf, internal_ranker=internal_ranker, dataset=dataset, counter=counter, probs_t_dc=probs) @staticmethod def create_counter(conf: Configuration, dataset: pd.DataFrame) -> Tuple[Dict[Text, int], Dict[Text, float]]: text = [Ranker.get_text(conf, data) for (index, data) in dataset.iterrows()] counter = {} sum = 0 for index, t_list in enumerate(text): sum += len(t_list) for token in t_list: if counter.get(token): counter[token] += 1 else: counter[token] = 1 probs = {token: (count / sum) for token, count in counter.items()} return counter, probs def get_name(self): return Ranker.R_PRF_KLD_WEIGHTED_BM25
class LocalPrfKldCatWeightedBM25ExpanderRanker(Ranker): preprocessor = Preprocessor() extractor = Extractor(preprocessor) def __init__(self, conf: Configuration, internal_ranker: BM25OkapiWeightedRanker, dataset: pd.DataFrame, counter: Dict[Text, Dict[Text, int]], probs_t_dc: Dict[Text, Dict[Text, float]], k: Optional[int] = 10, top_w_per_cat: Optional[Dict[Text, int]] = None, cats: Optional[List[Text]] = None): self.conf = conf self.ranker = internal_ranker self.dataset = dataset self.counter = counter self.probs_t_dc = probs_t_dc self.k = k if not top_w_per_cat: top_w_per_cat = { Extractor.DATA_ACTIVITY_NAME: 2, Extractor.DATA_TEXT_VISIBLE: 2, Extractor.DATA_TEXT_INVISIBLE: 2, Extractor.DATA_RES_IDS_VISIBLE: 2, Extractor.DATA_RES_IDS_INVISIBLE: 2, Extractor.DATA_ICON_IDS: 2 } self.top_w_per_cat = top_w_per_cat if not cats: cats = [ 'text_activity_name', 'text_visible', 'text_invisible', 'text_res_ids', 'text_icon_ids' ] self.cats = cats def rank(self, query: Text, rank_threshold: Optional[float] = 0.0, rank_cutoff: Optional[int] = 100) -> List[Tuple[int, float]]: logging.info( 'LocPrfKldCatWeightedRanker: Ranking for "{}"'.format(query)) preproc_query = LocalPrfKldCatWeightedBM25ExpanderRanker.preprocessor.\ preprocess_text(query, tokenized=True, remove_stopwords=self.conf.preprocesing_rm_stopwords, stemmed=self.conf.preprocesing_stemmed) preproc_query_w = [(token, 1) for token in preproc_query] ranking = self.ranker.rank(preproc_query_w, rank_cutoff=rank_cutoff) probs_t_dr = self.get_probs_from_top( [index for index, conf in ranking]) top_cands = self.get_top_expansion_cands_per_cat( probs_t_dr, self.probs_t_dc, rm_tokens=preproc_query, ensure_unique=True) preproc_query_w.extend(top_cands) re_ranking = self.ranker.rank(preproc_query_w, rank_cutoff=rank_cutoff) return re_ranking def rank_gs(self, query: Text, goldstandard: Set[int], rank_threshold: Optional[float] = 0.0, rank_cutoff: Optional[int] = 100) -> List[Tuple[int, float]]: logging.info( 'LocPrfKldCatWeightedRanker: Ranking for "{}"'.format(query)) preproc_query = LocalPrfKldCatWeightedBM25ExpanderRanker.preprocessor.\ preprocess_text(query, tokenized=True, remove_stopwords=self.conf.preprocesing_rm_stopwords, stemmed=self.conf.preprocesing_stemmed) preproc_query_w = [(token, 1) for token in preproc_query] ranking = self.ranker.rank(preproc_query_w, rank_cutoff=rank_cutoff) probs_t_dr = self.get_probs_from_top( [index for index, conf in ranking]) top_cands = self.get_top_expansion_cands_per_cat( probs_t_dr, self.probs_t_dc, rm_tokens=preproc_query, ensure_unique=True) preproc_query_w.extend(top_cands) re_ranking = self.ranker.rank_gs(preproc_query_w, goldstandard, rank_cutoff=rank_cutoff) return re_ranking def get_top_expansion_cands_per_cat( self, probs_t_dr: Dict[Text, Dict[Text, float]], probs_t_dc: Dict[Text, Dict[Text, float]], rm_tokens: Optional[List[Text]] = None, ensure_unique: Optional[bool] = True) -> List[Tuple[Text, float]]: if set(probs_t_dr.keys()) != set(probs_t_dc.keys()): raise ValueError( 'ProbR and ProbC dictionaries do not contain the same categories' ) top_words = [] for index, name in enumerate(self.cats): top_words_cat = self.get_top_expansion_cands( probs_t_dr=probs_t_dr[name], probs_t_dc=probs_t_dc[name], rm_tokens=rm_tokens) top_words_cat = top_words_cat[:self.top_w_per_cat[name]] top_words.extend(top_words_cat) if ensure_unique: top_words = list(set(top_words)) return top_words @staticmethod def get_top_expansion_cands( probs_t_dr: Dict[Text, float], probs_t_dc: Dict[Text, float], rm_tokens: Optional[List[Text]] = None ) -> List[Tuple[Text, float]]: scores_kld = {} for word, prob_t_dr in probs_t_dr.items(): if word not in rm_tokens: scores_kld[word] = LocalPrfKldCatWeightedBM25ExpanderRanker.\ kullback_leibler_divergence(prob_t_dr=prob_t_dr, prob_t_dc=probs_t_dc[word]) top_words = [(k, v) for k, v in sorted( scores_kld.items(), key=lambda item: item[1], reverse=True)] return top_words def get_probs_from_top( self, indexes: List[int]) -> Dict[Text, Dict[Text, float]]: top_indexes = indexes[:self.k] results = self.dataset.loc[ self.dataset['fileindex'].isin(top_indexes), ] all_counts = {name: {} for name in self.cats} all_probs = {name: {} for name in self.cats} all_sums = {name: 0 for name in self.cats} for index, data in results.iterrows(): for name in self.cats: text = data[name] for token in text: if all_counts[name].get(token): all_counts[name][token] += 1 else: all_counts[name][token] = 1 all_sums[name] += len(text) for name, counts in all_counts.items(): all_probs[name] = { token: (count / all_sums[name]) for token, count in counts.items() } return all_probs @staticmethod def kullback_leibler_divergence(prob_t_dr: float, prob_t_dc: float): return prob_t_dr * (np.log((prob_t_dr / prob_t_dc))) def persist(self, path: Optional[Text]) -> None: pass @staticmethod def load( conf: Configuration, force: Optional[bool] = False, persist: Optional[bool] = True ) -> "LocalPrfKldCatWeightedBM25ExpanderRanker": dataset = LocalPrfKldCatWeightedBM25ExpanderRanker.extractor.load_dataset( conf=conf) dataset.loc[:, 'fileindex'] = dataset.loc[:, 'filename'].apply( lambda x: int(x.split('.')[0])) if len(conf.text_segments_used ) == 1 and conf.text_segments_used[0] == Extractor.DATA_ALL: text_segments = [ Extractor.DATA_ACTIVITY_NAME, Extractor.DATA_TEXT_VISIBLE, Extractor.DATA_TEXT_INVISIBLE, Extractor.DATA_RES_IDS_VISIBLE, Extractor.DATA_RES_IDS_INVISIBLE, Extractor.DATA_ICON_IDS ] else: text_segments = conf.text_segments_used logging.info( 'LocPrfKldCatWeightedRanker: Using Text Segments: {}'.format( text_segments)) segment_top_ws = { Extractor.DATA_ACTIVITY_NAME: 2, Extractor.DATA_TEXT_VISIBLE: 2, Extractor.DATA_TEXT_INVISIBLE: 2, Extractor.DATA_RES_IDS_VISIBLE: 2, Extractor.DATA_RES_IDS_INVISIBLE: 2, Extractor.DATA_ICON_IDS: 2 } internal_ranker = BM25OkapiWeightedRanker.load(conf, persist=persist) counter, probs = LocalPrfKldCatWeightedBM25ExpanderRanker.create_counter( dataset, text_segments=text_segments) return LocalPrfKldCatWeightedBM25ExpanderRanker( conf=conf, internal_ranker=internal_ranker, dataset=dataset, counter=counter, probs_t_dc=probs, cats=text_segments, top_w_per_cat=segment_top_ws) @staticmethod def create_counter(dataset: pd.DataFrame, text_segments: List[Text]) -> \ Tuple[Dict[Text, Dict[Text, int]], Dict[Text, Dict[Text, float]]]: all_counts = {name: {} for name in text_segments} all_probs = {} all_sums = {name: 0 for name in text_segments} for (index, data) in dataset.iterrows(): for name in text_segments: text = data[name] for token in text: if all_counts[name].get(token): all_counts[name][token] += 1 else: all_counts[name][token] = 1 all_sums[name] += len(text) for name, counts in all_counts.items(): all_probs[name] = { token: (count / all_sums[name]) for token, count in counts.items() } return all_counts, all_probs def get_name(self): return Ranker.R_PRF_KLD_CAT_WEIGHTED_BM25
class BoolIWCSRanker(Ranker): """This is the implementation of the neural Bag-Of-Words (nBOW) model using pretrained word2vec embeddings and and TF-IDF to compute a weighted mean embedding for the documents and queries """ preprocessor = Preprocessor() extractor = Extractor(preprocessor) def __init__(self, inverted_index: Dict[Text, Set[str]], bool_dictionary: List[str], conf: Configuration, dictionary: corpora.Dictionary, bow_corpus: corpora.MmCorpus, model: TfidfModel, index: SparseMatrixSimilarity, index_mapping: Dict[int, int], inverse_index_mapping: Dict[int, int], doc_embedding: np.array, model_embedding: KeyedVectors): self.inverted_index = inverted_index self.bool_dictionary = bool_dictionary self.dictionary = dictionary self.bow_corpus = bow_corpus self.model = model self.index = index self.index_mapping = index_mapping self.inverse_index_mapping = inverse_index_mapping self.conf = conf self.doc_embedding = doc_embedding self.model_embedding = model_embedding @staticmethod def inverted_index(conf: Configuration, dataset: pd.DataFrame) -> Dict[Text, Set[str]]: path_docs = conf.path_dsls inv_index = {} for index, data in dataset.iterrows(): text = Ranker.get_text(conf, data) file_name = data['filename'] for token in text: if not inv_index.get(token): inv_index[token] = {file_name} else: inv_index[token].add(file_name) return inv_index def rank(self, query: Text, rank_threshold: Optional[float] = 0.0, rank_cutoff: Optional[int] = 100) -> List[Tuple[int, float]]: logging.info('nBOWRanker: Ranking for "{}"'.format(query)) preproc_query = self.preprocessor.\ preprocess_text(query, tokenized=True, remove_stopwords=self.conf.preprocesing_rm_stopwords, stemmed=self.conf.preprocesing_stemmed, stemming=self.conf.preprocessing_stemmer) query_tokens = [ word for word in preproc_query if word in self.bool_dictionary ] result = self.get_bool_or_doc_matches(query_tokens) result_indexes = [int(index.split('.')[0]) for index in result] result_inv_indexes = [ self.inverse_index_mapping[index] for index in result_indexes ] # TFIDF model preproc_query_bow = self.dictionary.doc2bow(preproc_query) tfidf_query_bow = self.model[preproc_query_bow] tfidf_query_embedded, missing_words_query = BoolIWCSRanker.get_embedding( tfidf_query_bow, self.model_embedding, self.dictionary) rankings = self.get_ranking_over_doc_matches(result_inv_indexes, tfidf_query_embedded) rankings_inv_indexes = [(self.index_mapping[index], sim) for index, sim in rankings] return rankings_inv_indexes[:min(len(result_indexes), rank_cutoff)] def rank_gs(self, query: Text, goldstandard: Set[int], rank_threshold: Optional[float] = 0.0, rank_cutoff: Optional[int] = 100) -> List[Tuple[int, float]]: logging.info('nBOWRanker: Ranking for "{}"'.format(query)) preproc_query = self.preprocessor. \ preprocess_text(query, tokenized=True, remove_stopwords=self.conf.preprocesing_rm_stopwords, stemmed=self.conf.preprocesing_stemmed, stemming=self.conf.preprocessing_stemmer) query_tokens = [ word for word in preproc_query if word in self.bool_dictionary ] result = self.get_bool_or_doc_matches(query_tokens) result_indexes = [int(index.split('.')[0]) for index in result] result_inv_indexes = [ self.inverse_index_mapping[index] for index in result_indexes ] # TFIDF model preproc_query_bow = self.dictionary.doc2bow(preproc_query) tfidf_query_bow = self.model[preproc_query_bow] tfidf_query_embedded, missing_words_query = BoolIWCSRanker.get_embedding( tfidf_query_bow, self.model_embedding, self.dictionary) rankings = self.get_ranking_over_doc_matches(result_inv_indexes, tfidf_query_embedded) rankings_inv_indexes = [(self.index_mapping[index], sim) for index, sim in rankings] init_result = [(index, conf) for index, conf in rankings_inv_indexes if index in goldstandard] remains = [(elem, 1) for elem in goldstandard if elem not in init_result] return init_result + remains def get_ranking_over_doc_matches(self, doc_indexes: List[int], query_embedded: np.array) \ -> List[Tuple[int, float]]: query_embedded_reshaped = query_embedded.reshape(1, -1) relevant_corpus = self.doc_embedding[doc_indexes, :] distances = spatial.distance.cdist(relevant_corpus, query_embedded_reshaped, 'cosine') similarities = 1 - distances results = [(index, similarities[enum_index][0]) for enum_index, index in enumerate(doc_indexes)] results_sorted = [ (k, v) for k, v in sorted(results, key=lambda item: item[1], reverse=True) ] return results_sorted def get_bool_or_doc_matches(self, query_tokens: List[Text]) -> Set[Text]: result = None for word in query_tokens: if result is None: result = self.inverted_index.get(word) else: intermediate_results = self.inverted_index.get(word) result = result.union(intermediate_results) return result @staticmethod def get_embedding( tfidf_words: List[Tuple[int, float]], embedding: KeyedVectors, id2word: corpora.Dictionary) -> Tuple[np.array, List[Text]]: missing_words = [] embeds = [] doc_embed = np.zeros(embedding.vector_size) for word_index, weight in tfidf_words: word = id2word[word_index] if word in embedding.vocab: word_embed = embedding[word] embeds.append(word_embed) doc_embed += word_embed * weight else: missing_words.append(word) return doc_embed, missing_words @staticmethod def embed_corpus(tfidf_corpus: List[List[Tuple[int, float]]], embedding: KeyedVectors, id2word: corpora.Dictionary) -> np.array: return np.array([ BoolIWCSRanker.get_embedding(doc, embedding, id2word)[0] for doc in tfidf_corpus ]) def persist(self, path: Optional[Text]) -> None: with open(path + 'dictionary.txt', mode='w') as file: for word in self.bool_dictionary: file.write(word + '\n') with open(path + 'inverted_index.pickle', mode='wb') as file: pickle.dump(self.inverted_index, file) self.dictionary.save(path + 'dict.dictionary') corpora.MmCorpus.serialize(path + 'corpus.mm', self.bow_corpus) self.model.save(path + 'tfidf.model') self.index.save(path + 'tfidf.index') with open(path + 'index_mapping.pickle', mode='wb') as file: pickle.dump(self.index_mapping, file) with open(path + 'inverse_index_mapping.pickle', mode='wb') as file: pickle.dump(self.inverse_index_mapping, file) np.save(path + 'doc_embedding.npy', self.doc_embedding) @staticmethod def load(conf: Configuration, force: Optional[bool] = False, persist: Optional[bool] = True) -> "BoolIWCSRanker": model_path = conf.path_models + 'bool_iwcs/' + conf.get_desc() + '/' if force or (not os.path.exists(model_path)) or \ (not os.path.isfile(model_path + 'inverted_index.pickle')) \ or (not os.path.isfile(model_path + 'corpus.mm')) \ or (not os.path.isfile(model_path + 'tfidf.model')): utils.mk_dir_if_not_exists(model_path) # Create the TFIDF model and dictionary dataset = BoolIWCSRanker.extractor.load_dataset(conf=conf) dictionary = corpora.Dictionary([ Ranker.get_text(conf, data) for (index, data) in dataset.iterrows() ]) bow_corpus = [(dictionary.doc2bow(Ranker.get_text(conf, data)), data['filename']) for (index, data) in dataset.iterrows()] bow_corpus, names = map(list, zip(*bow_corpus)) index_mapping = BoolIWCSRanker.build_index_mapping(names) inverse_index_mapping = BoolIWCSRanker.build_inverse_index_mapping( names) corpora.MmCorpus.serialize(model_path + 'corpus.mm', bow_corpus) mm_corpus = corpora.MmCorpus(model_path + 'corpus.mm') tfidf_model = TfidfModel(mm_corpus, ) tfidf_index = SparseMatrixSimilarity( tfidf_model[mm_corpus], num_features=mm_corpus.num_terms) logging.info('nBOWRanker : TFIDF initialized') logging.info('nBOWRanker : TFIDF model : {}'.format(tfidf_model)) logging.info('nBOWRanker : TFIDF index : {}'.format(tfidf_index)) # Create boolean index inverted_index = BoolIWCSRanker.inverted_index(conf, dataset) bool_dictionary = inverted_index.keys() # Load word2vec embedding and embed the corpus word2vec = KeyedVectors.load_word2vec_format( '../resources/embeddings/GoogleNews-vectors-negative300.bin', binary=True) tfidf_corpus = [tfidf_model[doc] for doc in bow_corpus] doc_embedding = BoolIWCSRanker.embed_corpus( tfidf_corpus, word2vec, dictionary) logging.info('nBOWRanker : Embedded docs shape : {}'.format( doc_embedding.shape)) ranker = BoolIWCSRanker(inverted_index, bool_dictionary, conf, dictionary, bow_corpus, tfidf_model, tfidf_index, index_mapping, inverse_index_mapping, doc_embedding=doc_embedding, model_embedding=word2vec) ranker.persist(model_path) return ranker else: dictionary = corpora.Dictionary.load(model_path + 'dict.dictionary') mm_corpus = corpora.MmCorpus(model_path + 'corpus.mm') tfidf_model = TfidfModel.load(model_path + 'tfidf.model') tfidf_index = SparseMatrixSimilarity.load(model_path + 'tfidf.index') with open(model_path + 'index_mapping.pickle', mode='rb') as file: index_mapping = pickle.load(file) logging.info('nBOWRanker : TFIDF indexmap initialized') with open(model_path + 'inverse_index_mapping.pickle', mode='rb') as file: inverse_index_mapping = pickle.load(file) logging.info('nBOWRanker : TFIDF invindexmap initialized') with open(model_path + 'inverted_index.pickle', mode='rb') as file: inverted_index = pickle.load(file) bool_dictionary = inverted_index.keys() doc_embedding = np.load(model_path + 'doc_embedding.npy') logging.info('nBOWRanker : Doc embeddings loaded') word2vec = KeyedVectors.load_word2vec_format( '../resources/embeddings/GoogleNews-vectors-negative300.bin', binary=True) logging.info('nBOWRanker : Embedding model loaded') return BoolIWCSRanker(inverted_index, bool_dictionary, conf, dictionary, mm_corpus, tfidf_model, tfidf_index, index_mapping, inverse_index_mapping, doc_embedding=doc_embedding, model_embedding=word2vec) @staticmethod def build_index_mapping(names: List[Text]) -> Dict[int, int]: mapping = {} for index, name in enumerate(names): mapping[index] = int(name.split('.')[0]) return mapping @staticmethod def build_inverse_index_mapping(names: List[Text]) -> Dict[int, int]: mapping = {} for index, name in enumerate(names): mapping[int(name.split('.')[0])] = index return mapping def get_name(self): return Ranker.R_IWCS
class BM25OkapiRanker(Ranker): preprocessor = Preprocessor() extractor = Extractor(preprocessor) def __init__(self, model: BM25Okapi, index_mapping: Dict[int, int], conf: Configuration): self.model = model self.index_mapping = index_mapping self.conf = conf def rank(self, query: Text, rank_threshold: Optional[float] = 0.0, rank_cutoff: Optional[int] = 100) -> List[Tuple[int, float]]: logging.info('BM25OkapiRanker: Ranking for "{}"'.format(query)) preproc_query = BM25OkapiRanker.preprocessor.\ preprocess_text(query, tokenized=True, remove_stopwords=self.conf.preprocesing_rm_stopwords, stemmed=self.conf.preprocesing_stemmed, stemming=self.conf.preprocessing_stemmer) top_n = self.get_top_n(preproc_query, n=rank_cutoff) return [(self.index_mapping[index], conf) for index, conf in top_n] def rank_gs(self, query: Text, goldstandard: Set[int], rank_threshold: Optional[float] = 0.0, rank_cutoff: Optional[int] = 100) -> List[Tuple[int, float]]: logging.info('BM25OkapiRanker: Ranking for "{}"'.format(query)) preproc_query = BM25OkapiRanker.preprocessor.\ preprocess_text(query, tokenized=True, remove_stopwords=self.conf.preprocesing_rm_stopwords, stemmed=self.conf.preprocesing_stemmed, stemming=self.conf.preprocessing_stemmer) top_n = self.get_top_n(preproc_query, n=self.model.corpus_size) return [(self.index_mapping[index], conf) for index, conf in top_n if self.index_mapping[index] in goldstandard] def get_top_n(self, query: List[Text], n: Optional[int] = 5): scores = self.model.get_scores(query) top_n_args = np.argsort(scores)[::-1][:n] top_n_sims = np.sort(scores)[::-1][:n] max_sim = np.sort(scores)[::-1][0] return [(arg, (sim / max_sim)) for (arg, sim) in zip(top_n_args, top_n_sims)] def persist(self, path: Optional[Text]) -> None: with open(path + 'bm25okapi.pickle', mode='wb') as file: pickle.dump(self.model, file) with open(path + 'bm25okapi_index_mapping.pickle', mode='wb') as file: pickle.dump(self.index_mapping, file) @staticmethod def load(conf: Configuration, force: Optional[bool] = False, persist: Optional[bool] = True) -> "BM25OkapiRanker": model_path = conf.path_models + 'vsm_bm25okapi/' + conf.get_desc() + '/' if force or (not os.path.exists(model_path)) \ or (not os.path.isfile(model_path + 'bm25okapi.pickle')) \ or (not os.path.isfile(model_path + 'bm25okapi_index_mapping.pickle')): utils.mk_dir_if_not_exists(model_path) dataset = BM25OkapiRanker.extractor.load_dataset(conf=conf) bow_corpus = [(Ranker.get_text(conf, data), data['filename']) for (index, data) in dataset.iterrows()] bow_corpus, names = map(list, zip(*bow_corpus)) index_mapping = BM25OkapiRanker.build_index_mapping(names) bm25 = BM25Okapi(bow_corpus) logging.info('BM25OkapiRanker : initialized') bm25_ranker = BM25OkapiRanker(model=bm25, index_mapping=index_mapping, conf=conf) bm25_ranker.persist(model_path) return bm25_ranker else: with open(model_path + 'bm25okapi.pickle', mode='rb') as file: bm25 = pickle.load(file) logging.info('BM25OkapiRanker : loading bm25okapi.pickle from {}'.format(model_path)) with open(model_path + 'bm25okapi_index_mapping.pickle', mode='rb') as file: index_mapping = pickle.load(file) logging.info('BM25OkapiRanker : loading bm25_index_mapping.pickle from {}'.format(model_path)) logging.info('BM25OkapiRanker : initialized') return BM25OkapiRanker(model=bm25, index_mapping=index_mapping, conf=conf) @staticmethod def build_index_mapping(names: List[Text]) -> Dict[int, int]: mapping ={} for index, name in enumerate(names): mapping[index] = int(name.split('.')[0]) return mapping def get_name(self): return Ranker.R_BM25OKAPI
class TFIDFRanker(Ranker): preprocessor = Preprocessor() extractor = Extractor(preprocessor) def __init__(self, dictionary: corpora.Dictionary, bow_corpus: corpora.MmCorpus, model: TfidfModel, index: SparseMatrixSimilarity, index_mapping: Dict[int, int], conf: Configuration): self.dictionary = dictionary self.bow_corpus = bow_corpus self.model = model self.index = index self.index_mapping = index_mapping self.conf = conf def rank(self, query: Text, rank_threshold: Optional[float] = 0.0, rank_cutoff: Optional[int] = 100) -> List[Tuple[int, float]]: logging.info('TFIDFRanker: Ranking for "{}"'.format(query)) preproc_query = TFIDFRanker.preprocessor.\ preprocess_text(query, tokenized=True, remove_stopwords=self.conf.preprocesing_rm_stopwords, stemmed=self.conf.preprocesing_stemmed, stemming=self.conf.preprocessing_stemmer) preproc_query_bow = self.dictionary.doc2bow(preproc_query) similarities = self.index[self.model[preproc_query_bow]] return sorted([(self.index_mapping[index], sim) for index, sim in enumerate(similarities)], key=lambda x: x[1], reverse=True)[:rank_cutoff] def rank_gs(self, query: Text, goldstandard: Set[int], rank_threshold: Optional[float] = 0.0, rank_cutoff: Optional[int] = 100) -> List[Tuple[int, float]]: logging.info('TFIDFRanker: Ranking for "{}"'.format(query)) preproc_query = TFIDFRanker.preprocessor.\ preprocess_text(query, tokenized=True, remove_stopwords=self.conf.preprocesing_rm_stopwords, stemmed=self.conf.preprocesing_stemmed, stemming=self.conf.preprocessing_stemmer) preproc_query_bow = self.dictionary.doc2bow(preproc_query) similarities = self.index[self.model[preproc_query_bow]] return sorted([(self.index_mapping[index], sim) for index, sim in enumerate(similarities) if self.index_mapping[index] in goldstandard], key=lambda x: x[1], reverse=True) def persist(self, path: Optional[Text]) -> None: self.dictionary.save(path + 'dict.dictionary') corpora.MmCorpus.serialize(path + 'corpus.mm', self.bow_corpus) self.model.save(path + 'tfidf.model') self.index.save(path + 'tfidf.index') with open(path + 'index_mapping.pickle', mode='wb') as file: pickle.dump(self.index_mapping, file) @staticmethod def load(conf: Configuration, force: Optional[bool] = False, persist: Optional[bool] = True) -> "TFIDFRanker": model_path = conf.path_models + 'vsm_tfidf/' + conf.get_desc() + '/' if force or (not os.path.exists(model_path)) \ or (not os.path.isfile(model_path + 'corpus.mm')) \ or (not os.path.isfile(model_path + 'tfidf.model')): utils.mk_dir_if_not_exists(model_path) dataset = TFIDFRanker.extractor.load_dataset(conf=conf) dictionary = corpora.Dictionary([Ranker.get_text(conf, data) for (index, data) in dataset.iterrows()]) bow_corpus = [(dictionary.doc2bow(Ranker.get_text(conf, data)), data['filename']) for (index, data) in dataset.iterrows()] bow_corpus, names = map(list, zip(*bow_corpus)) index_mapping = TFIDFRanker.build_index_mapping(names) corpora.MmCorpus.serialize(model_path + 'corpus.mm', bow_corpus) mm_corpus = corpora.MmCorpus(model_path + 'corpus.mm') tfidf_model = TfidfModel(mm_corpus, ) tfidf_index = SparseMatrixSimilarity(tfidf_model[mm_corpus], num_features=mm_corpus.num_terms) ranker = TFIDFRanker(dictionary=dictionary, bow_corpus=mm_corpus, model=tfidf_model, index=tfidf_index, index_mapping=index_mapping, conf=conf) ranker.persist(model_path) logging.info('TFIDFRanker : initialized') logging.info('TFIDFRanker : model : {}'.format(tfidf_model)) logging.info('TFIDFRanker : index : {}'.format(tfidf_index)) return ranker else: dictionary = corpora.Dictionary.load(model_path + 'dict.dictionary') mm_corpus = corpora.MmCorpus(model_path+ 'corpus.mm') tfidf_model = TfidfModel.load(model_path + 'tfidf.model') tfidf_index = SparseMatrixSimilarity.load(model_path + 'tfidf.index') with open(model_path + 'index_mapping.pickle', mode='rb') as file: index_mapping = pickle.load(file) logging.info('TFIDFRanker : initialized') return TFIDFRanker(dictionary=dictionary,bow_corpus=mm_corpus, model=tfidf_model,index=tfidf_index,index_mapping=index_mapping,conf=conf) @staticmethod def build_index_mapping(names: List[Text]) -> Dict[int, int]: mapping ={} for index, name in enumerate(names): mapping[index] = int(name.split('.')[0]) return mapping def get_name(self): return Ranker.R_TFIDF