Python Extractor примеры использования

Язык программирования: Python

Пространство имен/Пакет: gui2r.preprocessing.extraction

Класс/Тип: Extractor

Примеров на hotexamples.com: 5

Python Extractor - 5 примеров найдено. Это лучшие примеры Python кода для gui2r.preprocessing.extraction.Extractor, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Extractor(5)

Основные методы

Extractor (5)

Пример #1

Показать файл

Файл: loc_prf_kld_bm25_weighted_expander.py Проект: RaWi-Protoyping/RaWi

class LocalPrfKldWeightedBM25ExpanderRanker(Ranker):

    preprocessor = Preprocessor()
    extractor = Extractor(preprocessor)

    def __init__(self, conf: Configuration, internal_ranker: BM25OkapiWeightedRanker,
                 dataset: pd.DataFrame, counter: Dict[Text, int],
                 probs_t_dc: Dict[Text, float], k: Optional[int] = 10, top_w: Optional[int] = 10):
        self.conf = conf
        self.ranker = internal_ranker
        self.dataset = dataset
        self.counter = counter
        self.probs_t_dc = probs_t_dc
        self.k = k
        self.top_w = top_w

    def rank(self, query: Text, rank_threshold: Optional[float] = 0.0,
             rank_cutoff: Optional[int] = 100) -> List[Tuple[int, float]]:
        logging.info('LocPrfKldWeightedRanker: Ranking for "{}"'.format(query))
        preproc_query = LocalPrfKldWeightedBM25ExpanderRanker.preprocessor.\
            preprocess_text(query, tokenized=True, remove_stopwords=self.conf.preprocesing_rm_stopwords,
                            stemmed=self.conf.preprocesing_stemmed)
        preproc_query_w = [(token, 1) for token in preproc_query]
        ranking = self.ranker.rank(preproc_query_w, rank_cutoff=rank_cutoff)
        probs_t_dr = self.get_probs_from_top([index for index, conf in ranking])
        all_cands = self.get_top_expansion_cands(probs_t_dr, rm_tokens=preproc_query)
        top_cands = all_cands[:self.top_w]
        preproc_query_w.extend(top_cands)
        re_ranking = self.ranker.rank(preproc_query_w, rank_cutoff=rank_cutoff)
        return re_ranking

    def rank_gs(self, query: Text, goldstandard: Set[int], rank_threshold: Optional[float] = 0.0,
             rank_cutoff: Optional[int] = 100) -> List[Tuple[int, float]]:
        logging.info('LocPrfKldWeightedRanker: Ranking for "{}"'.format(query))
        preproc_query = LocalPrfKldWeightedBM25ExpanderRanker.preprocessor.\
            preprocess_text(query, tokenized=True, remove_stopwords=self.conf.preprocesing_rm_stopwords,
                            stemmed=self.conf.preprocesing_stemmed)
        preproc_query_w = [(token, 1) for token in preproc_query]
        ranking = self.ranker.rank(preproc_query_w, rank_cutoff=rank_cutoff)
        probs_t_dr = self.get_probs_from_top([index for index, conf in ranking])
        all_cands = self.get_top_expansion_cands(probs_t_dr, rm_tokens=preproc_query)
        top_cands = all_cands[:self.top_w]
        preproc_query_w.extend(top_cands)
        re_ranking = self.ranker.rank_gs(preproc_query_w, goldstandard, rank_cutoff=rank_cutoff)
        return re_ranking

    def get_top_expansion_cands(self, probs_t_dr: Dict[Text, float],
                                rm_tokens: Optional[List[Text]] = None) -> List[Tuple[Text, float]]:
        scores_kld = {}
        for word, prob_t_dr in probs_t_dr.items():
            if word not in rm_tokens:
                scores_kld[word] = LocalPrfKldWeightedBM25ExpanderRanker.\
                    kullback_leibler_divergence(prob_t_dr=prob_t_dr, prob_t_dc=self.probs_t_dc[word])
        top_words = [(k,v) for k, v in sorted(scores_kld.items(), key=lambda item: item[1], reverse=True)]
        return top_words

    def get_probs_from_top(self, indexes: List[int]):
        top_indexes = indexes[:self.k]
        results = self.dataset.loc[self.dataset['fileindex'].isin(top_indexes),]
        words = []
        for index, data in results.iterrows():
            text = Ranker.get_text(self.conf, data)
            words.extend(text)
        counter = Counter(words)
        sum = len(words)
        probs = {word: (count / sum) for word, count in counter.items()}
        return probs

    @staticmethod
    def kullback_leibler_divergence(prob_t_dr: float, prob_t_dc: float):
        return prob_t_dr * (np.log((prob_t_dr / prob_t_dc)))

    def persist(self, path: Optional[Text]) -> None:
        pass

    @staticmethod
    def load(conf: Configuration, force: Optional[bool] = False,
             persist: Optional[bool] = True) -> "LocalPrfKldWeightedBM25ExpanderRanker":
        dataset = LocalPrfKldWeightedBM25ExpanderRanker.extractor.load_dataset(conf=conf)
        dataset.loc[:, 'fileindex'] = dataset.loc[:, 'filename'].apply(lambda x: int(x.split('.')[0]))
        internal_ranker = BM25OkapiWeightedRanker.load(conf, persist=persist)
        counter, probs = LocalPrfKldWeightedBM25ExpanderRanker.create_counter(conf, dataset)
        return LocalPrfKldWeightedBM25ExpanderRanker(conf=conf, internal_ranker=internal_ranker,
                                                     dataset=dataset, counter=counter, probs_t_dc=probs)

    @staticmethod
    def create_counter(conf: Configuration, dataset: pd.DataFrame) -> Tuple[Dict[Text, int], Dict[Text, float]]:
        text = [Ranker.get_text(conf, data) for (index, data) in dataset.iterrows()]
        counter = {}
        sum = 0
        for index, t_list in enumerate(text):
            sum += len(t_list)
            for token in t_list:
                if counter.get(token): counter[token] += 1
                else: counter[token] = 1
        probs = {token: (count / sum) for token, count in counter.items()}
        return counter, probs

    def get_name(self):
        return Ranker.R_PRF_KLD_WEIGHTED_BM25

Пример #2

Показать файл

class LocalPrfKldCatWeightedBM25ExpanderRanker(Ranker):

    preprocessor = Preprocessor()
    extractor = Extractor(preprocessor)

    def __init__(self,
                 conf: Configuration,
                 internal_ranker: BM25OkapiWeightedRanker,
                 dataset: pd.DataFrame,
                 counter: Dict[Text, Dict[Text, int]],
                 probs_t_dc: Dict[Text, Dict[Text, float]],
                 k: Optional[int] = 10,
                 top_w_per_cat: Optional[Dict[Text, int]] = None,
                 cats: Optional[List[Text]] = None):
        self.conf = conf
        self.ranker = internal_ranker
        self.dataset = dataset
        self.counter = counter
        self.probs_t_dc = probs_t_dc
        self.k = k
        if not top_w_per_cat:
            top_w_per_cat = {
                Extractor.DATA_ACTIVITY_NAME: 2,
                Extractor.DATA_TEXT_VISIBLE: 2,
                Extractor.DATA_TEXT_INVISIBLE: 2,
                Extractor.DATA_RES_IDS_VISIBLE: 2,
                Extractor.DATA_RES_IDS_INVISIBLE: 2,
                Extractor.DATA_ICON_IDS: 2
            }
        self.top_w_per_cat = top_w_per_cat
        if not cats:
            cats = [
                'text_activity_name', 'text_visible', 'text_invisible',
                'text_res_ids', 'text_icon_ids'
            ]
        self.cats = cats

    def rank(self,
             query: Text,
             rank_threshold: Optional[float] = 0.0,
             rank_cutoff: Optional[int] = 100) -> List[Tuple[int, float]]:
        logging.info(
            'LocPrfKldCatWeightedRanker: Ranking for "{}"'.format(query))
        preproc_query = LocalPrfKldCatWeightedBM25ExpanderRanker.preprocessor.\
            preprocess_text(query, tokenized=True, remove_stopwords=self.conf.preprocesing_rm_stopwords,
                            stemmed=self.conf.preprocesing_stemmed)
        preproc_query_w = [(token, 1) for token in preproc_query]
        ranking = self.ranker.rank(preproc_query_w, rank_cutoff=rank_cutoff)
        probs_t_dr = self.get_probs_from_top(
            [index for index, conf in ranking])
        top_cands = self.get_top_expansion_cands_per_cat(
            probs_t_dr,
            self.probs_t_dc,
            rm_tokens=preproc_query,
            ensure_unique=True)
        preproc_query_w.extend(top_cands)
        re_ranking = self.ranker.rank(preproc_query_w, rank_cutoff=rank_cutoff)
        return re_ranking

    def rank_gs(self,
                query: Text,
                goldstandard: Set[int],
                rank_threshold: Optional[float] = 0.0,
                rank_cutoff: Optional[int] = 100) -> List[Tuple[int, float]]:
        logging.info(
            'LocPrfKldCatWeightedRanker: Ranking for "{}"'.format(query))
        preproc_query = LocalPrfKldCatWeightedBM25ExpanderRanker.preprocessor.\
            preprocess_text(query, tokenized=True, remove_stopwords=self.conf.preprocesing_rm_stopwords,
                            stemmed=self.conf.preprocesing_stemmed)
        preproc_query_w = [(token, 1) for token in preproc_query]
        ranking = self.ranker.rank(preproc_query_w, rank_cutoff=rank_cutoff)
        probs_t_dr = self.get_probs_from_top(
            [index for index, conf in ranking])
        top_cands = self.get_top_expansion_cands_per_cat(
            probs_t_dr,
            self.probs_t_dc,
            rm_tokens=preproc_query,
            ensure_unique=True)
        preproc_query_w.extend(top_cands)
        re_ranking = self.ranker.rank_gs(preproc_query_w,
                                         goldstandard,
                                         rank_cutoff=rank_cutoff)
        return re_ranking

    def get_top_expansion_cands_per_cat(
            self,
            probs_t_dr: Dict[Text, Dict[Text, float]],
            probs_t_dc: Dict[Text, Dict[Text, float]],
            rm_tokens: Optional[List[Text]] = None,
            ensure_unique: Optional[bool] = True) -> List[Tuple[Text, float]]:
        if set(probs_t_dr.keys()) != set(probs_t_dc.keys()):
            raise ValueError(
                'ProbR and ProbC dictionaries do not contain the same categories'
            )
        top_words = []
        for index, name in enumerate(self.cats):
            top_words_cat = self.get_top_expansion_cands(
                probs_t_dr=probs_t_dr[name],
                probs_t_dc=probs_t_dc[name],
                rm_tokens=rm_tokens)
            top_words_cat = top_words_cat[:self.top_w_per_cat[name]]
            top_words.extend(top_words_cat)
        if ensure_unique:
            top_words = list(set(top_words))
        return top_words

    @staticmethod
    def get_top_expansion_cands(
            probs_t_dr: Dict[Text, float],
            probs_t_dc: Dict[Text, float],
            rm_tokens: Optional[List[Text]] = None
    ) -> List[Tuple[Text, float]]:
        scores_kld = {}
        for word, prob_t_dr in probs_t_dr.items():
            if word not in rm_tokens:
                scores_kld[word] = LocalPrfKldCatWeightedBM25ExpanderRanker.\
                    kullback_leibler_divergence(prob_t_dr=prob_t_dr, prob_t_dc=probs_t_dc[word])
        top_words = [(k, v) for k, v in sorted(
            scores_kld.items(), key=lambda item: item[1], reverse=True)]
        return top_words

    def get_probs_from_top(
            self, indexes: List[int]) -> Dict[Text, Dict[Text, float]]:
        top_indexes = indexes[:self.k]
        results = self.dataset.loc[
            self.dataset['fileindex'].isin(top_indexes), ]
        all_counts = {name: {} for name in self.cats}
        all_probs = {name: {} for name in self.cats}
        all_sums = {name: 0 for name in self.cats}
        for index, data in results.iterrows():
            for name in self.cats:
                text = data[name]
                for token in text:
                    if all_counts[name].get(token):
                        all_counts[name][token] += 1
                    else:
                        all_counts[name][token] = 1
                all_sums[name] += len(text)
        for name, counts in all_counts.items():
            all_probs[name] = {
                token: (count / all_sums[name])
                for token, count in counts.items()
            }
        return all_probs

    @staticmethod
    def kullback_leibler_divergence(prob_t_dr: float, prob_t_dc: float):
        return prob_t_dr * (np.log((prob_t_dr / prob_t_dc)))

    def persist(self, path: Optional[Text]) -> None:
        pass

    @staticmethod
    def load(
        conf: Configuration,
        force: Optional[bool] = False,
        persist: Optional[bool] = True
    ) -> "LocalPrfKldCatWeightedBM25ExpanderRanker":
        dataset = LocalPrfKldCatWeightedBM25ExpanderRanker.extractor.load_dataset(
            conf=conf)
        dataset.loc[:, 'fileindex'] = dataset.loc[:, 'filename'].apply(
            lambda x: int(x.split('.')[0]))
        if len(conf.text_segments_used
               ) == 1 and conf.text_segments_used[0] == Extractor.DATA_ALL:
            text_segments = [
                Extractor.DATA_ACTIVITY_NAME, Extractor.DATA_TEXT_VISIBLE,
                Extractor.DATA_TEXT_INVISIBLE, Extractor.DATA_RES_IDS_VISIBLE,
                Extractor.DATA_RES_IDS_INVISIBLE, Extractor.DATA_ICON_IDS
            ]
        else:
            text_segments = conf.text_segments_used
        logging.info(
            'LocPrfKldCatWeightedRanker: Using Text Segments: {}'.format(
                text_segments))
        segment_top_ws = {
            Extractor.DATA_ACTIVITY_NAME: 2,
            Extractor.DATA_TEXT_VISIBLE: 2,
            Extractor.DATA_TEXT_INVISIBLE: 2,
            Extractor.DATA_RES_IDS_VISIBLE: 2,
            Extractor.DATA_RES_IDS_INVISIBLE: 2,
            Extractor.DATA_ICON_IDS: 2
        }
        internal_ranker = BM25OkapiWeightedRanker.load(conf, persist=persist)
        counter, probs = LocalPrfKldCatWeightedBM25ExpanderRanker.create_counter(
            dataset, text_segments=text_segments)
        return LocalPrfKldCatWeightedBM25ExpanderRanker(
            conf=conf,
            internal_ranker=internal_ranker,
            dataset=dataset,
            counter=counter,
            probs_t_dc=probs,
            cats=text_segments,
            top_w_per_cat=segment_top_ws)

    @staticmethod
    def create_counter(dataset: pd.DataFrame, text_segments: List[Text]) -> \
            Tuple[Dict[Text, Dict[Text, int]], Dict[Text, Dict[Text, float]]]:
        all_counts = {name: {} for name in text_segments}
        all_probs = {}
        all_sums = {name: 0 for name in text_segments}
        for (index, data) in dataset.iterrows():
            for name in text_segments:
                text = data[name]
                for token in text:
                    if all_counts[name].get(token):
                        all_counts[name][token] += 1
                    else:
                        all_counts[name][token] = 1
                all_sums[name] += len(text)
        for name, counts in all_counts.items():
            all_probs[name] = {
                token: (count / all_sums[name])
                for token, count in counts.items()
            }
        return all_counts, all_probs

    def get_name(self):
        return Ranker.R_PRF_KLD_CAT_WEIGHTED_BM25

Пример #3

Показать файл

class BoolIWCSRanker(Ranker):
    """This is the implementation of the neural Bag-Of-Words (nBOW) model using pretrained word2vec embeddings and
       and TF-IDF to compute a weighted mean embedding for the documents and queries
    """

    preprocessor = Preprocessor()
    extractor = Extractor(preprocessor)

    def __init__(self, inverted_index: Dict[Text, Set[str]],
                 bool_dictionary: List[str], conf: Configuration,
                 dictionary: corpora.Dictionary, bow_corpus: corpora.MmCorpus,
                 model: TfidfModel, index: SparseMatrixSimilarity,
                 index_mapping: Dict[int,
                                     int], inverse_index_mapping: Dict[int,
                                                                       int],
                 doc_embedding: np.array, model_embedding: KeyedVectors):
        self.inverted_index = inverted_index
        self.bool_dictionary = bool_dictionary
        self.dictionary = dictionary
        self.bow_corpus = bow_corpus
        self.model = model
        self.index = index
        self.index_mapping = index_mapping
        self.inverse_index_mapping = inverse_index_mapping
        self.conf = conf
        self.doc_embedding = doc_embedding
        self.model_embedding = model_embedding

    @staticmethod
    def inverted_index(conf: Configuration,
                       dataset: pd.DataFrame) -> Dict[Text, Set[str]]:
        path_docs = conf.path_dsls
        inv_index = {}
        for index, data in dataset.iterrows():
            text = Ranker.get_text(conf, data)
            file_name = data['filename']
            for token in text:
                if not inv_index.get(token):
                    inv_index[token] = {file_name}
                else:
                    inv_index[token].add(file_name)
        return inv_index

    def rank(self,
             query: Text,
             rank_threshold: Optional[float] = 0.0,
             rank_cutoff: Optional[int] = 100) -> List[Tuple[int, float]]:
        logging.info('nBOWRanker: Ranking for "{}"'.format(query))
        preproc_query = self.preprocessor.\
            preprocess_text(query, tokenized=True, remove_stopwords=self.conf.preprocesing_rm_stopwords,
                            stemmed=self.conf.preprocesing_stemmed,
                            stemming=self.conf.preprocessing_stemmer)
        query_tokens = [
            word for word in preproc_query if word in self.bool_dictionary
        ]
        result = self.get_bool_or_doc_matches(query_tokens)
        result_indexes = [int(index.split('.')[0]) for index in result]
        result_inv_indexes = [
            self.inverse_index_mapping[index] for index in result_indexes
        ]
        # TFIDF model
        preproc_query_bow = self.dictionary.doc2bow(preproc_query)
        tfidf_query_bow = self.model[preproc_query_bow]
        tfidf_query_embedded, missing_words_query = BoolIWCSRanker.get_embedding(
            tfidf_query_bow, self.model_embedding, self.dictionary)
        rankings = self.get_ranking_over_doc_matches(result_inv_indexes,
                                                     tfidf_query_embedded)
        rankings_inv_indexes = [(self.index_mapping[index], sim)
                                for index, sim in rankings]
        return rankings_inv_indexes[:min(len(result_indexes), rank_cutoff)]

    def rank_gs(self,
                query: Text,
                goldstandard: Set[int],
                rank_threshold: Optional[float] = 0.0,
                rank_cutoff: Optional[int] = 100) -> List[Tuple[int, float]]:
        logging.info('nBOWRanker: Ranking for "{}"'.format(query))
        preproc_query = self.preprocessor. \
            preprocess_text(query, tokenized=True, remove_stopwords=self.conf.preprocesing_rm_stopwords,
                            stemmed=self.conf.preprocesing_stemmed,
                            stemming=self.conf.preprocessing_stemmer)
        query_tokens = [
            word for word in preproc_query if word in self.bool_dictionary
        ]
        result = self.get_bool_or_doc_matches(query_tokens)
        result_indexes = [int(index.split('.')[0]) for index in result]
        result_inv_indexes = [
            self.inverse_index_mapping[index] for index in result_indexes
        ]
        # TFIDF model
        preproc_query_bow = self.dictionary.doc2bow(preproc_query)
        tfidf_query_bow = self.model[preproc_query_bow]
        tfidf_query_embedded, missing_words_query = BoolIWCSRanker.get_embedding(
            tfidf_query_bow, self.model_embedding, self.dictionary)
        rankings = self.get_ranking_over_doc_matches(result_inv_indexes,
                                                     tfidf_query_embedded)
        rankings_inv_indexes = [(self.index_mapping[index], sim)
                                for index, sim in rankings]
        init_result = [(index, conf) for index, conf in rankings_inv_indexes
                       if index in goldstandard]
        remains = [(elem, 1) for elem in goldstandard
                   if elem not in init_result]
        return init_result + remains

    def get_ranking_over_doc_matches(self, doc_indexes: List[int], query_embedded: np.array) \
            -> List[Tuple[int, float]]:
        query_embedded_reshaped = query_embedded.reshape(1, -1)
        relevant_corpus = self.doc_embedding[doc_indexes, :]
        distances = spatial.distance.cdist(relevant_corpus,
                                           query_embedded_reshaped, 'cosine')
        similarities = 1 - distances
        results = [(index, similarities[enum_index][0])
                   for enum_index, index in enumerate(doc_indexes)]
        results_sorted = [
            (k, v)
            for k, v in sorted(results, key=lambda item: item[1], reverse=True)
        ]
        return results_sorted

    def get_bool_or_doc_matches(self, query_tokens: List[Text]) -> Set[Text]:
        result = None
        for word in query_tokens:
            if result is None:
                result = self.inverted_index.get(word)
            else:
                intermediate_results = self.inverted_index.get(word)
                result = result.union(intermediate_results)
        return result

    @staticmethod
    def get_embedding(
            tfidf_words: List[Tuple[int, float]], embedding: KeyedVectors,
            id2word: corpora.Dictionary) -> Tuple[np.array, List[Text]]:
        missing_words = []
        embeds = []
        doc_embed = np.zeros(embedding.vector_size)
        for word_index, weight in tfidf_words:
            word = id2word[word_index]
            if word in embedding.vocab:
                word_embed = embedding[word]
                embeds.append(word_embed)
                doc_embed += word_embed * weight
            else:
                missing_words.append(word)
        return doc_embed, missing_words

    @staticmethod
    def embed_corpus(tfidf_corpus: List[List[Tuple[int, float]]],
                     embedding: KeyedVectors,
                     id2word: corpora.Dictionary) -> np.array:
        return np.array([
            BoolIWCSRanker.get_embedding(doc, embedding, id2word)[0]
            for doc in tfidf_corpus
        ])

    def persist(self, path: Optional[Text]) -> None:
        with open(path + 'dictionary.txt', mode='w') as file:
            for word in self.bool_dictionary:
                file.write(word + '\n')
        with open(path + 'inverted_index.pickle', mode='wb') as file:
            pickle.dump(self.inverted_index, file)
        self.dictionary.save(path + 'dict.dictionary')
        corpora.MmCorpus.serialize(path + 'corpus.mm', self.bow_corpus)
        self.model.save(path + 'tfidf.model')
        self.index.save(path + 'tfidf.index')
        with open(path + 'index_mapping.pickle', mode='wb') as file:
            pickle.dump(self.index_mapping, file)
        with open(path + 'inverse_index_mapping.pickle', mode='wb') as file:
            pickle.dump(self.inverse_index_mapping, file)
        np.save(path + 'doc_embedding.npy', self.doc_embedding)

    @staticmethod
    def load(conf: Configuration,
             force: Optional[bool] = False,
             persist: Optional[bool] = True) -> "BoolIWCSRanker":
        model_path = conf.path_models + 'bool_iwcs/' + conf.get_desc() + '/'
        if force or (not os.path.exists(model_path)) or \
                (not os.path.isfile(model_path + 'inverted_index.pickle')) \
                 or (not os.path.isfile(model_path + 'corpus.mm')) \
                 or (not os.path.isfile(model_path + 'tfidf.model')):
            utils.mk_dir_if_not_exists(model_path)
            # Create the TFIDF model and dictionary
            dataset = BoolIWCSRanker.extractor.load_dataset(conf=conf)
            dictionary = corpora.Dictionary([
                Ranker.get_text(conf, data)
                for (index, data) in dataset.iterrows()
            ])
            bow_corpus = [(dictionary.doc2bow(Ranker.get_text(conf, data)),
                           data['filename'])
                          for (index, data) in dataset.iterrows()]
            bow_corpus, names = map(list, zip(*bow_corpus))
            index_mapping = BoolIWCSRanker.build_index_mapping(names)
            inverse_index_mapping = BoolIWCSRanker.build_inverse_index_mapping(
                names)
            corpora.MmCorpus.serialize(model_path + 'corpus.mm', bow_corpus)
            mm_corpus = corpora.MmCorpus(model_path + 'corpus.mm')
            tfidf_model = TfidfModel(mm_corpus, )
            tfidf_index = SparseMatrixSimilarity(
                tfidf_model[mm_corpus], num_features=mm_corpus.num_terms)
            logging.info('nBOWRanker : TFIDF initialized')
            logging.info('nBOWRanker : TFIDF model : {}'.format(tfidf_model))
            logging.info('nBOWRanker : TFIDF index : {}'.format(tfidf_index))
            # Create boolean index
            inverted_index = BoolIWCSRanker.inverted_index(conf, dataset)
            bool_dictionary = inverted_index.keys()
            # Load word2vec embedding and embed the corpus
            word2vec = KeyedVectors.load_word2vec_format(
                '../resources/embeddings/GoogleNews-vectors-negative300.bin',
                binary=True)
            tfidf_corpus = [tfidf_model[doc] for doc in bow_corpus]
            doc_embedding = BoolIWCSRanker.embed_corpus(
                tfidf_corpus, word2vec, dictionary)
            logging.info('nBOWRanker : Embedded docs shape : {}'.format(
                doc_embedding.shape))
            ranker = BoolIWCSRanker(inverted_index,
                                    bool_dictionary,
                                    conf,
                                    dictionary,
                                    bow_corpus,
                                    tfidf_model,
                                    tfidf_index,
                                    index_mapping,
                                    inverse_index_mapping,
                                    doc_embedding=doc_embedding,
                                    model_embedding=word2vec)
            ranker.persist(model_path)
            return ranker
        else:
            dictionary = corpora.Dictionary.load(model_path +
                                                 'dict.dictionary')
            mm_corpus = corpora.MmCorpus(model_path + 'corpus.mm')
            tfidf_model = TfidfModel.load(model_path + 'tfidf.model')
            tfidf_index = SparseMatrixSimilarity.load(model_path +
                                                      'tfidf.index')
            with open(model_path + 'index_mapping.pickle', mode='rb') as file:
                index_mapping = pickle.load(file)
                logging.info('nBOWRanker : TFIDF indexmap initialized')
            with open(model_path + 'inverse_index_mapping.pickle',
                      mode='rb') as file:
                inverse_index_mapping = pickle.load(file)
                logging.info('nBOWRanker : TFIDF invindexmap initialized')
            with open(model_path + 'inverted_index.pickle', mode='rb') as file:
                inverted_index = pickle.load(file)
                bool_dictionary = inverted_index.keys()
            doc_embedding = np.load(model_path + 'doc_embedding.npy')
            logging.info('nBOWRanker : Doc embeddings loaded')
            word2vec = KeyedVectors.load_word2vec_format(
                '../resources/embeddings/GoogleNews-vectors-negative300.bin',
                binary=True)
            logging.info('nBOWRanker : Embedding model loaded')
            return BoolIWCSRanker(inverted_index,
                                  bool_dictionary,
                                  conf,
                                  dictionary,
                                  mm_corpus,
                                  tfidf_model,
                                  tfidf_index,
                                  index_mapping,
                                  inverse_index_mapping,
                                  doc_embedding=doc_embedding,
                                  model_embedding=word2vec)

    @staticmethod
    def build_index_mapping(names: List[Text]) -> Dict[int, int]:
        mapping = {}
        for index, name in enumerate(names):
            mapping[index] = int(name.split('.')[0])
        return mapping

    @staticmethod
    def build_inverse_index_mapping(names: List[Text]) -> Dict[int, int]:
        mapping = {}
        for index, name in enumerate(names):
            mapping[int(name.split('.')[0])] = index
        return mapping

    def get_name(self):
        return Ranker.R_IWCS

Пример #4

Показать файл

Файл: bm25okapi_ranker.py Проект: RaWi-Protoyping/RaWi

class BM25OkapiRanker(Ranker):

    preprocessor = Preprocessor()
    extractor = Extractor(preprocessor)

    def __init__(self, model: BM25Okapi, index_mapping: Dict[int, int],
                 conf: Configuration):
        self.model = model
        self.index_mapping = index_mapping
        self.conf = conf

    def rank(self, query: Text, rank_threshold: Optional[float] = 0.0,
             rank_cutoff: Optional[int] = 100) -> List[Tuple[int, float]]:
        logging.info('BM25OkapiRanker: Ranking for "{}"'.format(query))
        preproc_query = BM25OkapiRanker.preprocessor.\
            preprocess_text(query, tokenized=True, remove_stopwords=self.conf.preprocesing_rm_stopwords,
                            stemmed=self.conf.preprocesing_stemmed,
                            stemming=self.conf.preprocessing_stemmer)
        top_n = self.get_top_n(preproc_query, n=rank_cutoff)
        return [(self.index_mapping[index], conf) for index, conf in top_n]

    def rank_gs(self, query: Text, goldstandard: Set[int], rank_threshold: Optional[float] = 0.0,
             rank_cutoff: Optional[int] = 100) -> List[Tuple[int, float]]:
        logging.info('BM25OkapiRanker: Ranking for "{}"'.format(query))
        preproc_query = BM25OkapiRanker.preprocessor.\
            preprocess_text(query, tokenized=True, remove_stopwords=self.conf.preprocesing_rm_stopwords,
                            stemmed=self.conf.preprocesing_stemmed,
                            stemming=self.conf.preprocessing_stemmer)
        top_n = self.get_top_n(preproc_query, n=self.model.corpus_size)
        return [(self.index_mapping[index], conf) for index, conf in top_n if self.index_mapping[index] in goldstandard]

    def get_top_n(self, query: List[Text], n: Optional[int] = 5):
        scores = self.model.get_scores(query)
        top_n_args = np.argsort(scores)[::-1][:n]
        top_n_sims = np.sort(scores)[::-1][:n]
        max_sim = np.sort(scores)[::-1][0]
        return [(arg, (sim / max_sim)) for (arg, sim) in zip(top_n_args, top_n_sims)]

    def persist(self, path: Optional[Text]) -> None:
        with open(path + 'bm25okapi.pickle', mode='wb') as file:
            pickle.dump(self.model, file)
        with open(path + 'bm25okapi_index_mapping.pickle', mode='wb') as file:
            pickle.dump(self.index_mapping, file)

    @staticmethod
    def load(conf: Configuration, force: Optional[bool] = False,
             persist: Optional[bool] = True) -> "BM25OkapiRanker":
        model_path = conf.path_models + 'vsm_bm25okapi/' + conf.get_desc() + '/'
        if force or (not os.path.exists(model_path)) \
                or (not os.path.isfile(model_path + 'bm25okapi.pickle')) \
                or (not os.path.isfile(model_path + 'bm25okapi_index_mapping.pickle')):
            utils.mk_dir_if_not_exists(model_path)
            dataset = BM25OkapiRanker.extractor.load_dataset(conf=conf)
            bow_corpus = [(Ranker.get_text(conf, data), data['filename']) for (index, data) in dataset.iterrows()]
            bow_corpus, names = map(list, zip(*bow_corpus))
            index_mapping = BM25OkapiRanker.build_index_mapping(names)
            bm25 = BM25Okapi(bow_corpus)
            logging.info('BM25OkapiRanker : initialized')
            bm25_ranker = BM25OkapiRanker(model=bm25, index_mapping=index_mapping, conf=conf)
            bm25_ranker.persist(model_path)
            return bm25_ranker
        else:
            with open(model_path + 'bm25okapi.pickle', mode='rb') as file:
                bm25 = pickle.load(file)
                logging.info('BM25OkapiRanker : loading bm25okapi.pickle from {}'.format(model_path))
            with open(model_path + 'bm25okapi_index_mapping.pickle', mode='rb') as file:
                index_mapping = pickle.load(file)
                logging.info('BM25OkapiRanker : loading bm25_index_mapping.pickle from {}'.format(model_path))
            logging.info('BM25OkapiRanker : initialized')
            return BM25OkapiRanker(model=bm25, index_mapping=index_mapping, conf=conf)

    @staticmethod
    def build_index_mapping(names: List[Text]) -> Dict[int, int]:
        mapping ={}
        for index, name in enumerate(names):
            mapping[index] = int(name.split('.')[0])
        return mapping

    def get_name(self):
        return Ranker.R_BM25OKAPI

Пример #5

Показать файл

Файл: vsm_tfidf_ranker.py Проект: RaWi-Protoyping/RaWi

class TFIDFRanker(Ranker):

    preprocessor = Preprocessor()
    extractor = Extractor(preprocessor)

    def __init__(self, dictionary: corpora.Dictionary,
                 bow_corpus: corpora.MmCorpus,
                 model: TfidfModel, index: SparseMatrixSimilarity,
                 index_mapping: Dict[int, int],
                 conf: Configuration):
        self.dictionary = dictionary
        self.bow_corpus = bow_corpus
        self.model = model
        self.index = index
        self.index_mapping = index_mapping
        self.conf = conf

    def rank(self, query: Text, rank_threshold: Optional[float] = 0.0,
             rank_cutoff: Optional[int] = 100) -> List[Tuple[int, float]]:
        logging.info('TFIDFRanker: Ranking for "{}"'.format(query))
        preproc_query = TFIDFRanker.preprocessor.\
            preprocess_text(query, tokenized=True, remove_stopwords=self.conf.preprocesing_rm_stopwords,
                            stemmed=self.conf.preprocesing_stemmed,
                            stemming=self.conf.preprocessing_stemmer)
        preproc_query_bow = self.dictionary.doc2bow(preproc_query)
        similarities = self.index[self.model[preproc_query_bow]]
        return sorted([(self.index_mapping[index], sim)
                for index, sim in enumerate(similarities)],
                key=lambda x: x[1], reverse=True)[:rank_cutoff]

    def rank_gs(self, query: Text, goldstandard: Set[int], rank_threshold: Optional[float] = 0.0,
             rank_cutoff: Optional[int] = 100) -> List[Tuple[int, float]]:
        logging.info('TFIDFRanker: Ranking for "{}"'.format(query))
        preproc_query = TFIDFRanker.preprocessor.\
            preprocess_text(query, tokenized=True, remove_stopwords=self.conf.preprocesing_rm_stopwords,
                            stemmed=self.conf.preprocesing_stemmed,
                            stemming=self.conf.preprocessing_stemmer)
        preproc_query_bow = self.dictionary.doc2bow(preproc_query)
        similarities = self.index[self.model[preproc_query_bow]]
        return sorted([(self.index_mapping[index], sim)
                for index, sim in enumerate(similarities) if self.index_mapping[index] in goldstandard],
                      key=lambda x: x[1], reverse=True)

    def persist(self, path: Optional[Text]) -> None:
        self.dictionary.save(path + 'dict.dictionary')
        corpora.MmCorpus.serialize(path + 'corpus.mm', self.bow_corpus)
        self.model.save(path + 'tfidf.model')
        self.index.save(path + 'tfidf.index')
        with open(path + 'index_mapping.pickle', mode='wb') as file:
            pickle.dump(self.index_mapping, file)

    @staticmethod
    def load(conf: Configuration, force: Optional[bool] = False,
             persist: Optional[bool] = True) -> "TFIDFRanker":
        model_path = conf.path_models + 'vsm_tfidf/' + conf.get_desc() + '/'
        if force or (not os.path.exists(model_path)) \
                or (not os.path.isfile(model_path + 'corpus.mm')) \
                or (not os.path.isfile(model_path + 'tfidf.model')):
            utils.mk_dir_if_not_exists(model_path)
            dataset = TFIDFRanker.extractor.load_dataset(conf=conf)
            dictionary = corpora.Dictionary([Ranker.get_text(conf, data) for (index, data) in dataset.iterrows()])
            bow_corpus = [(dictionary.doc2bow(Ranker.get_text(conf, data)), data['filename'])
                          for (index, data) in dataset.iterrows()]
            bow_corpus, names = map(list, zip(*bow_corpus))
            index_mapping = TFIDFRanker.build_index_mapping(names)
            corpora.MmCorpus.serialize(model_path + 'corpus.mm', bow_corpus)
            mm_corpus = corpora.MmCorpus(model_path + 'corpus.mm')
            tfidf_model = TfidfModel(mm_corpus, )
            tfidf_index = SparseMatrixSimilarity(tfidf_model[mm_corpus],
                                                 num_features=mm_corpus.num_terms)
            ranker = TFIDFRanker(dictionary=dictionary, bow_corpus=mm_corpus,
                                 model=tfidf_model, index=tfidf_index, index_mapping=index_mapping, conf=conf)
            ranker.persist(model_path)
            logging.info('TFIDFRanker : initialized')
            logging.info('TFIDFRanker : model : {}'.format(tfidf_model))
            logging.info('TFIDFRanker : index : {}'.format(tfidf_index))
            return ranker
        else:
            dictionary = corpora.Dictionary.load(model_path + 'dict.dictionary')
            mm_corpus = corpora.MmCorpus(model_path+ 'corpus.mm')
            tfidf_model = TfidfModel.load(model_path + 'tfidf.model')
            tfidf_index = SparseMatrixSimilarity.load(model_path + 'tfidf.index')
            with open(model_path + 'index_mapping.pickle', mode='rb') as file:
                index_mapping = pickle.load(file)
                logging.info('TFIDFRanker : initialized')
            return TFIDFRanker(dictionary=dictionary,bow_corpus=mm_corpus,
                               model=tfidf_model,index=tfidf_index,index_mapping=index_mapping,conf=conf)

    @staticmethod
    def build_index_mapping(names: List[Text]) -> Dict[int, int]:
        mapping ={}
        for index, name in enumerate(names):
            mapping[index] = int(name.split('.')[0])
        return mapping

    def get_name(self):
        return Ranker.R_TFIDF