Exemplos de Preprocessor.Preprocessor em Python, exemplos de gui2r.preprocessing.preprocess.Preprocessor.Preprocessor em Python

Exemplo n.º 1

0

Exibir arquivo

 def __init__(self,
              conf: Configuration,
              ranker: Optional[Dict[Text, Ranker]] = None):
     self.conf = conf
     if ranker:
         self.ranker = ranker
     else:
         self.ranker = {
             Ranker.R_BM25OKAPI:
             BM25OkapiRanker.load(conf),
             Ranker.R_IWCS:
             BoolIWCSRanker.load(conf),
             Ranker.R_TFIDF:
             TFIDFRanker.load(conf),
             Ranker.R_PRF_KLD_WEIGHTED_BM25:
             LocalPrfKldWeightedBM25ExpanderRanker.load(conf),
             Ranker.R_PRF_KLD_CAT_WEIGHTED_BM25:
             LocalPrfKldCatWeightedBM25ExpanderRanker.load(conf),
             Ranker.R_PRF_KLD_CAT_BM25:
             LocalPrfKldCatBM25ExpanderRanker.load(conf),
             Ranker.R_PRF_KLD_BM25:
             LocalPrfKldBM25ExpanderRanker.load(conf)
         }
     self.preprocessor = Preprocessor()
     self.expander = {}

Exemplo n.º 2

0

Exibir arquivo

 def __init__(self, preprocessor: Optional[Preprocessor] = Preprocessor()):
     self.preprocessor = preprocessor
     self.xml_parse_errors = 0
     self.file_count = 0
     self.filter_count = 0
     self.filter_count_cat = 0
     self.filter_lang_count = 0
     self.filter_ads_count = 0

Exemplo n.º 3

0

Exibir arquivo

Arquivo: loc_prf_kld_bm25_weighted_expander.py Projeto: RaWi-Protoyping/RaWi

class LocalPrfKldWeightedBM25ExpanderRanker(Ranker):

    preprocessor = Preprocessor()
    extractor = Extractor(preprocessor)

    def __init__(self, conf: Configuration, internal_ranker: BM25OkapiWeightedRanker,
                 dataset: pd.DataFrame, counter: Dict[Text, int],
                 probs_t_dc: Dict[Text, float], k: Optional[int] = 10, top_w: Optional[int] = 10):
        self.conf = conf
        self.ranker = internal_ranker
        self.dataset = dataset
        self.counter = counter
        self.probs_t_dc = probs_t_dc
        self.k = k
        self.top_w = top_w

    def rank(self, query: Text, rank_threshold: Optional[float] = 0.0,
             rank_cutoff: Optional[int] = 100) -> List[Tuple[int, float]]:
        logging.info('LocPrfKldWeightedRanker: Ranking for "{}"'.format(query))
        preproc_query = LocalPrfKldWeightedBM25ExpanderRanker.preprocessor.\
            preprocess_text(query, tokenized=True, remove_stopwords=self.conf.preprocesing_rm_stopwords,
                            stemmed=self.conf.preprocesing_stemmed)
        preproc_query_w = [(token, 1) for token in preproc_query]
        ranking = self.ranker.rank(preproc_query_w, rank_cutoff=rank_cutoff)
        probs_t_dr = self.get_probs_from_top([index for index, conf in ranking])
        all_cands = self.get_top_expansion_cands(probs_t_dr, rm_tokens=preproc_query)
        top_cands = all_cands[:self.top_w]
        preproc_query_w.extend(top_cands)
        re_ranking = self.ranker.rank(preproc_query_w, rank_cutoff=rank_cutoff)
        return re_ranking

    def rank_gs(self, query: Text, goldstandard: Set[int], rank_threshold: Optional[float] = 0.0,
             rank_cutoff: Optional[int] = 100) -> List[Tuple[int, float]]:
        logging.info('LocPrfKldWeightedRanker: Ranking for "{}"'.format(query))
        preproc_query = LocalPrfKldWeightedBM25ExpanderRanker.preprocessor.\
            preprocess_text(query, tokenized=True, remove_stopwords=self.conf.preprocesing_rm_stopwords,
                            stemmed=self.conf.preprocesing_stemmed)
        preproc_query_w = [(token, 1) for token in preproc_query]
        ranking = self.ranker.rank(preproc_query_w, rank_cutoff=rank_cutoff)
        probs_t_dr = self.get_probs_from_top([index for index, conf in ranking])
        all_cands = self.get_top_expansion_cands(probs_t_dr, rm_tokens=preproc_query)
        top_cands = all_cands[:self.top_w]
        preproc_query_w.extend(top_cands)
        re_ranking = self.ranker.rank_gs(preproc_query_w, goldstandard, rank_cutoff=rank_cutoff)
        return re_ranking

    def get_top_expansion_cands(self, probs_t_dr: Dict[Text, float],
                                rm_tokens: Optional[List[Text]] = None) -> List[Tuple[Text, float]]:
        scores_kld = {}
        for word, prob_t_dr in probs_t_dr.items():
            if word not in rm_tokens:
                scores_kld[word] = LocalPrfKldWeightedBM25ExpanderRanker.\
                    kullback_leibler_divergence(prob_t_dr=prob_t_dr, prob_t_dc=self.probs_t_dc[word])
        top_words = [(k,v) for k, v in sorted(scores_kld.items(), key=lambda item: item[1], reverse=True)]
        return top_words

    def get_probs_from_top(self, indexes: List[int]):
        top_indexes = indexes[:self.k]
        results = self.dataset.loc[self.dataset['fileindex'].isin(top_indexes),]
        words = []
        for index, data in results.iterrows():
            text = Ranker.get_text(self.conf, data)
            words.extend(text)
        counter = Counter(words)
        sum = len(words)
        probs = {word: (count / sum) for word, count in counter.items()}
        return probs

    @staticmethod
    def kullback_leibler_divergence(prob_t_dr: float, prob_t_dc: float):
        return prob_t_dr * (np.log((prob_t_dr / prob_t_dc)))

    def persist(self, path: Optional[Text]) -> None:
        pass

    @staticmethod
    def load(conf: Configuration, force: Optional[bool] = False,
             persist: Optional[bool] = True) -> "LocalPrfKldWeightedBM25ExpanderRanker":
        dataset = LocalPrfKldWeightedBM25ExpanderRanker.extractor.load_dataset(conf=conf)
        dataset.loc[:, 'fileindex'] = dataset.loc[:, 'filename'].apply(lambda x: int(x.split('.')[0]))
        internal_ranker = BM25OkapiWeightedRanker.load(conf, persist=persist)
        counter, probs = LocalPrfKldWeightedBM25ExpanderRanker.create_counter(conf, dataset)
        return LocalPrfKldWeightedBM25ExpanderRanker(conf=conf, internal_ranker=internal_ranker,
                                                     dataset=dataset, counter=counter, probs_t_dc=probs)

    @staticmethod
    def create_counter(conf: Configuration, dataset: pd.DataFrame) -> Tuple[Dict[Text, int], Dict[Text, float]]:
        text = [Ranker.get_text(conf, data) for (index, data) in dataset.iterrows()]
        counter = {}
        sum = 0
        for index, t_list in enumerate(text):
            sum += len(t_list)
            for token in t_list:
                if counter.get(token): counter[token] += 1
                else: counter[token] = 1
        probs = {token: (count / sum) for token, count in counter.items()}
        return counter, probs

    def get_name(self):
        return Ranker.R_PRF_KLD_WEIGHTED_BM25

Exemplo n.º 4

0

Exibir arquivo

class LocalPrfKldCatWeightedBM25ExpanderRanker(Ranker):

    preprocessor = Preprocessor()
    extractor = Extractor(preprocessor)

    def __init__(self,
                 conf: Configuration,
                 internal_ranker: BM25OkapiWeightedRanker,
                 dataset: pd.DataFrame,
                 counter: Dict[Text, Dict[Text, int]],
                 probs_t_dc: Dict[Text, Dict[Text, float]],
                 k: Optional[int] = 10,
                 top_w_per_cat: Optional[Dict[Text, int]] = None,
                 cats: Optional[List[Text]] = None):
        self.conf = conf
        self.ranker = internal_ranker
        self.dataset = dataset
        self.counter = counter
        self.probs_t_dc = probs_t_dc
        self.k = k
        if not top_w_per_cat:
            top_w_per_cat = {
                Extractor.DATA_ACTIVITY_NAME: 2,
                Extractor.DATA_TEXT_VISIBLE: 2,
                Extractor.DATA_TEXT_INVISIBLE: 2,
                Extractor.DATA_RES_IDS_VISIBLE: 2,
                Extractor.DATA_RES_IDS_INVISIBLE: 2,
                Extractor.DATA_ICON_IDS: 2
            }
        self.top_w_per_cat = top_w_per_cat
        if not cats:
            cats = [
                'text_activity_name', 'text_visible', 'text_invisible',
                'text_res_ids', 'text_icon_ids'
            ]
        self.cats = cats

    def rank(self,
             query: Text,
             rank_threshold: Optional[float] = 0.0,
             rank_cutoff: Optional[int] = 100) -> List[Tuple[int, float]]:
        logging.info(
            'LocPrfKldCatWeightedRanker: Ranking for "{}"'.format(query))
        preproc_query = LocalPrfKldCatWeightedBM25ExpanderRanker.preprocessor.\
            preprocess_text(query, tokenized=True, remove_stopwords=self.conf.preprocesing_rm_stopwords,
                            stemmed=self.conf.preprocesing_stemmed)
        preproc_query_w = [(token, 1) for token in preproc_query]
        ranking = self.ranker.rank(preproc_query_w, rank_cutoff=rank_cutoff)
        probs_t_dr = self.get_probs_from_top(
            [index for index, conf in ranking])
        top_cands = self.get_top_expansion_cands_per_cat(
            probs_t_dr,
            self.probs_t_dc,
            rm_tokens=preproc_query,
            ensure_unique=True)
        preproc_query_w.extend(top_cands)
        re_ranking = self.ranker.rank(preproc_query_w, rank_cutoff=rank_cutoff)
        return re_ranking

    def rank_gs(self,
                query: Text,
                goldstandard: Set[int],
                rank_threshold: Optional[float] = 0.0,
                rank_cutoff: Optional[int] = 100) -> List[Tuple[int, float]]:
        logging.info(
            'LocPrfKldCatWeightedRanker: Ranking for "{}"'.format(query))
        preproc_query = LocalPrfKldCatWeightedBM25ExpanderRanker.preprocessor.\
            preprocess_text(query, tokenized=True, remove_stopwords=self.conf.preprocesing_rm_stopwords,
                            stemmed=self.conf.preprocesing_stemmed)
        preproc_query_w = [(token, 1) for token in preproc_query]
        ranking = self.ranker.rank(preproc_query_w, rank_cutoff=rank_cutoff)
        probs_t_dr = self.get_probs_from_top(
            [index for index, conf in ranking])
        top_cands = self.get_top_expansion_cands_per_cat(
            probs_t_dr,
            self.probs_t_dc,
            rm_tokens=preproc_query,
            ensure_unique=True)
        preproc_query_w.extend(top_cands)
        re_ranking = self.ranker.rank_gs(preproc_query_w,
                                         goldstandard,
                                         rank_cutoff=rank_cutoff)
        return re_ranking

    def get_top_expansion_cands_per_cat(
            self,
            probs_t_dr: Dict[Text, Dict[Text, float]],
            probs_t_dc: Dict[Text, Dict[Text, float]],
            rm_tokens: Optional[List[Text]] = None,
            ensure_unique: Optional[bool] = True) -> List[Tuple[Text, float]]:
        if set(probs_t_dr.keys()) != set(probs_t_dc.keys()):
            raise ValueError(
                'ProbR and ProbC dictionaries do not contain the same categories'
            )
        top_words = []
        for index, name in enumerate(self.cats):
            top_words_cat = self.get_top_expansion_cands(
                probs_t_dr=probs_t_dr[name],
                probs_t_dc=probs_t_dc[name],
                rm_tokens=rm_tokens)
            top_words_cat = top_words_cat[:self.top_w_per_cat[name]]
            top_words.extend(top_words_cat)
        if ensure_unique:
            top_words = list(set(top_words))
        return top_words

    @staticmethod
    def get_top_expansion_cands(
            probs_t_dr: Dict[Text, float],
            probs_t_dc: Dict[Text, float],
            rm_tokens: Optional[List[Text]] = None
    ) -> List[Tuple[Text, float]]:
        scores_kld = {}
        for word, prob_t_dr in probs_t_dr.items():
            if word not in rm_tokens:
                scores_kld[word] = LocalPrfKldCatWeightedBM25ExpanderRanker.\
                    kullback_leibler_divergence(prob_t_dr=prob_t_dr, prob_t_dc=probs_t_dc[word])
        top_words = [(k, v) for k, v in sorted(
            scores_kld.items(), key=lambda item: item[1], reverse=True)]
        return top_words

    def get_probs_from_top(
            self, indexes: List[int]) -> Dict[Text, Dict[Text, float]]:
        top_indexes = indexes[:self.k]
        results = self.dataset.loc[
            self.dataset['fileindex'].isin(top_indexes), ]
        all_counts = {name: {} for name in self.cats}
        all_probs = {name: {} for name in self.cats}
        all_sums = {name: 0 for name in self.cats}
        for index, data in results.iterrows():
            for name in self.cats:
                text = data[name]
                for token in text:
                    if all_counts[name].get(token):
                        all_counts[name][token] += 1
                    else:
                        all_counts[name][token] = 1
                all_sums[name] += len(text)
        for name, counts in all_counts.items():
            all_probs[name] = {
                token: (count / all_sums[name])
                for token, count in counts.items()
            }
        return all_probs

    @staticmethod
    def kullback_leibler_divergence(prob_t_dr: float, prob_t_dc: float):
        return prob_t_dr * (np.log((prob_t_dr / prob_t_dc)))

    def persist(self, path: Optional[Text]) -> None:
        pass

    @staticmethod
    def load(
        conf: Configuration,
        force: Optional[bool] = False,
        persist: Optional[bool] = True
    ) -> "LocalPrfKldCatWeightedBM25ExpanderRanker":
        dataset = LocalPrfKldCatWeightedBM25ExpanderRanker.extractor.load_dataset(
            conf=conf)
        dataset.loc[:, 'fileindex'] = dataset.loc[:, 'filename'].apply(
            lambda x: int(x.split('.')[0]))
        if len(conf.text_segments_used
               ) == 1 and conf.text_segments_used[0] == Extractor.DATA_ALL:
            text_segments = [
                Extractor.DATA_ACTIVITY_NAME, Extractor.DATA_TEXT_VISIBLE,
                Extractor.DATA_TEXT_INVISIBLE, Extractor.DATA_RES_IDS_VISIBLE,
                Extractor.DATA_RES_IDS_INVISIBLE, Extractor.DATA_ICON_IDS
            ]
        else:
            text_segments = conf.text_segments_used
        logging.info(
            'LocPrfKldCatWeightedRanker: Using Text Segments: {}'.format(
                text_segments))
        segment_top_ws = {
            Extractor.DATA_ACTIVITY_NAME: 2,
            Extractor.DATA_TEXT_VISIBLE: 2,
            Extractor.DATA_TEXT_INVISIBLE: 2,
            Extractor.DATA_RES_IDS_VISIBLE: 2,
            Extractor.DATA_RES_IDS_INVISIBLE: 2,
            Extractor.DATA_ICON_IDS: 2
        }
        internal_ranker = BM25OkapiWeightedRanker.load(conf, persist=persist)
        counter, probs = LocalPrfKldCatWeightedBM25ExpanderRanker.create_counter(
            dataset, text_segments=text_segments)
        return LocalPrfKldCatWeightedBM25ExpanderRanker(
            conf=conf,
            internal_ranker=internal_ranker,
            dataset=dataset,
            counter=counter,
            probs_t_dc=probs,
            cats=text_segments,
            top_w_per_cat=segment_top_ws)

    @staticmethod
    def create_counter(dataset: pd.DataFrame, text_segments: List[Text]) -> \
            Tuple[Dict[Text, Dict[Text, int]], Dict[Text, Dict[Text, float]]]:
        all_counts = {name: {} for name in text_segments}
        all_probs = {}
        all_sums = {name: 0 for name in text_segments}
        for (index, data) in dataset.iterrows():
            for name in text_segments:
                text = data[name]
                for token in text:
                    if all_counts[name].get(token):
                        all_counts[name][token] += 1
                    else:
                        all_counts[name][token] = 1
                all_sums[name] += len(text)
        for name, counts in all_counts.items():
            all_probs[name] = {
                token: (count / all_sums[name])
                for token, count in counts.items()
            }
        return all_counts, all_probs

    def get_name(self):
        return Ranker.R_PRF_KLD_CAT_WEIGHTED_BM25

Exemplo n.º 5

0

Exibir arquivo

Arquivo: bm25okapi_ranker.py Projeto: RaWi-Protoyping/RaWi

class BM25OkapiRanker(Ranker):

    preprocessor = Preprocessor()
    extractor = Extractor(preprocessor)

    def __init__(self, model: BM25Okapi, index_mapping: Dict[int, int],
                 conf: Configuration):
        self.model = model
        self.index_mapping = index_mapping
        self.conf = conf

    def rank(self, query: Text, rank_threshold: Optional[float] = 0.0,
             rank_cutoff: Optional[int] = 100) -> List[Tuple[int, float]]:
        logging.info('BM25OkapiRanker: Ranking for "{}"'.format(query))
        preproc_query = BM25OkapiRanker.preprocessor.\
            preprocess_text(query, tokenized=True, remove_stopwords=self.conf.preprocesing_rm_stopwords,
                            stemmed=self.conf.preprocesing_stemmed,
                            stemming=self.conf.preprocessing_stemmer)
        top_n = self.get_top_n(preproc_query, n=rank_cutoff)
        return [(self.index_mapping[index], conf) for index, conf in top_n]

    def rank_gs(self, query: Text, goldstandard: Set[int], rank_threshold: Optional[float] = 0.0,
             rank_cutoff: Optional[int] = 100) -> List[Tuple[int, float]]:
        logging.info('BM25OkapiRanker: Ranking for "{}"'.format(query))
        preproc_query = BM25OkapiRanker.preprocessor.\
            preprocess_text(query, tokenized=True, remove_stopwords=self.conf.preprocesing_rm_stopwords,
                            stemmed=self.conf.preprocesing_stemmed,
                            stemming=self.conf.preprocessing_stemmer)
        top_n = self.get_top_n(preproc_query, n=self.model.corpus_size)
        return [(self.index_mapping[index], conf) for index, conf in top_n if self.index_mapping[index] in goldstandard]

    def get_top_n(self, query: List[Text], n: Optional[int] = 5):
        scores = self.model.get_scores(query)
        top_n_args = np.argsort(scores)[::-1][:n]
        top_n_sims = np.sort(scores)[::-1][:n]
        max_sim = np.sort(scores)[::-1][0]
        return [(arg, (sim / max_sim)) for (arg, sim) in zip(top_n_args, top_n_sims)]

    def persist(self, path: Optional[Text]) -> None:
        with open(path + 'bm25okapi.pickle', mode='wb') as file:
            pickle.dump(self.model, file)
        with open(path + 'bm25okapi_index_mapping.pickle', mode='wb') as file:
            pickle.dump(self.index_mapping, file)

    @staticmethod
    def load(conf: Configuration, force: Optional[bool] = False,
             persist: Optional[bool] = True) -> "BM25OkapiRanker":
        model_path = conf.path_models + 'vsm_bm25okapi/' + conf.get_desc() + '/'
        if force or (not os.path.exists(model_path)) \
                or (not os.path.isfile(model_path + 'bm25okapi.pickle')) \
                or (not os.path.isfile(model_path + 'bm25okapi_index_mapping.pickle')):
            utils.mk_dir_if_not_exists(model_path)
            dataset = BM25OkapiRanker.extractor.load_dataset(conf=conf)
            bow_corpus = [(Ranker.get_text(conf, data), data['filename']) for (index, data) in dataset.iterrows()]
            bow_corpus, names = map(list, zip(*bow_corpus))
            index_mapping = BM25OkapiRanker.build_index_mapping(names)
            bm25 = BM25Okapi(bow_corpus)
            logging.info('BM25OkapiRanker : initialized')
            bm25_ranker = BM25OkapiRanker(model=bm25, index_mapping=index_mapping, conf=conf)
            bm25_ranker.persist(model_path)
            return bm25_ranker
        else:
            with open(model_path + 'bm25okapi.pickle', mode='rb') as file:
                bm25 = pickle.load(file)
                logging.info('BM25OkapiRanker : loading bm25okapi.pickle from {}'.format(model_path))
            with open(model_path + 'bm25okapi_index_mapping.pickle', mode='rb') as file:
                index_mapping = pickle.load(file)
                logging.info('BM25OkapiRanker : loading bm25_index_mapping.pickle from {}'.format(model_path))
            logging.info('BM25OkapiRanker : initialized')
            return BM25OkapiRanker(model=bm25, index_mapping=index_mapping, conf=conf)

    @staticmethod
    def build_index_mapping(names: List[Text]) -> Dict[int, int]:
        mapping ={}
        for index, name in enumerate(names):
            mapping[index] = int(name.split('.')[0])
        return mapping

    def get_name(self):
        return Ranker.R_BM25OKAPI

Exemplo n.º 6

0

Exibir arquivo

class BoolIWCSRanker(Ranker):
    """This is the implementation of the neural Bag-Of-Words (nBOW) model using pretrained word2vec embeddings and
       and TF-IDF to compute a weighted mean embedding for the documents and queries
    """

    preprocessor = Preprocessor()
    extractor = Extractor(preprocessor)

    def __init__(self, inverted_index: Dict[Text, Set[str]],
                 bool_dictionary: List[str], conf: Configuration,
                 dictionary: corpora.Dictionary, bow_corpus: corpora.MmCorpus,
                 model: TfidfModel, index: SparseMatrixSimilarity,
                 index_mapping: Dict[int,
                                     int], inverse_index_mapping: Dict[int,
                                                                       int],
                 doc_embedding: np.array, model_embedding: KeyedVectors):
        self.inverted_index = inverted_index
        self.bool_dictionary = bool_dictionary
        self.dictionary = dictionary
        self.bow_corpus = bow_corpus
        self.model = model
        self.index = index
        self.index_mapping = index_mapping
        self.inverse_index_mapping = inverse_index_mapping
        self.conf = conf
        self.doc_embedding = doc_embedding
        self.model_embedding = model_embedding

    @staticmethod
    def inverted_index(conf: Configuration,
                       dataset: pd.DataFrame) -> Dict[Text, Set[str]]:
        path_docs = conf.path_dsls
        inv_index = {}
        for index, data in dataset.iterrows():
            text = Ranker.get_text(conf, data)
            file_name = data['filename']
            for token in text:
                if not inv_index.get(token):
                    inv_index[token] = {file_name}
                else:
                    inv_index[token].add(file_name)
        return inv_index

    def rank(self,
             query: Text,
             rank_threshold: Optional[float] = 0.0,
             rank_cutoff: Optional[int] = 100) -> List[Tuple[int, float]]:
        logging.info('nBOWRanker: Ranking for "{}"'.format(query))
        preproc_query = self.preprocessor.\
            preprocess_text(query, tokenized=True, remove_stopwords=self.conf.preprocesing_rm_stopwords,
                            stemmed=self.conf.preprocesing_stemmed,
                            stemming=self.conf.preprocessing_stemmer)
        query_tokens = [
            word for word in preproc_query if word in self.bool_dictionary
        ]
        result = self.get_bool_or_doc_matches(query_tokens)
        result_indexes = [int(index.split('.')[0]) for index in result]
        result_inv_indexes = [
            self.inverse_index_mapping[index] for index in result_indexes
        ]
        # TFIDF model
        preproc_query_bow = self.dictionary.doc2bow(preproc_query)
        tfidf_query_bow = self.model[preproc_query_bow]
        tfidf_query_embedded, missing_words_query = BoolIWCSRanker.get_embedding(
            tfidf_query_bow, self.model_embedding, self.dictionary)
        rankings = self.get_ranking_over_doc_matches(result_inv_indexes,
                                                     tfidf_query_embedded)
        rankings_inv_indexes = [(self.index_mapping[index], sim)
                                for index, sim in rankings]
        return rankings_inv_indexes[:min(len(result_indexes), rank_cutoff)]

    def rank_gs(self,
                query: Text,
                goldstandard: Set[int],
                rank_threshold: Optional[float] = 0.0,
                rank_cutoff: Optional[int] = 100) -> List[Tuple[int, float]]:
        logging.info('nBOWRanker: Ranking for "{}"'.format(query))
        preproc_query = self.preprocessor. \
            preprocess_text(query, tokenized=True, remove_stopwords=self.conf.preprocesing_rm_stopwords,
                            stemmed=self.conf.preprocesing_stemmed,
                            stemming=self.conf.preprocessing_stemmer)
        query_tokens = [
            word for word in preproc_query if word in self.bool_dictionary
        ]
        result = self.get_bool_or_doc_matches(query_tokens)
        result_indexes = [int(index.split('.')[0]) for index in result]
        result_inv_indexes = [
            self.inverse_index_mapping[index] for index in result_indexes
        ]
        # TFIDF model
        preproc_query_bow = self.dictionary.doc2bow(preproc_query)
        tfidf_query_bow = self.model[preproc_query_bow]
        tfidf_query_embedded, missing_words_query = BoolIWCSRanker.get_embedding(
            tfidf_query_bow, self.model_embedding, self.dictionary)
        rankings = self.get_ranking_over_doc_matches(result_inv_indexes,
                                                     tfidf_query_embedded)
        rankings_inv_indexes = [(self.index_mapping[index], sim)
                                for index, sim in rankings]
        init_result = [(index, conf) for index, conf in rankings_inv_indexes
                       if index in goldstandard]
        remains = [(elem, 1) for elem in goldstandard
                   if elem not in init_result]
        return init_result + remains

    def get_ranking_over_doc_matches(self, doc_indexes: List[int], query_embedded: np.array) \
            -> List[Tuple[int, float]]:
        query_embedded_reshaped = query_embedded.reshape(1, -1)
        relevant_corpus = self.doc_embedding[doc_indexes, :]
        distances = spatial.distance.cdist(relevant_corpus,
                                           query_embedded_reshaped, 'cosine')
        similarities = 1 - distances
        results = [(index, similarities[enum_index][0])
                   for enum_index, index in enumerate(doc_indexes)]
        results_sorted = [
            (k, v)
            for k, v in sorted(results, key=lambda item: item[1], reverse=True)
        ]
        return results_sorted

    def get_bool_or_doc_matches(self, query_tokens: List[Text]) -> Set[Text]:
        result = None
        for word in query_tokens:
            if result is None:
                result = self.inverted_index.get(word)
            else:
                intermediate_results = self.inverted_index.get(word)
                result = result.union(intermediate_results)
        return result

    @staticmethod
    def get_embedding(
            tfidf_words: List[Tuple[int, float]], embedding: KeyedVectors,
            id2word: corpora.Dictionary) -> Tuple[np.array, List[Text]]:
        missing_words = []
        embeds = []
        doc_embed = np.zeros(embedding.vector_size)
        for word_index, weight in tfidf_words:
            word = id2word[word_index]
            if word in embedding.vocab:
                word_embed = embedding[word]
                embeds.append(word_embed)
                doc_embed += word_embed * weight
            else:
                missing_words.append(word)
        return doc_embed, missing_words

    @staticmethod
    def embed_corpus(tfidf_corpus: List[List[Tuple[int, float]]],
                     embedding: KeyedVectors,
                     id2word: corpora.Dictionary) -> np.array:
        return np.array([
            BoolIWCSRanker.get_embedding(doc, embedding, id2word)[0]
            for doc in tfidf_corpus
        ])

    def persist(self, path: Optional[Text]) -> None:
        with open(path + 'dictionary.txt', mode='w') as file:
            for word in self.bool_dictionary:
                file.write(word + '\n')
        with open(path + 'inverted_index.pickle', mode='wb') as file:
            pickle.dump(self.inverted_index, file)
        self.dictionary.save(path + 'dict.dictionary')
        corpora.MmCorpus.serialize(path + 'corpus.mm', self.bow_corpus)
        self.model.save(path + 'tfidf.model')
        self.index.save(path + 'tfidf.index')
        with open(path + 'index_mapping.pickle', mode='wb') as file:
            pickle.dump(self.index_mapping, file)
        with open(path + 'inverse_index_mapping.pickle', mode='wb') as file:
            pickle.dump(self.inverse_index_mapping, file)
        np.save(path + 'doc_embedding.npy', self.doc_embedding)

    @staticmethod
    def load(conf: Configuration,
             force: Optional[bool] = False,
             persist: Optional[bool] = True) -> "BoolIWCSRanker":
        model_path = conf.path_models + 'bool_iwcs/' + conf.get_desc() + '/'
        if force or (not os.path.exists(model_path)) or \
                (not os.path.isfile(model_path + 'inverted_index.pickle')) \
                 or (not os.path.isfile(model_path + 'corpus.mm')) \
                 or (not os.path.isfile(model_path + 'tfidf.model')):
            utils.mk_dir_if_not_exists(model_path)
            # Create the TFIDF model and dictionary
            dataset = BoolIWCSRanker.extractor.load_dataset(conf=conf)
            dictionary = corpora.Dictionary([
                Ranker.get_text(conf, data)
                for (index, data) in dataset.iterrows()
            ])
            bow_corpus = [(dictionary.doc2bow(Ranker.get_text(conf, data)),
                           data['filename'])
                          for (index, data) in dataset.iterrows()]
            bow_corpus, names = map(list, zip(*bow_corpus))
            index_mapping = BoolIWCSRanker.build_index_mapping(names)
            inverse_index_mapping = BoolIWCSRanker.build_inverse_index_mapping(
                names)
            corpora.MmCorpus.serialize(model_path + 'corpus.mm', bow_corpus)
            mm_corpus = corpora.MmCorpus(model_path + 'corpus.mm')
            tfidf_model = TfidfModel(mm_corpus, )
            tfidf_index = SparseMatrixSimilarity(
                tfidf_model[mm_corpus], num_features=mm_corpus.num_terms)
            logging.info('nBOWRanker : TFIDF initialized')
            logging.info('nBOWRanker : TFIDF model : {}'.format(tfidf_model))
            logging.info('nBOWRanker : TFIDF index : {}'.format(tfidf_index))
            # Create boolean index
            inverted_index = BoolIWCSRanker.inverted_index(conf, dataset)
            bool_dictionary = inverted_index.keys()
            # Load word2vec embedding and embed the corpus
            word2vec = KeyedVectors.load_word2vec_format(
                '../resources/embeddings/GoogleNews-vectors-negative300.bin',
                binary=True)
            tfidf_corpus = [tfidf_model[doc] for doc in bow_corpus]
            doc_embedding = BoolIWCSRanker.embed_corpus(
                tfidf_corpus, word2vec, dictionary)
            logging.info('nBOWRanker : Embedded docs shape : {}'.format(
                doc_embedding.shape))
            ranker = BoolIWCSRanker(inverted_index,
                                    bool_dictionary,
                                    conf,
                                    dictionary,
                                    bow_corpus,
                                    tfidf_model,
                                    tfidf_index,
                                    index_mapping,
                                    inverse_index_mapping,
                                    doc_embedding=doc_embedding,
                                    model_embedding=word2vec)
            ranker.persist(model_path)
            return ranker
        else:
            dictionary = corpora.Dictionary.load(model_path +
                                                 'dict.dictionary')
            mm_corpus = corpora.MmCorpus(model_path + 'corpus.mm')
            tfidf_model = TfidfModel.load(model_path + 'tfidf.model')
            tfidf_index = SparseMatrixSimilarity.load(model_path +
                                                      'tfidf.index')
            with open(model_path + 'index_mapping.pickle', mode='rb') as file:
                index_mapping = pickle.load(file)
                logging.info('nBOWRanker : TFIDF indexmap initialized')
            with open(model_path + 'inverse_index_mapping.pickle',
                      mode='rb') as file:
                inverse_index_mapping = pickle.load(file)
                logging.info('nBOWRanker : TFIDF invindexmap initialized')
            with open(model_path + 'inverted_index.pickle', mode='rb') as file:
                inverted_index = pickle.load(file)
                bool_dictionary = inverted_index.keys()
            doc_embedding = np.load(model_path + 'doc_embedding.npy')
            logging.info('nBOWRanker : Doc embeddings loaded')
            word2vec = KeyedVectors.load_word2vec_format(
                '../resources/embeddings/GoogleNews-vectors-negative300.bin',
                binary=True)
            logging.info('nBOWRanker : Embedding model loaded')
            return BoolIWCSRanker(inverted_index,
                                  bool_dictionary,
                                  conf,
                                  dictionary,
                                  mm_corpus,
                                  tfidf_model,
                                  tfidf_index,
                                  index_mapping,
                                  inverse_index_mapping,
                                  doc_embedding=doc_embedding,
                                  model_embedding=word2vec)

    @staticmethod
    def build_index_mapping(names: List[Text]) -> Dict[int, int]:
        mapping = {}
        for index, name in enumerate(names):
            mapping[index] = int(name.split('.')[0])
        return mapping

    @staticmethod
    def build_inverse_index_mapping(names: List[Text]) -> Dict[int, int]:
        mapping = {}
        for index, name in enumerate(names):
            mapping[int(name.split('.')[0])] = index
        return mapping

    def get_name(self):
        return Ranker.R_IWCS

Exemplo n.º 7

0

Exibir arquivo

Arquivo: vsm_tfidf_ranker.py Projeto: RaWi-Protoyping/RaWi

class TFIDFRanker(Ranker):

    preprocessor = Preprocessor()
    extractor = Extractor(preprocessor)

    def __init__(self, dictionary: corpora.Dictionary,
                 bow_corpus: corpora.MmCorpus,
                 model: TfidfModel, index: SparseMatrixSimilarity,
                 index_mapping: Dict[int, int],
                 conf: Configuration):
        self.dictionary = dictionary
        self.bow_corpus = bow_corpus
        self.model = model
        self.index = index
        self.index_mapping = index_mapping
        self.conf = conf

    def rank(self, query: Text, rank_threshold: Optional[float] = 0.0,
             rank_cutoff: Optional[int] = 100) -> List[Tuple[int, float]]:
        logging.info('TFIDFRanker: Ranking for "{}"'.format(query))
        preproc_query = TFIDFRanker.preprocessor.\
            preprocess_text(query, tokenized=True, remove_stopwords=self.conf.preprocesing_rm_stopwords,
                            stemmed=self.conf.preprocesing_stemmed,
                            stemming=self.conf.preprocessing_stemmer)
        preproc_query_bow = self.dictionary.doc2bow(preproc_query)
        similarities = self.index[self.model[preproc_query_bow]]
        return sorted([(self.index_mapping[index], sim)
                for index, sim in enumerate(similarities)],
                key=lambda x: x[1], reverse=True)[:rank_cutoff]

    def rank_gs(self, query: Text, goldstandard: Set[int], rank_threshold: Optional[float] = 0.0,
             rank_cutoff: Optional[int] = 100) -> List[Tuple[int, float]]:
        logging.info('TFIDFRanker: Ranking for "{}"'.format(query))
        preproc_query = TFIDFRanker.preprocessor.\
            preprocess_text(query, tokenized=True, remove_stopwords=self.conf.preprocesing_rm_stopwords,
                            stemmed=self.conf.preprocesing_stemmed,
                            stemming=self.conf.preprocessing_stemmer)
        preproc_query_bow = self.dictionary.doc2bow(preproc_query)
        similarities = self.index[self.model[preproc_query_bow]]
        return sorted([(self.index_mapping[index], sim)
                for index, sim in enumerate(similarities) if self.index_mapping[index] in goldstandard],
                      key=lambda x: x[1], reverse=True)

    def persist(self, path: Optional[Text]) -> None:
        self.dictionary.save(path + 'dict.dictionary')
        corpora.MmCorpus.serialize(path + 'corpus.mm', self.bow_corpus)
        self.model.save(path + 'tfidf.model')
        self.index.save(path + 'tfidf.index')
        with open(path + 'index_mapping.pickle', mode='wb') as file:
            pickle.dump(self.index_mapping, file)

    @staticmethod
    def load(conf: Configuration, force: Optional[bool] = False,
             persist: Optional[bool] = True) -> "TFIDFRanker":
        model_path = conf.path_models + 'vsm_tfidf/' + conf.get_desc() + '/'
        if force or (not os.path.exists(model_path)) \
                or (not os.path.isfile(model_path + 'corpus.mm')) \
                or (not os.path.isfile(model_path + 'tfidf.model')):
            utils.mk_dir_if_not_exists(model_path)
            dataset = TFIDFRanker.extractor.load_dataset(conf=conf)
            dictionary = corpora.Dictionary([Ranker.get_text(conf, data) for (index, data) in dataset.iterrows()])
            bow_corpus = [(dictionary.doc2bow(Ranker.get_text(conf, data)), data['filename'])
                          for (index, data) in dataset.iterrows()]
            bow_corpus, names = map(list, zip(*bow_corpus))
            index_mapping = TFIDFRanker.build_index_mapping(names)
            corpora.MmCorpus.serialize(model_path + 'corpus.mm', bow_corpus)
            mm_corpus = corpora.MmCorpus(model_path + 'corpus.mm')
            tfidf_model = TfidfModel(mm_corpus, )
            tfidf_index = SparseMatrixSimilarity(tfidf_model[mm_corpus],
                                                 num_features=mm_corpus.num_terms)
            ranker = TFIDFRanker(dictionary=dictionary, bow_corpus=mm_corpus,
                                 model=tfidf_model, index=tfidf_index, index_mapping=index_mapping, conf=conf)
            ranker.persist(model_path)
            logging.info('TFIDFRanker : initialized')
            logging.info('TFIDFRanker : model : {}'.format(tfidf_model))
            logging.info('TFIDFRanker : index : {}'.format(tfidf_index))
            return ranker
        else:
            dictionary = corpora.Dictionary.load(model_path + 'dict.dictionary')
            mm_corpus = corpora.MmCorpus(model_path+ 'corpus.mm')
            tfidf_model = TfidfModel.load(model_path + 'tfidf.model')
            tfidf_index = SparseMatrixSimilarity.load(model_path + 'tfidf.index')
            with open(model_path + 'index_mapping.pickle', mode='rb') as file:
                index_mapping = pickle.load(file)
                logging.info('TFIDFRanker : initialized')
            return TFIDFRanker(dictionary=dictionary,bow_corpus=mm_corpus,
                               model=tfidf_model,index=tfidf_index,index_mapping=index_mapping,conf=conf)

    @staticmethod
    def build_index_mapping(names: List[Text]) -> Dict[int, int]:
        mapping ={}
        for index, name in enumerate(names):
            mapping[index] = int(name.split('.')[0])
        return mapping

    def get_name(self):
        return Ranker.R_TFIDF