예제 #1
0
파일: preprocess.py 프로젝트: yakzan/ktext
class processor(processor_base):
    """
    Pre-process text in memory.

    Includes utilities for cleaning, tokenization, and vectorization in parallel.
    """
    def __init__(self,
                 hueristic_pct_padding: float = .90,
                 append_indicators: bool = False,
                 keep_n: int = 150000,
                 padding: str = 'pre',
                 padding_maxlen: Union[int, None] = None,
                 truncating: str = 'post'):
        """
        Parameters:
        ----------
        hueristic_pct_padding: float
            This parameter is only used if `padding_maxlen` = None.  A histogram
            of documents is calculated, and the maxlen is set hueristic_pct_padding.
        append_indicators: bool
            If True, will append the tokens '_start_' and '_end_' to the beginning
            and end of your tokenized documents.  This can be useful when training
            seq2seq models.
        keep_n: int = 150000
            This is the maximum size of your vocabulary (unique number of words
            allowed).  Consider limiting this to a reasonable size based upon
            your corpus.
        padding : str
            'pre' or 'post', pad either before or after each sequence.
        padding_maxlen : int or None
            Maximum sequence length, longer sequences are truncated and shorter
            sequences are padded with zeros at the end.  Note if this is specified,
            the `hueristic_pct_padding` is ignored.
        truncating : str
            'pre' or 'post', remove values from sequences larger than padding_maxlen
            either in the beginning or in the end of the sequence.

        See https://keras.io/preprocessing/sequence/

        Attributes:
        -----------
        vocabulary : gensim.corpora.dictionary.Dictionary
            This is a gensim object that is built after parsing all the tokens
            in your corpus.
        n_tokens : int
            The total number of tokens in the corpus.  Will be less than or
            equal to keep_n
        id2token : dict
            dict with { int : str} ex: {'the': 2, 'cat': 3}
            this is used for converting tokens to integers.
        token2id : dict
            dict with {str: int} ex: {2: 'the', 3: 'cat'}
            this is used for decoding predictions back to tokens
        document_length_stats : pandas.DataFrame
            histogram of document lengths.  Can be used to decide padding_maxlen.
        """
        super().__init__()
        self.hueristic_pct = hueristic_pct_padding
        self.append_indicators = append_indicators
        self.keep_n = keep_n
        self.padding = padding
        self.padding_maxlen = padding_maxlen
        self.truncating = truncating

        # These are placeholders for data that will be collected or calculated
        self.vocabulary = Dictionary()
        self.n_tokens = None
        self.id2token = None
        self.token2id = None
        self.document_length_histogram = Counter()
        self.document_length_stats = None
        self.doc_length_huerestic = None

        # These values are 'hardcoded' for now
        self.padding_value = 0.0
        self.padding_dtype = 'int32'
        self.start_tok = '_start_'
        self.end_tok = '_end_'
        self.keep_tokens = [self.start_tok, self.end_tok]

    def process_text(self, text: List[str]) -> List[List[str]]:
        """Combine the cleaner and tokenizer."""
        return self.__apply_tokenizer(self.__apply_cleaner(text))

    def __apply_cleaner(self, data: List[str]) -> List[str]:
        """Apply the cleaner over a list."""
        return [self.cleaner(doc) for doc in data]

    def __apply_tokenizer(self, data: List[str]) -> List[List[str]]:
        """Apply the tokenizer over a list."""
        if self.append_indicators:
            tmp = [[self.start_tok] + self.tokenizer(doc) + [self.end_tok]
                   for doc in data]
            return tmp
        else:
            return [self.tokenizer(doc) for doc in data]

    def parallel_process_text(self, data: List[str]) -> List[List[str]]:
        """Apply cleaner -> tokenizer."""
        return apply_parallel(data, self.process_text)

    def generate_doc_length_stats(self):
        """Analyze document length statistics for padding strategy"""
        hueristic = self.hueristic_pct
        histdf = (pd.DataFrame(
            [(a, b) for a, b in self.document_length_histogram.items()],
            columns=['bin', 'doc_count']).sort_values(by='bin'))
        histdf['cumsum_pct'] = histdf.doc_count.cumsum(
        ) / histdf.doc_count.sum()

        self.document_length_stats = histdf
        self.doc_length_huerestic = histdf.query(
            f'cumsum_pct >= {hueristic}').bin.head(1).values[0]
        logging.warning(' '.join([
            "Setting maximum document length to",
            f'{self.doc_length_huerestic} based upon',
            f'hueristic of {hueristic} percentile.\n',
            'See full histogram by insepecting the',
            "`document_length_stats` attribute."
        ]))
        self.padding_maxlen = self.doc_length_huerestic

    def fit(self,
            data: List[str],
            return_tokenized_data: bool = False,
            no_below: int = 100,
            no_above: float = .9) -> Union[None, List[List[str]]]:
        """
        TODO: update docs

        Apply cleaner and tokenzier to raw data and build vocabulary.

        Parameters
        ----------
        data : List[str]
            These are raw documents, which are a list of strings. ex:
            [["The quick brown fox"], ["jumps over the lazy dog"]]
        return_tokenized_data : bool
            Return the tokenized strings.  This is primarly used for debugging
            purposes.
        no_below : int
            See below explanation
        no_above : float
            See below explanation

        When tokenizing documents, filter tokens according to these rules:
        1. occur less than `no_below` documents (absolute number) or
        2. occur more than `no_above` documents (fraction of total corpus size, not absolute number).
        3. after (1), and (2), keep only the first keep_n most frequent tokens.

        Returns
        -------
        None or List[List[str]]
            if return_tokenized_data=True then will return tokenized documents,
            otherwise will not return anything.

        This method heavily leverages gensim https://radimrehurek.com/gensim/corpora/dictionary.html
        """
        now = get_time()
        logging.warning(f'....tokenizing data')
        tokenized_data = list(
            chain.from_iterable(self.parallel_process_text(data)))

        if not self.padding_maxlen:
            document_len_counters = apply_parallel(tokenized_data, count_len)

            for doc_counter in document_len_counters:
                self.document_length_histogram.update(doc_counter)
            self.generate_doc_length_stats()

        # chunk the data manually for corpus build adnd pass to build corpus method
        logging.warning(f'(1/3) done. {time_diff(now)} sec')
        logging.warning(f'....building corpus')
        now = get_time()
        corpus = build_corpus(tokenized_data)

        # Merge the corpuses from each thread together, this is like a "reduce" step
        logging.warning(f'(2/3) done. {time_diff(now)} sec')
        logging.warning(f'....consolidating corpus')
        now = get_time()
        self.vocabulary.merge_with(corpus)

        # # get rid of rare tokens from corpus such that they will get the same id
        self.vocabulary.filter_extremes(no_below,
                                        no_above,
                                        self.keep_n,
                                        keep_tokens=self.keep_tokens)

        # compactify the ids for each word
        self.vocabulary.compactify()

        # Build Dictionary accounting For 0 padding, and reserve 1 for unknown and rare Words
        self.token2id = dict([(k, v + 2)
                              for k, v in self.vocabulary.token2id.items()])
        self.id2token = dict([(v, k) for k, v in self.token2id.items()])
        self.n_tokens = len(self.id2token.keys())

        # logging
        logging.warning(f'(3/3) done. {time_diff(now)} sec')
        logging.warning(
            f'Finished parsing {self.vocabulary.num_docs:,} documents.')

        if return_tokenized_data:
            return tokenized_data

    def token_count_pandas(self):
        """ See token counts as pandas dataframe"""
        freq_df = pd.DataFrame(
            [b for a, b in self.vocabulary.dfs.items()],
            index=[a for a, b in self.vocabulary.dfs.items()],
            columns=['count'])

        id2tokens = [(b, a) for a, b in self.vocabulary.token2id.items()]

        token_df = pd.DataFrame([b for a, b in id2tokens],
                                index=[a for a, b in id2tokens],
                                columns=['token'])

        return freq_df.join(token_df).sort_values('count', ascending=False)

    def fit_transform(self,
                      data: List[str],
                      no_below: int = 25,
                      no_above: float = 0.8) -> List[List[int]]:
        """
        Apply cleaner and tokenzier to raw data, build vocabulary and return
        transfomred dataset that is a List[List[int]].  This will use
        process-based-threading on all available cores.

        ex:
        >>> data = [["The quick brown fox"], ["jumps over the lazy dog"]]
        >>> pp = preprocess(maxlen=5, no_below=0)
        >>> pp.fit_transform(data)
        # 0 padding is applied
        [[0, 2, 3, 4, 5], [6, 7, 2, 8, 9]]

        Parameters
        ----------
        data : List[str]
            These are raw documents, which are a list of strings. ex:
            [["The quick brown fox"], ["jumps over the lazy dog"]]
        no_below : int
            See below explanation
        no_above : float
            See below explanation

        When tokenizing documents, filter tokens according to these rules:
        1. occur less than `no_below` documents (absolute number) or
        2. occur more than `no_above` documents (fraction of total corpus size, not absolute number).
        3. after (1), and (2), keep only the first keep_n most frequent tokens.

        Returns
        -------
        numpy.array with shape (number of documents, max_len)


        This method leverages gensim https://radimrehurek.com/gensim/corpora/dictionary.html
        """
        tokdata = self.fit(data,
                           return_tokenized_data=True,
                           no_below=no_below,
                           no_above=no_above)

        logging.warning(f'...fit is finished, beginning transform')
        now = get_time()
        vec_data = self.vectorize_parallel(tokdata)
        logging.warning(f'done. {time_diff(now)} sec')
        return vec_data

    def transform(self, data: List[str]) -> List[List[int]]:
        """
        Transform List of documents into List[List[int]]
        If transforming a large number of documents consider using the method
        `transform_parallel` instead.

        ex:
        >> pp = processor()
        >> pp.fit(docs)
        >> new_docs = [["The quick brown fox"], ["jumps over the lazy dog"]]
        >> pp.transform(new_docs)
        [[1, 2, 3, 4], [5, 6, 1, 7, 8]]
        """
        return self.vectorize(self.process_text(data))

    def transform_parallel(self, data: List[str]) -> List[List[int]]:
        """
        Transform List of documents into List[List[int]].  Uses process based
        threading on all available cores.  If only processing a small number of
        documents ( < 10k ) then consider using the method `transform` instead.

        ex:
        >> pp = processor()
        >> pp.fit(docs)
        >> new_docs = [["The quick brown fox"], ["jumps over the lazy dog"]]
        >> pp.transform_parallel(new_docs)
        [[1, 2, 3, 4], [5, 6, 1, 7, 8]]
        """
        return np.vstack(apply_parallel(data, self.transform))

    def get_idx(self, token: str) -> int:
        """Get integer index from token."""
        # return the index for index or if not foudn return out of boundary index which is 1
        return self.token2id.get(token, 1)

    def __vec_one_doc(self, doc: List[str]) -> List[int]:
        """
        Vectorize a single tokenized document.
        ex: ['hello', 'world']
        """
        return [self.get_idx(tok) for tok in doc]

    def vectorize(self, docs: List[List[str]]) -> List[List[int]]:
        """
        Vectorize and apply padding on a set of tokenized doucments
        ex: [['hello, 'world'], ['goodbye', 'now']]

        """
        # First apply indexing on all the rows then pad_sequnces (i found this
        # faster than trying to do these steps on each row
        return pad_sequences(list(map(self.__vec_one_doc, docs)),
                             maxlen=self.padding_maxlen,
                             dtype=self.padding_dtype,
                             padding=self.padding,
                             truncating=self.truncating,
                             value=self.padding_value)

    def vectorize_parallel(self, data: List[List[str]]) -> np.array:
        """
        Apply idx-> token mappings in parallel and apply padding.

        Arguments:
        data: List of List of strings
        """
        indexed_data = apply_parallel(data, self.vectorize)
        # concatenate list of arrays vertically
        return np.vstack(indexed_data)
예제 #2
0
            output_file_dist = os.path.join(curr_dir, 'distance_js')

        logging.info('creating the dictionary for ' + str(curr_iter) + '...')

        if curr_iter == 1:
            dict_file = os.path.join(curr_dir, 'dict.model')
            if os.path.exists(dict_file):
                logging.info(f'loading dictionary file from: {dict_file}')
                dictionary = Dictionary.load(dict_file)
            else:
                dictionary = Dictionary(dark_text)
                dictionary.add_documents(clean_text)

                word_dict = {word[1:]: word for word in dictionary.itervalues() if word.startswith('_')}
                _dict = Dictionary([[word] for word in word_dict.keys()])
                dictionary.merge_with(_dict)

                dictionary = filter_dict(args.vocab_size, dictionary, chain(word_dict.values(), word_dict.keys()))
                dictionary.save(dict_file)

        else:
            dict_file_prev = os.path.join(prev_dir, 'dict.model')
            dict_file = os.path.join(curr_dir, 'dict.model')
            mrr_file = os.path.join(prev_dir, 'ranking_list.csv')

            dictionary = Dictionary.load(dict_file_prev)
            word_dict = {word[1:]: word for word in dictionary.itervalues() if word.startswith('_')}
            words = get_dark_words_prev(mrr_file)

            dictionary.filter_extremes(no_below=len(dark_text)+1, keep_tokens=chain(words, word_dict.values(), word_dict.keys())) # Do this primarily because docID may be useful
            dictionary.save(dict_file)