예제 #1
0
    def get_char_ngrams(self, ngram_size, str_series, doc_id_sr):
        """
        Handles ngram generation for characters analyzers.

        When analyzer is 'char_wb', we generate ngrams within word boundaries,
        meaning we need to first tokenize and pad each token with a delimiter.
        """
        if self.analyzer == 'char_wb' and ngram_size != 1:
            token_count = str_series.str.token_count(self.delimiter)
            tokens = str_series.str.tokenize(self.delimiter)
            del str_series

            padding = Series(self.delimiter).repeat(len(tokens))
            tokens = tokens.str.cat(padding)
            padding = padding.reset_index(drop=True)
            tokens = padding.str.cat(tokens)
            tokens = tokens.reset_index(drop=True)

            ngram_sr = tokens.str.character_ngrams(n=ngram_size)

            doc_id_df = cudf.DataFrame({
                'doc_id':
                doc_id_sr.repeat(token_count).reset_index(drop=True),
                # formula to count ngrams given number of letters per token:
                'ngram_count':
                tokens.str.len() - (ngram_size - 1)
            })
            del tokens
            ngram_count = doc_id_df.groupby('doc_id',
                                            sort=True).sum()['ngram_count']
            return ngram_sr, ngram_count, token_count

        if ngram_size == 1:
            token_count = str_series.str.len()
            ngram_sr = str_series.str.character_tokenize()
            del str_series
        elif self.analyzer == 'char':
            token_count = str_series.str.len()
            ngram_sr = str_series.str.character_ngrams(n=ngram_size)
            del str_series

        ngram_count = token_count - (ngram_size - 1)

        return ngram_sr, ngram_count, token_count