Exemplo n.º 1
0
    def test_train(self):
        def _tokenizer(text, token_pattern=r"(?u)\b\w\w+\b"):
            token_pattern = re.compile(token_pattern)
            return token_pattern.findall(text)

        train_data = sklearn.datasets.fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
        train_x = train_data.data

        train_x_tokens = [_tokenizer(x) for x in train_x]

        tfidf_model = nmw.TfIdf()
        tfidf_model.train(train_x_tokens)
        tfidf_model.save('.')

        aug = naw.TfIdfAug(model_path='.', tokenizer=_tokenizer)

        texts = [
            'The quick brown fox jumps over the lazy dog',
            'asdasd test apple dog asd asd'
        ]

        for text in texts:
            augmented_text = aug.augment(text)
            self.assertNotEqual(text, augmented_text)

        self.assertLess(0, len(texts))
Exemplo n.º 2
0
def init_tfidf_model(model_path, force_reload=False):
    # Load model once at runtime
    global TFIDF_MODEL
    if TFIDF_MODEL and not force_reload:
        return TFIDF_MODEL

    tfidf_model = nmws.TfIdf()
    tfidf_model.read(model_path)
    TFIDF_MODEL = tfidf_model

    return tfidf_model
Exemplo n.º 3
0
    def _train_tfidf(self):
        import sklearn.datasets
        import re
        import nlpaug.model.word_stats as nmw

        def _tokenizer(text, token_pattern=r"(?u)\b\w\w+\b"):
            token_pattern = re.compile(token_pattern)
            return token_pattern.findall(text)

        # Load sample data
        train_data = sklearn.datasets.fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
        train_x = train_data.data

        # Tokenize input
        train_x_tokens = [_tokenizer(x) for x in train_x]

        # Train TF-IDF model
        if not os.path.exists(self.tfidf_model_path):
            os.makedirs(self.tfidf_model_path)

        tfidf_model = nmw.TfIdf()
        tfidf_model.train(train_x_tokens)
        tfidf_model.save(self.tfidf_model_path)
def _tokenizer(text, token_pattern=r"(?u)\b\w\w+\b"):
    token_pattern = re.compile(token_pattern)
    return token_pattern.findall(text)


# Load sample data
train_data = sklearn.datasets.fetch_20newsgroups(subset='train',
                                                 remove=('headers', 'footers',
                                                         'quotes'))
train_x = train_data.data

# Tokenize input
train_x_tokens = [_tokenizer(x) for x in train_x]

# Train TF-IDF model
tfidf_model = nmw.TfIdf()
tfidf_model.train(train_x_tokens)
tfidf_model.save('.')

# Load TF-IDF augmenter
aug = naw.TfIdfAug(model_path='.', tokenizer=_tokenizer)

texts = [
    'The quick brown fox jumps over the lazy dog',
    'asdasd test apple dog asd asd'
]

for text in texts:
    augmented_text = aug.augment(text)

    print('-' * 20)