def test_empty_input_substitute(self): text = ' ' augs = [naw.SpellingAug()] for aug in augs: augmented_text = aug.augment(text) self.assertEqual('', augmented_text)
def spell_attack(text, n): #Spelling Augmenter #Substitute word by spelling mistake words dictionary aug = naw.SpellingAug('models/spelling_en.txt') attacked_texts = aug.augment(text, n=n) print("Attacked Texts:") print(attacked_texts)
def test_substitute_stopwords(self): texts = ['The quick brown fox jumps over the lazy dog'] stopwords = [t.lower() for t in texts[0].split(' ')[:3]] aug_n = 3 aug = naw.SpellingAug(dict_path=os.environ.get("MODEL_DIR") + 'spelling_en.txt', stopwords=stopwords) for text in texts: self.assertLess(0, len(text)) augmented_text = aug.augment(text) augmented_tokens = aug.tokenizer(augmented_text) tokens = aug.tokenizer(text) augmented_cnt = 0 for token, augmented_token in zip(tokens, augmented_tokens): if token.lower() in stopwords and len(token) > aug_n: self.assertEqual(token.lower(), augmented_token) else: augmented_cnt += 1 self.assertGreater(augmented_cnt, 0) self.assertLess(0, len(texts))
def test_empty_input_substitute(self): texts = ['', ' '] self.word2vec_model.action = 'substitute' self.context_word_embs_model.action = 'substitute' augs = [ naw.SpellingAug(), naw.AntonymAug(), naw.RandomWordAug(action='substitute'), naw.SynonymAug(aug_src='wordnet'), naw.TfIdfAug(model_path=self.tfidf_model_path, action="substitute"), self.word2vec_model, self.context_word_embs_model ] for aug in augs: for text in texts: augmented_text = aug.augment(text) self.assertTrue(augmented_text is None or augmented_text.strip() == '') augmented_texts = aug.augment(texts) for augmented_text in augmented_texts: self.assertTrue(augmented_text is None or augmented_text.strip() == '')
def test_case(self): # Swap aug = naw.RandomWordAug(action='swap') self.assertEqual('bB aA', aug.augment('aA bB')) data = 'I love McDonalds' doc = Doc(data, aug.tokenizer(data)) augmented_tokens = aug.change_case(doc, 1, 0, 1).get_augmented_tokens() self.assertEqual(['Love', 'I', 'McDonalds'], augmented_tokens) doc = Doc(data, aug.tokenizer(data)) augmented_tokens = aug.change_case(doc, 0, 1, 1).get_augmented_tokens() self.assertEqual(['Love', 'I', 'McDonalds'], augmented_tokens) data = 'He loves McDonalds' doc = Doc(data, aug.tokenizer(data)) augmented_tokens = aug.change_case(doc, 1, 0, 1).get_augmented_tokens() self.assertEqual(['Loves', 'he', 'McDonalds'], augmented_tokens) doc = Doc(data, aug.tokenizer(data)) augmented_tokens = aug.change_case(doc, 0, 1, 1).get_augmented_tokens() self.assertEqual(['Loves', 'he', 'McDonalds'], augmented_tokens) doc = Doc(data, aug.tokenizer(data)) augmented_tokens = aug.change_case(doc, 2, 1, 1).get_augmented_tokens() self.assertEqual(['He', 'McDonalds', 'loves'], augmented_tokens) # Insert aug = naw.TfIdfAug(model_path=self.tfidf_model_path, action='insert') expected = False for i in range(10): augmented_text = aug.augment('Good') if 'good' in augmented_text and aug.get_word_case(augmented_text.split(' ')[0]) == 'capitalize': expected = True break self.assertTrue(expected) # Substitute aug = naw.RandomWordAug(action='substitute', target_words=['abc']) expected = False for i in range(10): augmented_text = aug.augment('I love') if augmented_text == 'Abc love': expected = True break self.assertTrue(expected) aug = naw.AntonymAug() self.assertEqual('Unhappy', aug.augment('Happy')) # Do not change if target word is non-lower aug = naw.SpellingAug() self.assertEqual('RE', aug.augment('Re')) # Delete case aug = naw.RandomWordAug(action='delete') expected = False for i in range(10): augmented_text = aug.augment('I love') if augmented_text == 'Love': expected = True break self.assertTrue(expected)
def test_read_default_dict(self): text = 'abcdef' aug = naw.SpellingAug() self.assertTrue(aug.model.dict_path) aug.augment(text) self.assertTrue(True)
def test_oov(self): text = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa' aug = naw.SpellingAug(dict_path=os.environ.get("MODEL_DIR") + 'spelling_en.txt') augmented_text = aug.augment(text) self.assertEqual(text, augmented_text)
def test_oov(self): text = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa' aug = naw.SpellingAug( dict_path=os.path.join(self.model_dir, 'spelling_en.txt')) augmented_text = aug.augment(text) self.assertEqual(text, augmented_text)
def test_empty_input_substitute(self): text = ' ' augs = [ naw.SpellingAug(dict_path=os.environ.get("MODEL_DIR") + 'spelling_en.txt') ] for aug in augs: augmented_text = aug.augment(text) self.assertEqual('', augmented_text)
def test_empty_input(self): texts = [' '] aug = naw.SpellingAug(dict_path=os.environ.get("MODEL_DIR") + 'spelling_en.txt') for text in texts: augmented_text = aug.augment(text) self.assertEqual(text, augmented_text) self.assertEqual(1, len(texts))
def spelling_aug(corpus): aug = naw.SpellingAug() # augmented_sentences = [] # go through all train and dev sentences for sentence in corpus.train: augmented_texts = aug.augment(sentence, n=3) corpus = Corpus(train=SentenceDataset(augmented_texts), dev=corpus.dev, test=corpus.test) return corpus
def augment_(isear_train): import nlpaug.augmenter.word as naw aug = naw.SpellingAug() isear_aug = isear_train.apply( lambda x: pd.Series([x[0], aug.augment(x[1])]), axis=1) isear_aug.columns = isear_train.columns isear_aug1 = isear_train.apply( lambda x: pd.Series([x[0], aug.augment(x[1])]), axis=1) isear_aug1.columns = isear_train.columns isear_train = pd.concat([isear_train, isear_aug, isear_aug1], ignore_index=True) return isear_train
def nlpaug(word): aug = naf.Sometimes([ nac.OcrAug(), nac.KeyboardAug(), nac.RandomCharAug(action="insert"), nac.RandomCharAug(action="substitute"), nac.RandomCharAug(action="swap"), nac.RandomCharAug(action="delete"), naw.SpellingAug(), ]) word = aug.augment(word) return word
def test_substitute(self): texts = ['The quick brown fox jumps over the lazy dog'] aug = naw.SpellingAug(dict_path=os.environ.get("MODEL_DIR") + 'spelling_en.txt') for text in texts: self.assertLess(0, len(text)) augmented_text = aug.augment(text) self.assertNotEqual(text, augmented_text) self.assertLess(0, len(texts))
def test_substitute(self): texts = ['The quick brown fox jumps over the lazy dog'] aug = naw.SpellingAug( dict_path=os.path.join(self.model_dir, 'spelling_en.txt')) for text in texts: self.assertLess(0, len(text)) augmented_text = aug.augment(text) self.assertNotEqual(text, augmented_text) self.assertLess(0, len(texts))
def test_case(self): # Swap aug = naw.RandomWordAug(action='swap') self.assertEqual('bB aA', aug.augment('aA bB')) self.assertEqual(['Love', 'I', 'McDonalds'], aug.change_case('I love McDonalds'.split(' '), 1, 0)) self.assertEqual(['Love', 'I', 'McDonalds'], aug.change_case('I love McDonalds'.split(' '), 0, 1)) self.assertEqual(['Loves', 'he', 'McDonalds'], aug.change_case('He loves McDonalds'.split(' '), 1, 0)) self.assertEqual(['Loves', 'he', 'McDonalds'], aug.change_case('He loves McDonalds'.split(' '), 0, 1)) self.assertEqual(['He', 'McDonalds', 'loves'], aug.change_case('He loves McDonalds'.split(' '), 2, 1)) # Insert aug = naw.TfIdfAug(model_path=os.environ.get("MODEL_DIR"), action='insert') expected = False for i in range(10): augmented_text = aug.augment('Good') if 'good' in augmented_text and aug.get_word_case(augmented_text.split(' ')[0]) == 'capitalize': expected = True break self.assertTrue(expected) # Substitute aug = naw.RandomWordAug(action='substitute', target_words=['abc']) expected = False for i in range(10): augmented_text = aug.augment('I love') if augmented_text == 'Abc love': expected = True break self.assertTrue(expected) aug = naw.AntonymAug() self.assertEqual('Unhappy', aug.augment('Happy')) # Do not change if target word is non-lower aug = naw.SpellingAug(dict_path=os.environ.get("MODEL_DIR") + 'spelling_en.txt') self.assertEqual('RE', aug.augment('Re')) # Delete case aug = naw.RandomWordAug(action='delete') expected = False for i in range(10): augmented_text = aug.augment('I love') if augmented_text == 'Love': expected = True break self.assertTrue(expected)
def spelling_aug(text): aug=naw.SpellingAug() augmented_text= aug.augment(text, n=1) return augmented_text