Exemplo n.º 1
0
    def test_empty_input_substitute(self):
        texts = ['', '           ']

        self.word2vec_model.action = 'substitute'
        self.context_word_embs_model.action = 'substitute'

        augs = [
            naw.SpellingAug(),
            naw.AntonymAug(),
            naw.RandomWordAug(action='substitute'),
            naw.SynonymAug(aug_src='wordnet'),
            naw.TfIdfAug(model_path=self.tfidf_model_path,
                         action="substitute"), self.word2vec_model,
            self.context_word_embs_model
        ]

        for aug in augs:
            for text in texts:
                augmented_text = aug.augment(text)
                self.assertTrue(augmented_text is None
                                or augmented_text.strip() == '')

            augmented_texts = aug.augment(texts)
            for augmented_text in augmented_texts:
                self.assertTrue(augmented_text is None
                                or augmented_text.strip() == '')
Exemplo n.º 2
0
    def test_case(self):
        # Swap
        aug = naw.RandomWordAug(action='swap')
        self.assertEqual('bB aA', aug.augment('aA bB'))

        data = 'I love McDonalds'
        doc = Doc(data, aug.tokenizer(data))
        augmented_tokens = aug.change_case(doc, 1, 0, 1).get_augmented_tokens()
        self.assertEqual(['Love', 'I', 'McDonalds'], augmented_tokens)
        doc = Doc(data, aug.tokenizer(data))
        augmented_tokens = aug.change_case(doc, 0, 1, 1).get_augmented_tokens()
        self.assertEqual(['Love', 'I', 'McDonalds'], augmented_tokens)

        data = 'He loves McDonalds'
        doc = Doc(data, aug.tokenizer(data))
        augmented_tokens = aug.change_case(doc, 1, 0, 1).get_augmented_tokens()
        self.assertEqual(['Loves', 'he', 'McDonalds'], augmented_tokens)
        doc = Doc(data, aug.tokenizer(data))
        augmented_tokens = aug.change_case(doc, 0, 1, 1).get_augmented_tokens()
        self.assertEqual(['Loves', 'he', 'McDonalds'], augmented_tokens)
        doc = Doc(data, aug.tokenizer(data))
        augmented_tokens = aug.change_case(doc, 2, 1, 1).get_augmented_tokens()
        self.assertEqual(['He', 'McDonalds', 'loves'], augmented_tokens)

        # Insert
        aug = naw.TfIdfAug(model_path=self.tfidf_model_path, action='insert')
        expected = False
        for i in range(10):
            augmented_text = aug.augment('Good')
            if 'good' in augmented_text and aug.get_word_case(augmented_text.split(' ')[0]) == 'capitalize':
                expected = True
                break
        self.assertTrue(expected)

        # Substitute
        aug = naw.RandomWordAug(action='substitute', target_words=['abc'])
        expected = False
        for i in range(10):
            augmented_text = aug.augment('I love')
            if augmented_text == 'Abc love':
                expected = True
                break
        self.assertTrue(expected)

        aug = naw.AntonymAug()
        self.assertEqual('Unhappy', aug.augment('Happy'))

        # Do not change if target word is non-lower
        aug = naw.SpellingAug()
        self.assertEqual('RE', aug.augment('Re'))

        # Delete case
        aug = naw.RandomWordAug(action='delete')
        expected = False
        for i in range(10):
            augmented_text = aug.augment('I love')
            if augmented_text == 'Love':
                expected = True
                break
        self.assertTrue(expected)
Exemplo n.º 3
0
def antonym_subsi(text):
    #Antonym Augmenter
    #Substitute word by antonym
    aug = naw.AntonymAug()
    attacked_text = aug.augment(text)
    print("Attacked Text:")
    print(attacked_text)
Exemplo n.º 4
0
    def test_non_strip_input(self):
        text = ' Good boy '

        augs = [
            naw.ContextualWordEmbsAug(action='insert'),
            naw.AntonymAug(),
            naw.TfIdfAug(model_path=os.environ.get("MODEL_DIR"), action="substitute")
        ]

        for aug in augs:
            augmented_text = aug.augment(text)
            self.assertNotEqual(text, augmented_text)
Exemplo n.º 5
0
    def test_skip_punctuation(self):
        text = '. . . . ! ? # @'

        augs = [
            naw.ContextualWordEmbsAug(action='insert'),
            naw.AntonymAug(),
            naw.TfIdfAug(model_path=os.environ.get("MODEL_DIR"), action="substitute")
        ]

        for aug in augs:
            augmented_text = aug.augment(text)
            self.assertEqual(text, augmented_text)
Exemplo n.º 6
0
    def test_skip_punctuation(self):
        text = '. . . . ! ? # @'

        augs = [
            # naw.ContextualWordEmbsAug(action='insert'), # After using convert_tokens_to_ids and decode function, it cannot keep it original format.
            naw.AntonymAug(),
            naw.TfIdfAug(model_path=self.tfidf_model_path, action="substitute")
        ]

        for aug in augs:
            augmented_text = aug.augment(text)
            self.assertEqual(text, augmented_text)
Exemplo n.º 7
0
    def test_excessive_space(self):
        # https://github.com/makcedward/nlpaug/issues/48
        text = 'The  quick brown fox        jumps over the lazy dog . 1  2 '
        expected_result = ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.', '1', '2']

        augs = [
            naw.ContextualWordEmbsAug(action='insert'),
            naw.AntonymAug(),
            naw.TfIdfAug(model_path=os.environ.get("MODEL_DIR"), action="substitute")
        ]

        for aug in augs:
            tokenized_text = aug._tokenizer(text)
            self.assertEqual(tokenized_text, expected_result)
Exemplo n.º 8
0
    def test_case(self):
        # Swap
        aug = naw.RandomWordAug(action='swap')
        self.assertEqual('bB aA', aug.augment('aA bB'))
        self.assertEqual(['Love', 'I', 'McDonalds'], aug.change_case('I love McDonalds'.split(' '), 1, 0))
        self.assertEqual(['Love', 'I', 'McDonalds'], aug.change_case('I love McDonalds'.split(' '), 0, 1))
        self.assertEqual(['Loves', 'he', 'McDonalds'], aug.change_case('He loves McDonalds'.split(' '), 1, 0))
        self.assertEqual(['Loves', 'he', 'McDonalds'], aug.change_case('He loves McDonalds'.split(' '), 0, 1))
        self.assertEqual(['He', 'McDonalds', 'loves'], aug.change_case('He loves McDonalds'.split(' '), 2, 1))

        # Insert
        aug = naw.TfIdfAug(model_path=os.environ.get("MODEL_DIR"), action='insert')
        expected = False
        for i in range(10):
            augmented_text = aug.augment('Good')
            if 'good' in augmented_text and aug.get_word_case(augmented_text.split(' ')[0]) == 'capitalize':
                expected = True
                break
        self.assertTrue(expected)

        # Substitute
        aug = naw.RandomWordAug(action='substitute', target_words=['abc'])
        expected = False
        for i in range(10):
            augmented_text = aug.augment('I love')
            if augmented_text == 'Abc love':
                expected = True
                break
        self.assertTrue(expected)

        aug = naw.AntonymAug()
        self.assertEqual('Unhappy', aug.augment('Happy'))

        # Do not change if target word is non-lower
        aug = naw.SpellingAug(dict_path=os.environ.get("MODEL_DIR") + 'spelling_en.txt')
        self.assertEqual('RE', aug.augment('Re'))

        # Delete case
        aug = naw.RandomWordAug(action='delete')
        expected = False
        for i in range(10):
            augmented_text = aug.augment('I love')
            if augmented_text == 'Love':
                expected = True
                break
        self.assertTrue(expected)
Exemplo n.º 9
0
    def __init__(self):
        antAug = naw.AntonymAug()
        synAug = naw.SynonymAug(aug_src='wordnet')
        embAug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased',
                                           action="substitute")

        self.model_dict = {
            0: antAug,
            1: synAug,
            2: embAug
        }

        self.output_data = {
            'Sentence1': [],
            'Sentence2': [],
            'Label': []
        }
Exemplo n.º 10
0
def prepare_aug():
    # Contextual Word Embeddings Augmenter, Substitute word by contextual word embeddings
    neu_aug = []
    neu_aug.append(
        naw.ContextualWordEmbsAug(model_path='bert-base-uncased',
                                  action="insert"))
    neu_aug.append(
        naw.ContextualWordEmbsAug(model_path='bert-base-uncased',
                                  action="substitute"))
    neu_aug.append(
        naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased',
                                  action="substitute"))
    neu_aug.append(
        naw.ContextualWordEmbsAug(model_path='roberta-base',
                                  action="substitute"))

    # Synonym Augmenter, Substitute word by WordNet's synonym
    syn_aug = []
    syn_aug.append(naw.SynonymAug(aug_src='wordnet'))
    syn_aug.append(
        naw.SynonymAug(
            aug_src='ppdb',
            model_path=
            '/home/ubuntu/sentiment_analysis/bert-sentiment/syn_model/ppdb-2.0-tldr'
        ))

    # Antonym Augmenter
    ant_aug = []
    ant_aug.append(naw.AntonymAug())

    # Random Word Augmenter
    random_aug = []
    random_aug.append(naw.RandomWordAug(action="swap"))
    random_aug.append(naw.RandomWordAug())

    print('augmenter initialization finished ...')
    aug = []
    aug.extend(neu_aug)
    aug.extend(syn_aug)
    aug.extend(ant_aug)
    aug.extend(random_aug)
    return aug
Exemplo n.º 11
0
    def setUpClass(cls):
        env_config_path = os.path.abspath(
            os.path.join(os.path.dirname(__file__), '..', '..', '..', '.env'))
        load_dotenv(env_config_path)

        cls.augs = [naw.AntonymAug()]
Exemplo n.º 12
0
import os
import nlpaug.augmenter.word as naw
from nlpaug.util import Action

os.environ["MODEL_DIR"] = '../model'
model_dir = os.environ.get("MODEL_DIR")

aug = naw.AntonymAug()
_text = 'The quick brown fox jumps over the lazy dog'
augmented_text = aug.augment(_text)
print("Original:")
print(_text)
print("Antonym Text:")
print(augmented_text)

aug = naw.SynonymAug(aug_src='wordnet')
text = 'The quick brown fox jumps over the lazy dog.'
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Synonym Text:")
print(augmented_text)

aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased',
                                action="substitute")
text = 'The quick brown fox jumps over the lazy dog.'
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("BeRT Embed Text:")
print(augmented_text)
Exemplo n.º 13
0
import nlpaug.augmenter.word as nas

from util import *

if __name__ == '__main__':
    random_seed(random.randint(0, 100000))

    # create configuration file
    config = Config()

    # create text embedder
    embedder = Embedder(config.embedding_length)

    augmenter = None
    if config.augment:
        augmenter = nas.AntonymAug()

    # preprocess data and create wos2class.text.json and wos2class.train.json
    if not config.use_existing_data:
        data_manager = DataManager(config, augmenter)
        data_manager.preprocess_data()
        data_manager.create_train_test_jsonfile()
        data_manager.count_labels()

    # create dataset and dataloaders
    train_dataset = WOSDataset(config, embedder, is_train=True)
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
    train_iter = iter(train_dataloader)

    test_dataset = WOSDataset(config, embedder, is_train=False)
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=config.batch_size, shuffle=True)