示例#1
0
    def test_case(self):
        # Swap
        aug = naw.RandomWordAug(action='swap')
        self.assertEqual('bB aA', aug.augment('aA bB'))

        data = 'I love McDonalds'
        doc = Doc(data, aug.tokenizer(data))
        augmented_tokens = aug.change_case(doc, 1, 0, 1).get_augmented_tokens()
        self.assertEqual(['Love', 'I', 'McDonalds'], augmented_tokens)
        doc = Doc(data, aug.tokenizer(data))
        augmented_tokens = aug.change_case(doc, 0, 1, 1).get_augmented_tokens()
        self.assertEqual(['Love', 'I', 'McDonalds'], augmented_tokens)

        data = 'He loves McDonalds'
        doc = Doc(data, aug.tokenizer(data))
        augmented_tokens = aug.change_case(doc, 1, 0, 1).get_augmented_tokens()
        self.assertEqual(['Loves', 'he', 'McDonalds'], augmented_tokens)
        doc = Doc(data, aug.tokenizer(data))
        augmented_tokens = aug.change_case(doc, 0, 1, 1).get_augmented_tokens()
        self.assertEqual(['Loves', 'he', 'McDonalds'], augmented_tokens)
        doc = Doc(data, aug.tokenizer(data))
        augmented_tokens = aug.change_case(doc, 2, 1, 1).get_augmented_tokens()
        self.assertEqual(['He', 'McDonalds', 'loves'], augmented_tokens)

        # Insert
        aug = naw.TfIdfAug(model_path=self.tfidf_model_path, action='insert')
        expected = False
        for i in range(10):
            augmented_text = aug.augment('Good')
            if 'good' in augmented_text and aug.get_word_case(augmented_text.split(' ')[0]) == 'capitalize':
                expected = True
                break
        self.assertTrue(expected)

        # Substitute
        aug = naw.RandomWordAug(action='substitute', target_words=['abc'])
        expected = False
        for i in range(10):
            augmented_text = aug.augment('I love')
            if augmented_text == 'Abc love':
                expected = True
                break
        self.assertTrue(expected)

        aug = naw.AntonymAug()
        self.assertEqual('Unhappy', aug.augment('Happy'))

        # Do not change if target word is non-lower
        aug = naw.SpellingAug()
        self.assertEqual('RE', aug.augment('Re'))

        # Delete case
        aug = naw.RandomWordAug(action='delete')
        expected = False
        for i in range(10):
            augmented_text = aug.augment('I love')
            if augmented_text == 'Love':
                expected = True
                break
        self.assertTrue(expected)
示例#2
0
def augmentation(text, insert=False, substitute=False, swap=True, delete=True):
    augs = []

    if insert:
        aug = naw.WordEmbsAug(
            model_type='word2vec',
            model_path=
            '/media/jionie/my_disk/Kaggle/Tweet/model/word2vec/GoogleNews-vectors-negative300.bin',
            action="insert")
        augs.append(aug)

    if substitute:
        aug_sub = naw.SynonymAug(aug_src='wordnet')
        augs.append(aug_sub)

    if swap:
        aug_swap = naw.RandomWordAug(action="swap")
        augs.append(aug_swap)

    if delete:
        aug_del = naw.RandomWordAug()
        augs.append(aug_del)

    aug = naf.Sometimes(augs, aug_p=0.5, pipeline_p=0.5)
    # print("before aug:", text)
    text = aug.augment(text, n=1)
    # print("after aug:", text)

    return text
示例#3
0
def data_augment(corpus, label):
    syn_aug = naw.SynonymAug(aug_src="wordnet")
    rand_aug = naw.RandomWordAug(action="swap")
    data_struc = {'emotion_label': [], 'emotion_text': []}
    aug_dataframe = pd.DataFrame(data_struc)
    print('Augmenting data')
    for label, sentence in zip(label, corpus):
        if sentence.find("\n") > 0:
            sentence = sentence.replace("\n", "")

            aug_dataframe = aug_dataframe.append(
                {
                    'emotion_label': label,
                    'emotion_text': sentence
                },
                ignore_index=True)

            augmented_sent = syn_aug.augment(sentence)
            aug_dataframe = aug_dataframe.append(
                {
                    'emotion_label': label,
                    'emotion_text': augmented_sent
                },
                ignore_index=True)

            augmented_sent1 = rand_aug.augment(sentence)
            aug_dataframe = aug_dataframe.append(
                {
                    'emotion_label': label,
                    'emotion_text': augmented_sent1
                },
                ignore_index=True)
        else:
            aug_dataframe = aug_dataframe.append(
                {
                    'emotion_label': label,
                    'emotion_text': sentence
                },
                ignore_index=True)
            augmented_sent = syn_aug.augment(sentence)
            aug_dataframe = aug_dataframe.append(
                {
                    'emotion_label': label,
                    'emotion_text': augmented_sent
                },
                ignore_index=True)
            aug1 = naw.RandomWordAug(action="swap")
            augmented_sent1 = rand_aug.augment(sentence)
            aug_dataframe = aug_dataframe.append(
                {
                    'emotion_label': label,
                    'emotion_text': augmented_sent1
                },
                ignore_index=True)
    print('Augmentation Completed')
    return aug_dataframe['emotion_text'], aug_dataframe['emotion_label']
示例#4
0
    def test_empty_input_for_delete(self):
        text = ' '
        augs = [
            naw.RandomWordAug(action="delete"),
            naw.RandomWordAug(action="delete", stopwords=['a', 'an', 'the'])
        ]

        for aug in augs:
            augmented_text = aug.augment(text)
            # FIXME: standardize return
            is_equal = augmented_text == '' or augmented_text == ' '
            self.assertTrue(is_equal)
示例#5
0
def train_eval_dataset(dataset: pd.DataFrame,lang="ita",expansion=10):
    nltk.download('averaged_perceptron_tagger')
    nltk.download('wordnet')
    nltk.download('omw')
    flow = naf.Sometimes([naw.SynonymAug(lang=lang, aug_min=10),naw.RandomWordAug("swap"),naw.RandomWordAug("delete"),nac.KeyboardAug()])

    train_afert_exp=[]
    dev_after_exp=[]

    for idx, row in dataset.iterrows():
        logging.info("[{}/{}] {}".format(idx, len(dataset), row["question"]))
        new_text = [new for new in flow.augment(row["question"], n=expansion)]
        train_afert_exp.append({"label": row["question_id"], "text": row["question"]})
        th=int(len(new_text)*0.8)
        for text in new_text[:th]:
            train_afert_exp.append({"label": row["question_id"], "text": text})
        for text in new_text[th:]:
            dev_after_exp.append({"label": row["question_id"], "text": text})

    train=train_afert_exp
    dev=dev_after_exp

    train = pd.DataFrame(train).sample(frac=1.0)
    dev = pd.DataFrame(dev).sample(frac=1.0)

    return train, dev
示例#6
0
文件: utils.py 项目: nirraviv/nlp_rep
def main(config):
    infile = Path(config.infile)
    if not infile.is_file():
        raise FileNotFoundError
    pattern = re.compile('(train|test|val|dev).txt')
    phase = pattern.findall(infile.name)[0]

    fin = codecs.open(infile, 'r', 'utf-8')
    txt = fin.read()
    fin.close()

    outdir = Path(config.outdir)
    if not outdir.is_dir():
        outdir.mkdir(parents=True, exist_ok=True)
    outfile = outdir / f'randswap_aug_{phase}.txt'
    fout = codecs.open(outfile, 'w', 'utf-8')

    aug = naw.RandomWordAug(action='swap')
    lines = []
    for line in txt.split('\n'):
        lines.append(line)
        for _ in range(config.num_swaps):
            augmented_text = aug.augment(line)
            lines.append(augmented_text)
    fout.writelines(f"{line}\n" for line in lines)
    fout.close()
示例#7
0
    def test_swap(self):
        texts = ['The quick brown fox jumps over the lazy dog']
        aug = naw.RandomWordAug(action="swap")

        for text in texts:
            augmented_text = aug.augment(text)
            self.assertNotEqual(text, augmented_text)
示例#8
0
    def test_empty_input_substitute(self):
        texts = ['', '           ']

        self.word2vec_model.action = 'substitute'
        self.context_word_embs_model.action = 'substitute'

        augs = [
            naw.SpellingAug(),
            naw.AntonymAug(),
            naw.RandomWordAug(action='substitute'),
            naw.SynonymAug(aug_src='wordnet'),
            naw.TfIdfAug(model_path=self.tfidf_model_path,
                         action="substitute"), self.word2vec_model,
            self.context_word_embs_model
        ]

        for aug in augs:
            for text in texts:
                augmented_text = aug.augment(text)
                self.assertTrue(augmented_text is None
                                or augmented_text.strip() == '')

            augmented_texts = aug.augment(texts)
            for augmented_text in augmented_texts:
                self.assertTrue(augmented_text is None
                                or augmented_text.strip() == '')
示例#9
0
    def test_swap(self):
        texts = [
            'The quick brown fox jumps over the lazy dog'
        ]
        aug = naw.RandomWordAug(action="swap")

        for text in texts:
            tokens = text.lower().split(' ')
            orig_token_freq = {}
            for w in tokens:
                orig_token_freq[w] = tokens.count(w)

            augmented_text = text

            # https://github.com/makcedward/nlpaug/issues/77
            for i in range(10):
                augmented_text = aug.augment(augmented_text)

            aug_tokens = augmented_text.lower().split(' ')
            aug_token_freq = {}
            for w in tokens:
                aug_token_freq[w] = aug_tokens.count(w)

            for orig_token, orig_freq in orig_token_freq.items():
                self.assertTrue(orig_token in aug_token_freq)
                self.assertTrue(aug_token_freq[orig_token] == orig_freq)

            self.assertNotEqual(text, augmented_text)
示例#10
0
def random_word_swap(text):
    # Random Word Augmenter
    # Swap word randomly
    aug = naw.RandomWordAug(action="swap")
    attacked_text = aug.augment(text)
    print("Attacked Text:")
    print(attacked_text)
示例#11
0
def random_deletion(text, p=ALPHA):
    """
    Randomly remove each word in the sentence with probability p=0.05
    """
    aug = naw.RandomWordAug(action='delete', aug_p=p)
    augmented_text = aug.augment(text)
    return augmented_text
示例#12
0
    def test_multiple_actions(self):
        texts = [
            'The quick brown fox jumps over the lazy dog',
            'Zology raku123456 fasdasd asd4123414 1234584'
        ]

        flows = [
            naf.Sequential(
                [nac.RandomCharAug(action=Action.INSERT),
                 naw.RandomWordAug()]),
            naf.Sequential([
                nac.OcrAug(),
                nac.KeyboardAug(aug_min=1),
                nac.RandomCharAug(action=Action.SUBSTITUTE,
                                  aug_min=1,
                                  aug_char_p=0.6,
                                  aug_word_p=0.6)
            ])
        ]

        for flow in flows:
            for text in texts:
                augmented_text = flow.augment(text)

                self.assertNotEqual(text, augmented_text)
                self.assertLess(0, len(text))

            self.assertLess(0, len(texts))

        self.assertLess(0, len(flows))
示例#13
0
def augment_by_class(df, max_n):
    word_index = {}

    for phrase in df['Body']:
        words = phrase.split(' ')
        for word in words:
            word_index[word] = word_index.get(word, 0) + 1

    index_df = pd.DataFrame([{
        'token': i,
        'count': word_index[i]
    } for i in word_index])
    index_df = index_df[index_df['count'] >= 10]

    aug = naw.SynonymAug(stopwords=index_df['token'])
    aug2 = naw.RandomWordAug()
    factor = (max_n // len(df)) + 2

    result = set()
    for phrase in df['Body']:
        result.add(phrase)
        print(f'Augmenting for {phrase}')
        for item in aug.augment(phrase, n=factor):
            result.add(item)
        for item in aug2.augment(phrase, n=2):
            result.add(item)

    return list(result)
示例#14
0
def random_swap_helper(text):
    """
    Randomly choose two words in the sentence and swap their positions.
    """
    aug = naw.RandomWordAug(action='swap', aug_min=1, aug_max=1)
    augmented_text = aug.augment(text)
    return augmented_text
def augment(lines, params):
    """
    Contextual WordEmbs Augmentation with nlpaug for a list of String
    Args:
        lines: (List of Strings)
        params: (Dictionary) aug_max arguments

    Returns: (List of Strings) new strings

    """
    # Contextual WordEmbs Augumentation pipline
    aug = naf.Sequential(
        [
            ContextualWordEmbsAug(action=Action.INSERT, aug_max=params['contextual_max']),
            ContextualWordEmbsAug(action=Action.SUBSTITUTE, aug_max=params['contextual_max']),
            naw.RandomWordAug(aug_max=params['ramdom_max'])
        ]
    )

    augmented = []
    total_batchs = len(lines) // 100 + 1

    for i in range(total_batchs):
        if i % 100 == 0:
            logger.info("Augmenting the {} th batch, {}%".format(i, round(i / total_batchs * 100)))
        if i == total_batchs - 1:
            sub_line = lines[100 * i:]
        else:
            sub_line = lines[100 * i: 100 * (i + 1)]
        sub_aug = aug.augment(sub_line, num_thread=mp.cpu_count() - 1)
        augmented = augmented + sub_aug
    return augmented
示例#16
0
    def test_delete_one_token(self):
        texts = ['The']
        aug = naw.RandomWordAug(action='delete')

        for text in texts:
            augmented_text = aug.augment(text)
            self.assertEqual(text, augmented_text)
示例#17
0
    def test_case(self):
        # Swap
        aug = naw.RandomWordAug(action='swap')
        self.assertEqual('bB aA', aug.augment('aA bB'))
        self.assertEqual(['Love', 'I', 'McDonalds'], aug.change_case('I love McDonalds'.split(' '), 1, 0))
        self.assertEqual(['Love', 'I', 'McDonalds'], aug.change_case('I love McDonalds'.split(' '), 0, 1))
        self.assertEqual(['Loves', 'he', 'McDonalds'], aug.change_case('He loves McDonalds'.split(' '), 1, 0))
        self.assertEqual(['Loves', 'he', 'McDonalds'], aug.change_case('He loves McDonalds'.split(' '), 0, 1))
        self.assertEqual(['He', 'McDonalds', 'loves'], aug.change_case('He loves McDonalds'.split(' '), 2, 1))

        # Insert
        aug = naw.TfIdfAug(model_path=os.environ.get("MODEL_DIR"), action='insert')
        expected = False
        for i in range(10):
            augmented_text = aug.augment('Good')
            if 'good' in augmented_text and aug.get_word_case(augmented_text.split(' ')[0]) == 'capitalize':
                expected = True
                break
        self.assertTrue(expected)

        # Substitute
        aug = naw.RandomWordAug(action='substitute', target_words=['abc'])
        expected = False
        for i in range(10):
            augmented_text = aug.augment('I love')
            if augmented_text == 'Abc love':
                expected = True
                break
        self.assertTrue(expected)

        aug = naw.AntonymAug()
        self.assertEqual('Unhappy', aug.augment('Happy'))

        # Do not change if target word is non-lower
        aug = naw.SpellingAug(dict_path=os.environ.get("MODEL_DIR") + 'spelling_en.txt')
        self.assertEqual('RE', aug.augment('Re'))

        # Delete case
        aug = naw.RandomWordAug(action='delete')
        expected = False
        for i in range(10):
            augmented_text = aug.augment('I love')
            if augmented_text == 'Love':
                expected = True
                break
        self.assertTrue(expected)
示例#18
0
    def test_empty(self):
        texts = ['', None, []]

        aug = naw.RandomWordAug()

        for text in texts:
            augmented_text = aug.augment(text)
            self.assertEqual(text, augmented_text)
示例#19
0
    def test_empty_input_for_delete(self):
        texts = ['', '           ', None]
        augs = [
            naw.RandomWordAug(action="delete"),
            naw.RandomWordAug(action="delete", stopwords=['a', 'an', 'the'])
        ]

        for aug in augs:
            for text in texts:
                augmented_text = aug.augment(text)
                self.assertTrue(augmented_text is None
                                or augmented_text.strip() == '')

            augmented_texts = aug.augment(texts)
            for augmented_text in augmented_texts:
                self.assertTrue(augmented_text is None
                                or augmented_text.strip() == '')
示例#20
0
    def test_empty_input_for_swap(self):
        texts = [' ']
        aug = naw.RandomWordAug(action=Action.SWAP)
        for text in texts:
            augmented_text = aug.augment(text)

            self.assertEqual(text, augmented_text)

        self.assertEqual(1, len(texts))

        tokens = [None]
        aug = naw.RandomWordAug(action=Action.SWAP)
        for t in tokens:
            augmented_text = aug.augment(t)
            self.assertEqual(augmented_text, None)

        self.assertEqual(len(tokens), 1)
示例#21
0
    def augmentation(self,
                     text,
                     insert=False,
                     substitute=False,
                     swap=True,
                     delete=True):

        augs = []

        if insert:
            # aug = naw.ContextualWordEmbsAug(
            #     model_path=self.model_type, action="insert", device='cuda')
            # wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
            aug = naw.WordEmbsAug(
                model_type='word2vec',
                model_path=
                '/C:/Users/admin/Documents/Nitin/mycodes/kaggle_google_quest_qna/data/helpers/word2vec/GoogleNews-vectors-negative300.bin',
                action="insert")
            augs.append(aug)

        if substitute:
            # aug = naw.ContextualWordEmbsAug(
            #     model_path=self.model_type, action="substitute", device='cuda')
            # aug = naw.WordEmbsAug(
            #     model_type='word2vec', model_path='/media/jionie/my_disk/Kaggle/Google_Quest_Answer/model/word2vec/GoogleNews-vectors-negative300.bin',
            #     action="substitute")
            aug_sub = naw.SynonymAug(aug_src='wordnet')
            augs.append(aug_sub)
            # text = aug.augment(text)

        if swap:
            aug_swap = naw.RandomWordAug(action="swap")
            augs.append(aug_swap)
            # text = aug.augment(text)

        if delete:
            aug_del = naw.RandomWordAug()
            augs.append(aug_del)
            # text = aug.augment(text)

        aug = naf.Sometimes(augs, aug_p=0.5, pipeline_p=0.5)
        # print("before aug:", text)
        text = aug.augment(text, n=1)
        # print("after aug:", text)

        return text
示例#22
0
    def test_substitute_without_target_word(self):
        texts = ['The quick brown fox jumps over the lazy dog']
        aug = naw.RandomWordAug(action='substitute')

        for text in texts:
            augmented_text = aug.augment(text)

            self.assertIn('_', augmented_text)
            self.assertNotEqual(text, augmented_text)
    def __init__(self):
        aug0 = naw.RandomWordAug()
        aug1 = naw.ContextualWordEmbsAug(model_path='bert-base-uncased',
                                         action="substitute")
        aug2 = naw.SynonymAug(aug_src='wordnet')
        aug3 = naw.SplitAug()
        aug4 = naw.ContextualWordEmbsAug(model_path='bert-base-uncased',
                                         action="insert")

        self.augs = [aug0, aug1, aug2, aug3, aug4]
示例#24
0
 def augment(self, example):
     augs = [self.augs.augment(example) for _ in range(self.num_of_samples)]
     if self.swap:
         swap_aug = naw.RandomWordAug(action="swap")
         augs_ = list(augs)
         for i in augs_:
             for _ in range(self.num_of_samples):
                 swapped = swap_aug.augment(i)
                 augs.append(swapped)
     return augs
示例#25
0
    def test_crop(self):
        texts = ['The quick brown fox jumps over the lazy dog']
        aug = naw.RandomWordAug(action='crop')

        for text in texts:
            orig_tokens = text.split(' ')
            augmented_text = aug.augment(text)
            aug_tokens = augmented_text.split(' ')

            self.assertGreater(len(orig_tokens), len(aug_tokens))
示例#26
0
    def test_empty_input_for_delete(self):
        texts = ['']
        aug = naw.RandomWordAug(action=Action.DELETE)
        for text in texts:
            augmented_text = aug.augment(text)

            self.assertEqual(text, augmented_text)

        self.assertEqual(1, len(texts))
        self.assertEqual(0, len(texts[0]))
示例#27
0
def prepare_aug():
    # Contextual Word Embeddings Augmenter, Substitute word by contextual word embeddings
    neu_aug = []
    neu_aug.append(
        naw.ContextualWordEmbsAug(model_path='bert-base-uncased',
                                  action="insert"))
    neu_aug.append(
        naw.ContextualWordEmbsAug(model_path='bert-base-uncased',
                                  action="substitute"))
    neu_aug.append(
        naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased',
                                  action="substitute"))
    neu_aug.append(
        naw.ContextualWordEmbsAug(model_path='roberta-base',
                                  action="substitute"))

    # Synonym Augmenter, Substitute word by WordNet's synonym
    syn_aug = []
    syn_aug.append(naw.SynonymAug(aug_src='wordnet'))
    syn_aug.append(
        naw.SynonymAug(
            aug_src='ppdb',
            model_path=
            '/home/ubuntu/sentiment_analysis/bert-sentiment/syn_model/ppdb-2.0-tldr'
        ))

    # Antonym Augmenter
    ant_aug = []
    ant_aug.append(naw.AntonymAug())

    # Random Word Augmenter
    random_aug = []
    random_aug.append(naw.RandomWordAug(action="swap"))
    random_aug.append(naw.RandomWordAug())

    print('augmenter initialization finished ...')
    aug = []
    aug.extend(neu_aug)
    aug.extend(syn_aug)
    aug.extend(ant_aug)
    aug.extend(random_aug)
    return aug
示例#28
0
    def test_random_word_delete(self):
        texts = ['The quick brown fox jumps over the lazy dog']
        aug = naw.RandomWordAug()

        for text in texts:
            self.assertLess(0, len(text))
            augmented_text = aug.augment(text)

            self.assertNotEqual(text, augmented_text)

        self.assertLess(0, len(texts))
示例#29
0
    def test_swap(self):
        texts = ['The quick brown fox jumps over the lazy dog']
        aug = naw.RandomWordAug(action=Action.SWAP)

        for text in texts:
            self.assertLess(0, len(text))
            augmented_text = aug.augment(text)

            self.assertNotEqual(text, augmented_text)

        self.assertLess(0, len(texts))
示例#30
0
    def test_empty_input_for_swap(self):
        texts = ['', '           ', None]
        aug = naw.RandomWordAug(action="swap")
        for text in texts:
            augmented_text = aug.augment(text)
            self.assertTrue(augmented_text is None
                            or augmented_text.strip() == '')

        augmented_texts = aug.augment(texts)
        for augmented_text in augmented_texts:
            self.assertTrue(augmented_text is None
                            or augmented_text.strip() == '')