Exemplo n.º 1
0
    def test_n_output_audio(self):
        audio, sampling_rate = AudioLoader.load_audio(self.sample_wav_file)

        flows = [
            naf.Sequential(
                [naa.CropAug(sampling_rate=sampling_rate),
                 naa.LoudnessAug()]),
            naf.Sometimes(
                [naa.CropAug(sampling_rate=sampling_rate),
                 naa.LoudnessAug()],
                pipeline_p=0.9),
            naf.Sequential([
                naf.Sequential([
                    naa.CropAug(sampling_rate=sampling_rate),
                    naa.LoudnessAug()
                ]),
                naf.Sometimes([
                    naa.CropAug(sampling_rate=sampling_rate),
                    naa.LoudnessAug()
                ],
                              pipeline_p=0.9)
            ])
        ]

        for flow in flows:
            augmented_audios = flow.augment(audio, n=3)
            self.assertGreater(len(augmented_audios), 1)
            for augmented_audio in augmented_audios:
                self.assertFalse(np.array_equal(audio, augmented_audio))

        self.assertLess(0, len(flows))
Exemplo n.º 2
0
    def test_n_output_spectrogram(self):
        mel_spectrogram = AudioLoader.load_mel_spectrogram(
            self.sample_wav_file, n_mels=128)
        #
        flows = [
            naf.Sequential([nas.FrequencyMaskingAug(),
                            nas.TimeMaskingAug()]),
            naf.Sometimes([nas.FrequencyMaskingAug(),
                           nas.TimeMaskingAug()],
                          pipeline_p=0.9),
            naf.Sequential([
                naf.Sequential(
                    [nas.FrequencyMaskingAug(),
                     nas.TimeMaskingAug()]),
                naf.Sometimes(
                    [nas.FrequencyMaskingAug(),
                     nas.TimeMaskingAug()],
                    pipeline_p=0.9)
            ])
        ]

        for flow in flows:
            augmented_mel_spectrograms = flow.augment(mel_spectrogram, n=3)
            self.assertGreater(len(augmented_mel_spectrograms), 1)
            for augmented_mel_spectrogram in augmented_mel_spectrograms:
                self.assertFalse(
                    np.array_equal(mel_spectrogram, augmented_mel_spectrogram))

        self.assertLess(0, len(flows))
Exemplo n.º 3
0
    def test_multiple_actions(self):
        texts = [
            'The quick brown fox jumps over the lazy dog',
            'Zology raku123456 fasdasd asd4123414 1234584'
        ]

        flows = [
            naf.Sequential([
                naf.Sometimes([
                    nac.RandomCharAug(action=Action.INSERT),
                    nac.RandomCharAug(action=Action.DELETE)
                ],
                              pipeline_p=0.5),
                naf.Sequential([
                    nac.OcrAug(),
                    nac.QwertyAug(aug_min=1),
                    nac.RandomCharAug(action=Action.SUBSTITUTE,
                                      aug_min=1,
                                      aug_char_p=0.6,
                                      aug_word_p=0.6)
                ],
                               name='Sub_Seq')
            ]),
            naf.Sometimes([
                naf.Sometimes([
                    nac.RandomCharAug(action=Action.INSERT),
                    nac.RandomCharAug(action=Action.DELETE)
                ]),
                naf.Sequential([
                    nac.OcrAug(),
                    nac.QwertyAug(aug_min=1),
                    nac.RandomCharAug(action=Action.SUBSTITUTE,
                                      aug_min=1,
                                      aug_char_p=0.6,
                                      aug_word_p=0.6)
                ])
            ],
                          pipeline_p=0.5)
        ]

        # Since prob may be low and causing do not perform data augmentation. Retry 5 times
        for flow in flows:
            at_least_one_not_equal = False
            for _ in range(0, 5):
                for text in texts:
                    self.assertLess(0, len(text))
                    augmented_text = flow.augment(text)

                    if text != augmented_text:
                        at_least_one_not_equal = True

                    self.assertLess(0, len(text))

                if at_least_one_not_equal:
                    break

        self.assertTrue(at_least_one_not_equal)
        self.assertLess(0, len(flows))
        self.assertLess(0, len(texts))
Exemplo n.º 4
0
    def test_multiple_actions(self):
        texts = [
            'The quick brown fox jumps over the lazy dog',
            'Zology raku123456 fasdasd asd4123414 1234584'
        ]

        flows = [
            naf.Sequential([
                naf.Sometimes([
                    nac.RandomCharAug(action="insert"),
                    nac.RandomCharAug(action="delete")
                ],
                              pipeline_p=0.9),
                naf.Sequential(
                    [
                        # nac.OcrAug(), nac.QwertyAug(aug_min=1),
                        nac.RandomCharAug(action="substitute",
                                          aug_char_min=1,
                                          aug_char_p=0.6,
                                          aug_word_p=0.6)
                    ],
                    name='Sub_Seq')
            ]),
            naf.Sometimes([
                naf.Sometimes([
                    nac.RandomCharAug(action="insert"),
                    nac.RandomCharAug(action="delete")
                ]),
                naf.Sequential([
                    nac.OcrAug(),
                    nac.KeyboardAug(aug_char_min=1),
                    nac.RandomCharAug(action="substitute",
                                      aug_char_min=1,
                                      aug_char_p=0.6,
                                      aug_word_p=0.6)
                ])
            ],
                          pipeline_p=0.9)
        ]

        # Since prob may be low and causing do not perform data augmentation. Retry 5 times
        for flow in flows:
            for text in texts:
                at_least_one_not_equal = False
                for _ in range(5):
                    augmented_text = flow.augment(text, n=1)

                    if text != augmented_text:
                        at_least_one_not_equal = True
                        break

                self.assertTrue(at_least_one_not_equal)
        self.assertLess(0, len(flows))
        self.assertLess(0, len(texts))
Exemplo n.º 5
0
def augmentation(text, insert=False, substitute=False, swap=True, delete=True):
    augs = []

    if insert:
        aug = naw.WordEmbsAug(
            model_type='word2vec',
            model_path=
            '/media/jionie/my_disk/Kaggle/Tweet/model/word2vec/GoogleNews-vectors-negative300.bin',
            action="insert")
        augs.append(aug)

    if substitute:
        aug_sub = naw.SynonymAug(aug_src='wordnet')
        augs.append(aug_sub)

    if swap:
        aug_swap = naw.RandomWordAug(action="swap")
        augs.append(aug_swap)

    if delete:
        aug_del = naw.RandomWordAug()
        augs.append(aug_del)

    aug = naf.Sometimes(augs, aug_p=0.5, pipeline_p=0.5)
    # print("before aug:", text)
    text = aug.augment(text, n=1)
    # print("after aug:", text)

    return text
Exemplo n.º 6
0
    def test_n_output_without_augmentation(self):
        texts = [
            'AAAAAAAAAAA AAAAAAAAAAAAAA'
        ]
        flows = [
            naf.Sequential([
                nac.OcrAug(),
                nac.OcrAug()
            ]),
            naf.Sometimes([
                nac.RandomCharAug(),
                nac.RandomCharAug()
            ], pipeline_p=0.00001)
        ]

        for flow in flows:
            for text in texts:
                for _ in range(5):
                    augmented_texts = flow.augment(text, n=3)
                    all_not_equal = False
                    for augmented_text in augmented_texts:
                        if augmented_text != text:
                            all_not_equal = True
                            break
                    if all_not_equal:
                        break

                self.assertFalse(all_not_equal)
        self.assertLess(0, len(flows))
        self.assertLess(0, len(texts))
Exemplo n.º 7
0
def train_eval_dataset(dataset: pd.DataFrame,lang="ita",expansion=10):
    nltk.download('averaged_perceptron_tagger')
    nltk.download('wordnet')
    nltk.download('omw')
    flow = naf.Sometimes([naw.SynonymAug(lang=lang, aug_min=10),naw.RandomWordAug("swap"),naw.RandomWordAug("delete"),nac.KeyboardAug()])

    train_afert_exp=[]
    dev_after_exp=[]

    for idx, row in dataset.iterrows():
        logging.info("[{}/{}] {}".format(idx, len(dataset), row["question"]))
        new_text = [new for new in flow.augment(row["question"], n=expansion)]
        train_afert_exp.append({"label": row["question_id"], "text": row["question"]})
        th=int(len(new_text)*0.8)
        for text in new_text[:th]:
            train_afert_exp.append({"label": row["question_id"], "text": text})
        for text in new_text[th:]:
            dev_after_exp.append({"label": row["question_id"], "text": text})

    train=train_afert_exp
    dev=dev_after_exp

    train = pd.DataFrame(train).sample(frac=1.0)
    dev = pd.DataFrame(dev).sample(frac=1.0)

    return train, dev
Exemplo n.º 8
0
    def test_augment_detail(self):
        text = 'The quick brown fox jumps over the lazy dog'

        flows = [
            naf.Sequential([
                naf.Sometimes([
                    nac.RandomCharAug(action="insert"),
                    nac.RandomCharAug(action="delete")
                ],
                              pipeline_p=0.5),
                naf.Sequential([
                    nac.RandomCharAug(action="substitute",
                                      aug_char_min=1,
                                      aug_char_p=0.6,
                                      aug_word_p=0.6)
                ],
                               name='Sub_Seq')
            ],
                           include_detail=True),
            naf.Sometimes([
                naf.Sometimes([
                    nac.RandomCharAug(action="insert"),
                    nac.RandomCharAug(action="delete")
                ]),
                naf.Sequential([
                    nac.OcrAug(),
                    nac.KeyboardAug(aug_char_min=1),
                    nac.RandomCharAug(action="substitute",
                                      aug_char_min=1,
                                      aug_char_p=0.6,
                                      aug_word_p=0.6)
                ])
            ],
                          pipeline_p=1,
                          include_detail=True)
        ]

        for flow in flows:
            augmented_text, augment_details = flow.augment(text)

            self.assertNotEqual(text, augmented_text)
            self.assertGreater(len(augment_details), 0)
            for augment_detail in augment_details:
                self.assertGreater(augment_detail['orig_start_pos'], -1)
                self.assertGreater(augment_detail['new_start_pos'], -1)
                self.assertGreater(augment_detail['change_seq'], 0)
                self.assertIn(augment_detail['action'], Action.getall())
Exemplo n.º 9
0
    def test_multi_thread(self):
        text = 'The quick brown fox jumps over the lazy dog'
        n = 3

        w2v_model_path = os.path.join(os.environ["MODEL_DIR"], 'word',
                                      'word_embs',
                                      'GoogleNews-vectors-negative300.bin')

        flows = [
            naf.Sequential([
                naf.Sequential([
                    nac.OcrAug(),
                    naw.WordEmbsAug(model_type='word2vec',
                                    model_path=w2v_model_path)
                ]),
                naf.Sequential([
                    nac.RandomCharAug(),
                ]),
                naw.ContextualWordEmbsAug(model_path='xlnet-base-cased',
                                          action="substitute",
                                          temperature=0.7,
                                          device='cpu')
            ]),
            naf.Sometimes([
                naf.Sequential([
                    nac.OcrAug(),
                    nac.RandomCharAug(),
                ]),
                naf.Sometimes([
                    naw.WordEmbsAug(model_type='word2vec',
                                    model_path=w2v_model_path)
                ],
                              pipeline_p=0.999),
                naw.ContextualWordEmbsAug(model_path='xlnet-base-cased',
                                          action="substitute",
                                          temperature=0.7,
                                          device='cpu')
            ],
                          pipeline_p=0.9999)
        ]

        for num_thread in [1, 3]:
            for flow in flows:
                augmented_data = flow.augment(text, n=n, num_thread=num_thread)
                self.assertEqual(len(augmented_data), n)
Exemplo n.º 10
0
 def parse(config: dict) -> list:
     augmentations = []
     for key, value in config.items():
         au = AUGMENTATIONS.get(key, None)
         if au is None:
             raise KeyError(f"No augmentation named: {key}\n"
                            f"Available augmentations: {AUGMENTATIONS.keys()}")
         aug = au(**value) if value is not None else au()
         augmentations.append(aug)
     return naf.Sometimes(augmentations)
Exemplo n.º 11
0
def nlpaug(word):
    aug = naf.Sometimes([
        nac.OcrAug(),
        nac.KeyboardAug(),
        nac.RandomCharAug(action="insert"),
        nac.RandomCharAug(action="substitute"),
        nac.RandomCharAug(action="swap"),
        nac.RandomCharAug(action="delete"),
        naw.SpellingAug(),
    ])
    word = aug.augment(word)
    return word
Exemplo n.º 12
0
    def test_n_output(self):
        texts = [
            'The quick brown fox jumps over the lazy dog',
            'Zology raku123456 fasdasd asd4123414 1234584',
            'AAAAAAAAAAA AAAAAAAAAAAAAA'
        ]
        flows = [
            naf.Sequential(
                [nac.RandomCharAug(action=Action.INSERT),
                 naw.RandomWordAug()]),
            naf.Sometimes([
                nac.RandomCharAug(action=Action.INSERT),
                nac.RandomCharAug(action=Action.DELETE)
            ],
                          pipeline_p=0.9),
            naf.Sequential([
                naf.Sequential([
                    nac.RandomCharAug(action=Action.INSERT),
                    naw.RandomWordAug()
                ]),
                naf.Sometimes([
                    nac.RandomCharAug(action=Action.INSERT),
                    nac.RandomCharAug(action=Action.DELETE)
                ],
                              pipeline_p=0.9)
            ])
        ]

        for flow in flows:
            for text in texts:
                augmented_texts = flow.augment(text, n=3)
                self.assertGreater(len(augmented_texts), 1)
                for augmented_text in augmented_texts:
                    self.assertNotEqual(augmented_text, text)

        self.assertLess(0, len(flows))
        self.assertLess(0, len(texts))
Exemplo n.º 13
0
    def augmentation(self,
                     text,
                     insert=False,
                     substitute=False,
                     swap=True,
                     delete=True):

        augs = []

        if insert:
            # aug = naw.ContextualWordEmbsAug(
            #     model_path=self.model_type, action="insert", device='cuda')
            # wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
            aug = naw.WordEmbsAug(
                model_type='word2vec',
                model_path=
                '/C:/Users/admin/Documents/Nitin/mycodes/kaggle_google_quest_qna/data/helpers/word2vec/GoogleNews-vectors-negative300.bin',
                action="insert")
            augs.append(aug)

        if substitute:
            # aug = naw.ContextualWordEmbsAug(
            #     model_path=self.model_type, action="substitute", device='cuda')
            # aug = naw.WordEmbsAug(
            #     model_type='word2vec', model_path='/media/jionie/my_disk/Kaggle/Google_Quest_Answer/model/word2vec/GoogleNews-vectors-negative300.bin',
            #     action="substitute")
            aug_sub = naw.SynonymAug(aug_src='wordnet')
            augs.append(aug_sub)
            # text = aug.augment(text)

        if swap:
            aug_swap = naw.RandomWordAug(action="swap")
            augs.append(aug_swap)
            # text = aug.augment(text)

        if delete:
            aug_del = naw.RandomWordAug()
            augs.append(aug_del)
            # text = aug.augment(text)

        aug = naf.Sometimes(augs, aug_p=0.5, pipeline_p=0.5)
        # print("before aug:", text)
        text = aug.augment(text, n=1)
        # print("after aug:", text)

        return text
Exemplo n.º 14
0
    def test_n_output_without_augmentation(self):
        texts = ['AAAAAAAAAAA AAAAAAAAAAAAAA']
        flows = [
            naf.Sequential([nac.OcrAug(), nac.OcrAug()]),
            naf.Sometimes(
                [nac.RandomCharAug(), nac.RandomCharAug()], pipeline_p=0.00001)
        ]

        for flow in flows:
            for text in texts:
                at_least_one_equal = False
                for _ in range(5):
                    augmented_texts = flow.augment(text, n=3)
                    if len(augmented_texts
                           ) == 1 and augmented_texts[0] == text:
                        at_least_one_equal = True
                        break

                self.assertTrue(at_least_one_equal)
        self.assertLess(0, len(flows))
        self.assertLess(0, len(texts))
Exemplo n.º 15
0
    def test_single_action(self):
        texts = [
            'The quick brown fox jumps over the lazy dog',
            'Zology raku123456 fasdasd asd4123414 1234584 s@#'
        ]

        # Since prob may be low and causing do not perform data augmentation. Retry 5 times
        at_least_one_not_equal = False
        for _ in range(0, 5):
            flow = naf.Sometimes([nac.RandomCharAug(action=Action.INSERT)], pipeline_p=0.6)
            for text in texts:
                augmented_text = flow.augment(text)

                if text != augmented_text:
                    at_least_one_not_equal = True

                self.assertLess(0, len(text))

            if at_least_one_not_equal:
                break

        self.assertTrue(at_least_one_not_equal)
        self.assertLess(0, len(texts))
Exemplo n.º 16
0
 def test_dry_run(self):
     seq = naf.Sometimes()
     results = seq.augment([])
     self.assertEqual(0, len(results))