Пример #1
0
 def test_multiple_actions(self):
     texts = [
         'The quick brown fox jumps over the lazy dog',
         'Zology raku123456 fasdasd asd4123414 1234584'
     ]
 
     flows = [
         naf.Sequential([
             naf.Sometimes([nac.RandomCharAug(action="insert"),
                            nac.RandomCharAug(action="delete")],
                           aug_p=0.9),
             naf.Sequential([
                 nac.RandomCharAug(action="substitute", aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6)
             ], name='Sub_Seq')
         ]),
         naf.Sometimes([
             naf.Sometimes([nac.RandomCharAug(action="insert"),
                            nac.RandomCharAug(action="delete")]),
             naf.Sequential([nac.OcrAug(), nac.KeyboardAug(aug_char_min=1),
                             nac.RandomCharAug(action="substitute", aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6)])
         ], aug_p=0.9)
     ]
 
     # Since prob may be low and causing do not perform data augmentation. Retry 5 times
     for flow in flows:
         for text in texts:
             at_least_one_not_equal = False
             for _ in range(5):
                 augmented_text = flow.augment(text, n=1)
 
                 if text != augmented_text:
                     at_least_one_not_equal = True
                     break
 
             self.assertTrue(at_least_one_not_equal)
     self.assertLess(0, len(flows))
     self.assertLess(0, len(texts))
Пример #2
0
    def test_multi_thread(self):
        text = 'The quick brown fox jumps over the lazy dog'
        n = 3
        flows = [
            naf.Sequential([
                naf.Sequential([
                    nac.OcrAug(),
                    naw.WordEmbsAug(
                        model_type='word2vec',
                        model_path=os.environ["MODEL_DIR"] + 'GoogleNews-vectors-negative300.bin')
                ]),
                naf.Sequential([
                    nac.RandomCharAug(),
                ]),
                naw.ContextualWordEmbsAug(
                    model_path='xlnet-base-cased', action="substitute",
                    skip_unknown_word=True, temperature=0.7, device='cpu')
            ]),
            naf.Sometimes([
                naf.Sequential([
                    nac.OcrAug(),
                    nac.RandomCharAug(),
                ]),
                naf.Sometimes([
                    naw.WordEmbsAug(model_type='word2vec',
                                    model_path=os.environ["MODEL_DIR"] + 'GoogleNews-vectors-negative300.bin')
                ], pipeline_p=0.999),
                naw.ContextualWordEmbsAug(
                    model_path='xlnet-base-cased', action="substitute",
                    skip_unknown_word=True, temperature=0.7, device='cpu')
            ], pipeline_p=0.9999)
        ]

        for num_thread in [1, 3]:
            for flow in flows:
                augmented_data = flow.augment(text, n=n, num_thread=num_thread)
                self.assertEqual(len(augmented_data), n)
Пример #3
0
    def test_single_action(self):
        texts = [
            'The quick brown fox jumps over the lazy dog',
            'Zology raku123456 fasdasd asd4123414 1234584 s@#'
        ]

        flow = naf.Sequential([nac.RandomCharAug(action=Action.INSERT, min_char=1)])

        for text in texts:
            augmented_text = flow.augment(text)

            self.assertNotEqual(text, augmented_text)
            self.assertLess(0, len(text))

        self.assertLess(0, len(texts))
Пример #4
0
    def test_n_output_textual(self):
        texts = [
            'The quick brown fox jumps over the lazy dog',
            'Zology raku123456 fasdasd asd4123414 1234584',
            'AAAAAAAAAAA AAAAAAAAAAAAAA'
        ]
        flows = [
            naf.Sequential(
                [nac.RandomCharAug(action="insert"),
                 naw.RandomWordAug()]),
            naf.Sometimes([
                nac.RandomCharAug(action="insert"),
                nac.RandomCharAug(action="delete")
            ],
                          pipeline_p=0.9),
            naf.Sequential([
                naf.Sequential(
                    [nac.RandomCharAug(action="insert"),
                     naw.RandomWordAug()]),
                naf.Sometimes([
                    nac.RandomCharAug(action="insert"),
                    nac.RandomCharAug(action="delete")
                ],
                              pipeline_p=0.9)
            ])
        ]

        for flow in flows:
            for text in texts:
                augmented_texts = flow.augment(text, n=3)
                self.assertGreater(len(augmented_texts), 1)
                for augmented_text in augmented_texts:
                    self.assertNotEqual(augmented_text, text)

        self.assertLess(0, len(flows))
        self.assertLess(0, len(texts))
Пример #5
0
    def test_n_output_spectrogram(self):
        audio, sampling_rate = librosa.load(self.sample_wav_file)
        mel_spectrogram = LoadUtil.load_mel_spectrogram(self.sample_wav_file,
                                                        n_mels=128)
        #
        flows = [
            naf.Sequential([
                nas.FrequencyMaskingAug(mask_factor=80),
                nas.TimeMaskingAug(mask_factor=80)
            ]),
            naf.Sometimes([
                nas.FrequencyMaskingAug(mask_factor=80),
                nas.TimeMaskingAug(mask_factor=80)
            ],
                          pipeline_p=0.9),
            naf.Sequential([
                naf.Sequential([
                    nas.FrequencyMaskingAug(mask_factor=80),
                    nas.TimeMaskingAug(mask_factor=80)
                ]),
                naf.Sometimes([
                    nas.FrequencyMaskingAug(mask_factor=80),
                    nas.TimeMaskingAug(mask_factor=80)
                ],
                              pipeline_p=0.9)
            ])
        ]

        for flow in flows:
            augmented_mel_spectrograms = flow.augment(mel_spectrogram, n=3)
            self.assertGreater(len(augmented_mel_spectrograms), 1)
            for augmented_mel_spectrogram in augmented_mel_spectrograms:
                self.assertFalse(
                    np.array_equal(mel_spectrogram, augmented_mel_spectrogram))

        self.assertLess(0, len(flows))
Пример #6
0
    def test_audio(self):
        # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
        sample_wav_file = os.path.abspath(os.path.join(
            os.path.dirname(__file__), '..', '..', 'data', 'Yamaha-V50-Rock-Beat-120bpm.wav'))

        audio, sampling_rate = librosa.load(sample_wav_file)

        flow = naf.Sequential([
            naa.NoiseAug(),
            naa.PitchAug(sampling_rate=sampling_rate, factor=(0.2, 1.5)),
            naa.ShiftAug(sampling_rate=sampling_rate, duration=2),
            naa.SpeedAug(factor=(1.5, 3))
        ])

        augmented_audio = flow.augment(audio)

        self.assertFalse(np.array_equal(audio, augmented_audio))
        self.assertTrue(len(audio), len(augmented_audio))
Пример #7
0
    def test_audio(self):
        # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
        sample_wav_file = os.path.join(os.environ.get("TEST_DIR"), 'res',
                                       'audio',
                                       'Yamaha-V50-Rock-Beat-120bpm.wav')

        audio, sampling_rate = AudioLoader.load_audio(sample_wav_file)

        flow = naf.Sequential([
            naa.NoiseAug(),
            naa.PitchAug(sampling_rate=sampling_rate, factor=(0.2, 1.5)),
            naa.ShiftAug(sampling_rate=sampling_rate, duration=2),
            naa.SpeedAug(factor=(1.5, 3))
        ])

        augmented_audio = flow.augment(audio)

        self.assertFalse(np.array_equal(audio, augmented_audio))
        self.assertTrue(len(audio), len(augmented_audio))
Пример #8
0
    def test_n_output_without_augmentation(self):
        texts = ['AAAAAAAAAAA AAAAAAAAAAAAAA']
        flows = [
            naf.Sequential([nac.OcrAug(), nac.OcrAug()]),
            naf.Sometimes(
                [nac.RandomCharAug(), nac.RandomCharAug()], pipeline_p=0.00001)
        ]

        for flow in flows:
            for text in texts:
                at_least_one_equal = False
                for _ in range(5):
                    augmented_texts = flow.augment(text, n=3)
                    if len(augmented_texts
                           ) == 1 and augmented_texts[0] == text:
                        at_least_one_equal = True
                        break

                self.assertTrue(at_least_one_equal)
        self.assertLess(0, len(flows))
        self.assertLess(0, len(texts))
Пример #9
0
    def test_spectrogram(self):
        # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
        sample_wav_file = os.path.abspath(
            os.path.join(os.path.dirname(__file__), '..', '..', 'data',
                         'Yamaha-V50-Rock-Beat-120bpm.wav'))

        mel_spectrogram = LoadUtil.load_mel_spectrogram(sample_wav_file,
                                                        n_mels=128)

        flow = naf.Sequential([
            nas.FrequencyMaskingAug(mask_factor=50),
            nas.TimeMaskingAug(mask_factor=20),
            nas.TimeMaskingAug(mask_factor=30)
        ])

        augmented_mel_spectrogram = flow.augment(mel_spectrogram)

        for aug in flow:
            if aug.name == 'FrequencyMasking_Aug':
                self.assertEqual(
                    len(mel_spectrogram[aug.model.f0]),
                    np.count_nonzero(mel_spectrogram[aug.model.f0]))
                self.assertEqual(
                    0,
                    np.count_nonzero(augmented_mel_spectrogram[aug.model.f0]))
            elif aug.name == 'TimeMasking_Aug':
                self.assertEqual(
                    len(mel_spectrogram[:, aug.model.t0]),
                    np.count_nonzero(mel_spectrogram[:, aug.model.t0]))
                self.assertEqual(
                    0,
                    np.count_nonzero(augmented_mel_spectrogram[:,
                                                               aug.model.t0]))
            else:
                # Unexpected flow
                self.assertFalse(True)

        self.assertTrue(len(flow) > 0)
Пример #10
0
def augment_n(data, N=1):
    pbar = tqdm(desc='Augmenting Data N={}'.format(N),
                total=data.shape[0],
                leave=False)

    # random synonym replacement
    # aug = naw.SynonymAug(aug_max=4, stopwords=stop_words())
    aug = naf.Sequential([
        # naw.ContextualWordEmbsAug(
        #     'bert-base-uncased',
        #     aug_max=5,
        #     stopwords=stop_words(),
        #     device='cuda',
        #     optimize=True
        # ),
        naw.ContextualWordEmbsAug('bert-base-uncased',
                                  aug_max=3,
                                  stopwords=stop_words(),
                                  device='cuda',
                                  optimize=True,
                                  action='insert'),
        naw.SynonymAug(aug_max=4, stopwords=stop_words())
    ])
    results = []
    for row in data:
        t, s = augment(row[1], row[2], aug, N)
        augs = []

        for j, t in enumerate(t):
            augs.append([row[0] + str(j), t, s[j], row[3]])
        if len(augs) > 0:
            results.append(np.array(augs))
        pbar.update()

    results.append(data)
    pbar.clear()
    pbar.close()
    return np.concatenate(results, axis=0)
Пример #11
0
    def test_n_output_without_augmentation(self):
        texts = ['AAAAAAAAAAA AAAAAAAAAAAAAA']
        flows = [
            naf.Sequential([nac.OcrAug(), nac.OcrAug()]),
            naf.Sometimes(
                [nac.RandomCharAug(), nac.RandomCharAug()], pipeline_p=0.00001)
        ]

        for flow in flows:
            for text in texts:
                for _ in range(5):
                    augmented_texts = flow.augment(text, n=3)
                    all_not_equal = False
                    for augmented_text in augmented_texts:
                        if augmented_text != text:
                            all_not_equal = True
                            break
                    if all_not_equal:
                        break

                self.assertFalse(all_not_equal)
        self.assertLess(0, len(flows))
        self.assertLess(0, len(texts))
Пример #12
0
 def test_dry_run(self):
     flow = naf.Sequential()
     results = flow.augment([])
     self.assertEqual(0, len(results))
Пример #13
0
 def build_augments(self):
     self.augs = naf.Sequential([
         naw.SynonymAug(aug_src='wordnet'),
         #     naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert"),
         #     naw.RandomWordAug(action="swap")
     ])
 def __call__(self, data):
     transforms = naf.Sequential(
         [self.get_random_freq_mask(),
          self.get_random_time_mask()])
     return transforms.augment(data)
Пример #15
0
            try:
                os.makedirs(path)
            except OSError:
                print("Creation of the directory %s failed" % path)
            else:
                print("Creation of the directory %s success" % path)
            if os.path.exists(csv_file_path):
                input_directory = Path(csv_file_path)
                for my_filename in input_directory.glob("*_AUDIO_*.wav"):
                    audio, sampling_rate = librosa.load(my_filename)
                    VisualWave.visual('Original', audio, sampling_rate)

                    flow = naf.Sequential([
                        naa.NoiseAug(),
                        naa.PitchAug(sampling_rate=sampling_rate,
                                     pitch_factor=1.5),
                        naa.ShiftAug(sampling_rate=sampling_rate, shift_max=2),
                        naa.SpeedAug(speed_factor=1.5)
                    ])
                    augmented_audio = flow.augment(audio)
                    VisualWave.visual('augment', augmented_audio,
                                      sampling_rate)
                    my_filename = my_filename.stem
                    librosa.output.write_wav(path + my_filename + '.wav',
                                             augmented_audio,
                                             sampling_rate,
                                             norm=False)
        except ValueError:
            print("Skipping the following line: ", row[0])
csvFile.close()