def test_multiple_actions(self): texts = [ 'The quick brown fox jumps over the lazy dog', 'Zology raku123456 fasdasd asd4123414 1234584' ] flows = [ naf.Sequential([ naf.Sometimes([ nac.RandomCharAug(action=Action.INSERT), nac.RandomCharAug(action=Action.DELETE) ], pipeline_p=0.5), naf.Sequential([ nac.OcrAug(), nac.QwertyAug(aug_min=1), nac.RandomCharAug(action=Action.SUBSTITUTE, aug_min=1, aug_char_p=0.6, aug_word_p=0.6) ], name='Sub_Seq') ]), naf.Sometimes([ naf.Sometimes([ nac.RandomCharAug(action=Action.INSERT), nac.RandomCharAug(action=Action.DELETE) ]), naf.Sequential([ nac.OcrAug(), nac.QwertyAug(aug_min=1), nac.RandomCharAug(action=Action.SUBSTITUTE, aug_min=1, aug_char_p=0.6, aug_word_p=0.6) ]) ], pipeline_p=0.5) ] # Since prob may be low and causing do not perform data augmentation. Retry 5 times for flow in flows: at_least_one_not_equal = False for _ in range(0, 5): for text in texts: self.assertLess(0, len(text)) augmented_text = flow.augment(text) if text != augmented_text: at_least_one_not_equal = True self.assertLess(0, len(text)) if at_least_one_not_equal: break self.assertTrue(at_least_one_not_equal) self.assertLess(0, len(flows)) self.assertLess(0, len(texts))
def test_multiple_actions(self): texts = [ 'The quick brown fox jumps over the lazy dog', 'Zology raku123456 fasdasd asd4123414 1234584' ] flows = [ naf.Sequential( [nac.RandomCharAug(action=Action.INSERT), naw.RandomWordAug()]), naf.Sequential([ nac.OcrAug(), nac.QwertyAug(aug_min=1), nac.RandomCharAug(action=Action.SUBSTITUTE, aug_min=1, aug_char_p=0.6, aug_word_p=0.6) ]) ] for flow in flows: for text in texts: augmented_text = flow.augment(text) self.assertNotEqual(text, augmented_text) self.assertLess(0, len(text)) self.assertLess(0, len(texts)) self.assertLess(0, len(flows))
def test_empty(self): texts = ['', None] augs = [ nac.OcrAug(), nac.QwertyAug(), ] for text in texts: for aug in augs: augmented_text = aug.augment(text) self.assertEqual(text, augmented_text)
def test_tokenizer(self): augs = [ nac.OcrAug(tokenizer=text_tokenizer.split_sentence), nac.QwertyAug(tokenizer=text_tokenizer.split_sentence), nac.RandomCharAug(tokenizer=text_tokenizer.split_sentence), ] text = 'The quick brown fox, jumps over lazy dog.' expected_tokens = [ 'The', ' quick', ' brown', ' fox', ', ', 'jumps', ' over', ' lazy', ' dog', '.' ] for aug in augs: tokens = aug.tokenizer(text) self.assertEqual(tokens, expected_tokens) text = 'The quick !brown fox, jumps # over lazy dog .' expected_tokens = [ 'The', ' quick', ' !', 'brown', ' fox', ', ', 'jumps', ' # ', 'over', ' lazy', ' dog', ' .' ] for aug in augs: tokens = aug.tokenizer(text) self.assertEqual(tokens, expected_tokens)