def test_multiple_actions(self): texts = [ 'The quick brown fox jumps over the lazy dog', 'Zology raku123456 fasdasd asd4123414 1234584' ] flows = [ naf.Sequential( [nac.RandomCharAug(action=Action.INSERT), naw.RandomWordAug()]), naf.Sequential([ nac.OcrAug(), nac.KeyboardAug(aug_min=1), nac.RandomCharAug(action=Action.SUBSTITUTE, aug_min=1, aug_char_p=0.6, aug_word_p=0.6) ]) ] for flow in flows: for text in texts: augmented_text = flow.augment(text) self.assertNotEqual(text, augmented_text) self.assertLess(0, len(text)) self.assertLess(0, len(texts)) self.assertLess(0, len(flows))
def test_n_output_without_augmentation(self): texts = [ 'AAAAAAAAAAA AAAAAAAAAAAAAA' ] flows = [ naf.Sequential([ nac.OcrAug(), nac.OcrAug() ]), naf.Sometimes([ nac.RandomCharAug(), nac.RandomCharAug() ], pipeline_p=0.00001) ] for flow in flows: for text in texts: for _ in range(5): augmented_texts = flow.augment(text, n=3) all_not_equal = False for augmented_text in augmented_texts: if augmented_text != text: all_not_equal = True break if all_not_equal: break self.assertFalse(all_not_equal) self.assertLess(0, len(flows)) self.assertLess(0, len(texts))
def nlpaug(word): aug = naf.Sometimes([ nac.OcrAug(), nac.KeyboardAug(), nac.RandomCharAug(action="insert"), nac.RandomCharAug(action="substitute"), nac.RandomCharAug(action="swap"), nac.RandomCharAug(action="delete"), naw.SpellingAug(), ]) word = aug.augment(word) return word
def test_multi_thread(self): text = 'The quick brown fox jumps over the lazy dog' n = 3 w2v_model_path = os.path.join(os.environ["MODEL_DIR"], 'word', 'word_embs', 'GoogleNews-vectors-negative300.bin') flows = [ naf.Sequential([ naf.Sequential([ nac.OcrAug(), naw.WordEmbsAug(model_type='word2vec', model_path=w2v_model_path) ]), naf.Sequential([ nac.RandomCharAug(), ]), naw.ContextualWordEmbsAug(model_path='xlnet-base-cased', action="substitute", temperature=0.7, device='cpu') ]), naf.Sometimes([ naf.Sequential([ nac.OcrAug(), nac.RandomCharAug(), ]), naf.Sometimes([ naw.WordEmbsAug(model_type='word2vec', model_path=w2v_model_path) ], pipeline_p=0.999), naw.ContextualWordEmbsAug(model_path='xlnet-base-cased', action="substitute", temperature=0.7, device='cpu') ], pipeline_p=0.9999) ] for num_thread in [1, 3]: for flow in flows: augmented_data = flow.augment(text, n=n, num_thread=num_thread) self.assertEqual(len(augmented_data), n)
def setUpClass(cls): env_config_path = os.path.abspath( os.path.join(os.path.dirname(__file__), '..', '..', '.env')) load_dotenv(env_config_path) cls.augs = [ nac.RandomCharAug(), naw.ContextualWordEmbsAug(), nas.ContextualWordEmbsForSentenceAug() ]
def test_n_output_without_augmentation(self): texts = ['AAAAAAAAAAA AAAAAAAAAAAAAA'] flows = [ naf.Sequential([nac.OcrAug(), nac.OcrAug()]), naf.Sometimes( [nac.RandomCharAug(), nac.RandomCharAug()], pipeline_p=0.00001) ] for flow in flows: for text in texts: at_least_one_equal = False for _ in range(5): augmented_texts = flow.augment(text, n=3) if len(augmented_texts ) == 1 and augmented_texts[0] == text: at_least_one_equal = True break self.assertTrue(at_least_one_equal) self.assertLess(0, len(flows)) self.assertLess(0, len(texts))
def test_multi_thread(self): text = 'The quick brown fox jumps over the lazy dog.' n = 3 augs = [ nac.KeyboardAug(tokenizer=text_tokenizer.split_sentence), nac.RandomCharAug(tokenizer=text_tokenizer.split_sentence), ] for num_thread in [1, 3]: for aug in augs: augmented_data = aug.augment(text, n=n, num_thread=num_thread) self.assertEqual(len(augmented_data), n)
def test_multiple_actions(self): texts = [ 'The quick brown fox jumps over the lazy dog', 'Zology raku123456 fasdasd asd4123414 1234584' ] flows = [ naf.Sequential([ naf.Sometimes([ nac.RandomCharAug(action=Action.INSERT), nac.RandomCharAug(action=Action.DELETE) ], pipeline_p=0.5), naf.Sequential([ nac.OcrAug(), nac.QwertyAug(aug_min=1), nac.RandomCharAug(action=Action.SUBSTITUTE, aug_min=1, aug_char_p=0.6, aug_word_p=0.6) ], name='Sub_Seq') ]), naf.Sometimes([ naf.Sometimes([ nac.RandomCharAug(action=Action.INSERT), nac.RandomCharAug(action=Action.DELETE) ]), naf.Sequential([ nac.OcrAug(), nac.QwertyAug(aug_min=1), nac.RandomCharAug(action=Action.SUBSTITUTE, aug_min=1, aug_char_p=0.6, aug_word_p=0.6) ]) ], pipeline_p=0.5) ] # Since prob may be low and causing do not perform data augmentation. Retry 5 times for flow in flows: at_least_one_not_equal = False for _ in range(0, 5): for text in texts: self.assertLess(0, len(text)) augmented_text = flow.augment(text) if text != augmented_text: at_least_one_not_equal = True self.assertLess(0, len(text)) if at_least_one_not_equal: break self.assertTrue(at_least_one_not_equal) self.assertLess(0, len(flows)) self.assertLess(0, len(texts))
def test_empty_input_for_insert(self): texts = ['', ' '] augs = [ nac.RandomCharAug(action='insert') ] for aug in augs: for text in texts: augmented_text = aug.augment(text) self.assertTrue(augmented_text is None or augmented_text.strip() == '') augmented_texts = aug.augment(texts) for augmented_text in augmented_texts: self.assertTrue(augmented_text is None or augmented_text.strip() == '')
def test_stopwords(self): text = 'The quick brown fox jumps over the lazy dog.' stopwords = ['The', 'brown', 'fox', 'jumps', 'the', 'dog'] augs = [ nac.RandomCharAug(stopwords=stopwords), nac.KeyboardAug(stopwords=stopwords), nac.OcrAug(stopwords=stopwords) ] for aug in augs: for i in range(10): augmented_text = aug.augment(text) self.assertTrue( 'quick' not in augmented_text or 'over' not in augmented_text or 'lazy' not in augmented_text)
def test_stopwords_regex(self): text = 'The quick brown fox jumps over the lazy dog.' stopwords_regex = "( [a-zA-Z]{1}ox | [a-z]{1}og|(brown)|[a-zA-z]{1}he)|[a-z]{2}mps " augs = [ nac.RandomCharAug(action="delete", stopwords_regex=stopwords_regex), nac.KeyboardAug(stopwords_regex=stopwords_regex), nac.OcrAug(stopwords_regex=stopwords_regex) ] for aug in augs: for i in range(10): augmented_text = aug.augment(text) self.assertTrue( 'quick' not in augmented_text or 'over' not in augmented_text or 'lazy' not in augmented_text)
def test_single_action(self): texts = [ 'The quick brown fox jumps over the lazy dog', 'Zology raku123456 fasdasd asd4123414 1234584 s@#' ] flow = naf.Sequential([nac.RandomCharAug(action=Action.INSERT, min_char=1)]) for text in texts: augmented_text = flow.augment(text) self.assertNotEqual(text, augmented_text) self.assertLess(0, len(text)) self.assertLess(0, len(texts))
def test_empty_input_for_substitute(self): texts = ['', ' '] augs = [ nac.RandomCharAug(action='substitute'), nac.KeyboardAug(), nac.OcrAug() ] for aug in augs: for text in texts: augmented_text = aug.augment(text) self.assertTrue(augmented_text is None or augmented_text.strip() == '') augmented_texts = aug.augment(texts) for augmented_text in augmented_texts: self.assertTrue(augmented_text is None or augmented_text.strip() == '')
def test_multiple_actions(self): texts = [ 'The quick brown fox jumps over the lazy dog', 'Zology raku123456 fasdasd asd4123414 1234584' ] flows = [ naf.Sequential([ naf.Sometimes([ nac.RandomCharAug(action="insert"), nac.RandomCharAug(action="delete") ], pipeline_p=0.9), naf.Sequential( [ # nac.OcrAug(), nac.QwertyAug(aug_min=1), nac.RandomCharAug(action="substitute", aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6) ], name='Sub_Seq') ]), naf.Sometimes([ naf.Sometimes([ nac.RandomCharAug(action="insert"), nac.RandomCharAug(action="delete") ]), naf.Sequential([ nac.OcrAug(), nac.KeyboardAug(aug_char_min=1), nac.RandomCharAug(action="substitute", aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6) ]) ], pipeline_p=0.9) ] # Since prob may be low and causing do not perform data augmentation. Retry 5 times for flow in flows: for text in texts: at_least_one_not_equal = False for _ in range(5): augmented_text = flow.augment(text, n=1) if text != augmented_text: at_least_one_not_equal = True break self.assertTrue(at_least_one_not_equal) self.assertLess(0, len(flows)) self.assertLess(0, len(texts))
def test_min_char(self): text = 'He eats apple' augs = [ nac.RandomCharAug(min_char=5), nac.KeyboardAug(min_char=5), nac.OcrAug(min_char=5) ] for aug in augs: augmented = False for i in range(10): augmented_text = aug.augment(text) if 'apple' not in augmented_text: augmented = True break self.assertTrue(augmented)
def apply_noise(datestr: str, format_dict: Dict[str, str], noise_dict: Dict[str, Any]) -> str: sep = format_dict["separator"] sep = sep[0] if len(sep) > 1 else sep date_parts = datestr.split(sep) if noise_dict["append_day_suffix"]: date_parts[0] = date_parts[0] + noise_dict["day_suffix"] # Add spelling mistake to month name if len(format_dict["month"]) > 2 and np.random.random() <= 0.3: aug = nac.RandomCharAug( action=noise_dict["aug_char_action"], aug_char_min=1, aug_char_max=1, ) date_parts[1] = aug.augment(date_parts[1]) out = "" for idx, date_part in enumerate(date_parts): part_sep = sep rand_val = np.random.random() if noise_dict["noisy_separator"] and rand_val <= 0.15: part_sep += " " if noise_dict["noisy_separator"] and rand_val <= 0.15: part_sep = " " + part_sep elif noise_dict["noisy_separator"] and rand_val <= 0.5: part_sep += "".join( np.random.choice(ADDITIONAL_PUNCTUATION, size=2)) if idx == 0: out += date_part else: out += f"{part_sep}{date_part}" # out = f"{sep}".join(date_parts) if noise_dict["casing"] == "uppercase": out = out.upper() elif noise_dict["casing"] == "lowercase": out = out.lower() if noise_dict["place_in_sentence"]: out = put_datestr_in_sentence(out, noise_dict["sentence"]) return out
def test_tokenizer(self): augs = [ nac.OcrAug(tokenizer=text_tokenizer.split_sentence), nac.KeyboardAug(tokenizer=text_tokenizer.split_sentence), nac.RandomCharAug(tokenizer=text_tokenizer.split_sentence), ] text = 'The quick brown fox, jumps over lazy dog.' expected_tokens = ['The', ' quick', ' brown', ' fox', ', ', 'jumps', ' over', ' lazy', ' dog', '.'] for aug in augs: tokens = aug.tokenizer(text) self.assertEqual(tokens, expected_tokens) text = 'The quick !brown fox, jumps # over lazy dog .' expected_tokens = ['The', ' quick', ' !', 'brown', ' fox', ', ', 'jumps', ' # ', 'over', ' lazy', ' dog', ' .'] for aug in augs: tokens = aug.tokenizer(text) self.assertEqual(tokens, expected_tokens)
def setUpClass(cls): env_config_path = os.path.abspath( os.path.join(os.path.dirname(__file__), '..', '..', '.env')) load_dotenv(env_config_path) # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm cls.sample_wav_file = os.environ.get( "DATA_DIR") + 'Yamaha-V50-Rock-Beat-120bpm.wav' cls.audio, cls.sampling_rate = librosa.load(cls.sample_wav_file) cls.textual_augs = [ nac.RandomCharAug(), naw.ContextualWordEmbsAug(), nas.ContextualWordEmbsForSentenceAug() ] cls.audio_augs = [ naa.CropAug(sampling_rate=cls.sampling_rate), naa.SpeedAug(), ]
def test_multi_inputs(self): texts = [ 'The quick brown fox jumps over the lazy dog.', 'The quick brown fox jumps over the lazy dog.', 'nac KeyboardAug ( tokenizer = text_tokenizer . split_sentence )', 'nac KeyboardAug ( tokenizer = text_tokenizer . split_sentence )' ] augs = [ nac.KeyboardAug(tokenizer=text_tokenizer.split_sentence), nac.RandomCharAug(tokenizer=text_tokenizer.split_sentence), ] num_thread = 2 for aug in augs: augmented_data = aug.augment(texts, num_thread=num_thread) self.assertEqual(len(augmented_data), len(texts)) num_thread = 1 for aug in augs: augmented_data = aug.augment(texts, num_thread=num_thread) self.assertEqual(len(augmented_data), len(texts))
def test_augment_detail(self): text = 'The quick brown fox jumps over the lazy dog' augs = [ nac.KeyboardAug(min_char=1, include_detail=True), nac.OcrAug(min_char=1, include_detail=True), nac.RandomCharAug(min_char=2, include_detail=True) ] for aug in augs: augmented_text, augment_details = aug.augment(text) self.assertNotEqual(text, augmented_text) self.assertGreater(len(augment_details), 0) for augment_detail in augment_details: self.assertTrue(augment_detail['orig_token'] in text) self.assertGreater(augment_detail['orig_start_pos'], -1) self.assertGreater(augment_detail['new_start_pos'], -1) self.assertGreater(augment_detail['change_seq'], 0) self.assertIn(augment_detail['action'], Action.getall()) # Get back original input by re-engineering reengineering_text = augmented_text for change_obj in sorted(augment_details, key=lambda item: item['orig_start_pos'], reverse=True): if change_obj['action'] == Action.DELETE: text_prefix = reengineering_text[:change_obj['new_start_pos']] text_core = change_obj['orig_token'] + ' ' text_suffix = reengineering_text[change_obj['new_start_pos']:] elif change_obj['action'] in [Action.INSERT, Action.SUBSTITUTE]: text_prefix = reengineering_text[:change_obj['new_start_pos']] text_core = reengineering_text[change_obj['new_start_pos']:].replace( change_obj['new_token'], change_obj['orig_token'], 1) text_suffix = '' # TODO # elif change_obj['action'] in Action.SWAP: reengineering_text = text_prefix + text_core + text_suffix reengineering_text = reengineering_text.strip() self.assertEqual(text, reengineering_text)
def test_augment_detail(self): text = 'The quick brown fox jumps over the lazy dog' flows = [ naf.Sequential([ naf.Sometimes([ nac.RandomCharAug(action="insert"), nac.RandomCharAug(action="delete") ], pipeline_p=0.5), naf.Sequential([ nac.RandomCharAug(action="substitute", aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6) ], name='Sub_Seq') ], include_detail=True), naf.Sometimes([ naf.Sometimes([ nac.RandomCharAug(action="insert"), nac.RandomCharAug(action="delete") ]), naf.Sequential([ nac.OcrAug(), nac.KeyboardAug(aug_char_min=1), nac.RandomCharAug(action="substitute", aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6) ]) ], pipeline_p=1, include_detail=True) ] for flow in flows: augmented_text, augment_details = flow.augment(text) self.assertNotEqual(text, augmented_text) self.assertGreater(len(augment_details), 0) for augment_detail in augment_details: self.assertGreater(augment_detail['orig_start_pos'], -1) self.assertGreater(augment_detail['new_start_pos'], -1) self.assertGreater(augment_detail['change_seq'], 0) self.assertIn(augment_detail['action'], Action.getall())
def test_single_action(self): texts = [ 'The quick brown fox jumps over the lazy dog', 'Zology raku123456 fasdasd asd4123414 1234584 s@#' ] # Since prob may be low and causing do not perform data augmentation. Retry 5 times at_least_one_not_equal = False for _ in range(0, 5): flow = naf.Sometimes([nac.RandomCharAug(action=Action.INSERT)], pipeline_p=0.6) for text in texts: augmented_text = flow.augment(text) if text != augmented_text: at_least_one_not_equal = True self.assertLess(0, len(text)) if at_least_one_not_equal: break self.assertTrue(at_least_one_not_equal) self.assertLess(0, len(texts))
def random_delete_aug(corpus): aug = nac.RandomCharAug(tokenizer=whitespace_tokenizer, action="delete") # go through all train and dev sentences augmented_sentences = [] for sentence in corpus.train: augmented_texts = aug.augment(sentence.to_tokenized_string(), n=3) for augmented_text in augmented_texts: augmented_sentence: Sentence = Sentence() augmented_token_texts = augmented_text.split(" ") for augmented_token_text, original_token in zip(augmented_token_texts, sentence): # make a new token augmented_token = Token(augmented_token_text) # transfer annotations over to augmented token augmented_token.annotation_layers = original_token.annotation_layers # add augmented token to augmented sentence augmented_sentence.add_token(augmented_token) # add augmented sentence to list of all augmented sentences augmented_sentences.append(augmented_sentence) corpus = Corpus(train=SentenceDataset(augmented_sentences), dev=corpus.dev, test=corpus.test) return corpus
def test_n_output(self): texts = [ 'The quick brown fox jumps over the lazy dog', 'Zology raku123456 fasdasd asd4123414 1234584', 'AAAAAAAAAAA AAAAAAAAAAAAAA' ] flows = [ naf.Sequential( [nac.RandomCharAug(action=Action.INSERT), naw.RandomWordAug()]), naf.Sometimes([ nac.RandomCharAug(action=Action.INSERT), nac.RandomCharAug(action=Action.DELETE) ], pipeline_p=0.9), naf.Sequential([ naf.Sequential([ nac.RandomCharAug(action=Action.INSERT), naw.RandomWordAug() ]), naf.Sometimes([ nac.RandomCharAug(action=Action.INSERT), nac.RandomCharAug(action=Action.DELETE) ], pipeline_p=0.9) ]) ] for flow in flows: for text in texts: augmented_texts = flow.augment(text, n=3) self.assertGreater(len(augmented_texts), 1) for augmented_text in augmented_texts: self.assertNotEqual(augmented_text, text) self.assertLess(0, len(flows)) self.assertLess(0, len(texts))
def random_char_swap(text): #Swap character randomly aug = nac.RandomCharAug(action="swap") attacked_text = aug.augment(text) print("Attacked Text:") print(attacked_text)
def random_char_del(text): #Delete character randomly aug = nac.RandomCharAug(action="delete") attacked_text = aug.augment(text) print("Attacked Text:") print(attacked_text)
rows = [] for row in csv_reader: rows.append(row) for row in list(rows): # Substitute character by keyboard distance if row[1] != "flag" and row[1] !='R' and row[1]!="": counter += 1 if counter != 2: row[2] = str(keyboard_dis.augment(row[2])) if counter == 3: counter = 0 csv_writer.writerow(row) with open("input_classification_test_data.csv","r") as input: with open("/Users/wenyaxie/Downloads/negative_data_character_insertion.csv","w") as output: random_insert = nac.RandomCharAug(action="insert") csv_reader = csv.reader(input) csv_writer = csv.writer(output) counter = 0 rows = [] for row in csv_reader: rows.append(row) for row in list(rows): # Insert character randomly if row[1] != "flag" and row[1] !='R' and row[1]!="": counter += 1 if counter != 2: row[2] = str(random_insert.augment(row[2])) if counter == 3:
def random_char_subsi(text): # Substitute character randomly aug = nac.RandomCharAug(action="substitute") attacked_text = aug.augment(text) print("Attacked Text:") print(attacked_text)
import warnings import pandas as pd import numpy as np from numpy.random import choice import nlpaug.augmenter.char as nac from pandas.api.types import is_string_dtype from pandas.api.types import is_numeric_dtype warnings.filterwarnings('ignore') swapRandom = nac.RandomCharAug(action="swap") replaceTwoCharsBasedOnKeyboard = nac.KeyboardAug() deleteRandomChar = nac.RandomCharAug(action="delete") np.random.seed(0) # Setting seed globally # r = np.random.RandomState(0) TODO: Setting the seed for the class locally without impacting global numpy seed class DataCorruptor: def __init__(self, data, feature_cols, feature_stats=None, log=False): # np.random.seed(0) if feature_stats is None: # TODO: Take the cardinlal statistics (like most common value), into account while corupting data self.feature_stats = data.describe().T[['mean', 'std', 'max', 'min']] else: self.feature_stats = feature_stats
def random_char_insert(text): # Insert character randomly aug = nac.RandomCharAug(action="insert") attacked_text = aug.augment(text) print("Attacked Text:") print(attacked_text)