def test_multilingual(self): # import nltk # nltk.download('omw') # French text = 'chien' expected_texts = [ 'cliquer', 'clic', 'aboyeur', 'hot dog', 'franc', 'canis familiaris', 'achille', 'toutou', 'cliquet', 'clébard', 'talon', 'chienchien', 'quignon', 'chien de chasse' ] aug = naw.SynonymAug(aug_src='wordnet', lang='fra') augmented_text = aug.augment(text) self.assertTrue(augmented_text in expected_texts) expected_texts = [ 'toutou', 'maître chien', 'clébard', 'dog', 'chienne', 'chiens', 'chiot', 'cynophiles', 'clebs' ] model_path = os.path.join(os.environ.get("MODEL_DIR"), 'word', 'ppdb', 'ppdb-1.0-s-lexical-french') aug = naw.SynonymAug(aug_src='ppdb', model_path=model_path) augmented_text = aug.augment(text) self.assertTrue(augmented_text in expected_texts) # Spanish text = 'Un rápido zorro marrón salta sobre el perro perezoso' aug = naw.SynonymAug(aug_src='wordnet', lang='spa') for _ in range(10): augmented_text = aug.augment(text) if augmented_text != text: break self.assertNotEqual(augmented_text, text)
def test_reload(self): text = 'The quick brown fox jumps over the lazy dog' aug = naw.SynonymAug(aug_src='wordnet') self.assertNotEqual(text, aug.augment(text)) model_path = os.path.join(os.environ.get("MODEL_DIR"), 'word', 'ppdb', 'ppdb-2.0-s-all') aug2 = naw.SynonymAug(aug_src='ppdb', model_path=model_path) self.assertNotEqual(text, aug2.augment(text))
def setUpClass(cls): env_config_path = os.path.abspath( os.path.join(os.path.dirname(__file__), '..', '..', '..', '.env')) load_dotenv(env_config_path) cls.augs = [ naw.SynonymAug(aug_src='wordnet'), naw.SynonymAug(aug_src='ppdb', model_path=os.environ.get("MODEL_DIR") + 'ppdb-2.0-s-all.txt') ]
class QuestionGenerator: """Class contains logic for augmenting text""" aug = naw.ContextualWordEmbsAug(model_path="bert-base-uncased", action="substitute") aug_single = naw.SynonymAug(aug_src="wordnet") @staticmethod def augment(text): """ checks whether to apply synonym or contextual augmentation :param text: :return: """ tokens = split_sentence(re.sub("[^a-zA-Z0-9 ]+", "", text)) if len(tokens) > 1: return QuestionGenerator.aug.augment(text, n=10, num_thread=4) else: return QuestionGenerator.aug_single.augment(tokens[0], n=10) @staticmethod async def generateQuestions(texts): """ generates a list of variations for a given sentence/question :param texts: list of text :return: list of variations """ if type(texts) == str: texts = [texts] result = [QuestionGenerator.augment(text) for text in texts] return sum(result, [])
def test_empty_input_substitute(self): texts = ['', ' '] self.word2vec_model.action = 'substitute' self.context_word_embs_model.action = 'substitute' augs = [ naw.SpellingAug(), naw.AntonymAug(), naw.RandomWordAug(action='substitute'), naw.SynonymAug(aug_src='wordnet'), naw.TfIdfAug(model_path=self.tfidf_model_path, action="substitute"), self.word2vec_model, self.context_word_embs_model ] for aug in augs: for text in texts: augmented_text = aug.augment(text) self.assertTrue(augmented_text is None or augmented_text.strip() == '') augmented_texts = aug.augment(texts) for augmented_text in augmented_texts: self.assertTrue(augmented_text is None or augmented_text.strip() == '')
class QuestionGenerator: aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute") aug_single = naw.SynonymAug(aug_src='wordnet') @staticmethod def augment(text): tokens = split_sentence(re.sub('[^a-zA-Z0-9 ]+', '', text)) if len(tokens) > 1: return QuestionGenerator.aug.augment(text, n=10, num_thread=4) else: return QuestionGenerator.aug_single.augment(tokens[0], n=10) @staticmethod async def generateQuestions(texts): """ This function generates a list of variations for a given sentence/question. E.g. await QuestionGenerator.generateQuestions('your question') will return the list of variations for that particular question """ if type(texts) == str: texts = [texts] result = [QuestionGenerator.augment(text) for text in texts] return sum(result, [])
def augmentation(text, insert=False, substitute=False, swap=True, delete=True): augs = [] if insert: aug = naw.WordEmbsAug( model_type='word2vec', model_path= '/media/jionie/my_disk/Kaggle/Tweet/model/word2vec/GoogleNews-vectors-negative300.bin', action="insert") augs.append(aug) if substitute: aug_sub = naw.SynonymAug(aug_src='wordnet') augs.append(aug_sub) if swap: aug_swap = naw.RandomWordAug(action="swap") augs.append(aug_swap) if delete: aug_del = naw.RandomWordAug() augs.append(aug_del) aug = naf.Sometimes(augs, aug_p=0.5, pipeline_p=0.5) # print("before aug:", text) text = aug.augment(text, n=1) # print("after aug:", text) return text
def augment_dataset(csv, model_dir): original = pd.read_csv(csv) """ Conduct two process of augmentation 1. Synonym augmentation 2. Word Embedding augmemntation """ syn_df = original.copy() syn_aug = naw.SynonymAug(aug_src='wordnet') # synonym augenter(simple version) for i, query in enumerate(syn_df.src): synonym = syn_aug.augment(query) syn_df.at[i, 'src'] = synonym #word embedding augmenter word_df = original.copy() embed_aug = naw.WordEmbsAug(model_type='fasttext', model_path=model_dir + '/wiki-news-300d-1M.vec', action="insert") for i, query in enumerate(word_df.src): insertion = embed_aug.augment(query) word_df.at[i, 'src'] = insertion a1 = pd.catcat([original, syn_df]) a2 = pd.concat([a1, word_df]) a2.to_csv(os.path.join(model_dir, 'augmented.csv'), index=False) return a2
def train_eval_dataset(dataset: pd.DataFrame,lang="ita",expansion=10): nltk.download('averaged_perceptron_tagger') nltk.download('wordnet') nltk.download('omw') flow = naf.Sometimes([naw.SynonymAug(lang=lang, aug_min=10),naw.RandomWordAug("swap"),naw.RandomWordAug("delete"),nac.KeyboardAug()]) train_afert_exp=[] dev_after_exp=[] for idx, row in dataset.iterrows(): logging.info("[{}/{}] {}".format(idx, len(dataset), row["question"])) new_text = [new for new in flow.augment(row["question"], n=expansion)] train_afert_exp.append({"label": row["question_id"], "text": row["question"]}) th=int(len(new_text)*0.8) for text in new_text[:th]: train_afert_exp.append({"label": row["question_id"], "text": text}) for text in new_text[th:]: dev_after_exp.append({"label": row["question_id"], "text": text}) train=train_afert_exp dev=dev_after_exp train = pd.DataFrame(train).sample(frac=1.0) dev = pd.DataFrame(dev).sample(frac=1.0) return train, dev
def augment_by_class(df, max_n): word_index = {} for phrase in df['Body']: words = phrase.split(' ') for word in words: word_index[word] = word_index.get(word, 0) + 1 index_df = pd.DataFrame([{ 'token': i, 'count': word_index[i] } for i in word_index]) index_df = index_df[index_df['count'] >= 10] aug = naw.SynonymAug(stopwords=index_df['token']) aug2 = naw.RandomWordAug() factor = (max_n // len(df)) + 2 result = set() for phrase in df['Body']: result.add(phrase) print(f'Augmenting for {phrase}') for item in aug.augment(phrase, n=factor): result.add(item) for item in aug2.augment(phrase, n=2): result.add(item) return list(result)
def synonym_wordnet(text): #Synonym Augmenter #Substitute word by WordNet's synonym aug = naw.SynonymAug(aug_src='wordnet') attacked_text = aug.augment(text) print("Attacked Text:") print(attacked_text)
def word_substitution(text, aug_src='wordnet'): # import nlpaug.flow as naf import nlpaug.augmenter.word as naw aug = naw.SynonymAug(aug_src=aug_src) augmented_text = aug.augment(text) return augmented_text
def test_multilingual(self): # French text = 'chien' expected_texts = [ 'cliquer', 'clic', 'aboyeur', 'hot dog', 'franc', 'canis familiaris', 'achille', 'toutou', 'cliquet', 'clébard', 'talon', 'chienchien', 'quignon', 'chien de chasse' ] aug = naw.SynonymAug(aug_src='wordnet', lang='fra') augmented_text = aug.augment(text) self.assertTrue(augmented_text in expected_texts) # Spanish text = 'Un rápido zorro marrón salta sobre el perro perezoso' aug = naw.SynonymAug(aug_src='wordnet', lang='spa') augmented_text = aug.augment(text) self.assertNotEqual(augmented_text, text)
def data_augment(corpus, label): syn_aug = naw.SynonymAug(aug_src="wordnet") rand_aug = naw.RandomWordAug(action="swap") data_struc = {'emotion_label': [], 'emotion_text': []} aug_dataframe = pd.DataFrame(data_struc) print('Augmenting data') for label, sentence in zip(label, corpus): if sentence.find("\n") > 0: sentence = sentence.replace("\n", "") aug_dataframe = aug_dataframe.append( { 'emotion_label': label, 'emotion_text': sentence }, ignore_index=True) augmented_sent = syn_aug.augment(sentence) aug_dataframe = aug_dataframe.append( { 'emotion_label': label, 'emotion_text': augmented_sent }, ignore_index=True) augmented_sent1 = rand_aug.augment(sentence) aug_dataframe = aug_dataframe.append( { 'emotion_label': label, 'emotion_text': augmented_sent1 }, ignore_index=True) else: aug_dataframe = aug_dataframe.append( { 'emotion_label': label, 'emotion_text': sentence }, ignore_index=True) augmented_sent = syn_aug.augment(sentence) aug_dataframe = aug_dataframe.append( { 'emotion_label': label, 'emotion_text': augmented_sent }, ignore_index=True) aug1 = naw.RandomWordAug(action="swap") augmented_sent1 = rand_aug.augment(sentence) aug_dataframe = aug_dataframe.append( { 'emotion_label': label, 'emotion_text': augmented_sent1 }, ignore_index=True) print('Augmentation Completed') return aug_dataframe['emotion_text'], aug_dataframe['emotion_label']
def __init__(self): aug0 = naw.RandomWordAug() aug1 = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute") aug2 = naw.SynonymAug(aug_src='wordnet') aug3 = naw.SplitAug() aug4 = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert") self.augs = [aug0, aug1, aug2, aug3, aug4]
def prepare_aug(): # Contextual Word Embeddings Augmenter, Substitute word by contextual word embeddings neu_aug = [] neu_aug.append( naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert")) neu_aug.append( naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute")) neu_aug.append( naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased', action="substitute")) neu_aug.append( naw.ContextualWordEmbsAug(model_path='roberta-base', action="substitute")) # Synonym Augmenter, Substitute word by WordNet's synonym syn_aug = [] syn_aug.append(naw.SynonymAug(aug_src='wordnet')) syn_aug.append( naw.SynonymAug( aug_src='ppdb', model_path= '/home/ubuntu/sentiment_analysis/bert-sentiment/syn_model/ppdb-2.0-tldr' )) # Antonym Augmenter ant_aug = [] ant_aug.append(naw.AntonymAug()) # Random Word Augmenter random_aug = [] random_aug.append(naw.RandomWordAug(action="swap")) random_aug.append(naw.RandomWordAug()) print('augmenter initialization finished ...') aug = [] aug.extend(neu_aug) aug.extend(syn_aug) aug.extend(ant_aug) aug.extend(random_aug) return aug
def synonym_replacement(text, n=N): """ Randomly choose n words from the sentence that are not stop words. Replace each of these words with one of its synonyms chosen at random. """ aug = naw.SynonymAug(aug_src='wordnet', aug_min=n, aug_max=n, stopwords=english_stopwords) augmented_text = aug.augment(text) return augmented_text
def augment_data( num_new_class_0, num_new_class_1, clear_old_augmented_data=False, write_to_path='data/synonym_augmented_reddit_submissions.csv'): """ Generates augmented data by producing new samples for class 0 and/or class 1, the two classes that are underrepresented in our dataset, and writing them to a designated new file 'data/augmented_reddit_submissions.csv'. Takes in: - num_new_class_0: Integer representing how many new samples of class 0 to generate - num_new_class_1: Integer representing how many new samples of class 1 to generate - clear_old_augmented_data: Boolean; if set to True, will overwrite the old augmented data rather than - write_to_path: The path of the file to write or append the new samples to. This function makes use of the nlpaug library's word augmenter. """ # We experimented with a couple other nlpaug models, but we ended up choosing SynonymAug # because it gave us the most natural-sounding and least noisy samples. # Other models we tried were: # naw.WordEmbsAug this one uses word2vec to find similar words for augmentation; it # ended up giving us very noisy data that made the performance of # all models decrease. # naw.ContextualWordEmbsAug this one uses BERT to do the same as the above; it was slightly # better, but still pretty noisy. aug = naw.SynonymAug(aug_src='wordnet') new_rows = [] with open('data/reddit_submissions.csv') as f: reader = csv.reader(f) # Skip the first row that just has column names rows = list(reader)[1:] print('unfiltered rows: {}'.format(len(rows))) seed_rows_with_class_0 = list( filter(lambda r: CLASSES[r[0]] == 0, rows)) seed_rows_with_class_1 = list( filter(lambda r: CLASSES[r[0]] == 1, rows)) print('filtered rows: {}'.format( len(seed_rows_with_class_0) + len(seed_rows_with_class_1))) print('generating new data with class 0') create_new_rows(seed_rows_with_class_0, num_new_class_0, new_rows, aug) print('generating new data with class 1') create_new_rows(seed_rows_with_class_1, num_new_class_1, new_rows, aug) file_open_mode = 'w' if clear_old_augmented_data else 'a' with open(write_to_path, file_open_mode) as f: writer = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC, delimiter=',') print('writing new rows') writer.writerows(new_rows)
def test_language(self): text = 'chien' expected_texts = [ 'cliquer', 'clic', 'aboyeur', 'hot dog', 'franc', 'canis familiaris', 'achille', 'toutou', 'cliquet', 'clébard', 'talon', 'chienchien', 'quignon', 'chien de chasse' ] aug = naw.SynonymAug(aug_src='wordnet', lang='fra') augmented_text = aug.augment(text) self.assertTrue(augmented_text in expected_texts)
def fn_synonym_replacement(self): product_choices = list( product(self.database_choice, self.aug_p_choices)) for item in product_choices: aug = naw.SynonymAug(aug_src=item[0][1], aug_p=item[1], stopwords=self.stopwords) print("\nmodelname-action-words augmented: {}-{}-{}\n".format( item[0][0], "substitute", item[1])) augmented_text = aug.augment(text, n=self.n_words) print(augmented_text, "\n") self.write_excel(augmented_text, item, item[1]) self.workbook.close()
def __init__(self, aug_min=1, aug_max=10, aug_p=0.3, tokenizer=None, always_apply=False, p=0.5): super().__init__(always_apply, p) self.aug = naw.SynonymAug( aug_min=aug_min, aug_max=aug_max, aug_p=aug_p, tokenizer=tokenizer, )
def augmentation(self, text, insert=False, substitute=False, swap=True, delete=True): augs = [] if insert: # aug = naw.ContextualWordEmbsAug( # model_path=self.model_type, action="insert", device='cuda') # wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz aug = naw.WordEmbsAug( model_type='word2vec', model_path= '/C:/Users/admin/Documents/Nitin/mycodes/kaggle_google_quest_qna/data/helpers/word2vec/GoogleNews-vectors-negative300.bin', action="insert") augs.append(aug) if substitute: # aug = naw.ContextualWordEmbsAug( # model_path=self.model_type, action="substitute", device='cuda') # aug = naw.WordEmbsAug( # model_type='word2vec', model_path='/media/jionie/my_disk/Kaggle/Google_Quest_Answer/model/word2vec/GoogleNews-vectors-negative300.bin', # action="substitute") aug_sub = naw.SynonymAug(aug_src='wordnet') augs.append(aug_sub) # text = aug.augment(text) if swap: aug_swap = naw.RandomWordAug(action="swap") augs.append(aug_swap) # text = aug.augment(text) if delete: aug_del = naw.RandomWordAug() augs.append(aug_del) # text = aug.augment(text) aug = naf.Sometimes(augs, aug_p=0.5, pipeline_p=0.5) # print("before aug:", text) text = aug.augment(text, n=1) # print("after aug:", text) return text
def __init__(self): antAug = naw.AntonymAug() synAug = naw.SynonymAug(aug_src='wordnet') embAug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute") self.model_dict = { 0: antAug, 1: synAug, 2: embAug } self.output_data = { 'Sentence1': [], 'Sentence2': [], 'Label': [] }
def augment_dataset(dataset, prob=0.3, augment_type='wordnet'): '''Augment data using wordnet''' data_targets = list(zip(dataset.data, dataset.target)) sample_size = int(np.ceil(len(dataset.data) * prob)) LOGGER.info(f'Augmenting {sample_size} datasets') sample_data = random.choices(data_targets, k=sample_size) if augment_type == 'wordnet': aug = naw.SynonymAug(aug_src='wordnet') aug_data, aug_targets = list( zip(*[(aug.augment(text), target) for text, target in sample_data])) dataset.data = [*dataset.data, *aug_data] dataset.target = [*dataset.target, *aug_targets] LOGGER.info( f'Total dataset size after augmentation: {len(dataset.target)}') return dataset
def main(): print("Loading ppdb dataset...") aug = naw.SynonymAug(aug_p=0.5, aug_src="ppdb", model_path="../data/ppdb-2.0-tldr", aug_max=100) print("Augmentor initialized") dataset = load_dataset('cnn_dailymail', "3.0.0") train_data = dataset['train'] inputs = train_data['article'] targets = train_data['highlights'] ids = train_data['id'] print(len(train_data)) # articles = [nltk.sent_tokenize(inp) for inp in tqdm(inputs)] # pickle.dump(articles, open('articles_sentences.json', 'wb')) articles = pickle.load(open('articles_sentences.json', 'rb'))[:80000] d = {} for i, art in enumerate(tqdm(articles)): augmented = aug.augment(art) # d[ids[i]] = ' '.join(augmented) d[ids[i]] = augmented # num_splits = 10 # split = len(articles)//num_splits # pool = mp.Pool(processes=num_splits) # results = [] # for i in range(num_splits): # data = articles[i*split:(i+1)*split] if i < num_splits-1 else articles[i*split:] # ids = ids[i*split:(i+1)*split] if i < num_splits - 1 else ids[i*split:] # results.append(pool.apply_async(paraphrase, args=(aug, data, ids))) # results = [pool.apply_async(paraphrase, args=(aug, articles[i*split:(i+1)*split] if i < num_splits-1 else articles[i*split:], ids[i*split:(i+1)*split] if i < num_splits-1 else ids[i*split:])) for i in range(num_splits)] # outputs = [p.get() for p in results] # for x in outputs: # d = {**d, **x} pickle.dump(d, open('ppdb_paraphrase.pkl', 'wb'))
def augment_text(self, data): op = random.choice(self.all_transform) # use specified operation magnitude if avalible if isinstance(op, tuple): op, scale = op else: scale = random.uniform(0, self.max_strength) if op == "identity": return data elif op == "syn_replacement": op = naw.SynonymAug(aug_src="wordnet", aug_p=scale, aug_max=None) elif op == "random_swap": op = naw.RandomWordAug(action="swap", aug_p=scale, aug_max=None) elif op == "random_delete": op = naw.RandomWordAug(action="delete", aug_p=scale, aug_max=None) elif op == "insert_punc": op = InsertPunctuation() # scale will be randomized inside function else: raise NotImplementedError return op.augment(data)
def augment_n(data, N=1): pbar = tqdm(desc='Augmenting Data N={}'.format(N), total=data.shape[0], leave=False) # random synonym replacement # aug = naw.SynonymAug(aug_max=4, stopwords=stop_words()) aug = naf.Sequential([ # naw.ContextualWordEmbsAug( # 'bert-base-uncased', # aug_max=5, # stopwords=stop_words(), # device='cuda', # optimize=True # ), naw.ContextualWordEmbsAug('bert-base-uncased', aug_max=3, stopwords=stop_words(), device='cuda', optimize=True, action='insert'), naw.SynonymAug(aug_max=4, stopwords=stop_words()) ]) results = [] for row in data: t, s = augment(row[1], row[2], aug, N) augs = [] for j, t in enumerate(t): augs.append([row[0] + str(j), t, s[j], row[3]]) if len(augs) > 0: results.append(np.array(augs)) pbar.update() results.append(data) pbar.clear() pbar.close() return np.concatenate(results, axis=0)
def get_augmenter(method: str, stopwords: List[str] = None) -> naw.SynonymAug: """ Initialize an augmenter depending on the given method. Parameters ---------- method : str (supported methods: wordnet_synonym and aug_sub_bert) stopwords : list list of words to freeze throughout the augmentation Returns ------- Initialized nlpaug augmenter """ if method == 'wordnet_synonym': return naw.SynonymAug(aug_src='wordnet', stopwords=stopwords) if method == 'aug_sub_bert': return naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute", stopwords=stopwords) raise UnavailableAugmenter( 'The given augmenter is not supported. You must choose one \ of the following: wordnet_synonym or aug_sub_bert')
def word_substitution(text, aug_src='wordnet'): aug = naw.SynonymAug(aug_src=aug_src) augmented_text = aug.augment(text) return augmented_text
def augment(dataset_path, factor, balance_aware=False, alpha=0.7, beta=1, verbose=False): """Augments a dataset in place using nlpaug. Args: dataset (string): Path to training set. factor (int): Factor by which to augment training set size. balance_aware (bool): Whether to use balance-aware data augmentation (not fully tested yet). alpha (float): Alpha parameter in balance-aware data augmentation. beta (float): Beta parameter in balance-aware data augmentation. verbose (bool, optional): Verbose output. Defaults to False. """ logging.basicConfig(level=logging.DEBUG, format="[%(asctime)s:%(name)s] %(message)s") logger = logging.getLogger("augment") if not os.path.exists(dataset_path): if verbose: logger.info( f'Skipping training set augmentation, for {dataset_path} not found.' ) return if factor < 2: if verbose: logger.info( f'Skipping training set augmentation, for factor is {factor} < 2.' ) return if verbose: logger.info(f"Begin training set augmentation.") aug = naw.SynonymAug(aug_src='wordnet') nltk.download('averaged_perceptron_tagger') nltk.download('wordnet') golden = [] with open(dataset_path) as fin: for line in fin: data = json.loads(line) golden.append(data) label_count = get_label_count(golden) if balance_aware: min_rareness, max_rareness, spread_factor = analyze_for_balance_awareness( golden, alpha, beta) if verbose: logger.info( f"Analyzing papers in {dataset_path} for balance-aware data augmentation." ) logger.info(f"Minimum rareness score is {min_rareness}.") logger.info(f"Maximum rareness score is {max_rareness}.") logger.info(f"This allows a spread factor of {spread_factor}.") if verbose: logger.info( f"Augmenting dataset at {dataset_path} with {len(golden)} examples." ) with open(dataset_path, 'w') as fout: for epoch in (tqdm(range(factor), desc='Full augmentation progress') if verbose else range(factor)): for js in (tqdm(golden, desc='Per epoch progress', leave=False) if verbose else golden): if epoch == 0: fout.write(json.dumps(js) + '\n') else: if balance_aware: relative_rareness = floor( rareness_score(js, label_count, alpha, beta) / min_rareness) if relative_rareness <= epoch: continue title = ' '.join(js['title']) aug_title = aug.augment(title) abstract = ' '.join(js['abstract']) aug_abstract = aug.augment(abstract) aug_js = copy.deepcopy(js) aug_js['title'] = aug_title.split() aug_js['abstract'] = aug_abstract.split() fout.write(json.dumps(aug_js) + '\n') if verbose: logger.info(f"Finish training set augmentation.")