def execute_by_device(self, device): for model_path in self.model_paths: insert_aug = naw.ContextualWordEmbsAug(model_path=model_path, action="insert", force_reload=True, device=device) substitute_aug = naw.ContextualWordEmbsAug(model_path=model_path, action="substitute") for data in [self.text, self.texts]: self.insert(insert_aug, data) self.substitute(substitute_aug, data) self.substitute_stopwords(substitute_aug, data) self.top_k([insert_aug, substitute_aug], data) self.top_p([insert_aug, substitute_aug], data) self.top_k_top_p([insert_aug, substitute_aug], data) self.no_top_k_top_p([insert_aug, substitute_aug], data) self.decode_by_tokenizer([insert_aug, substitute_aug]) self.no_candidiate([insert_aug, substitute_aug]) self.subword([insert_aug, substitute_aug]) self.max_length([insert_aug, substitute_aug]) self.empty_replacement(substitute_aug) self.skip_short_token(substitute_aug) self.assertLess(0, len(self.model_paths))
def test_batch_size(self): # 1 per batch aug = naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased', model_type='bert', batch_size=1) aug_data = aug.augment(self.texts) self.assertEqual(len(aug_data), len(self.texts)) # batch size = input size aug = naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased', model_type='bert', batch_size=len(self.texts)) aug_data = aug.augment(self.texts) self.assertEqual(len(aug_data), len(self.texts)) # batch size > input size aug = naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased', model_type='bert', batch_size=len(self.texts) + 1) aug_data = aug.augment(self.texts) self.assertEqual(len(aug_data), len(self.texts)) # input size > batch size aug = naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased', model_type='bert', batch_size=2) aug_data = aug.augment(self.texts * 2) self.assertEqual(len(aug_data), len(self.texts) * 2)
def execute_by_device(self, device): for model_path in self.model_paths: insert_aug = naw.ContextualWordEmbsAug(model_path=model_path, action="insert", force_reload=True, device=device) substitute_aug = naw.ContextualWordEmbsAug(model_path=model_path, action="substitute", force_reload=True, device=device) self.oov([insert_aug, substitute_aug]) self.insert(insert_aug) self.substitute(substitute_aug) self.substitute_stopwords(substitute_aug) self.subword([insert_aug, substitute_aug]) self.not_substitute_unknown_word(substitute_aug) self.top_k([insert_aug, substitute_aug]) self.top_p([insert_aug, substitute_aug]) self.top_k_top_p([insert_aug, substitute_aug]) self.no_top_k_top_p([insert_aug, substitute_aug]) self.max_length([insert_aug, substitute_aug]) self.empty_replacement(substitute_aug) self.assertLess(0, len(self.model_paths))
def execute_by_device(self, device): for model_path in self.model_paths: if self.debug: print('=============:', model_path) insert_aug = naw.ContextualWordEmbsAug(model_path=model_path, action="insert", force_reload=True, device=device) substitute_aug = naw.ContextualWordEmbsAug(model_path=model_path, action="substitute", device=device) if device == 'cpu': self.assertTrue(device == insert_aug.model.get_device()) self.assertTrue(device == substitute_aug.model.get_device()) elif 'cuda' in device: self.assertTrue('cuda' in insert_aug.model.get_device()) self.assertTrue('cuda' in substitute_aug.model.get_device()) for data in [self.text, self.texts]: self.insert(insert_aug, data) self.substitute(substitute_aug, data) if self.debug: print('=============data:', data) self.substitute_stopwords(substitute_aug, data) self.decode_by_tokenizer([insert_aug, substitute_aug]) self.subword([insert_aug, substitute_aug]) self.max_length([insert_aug, substitute_aug]) self.empty_replacement(substitute_aug) self.skip_short_token(substitute_aug) self.assertLess(0, len(self.model_paths))
def test_stopword_for_preprocess(self): stopwords = ["[id]", "[year]"] texts = [ "My id is [id], and I born in [year]", # with stopwords as last word "[id] id is [id], and I born in [year]", # with stopwords as first word "[id] [id] Id is [year] [id]", # continuous stopwords "[id] [id] Id is [year] [id]", # continuous stopwords with space "My id is [id], and I [id] born in [year] a[year] [year]b aa[year]", # with similar stopwords "My id is [id], and I born [UNK] [year]", # already have reserved word. NOT handling now ] expected_replaced_texts = [ 'My id is [UNK], and I born in [UNK]', '[UNK] id is [UNK], and I born in [UNK]', '[UNK] [UNK] Id is [UNK] [UNK]', '[UNK] [UNK] Id is [UNK] [UNK]', 'My id is [UNK], and I [UNK] born in [UNK] a[year] [year]b aa[year]', "My id is [UNK], and I born [UNK] [UNK]", ] expected_reserved_tokens = [['[year]', '[id]'], ['[year]', '[id]', '[id]'], ['[id]', '[year]', '[id]', '[id]'], ['[id]', '[year]', '[id]', '[id]'], ['[year]', '[id]', '[id]'], ['[year]', '[id]']] expected_reversed_texts = [ 'My id is [id], and I born in [year]', '[id] id is [id], and I born in [year]', '[id] [id] Id is [year] [id]', '[id] [id] Id is [year] [id]', 'My id is [id], and I [id] born in [year] a[year] [year]b aa[year]', 'My id is [UNK], and I born [id] [year]' ] augs = [ naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert", stopwords=stopwords), naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute", stopwords=stopwords) ] for aug in augs: unknown_token = aug.model.get_unknown_token( ) or aug.model.UNKNOWN_TOKEN for expected_text, expected_reserved_token_list, expected_reversed_text, text in zip( expected_replaced_texts, expected_reserved_tokens, expected_reversed_texts, texts): replaced_text, reserved_stopwords = aug.replace_stopword_by_reserved_word( text, aug.stopword_reg, unknown_token) assert expected_text == replaced_text assert expected_reserved_token_list == reserved_stopwords reversed_text = aug.replace_reserve_word_by_stopword( replaced_text, aug.reserve_word_reg, reserved_stopwords) assert expected_reversed_text == reversed_text
def __init__(self): aug0 = naw.RandomWordAug() aug1 = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute") aug2 = naw.SynonymAug(aug_src='wordnet') aug3 = naw.SplitAug() aug4 = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert") self.augs = [aug0, aug1, aug2, aug3, aug4]
def test_reset_model(self): for model_path in self.model_paths: original_aug = naw.ContextualWordEmbsAug(model_path=model_path, action="insert", force_reload=True) original_top_k = original_aug.model.top_k new_aug = naw.ContextualWordEmbsAug(model_path=model_path, action="insert", force_reload=True, top_k=original_top_k + 1) new_top_k = new_aug.model.top_k self.assertEqual(original_top_k + 1, new_top_k)
class QuestionGenerator: """Class contains logic for augmenting text""" aug = naw.ContextualWordEmbsAug(model_path="bert-base-uncased", action="substitute") aug_single = naw.SynonymAug(aug_src="wordnet") @staticmethod def augment(text): """ checks whether to apply synonym or contextual augmentation :param text: :return: """ tokens = split_sentence(re.sub("[^a-zA-Z0-9 ]+", "", text)) if len(tokens) > 1: return QuestionGenerator.aug.augment(text, n=10, num_thread=4) else: return QuestionGenerator.aug_single.augment(tokens[0], n=10) @staticmethod async def generateQuestions(texts): """ generates a list of variations for a given sentence/question :param texts: list of text :return: list of variations """ if type(texts) == str: texts = [texts] result = [QuestionGenerator.augment(text) for text in texts] return sum(result, [])
class QuestionGenerator: aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute") aug_single = naw.SynonymAug(aug_src='wordnet') @staticmethod def augment(text): tokens = split_sentence(re.sub('[^a-zA-Z0-9 ]+', '', text)) if len(tokens) > 1: return QuestionGenerator.aug.augment(text, n=10, num_thread=4) else: return QuestionGenerator.aug_single.augment(tokens[0], n=10) @staticmethod async def generateQuestions(texts): """ This function generates a list of variations for a given sentence/question. E.g. await QuestionGenerator.generateQuestions('your question') will return the list of variations for that particular question """ if type(texts) == str: texts = [texts] result = [QuestionGenerator.augment(text) for text in texts] return sum(result, [])
def bert_augment(text, aug_p=.2): aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', aug_p=aug_p) augmented_texts = aug.augment(text) return augmented_texts
def test_quicktest(self): for model_path in self.model_paths: aug = naw.ContextualWordEmbsAug(model_path=model_path) text = 'The quick brown fox jumps over the lazaaaaaaaaay dog' augmented_text = aug.augment(text) # print('[{}]: {}'.format(model_path, augmented_text)) self.assertNotEqual(text, augmented_text)
def execute_by_device(self, device): for model_path in self.model_paths: insert_aug = naw.ContextualWordEmbsAug(model_path=model_path, action="insert", force_reload=True, device=device) substitute_aug = naw.ContextualWordEmbsAug(model_path=model_path, action="substitute", force_reload=True, device=device) self.oov([insert_aug, substitute_aug]) self.insert(insert_aug) self.substitute(substitute_aug) self.substitute_stopwords(substitute_aug) self.assertLess(0, len(self.model_paths))
def test_model_type(self): aug = naw.ContextualWordEmbsAug( model_path="blinoff/roberta-base-russian-v0", model_type='roberta', force_reload=True) aug.augment( "Мозг — это машина которая пытается снизить ошибку в прогнозе.") self.assertTrue(True)
def test_multi_thread(self): text = 'The quick brown fox jumps over the lazy dog' n = 3 w2v_model_path = os.path.join(os.environ["MODEL_DIR"], 'word', 'word_embs', 'GoogleNews-vectors-negative300.bin') flows = [ naf.Sequential([ naf.Sequential([ nac.OcrAug(), naw.WordEmbsAug(model_type='word2vec', model_path=w2v_model_path) ]), naf.Sequential([ nac.RandomCharAug(), ]), naw.ContextualWordEmbsAug(model_path='xlnet-base-cased', action="substitute", temperature=0.7, device='cpu') ]), naf.Sometimes([ naf.Sequential([ nac.OcrAug(), nac.RandomCharAug(), ]), naf.Sometimes([ naw.WordEmbsAug(model_type='word2vec', model_path=w2v_model_path) ], pipeline_p=0.999), naw.ContextualWordEmbsAug(model_path='xlnet-base-cased', action="substitute", temperature=0.7, device='cpu') ], pipeline_p=0.9999) ] for num_thread in [1, 3]: for flow in flows: augmented_data = flow.augment(text, n=n, num_thread=num_thread) self.assertEqual(len(augmented_data), n)
def get_aug_type_1(top_k: int = 10, aug_max: int = 5, model_path: str = 'distilroberta-base', action: str = 'substitute'): return naw.ContextualWordEmbsAug(model_path=model_path, action=action, top_k=top_k, stopwords=list(STOP_WORDS), aug_max=aug_max)
def test_reset_model(self): for model_path in self.model_paths: original_aug = naw.ContextualWordEmbsAug( model_path=model_path, action="insert", force_reload=True, top_p=0.5) original_temperature = original_aug.model.temperature original_top_k = original_aug.model.top_k original_top_p = original_aug.model.top_p new_aug = naw.ContextualWordEmbsAug( model_path=model_path, action="insert", force_reload=True, temperature=original_temperature+1, top_k=original_top_k+1, top_p=original_top_p+1) new_temperature = new_aug.model.temperature new_top_k = new_aug.model.top_k new_top_p = new_aug.model.top_p self.assertEqual(original_temperature+1, new_temperature) self.assertEqual(original_top_k + 1, new_top_k) self.assertEqual(original_top_p + 1, new_top_p)
def test_multiprocess_gpu(self): text = 'The quick brown fox jumps over the lazy dog' n = 3 aug = naw.ContextualWordEmbsAug(force_reload=True, device='cuda') augmented_texts = aug.augment(text, n=n, num_thread=n) self.assertGreater(len(augmented_texts), 1) for augmented_text in augmented_texts: self.assertNotEqual(augmented_text, text)
def setUpClass(cls): env_config_path = os.path.abspath( os.path.join(os.path.dirname(__file__), '..', '..', '.env')) load_dotenv(env_config_path) cls.augs = [ nac.RandomCharAug(), naw.ContextualWordEmbsAug(), nas.ContextualWordEmbsForSentenceAug() ]
def create_context_aug(params): return naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased', aug_p=params['aug_p'], top_k=params['top_k'], top_p=params['top_p'], aug_max=None, device='cuda', temperature=params['temperature'], stopwords=nltk_stopwords, stopwords_regex=r'".*"')
def prepare_aug(): # Contextual Word Embeddings Augmenter, Substitute word by contextual word embeddings neu_aug = [] neu_aug.append( naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert")) neu_aug.append( naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute")) neu_aug.append( naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased', action="substitute")) neu_aug.append( naw.ContextualWordEmbsAug(model_path='roberta-base', action="substitute")) # Synonym Augmenter, Substitute word by WordNet's synonym syn_aug = [] syn_aug.append(naw.SynonymAug(aug_src='wordnet')) syn_aug.append( naw.SynonymAug( aug_src='ppdb', model_path= '/home/ubuntu/sentiment_analysis/bert-sentiment/syn_model/ppdb-2.0-tldr' )) # Antonym Augmenter ant_aug = [] ant_aug.append(naw.AntonymAug()) # Random Word Augmenter random_aug = [] random_aug.append(naw.RandomWordAug(action="swap")) random_aug.append(naw.RandomWordAug()) print('augmenter initialization finished ...') aug = [] aug.extend(neu_aug) aug.extend(syn_aug) aug.extend(ant_aug) aug.extend(random_aug) return aug
def test_multilingual(self): aug = naw.ContextualWordEmbsAug(model_path='bert-base-multilingual-uncased') inputs = [ {'lang': 'fra', 'text': "Bonjour, J'aimerais une attestation de l'employeur certifiant que je suis en CDI."}, {'lang': 'jap', 'text': '速い茶色の狐が怠惰なな犬を飛び越えます'}, {'lang': 'spa', 'text': 'un rapido lobo marron salta sobre el perro perezoso'} ] for input_param in inputs: augmented_text = aug.augment(input_param['text']) self.assertNotEqual(input_param['text'], augmented_text)
def run_core(): print(datetime.datetime.now(), 'before import') import nlpaug.augmenter.word as naw print(datetime.datetime.now(), 'before init') aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', model_type="bert", use_custom_api=True) text = 'The quick brown fox jumps over the lazy dog.' print(datetime.datetime.now(), 'before augment') aug.augment([text] * 2) print(datetime.datetime.now(), 'done')
def __init__(self, template, output_file, augmentation_factor=5): assert augmentation_factor >= 2 self.augmentation_factor = augmentation_factor self.base_file = template self.output_file = output_file self.dataset = {} self.intents = {} self.character_augmenter = nac.OcrAug() self.word_augmenter = naw.ContextualWordEmbsAug()
def execute_by_device(self, device): for model_path in self.model_paths: insert_aug = naw.ContextualWordEmbsAug(model_path=model_path, action="insert", force_reload=True, device=device) substitute_aug = naw.ContextualWordEmbsAug(model_path=model_path, action="substitute", force_reload=True, device=device) self.oov([insert_aug, substitute_aug]) self.insert(insert_aug) self.substitute(substitute_aug) self.substitute_stopwords(substitute_aug) self.subword([insert_aug, substitute_aug]) # Must be last one as it changed properties. self.not_substitute_unknown_word(substitute_aug) self.assertLess(0, len(self.model_paths))
def test_non_strip_input(self): text = ' Good boy ' augs = [ naw.ContextualWordEmbsAug(action='insert'), naw.AntonymAug(), naw.TfIdfAug(model_path=os.environ.get("MODEL_DIR"), action="substitute") ] for aug in augs: augmented_text = aug.augment(text) self.assertNotEqual(text, augmented_text)
def test_skip_punctuation(self): text = '. . . . ! ? # @' augs = [ naw.ContextualWordEmbsAug(action='insert'), naw.AntonymAug(), naw.TfIdfAug(model_path=os.environ.get("MODEL_DIR"), action="substitute") ] for aug in augs: augmented_text = aug.augment(text) self.assertEqual(text, augmented_text)
def test_empty_input_for_insert(self): text = ' ' augs = [ naw.ContextualWordEmbsAug(action="insert"), naw.TfIdfAug(model_path=os.environ.get("MODEL_DIR"), action="substitute") ] for aug in augs: augmented_text = aug.augment(text) # FIXME: standardize return is_equal = augmented_text == '' or augmented_text == ' ' self.assertTrue(is_equal)
def main(args): df = pd.read_csv(args.input_csv, index_col=0) kwargs = { 'top_k': 10, 'action': 'insert', 'model_path': args.bert_path, 'aug_min': 2, 'aug_max': 4, 'stopwords': get_stopwords(args.stopwords) } augmenter = naw.ContextualWordEmbsAug(device=args.device, **kwargs) augmented = augment_dataframe(df, augmenter, args.text_key, batch_size=2) augmented.to_csv(args.output_csv)
def main(): model_paths = [ # 'distilbert-base-uncased', 'bert-base-uncased', # 'bert-base-cased', # 'xlnet-base-cased', # 'roberta-base', # 'distilroberta-base' ] for model_path in model_paths: print('-----------------:', model_path) aug = naw.ContextualWordEmbsAug(model_path=model_path) text = 'The quick brown fox jumps over the lazaaaaaaaaay dog' augmented_text = aug.augment([text]*2)
def test_excessive_space(self): # https://github.com/makcedward/nlpaug/issues/48 text = 'The quick brown fox jumps over the lazy dog . 1 2 ' expected_result = ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.', '1', '2'] augs = [ naw.ContextualWordEmbsAug(action='insert'), naw.AntonymAug(), naw.TfIdfAug(model_path=os.environ.get("MODEL_DIR"), action="substitute") ] for aug in augs: tokenized_text = aug._tokenizer(text) self.assertEqual(tokenized_text, expected_result)