def test_reset_top_k(self): original_aug = naw.WordEmbsAug( model_type='word2vec', model_path=self.word2vec_model_path) original_top_k = original_aug.model.top_k new_aug = naw.WordEmbsAug( model_type='word2vec', model_path=self.word2vec_model_path, top_k=original_top_k+1) new_top_k = new_aug.model.top_k self.assertEqual(original_top_k+1, new_top_k)
def test_reset_top_k(self): original_aug = naw.WordEmbsAug(model_type='word2vec', model_path=os.environ.get("MODEL_DIR") + 'GoogleNews-vectors-negative300.bin') original_top_k = original_aug.model.top_k new_aug = naw.WordEmbsAug(model_type='word2vec', model_path=os.environ.get("MODEL_DIR") + 'GoogleNews-vectors-negative300.bin', top_k=original_top_k + 1) new_top_k = new_aug.model.top_k self.assertEqual(original_top_k + 1, new_top_k)
def test_incorrect_model_type(self): with self.assertRaises(ValueError) as error: naw.WordEmbsAug( model_type='test_model_type', model_path=os.environ.get("MODEL_DIR") + 'GoogleNews-vectors-negative300.bin') self.assertTrue('Model type value is unexpected.' in str(error.exception))
def test_incorrect_model_type(self): with self.assertRaises(ValueError) as error: naw.WordEmbsAug( model_type='test_model_type', model_path=self.word2vec_model_path) self.assertTrue('Model type value is unexpected.' in str(error.exception))
def augment_dataset(csv, model_dir): original = pd.read_csv(csv) """ Conduct two process of augmentation 1. Synonym augmentation 2. Word Embedding augmemntation """ syn_df = original.copy() syn_aug = naw.SynonymAug(aug_src='wordnet') # synonym augenter(simple version) for i, query in enumerate(syn_df.src): synonym = syn_aug.augment(query) syn_df.at[i, 'src'] = synonym #word embedding augmenter word_df = original.copy() embed_aug = naw.WordEmbsAug(model_type='fasttext', model_path=model_dir + '/wiki-news-300d-1M.vec', action="insert") for i, query in enumerate(word_df.src): insertion = embed_aug.augment(query) word_df.at[i, 'src'] = insertion a1 = pd.catcat([original, syn_df]) a2 = pd.concat([a1, word_df]) a2.to_csv(os.path.join(model_dir, 'augmented.csv'), index=False) return a2
def embeddings(df, model, context_action): aug = naw.WordEmbsAug(model_type=model, model_path=r"PATH TO WORD2VEC MODEL", action=context_action) print("STARTING EMBEDDINGS: ", context_action) text_augmentation(aug, df)
def augmentation(text, insert=False, substitute=False, swap=True, delete=True): augs = [] if insert: aug = naw.WordEmbsAug( model_type='word2vec', model_path= '/media/jionie/my_disk/Kaggle/Tweet/model/word2vec/GoogleNews-vectors-negative300.bin', action="insert") augs.append(aug) if substitute: aug_sub = naw.SynonymAug(aug_src='wordnet') augs.append(aug_sub) if swap: aug_swap = naw.RandomWordAug(action="swap") augs.append(aug_swap) if delete: aug_del = naw.RandomWordAug() augs.append(aug_del) aug = naf.Sometimes(augs, aug_p=0.5, pipeline_p=0.5) # print("before aug:", text) text = aug.augment(text, n=1) # print("after aug:", text) return text
def test_multi_thread(self): text = 'The quick brown fox jumps over the lazy dog' n = 3 w2v_model_path = os.path.join(os.environ["MODEL_DIR"], 'word', 'word_embs', 'GoogleNews-vectors-negative300.bin') flows = [ naf.Sequential([ naf.Sequential([ nac.OcrAug(), naw.WordEmbsAug(model_type='word2vec', model_path=w2v_model_path) ]), naf.Sequential([ nac.RandomCharAug(), ]), naw.ContextualWordEmbsAug(model_path='xlnet-base-cased', action="substitute", temperature=0.7, device='cpu') ]), naf.Sometimes([ naf.Sequential([ nac.OcrAug(), nac.RandomCharAug(), ]), naf.Sometimes([ naw.WordEmbsAug(model_type='word2vec', model_path=w2v_model_path) ], pipeline_p=0.999), naw.ContextualWordEmbsAug(model_path='xlnet-base-cased', action="substitute", temperature=0.7, device='cpu') ], pipeline_p=0.9999) ] for num_thread in [1, 3]: for flow in flows: augmented_data = flow.augment(text, n=n, num_thread=num_thread) self.assertEqual(len(augmented_data), n)
def setUpClass(cls): env_config_path = os.path.abspath( os.path.join(os.path.dirname(__file__), '..', '..', '..', '.env')) load_dotenv(env_config_path) cls.insert_augmenters = [ naw.WordEmbsAug(model_type='word2vec', model_path=os.environ.get("MODEL_DIR") + 'GoogleNews-vectors-negative300.bin', action=Action.INSERT), naw.WordEmbsAug(model_type='glove', model_path=os.environ.get("MODEL_DIR") + 'glove.6B.50d.txt', action=Action.INSERT), naw.WordEmbsAug(model_type='fasttext', model_path=os.environ.get("MODEL_DIR") + 'wiki-news-300d-1M.vec', action=Action.INSERT), ] cls.substitute_augmenters = [ naw.WordEmbsAug(model_type='word2vec', model_path=os.environ.get("MODEL_DIR") + 'GoogleNews-vectors-negative300.bin', action=Action.SUBSTITUTE), naw.WordEmbsAug(model_type='glove', model_path=os.environ.get("MODEL_DIR") + 'glove.6B.50d.txt', action=Action.SUBSTITUTE), naw.WordEmbsAug(model_type='fasttext', model_path=os.environ.get("MODEL_DIR") + 'wiki-news-300d-1M.vec', action=Action.SUBSTITUTE), ]
def test_case_insensitive(self): retry_cnt = 10 text = 'Good' aug = naw.WordEmbsAug( model_type='word2vec', model_path=self.word2vec_model_path, top_k=2) for _ in range(retry_cnt): augmented_text = aug.augment(text) self.assertNotEqual(text.lower(), augmented_text.lower()) self.assertLess(0, retry_cnt)
def test_case_insensitive(self): retry_cnt = 10 text = 'Good' aug = naw.WordEmbsAug(model_type='word2vec', model_path=os.environ.get("MODEL_DIR") + 'GoogleNews-vectors-negative300.bin', top_k=2) for _ in range(retry_cnt): augmented_text = aug.augment(text) self.assertNotEqual(text.lower(), augmented_text.lower()) self.assertLess(0, retry_cnt)
def test_stopwords(self): text = 'The quick brown fox jumps over the lazy dog.' stopwords = ['The', 'brown', 'fox', 'jumps', 'the', 'dog'] augs = [ naw.RandomWordAug(action="delete", stopwords=stopwords), naw.ContextualWordEmbsAug(stopwords=stopwords), naw.WordEmbsAug(model_type='word2vec', model_path=self.word2vec_model_path, stopwords=stopwords) ] for aug in augs: for i in range(10): augmented_text = aug.augment(text) self.assertTrue( 'quick' not in augmented_text or 'over' not in augmented_text or 'lazy' not in augmented_text)
def setUpClass(cls): env_config_path = os.path.abspath( os.path.join(os.path.dirname(__file__), '..', '..', '..', '.env')) load_dotenv(env_config_path) cls.word2vec_model_path = os.path.join( os.environ.get("MODEL_DIR"), 'word', 'word_embs', 'GoogleNews-vectors-negative300.bin') cls.word2vec_model = naw.WordEmbsAug( model_type='word2vec', model_path=cls.word2vec_model_path) cls.context_word_embs_model = naw.ContextualWordEmbsAug() cls.tfidf_model_path = os.path.join(os.environ.get("MODEL_DIR"), 'word', 'tfidf') cls._train_tfidf(cls)
def test_stopwords_regex(self): text = 'The quick brown fox jumps over the lazy dog.' stopwords_regex = "( [a-zA-Z]{1}ox | [a-z]{1}og|(brown)|[a-zA-z]{1}he)|[a-z]{2}mps " augs = [ naw.RandomWordAug(action="delete", stopwords_regex=stopwords_regex), naw.ContextualWordEmbsAug(stopwords_regex=stopwords_regex), naw.WordEmbsAug(model_type='word2vec', model_path=self.word2vec_model_path, stopwords_regex=stopwords_regex) ] for aug in augs: for i in range(10): augmented_text = aug.augment(text) self.assertTrue( 'quick' not in augmented_text or 'over' not in augmented_text or 'lazy' not in augmented_text)
def test_stopwords(self): text = 'The quick brown fox jumps over the lazy dog.' stopwords = ['The', 'brown', 'fox', 'jumps', 'the', 'dog'] augs = [ naw.RandomWordAug(action="delete", stopwords=stopwords), naw.ContextualWordEmbsAug(stopwords=stopwords), naw.WordEmbsAug(model_type='word2vec', model_path=os.environ["MODEL_DIR"] + 'GoogleNews-vectors-negative300.bin', stopwords=stopwords) ] for aug in augs: for i in range(10): augmented_text = aug.augment(text) self.assertTrue( 'quick' not in augmented_text or 'over' not in augmented_text or 'lazy' not in augmented_text)
def test_multi_thread(self): text = 'The quick brown fox jumps over the lazy dog.' augs = [ naw.RandomWordAug(), naw.WordEmbsAug(model_type='word2vec', model_path=self.word2vec_model_path), naw.ContextualWordEmbsAug( model_path='xlnet-base-cased', action="substitute", device='cpu') ] for num_thread in [1, 3]: for aug in augs: augmented_data = aug.augment(text, n=num_thread, num_thread=num_thread) if num_thread == 1: # return string self.assertTrue(isinstance(augmented_data, str)) else: self.assertEqual(len(augmented_data), num_thread)
def augmentation(self, text, insert=False, substitute=False, swap=True, delete=True): augs = [] if insert: # aug = naw.ContextualWordEmbsAug( # model_path=self.model_type, action="insert", device='cuda') # wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz aug = naw.WordEmbsAug( model_type='word2vec', model_path= '/C:/Users/admin/Documents/Nitin/mycodes/kaggle_google_quest_qna/data/helpers/word2vec/GoogleNews-vectors-negative300.bin', action="insert") augs.append(aug) if substitute: # aug = naw.ContextualWordEmbsAug( # model_path=self.model_type, action="substitute", device='cuda') # aug = naw.WordEmbsAug( # model_type='word2vec', model_path='/media/jionie/my_disk/Kaggle/Google_Quest_Answer/model/word2vec/GoogleNews-vectors-negative300.bin', # action="substitute") aug_sub = naw.SynonymAug(aug_src='wordnet') augs.append(aug_sub) # text = aug.augment(text) if swap: aug_swap = naw.RandomWordAug(action="swap") augs.append(aug_swap) # text = aug.augment(text) if delete: aug_del = naw.RandomWordAug() augs.append(aug_del) # text = aug.augment(text) aug = naf.Sometimes(augs, aug_p=0.5, pipeline_p=0.5) # print("before aug:", text) text = aug.augment(text, n=1) # print("after aug:", text) return text
def fn_word_emb(self): product_choices = list( product(self.database_choice, self.action_choices, self.aug_p_choices)) for item in product_choices: aug = naw.WordEmbsAug(model_type=item[0][1], model_path=item[0][2], action=item[1], aug_p=item[2], stopwords=self.stopwords) print("\nmodelname-action-words augmented: {}-{}-{}\n".format( item[0][0], item[1], item[2])) augmented_text = aug.augment(text, n=self.n_words) print(augmented_text, "\n") self.write_excel(augmented_text, item, item[1], item[2]) self.workbook.close()
def test_multi_thread(self): text = 'The quick brown fox jumps over the lazy dog.' n = 3 augs = [ naw.RandomWordAug(), naw.WordEmbsAug(model_type='word2vec', model_path=os.environ["MODEL_DIR"] + 'GoogleNews-vectors-negative300.bin'), naw.ContextualWordEmbsAug(model_path='xlnet-base-cased', action="substitute", skip_unknown_word=True, temperature=0.7, device='cpu') ] for num_thread in [1, 3]: for aug in augs: augmented_data = aug.augment(text, n=n, num_thread=num_thread) self.assertEqual(len(augmented_data), n)
import pandas as pd import numpy as np import nlpaug.augmenter.char as nac import nlpaug.augmenter.word as naw import nlpaug.augmenter.sentence as nas import nlpaug.flow as nafc from nlpaug.util import Action import swifter df = pd.read_csv("../data/train.csv").iloc[:, 1:] aug = naw.WordEmbsAug(model_type="word2vec", model_path="../data/aug/GoogleNews-vectors-negative300", action="insert") df_copy = df.copy() # df_copy["description"] = df_copy["description"].apply(lambda x: aug.augment(x)) df_copy["description"] = df_copy["description"].swifter.apply(aug.augment) print(0) df_copy.to_csv("../data/train_augmented.csv", index=False)
def test_incorrect_model_type(self): with self.assertRaises(ValueError): naw.WordEmbsAug(model_type='test_model_type', model_path=os.environ.get("MODEL_DIR") + 'GoogleNews-vectors-negative300.bin', action=Action.INSERT)
def tokenize(self): # This method rewrite the TRIZ summary in accordance with the will of the user (only one part of the contradiction in the summary or both parts) # It also rewrites every part involved to add spaces between sentences and avoid problems during tokenization # Finally it tokenizes text+summary using Stanford Core NLP. The results are saved if self.args.mode == 'test': print("\n\n\nTokenization in progress...") if self.args.input_files is not None: files = self.args.input_files else: addon = '/*' * (self.args.depth_directory + 1) files = sorted( glob.glob(self.data_path + addon + self.args.input)) count = 0 pool = Pool(self.args.n_cpus) for _ in pool.imap_unordered(clean_text, files): # Print % processed files ######################################################### sys.stdout.write('\r') # the exact output you're looking for: j = (count + 1) / len(files) sys.stdout.write("[%-20s] %d%%" % ('=' * int(20 * j), 100 * j)) sys.stdout.flush() count += 1 ######################################################### pool.close() pool.join() extracted_patents_dir = os.path.abspath(self.data_path) tokenized_patents_dir = os.path.abspath(self.temp_path + '/test/' + self.args.input) print("Preparing to tokenize %s to %s..." % (extracted_patents_dir, tokenized_patents_dir)) # make IO list file print("Making list of files to tokenize...") with open("mapping_for_corenlp.txt", "w") as f: for s in files: f.write("%s\n" % (s)) command = [ 'java', 'edu.stanford.nlp.pipeline.StanfordCoreNLP', '-annotators', 'tokenize,ssplit', '-ssplit.newlineIsSentenceBreak', 'always', '-filelist', 'mapping_for_corenlp.txt', '-outputFormat', 'json', '-outputDirectory', tokenized_patents_dir ] print("Tokenizing %i files in %s and saving in %s..." % (len(files), extracted_patents_dir, tokenized_patents_dir)) subprocess.call(command) os.remove("mapping_for_corenlp.txt") print("Stanford CoreNLP Tokenizer has finished.") print("Successfully finished tokenizing %s to %s.\n" % (extracted_patents_dir, tokenized_patents_dir)) else: pool = Pool(self.args.n_cpus) for path in self.data_path: corpus_type = path.split('/')[-1] patents_directories = sorted(glob.glob(path + '/*')) for patent_directory in patents_directories: found_summary = False patent_files = sorted(glob.glob(patent_directory + '/*')) for patent_file in patent_files: if patent_file.find('.SUM') >= 0 and patent_file.find( 'SUMMARY') < 0: found_summary = True if not found_summary: with open( patent_directory + '/' + os.path.basename(patent_directory) + '.SUM', 'w') as f: f.write('') # Data augmentation with double translation # Add spaces between sentences in all used texts including summaries to avoid problems during tokenization ######################################################################################################################### to_augment = [] count_augment = 0 for tipe in ["SUM"] + self.parts_of_interest: files = sorted(glob.glob(path + '/*/*.' + tipe)) count = 0 if tipe != "SUM": print() logging.info("Cleaning data...") count = 0 for _ in pool.imap_unordered(clean_text, files): # Print % processed files ######################################################### sys.stdout.write('\r') # the exact output you're looking for: j = (count + 1) / len(files) sys.stdout.write("[%-20s] %d%%" % ('=' * int(20 * j), 100 * j)) sys.stdout.flush() count += 1 ######################################################### if self.args.data_augmentation != "None" and corpus_type == 'train': print("\n" + corpus_type + " set: Data augmentation in progress for " + tipe + " parts...") else: print("\n" + corpus_type + " set: Sentences verification for SUM parts...") for num_file, file in enumerate(files): if tipe == "SUM": with open(file, "r", encoding='utf-8') as f: # print(file) data = '' for sentence in f: if sentence.find("STATE_OF_THE_ART") >= 0: continue else: data += sentence param_sents = data.split("///") if param_sents[0] != 'empty': to_augment.append(num_file) first_param_sents = param_sents[0].replace( "\n", "").split("//") if len(param_sents) > 1: second_param_sents = param_sents[ 1].replace("\n", "").split("//") else: second_param_sents = [] # Print % processed files ######################################################### sys.stdout.write('\r') # the exact output you're looking for: j = (count + 1) / len(files) sys.stdout.write("[%-20s] %d%%" % ('=' * int(20 * j), 100 * j)) sys.stdout.flush() count += 1 ######################################################### # Data augmentation with double translation if self.args.data_augmentation != "None" and corpus_type == 'train' and num_file in to_augment: count_augment += 1 if self.args.data_augmentation == "transformation" and self.args.transformation_type == "bert_embeddings": aug = naw.ContextualWordEmbsAug( model_path='bert-base-uncased', action="substitute") elif self.args.data_augmentation == "transformation" and self.args.transformation_type == "word2vec_embeddings": aug = naw.WordEmbsAug( model_type='word2vec', model_path= './word2vec/GoogleNews-vectors-negative300.bin' ) elif self.args.data_augmentation == "transformation" and self.args.transformation_type == "synonyms": aug = naw.SynonymAug() path_augmented_text = file.split('/') path_augmented_text[-2] += 'b' path_augmented_text[-1] = '.'.join([ path_augmented_text[-2], path_augmented_text[-1].split('.')[-1] ]) path_new_directory = '/'.join( path_augmented_text[:-1]) path_augmented_text = '/'.join(path_augmented_text) if os.path.isfile( path_augmented_text) or file.find('b') > 0: continue if self.args.data_augmentation == "translation": # Not to exceed google translations quotas time.sleep(1.25) augmented_text = '' if tipe != "SUM": for sentence in data.split('.'): if self.args.data_augmentation == "translation": augmented_text += translate_client.translate( translate_client.translate( sentence + '.', target_language=self.args. translation_language) ['translatedText'].replace( "'", "'").replace( ".", " ") + '.', target_language='en' )['translatedText'].replace( "'", "'").replace(".", " ") + '.' elif self.args.data_augmentation == "transformation": augmented_text += aug.augment( sentence + '.').replace(".", " ") + '.' # print("ok2") elif first_param_sents[0] != 'empty': for sentence in first_param_sents: if self.args.data_augmentation == "translation": augmented_text += translate_client.translate( translate_client.translate( sentence + '.', target_language=self.args. translation_language) ['translatedText'].replace( "'", "'").replace( ".", " ") + '.', target_language='en' )['translatedText'].replace( "'", "'").replace(".", " ") + '. //' elif self.args.data_augmentation == "transformation": augmented_text += aug.augment( sentence + '.').replace( ".", " ") + '. //' # print("ok3") augmented_text += '/' for sentence in second_param_sents: if self.args.data_augmentation == "translation": augmented_text += translate_client.translate( translate_client.translate( sentence + '.', target_language=self.args. translation_language) ['translatedText'].replace( "'", "'").replace( ".", " ") + '.', target_language='en' )['translatedText'].replace( "'", "'").replace(".", " ") + '. //' elif self.args.data_augmentation == "transformation": augmented_text += aug.augment( sentence + '.').replace( ".", " ") + '. //' # print("ok4") augmented_text = augmented_text[:-3] augmented_text = augmented_text.replace(".", ". ") augmented_text = ' '.join(augmented_text.split()) # Write translation try: os.mkdir(path_new_directory) except: pass with open(path_augmented_text, 'w') as f: f.write(augmented_text[:-1]) ######################################################################################################################### # Rewriting of summaries with chosen sentences/parameters (one side of the contradiction or both) #########################################################################################################################"" data_analyzer = summary_preparation(path + '/') (path_state_of_the_art, summary) = data_analyzer.get_data('both') for num in range(0, len(summary[0])): summary_patent_first = '' summary_patent_second = '' for x in range(0, len(summary[0][num])): summary_patent_first += (summary[0][num][x] + ' ') with open(path_state_of_the_art[num][0:-16] + 'SUMTRIZ', "w") as file: file.write(summary_patent_first) for x in range(0, len(summary[1][num])): summary_patent_second += (summary[1][num][x] + ' ') with open(path_state_of_the_art[num][0:-16] + 'SUMTRIZ2', "w") as file: file.write(summary_patent_second) # except: # print("No summaries provided for "+corpus_type+' files.') # time.sleep(1) #########################################################################################################################"" # Tokenization using Standford Core NLP ######################################################################################################################### add_on = ["SUMTRIZ", "SUMTRIZ2"] for tipe in self.parts_of_interest + add_on: print("\n\n\nTokenization in progress...") files = sorted(glob.glob(path + '/*/*.' + tipe)) print( str(len(files)) + " " + tipe + " found for " + corpus_type + " set.") extracted_patents_dir = os.path.abspath(path) tokenized_patents_dir = os.path.abspath(self.temp_path + '/' + corpus_type + '/' + tipe) print("Preparing to tokenize %s to %s..." % (extracted_patents_dir, tokenized_patents_dir)) stories = os.listdir(extracted_patents_dir) # make IO list file print("Making list of files to tokenize...") with open("mapping_for_corenlp.txt", "w") as f: for s in files: f.write("%s\n" % (s)) command = [ 'java', 'edu.stanford.nlp.pipeline.StanfordCoreNLP', '-annotators', 'tokenize,ssplit', '-ssplit.newlineIsSentenceBreak', 'always', '-filelist', 'mapping_for_corenlp.txt', '-outputFormat', 'json', '-outputDirectory', tokenized_patents_dir ] print("Tokenizing %i files in %s and saving in %s..." % (len(stories), extracted_patents_dir, tokenized_patents_dir)) subprocess.call(command) os.remove("mapping_for_corenlp.txt") ######################################################################################################################### print("Stanford CoreNLP Tokenizer has finished.") print("Successfully finished tokenizing %s to %s.\n" % (extracted_patents_dir, tokenized_patents_dir)) pool.close() pool.join()
def word2vec_aug(text): aug = naw.WordEmbsAug( model_type='word2vec', model_path='/spell/leftout/GoogleNews-vectors-negative300.bin', action="substitute") augmented_text = aug.augment(text) return augmented_text
if answers.get('choice') == 'Plot embedding space - TensorboardX': writer = SummaryWriter('tensorboard/embeddings') writer.add_embedding(pretrained_embeddings, metadata=dataset.TEXT.vocab.itos, tag='Embedding') writer.close() print('Remember to run Tensorboard thru: tensorboard --logdir=tensorboard') if answers.get('choice') == 'More info about dataset': dataset.dataset_info() dataset.print_dataset_details() if answers.get('choice') == 'Evaluate and plot PR curves - TensorboardX': model.load_state_dict(torch.load('ezmath-model_83.pt')) test_loss, test_acc = train.evaluate_with_pr_plotting(model, test_iterator, criterion, dataset.LABEL.vocab.itos) print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.3f}%') print('Remember to run Tensorboard thru: tensorboard --logdir=tensorboard') if answers.get('choice') == 'Test Textual Augmenter by word2vec similarity': print('Loading Word2Vec model...') aug = naw.WordEmbsAug( model_type='word2vec', model_path='vector_cache/word2vec_CoNLL17/model.bin', action="substitute") text = input("Please insert the exercise text to augment: ") augmented_text = aug.augment(text) print("Original:") print(text) print("Augmented Text:") print(augmented_text) answers = inquirer.prompt(questions)
def setUpClass(cls): env_config_path = os.path.abspath(os.path.join( os.path.dirname(__file__), '..', '..', '..', '.env')) load_dotenv(env_config_path) model_dir = os.environ.get("MODEL_DIR") full_test_case = False cls.augmenters = [ naw.WordEmbsAug(model_type='word2vec', model_path=model_dir+'GoogleNews-vectors-negative300.bin'), naw.WordEmbsAug(model_type='glove', model_path=model_dir+'glove.6B.50d.txt'), naw.WordEmbsAug(model_type='fasttext', model_path=model_dir + 'wiki-news-300d-1M.vec') ] if full_test_case: cls.augmenters.extend([ naw.WordEmbsAug(model_type='glove', model_path=model_dir+'glove.42B.300d.txt'), naw.WordEmbsAug(model_type='glove', model_path=model_dir+'glove.840B.300d.txt'), naw.WordEmbsAug(model_type='glove', model_path=model_dir+'glove.twitter.27B.25d.txt'), naw.WordEmbsAug(model_type='glove', model_path=model_dir+'glove.twitter.27B.50d.txt'), naw.WordEmbsAug(model_type='glove', model_path=model_dir+'glove.twitter.27B.100d.txt'), naw.WordEmbsAug(model_type='glove', model_path=model_dir+'glove.twitter.27B.200d.txt'), naw.WordEmbsAug(model_type='fasttext', model_path=model_dir+'wiki-news-300d-1M-subword.vec'), naw.WordEmbsAug(model_type='fasttext', model_path=model_dir+'crawl-300d-2M.vec'), naw.WordEmbsAug(model_type='fasttext', model_path=model_dir+'crawl-300d-2M-subword.vec'), ])
from tqdm import tqdm SAMPLE_NUM = 1000 model_dir = 'model/' des_dir = 'sentence/' data_need_dir = des_dir + 'need_sentence.csv' data_novel_dir = des_dir + 'novel_sentence.xlsx' data_need_aug_dir = des_dir + 'need_aug_sentence.csv' data_need_all_dir = des_dir + 'need_all_sentence.csv' data_need = pd.read_csv(data_need_dir, index_col=0) augs = [ # Substitute word by word2vec similarity naw.WordEmbsAug(model_type='word2vec', model_path=model_dir + 'GoogleNews-vectors-negative300.bin', action="substitute"), # Substitute word by contextual word embeddings (BERT) naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute"), # Substitute word by WordNet's synonym naw.SynonymAug(aug_src='wordnet'), # Substitute word by PPDB's synonym naw.SynonymAug(aug_src='ppdb', model_path=model_dir + 'ppdb-2.0-s-all') ] trans = BackTranslation( url=[ 'translate.google.com', #'translate.google.co.kr', 'translate.google.cn',