def test_language_modeling(model_type, model_name): with open("train.txt", "w") as f: for i in range(100): f.writelines("Hello world with Simple Transformers! \n") model_args = { "reprocess_input_data": True, "overwrite_output_dir": True, "num_train_epochs": 1, "no_save": True, "vocab_size": 100, "generator_config": { "embedding_size": 512, "hidden_size": 256, "num_hidden_layers": 1 }, "discriminator_config": { "embedding_size": 512, "hidden_size": 256, "num_hidden_layers": 2 }, } model = LanguageModelingModel( "electra", None, args=model_args, train_files="train.txt", use_cuda=False, ) # Train the model model.train_model("train.txt")
def get_gpt2_perplexity_for_every_sentence(data_as_pd_dataframe, output_file, cuda_device=-1, lowercase_all_text=True): if cuda_device < 0: use_cuda = False else: use_cuda = True model = LanguageModelingModel('gpt2', 'gpt2-large', use_cuda=use_cuda, cuda_device=cuda_device, args={ 'do_lower_case': lowercase_all_text, 'mlm': False }) model.model = model.model.to(torch.device(f"cuda:{cuda_device}")) disposable_output_dir = 'scratch_output_dir/' if not os.path.isdir(disposable_output_dir): was_originally_dir = False os.makedirs(disposable_output_dir) else: was_originally_dir = True perplexities_to_return = [] with open(output_file, 'w') as f: f.write('perplexity\tsentence\n') for i, row in tqdm(data_as_pd_dataframe.iterrows(), total=data_as_pd_dataframe.shape[0]): # single_example_dataset = pd.DataFrame(pd.Series([row['text']]), columns=['text'], index=[0]) single_example_dataset = SingleItemDataset(model.tokenizer, row['text'], cuda_device) results = model.evaluate(single_example_dataset, disposable_output_dir, multi_label=False, verbose=False, silent=True) instance_perplexity = float(results['perplexity']) perplexities_to_return.append(instance_perplexity) text = row['text'] if '\n' in text or '\t' in text: if '"' in text: f.write('\t'.join( [str(instance_perplexity), '""' + text + '""']) + '\n') else: f.write('\t'.join( [str(instance_perplexity), '"' + text + '"']) + '\n') else: f.write('\t'.join([str(instance_perplexity), text]) + '\n') if not was_originally_dir: shutil.rmtree(disposable_output_dir) return perplexities_to_return
def fine_tune(): train_args = { "reprocess_input_data": True, "overwrite_output_dir": True, "do_lower_case": True, "use_multiprocessing": False, "mlm": False, "num_train_epochs": args.num_train_epochs, 'learning_rate': args.learning_rate, "max_seq_length": 300, "evaluate_during_training": True, "output_dir": args.model_output_dir } print(args.use_cuda) model = LanguageModelingModel('gpt2', args.model_name, use_cuda=args.use_cuda, args=train_args) model.eval_model(args.test_path) model.train_model(args.train_path, eval_file=args.validation_path) model.eval_model(args.test_path)
def build_language_model(handle): train_args = { "output_dir": f"gpt2_outputs/{handle}/", "reprocess_input_data": True, "overwrite_output_dir": True, "fp16": False, "train_batch_size": 32, "eval_batch_size":32, "num_train_epochs": 3, "tensorboard_dir": 'gpt2_tweet_runs/', 'mlm':False, 'use_multiprocessing':False } model = LanguageModelingModel('gpt2', 'gpt2', args=train_args,use_cuda=False) st.info('Training model. This may take a few mins - you may want to check back later.') model.train_model(f"./tweets_cache/{handle}_train.txt", eval_file=f"./tweets_cache/{handle}_valid.txt") return
def train(orig_bert, outdir_name, train_filename, eval_filename): logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) model_args = LanguageModelingArgs() model_args.reprocess_input_data = True model_args.output_dir = outdir_name model_args.best_model_dir = outdir_name + "/best_model" model_args.tensorboard_dir = outdir_name + "/runs" model_args.overwrite_output_dir = True model_args.save_steps = 0 model_args.num_train_epochs = 10 model_args.dataset_type = "simple" model_args.evaluate_during_training = True model_args.evaluate_during_training_verbose = True model_args.evaluate_during_training_steps = 0 model_args.silent = True model_args.do_lower_case = ("uncased" in orig_bert) model = LanguageModelingModel("bert", orig_bert, args=model_args) model.train_model(train_filename, eval_file=eval_filename)
def fine_tuning(model_name, epoch, batch_size): model_base = model_name.split("/")[1].split("-")[0] train_args = { "reprocess_input_data": True, "overwrite_output_dir": True, } filename = "train-v0.1.json" with open('train_data.txt', 'w') as f: for item in data_list: f.write("%s\n" % item) # for some models seed may cause error try: lang_model = LanguageModelingModel(model_base, model_name, torch.Generator().manual_seed(42), use_cuda=True, args=train_args) except: lang_model = LanguageModelingModel(model_base, model_name, use_cuda=True, args=train_args) lang_output_file_name = "lang_output" lang_model.train_model("train_data.txt", output_dir=lang_output_file_name) necessary_files_for_pretrained_model = [ 'pytorch_model.bin', 'config.json', 'vocab.txt' ] lang_model_output_files = files = glob(lang_output_file_name + "/*") files_to_be_removed = [] for f in lang_model_output_files: if f.split("/")[1] not in necessary_files_for_pretrained_model: files_to_be_removed.append(f) for f in files_to_be_removed: if os.path.isfile(f): os.remove(f) elif os.path.isdir(f): shutil.rmtree(f) shutil.rmtree("cache_dir", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) train_args = { "reprocess_input_data": True, "overwrite_output_dir": True, "num_train_epochs": epoch, "train_batch_size": batch_size } qa_model = QuestionAnsweringModel(model_base, "lang_output", use_cuda=True, args=train_args) return qa_model
def train_model(train_data, test_data, train_args): """ Train the model and save it to the outputs folder :param test_data: :param train_data: :param train_args: dict - arguments to be passed into the train function :return: """ model = LanguageModelingModel('gpt2', MODEL, args=train_args, use_cuda=True) model.train_model(train_data, eval_file=test_data) model.eval_model(test_data)
"n_gpu": args.gpu, "use_multiprocessing": True, "use_early_stopping": True, "early_stopping_patience": 3, "early_stopping_delta": 0, "early_stopping_metric": "eval_loss", "early_stopping_metric_minimize": True, "overwrite_output_dir": True, "manual_seed": None, "encoding": None, "dataset_type": "simple", "tokenizer_name": args.tokenizer, "evaluate_during_training_verbose": True, "use_cached_eval_features": True, "sliding_window": True, "vocab_size": 52000 } model = LanguageModelingModel("electra", None, args=train_args, use_cuda=False, train_files=args.tok_data) model.train_model( args.train_data, eval_file=args.test_data, ) model.eval_model(args.test_data)
def run_exps(): logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) now = datetime.now() time = now.strftime("%H:%M:%S") date_time = now.strftime("%m%d%Y_%H%M%S") if sched_params_opt == "plateau": sched_params = {} sched_params['sched_name'] = 'plateau' sched_params['factor'] = 0.1 sched_params['patience'] = sched_patience sched_params['verbose'] = True sched_params['threshold'] = 0.001 sched_params['min_lr'] = min_lr else: sched_params = None if sched_params is not None: run_name = f'{subset[0]}_slen{seq_len}_wd{weight_decay}_lr{max_lr}-{min_lr}_msk{masks}_p{mask_prob}_dl{disc_hid_layers}_sz{disc_hid_size}_gl{gen_hid_layers}_sz{gen_hid_size}_rgen{random_generator}_drop{disc_drop}_w{rtd_loss_weight}_{rmd_loss_weight}_{mlm_loss_weight}_replace{replace_tokens}_mlmr{mlm_lr_ratio}_{date_time}_cont{contamination}' else: run_name = f'{subset[0]}_slen{seq_len}_wd{weight_decay}_mlr{max_lr}_minlr{min_lr}_msk{masks}_p{mask_prob}_dl{disc_hid_layers}_sz{disc_hid_size}_gl{gen_hid_layers}_sz{gen_hid_size}_rgen{random_generator}_drop{disc_drop}_w{rtd_loss_weight}_{rmd_loss_weight}_{mlm_loss_weight}_replace{replace_tokens}_mlmr{mlm_lr_ratio}_{date_time}_cont{contamination}' print(f'RUN: {run_name}') train_args = { "fp16": False, "use_multiprocessing": False, "reprocess_input_data": False, "overwrite_output_dir": True, "num_train_epochs": 20, "save_eval_checkpoints": False, "save_model_every_epoch": False, "learning_rate": max_lr, "warmup_steps": warmup, "train_batch_size": train_batch_size, #was 32 "eval_batch_size": eval_batch_size, #was 32 "gradient_accumulation_steps": 1, "block_size": seq_len + 2, "max_seq_length": seq_len + 2, "dataset_type": "simple", "logging_steps": 500, "evaluate_during_training": True, "evaluate_during_training_steps": 500, #was 500 "evaluate_during_training_steps_anomaly": eval_anomaly, #was 500 "anomaly_batch_size": anomaly_batch_size, "evaluate_during_training_verbose": True, "use_cached_eval_features": True, "sliding_window": True, "vocab_size": 52000, "eval_anomalies": True, "random_generator": random_generator, "use_rtd_loss": True, "rtd_loss_weight": rtd_loss_weight, "rmd_loss_weight": rmd_loss_weight, "mlm_loss_weight": mlm_loss_weight, "dump_histogram": dump_histogram, "eval_anomaly_after": eval_anomaly_after, "train_just_generator": train_just_generator, "replace_tokens": replace_tokens, "extract_scores": 1, "subset_name": subset[0], "extract_repr": 0, # "vanilla_electra": { # "no_masks": masks, # }, # "vanilla_electra": False, "train_document": True, "tokenizer_name": "bert-base-uncased", "tensorboard_dir": f'{tensorboard_dir}/{exp_prefix}/{run_name}', "extract_reps": extract_reps, "weight_decay": weight_decay, "optimizer": optimizer, "scores_export_path": f"./token_scores/{run_name}/", "generator_config": { "embedding_size": 128, "hidden_size": gen_hid_size, "num_hidden_layers": gen_hid_layers, }, "discriminator_config": { "hidden_dropout_prob": disc_drop, "attention_probs_dropout_prob": disc_drop, "embedding_size": 128, "hidden_size": disc_hid_size, "num_hidden_layers": disc_hid_layers, }, "mlm_lr_ratio": mlm_lr_ratio, } for subset_r in tqdm(subset): print('-' * 10, '\n', f'SUBSET: {subset_r}', '-' * 10) now = datetime.now() time = now.strftime("%H:%M:%S") date_time = now.strftime("%m%d%Y_%H%M%S") if preprocessed: train_file = f"./datasets/ag_od/train/{subset_r}.txt" test_file = f"./datasets/ag_od/test/{subset_r}.txt" outlier_file = f"./datasets/ag_od/test/{subset_r}-outliers.txt" if contamination != 0: train_file = f"./datasets/ag_od/train/{subset_r}-contaminated/{subset_r}_c{int(contamination)}.txt" model = LanguageModelingModel("electra", None, masks=masks_, args=train_args, train_files=train_file, use_cuda=True) model.train_model_anomaly(train_file, eval_file=test_file, eval_file_outlier=outlier_file, sched_params=sched_params)
task = sys.argv[1] vocsize = int(sys.argv[2]) logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) outdir_name = task + "-constr-uncased-{:d}k".format(vocsize/1000) model_args = LanguageModelingArgs() model_args.reprocess_input_data = True model_args.output_dir = outdir_name model_args.best_model_dir = outdir_name + "/best_model" model_args.tensorboard_dir = outdir_name + "/runs" model_args.overwrite_output_dir = True model_args.train_batch_size = 32 model_args.save_steps = 5000 model_args.max_steps = 50000 model_args.dataset_type = "simple" model_args.evaluate_during_training = True model_args.evaluate_during_training_verbose = True model_args.evaluate_during_training_steps = 5000 model_args.silent = True model_args.do_lower_case = True model_args.tokenizer_name = None model_args.vocab_size = vocsize model = LanguageModelingModel("bert", None, args=model_args, train_files=task + "/train_textonly.txt") model.train_model(task + "/train_textonly.txt", eval_file=task + "/dev_textonly.txt")
print( "----------------------------------------------------------------------" ) from simpletransformers.language_modeling import LanguageModelingModel import logging logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.INFO) train_args = { "reprocess_input_data": True, "overwrite_output_dir": True, "train_batch_size": 16, "num_train_epochs": 3, "mlm": False, } model = LanguageModelingModel('gpt2', 'gpt2', args=train_args, use_cuda=False) model.train_model( "/mnt/InternalStorage/sidkas/DocumentSearchEngine/assets/sample_docs_eng/cs_ai_train.txt", eval_file= "/mnt/InternalStorage/sidkas/DocumentSearchEngine/assets/sample_docs_eng/cs_ai_test.txt" ) model.eval_model( "/mnt/InternalStorage/sidkas/DocumentSearchEngine/assets/sample_docs_eng/cs_ai_test.txt" )
logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) outdir_name = "ch-swisscrawl-{}-{:d}k".format(casing, vocsize / 1000) model_args = LanguageModelingArgs() model_args.reprocess_input_data = True model_args.output_dir = outdir_name model_args.best_model_dir = outdir_name + "/best_model" model_args.tensorboard_dir = outdir_name + "/runs" model_args.overwrite_output_dir = True model_args.train_batch_size = 32 model_args.save_steps = 5000 model_args.max_steps = 50000 model_args.dataset_type = "simple" model_args.evaluate_during_training = True model_args.evaluate_during_training_verbose = True model_args.evaluate_during_training_steps = 5000 model_args.silent = True model_args.do_lower_case = (casing == "uncased") model_args.tokenizer_name = None model_args.vocab_size = vocsize model = LanguageModelingModel("bert", None, args=model_args, train_files="swisscrawl_reformatted.txt") model.train_model("swisscrawl_reformatted.txt", eval_file="vardial_train_reformatted.txt")
import logging from simpletransformers.language_modeling import LanguageModelingModel logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) train_args = { "reprocess_input_data": True, "overwrite_output_dir": True, "block_size": 512, "max_seq_length": 512, "learning_rate": 5e-6, "train_batch_size": 8, "gradient_accumulation_steps": 8, "num_train_epochs": 3, "mlm": False, 'reprocess_input_data': True, "output_dir": f"outputs/fine-tuned/", 'overwrite_output_dir': True } model = LanguageModelingModel("gpt2", "gpt2", args=train_args, use_cuda=False) #model.train_model("data/train.txt", eval_file="data/test.txt") model.train_model("datatest.txt") #model.eval_model("data/test.txt") #model.eval_model("evaltest.txt")
"mlm": False, "dataset_type": "simple", "logging_steps": 100, "evaluate_during_training": True, "evaluate_during_training_steps": 3000, "evaluate_during_training_verbose": True, "use_cached_eval_features": True, "sliding_window": True, "use_multiprocessing": False, "vocab_size": 10000, "output_dir": f"outputs/from_scratch_", "best_model_dir": f"outputs/from_scratch/best_model", } train_file = f"data/train.txt" test_file = f"data/test.txt" model = LanguageModelingModel( "gpt2", None, args=train_args, train_files=train_file, ) model.train_model( train_file, eval_file=test_file, ) model.eval_model(test_file)
def main(): logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) train_args = { "reprocess_input_data": False, "overwrite_output_dir": True, "num_train_epochs": 50, "save_eval_checkpoints": True, "save_model_every_epoch": False, "learning_rate": 1e-3, "warmup_steps": 10000, "train_batch_size": 64, "eval_batch_size": 128, "gradient_accumulation_steps": 2, "block_size": 128, "max_seq_length": 128, "dataset_type": "simple", "wandb_project": "Esperanto - ConvBert", "wandb_kwargs": { "name": "ConvBert-SMALL" }, "logging_steps": 100, "evaluate_during_training": True, "evaluate_during_training_steps": 3000, "evaluate_during_training_verbose": True, "use_cached_eval_features": True, "sliding_window": False, "tokenizer_name": "bert-base-chinese", "use_multiprocessing": True, "process_count": 8, "vocab_size": 21128, "generator_config": { "attention_probs_dropout_prob": 0.1, "directionality": "bidi", "embedding_size": 128, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 64, "initializer_range": 0.02, "intermediate_size": 256, "layer_norm_eps": 1e-12, "max_position_embeddings": 512, "model_type": "convbert", "num_attention_heads": 1, "num_hidden_layers": 12, "pad_token_id": 0, "summary_activation": "gelu", "summary_last_dropout": 0.1, "summary_type": "first", "summary_use_proj": True, "type_vocab_size": 2, "vocab_size": 21128 }, "discriminator_config": { "attention_probs_dropout_prob": 0.1, "embedding_size": 128, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 256, "initializer_range": 0.02, "intermediate_size": 1024, "layer_norm_eps": 1e-12, "max_position_embeddings": 512, "model_type": "convbert", "num_attention_heads": 4, "num_hidden_layers": 12, "output_past": True, "pad_token_id": 0, "summary_activation": "gelu", "summary_last_dropout": 0.1, "summary_type": "first", "summary_use_proj": True, "type_vocab_size": 2, "vocab_size": 21128 }, } train_file = "data/train.txt" test_file = "data/test.txt" model = LanguageModelingModel( "convbert", None, args=train_args, train_files=train_file, cuda_device=1, ) model.train_model( train_file, eval_file=test_file, ) model.eval_model(test_file)
def save_best_model(): model = LanguageModelingModel('convbert', 'outputs/best_model', args={"output_dir": "discriminator_trained"}) model.save_discriminator()
issues = [json.loads(line) for line in f.readlines()] test = [] train = [] for issue in issues: body = issue['title'] + ' ' + issue['body'].replace('\n', ' ').replace('\r', ' ') if random.random() > 0.8: test.append(body) else: train.append(body) with open(train_token_path, 'w') as p: p.write('\n'.join(train)) with open(test_token_path, 'w') as p: p.write('\n'.join(test)) print("training model") train_args = { "reprocess_input_data": True, "overwrite_output_dir": True, "save_steps": 10000 } model = LanguageModelingModel("bert", "bert-base-uncased", args=train_args) model.train_model( "issues.train.tokens", eval_file="issues.test.tokens", output_dir="finetuned", ) model.eval_model("issues.test.tokens", "finetune-eval")
"vocab_size": 21128, }, "discriminator_config": { "embedding_size": 128, "hidden_size": 256, "vocab_size": 21128, }, "use_multiprocessing": False, "wandb_project": False, "fp16": False, "save_steps": 20000, "tokenizer_name": 'outputs', "model_name": 'outputs/checkpoint-120000' } train_file = r"train.txt" test_file = r"test.txt" model = LanguageModelingModel("electra", args=train_args, train_files=train_file, use_cuda=False, model_name="outputs/checkpoint-120000") model.train_model( train_file, eval_file=test_file, ) model.eval_model(test_file)
import logging from simpletransformers.language_modeling import LanguageModelingModel logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) train_args = { "reprocess_input_data": True, "overwrite_output_dir": True, "block_size": 512, "max_seq_length": 512, "learning_rate": 5e-6, "train_batch_size": 8, "gradient_accumulation_steps": 8, "num_train_epochs": 3, "mlm": False, "output_dir": f"outputs/fine-tuned/", } model = LanguageModelingModel("gpt2", "gpt2", args=train_args) model.train_model("data/train.txt", eval_file="data/test.txt") model.eval_model("data/test.txt")
def finetunePretrainedBertOnMLM(self, saveMultipleFilesThatWereMergedIntoSingleTrainFile=True): ############################################################################################ # This method finetunes a pre-trained BERT model (base-uncased) on MLM using SEC data as follows: # 1. Builds training and eval files. # 2. Uses Simple Transformers "LanguageModelingModel" class to batch process the # above training and eval files. # 3. Evaluates the finetuned model using the above class. ############################################################################################ try: if self.corpusFolder is None or self.corpusFolder == "" or os.path.exists(self.corpusFolder) is False: log.error(f"Text corpus folder '{self.corpusFolder}' DOES NOT exist!") return # Check if CUDA is available for doing training on a GPU system if torch.cuda.is_available() is False: log.error( f"CUDA libs not found. A new language model can be trained from scratch only on a GPU system with " f"CUDA libs!") return startTime = time.time() ################################################################# # 1. Build text corpus files for train and eval datasets. ################################################################# trainOk, evalOk = self.__buildTrainingAndEvalTextFiles(saveMultipleFilesThatWereMergedIntoSingleTrainFile) # Check if the train and eval files are built as single text corpus file if trainOk is False: log.error(f"Error building training files to finetune pre-trained BERT on MLM objective!") return if evalOk is False: log.error(f"Error building evaluation file to finetune pre-trained BERT on MLM obective!") return ##################################################################### # 2. Uses Simple Transformers "LanguageModelingModel" class to train ##################################################################### log.debug(f"Building config params for SimpleTransformer..") transformers_logger = log.getLogger("transformers") transformers_logger.setLevel(log.WARNING) modelArgs = {"reprocess_input_data": True, "overwrite_output_dir": True, "num_training_epochs": 2, "dataset_type": "simple", "encoding": "utf-8", "cache_dir": self.modelCacheDir, "do_lower_case": True, "learning_rate": self.learningRate, "max_seq_length": self.maxSeqLength, "sliding_window": True, "stride": 0.7, "handle_chinese_chars": False,} log.debug(f"Finished building config params for SimpleTransformer.") log.debug(f"Initializing SimpleTransformer's LanguageModelingModel class..") model = LanguageModelingModel(model_type=self.modelType, model_name=self.modelNameOrPath, args=modelArgs) log.debug(f"Finished initializing SimpleTransformer's LanguageModelingModel class.") log.info(f"Started finetuning pre-trained BERT (large-uncased) on same MLM objective with SEC data..") model.train_model(train_file=self.singleCorpusTrainFile, eval_file=self.singleCorpusEvalFile, output_dir=self.modelOutputDir, show_running_loss=True,) log.info(f"Finished finetuning and saving a pre-trained BERT (large-uncased) model on MLM with SEC data. " f"Check the folder '{self.modelOutputDir}' for finetuned weights.") log.info(f"It took {round((time.time()-startTime)/3600, 1)} hours to finetune a pre-trained BERT model on " f"MLM with SEC data from the corpus '{self.corpusFolder}'") # Evaluation log.info(f"Started evaluating the finetuned BERT (large-uncased) model on: a) perplexity, and b) eval_loss.") model.eval_model(eval_file=self.singleCorpusEvalFile, output_dir=self.modelOutputDir, verbose=True, silent=False) log.info(f"Finished evaluation of the finetuned BERT (large-uncased) model on MLM with SEC data. Check " f"the evaluation results in the folder '{self.modelOutputDir}'.") log.info(f"It took {round((time.time()-startTime)/3600, 1)} hours to evaluate the finetuned BERT model on MLM.'") except: exc_type, exc_value, exc_traceback = sys.exc_info() err = f"** ERROR ** occurred while finetuning a pre-trained BERT model and evaluating it. Error is: {exc_type}; {exc_value}." raise Exception(err)
"vocab_size": 52000, "generator_config": { "embedding_size": 128, "hidden_size": 256, "num_hidden_layers": 3, }, "discriminator_config": { "embedding_size": 128, "hidden_size": 256, }, } train_file = "../data/train.txt" test_file = "../data/test.txt" model = LanguageModelingModel( "electra", None, args=train_args, train_files=train_file, ) model.train_model(train_file, eval_file=test_file,) model.eval_model(test_file) model.save_discriminator() model.save_generator()
import pandas as pd # Finetune m-BERT model (1ST PART) finetune_args = { "reprocess_input_data": True, "overwrite_output_dir": True, 'fp16': False, "special_tokens": ["<s>", "<pad>", "</s>", "<unk>", "<mask>", '</br>'], "evaluate_during_training" : True, 'save_model_every_epoch': False, 'num_train_epochs': 20, 'train_batch_size' : 25, } model = LanguageModelingModel('bert', 'bert-base-multilingual-cased', args = finetune_args, use_cuda = False) # Highly recommended to set use_cuda = True to ultilize GPU (if available) for training model.train_model(curr_path + "/raw-poetry-train.txt", eval_file=curr_path + '/raw-poetry-test.txt') # ## Train an emotion classifier with the fine-tuned model from above (2ND PART) # def f1_evaluate(true, pred): # for p in pred: # for i in range(len(p)): # if p[i] >= 0.5: # p[i] = 1 # else: # p[i] = 0 # score = f1_score(true, pred, average = 'macro') # label = f1_score(true, pred, average = None)
} model_args.vocab_size = 20000 model_args.output_dir = "{}/from_scratch_".format(output_dir_name) model_args.best_model_dir = "{}/from_scratch/best_model".format( output_dir_name) model_args.num_train_epochs = 5 #3 model_args.save_eval_checkpoints = True model_args.overwrite_output_dir = True model_args.evaluate_during_training = True, # needed to save the best model! model_args.evaluate_during_training_verbose = True, #model_args.evaluate_during_training_steps = 3000, #model = LanguageModelingModel("bert", None, args=model_args, train_files=train_file, use_cuda=False) #model = LanguageModelingModel("electra", None, args=model_args, train_files=train_file, use_cuda=False) model = LanguageModelingModel("albert", None, args=model_args, train_files=train_file, use_cuda=False) #--------------------------------- """ model_args = LanguageModelingArgs() model_args.config = { "hidden_size": 768, # 768 "num_hidden_layers": 3, # 12 "num_attention_heads": 3, } model_args.vocab_size = 20000 model_args.output_dir = "{}/from_scratch_".format(output_dir_name) model_args.best_model_dir = "{}/from_scratch/best_model".format(output_dir_name) model_args.num_train_epochs = 3
"fp16_opt_level": "O1", # misc. settings "manual_seed": 3435, # output settings "overwrite_output_dir": True, "best_model_dir": "outputs/pretrain/electra_base_paper_final/best_model", "cache_dir": "cache_dir/pretrain", "output_dir": "outputs/pretrain/electra_base_paper_final/", } train_file = "inputs/pretrain/train-wikidump-books.en" # test_file = "inputs/pretrain/test.txt" model = LanguageModelingModel("electra", None, args=train_args) model.load_and_cache_examples(train_file) # model.load_and_cache_examples(test_file) # def globalizer(fn=None, kwargs=None): # if fn: # fn(**kwargs) # if __name__ == "__main__": # if train_args['n_gpu'] > 1: # fn = model.train_model # kwargs = { # 'train_file': train_file, # 'args': train_args, # 'eval_file': test_file
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the source and target files for the task.", ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help= "Path to pretrained model or model identifier from huggingface.co/models", ) # Other parameters parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--overwrite_output_dir", action="store_true", help="Whether to overwrite on the existing output dir") parser.add_argument( "--output_dir", default='output_dir/', type=str, help= "The output directory where the model predictions and checkpoints will be written.", ) parser.add_argument( "--max_seq_length", default=None, type=int, help="Max input seq length", ) parser.add_argument( "--train_batch_size", default=16, type=int, help="Size of each train batch", ) parser.add_argument( "--gradient_accumulation_steps", default=1, type=int, help="gradient accumulation steps", ) args = parser.parse_args() if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) train_args = { "reprocess_input_data": True, "overwrite_output_dir": True, "block_size": 128, "max_seq_length": args.max_seq_length, "learning_rate": 5e-6, "train_batch_size": args.train_batch_size, "gradient_accumulation_steps": args.gradient_accumulation_steps, "num_train_epochs": 100, "mlm": False, "fp16": False, "output_dir": args.output_dir, "dataset_type": "line_by_line", } model = LanguageModelingModel(model_type="gpt2", model_name=args.model_name_or_path, args=train_args) model.train_model(args.data_dir + "train.txt", eval_file=args.data_dir + "test.txt") model.eval_model(args.data_dir + "test.txt")