def main(): # define parser and arguments args = get_train_test_args() util.set_seed(args.seed) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') DistilBert = DistilBertModel.from_pretrained('distilbert-base-uncased') Experts = [DistilBertQA(DistilBertModel.from_pretrained('distilbert-base-uncased')).to(device) for _ in range(args.num_experts)] tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') gate_model = GateNetwork(384, 3,3, DistilBert.config).to(device) print(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}') if args.do_train: if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) args.save_dir = util.get_save_dir(args.save_dir, args.run_name) log = util.get_logger(args.save_dir, 'log_train') log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}') log.info("Preparing Training Data...") args.device = device trainer = train.Trainer(args, log) train_dataset, _ = get_dataset(args, args.train_datasets, args.train_dir, tokenizer, 'train') log.info("Preparing Validation Data...") val_dataset, val_dict = get_dataset(args, args.train_datasets, args.val_dir, tokenizer, 'val') train_loader = DataLoader(train_dataset, batch_size=args.batch_size, sampler=RandomSampler(train_dataset)) val_loader = DataLoader(val_dataset, batch_size=1, sampler=SequentialSampler(val_dataset)) best_scores = trainer.train(Experts, gate_model, train_loader, val_loader, val_dict, args.num_experts) if args.do_eval: split_name = 'test' if 'test' in args.eval_dir else 'validation' log = util.get_logger(args.save_dir, f'log_{split_name}') trainer = train.Trainer(args, log) # load model restore_model("",args.num_experts, Experts, gate_model) eval_dataset, eval_dict = get_dataset(args, args.eval_datasets, args.eval_dir, tokenizer, split_name) eval_loader = DataLoader(eval_dataset, batch_size=1, sampler=SequentialSampler(eval_dataset)) args.device = device eval_preds, eval_scores = trainer.evaluate(Experts, gate_model, eval_loader, eval_dict, return_preds=True, split=split_name) results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in eval_scores.items()) log.info(f'Eval {results_str}') # Write submission file sub_path = os.path.join(args.save_dir, split_name + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(eval_preds): csv_writer.writerow([uuid, eval_preds[uuid]])
def main(): torch.autograd.set_detect_anomaly(True) # define parser and arguments args = get_train_test_args() util.set_seed(args.seed) tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) if args.model_dir == 'save/xxx': args.save_dir = util.get_save_dir(args.save_dir, args.run_name) else: args.save_dir = args.model_dir if args.train_adv: trainer = AdvTrainer(args) else: trainer = Trainer(args) print("Preparing Training Data...") train_datasets = [] for dataset_idx, train_dataset_name in enumerate(args.train_datasets.split(",")): train_dataset, _ = get_dataset(args, train_dataset_name, args.train_dir, tokenizer, 'train', dataset_idx) train_datasets.append(train_dataset) train_loader = DataLoader(ConcatDataset(train_datasets), batch_size=args.batch_size, shuffle=True) print("Preparing ind Validation Data...") ind_val_dataset, ind_val_dict = get_dataset(args, args.train_datasets.split(","), args.ind_val_dir, tokenizer, 'ind_val') ind_val_loader = DataLoader(ind_val_dataset, batch_size=args.batch_size, shuffle=False) print("Preparing ood Validation Data...") ood_val_loaders = [] ood_val_dicts = [] ood_val_names = [] for ood_val_dataset_name in args.ood_val_datasets.split(","): ood_val_dataset, ood_val_dict = get_dataset(args, ood_val_dataset_name, args.ood_val_dir, tokenizer, 'ood_val') ood_val_loader = DataLoader(ood_val_dataset, batch_size=args.batch_size, shuffle=False) ood_val_loaders.append(ood_val_loader) ood_val_dicts.append(ood_val_dict) ood_val_names.append(ood_val_dataset_name) trainer.train(train_loader, ind_val_loader, ind_val_dict, ood_val_loaders, ood_val_dicts, ood_val_names, args.resume_iters)
def main(): # define parser and arguments args = get_train_test_args() util.set_seed(args.seed) if args.mixture_of_experts and args.do_eval: model = MoE(load_gate=True) experts = True model.gate.eval() elif args.mixture_of_experts and args.do_train: model = MoE(load_gate=False) experts = True else: model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased") experts = False if args.reinit > 0: transformer_temp = getattr(model, 'distilbert') for layer in transformer_temp.transformer.layer[-args.reinit:]: for module in layer.modules(): print(type(module)) if isinstance(module, (nn.Linear, nn.Embedding)): module.weight.data.normal_(mean=0.0, std=transformer_temp.config.initializer_range) elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') if args.do_train: if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) args.save_dir = util.get_save_dir(args.save_dir, args.run_name) log = util.get_logger(args.save_dir, 'log_train') log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}') log.info("Preparing Training Data...") args.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') trainer = Trainer(args, log) train_dataset, _ = get_dataset(args, args.train_datasets, args.train_dir, tokenizer, 'train') log.info("Preparing Validation Data...") val_dataset, val_dict = get_dataset(args, args.train_datasets, args.val_dir, tokenizer, 'val') train_loader = DataLoader(train_dataset, batch_size=args.batch_size, sampler=RandomSampler(train_dataset)) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, sampler=SequentialSampler(val_dataset)) best_scores = trainer.train(model, train_loader, val_loader, val_dict, experts) if args.do_eval: args.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') split_name = 'test' if 'test' in args.eval_dir else 'validation' log = util.get_logger(args.save_dir, f'log_{split_name}') trainer = Trainer(args, log) if args.mixture_of_experts is False: checkpoint_path = os.path.join(args.save_dir, 'checkpoint') model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path) model.to(args.device) eval_dataset, eval_dict = get_dataset(args, args.eval_datasets, args.eval_dir, tokenizer, split_name) eval_loader = DataLoader(eval_dataset, batch_size=args.batch_size, sampler=SequentialSampler(eval_dataset)) eval_preds, eval_scores = trainer.evaluate(model, eval_loader, eval_dict, return_preds=True, split=split_name, MoE = True) results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in eval_scores.items()) log.info(f'Eval {results_str}') # Write submission file sub_path = os.path.join(args.save_dir, split_name + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(eval_preds): csv_writer.writerow([uuid, eval_preds[uuid]])
def main(): # Get command-line args and set seed args = get_train_test_args() util_adversarial.set_seed(args.seed) # Load model model = AdversarialModel(args) tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased') if args.do_train: # Make /save directory if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) args.save_dir = util_adversarial.get_save_dir(args.save_dir, args.run_name) # Get logger log = util_adversarial.get_logger(args.save_dir, 'log_train') log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}') # Set the device to cuda if GPU available args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # Load training data log.info("Preparing Training Data...") train_dataset, train_dict = data_utils.get_dataset( args, args.train_datasets, args.train_dir, tokenizer, 'train') train_loader = DataLoader( train_dataset, batch_size=args. batch_size, # batches the examples into groups of 16 sampler=RandomSampler(train_dataset)) # Load validation data log.info("Preparing Validation Data...") val_dataset, val_dict = data_utils.get_dataset(args, args.train_datasets, args.val_dir, tokenizer, 'val') val_loader = DataLoader(val_dataset, batch_size=args.batch_size, sampler=SequentialSampler(val_dataset)) # Train!!! trainer = Trainer(args, log) trainer.train(model, train_loader, val_loader, val_dict) if args.continue_to_eval: args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') split_name = 'test' if 'test' in args.eval_dir else 'validation' log = util_adversarial.get_logger(args.save_dir, f'log_{split_name}') # Load model model.to(args.device) # Load eval data eval_dataset, eval_dict = data_utils.get_dataset( args, args.eval_datasets, args.eval_dir, tokenizer, split_name) eval_loader = DataLoader(eval_dataset, batch_size=args.batch_size, sampler=SequentialSampler(eval_dataset)) # Evaluate!!! trainer = Trainer(args, log) eval_preds, eval_scores = trainer.evaluate(model, data_loader=eval_loader, data_dict=eval_dict, return_preds=True, split=split_name) results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in eval_scores.items()) log.info(f'Eval in continue_to_eval {results_str}') # Write submission file sub_path = os.path.join(args.save_dir, split_name + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(eval_preds): csv_writer.writerow([uuid, eval_preds[uuid]]) if args.do_eval: args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') split_name = 'test' if 'test' in args.eval_dir else 'validation' log = util_adversarial.get_logger(args.save_dir, f'log_{split_name}') checkpoint_path = os.path.join(args.save_dir, 'checkpoint') checkpoint_path_qa_output = os.path.join(args.save_dir, 'qa_output_state') # Load model model = AdversarialModel(args, load_path=args.saved_model_filename) # model.load(checkpoint_path) # model.load_qa_output_model(checkpoint_path_qa_output) model.to(args.device) # Load eval data eval_dataset, eval_dict = data_utils.get_dataset( args, args.eval_datasets, args.eval_dir, tokenizer, split_name) eval_loader = DataLoader(eval_dataset, batch_size=args.batch_size, sampler=SequentialSampler(eval_dataset)) # Evaluate!!! trainer = Trainer(args, log) eval_preds, eval_scores = trainer.evaluate(model, data_loader=eval_loader, data_dict=eval_dict, return_preds=True, split=split_name) results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in eval_scores.items()) log.info(f'Eval {results_str}') # Write submission file sub_path = os.path.join(args.save_dir, split_name + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(eval_preds): csv_writer.writerow([uuid, eval_preds[uuid]])
def main(): # define parser and arguments args = get_train_test_args() util.set_seed(args.seed) model = DistilBertForQuestionAnswering.from_pretrained( "distilbert-base-uncased") tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased') if args.do_train: if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) if args.resume_training: checkpoint_path = os.path.join(args.save_dir, 'checkpoint') model = DistilBertForQuestionAnswering.from_pretrained( checkpoint_path) model.to(args.device) else: args.save_dir = util.get_save_dir(args.save_dir, args.run_name) log = util.get_logger(args.save_dir, 'log_train') log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}') log.info("Preparing Training Data...") args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') trainer = Trainer(args, log) train_dataset, _ = get_dataset(args, args.train_datasets, args.train_dir, tokenizer, 'train', args.outdomain_data_repeat) log.info("Preparing Validation Data...") val_dataset, val_dict = get_dataset(args, args.train_datasets, args.val_dir, tokenizer, 'val', args.outdomain_data_repeat) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, sampler=RandomSampler(train_dataset)) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, sampler=RandomSampler(val_dataset)) best_scores = trainer.train(model, train_loader, val_loader, val_dict) if args.do_eval: args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') split_name = 'test' if 'test' in args.eval_dir else 'validation' log = util.get_logger(args.save_dir, f'log_{split_name}') trainer = Trainer(args, log) checkpoint_path = os.path.join(args.save_dir, 'checkpoint') model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path) discriminator_input_size = 768 if args.full_adv: discriminator_input_size = 384 * 768 discriminator = DomainDiscriminator( input_size=discriminator_input_size) # discriminator.load_state_dict(torch.load(checkpoint_path + '/discriminator')) model.to(args.device) discriminator.to(args.device) eval_dataset, eval_dict = get_dataset(args, args.eval_datasets, args.eval_dir, tokenizer, split_name, args.outdomain_data_repeat) eval_loader = DataLoader(eval_dataset, batch_size=args.batch_size, sampler=SequentialSampler(eval_dataset)) eval_preds, eval_scores = trainer.evaluate(model, discriminator, eval_loader, eval_dict, return_preds=True, split=split_name) results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in eval_scores.items()) log.info(f'Eval {results_str}') # Write submission file sub_path = os.path.join(args.save_dir, split_name + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(eval_preds): csv_writer.writerow([uuid, eval_preds[uuid]])
def main(): # define parser and arguments args = get_train_test_args() util.set_seed(args.seed) # model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased") model = DomainQA(args.num_classes, args.hidden_size, args.num_layers, args.dropout, args.dis_lambda, args.concat, args.anneal) tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased') if args.do_train: if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) args.save_dir = util.get_save_dir(args.save_dir, args.run_name) log = util.get_logger(args.save_dir, 'log_train') log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}') log.info("Preparing Training Data...") args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') if args.load_weights != '': args.load_weights = os.path.join(args.load_weights, 'checkpoint', model.WEIGHTS_NAME) model.load_state_dict(torch.load(args.load_weights)) if args.load_distilbert_weights != '': # args.load_distilbert_weights = os.path.join(args.load_distilbert_weights, 'checkpoint', model.WEIGHTS_NAME) args.load_distilbert_weights = os.path.join( args.load_distilbert_weights, 'checkpoint', 'pytorch_model.bin') model.distilbert.load_state_dict( torch.load(args.load_distilbert_weights)) print('loaded pretrained distilbert weights from', args.load_distilbert_weights) trainer = Trainer(args, log, model) #target_data_dir, target_dataset, tokenizer, split_name, source_data_dir = None, source_dataset = None train_dataset, _ = get_train_dataset(args, \ args.target_train_dir,\ args.target_train_datasets,\ tokenizer, 'train', \ source_data_dir=args.source_train_dir, \ source_dataset=args.source_train_datasets) log.info("Preparing Validation Data...") val_dataset, val_dict = get_dataset(args, \ args.eval_datasets,\ args.eval_dir,\ tokenizer, 'val') train_loader = DataLoader(train_dataset, batch_size=args.batch_size, sampler=RandomSampler(train_dataset)) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, sampler=SequentialSampler(val_dataset)) best_scores = trainer.train(train_loader, val_loader, val_dict) if args.do_eval: args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') split_name = 'test' if 'test' in args.eval_dir else 'validation' log = util.get_logger(args.save_dir, f'log_{split_name}') trainer = Trainer(args, log, model) config_path = os.path.join(args.save_dir, 'checkpoint', 'config.json') checkpoint_path = os.path.join(args.save_dir, 'checkpoint', model.WEIGHTS_NAME) model.load_state_dict(torch.load(checkpoint_path)) model.to(args.device) eval_dataset, eval_dict = get_dataset(args, args.eval_datasets, args.eval_dir, tokenizer, split_name) eval_loader = DataLoader(eval_dataset, batch_size=args.batch_size, sampler=SequentialSampler(eval_dataset)) eval_preds, eval_scores = trainer.evaluate(eval_loader, eval_dict, return_preds=True, split=split_name) results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in eval_scores.items()) log.info(f'Eval {results_str}') # Write submission file sub_path = os.path.join(args.save_dir, split_name + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(eval_preds): csv_writer.writerow([uuid, eval_preds[uuid]])
def main(): # define parser and arguments args = get_train_test_args() util.set_seed(args.seed) model = DistilBertForQuestionAnswering.from_pretrained( "distilbert-base-uncased") tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased') '''###''' # if args.reinit_pooler: # encoder_temp = getattr(model, "distilbert") # Equivalent to model.distilbert # encoder_temp.pooler.dense.weight.data.normal_(mean=0.0, std=encoder_temp.config.initializer_range) # encoder_temp.pooler.dense.bias.data.zero_() # The change of encoder_temp would affect the model # for p in encoder_temp.pooler.parameters(): # p.requires_grad = True if args.reinit_layers > 0: import torch.nn as nn from transformers.models.distilbert.modeling_distilbert import MultiHeadSelfAttention, FFN # model_distilbert = getattr(model, "distilbert") # model.distilbert; change of model_distilbert affects model! # Reinitialization for the last few layers for layer in model.distilbert.transformer.layer[-args.reinit_layers:]: for module in layer.modules(): # print(module) model.distilbert._init_weights( module) # It's the line equivalent to below approach # if isinstance(module, nn.modules.linear.Linear): # Original form for nn.Linear # # model.config.initializer_range == model.distilbert.config.initializer_range => True # module.weight.data.normal_(mean=0.0, std=model.distilbert.config.initializer_range) # if module.bias is not None: # module.bias.data.zero_() # elif isinstance(module, nn.modules.normalization.LayerNorm): # module.weight.data.fill_(1.0) # module.bias.data.zero_() # elif isinstance(module, FFN): # for param in [module.lin1, module.lin2]: # param.weight.data.normal_(mean=0.0, std=model.distilbert.config.initializer_range) # if param.bias is not None: # param.bias.data.zero_() # elif isinstance(module, MultiHeadSelfAttention): # for param in [module.q_lin, module.k_lin, module.v_lin, module.out_lin]: # param.data.weight.normal_(mean=0.0, std=model.distilbert.config.initializer_range) # if param.bias is not None: # param.bias.data.zero_() if args.do_train: if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) args.save_dir = util.get_save_dir(args.save_dir, args.run_name) log = util.get_logger(args.save_dir, 'log_train') log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}') log.info("Preparing Training Data...") args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') trainer = Trainer(args, log) train_dataset, _ = get_dataset(args, args.train_datasets, args.train_dir, tokenizer, 'train') log.info("Preparing Validation Data...") val_dataset, val_dict = get_dataset(args, args.train_datasets, args.val_dir, tokenizer, 'val') train_loader = DataLoader( train_dataset, batch_size=args.batch_size, sampler=RandomSampler( train_dataset)) # For squad: 50537/16~3159 items/batches val_loader = DataLoader(val_dataset, batch_size=args.batch_size, sampler=SequentialSampler(val_dataset)) best_scores = trainer.train(model, train_loader, val_loader, val_dict) if args.do_eval: args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') split_name = 'test' if 'test' in args.eval_dir else 'validation' log = util.get_logger(args.save_dir, f'log_{split_name}') trainer = Trainer(args, log) checkpoint_path = os.path.join(args.save_dir, 'checkpoint') model = DistilBertForQuestionAnswering.from_pretrained( checkpoint_path) # Trained model model.to(args.device) eval_dataset, eval_dict = get_dataset(args, args.eval_datasets, args.eval_dir, tokenizer, split_name) eval_loader = DataLoader(eval_dataset, batch_size=args.batch_size, sampler=SequentialSampler(eval_dataset)) eval_preds, eval_scores = trainer.evaluate(model, eval_loader, eval_dict, return_preds=True, split=split_name) results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in eval_scores.items()) log.info(f'Eval {results_str}') # Write submission file sub_path = os.path.join(args.save_dir, split_name + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(eval_preds): csv_writer.writerow([uuid, eval_preds[uuid]])
def main(): # define parser and arguments args = get_train_test_args() util.set_seed(args.seed) #### Change Made By Xuran Wang: Comment out original lines ####### # model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased") # tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') #### Change End ####### #### Change Made By Xuran Wang: Add custom lines ####### tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') finetuned_model_path = 'save/baseline-01/' #### Change End ####### if args.do_train: if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) args.save_dir = util.get_save_dir(args.save_dir, args.run_name) log = util.get_logger(args.save_dir, 'log_train') log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}') log.info("Preparing Training Data...") #### Change Made By Xuran Wang: Add custom lines ####### checkpoint_path = os.path.join(finetuned_model_path, 'checkpoint') model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path) #### Change End ####### args.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(args.device) trainer = Trainer(args, log) #### Change Made By Xuran Wang: Add custom lines, comment out original line ####### # train_dataset, _ = get_dataset(args, args.train_datasets, args.train_dir, tokenizer, 'train') train_dataset, _ = get_dataset_eda_revised(args, args.train_datasets, args.train_dir, tokenizer, 'train', train_fraction) #### Change End ####### log.info("Preparing Validation Data...") val_dataset, val_dict = get_dataset(args, args.train_datasets, args.val_dir, tokenizer, 'val') train_loader = DataLoader(train_dataset, batch_size=args.batch_size, sampler=RandomSampler(train_dataset)) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, sampler=SequentialSampler(val_dataset)) best_scores = trainer.train(model, train_loader, val_loader, val_dict) if args.do_eval: args.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') split_name = 'test' if 'test' in args.eval_dir else 'validation' log = util.get_logger(args.save_dir, f'log_{split_name}') trainer = Trainer(args, log) checkpoint_path = os.path.join(args.save_dir, 'checkpoint') model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path) model.to(args.device) eval_dataset, eval_dict = get_dataset(args, args.eval_datasets, args.eval_dir, tokenizer, split_name) eval_loader = DataLoader(eval_dataset, batch_size=args.batch_size, sampler=SequentialSampler(eval_dataset)) eval_preds, eval_scores = trainer.evaluate(model, eval_loader, eval_dict, return_preds=True, split=split_name) results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in eval_scores.items()) log.info(f'Eval {results_str}') # Write submission file sub_path = os.path.join(args.save_dir, split_name + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(eval_preds): csv_writer.writerow([uuid, eval_preds[uuid]])
def main(): # define parser and arguments args = get_train_test_args() util.set_seed(args.seed) model = DistilBertForQuestionAnswering.from_pretrained( args.model_checkpoint) tokenizer = DistilBertTokenizerFast.from_pretrained(args.model_checkpoint) with wandb.init(project="qa-system", config=args) as run: run.name = args.run_name wandb.watch(model) if args.do_train: if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) args.save_dir = util.get_save_dir(args.save_dir, args.run_name) log = util.get_logger(args.save_dir, 'log_train') log.info( f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}') log.info("Preparing Training Data...") args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') trainer = Trainer(args, log) train_dataset, _ = get_dataset(args, args.train_datasets, args.train_dir, tokenizer, 'train') log.info("Preparing Validation Data...") val_dataset, val_dict = get_dataset(args, args.val_datasets, args.val_dir, tokenizer, 'val') train_loader = DataLoader(train_dataset, batch_size=args.batch_size, sampler=RandomSampler(train_dataset)) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, sampler=SequentialSampler(val_dataset)) best_scores = trainer.train(model, train_loader, val_loader, val_dict) model_artifact = wandb.Artifact( args.run_name, type="model", ) model_artifact.add_dir(os.path.join(args.save_dir, 'checkpoint')) run.log_artifact(model_artifact) if args.do_eval: args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') split_name = 'test' if 'test' in args.eval_dir else 'validation' log = util.get_logger(args.save_dir, f'log_{split_name}') trainer = Trainer(args, log) if args.checkpoint_path != "": model = DistilBertForQuestionAnswering.from_pretrained( args.checkpoint_path) else: checkpoint_path = os.path.join(args.save_dir, 'checkpoint') model = DistilBertForQuestionAnswering.from_pretrained( checkpoint_path) model.to(args.device) eval_dataset, eval_dict = get_dataset(args, args.eval_datasets, args.eval_dir, tokenizer, split_name) eval_loader = DataLoader(eval_dataset, batch_size=args.batch_size, sampler=SequentialSampler(eval_dataset)) eval_preds, eval_scores = trainer.evaluate(model, eval_loader, eval_dict, return_preds=True, split=split_name) results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in eval_scores.items()) log.info(f'Eval {results_str}') # Write submission file sub_path = os.path.join(args.save_dir, split_name + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(eval_preds): csv_writer.writerow([uuid, eval_preds[uuid]])
def main(): # define parser and arguments args = get_train_test_args() util.set_seed(args.seed) model = DistilBertForQuestionAnswering.from_pretrained( "distilbert-base-uncased") if args.do_finetune: checkpoint_path = os.path.join(args.save_dir, 'checkpoint') model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path) for name, param in model.named_parameters(): if name.startswith("distilbert.embeddings."): param.requires_grad = False for i in range(args.freeze_layer): if name.startswith("distilbert.transformer.layer.%s." % i): param.requires_grad = False return tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased') if args.do_train: if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) args.save_dir = util.get_save_dir(args.save_dir, args.run_name) log = util.get_logger(args.save_dir, 'log_train') log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}') log.info("Preparing Training Data...") args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') trainer = Trainer(args, log) train_dataset, _ = get_dataset(args, args.train_datasets, args.train_dir, tokenizer, 'train') log.info("Preparing Validation Data...") val_dataset, val_dict = get_dataset(args, args.train_datasets, args.val_dir, tokenizer, 'val') train_loader = DataLoader(train_dataset, batch_size=args.batch_size, sampler=RandomSampler(train_dataset)) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, sampler=SequentialSampler(val_dataset)) best_scores = trainer.train(model, train_loader, val_loader, val_dict) if args.do_eval: args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') split_name = 'test' if 'test' in args.eval_dir else 'validation' log = util.get_logger(args.save_dir, f'log_{split_name}') trainer = Trainer(args, log) checkpoint_path = os.path.join(args.save_dir, 'checkpoint') model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path) model.to(args.device) eval_dataset, eval_dict = get_dataset(args, args.eval_datasets, args.eval_dir, tokenizer, split_name) eval_loader = DataLoader(eval_dataset, batch_size=args.batch_size, sampler=SequentialSampler(eval_dataset)) eval_preds, eval_scores = trainer.evaluate(model, eval_loader, eval_dict, return_preds=True, split=split_name) results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in eval_scores.items()) log.info(f'Eval {results_str}') # Write submission file sub_path = os.path.join(args.save_dir, split_name + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(eval_preds): csv_writer.writerow([uuid, eval_preds[uuid]])