base_lr * 10) params_list = group_weight(params_list, model.aux_layer, BatchNorm2d, base_lr * 10) # config lr policy total_iteration = config.nepochs * config.niters_per_epoch lr_policy = PolyLR(base_lr, config.lr_power, total_iteration) optimizer = torch.optim.SGD(params_list, lr=base_lr, momentum=config.momentum, weight_decay=config.weight_decay) if engine.distributed: if torch.cuda.is_available(): model.cuda() model = DistributedDataParallel(model) else: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # model = DataParallelModel(model, engine.devices) model.to(device) engine.register_state(dataloader=train_loader, model=model, optimizer=optimizer) if engine.continue_state_object: engine.restore_checkpoint() optimizer.zero_grad() model.train() for epoch in range(engine.state.epoch, config.nepochs):
def main(): setup_default_logging() args, args_text = _parse_args() args.prefetcher = not args.no_prefetcher args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.distributed and args.num_gpu > 1: logging.warning('Using more than one GPU per process in distributed mode is not allowed. Setting num_gpu to 1.') args.num_gpu = 1 args.device = 'cuda:0' args.world_size = 1 args.rank = 0 # global rank if args.distributed: args.num_gpu = 1 args.device = 'cuda:%d' % args.local_rank torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() args.rank = torch.distributed.get_rank() assert args.rank >= 0 if args.distributed: logging.info('Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.' % (args.rank, args.world_size)) else: logging.info('Training with a single process on %d GPUs.' % args.num_gpu) torch.manual_seed(args.seed + args.rank) model = create_model( args.model, pretrained=args.pretrained, num_classes=args.num_classes, drop_rate=args.drop, global_pool=args.gp, bn_tf=args.bn_tf, bn_momentum=args.bn_momentum, bn_eps=args.bn_eps, checkpoint_path=args.initial_checkpoint) if args.binarizable: Model_binary_patch(model) if args.local_rank == 0: logging.info('Model %s created, param count: %d' % (args.model, sum([m.numel() for m in model.parameters()]))) data_config = resolve_data_config(vars(args), model=model, verbose=args.local_rank == 0) if args.num_gpu > 1: if args.amp: logging.warning( 'AMP does not work well with nn.DataParallel, disabling. Use distributed mode for multi-GPU AMP.') args.amp = False model = nn.DataParallel(model, device_ids=list(range(args.num_gpu))).cuda() else: model.cuda() optimizer = create_optimizer(args, model) use_amp = False if has_apex and args.amp: print('Using amp.') model, optimizer = amp.initialize(model, optimizer, opt_level='O1') use_amp = True else: print('Do NOT use amp.') if args.local_rank == 0: logging.info('NVIDIA APEX {}. AMP {}.'.format( 'installed' if has_apex else 'not installed', 'on' if use_amp else 'off')) # optionally resume from a checkpoint resume_state = {} resume_epoch = None if args.resume: resume_state, resume_epoch = resume_checkpoint(model, args.resume) if resume_state and not args.no_resume_opt: if 'optimizer' in resume_state: if args.local_rank == 0: logging.info('Restoring Optimizer state from checkpoint') optimizer.load_state_dict(resume_state['optimizer']) if use_amp and 'amp' in resume_state and 'load_state_dict' in amp.__dict__: if args.local_rank == 0: logging.info('Restoring NVIDIA AMP state from checkpoint') amp.load_state_dict(resume_state['amp']) resume_state = None if args.freeze_binary: Model_freeze_binary(model) if args.distributed: if args.sync_bn: try: if has_apex: model = convert_syncbn_model(model) else: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if args.local_rank == 0: logging.info('Converted model to use Synchronized BatchNorm.') except Exception as e: logging.error('Failed to enable Synchronized BatchNorm. Install Apex or Torch >= 1.1') if has_apex: model = DDP(model, delay_allreduce=True) else: if args.local_rank == 0: logging.info("Using torch DistributedDataParallel. Install NVIDIA Apex for Apex DDP.") model = DDP(model, device_ids=[args.local_rank]) # can use device str in Torch >= 1.1 # NOTE: EMA model does not need to be wrapped by DDP lr_scheduler, num_epochs = create_scheduler(args, optimizer) # start_epoch = 0 # if args.start_epoch is not None: # a specified start_epoch will always override the resume epoch start_epoch = args.start_epoch elif resume_epoch is not None: start_epoch = resume_epoch if args.reset_lr_scheduler is not None: lr_scheduler.base_values = len(lr_scheduler.base_values)*[args.reset_lr_scheduler] lr_scheduler.step(start_epoch) if lr_scheduler is not None and start_epoch > 0: lr_scheduler.step(start_epoch) if args.local_rank == 0: logging.info('Scheduled epochs: {}'.format(num_epochs)) # Using pruner to get sparse weights if args.prune: pruner = Pruner(model, 0, 100, 0.75) else: pruner = None dataset_train = torchvision.datasets.CIFAR100(root='~/Downloads/CIFAR100', train=True, download=True) collate_fn = None if args.prefetcher and args.mixup > 0: collate_fn = FastCollateMixup(args.mixup, args.smoothing, args.num_classes) loader_train = create_loader_CIFAR100( dataset_train, input_size=data_config['input_size'], batch_size=args.batch_size, is_training=True, use_prefetcher=args.prefetcher, rand_erase_prob=args.reprob, rand_erase_mode=args.remode, rand_erase_count=args.recount, color_jitter=args.color_jitter, auto_augment=args.aa, interpolation='random', mean=data_config['mean'], std=data_config['std'], num_workers=args.workers, distributed=args.distributed, collate_fn=collate_fn, is_clean_data=args.clean_train, ) dataset_eval = torchvision.datasets.CIFAR100(root='~/Downloads/CIFAR100', train=False, download=True) loader_eval = create_loader_CIFAR100( dataset_eval, input_size=data_config['input_size'], batch_size=4 * args.batch_size, is_training=False, use_prefetcher=args.prefetcher, interpolation=data_config['interpolation'], mean=data_config['mean'], std=data_config['std'], num_workers=args.workers, distributed=args.distributed, ) if args.mixup > 0.: # smoothing is handled with mixup label transform train_loss_fn = SoftTargetCrossEntropy(multiplier=args.softmax_multiplier).cuda() validate_loss_fn = nn.CrossEntropyLoss().cuda() elif args.smoothing: train_loss_fn = LabelSmoothingCrossEntropy(smoothing=args.smoothing).cuda() validate_loss_fn = nn.CrossEntropyLoss().cuda() else: train_loss_fn = nn.CrossEntropyLoss().cuda() validate_loss_fn = train_loss_fn eval_metric = args.eval_metric best_metric = None best_epoch = None saver = None saver_last_10_epochs = None output_dir = '' if args.local_rank == 0: output_base = args.output if args.output else './output' exp_name = '-'.join([ datetime.now().strftime("%Y%m%d-%H%M%S"), args.model, str(data_config['input_size'][-1]) ]) output_dir = get_outdir(output_base, 'train', exp_name) decreasing = True if eval_metric == 'loss' else False os.makedirs(output_dir+'/Top') os.makedirs(output_dir+'/Last') saver = CheckpointSaver(checkpoint_dir=output_dir + '/Top', decreasing=decreasing, max_history=10) # Save the results of the top 10 epochs saver_last_10_epochs = CheckpointSaver(checkpoint_dir=output_dir + '/Last', decreasing=decreasing, max_history=10) # Save the results of the last 10 epochs with open(os.path.join(output_dir, 'args.yaml'), 'w') as f: f.write(args_text) f.write('==============================') f.write(model.__str__()) tensorboard_writer = SummaryWriter(output_dir) try: for epoch in range(start_epoch, num_epochs): global alpha alpha = get_alpha(epoch, args) if args.distributed: loader_train.sampler.set_epoch(epoch) if pruner: pruner.on_epoch_begin(epoch) # pruning train_metrics = train_epoch( epoch, model, loader_train, optimizer, train_loss_fn, args, lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir, use_amp=use_amp, tensorboard_writer=tensorboard_writer, pruner = pruner) if pruner: pruner.print_statistics() eval_metrics = validate(model, loader_eval, validate_loss_fn, args, tensorboard_writer=tensorboard_writer, epoch=epoch) if lr_scheduler is not None: # step LR for next epoch lr_scheduler.step(epoch + 1, eval_metrics[eval_metric]) update_summary( epoch, train_metrics, eval_metrics, os.path.join(output_dir, 'summary.csv'), write_header=best_metric is None) if saver is not None: # save proper checkpoint with eval metric save_metric = eval_metrics[eval_metric] best_metric, best_epoch = saver.save_checkpoint( model, optimizer, args, epoch=epoch, metric=save_metric, use_amp=use_amp) if saver_last_10_epochs is not None: # save the checkpoint in the last 5 epochs _, _ = saver_last_10_epochs.save_checkpoint( model, optimizer, args, epoch=epoch, metric=epoch, use_amp=use_amp) except KeyboardInterrupt: pass if best_metric is not None: logging.info('*** Best metric: {0} (epoch {1})'.format(best_metric, best_epoch)) logging.info('The checkpoint of the last epoch is: \n') logging.info(saver_last_10_epochs.checkpoint_files[0][0])
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--load_finetuned_model', action='store_true', default=False, help="Load finetuned model.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "compq": COMPQProcessor, } output_modes = { "compq": "classification", } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)) if args.load_finetuned_model: print("Loading finetuned model....") model = BertForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) else: model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # define a new function to compute loss values for both output_modes logits = model(input_ids, segment_ids, input_mask, labels=None) if output_mode == "classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] softmax_preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) # create eval loss and other metric required by the task if output_mode == "classification": loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) softmax_preds.append(Softmax(1)(logits).detach().cpu().numpy()) else: preds[0] = np.append( preds[0], logits.detach().cpu().numpy(), axis=0) softmax_preds[0] = np.append(softmax_preds[0], Softmax(1)(logits).detach().cpu().numpy(), axis = 0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] softmax_preds = softmax_preds[0] output_prediction_file = os.path.join(args.output_dir, "predictions.txt") with open(output_prediction_file, 'w') as writer: for i,pred in enumerate(softmax_preds): writer.write(str(pred[0]) + '\t' + str(pred[1]) + '\t' + eval_examples[i].text_a + '\n') if output_mode == "classification": preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss/nb_tr_steps if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
args.from_pretrained, config=config, num_labels=num_labels, default_gpu=default_gpu, ) task_losses = LoadLosses(args, task_cfg, args.tasks.split("-")) model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, delay_allreduce=True) elif n_gpu > 1: model = nn.DataParallel(model) # Propagate Training Split print("***** Running evaluation *****") print(" Num Iters: ", task_num_iters) print(" Batch size: ", task_batch_size) pooled_output_mul_list, pooled_output_sum_list, pooled_output_t_list, pooled_output_v_list = list( ), list(), list(), list() targets_list = list() model.eval()
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, } num_labels_task = { "cola": 2, "mnli": 3, "mrpc": 2, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") if args.do_train: torch.save(model_to_save.state_dict(), output_model_file) # Load a trained model that you have fine-tuned model_state_dict = torch.load(output_model_file) model = BertForSequenceClassification.from_pretrained( args.bert_model, state_dict=model_state_dict, num_labels=num_labels) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--wf_data_dir", default=None, type=str, required=True, help="The output data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument("--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument("--meta_path", default=None, type=str, required=False, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument("--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_test", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.") parser.add_argument("--eval_steps", default=-1, type=int, help="") parser.add_argument("--lstm_hidden_size", default=300, type=int, help="") parser.add_argument("--lstm_layers", default=2, type=int, help="") parser.add_argument("--lstm_dropout", default=0.5, type=float, help="") parser.add_argument("--train_steps", default=-1, type=int, help="") parser.add_argument("--report_steps", default=-1, type=int, help="") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--split_num", default=3, type=int, help="text split") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument("--eval_all_checkpoints", action='store_true', help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument('--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") parser.add_argument('--fp16_opt_level', type=str, default='O1', help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") args = parser.parse_args() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) try: os.makedirs(args.output_dir) except: pass tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case) config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=3) config.output_hidden_states = True # Prepare model model = BertForSequenceClassification.from_pretrained(args.model_name_or_path, args, config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) if args.do_train: # Prepare data loader train_examples = read_examples(os.path.join(args.data_dir, 'train.csv'), is_training=True) train_features = convert_examples_to_features( train_examples, tokenizer, args.max_seq_length, args.split_num, True) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) for i, f in enumerate(train_features): if i == 0: all_weight = f.weight.unsqueeze(0) else: all_weight = torch.cat([all_weight, f.weight.unsqueeze(0)]) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label, all_weight) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps) num_train_optimization_steps = args.train_steps # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=args.train_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 bar = tqdm(range(num_train_optimization_steps), total=num_train_optimization_steps) train_dataloader = cycle(train_dataloader) # weights = [] # for w in open("./weight_3_segment.weight","r",encoding='utf-8').read().split('\n'): # if len(w.split(','))!=3: # print(w) # continue # w = [float(x) for x in w.split(',')] # weights.append(w) stop_gate = 0 for step in bar: batch = next(train_dataloader) batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, weight_ids = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, weight_ids=weight_ids) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() train_loss = round(tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1), 4) bar.set_description("loss {}".format(train_loss)) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if (step + 1) % (args.eval_steps * args.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if args.do_eval and (step + 1) % (args.eval_steps * args.gradient_accumulation_steps) == 0: for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=True) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) for i, f in enumerate(eval_features): if i == 0: all_weight = f.weight.unsqueeze(0) else: all_weight = torch.cat([all_weight, f.weight.unsqueeze(0)]) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label, all_weight) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_f1, eval_acc, eval_recall, eval_precision = 0, 0, 0, 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids, weight_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) weight_ids = weight_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, weight_ids=weight_ids) logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, weight_ids=weight_ids) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) model.train() eval_loss = eval_loss / nb_eval_steps eval_f1 = f1(inference_logits, gold_labels) eval_acc = acc(inference_logits, gold_labels) eval_recall = recall(inference_logits, gold_labels) eval_precision = precision(inference_logits, gold_labels) result = {'eval_loss': eval_loss, 'eval_F1': eval_f1, 'eval_acc': eval_acc, 'eval_recall': eval_recall, 'eval_precision': eval_precision, 'global_step': global_step, 'loss': train_loss} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if abs(eval_f1 - best_acc) < 0.05 and eval_f1 < best_acc and 'dev' in file: stop_gate += 1 if eval_f1 > best_acc and 'dev' in file: print("=" * 80) print("Best F1", eval_f1) print("Accuracy", eval_acc) print("Saving Model......") best_acc = eval_f1 stop_gate = 0 # Save a trained model model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80) if stop_gate >= 10 and global_step > num_train_optimization_steps * 0.6: print("Early Stopping. Bye~") break if args.do_test: del model gc.collect() args.do_train = False model = BertForSequenceClassification.from_pretrained(os.path.join(args.output_dir, "pytorch_model.bin"), args, config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) if not os.path.exists(args.wf_data_dir): os.makedirs(args.wf_data_dir) for file, flag in [('train.csv', 'train'), ('dev.csv', 'dev'), ('test.csv', 'dev')]: rewards = [] inference_labels = [] gold_labels = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=False) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) all_title_len = torch.tensor([f.title_len for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label, all_title_len) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids, title_lens in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) title_lens = title_lens.to(device) with torch.no_grad(): logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, title_lens=title_lens).detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(logits) gold_labels.append(label_ids) gold_labels = np.concatenate(gold_labels, 0) logits = np.concatenate(inference_labels, 0) for i in range(gold_labels.shape[0]): rewards.append(logits[i].tolist()) df = pd.read_csv(os.path.join(args.data_dir, file)) if len(rewards) != len(df): print("Error in length of rewards") print(len(rewards), len(df), gold_labels.shape) df['reward'] = rewards df[['id', 'new_content', 'title', 'label', 'weight', 'reward']].to_csv(os.path.join(args.wf_data_dir, file), index=False) print("write to ", os.path.join(args.wf_data_dir, file)) if flag == 'test': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df['label_2'] = logits[:, 2] df[['id', 'label_0', 'label_1', 'label_2']].to_csv(os.path.join(args.output_dir, "sub.csv"), index=False)
def main(): parser = argparse.ArgumentParser(description='PyTorch FixMatch Training') parser.add_argument('--gpu-id', default='0', type=int, help='id(s) for CUDA_VISIBLE_DEVICES') parser.add_argument('--num-workers', type=int, default=4, help='number of workers') parser.add_argument('--dataset', default='cifar10', type=str, choices=['cifar10', 'cifar100'], help='dataset name') parser.add_argument('--num-labeled', type=int, default=4000, help='number of labeled data') parser.add_argument("--expand-labels", action="store_true", help="expand labels to fit eval steps") parser.add_argument('--arch', default='wideresnet', type=str, choices=['wideresnet', 'resnext'], help='dataset name') parser.add_argument('--total-steps', default=2**20, type=int, help='number of total steps to run') parser.add_argument('--eval-step', default=1024, type=int, help='number of eval steps to run') parser.add_argument('--start-epoch', default=0, type=int, help='manual epoch number (useful on restarts)') parser.add_argument('--batch-size', default=64, type=int, help='train batchsize') parser.add_argument('--lr', '--learning-rate', default=0.03, type=float, help='initial learning rate') parser.add_argument('--warmup', default=0, type=float, help='warmup epochs (unlabeled data based)') parser.add_argument('--wdecay', default=5e-4, type=float, help='weight decay') parser.add_argument('--nesterov', action='store_true', default=True, help='use nesterov momentum') parser.add_argument('--use-ema', action='store_true', default=True, help='use EMA model') parser.add_argument('--ema-decay', default=0.999, type=float, help='EMA decay rate') parser.add_argument('--mu', default=7, type=int, help='coefficient of unlabeled batch size') parser.add_argument('--lambda-u', default=1, type=float, help='coefficient of unlabeled loss') parser.add_argument('--T', default=1, type=float, help='pseudo label temperature') parser.add_argument('--threshold', default=0.95, type=float, help='pseudo label threshold') parser.add_argument('--out', default='result', help='directory to output the result') parser.add_argument('--resume', default='', type=str, help='path to latest checkpoint (default: none)') parser.add_argument('--seed', default=None, type=int, help="random seed") parser.add_argument( "--amp", action="store_true", help="use 16-bit (mixed) precision through NVIDIA apex AMP") parser.add_argument( "--opt_level", type=str, default="O0", help= "apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--no-progress', action='store_true', help="don't use progress bar") args = parser.parse_args() global best_acc def create_model(args): if args.arch == 'wideresnet': import models.wideresnet as models model = models.build_dualwideresnet(depth=args.model_depth, widen_factor=args.model_width, dropout=0, num_classes=args.num_classes) elif args.arch == 'resnext': import models.resnext as models model = models.build_resnext(cardinality=args.model_cardinality, depth=args.model_depth, width=args.model_width, num_classes=args.num_classes) logger.info("Total params: {:.2f}M".format( sum(p.numel() for p in model.parameters()) / 1e6)) return model if args.local_rank == -1: device = torch.device('cuda', args.gpu_id) args.world_size = 1 args.n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device('cuda', args.local_rank) torch.distributed.init_process_group(backend='nccl') args.world_size = torch.distributed.get_world_size() args.n_gpu = 1 args.device = device logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( f"Process rank: {args.local_rank}, " f"device: {args.device}, " f"n_gpu: {args.n_gpu}, " f"distributed training: {bool(args.local_rank != -1)}, " f"16-bits training: {args.amp}", ) logger.info(dict(args._get_kwargs())) if args.seed is not None: set_seed(args) if args.local_rank in [-1, 0]: os.makedirs(args.out, exist_ok=True) writer = SummaryWriter(args.out) else: writer = None if args.dataset == 'cifar10': args.num_classes = 10 if args.arch == 'wideresnet': args.model_depth = 28 args.model_width = 2 elif args.arch == 'resnext': args.model_cardinality = 4 args.model_depth = 28 args.model_width = 4 elif args.dataset == 'cifar100': args.num_classes = 100 if args.arch == 'wideresnet': args.model_depth = 28 args.model_width = 8 elif args.arch == 'resnext': args.model_cardinality = 8 args.model_depth = 29 args.model_width = 64 labeled_dataset, unlabeled_dataset, test_dataset = DATASET_GETTERS[ args.dataset](args, os.path.join(root_dir, 'data')) train_sampler = RandomSampler if args.local_rank == -1 else DistributedSampler labeled_trainloader = DataLoader(labeled_dataset, sampler=train_sampler(labeled_dataset), batch_size=args.batch_size, num_workers=args.num_workers, drop_last=True) unlabeled_trainloader = DataLoader( unlabeled_dataset, sampler=train_sampler(unlabeled_dataset), batch_size=args.batch_size * args.mu, num_workers=args.num_workers, drop_last=True) test_loader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=args.batch_size, num_workers=args.num_workers) if args.local_rank not in [-1, 0]: torch.distributed.barrier() model = create_model(args) if args.local_rank == 0: torch.distributed.barrier() model.to(args.device) no_decay = ['bias', 'bn'] grouped_parameters = [{ 'params': [ p for n, p in model.left_model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.wdecay }, { 'params': [ p for n, p in model.left_model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = optim.SGD(grouped_parameters, lr=args.lr, momentum=0.9, nesterov=args.nesterov) grouped_parameters_right = [{ 'params': [ p for n, p in model.right_model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.wdecay }, { 'params': [ p for n, p in model.right_model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer_right = optim.SGD(grouped_parameters_right, lr=args.lr, momentum=0.9, nesterov=args.nesterov) args.epochs = math.ceil(args.total_steps / args.eval_step) scheduler = get_cosine_schedule_with_warmup(optimizer, args.warmup, args.total_steps) scheduler_right = get_cosine_schedule_with_warmup(optimizer_right, args.warmup, args.total_steps) if args.use_ema: from models.ema import ModelEMA ema_model = ModelEMA(args, model, args.ema_decay) args.start_epoch = 0 if args.resume: logger.info("==> Resuming from checkpoint..") assert os.path.isfile( args.resume), "Error: no checkpoint directory found!" args.out = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) if args.use_ema: ema_model.ema.load_state_dict(checkpoint['ema_state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) optimizer_right.load_state_dict(checkpoint['optimizer_right']) scheduler.load_state_dict(checkpoint['scheduler']) # if args.amp: # from apex import amp # model, optimizer = amp.initialize( # model, optimizer, opt_level=args.opt_level) if args.local_rank != -1: model = DistributedDataParallel(model) # model = torch.nn.parallel.DistributedDataParallel( # model, device_ids=[args.local_rank], # output_device=args.local_rank, find_unused_parameters=False) logger.info("***** Running training *****") logger.info(f" Task = {args.dataset}@{args.num_labeled}") logger.info(f" Num Epochs = {args.epochs}") logger.info(f" Batch size per GPU = {args.batch_size}") logger.info( f" Total train batch size = {args.batch_size*args.world_size}") logger.info(f" Total optimization steps = {args.total_steps}") model.zero_grad() train(args, labeled_trainloader, unlabeled_trainloader, test_loader, model, optimizer, optimizer_right, ema_model, scheduler, scheduler_right, writer)
def main(): """Run training process.""" parser = argparse.ArgumentParser( description= "Train Parallel WaveGAN (See detail in parallel_wavegan/bin/train.py)." ) parser.add_argument( "--train-wav-scp", default=None, type=str, help="kaldi-style wav.scp file for training. " "you need to specify either train-*-scp or train-dumpdir.") parser.add_argument( "--train-feats-scp", default=None, type=str, help="kaldi-style feats.scp file for training. " "you need to specify either train-*-scp or train-dumpdir.") parser.add_argument("--train-segments", default=None, type=str, help="kaldi-style segments file for training.") parser.add_argument( "--train-dumpdir", default=None, type=str, help="directory including training data. " "you need to specify either train-*-scp or train-dumpdir.") parser.add_argument("--dev-wav-scp", default=None, type=str, help="kaldi-style wav.scp file for validation. " "you need to specify either dev-*-scp or dev-dumpdir.") parser.add_argument("--dev-feats-scp", default=None, type=str, help="kaldi-style feats.scp file for vaidation. " "you need to specify either dev-*-scp or dev-dumpdir.") parser.add_argument("--dev-segments", default=None, type=str, help="kaldi-style segments file for validation.") parser.add_argument("--dev-dumpdir", default=None, type=str, help="directory including development data. " "you need to specify either dev-*-scp or dev-dumpdir.") parser.add_argument("--outdir", type=str, required=True, help="directory to save checkpoints.") parser.add_argument("--config", type=str, required=True, help="yaml format configuration file.") parser.add_argument( "--pretrain", default="", type=str, nargs="?", help="checkpoint file path to load pretrained params. (default=\"\")") parser.add_argument( "--resume", default="", type=str, nargs="?", help="checkpoint file path to resume training. (default=\"\")") parser.add_argument( "--verbose", type=int, default=1, help="logging level. higher is more logging. (default=1)") parser.add_argument( "--rank", "--local_rank", default=0, type=int, help="rank for distributed training. no need to explictly specify.") args = parser.parse_args() args.distributed = False if not torch.cuda.is_available(): device = torch.device("cpu") else: device = torch.device("cuda") # effective when using fixed size inputs # see https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936 torch.backends.cudnn.benchmark = True torch.cuda.set_device(args.rank) # setup for distributed training # see example: https://github.com/NVIDIA/apex/tree/master/examples/simple/distributed if "WORLD_SIZE" in os.environ: args.world_size = int(os.environ["WORLD_SIZE"]) args.distributed = args.world_size > 1 if args.distributed: torch.distributed.init_process_group(backend="nccl", init_method="env://") # suppress logging for distributed training if args.rank != 0: sys.stdout = open(os.devnull, "w") # set logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG, stream=sys.stdout, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") elif args.verbose > 0: logging.basicConfig( level=logging.INFO, stream=sys.stdout, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") else: logging.basicConfig( level=logging.WARN, stream=sys.stdout, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") logging.warning("Skip DEBUG/INFO messages") # check directory existence if not os.path.exists(args.outdir): os.makedirs(args.outdir) # check arguments if (args.train_feats_scp is not None and args.train_dumpdir is not None) or \ (args.train_feats_scp is None and args.train_dumpdir is None): raise ValueError( "Please specify either --train-dumpdir or --train-*-scp.") if (args.dev_feats_scp is not None and args.dev_dumpdir is not None) or \ (args.dev_feats_scp is None and args.dev_dumpdir is None): raise ValueError("Please specify either --dev-dumpdir or --dev-*-scp.") # load and save config with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) config.update(vars(args)) config["version"] = parallel_wavegan.__version__ # add version info with open(os.path.join(args.outdir, "config.yml"), "w") as f: yaml.dump(config, f, Dumper=yaml.Dumper) for key, value in config.items(): logging.info(f"{key} = {value}") # get dataset if config["remove_short_samples"]: mel_length_threshold = config["batch_max_steps"] // config["hop_size"] + \ 2 * config["generator_params"].get("aux_context_window", 0) else: mel_length_threshold = None if args.train_wav_scp is None or args.dev_wav_scp is None: if config["format"] == "hdf5": audio_query, mel_query = "*.h5", "*.h5" audio_load_fn = lambda x: read_hdf5(x, "wave") # NOQA mel_load_fn = lambda x: read_hdf5(x, "feats") # NOQA elif config["format"] == "npy": audio_query, mel_query = "*-wave.npy", "*-feats.npy" audio_load_fn = np.load mel_load_fn = np.load else: raise ValueError("support only hdf5 or npy format.") if args.train_dumpdir is not None: train_dataset = AudioMelDataset( root_dir=args.train_dumpdir, audio_query=audio_query, mel_query=mel_query, audio_load_fn=audio_load_fn, mel_load_fn=mel_load_fn, mel_length_threshold=mel_length_threshold, allow_cache=config.get("allow_cache", False), # keep compatibility ) else: train_dataset = AudioMelSCPDataset( wav_scp=args.train_wav_scp, feats_scp=args.train_feats_scp, segments=args.train_segments, mel_length_threshold=mel_length_threshold, allow_cache=config.get("allow_cache", False), # keep compatibility ) logging.info(f"The number of training files = {len(train_dataset)}.") if args.dev_dumpdir is not None: dev_dataset = AudioMelDataset( root_dir=args.dev_dumpdir, audio_query=audio_query, mel_query=mel_query, audio_load_fn=audio_load_fn, mel_load_fn=mel_load_fn, mel_length_threshold=mel_length_threshold, allow_cache=config.get("allow_cache", False), # keep compatibility ) else: dev_dataset = AudioMelSCPDataset( wav_scp=args.dev_wav_scp, feats_scp=args.dev_feats_scp, segments=args.dev_segments, mel_length_threshold=mel_length_threshold, allow_cache=config.get("allow_cache", False), # keep compatibility ) logging.info(f"The number of development files = {len(dev_dataset)}.") dataset = { "train": train_dataset, "dev": dev_dataset, } # get data loader collater = Collater( batch_max_steps=config["batch_max_steps"], hop_size=config["hop_size"], # keep compatibility aux_context_window=config["generator_params"].get( "aux_context_window", 0), # keep compatibility use_noise_input=config.get( "generator_type", "ParallelWaveGANGenerator") != "MelGANGenerator", ) train_sampler, dev_sampler = None, None if args.distributed: # setup sampler for distributed training from torch.utils.data.distributed import DistributedSampler train_sampler = DistributedSampler( dataset=dataset["train"], num_replicas=args.world_size, rank=args.rank, shuffle=True, ) dev_sampler = DistributedSampler( dataset=dataset["dev"], num_replicas=args.world_size, rank=args.rank, shuffle=False, ) data_loader = { "train": DataLoader( dataset=dataset["train"], shuffle=False if args.distributed else True, collate_fn=collater, batch_size=config["batch_size"], num_workers=config["num_workers"], sampler=train_sampler, pin_memory=config["pin_memory"], ), "dev": DataLoader( dataset=dataset["dev"], shuffle=False if args.distributed else True, collate_fn=collater, batch_size=config["batch_size"], num_workers=config["num_workers"], sampler=dev_sampler, pin_memory=config["pin_memory"], ), } # define models and optimizers generator_class = getattr( parallel_wavegan.models, # keep compatibility config.get("generator_type", "ParallelWaveGANGenerator"), ) discriminator_class = getattr( parallel_wavegan.models, # keep compatibility config.get("discriminator_type", "ParallelWaveGANDiscriminator"), ) model = { "generator": generator_class(**config["generator_params"]).to(device), "discriminator": discriminator_class(**config["discriminator_params"]).to(device), } criterion = { "stft": MultiResolutionSTFTLoss(**config["stft_loss_params"]).to(device), "mse": torch.nn.MSELoss().to(device), } if config.get("use_feat_match_loss", False): # keep compatibility criterion["l1"] = torch.nn.L1Loss().to(device) optimizer = { "generator": RAdam(model["generator"].parameters(), **config["generator_optimizer_params"]), "discriminator": RAdam(model["discriminator"].parameters(), **config["discriminator_optimizer_params"]), } scheduler = { "generator": torch.optim.lr_scheduler.StepLR( optimizer=optimizer["generator"], **config["generator_scheduler_params"]), "discriminator": torch.optim.lr_scheduler.StepLR( optimizer=optimizer["discriminator"], **config["discriminator_scheduler_params"]), } if args.distributed: # wrap model for distributed training try: from apex.parallel import DistributedDataParallel except ImportError: raise ImportError( "apex is not installed. please check https://github.com/NVIDIA/apex." ) model["generator"] = DistributedDataParallel(model["generator"]) model["discriminator"] = DistributedDataParallel( model["discriminator"]) logging.info(model["generator"]) logging.info(model["discriminator"]) # define trainer trainer = Trainer( steps=0, epochs=0, data_loader=data_loader, model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, config=config, device=device, ) # load pretrained parameters from checkpoint if len(args.pretrain) != 0: trainer.load_checkpoint(args.pretrain, load_only_params=True) logging.info(f"Successfully load parameters from {args.pretrain}.") # resume from checkpoint if len(args.resume) != 0: trainer.load_checkpoint(args.resume) logging.info(f"Successfully resumed from {args.resume}.") # run training loop try: trainer.run() except KeyboardInterrupt: trainer.save_checkpoint( os.path.join(config["outdir"], f"checkpoint-{trainer.steps}steps.pkl")) logging.info(f"Successfully saved checkpoint @ {trainer.steps}steps.")
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default='../absa_data/twitter', type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default='twitter', type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=64, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument( "--max_entity_length", default=16, type=int, help= "The maximum entity input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=16, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=8.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--fine_tune_cnn', action='store_true', help='fine tune pre-trained CNN if True') parser.add_argument('--resnet_root', default='./resnet', help='path the pre-trained cnn models') parser.add_argument('--crop_size', type=int, default=224, help='crop size of image') parser.add_argument( '--path_image', default='../pytorch-pretrained-BERT/twitter_subimages/', help='path to images') parser.add_argument( '--mm_model', default='TomBert', help='model name' ) # TomBert, TomBertNoPooling, MBert, MBertNoPooling, ResBert parser.add_argument('--pooling', default='first', help='pooling method') # first, cls, concat parser.add_argument('--bertlayer', action='store_true', help='whether to add another bert layer') parser.add_argument('--tfn', action='store_true', help='whether to use TFN') args = parser.parse_args() print("**************current model: " + args.mm_model + "******************") if args.mm_model == "ResBert" and args.bertlayer: print("add another bert layer") if args.mm_model == "ResBert" and args.tfn: print("add another tfn layer") elif args.mm_model == "TomBert" or args.mm_model == "MBert": print("pooling method: " + args.pooling) print("*" * 50) if args.task_name == "twitter": # this refers to twitter-2017 dataset args.path_image = "../pytorch-pretrained-BERT/twitter_subimages/" elif args.task_name == "twitter2015": # this refers to twitter-2015 dataset args.path_image = "../pytorch-pretrained-BERT/twitter15_images/" else: print("The task name is not right!") processors = { "twitter2015": AbmsaProcessor, # our twitter-2015 dataset "twitter": AbmsaProcessor # our twitter-2017 dataset } num_labels_task = { "twitter2015": 3, # our twitter-2015 dataset "twitter": 3 # our twitter-2017 dataset } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model if args.mm_model == 'ResBert': model = ResBertForMMSequenceClassification.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_labels=num_labels, bert_layer=args.bertlayer, tfn=args.tfn) elif args.mm_model == 'MBert': model = MBertForMMSequenceClassification.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_labels=num_labels, pooling=args.pooling) elif args.mm_model == 'MBertNoPooling': model = MBertNoPoolingForMMSequenceClassification.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_labels=num_labels) elif args.mm_model == 'TomBertNoPooling': model = TomBertNoPoolingForMMSequenceClassification.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_labels=num_labels) else: # TomBert by default model = TomBertForMMSequenceClassification.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_labels=num_labels, pooling=args.pooling) net = getattr(resnet, 'resnet152')() net.load_state_dict( torch.load(os.path.join(args.resnet_root, 'resnet152.pth'))) encoder = myResnet(net, args.fine_tune_cnn, device) if args.fp16: model.half() encoder.half() model.to(device) encoder.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) encoder = DDP(encoder) elif n_gpu > 1: model = torch.nn.DataParallel(model) encoder = torch.nn.DataParallel(encoder) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 nb_tr_steps = 0 tr_loss = 0 output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") output_encoder_file = os.path.join(args.output_dir, "pytorch_encoder.bin") if args.do_train: train_features = convert_mm_examples_to_features( train_examples, label_list, args.max_seq_length, args.max_entity_length, tokenizer, args.crop_size, args.path_image) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_added_input_mask = torch.tensor( [f.added_input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_s2_input_ids = torch.tensor( [f.s2_input_ids for f in train_features], dtype=torch.long) all_s2_input_mask = torch.tensor( [f.s2_input_mask for f in train_features], dtype=torch.long) all_s2_segment_ids = torch.tensor( [f.s2_segment_ids for f in train_features], dtype=torch.long) all_img_feats = torch.stack([f.img_feat for f in train_features]) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_added_input_mask, all_segment_ids,\ all_s2_input_ids, all_s2_input_mask, all_s2_segment_ids, all_img_feats, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) #''' eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_mm_examples_to_features( eval_examples, label_list, args.max_seq_length, args.max_entity_length, tokenizer, args.crop_size, args.path_image) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_added_input_mask = torch.tensor( [f.added_input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_s2_input_ids = torch.tensor( [f.s2_input_ids for f in eval_features], dtype=torch.long) all_s2_input_mask = torch.tensor( [f.s2_input_mask for f in eval_features], dtype=torch.long) all_s2_segment_ids = torch.tensor( [f.s2_segment_ids for f in eval_features], dtype=torch.long) all_img_feats = torch.stack([f.img_feat for f in eval_features]) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_added_input_mask, all_segment_ids, \ all_s2_input_ids, all_s2_input_mask, all_s2_segment_ids,\ all_img_feats, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) max_acc = 0.0 #''' logger.info("*************** Running training ***************") for train_idx in trange(int(args.num_train_epochs), desc="Epoch"): logger.info("********** Epoch: " + str(train_idx) + " **********") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) model.train() encoder.train() encoder.zero_grad() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, added_input_mask, segment_ids, s2_input_ids, s2_input_mask, s2_segment_ids, \ img_feats, label_ids = batch with torch.no_grad(): imgs_f, img_mean, img_att = encoder(img_feats) if train_idx == 0 and step == 0: loss = model(input_ids, s2_input_ids, img_att, segment_ids, s2_segment_ids, input_mask, s2_input_mask, \ added_input_mask, label_ids, True) else: loss = model(input_ids, s2_input_ids, img_att, segment_ids, s2_segment_ids, input_mask, s2_input_mask, \ added_input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 logger.info("***** Running evaluation on Dev Set*****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) model.eval() encoder.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 true_label_list = [] pred_label_list = [] for input_ids, input_mask, added_input_mask, segment_ids, s2_input_ids, s2_input_mask, s2_segment_ids, \ img_feats, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) added_input_mask = added_input_mask.to(device) segment_ids = segment_ids.to(device) s2_input_ids = s2_input_ids.to(device) s2_input_mask = s2_input_mask.to(device) s2_segment_ids = s2_segment_ids.to(device) img_feats = img_feats.to(device) label_ids = label_ids.to(device) with torch.no_grad(): imgs_f, img_mean, img_att = encoder(img_feats) tmp_eval_loss = model(input_ids, s2_input_ids, img_att, segment_ids, s2_segment_ids, input_mask, s2_input_mask, added_input_mask, label_ids) logits = model(input_ids, s2_input_ids, img_att, segment_ids, s2_segment_ids, input_mask, s2_input_mask, added_input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() true_label_list.append(label_ids) pred_label_list.append(logits) tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None true_label = np.concatenate(true_label_list) pred_outputs = np.concatenate(pred_label_list) precision, recall, F_score = macro_f1(true_label, pred_outputs) result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'f_score': F_score, 'global_step': global_step, 'loss': loss } logger.info("***** Dev Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) if eval_accuracy >= max_acc: # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self encoder_to_save = encoder.module if hasattr( encoder, 'module') else encoder # Only save the model it-self if args.do_train: torch.save(model_to_save.state_dict(), output_model_file) torch.save(encoder_to_save.state_dict(), output_encoder_file) max_acc = eval_accuracy # Load a trained model that you have fine-tuned model_state_dict = torch.load(output_model_file) if args.mm_model == 'ResBert': model = ResBertForMMSequenceClassification.from_pretrained( args.bert_model, state_dict=model_state_dict, num_labels=num_labels, bert_layer=args.bertlayer, tfn=args.tfn) elif args.mm_model == 'MBert': model = MBertForMMSequenceClassification.from_pretrained( args.bert_model, state_dict=model_state_dict, num_labels=num_labels, pooling=args.pooling) elif args.mm_model == 'MBertNoPooling': model = MBertNoPoolingForMMSequenceClassification.from_pretrained( args.bert_model, state_dict=model_state_dict, num_labels=num_labels) elif args.mm_model == 'TomBertNoPooling': model = TomBertNoPoolingForMMSequenceClassification.from_pretrained( args.bert_model, state_dict=model_state_dict, num_labels=num_labels) else: # TomBert by default model = TomBertForMMSequenceClassification.from_pretrained( args.bert_model, state_dict=model_state_dict, num_labels=num_labels, pooling=args.pooling) model.to(device) encoder_state_dict = torch.load(output_encoder_file) encoder.load_state_dict(encoder_state_dict) encoder.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_test_examples(args.data_dir) eval_features = convert_mm_examples_to_features( eval_examples, label_list, args.max_seq_length, args.max_entity_length, tokenizer, args.crop_size, args.path_image) logger.info("***** Running evaluation on Test Set*****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_added_input_mask = torch.tensor( [f.added_input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_s2_input_ids = torch.tensor( [f.s2_input_ids for f in eval_features], dtype=torch.long) all_s2_input_mask = torch.tensor( [f.s2_input_mask for f in eval_features], dtype=torch.long) all_s2_segment_ids = torch.tensor( [f.s2_segment_ids for f in eval_features], dtype=torch.long) all_img_feats = torch.stack([f.img_feat for f in eval_features]) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_added_input_mask, all_segment_ids, \ all_s2_input_ids, all_s2_input_mask, all_s2_segment_ids, all_img_feats, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() encoder.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 true_label_list = [] pred_label_list = [] for input_ids, input_mask, added_input_mask, segment_ids, s2_input_ids, s2_input_mask, s2_segment_ids, \ img_feats, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) added_input_mask = added_input_mask.to(device) segment_ids = segment_ids.to(device) s2_input_ids = s2_input_ids.to(device) s2_input_mask = s2_input_mask.to(device) s2_segment_ids = s2_segment_ids.to(device) img_feats = img_feats.to(device) label_ids = label_ids.to(device) with torch.no_grad(): imgs_f, img_mean, img_att = encoder(img_feats) tmp_eval_loss = model(input_ids, s2_input_ids, img_att, segment_ids, s2_segment_ids, input_mask, s2_input_mask, added_input_mask, label_ids) logits = model(input_ids, s2_input_ids, img_att, segment_ids, s2_segment_ids, input_mask, s2_input_mask, added_input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() true_label_list.append(label_ids) pred_label_list.append(logits) tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None true_label = np.concatenate(true_label_list) pred_outputs = np.concatenate(pred_label_list) precision, recall, F_score = macro_f1(true_label, pred_outputs) result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'precision': precision, 'recall': recall, 'f_score': F_score, 'global_step': global_step, 'loss': loss } pred_label = np.argmax(pred_outputs, axis=-1) fout_p = open(os.path.join(args.output_dir, "pred.txt"), 'w') fout_t = open(os.path.join(args.output_dir, "true.txt"), 'w') for i in range(len(pred_label)): attstr = str(pred_label[i]) fout_p.write(attstr + '\n') for i in range(len(true_label)): attstr = str(true_label[i]) fout_t.write(attstr + '\n') fout_p.close() fout_t.close() output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Test Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
# compensating the averaging over processes done by DDP # in order to produce mathematically equivalent result # https://github.com/NVIDIA/apex/issues/134#issuecomment-458307368 for param in bn.parameters(): param.grad = param.grad / args.world_size bn_opt = optim.SGD(bn.parameters(), lr=1.0) sbn = apex.parallel.SyncBatchNorm(feature_size).cuda() sbn.momentum = 1.0 sbn.weight.data = weight_t.clone() sbn.bias.data = bias_t.clone() if args.fp16: sbn.half() if args.fp64: sbn.double() sbn = DDP(sbn) sbn_opt = optim.SGD(sbn.parameters(), lr=1.0) inp_sbn = inp_t.clone().requires_grad_() grad_sbn = grad_output_t.clone().detach() out_sbn = sbn(inp_sbn[start:finish]) out_sbn.backward(grad_sbn[start:finish]) count = [ space_size**2 * ((i + 1) * batch_size // args.world_size - i * batch_size // args.world_size) for i in range(0, args.world_size) ] count = torch.cuda.IntTensor(count) print("--- count : ", count)
def main(): parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Training') parser = parse_args(parser) args, _ = parser.parse_known_args() if 'LOCAL_RANK' in os.environ and 'WORLD_SIZE' in os.environ: local_rank = int(os.environ['LOCAL_RANK']) world_size = int(os.environ['WORLD_SIZE']) else: local_rank = args.rank world_size = args.world_size if local_rank == 0: writer = SummaryWriter(args.tensorboard_path) else: writer = None distributed_run = world_size > 1 if local_rank == 0: log_file = os.path.join(args.output, args.log_file) DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, log_file), StdOutBackend(Verbosity.VERBOSE)]) else: DLLogger.init(backends=[]) for k,v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k:v}) DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'}) model_name = args.model_name parser = models.model_parser(model_name, parser) args, _ = parser.parse_known_args() torch.backends.cudnn.enabled = args.cudnn_enabled torch.backends.cudnn.benchmark = args.cudnn_benchmark if distributed_run: init_distributed(args, world_size, local_rank, args.group_name) torch.cuda.synchronize() run_start_time = time.perf_counter() model_config = models.get_model_config(model_name, args) model = models.get_model(model_name, model_config, cpu_run=False, uniform_initialize_bn_weight=not args.disable_uniform_initialize_bn_weight) if args.freeze_encoder: for param in model.encoder.parameters(): param.requires_grad = False if not args.amp and distributed_run: model = DDP(model) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) if args.amp: model, optimizer = amp.initialize(model, optimizer, opt_level="O1") if distributed_run: model = DDP(model) try: sigma = args.sigma except AttributeError: sigma = None start_epoch = [0] if args.checkpoint_path is not "": load_checkpoint(model, optimizer, start_epoch, model_config, args.amp, args.checkpoint_path, local_rank) elif args.resume_from_last: args.checkpoint_path = get_last_checkpoint_filename(args.output, model_name) load_checkpoint(model, optimizer, start_epoch, model_config, args.amp, args.checkpoint_path, local_rank) start_epoch = start_epoch[0] criterion = loss_functions.get_loss_function(model_name, sigma) try: n_frames_per_step = args.n_frames_per_step except AttributeError: n_frames_per_step = None collate_fn = data_functions.get_collate_function( model_name, n_frames_per_step) trainset = data_functions.get_data_loader( model_name, args.dataset_path, args.training_files, args) if distributed_run: train_sampler = DistributedSampler(trainset) shuffle = False else: train_sampler = None shuffle = True train_loader = DataLoader(trainset, num_workers=1, shuffle=shuffle, sampler=train_sampler, batch_size=args.batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) valset = data_functions.get_data_loader( model_name, args.dataset_path, args.validation_files, args) batch_to_gpu = data_functions.get_batch_to_gpu(model_name) iteration = 0 train_epoch_items_per_sec = 0.0 val_loss = 0.0 num_iters = 0 model.train() for epoch in range(start_epoch, args.epochs): torch.cuda.synchronize() epoch_start_time = time.perf_counter() # used to calculate avg items/sec over epoch reduced_num_items_epoch = 0 train_epoch_items_per_sec = 0.0 num_iters = 0 reduced_loss = 0 # if overflow at the last iteration then do not save checkpoint overflow = False if distributed_run: train_loader.sampler.set_epoch(epoch) for i, batch in enumerate(train_loader): torch.cuda.synchronize() iter_start_time = time.perf_counter() DLLogger.log(step=(epoch, i), data={'glob_iter/iters_per_epoch': str(iteration)+"/"+str(len(train_loader))}) adjust_learning_rate(iteration, epoch, optimizer, args.learning_rate, args.anneal_steps, args.anneal_factor, local_rank) model.zero_grad() x, y, num_items = batch_to_gpu(batch) y_pred = model(x) loss = criterion(y_pred, y) if distributed_run: reduced_loss = reduce_tensor(loss.data, world_size).item() reduced_num_items = reduce_tensor(num_items.data, 1).item() else: reduced_loss = loss.item() reduced_num_items = num_items.item() if np.isnan(reduced_loss): raise Exception("loss is NaN") DLLogger.log(step=(epoch,i), data={'train_loss': reduced_loss}) if writer: writer.add_scalar("train/loss", reduced_loss, iteration) num_iters += 1 # accumulate number of items processed in this epoch reduced_num_items_epoch += reduced_num_items if args.amp: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.grad_clip_thresh) else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), args.grad_clip_thresh) optimizer.step() torch.cuda.synchronize() iter_stop_time = time.perf_counter() iter_time = iter_stop_time - iter_start_time items_per_sec = reduced_num_items/iter_time train_epoch_items_per_sec += items_per_sec DLLogger.log(step=(epoch, i), data={'train_items_per_sec': items_per_sec}) DLLogger.log(step=(epoch, i), data={'train_iter_time': iter_time}) iteration += 1 torch.cuda.synchronize() epoch_stop_time = time.perf_counter() epoch_time = epoch_stop_time - epoch_start_time DLLogger.log(step=(epoch,), data={'train_items_per_sec': (train_epoch_items_per_sec/num_iters if num_iters > 0 else 0.0)}) DLLogger.log(step=(epoch,), data={'train_loss': reduced_loss}) DLLogger.log(step=(epoch,), data={'train_epoch_time': epoch_time}) val_loss, val_items_per_sec = validate(model, criterion, valset, epoch, iteration, args.batch_size, world_size, collate_fn, distributed_run, local_rank, batch_to_gpu, summary_writer=writer) if writer: writer.add_scalar("val/loss", val_loss, iteration) if (epoch % args.epochs_per_checkpoint == 0) and args.bench_class == "": save_checkpoint(model, optimizer, epoch, model_config, args.amp, args.output, args.model_name, local_rank, world_size) if local_rank == 0: DLLogger.flush() torch.cuda.synchronize() run_stop_time = time.perf_counter() run_time = run_stop_time - run_start_time DLLogger.log(step=tuple(), data={'run_time': run_time}) DLLogger.log(step=tuple(), data={'val_loss': val_loss}) if writer: writer.add_scalar("val/loss", val_loss, iteration) DLLogger.log(step=tuple(), data={'train_items_per_sec': (train_epoch_items_per_sec/num_iters if num_iters > 0 else 0.0)}) DLLogger.log(step=tuple(), data={'val_items_per_sec': val_items_per_sec}) if local_rank == 0: DLLogger.flush()
def main(): global best_prec1, args args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 args.gpu = 0 args.world_size = 1 if args.distributed: args.gpu = args.local_rank % torch.cuda.device_count() torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." if args.static_loss_scale != 1.0: if not args.fp16: print( "Warning: static_loss_scale != 1.0 is only necessary with --fp16. " "Resetting static_loss_scale to 1.0") args.static_loss_scale = 1.0 # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() if args.sync_bn: import apex print("using apex synced BN") model = apex.parallel.convert_syncbn_model(model) model = model.cuda() if args.fp16: model = network_to_half(model) if args.distributed: # By default, apex.parallel.DistributedDataParallel overlaps communication with # computation in the backward pass. # model = DDP(model) # delay_allreduce delays all communication to the end of the backward pass. model = DDP(model, delay_allreduce=True) global model_params, master_params if args.fp16: model_params, master_params = prep_param_lists(model) else: master_params = list(model.parameters()) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() # Scale learning rate based on global batch size args.lr = args.lr * float(args.batch_size * args.world_size) / 256. optimizer = torch.optim.SGD(master_params, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # Optionally resume from a checkpoint if args.resume: # Use a local scope to avoid dangling references def resume(): if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) if args.fp16: saved_master_params = checkpoint['master_params'] for master, saved in zip(master_params, saved_master_params): master.data.copy_(saved.data) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) resume() # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') if (args.arch == "inception_v3"): crop_size = 299 val_size = 320 # I chose this value arbitrarily, we can adjust. else: crop_size = 224 val_size = 256 train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(crop_size), transforms.RandomHorizontalFlip(), # transforms.ToTensor(), Too slow # normalize, ])) val_dataset = datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(val_size), transforms.CenterCrop(crop_size), ])) train_sampler = None val_sampler = None if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) val_sampler = torch.utils.data.distributed.DistributedSampler( val_dataset) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, collate_fn=fast_collate) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=val_sampler, collate_fn=fast_collate) if args.evaluate: validate(val_loader, model, criterion) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch) if args.prof: break # evaluate on validation set prec1 = validate(val_loader, model, criterion) # remember best prec@1 and save checkpoint if args.local_rank == 0: is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) # Use local scope to avoid dangling references def create_and_save_checkpoint(): checkpoint_dict = { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), } if args.fp16: checkpoint_dict['master_params'] = master_params save_checkpoint(checkpoint_dict, is_best) create_and_save_checkpoint()
def main(): parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') parser.add_argument( 'data', metavar='DIR', nargs='*', help='path(s) to dataset (if one path is provided, it is assumed\n' + 'to have subdirectories named "train" and "val"; alternatively,\n' + 'train and val paths can be specified directly by providing both paths as arguments)' ) parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18', choices=model_names, help='model architecture: ' + ' | '.join(model_names) + ' (default: resnet18)') parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', help='number of data loading workers (default: 4)') parser.add_argument('--epochs', default=15, type=int, metavar='N', help='number of total epochs to run') parser.add_argument('--start-epoch', default=0, type=int, metavar='N', help='manual epoch number (useful on restarts)') parser.add_argument( '-bs', '--batch-size', default=128, type=int, metavar='N', help='batch size for descriptor generation (default: 128)') parser.add_argument('-lr', '--learning-rate', default=0.1, type=float, metavar='LR', help='initial learning rate', dest='lr') parser.add_argument('--momentum', default=0.9, type=float, metavar='M', help='momentum') parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float, metavar='W', help='weight decay (default: 1e-4)', dest='weight_decay') parser.add_argument('-p', '--print-freq', default=10, type=int, metavar='N', help='print frequency (default: 10)') parser.add_argument('--fp16', action='store_true', help='Run model fp16 mode.') parser.add_argument('--dali_cpu', action='store_true', help='Runs CPU based version of DALI pipeline.') parser.add_argument( '--static-loss-scale', type=float, default=1, help= 'Static loss scale, positive power of 2 values can improve fp16 convergence.' ) parser.add_argument( '--dynamic-loss-scale', action='store_true', help='Use dynamic loss scaling. If supplied, this argument supersedes ' + '--static-loss-scale.') parser.add_argument('--prof', dest='prof', action='store_true', help='Only run 10 iterations for profiling.') parser.add_argument('-t', '--test', action='store_true', help='Launch test mode with preset arguments') parser.add_argument("--local_rank", default=0, type=int) # added parser.add_argument( '-ir', '--imbalance-ratio', type=int, default=1, metavar='N', help= 'ratio of 0..499 to 500..999 labels in the training dataset drawn from uniform distribution' ) parser.add_argument( '-nr', '--noisy-ratio', type=float, default=0.0, metavar='N', help= 'ratio of noisy(random) labels in the training dataset drawn from uniform distribution' ) parser.add_argument( '-ens', '--ensemble-size', type=int, default=1, metavar='E', help='defines size of ensemble or, by default, no ensemble if = 1') parser.add_argument('-e', '--ensemble-index', type=int, default=0, metavar='E', help='defines index of ensemble') parser.add_argument('--save-folder', default='../local_data/ImageNet', type=str, help='dir to save data') parser.add_argument('-r', '--run-folder', default='run99', type=str, help='dir to save run') args = parser.parse_args() cudnn.benchmark = True # test mode, use default args for sanity test if args.test: args.fp16 = False args.epochs = 1 args.start_epoch = 0 args.arch = 'resnet18' args.batch_size = 256 args.data = [] args.prof = True args.data.append('/data/imagenet/train-jpeg/') args.data.append('/data/imagenet/val-jpeg/') if not len(args.data): raise Exception("error: too few data arguments") args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 args.gpu = 0 args.world_size = 1 if args.distributed: args.gpu = args.local_rank % torch.cuda.device_count() torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." if args.static_loss_scale != 1.0: if not args.fp16: print( "Warning: if --fp16 is not used, static_loss_scale will be ignored." ) # Data loading code if len(args.data) == 1: train_dir = os.path.join(args.data[0], 'train') val_dir = os.path.join(args.data[0], 'val') else: train_dir = args.data[0] val_dir = args.data[1] if (args.arch == "inception_v3"): crop_size = 299 val_size = 320 # I chose this value arbitrarily, we can adjust. else: crop_size = 224 val_size = 256 if not os.path.isdir(args.save_folder): os.mkdir(args.save_folder) # make a separate folder for experiment run_folder = '{}/{}'.format(args.save_folder, args.run_folder) if not os.path.isdir(run_folder): os.mkdir(run_folder) os.mkdir(run_folder + '/data') os.mkdir(run_folder + '/checkpoint') os.mkdir(run_folder + '/descr') # lists for full datasets orig_train_list_file = '{}/{}'.format(args.save_folder, 'processed/train_list.txt') val_list_file = '{}/{}'.format(args.save_folder, 'processed/val_list.txt') if (args.imbalance_ratio == 1) and (args.noisy_ratio == 0.0): # use original training dataset full_train_list_file = orig_train_list_file else: R = 1000 # number of classes distorted_train_list_file = '{}/{}/full_train_list_ir_{}_nr_{}.txt'.format( args.save_folder, args.run_folder, args.imbalance_ratio, args.noisy_ratio) full_train_list_file = distorted_train_list_file if not os.path.isfile(distorted_train_list_file): with open(orig_train_list_file) as f: lines = f.readlines() full_train_list = [x.strip().split() for x in lines] R = 1000 # number of classes class_index = random.sample( range(R), R >> 1) # randomly sample half of classes which we will modify # class imbalance if args.imbalance_ratio != 1: distorted_list = list() for c in range(R): c_list = [ x for i, x in enumerate(full_train_list) if int(x[1]) == c ] A = len(c_list) # select indices we will evict from the list to distort dataset selected_index = list() if c in class_index: selected_index = random.sample( range(A), round(A * (args.imbalance_ratio - 1) / args.imbalance_ratio)) # distorted_list.extend([ i for j, i in enumerate(c_list) if j not in selected_index ]) print(c, A, len(selected_index), len(distorted_list)) else: distorted_list = full_train_list # print('Imbalance =', len(distorted_list), 'selected from original', len(full_train_list)) # noisy labels if args.noisy_ratio != 0.0: P = len(distorted_list) K = int(P * args.noisy_ratio) print('Noisy =', K, ' out of', P) noisy_index = random.sample(range(P), K) for j, i in enumerate(distorted_list): # SHOULD BE SLOW!!! if j in noisy_index: distorted_list[j][1] = random.randint(0, R - 1) # with open(distorted_train_list_file, "w") as f: for item in distorted_list: f.write("%s %s\n" % (item[0], item[1])) # initially we use unsupervised pretraining unsup_prefix = 'unsup_' refer_prefix = '' unsup_postfix = '{}batch_0_ir_{}_nr_{}_sub_{}_aug_{}'.format( unsup_prefix, args.imbalance_ratio, args.noisy_ratio, 'none', 'none') refer_postfix = '{}batch_0_ir_{}_nr_{}_sub_{}_aug_{}'.format( refer_prefix, args.imbalance_ratio, args.noisy_ratio, 'none', 'none') train_list_file = '{}/{}/train_list_{}.txt'.format(args.save_folder, args.run_folder, unsup_postfix) index_list_file = '{}/{}/index_list_{}.npy'.format(args.save_folder, args.run_folder, unsup_postfix) if os.path.isfile(train_list_file) and os.path.isfile(index_list_file): print('Train list exists =', train_list_file) with open(train_list_file) as f: train_list = f.readlines() else: with open(full_train_list_file) as f: lines = f.readlines() lines = [l.strip() for l in lines] index_list = range(len(lines)) train_list = lines # np.save(index_list_file, index_list) with open(train_list_file, "w") as f: f.write("\n".join(train_list)) print('Train list files created =', index_list_file, train_list_file) pipe = HybridTrainPipe(batch_size=args.batch_size, num_threads=args.workers, device_id=args.local_rank, data_dir=train_dir, file_list=train_list_file, crop=crop_size, local_rank=args.local_rank, world_size=args.world_size, dali_cpu=args.dali_cpu) pipe.build() train_loader = DALIClassificationIterator( pipe, size=int(pipe.epoch_size("Reader") / args.world_size)) pipe = HybridValPipe(batch_size=args.batch_size, num_threads=args.workers, device_id=args.local_rank, data_dir=val_dir, file_list=val_list_file, crop=crop_size, local_rank=args.local_rank, world_size=args.world_size, size=val_size) pipe.build() val_loader = DALIClassificationIterator( pipe, size=int(pipe.epoch_size("Reader") / args.world_size)) model_folder = '{}/{}/checkpoint'.format(args.save_folder, args.run_folder) if args.ensemble_size > 1: checkpoint_refer_file = '{}/init_{}_E_{}.pt'.format( model_folder, refer_postfix, args.ensemble_index) checkpoint_unsup_file = '{}/init_{}_E_{}.pt'.format( model_folder, unsup_postfix, args.ensemble_index) else: checkpoint_refer_file = '{}/init_{}.pt'.format(model_folder, refer_postfix) checkpoint_unsup_file = '{}/init_{}.pt'.format(model_folder, unsup_postfix) # save reference checkpoint (randomly initialized) if os.path.isfile(checkpoint_refer_file): print('Model {} is already trained!'.format(checkpoint_refer_file)) else: print("=> creating reference model '{}'".format(args.arch)) modelRefer = models.__dict__[args.arch](UNSUP=False) modelRefer = modelRefer.cuda() if args.fp16: modelRefer = network_to_half(modelRefer) if args.distributed: # shared param/delay all reduce turns off bucketing in DDP, for lower latency runs this can improve perf # for the older version of APEX please use shared_param, for newer one it is delay_allreduce modelRefer = DDP(modelRefer, delay_allreduce=True) # evaluate on validation set criterion = nn.CrossEntropyLoss().cuda() [refer_prec1, refer_prec5] = validate(args, val_loader, modelRefer, criterion, unsup=False) val_loader.reset() # print( 'Saving reference checkpoint at epoch {} with accuracy {}'.format( 0, refer_prec1)) save_checkpoint( { 'epoch': 0, 'arch': args.arch, 'state_dict': modelRefer.state_dict(), 'acc': refer_prec1, }, checkpoint_refer_file) del modelRefer, criterion # train unsupervised model if os.path.isfile(checkpoint_unsup_file): print('Model {} is already trained!'.format(checkpoint_unsup_file)) else: print("=> creating unsupervised model '{}'".format(args.arch)) modelUnsup = models.__dict__[args.arch](UNSUP=True) modelUnsup = modelUnsup.cuda() if args.fp16: modelUnsup = network_to_half(modelUnsup) if args.distributed: # shared param/delay all reduce turns off bucketing in DDP, for lower latency runs this can improve perf # for the older version of APEX please use shared_param, for newer one it is delay_allreduce modelUnsup = DDP(modelUnsup, delay_allreduce=True) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(modelUnsup.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.fp16: optimizer = FP16_Optimizer( optimizer, static_loss_scale=args.static_loss_scale, dynamic_loss_scale=args.dynamic_loss_scale) # evaluate on validation set [best_prec1, best_prec5] = validate(args, val_loader, modelUnsup, criterion, unsup=True) val_loader.reset() for epoch in range(args.start_epoch, args.epochs): # train for one epoch train(args, train_loader, modelUnsup, criterion, optimizer, epoch, unsup=True) # evaluate on validation set [prec1, prec5] = validate(args, val_loader, modelUnsup, criterion, unsup=True) # remember best prec@1 and save checkpoint if args.local_rank == 0: if prec1 > best_prec1: best_prec1 = prec1 print( 'Saving best unsupervised checkpoint at epoch {} with accuracy {}' .format(epoch + 1, best_prec1)) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': modelUnsup.state_dict(), 'acc': best_prec1, #'optimizer': optimizer.state_dict(), }, checkpoint_unsup_file) else: print('Local rank is not zero') # reset DALI iterators train_loader.reset() val_loader.reset() if args.epochs == args.start_epoch - 1: print('##Top-1 {0}\n' '##Top-5 {1}').format(prec1, prec5)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--ernie_model", default=None, type=str, required=True, help="Ernie pre-trained model") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", default=False, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--threshold', type=float, default=.3) args = parser.parse_args() processors = FewrelProcessor num_labels_task = 80 if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) processor = processors() num_labels = num_labels_task label_list = None tokenizer = BertTokenizer.from_pretrained(args.ernie_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None train_examples, label_list = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model, _ = BertForSequenceClassification.from_pretrained( args.ernie_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_grad = [ 'bert.encoder.layer.11.output.dense_ent', 'bert.encoder.layer.11.output.LayerNorm_ent' ] param_optimizer = [(n, p) for n, p in param_optimizer if not any(nd in n for nd in no_grad)] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, args.threshold) vecs = [] vecs.append([0] * 100) with open("kg_embed/entity2vec.vec", 'r') as fin: for line in fin: vec = line.strip().split('\t') vec = [float(x) for x in vec] vecs.append(vec) embed = torch.FloatTensor(vecs) embed = torch.nn.Embedding.from_pretrained(embed) # embed = torch.nn.Embedding(5041175, 100) logger.info("Shape of entity embedding: " + str(embed.weight.size())) del vecs logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_ent = torch.tensor([f.input_ent for f in train_features], dtype=torch.long) all_ent_masks = torch.tensor([f.ent_mask for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_ent, all_ent_masks, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) output_loss_file = os.path.join(args.output_dir, "loss") loss_fout = open(output_loss_file, 'w') model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple( t.to(device) if i != 3 else t for i, t in enumerate(batch)) input_ids, input_mask, segment_ids, input_ent, ent_mask, label_ids = batch input_ent = embed(input_ent + 1).to(device) # -1 -> 0 loss = model(input_ids, segment_ids, input_mask, input_ent.half(), ent_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() loss_fout.write("{}\n".format(loss.item())) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 model_to_save = model.module if hasattr(model, 'module') else model output_model_file = os.path.join( args.output_dir, "pytorch_model.bin_{}".format(global_step)) torch.save(model_to_save.state_dict(), output_model_file) # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file)
class BaseTrainer(object): def __init__(self, cfg, model, train_dl, val_dl, loss_func, num_query, num_gpus): self.cfg = cfg self.model = model self.train_dl = train_dl self.val_dl = val_dl self.loss_func = loss_func self.num_query = num_query self.loss_avg = AvgerageMeter() self.acc_avg = AvgerageMeter() self.train_epoch = 1 self.batch_cnt = 0 self.logger = logging.getLogger('reid_baseline.train') self.log_period = cfg.SOLVER.LOG_PERIOD self.checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD self.eval_period = cfg.SOLVER.EVAL_PERIOD self.output_dir = cfg.OUTPUT_DIR self.device = cfg.MODEL.DEVICE self.epochs = cfg.SOLVER.MAX_EPOCHS if num_gpus > 1: # convert to use sync_bn self.logger.info( 'More than one gpu used, convert model to use SyncBN.') if cfg.SOLVER.FP16: # TODO: Multi-GPU model with FP16 raise NotImplementedError self.logger.info( 'Using apex to perform SyncBN and FP16 training') torch.distributed.init_process_group(backend='nccl', init_method='env://') self.model = apex.parallel.convert_syncbn_model(self.model) else: # Multi-GPU model without FP16 self.model = nn.DataParallel(self.model) self.model = convert_model(self.model) self.model.cuda() self.logger.info('Using pytorch SyncBN implementation') self.optim = make_optimizer(cfg, self.model, num_gpus) self.scheduler = WarmupMultiStepLR(self.optim, cfg.SOLVER.STEPS, cfg.SOLVER.GAMMA, cfg.SOLVER.WARMUP_FACTOR, cfg.SOLVER.WARMUP_ITERS, cfg.SOLVER.WARMUP_METHOD) self.scheduler.step() self.mix_precision = False self.logger.info('Trainer Built') return else: # Single GPU model self.model.cuda() self.optim = make_optimizer(cfg, self.model, num_gpus) self.scheduler = WarmupMultiStepLR(self.optim, cfg.SOLVER.STEPS, cfg.SOLVER.GAMMA, cfg.SOLVER.WARMUP_FACTOR, cfg.SOLVER.WARMUP_ITERS, cfg.SOLVER.WARMUP_METHOD) self.scheduler.step() self.mix_precision = False if cfg.SOLVER.FP16: # Single model using FP16 self.model, self.optim = amp.initialize(self.model, self.optim, opt_level='O1') self.mix_precision = True self.logger.info('Using fp16 training') self.logger.info('Trainer Built') return # TODO: Multi-GPU model with FP16 raise NotImplementedError self.model.to(self.device) self.optim = make_optimizer(cfg, self.model, num_gpus) self.scheduler = WarmupMultiStepLR(self.optim, cfg.SOLVER.STEPS, cfg.SOLVER.GAMMA, cfg.SOLVER.WARMUP_FACTOR, cfg.SOLVER.WARMUP_ITERS, cfg.SOLVER.WARMUP_METHOD) self.scheduler.step() self.model, self.optim = amp.initialize(self.model, self.optim, opt_level='O1') self.mix_precision = True self.logger.info('Using fp16 training') self.model = DDP(self.model, delay_allreduce=True) self.logger.info('Convert model using apex') self.logger.info('Trainer Built') def handle_new_batch(self): self.batch_cnt += 1 if self.batch_cnt % self.cfg.SOLVER.LOG_PERIOD == 0: self.logger.info('Epoch[{}] Iteration[{}/{}] Loss: {:.3f},' 'Acc: {:.3f}, Base Lr: {:.2e}'.format( self.train_epoch, self.batch_cnt, len(self.train_dl), self.loss_avg.avg, self.acc_avg.avg, self.scheduler.get_lr()[0])) def handle_new_epoch(self): self.batch_cnt = 1 self.scheduler.step() self.logger.info('Epoch {} done'.format(self.train_epoch)) self.logger.info('-' * 20) if self.train_epoch % self.checkpoint_period == 0: self.save() if self.train_epoch % self.eval_period == 0: self.evaluate() self.train_epoch += 1 def step(self, batch): self.model.train() self.optim.zero_grad() img, target = batch img, target = img.cuda(), target.cuda() score, feat = self.model(img) loss = self.loss_func(score, feat, target) if self.mix_precision: with amp.scale_loss(loss, self.optim) as scaled_loss: scaled_loss.backward() else: loss.backward() self.optim.step() acc = (score.max(1)[1] == target).float().mean() self.loss_avg.update(loss.cpu().item()) self.acc_avg.update(acc.cpu().item()) return self.loss_avg.avg, self.acc_avg.avg def evaluate(self): self.model.eval() num_query = self.num_query feats, pids, camids = [], [], [] with torch.no_grad(): for batch in tqdm(self.val_dl, total=len(self.val_dl), leave=False): data, pid, camid, _ = batch data = data.cuda() feat = self.model(data).detach().cpu() feats.append(feat) pids.append(pid) camids.append(camid) feats = torch.cat(feats, dim=0) pids = torch.cat(pids, dim=0) camids = torch.cat(camids, dim=0) query_feat = feats[:num_query] query_pid = pids[:num_query] query_camid = camids[:num_query] gallery_feat = feats[num_query:] gallery_pid = pids[num_query:] gallery_camid = camids[num_query:] distmat = euclidean_dist(query_feat, gallery_feat) cmc, mAP, _ = eval_func(distmat.numpy(), query_pid.numpy(), gallery_pid.numpy(), query_camid.numpy(), gallery_camid.numpy(), use_cython=self.cfg.SOLVER.CYTHON) self.logger.info('Validation Result:') for r in self.cfg.TEST.CMC: self.logger.info('CMC Rank-{}: {:.2%}'.format(r, cmc[r - 1])) self.logger.info('mAP: {:.2%}'.format(mAP)) self.logger.info('-' * 20) def save(self): torch.save( self.model.state_dict(), osp.join( self.output_dir, self.cfg.MODEL.NAME + '_epoch' + str(self.train_epoch) + '.pth')) torch.save( self.optim.state_dict(), osp.join( self.output_dir, self.cfg.MODEL.NAME + '_epoch' + str(self.train_epoch) + '_optim.pth'))
if args.random: logger.info("USING RANDOM INITIALISATION FOR FINETUNING") model.apply(model.init_bert_weights) if args.fp16: model.half() model.to(device) if is_distributed: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay':
def __init__(self, cfg, model, train_dl, val_dl, loss_func, num_query, num_gpus): self.cfg = cfg self.model = model self.train_dl = train_dl self.val_dl = val_dl self.loss_func = loss_func self.num_query = num_query self.loss_avg = AvgerageMeter() self.acc_avg = AvgerageMeter() self.train_epoch = 1 self.batch_cnt = 0 self.logger = logging.getLogger('reid_baseline.train') self.log_period = cfg.SOLVER.LOG_PERIOD self.checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD self.eval_period = cfg.SOLVER.EVAL_PERIOD self.output_dir = cfg.OUTPUT_DIR self.device = cfg.MODEL.DEVICE self.epochs = cfg.SOLVER.MAX_EPOCHS if num_gpus > 1: # convert to use sync_bn self.logger.info( 'More than one gpu used, convert model to use SyncBN.') if cfg.SOLVER.FP16: # TODO: Multi-GPU model with FP16 raise NotImplementedError self.logger.info( 'Using apex to perform SyncBN and FP16 training') torch.distributed.init_process_group(backend='nccl', init_method='env://') self.model = apex.parallel.convert_syncbn_model(self.model) else: # Multi-GPU model without FP16 self.model = nn.DataParallel(self.model) self.model = convert_model(self.model) self.model.cuda() self.logger.info('Using pytorch SyncBN implementation') self.optim = make_optimizer(cfg, self.model, num_gpus) self.scheduler = WarmupMultiStepLR(self.optim, cfg.SOLVER.STEPS, cfg.SOLVER.GAMMA, cfg.SOLVER.WARMUP_FACTOR, cfg.SOLVER.WARMUP_ITERS, cfg.SOLVER.WARMUP_METHOD) self.scheduler.step() self.mix_precision = False self.logger.info('Trainer Built') return else: # Single GPU model self.model.cuda() self.optim = make_optimizer(cfg, self.model, num_gpus) self.scheduler = WarmupMultiStepLR(self.optim, cfg.SOLVER.STEPS, cfg.SOLVER.GAMMA, cfg.SOLVER.WARMUP_FACTOR, cfg.SOLVER.WARMUP_ITERS, cfg.SOLVER.WARMUP_METHOD) self.scheduler.step() self.mix_precision = False if cfg.SOLVER.FP16: # Single model using FP16 self.model, self.optim = amp.initialize(self.model, self.optim, opt_level='O1') self.mix_precision = True self.logger.info('Using fp16 training') self.logger.info('Trainer Built') return # TODO: Multi-GPU model with FP16 raise NotImplementedError self.model.to(self.device) self.optim = make_optimizer(cfg, self.model, num_gpus) self.scheduler = WarmupMultiStepLR(self.optim, cfg.SOLVER.STEPS, cfg.SOLVER.GAMMA, cfg.SOLVER.WARMUP_FACTOR, cfg.SOLVER.WARMUP_ITERS, cfg.SOLVER.WARMUP_METHOD) self.scheduler.step() self.model, self.optim = amp.initialize(self.model, self.optim, opt_level='O1') self.mix_precision = True self.logger.info('Using fp16 training') self.model = DDP(self.model, delay_allreduce=True) self.logger.info('Convert model using apex') self.logger.info('Trainer Built')
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument("--negative_weight", default=1., type=float) parser.add_argument("--neutral_words_file", default='data/identity.csv') # if true, use test data instead of val data parser.add_argument("--test", action='store_true') # Explanation specific arguments below # whether run explanation algorithms parser.add_argument("--explain", action='store_true', help='if true, explain test set predictions') parser.add_argument("--debug", action='store_true') # which algorithm to run parser.add_argument("--algo", choices=['soc']) # the output filename without postfix parser.add_argument("--output_filename", default='temp.tmp') # see utils/config.py parser.add_argument("--use_padding_variant", action='store_true') parser.add_argument("--mask_outside_nb", action='store_true') parser.add_argument("--nb_range", type=int) parser.add_argument("--sample_n", type=int) # whether use explanation regularization parser.add_argument("--reg_explanations", action='store_true') parser.add_argument("--reg_strength", type=float) parser.add_argument("--reg_mse", action='store_true') # whether discard other neutral words during regularization. default: False parser.add_argument("--discard_other_nw", action='store_false', dest='keep_other_nw') # whether remove neutral words when loading datasets parser.add_argument("--remove_nw", action='store_true') # if true, generate hierarchical explanations instead of word level outputs. # Only useful when the --explain flag is also added. parser.add_argument("--hiex", action='store_true') parser.add_argument("--hiex_tree_height", default=5, type=int) # whether add the sentence itself to the sample set in SOC parser.add_argument("--hiex_add_itself", action='store_true') # the directory where the lm is stored parser.add_argument("--lm_dir", default='runs/lm') # if configured, only generate explanations for instances with given line numbers parser.add_argument("--hiex_idxs", default=None) # if true, use absolute values of explanations for hierarchical clustering parser.add_argument("--hiex_abs", action='store_true') # if either of the two is true, only generate explanations for positive / negative instances parser.add_argument("--only_positive", action='store_true') parser.add_argument("--only_negative", action='store_true') # stop after generating x explanation parser.add_argument("--stop", default=100000000, type=int) # early stopping with decreasing learning rate. 0: direct exit when validation F1 decreases parser.add_argument("--early_stop", default=5, type=int) # other external arguments originally here in pytorch_transformers parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=32, type=int, help="Total batch size for eval.") parser.add_argument("--validate_steps", default=200, type=int, help="validate once for how many steps") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--running_offline', action='store_true') args = parser.parse_args() combine_args(configs, args) args = configs if args.running_offline: TRANSFORMERS_OFFLINE = 1 print('running offline') if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { 'gab': GabProcessor, 'ws': WSProcessor, 'nyt': NytProcessor, #'multi-class': multiclass_Processor, #'multi-label': multilabel_Processor, } output_modes = { 'gab': 'classification', 'ws': 'classification', 'nyt': 'classification' } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") #if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # save configs f = open(os.path.join(args.output_dir, 'args.json'), 'w') json.dump(args.__dict__, f, indent=4) f.close() task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) if args.bert_model == 'GroNLP/hateBERT': from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification tokenizer = AutoTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) else: tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) processor = processors[task_name](configs, tokenizer=tokenizer) output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) if args.do_train: if args.bert_model == 'GroNLP/hateBERT': model = AutoModelForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) else: model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) else: if args.bert_model == 'GroNLP/hateBERT': model = AutoModelForSequenceClassification.from_pretrained( args.output_dir, num_labels=num_labels) else: model = BertForSequenceClassification.from_pretrained( args.output_dir, num_labels=num_labels) model.to(device) if args.fp16: model.half() if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) # elif n_gpu > 1: # model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: if args.do_train: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss, tr_reg_loss = 0, 0 tr_reg_cnt = 0 epoch = -1 val_best_f1 = -1 val_best_loss = 1e10 early_stop_countdown = args.early_stop if args.reg_explanations: train_lm_dataloder = processor.get_dataloader('train', configs.train_batch_size) dev_lm_dataloader = processor.get_dataloader('dev', configs.train_batch_size) explainer = SamplingAndOcclusionExplain( model, configs, tokenizer, device=device, vocab=tokenizer.vocab, train_dataloader=train_lm_dataloder, dev_dataloader=dev_lm_dataloader, lm_dir=args.lm_dir, output_path=os.path.join(configs.output_dir, configs.output_filename), ) else: explainer = None if args.do_train: epoch = 0 train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, output_mode, configs) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) class_weight = torch.FloatTensor([args.negative_weight, 1]).to(device) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # print('----------------------------') # print(input_ids.size()) # print(label_ids.size()) if args.bert_model == "GroNLP/hateBERT": logits = model(input_ids, segment_ids, input_mask, labels=None).logits else: logits = model(input_ids, segment_ids, input_mask, labels=None) if output_mode == "classification": loss_fct = CrossEntropyLoss(class_weight) loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) # define a new function to compute loss values for both output_modes # if args.bert_model != "GroNLP/hateBERT": # logits = model(input_ids, segment_ids, input_mask, labels=None) # if output_mode == "classification": # loss_fct = CrossEntropyLoss(class_weight) # loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) # elif output_mode == "regression": # loss_fct = MSELoss() # loss = loss_fct(logits.view(-1), label_ids.view(-1)) # else: # model_output = model(input_ids, segment_ids, input_mask, labels=label_ids) # print(model_output) # loss = model_output.loss if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() if args.fp16: optimizer.backward(loss) else: loss.backward() # regularize explanations # NOTE: backward performed inside this function to prevent OOM if args.reg_explanations: reg_loss, reg_cnt = explainer.compute_explanation_loss( input_ids, input_mask, segment_ids, label_ids, do_backprop=True) tr_reg_loss += reg_loss # float tr_reg_cnt += reg_cnt nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if global_step % args.validate_steps == 0: val_result = validate(args, model, processor, tokenizer, output_mode, label_list, device, num_labels, task_name, tr_loss, global_step, epoch, explainer) val_acc, val_f1 = val_result['acc'], val_result['f1'] if val_f1 > val_best_f1: val_best_f1 = val_f1 if args.local_rank == -1 or torch.distributed.get_rank( ) == 0: save_model(args, model, tokenizer, num_labels) else: # halve the learning rate for param_group in optimizer.param_groups: param_group['lr'] *= 0.5 early_stop_countdown -= 1 logger.info( "Reducing learning rate... Early stop countdown %d" % early_stop_countdown) if early_stop_countdown < 0: break if early_stop_countdown < 0: break epoch += 1 # training finish ############################ # if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # if not args.explain: # args.test = True # validate(args, model, processor, tokenizer, output_mode, label_list, device, num_labels, # task_name, tr_loss, global_step=0, epoch=-1, explainer=explainer) # else: # args.test = True # explain(args, model, processor, tokenizer, output_mode, label_list, device) if not args.explain: args.test = True print('--Test_args.test: %s' % str(args.test)) #Test_args.test: True validate(args, model, processor, tokenizer, output_mode, label_list, device, num_labels, task_name, tr_loss, global_step=888, epoch=-1, explainer=explainer) args.test = False else: print('--Test_args.test: %s' % str(args.test)) # Test_args.test: True args.test = True explain(args, model, processor, tokenizer, output_mode, label_list, device) args.test = False
class Distiller: def __init__(self, params: dict, dataset: LmSeqsDataset, token_probs: torch.tensor, student: nn.Module, teacher: nn.Module): logger.info("Initializing Distiller") self.params = params self.dump_path = params.dump_path self.multi_gpu = params.multi_gpu self.fp16 = params.fp16 self.student = student self.teacher = teacher self.student_config = student.config self.vocab_size = student.config.vocab_size if params.n_gpu <= 1: sampler = RandomSampler(dataset) else: sampler = DistributedSampler(dataset) if params.group_by_size: groups = create_lengths_groups(lengths=dataset.lengths, k=params.max_model_input_size) sampler = GroupedBatchSampler(sampler=sampler, group_ids=groups, batch_size=params.batch_size) else: sampler = BatchSampler(sampler=sampler, batch_size=params.batch_size, drop_last=False) self.dataloader = DataLoader(dataset=dataset, batch_sampler=sampler, collate_fn=dataset.batch_sequences) self.temperature = params.temperature assert self.temperature > 0.0 self.alpha_ce = params.alpha_ce self.alpha_mlm = params.alpha_mlm self.alpha_clm = params.alpha_clm self.alpha_mse = params.alpha_mse self.alpha_cos = params.alpha_cos self.mlm = params.mlm if self.mlm: logger.info(f"Using MLM loss for LM step.") self.mlm_mask_prop = params.mlm_mask_prop assert 0.0 <= self.mlm_mask_prop <= 1.0 assert params.word_mask + params.word_keep + params.word_rand == 1.0 self.pred_probs = torch.FloatTensor( [params.word_mask, params.word_keep, params.word_rand]) self.pred_probs = self.pred_probs.to( f"cuda:{params.local_rank}" ) if params.n_gpu > 0 else self.pred_probs self.token_probs = token_probs.to( f"cuda:{params.local_rank}" ) if params.n_gpu > 0 else token_probs if self.fp16: self.pred_probs = self.pred_probs.half() self.token_probs = self.token_probs.half() else: logger.info(f"Using CLM loss for LM step.") self.epoch = 0 self.n_iter = 0 self.n_total_iter = 0 self.n_sequences_epoch = 0 self.total_loss_epoch = 0 self.last_loss = 0 self.last_loss_ce = 0 self.last_loss_mlm = 0 self.last_loss_clm = 0 if self.alpha_mse > 0.0: self.last_loss_mse = 0 if self.alpha_cos > 0.0: self.last_loss_cos = 0 self.last_log = 0 self.ce_loss_fct = nn.KLDivLoss(reduction="batchmean") self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-100) if self.alpha_mse > 0.0: self.mse_loss_fct = nn.MSELoss(reduction="sum") if self.alpha_cos > 0.0: self.cosine_loss_fct = nn.CosineEmbeddingLoss(reduction="mean") logger.info("--- Initializing model optimizer") assert params.gradient_accumulation_steps >= 1 self.num_steps_epoch = len(self.dataloader) num_train_optimization_steps = ( int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in student.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad ], "weight_decay": params.weight_decay, }, { "params": [ p for n, p in student.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad ], "weight_decay": 0.0, }, ] logger.info( "------ Number of trainable parameters (student): %i" % sum([ p.numel() for p in self.student.parameters() if p.requires_grad ])) logger.info("------ Number of parameters (student): %i" % sum([p.numel() for p in self.student.parameters()])) self.optimizer = AdamW(optimizer_grouped_parameters, lr=params.learning_rate, eps=params.adam_epsilon, betas=(0.9, 0.98)) warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop) self.scheduler = get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_optimization_steps) if self.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) logger.info( f"Using fp16 training: {self.params.fp16_opt_level} level") self.student, self.optimizer = amp.initialize( self.student, self.optimizer, opt_level=self.params.fp16_opt_level) self.teacher = self.teacher.half() if self.multi_gpu: if self.fp16: from apex.parallel import DistributedDataParallel logger.info( "Using apex.parallel.DistributedDataParallel for distributed training." ) self.student = DistributedDataParallel(self.student) else: from torch.nn.parallel import DistributedDataParallel logger.info( "Using nn.parallel.DistributedDataParallel for distributed training." ) self.student = DistributedDataParallel( self.student, device_ids=[params.local_rank], output_device=params.local_rank, find_unused_parameters=True, ) self.is_master = params.is_master if self.is_master: logger.info("--- Initializing Tensorboard") self.tensorboard = SummaryWriter( log_dir=os.path.join(self.dump_path, "log", "train")) self.tensorboard.add_text(tag="config/training", text_string=str(self.params), global_step=0) self.tensorboard.add_text(tag="config/student", text_string=str(self.student_config), global_step=0) def prepare_batch_mlm(self, batch): """ Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the masked label for MLM. Input: ------ batch: `Tuple` token_ids: `torch.tensor(bs, seq_length)` - The token ids for each of the sequence. It is padded. lengths: `torch.tensor(bs)` - The lengths of each of the sequences in the batch. Output: ------- token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM. attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention. mlm_labels: `torch.tensor(bs, seq_length)` - The masked languge modeling labels. There is a -100 where there is nothing to predict. """ token_ids, lengths = batch token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths) assert token_ids.size(0) == lengths.size(0) attn_mask = torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None] bs, max_seq_len = token_ids.size() mlm_labels = token_ids.new(token_ids.size()).copy_(token_ids) x_prob = self.token_probs[token_ids.flatten()] n_tgt = math.ceil(self.mlm_mask_prop * lengths.sum().item()) tgt_ids = torch.multinomial(x_prob / x_prob.sum(), n_tgt, replacement=False) pred_mask = torch.zeros( bs * max_seq_len, dtype=torch.bool, device=token_ids.device ) # previously `dtype=torch.uint8`, cf pytorch 1.2.0 compatibility pred_mask[tgt_ids] = 1 pred_mask = pred_mask.view(bs, max_seq_len) pred_mask[token_ids == self.params.special_tok_ids["pad_token"]] = 0 # mask a number of words == 0 [8] (faster with fp16) if self.fp16: n1 = pred_mask.sum().item() if n1 > 8: pred_mask = pred_mask.view(-1) n2 = max(n1 % 8, 8 * (n1 // 8)) if n2 != n1: pred_mask[torch.nonzero(pred_mask).view(-1)[:n1 - n2]] = 0 pred_mask = pred_mask.view(bs, max_seq_len) assert pred_mask.sum().item() % 8 == 0, pred_mask.sum().item() _token_ids_real = token_ids[pred_mask] _token_ids_rand = _token_ids_real.clone().random_(self.vocab_size) _token_ids_mask = _token_ids_real.clone().fill_( self.params.special_tok_ids["mask_token"]) probs = torch.multinomial(self.pred_probs, len(_token_ids_real), replacement=True) _token_ids = (_token_ids_mask * (probs == 0).long() + _token_ids_real * (probs == 1).long() + _token_ids_rand * (probs == 2).long()) token_ids = token_ids.masked_scatter(pred_mask, _token_ids) mlm_labels[ ~pred_mask] = -100 # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility # sanity checks assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size return token_ids, attn_mask, mlm_labels def prepare_batch_clm(self, batch): """ Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the labels for CLM. Input: ------ batch: `Tuple` token_ids: `torch.tensor(bs, seq_length)` - The token ids for each of the sequence. It is padded. lengths: `torch.tensor(bs)` - The lengths of each of the sequences in the batch. Output: ------- token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM. attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention. clm_labels: `torch.tensor(bs, seq_length)` - The causal languge modeling labels. There is a -100 where there is nothing to predict. """ token_ids, lengths = batch token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths) assert token_ids.size(0) == lengths.size(0) attn_mask = torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None] clm_labels = token_ids.new(token_ids.size()).copy_(token_ids) clm_labels[ ~attn_mask] = -100 # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility # sanity checks assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size return token_ids, attn_mask, clm_labels def round_batch(self, x: torch.tensor, lengths: torch.tensor): """ For float16 only. Sub-sample sentences in a batch, and add padding, so that each dimension is a multiple of 8. Input: ------ x: `torch.tensor(bs, seq_length)` - The token ids. lengths: `torch.tensor(bs, seq_length)` - The lengths of each of the sequence in the batch. Output: ------- x: `torch.tensor(new_bs, new_seq_length)` - The updated token ids. lengths: `torch.tensor(new_bs, new_seq_length)` - The updated lengths. """ if not self.fp16 or len(lengths) < 8: return x, lengths # number of sentences == 0 [8] bs1 = len(lengths) bs2 = 8 * (bs1 // 8) assert bs2 > 0 and bs2 % 8 == 0 if bs1 != bs2: idx = torch.randperm(bs1)[:bs2] lengths = lengths[idx] slen = lengths.max().item() x = x[idx, :slen] else: idx = None # sequence length == 0 [8] ml1 = x.size(1) if ml1 % 8 != 0: pad = 8 - (ml1 % 8) ml2 = ml1 + pad if self.mlm: pad_id = self.params.special_tok_ids["pad_token"] else: pad_id = self.params.special_tok_ids["unk_token"] padding_tensor = torch.zeros(bs2, pad, dtype=torch.long, device=x.device).fill_(pad_id) x = torch.cat([x, padding_tensor], 1) assert x.size() == (bs2, ml2) assert x.size(0) % 8 == 0 assert x.size(1) % 8 == 0 return x, lengths def train(self): """ The real training loop. """ if self.is_master: logger.info("Starting training") self.last_log = time.time() self.student.train() self.teacher.eval() for _ in range(self.params.n_epoch): if self.is_master: logger.info( f"--- Starting epoch {self.epoch}/{self.params.n_epoch-1}") if self.multi_gpu: torch.distributed.barrier() iter_bar = tqdm(self.dataloader, desc="-Iter", disable=self.params.local_rank not in [-1, 0]) for batch in iter_bar: if self.params.n_gpu > 0: batch = tuple( t.to(f"cuda:{self.params.local_rank}") for t in batch) if self.mlm: token_ids, attn_mask, lm_labels = self.prepare_batch_mlm( batch=batch) else: token_ids, attn_mask, lm_labels = self.prepare_batch_clm( batch=batch) self.step(input_ids=token_ids, attention_mask=attn_mask, lm_labels=lm_labels) iter_bar.update() iter_bar.set_postfix({ "Last_loss": f"{self.last_loss:.2f}", "Avg_cum_loss": f"{self.total_loss_epoch/self.n_iter:.2f}" }) iter_bar.close() if self.is_master: logger.info( f"--- Ending epoch {self.epoch}/{self.params.n_epoch-1}") self.end_epoch() if self.is_master: logger.info(f"Save very last checkpoint as `pytorch_model.bin`.") self.save_checkpoint(checkpoint_name=f"pytorch_model.bin") logger.info("Training is finished") def step(self, input_ids: torch.tensor, attention_mask: torch.tensor, lm_labels: torch.tensor): """ One optimization step: forward of student AND teacher, backward on the loss (for gradient accumulation), and possibly a parameter update (depending on the gradient accumulation). Input: ------ input_ids: `torch.tensor(bs, seq_length)` - The token ids. attention_mask: `torch.tensor(bs, seq_length)` - The attention mask for self attention. lm_labels: `torch.tensor(bs, seq_length)` - The language modeling labels (mlm labels for MLM and clm labels for CLM). """ if self.mlm: s_logits, s_hidden_states = self.student( input_ids=input_ids, attention_mask=attention_mask) # (bs, seq_length, voc_size) with torch.no_grad(): t_logits, t_hidden_states = self.teacher( input_ids=input_ids, attention_mask=attention_mask ) # (bs, seq_length, voc_size) else: s_logits, _, s_hidden_states = self.student( input_ids=input_ids, attention_mask=None) # (bs, seq_length, voc_size) with torch.no_grad(): t_logits, _, t_hidden_states = self.teacher( input_ids=input_ids, attention_mask=None) # (bs, seq_length, voc_size) assert s_logits.size() == t_logits.size() # https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100 # https://github.com/peterliht/knowledge-distillation-pytorch/issues/2 if self.params.restrict_ce_to_mask: mask = (lm_labels > -1).unsqueeze(-1).expand_as( s_logits) # (bs, seq_lenth, voc_size) else: mask = attention_mask.unsqueeze(-1).expand_as( s_logits) # (bs, seq_lenth, voc_size) s_logits_slct = torch.masked_select( s_logits, mask) # (bs * seq_length * voc_size) modulo the 1s in mask s_logits_slct = s_logits_slct.view(-1, s_logits.size( -1)) # (bs * seq_length, voc_size) modulo the 1s in mask t_logits_slct = torch.masked_select( t_logits, mask) # (bs * seq_length * voc_size) modulo the 1s in mask t_logits_slct = t_logits_slct.view(-1, s_logits.size( -1)) # (bs * seq_length, voc_size) modulo the 1s in mask assert t_logits_slct.size() == s_logits_slct.size() loss_ce = (self.ce_loss_fct( F.log_softmax(s_logits_slct / self.temperature, dim=-1), F.softmax(t_logits_slct / self.temperature, dim=-1), ) * (self.temperature)**2) loss = self.alpha_ce * loss_ce if self.alpha_mlm > 0.0: loss_mlm = self.lm_loss_fct(s_logits.view(-1, s_logits.size(-1)), lm_labels.view(-1)) loss += self.alpha_mlm * loss_mlm if self.alpha_clm > 0.0: shift_logits = s_logits[..., :-1, :].contiguous() shift_labels = lm_labels[..., 1:].contiguous() loss_clm = self.lm_loss_fct( shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) loss += self.alpha_clm * loss_clm if self.alpha_mse > 0.0: loss_mse = self.mse_loss_fct( s_logits_slct, t_logits_slct) / s_logits_slct.size( 0) # Reproducing batchmean reduction loss += self.alpha_mse * loss_mse if self.alpha_cos > 0.0: s_hidden_states = s_hidden_states[-1] # (bs, seq_length, dim) t_hidden_states = t_hidden_states[-1] # (bs, seq_length, dim) mask = attention_mask.unsqueeze(-1).expand_as( s_hidden_states) # (bs, seq_length, dim) assert s_hidden_states.size() == t_hidden_states.size() dim = s_hidden_states.size(-1) s_hidden_states_slct = torch.masked_select( s_hidden_states, mask) # (bs * seq_length * dim) s_hidden_states_slct = s_hidden_states_slct.view( -1, dim) # (bs * seq_length, dim) t_hidden_states_slct = torch.masked_select( t_hidden_states, mask) # (bs * seq_length * dim) t_hidden_states_slct = t_hidden_states_slct.view( -1, dim) # (bs * seq_length, dim) target = s_hidden_states_slct.new( s_hidden_states_slct.size(0)).fill_(1) # (bs * seq_length,) loss_cos = self.cosine_loss_fct(s_hidden_states_slct, t_hidden_states_slct, target) loss += self.alpha_cos * loss_cos self.total_loss_epoch += loss.item() self.last_loss = loss.item() self.last_loss_ce = loss_ce.item() if self.alpha_mlm > 0.0: self.last_loss_mlm = loss_mlm.item() if self.alpha_clm > 0.0: self.last_loss_clm = loss_clm.item() if self.alpha_mse > 0.0: self.last_loss_mse = loss_mse.item() if self.alpha_cos > 0.0: self.last_loss_cos = loss_cos.item() self.optimize(loss) self.n_sequences_epoch += input_ids.size(0) def optimize(self, loss): """ Normalization on the loss (gradient accumulation or distributed training), followed by backward pass on the loss, possibly followed by a parameter update (depending on the gradient accumulation). Also update the metrics for tensorboard. """ # Check for NaN if (loss != loss).data.any(): logger.error("NaN detected") exit() if self.multi_gpu: loss = loss.mean() if self.params.gradient_accumulation_steps > 1: loss = loss / self.params.gradient_accumulation_steps if self.fp16: from apex import amp with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() self.iter() if self.n_iter % self.params.gradient_accumulation_steps == 0: if self.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(self.optimizer), self.params.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(self.student.parameters(), self.params.max_grad_norm) self.optimizer.step() self.optimizer.zero_grad() self.scheduler.step() def iter(self): """ Update global counts, write to tensorboard and save checkpoint. """ self.n_iter += 1 self.n_total_iter += 1 if self.n_total_iter % self.params.log_interval == 0: self.log_tensorboard() self.last_log = time.time() if self.n_total_iter % self.params.checkpoint_interval == 0: self.save_checkpoint() def log_tensorboard(self): """ Log into tensorboard. Only by the master process. """ if not self.is_master: return for param_name, param in self.student.named_parameters(): self.tensorboard.add_scalar(tag="parameter_mean/" + param_name, scalar_value=param.data.mean(), global_step=self.n_total_iter) self.tensorboard.add_scalar(tag="parameter_std/" + param_name, scalar_value=param.data.std(), global_step=self.n_total_iter) if param.grad is None: continue self.tensorboard.add_scalar(tag="grad_mean/" + param_name, scalar_value=param.grad.data.mean(), global_step=self.n_total_iter) self.tensorboard.add_scalar(tag="grad_std/" + param_name, scalar_value=param.grad.data.std(), global_step=self.n_total_iter) self.tensorboard.add_scalar( tag="losses/cum_avg_loss_epoch", scalar_value=self.total_loss_epoch / self.n_iter, global_step=self.n_total_iter, ) self.tensorboard.add_scalar(tag="losses/loss", scalar_value=self.last_loss, global_step=self.n_total_iter) self.tensorboard.add_scalar(tag="losses/loss_ce", scalar_value=self.last_loss_ce, global_step=self.n_total_iter) if self.alpha_mlm > 0.0: self.tensorboard.add_scalar(tag="losses/loss_mlm", scalar_value=self.last_loss_mlm, global_step=self.n_total_iter) if self.alpha_clm > 0.0: self.tensorboard.add_scalar(tag="losses/loss_clm", scalar_value=self.last_loss_clm, global_step=self.n_total_iter) if self.alpha_mse > 0.0: self.tensorboard.add_scalar(tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter) if self.alpha_cos > 0.0: self.tensorboard.add_scalar(tag="losses/loss_cos", scalar_value=self.last_loss_cos, global_step=self.n_total_iter) self.tensorboard.add_scalar(tag="learning_rate/lr", scalar_value=self.scheduler.get_lr()[0], global_step=self.n_total_iter) self.tensorboard.add_scalar( tag="global/memory_usage", scalar_value=psutil.virtual_memory()._asdict()["used"] / 1_000_000, global_step=self.n_total_iter, ) self.tensorboard.add_scalar(tag="global/speed", scalar_value=time.time() - self.last_log, global_step=self.n_total_iter) def end_epoch(self): """ Finally arrived at the end of epoch (full pass on dataset). Do some tensorboard logging and checkpoint saving. """ logger.info( f"{self.n_sequences_epoch} sequences have been trained during this epoch." ) if self.is_master: self.save_checkpoint( checkpoint_name=f"model_epoch_{self.epoch}.pth") self.tensorboard.add_scalar(tag="epoch/loss", scalar_value=self.total_loss_epoch / self.n_iter, global_step=self.epoch) self.epoch += 1 self.n_sequences_epoch = 0 self.n_iter = 0 self.total_loss_epoch = 0 def save_checkpoint(self, checkpoint_name: str = "checkpoint.pth"): """ Save the current state. Only by the master process. """ if not self.is_master: return mdl_to_save = self.student.module if hasattr( self.student, "module") else self.student mdl_to_save.config.save_pretrained(self.dump_path) state_dict = mdl_to_save.state_dict() torch.save(state_dict, os.path.join(self.dump_path, checkpoint_name))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_file", default=None, type=str, required=True, help="The input train corpus.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--on_memory", action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument("--do_lower_case", action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumualte before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type = float, default = 0, help = "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: dummy=torch.cuda.FloatTensor(1) # device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = 1 else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') # logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( # device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") # if os.path.exists(args.output_dir) and os.listdir(args.output_dir): # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) #train_examples = None num_train_steps = None if args.do_train: # print("Loading Train Dataset", args.train_file) train_dataset = BERTDataset(args.train_file, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = BertForPreTraining.from_pretrained(args.bert_model) if args.fp16: model.half() model.cuda() if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 if args.do_train: # logger.info("***** Running training *****") # logger.info(" Num examples = %d", len(train_dataset)) # logger.info(" Batch size = %d", args.train_batch_size) # logger.info(" Num steps = %d", num_train_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: #TODO: check if this works with current data generator from disk that relies on file.__next__ # (it doesn't return item back by index) train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.cuda() for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") if args.do_train: torch.save(model_to_save.state_dict(), output_model_file)
def __init__(self, params: dict, dataset: LmSeqsDataset, token_probs: torch.tensor, student: nn.Module, teacher: nn.Module): logger.info("Initializing Distiller") self.params = params self.dump_path = params.dump_path self.multi_gpu = params.multi_gpu self.fp16 = params.fp16 self.student = student self.teacher = teacher self.student_config = student.config self.vocab_size = student.config.vocab_size if params.n_gpu <= 1: sampler = RandomSampler(dataset) else: sampler = DistributedSampler(dataset) if params.group_by_size: groups = create_lengths_groups(lengths=dataset.lengths, k=params.max_model_input_size) sampler = GroupedBatchSampler(sampler=sampler, group_ids=groups, batch_size=params.batch_size) else: sampler = BatchSampler(sampler=sampler, batch_size=params.batch_size, drop_last=False) self.dataloader = DataLoader(dataset=dataset, batch_sampler=sampler, collate_fn=dataset.batch_sequences) self.temperature = params.temperature assert self.temperature > 0.0 self.alpha_ce = params.alpha_ce self.alpha_mlm = params.alpha_mlm self.alpha_clm = params.alpha_clm self.alpha_mse = params.alpha_mse self.alpha_cos = params.alpha_cos self.mlm = params.mlm if self.mlm: logger.info(f"Using MLM loss for LM step.") self.mlm_mask_prop = params.mlm_mask_prop assert 0.0 <= self.mlm_mask_prop <= 1.0 assert params.word_mask + params.word_keep + params.word_rand == 1.0 self.pred_probs = torch.FloatTensor( [params.word_mask, params.word_keep, params.word_rand]) self.pred_probs = self.pred_probs.to( f"cuda:{params.local_rank}" ) if params.n_gpu > 0 else self.pred_probs self.token_probs = token_probs.to( f"cuda:{params.local_rank}" ) if params.n_gpu > 0 else token_probs if self.fp16: self.pred_probs = self.pred_probs.half() self.token_probs = self.token_probs.half() else: logger.info(f"Using CLM loss for LM step.") self.epoch = 0 self.n_iter = 0 self.n_total_iter = 0 self.n_sequences_epoch = 0 self.total_loss_epoch = 0 self.last_loss = 0 self.last_loss_ce = 0 self.last_loss_mlm = 0 self.last_loss_clm = 0 if self.alpha_mse > 0.0: self.last_loss_mse = 0 if self.alpha_cos > 0.0: self.last_loss_cos = 0 self.last_log = 0 self.ce_loss_fct = nn.KLDivLoss(reduction="batchmean") self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-100) if self.alpha_mse > 0.0: self.mse_loss_fct = nn.MSELoss(reduction="sum") if self.alpha_cos > 0.0: self.cosine_loss_fct = nn.CosineEmbeddingLoss(reduction="mean") logger.info("--- Initializing model optimizer") assert params.gradient_accumulation_steps >= 1 self.num_steps_epoch = len(self.dataloader) num_train_optimization_steps = ( int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in student.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad ], "weight_decay": params.weight_decay, }, { "params": [ p for n, p in student.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad ], "weight_decay": 0.0, }, ] logger.info( "------ Number of trainable parameters (student): %i" % sum([ p.numel() for p in self.student.parameters() if p.requires_grad ])) logger.info("------ Number of parameters (student): %i" % sum([p.numel() for p in self.student.parameters()])) self.optimizer = AdamW(optimizer_grouped_parameters, lr=params.learning_rate, eps=params.adam_epsilon, betas=(0.9, 0.98)) warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop) self.scheduler = get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_optimization_steps) if self.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) logger.info( f"Using fp16 training: {self.params.fp16_opt_level} level") self.student, self.optimizer = amp.initialize( self.student, self.optimizer, opt_level=self.params.fp16_opt_level) self.teacher = self.teacher.half() if self.multi_gpu: if self.fp16: from apex.parallel import DistributedDataParallel logger.info( "Using apex.parallel.DistributedDataParallel for distributed training." ) self.student = DistributedDataParallel(self.student) else: from torch.nn.parallel import DistributedDataParallel logger.info( "Using nn.parallel.DistributedDataParallel for distributed training." ) self.student = DistributedDataParallel( self.student, device_ids=[params.local_rank], output_device=params.local_rank, find_unused_parameters=True, ) self.is_master = params.is_master if self.is_master: logger.info("--- Initializing Tensorboard") self.tensorboard = SummaryWriter( log_dir=os.path.join(self.dump_path, "log", "train")) self.tensorboard.add_text(tag="config/training", text_string=str(self.params), global_step=0) self.tensorboard.add_text(tag="config/student", text_string=str(self.student_config), global_step=0)
def test_apex(mixed_precision, cuda_parallel, batch_size, opt_level): # not support in nb if mixed_precision and cuda_parallel: parser = argparse.ArgumentParser() parser.add_argument("--local_rank", type=int) args = parser.parse_args() torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method='env://') # load data pickle_paths = [ "../data/processed_data/size_224/train_data_0.pickle", # "../data/processed_data/size_224/train_data_1.pickle", # "../data/processed_data/size_224/train_data_2.pickle", # "../data/processed_data/size_224/train_data_3.pickle", ] image_data, name_data, label_data = load_data(pickle_paths) batch_size = batch_size num_workers = 6 pin_memory = True n_epoch = 120 n_splits = 5 random_seed = 2020 train_idx_list, valid_idx_list = generate_stratified_k_fold_index( image_data, label_data, n_splits, random_seed) train_idx = train_idx_list[0] valid_idx = valid_idx_list[0] # create loss function criterion = Loss_combine() # create data_transforms data_transforms = { 'train': transforms.Compose([ transforms.ToPILImage(), transforms.Grayscale(num_output_channels=3), transforms.ToTensor(), ]), 'val': transforms.Compose([ transforms.ToPILImage(), transforms.Grayscale(num_output_channels=3), transforms.ToTensor(), ]), } # create model eff_b0 = EfficientNet_0() # create optimizer optimizer_ft = optim.Adam(eff_b0.parameters()) # create data loader data_loaders = create_dataloaders(image_data, name_data, label_data, train_idx, valid_idx, data_transforms, batch_size, num_workers, pin_memory) if mixed_precision and cuda_parallel: eff_b0.to("cuda") eff_b0, optimizer_ft = amp.initialize(eff_b0, optimizer_ft, opt_level=opt_level) eff_b0 = DistributedDataParallel(eff_b0) elif mixed_precision and not cuda_parallel: eff_b0.to("cuda") eff_b0, optimizer_ft = amp.initialize(eff_b0, optimizer_ft, opt_level=opt_level) elif not mixed_precision and cuda_parallel: eff_b0.to("cuda") eff_b0 = nn.DataParallel(eff_b0) elif not mixed_precision and not cuda_parallel: eff_b0.to("cuda") callbacks = {} callbacks = train_model(eff_b0, criterion, optimizer_ft, data_loaders, mixed_precision, callbacks, num_epochs=n_epoch, epoch_scheduler=None, save_dir=None)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument( "--percent", default=100, type=int, help="The percentage of examples used in the training data.\n") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--pretrain', action='store_true', help="Whether to load a pre-trained model for continuing training") parser.add_argument('--pretrained_model_file', type=str, help="The path of the pretrained_model_file") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = {"sdp": SDPProcessor} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() num_labels = len(label_list) + 1 tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) train_examples = train_examples[:int( len(train_examples) * args.percent / 100)] num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertForSDP.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.pretrain: # Load a pre-trained model print('load a pre-trained model from ' + args.pretrained_model_file) pretrained_state_dict = torch.load(args.pretrained_model_file) model_state_dict = model.state_dict() print('pretrained_state_dict', pretrained_state_dict.keys()) print('model_state_dict', model_state_dict.keys()) pretrained_state = { k: v for k, v in pretrained_state_dict.items() if k in model_state_dict and v.size() == model_state_dict[k].size() } model_state_dict.update(pretrained_state) print('updated_state_dict', model_state_dict.keys()) model.load_state_dict(model_state_dict) model.to(device) for param in model.bert.parameters(): param.requires_grad = False if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_dependencies = torch.tensor( [f.dependencies for f in train_features], dtype=torch.long) all_dependency_labels = torch.tensor( [f.dependency_labels for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_dependencies, all_dependency_labels) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, dependencies, dependency_labels = batch loss = model(input_ids, segment_ids, input_mask, dependencies, dependency_labels) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 print('train loss', tr_loss) # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) label_map = {i: label for i, label in enumerate(label_list, 0)} model_config = { "bert_model": args.bert_model, "do_lower": args.do_lower_case, "max_seq_length": args.max_seq_length, "num_labels": len(label_list) + 1, "label_map": label_map } json.dump( model_config, open(os.path.join(args.output_dir, "model_config.json"), "w")) # Load a trained model and config that you have fine-tuned else: output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) model_state_dict = torch.load(output_model_file) model = BertForSDP.from_pretrained(args.bert_model, state_dict=model_state_dict, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_dependencies = torch.tensor( [f.dependencies for f in eval_features], dtype=torch.long) all_dependency_labels = torch.tensor( [f.dependency_labels for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_dependencies, all_dependency_labels) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() y_true = [] y_pred = [] label_map = {i: label for i, label in enumerate(label_list, 0)} for input_ids, input_mask, segment_ids, dependencies, dependency_labels in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) dependencies = dependencies.to(device) dependency_labels = dependency_labels.to(device) with torch.no_grad(): dep_logits, label_logits = model(input_ids, segment_ids, input_mask) dependencies = dependencies.to('cpu').numpy() dependency_labels = dependency_labels.to('cpu').numpy() input_mask = input_mask.to('cpu').numpy() for i, mask in enumerate(input_mask): # gold_relations = get_tuples(dependencies[i], dependency_labels[i], input_mask[i]) # pred_relations = get_tuples(dep_logits[i], label_logits[i], input_mask[i]) # y_true.append(gold_relations) # y_pred.append(pred_relations) y_true.append( (dependencies[i], dependency_labels[i], input_mask[i])) y_pred.append((dep_logits[i], label_logits[i], input_mask[i])) gold_relations = [] pred_relations = [] for index in range(len(y_true)): # print('feature index', index) # print(y_true[index][0].shape) # print(y_true[index][1].shape) # print(y_pred[index][0].shape) # print(y_pred[index][1].shape) gold_relations.append( get_tuples(y_true[index][0], y_true[index][1], y_true[index][2])) pred_relations.append( get_tuples(y_pred[index][0], y_pred[index][1], y_pred[index][2])) precision, recall, F1, label_precision, label_recall, label_F1 = get_results( gold_relations, pred_relations) print('precision', precision) print('recall', recall) print('F1', F1) print('label precision', label_precision) print('label recall', label_recall) print('label F1', label_F1) prediction_file = os.path.join(args.output_dir, 'predictions.txt') write_predictions(eval_features, gold_relations, pred_relations, prediction_file)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "aus": OOCLAUSProcessor, "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "dbpedia": DBpediaProcessor, "yelp": YelpProcessor, "trec": TrecProcessor, } num_labels_task = { "aus": 33, "cola": 2, "mnli": 3, "mrpc": 2, "dbpedia": len(DBpediaProcessor().get_labels()), "trec": len(TrecProcessor().get_labels()), "yelp": len(YelpProcessor().get_labels()), } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model cache_dir = args.cache_dir model_teacher = BertForSequenceClassification.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels=num_labels) # print(type(model_teacher)) model_student = BertForSequenceClassification.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model_teacher.half() model_student.half() model_teacher.to(device) model_student.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model_teacher = DDP(model_teacher) model_student = DDP(model_student) elif n_gpu > 1: model_teacher = torch.nn.DataParallel(model_teacher) model_student = torch.nn.DataParallel(model_student) # Prepare optimizer param_optimizer_t = list(model_teacher.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters_t = [ {'params': [p for n, p in param_optimizer_t if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer_t if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] param_optimizer_s = list(model_student.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters_s = [ {'params': [p for n, p in param_optimizer_s if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer_s if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer_t = FusedAdam(optimizer_grouped_parameters_t, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) optimizer_s = FusedAdam(optimizer_grouped_parameters_s, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer_t = FP16_Optimizer(optimizer_t, dynamic_loss_scale=True) optimizer_s = FP16_Optimizer(optimizer_s, dynamic_loss_scale=True) else: optimizer_t = FP16_Optimizer(optimizer_t, static_loss_scale=args.loss_scale) optimizer_s = FP16_Optimizer(optimizer_s, static_loss_scale=args.loss_scale) else: optimizer_t = BertAdam(optimizer_grouped_parameters_t, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) optimizer_s = BertAdam(optimizer_grouped_parameters_s, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) if args.do_train: # step 0: load train examples train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) input_ids_train = np.array([f.input_ids for f in train_features]) input_mask_train = np.array([f.input_mask for f in train_features]) segment_ids_train = np.array([f.segment_ids for f in train_features]) label_ids_train = np.array([f.label_id for f in train_features]) # Step 1: train the teacher_model print("*" * 10 + "train teacher model" + "*" * 10) print("input ids shape", input_ids_train.shape) print("label ids shape", label_ids_train.shape) train_dataloader = load_train_data(args, input_ids_train, input_mask_train, segment_ids_train, label_ids_train) num_train_epochs = 10 model_teacher = train(model_teacher, args, n_gpu, optimizer_t, num_train_optimization_steps, num_labels, train_dataloader, device, num_train_epochs) model_teacher.to(device) print() print("teacher model accuracy:") evaluate_for_dbpedia(model_teacher, args, processor, device, global_step, task_name, label_list, tokenizer) # Step 2: predict the val_set eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) input_ids_val_all = np.array([f.input_ids for f in eval_features]) input_mask_val_all = np.array([f.input_mask for f in eval_features]) segment_ids_val_all = np.array([f.segment_ids for f in eval_features]) label_ids_val_all = np.array([f.label_id for f in eval_features]) input_ids_val = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) input_mask_val = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) segment_ids_val = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) label_ids_val = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(input_ids_val, input_mask_val, segment_ids_val, label_ids_val) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) print("start predict") start_time = time.time() probas_val = predict_for_dbpedia(model_teacher, args, eval_dataloader, device) end_time = time.time() label_ids_predict = np.array(probas_val) print("predict cost time", end_time - start_time) print(label_ids_predict.shape) # Step 3: choose top-k data_val and reset train_data pos_list = [0 for x in range(len(probas_val))] top_k = 200 type_len = label_ids_predict.shape[1] index_list = [] for i in range(type_len): pos_sort = sorted(range(len(probas_val)), key=lambda k: probas_val[k][i], reverse=True) pos_sort = pos_sort[:top_k] for pos in pos_sort: pos_list[pos] = 1 for i in range(len(pos_list)): if pos_list[i] == 1: index_list.append(i) permutation = np.array(index_list) input_ids_stu = input_ids_val_all[permutation] input_mask_stu = input_mask_val_all[permutation] segment_ids_stu = segment_ids_val_all[permutation] label_ids_stu = label_ids_predict[permutation] label_ids_true = label_ids_val_all[permutation] label_ids_stu = np.array([np.argmax(x, axis=0) for x in label_ids_stu]) print("student label distribution", collections.Counter(label_ids_stu)) # step 4: train student model with teacher labeled data print("*" * 10 + "train student model" + "*" * 10) print("train set:", input_ids_stu.shape) print("input_mask_stu", input_mask_stu.shape) print("label_ids_stu", label_ids_stu.shape) print("predict", label_ids_stu[100:150]) print("true", label_ids_true[100:150]) # for debug train_dataloader_stu = load_train_data(args, input_ids_stu, input_mask_stu, segment_ids_stu, label_ids_stu) model_student.train() model_student = train(model_student, args, n_gpu, optimizer_s, num_train_optimization_steps, num_labels, train_dataloader_stu, device, 2) model_student.to(device) print() print("student before ft:") model_student.eval() evaluate_for_dbpedia(model_student, args, processor, device, global_step, task_name, label_list, tokenizer) # step 5: train student model with true data print("*" * 10 + "refine student model" + "*" * 10) print("train set:", input_ids_train.shape) model_student.train() model_student = train(model_student, args, n_gpu, optimizer_s, num_train_optimization_steps, num_labels, train_dataloader, device, args.num_train_epochs) model_student.to(device) print() print("student after ft:") model_student.eval() evaluate_for_dbpedia(model_student, args, processor, device, global_step, task_name, label_list, tokenizer) # do eval if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info("***** Running teacher evaluation *****") evaluate_for_dbpedia(model_teacher, args, processor, device, global_step, task_name, label_list, tokenizer)
def main(): global best_prec1, args args.distributed = args.world_size > 1 args.gpu = 0 if args.distributed: args.gpu = args.rank % torch.cuda.device_count() if args.distributed: torch.cuda.set_device(args.gpu) dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() model = model.cuda() if args.fp16: model = network_to_half(model) if args.distributed: #shared param turns off bucketing in DDP, for lower latency runs this can improve perf model = DDP(model, shared_param=True) global model_params, master_params if args.fp16: model_params, master_params = prep_param_lists(model) else: master_params = list(model.parameters()) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(master_params, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') if (args.arch == "inception_v3"): crop_size = 299 val_size = 320 # I chose this value arbitrarily, we can adjust. else: crop_size = 224 val_size = 256 train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(crop_size), transforms.RandomHorizontalFlip(), #transforms.ToTensor(), Too slow #normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, collate_fn=fast_collate) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(val_size), transforms.CenterCrop(crop_size), ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True, collate_fn=fast_collate) if args.evaluate: validate(val_loader, model, criterion) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch) if args.prof: break # evaluate on validation set prec1 = validate(val_loader, model, criterion) # remember best prec@1 and save checkpoint if args.rank == 0: is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints and predictions will be written.") ## Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument("--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") parser.add_argument("--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.") parser.add_argument("--doc_stride", default=128, type=int, help="When splitting up a long document into chunks, how much stride to take between chunks.") parser.add_argument("--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument("--n_best_size", default=20, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument("--max_answer_length", default=30, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument("--verbose_logging", action='store_true', help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--do_lower_case", action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument("--percent", default=100, type=float, help="The percentage of examples used in the training data.\n") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError("At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = read_squad_examples( input_file=args.train_file, is_training=True) train_examples = train_examples[:int(len(train_examples) * args.percent / 100)] num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = BertForQuestionAnswering.from_pretrained(args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: if t_total is None: t_total = 0 optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 if args.do_train: cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) train_features = None try: # with open(cached_train_features_file, "rb") as reader: with open('not load features', "rb") as reader: train_features = pickle.load(reader) print('not load stored features, but generate itself') except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): total_loss = 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): if n_gpu == 1: batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps total_loss += loss if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear(global_step/t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 print('loss', total_loss) # Save a trained model model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") if args.do_train: torch.save(model_to_save.state_dict(), output_model_file) # Load a trained model that you have fine-tuned model_state_dict = torch.load(output_model_file) model = BertForQuestionAnswering.from_pretrained(args.bert_model, state_dict=model_state_dict) model.to(device) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_squad_examples( input_file=args.predict_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating"): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append(RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, args.verbose_logging)
def main(): parser = setup_parser() args = parser.parse_args() # specifies the path where the biobert or clinical bert model is saved if args.bert_model == 'biobert' or args.bert_model == 'clinical_bert': args.bert_model = args.model_loc print(args.bert_model) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "mednli": MedNLIProcessor, "goc": GOCProcessor } num_labels_task = { "cola": 2, "mnli": 3, "mrpc": 2, "mednli": 3, "goc": 2, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") #if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) #if not os.path.exists(args.output_dir): # os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) print('TRAIN') train = processor.get_train_examples(args.data_dir) print([(train[i].text_a, train[i].text_b, train[i].label) for i in range(3)]) print('DEV') dev = processor.get_dev_examples(args.data_dir) print([(dev[i].text_a, dev[i].text_b, dev[i].label) for i in range(3)]) print('TEST') test = processor.get_test_examples(args.data_dir) print([(test[i].text_a, test[i].text_b, test[i].label) for i in range(3)]) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format( args.local_rank)) model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) num_train_optimization_steps scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=float(num_train_optimization_steps) * args.warmup_proportion, num_training_steps=num_train_optimization_steps) #optimizer = BertAdam(optimizer_grouped_parameters, # lr=args.learning_rate, # warmup=args.warmup_proportion, # t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for epoch_num in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, logits, other = model(input_ids=input_ids, attention_mask=input_mask, labels=label_ids) #print(loss[0].shape) #print(loss[1].shape) #print(loss[2].shape) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 # Saving checkpoint save_checkpoint(model, args.output_dir, "epoch_%d_checkpoint.pth" % epoch_num) if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned #config = DistilBertConfig(output_config_file) model = BertForSequenceClassification.from_pretrained( args.output_dir) #, num_labels=num_labels) #model.load_state_dict(torch.load(output_model_file)) else: model = BertForSequenceClassification.from_pretrained( args.bert_model) #, num_labels=num_labels) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits, other = model(input_ids=input_ids, attention_mask=input_mask, labels=label_ids) # logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if args.do_test and (args.local_rank == -1 or torch.distributed.get_rank() == 0): test_examples = processor.get_test_examples(args.data_dir) test_features = convert_examples_to_features(test_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running testing *****") logger.info(" Num examples = %d", len(test_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) model.eval() test_loss, test_accuracy = 0, 0 nb_test_steps, nb_test_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( test_dataloader, desc="Testing"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): #tmp_test_loss = model(input_ids, segment_ids, input_mask, label_ids) tmp_test_loss, logits, other = model(input_ids=input_ids, attention_mask=input_mask, labels=label_ids) #logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_test_accuracy = accuracy(logits, label_ids) test_loss += tmp_test_loss.mean().item() test_accuracy += tmp_test_accuracy nb_test_examples += input_ids.size(0) nb_test_steps += 1 test_loss = test_loss / nb_test_steps test_accuracy = test_accuracy / nb_test_examples loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'test_loss': test_loss, 'test_accuracy': test_accuracy, 'global_step': global_step, 'loss': loss } output_test_file = os.path.join(args.output_dir, "test_results.txt") with open(output_test_file, "w") as writer: logger.info("***** Test results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", default=False, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", #default=3.0, default=1.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") ##########ADD## parser.add_argument("--K_V_dim", type=int, default=100, help="Key and Value dim == KG representation dim") parser.add_argument("--Q_dim", type=int, default=768, help="Query dim == Bert six output layer representation dim") parser.add_argument('--graphsage', default=False, action='store_true', help="Whether to use Attention GraphSage instead of GAT") parser.add_argument('--self_att', default=True, action='store_true', help="Whether to use GAT") ############### args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() train_data = None num_train_steps = None if args.do_train: # TODO import indexed_dataset from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, BatchSampler import iterators #train_data = indexed_dataset.IndexedCachedDataset(args.data_dir) train_data = indexed_dataset.IndexedDataset(args.data_dir, fix_lua_indexing=True) #print(train_data) #print("-----------") if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_sampler = BatchSampler(train_sampler, args.train_batch_size, True) def collate_fn(x): x = torch.LongTensor([xx for xx in x]) #x = torch.LongTensor([xx%9 for xx in x]) ## ### #entity_idx = x[:, 4*args.max_seq_length:5*args.max_seq_length] ### ### entity_idx = x[:, 3*args.max_seq_length:4*args.max_seq_length] ### #entity_idx = x[:, 4*args.max_seq_length] #print(entity_idx) #print(entity_idx.shape) # Build candidate # #print(entity_idx) uniq_idx = np.unique(entity_idx.numpy()) #print(uniq_idx) #print(uniq_idx.shape) #exit() #ent_candidate = embed(torch.LongTensor(uniq_idx+1)) #print(ent_candidate) #print(ent_candidate.shape) ent_candidate = torch.LongTensor(uniq_idx+1) #del #print(ent_candidate) #print(ent_candidate.shape) #print(ent_candidate) #print(ent_candidate.shape) #print(ent_candidate) #print(ent_candidate.shape) #print("================") ent_candidate = ent_candidate.repeat([n_gpu, 1]) #batch #print(ent_candidate) #print(ent_candidate.shape) #exit() #print(ent_candidate) #print(ent_candidate.shape) #exit() #ent_candidate = embed(torch.LongTensor(uniq_idx+1)) #del #print(ent_candidate) #print(ent_candidate.shape) #print(ent_candidate.size()) #exit() # #ent_candidate = torch.LongTensor(ent_candidate+1) #single #! --> return uniq_idx =>all entity in batch # build entity labels d = {} dd = [] for i, idx in enumerate(uniq_idx): d[idx] = i dd.append(idx) ### ''' ent_size = len(uniq_idx)-1 def map(x): if x == -1: return -1 else: rnd = random.uniform(0, 1) if rnd < 0.05: return dd[random.randint(1, ent_size)] elif rnd < 0.2: return -1 else: return x ''' ### ent_labels = entity_idx.clone() d[-1] = -1 ent_labels = ent_labels.apply_(lambda x: d[x]) ### ''' entity_idx.apply_(map) #ent_emb = embed(entity_idx+1) ent_emb = entity_idx+1 ## #! --> return entity+1 => input_ent mask = entity_idx.clone() mask.apply_(lambda x: 0 if x == -1 else 1) mask[:,0] = 1 ''' ### ### # entity_idx.apply_(map) #mask = entity_idx.clone() #mask.apply_(lambda x: 0 if x == -1 else 1) mask = x[:, 4*args.max_seq_length:5*args.max_seq_length] mask[:,0] = 1 entity_idx = entity_idx * mask ### entity_idx[entity_idx == 0] = -1 ### ent_emb = entity_idx+1 ### ### #return x[:,:args.max_seq_length], x[:,args.max_seq_length:2*args.max_seq_length], x[:,2*args.max_seq_length:3*args.max_seq_length], x[:,3*args.max_seq_length:4*args.max_seq_length], ent_emb, mask, x[:,6*args.max_seq_length:], ent_candidate, ent_labels ### ### return x[:,:args.max_seq_length], x[:,args.max_seq_length:2*args.max_seq_length], x[:,2*args.max_seq_length:3*args.max_seq_length], ent_emb, mask, x[:,5*args.max_seq_length:], ent_candidate, ent_labels ### train_iterator = iterators.EpochBatchIterator(train_data, collate_fn, train_sampler) num_train_steps = int( len(train_data) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model #model, missing_keys = BertForPreTraining.from_pretrained(args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) model, missing_keys = BertForPreTraining.from_pretrained(args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), args=args) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_linear = ['layer.2.output.dense_ent', 'layer.2.intermediate.dense_1', 'bert.encoder.layer.2.intermediate.dense_1_ent', 'layer.2.output.LayerNorm_ent'] no_linear = [x.replace('2', '11') for x in no_linear] param_optimizer = [(n, p) for n, p in param_optimizer if not any(nl in n for nl in no_linear)] #param_optimizer = [(n, p) for n, p in param_optimizer if not any(nl in n for nl in missing_keys)] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight', 'LayerNorm_ent.bias', 'LayerNorm_ent.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) #logger.info(dir(optimizer)) #op_path = os.path.join(args.bert_model, "pytorch_op.bin") #optimizer.load_state_dict(torch.load(op_path)) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_data)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) model.train() import datetime fout = open(os.path.join(args.output_dir, "loss.{}".format(datetime.datetime.now())), 'w') more_than_one_2 = 0 less_than_one_2 = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 ### ''' for step, batch in enumerate(tqdm(train_iterator.next_epoch_itr(), desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, masked_lm_labels, input_ent, ent_mask, next_sentence_label, ent_candidate, ent_labels = batch ''' ### ### if args.local_rank == 0 or args.local_rank == -1: iters = tqdm(train_iterator.next_epoch_itr(), desc="Iteration") else: iters = train_iterator.next_epoch_itr() for step, batch in enumerate(iters): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, masked_lm_labels, input_ent, ent_mask, next_sentence_label, ent_candidate, ent_labels = batch ### ### #print(len(input_ids[input_ids==2])) if len(input_ids[input_ids==2]) != args.train_batch_size: for i_th_1, input_id in enumerate(input_ids): print(input_id[input_id==2]) print(len(input_id[input_id==2])) if len(input_id[input_id==2]) > 1: for i_th_2 ,id in enumerate(input_id): if id == 2: print("Befor:",input_id) input_ids[i_th_1][i_th_2] = 0 more_than_one_2 += 1 print("more_than_one_2:",more_than_one_2) print("After:",input_id) if len(input_id[input_id==2] == 1): break elif len(input_id[input_id==2]) < 1: print("Error!! Have no id=2 </s>") less_than_one_2 += 1 print("less_than_one_2:",less_than_one_2) print(input_id) input_ids[i_th_1][-1] = 2 else: print("ids_2 == 1") ### ### #start_time_1 = time.time() k_1, v_1, k_2, v_2, k_cand_1, v_cand_1, k_cand_2, v_cand_2, cand_pos_tensor = load_k_v_queryR_small(input_ent,ent_candidate) #k, v = load_k_v_queryR(input_ent,device) #input_ent_neighbor_emb, input_ent_r_emb, input_ent_outORin_emb = load_k_v_queryR(input_ent) #end_time_1 = time.time() #print("load_k_v_queryR:{}".format(end_time_1-start_time_1)) #print(ent_candidate) #print(ent_candidate.shape) #exit() #k_cand, v_cand = load_k_v_queryR_small(ent_candidate,"candidate") #k_cand, v_cand = load_k_v_queryR(ent_candidate,device) #input_ent_neighbor_emb_c, input_ent_r_emb_c, input_ent_outORin_emb_c = load_k_v_queryR(candidate) #end_time_2 = time.time() #print("load_cand:{}".format(end_time_2-end_time_1)) #k, v = load_batch_k_v_queryE(input_ent,500) #k_cand, v_cand = load_batch_k_v_queryE(ent_candidate,500) #k, v = load_batch_k_v_queryR(input_ent,300) #k_cand, v_cand = load_batch_k_v_queryR(ent_candidate,300) if args.fp16: #loss, original_loss = model(input_ids, segment_ids, input_mask, masked_lm_labels, input_ent, ent_mask, next_sentence_label, ent_candidate, ent_labels, k_1.half(), v_1.half(), k_2.half(), v_2.half(), k_cand_1.half(), v_cand_1.half(), k_cand_2.half(), v_cand_2.half(), cand_pos_tensor) loss, original_loss = model(input_ids, None, input_mask, masked_lm_labels, input_ent, ent_mask, next_sentence_label, ent_candidate, ent_labels, k_1.half(), v_1.half(), k_2.half(), v_2.half(), k_cand_1.half(), v_cand_1.half(), k_cand_2.half(), v_cand_2.half(), cand_pos_tensor) else: #loss, original_loss = model(input_ids, segment_ids, input_mask, masked_lm_labels, input_ent, ent_mask, next_sentence_label, ent_candidate, ent_labels, k_1, v_1, k_2, v_2, k_cand_1, v_cand_1, k_cand_2, v_cand_2, cand_pos_tensor) loss, original_loss = model(input_ids, None, input_mask, masked_lm_labels, input_ent, ent_mask, next_sentence_label, ent_candidate, ent_labels, k_1, v_1, k_2, v_2, k_cand_1, v_cand_1, k_cand_2, v_cand_2, cand_pos_tensor) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. original_loss = original_loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() end_time_4 = time.time() #print("bp time:{}".format(end_time_4)) #print("=====================================") fout.write("{} {}\n".format(loss.item()*args.gradient_accumulation_steps, original_loss.item())) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear(global_step/t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if global_step % 100000 == 0: model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin_{}".format(global_step)) torch.save(model_to_save.state_dict(), output_model_file) fout.close() # Save a trained model model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file)
def main(): global best_prec1, args args.gpu = 0 args.world_size = 1 if args.distributed: args.gpu = args.local_rank % torch.cuda.device_count() torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() args.total_batch_size = args.world_size * args.batch_size if not os.path.isdir(args.checkpoint) and args.local_rank == 0: mkdir_p(args.checkpoint) if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." if args.static_loss_scale != 1.0: if not args.fp16: print("Warning: if --fp16 is not used, static_loss_scale will be ignored.") # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() model = model.cuda() if args.fp16: model = network_to_half(model) if args.distributed: # shared param/delay all reduce turns off bucketing in DDP, for lower latency runs this can improve perf # for the older version of APEX please use shared_param, for newer one it is delay_allreduce model = DDP(model, delay_allreduce=True) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.fp16: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.static_loss_scale, dynamic_loss_scale=args.dynamic_loss_scale, verbose=False) # optionally resume from a checkpoint title = 'ImageNet-' + args.arch if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) if args.local_rank == 0: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: if args.local_rank == 0: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names(['Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.', 'Valid Top5.']) traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') if(args.arch == "inception_v3"): crop_size = 299 val_size = 320 # I chose this value arbitrarily, we can adjust. else: crop_size = 224 val_size = 256 pipe = HybridTrainPipe(batch_size=args.batch_size, num_threads=args.workers, device_id=args.local_rank, data_dir=traindir, crop=crop_size, dali_cpu=args.dali_cpu) pipe.build() train_loader = DALIClassificationIterator(pipe, size=int(pipe.epoch_size("Reader") / args.world_size)) pipe = HybridValPipe(batch_size=args.batch_size, num_threads=args.workers, device_id=args.local_rank, data_dir=valdir, crop=crop_size, size=val_size) pipe.build() val_loader = DALIClassificationIterator(pipe, size=int(pipe.epoch_size("Reader") / args.world_size)) if args.evaluate: validate(val_loader, model, criterion) return total_time = AverageMeter() for epoch in range(args.start_epoch, args.epochs): # train for one epoch adjust_learning_rate(optimizer, epoch,args) if args.local_rank == 0: print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, optimizer.param_groups[0]['lr'])) [train_loss, train_acc, avg_train_time] = train(train_loader, model, criterion, optimizer, epoch) total_time.update(avg_train_time) # evaluate on validation set [test_loss, prec1, prec5] = validate(val_loader, model, criterion) # remember best prec@1 and save checkpoint if args.local_rank == 0: # append logger file logger.append([optimizer.param_groups[0]['lr'], train_loss, test_loss, train_acc, prec1, prec5]) is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best,checkpoint=args.checkpoint) # if need to save each epoch checkpoint, add: filename="epoch"+str(epoch+1)+"checkpoint.pth.tar" if epoch == args.epochs - 1: print('##Top-1 {0}\n' '##Top-5 {1}\n' '##Perf {2}'.format(prec1, prec5, args.total_batch_size / total_time.avg)) # reset DALI iterators train_loader.reset() val_loader.reset() if args.local_rank == 0: logger.close()
def main(): cudnn.enabled = True cudnn.benchmark = True args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 args.gpu = 0 args.world_size = 1 if args.distributed: args.gpu = args.local_rank torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() # create model if args.model == 'MobileNetV3_Large': model = MobileNetV3_Large(args.num_classes, 0.0, False) elif args.model == 'MobileNetV3_Small': model = MobileNetV3_Small(args.num_classes, 0.0, False) else: raise Exception('invalid type of model') model = model.cuda().to(memory_format=memory_format ) if memory_format is not None else model.cuda() # For distributed training, wrap the model with apex.parallel.DistributedDataParallel. # This must be done AFTER the call to amp.initialize. if args.distributed: # By default, apex.parallel.DistributedDataParallel overlaps communication with # computation in the backward pass. # delay_allreduce delays all communication to the end of the backward pass. model = DDP(model, delay_allreduce=True) else: model = nn.DataParallel(model) # define transform and initialize dataloader batch_size = args.batch_size // args.world_size workers = args.workers // args.world_size if args.trans_mode == 'tv': val_transform = get_val_transform() val_dataset = ImageList(root=args.val_root, list_path=args.val_list, transform=val_transform) val_sampler = None if args.distributed: val_sampler = torch.utils.data.distributed.DistributedSampler( val_dataset, shuffle=False) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, num_workers=workers, pin_memory=True, sampler=val_sampler, shuffle=False) elif args.trans_mode == 'dali': pipe = HybridValPipe(batch_size=batch_size, num_threads=workers, device_id=args.local_rank, root=args.val_root, list_path=args.val_list, size=256, crop=224, shard_id=args.local_rank, num_shards=args.world_size, dali_cpu=args.dali_cpu) pipe.build() val_loader = DALIClassificationIterator( pipe, size=int(pipe.epoch_size("Reader") / args.world_size)) else: raise Exception('invalid image transformation mode') # restart from weights if args.weights and os.path.isfile(args.weights): if args.local_rank == 0: print('loading weights from {}'.format(args.weights)) checkpoint = torch.load( args.weights, map_location=lambda storage, loc: storage.cuda(args.gpu)) if args.ema: model.load_state_dict(checkpoint['ema']) else: model.load_state_dict(checkpoint['model']) val_acc_top1, val_acc_top5 = validate(val_loader, model) if args.local_rank == 0: print('Val_acc_top1: {:.2f}'.format(val_acc_top1)) print('Val_acc_top5: {:.2f}'.format(val_acc_top5))
def main(): global best_prec1, args args.distributed = args.world_size > 1 args.gpu = 0 if args.distributed: args.gpu = args.rank % torch.cuda.device_count() if args.distributed: torch.cuda.set_device(args.gpu) dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True, num_classes=args.num_classes) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch](num_classes=args.num_classes) model = model.cuda() if args.fp16: model = network_to_half(model) if args.distributed: model = DDP(model) global model_params, master_params if args.fp16: model_params, master_params = prep_param_lists(model) else: master_params = list(model.parameters()) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(master_params, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') pipe = HybridPipe(batch_size=args.batch_size, num_threads=args.workers, device_id = args.rank, data_dir = traindir) pipe.build() test_run = pipe.run() from nvidia.dali.plugin.pytorch import DALIClassificationIterator train_loader = DALIClassificationIterator(pipe, size = int(1281167 / args.world_size) ) pipe = HybridPipe(batch_size=args.batch_size, num_threads=args.workers, device_id = args.rank, data_dir = valdir) pipe.build() test_run = pipe.run() from nvidia.dali.plugin.pytorch import DALIClassificationIterator val_loader = DALIClassificationIterator(pipe, size = int(50000 / args.world_size) ) if args.evaluate: validate(val_loader, model, criterion) return for epoch in range(args.start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch) if args.prof: break # evaluate on validation set prec1 = validate(val_loader, model, criterion) # remember best prec@1 and save checkpoint if args.rank == 0: is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer' : optimizer.state_dict(), }, is_best)