def __init__(self, model, mask_prob: float = 0.15, clip: int = 1, optimizer=None): self.model = model self.clip = clip self.optimizer = optimizer self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.model = self.model.to(self.device) self.mask_prob = mask_prob self.criterion = nn.NLLLoss( ignore_index=model.text_processor.pad_token_id()) num_gpu = torch.cuda.device_count() if num_gpu > 1: print("Let's use", num_gpu, "GPUs!") self.model = DataParallelModel(self.model) self.criterion = DataParallelCriterion(self.criterion) self.best_dev_loss = float("inf") self.best_train_loss = float("inf") self.last_train_loss = float("inf")
def createModels(args, userNum, itemNum): if args.model == 'SPUIGACF': model = SPUIGACF(userNum, itemNum, embedSize=args.embedSize, layers=args.layers, droprate=args.droprate).cuda() elif args.model == 'SPUIMultiGACF': model = SPUIMultiGACF(userNum, itemNum, embedSize=args.embedSize, layers=args.layers, droprate=args.droprate).cuda() elif args.model == 'SPUIGAGPCF': model = SPUIGAGPCF(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers, droprate=args.droprate).cuda() if args.train_mode == 'PairSampling': lossfn = BPRLoss() if args.parallel == True: model = DataParallelModel(model) lossfn = DataParallelCriterion2(lossfn) elif args.train_mode == 'NegSampling': lossfn = BCEWithLogitsLoss() if args.parallel == True: model = DataParallelModel(model) # 并行化model lossfn = DataParallelCriterion(lossfn) # 并行化损失函数 optim = Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) return model, lossfn, optim
def __init__(self, cfg: Namespace, data: Dataset): """ Args: cfg: configuration data: train dataset """ self.cfg = cfg self.train, self.valid = data.split(0.8) RATING_FIELD.build_vocab(self.train) self.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # pylint: disable=no-member self.batch_size = cfg.batch_size if torch.cuda.is_available(): self.batch_size *= torch.cuda.device_count() self.trn_itr = BucketIterator( self.train, device=self.device, batch_size=self.batch_size, shuffle=True, train=True, sort_within_batch=True, sort_key=lambda exam: -len(exam.comment_text)) self.vld_itr = BucketIterator( self.valid, device=self.device, batch_size=self.batch_size, shuffle=False, train=False, sort_within_batch=True, sort_key=lambda exam: -len(exam.comment_text)) self.log_step = 1000 if len(self.vld_itr) < 100: self.log_step = 10 elif len(self.vld_itr) < 1000: self.log_step = 100 bert_path = cfg.bert_path if cfg.bert_path else 'bert-base-cased' self.model = BertForSequenceClassification.from_pretrained( bert_path, num_labels=2) pos_weight = ( len([exam for exam in self.train.examples if exam.target < 0.5]) / len([exam for exam in self.train.examples if exam.target >= 0.5])) pos_wgt_tensor = torch.tensor([1.0, pos_weight], device=self.device) # pylint: disable=not-callable self.criterion = nn.CrossEntropyLoss(weight=pos_wgt_tensor) if torch.cuda.is_available(): self.model = DataParallelModel(self.model.cuda()) self.criterion = DataParallelCriterion(self.criterion) self.optimizer = optim.Adam(self.model.parameters(), cfg.learning_rate)
def __init__(self, model, caption_model, mask_prob: float = 0.3, clip: int = 1, optimizer=None, beam_width: int = 5, max_len_a: float = 1.1, max_len_b: int = 5, len_penalty_ratio: float = 0.8, nll_loss: bool = False, fp16: bool = False, mm_mode="mixed"): super().__init__(model, mask_prob, clip, optimizer, beam_width, max_len_a, max_len_b, len_penalty_ratio, nll_loss, fp16, mm_mode) self.caption_model = caption_model self.caption_model.eval() self.caption_model = self.caption_model.to(self.device) if self.num_gpu == 1 and fp16: self.caption_model = amp.initialize(self.caption_model, opt_level="O2") if self.num_gpu > 1: print("Let's use", self.num_gpu, "GPUs!") self.caption_model = DataParallelModel(self.caption_model)
def train(): os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") loaders = create_datasets(num_workers=32, batch_size=600) # info = pd.read_csv("./flower_data/train.csv")[["image","label"]] # class_weights = torch.tensor(1.0/info.groupby(["label"]).count().values.astype(np.float32)) # del info models_ensamble = [ # {"name":"vgg", "model":models.vgg16_bn(pretrained=True)}, {"name":"resnet", "model":models.resnet50(pretrained=True)}, # {"name":"densenet", "model":models.densenet121(pretrained=True) }, {"name":"resnet", "model":models.resnet101(pretrained=True) }, ] # model = Ensemble(models_ensamble, name="star_ensemble") model = load_checkpoint("ensemble_iso_star_5118.pt") ft, cl =model.get_parameters() # model = nn.DataParallel(model) model = DataParallelModel(model) model = model.to(device) weight = torch.from_numpy(weight_train[0]).to(device) criterion = nn.NLLLoss(weight) criterion = DataParallelCriterion(criterion) optimizers = [ optim.Adam(ft, lr=5e-4), optim.Adam(cl, lr=5e-3)] # # print("") # # print('-' * 40) # # print("lr = {} bs= {}".format(lr,bs) ) # # print('-' * 40) # # Decay LR by a factor of 0.1 every 7 epochs exp_lr_schedulers = [lr_scheduler.StepLR(optimizers[0], step_size = 1, gamma = 0.995), lr_scheduler.StepLR(optimizers[1], step_size = 1, gamma = 0.992) ] model = [model, criterion, optimizers, exp_lr_schedulers, device] model = train_model(*model, loaders, num_epochs = 100)
def build_model(options): model = Seq2Seq.load(ImageCaptioning, options.model_path, tok_dir=options.tokenizer_path, use_obj=options.obj) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) num_gpu = torch.cuda.device_count() generator = BeamDecoder(model, beam_width=options.beam_width, max_len_a=options.max_len_a, max_len_b=options.max_len_b, len_penalty_ratio=options.len_penalty_ratio) if options.fp16: generator = amp.initialize(generator, opt_level="O2") if num_gpu > 1: generator = DataParallelModel(generator) return generator, model.text_processor
def main(): parser = setup_parser() args = parser.parse_args() processors = { 'stsb': StsbProcessor, 'mednli': MednliProcessor, 'medsts': MedstsProcessor } output_modes = { 'mnli': 'classification', 'stsb': 'regression', 'mednli': 'classification', 'medsts': 'regression' } bert_types = { 'discharge': '/home/dc925/project/data/clinicalbert/biobert_pretrain_output_disch_100000', 'all': '/home/dc925/project/data/clinicalbert/biobert_pretrain_output_all_notes_150000', 'base_uncased': 'bert-base-uncased', 'base_cased': 'bert-base-cased' } ################################################################################################## ################################### SETUP DATA, DEVICE, MODEL #################################### ################################################################################################## if args.local_rank == -1 or args.no_cuda: device = torch.device('cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu') n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device('cuda', args.local_rank) n_gpu = 1 #Initialize the distributed backend which will take care of synchronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: {}".format(task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels(output_mode) num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: # model = torch.nn.DataParallel(model) model = DataParallelModel(model) ################################################################################################## ########################################### OPTIMIZER ############################################ ################################################################################################## if args.do_train: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] if args.discriminative_finetuning: group1 = ['layer.0', 'layer.1.'] group2 = ['layer.2', 'layer.3'] group3 = ['layer.4', 'layer.5'] group4 = ['layer.6', 'layer.7'] group5 = ['layer.8', 'layer.9'] group6 = ['layer.10', 'layer.11'] group_all = ['layer.0', 'layer.1.', 'layer.2', 'layer.3', 'layer.4', 'layer.5', \ 'layer.6', 'layer.7', 'layer.8', 'layer.9', 'layer.10', 'layer.11'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)], \ 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)], \ 'weight_decay': 0.01, 'lr': args.learning_rate/2.6**5}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)], \ 'weight_decay': 0.01, 'lr': args.learning_rate/2.6**4}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)], \ 'weight_decay': 0.01, 'lr': args.learning_rate/2.6**3}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group4)], \ 'weight_decay': 0.01, 'lr': args.learning_rate/2.6**2}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group5)], \ 'weight_decay': 0.01, 'lr': args.learning_rate/2.6}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group6)], \ 'weight_decay': 0.01, 'lr': args.learning_rate}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)], \ 'weight_decay': 0.0}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)], \ 'weight_decay': 0.0, 'lr': args.learning_rate/2.6**5}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)], \ 'weight_decay': 0.0, 'lr': args.learning_rate/2.6**4}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)], \ 'weight_decay': 0.0, 'lr': args.learning_rate/2.6**3}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group4)], \ 'weight_decay': 0.0, 'lr': args.learning_rate/2.6**2}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group5)], \ 'weight_decay': 0.0, 'lr': args.learning_rate/2.6}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group6)], \ 'weight_decay': 0.0, 'lr': args.learning_rate}, ] else: optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) ################################################################################################## ############################################# TRAIN ############################################## ################################################################################################## global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) all_pids = np.array([f.pid for f in eval_features]) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size, drop_last=True) model.train() epoch_metric = {} for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # define a new function to compute loss values for both output_modes logits = model(input_ids, segment_ids, input_mask, labels=None) if output_mode == "classification": loss_fct = CrossEntropyLoss() loss_fct = DataParallelCriterion(loss_fct) logits = [ logits[i].view(-1, num_labels) for i in range(len(logits)) ] loss = loss_fct(logits, label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss_fct = DataParallelCriterion(loss_fct) logits = [logits[i].view(-1) for i in range(len(logits))] loss = loss_fct(logits, label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() #average on multi-gpu if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: #modify lr with special warm up BERT uses #if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 with torch.no_grad(): model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] i = 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) if output_mode == 'classification': # loss_fct = CrossEntropyLoss() # tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) loss_fct = CrossEntropyLoss() loss_fct = DataParallelCriterion(loss_fct) logits = [ logits[i].view(-1, num_labels) for i in range(len(logits)) ] tmp_eval_loss = loss_fct(logits, label_ids.view(-1)) elif output_mode == 'regression': # loss_fct = MSELoss() # tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) loss_fct = MSELoss() loss_fct = DataParallelCriterion(loss_fct) logits = [ logits[i].view(-1) for i in range(len(logits)) ] tmp_eval_loss = loss_fct(logits, label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 logits = parallel.gather(logits, target_device='cuda:0') if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] if output_mode == 'classification': preds = np.argmax(preds, axis=1) elif output_mode == 'regression': preds = np.squeeze(preds) all_label_ids = all_label_ids[:preds.shape[0]] all_pids = all_pids[:preds.shape[0]] errors = generate_errors(preds, all_label_ids.numpy(), all_pids) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss / global_step if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss logger.info('***** Eval Results *****') for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) epoch_metric[_] = result[ 'pearson'] if output_mode == 'regression' else result['acc'] output_eval_file = os.path.join(args.output_dir, 'eval_results.txt') with open(output_eval_file, 'w') as writer: logger.info('***** Eval Results *****') # for key in sorted(result.keys()): # logger.info(" %s = %s", key, str(result[key])) # writer.write("%s = %s\n" % (key, str(result[key]))) # writer.write("{} {}\n".format("epoch","pearson")) for key in sorted(epoch_metric.keys()): writer.write("{}\t{}\t{}\t{}\n".format(key, str(epoch_metric[key]), args.learning_rate, args.train_batch_size)) errors.to_csv('errors.txt', sep='\t', index=False) ################################################################################################## ########################################## SAVE & RELOAD ######################################### ################################################################################################## if args.do_train: #Save a trained model, config, and tokenizer model_to_save = model.module if hasattr( model, 'module') else model #only save the model itself output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) model = BertForSequenceClassification.from_pretrained( args.output_dir, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) else: model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) model.to(device)
def main(): epoches = 32 gpu_id = 7 ctx_list = [mx.gpu(x) for x in [7, 8]] log_interval = 100 batch_size = 32 start_epoch = 0 # trainer_resume = resume + ".states" if resume is not None else None trainer_resume = None resume = None from mxnet.gluon.data.vision import transforms transform_fn = transforms.Compose([ LeftTopPad(dest_shape=(256, 256)), transforms.ToTensor(), transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) ]) dataset = CaptionDataSet( image_root="/data3/zyx/yks/coco2017/train2017", annotation_path= "/data3/zyx/yks/coco2017/annotations/captions_train2017.json", transforms=transform_fn, feature_hdf5="output/train2017.h5") val_dataset = CaptionDataSet( image_root="/data3/zyx/yks/coco2017/val2017", annotation_path= "/data3/zyx/yks/coco2017/annotations/captions_val2017.json", words2index=dataset.words2index, index2words=dataset.index2words, transforms=transform_fn, feature_hdf5="output/val2017.h5") dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True, last_batch="discard") val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True) num_words = dataset.words_count # set up logger save_prefix = "output/res50_" logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) net = EncoderDecoder(num_words=num_words, test_max_len=val_dataset.max_len) if resume is not None: net.collect_params().load(resume, allow_missing=True, ignore_extra=True) logger.info("Resumed form checkpoint {}.".format(resume)) params = net.collect_params() for key in params.keys(): if params[key]._data is not None: continue else: if "bias" in key or "mean" in key or "beta" in key: params[key].initialize(init=mx.init.Zero()) logging.info("initialized {} using Zero.".format(key)) elif "weight" in key: params[key].initialize(init=mx.init.Normal()) logging.info("initialized {} using Normal.".format(key)) elif "var" in key or "gamma" in key: params[key].initialize(init=mx.init.One()) logging.info("initialized {} using One.".format(key)) else: params[key].initialize(init=mx.init.Normal()) logging.info("initialized {} using Normal.".format(key)) net.collect_params().reset_ctx(ctx=ctx_list) trainer = mx.gluon.Trainer( net.collect_params(), 'adam', { 'learning_rate': 4e-4, 'clip_gradient': 5, 'multi_precision': True }, ) if trainer_resume is not None: trainer.load_states(trainer_resume) logger.info( "Loaded trainer states form checkpoint {}.".format(trainer_resume)) criterion = Criterion() accu_top3_metric = TopKAccuracy(top_k=3) accu_top1_metric = Accuracy(name="batch_accu") ctc_loss_metric = Loss(name="ctc_loss") alpha_metric = Loss(name="alpha_loss") batch_bleu = BleuMetric(name="batch_bleu", pred_index2words=dataset.index2words, label_index2words=dataset.index2words) epoch_bleu = BleuMetric(name="epoch_bleu", pred_index2words=dataset.index2words, label_index2words=dataset.index2words) btic = time.time() logger.info(batch_size) logger.info(num_words) logger.info(len(dataset.words2index)) logger.info(len(dataset.index2words)) logger.info(dataset.words2index["<PAD>"]) logger.info(val_dataset.words2index["<PAD>"]) logger.info(len(val_dataset.words2index)) # net.hybridize(static_alloc=True, static_shape=True) net_parallel = DataParallelModel(net, ctx_list=ctx_list, sync=True) for nepoch in range(start_epoch, epoches): if nepoch > 15: trainer.set_learning_rate(4e-5) logger.info("Current lr: {}".format(trainer.learning_rate)) accu_top1_metric.reset() accu_top3_metric.reset() ctc_loss_metric.reset() alpha_metric.reset() epoch_bleu.reset() batch_bleu.reset() for nbatch, batch in enumerate(tqdm.tqdm(dataloader)): batch = [mx.gluon.utils.split_and_load(x, ctx_list) for x in batch] inputs = [[x[n] for x in batch] for n, _ in enumerate(ctx_list)] losses = [] with ag.record(): net_parallel.sync = nbatch > 1 outputs = net_parallel(*inputs) for s_batch, s_outputs in zip(inputs, outputs): image, label, label_len = s_batch predictions, alphas = s_outputs ctc_loss = criterion(predictions, label, label_len) loss2 = 1.0 * ((1. - alphas.sum(axis=1))**2).mean() losses.extend([ctc_loss, loss2]) ag.backward(losses) trainer.step(batch_size=batch_size, ignore_stale_grad=True) for n, l in enumerate(label_len): l = int(l.asscalar()) la = label[n, 1:l] pred = predictions[n, :(l - 1)] accu_top3_metric.update(la, pred) accu_top1_metric.update(la, pred) epoch_bleu.update(la, predictions[n, :]) batch_bleu.update(la, predictions[n, :]) ctc_loss_metric.update(None, preds=nd.sum(ctc_loss) / image.shape[0]) alpha_metric.update(None, preds=loss2) if nbatch % log_interval == 0 and nbatch > 0: msg = ','.join([ '{}={:.3f}'.format(*metric.get()) for metric in [ epoch_bleu, batch_bleu, accu_top1_metric, accu_top3_metric, ctc_loss_metric, alpha_metric ] ]) logger.info( '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}'. format(nepoch, nbatch, log_interval * batch_size / (time.time() - btic), msg)) btic = time.time() batch_bleu.reset() accu_top1_metric.reset() accu_top3_metric.reset() ctc_loss_metric.reset() alpha_metric.reset() bleu, acc_top1 = validate(net, gpu_id=gpu_id, val_loader=val_loader, train_index2words=dataset.index2words, val_index2words=val_dataset.index2words) save_path = save_prefix + "_weights-%d-bleu-%.4f-%.4f.params" % ( nepoch, bleu, acc_top1) net.collect_params().save(save_path) trainer.save_states(fname=save_path + ".states") logger.info("Saved checkpoint to {}.".format(save_path))
def main(args): init(args) #Args setup: save_dir = os.path.join(args.output_dir, args.experiment_name, "checkpoints") save_dir_local = "checkpoints_local" desc = args.desc data_dir = args.data_dir log_dir = os.path.join(args.output_dir, args.experiment_name, "logs") os.makedirs(log_dir, exist_ok=True) os.makedirs(save_dir, exist_ok=True) os.makedirs(save_dir_local, exist_ok=True) train_log_interval = args.train_log_interval val_log_interval = args.val_log_interval beam = args.beam p = args.p n_ctx = args.n_ctx gen_len = args.gen_len k = args.k decoding_strategy = args.decoding_strategy accum_iter = args.accum_iter device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print("device", device, "n_gpu", n_gpu) logger = Logger(log_dir) #Text Encoder if args.use_offline_gpt2: text_encoder = GPT2Tokenizer.from_pretrained('./gpt2model') elif args.debug_mode: text_encoder = GPT2Tokenizer.from_pretrained('gpt2') else: text_encoder = GPT2Tokenizer.from_pretrained('gpt2-medium') text_encoder.add_special_tokens({ 'bos_token': '_start_', 'cls_token': '_classify_', 'eos_token': '_end_', 'additional_special_tokens': ['_kw_', '_endkw_', '_t_', '_i_', '_b_', '_c_'] }) vocab = len(text_encoder) print("Loading dataset...") if args.use_model == "base": train_loader = get_paragraph_input_loader( os.path.join(data_dir, "train_encoded.jsonl"), args.n_batch, text_encoder, num_workers=3, shuffle=True, gen_len=gen_len, n_ctx=n_ctx, include_discourse_type=args.use_discourse, include_neigh=args.use_neighbor_feat, max_size=args.max_ex, include_kw=not args.exclude_kw, dim=args.n_embd, debug_mode=args.debug_mode) val_loader = get_paragraph_input_loader( os.path.join(data_dir, "val_encoded.jsonl"), n_gpu, text_encoder, num_workers=0, shuffle=False, gen_len=gen_len, n_ctx=n_ctx, include_discourse_type=args.use_discourse, include_neigh=args.use_neighbor_feat, max_size=args.num_val_examples, include_kw=not args.exclude_kw, dim=args.n_embd, debug_mode=args.debug_mode) print("Train length: {}, Validation length: {}".format( len(train_loader), len(val_loader))) doc_model = GPT2BaseModel(args, vocab=vocab, n_ctx=n_ctx, gen_len=gen_len, lastidx=text_encoder.eos_token_id, includeprev=args.use_neighbor_feat, use_offline_gpt2=args.use_offline_gpt2) elif args.use_model == "plotmachines": #asli train_loader = get_paragraph_memory_input_loader( os.path.join(data_dir, "train_encoded.jsonl"), args.n_batch, text_encoder, num_workers=3, shuffle=True, gen_len=gen_len, n_ctx=n_ctx, include_discourse_type=args.use_discourse, include_neigh=args.use_neighbor_feat, max_size=args.max_ex, include_kw=not args.exclude_kw, memsize=args.memstatesize, dim=args.n_embd, use_kwmem=True, debug_mode=args.debug_mode) val_loader = get_paragraph_memory_input_loader( os.path.join(data_dir, "val_encoded.jsonl"), n_gpu, text_encoder, num_workers=0, shuffle=False, gen_len=gen_len, n_ctx=n_ctx, include_discourse_type=args.use_discourse, include_neigh=args.use_neighbor_feat, max_size=args.num_val_examples, include_kw=not args.exclude_kw, memsize=args.memstatesize, dim=args.n_embd, use_kwmem=True, debug_mode=args.debug_mode) print("Train length: {}, Validation length: {}".format( len(train_loader), len(val_loader))) doc_model = PlotMachinesModel(args, vocab=vocab, n_ctx=n_ctx, gen_len=gen_len, lastidx=text_encoder.eos_token_id, includeprev=args.use_neighbor_feat, use_offline_gpt2=args.use_offline_gpt2) n_updates_total = (len(train_loader) // args.accum_iter) * (args.num_epochs) if args.debug_mode: print_model_params(log_dir, doc_model) criterion = nn.CrossEntropyLoss(reduction="none") model_opt = AdamW(filter(lambda p: p.requires_grad, doc_model.parameters()), lr=args.lr, betas=(args.b1, args.b2), eps=args.e) lm_loss = ParagraphLoss(criterion, n_ctx=n_ctx, gen_len=gen_len) print("Loading Model") doc_model.to(device) if n_gpu > 1: doc_model = DataParallelModel(doc_model) lm_loss = DataParallelCriterion(lm_loss) print("Parallelized") bestloss = -1 start_iter, running_loss = 1, 0 prevloss = 1000 start_iter, running_loss = load_checkpoint(args.checkpoint, doc_model, model_opt) for i in range(args.num_epochs): start_iter, running_loss, bestloss, updates, val_loss1 = run_epoch( bestloss, start_iter, running_loss, doc_model, lm_loss, model_opt, train_loader, val_loader, train_log_interval, val_log_interval, device, beam, gen_len, k, p, decoding_strategy, accum_iter, "FT Training Epoch [{}/{}]".format(i + 1, args.num_epochs), save_dir, logger, text_encoder, show_progress=args.show_progress, my_local_dir=save_dir_local) print("VAL LOSS: ", str(val_loss1)) if val_loss1 > prevloss or math.isnan(val_loss1): break prevloss = val_loss1 print('Done training...') print('Evaluating on validation with best checkpoint...') bestcheck = os.path.join(save_dir, "checkpoint_best.pt") checkpoint = torch.load(bestcheck, map_location='cpu') state_dict = checkpoint["state_dict"] if state_dict.get('module.pos_emb_mask') is None and doc_model.state_dict( ).get('module.pos_emb_mask') is not None: state_dict['module.pos_emb_mask'] = doc_model.state_dict().get( 'module.pos_emb_mask') doc_model.load_state_dict(state_dict) evaluate_doc_model(doc_model, val_loader, text_encoder, device, beam, gen_len, k, p, args.decoding_strategy, os.path.join(save_dir, 'valeval.log'), 'gen', 'tgt', gen_len, [], args)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_corpus", default=None, type=str, required=True, help="The input train corpus.") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--on_memory", action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--discriminative_finetuning', action='store_true', help='Whether to use discriminative fine-tuning') args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train: raise ValueError( "Training is currently the only implemented execution option. Please set `do_train`." ) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) #train_examples = None num_train_optimization_steps = None if args.do_train: print("Loading Train Dataset", args.train_corpus) train_dataset = BERTDataset(args.train_corpus, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_optimization_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model ############################################################################# # model = BertForPreTraining.from_pretrained(args.bert_model) model = BertForMaskedLM.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: # model = torch.nn.DataParallel(model) model = DataParallelModel(model) # Prepare optimizer if args.do_train: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] if args.discriminative_finetuning: group1 = ['layer.0', 'layer.1.'] group2 = ['layer.2', 'layer.3'] group3 = ['layer.4', 'layer.5'] group4 = ['layer.6', 'layer.7'] group5 = ['layer.8', 'layer.9'] group6 = ['layer.10', 'layer.11'] group_all = ['layer.0', 'layer.1', 'layer.2', 'layer.3', 'layer.4', 'layer.5', \ 'layer.6', 'layer.7', 'layer.8', 'layer.9', 'layer.10', 'layer.11'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)], \ 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)], \ 'weight_decay': 0.01, 'lr': args.learning_rate/2.6**5}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)], \ 'weight_decay': 0.01, 'lr': args.learning_rate/2.6**4}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)], \ 'weight_decay': 0.01, 'lr': args.learning_rate/2.6**3}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group4)], \ 'weight_decay': 0.01, 'lr': args.learning_rate/2.6**2}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group5)], \ 'weight_decay': 0.01, 'lr': args.learning_rate/2.6}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group6)], \ 'weight_decay': 0.01, 'lr': args.learning_rate}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)], \ 'weight_decay': 0.0}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)], \ 'weight_decay': 0.0, 'lr': args.learning_rate/2.6**5}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)], \ 'weight_decay': 0.0, 'lr': args.learning_rate/2.6**4}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)], \ 'weight_decay': 0.0, 'lr': args.learning_rate/2.6**3}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group4)], \ 'weight_decay': 0.0, 'lr': args.learning_rate/2.6**2}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group5)], \ 'weight_decay': 0.0, 'lr': args.learning_rate/2.6}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group6)], \ 'weight_decay': 0.0, 'lr': args.learning_rate}, ] else: optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: #TODO: check if this works with current data generator from disk that relies on next(file) # (it doesn't return item back by index) train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, drop_last=True) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch logits = model(input_ids, segment_ids, input_mask) loss_fct = CrossEntropyLoss(ignore_index=-1) loss_fct = DataParallelCriterion(loss_fct) logits = [ logits[i].view(-1, model.module.config.vocab_size) for i in range(len(logits)) ] loss = loss_fct(logits, lm_label_ids.view(-1)) # loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) # loss = model(input_ids, segment_ids, input_mask, lm_label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) if args.do_train: torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir)
def main(args): init(args) #Args setup: beam = args.beam p = args.p n_ctx = args.n_ctx gen_len = args.gen_len k = args.k decoding_strategy = args.decoding_strategy accum_iter = args.accum_iter device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print("device", device, "n_gpu", n_gpu) data_dir = args.data_dir #Text Encoder if args.debug_mode: text_encoder = GPT2Tokenizer.from_pretrained('gpt2') else: text_encoder = GPT2Tokenizer.from_pretrained('gpt2-medium') text_encoder.add_special_tokens({ 'bos_token': '_start_', 'cls_token': '_classify_', 'eos_token': '_end_', 'additional_special_tokens': ['_kw_', '_endkw_', '_t_', '_i_', '_b_', '_c_'] }) vocab = len(text_encoder) datafile = os.path.join( data_dir, "test_encoded.jsonl") if args.testset else os.path.join( data_dir, "val_encoded.jsonl") print("Loading dataset...") val_loader = get_fullstory_loader(datafile, args.n_batch, text_encoder, num_workers=0, shuffle=False, gen_len=gen_len, n_ctx=n_ctx, include_kw=not args.exclude_kw, max_size=args.max_ex) print(len(val_loader)) if args.use_model == "plotmachines": doc_model = PlotMachinesModel(args, vocab=vocab, n_ctx=n_ctx, gen_len=gen_len, lastidx=text_encoder.eos_token_id, includeprev=args.use_neighbor_feat) else: doc_model = GPT2BaseModel(args, vocab=vocab, n_ctx=n_ctx, gen_len=gen_len, lastidx=text_encoder.eos_token_id, includeprev=args.use_neighbor_feat) doc_model.to(device) if n_gpu > 1: doc_model = DataParallelModel(doc_model) if args.debug_mode: gptclf = GPT2Model.from_pretrained('gpt2') gptclf.eval() device = 'cuda' if torch.cuda.is_available() else 'cpu' gptclf.to(device) #gpttok = gptTokenizer.from_pretrained('openai-gpt') gpttok = GPT2Tokenizer.from_pretrained('gpt2') else: gptclf = GPT2Model.from_pretrained('gpt2-medium') gptclf.eval() device = 'cuda' if torch.cuda.is_available() else 'cpu' gptclf.to(device) #gpttok = gptTokenizer.from_pretrained('openai-gpt') gpttok = GPT2Tokenizer.from_pretrained('gpt2-medium') prevloss = [] upd = [] start_iter, running_loss = 1, 0 load_dir = args.load_dir bestcheck = os.path.join(load_dir, "checkpoint_best.pt") checkpoint = torch.load(bestcheck, map_location='cpu') state_dict = checkpoint["state_dict"] if n_gpu == 1: if state_dict.get( 'module.pos_emb_mask') is None and doc_model.state_dict().get( 'pos_emb_mask') is not None: state_dict['module.pos_emb_mask'] = doc_model.state_dict().get( 'pos_emb_mask') for k in list(state_dict.keys()): state_dict[k[7:]] = state_dict[k] del state_dict[k] else: if state_dict.get( 'module.pos_emb_mask') is None and doc_model.state_dict().get( 'module.pos_emb_mask') is not None: state_dict['module.pos_emb_mask'] = doc_model.state_dict().get( 'module.pos_emb_mask') doc_model.load_state_dict(state_dict) print("Parallelized") tagset = ['_i_'] + args.bodynum * ['_b_'] + ['_c_'] vort = 'test' if args.testset else 'val' generatedocs(doc_model, gptclf, gpttok, val_loader, text_encoder, device, beam, gen_len, k, p, args.decoding_strategy, os.path.join(args.save_dir, vort + '.gens.tsv'), 'gen', 'tgt', gen_len, [], args, tags=tagset, dim=args.n_embd, save_dir=args.save_dir, localfile=os.path.join('/tmp', vort + '.gens.tsv')) print('done decoding....')
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs args.warmup_steps = t_total // 100 # Prepare optimizer and schedule (linear warmup and decay) optimizer_grouped_parameters = get_param_groups(args, model) optimizer = RAdam(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: # model = torch.nn.DataParallel(model) model = DataParallelModel(model) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) args.logging_steps = len(train_dataloader) // 1 args.save_steps = args.logging_steps global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch") set_seed(args) for _ in train_iterator: args.current_epoch = _ epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, } # XLM and RoBERTa don't use segment_ids # 'labels': batch[3]} outputs = model(**inputs) outputs = [outputs[i][0] for i in range(len(outputs))] loss_fct = CrossEntropyLoss() loss_fct = DataParallelCriterion(loss_fct) loss = loss_fct(outputs, batch[3]) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
lr_rate = 0.03 milestones = [5, 7, 8, 10, 12, 14, 16, 17, 18] img_size = 384 gamma = 0.5 #use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #use_cuda = False segm_model = ResNetLinkModel(input_channels=1, pretrained=True, num_classes=3) if torch.cuda.device_count() > 1: #dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs #segm_model = nn.DataParallel(segm_model) #segm_model = encoding.parallel.DataParallelModel(segm_model, device_ids=[0,1,2,3,4,5,6,7]) segm_model = DataParallelModel(segm_model) print("Let's use", torch.cuda.device_count(), "GPUs!") segm_model.to(device) '''if use_cuda: segm_model.cuda() seg_model = nn.DataParallel(seg_model)''' mul_transf = [ transforms.Resize(size=(img_size, img_size)), transforms.ToTensor() ] #optimizer = optim.SGD(segm_model.parameters(), lr=lr_rate, momentum=momentum) optimizer = optim.Adam(segm_model.parameters(), lr=0.0001) #criterion = nn.BCEWithLogitsLoss().cuda() if use_cuda else nn.BCEWithLogitsLoss() criterion = nn.BCEWithLogitsLoss() criterion = DataParallelCriterion(criterion)
def __init__(self, model, vocab_size, train_dataloader, test_dataloader=None, lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000, with_cuda: bool = True, cuda_devices=None, log_freq: int = 10, include_next=False, include_vision=True, total_epochs=1): """ :param bert: BERT model which you want to train :param vocab_size: total word vocab size :param train_dataloader: train dataset data loader :param test_dataloader: test dataset data loader [can be None] :param lr: learning rate of optimizer :param betas: Adam optimizer betas :param weight_decay: Adam optimizer weight decay param :param with_cuda: traning with cuda :param log_freq: logging frequency of the batch iteration """ # Setup cuda device for BERT training, argument -c, --cuda should be true cuda_condition = torch.cuda.is_available() and with_cuda self.device = torch.device("cuda:0" if cuda_condition else "cpu") n_gpu = torch.cuda.device_count() print("device", device, "n_gpu", n_gpu) # Initialize the BERT Language Model, with BERT model self.model = model.to(self.device) self.bert = self.model.bert self.padding_idx = 0 self.include_next = include_next self.include_vision = include_vision # Distributed GPU training if CUDA can detect more than 1 GPU if with_cuda and torch.cuda.device_count() > 1: print("Using %d GPUS for BERT" % torch.cuda.device_count()) #self.model = nn.DataParallel(self.model, device_ids=range(torch.cuda.device_count())) self.model = DataParallelModel(self.model, device_ids=range( torch.cuda.device_count())) # Setting the train and test data loader self.train_data = train_dataloader self.test_data = test_dataloader # Setting the Adam optimizer with hyper-param self.optim = optim.Adamax(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay) if self.model.__class__.__name__ in [ 'DataParallel', 'DataParallelModel' ]: self.optim_schedule = ScheduledOptim( self.optim, self.model.module.bert.transformer_hidden_size, n_warmup_steps=warmup_steps) else: self.optim_schedule = ScheduledOptim( self.optim, self.model.bert.transformer_hidden_size, n_warmup_steps=warmup_steps) # Using Negative Log Likelihood Loss function for predicting the masked_token self.criterion = nn.NLLLoss(ignore_index=0) if with_cuda and torch.cuda.device_count() > 1: print("Using %d GPUS for BERT" % torch.cuda.device_count()) #self.model = nn.DataParallel(self.model, device_ids=range(torch.cuda.device_count())) self.criterion = DataParallelCriterion( self.criterion, device_ids=range(torch.cuda.device_count())) self.log_freq = log_freq self.total_iters = total_epochs * len(train_dataloader) print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))
class BERTTrainer: """ BERTTrainer make the pretrained BERT model with two LM training method. 1. Masked Language Model : 3.3.1 Task #1: Masked LM 2. Next Sentence prediction : 3.3.2 Task #2: Next Sentence Prediction """ def __init__(self, model, vocab_size, train_dataloader, test_dataloader=None, lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000, with_cuda: bool = True, cuda_devices=None, log_freq: int = 10, include_next=False, include_vision=True, total_epochs=1): """ :param bert: BERT model which you want to train :param vocab_size: total word vocab size :param train_dataloader: train dataset data loader :param test_dataloader: test dataset data loader [can be None] :param lr: learning rate of optimizer :param betas: Adam optimizer betas :param weight_decay: Adam optimizer weight decay param :param with_cuda: traning with cuda :param log_freq: logging frequency of the batch iteration """ # Setup cuda device for BERT training, argument -c, --cuda should be true cuda_condition = torch.cuda.is_available() and with_cuda self.device = torch.device("cuda:0" if cuda_condition else "cpu") n_gpu = torch.cuda.device_count() print("device", device, "n_gpu", n_gpu) # Initialize the BERT Language Model, with BERT model self.model = model.to(self.device) self.bert = self.model.bert self.padding_idx = 0 self.include_next = include_next self.include_vision = include_vision # Distributed GPU training if CUDA can detect more than 1 GPU if with_cuda and torch.cuda.device_count() > 1: print("Using %d GPUS for BERT" % torch.cuda.device_count()) #self.model = nn.DataParallel(self.model, device_ids=range(torch.cuda.device_count())) self.model = DataParallelModel(self.model, device_ids=range( torch.cuda.device_count())) # Setting the train and test data loader self.train_data = train_dataloader self.test_data = test_dataloader # Setting the Adam optimizer with hyper-param self.optim = optim.Adamax(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay) if self.model.__class__.__name__ in [ 'DataParallel', 'DataParallelModel' ]: self.optim_schedule = ScheduledOptim( self.optim, self.model.module.bert.transformer_hidden_size, n_warmup_steps=warmup_steps) else: self.optim_schedule = ScheduledOptim( self.optim, self.model.bert.transformer_hidden_size, n_warmup_steps=warmup_steps) # Using Negative Log Likelihood Loss function for predicting the masked_token self.criterion = nn.NLLLoss(ignore_index=0) if with_cuda and torch.cuda.device_count() > 1: print("Using %d GPUS for BERT" % torch.cuda.device_count()) #self.model = nn.DataParallel(self.model, device_ids=range(torch.cuda.device_count())) self.criterion = DataParallelCriterion( self.criterion, device_ids=range(torch.cuda.device_count())) self.log_freq = log_freq self.total_iters = total_epochs * len(train_dataloader) print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()])) def train(self, epoch): self.iteration(epoch, self.train_data) def test(self, epoch): self.iteration(epoch, self.test_data, train=False) def iteration(self, epoch, data_loader, train=True): """ loop over the data_loader for training or testing if on train status, backward operation is activated and also auto save the model every peoch :param epoch: current epoch index :param data_loader: torch.utils.data.DataLoader for iteration :param train: boolean value of is train or test :return: None """ str_code = "train" if train else "test" # Setting the tqdm progress bar data_iter = tqdm.tqdm(enumerate(data_loader), desc="EP_%s:%d" % (str_code, epoch), total=len(data_loader), bar_format="{l_bar}{r_bar}", disable=True) avg_loss = 0.0 total_correct = 0 total_element = 0 for i, data in data_iter: # 0. prepare the text sequence tensor #data = {key: value.to(self.device) for key, value in data.items()} seq_tensor = data['masked_text_seq'] labels = data['masked_text_label'] seq_lengths = np.argmax(seq_tensor == self.padding_idx, axis=1) seq_lengths[seq_lengths == 0] = seq_tensor.shape[1] # Full length # Sort sequences by lengths seq_lengths, perm_idx = seq_lengths.sort(0, True) sorted_tensor = seq_tensor[perm_idx] mask = (sorted_tensor == padding_idx)[:, :seq_lengths[0]] f_t_all = data['feature_all'] isnext = data["isnext"] f_t_all = f_t_all[perm_idx] isnext = isnext[perm_idx] labels = labels[perm_idx] # 1. forward the next_sentence_prediction and masked_lm model if self.include_vision: #next_sent_output, mask_lm_output = self.model.forward(sorted_tensor.cuda(), mask.cuda(),seq_lengths.cuda(),f_t_all.cuda()) output = self.model.forward(sorted_tensor.cuda(), mask.cuda(), seq_lengths.cuda(), f_t_all.cuda()) length_output = len(output) print("You got %d outputs" % (length_output)) next_sent_output, mask_lm_output = zip(*output) print("vision test shape is %d " % (next_sent_output[1].shape)) print("lm test shape is %d " % (mask_lm_output[1].shape)) else: #next_sent_output, mask_lm_output = self.model.forward(sorted_tensor.cuda(), mask.cuda(),seq_lengths.cuda(),None) output = self.model.forward(sorted_tensor.cuda(), mask.cuda(), seq_lengths.cuda(), None) length_output = len(output) print("You got %d outputs" % (length_output)) next_sent_output, mask_lm_output = zip(*output) # 2-1. NLL(negative log likelihood) loss of is_next classification result next_loss = 0 if self.include_vision and self.include_next: next_loss = self.criterion(next_sent_output, isnext.cuda()) # 2-2. NLLLoss of predicting masked token word mask_loss = self.criterion(mask_lm_output.transpose(1, 2), labels[:, :seq_lengths[0]].cuda()) # 2-3. Adding next_loss and mask_loss : 3.4 Pre-training Procedure #loss = next_loss + mask_loss # 3. backward and optimization only in train loss = loss.mean() if train: self.optim_schedule.zero_grad() loss.backward() self.optim_schedule.step_and_update_lr() # next vision prediction accuracy if self.include_next: correct = next_sent_output.argmax(dim=-1).eq( isnext.cuda()).sum().item() total_correct += correct total_element += data["isnext"].nelement() avg_loss += loss.item() if self.include_next: post_fix = { "epoch": epoch, "iter": i, "avg_loss": avg_loss / (i + 1), "avg_acc": total_correct / total_element * 100, "loss": loss.item() } else: post_fix = { "epoch": epoch, "iter": i, "avg_loss": avg_loss / (i + 1), "loss": loss.item() } #if i % self.log_freq == 0: # data_iter.write(str(post_fix)) if i % 100 == 0: #print("PROGRESS: {}%".format(round((myidx) * 100 / n_iters, 4))) print("\n") print("PROGRESS: {}%".format( round((epoch * len(data_loader) + i) * 100 / self.total_iters, 4))) print("EVALERR: {}%".format(avg_loss / (i + 1))) #print("EP%d_%s, avg_loss=" % (epoch, str_code), avg_loss / len(data_iter)) def save(self, epoch, file_path="pretrained_models/addbert_trained.model"): """ Saving the current BERT model on file_path :param epoch: current epoch number :param file_path: model output path which gonna be file_path+"ep%d" % epoch :return: final_output_path """ output_path = file_path + ".ep%d" % epoch torch.save(self.bert.cpu(), output_path) self.bert.to(self.device) print("EP:%d Model Saved on:" % epoch, output_path) return output_path
for (key, value) in vars(args).items(): print("{0:16} | {1}".format(key, value)) # check if processed data file exists or not data_mean = [0.485, 0.456, 0.406] data_std = [0.229, 0.224, 0.225] # load the model model = BiSalNet() model.eval() if args.onGPU and torch.cuda.device_count() > 1: # model = torch.nn.DataParallel(model) model = DataParallelModel(model) if args.onGPU: model = model.cuda() # compose the data with transforms val_transforms = transforms.Compose([ transforms.Resize((args.inHeight, args.inWidth)), transforms.ToTensor(), transforms.Normalize(data_mean, data_std) ]) if os.path.isfile(join(args.savedir, "checkpoint.pth")): print("=> loading checkpoint '{}'".format( join(args.savedir, "checkpoint.pth"))) checkpoint = torch.load(join(args.savedir, "checkpoint.pth"))["state_dict"] if list(checkpoint.keys())[0][:7] == "module." and not isinstance(
class ImageMTTrainer: def __init__(self, model, mask_prob: float = 0.3, clip: int = 1, optimizer=None, beam_width: int = 5, max_len_a: float = 1.1, max_len_b: int = 5, len_penalty_ratio: float = 0.8, nll_loss: bool = False, fp16: bool = False, mm_mode="mixed", rank: int = -1): self.model = model self.clip = clip self.optimizer = optimizer self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.num_gpu = torch.cuda.device_count() self.mask_prob = mask_prob if nll_loss: self.criterion = nn.NLLLoss( ignore_index=model.text_processor.pad_token_id()) else: self.criterion = SmoothedNLLLoss( ignore_index=model.text_processor.pad_token_id()) self.num_gpu = torch.cuda.device_count() self.fp16 = False self.rank = rank if rank >= 0: self.device = torch.device('cuda', rank) torch.cuda.set_device(self.device) self.model = self.model.to(self.device) if fp16: self.model, self.optimizer = amp.initialize(self.model, self.optimizer, opt_level="O2") self.fp16 = True self.generator = BeamDecoder(self.model, beam_width=beam_width, max_len_a=max_len_a, max_len_b=max_len_b, len_penalty_ratio=len_penalty_ratio) if rank >= 0: self.model = DistributedDataParallel(self.model, device_ids=[self.rank], output_device=self.rank, find_unused_parameters=True) self.generator = DistributedDataParallel( self.generator, device_ids=[self.rank], output_device=self.rank, find_unused_parameters=True) elif self.num_gpu > 1: print("Let's use", self.num_gpu, "GPUs!") self.model = DataParallelModel(self.model) self.criterion = DataParallelCriterion(self.criterion) self.generator = DataParallelModel(self.generator) self.reference = None self.best_bleu = -1.0 self.mm_mode = mm_mode def train_epoch(self, img_data_iter: List[data_utils.DataLoader], step: int, saving_path: str = None, mass_data_iter: List[data_utils.DataLoader] = None, mt_dev_iter: List[data_utils.DataLoader] = None, mt_train_iter: List[data_utils.DataLoader] = None, max_step: int = 300000, accum=1, beam_width=1, fine_tune: bool = False, lang_directions: dict = False, lex_dict=None, save_opt: bool = False, **kwargs): "Standard Training and Logging Function" start = time.time() total_tokens, total_loss, tokens, cur_loss = 0, 0, 0, 0 cur_loss = 0 batch_zip, shortest = self.get_batch_zip(img_data_iter, mass_data_iter, mt_train_iter) model = (self.model.module if hasattr(self.model, "module") else self.model) self.optimizer.zero_grad() for i, batches in enumerate(batch_zip): for batch in batches: is_img_batch = isinstance(batch, list) and "captions" in batch[0] is_mass_batch = not is_img_batch and "dst_texts" not in batch is_contrastive = False try: if fine_tune and (is_img_batch or is_mass_batch): id2lid = lambda r: model.text_processor.languages[ model.text_processor.id2token(lang_directions[int( r)])] if is_mass_batch: src_inputs = batch["src_texts"].squeeze(0) src_pad_mask = src_inputs != model.text_processor.pad_token_id( ) pad_indices = batch["pad_idx"].squeeze(0) proposal = batch["proposal"].squeeze( 0) if lex_dict is not None else None target_langs = torch.LongTensor([ lang_directions[int(l)] for l in src_inputs[:, 0] ]) dst_langs = torch.LongTensor( [id2lid(l) for l in src_inputs[:, 0]]) else: src_inputs = [b["captions"] for b in batch] src_pad_mask = [b["caption_mask"] for b in batch] pad_indices = [b["pad_idx"] for b in batch] proposal = [ b["proposal"] if lex_dict is not None else None for b in batch ] target_langs = [ torch.LongTensor([ lang_directions[int(l)] for l in src[:, 0] ]) for src in src_inputs ] dst_langs = [ torch.LongTensor( [id2lid(l) for l in src[:, 0]]) for src in src_inputs ] if len(src_inputs) < self.num_gpu: continue if is_mass_batch: langs = batch["langs"].squeeze(0) else: langs = [b["langs"] for b in batch] model.eval() with torch.no_grad(): # We do not backpropagate the data generator following the MASS paper. images = None if is_img_batch: images = [b["images"] for b in batch] outputs = self.generator( src_inputs=src_inputs, src_sizes=pad_indices, first_tokens=target_langs, src_langs=langs, tgt_langs=dst_langs, pad_idx=model.text_processor.pad_token_id(), src_mask=src_pad_mask, unpad_output=False, beam_width=beam_width, images=images, proposals=proposal) if self.num_gpu > 1 and self.rank < 0: if is_mass_batch: new_outputs = [] for output in outputs: new_outputs += output outputs = new_outputs if is_mass_batch or self.num_gpu <= 1: translations = pad_sequence( outputs, batch_first=True, padding_value=model.text_processor. pad_token_id()) translation_proposals = None if lex_dict is not None: translation_proposals = list( map( lambda o: dataset. get_lex_suggestions( lex_dict, o, model.text_processor. pad_token_id()), outputs)) translation_proposals = pad_sequence( translation_proposals, batch_first=True, padding_value=model.text_processor. pad_token_id()) translation_pad_mask = ( translations != model.text_processor.pad_token_id()) else: translation_proposals = None if lex_dict is not None: translation_proposals = [ pad_sequence( list( map( lambda o: dataset. get_lex_suggestions( lex_dict, o, model.text_processor. pad_token_id()), output)), batch_first=True, padding_value=model.text_processor. pad_token_id()) for output in outputs ] translations = [ pad_sequence(output, batch_first=True, padding_value=model. text_processor.pad_token_id()) for output in outputs ] translation_pad_mask = [ t != model.text_processor.pad_token_id() for t in translations ] model.train() if is_mass_batch: langs = batch["langs"].squeeze(0) else: langs = torch.cat([b["langs"] for b in batch]) # Now use it for back-translation loss. predictions = model( src_inputs=translations, tgt_inputs=src_inputs, src_pads=translation_pad_mask, pad_idx=model.text_processor.pad_token_id(), src_langs=dst_langs, tgt_langs=langs, proposals=translation_proposals, log_softmax=True) if is_mass_batch: src_targets = src_inputs[:, 1:].contiguous().view(-1) src_mask_flat = src_pad_mask[:, 1:].contiguous().view( -1) else: src_targets = torch.cat( list(map(lambda s: s[:, 1:], src_inputs))) src_mask_flat = torch.cat( list(map(lambda s: s[:, 1:], src_pad_mask))) targets = src_targets[src_mask_flat] ntokens = targets.size(0) elif is_img_batch: src_inputs = [b["captions"] for b in batch] src_pad_mask = [b["caption_mask"] for b in batch] proposals = [b["proposal"] for b in batch ] if lex_dict is not None else None langs = [b["langs"] for b in batch] if (self.mm_mode == "mixed" and random.random() <= .5 ) or self.mm_mode == "masked": pad_indices = [b["pad_idx"] for b in batch] if len(batch) < self.num_gpu: continue # For image masking, we are allowed to mask more than mask_prob mask_prob = random.uniform(self.mask_prob, 1.0) masked_info = list( map( lambda pi, si: mass_mask( mask_prob, pi, si, model.text_processor ), pad_indices, src_inputs)) predictions = self.model( src_inputs=list( map(lambda m: m["src_text"], masked_info)), tgt_inputs=list( map(lambda m: m["to_recover"], masked_info)), tgt_positions=list( map(lambda m: m["positions"], masked_info)), src_pads=src_pad_mask, pad_idx=model.text_processor.pad_token_id(), src_langs=langs, batch=batch, proposals=proposals, log_softmax=True) targets = torch.cat( list(map(lambda m: m["targets"], masked_info))) ntokens = targets.size(0) else: neg_samples = [b["neg"] for b in batch] neg_mask = [b["neg_mask"] for b in batch] loss = self.model( src_inputs=src_inputs, src_pads=src_pad_mask, neg_samples=neg_samples, neg_mask=neg_mask, pad_idx=model.text_processor.pad_token_id(), src_langs=langs, batch=batch, proposals=proposals, log_softmax=True) is_contrastive = True elif not is_mass_batch: # MT data src_inputs = batch["src_texts"].squeeze(0) src_mask = batch["src_pad_mask"].squeeze(0) tgt_inputs = batch["dst_texts"].squeeze(0) tgt_mask = batch["dst_pad_mask"].squeeze(0) src_langs = batch["src_langs"].squeeze(0) dst_langs = batch["dst_langs"].squeeze(0) proposals = batch["proposal"].squeeze( 0) if lex_dict is not None else None if src_inputs.size(0) < self.num_gpu: continue predictions = self.model( src_inputs=src_inputs, tgt_inputs=tgt_inputs, src_pads=src_mask, tgt_mask=tgt_mask, src_langs=src_langs, tgt_langs=dst_langs, proposals=proposals, pad_idx=model.text_processor.pad_token_id(), log_softmax=True) targets = tgt_inputs[:, 1:].contiguous().view(-1) tgt_mask_flat = tgt_mask[:, 1:].contiguous().view(-1) targets = targets[tgt_mask_flat] ntokens = targets.size(0) else: # MASS data src_inputs = batch["src_texts"].squeeze(0) pad_indices = batch["pad_idx"].squeeze(0) proposals = batch["proposal"].squeeze( 0) if lex_dict is not None else None if src_inputs.size(0) < self.num_gpu: continue masked_info = mass_mask(self.mask_prob, pad_indices, src_inputs, model.text_processor) predictions = self.model( src_inputs=masked_info["src_text"], tgt_inputs=masked_info["to_recover"], tgt_positions=masked_info["positions"], pad_idx=model.text_processor.pad_token_id(), src_langs=batch["langs"].squeeze(0), proposals=proposals, log_softmax=True) targets = masked_info["targets"] ntokens = targets.size(0) if is_contrastive: # Nothing to predict! backward(loss, self.optimizer, self.fp16) loss = loss.data elif ntokens > 0: if self.num_gpu == 1: targets = targets.to(predictions.device) if self.rank >= 0: targets = targets.to(self.device) loss = self.criterion(predictions, targets).mean() backward(loss, self.optimizer, self.fp16) loss = float(loss.data) * ntokens tokens += ntokens total_tokens += ntokens total_loss += loss cur_loss += loss torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip) step += 1 if step % accum == 0: self.optimizer.step() self.optimizer.zero_grad() if is_mass_batch and not fine_tune: mass_unmask(masked_info["src_text"], masked_info["src_mask"], masked_info["mask_idx"]) if not is_contrastive and is_img_batch and not fine_tune: map( lambda m: mass_unmask(m["src_text"], m["src_mask"], m["mask_idx"]), masked_info) if step % 50 == 0 and tokens > 0: elapsed = time.time() - start print( self.rank, "->", datetime.datetime.now(), "Epoch Step: %d Loss: %f Tokens per Sec: %f " % (step, cur_loss / tokens, tokens / elapsed)) if mt_dev_iter is not None and step % 5000 == 0 and self.rank <= 0: bleu = self.eval_bleu(mt_dev_iter, saving_path) print("BLEU:", bleu) if step % 10000 == 0: if self.rank <= 0: if self.rank < 0: model.cpu().save(saving_path + ".latest") elif self.rank == 0: model.save(saving_path + ".latest") if save_opt: with open( os.path.join( saving_path + ".latest", "optim"), "wb") as fp: pickle.dump(self.optimizer, fp) if self.rank < 0: model = model.to(self.device) start, tokens, cur_loss = time.time(), 0, 0 except RuntimeError as err: print(repr(err)) print("Error processing", is_img_batch) if (isinstance(model, ImageMassSeq2Seq)) and is_img_batch: for b in batch: print("->", len(b["images"]), b["captions"].size()) torch.cuda.empty_cache() if i == shortest - 1: break if step >= max_step: break try: if self.rank <= 0: print("Total loss in this epoch: %f" % (total_loss / total_tokens)) if self.rank < 0: model.cpu().save(saving_path + ".latest") model = model.to(self.device) elif self.rank == 0: model.save(saving_path + ".latest") if mt_dev_iter is not None: bleu = self.eval_bleu(mt_dev_iter, saving_path) print("BLEU:", bleu) except RuntimeError as err: print(repr(err)) return step def get_batch_zip(self, img_data_iter, mass_data_iter, mt_train_iter): # if img_data_iter is not None and mt_train_iter is not None: # img_data_iter *= 5 # if mass_data_iter is not None and mt_train_iter is not None: # mass_data_iter *= 5 iters = list( chain(*filter(lambda x: x != None, [img_data_iter, mass_data_iter, mt_train_iter]))) shortest = min(len(l) for l in iters) return zip(*iters), shortest def eval_bleu(self, dev_data_iter, saving_path, save_opt: bool = False): mt_output = [] src_text = [] model = (self.model.module if hasattr(self.model, "module") else self.model) model.eval() with torch.no_grad(): for iter in dev_data_iter: for batch in iter: src_inputs = batch["src_texts"].squeeze(0) src_mask = batch["src_pad_mask"].squeeze(0) tgt_inputs = batch["dst_texts"].squeeze(0) src_langs = batch["src_langs"].squeeze(0) dst_langs = batch["dst_langs"].squeeze(0) src_pad_idx = batch["pad_idx"].squeeze(0) proposal = batch["proposal"].squeeze( 0) if batch["proposal"] is not None else None src_ids = get_outputs_until_eos( model.text_processor.sep_token_id(), src_inputs, remove_first_token=True) src_text += list( map( lambda src: model.text_processor.tokenizer.decode( src.numpy()), src_ids)) outputs = self.generator( src_inputs=src_inputs, src_sizes=src_pad_idx, first_tokens=tgt_inputs[:, 0], src_mask=src_mask, src_langs=src_langs, tgt_langs=dst_langs, pad_idx=model.text_processor.pad_token_id(), proposals=proposal) if self.num_gpu > 1 and self.rank < 0: new_outputs = [] for output in outputs: new_outputs += output outputs = new_outputs mt_output += list( map( lambda x: model.text_processor.tokenizer.decode(x[ 1:].numpy()), outputs)) model.train() bleu = sacrebleu.corpus_bleu(mt_output, [self.reference[:len(mt_output)]], lowercase=True, tokenize="intl") with open(os.path.join(saving_path, "bleu.output"), "w") as writer: writer.write("\n".join([ src + "\n" + ref + "\n" + o + "\n\n***************\n" for src, ref, o in zip(src_text, mt_output, self.reference[:len(mt_output)]) ])) if bleu.score > self.best_bleu: self.best_bleu = bleu.score print("Saving best BLEU", self.best_bleu) with open(os.path.join(saving_path, "bleu.best.output"), "w") as writer: writer.write("\n".join([ src + "\n" + ref + "\n" + o + "\n\n***************\n" for src, ref, o in zip(src_text, mt_output, self.reference[:len(mt_output)]) ])) if self.rank < 0: model.cpu().save(saving_path) model = model.to(self.device) elif self.rank == 0: model.save(saving_path) if save_opt: with open(os.path.join(saving_path, "optim"), "wb") as fp: pickle.dump(self.optimizer, fp) return bleu.score @staticmethod def train(options): lex_dict = None if options.dict_path is not None: lex_dict = get_lex_dict(options.dict_path) if options.local_rank <= 0 and not os.path.exists(options.model_path): os.makedirs(options.model_path) text_processor = TextProcessor(options.tokenizer_path) assert text_processor.pad_token_id() == 0 num_processors = max(torch.cuda.device_count(), 1) if options.local_rank < 0 else 1 if options.pretrained_path is not None: mt_model = Seq2Seq.load(ImageMassSeq2Seq, options.pretrained_path, tok_dir=options.tokenizer_path) else: mt_model = ImageMassSeq2Seq( use_proposals=lex_dict is not None, tie_embed=options.tie_embed, text_processor=text_processor, resnet_depth=options.resnet_depth, lang_dec=options.lang_decoder, enc_layer=options.encoder_layer, dec_layer=options.decoder_layer, embed_dim=options.embed_dim, intermediate_dim=options.intermediate_layer_dim) if options.lm_path is not None: lm = LM(text_processor=text_processor, enc_layer=options.encoder_layer, embed_dim=options.embed_dim, intermediate_dim=options.intermediate_layer_dim) mt_model.init_from_lm(lm) print("Model initialization done!") # We assume that the collator function returns a list with the size of number of gpus (in case of cpus, collator = dataset.ImageTextCollator() num_batches = max(1, torch.cuda.device_count()) if options.continue_train: with open(os.path.join(options.pretrained_path, "optim"), "rb") as fp: optimizer = pickle.load(fp) else: optimizer = build_optimizer(mt_model, options.learning_rate, warump_steps=options.warmup) trainer = ImageMTTrainer(model=mt_model, mask_prob=options.mask_prob, optimizer=optimizer, clip=options.clip, beam_width=options.beam_width, max_len_a=options.max_len_a, max_len_b=options.max_len_b, len_penalty_ratio=options.len_penalty_ratio, fp16=options.fp16, mm_mode=options.mm_mode, rank=options.local_rank) pin_memory = torch.cuda.is_available() img_train_loader = ImageMTTrainer.get_img_loader( collator, dataset.ImageCaptionDataset, options.train_path, mt_model, num_batches, options, pin_memory, lex_dict=lex_dict) mass_train_data, mass_train_loader, finetune_loader, mt_dev_loader = None, None, None, None if options.mass_train_path is not None: mass_train_paths = options.mass_train_path.strip().split(",") if options.step > 0: mass_train_data, mass_train_loader = ImageMTTrainer.get_mass_loader( mass_train_paths, mt_model, num_processors, options, pin_memory, keep_examples=options.finetune_step > 0, lex_dict=lex_dict) if options.finetune_step > 0: finetune_loader, finetune_data = ImageMTTrainer.get_mass_finetune_data( mass_train_data, mass_train_paths, mt_model, num_processors, options, pin_memory, lex_dict=lex_dict) mt_train_loader = None if options.mt_train_path is not None: mt_train_loader = ImageMTTrainer.get_mt_train_data( mt_model, num_processors, options, pin_memory, lex_dict=lex_dict) mt_dev_loader = None if options.mt_dev_path is not None: mt_dev_loader = ImageMTTrainer.get_mt_dev_data(mt_model, options, pin_memory, text_processor, trainer, lex_dict=lex_dict) step, train_epoch = 0, 1 while options.step > 0 and step < options.step: print("train epoch", train_epoch) step = trainer.train_epoch(img_data_iter=img_train_loader, mass_data_iter=mass_train_loader, mt_train_iter=mt_train_loader, max_step=options.step, lex_dict=lex_dict, mt_dev_iter=mt_dev_loader, saving_path=options.model_path, step=step, save_opt=options.save_opt, accum=options.accum) train_epoch += 1 finetune_epoch = 0 # Resetting the optimizer for the purpose of finetuning. trainer.optimizer.reset() lang_directions = ImageMTTrainer.get_lang_dirs(options.bt_langs, text_processor) print(options.local_rank, "lang dirs", lang_directions) print(options.local_rank, "Reloading image train data with new batch size...") if options.finetune_step > 0 and img_train_loader is not None: img_train_loader = ImageMTTrainer.get_img_loader( collator, dataset.ImageCaptionDataset, options.train_path, mt_model, num_batches, options, pin_memory, denom=2, lex_dict=lex_dict) if options.ignore_mt_mass: mt_train_loader = None print(options.local_rank, "Reloading image train data with new batch size done!") while options.finetune_step > 0 and step <= options.finetune_step + options.step: print(options.local_rank, "finetune epoch", finetune_epoch) step = trainer.train_epoch(img_data_iter=img_train_loader, mass_data_iter=finetune_loader, mt_train_iter=mt_train_loader, max_step=options.finetune_step + options.step, mt_dev_iter=mt_dev_loader, saving_path=options.model_path, step=step, fine_tune=True, lang_directions=lang_directions, lex_dict=lex_dict, save_opt=options.save_opt, accum=options.accum, beam_width=options.bt_beam_width) finetune_epoch += 1 @staticmethod def get_lang_dirs(bt_langs, text_processor: TextProcessor): langs = ["<" + l + ">" for l in bt_langs.strip().split(",")] langs = set([text_processor.token_id(l) for l in langs]) if len(langs) < 2: return None assert len(langs) <= 2 lang_directions = {} for lang1 in langs: for lang2 in langs: if lang1 != lang2: # Assuming that we only have two languages! lang_directions[lang1] = lang2 return lang_directions @staticmethod def get_mt_dev_data(mt_model, options, pin_memory, text_processor, trainer, lex_dict=None): mt_dev_loader = [] dev_paths = options.mt_dev_path.split(",") trainer.reference = [] for dev_path in dev_paths: mt_dev_data = dataset.MTDataset( batch_pickle_dir=dev_path, max_batch_capacity=options.total_capacity, keep_pad_idx=True, max_batch=int(options.batch / (options.beam_width * 2)), pad_idx=mt_model.text_processor.pad_token_id(), lex_dict=lex_dict) dl = data_utils.DataLoader(mt_dev_data, batch_size=1, shuffle=False, pin_memory=pin_memory) mt_dev_loader.append(dl) print(options.local_rank, "creating reference") generator = (trainer.generator.module if hasattr( trainer.generator, "module") else trainer.generator) for batch in dl: tgt_inputs = batch["dst_texts"].squeeze() refs = get_outputs_until_eos(text_processor.sep_token_id(), tgt_inputs, remove_first_token=True) ref = [ generator.seq2seq_model.text_processor.tokenizer.decode( ref.numpy()) for ref in refs ] trainer.reference += ref return mt_dev_loader @staticmethod def get_mt_train_data(mt_model, num_processors, options, pin_memory, lex_dict=None): mt_train_loader = [] train_paths = options.mt_train_path.split(",") for train_path in train_paths: mt_train_data = dataset.MTDataset( batch_pickle_dir=train_path, max_batch_capacity=int(num_processors * options.total_capacity / 2), max_batch=int(num_processors * options.batch / 2), pad_idx=mt_model.text_processor.pad_token_id(), lex_dict=lex_dict, keep_pad_idx=False) mtl = data_utils.DataLoader( mt_train_data, sampler=None if options.local_rank < 0 else DistributedSampler( mt_train_data, rank=options.local_rank), batch_size=1, shuffle=(options.local_rank < 0), pin_memory=pin_memory) mt_train_loader.append(mtl) return mt_train_loader @staticmethod def get_mass_finetune_data(mass_train_data, mass_train_paths, mt_model, num_processors, options, pin_memory, lex_dict=None): finetune_data, finetune_loader = [], [] for i, mass_train_path in enumerate(mass_train_paths): fd = dataset.MassDataset( batch_pickle_dir=mass_train_path, max_batch_capacity=int(num_processors * options.total_capacity / max(2, options.bt_beam_width)), max_batch=int(num_processors * options.batch / max(2, options.bt_beam_width)), pad_idx=mt_model.text_processor.pad_token_id(), max_seq_len=options.max_seq_len, keep_examples=False, example_list=None if mass_train_data is None else mass_train_data[i].examples_list, lex_dict=lex_dict) finetune_data.append(fd) fl = data_utils.DataLoader( fd, sampler=None if options.local_rank < 0 else DistributedSampler( fd, rank=options.local_rank), batch_size=1, shuffle=(options.local_rank < 0), pin_memory=pin_memory) finetune_loader.append(fl) if mass_train_data is not None: mass_train_data[i].examples_list = [] return finetune_loader, finetune_data @staticmethod def get_mass_loader(mass_train_paths, mt_model, num_processors, options, pin_memory, keep_examples, lex_dict=None): mass_train_data, mass_train_loader = [], [] for i, mass_train_path in enumerate(mass_train_paths): td = dataset.MassDataset( batch_pickle_dir=mass_train_path, max_batch_capacity=num_processors * options.total_capacity, max_batch=num_processors * options.batch, pad_idx=mt_model.text_processor.pad_token_id(), max_seq_len=options.max_seq_len, keep_examples=keep_examples, lex_dict=lex_dict) mass_train_data.append(td) dl = data_utils.DataLoader( td, sampler=None if options.local_rank < 0 else DistributedSampler( td, rank=options.local_rank), batch_size=1, shuffle=(options.local_rank < 0), pin_memory=pin_memory) mass_train_loader.append(dl) return mass_train_data, mass_train_loader @staticmethod def get_img_loader(collator, dataset_class, paths, mt_model, num_batches, options, pin_memory, denom=1, lex_dict=None, shuffle=True): if paths is not None: img_loader = [] for pth in paths.strip().split(","): data = dataset_class( root_img_dir=options.image_dir, data_bin_file=pth, max_capacity=int(options.img_capacity / denom), text_processor=mt_model.text_processor, max_img_per_batch=options.max_image / denom, lex_dict=lex_dict) print(options.local_rank, pth, "Length of training data", len(data)) tl = data_utils.DataLoader( data, sampler=None if options.local_rank < 0 else DistributedSampler(data, rank=options.local_rank), batch_size=num_batches, shuffle=shuffle, pin_memory=pin_memory, collate_fn=collator) img_loader.append(tl) return img_loader return None
class Trainer: """ trainer class """ def __init__(self, cfg: Namespace, data: Dataset): """ Args: cfg: configuration data: train dataset """ self.cfg = cfg self.train, self.valid = data.split(0.8) RATING_FIELD.build_vocab(self.train) self.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # pylint: disable=no-member self.batch_size = cfg.batch_size if torch.cuda.is_available(): self.batch_size *= torch.cuda.device_count() self.trn_itr = BucketIterator( self.train, device=self.device, batch_size=self.batch_size, shuffle=True, train=True, sort_within_batch=True, sort_key=lambda exam: -len(exam.comment_text)) self.vld_itr = BucketIterator( self.valid, device=self.device, batch_size=self.batch_size, shuffle=False, train=False, sort_within_batch=True, sort_key=lambda exam: -len(exam.comment_text)) self.log_step = 1000 if len(self.vld_itr) < 100: self.log_step = 10 elif len(self.vld_itr) < 1000: self.log_step = 100 bert_path = cfg.bert_path if cfg.bert_path else 'bert-base-cased' self.model = BertForSequenceClassification.from_pretrained( bert_path, num_labels=2) pos_weight = ( len([exam for exam in self.train.examples if exam.target < 0.5]) / len([exam for exam in self.train.examples if exam.target >= 0.5])) pos_wgt_tensor = torch.tensor([1.0, pos_weight], device=self.device) # pylint: disable=not-callable self.criterion = nn.CrossEntropyLoss(weight=pos_wgt_tensor) if torch.cuda.is_available(): self.model = DataParallelModel(self.model.cuda()) self.criterion = DataParallelCriterion(self.criterion) self.optimizer = optim.Adam(self.model.parameters(), cfg.learning_rate) def run(self): """ do train """ max_f_score = -9e10 max_epoch = -1 for epoch in range(self.cfg.epoch): train_loss = self._train_epoch(epoch) metrics = self._evaluate(epoch) max_f_score_str = f' < {max_f_score:.2f}' if metrics['f_score'] > max_f_score: max_f_score_str = ' is max' max_f_score = metrics['f_score'] max_epoch = epoch torch.save(self.model.state_dict(), self.cfg.model_path) logging.info('EPOCH[%d]: train loss: %.6f, valid loss: %.6f, acc: %.2f,' \ ' F: %.2f%s', epoch, train_loss, metrics['loss'], metrics['accuracy'], metrics['f_score'], max_f_score_str) if (epoch - max_epoch) >= self.cfg.patience: logging.info('early stopping...') break logging.info('epoch: %d, f-score: %.2f', max_epoch, max_f_score) def _train_epoch(self, epoch: int) -> float: """ train single epoch Args: epoch: epoch number Returns: average loss """ self.model.train() progress = tqdm(self.trn_itr, f'EPOCH[{epoch}]', mininterval=1, ncols=100) losses = [] for step, batch in enumerate(progress, start=1): outputs = self.model(batch.comment_text) # output of model wrapped with DataParallelModel is a list of outputs from each GPU # make input of DataParallelCriterion as a list of tuples if isinstance(self.model, DataParallelModel): loss = self.criterion([(output, ) for output in outputs], batch.target) else: loss = self.criterion(outputs, batch.target) losses.append(loss.item()) if step % self.log_step == 0: avg_loss = sum(losses) / len(losses) progress.set_description(f'EPOCH[{epoch}] ({avg_loss:.6f})') loss.backward() self.optimizer.step() self.optimizer.zero_grad() return sum(losses) / len(losses) def _evaluate(self, epoch: int) -> Dict[str, float]: """ evaluate on validation data Args: epoch: epoch number Returns: metrics """ self.model.eval() progress = tqdm(self.vld_itr, f' EVAL[{epoch}]', mininterval=1, ncols=100) losses = [] preds = [] golds = [] for step, batch in enumerate(progress, start=1): with torch.no_grad(): outputs = self.model(batch.comment_text) if isinstance(self.model, DataParallelModel): loss = self.criterion([(output, ) for output in outputs], batch.target) for output in outputs: preds.extend([(0 if o[0] < o[1] else 1) for o in output]) else: loss = self.criterion(outputs, batch.target) preds.extend([(0 if output[0] < output[1] else 1) for output in outputs]) losses.append(loss.item()) golds.extend([gold.item() for gold in batch.target]) if step % self.log_step == 0: avg_loss = sum(losses) / len(losses) progress.set_description( f' EVAL[{epoch}] ({avg_loss:.6f})') metrics = self._get_metrics(preds, golds) metrics['loss'] = sum(losses) / len(losses) return metrics @classmethod def _get_metrics(cls, preds: List[float], golds: List[float]) -> Dict[str, float]: """ get metric values Args: preds: predictions golds: gold standards Returns: metric """ assert len(preds) == len(golds) true_pos = 0 false_pos = 0 false_neg = 0 true_neg = 0 for pred, gold in zip(preds, golds): if pred >= 0.5: if gold >= 0.5: true_pos += 1 else: false_pos += 1 else: if gold >= 0.5: false_neg += 1 else: true_neg += 1 accuracy = (true_pos + true_neg) / (true_pos + false_pos + false_neg + true_neg) precision = 0.0 if (true_pos + false_pos) > 0: precision = true_pos / (true_pos + false_pos) recall = 0.0 if (true_pos + false_neg) > 0: recall = true_pos / (true_pos + false_neg) f_score = 0.0 if (precision + recall) > 0.0: f_score = 2.0 * precision * recall / (precision + recall) return { 'accuracy': 100.0 * accuracy, 'precision': 100.0 * precision, 'recall': 100.0 * recall, 'f_score': 100.0 * f_score, }
def main(args): init(args) # Constants n_ctx = args.n_ctx data_dir = args.data_dir device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print("device", device, "n_gpu", n_gpu) text_encoder = TextEncoder(args.encoder_path, args.bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) text_encoder.decoder[len(encoder)] = '_start_' encoder['_start_'] = len(encoder) text_encoder.decoder[len(encoder)] = '_delimiter_' encoder['_delimiter_'] = len(encoder) text_encoder.decoder[len(encoder)] = '_classify_' encoder['_classify_'] = len(encoder) n_special = 3 # XD: useless for language modeling task vocab = n_vocab + n_special + n_ctx lm_model = LMModel(args, vocab, n_ctx, return_probs=True, doc_embed=args.doc_model) load_openai_pretrained_model(lm_model.transformer, n_ctx=n_ctx, n_special=n_special) if args.checkpoint != "none": checkpoint = torch.load(args.checkpoint, map_location='cpu') state_dict = checkpoint["state_dict"] for key in list(state_dict.keys()): state_dict[key[7:]] = state_dict[key] del state_dict[key] pos_emb_mask = torch.zeros(1, 1, vocab) pos_emb_mask[:, :, -n_ctx] = -1e12 state_dict['pos_emb_mask'] = pos_emb_mask lm_model.load_state_dict(state_dict) lm_model.to(device) lm_model = DataParallelModel(lm_model) train_bar = get_loader(os.path.join(data_dir, "val_encoded.jsonl"), n_gpu, encoder, num_workers=1, shuffle=True, max_size=args.n_iter) srcs, hyps, refs = [], [], [] with torch.no_grad(): lm_model.eval() for i, (pad_output, mask_output) in enumerate(tqdm(train_bar), 1): src_strs, tgt_strs, gen_strs = generate_outputs( lm_model, pad_output, mask_output, text_encoder, device, args.beam, args.gen_len, args.k, args.decoding_strategy) srcs.extend(src_strs) hyps.extend(gen_strs) refs.extend(tgt_strs) for i in range(len(hyps)): print("*" * 50) print("Source: {}".format(srcs[i])) print('Hypothesis: {}'.format(hyps[i])) print("Reference: {}".format(refs[i]))
def train(config): net = BertForMaskedLM.from_pretrained(config.model) lossFunc = KLDivLoss(config) if torch.cuda.is_available(): net = net.cuda() lossFunc = lossFunc.cuda() if config.dataParallel: net = DataParallelModel(net) lossFunc = DataParallelCriterion(lossFunc) options = optionsLoader(LOG, config.optionFrames, disp=False) Tokenizer = BertTokenizer.from_pretrained(config.model) prepareFunc = prepare_data trainSet = Dataset('train', config.batch_size, lambda x: len(x[0]) + len(x[1]), prepareFunc, Tokenizer, options['dataset'], LOG, 'train') validSet = Dataset('valid', config.batch_size, lambda x: len(x[0]) + len(x[1]), prepareFunc, Tokenizer, options['dataset'], LOG, 'valid') print(trainSet.__len__()) Q = [] best_vloss = 1e99 counter = 0 lRate = config.lRate prob_src = config.prob_src prob_tgt = config.prob_tgt num_train_optimization_steps = trainSet.__len__( ) * options['training']['stopConditions']['max_epoch'] param_optimizer = list(net.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=lRate, e=1e-9, t_total=num_train_optimization_steps, warmup=0.0) for epoch_idx in range(options['training']['stopConditions']['max_epoch']): total_seen = 0 total_similar = 0 total_unseen = 0 total_source = 0 trainSet.setConfig(config, prob_src, prob_tgt) trainLoader = data.DataLoader(dataset=trainSet, batch_size=1, shuffle=True, num_workers=config.dataLoader_workers, pin_memory=True) validSet.setConfig(config, 0.0, prob_tgt) validLoader = data.DataLoader(dataset=validSet, batch_size=1, shuffle=False, num_workers=config.dataLoader_workers, pin_memory=True) for batch_idx, batch_data in enumerate(trainLoader): if (batch_idx + 1) % 10000 == 0: gc.collect() start_time = time.time() net.train() inputs, positions, token_types, labels, masks, batch_seen, batch_similar, batch_unseen, batch_source = batch_data inputs = inputs[0].cuda() positions = positions[0].cuda() token_types = token_types[0].cuda() labels = labels[0].cuda() masks = masks[0].cuda() total_seen += batch_seen total_similar += batch_similar total_unseen += batch_unseen total_source += batch_source n_token = int((labels.data != 0).data.sum()) predicts = net(inputs, positions, token_types, masks) loss = lossFunc(predicts, labels, n_token).sum() Q.append(float(loss)) if len(Q) > 200: Q.pop(0) loss_avg = sum(Q) / len(Q) optimizer.zero_grad() loss.backward() optimizer.step() LOG.log( 'Epoch %2d, Batch %6d, Loss %9.6f, Average Loss %9.6f, Time %9.6f' % (epoch_idx + 1, batch_idx + 1, loss, loss_avg, time.time() - start_time)) # Checkpoints idx = epoch_idx * trainSet.__len__() + batch_idx + 1 if (idx >= options['training']['checkingPoints']['checkMin']) and ( idx % options['training']['checkingPoints']['checkFreq'] == 0): if config.do_eval: vloss = 0 total_tokens = 0 for bid, batch_data in enumerate(validLoader): inputs, positions, token_types, labels, masks, batch_seen, batch_similar, batch_unseen, batch_source = batch_data inputs = inputs[0].cuda() positions = positions[0].cuda() token_types = token_types[0].cuda() labels = labels[0].cuda() masks = masks[0].cuda() n_token = int((labels.data != config.PAD).data.sum()) with torch.no_grad(): net.eval() predicts = net(inputs, positions, token_types, masks) vloss += float(lossFunc(predicts, labels).sum()) total_tokens += n_token vloss /= total_tokens is_best = vloss < best_vloss best_vloss = min(vloss, best_vloss) LOG.log( 'CheckPoint: Validation Loss %11.8f, Best Loss %11.8f' % (vloss, best_vloss)) if is_best: LOG.log('Best Model Updated') save_check_point( { 'epoch': epoch_idx + 1, 'batch': batch_idx + 1, 'options': options, 'config': config, 'state_dict': net.state_dict(), 'best_vloss': best_vloss }, is_best, path=config.save_path, fileName='latest.pth.tar') counter = 0 else: counter += options['training']['checkingPoints'][ 'checkFreq'] if counter >= options['training']['stopConditions'][ 'rateReduce_bound']: counter = 0 for param_group in optimizer.param_groups: lr_ = param_group['lr'] param_group['lr'] *= 0.55 _lr = param_group['lr'] LOG.log( 'Reduce Learning Rate from %11.8f to %11.8f' % (lr_, _lr)) LOG.log('Current Counter = %d' % (counter)) else: save_check_point( { 'epoch': epoch_idx + 1, 'batch': batch_idx + 1, 'options': options, 'config': config, 'state_dict': net.state_dict(), 'best_vloss': 1e99 }, False, path=config.save_path, fileName='checkpoint_Epoch' + str(epoch_idx + 1) + '_Batch' + str(batch_idx + 1) + '.pth.tar') LOG.log('CheckPoint Saved!') if options['training']['checkingPoints']['everyEpoch']: save_check_point( { 'epoch': epoch_idx + 1, 'batch': batch_idx + 1, 'options': options, 'config': config, 'state_dict': net.state_dict(), 'best_vloss': 1e99 }, False, path=config.save_path, fileName='checkpoint_Epoch' + str(epoch_idx + 1) + '.pth.tar') LOG.log('Epoch Finished.') LOG.log( 'Total Seen: %d, Total Unseen: %d, Total Similar: %d, Total Source: %d.' % (total_seen, total_unseen, total_similar, total_source)) gc.collect()
def test(config): Best_Model = torch.load(config.test_model) Tokenizer = BertTokenizer.from_pretrained(config.model) f_in = open(config.inputFile, 'r') net = BertForMaskedLM.from_pretrained(config.model) # When loading from a model not trained from DataParallel #net.load_state_dict(Best_Model['state_dict']) #net.eval() if torch.cuda.is_available(): net = net.cuda(0) if config.dataParallel: net = DataParallelModel(net) # When loading from a model trained from DataParallel net.load_state_dict(Best_Model['state_dict']) net.eval() mySearcher = Searcher(net, config) f_top1 = open('summary' + config.suffix + '.txt', 'w', encoding='utf-8') f_topK = open('summary' + config.suffix + '.txt.' + str(config.answer_size), 'w', encoding='utf-8') ed = '\n------------------------\n' for idx, line in enumerate(f_in): source_ = line.strip().split() source = Tokenizer.tokenize(line.strip()) mapping = mapping_tokenize(source_, source) source = Tokenizer.convert_tokens_to_ids(source) print(idx) print(detokenize(translate(source, Tokenizer), mapping), end=ed) l_pred = mySearcher.length_Predict(source) Answers = mySearcher.search(source) baseline = sum(Answers[0][0]) if config.reranking_method == 'none': Answers = sorted(Answers, key=lambda x: sum(x[0])) elif config.reranking_method == 'length_norm': Answers = sorted(Answers, key=lambda x: length_norm(x[0])) elif config.reranking_method == 'bounded_word_reward': Answers = sorted( Answers, key=lambda x: bounded_word_reward(x[0], config.reward, l_pred)) elif config.reranking_method == 'bounded_adaptive_reward': Answers = sorted( Answers, key=lambda x: bounded_adaptive_reward(x[0], x[2], l_pred)) texts = [ detokenize(translate(Answers[k][1], Tokenizer), mapping) for k in range(len(Answers)) ] if baseline != sum(Answers[0][0]): print('Reranked!') print(texts[0], end=ed) print(texts[0], file=f_top1) print(len(texts), file=f_topK) for i in range(len(texts)): print(Answers[i][0], file=f_topK) print(texts[i], file=f_topK) f_top1.close() f_topK.close()
# target dimension[0] / 2 # tar = target.contiguous().view(-1) # out = output.contiguous().view(target.size(0),-1) target = tar.contiguous().view(-1) output = out[:tar.size(0)] normalize = output.size(0) * output.size(1) output = output.contiguous().view(target.size(0), -1) loss = self.NLL(output, target) / normalize return loss if not eval_model: criterion = NLLLoss(ignore_index=PAD) parallel_model = DataParallelModel(model) # Encapsulate the model parallel_loss = DataParallelCriterion(criterion) # In[5]: # --------------------------- # def merge_res(res): # ((inds1, log_probs1, enc_out1),(inds2, log_probs2, enc_out2)) = res # inds = T.cat([inds1, inds2], dim = 0).cpu() # enc_out = T.cat([enc_out1, enc_out2], dim = 0).cpu() # if type(log_probs1) != list: # log_probs = T.cat([log_probs1, log_probs2], dim = 0) # return inds, log_probs, enc_out # else: # return inds, _, enc_out
if config.model_type=='LSTM': model = LSTMLM(input_size=len(vocab), embedding_size=config.embedding_size, hidden_size=config.hidden_size, output_size=len(vocab), n_layers=config.n_layers, dropout_p=config.dropout_p) elif config.model_type=='BiLSTM': model = BiLSTMLM(input_size=len(vocab), embedding_size=config.embedding_size, hidden_size=config.hidden_size, output_size=len(vocab), n_layers=config.n_layers, dropout_p=config.dropout_p) loss_fn = nn.NLLLoss(ignore_index=vocab.stoi[vocab.pad_token]) optimizer = optim.Adam(model.parameters()) if config.cuda: if config.multi_gpu: from parallel import DataParallelModel, DataParallelCriterion model = DataParallelModel(model).cuda() loss_fn = DataParallelCriterion(loss_fn).cuda() else: model = model.cuda() loss_fn = loss_fn.cuda() print('=========MODEL=========\n',model) # Train for epoch in range(1, config.epochs+1): train()
def parallelize(self): self.parallel = True self.model = DataParallelModel(self.model) self.criterion = DataParallelCriterion(self.criterion)
def main(): args = setup_parser() args.final_eval = False if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup CUDA, GPU & distributed training device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) # Prepare GLUE task args.task_name = args.task_name.lower() if args.task_name not in processors: raise ValueError("Task not found: %s" % (args.task_name)) processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] label_list = processor.get_labels(args.data_dir) num_labels = len(label_list) args.num_labels = num_labels # Load pretrained model and tokenizer args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) model = model_class.from_pretrained(args.model_name_or_path, config=config) model.to(args.device) # logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train: # Create output directory if needed if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) # Load a trained model and vocabulary that you have fine-tuned model = model_class.from_pretrained(args.output_dir) tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) model.to(args.device) if args.n_gpu > 1: model = DataParallelModel(model) # Evaluation results = {} if args.do_eval: tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) logging.getLogger("pytorch_transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( '-')[-1] if len(checkpoints) > 1 else "" model = model_class.from_pretrained(checkpoint) model.to(args.device) if args.n_gpu > 1: model = DataParallelModel(model) args.final_eval = True result = evaluate(args, model, tokenizer, prefix=global_step) result = dict( (k + '_{}'.format(global_step), v) for k, v in result.items()) results.update(result) if args.save_embeddings: save_embeddings(args, model, tokenizer) return results
def __init__(self, model, mask_prob: float = 0.3, clip: int = 1, optimizer=None, beam_width: int = 5, max_len_a: float = 1.1, max_len_b: int = 5, len_penalty_ratio: float = 0.8, nll_loss: bool = False, fp16: bool = False, mm_mode="mixed", rank: int = -1): self.model = model self.clip = clip self.optimizer = optimizer self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.num_gpu = torch.cuda.device_count() self.mask_prob = mask_prob if nll_loss: self.criterion = nn.NLLLoss( ignore_index=model.text_processor.pad_token_id()) else: self.criterion = SmoothedNLLLoss( ignore_index=model.text_processor.pad_token_id()) self.num_gpu = torch.cuda.device_count() self.fp16 = False self.rank = rank if rank >= 0: self.device = torch.device('cuda', rank) torch.cuda.set_device(self.device) self.model = self.model.to(self.device) if fp16: self.model, self.optimizer = amp.initialize(self.model, self.optimizer, opt_level="O2") self.fp16 = True self.generator = BeamDecoder(self.model, beam_width=beam_width, max_len_a=max_len_a, max_len_b=max_len_b, len_penalty_ratio=len_penalty_ratio) if rank >= 0: self.model = DistributedDataParallel(self.model, device_ids=[self.rank], output_device=self.rank, find_unused_parameters=True) self.generator = DistributedDataParallel( self.generator, device_ids=[self.rank], output_device=self.rank, find_unused_parameters=True) elif self.num_gpu > 1: print("Let's use", self.num_gpu, "GPUs!") self.model = DataParallelModel(self.model) self.criterion = DataParallelCriterion(self.criterion) self.generator = DataParallelModel(self.generator) self.reference = None self.best_bleu = -1.0 self.mm_mode = mm_mode
class TransferNetworkImg(Network): def __init__(self, model_name='DenseNet', model_type='cv_transfer', lr=0.02, criterion=nn.CrossEntropyLoss(), optimizer_name='Adam', dropout_p=0.45, pretrained=True, device=None, best_accuracy=0., best_validation_loss=None, best_model_file='best_model.pth', head={ 'num_outputs': 10, 'layers': [], 'model_type': 'classifier' }, class_names=[], num_classes=None, add_extra=True, set_params=True, set_head=True): super().__init__(device=device) self.set_transfer_model(model_name, pretrained=pretrained, add_extra=add_extra, dropout_p=dropout_p) if set_head: self.set_model_head(model_name=model_name, head=head, dropout_p=dropout_p, criterion=criterion, device=device) if set_params: self.set_model_params(criterion=criterion, optimizer_name=optimizer_name, lr=lr, dropout_p=dropout_p, model_name=model_name, model_type=model_type, best_accuracy=best_accuracy, best_validation_loss=best_validation_loss, best_model_file=best_model_file, class_names=class_names, num_classes=num_classes) self.model = self.model.to(device) def set_model_params(self, criterion=nn.CrossEntropyLoss(), optimizer_name='Adam', lr=0.1, dropout_p=0.45, model_name='DenseNet', model_type='cv_transfer', best_accuracy=0., best_validation_loss=None, best_model_file='best_model_file.pth', class_names=[], num_classes=None): print('Transfer Learning: current best accuracy = {:.3f}'.format( best_accuracy)) super(TransferNetworkImg, self).set_model_params(criterion=criterion, optimizer_name=optimizer_name, lr=lr, dropout_p=dropout_p, model_name=model_name, model_type=model_type, best_accuracy=best_accuracy, best_validation_loss=best_validation_loss, best_model_file=best_model_file) self.class_names = class_names self.num_classes = num_classes if len(class_names) == 0: self.class_names = { k: str(v) for k, v in enumerate(list(range(self.head['num_outputs']))) } def forward(self, x): return self.model(x) def freeze(self, train_classifier=True): super(TransferNetworkImg, self).freeze() if train_classifier: for param in self.model.fc.parameters(): param.requires_grad = True def parallelize(self): self.parallel = True self.model = DataParallelModel(self.model) self.criterion = DataParallelCriterion(self.criterion) def set_transfer_model(self, mname, pretrained=True, add_extra=True, dropout_p=0.45): self.model = None models_dict = { 'densenet': { 'model': models.densenet121(pretrained=pretrained), 'conv_channels': 1024 }, 'resnet34': { 'model': models.resnet34(pretrained=pretrained), 'conv_channels': 512 }, 'resnet50': { 'model': models.resnet50(pretrained=pretrained), 'conv_channels': 2048 } } meta = models_dict[mname.lower()] try: model = meta['model'] for param in model.parameters(): param.requires_grad = False self.model = model print( 'Setting transfer learning model: self.model set to {}'.format( mname)) except: print( 'Setting transfer learning model: model name {} not supported'. format(mname)) # creating and adding extra layers to the model dream_model = None if add_extra: channels = meta['conv_channels'] dream_model = nn.Sequential( nn.Conv2d(channels, channels, 3, 1, 1), # Printer(), nn.BatchNorm2d(channels), nn.ReLU(True), nn.Dropout2d(dropout_p), nn.Conv2d(channels, channels, 3, 1, 1), nn.BatchNorm2d(channels), nn.ReLU(True), nn.Dropout2d(dropout_p), nn.Conv2d(channels, channels, 3, 1, 1), nn.BatchNorm2d(channels), nn.ReLU(True), nn.Dropout2d(dropout_p)) self.dream_model = dream_model def set_model_head( self, model_name='DenseNet', head={ 'num_outputs': 10, 'layers': [], 'class_names': None, 'model_type': 'classifier' }, criterion=nn.NLLLoss(), adaptive=True, dropout_p=0.45, device=None): models_meta = { 'resnet34': { 'conv_channels': 512, 'head_id': -2, 'adaptive_head': [DAI_AvgPool], 'normal_head': [nn.AvgPool2d(7, 1)] }, 'resnet50': { 'conv_channels': 2048, 'head_id': -2, 'adaptive_head': [DAI_AvgPool], 'normal_head': [nn.AvgPool2d(7, 1)] }, 'densenet': { 'conv_channels': 1024, 'head_id': -1, 'adaptive_head': [nn.ReLU(inplace=True), DAI_AvgPool], 'normal_head': [nn.ReLU(inplace=True), nn.AvgPool2d(7, 1)] } } name = model_name.lower() meta = models_meta[name] modules = list(self.model.children()) l = modules[:meta['head_id']] if self.dream_model: l += self.dream_model if type(head).__name__ != 'dict': model = nn.Sequential(*l) for layer in head.children(): if (type(layer).__name__) == 'StdConv': conv_module = layer break conv_layer = conv_module.conv temp_args = [ conv_layer.out_channels, conv_layer.kernel_size, conv_layer.stride, conv_layer.padding ] temp_args.insert(0, meta['conv_channels']) conv_layer = nn.Conv2d(*temp_args) conv_module.conv = conv_layer model.add_module('custom_head', head) else: head['criterion'] = criterion if head['model_type'].lower() == 'classifier': head['output_non_linearity'] = None self.num_outputs = head['num_outputs'] fc = modules[-1] try: in_features = fc.in_features except: in_features = fc.model.out.in_features fc = FC(num_inputs=in_features, num_outputs=head['num_outputs'], layers=head['layers'], model_type=head['model_type'], output_non_linearity=head['output_non_linearity'], dropout_p=dropout_p, criterion=head['criterion'], optimizer_name=None, device=device) if adaptive: l += meta['adaptive_head'] else: l += meta['normal_head'] model = nn.Sequential(*l) model.add_module('fc', fc) self.model = model self.head = head if type(head).__name__ == 'dict': print('Model: {}, Setting head: inputs: {} hidden:{} outputs: {}'. format(model_name, in_features, head['layers'], head['num_outputs'])) else: print('Model: {}, Setting head: {}'.format(model_name, type(head).__name__)) def _get_dropout(self): return self.dropout_p def _set_dropout(self, p=0.45): if self.model.classifier is not None: print('{}: setting head (FC) dropout prob to {:.3f}'.format( self.model_name, p)) self.model.fc._set_dropout(p=p) def get_model_params(self): params = super(TransferNetworkImg, self).get_model_params() params['class_names'] = self.class_names params['num_classes'] = self.num_classes params['head'] = self.head return params
def createModels(args, userNum, itemNum, adj): if args.model == 'NCF': model = NCF(userNum, itemNum, 64, layers=[128, 64, 32, 16, 8]).cuda() if args.model == 'NMF': model = NMF(args.model, userNum, itemNum, 3, args.embedSize, args.droprate).cuda() elif args.model == 'NGCFMF': model = NGCFMF(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers).cuda() elif args.model == 'NGCFMLP': model = NGCFMLP(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers).cuda() elif args.model == 'NGCFMFMLP': model = NGCFMFMLP(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers).cuda() elif args.model == 'NGCFMF_concat_MF': model = NGCFMF_concat_MF(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers).cuda() elif args.model == 'NGCFMF_concat_MLP': model = NGCFMF_concat_MLP(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers).cuda() elif args.model == 'NGCFMLP_concat_MF': model = NGCFMLP_concat_MF(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers).cuda() elif args.model == 'NGCFMLP_concat_MLP': model = NGCFMLP_concat_MLP(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers).cuda() elif args.model == 'NGCFMF_concat_MF_MLP': model = NGCFMF_concat_MF_MLP(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers).cuda() elif args.model == 'NGCFMLP_concat_MF_MLP': model = NGCFMLP_concat_MF_MLP(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers).cuda() elif args.model == 'GACFV1': model = GACFV1(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers, droprate=args.droprate).cuda() elif args.model == 'GACFV2': model = GACFV2(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers, droprate=args.droprate).cuda() elif args.model == 'GACFMask': model = GACFMask(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers, droprate=args.droprate).cuda() elif args.model == 'SPGA': model = SPGACF(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers, droprate=args.droprate).cuda() elif args.model == 'GACFV3': model = GACFV3(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers, droprate=args.droprate).cuda() elif args.model == 'GACFV4': model = GACFV4(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers, droprate=args.droprate).cuda() elif args.model == 'GACFV5': model = GACFV5(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers, droprate=args.droprate).cuda() elif args.model == 'GACFV6': model = GACFV6(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers, droprate=args.droprate).cuda() if args.train_mode == 'PairSampling': lossfn = BPRLoss() if args.parallel == True: model = DataParallelModel(model) lossfn = DataParallelCriterion2(lossfn) elif args.train_mode == 'NegSampling': lossfn = BCEWithLogitsLoss() if args.parallel == True: model = DataParallelModel(model) # 并行化model lossfn = DataParallelCriterion(lossfn) # 并行化损失函数 optim = Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) return model, lossfn, optim
class FoodIngredients(Network): def __init__(self, model_name='DenseNet', model_type='food', lr=0.02, optimizer_name='Adam', criterion1=nn.CrossEntropyLoss(), criterion2=nn.BCEWithLogitsLoss(), dropout_p=0.45, pretrained=True, device=None, best_accuracy=0., best_validation_loss=None, best_model_file='best_model.pth', head1={ 'num_outputs': 10, 'layers': [], 'model_type': 'classifier' }, head2={ 'num_outputs': 10, 'layers': [], 'model_type': 'multi_label_classifier' }, class_names=[], num_classes=None, ingredient_names=[], num_ingredients=None, add_extra=True, set_params=True, set_head=True): super().__init__(device=device) self.set_transfer_model(model_name, pretrained=pretrained, add_extra=add_extra, dropout_p=dropout_p) if set_head: self.set_model_head(model_name=model_name, head1=head1, head2=head2, dropout_p=dropout_p, criterion1=criterion1, criterion2=criterion2, device=device) if set_params: self.set_model_params( optimizer_name=optimizer_name, lr=lr, dropout_p=dropout_p, model_name=model_name, model_type=model_type, best_accuracy=best_accuracy, best_validation_loss=best_validation_loss, best_model_file=best_model_file, class_names=class_names, num_classes=num_classes, ingredient_names=ingredient_names, num_ingredients=num_ingredients, ) self.model = self.model.to(device) def set_model_params(self, criterion1=nn.CrossEntropyLoss(), criterion2=nn.BCEWithLogitsLoss(), optimizer_name='Adam', lr=0.1, dropout_p=0.45, model_name='DenseNet', model_type='cv_transfer', best_accuracy=0., best_validation_loss=None, best_model_file='best_model_file.pth', head1={ 'num_outputs': 10, 'layers': [], 'model_type': 'classifier' }, head2={ 'num_outputs': 10, 'layers': [], 'model_type': 'muilti_label_classifier' }, class_names=[], num_classes=None, ingredient_names=[], num_ingredients=None): print( 'Food Names: current best accuracy = {:.3f}'.format(best_accuracy)) if best_validation_loss is not None: print('Food Ingredients: current best loss = {:.3f}'.format( best_validation_loss)) super(FoodIngredients, self).set_model_params(optimizer_name=optimizer_name, lr=lr, dropout_p=dropout_p, model_name=model_name, model_type=model_type, best_accuracy=best_accuracy, best_validation_loss=best_validation_loss, best_model_file=best_model_file) self.class_names = class_names self.num_classes = num_classes self.ingredeint_names = ingredient_names self.num_ingredients = num_ingredients self.criterion1 = criterion1 self.criterion2 = criterion2 def forward(self, x): l = list(self.model.children()) for m in l[:-2]: x = m(x) food = l[-2](x) ingredients = l[-1](x) return (food, ingredients) def compute_loss(self, outputs, labels, w1=1., w2=1.): out1, out2 = outputs label1, label2 = labels loss1 = self.criterion1(out1, label1) loss2 = self.criterion2(out2, label2) return [(loss1 * w1) + (loss2 * w2)] def freeze(self, train_classifier=True): super(FoodIngredients, self).freeze() if train_classifier: for param in self.model.fc1.parameters(): param.requires_grad = True for param in self.model.fc2.parameters(): param.requires_grad = True def parallelize(self): self.parallel = True self.model = DataParallelModel(self.model) self.criterion = DataParallelCriterion(self.criterion) def set_transfer_model(self, mname, pretrained=True, add_extra=True, dropout_p=0.45): self.model = None models_dict = { 'densenet': { 'model': models.densenet121(pretrained=pretrained), 'conv_channels': 1024 }, 'resnet34': { 'model': models.resnet34(pretrained=pretrained), 'conv_channels': 512 }, 'resnet50': { 'model': models.resnet50(pretrained=pretrained), 'conv_channels': 2048 } } meta = models_dict[mname.lower()] try: model = meta['model'] for param in model.parameters(): param.requires_grad = False self.model = model print( 'Setting transfer learning model: self.model set to {}'.format( mname)) except: print( 'Setting transfer learning model: model name {} not supported'. format(mname)) # creating and adding extra layers to the model dream_model = None if add_extra: channels = meta['conv_channels'] dream_model = nn.Sequential( nn.Conv2d(channels, channels, 3, 1, 1), # Printer(), nn.BatchNorm2d(channels), nn.ReLU(True), nn.Dropout2d(dropout_p), nn.Conv2d(channels, channels, 3, 1, 1), nn.BatchNorm2d(channels), nn.ReLU(True), nn.Dropout2d(dropout_p), nn.Conv2d(channels, channels, 3, 1, 1), nn.BatchNorm2d(channels), nn.ReLU(True), nn.Dropout2d(dropout_p)) self.dream_model = dream_model def set_model_head( self, model_name='DenseNet', head1={ 'num_outputs': 10, 'layers': [], 'class_names': None, 'model_type': 'classifier' }, head2={ 'num_outputs': 10, 'layers': [], 'class_names': None, 'model_type': 'muilti_label_classifier' }, criterion1=nn.CrossEntropyLoss(), criterion2=nn.BCEWithLogitsLoss(), adaptive=True, dropout_p=0.45, device=None): models_meta = { 'resnet34': { 'conv_channels': 512, 'head_id': -2, 'adaptive_head': [DAI_AvgPool], 'normal_head': [nn.AvgPool2d(7, 1)] }, 'resnet50': { 'conv_channels': 2048, 'head_id': -2, 'adaptive_head': [DAI_AvgPool], 'normal_head': [nn.AvgPool2d(7, 1)] }, 'densenet': { 'conv_channels': 1024, 'head_id': -1, 'adaptive_head': [nn.ReLU(inplace=True), DAI_AvgPool], 'normal_head': [nn.ReLU(inplace=True), nn.AvgPool2d(7, 1)] } } name = model_name.lower() meta = models_meta[name] modules = list(self.model.children()) l = modules[:meta['head_id']] if self.dream_model: l += self.dream_model heads = [head1, head2] crits = [criterion1, criterion2] fcs = [] for head, criterion in zip(heads, crits): head['criterion'] = criterion if head['model_type'].lower() == 'classifier': head['output_non_linearity'] = None fc = modules[-1] try: in_features = fc.in_features except: in_features = fc.model.out.in_features fc = FC(num_inputs=in_features, num_outputs=head['num_outputs'], layers=head['layers'], model_type=head['model_type'], output_non_linearity=head['output_non_linearity'], dropout_p=dropout_p, criterion=head['criterion'], optimizer_name=None, device=device) fcs.append(fc) if adaptive: l += meta['adaptive_head'] else: l += meta['normal_head'] model = nn.Sequential(*l) model.add_module('fc1', fcs[0]) model.add_module('fc2', fcs[1]) self.model = model self.head1 = head1 self.head2 = head2 print('Multi-head set up complete.') def train_(self, e, trainloader, optimizer, print_every): epoch, epochs = e self.train() t0 = time.time() t1 = time.time() batches = 0 running_loss = 0. for data_batch in trainloader: inputs, label1, label2 = data_batch[0], data_batch[1], data_batch[ 2] batches += 1 inputs = inputs.to(self.device) label1 = label1.to(self.device) label2 = label2.to(self.device) labels = (label1, label2) optimizer.zero_grad() outputs = self.forward(inputs) loss = self.compute_loss(outputs, labels)[0] if self.parallel: loss.sum().backward() loss = loss.sum() else: loss.backward() loss = loss.item() optimizer.step() running_loss += loss if batches % print_every == 0: elapsed = time.time() - t1 if elapsed > 60: elapsed /= 60. measure = 'min' else: measure = 'sec' batch_time = time.time() - t0 if batch_time > 60: batch_time /= 60. measure2 = 'min' else: measure2 = 'sec' print( '+----------------------------------------------------------------------+\n' f"{time.asctime().split()[-2]}\n" f"Time elapsed: {elapsed:.3f} {measure}\n" f"Epoch:{epoch+1}/{epochs}\n" f"Batch: {batches+1}/{len(trainloader)}\n" f"Batch training time: {batch_time:.3f} {measure2}\n" f"Batch training loss: {loss:.3f}\n" f"Average training loss: {running_loss/(batches):.3f}\n" '+----------------------------------------------------------------------+\n' ) t0 = time.time() return running_loss / len(trainloader) def evaluate(self, dataloader, metric='accuracy'): running_loss = 0. classifier = None if self.model_type == 'classifier': # or self.num_classes is not None: classifier = Classifier(self.class_names) y_pred = [] y_true = [] self.eval() rmse_ = 0. with torch.no_grad(): for data_batch in dataloader: inputs, label1, label2 = data_batch[0], data_batch[ 1], data_batch[2] inputs = inputs.to(self.device) label1 = label1.to(self.device) label2 = label2.to(self.device) labels = (label1, label2) outputs = self.forward(inputs) loss = self.compute_loss(outputs, labels)[0] if self.parallel: running_loss += loss.sum() outputs = parallel.gather(outputs, self.device) else: running_loss += loss.item() if classifier is not None and metric == 'accuracy': classifier.update_accuracies(outputs, labels) y_true.extend(list(labels.squeeze(0).cpu().numpy())) _, preds = torch.max(torch.exp(outputs), 1) y_pred.extend(list(preds.cpu().numpy())) elif metric == 'rmse': rmse_ += rmse(outputs, labels).cpu().numpy() self.train() ret = {} # print('Running_loss: {:.3f}'.format(running_loss)) if metric == 'rmse': print('Total rmse: {:.3f}'.format(rmse_)) ret['final_rmse'] = rmse_ / len(dataloader) ret['final_loss'] = running_loss / len(dataloader) if classifier is not None: ret['accuracy'], ret[ 'class_accuracies'] = classifier.get_final_accuracies() ret['report'] = classification_report( y_true, y_pred, target_names=self.class_names) ret['confusion_matrix'] = confusion_matrix(y_true, y_pred) try: ret['roc_auc_score'] = roc_auc_score(y_true, y_pred) except: pass return ret def evaluate_food(self, dataloader, metric='accuracy'): running_loss = 0. classifier = None classifier = Classifier(self.class_names) y_pred = [] y_true = [] self.eval() rmse_ = 0. with torch.no_grad(): for data_batch in dataloader: inputs, labels = data_batch[0], data_batch[1] inputs = inputs.to(self.device) labels = labels.to(self.device) outputs = self.forward(inputs)[0] if classifier is not None and metric == 'accuracy': try: classifier.update_accuracies(outputs, labels) y_true.extend(list(labels.squeeze(0).cpu().numpy())) _, preds = torch.max(torch.exp(outputs), 1) y_pred.extend(list(preds.cpu().numpy())) except: pass elif metric == 'rmse': rmse_ += rmse(outputs, labels).cpu().numpy() self.train() ret = {} # print('Running_loss: {:.3f}'.format(running_loss)) if metric == 'rmse': print('Total rmse: {:.3f}'.format(rmse_)) ret['final_rmse'] = rmse_ / len(dataloader) ret['final_loss'] = running_loss / len(dataloader) if classifier is not None: ret['accuracy'], ret[ 'class_accuracies'] = classifier.get_final_accuracies() ret['report'] = classification_report( y_true, y_pred, target_names=self.class_names) ret['confusion_matrix'] = confusion_matrix(y_true, y_pred) try: ret['roc_auc_score'] = roc_auc_score(y_true, y_pred) except: pass return ret def find_lr(self, trn_loader, init_value=1e-8, final_value=10., beta=0.98, plot=False): print('\nFinding the ideal learning rate.') model_state = copy.deepcopy(self.model.state_dict()) optim_state = copy.deepcopy(self.optimizer.state_dict()) optimizer = self.optimizer num = len(trn_loader) - 1 mult = (final_value / init_value)**(1 / num) lr = init_value optimizer.param_groups[0]['lr'] = lr avg_loss = 0. best_loss = 0. batch_num = 0 losses = [] log_lrs = [] for data_batch in trn_loader: batch_num += 1 inputs, label1, label2 = data_batch[0], data_batch[1], data_batch[ 2] inputs = inputs.to(self.device) label1 = label1.to(self.device) label2 = label2.to(self.device) labels = (label1, label2) optimizer.zero_grad() outputs = self.forward(inputs) loss = self.compute_loss(outputs, labels)[0] #Compute the smoothed loss if self.parallel: avg_loss = beta * avg_loss + (1 - beta) * loss.sum() else: avg_loss = beta * avg_loss + (1 - beta) * loss.item() smoothed_loss = avg_loss / (1 - beta**batch_num) #Stop if the loss is exploding if batch_num > 1 and smoothed_loss > 4 * best_loss: self.log_lrs, self.find_lr_losses = log_lrs, losses self.model.load_state_dict(model_state) self.optimizer.load_state_dict(optim_state) if plot: self.plot_find_lr() temp_lr = self.log_lrs[np.argmin(self.find_lr_losses) - (len(self.log_lrs) // 8)] self.lr = (10**temp_lr) print('Found it: {}\n'.format(self.lr)) return self.lr #Record the best loss if smoothed_loss < best_loss or batch_num == 1: best_loss = smoothed_loss #Store the values losses.append(smoothed_loss) log_lrs.append(math.log10(lr)) #Do the SGD step if self.parallel: loss.sum().backward() else: loss.backward() optimizer.step() #Update the lr for the next step lr *= mult optimizer.param_groups[0]['lr'] = lr self.log_lrs, self.find_lr_losses = log_lrs, losses self.model.load_state_dict(model_state) self.optimizer.load_state_dict(optim_state) if plot: self.plot_find_lr() temp_lr = self.log_lrs[np.argmin(self.find_lr_losses) - (len(self.log_lrs) // 10)] self.lr = (10**temp_lr) print('Found it: {}\n'.format(self.lr)) return self.lr def plot_find_lr(self): plt.ylabel("Loss") plt.xlabel("Learning Rate (log scale)") plt.plot(self.log_lrs, self.find_lr_losses) plt.show() def classify(self, inputs, thresh=0.4): #,show = False,mean = None,std = None): outputs = self.predict(inputs) food, ing = outputs try: _, preds = torch.max(torch.exp(food), 1) except: _, preds = torch.max(torch.exp(food.unsqueeze(0)), 1) ing_outs = ing.sigmoid() ings = (ing_outs >= thresh) class_preds = [str(self.class_names[p]) for p in preds] ing_preds = [ self.ingredeint_names[p.nonzero().squeeze(1).cpu()] for p in ings ] return class_preds, ing_preds def _get_dropout(self): return self.dropout_p def get_model_params(self): params = super(FoodIngredients, self).get_model_params() params['class_names'] = self.class_names params['num_classes'] = self.num_classes params['ingredient_names'] = self.ingredient_names params['num_ingredients'] = self.num_ingredients params['head1'] = self.head1 params['head2'] = self.head2 return params
class LMTrainer: def __init__(self, model, mask_prob: float = 0.15, clip: int = 1, optimizer=None): self.model = model self.clip = clip self.optimizer = optimizer self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.model = self.model.to(self.device) self.mask_prob = mask_prob self.criterion = nn.NLLLoss( ignore_index=model.text_processor.pad_token_id()) num_gpu = torch.cuda.device_count() if num_gpu > 1: print("Let's use", num_gpu, "GPUs!") self.model = DataParallelModel(self.model) self.criterion = DataParallelCriterion(self.criterion) self.best_dev_loss = float("inf") self.best_train_loss = float("inf") self.last_train_loss = float("inf") def train_epoch(self, data_iter: data_utils.DataLoader, dev_data_iter: data_utils.DataLoader, saving_path: str, step: int): "Standard Training and Logging Function" start = time.time() total_tokens, total_loss, tokens, cur_loss = 0, 0, 0, 0 cur_loss = 0 model = self.model.module if hasattr(self.model, "module") else self.model for i, batch in enumerate(data_iter): if self.optimizer is not None: self.optimizer.zero_grad() mask, target, texts = mask_text(self.mask_prob, batch["pad_mask"], batch["texts"], model.text_processor) try: predictions = self.model(mask=mask, texts=texts, pads=batch["pad_mask"], langs=batch["langs"]) ntokens = target.size(0) if ntokens == 0: # Nothing to predict! continue loss = self.criterion(predictions, target).mean() loss.backward() unmask_text(mask, target, texts) if self.optimizer is not None: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip) self.optimizer.step() step += 1 loss = float(loss.data) * ntokens total_loss += loss cur_loss += loss total_tokens += ntokens tokens += ntokens if step % 50 == 0: elapsed = time.time() - start print( datetime.datetime.now(), "Epoch Step: %d Loss: %f Tokens per Sec: %f" % (step, cur_loss / tokens, tokens / elapsed)) if step % 500 == 0: self.validate_and_save(saving_path, dev_data_iter) start, tokens, cur_loss = time.time(), 0, 0 except RuntimeError as err: print("Problem with batch item", texts.size()) torch.cuda.empty_cache() pass current_loss = total_loss / total_tokens print("Total loss in this epoch: %f" % current_loss) if current_loss < self.best_train_loss: self.best_train_loss = current_loss model_to_save = (self.model.module if hasattr( self.model, "module") else self.model) model_to_save.save(saving_path + ".latest") with open(os.path.join(saving_path + ".latest", "optim"), "wb") as fp: pickle.dump(self.optimizer, fp) self.last_train_loss = current_loss self.validate_and_save(saving_path, dev_data_iter) return step def validate_and_save(self, saving_path, dev_data_iter): with torch.no_grad(): model = self.model.module if hasattr(self.model, "module") else self.model model.eval() total_dev_loss, total_dev_tokens = 0, 0 for batch in dev_data_iter: mask, target, texts = mask_text(self.mask_prob, batch["pad_mask"], batch["texts"].clone(), model.text_processor) predictions = self.model(mask=mask, texts=texts, pads=batch["pad_mask"], langs=batch["langs"]) ntokens = target.size(0) if ntokens == 0: # Nothing to predict! continue loss = self.criterion(predictions, target).mean().data * ntokens total_dev_loss += float(loss) total_dev_tokens += ntokens dev_loss = total_dev_loss / total_dev_tokens print("Current dev loss", dev_loss) if self.best_dev_loss > float(dev_loss): self.best_dev_loss = float(dev_loss) print("saving best dev loss", self.best_dev_loss) model_to_save = (self.model.module if hasattr( self.model, "module") else self.model) model_to_save.save(saving_path) with open(os.path.join(saving_path, "optim"), "wb") as fp: pickle.dump(self.optimizer, fp) model.train() @staticmethod def config_dropout(model, dropout): model.encoder.config.hidden_dropout_prob = dropout model.encoder.config.attention_probs_dropout_prob = dropout @staticmethod def train(options): if not os.path.exists(options.model_path): os.makedirs(options.model_path) text_processor = TextProcessor(options.tokenizer_path) lm_class = ReformerLM if options.reformer else LM if options.pretrained_path is None: lm = lm_class(text_processor=text_processor, size=options.model_size) else: lm = lm_class.load(options.pretrained_path) if options.reformer: lm.config.hidden_dropout_prob = options.dropout lm.config.local_attention_probs_dropout_prob = options.dropout lm.config.lsh_attention_probs_dropout_prob = options.dropout else: LMTrainer.config_dropout(lm, options.dropout) train_data = dataset.TextDataset(save_cache_dir=options.train_path, max_cache_size=options.cache_size) dev_data = dataset.TextDataset(save_cache_dir=options.dev_path, max_cache_size=options.cache_size, load_all=True) if options.continue_train: with open(os.path.join(options.pretrained_path, "optim"), "rb") as fp: optimizer = pickle.load(fp) else: optimizer = build_optimizer(lm, options.learning_rate, options.warmup) trainer = LMTrainer(model=lm, mask_prob=options.mask_prob, optimizer=optimizer, clip=options.clip) collator = dataset.TextCollator(pad_idx=text_processor.pad_token_id()) train_sampler, dev_sampler = None, None pin_memory = torch.cuda.is_available() loader = data_utils.DataLoader(train_data, batch_size=options.batch, shuffle=False, pin_memory=pin_memory, collate_fn=collator, sampler=train_sampler) dev_loader = data_utils.DataLoader(dev_data, batch_size=options.batch, shuffle=False, pin_memory=pin_memory, collate_fn=collator, sampler=dev_sampler) step, train_epoch = 0, 1 while step <= options.step: print("train epoch", train_epoch) step = trainer.train_epoch(data_iter=loader, dev_data_iter=dev_loader, saving_path=options.model_path, step=step)