def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--meta_path", default=None, type=str, required=False, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( '--classifier', default='guoday', type=str, required=True, help='classifier type, guoday or MLP or GRU_MLP or ...') parser.add_argument('--optimizer', default='RAdam', type=str, required=True, help='optimizer we use, RAdam or ...') parser.add_argument("--do_label_smoothing", default='yes', type=str, required=True, help="Whether to do label smoothing. yes or no.") parser.add_argument('--draw_loss_steps', default=1, type=int, required=True, help='training steps to draw loss') parser.add_argument('--label_name', default='label', type=str, required=True, help='label name in original train set index') ## Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", default='yes', type=str, required=True, help="Whether to run training. yes or no.") parser.add_argument("--do_test", default='yes', type=str, required=True, help="Whether to run training. yes or no.") parser.add_argument("--do_eval", default='yes', type=str, required=True, help="Whether to run eval on the dev set. yes or no.") parser.add_argument( "--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--eval_steps", default=200, type=int, help="") parser.add_argument("--lstm_hidden_size", default=300, type=int, help="") parser.add_argument("--lstm_layers", default=2, type=int, help="") parser.add_argument("--dropout", default=0.5, type=float, help="") parser.add_argument("--train_steps", default=-1, type=int, help="") parser.add_argument("--report_steps", default=-1, type=int, help="") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--split_num", default=3, type=int, help="text split") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") args = parser.parse_args() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) try: os.makedirs(args.output_dir) except: pass tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case) # tensorboard_log_dir = args.output_dir # loss_now = tf.placeholder(dtype=tf.float32, name='loss_now') # loss_mean = tf.placeholder(dtype=tf.float32, name='loss_mean') # loss_now_variable = loss_now # loss_mean_variable = loss_mean # train_loss = tf.summary.scalar('train_loss', loss_now_variable) # dev_loss_mean = tf.summary.scalar('dev_loss_mean', loss_mean_variable) # merged = tf.summary.merge([train_loss, dev_loss_mean]) config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=3) config.hidden_dropout_prob = args.dropout # Prepare model if args.do_train == 'yes': model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, args, config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) if args.do_train == 'yes': print( '________________________now training______________________________' ) # Prepare data loader train_examples = read_examples(os.path.join(args.data_dir, 'train.csv'), is_training=True, label_name=args.label_name) train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, args.split_num, True) # print('train_feature_size=', train_features.__sizeof__()) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # print('train_data=',train_data[0]) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps) num_train_optimization_steps = args.train_steps # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.optimizer == 'RAdam': optimizer = RAdam(optimizer_grouped_parameters, lr=args.learning_rate) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=args.train_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 model.train() tr_loss = 0 loss_batch = 0 nb_tr_examples, nb_tr_steps = 0, 0 bar = tqdm(range(num_train_optimization_steps), total=num_train_optimization_steps) train_dataloader = cycle(train_dataloader) # with tf.Session() as sess: # summary_writer = tf.summary.FileWriter(tensorboard_log_dir, sess.graph) # sess.run(tf.global_variables_initializer()) list_loss_mean = [] bx = [] eval_F1 = [] ax = [] for step in bar: batch = next(train_dataloader) batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() loss_batch += loss.item() train_loss = round( tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1), 4) bar.set_description("loss {}".format(train_loss)) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: # optimizer.backward(loss) loss.backward() else: loss.backward() # draw loss every n docs if (step + 1) % int(args.draw_loss_steps / (args.train_batch_size / args.gradient_accumulation_steps)) == 0: list_loss_mean.append(round(loss_batch, 4)) bx.append(step + 1) plt.plot(bx, list_loss_mean, label='loss_mean', linewidth=1, color='b', marker='o', markerfacecolor='green', markersize=2) plt.savefig(args.output_dir + '/labeled.jpg') loss_batch = 0 # paras update every batch data. if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 # report results every 200 real batch. if step % (args.eval_steps * args.gradient_accumulation_steps) == 0 and step > 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) # do evaluation totally 10 times during training stage. if args.do_eval == 'yes' and (step + 1) % int( num_train_optimization_steps / 10) == 0 and step > 450: for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] eval_examples = read_examples(os.path.join( args.data_dir, file), is_training=True, label_name=args.label_name) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field( eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field( eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_labels = np.concatenate(inference_labels, 0) inference_logits = np.concatenate(inference_logits, 0) model.train() ############################################### num_gold_0 = np.sum(gold_labels == 0) num_gold_1 = np.sum(gold_labels == 1) num_gold_2 = np.sum(gold_labels == 2) right_0 = 0 right_1 = 0 right_2 = 0 error_0 = 0 error_1 = 0 error_2 = 0 for gold_label, inference_label in zip( gold_labels, inference_labels): if gold_label == inference_label: if gold_label == 0: right_0 += 1 elif gold_label == 1: right_1 += 1 else: right_2 += 1 elif inference_label == 0: error_0 += 1 elif inference_label == 1: error_1 += 1 else: error_2 += 1 recall_0 = right_0 / (num_gold_0 + 1e-5) recall_1 = right_1 / (num_gold_1 + 1e-5) recall_2 = right_2 / (num_gold_2 + 1e-5) precision_0 = right_0 / (error_0 + right_0 + 1e-5) precision_1 = right_1 / (error_1 + right_1 + 1e-5) precision_2 = right_2 / (error_2 + right_2 + 1e-5) f10 = 2 * precision_0 * recall_0 / (precision_0 + recall_0 + 1e-5) f11 = 2 * precision_1 * recall_1 / (precision_1 + recall_1 + 1e-5) f12 = 2 * precision_2 * recall_2 / (precision_2 + recall_2 + 1e-5) output_dev_result_file = os.path.join( args.output_dir, "dev_results.txt") with open(output_dev_result_file, 'a', encoding='utf-8') as f: f.write('precision:' + str(precision_0) + ' ' + str(precision_1) + ' ' + str(precision_2) + '\n') f.write('recall:' + str(recall_0) + ' ' + str(recall_1) + ' ' + str(recall_2) + '\n') f.write('f1:' + str(f10) + ' ' + str(f11) + ' ' + str(f12) + '\n' + '\n') eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracy(inference_logits, gold_labels) # draw loss. eval_F1.append(round(eval_accuracy, 4)) ax.append(step) plt.plot(ax, eval_F1, label='eval_F1', linewidth=1, color='r', marker='o', markerfacecolor='blue', markersize=2) for a, b in zip(ax, eval_F1): plt.text(a, b, b, ha='center', va='bottom', fontsize=8) plt.savefig(args.output_dir + '/labeled.jpg') result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step, 'loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc and 'dev' in file: print("=" * 80) print("more accurate model arises, now best F1 = ", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model, only save the model it-self model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join( args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) ''' if (step+1) / int(num_train_optimization_steps/10) > 9.5: print("=" * 80) print("End of training. Saving Model......") # Save a trained model, only save the model it-self model_to_save = model.module if hasattr(model, 'module') else model output_model_file = os.path.join(args.output_dir, "pytorch_model_final_step.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) ''' if args.do_test == 'yes': start_time = time.time() print( '___________________now testing for best eval f1 model_________________________' ) try: del model except: pass gc.collect() args.do_train = 'no' model = BertForSequenceClassification.from_pretrained(os.path.join( args.output_dir, "pytorch_model.bin"), args, config=config) model.half() for layer in model.modules(): if isinstance(layer, torch.nn.modules.batchnorm._BatchNorm): layer.float() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from " "https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) for file, flag in [('test.csv', 'test')]: inference_labels = [] gold_labels = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=False, label_name=args.label_name) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask).detach().cpu().numpy() # print('test_logits=', logits) label_ids = label_ids.to('cpu').numpy() inference_labels.append(logits) gold_labels.append(label_ids) gold_labels = np.concatenate(gold_labels, 0) logits = np.concatenate(inference_labels, 0) if flag == 'dev': print(flag, accuracy(logits, gold_labels)) elif flag == 'test': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df['label_2'] = logits[:, 2] df[['id', 'label_0', 'label_1', 'label_2']].to_csv(os.path.join(args.output_dir, "sub.csv"), index=False) # df[['id', 'label_0', 'label_1']].to_csv(os.path.join(args.output_dir, "sub.csv"), index=False) else: raise ValueError('flag not in [dev, test]') print('inference time usd = {}s'.format(time.time() - start_time)) '''
def run(): d = { 'image_id': os.listdir(config.TRAIN_IMAGE_PATH), 'mask_id': os.listdir(config.TRAIN_MASK_PATH) } df = pd.DataFrame(data=d) folds = df.copy() kf = KFold(n_splits=config.N_FOLDS, shuffle=True, random_state=42) for fold, (train_idx, valid_idx) in enumerate(kf.split(folds)): print(f'FOLD: {fold+1}/{config.N_FOLDS}') train_test = folds.iloc[train_idx] valid_test = folds.iloc[valid_idx] train_test.reset_index(drop=True, inplace=True) valid_test.reset_index(drop=True, inplace=True) train_dataset = dataset.HuBMAPDataset( train_test, transforms=transforms.transforms_train) train_loader = DataLoader(train_dataset, batch_size=config.TRAIN_BATCH_SIZE, shuffle=True, num_workers=config.NUM_WORKERS) valid_dataset = dataset.HuBMAPDataset( valid_test, transforms=transforms.transforms_valid) valid_loader = DataLoader(valid_dataset, batch_size=config.VALID_BATCH_SIZE, shuffle=False, num_workers=config.NUM_WORKERS) loss_history = {"train": [], "valid": []} dice_history = {"train": [], "valid": []} jaccard_history = {"train": [], "valid": []} dice_max = 0.0 kernel_type = 'unext50' best_file = f'../drive/MyDrive/{kernel_type}_best_fold{fold}_strong_aug_70_epochs.bin' device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = UneXt50().to(device) optimizer = Lookahead(RAdam(filter(lambda p: p.requires_grad, model.parameters()), lr=config.LR), alpha=0.5, k=5) # base_opt = optim.Adam(model.parameters(), lr=3e-4) # optimizer = SWA(base_opt) model, optimizer = amp.initialize(model, optimizer, opt_level="O1") scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, config.N_EPOCHS) # scheduler = GradualWarmupSchedulerV2(optimizer, multiplier=10, total_epoch=config.WARMUP_EPO, after_scheduler=scheduler_cosine) loss_fn = metrics.symmetric_lovasz for epoch in range(config.N_EPOCHS): scheduler.step(epoch) avg_train_loss, train_dice_scores, train_jaccard_scores = engine.train_loop_fn( model, train_loader, optimizer, loss_fn, metrics.dice_coef_metric, metrics.jaccard_coef_metric, device) # if epoch > 10 and epoch % 5 == 0: # optimizer.update_swa() loss_history["train"].append(avg_train_loss) dice_history["train"].append(train_dice_scores) jaccard_history["train"].append(train_jaccard_scores) avg_val_loss, val_dice_scores, val_jaccard_scores = engine.val_loop_fn( model, valid_loader, optimizer, loss_fn, metrics.dice_coef_metric, metrics.jaccard_coef_metric, device) loss_history["valid"].append(avg_val_loss) dice_history["valid"].append(val_dice_scores) jaccard_history["valid"].append(val_jaccard_scores) print( f"Epoch: {epoch+1} | lr: {optimizer.param_groups[0]['lr']:.7f} | train loss: {avg_train_loss:.4f} | val loss: {avg_val_loss:.4f}" ) print( f"train dice: {train_dice_scores:.4f} | val dice: {val_dice_scores:.4f} | train jaccard: {train_jaccard_scores:.4f} | val jaccard: {val_jaccard_scores:.4f}" ) if val_dice_scores > dice_max: print('score2 ({:.6f} --> {:.6f}). Saving model ...'.format( dice_max, val_dice_scores)) torch.save(model.state_dict(), best_file) dice_max = val_dice_scores
if args.optimizer.lower()=='sgd': optimizer = optim.SGD(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) if args.optimizer.lower()=='sgdwm': optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower()=='adam': optimizer = torch.optim.Adam(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'rmsprop': optimizer = optim.RMSprop(net.parameters(),lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'adagrad': optimizer = optim.Adagrad(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'radam': from radam import RAdam optimizer = RAdam(net.parameters(),lr=args.lr,weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lars':#no tensorboardX from lars import LARS optimizer = LARS(net.parameters(), lr=args.lr,momentum=args.momentum,weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lamb': from lamb import Lamb optimizer = Lamb(net.parameters(),lr=args.lr,weight_decay=args.weight_decay) elif args.optimizer.lower() == 'novograd': from novograd import NovoGrad optimizer = NovoGrad(net.parameters(), lr=args.lr,weight_decay=args.weight_decay) elif args.optimizer.lower() == 'dyna': from dyna import Dyna optimizer = Dyna(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) else: optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
def train(validate, n_gpus, rank, output_directory, epochs, optim_algo, learning_rate, weight_decay, sigma, iters_per_checkpoint, batch_size, seed, checkpoint_path, ignore_layers, include_layers, finetune_layers, warmstart_checkpoint_path, with_tensorboard, grad_clip_val, fp16_run): fp16_run = bool(fp16_run) torch.manual_seed(seed) torch.cuda.manual_seed(seed) if n_gpus > 1: init_distributed(rank, n_gpus, **dist_config) criterion = FlowtronLoss(sigma, bool(model_config['n_components']), bool(model_config['use_gate_layer'])) model = Flowtron(**model_config).cuda() if len(finetune_layers): for name, param in model.named_parameters(): if name in finetune_layers: param.requires_grad = True else: param.requires_grad = False print("Initializing %s optimizer" % (optim_algo)) if optim_algo == 'Adam': optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) elif optim_algo == 'RAdam': optimizer = RAdam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) else: print("Unrecognized optimizer %s!" % (optim_algo)) exit(1) # Load checkpoint if one exists iteration = 0 if warmstart_checkpoint_path != "": model = warmstart(warmstart_checkpoint_path, model) if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer, ignore_layers) iteration += 1 # next iteration is iteration + 1 if n_gpus > 1: model = apply_gradient_allreduce(model) print(model) scaler = amp.GradScaler(enabled=fp16_run) train_loader, valset, collate_fn = prepare_dataloaders( data_config, n_gpus, batch_size) # Get shared output_directory ready if rank == 0 and not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("Output directory", output_directory) if with_tensorboard and rank == 0: tboard_out_path = os.path.join(output_directory, 'logs') print("Setting up Tensorboard log in %s" % (tboard_out_path)) logger = FlowtronLogger(tboard_out_path) # force set the learning rate to what is specified for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for batch in train_loader: try: model.zero_grad() mel, embeds, text, in_lens, out_lens, gate_target, attn_prior = batch mel, embeds, text = mel.cuda(), embeds.cuda(), text.cuda() in_lens, out_lens, gate_target = in_lens.cuda(), out_lens.cuda(), gate_target.cuda() attn_prior = attn_prior.cuda() if valset.use_attn_prior else None with amp.autocast(enabled=fp16_run): z, log_s_list, gate_pred, attn, mean, log_var, prob = model( mel, embeds, text, in_lens, out_lens, attn_prior) loss_nll, loss_gate = criterion( (z, log_s_list, gate_pred, mean, log_var, prob), gate_target, out_lens) loss = loss_nll + loss_gate if n_gpus > 1: reduced_loss = reduce_tensor(loss.data, n_gpus).item() reduced_gate_loss = reduce_tensor(loss_gate.data, n_gpus).item() reduced_nll_loss = reduce_tensor(loss_nll.data, n_gpus).item() else: reduced_loss = loss.item() reduced_gate_loss = loss_gate.item() reduced_nll_loss = loss_nll.item() scaler.scale(loss).backward() if grad_clip_val > 0: scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip_val) scaler.step(optimizer) scaler.update() if rank == 0: print("{}:\t{:.9f}".format(iteration, reduced_loss), flush=True) if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, iteration) logger.add_scalar('training_loss_gate', reduced_gate_loss, iteration) logger.add_scalar('training_loss_nll', reduced_nll_loss, iteration) logger.add_scalar('learning_rate', learning_rate, iteration) if iteration % iters_per_checkpoint == 0: if validate: val_loss, val_loss_nll, val_loss_gate, attns, gate_pred, gate_target = compute_validation_loss( model, criterion, valset, collate_fn, batch_size, n_gpus) if rank == 0: print("Validation loss {}: {:9f} ".format(iteration, val_loss)) if with_tensorboard: logger.log_validation( val_loss, val_loss_nll, val_loss_gate, attns, gate_pred, gate_target, iteration) checkpoint_path = "{}/model_{}.pt".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1 except: print("SOMETHING BAD HAPPENED")
optimizer = optim.Adam(model.parameters(), lr=args.init_lr, betas=betas, eps=1e-9) elif args.optim == '1cycle': optimizer = optim.Adam(model.parameters(), lr=args.init_lr, betas=betas, eps=1e-9) scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=args.max_lr, steps_per_epoch=len(train_data), epochs=args.max_epochs) elif args.optim == 'radam': optimizer = RAdam(model.parameters(), lr=args.init_lr, betas=betas, eps=1e-9) elif args.optim == 'schedule': optimizer = optim.Adam(model.parameters(), lr=args.init_lr, betas=betas, eps=1e-9) scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, factor=args.schedule_factor, patience=args.schedule_patience) # **************** TRAINING ****************** print('Training starts...') alignment = None
def main(): n_envs = len(os.sched_getaffinity(0)) factory = FallingEnvFactory() # factory = HalfCheetahEnvFactory() # factory = HumanoidFallingEnvFactory() env: Env = factory.make_env() envs: VectorEnv = AsyncVectorEnv([factory.make_env for _ in range(n_envs)]) env_container = EnvContainer(env, envs) state_dim, = env.observation_space.shape action_dim, = env.action_space.shape relu = nn.ReLU() tanh = nn.Tanh() identity = nn.Identity() actor = ProbMLPConstantLogStd(state_dim, action_dim, [256, 256], relu, tanh, -1.0) critic = MultiLayerPerceptron(state_dim, 1, [256, 256], relu, identity) scaler_ = StandardScaler() print("Fit scaler") env.reset() state_seq = [] for _ in tqdm(range(512)): action = env.action_space.sample() state, _, done, _ = env.step(action) state_seq.append(state) if done: env.reset() state_seq = np.stack(state_seq) scaler_.fit(state_seq) scaler = ScalerNet(scaler_) module_dict = ModuleDict() module_dict.set(ModuleKey.actor, actor) module_dict.set(ModuleKey.scaler, scaler) module_dict.set(ModuleKey.critic, critic) action_getter: ActionGetter = ActionGetterModule(actor, scaler) sample_collector: SampleCollector = SampleCollectorV0(env_container, action_getter, 2048, 1) mse_loss = nn.MSELoss() critic_tensor_inserter: TensorInserter = \ TensorInserterTensorize(ArrayKey.states, TensorKey.states_tensor) + \ TensorInserterTensorize(ArrayKey.log_probs, TensorKey.log_probs_tensor) + \ TensorInserterTensorize(ArrayKey.cumulative_rewards, TensorKey.cumulative_rewards_tensor) + \ TensorInserterForward(TensorKey.states_tensor, ModuleKey.scaler, TensorKey.states_tensor) + \ TensorInserterForward(TensorKey.states_tensor, ModuleKey.critic, TensorKey.cumulative_reward_predictions_tensor) critic_loss_calculator: LossCalculator = \ LossCalculatorInputTarget(TensorKey.cumulative_reward_predictions_tensor, TensorKey.cumulative_rewards_tensor, mse_loss) actor_tensor_inserter: TensorInserter = \ TensorInserterTensorize(ArrayKey.states, TensorKey.states_tensor) + \ TensorInserterTensorize(ArrayKey.actions, TensorKey.actions_tensor) + \ TensorInserterTensorize(ArrayKey.log_probs, TensorKey.log_probs_tensor) + \ TensorInserterTensorize(ArrayKey.cumulative_rewards, TensorKey.cumulative_rewards_tensor) + \ TensorInserterForward(TensorKey.states_tensor, ModuleKey.scaler, TensorKey.states_tensor) + \ TensorInserterForward(TensorKey.states_tensor, ModuleKey.critic, TensorKey.cumulative_reward_predictions_tensor) + \ TensorInserterLambda([TensorKey.cumulative_rewards_tensor, TensorKey.cumulative_reward_predictions_tensor], lambda x, y: x - y, TensorKey.advantages_tensor) + \ TensorInserterModuleLambda(ModuleKey.actor, [TensorKey.states_tensor, TensorKey.actions_tensor], lambda actor, state, action: actor.get_log_prob(state, action), TensorKey.new_log_probs_tensor) + \ TensorInserterLambda([TensorKey.new_log_probs_tensor, TensorKey.log_probs_tensor, TensorKey.advantages_tensor], get_ppo_surrogate_tensor, TensorKey.ppo_surrogates_tensor) actor_loss_calculator: LossCalculator = \ LossCalculatorLambda([TensorKey.ppo_surrogates_tensor], lambda x: -torch.mean(x)) actor_optimizer = RAdam(params=actor.parameters(), lr=3e-4) actor_updater: ModuleUpdater = ModuleUpdaterOptimizer(actor_optimizer) critic_optimizer = RAdam(params=critic.parameters(), lr=3e-4) critic_updater: ModuleUpdater = ModuleUpdaterOptimizer(critic_optimizer) actor_trainee = Trainee([actor], actor_updater, actor_tensor_inserter, actor_loss_calculator, 10) critic_trainee = Trainee([critic], critic_updater, critic_tensor_inserter, critic_loss_calculator, 10) trainer = RLTrainer(sample_collector, [critic_trainee, actor_trainee], 100000, 128) trainer.train(module_dict)
def main(lr=0.1): global best_acc args.lr = lr device = 'cuda' if torch.cuda.is_available() else 'cpu' best_acc = 0 # best test accuracy start_epoch = 0 # start from epoch 0 or last checkpoint epoch # Data print('==> Preparing data..') transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) trainset = torchvision.datasets.CIFAR10(root='/tmp/cifar10', train=True, download=True, transform=transform_train) trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=2) testset = torchvision.datasets.CIFAR10(root='/tmp/cifar10', train=False, download=True, transform=transform_test) testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2) classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') # Model print('==> Building model..') # net = VGG('VGG19') # net = ResNet18() # net = PreActResNet18() # net = GoogLeNet() # net = DenseNet121() # net = ResNeXt29_2x64d() # net = MobileNet() # net = MobileNetV2() # net = DPN92() # net = ShuffleNetG2() # net = SENet18() # net = ShuffleNetV2(1) # net = EfficientNetB0() # net = RegNetX_200MF() net = ResNet50() net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True ckpt = './checkpoint/' + args.optimizer + str(lr) + '_ckpt.pth' if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isdir( 'checkpoint'), 'Error: no checkpoint directory found!' checkpoint = torch.load(ckpt) net.load_state_dict(checkpoint['net']) best_acc = checkpoint['acc'] start_epoch = checkpoint['epoch'] criterion = nn.CrossEntropyLoss() if args.optimizer.lower() == 'sgd': optimizer = optim.SGD(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) if args.optimizer.lower() == 'sgdwm': optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'adam': optimizer = torch.optim.Adam(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'rmsprop': optimizer = optim.RMSprop(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'adagrad': optimizer = optim.Adagrad(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'radam': from radam import RAdam optimizer = RAdam(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lars': #no tensorboardX from lars import LARS optimizer = LARS(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lamb': from lamb import Lamb optimizer = Lamb(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'novograd': from novograd import NovoGrad optimizer = NovoGrad(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) else: optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lrs = create_lr_scheduler(args.warmup_epochs, args.lr_decay) # lr_scheduler = LambdaLR(optimizer,lrs) # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, args.lr_decay, gamma=0.1) train_acc = [] valid_acc = [] # Training def train(epoch): print('\nEpoch: %d' % epoch) net.train() train_loss = 0 correct = 0 total = 0 for batch_idx, (inputs, targets) in enumerate(trainloader): print(batch_idx) inputs, targets = inputs.to(device), targets.to(device) optimizer.zero_grad() outputs = net(inputs) loss = criterion(outputs, targets) loss.backward() optimizer.step() # lr_scheduler.step() train_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() print(100. * correct / total) train_acc.append(correct / total) def test(epoch): global best_acc net.eval() test_loss = 0 correct = 0 total = 0 print('test') with torch.no_grad(): for batch_idx, (inputs, targets) in enumerate(testloader): print(batch_idx) inputs, targets = inputs.to(device), targets.to(device) outputs = net(inputs) loss = criterion(outputs, targets) test_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() # Save checkpoint. acc = 100. * correct / total print(acc) valid_acc.append(correct / total) if acc > best_acc: print('Saving..') state = { 'net': net.state_dict(), 'acc': acc, 'epoch': epoch, } if not os.path.isdir('checkpoint'): os.mkdir('checkpoint') torch.save(state, ckpt) best_acc = acc for epoch in range(200): if epoch in args.lr_decay: checkpoint = torch.load(ckpt) net.load_state_dict(checkpoint['net']) best_acc = checkpoint['acc'] args.lr *= 0.1 if args.optimizer.lower() == 'sgd': optimizer = optim.SGD(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) if args.optimizer.lower() == 'sgdwm': optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'adam': optimizer = optim.Adam(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'rmsprop': optimizer = optim.RMSprop(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'adagrad': optimizer = optim.Adagrad(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'radam': from radam import RAdam optimizer = RAdam(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lars': # no tensorboardX optimizer = LARS(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, dampening=args.damping) elif args.optimizer.lower() == 'lamb': optimizer = Lamb(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'novograd': optimizer = NovoGrad(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) else: optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) train(epoch) test(epoch) file = open(args.optimizer + str(lr) + 'log.json', 'w+') json.dump([train_acc, valid_acc], file) return best_acc
return batch_1D, idxs, batch_truths_1D, idxs_truths, steps print('Using device:', args.device) model = ModifiedUNet(args, in_channels=1, out_channels=1, bottleneck_out=None, init_features=32).to(args.device) now = datetime.datetime.now() dir_name = now.strftime("%B_%d_at_%H_%M_%p") save_dir = './save/' + dir_name imgs_dir = './imgs/' + dir_name if args.optimizer == 'adam': optimizer = torch.optim.Adam(model.parameters(), args.lr) else: optimizer = RAdam(model.parameters(), args.lr) if args.loss == 'mse': loss_fn = torch.nn.MSELoss() else: loss_fn = torch.nn.BCELoss() # laoding checkpoint if args.load_path: files = os.listdir(args.load_path) files = sorted(files, key=lambda x: int(os.path.splitext(x)[0])) last_path = os.path.join(args.load_path, files[-1]) checkpoint = torch.load(last_path) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer'])
optimizer = torch.optim.Adam(model.parameters(), lr=args.base_lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'rmsprop': optimizer = optim.RMSprop(model.parameters(), lr=args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'adagrad': optimizer = optim.Adagrad(model.parameters(), lr=args.base_lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'radam': from radam import RAdam optimizer = RAdam(model.parameters(), lr=args.base_lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lars': #no tensorboardX from lars import LARS optimizer = LARS(model.parameters(), lr=args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lamb': from lamb import Lamb optimizer = Lamb(model.parameters(), lr=args.base_lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'novograd': from novograd import NovoGrad optimizer = NovoGrad(model.parameters(),
def main(): parser = argparse.ArgumentParser() parser.add_argument('--bs', metavar='bs', type=int, default=2) parser.add_argument('--path', type=str, default='../../data') parser.add_argument('--results', type=str, default='../../results/model') parser.add_argument('--nw', type=int, default=0) parser.add_argument('--max_images', type=int, default=None) parser.add_argument('--val_size', type=int, default=None) parser.add_argument('--epochs', type=int, default=10) parser.add_argument('--lr', type=float, default=0.003) parser.add_argument('--lr_decay', type=float, default=0.99997) parser.add_argument('--kernel_lvl', type=float, default=1) parser.add_argument('--noise_lvl', type=float, default=1) parser.add_argument('--motion_blur', type=bool, default=False) parser.add_argument('--homo_align', type=bool, default=False) parser.add_argument('--resume', type=bool, default=False) args = parser.parse_args() print() print(args) print() if not os.path.isdir(args.results): os.makedirs(args.results) PATH = args.results if not args.resume: f = open(PATH + "/param.txt", "a+") f.write(str(args)) f.close() writer = SummaryWriter(PATH + '/runs') # CUDA for PyTorch use_cuda = torch.cuda.is_available() device = torch.device('cuda:0' if use_cuda else "cpu") # Parameters params = {'batch_size': args.bs, 'shuffle': True, 'num_workers': args.nw} # Generators print('Initializing training set') training_set = Dataset(args.path + '/train/', args.max_images, args.kernel_lvl, args.noise_lvl, args.motion_blur, args.homo_align) training_generator = data.DataLoader(training_set, **params) print('Initializing validation set') validation_set = Dataset(args.path + '/test/', args.val_size, args.kernel_lvl, args.noise_lvl, args.motion_blur, args.homo_align) validation_generator = data.DataLoader(validation_set, **params) # Model model = UNet(in_channel=3, out_channel=3) if args.resume: models_path = get_newest_model(PATH) print('loading model from ', models_path) model.load_state_dict(torch.load(models_path)) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = torch.nn.DataParallel(model) model.to(device) # Loss + optimizer criterion = BurstLoss() optimizer = RAdam(model.parameters(), lr=args.lr) scheduler = StepLR(optimizer, step_size=8 // args.bs, gamma=args.lr_decay) if args.resume: n_iter = np.loadtxt(PATH + '/train.txt', delimiter=',')[:, 0][-1] else: n_iter = 0 # Loop over epochs for epoch in range(args.epochs): train_loss = 0.0 # Training model.train() for i, (X_batch, y_labels) in enumerate(training_generator): # Alter the burst length for each mini batch burst_length = np.random.randint(2, 9) X_batch = X_batch[:, :burst_length, :, :, :] # Transfer to GPU X_batch, y_labels = X_batch.to(device).type( torch.float), y_labels.to(device).type(torch.float) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize pred = model(X_batch) loss = criterion(pred, y_labels) loss.backward() optimizer.step() scheduler.step() train_loss += loss.detach().cpu().numpy() writer.add_scalar('training_loss', loss.item(), n_iter) if i % 100 == 0 and i > 0: loss_printable = str(np.round(train_loss, 2)) f = open(PATH + "/train.txt", "a+") f.write(str(n_iter) + "," + loss_printable + "\n") f.close() print("training loss ", loss_printable) train_loss = 0.0 if i % 1000 == 0: if torch.cuda.device_count() > 1: torch.save( model.module.state_dict(), os.path.join(PATH, 'model_' + str(int(n_iter)) + '.pt')) else: torch.save( model.state_dict(), os.path.join(PATH, 'model_' + str(int(n_iter)) + '.pt')) if i % 1000 == 0: # Validation val_loss = 0.0 with torch.set_grad_enabled(False): model.eval() for v, (X_batch, y_labels) in enumerate(validation_generator): # Alter the burst length for each mini batch burst_length = np.random.randint(2, 9) X_batch = X_batch[:, :burst_length, :, :, :] # Transfer to GPU X_batch, y_labels = X_batch.to(device).type( torch.float), y_labels.to(device).type(torch.float) # forward + backward + optimize pred = model(X_batch) loss = criterion(pred, y_labels) val_loss += loss.detach().cpu().numpy() if v < 5: im = make_im(pred, X_batch, y_labels) writer.add_image('image_' + str(v), im, n_iter) writer.add_scalar('validation_loss', val_loss, n_iter) loss_printable = str(np.round(val_loss, 2)) print('validation loss ', loss_printable) f = open(PATH + "/eval.txt", "a+") f.write(str(n_iter) + "," + loss_printable + "\n") f.close() n_iter += args.bs
def train(args): print('start training...') model, model_file = create_model(args) train_loader, val_loader = get_train_val_loaders( batch_size=args.batch_size, val_batch_size=args.val_batch_size) train_loader = get_frame_train_loader(batch_size=args.batch_size) #model, optimizer = amp.initialize(model, optimizer, opt_level="O1",verbosity=0) if args.optim == 'Adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.0001) elif args.optim == 'RAdam': optimizer = RAdam(model.parameters(), lr=args.lr, weight_decay=0.0001) else: optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=0.0001) if args.lrs == 'plateau': lr_scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=args.factor, patience=args.patience, min_lr=args.min_lr) else: lr_scheduler = CosineAnnealingLR(optimizer, args.t_max, eta_min=args.min_lr) model = model.cuda() if torch.cuda.device_count() > 1: model_name = model.name model = DataParallel(model) model.name = model_name #model=model.train() best_f2 = 99999. best_key = 'loss' print( 'epoch | lr | % | loss | avg | loss | 0.01 | 0.20 | 0.50 | best | time | save |' ) if not args.no_first_val: val_metrics = validate(args, model, val_loader) print( 'val | | | | | {:.4f} | {:.4f} | {:.4f} | {:.4f} | {:.4f} | | |' .format(val_metrics['loss'], val_metrics['f2_th_0.01'], val_metrics['f2_th_0.20'], val_metrics['f2_th_0.50'], val_metrics[best_key])) best_f2 = val_metrics[best_key] if args.val: return model.train() if args.lrs == 'plateau': lr_scheduler.step(best_f2) else: lr_scheduler.step() train_iter = 0 for epoch in range(args.start_epoch, args.num_epochs): #train_loader, val_loader = get_train_val_loaders(batch_size=args.batch_size, val_batch_size=args.val_batch_size, val_num=args.val_num) train_loss = 0 current_lr = get_lrs(optimizer) bg = time.time() for batch_idx, data in enumerate(train_loader): train_iter += 1 if train_loader.seg: rgb, audio, labels = [x.cuda() for x in data] else: rgb, audio, labels = data[0].cuda(), data[2].cuda( ), data[4].cuda() output = model(rgb, audio) loss = criterion(output, labels) batch_size = rgb.size(0) loss.backward() optimizer.step() optimizer.zero_grad() #with amp.scale_loss(loss, optimizer) as scaled_loss: # scaled_loss.backward() train_loss += loss.item() print('\r {:4d} | {:.7f} | {:06d}/{} | {:.4f} | {:.4f} |'.format( epoch, float(current_lr[0]), args.batch_size * (batch_idx + 1), train_loader.num, loss.item(), train_loss / (batch_idx + 1)), end='') if train_iter > 0 and train_iter % args.iter_val == 0: if isinstance(model, DataParallel): torch.save(model.module.state_dict(), model_file + '_latest') else: torch.save(model.state_dict(), model_file + '_latest') val_metrics = validate(args, model, val_loader) _save_ckp = '' if args.always_save or val_metrics[best_key] < best_f2: best_f2 = val_metrics[best_key] if isinstance(model, DataParallel): torch.save(model.module.state_dict(), model_file) else: torch.save(model.state_dict(), model_file) _save_ckp = '*' print( ' {:.4f} | {:.4f} | {:.4f} | {:.4f} | {:.4f} | {:.2f} | {:4s} |' .format(val_metrics['loss'], val_metrics['f2_th_0.01'], val_metrics['f2_th_0.20'], val_metrics['f2_th_0.50'], best_f2, (time.time() - bg) / 60, _save_ckp)) model.train() if args.lrs == 'plateau': lr_scheduler.step(best_f2) else: lr_scheduler.step() current_lr = get_lrs(optimizer)
def get_reward_model_optimizer(params, *, model_lr, model_weight_decay): return RAdam(params, lr=model_lr, weight_decay=model_weight_decay)
print("Load duration : {}".format(time.time()-st)) print("[!] load data end") # torch.cuda.set_device(4) # torch.distributed.init_process_group(backend='nccl', # init_method='env://') model = FCAE() model.cuda() # criterion = nn.MSELoss() critierion = CosineDistanceLoss() # criterion.cuda() # optimizer = torch.optim.Adam(model.parameters(), lr= learning_rate) optimizer = RAdam(model.parameters(), lr= learning_rate) optimizer = Lookahead(optimizer, alpha=0.5,k=5) opt_level = 'O1' # assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled." model,optimizer = amp.initialize(model,optimizer,opt_level = opt_level) scheduler = StepLR(optimizer,step_size=50,gamma = 0.1) model = nn.DataParallel(model) print("[*] training ...") log = open(os.path.join(save_path,'log.csv'), 'w', encoding='utf-8', newline='') log_writer = csv.writer(log) best_val_loss = 100
def train(config, num_classes=1108): model = model_whale(num_classes=num_classes, inchannels=6, model_name=config.train.model_name, pretrained=config.train.pretrained).cuda() if config.train.freeze: model.freeze() base_opt = RAdam(model.parameters(), lr=config.train.lr) optimizer = Lookahead(base_opt) # optimizer = torch.optim.Adam(model.parameters(), lr=config.train.lr, betas=(0.9, 0.99), weight_decay=0.0002) resultDir = config.train.result_dir checkPoint = join(resultDir, 'checkpoint') # if not config.train.in_colab: # os.makedirs(checkPoint, exist_ok=True) train_dataset = CustomDataset(config.train.csv_file, config.train.img_dir, transforms=transforms['train']) dataset_size = len(train_dataset) indices = list(range(dataset_size)) split = int(np.floor(config.train.validation_split * dataset_size)) if config.train.shuffle_dataset: np.random.seed(config.train.random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] # Creating PT data samplers and loaders: train_sampler = SubsetRandomSampler(train_indices) valid_sampler = SubsetRandomSampler(val_indices) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.train.batch_size, sampler=train_sampler, num_workers=config.train.num_workers) validation_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.train.batch_size, sampler=valid_sampler, num_workers=config.train.num_workers) train_loss = 0. # load from cpk: if config.train.load_cpk: model.load_pretrain(os.path.join( checkPoint, '%08d_model.pth' % (config.train.start_epoch)), skip=[]) cpk = torch.load( os.path.join(checkPoint, '%08d_optimizer.pth' % (config.train.start_epoch))) optimizer.load_state_dict(cpk['optimizer']) adjust_learning_rate(optimizer, config.train.lr) start_epoch = cpk['epoch'] else: start_epoch = 0 top1_batch, map5_batch = 0, 0 for epoch in range(start_epoch + 1, config.train.epochs): print('Starting:', epoch, 'Iterations:', len(train_loader)) for i, data in enumerate(train_loader): model.train() model.mode = 'train' images, labels = data images = images.cuda() labels = labels.cuda().long() global_feat, local_feat, results = data_parallel(model, images) model.getLoss(global_feat, local_feat, results, labels, config, verbose=(i % config.loss.verbose_interval == 0)) batch_loss = model.loss optimizer.zero_grad() batch_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0, norm_type=2) optimizer.step() results = torch.sigmoid(results) train_loss += batch_loss.data.cpu().numpy() top1_batch += accuracy(results, labels, topk=[1])[0] map5_batch += mapk(labels, results, k=5) if i % config.train.verbose_interval == 0: print( 'epoch: %03d, iter: %05d, train_loss: %f, top1_batch: %f, map5_batch: %f' % (epoch, i, float(train_loss / config.train.verbose_interval), float(top1_batch / config.train.verbose_interval), float(map5_batch / config.train.verbose_interval))) # print(f'epoch: {epoch}, iter: {i}, train_loss: {float(train_loss / config.train.verbose_interval)}, top1_batch: {float(top1_batch / config.train.verbose_interval)}, map5_batch: {float(map5_batch / config.train.verbose_interval)}') train_loss, top1_batch, map5_batch = 0, 0, 0 valid_loss, top1_valid, map5_valid = valid_eval( config, model, validation_loader) print( 'epoch: %03d, iter: %05d, valid_loss: %f, valid_top1_batch: %f, valid_map5_batch: %f' % (epoch, i, valid_loss, top1_valid, map5_valid)) # print(f'epoch: {epoch}, iter: {i}, valid_loss: {valid_loss}, top1_batch: {top1_valid}, map5_batch: {map5_valid}') if epoch % config.train.save_period == 0: os.system("touch " + resultDir + "/checkpoint/%08d_model.pth" % (epoch)) os.system("touch " + resultDir + "/checkpoint/%08d_optimizer.pth" % (epoch)) time.sleep(1) torch.save(model.state_dict(), resultDir + '/checkpoint/%08d_model.pth' % (epoch)) torch.save({ 'optimizer': optimizer.state_dict(), 'epoch': epoch, }, resultDir + '/checkpoint/%08d_optimizer.pth' % (epoch))
def __init__(self, TasNET, batch_size, checkpoint="checkpoint", log_folder="./log", rnn_arch="LSTM", optimizer='radam', rerun_mode=False, lr=1e-5, momentum=0.9, weight_decay=0, num_epoches=20, clip_norm=False, sr=8000, cudnnBenchmark=True): logger.info('---Experiment Variables---') logger.info('RNN Architecture: '+rnn_arch) logger.info('Batch Size : '+str(batch_size)) logger.info('Optimizer : '+optimizer) logger.info('--------------------------\n') logger.info('Rerun mode: '+str(rerun_mode)) self.TasNET = TasNET self.log_folder = log_folder self.writer = SummaryWriter(log_folder) self.all_log = 'all_log.log' #all log filename self.log('Progress Log save path: '+log_folder) self.log("TasNET:\n{}".format(self.TasNET)) if type(lr) is str: lr = float(lr) logger.info("Transfrom lr from str to float => {}".format(lr)) self.log('Batch size used: '+str(batch_size)) if optimizer == 'radam': self.optimizer = RAdam(self.TasNET.parameters(), lr=lr, weight_decay=weight_decay) else: self.optimizer = torch.optim.Adam( self.TasNET.parameters(), lr=lr, weight_decay=weight_decay) self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, 'min', factor=0.5, patience=3,verbose=True) self.TasNET.to(device) self.checkpoint = checkpoint self.log('Model save path: '+checkpoint) self.num_epoches = num_epoches self.clip_norm = clip_norm self.sr = sr if self.clip_norm: self.log("Clip gradient by 2-norm {}".format(clip_norm)) if not os.path.exists(self.checkpoint): os.makedirs(checkpoint) torch.backends.cudnn.benchmark=cudnnBenchmark self.log('cudnn benchmark status: '+str(torch.backends.cudnn.benchmark)+'\n')
def __init__(self, log_dir, cfg): self.path = log_dir self.cfg = cfg if cfg.TRAIN.FLAG: self.model_dir = os.path.join(self.path, 'Model') self.log_dir = os.path.join(self.path, 'Log') mkdir_p(self.model_dir) mkdir_p(self.log_dir) self.writer = SummaryWriter(log_dir=self.log_dir) sys.stdout = Logger(logfile=os.path.join(self.path, "logfile.log")) self.data_dir = cfg.DATASET.DATA_DIR self.max_epochs = cfg.TRAIN.MAX_EPOCHS self.snapshot_interval = cfg.TRAIN.SNAPSHOT_INTERVAL s_gpus = cfg.GPU_ID.split(',') self.gpus = [int(ix) for ix in s_gpus] self.num_gpus = len(self.gpus) self.batch_size = cfg.TRAIN.BATCH_SIZE self.lr = cfg.TRAIN.LEARNING_RATE torch.cuda.set_device(self.gpus[0]) cudnn.benchmark = True # load dataset cogent = cfg.DATASET.congent if not args.eval and not args.test: self.dataset = ClevrDataset(data_dir=self.data_dir, split="train" + cogent) self.dataloader = DataLoader(dataset=self.dataset, batch_size=cfg.TRAIN.BATCH_SIZE, shuffle=True, num_workers=cfg.WORKERS, drop_last=False, collate_fn=collate_fn) self.dataset_val = ClevrDataset(data_dir=self.data_dir, split="val" + cogent) self.dataloader_val = DataLoader(dataset=self.dataset_val, batch_size=256, drop_last=False, shuffle=False, num_workers=cfg.WORKERS, collate_fn=collate_fn) # load model self.vocab = load_vocab(cfg) self.model, self.model_ema = mac.load_MAC(cfg, self.vocab) self.weight_moving_average(alpha=0) if cfg.TRAIN.RADAM: self.optimizer = RAdam(self.model.parameters(), lr=self.lr) else: self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr) self.previous_best_acc = 0.0 self.previous_best_epoch = 0 self.total_epoch_loss = 0 self.prior_epoch_loss = 10 self.print_info() self.loss_fn = torch.nn.CrossEntropyLoss().cuda() def print_info(self): print('Using config:') pprint.pprint(self.cfg) print("\n") pprint.pprint("Size of dataset: {}".format(len(self.dataset))) print("\n") print("Using MAC-Model:") pprint.pprint(self.model) print("\n") def weight_moving_average(self, alpha=0.999): for param1, param2 in zip(self.model_ema.parameters(), self.model.parameters()): param1.data *= alpha param1.data += (1.0 - alpha) * param2.data def set_mode(self, mode="train"): if mode == "train": self.model.train() self.model_ema.train() else: self.model.eval() self.model_ema.eval() def reduce_lr(self): epoch_loss = self.total_epoch_loss # / float(len(self.dataset) // self.batch_size) lossDiff = self.prior_epoch_loss - epoch_loss if ((lossDiff < 0.015 and self.prior_epoch_loss < 0.5 and self.lr > 0.00002) or \ (lossDiff < 0.008 and self.prior_epoch_loss < 0.15 and self.lr > 0.00001) or \ (lossDiff < 0.003 and self.prior_epoch_loss < 0.10 and self.lr > 0.000005)): self.lr *= 0.5 print("Reduced learning rate to {}".format(self.lr)) for param_group in self.optimizer.param_groups: param_group['lr'] = self.lr self.prior_epoch_loss = epoch_loss self.total_epoch_loss = 0 def save_models(self, iteration): save_model(self.model, self.optimizer, iteration, self.model_dir, model_name="model") save_model(self.model_ema, None, iteration, self.model_dir, model_name="model_ema") def train_epoch(self, epoch): cfg = self.cfg total_loss = 0 total_correct = 0 total_samples = 0 self.labeled_data = iter(self.dataloader) self.set_mode("train") dataset = tqdm(self.labeled_data, total=len(self.dataloader)) for data in dataset: ###################################################### # (1) Prepare training data ###################################################### image, question, question_len, answer = data['image'], data['question'], data['question_length'], data['answer'] answer = answer.long() question = Variable(question) answer = Variable(answer) if cfg.CUDA: image = image.cuda() question = question.cuda() answer = answer.cuda().squeeze() else: question = question image = image answer = answer.squeeze() ############################ # (2) Train Model ############################ self.optimizer.zero_grad() scores = self.model(image, question, question_len) loss = self.loss_fn(scores, answer) loss.backward() if self.cfg.TRAIN.CLIP_GRADS: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.cfg.TRAIN.CLIP) self.optimizer.step() self.weight_moving_average() ############################ # (3) Log Progress ############################ correct = scores.detach().argmax(1) == answer total_correct += correct.sum().cpu().item() total_loss += loss.item() * answer.size(0) total_samples += answer.size(0) avg_loss = total_loss / total_samples train_accuracy = total_correct / total_samples # accuracy = correct.sum().cpu().numpy() / answer.shape[0] # if avg_loss == 0: # avg_loss = loss.item() # train_accuracy = accuracy # else: # avg_loss = 0.99 * avg_loss + 0.01 * loss.item() # train_accuracy = 0.99 * train_accuracy + 0.01 * accuracy # self.total_epoch_loss += loss.item() * answer.size(0) dataset.set_description( 'Epoch: {}; Avg Loss: {:.5f}; Avg Train Acc: {:.5f}'.format(epoch + 1, avg_loss, train_accuracy) ) self.total_epoch_loss = avg_loss print(self.total_epoch_loss) dict = { "avg_loss": avg_loss, "train_accuracy": train_accuracy } return dict def train(self): cfg = self.cfg print("Start Training") for epoch in range(self.max_epochs): dict = self.train_epoch(epoch) self.reduce_lr() self.log_results(epoch, dict) if cfg.TRAIN.EALRY_STOPPING: if epoch - cfg.TRAIN.PATIENCE == self.previous_best_epoch: break self.save_models(self.max_epochs) self.writer.close() print("Finished Training") print("Highest validation accuracy: {} at epoch {}") def log_results(self, epoch, dict, max_eval_samples=None): epoch += 1 self.writer.add_scalar("avg_loss", dict["avg_loss"], epoch) self.writer.add_scalar("train_accuracy", dict["train_accuracy"], epoch) val_accuracy, val_accuracy_ema = self.calc_accuracy("validation", max_samples=max_eval_samples) self.writer.add_scalar("val_accuracy_ema", val_accuracy_ema, epoch) self.writer.add_scalar("val_accuracy", val_accuracy, epoch) print("Epoch: {}\tVal Acc: {},\tVal Acc EMA: {},\tAvg Loss: {},\tLR: {}". format(epoch, val_accuracy, val_accuracy_ema, dict["avg_loss"], self.lr)) if val_accuracy > self.previous_best_acc: self.previous_best_acc = val_accuracy self.previous_best_epoch = epoch if epoch % self.snapshot_interval == 0: self.save_models(epoch) def calc_accuracy(self, mode="train", max_samples=None): self.set_mode("validation") if mode == "train": loader = self.dataloader # num_imgs = len(self.dataset) elif mode == "validation": loader = self.dataloader_val # num_imgs = len(self.dataset_val) # batch_size = 256 # total_iters = num_imgs // batch_size # if max_samples is not None: # max_iter = max_samples // batch_size # else: # max_iter = None # all_accuracies = [] total_correct = 0 total_correct_ema = 0 total_samples = 0 # all_accuracies_ema = [] for data in tqdm(loader, total=len(loader)): # try: # data = next(eval_data) # except StopIteration: # break # if max_iter is not None and _iteration == max_iter: # break image, question, question_len, answer = data['image'], data['question'], data['question_length'], data['answer'] answer = answer.long() question = Variable(question) answer = Variable(answer) if self.cfg.CUDA: image = image.cuda() question = question.cuda() answer = answer.cuda().squeeze() with torch.no_grad(): scores = self.model(image, question, question_len) scores_ema = self.model_ema(image, question, question_len) correct_ema = scores_ema.detach().argmax(1) == answer total_correct_ema += correct_ema.sum().cpu().item() # accuracy_ema = correct_ema.sum().cpu().numpy() / answer.shape[0] # all_accuracies_ema.append(accuracy_ema) correct = scores.detach().argmax(1) == answer total_correct += correct.sum().cpu().item() # accuracy = correct.sum().cpu().numpy() / answer.shape[0] # all_accuracies.append(accuracy) total_samples += answer.size(0) accuracy_ema = total_correct_ema / total_samples accuracy = total_correct / total_samples return accuracy, accuracy_ema
def get_optimizer(model, lr): return RAdam(model.parameters(), lr=lr, weight_decay=1e-5)
class DDPG(nn.Module): def __init__( self, d_state, d_action, device, gamma, tau, policy_lr, value_lr, value_loss, value_n_layers, value_n_units, value_activation, policy_n_layers, policy_n_units, policy_activation, grad_clip, policy_noise=0.2, noise_clip=0.5, expl_noise=0.1, tdg_error_weight=0, td_error_weight=1, ): super().__init__() self.actor = Actor(d_state, d_action, policy_n_layers, policy_n_units, policy_activation).to(device) self.actor_target = copy.deepcopy(self.actor) self.actor_optimizer = RAdam(self.actor.parameters(), lr=policy_lr) self.critic = ActionValueFunction(d_state, d_action, value_n_layers, value_n_units, value_activation).to(device) self.critic_target = copy.deepcopy(self.critic) self.critic_optimizer = RAdam(self.critic.parameters(), lr=value_lr) self.discount = gamma self.tau = tau self.policy_noise = policy_noise self.noise_clip = noise_clip self.expl_noise = expl_noise self.normalizer = None self.value_loss = value_loss self.grad_clip = grad_clip self.device = device self.tdg_error_weight = tdg_error_weight self.td_error_weight = td_error_weight self.step_counter = 0 def setup_normalizer(self, normalizer): self.normalizer = copy.deepcopy(normalizer) def get_action(self, states, deterministic=False): states = states.to(self.device) if self.normalizer is not None: states = self.normalizer.normalize_states(states) actions = self.actor(states) if not deterministic: actions = actions + torch.randn_like(actions) * self.expl_noise return actions.clamp(-1, +1) def get_action_with_logp(self, states): states = states.to(self.device) if self.normalizer is not None: states = self.normalizer.normalize_states(states) a = self.actor(states) return a, torch.ones( a.shape[0], device=a.device) * np.inf # inf: should not be used def get_action_value(self, states, actions): if self.normalizer is not None: states = self.normalizer.normalize_states(states) with torch.no_grad(): states = states.to(self.device) actions = actions.to(self.device) return self.critic(states, actions)[0] # just q1 def update(self, states, actions, logps, rewards, next_states, masks): if self.normalizer is not None: states = self.normalizer.normalize_states(states) next_states = self.normalizer.normalize_states(next_states) self.step_counter += 1 # Select action according to policy and add clipped noise noise = (torch.randn_like(actions) * self.policy_noise).clamp( -self.noise_clip, self.noise_clip) raw_next_actions = self.actor_target(next_states) next_actions = (raw_next_actions + noise).clamp(-1, 1) # Compute the target Q value next_Q = self.critic_target(next_states, next_actions) q_target = rewards.unsqueeze( 1) + self.discount * masks.float().unsqueeze(1) * next_Q zero_targets = torch.zeros_like(q_target, device=self.device) q = self.critic(states, actions) # Q(s,a) q_td_error = q_target - q critic_loss, standard_loss, gradient_loss = torch.tensor( 0, device=self.device), torch.tensor( 0, device=self.device), torch.tensor(0, device=self.device) if self.td_error_weight > 0: if self.value_loss == 'huber': standard_loss = 0.5 * F.smooth_l1_loss(q_td_error, zero_targets) elif self.value_loss == 'mse': standard_loss = 0.5 * F.mse_loss(q_td_error, zero_targets) critic_loss = critic_loss + self.td_error_weight * standard_loss if self.tdg_error_weight > 0: gradients_error_norms = torch.autograd.grad( outputs=q_td_error, inputs=actions, grad_outputs=torch.ones(q_td_error.size(), device=self.device), retain_graph=True, create_graph=True, only_inputs=True)[0].flatten(start_dim=1).norm(dim=1, keepdim=True) if self.value_loss == 'huber': gradient_loss = 0.5 * F.smooth_l1_loss(gradients_error_norms, zero_targets) elif self.value_loss == 'mse': gradient_loss = 0.5 * F.mse_loss(gradients_error_norms, zero_targets) critic_loss = critic_loss + self.tdg_error_weight * gradient_loss # Optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward(retain_graph=True) torch.nn.utils.clip_grad_value_(self.critic.parameters(), self.grad_clip) self.critic_optimizer.step() # Compute actor loss q = self.critic(states, self.actor(states)) # Q(s,pi(s)) actor_loss = -q.mean() # Optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() torch.nn.utils.clip_grad_value_(self.actor.parameters(), self.grad_clip) self.actor_optimizer.step() # Update the frozen target models for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) return raw_next_actions[ 0, 0].item(), self.td_error_weight * standard_loss.item( ), self.tdg_error_weight * gradient_loss.item(), actor_loss.item() @staticmethod def catastrophic_divergence(q_loss, pi_loss): return q_loss > 1e2 or (pi_loss is not None and abs(pi_loss) > 1e5)
def main(): outPath = f"{args.results}_{args.model_name}_CBAM_WM" if not os.path.exists(outPath): os.makedirs(outPath) ts = str(datetime.datetime.now()).split(".")[0].replace(" ", "_") ts = ts.replace(":", "_").replace("-", "_") file_path = os.path.join(outPath, "{}_run_{}.json".format(args.model_name, ts)) ##############choose model########################## net = get_model(args.num_classes, args.model_name).to(device) if args.pre_train: net = torch.load(args.ckp)["model_state"] #load the pretrained model print("load pre-trained model sucessfully") if torch.cuda.device_count() > 1: print("using multi gpu") net = torch.nn.DataParallel(net, device_ids=[0, 1]) else: print('using one gpu') ##########hyper parameters setting################# # optimizer = Adam(net.parameters(), lr=args.lr) optimizer = RAdam(params=net.parameters(), lr=args.lr, weight_decay=0.0001) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', factor=0.2, patience=4, verbose=True) # milestones = [x*40 for x in range(10)] # print(milestones) # scheduler = CyclicCosAnnealingLR(optimizer,milestones=milestones,eta_min=1e-7) # criterion = FocalLoss2d().to(device) # criterion = BCEDiceLossWeighted().to(device) criterion = WeightedBceLoss().to(device) criterion2 = SoftDiceLoss().to(device) ##########prepare dataset################################ train_loader, val_loader = build_loader(batch_size=args.batch_size, num_workers=4) history = collections.defaultdict(list) best_miou = -100 for epoch in range(args.num_epochs): print("Epoch: {}/{}".format(epoch + 1, args.num_epochs)) # optimizer.step() # scheduler.step(epoch) ####################train#################################### train_hist = train(train_loader, args.num_classes, device, net, optimizer, criterion) print('loss', train_hist["loss"], 'miou', train_hist["miou"], 'fg_iou', train_hist["fg_iou"], 'mcc', train_hist["mcc"]) for k, v in train_hist.items(): history["train " + k].append(v) ######################valid################################## val_hist = validate(val_loader, args.num_classes, device, net, scheduler, criterion2) print('loss', val_hist["loss"], 'miou', val_hist["miou"], 'fg_iou', val_hist["fg_iou"], 'mcc', val_hist["mcc"]) if val_hist["miou"] > best_miou: state = { "epoch": epoch + 1, "model_state": net, "best_miou": val_hist["miou"] } checkpoint = f'{args.model_name}_val_{val_hist["miou"]}_epoch{epoch + 1}.pth' torch.save(state, os.path.join(outPath, checkpoint)) # save model print("The model has saved successfully!") best_miou = val_hist["miou"] for k, v in val_hist.items(): history["val " + k].append(v) f = open(file_path, "w+") f.write(json.dumps(history)) f.close()
##### Build Model ##### from model.CADVFi import CAIN print("Building model: CADVFi") model = CAIN(depth=args.depth) # Just make every model to DataParallel model = torch.nn.DataParallel(model).to(device) #print(model) ##### Define Loss & Optimizer ##### criterion = Loss(args) args.radam = False if args.radam: from radam import RAdam optimizer = RAdam(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2)) else: from torch.optim import Adam optimizer = Adam(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2)) print('# of parameters: %d' % sum(p.numel() for p in model.parameters())) # If resume, load checkpoint: model + optimizer if args.resume: utils.load_checkpoint(args, model, optimizer) # Learning Rate Scheduler scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',
def main(): parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('data', metavar='DIR', help='path to dataset') parser.add_argument('--batch-size', type=int, default=128, metavar='N', help='input batch size for training (default: 128)') parser.add_argument('--batch-size-val', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--epochs', type=int, default=1000, metavar='N', help='number of epochs to train (default: 1000)') parser.add_argument('--lr', type=float, default=1e-4, metavar='LR', help='learning rate (default: 1e-4)') parser.add_argument('--image-size', type=float, default=80, metavar='IMSIZE', help='input image size (default: 80)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--multi-gpu', action='store_true', default=False, help='parallel training on multiple GPUs') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') parser.add_argument('--model-save-path', default='', type=str, metavar='PATH', help='For Saving the current Model') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') args = parser.parse_args() torch.set_default_tensor_type('torch.FloatTensor') device = torch.device("cpu" if args.no_cuda else "cuda") train_data = dataset(args.data, "train", args.image_size, transform=transforms.Compose([ToTensor()]), shuffle=True) valid_data = dataset(args.data, "val", args.image_size, transform=transforms.Compose([ToTensor()])) trainloader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=8) validloader = DataLoader(valid_data, batch_size=args.batch_size_val, shuffle=False, num_workers=8) model = Model(args.image_size, args.image_size) optimizer = RAdam(model.parameters(), lr=args.lr, weight_decay=1e-8) if not args.no_cuda: model.cuda() if torch.cuda.device_count() > 1 and args.multi_gpu: print("Let's use", torch.cuda.device_count(), "GPUs!") model = torch.nn.DataParallel(model) if args.resume: model.load_state_dict( torch.load(os.path.join(args.resume, model_save_name))) optimizer.load_state_dict( torch.load(os.path.join(args.resume, optimizer_save_name))) train(model, optimizer, trainloader, validloader, device, args)
if __name__ == "__main__": device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") train_loader, validation_loader, test_loader, classes = load_CIFAR10_data() # model = CIFAR10Model() # model = CIFAR10ModelTest() # model = resnet18(3, 10) # model = CIFAR10ResLayer() # model = CIFAR100Model() # model, criterion, optimizer = transfer_radam_resnet101(100) model = resnet18(3, 10, drop=False) summary(model, (3, 32, 32), device="cpu") model.to(device) criterion = nn.CrossEntropyLoss() optimizer = RAdam(model.parameters()) # optimizer = optim.Adam(model.parameters()) accuracies = [] train_losses = [] losses = [] for epoch in range(1, EPOCH + 1): loss = train(model, train_loader, optimizer, criterion, device, epoch, 1) train_losses.append(loss) if not TEST: acc, loss = validate(model, validation_loader, criterion, device, epoch, classes, 1) accuracies.append(acc) losses.append(loss)
def main(args): # 1. prepare data & models train_transforms = transforms.Compose([ ScaleMinSideToSize((CROP_SIZE, CROP_SIZE)), CropCenter(CROP_SIZE), AffineAugmenter(min_scale=0.9, max_offset=0.1, rotate=True), BrightnessContrastAugmenter(brightness=0.3, contrast=0.3), BlurAugmenter(max_kernel=5), TransformByKeys(transforms.ToPILImage(), ("image", )), TransformByKeys(transforms.ToTensor(), ("image", )), TransformByKeys( transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), ("image", )), ]) test_transforms = transforms.Compose([ ScaleMinSideToSize((CROP_SIZE, CROP_SIZE)), CropCenter(CROP_SIZE), TransformByKeys(transforms.ToPILImage(), ("image", )), TransformByKeys(transforms.ToTensor(), ("image", )), TransformByKeys( transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), ("image", )), ]) print("Reading data...") train_dataset = ThousandLandmarksDataset(os.path.join(args.data, 'train'), train_transforms, split="train") train_dataloader = data.DataLoader(train_dataset, batch_size=args.batch_size, num_workers=4, pin_memory=True, shuffle=True, drop_last=True) val_dataset = ThousandLandmarksDataset(os.path.join(args.data, 'train'), test_transforms, split="val") val_dataloader = data.DataLoader(val_dataset, batch_size=args.batch_size, num_workers=4, pin_memory=True, shuffle=False, drop_last=False) print("Creating model...") device = torch.device("cuda: 0") if args.gpu else torch.device("cpu") model = models.resnext50_32x4d(pretrained=True) # for param in model.parameters(): # param.requires_grad = False model.fc = nn.Linear(model.fc.in_features, 2 * NUM_PTS, bias=True) # model.fc = nn.Sequential( # # nn.BatchNorm1d(model.fc.in_features, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True), # # nn.Linear(model.fc.in_features, model.fc.in_features, bias=True), # # nn.ReLU(), # nn.BatchNorm1d(model.fc.in_features, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True), # nn.Linear(model.fc.in_features, 2 * NUM_PTS, bias=True)) model.to(device) # optimizer = optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=0.01, amsgrad=True) optimizer = RAdam(model.parameters(), lr=args.learning_rate) # , weight_decay=0.01) optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.2, patience=3) # optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.2) loss_fn = fnn.mse_loss # 2. train & validate print("Ready for training...") best_val_loss = np.inf for epoch in range(args.epochs): train_loss = train(model, train_dataloader, loss_fn, optimizer, device=device) val_loss = validate(model, val_dataloader, loss_fn, device=device) print("Epoch #{:2}:\ttrain loss: {:5.2}\tval loss: {:5.2}".format( epoch, train_loss, val_loss)) if val_loss < best_val_loss: best_val_loss = val_loss with open(f"{args.name}_best.pth", "wb") as fp: torch.save(model.state_dict(), fp) # with open(f"{args.name}_{epoch}_{train_loss:7.4}_{val_loss:7.4}.pth", "wb") as fp: # torch.save(model.state_dict(), fp) # 3. predict test_dataset = ThousandLandmarksDataset(os.path.join(args.data, 'test'), test_transforms, split="test") test_dataloader = data.DataLoader(test_dataset, batch_size=args.batch_size, num_workers=4, pin_memory=True, shuffle=False, drop_last=False) with open(f"{args.name}_best.pth", "rb") as fp: best_state_dict = torch.load(fp, map_location="cpu") model.load_state_dict(best_state_dict) test_predictions = predict(model, test_dataloader, device) with open(f"{args.name}_test_predictions.pkl", "wb") as fp: pickle.dump( { "image_names": test_dataset.image_names, "landmarks": test_predictions }, fp) create_submission(args.data, test_predictions, f"{args.name}_submit.csv") if args.draw: print("Drawing landmarks...") directory = os.path.join("result", test_dataset.image_names[0].split('.')[0]) if not os.path.exists(directory): os.makedirs(directory) random_idxs = np.random.choice(len(test_dataset.image_names), size=1000, replace=False) for i, idx in enumerate(random_idxs, 1): image = cv2.imread(test_dataset.image_names[idx]) image = draw_landmarks(image, test_predictions[idx]) cv2.imwrite(os.path.join("result", test_dataset.image_names[idx]), image)
def configure_optimizers(self): from radam import RAdam optimizer = RAdam(self.parameters()) return optimizer
class Trainer: def __init__(self, args, train_loader, test_loader, tokenizer): self.args = args self.train_loader = train_loader self.test_loader = test_loader self.tokenizer = tokenizer self.vocab_size = tokenizer.vocab_size self.pad_id = tokenizer.pad_token_id self.eos_id = tokenizer.eos_token_id self.device = torch.device( 'cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu', args.local_rank) self.writer = SummaryWriter() if args.local_rank in [-1, 0] else None self.n_gpus = torch.distributed.get_world_size( ) if args.distributed else torch.cuda.device_count() assert args.pretrain != args.finetune # Do not set both finetune and pretrain arguments to the same (True, False) if args.pretrained_model: self.gpt = torch.load(args.pretrained_model) else: self.gpt = GPT(vocab_size=self.vocab_size, seq_len=args.max_seq_len, d_model=args.hidden, n_layers=args.n_layers, n_heads=args.n_attn_heads, d_ff=args.ffn_hidden, embd_pdrop=args.embd_dropout, attn_pdrop=args.attn_dropout, resid_pdrop=args.resid_dropout, pad_id=self.pad_id) if args.pretrain: self.model = GPTLMHead(self.gpt) self.model.to(self.device) if args.finetune: with open(args.cached_label_dict, 'r') as file: label_dict = json.load(file) self.model = GPTClsHead(self.gpt, n_class=len(label_dict), cls_token_id=self.eos_id) self.model.to(self.device) if args.distributed: self.model = DistributedDataParallel(self.model, device_ids=[args.local_rank], output_device=args.local_rank) self.optimizer = RAdam(self.model.parameters(), args.lr) self.criterion = nn.CrossEntropyLoss(ignore_index=self.pad_id).to( self.device) self.cls_criterion = nn.CrossEntropyLoss().to(self.device) @timeit def train(self, epoch): if self.args.pretrain: self.pretrain(epoch) if self.args.finetune: self.finetune(epoch) def pretrain(self, epoch): losses = 0 n_batches, n_samples = len(self.train_loader), len( self.train_loader.dataset) self.model.train() for i, batch in enumerate(self.train_loader): inputs = batch[0].to(self.device) targets = inputs[:, 1:].contiguous() # |inputs| : (batch_size, seq_len), |targets| : (batch_size, seq_len-1) lm_logits = self.model(inputs) lm_logits = lm_logits[:, :-1].contiguous() # |lm_logits| : (batch_size, seq_len-1, vocab_size) loss = self.criterion(lm_logits.view(-1, self.vocab_size), targets.view(-1)) losses += loss.item() self.optimizer.zero_grad() loss.backward() self.optimizer.step() if self.args.local_rank in [-1, 0]: self.writer.add_scalar('Loss/pre-train', loss.item(), ((epoch - 1) * n_batches) + i) if i % (n_batches // 5) == 0 and i != 0: print('Iteration {} ({}/{})\tLoss: {:.4f}'.format( i, i, n_batches, losses / i)) print('Train Epoch {} [rank: {}]\t>\tLoss: {:.4f}'.format( epoch, self.args.local_rank, losses / n_batches)) def finetune(self, epoch): losses, accs = 0, 0 n_batches, n_samples = len(self.train_loader), len( self.train_loader.dataset) # n_batches = batch size per GPU self.model.train() for i, batch in enumerate(self.train_loader): inputs, labels = map(lambda x: x.to(self.device), batch) # |inputs| : (batch_size, seq_len), |labels| : (batch_size) lm_logits, cls_logits = self.model(inputs) lm_logits = lm_logits[:, :-1].contiguous() # |lm_logits| : (batch_size, seq_len-1, vocab_size), |cls_logits| : (batch_size, n_class) lm_loss = self.criterion(lm_logits.view(-1, self.vocab_size), inputs[:, 1:].contiguous().view(-1)) cls_loss = self.cls_criterion(cls_logits, labels) loss = cls_loss + (self.args.auxiliary_ratio * lm_loss) losses += loss.item() acc = (cls_logits.argmax(dim=-1) == labels).to( dtype=cls_logits.dtype).mean() accs += acc self.optimizer.zero_grad() loss.backward() self.optimizer.step() if self.args.local_rank in [-1, 0]: self.writer.add_scalar('Loss/fine-tune', loss.item(), ((epoch - 1) * n_batches) + i) self.writer.add_scalar('Accuracy/fine-tune', acc, ((epoch - 1) * n_batches) + i) if i % (n_batches // 5) == 0 and i != 0: print('Iteration {} ({}/{})\tLoss: {:.4f} Acc: {:.1f}%'. format(i, i, n_batches, losses / i, accs / i * 100.)) print( 'Train Epoch {} [rank: {}]\t>\tLoss: {:.4f} / Acc: {:.1f}%'.format( epoch, self.args.local_rank, losses / n_batches, accs / n_batches * 100.)) def evaluate(self, epoch): losses, accs = 0, 0 n_batches, n_samples = len(self.test_loader), len( self.test_loader.dataset) self.model.eval() with torch.no_grad(): for i, batch in enumerate(self.test_loader): if self.args.pretrain: inputs = batch.to(self.device) targets = inputs[:, 1:].contiguous() lm_logits = self.model(inputs) lm_logits = lm_logits[:, :-1].contiguous() loss = self.criterion(lm_logits.view(-1, self.vocab_size), targets.view(-1)) losses += loss.item() if self.args.local_rank in [-1, 0]: self.writer.add_scalar('Loss/pre-train(eval)', loss.item(), ((epoch - 1) * n_batches) + i) elif self.args.finetune: inputs, labels = map(lambda x: x.to(self.device), batch) lm_logits, cls_logits = self.model(inputs) lm_logits = lm_logits[:, :-1].contiguous() lm_loss = self.criterion( lm_logits.view(-1, self.vocab_size), inputs[:, 1:].contiguous().view(-1)) cls_loss = self.cls_criterion(cls_logits, labels) loss = cls_loss + (self.args.auxiliary_ratio * lm_loss) losses += loss.item() acc = (cls_logits.argmax(dim=-1) == labels).to( dtype=cls_logits.dtype).mean() accs += acc if self.args.local_rank in [-1, 0]: self.writer.add_scalar('Loss/fine-tune(eval)', loss.item(), ((epoch - 1) * n_batches) + i) self.writer.add_scalar('Accuracy/fine-tune(eval)', acc, ((epoch - 1) * n_batches) + i) print( 'Eval Epoch {} [rank: {}]\t>\tLoss: {:.4f} / Acc: {:.1f}%'.format( epoch, self.args.local_rank, losses / n_batches, accs / n_batches * 100.)) def save(self, epoch, model_prefix='model', root='.model'): path = Path(root) / (model_prefix + '.ep%d' % epoch) if not path.parent.exists(): path.parent.mkdir() if self.args.distributed: if self.args.local_rank == 0: torch.save(self.gpt, path) else: torch.save(self.gpt, path)
def train_model(train_parameters): k = train_parameters["k"] loaders = train_parameters["loaders"] num_epochs = train_parameters["num_epochs"] net = train_parameters["net"] ENCODER = train_parameters["ENCODER"] ENCODER_WEIGHTS = train_parameters["ENCODER_WEIGHTS"] ACTIVATION = train_parameters["ACTIVATION"] model = load_model(net, ENCODER, ENCODER_WEIGHTS, ACTIVATION) """ multi-gpu """ if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = nn.DataParallel(model) model.to("cuda") # if k==0: # summary(model.module.encoder,(3,384,576)) logdir = "./logs/segmentation_{}_{}Fold".format(net, k) # model, criterion, optimizer optimizer = RAdam([ { 'params': model.module.decoder.parameters(), 'lr': 1e-2 }, { 'params': model.module.encoder.parameters(), 'lr': 1e-3 }, # {'params': model.decoder.parameters(), 'lr': 1e-2}, # {'params': model.encoder.parameters(), 'lr': 1e-3}, ]) criterion = smp.utils.losses.BCEDiceLoss(eps=1.) # criterion = FocalLoss() # criterion = FocalDiceLoss() # criterion = smp.utils.losses.DiceLoss(eps=1.) scheduler = ReduceLROnPlateau(optimizer, factor=0.15, patience=2) runner = SupervisedRunner() runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, callbacks=[ EarlyStoppingCallback(patience=10, min_delta=0.001), DiceCallback() ], # AUCCallback(), # IouCallback()], logdir=logdir, num_epochs=num_epochs, verbose=True) del loaders, optimizer, scheduler, model, runner torch.cuda.empty_cache() gc.collect() print("Collect GPU cache")
def configure_optimizers(self): return RAdam(self.parameters(), lr=self.hparams.lr)
class TasNET_trainer(object): def __init__(self, TasNET, batch_size, checkpoint="checkpoint", log_folder="./log", rnn_arch="LSTM", optimizer='radam', rerun_mode=False, lr=1e-5, momentum=0.9, weight_decay=0, num_epoches=20, clip_norm=False, sr=8000, cudnnBenchmark=True): logger.info('---Experiment Variables---') logger.info('RNN Architecture: '+rnn_arch) logger.info('Batch Size : '+str(batch_size)) logger.info('Optimizer : '+optimizer) logger.info('--------------------------\n') logger.info('Rerun mode: '+str(rerun_mode)) self.TasNET = TasNET self.log_folder = log_folder self.writer = SummaryWriter(log_folder) self.all_log = 'all_log.log' #all log filename self.log('Progress Log save path: '+log_folder) self.log("TasNET:\n{}".format(self.TasNET)) if type(lr) is str: lr = float(lr) logger.info("Transfrom lr from str to float => {}".format(lr)) self.log('Batch size used: '+str(batch_size)) if optimizer == 'radam': self.optimizer = RAdam(self.TasNET.parameters(), lr=lr, weight_decay=weight_decay) else: self.optimizer = torch.optim.Adam( self.TasNET.parameters(), lr=lr, weight_decay=weight_decay) self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, 'min', factor=0.5, patience=3,verbose=True) self.TasNET.to(device) self.checkpoint = checkpoint self.log('Model save path: '+checkpoint) self.num_epoches = num_epoches self.clip_norm = clip_norm self.sr = sr if self.clip_norm: self.log("Clip gradient by 2-norm {}".format(clip_norm)) if not os.path.exists(self.checkpoint): os.makedirs(checkpoint) torch.backends.cudnn.benchmark=cudnnBenchmark self.log('cudnn benchmark status: '+str(torch.backends.cudnn.benchmark)+'\n') def SISNR(self, output, target): #output:(128,4000) batchsize = np.shape(output)[0] target = target.view(batchsize,-1) output = output - torch.mean(output,1,keepdim=True) target = target - torch.mean(target,1,keepdim=True) s_shat = torch.sum(output*target,1,keepdim=True) s_2 = torch.sum(target**2,1,keepdim=True) s_target = (s_shat / s_2) * target #(128,4000) e_noise = output - s_target return 10*torch.log10(torch.sum(e_noise**2,1,keepdim=True)\ /torch.sum(s_target**2,1,keepdim=True)) #(128,1) def loss(self,output1,output2,target1,target2): #PIT loss loss1 = self.SISNR(output1,target1)+self.SISNR(output2,target2) loss2 = self.SISNR(output1,target2)+self.SISNR(output2,target1) min = torch.min(loss1, loss2) #(128,1) return torch.mean(min) #scale def train(self, dataloader, epoch): self.TasNET.train() self.log("Training...") tot_loss = 0 tot_batch = len(dataloader) batch_indx = (epoch-1)*tot_batch currProcess = 0 fivePercentProgress = tot_batch//20 for mix_speech, speech1, speech2 in dataloader: self.optimizer.zero_grad() if torch.cuda.is_available(): mix_speech= mix_speech.cuda() speech1 = speech1.cuda() speech2 = speech2.cuda() mix_speech = Variable(mix_speech) speech1 = Variable(speech1) speech2 = Variable(speech2) output1, output2 = self.TasNET(mix_speech) cur_loss = self.loss(output1,output2,speech1,speech2) tot_loss += cur_loss.item() #write summary batch_indx += 1 self.writer.add_scalar('train_loss', cur_loss, batch_indx) cur_loss.backward() if self.clip_norm: nn.utils.clip_grad_norm_(self.TasNET.parameters(), self.clip_norm) self.optimizer.step() currProcess+=1 if currProcess % fivePercentProgress == 0: self.log('batch {}: {:.2f}% progress ({}/{})| LR: {}'.format(batch_indx, currProcess*100/tot_batch, currProcess, tot_batch, str(self.get_curr_lr()))) return tot_loss / tot_batch, tot_batch def validate(self, dataloader, epoch): """one epoch""" self.TasNET.eval() self.log("Evaluating...") tot_loss = 0 tot_batch = len(dataloader) batch_indx = (epoch-1)*tot_batch currProcess = 0 fivePercentProgress = tot_batch//20 #print(tot_batch) with torch.no_grad(): for mix_speech,speech1,speech2 in dataloader: if torch.cuda.is_available(): mix_speech = mix_speech.cuda() speech1 = speech1.cuda() speech2 = speech2.cuda() mix_speech = Variable(mix_speech) speech1 = Variable(speech1) speech2 = Variable(speech2) output1, output2 = self.TasNET(mix_speech) cur_loss = self.loss(output1,output2,speech1,speech2) tot_loss += cur_loss.item() #write summary batch_indx += 1 currProcess += 1 if currProcess % fivePercentProgress == 0: self.log('batch {}: {:.2f}% progress ({}/{})| LR: {}'.format(batch_indx, currProcess*100/tot_batch, currProcess, tot_batch, str(self.get_curr_lr()))) self.writer.add_scalar('dev_loss', cur_loss, batch_indx) return tot_loss / tot_batch, tot_batch def run(self, train_set, dev_set): init_loss, _ = self.validate(dev_set,1) self.log("Start training for {} epoches".format(self.num_epoches)) self.log("Epoch {:2d}: dev loss ={:.4e}".format(0, init_loss)) torch.save(self.TasNET.state_dict(), os.path.join(self.checkpoint, 'TasNET_0.pkl')) for epoch in range(1, self.num_epoches+1): train_start = time.time() train_loss, train_num_batch = self.train(train_set, epoch) valid_start = time.time() valid_loss, valid_num_batch = self.validate(dev_set, epoch) valid_end = time.time() self.scheduler.step(valid_loss) self.log( "Epoch {:2d}: train loss = {:.4e}({:.2f}s/{:d}) |" " dev loss= {:.4e}({:.2f}s/{:d})".format( epoch, train_loss, valid_start - train_start, train_num_batch, valid_loss, valid_end - valid_start, valid_num_batch)) save_path = os.path.join( self.checkpoint, "TasNET_{:d}_trainloss_{:.4e}_valloss_{:.4e}.pkl".format( epoch, train_loss, valid_loss)) torch.save(self.TasNET.state_dict(), save_path) self.log("Training for {} epoches done!".format(self.num_epoches)) def rerun(self, train_set, dev_set, model_path, epoch_done): self.TasNET.load_state_dict(torch.load(model_path)) # init_loss, _ = self.validate(dev_set,epoch_done) # logger.info("Start training for {} epoches".format(self.num_epoches)) # logger.info("Epoch {:2d}: dev loss ={:.4e}".format(0, init_loss)) # torch.save(self.TasNET.state_dict(), os.path.join(self.checkpoint, 'TasNET_0.pkl')) for epoch in range(epoch_done+1, self.num_epoches+1): train_start = time.time() train_loss, train_num_batch = self.train(train_set,epoch) valid_start = time.time() valid_loss, valid_num_batch = self.validate(dev_set,epoch) valid_end = time.time() self.scheduler.step(valid_loss) self.log( "Epoch {:2d}: train loss = {:.4e}({:.2f}s/{:d}) |" " dev loss= {:.4e}({:.2f}s/{:d})".format( epoch, train_loss, valid_start - train_start, train_num_batch, valid_loss, valid_end - valid_start, valid_num_batch)) save_path = os.path.join( self.checkpoint, "TasNET_{:d}_trainloss_{:.4e}_valloss_{:.4e}.pkl".format( epoch, train_loss, valid_loss)) torch.save(self.TasNET.state_dict(), save_path) self.log("Training for {} epoches done!".format(self.num_epoches)) def get_curr_lr(self): for i, param_group in enumerate(self.optimizer.param_groups): curr_lr = float(param_group['lr']) return curr_lr def log(self, log_data): logger.info(log_data) try: f = open(self.log_folder+'/'+self.all_log,'a+') f.write(log_data+'\n') f.close() except: logger.info('failed to save last log')
def train(): # Init Training tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') bert_model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True, output_attentions=True) bert_model.eval() center_loss = None data_loaders = get_data_loader() coconut_model = CoconutModel(num_of_classes=args.num_of_classes, num_of_group=args.num_of_group, feature_size=args.feature_size) if torch.cuda.is_available(): coconut_model = coconut_model.cuda() bert_model = bert_model.cuda() optimizer = RAdam(params=coconut_model, lr=args.lr, betas=(0.0, 0.999), eps=1e-3, weight_decay=args.l2_reg) # optimizer = torch.optim.SGD(params, # lr=args.lr, # momentum=0.9, # nesterov=True, # weight_decay=args.l2_reg) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer=optimizer, milestones=[80, 150], gamma=0.1) starting_epoch = 0 if args.resume: checkpoints = load_model(model=coconut_model, optimizer=optimizer, lr_scheduler=lr_scheduler, center_loss=center_loss) (starting_epoch, coconut_model, optimizer, lr_scheduler, center_loss) = checkpoints for epoch in range(starting_epoch, args.epoch): train_model(epoch=epoch, model=coconut_model, optimizer=optimizer, loader=data_loaders["train_loader"], tokenizer=tokenizer, bert_model=bert_model, center_loss=center_loss) lr_scheduler.step() eval_model(epoch=epoch, model=coconut_model, loader=data_loaders["dev_loader"], tokenizer=tokenizer, bert_model=bert_model) save_mode(epoch=epoch, model=coconut_model, optimizer=optimizer, lr_scheduler=lr_scheduler, center_loss=center_loss) return
def main(args): # 1. prepare data & models train_transforms = transforms.Compose([ ScaleMinSideToSize((CROP_SIZE, CROP_SIZE)), CropCenter(CROP_SIZE), TransformByKeys(transforms.ToPILImage(), ("image", )), TransformByKeys(transforms.ToTensor(), ("image", )), TransformByKeys( transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), ("image", )), ]) print("Reading data...") train_dataset = ThousandLandmarksDataset(os.path.join(args.data, 'train'), train_transforms, split="train") train_dataloader = data.DataLoader(train_dataset, batch_size=args.batch_size, num_workers=1, pin_memory=True, shuffle=True, drop_last=True) val_dataset = ThousandLandmarksDataset(os.path.join(args.data, 'train'), train_transforms, split="val") val_dataloader = data.DataLoader(val_dataset, batch_size=args.batch_size, num_workers=1, pin_memory=True, shuffle=False, drop_last=False) print("Creating model...") device = torch.device("cuda: 0") if args.gpu else torch.device("cpu") #model = models.resnet18(pretrained=True) #model.fc = nn.Linear(model.fc.in_features, 2 * NUM_PTS, bias=True) #model = models.densenet161(pretrained=True) #model.classifier = nn.Linear(model.classifier.in_features, 2 * NUM_PTS, bias=True) model = EfficientNet.from_pretrained('efficientnet-b4', num_classes=2 * NUM_PTS) if args.oldname: with open(f"{args.oldname}_best.pth", "rb") as fp: best_state_dict = torch.load(fp, map_location="cpu") model.load_state_dict(best_state_dict) model.to(device) #optimizer = optim.Adam(model.parameters(), lr=args.learning_rate, amsgrad=True) optimizer = RAdam(model.parameters(), lr=args.learning_rate) loss_fn = fnn.mse_loss lr_scheduler = ReduceLROnPlateau(optimizer) # 2. train & validate print("Ready for training...") best_val_loss = np.inf for epoch in range(args.epochs): train_loss = train(model, train_dataloader, loss_fn, optimizer, device=device) val_loss = validate(model, val_dataloader, loss_fn, device=device) print("Epoch #{:2}:\ttrain loss: {:5.2}\tval loss: {:5.2}\tlr: {:5.2}". format(epoch, train_loss, val_loss, get_lr(optimizer))) if val_loss < best_val_loss: best_val_loss = val_loss with open(f"{args.name}_best.pth", "wb") as fp: torch.save(model.state_dict(), fp) os.system( 'cp /content/MadeCvHw1/{args.name}_best.pth "/content/drive/My Drive/MADE/CV_HW1/"' ) lr_scheduler.step(train_loss) # 3. predict test_dataset = ThousandLandmarksDataset(os.path.join(args.data, 'test'), train_transforms, split="test") test_dataloader = data.DataLoader(test_dataset, batch_size=args.batch_size, num_workers=1, pin_memory=True, shuffle=False, drop_last=False) with open(f"{args.name}_best.pth", "rb") as fp: best_state_dict = torch.load(fp, map_location="cpu") model.load_state_dict(best_state_dict) test_predictions = predict(model, test_dataloader, device) with open(f"{args.name}_test_predictions.pkl", "wb") as fp: pickle.dump( { "image_names": test_dataset.image_names, "landmarks": test_predictions }, fp) create_submission(args.data, test_predictions, f"{args.name}_submit.csv")