train, val = split_by_individuals(glucose_file, meal_file, ratio=0.8) glucose_train = GlucoseData(train, transform=combine_cgm_meals) glucose_val = GlucoseData(val, transform=combine_cgm_meals) loaders_dict = { 'train': torch.utils.data.DataLoader(glucose_train, batch_size=400, shuffle=False, drop_last=True), 'val': torch.utils.data.DataLoader(glucose_val, batch_size=400, shuffle=False, drop_last=True) } model = Seq2Seq() model.to(device) optimizer_base = RAdam(model.parameters(), lr=1e-1) optimizer = Lookahead(optimizer=optimizer_base, k=5, alpha=0.5) model.train() train_model(model=model, dataloaders=loaders_dict, optimizer=optimizer, device=device, num_epochs=25)
def main(args): # 1. prepare data & models train_transforms = transforms.Compose([ ScaleMinSideToSize((CROP_SIZE, CROP_SIZE)), CropCenter(CROP_SIZE), TransformByKeys(transforms.ToPILImage(), ("image", )), TransformByKeys(transforms.ToTensor(), ("image", )), TransformByKeys( transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), ("image", )), ]) print("Reading data...") train_dataset = ThousandLandmarksDataset(os.path.join(args.data, 'train'), train_transforms, split="train") train_dataloader = data.DataLoader(train_dataset, batch_size=args.batch_size, num_workers=1, pin_memory=True, shuffle=True, drop_last=True) val_dataset = ThousandLandmarksDataset(os.path.join(args.data, 'train'), train_transforms, split="val") val_dataloader = data.DataLoader(val_dataset, batch_size=args.batch_size, num_workers=1, pin_memory=True, shuffle=False, drop_last=False) print("Creating model...") device = torch.device("cuda: 0") if args.gpu else torch.device("cpu") #model = models.resnet18(pretrained=True) #model.fc = nn.Linear(model.fc.in_features, 2 * NUM_PTS, bias=True) #model = models.densenet161(pretrained=True) #model.classifier = nn.Linear(model.classifier.in_features, 2 * NUM_PTS, bias=True) model = EfficientNet.from_pretrained('efficientnet-b4', num_classes=2 * NUM_PTS) if args.oldname: with open(f"{args.oldname}_best.pth", "rb") as fp: best_state_dict = torch.load(fp, map_location="cpu") model.load_state_dict(best_state_dict) model.to(device) #optimizer = optim.Adam(model.parameters(), lr=args.learning_rate, amsgrad=True) optimizer = RAdam(model.parameters(), lr=args.learning_rate) loss_fn = fnn.mse_loss lr_scheduler = ReduceLROnPlateau(optimizer) # 2. train & validate print("Ready for training...") best_val_loss = np.inf for epoch in range(args.epochs): train_loss = train(model, train_dataloader, loss_fn, optimizer, device=device) val_loss = validate(model, val_dataloader, loss_fn, device=device) print("Epoch #{:2}:\ttrain loss: {:5.2}\tval loss: {:5.2}\tlr: {:5.2}". format(epoch, train_loss, val_loss, get_lr(optimizer))) if val_loss < best_val_loss: best_val_loss = val_loss with open(f"{args.name}_best.pth", "wb") as fp: torch.save(model.state_dict(), fp) os.system( 'cp /content/MadeCvHw1/{args.name}_best.pth "/content/drive/My Drive/MADE/CV_HW1/"' ) lr_scheduler.step(train_loss) # 3. predict test_dataset = ThousandLandmarksDataset(os.path.join(args.data, 'test'), train_transforms, split="test") test_dataloader = data.DataLoader(test_dataset, batch_size=args.batch_size, num_workers=1, pin_memory=True, shuffle=False, drop_last=False) with open(f"{args.name}_best.pth", "rb") as fp: best_state_dict = torch.load(fp, map_location="cpu") model.load_state_dict(best_state_dict) test_predictions = predict(model, test_dataloader, device) with open(f"{args.name}_test_predictions.pkl", "wb") as fp: pickle.dump( { "image_names": test_dataset.image_names, "landmarks": test_predictions }, fp) create_submission(args.data, test_predictions, f"{args.name}_submit.csv")
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--meta_path", default=None, type=str, required=False, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( '--classifier', default='guoday', type=str, required=True, help='classifier type, guoday or MLP or GRU_MLP or ...') parser.add_argument('--optimizer', default='RAdam', type=str, required=True, help='optimizer we use, RAdam or ...') parser.add_argument("--do_label_smoothing", default='yes', type=str, required=True, help="Whether to do label smoothing. yes or no.") parser.add_argument('--draw_loss_steps', default=1, type=int, required=True, help='training steps to draw loss') parser.add_argument('--label_name', default='label', type=str, required=True, help='label name in original train set index') ## Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", default='yes', type=str, required=True, help="Whether to run training. yes or no.") parser.add_argument("--do_test", default='yes', type=str, required=True, help="Whether to run training. yes or no.") parser.add_argument("--do_eval", default='yes', type=str, required=True, help="Whether to run eval on the dev set. yes or no.") parser.add_argument( "--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--eval_steps", default=200, type=int, help="") parser.add_argument("--lstm_hidden_size", default=300, type=int, help="") parser.add_argument("--lstm_layers", default=2, type=int, help="") parser.add_argument("--dropout", default=0.5, type=float, help="") parser.add_argument("--train_steps", default=-1, type=int, help="") parser.add_argument("--report_steps", default=-1, type=int, help="") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--split_num", default=3, type=int, help="text split") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") args = parser.parse_args() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) try: os.makedirs(args.output_dir) except: pass tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case) # tensorboard_log_dir = args.output_dir # loss_now = tf.placeholder(dtype=tf.float32, name='loss_now') # loss_mean = tf.placeholder(dtype=tf.float32, name='loss_mean') # loss_now_variable = loss_now # loss_mean_variable = loss_mean # train_loss = tf.summary.scalar('train_loss', loss_now_variable) # dev_loss_mean = tf.summary.scalar('dev_loss_mean', loss_mean_variable) # merged = tf.summary.merge([train_loss, dev_loss_mean]) config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=3) config.hidden_dropout_prob = args.dropout # Prepare model if args.do_train == 'yes': model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, args, config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) if args.do_train == 'yes': print( '________________________now training______________________________' ) # Prepare data loader train_examples = read_examples(os.path.join(args.data_dir, 'train.csv'), is_training=True, label_name=args.label_name) train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, args.split_num, True) # print('train_feature_size=', train_features.__sizeof__()) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # print('train_data=',train_data[0]) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps) num_train_optimization_steps = args.train_steps # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.optimizer == 'RAdam': optimizer = RAdam(optimizer_grouped_parameters, lr=args.learning_rate) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=args.train_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 model.train() tr_loss = 0 loss_batch = 0 nb_tr_examples, nb_tr_steps = 0, 0 bar = tqdm(range(num_train_optimization_steps), total=num_train_optimization_steps) train_dataloader = cycle(train_dataloader) # with tf.Session() as sess: # summary_writer = tf.summary.FileWriter(tensorboard_log_dir, sess.graph) # sess.run(tf.global_variables_initializer()) list_loss_mean = [] bx = [] eval_F1 = [] ax = [] for step in bar: batch = next(train_dataloader) batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() loss_batch += loss.item() train_loss = round( tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1), 4) bar.set_description("loss {}".format(train_loss)) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: # optimizer.backward(loss) loss.backward() else: loss.backward() # draw loss every n docs if (step + 1) % int(args.draw_loss_steps / (args.train_batch_size / args.gradient_accumulation_steps)) == 0: list_loss_mean.append(round(loss_batch, 4)) bx.append(step + 1) plt.plot(bx, list_loss_mean, label='loss_mean', linewidth=1, color='b', marker='o', markerfacecolor='green', markersize=2) plt.savefig(args.output_dir + '/labeled.jpg') loss_batch = 0 # paras update every batch data. if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 # report results every 200 real batch. if step % (args.eval_steps * args.gradient_accumulation_steps) == 0 and step > 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) # do evaluation totally 10 times during training stage. if args.do_eval == 'yes' and (step + 1) % int( num_train_optimization_steps / 10) == 0 and step > 450: for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] eval_examples = read_examples(os.path.join( args.data_dir, file), is_training=True, label_name=args.label_name) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field( eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field( eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_labels = np.concatenate(inference_labels, 0) inference_logits = np.concatenate(inference_logits, 0) model.train() ############################################### num_gold_0 = np.sum(gold_labels == 0) num_gold_1 = np.sum(gold_labels == 1) num_gold_2 = np.sum(gold_labels == 2) right_0 = 0 right_1 = 0 right_2 = 0 error_0 = 0 error_1 = 0 error_2 = 0 for gold_label, inference_label in zip( gold_labels, inference_labels): if gold_label == inference_label: if gold_label == 0: right_0 += 1 elif gold_label == 1: right_1 += 1 else: right_2 += 1 elif inference_label == 0: error_0 += 1 elif inference_label == 1: error_1 += 1 else: error_2 += 1 recall_0 = right_0 / (num_gold_0 + 1e-5) recall_1 = right_1 / (num_gold_1 + 1e-5) recall_2 = right_2 / (num_gold_2 + 1e-5) precision_0 = right_0 / (error_0 + right_0 + 1e-5) precision_1 = right_1 / (error_1 + right_1 + 1e-5) precision_2 = right_2 / (error_2 + right_2 + 1e-5) f10 = 2 * precision_0 * recall_0 / (precision_0 + recall_0 + 1e-5) f11 = 2 * precision_1 * recall_1 / (precision_1 + recall_1 + 1e-5) f12 = 2 * precision_2 * recall_2 / (precision_2 + recall_2 + 1e-5) output_dev_result_file = os.path.join( args.output_dir, "dev_results.txt") with open(output_dev_result_file, 'a', encoding='utf-8') as f: f.write('precision:' + str(precision_0) + ' ' + str(precision_1) + ' ' + str(precision_2) + '\n') f.write('recall:' + str(recall_0) + ' ' + str(recall_1) + ' ' + str(recall_2) + '\n') f.write('f1:' + str(f10) + ' ' + str(f11) + ' ' + str(f12) + '\n' + '\n') eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracy(inference_logits, gold_labels) # draw loss. eval_F1.append(round(eval_accuracy, 4)) ax.append(step) plt.plot(ax, eval_F1, label='eval_F1', linewidth=1, color='r', marker='o', markerfacecolor='blue', markersize=2) for a, b in zip(ax, eval_F1): plt.text(a, b, b, ha='center', va='bottom', fontsize=8) plt.savefig(args.output_dir + '/labeled.jpg') result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step, 'loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc and 'dev' in file: print("=" * 80) print("more accurate model arises, now best F1 = ", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model, only save the model it-self model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join( args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) ''' if (step+1) / int(num_train_optimization_steps/10) > 9.5: print("=" * 80) print("End of training. Saving Model......") # Save a trained model, only save the model it-self model_to_save = model.module if hasattr(model, 'module') else model output_model_file = os.path.join(args.output_dir, "pytorch_model_final_step.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) ''' if args.do_test == 'yes': start_time = time.time() print( '___________________now testing for best eval f1 model_________________________' ) try: del model except: pass gc.collect() args.do_train = 'no' model = BertForSequenceClassification.from_pretrained(os.path.join( args.output_dir, "pytorch_model.bin"), args, config=config) model.half() for layer in model.modules(): if isinstance(layer, torch.nn.modules.batchnorm._BatchNorm): layer.float() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from " "https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) for file, flag in [('test.csv', 'test')]: inference_labels = [] gold_labels = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=False, label_name=args.label_name) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask).detach().cpu().numpy() # print('test_logits=', logits) label_ids = label_ids.to('cpu').numpy() inference_labels.append(logits) gold_labels.append(label_ids) gold_labels = np.concatenate(gold_labels, 0) logits = np.concatenate(inference_labels, 0) if flag == 'dev': print(flag, accuracy(logits, gold_labels)) elif flag == 'test': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df['label_2'] = logits[:, 2] df[['id', 'label_0', 'label_1', 'label_2']].to_csv(os.path.join(args.output_dir, "sub.csv"), index=False) # df[['id', 'label_0', 'label_1']].to_csv(os.path.join(args.output_dir, "sub.csv"), index=False) else: raise ValueError('flag not in [dev, test]') print('inference time usd = {}s'.format(time.time() - start_time)) '''
def main(): seed_everything(69) outPath = f"{args.results}_{args.model_name}_YpUnet_AG_hnn234" if not os.path.exists(outPath): os.makedirs(outPath) ts = str(datetime.datetime.now()).split(".")[0].replace(" ", "_") ts = ts.replace(":", "_").replace("-", "_") file_path = os.path.join(outPath, "{}_run_{}.json".format(args.model_name, ts)) ##############choose model########################## net = get_model(args.num_classes, args.model_name).to(device) if args.pre_train: net = torch.load(args.ckp)["model_state"] #load the pretrained model print("load pre-trained model sucessfully") if torch.cuda.device_count() > 1: print("using multi gpu") net = torch.nn.DataParallel(net, device_ids=[0, 1, 2, 3]) else: print('using one gpu') ##########hyper parameters setting################# # optimizer = Adam(net.parameters(), lr=args.lr) optimizer = RAdam(params=net.parameters(), lr=args.lr, weight_decay=0.0001) optimizer = Lookahead(optimizer) milestones = [5 + x * 30 for x in range(5)] scheduler_c = CyclicCosAnnealingLR(optimizer, milestones=milestones, eta_min=5e-5) # # scheduler_r = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', factor=0.2, patience=4, verbose=True) scheduler = LearningRateWarmUP(optimizer=optimizer, target_iteration=5, target_lr=0.003, after_scheduler=scheduler_c) # criterion = FocalLoss2d().to(device) # criterion = BCEDiceLossWeighted().to(device) criterion = WeightedBceLoss().to(device) # criterion2 = WeightedBceLoss().to(device) ##########prepare dataset################################ train_loader, val_loader = build_loader(batch_size=args.batch_size, num_workers=4) history = collections.defaultdict(list) best_miou = -100 for epoch in range(args.num_epochs): print("Epoch: {}/{}".format(epoch + 1, args.num_epochs)) # optimizer.step() # scheduler.step(epoch) ####################train#################################### train_hist = train(train_loader, args.num_classes, device, net, optimizer, criterion) print('loss', train_hist["loss"], 'miou', train_hist["miou"], 'fg_iou', train_hist["fg_iou"], 'mcc', train_hist["mcc"]) for k, v in train_hist.items(): history["train " + k].append(v) ######################valid################################## val_hist = validate(val_loader, args.num_classes, device, net, scheduler, criterion) print('loss', val_hist["loss"], 'miou', val_hist["miou"], 'fg_iou', val_hist["fg_iou"], 'mcc', val_hist["mcc"]) if val_hist["miou"] > best_miou: state = { "epoch": epoch + 1, "model_state": net, "best_miou": val_hist["miou"] } checkpoint = f'{args.model_name}_val_{val_hist["miou"]}_epoch{epoch + 1}.pth' torch.save(state, os.path.join(outPath, checkpoint)) # save model print("The model has saved successfully!") best_miou = val_hist["miou"] for k, v in val_hist.items(): history["val " + k].append(v) f = open(file_path, "w+") f.write(json.dumps(history)) f.close()
shuffle=True, num_workers=config.data_loader_numworkers) #load data for testing test_loader = torch.utils.data.DataLoader(RoadSequenceDataset( file_path=config.test_path, transforms=op_tranforms), batch_size=args.test_batch_size, shuffle=False, num_workers=1) #load model model = UNet_TwoConvGRU(3, 2).to(device) # optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) # Adam 参数betas=(0.9, 0.99) #optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.99)) optimizer = RAdam(model.parameters(), lr=args.lr, betas=(0.9, 0.999)) # optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9) scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.5) class_weight = torch.Tensor(config.class_weight) criterion = torch.nn.CrossEntropyLoss(weight=class_weight).to(device) criterion2 = torch.nn.MSELoss().to(device) # best_acc = 0 if config.pretrained_path: print('loading------------------') pretrained_dict = torch.load(config.pretrained_path) model_dict = model.state_dict() # pretrained_dict_1 = { k: v for k, v in pretrained_dict.items() if (k in model_dict)
# schedule = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.333, patience=2, verbose=True) criterion = nn.CrossEntropyLoss() # criterion = LabelSmoothSoftmaxCE(lb_pos=0.9, lb_neg=5e-3) ''' dataset.get_step() 获取数据的总迭代次数 ''' best_score = 0 print(max_epoch) print('------------------start training------------------------') for i, net in enumerate(nets): min_loss = 1000 print('----------------------start net{}---------------------'.format(i)) optimizer = RAdam(net.parameters(), lr=params['lr'], weight_decay=params['weight_decay']) schedule = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.333, patience=5, verbose=True) for step in range(max_epoch): # model.save_model(net, MODEL_PATH, overwrite=True) # break train_loss, train_acc = train(net, train_loader, optimizer,criterion) valid_loss, valid_acc = valid(net, val_loader, optimizer,criterion) schedule.step(valid_loss) # ''' # 实现自己的模型保存逻辑 # ''' # if valid_loss < min_loss: model.save_model(net, MODEL_PATH, name='net_'+str(i)+'.pkl', overwrite=False)
def main(): num_epochs = 100 train_x, train_y, valid_x, valid_y, test_x, test_y = load_data() model = CNN_LSTM() CUDA = torch.cuda.is_available() if CUDA: model = model.cuda() criterion = nn.CrossEntropyLoss() optimizer = RAdam(model.parameters(), lr=learning_rate, weight_decay=1e-4) # optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4) # optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=1e-4,momentum=0.9) train_loss = [] train_accuracy = [] valid_loss = [] valid_accuracy = [] for epoch in range(num_epochs): start = time.time() # adjust_learning_rate(optimizer,epoch) loss_avg, accuracy_avg = train(train_x, train_y, model, criterion, optimizer) valid_loss_avg, valid_acc_avg = valid(valid_x, valid_y, model, criterion) train_loss.append(loss_avg) train_accuracy.append(accuracy_avg) valid_loss.append(valid_loss_avg) valid_accuracy.append(valid_acc_avg) stop = time.time() print( 'Epoch: [%d | %d] LR: %f Train:Loss_Avg=%f Accuracy_Avg=%f Valid: Loss=%f Accuracy=%f Time: %f' % (epoch + 1, num_epochs, learning_rate, loss_avg, accuracy_avg, valid_loss_avg, valid_acc_avg, stop - start)) test_accuracy = test(test_x, test_y, model) print('Test Accuracy: {:.3f} %'.format(test_accuracy)) # torch.save(model,"base_cnn") if torch.cuda.is_available(): inputs = test_x.cuda() labels = test_y.cuda() outputs = model(inputs) _, predicted = torch.max(outputs.data, 1) correct = ((predicted == labels).sum().item()) / labels.size(0) print('Test Accuracy: {:.3f} %'.format(100 * correct)) print("Precision") print( precision_score(predicted.cpu().numpy(), labels.cpu().numpy(), average=None)) print( precision_score(predicted.cpu().numpy(), labels.cpu().numpy(), average='weighted')) print("Recall") print( recall_score(predicted.cpu().numpy(), labels.cpu().numpy(), average=None)) print( recall_score(predicted.cpu().numpy(), labels.cpu().numpy(), average='weighted')) print( "F1 Accuracy:", sk.metrics.f1_score(predicted.cpu().numpy(), labels.cpu().numpy(), average=None)) print( "F1 Accuracy:", sk.metrics.f1_score(predicted.cpu().numpy(), labels.cpu().numpy(), average='weighted')) np.save("predicted", predicted.cpu().numpy()) np.save("labels", labels.cpu().numpy()) print("Confustion Matrix:") class_names = ['Start_gesture', 'Unknown'] class_names = np.array(class_names) plot_confusion_matrix(predicted.cpu().numpy(), labels.cpu().numpy(), classes=class_names, normalize=True, title='Normalized confusion matrix') plt.figure(figsize=(10, 10)) plt.savefig('foo.png') plt.show() print() # Loss f = plt.figure(figsize=(10, 10)) plt.plot(train_loss, label='Training Loss') plt.plot(valid_loss, label='Valid Loss') plt.legend() plt.show() # Accuracy f = plt.figure(figsize=(10, 10)) plt.plot(train_accuracy, label='Training Accuracy') plt.plot(valid_accuracy, label='Valid Accuracy') plt.legend() plt.show() train_loss = np.asarray(train_loss) train_accuracy = np.asarray(train_accuracy) valid_loss = np.asarray(valid_loss) valid_accuracy = np.asarray(valid_accuracy) np.save('train_loss', train_loss) np.save('train_accuracy', train_accuracy) np.save('valid_loss', valid_loss) np.save('valid_accuracy', valid_accuracy)
def train(train_loader, val_loader, model, device, experiment_name, epochs=20, optimizer_name='radam', lr=1e-3, weight_decay=0, unfreeze_extractor_epoch=0, extractor_lr=1e-4, log_interval=50): freeze_extractor = True model.feature_extractor.toggle_extractor(freeze=True) parameter_groups = [ { 'params': filter_params(model.parameters()), 'lr': lr }, ] if optimizer_name == 'adam': optimizer = optim.Adam(parameter_groups, lr=lr, betas=(0.9, 0.999), weight_decay=weight_decay) elif optimizer_name == 'radam': from radam import RAdam optimizer = RAdam(parameter_groups, lr=lr, betas=(0.9, 0.999), weight_decay=weight_decay) elif optimizer_name == 'rmsprop': optimizer = optim.RMSprop(parameter_groups, lr=lr, weight_decay=weight_decay) else: raise ValueError(f'Unknown optimizer {optimizer_name}') for epoch in range(1, epochs + 1): print('Learning rate is {}'.format(optimizer.param_groups[0]['lr'])) if epoch - 1 == unfreeze_extractor_epoch: print('Unfreezing extractor') freeze_extractor = False model.feature_extractor.toggle_extractor(freeze=False) pgroup = { 'params': filter_params(model.feature_extractor.extractor.parameters()), 'lr': extractor_lr } optimizer.add_param_group(pgroup) start_time = time.time() train_epoch(train_loader, model, optimizer, device, epoch, log_interval, freeze_extractor=freeze_extractor) end_time = time.time() print(f'Epoch took {end_time-start_time:.2f} seconds') val_epoch(val_loader, model, device) save_model(model, experiment_name) return model
EGG_prior.load_state_dict(torch.load('./models/AAI_EGGAE/best_val-cosloss-ranger.pth')) EGG_prior.eval() STZ = FCEncoder() STZ.cuda() ZTE = FCDecoder() ZTE.cuda() DC = SimpleDiscriminator() DC.cuda() #encoder/decoder optimizers # STZ_optimizer_gen = torch.optim.Adam(STZ.parameters(), lr = gen_lr) # ZTE_optimizer = torch.optim.Adam(ZTE.parameters(), lr = gen_lr) STZ_optimizer_gen = RAdam(STZ.parameters(), lr = gen_lr) ZTE_optimizer = RAdam(ZTE.parameters(), lr = gen_lr) STZ_optimizer_gen = Lookahead(STZ_optimizer_gen, alpha=0.5,k=5) ZTE_optimizer = Lookahead(ZTE_optimizer, alpha=0.5,k=5) #regularizing optimizer # STZ_optimizer_enc = torch.optim.Adam(STZ.parameters(), lr = reg_lr) # DC_optimizer = torch.optim.Adam(DC.parameters(),lr = reg_lr) STZ_optimizer_enc = RAdam(STZ.parameters(), lr = reg_lr) DC_optimizer = RAdam(DC.parameters(),lr = reg_lr) STZ_optimizer_enc = Lookahead(STZ_optimizer_enc, alpha=0.5,k=5) DC_optimizer = Lookahead(DC_optimizer, alpha=0.5,k=5)
lr=opt.learning_rate, momentum=opt.momentum, dampening=dampening, weight_decay=opt.weight_decay, nesterov=opt.nesterov) elif opt.optimizer == 'adam': optimizer = optim.Adam( parameters, lr=opt.learning_rate, betas=(0.9, 0.999), eps=1e-8, weight_decay=opt.weight_decay) elif opt.optimizer == 'radam': optimizer = RAdam( parameters, lr=opt.learning_rate, betas=(0.9, 0.999), weight_decay=opt.weight_decay) normalize = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ## prepare train if not opt.no_train: temporal_transform = TemporalSegmentRandomCrop(opt.segment_number, opt.sample_duration) assert opt.train_crop in ['random', 'corner', 'center'] if opt.train_crop == 'random': sceobj_crop_method = MultiScaleRandomCrop(opt.scales, opt.sceobj_frame_size) elif opt.train_crop == 'corner': sceobj_crop_method = MultiScaleCornerCrop(opt.scales, opt.sceobj_frame_size) elif opt.train_crop == 'center':
def main(): parser = argparse.ArgumentParser() parser.add_argument('--encoder', type=str, default='efficientnet-b0') parser.add_argument('--model', type=str, default='unet') parser.add_argument('--pretrained', type=str, default='imagenet') parser.add_argument('--logdir', type=str, default='../logs/') parser.add_argument('--exp_name', type=str) parser.add_argument('--data_folder', type=str, default='../input/') parser.add_argument('--height', type=int, default=320) parser.add_argument('--width', type=int, default=640) parser.add_argument('--batch_size', type=int, default=2) parser.add_argument('--accumulate', type=int, default=8) parser.add_argument('--epochs', type=int, default=20) parser.add_argument('--enc_lr', type=float, default=1e-2) parser.add_argument('--dec_lr', type=float, default=1e-3) parser.add_argument('--optim', type=str, default="radam") parser.add_argument('--loss', type=str, default="bcedice") parser.add_argument('--schedule', type=str, default="rlop") parser.add_argument('--early_stopping', type=bool, default=True) args = parser.parse_args() encoder = args.encoder model = args.model pretrained = args.pretrained logdir = args.logdir name = args.exp_name data_folder = args.data_folder height = args.height width = args.width bs = args.batch_size accumulate = args.accumulate epochs = args.epochs enc_lr = args.enc_lr dec_lr = args.dec_lr optim = args.optim loss = args.loss schedule = args.schedule early_stopping = args.early_stopping if model == 'unet': model = smp.Unet(encoder_name=encoder, encoder_weights=pretrained, classes=4, activation=None) if model == 'fpn': model = smp.FPN( encoder_name=encoder, encoder_weights=pretrained, classes=4, activation=None, ) if model == 'pspnet': model = smp.PSPNet( encoder_name=encoder, encoder_weights=pretrained, classes=4, activation=None, ) if model == 'linknet': model = smp.Linknet( encoder_name=encoder, encoder_weights=pretrained, classes=4, activation=None, ) if model == 'aspp': print('aspp can only be used with resnet34') model = aspp(num_class=4) preprocessing_fn = smp.encoders.get_preprocessing_fn(encoder, pretrained) log = os.path.join(logdir, name) ds = get_dataset(path=data_folder) prepared_ds = prepare_dataset(ds) train_set, valid_set = get_train_test(ds) train_ds = CloudDataset(df=prepared_ds, datatype='train', img_ids=train_set, transforms=training1(h=height, w=width), preprocessing=get_preprocessing(preprocessing_fn), folder=data_folder) valid_ds = CloudDataset(df=prepared_ds, datatype='train', img_ids=valid_set, transforms=valid1(h=height, w=width), preprocessing=get_preprocessing(preprocessing_fn), folder=data_folder) train_loader = DataLoader(train_ds, batch_size=bs, shuffle=True, num_workers=multiprocessing.cpu_count()) valid_loader = DataLoader(valid_ds, batch_size=bs, shuffle=False, num_workers=multiprocessing.cpu_count()) loaders = { 'train': train_loader, 'valid': valid_loader, } num_epochs = epochs if args.model != "aspp": if optim == "radam": optimizer = RAdam([ { 'params': model.encoder.parameters(), 'lr': enc_lr }, { 'params': model.decoder.parameters(), 'lr': dec_lr }, ]) if optim == "adam": optimizer = Adam([ { 'params': model.encoder.parameters(), 'lr': enc_lr }, { 'params': model.decoder.parameters(), 'lr': dec_lr }, ]) if optim == "adamw": optimizer = AdamW([ { 'params': model.encoder.parameters(), 'lr': enc_lr }, { 'params': model.decoder.parameters(), 'lr': dec_lr }, ]) if optim == "sgd": optimizer = SGD([ { 'params': model.encoder.parameters(), 'lr': enc_lr }, { 'params': model.decoder.parameters(), 'lr': dec_lr }, ]) elif args.model == 'aspp': if optim == "radam": optimizer = RAdam([ { 'params': model.parameters(), 'lr': enc_lr }, ]) if optim == "adam": optimizer = Adam([ { 'params': model.parameters(), 'lr': enc_lr }, ]) if optim == "adamw": optimizer = AdamW([ { 'params': model.parameters(), 'lr': enc_lr }, ]) if optim == "sgd": optimizer = SGD([ { 'params': model.parameters(), 'lr': enc_lr }, ]) scheduler = ReduceLROnPlateau(optimizer, factor=0.1, patience=5) if schedule == "rlop": scheduler = ReduceLROnPlateau(optimizer, factor=0.15, patience=3) if schedule == "noam": scheduler = NoamLR(optimizer, 10) if loss == "bcedice": criterion = smp.utils.losses.BCEDiceLoss(eps=1.) if loss == "dice": criterion = smp.utils.losses.DiceLoss(eps=1.) if loss == "bcejaccard": criterion = smp.utils.losses.BCEJaccardLoss(eps=1.) if loss == "jaccard": criterion == smp.utils.losses.JaccardLoss(eps=1.) if loss == 'bce': criterion = NewBCELoss() callbacks = [NewDiceCallback(), CriterionCallback()] callbacks.append(OptimizerCallback(accumulation_steps=accumulate)) if early_stopping: callbacks.append(EarlyStoppingCallback(patience=5, min_delta=0.001)) runner = SupervisedRunner() runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, callbacks=callbacks, logdir=log, num_epochs=num_epochs, verbose=True, )
def train(args): model, model_file = create_model(args.encoder_type, work_dir=args.work_dir) if torch.cuda.device_count() > 1: model = DataParallel(model) model = model.cuda() loaders = get_train_val_loaders(args.encoder_type, batch_size=args.batch_size, ifold=args.ifold) #optimizer = RAdam([ # {'params': model.decoder.parameters(), 'lr': args.lr}, # {'params': model.encoder.parameters(), 'lr': args.lr / 10.}, #]) if args.optim_name == 'RAdam': optimizer = RAdam(model.parameters(), lr=args.lr) elif args.optim_name == 'Adam': optimizer = optim.Adam(model.parameters(), lr=args.lr) elif args.optim_name == 'SGD': optimizer = optim.SGD(model.parameters(), momentum=0.9, lr=args.lr) if args.lrs == 'plateau': lr_scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=args.factor, patience=args.patience, min_lr=args.min_lr) else: lr_scheduler = CosineAnnealingLR(optimizer, args.t_max, eta_min=args.min_lr) best_metrics = 0. best_key = 'dice' print( 'epoch | lr | % | loss | avg | dloss | closs | dice | best | time | save |' ) if not args.no_first_val: val_metrics = validate(args, model, loaders['valid']) print( 'val | | | | | {:.4f} | {:.4f} | {:.4f} | {:.4f} | | |' .format(val_metrics['dice_loss'], val_metrics['cls_loss'], val_metrics['dice'], val_metrics['dice'])) best_metrics = val_metrics[best_key] if args.val: return model.train() #if args.lrs == 'plateau': # lr_scheduler.step(best_metrics) #else: # lr_scheduler.step() train_iter = 0 for epoch in range(args.num_epochs): train_loss = 0 current_lr = get_lrs(optimizer) bg = time.time() for batch_idx, data in enumerate(loaders['train']): train_iter += 1 img, mask_targets, cls_targets = data[0].cuda(), data[1][0].cuda( ), data[1][1].cuda() batch_size = img.size(0) outputs = model(img) dice_loss, cls_loss = criterion(outputs, (mask_targets, cls_targets)) ((dice_loss + cls_loss) * batch_size).backward() optimizer.step() optimizer.zero_grad() train_loss += dice_loss.item() + cls_loss.item() print('\r {:4d} | {:.6f} | {:06d}/{} | {:.4f} | {:.4f} |'.format( epoch, float(current_lr[0]), args.batch_size * (batch_idx + 1), loaders['train'].num, dice_loss.item() + cls_loss.item(), train_loss / (batch_idx + 1)), end='') if train_iter > 0 and train_iter % args.iter_val == 0: save_model(model, model_file + '_latest') val_metrics = validate(args, model, loaders['valid']) _save_ckp = '' if val_metrics[best_key] > best_metrics: best_metrics = val_metrics[best_key] save_model(model, model_file) _save_ckp = '*' print(' {:.4f} | {:.4f} | {:.4f} | {:.4f} | {:.2f} | {:4s} |'. format(val_metrics['dice_loss'], val_metrics['cls_loss'], val_metrics['dice'], best_metrics, (time.time() - bg) / 60, _save_ckp)) model.train() if args.lrs == 'plateau': lr_scheduler.step(best_metrics) else: lr_scheduler.step() current_lr = get_lrs(optimizer)
from model.cain_noca import CAIN_NoCA print("Building model: CAIN_NoCA") model = CAIN_NoCA(depth=args.depth) else: raise NotImplementedError("Unknown model!") # Just make every model to DataParallel model = torch.nn.DataParallel(model).to(device) #print(model) ##### Define Loss & Optimizer ##### criterion = Loss(args) args.radam = False if args.radam: from radam import RAdam optimizer = RAdam(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2)) else: from torch.optim import Adam optimizer = Adam(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2)) print('# of parameters: %d' % sum(p.numel() for p in model.parameters())) # If resume, load checkpoint: model + optimizer if args.resume: utils.load_checkpoint(args, model, optimizer) it = args.it # Learning Rate Scheduler scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=0.5, patience=5, verbose=True)
def create_custom_optimizer(tvars, loss, bert_init_lr, task_init_lr, num_train_steps, num_warmup_steps, use_tpu, global_step=None, freeze=-1, task_opt='adam', eps=1e-6): """Creates an optimizer training op.""" if global_step is None: global_step = tf.train.get_or_create_global_step() bert_learning_rate = tf.constant(value=bert_init_lr, shape=[], dtype=tf.float32) task_learning_rate = tf.constant(value=task_init_lr, shape=[], dtype=tf.float32) # Implements linear decay of the learning rate. bert_learning_rate = tf.train.polynomial_decay(bert_learning_rate, global_step, num_train_steps, end_learning_rate=0.0, power=1.0, cycle=False) task_learning_rate = tf.train.polynomial_decay(task_learning_rate, global_step, num_train_steps, end_learning_rate=0.0, power=1.0, cycle=False) # Implements linear warmup. I.e., if global_step < num_warmup_steps, the # learning rate will be `global_step/num_warmup_steps * init_lr`. if num_warmup_steps: global_steps_int = tf.cast(global_step, tf.int32) warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) global_steps_float = tf.cast(global_steps_int, tf.float32) warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) warmup_percent_done = global_steps_float / warmup_steps_float bert_warmup_learning_rate = bert_init_lr * warmup_percent_done task_warmup_learning_rate = task_init_lr * warmup_percent_done is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) bert_learning_rate = ((1.0 - is_warmup) * bert_learning_rate + is_warmup * bert_warmup_learning_rate) # It is recommended that you use this optimizer for fine tuning, since this # is how the model was trained (note that the Adam m/v variables are NOT # loaded from init_checkpoint.) bert_optimizer = AdamWeightDecayOptimizer( learning_rate=bert_learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=eps, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) if task_opt == 'adam_weight_decay': task_optimizer = AdamWeightDecayOptimizer( learning_rate=task_learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=eps) elif task_opt == 'adam': task_optimizer = tf.train.AdamOptimizer( learning_rate=task_learning_rate) elif task_opt == 'radam': task_optimizer = RAdam(learning_rate=task_learning_rate, epsilon=1e-8, beta1=0.9, beta2=0.999) else: raise NotImplementedError( 'Check optimizer. {} is invalid.'.format(task_opt)) # tvars = tf.trainable_variables() bert_vars, task_vars = [], [] for var in tvars: if var.name.startswith('bert'): can_optimize = False if var.name.startswith('bert/encoder/layer_') and int( var.name.split('/')[2][len('layer_'):]) >= freeze: can_optimize = True if freeze == -1 or can_optimize: bert_vars.append(var) else: task_vars.append(var) print('bert:task', len(bert_vars), len(task_vars)) grads = tf.gradients(loss, bert_vars + task_vars) bert_grads = grads[:len(bert_vars)] task_grads = grads[len(bert_vars):] # This is how the model was pre-trained. (bert_grads, _) = tf.clip_by_global_norm(bert_grads, clip_norm=1.0) (task_grads, _) = tf.clip_by_global_norm(task_grads, clip_norm=1.0) # global_step1 = tf.Print(global_step, [global_step], 'before') bert_train_op = bert_optimizer.apply_gradients(zip(bert_grads, bert_vars), global_step=global_step) task_train_op = task_optimizer.apply_gradients(zip(task_grads, task_vars), global_step=global_step) if task_opt == 'adam_weight_decay': new_global_step = global_step + 1 train_op = tf.group(bert_train_op, task_train_op, [global_step.assign(new_global_step)]) else: train_op = tf.group(bert_train_op, task_train_op) return train_op
def train_vae(loader, device, stats_logger, lr=1e-3, schedule_lr=False, latent_dims=16, epochs=100, optimizer_name='adam', adam_beta1=0.5, loss_weights=None, extractor_lr=1e-5, clip_gradients=None, encoder_class=vae.Encoder, decoder_class=vae.Decoder, schedule_classes=None, beta_schedule_class=BetaSchedule): """Entry point for VAE training""" repr_dims = loader.dataset.dims encoder = encoder_class(repr_dims, latent_dims) decoder = decoder_class(repr_dims, latent_dims) model = vae.VAE(encoder, decoder).to(device) if schedule_lr: # LambdaLR multiplies the initial learning rate with the value # returned from lambda each epoch. If we want to directly use the # value returned from lambda as the learning rate, we can set an # initial learning rate of 1. initial_lr = lr lr = 1.0 parameter_groups = [ { 'params': model.parameters(), 'lr': lr }, ] if optimizer_name == 'adam': optimizer = optim.Adam(parameter_groups, lr=lr, betas=(adam_beta1, 0.999)) elif optimizer_name == 'radam': from radam import RAdam optimizer = RAdam(parameter_groups, lr=lr, betas=(adam_beta1, 0.999)) elif optimizer_name == 'rmsprop': optimizer = optim.RMSprop(parameter_groups, lr=lr) else: raise ValueError(f'Unknown optimizer {optimizer_name}') if schedule_lr: lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, LRSchedule(initial_lr)) if loss_weights is None: loss_weights = {} else: assert isinstance(loss_weights, dict), \ 'Loss weights must be a dictionary `loss_name -> weight`' if schedule_classes is None: schedule_classes = {} else: assert isinstance(schedule_classes, dict), \ 'schedules_classes must be a dictionary `loss_name -> schedule_class`' schedule_classes['KLD'] = beta_schedule_class loss_schedules = { name: schedule_class() for name, schedule_class in schedule_classes.items() } print('Training VAE on features...') for epoch in range(1, epochs + 1): print('Learning rate is {}'.format(optimizer.param_groups[0]['lr'])) for name, schedule in loss_schedules.items(): if name == 'KLD': # Special case for KLD's weight (beta) if isinstance(schedule, BetaSchedule): beta = schedule.get_beta(epoch - 1) loss_weights['KLD'] = beta print(f'Beta is {beta}') else: loss_weights[name] = schedule.get_beta(epoch - 1) if model.reg_loss.use_bayes_factor_vae0_loss: variances = ( 1 / model.reg_loss.log_precision.exp()).cpu().detach().numpy() print(variances[variances > 1]) start_time = time.time() epoch_stats = train_epoch(loader, model, optimizer, device, epoch, 1, loss_weights, stats_logger, clip_gradients) end_time = time.time() print(f'Epoch took {end_time-start_time:.2f} seconds') stats_logger.append(epoch - 1, epoch_stats) if schedule_lr: lr_scheduler.step() return model
attention=True, ) model = model.to(device) # convert to half precision model.half() for layer in model.modules(): if isinstance(layer, nn.BatchNorm2d): layer.float() # Add weights from checkpoint model if specified if opt.checkpoint_model: model.load_state_dict(torch.load(opt.checkpoint_model), ) optimizer = RAdam(model.parameters(), lr=0.0001, eps=1e-04) #Stating the epoch for epoch in range(opt.num_epochs): #epoch+=104 epoch_metrics = {"loss": [], "acc": []} prev_time = time.time() print(f"--- Epoch {epoch}---") for batch_i, (X, y) in enumerate(train_dataloader): if X.size(0) == 1: continue image_sequences = Variable(X.to(device), requires_grad=True) labels = Variable(y.to(device), requires_grad=False) image_sequences = image_sequences.half()
if args.optimizer.lower()=='sgd': optimizer = optim.SGD(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) if args.optimizer.lower()=='sgdwm': optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower()=='adam': optimizer = torch.optim.Adam(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'rmsprop': optimizer = optim.RMSprop(net.parameters(),lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'adagrad': optimizer = optim.Adagrad(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'radam': from radam import RAdam optimizer = RAdam(net.parameters(),lr=args.lr,weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lars':#no tensorboardX from lars import LARS optimizer = LARS(net.parameters(), lr=args.lr,momentum=args.momentum,weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lamb': from lamb import Lamb optimizer = Lamb(net.parameters(),lr=args.lr,weight_decay=args.weight_decay) elif args.optimizer.lower() == 'novograd': from novograd import NovoGrad optimizer = NovoGrad(net.parameters(), lr=args.lr,weight_decay=args.weight_decay) else: optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lrs = create_lr_scheduler(args.warmup_epochs, args.lr_decay) # lr_scheduler = LambdaLR(optimizer,lrs) def lrs(batch):
def train(args): print('start training...') model, model_file = create_model(args) train_loader, val_loader = get_train_val_loaders(batch_size=args.batch_size, val_batch_size=args.val_batch_size) #train_loader, val_loader = get_frame_train_loader(batch_size=args.batch_size, val_batch_size=args.val_batch_size) #model, optimizer = amp.initialize(model, optimizer, opt_level="O1",verbosity=0) if args.optim == 'Adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.0001) elif args.optim == 'RAdam': optimizer = RAdam(model.parameters(), lr=args.lr, weight_decay=0.0001) else: optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=0.0001) if args.lrs == 'plateau': lr_scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=args.factor, patience=args.patience, min_lr=args.min_lr) else: lr_scheduler = CosineAnnealingLR(optimizer, args.t_max, eta_min=args.min_lr) model = model.cuda() if torch.cuda.device_count() > 1: model_name = model.name model = DataParallel(model) model.name = model_name #model=model.train() best_f2 = 0. best_key = 'top1' print('epoch | lr | % | loss | avg | loss | top1 | top10 | best | time | save |') if not args.no_first_val: val_metrics = validate(args, model, val_loader) print('val | | | | | {:.4f} | {:.4f} | {:.4f} | {:.4f} | | |'.format( val_metrics['valid_loss'], val_metrics['top1'], val_metrics['top10'], val_metrics[best_key] )) best_f2 = val_metrics[best_key] if args.val: return model.train() if args.lrs == 'plateau': lr_scheduler.step(best_f2) else: lr_scheduler.step() train_iter = 0 for epoch in range(args.start_epoch, args.num_epochs): #train_loader, val_loader = get_train_val_loaders(batch_size=args.batch_size, val_batch_size=args.val_batch_size, val_num=args.val_num) train_loss = 0 current_lr = get_lrs(optimizer) bg = time.time() for batch_idx, data in enumerate(train_loader): train_iter += 1 #if train_loader.seg: rgb, audio, labels = [x.cuda() for x in data] #else: # rgb, audio, labels = data[0].cuda(), data[2].cuda(), data[4].cuda() output = model(rgb, audio) loss = criterion(output, labels) batch_size = rgb.size(0) loss.backward() optimizer.step() optimizer.zero_grad() #with amp.scale_loss(loss, optimizer) as scaled_loss: # scaled_loss.backward() train_loss += loss.item() print('\r {:4d} | {:.7f} | {:06d}/{} | {:.4f} | {:.4f} |'.format( epoch, float(current_lr[0]), args.batch_size*(batch_idx+1), train_loader.num, loss.item(), train_loss/(batch_idx+1)), end='') if train_iter > 0 and train_iter % args.iter_val == 0: if isinstance(model, DataParallel): torch.save(model.module.state_dict(), model_file+'_latest') else: torch.save(model.state_dict(), model_file+'_latest') val_metrics = validate(args, model, val_loader) _save_ckp = '' if args.always_save or val_metrics[best_key] > best_f2: best_f2 = val_metrics[best_key] if isinstance(model, DataParallel): torch.save(model.module.state_dict(), model_file) else: torch.save(model.state_dict(), model_file) _save_ckp = '*' print(' {:.4f} | {:.4f} | {:.4f} | {:.4f} | {:.2f} | {:4s} |'.format( val_metrics['valid_loss'], val_metrics['top1'], val_metrics['top10'], best_f2, (time.time() - bg) / 60, _save_ckp)) model.train() if args.lrs == 'plateau': lr_scheduler.step(best_f2) else: lr_scheduler.step() current_lr = get_lrs(optimizer)
def __init__(self, log_dir, cfg): self.path = log_dir self.cfg = cfg if cfg.TRAIN.FLAG: self.model_dir = os.path.join(self.path, 'Model') self.log_dir = os.path.join(self.path, 'Log') mkdir_p(self.model_dir) mkdir_p(self.log_dir) self.writer = SummaryWriter(log_dir=self.log_dir) self.logfile = os.path.join(self.path, "logfile.log") sys.stdout = Logger(logfile=self.logfile) self.data_dir = cfg.DATASET.DATA_DIR self.max_epochs = cfg.TRAIN.MAX_EPOCHS self.snapshot_interval = cfg.TRAIN.SNAPSHOT_INTERVAL s_gpus = cfg.GPU_ID.split(',') self.gpus = [int(ix) for ix in s_gpus] self.num_gpus = len(self.gpus) self.batch_size = cfg.TRAIN.BATCH_SIZE self.lr = cfg.TRAIN.LEARNING_RATE torch.cuda.set_device(self.gpus[0]) cudnn.benchmark = True sample = cfg.SAMPLE self.dataset = [] self.dataloader = [] self.use_feats = cfg.model.use_feats eval_split = cfg.EVAL if cfg.EVAL else 'val' train_split = cfg.DATASET.train_split if cfg.DATASET.DATASET == 'clevr': clevr_collate_fn = collate_fn cogent = cfg.DATASET.COGENT if cogent: print(f'Using CoGenT {cogent.upper()}') if cfg.TRAIN.FLAG: self.dataset = ClevrDataset(data_dir=self.data_dir, split=train_split + cogent, sample=sample, **cfg.DATASET.params) self.dataloader = DataLoader(dataset=self.dataset, batch_size=cfg.TRAIN.BATCH_SIZE, shuffle=True, num_workers=cfg.WORKERS, drop_last=True, collate_fn=clevr_collate_fn) self.dataset_val = ClevrDataset(data_dir=self.data_dir, split=eval_split + cogent, sample=sample, **cfg.DATASET.params) self.dataloader_val = DataLoader(dataset=self.dataset_val, batch_size=cfg.TEST_BATCH_SIZE, drop_last=False, shuffle=False, num_workers=cfg.WORKERS, collate_fn=clevr_collate_fn) elif cfg.DATASET.DATASET == 'gqa': if self.use_feats == 'spatial': gqa_collate_fn = collate_fn_gqa elif self.use_feats == 'objects': gqa_collate_fn = collate_fn_gqa_objs if cfg.TRAIN.FLAG: self.dataset = GQADataset(data_dir=self.data_dir, split=train_split, sample=sample, use_feats=self.use_feats, **cfg.DATASET.params) self.dataloader = DataLoader(dataset=self.dataset, batch_size=cfg.TRAIN.BATCH_SIZE, shuffle=True, num_workers=cfg.WORKERS, drop_last=True, collate_fn=gqa_collate_fn) self.dataset_val = GQADataset(data_dir=self.data_dir, split=eval_split, sample=sample, use_feats=self.use_feats, **cfg.DATASET.params) self.dataloader_val = DataLoader(dataset=self.dataset_val, batch_size=cfg.TEST_BATCH_SIZE, shuffle=False, num_workers=cfg.WORKERS, drop_last=False, collate_fn=gqa_collate_fn) # load model self.vocab = load_vocab(cfg) self.model, self.model_ema = mac.load_MAC(cfg, self.vocab) self.weight_moving_average(alpha=0) if cfg.TRAIN.RADAM: self.optimizer = RAdam(self.model.parameters(), lr=self.lr) else: self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr) self.start_epoch = 0 if cfg.resume_model: location = 'cuda' if cfg.CUDA else 'cpu' state = torch.load(cfg.resume_model, map_location=location) self.model.load_state_dict(state['model']) self.optimizer.load_state_dict(state['optim']) self.start_epoch = state['iter'] + 1 state = torch.load(cfg.resume_model_ema, map_location=location) self.model_ema.load_state_dict(state['model']) if cfg.start_epoch is not None: self.start_epoch = cfg.start_epoch self.previous_best_acc = 0.0 self.previous_best_epoch = 0 self.previous_best_loss = 100 self.previous_best_loss_epoch = 0 self.total_epoch_loss = 0 self.prior_epoch_loss = 10 self.print_info() self.loss_fn = torch.nn.CrossEntropyLoss().cuda() self.comet_exp = Experiment( project_name=cfg.COMET_PROJECT_NAME, api_key=os.getenv('COMET_API_KEY'), workspace=os.getenv('COMET_WORKSPACE'), disabled=cfg.logcomet is False, ) if cfg.logcomet: exp_name = cfg_to_exp_name(cfg) print(exp_name) self.comet_exp.set_name(exp_name) self.comet_exp.log_parameters(flatten_json_iterative_solution(cfg)) self.comet_exp.log_asset(self.logfile) self.comet_exp.log_asset_data(json.dumps(cfg, indent=4), file_name='cfg.json') self.comet_exp.set_model_graph(str(self.model)) if cfg.cfg_file: self.comet_exp.log_asset(cfg.cfg_file) with open(os.path.join(self.path, 'cfg.json'), 'w') as f: json.dump(cfg, f, indent=4)
def focal_loss(y_true, y_pred): alpha, gamma = 0.25, 2 y_pred = K.clip(y_pred, 1e-8, 1 - 1e-8) return - alpha * y_true * K.log(y_pred) * (1 - y_pred)**gamma\ - (1 - alpha) * (1 - y_true) * K.log(1 - y_pred) * y_pred**gamma loss1 = focal_loss(a1_in, pa1) loss1 = K.sum(loss1 * p_mask[..., 0]) / K.sum(p_mask) loss2 = focal_loss(a2_in, pa2) loss2 = K.sum(loss2 * p_mask[..., 0]) / K.sum(p_mask) loss = (loss1 + loss2) * 100 # 放大100倍,可读性好些,不影响Adam的优化 train_model.add_loss(loss) train_model.compile(optimizer=RAdam(1e-3)) class ExponentialMovingAverage: """对模型权重进行指数滑动平均。 用法:在model.compile之后、第一次训练之前使用; 先初始化对象,然后执行inject方法。 """ def __init__(self, model, momentum=0.9999): self.momentum = momentum self.model = model self.ema_weights = [K.zeros(K.shape(w)) for w in model.weights] def inject(self): """添加更新算子到model.metrics_updates。 """
help='base learning rate (default: 0.1)') args = parser.parse_args() if args.optimizer.lower()=='adam': optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) elif args.optimizer.lower()=='sgd': optimizer = optim.SGD(model.parameters(), lr=args.lr) elif args.optimizer.lower()=='sgdwm': optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9) elif args.optimizer.lower() == 'rmsprop': optimizer = optim.RMSprop(model.parameters(),lr=args.lr, momentum=0.9) elif args.optimizer.lower() == 'adagrad': optimizer = optim.Adagrad(model.parameters(), lr=args.lr) elif args.optimizer.lower() == 'radam': optimizer = RAdam(model.parameters(),lr=args.lr) elif args.optimizer.lower() == 'lars':#no tensorboardX optimizer = LARS(model.parameters(), lr=args.lr, momentum=0.9) elif args.optimizer.lower() == 'lamb': optimizer = Lamb(model.parameters(),lr=args.lr) elif args.optimizer.lower() == 'novograd': optimizer = NovoGrad(model.parameters(), lr=args.lr, weight_decay=0.0001) else: optimizer = optim.SGD(model.parameters(), lr=1) optname = args.optimizer if len(sys.argv)>=2 else 'sgd' # log = open(optname+'log.txt','w+') def lrs(batch): low = math.log2(1e-5)
def train(opt): """ dataset preparation """ if not opt.data_filtering_off: print('Filtering the images containing characters which are not in opt.character') print('Filtering the images whose label is longer than opt.batch_max_length') # see https://github.com/clovaai/deep-text-recognition-benchmark/blob/6593928855fb7abb999a99f428b3e4477d4ae356/dataset.py#L130 opt.select_data = opt.select_data.split('-') opt.batch_ratio = opt.batch_ratio.split('-') train_dataset = Batch_Balanced_Dataset(opt) log = open(f'./saved_models/{opt.experiment_name}/log_dataset.txt', 'a') AlignCollate_valid = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD) valid_dataset, valid_dataset_log = hierarchical_dataset(root=opt.valid_data, opt=opt) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=opt.batch_size, shuffle=True, # 'True' to check training progress with validation function. num_workers=int(opt.workers), collate_fn=AlignCollate_valid, pin_memory=True) log.write(valid_dataset_log) print('-' * 80) log.write('-' * 80 + '\n') log.close() """ model configuration """ if 'CTC' in opt.Prediction: converter = CTCLabelConverter(opt.character) elif opt.Prediction == 'None': converter = TransformerConverter(opt.character) else: converter = AttnLabelConverter(opt.character) opt.num_class = len(converter.character) if opt.rgb: opt.input_channel = 3 model = Model(opt) print('model input parameters', opt.imgH, opt.imgW, opt.num_fiducial, opt.input_channel, opt.output_channel, opt.hidden_size, opt.num_class, opt.batch_max_length, opt.Transformation, opt.FeatureExtraction, opt.SequenceModeling, opt.Prediction) # weight initialization for name, param in model.named_parameters(): if 'localization_fc2' in name: print(f'Skip {name} as it is already initialized') continue try: if 'bias' in name: init.constant_(param, 0.0) elif 'weight' in name: init.kaiming_normal_(param) except Exception as e: # for batchnorm. if 'weight' in name: param.data.fill_(1) continue # data parallel for multi-GPU # model = torch.nn.DataParallel(model).to(device) model = model.to(device) model.train() if opt.load_from_checkpoint: model.load_state_dict(torch.load(os.path.join(opt.load_from_checkpoint, 'checkpoint.pth'))) print(f'loaded checkpoint from {opt.load_from_checkpoint}...') elif opt.saved_model != '': print(f'loading pretrained model from {opt.saved_model}') if opt.SequenceModeling == 'Transformer': fe_state = OrderedDict() state_dict = torch.load(opt.saved_model) for k, v in state_dict.items(): if k.startswith('module.FeatureExtraction'): new_k = re.sub('module.FeatureExtraction.', '', k) fe_state[new_k] = state_dict[k] model.FeatureExtraction.load_state_dict(fe_state) else: if opt.FT: model.load_state_dict(torch.load(opt.saved_model), strict=False) else: model.load_state_dict(torch.load(opt.saved_model)) if opt.freeze_fe: model.freeze(['FeatureExtraction']) print("Model:") print(model) """ setup loss """ if 'CTC' in opt.Prediction: criterion = torch.nn.CTCLoss(zero_infinity=True).to(device) elif opt.Prediction == 'None': criterion = LabelSmoothingLoss(classes=converter.n_classes, padding_idx=converter.pad_idx, smoothing=0.1) # criterion = torch.nn.CrossEntropyLoss(ignore_index=converter.pad_idx) else: criterion = torch.nn.CrossEntropyLoss(ignore_index=0).to(device) # ignore [GO] token = ignore index 0 # loss averager loss_avg = Averager() # filter that only require gradient decent filtered_parameters = [] params_num = [] for p in filter(lambda p: p.requires_grad, model.parameters()): filtered_parameters.append(p) params_num.append(np.prod(p.size())) print('Trainable params num : ', sum(params_num)) # [print(name, p.numel()) for name, p in filter(lambda p: p[1].requires_grad, model.named_parameters())] # setup optimizer if opt.adam: assert opt.adam in ['Adam', 'AdamW', 'RAdam'], 'adam optimizer must be in Adam, AdamW or RAdam' if opt.adam == 'Adam': optimizer = optim.Adam(filtered_parameters, lr=opt.lr, betas=(opt.beta1, 0.999)) elif opt.adam == "AdamW": optimizer = optim.AdamW(filtered_parameters, lr=opt.lr, betas=(opt.beta1, 0.999)) else: optimizer = RAdam(filtered_parameters, lr=opt.lr, betas=(opt.beta1, 0.999)) else: optimizer = optim.Adadelta(filtered_parameters, lr=opt.lr, rho=opt.rho, eps=opt.eps) print("Optimizer:") print(optimizer) if opt.load_from_checkpoint and opt.load_optimizer_state: optimizer.load_state_dict(torch.load(os.path.join(opt.load_from_checkpoint, 'optimizer.pth'))) print(f'loaded optimizer state from {os.path.join(opt.load_from_checkpoint, "optimizer.pth")}') """ final options """ # print(opt) with open(f'./saved_models/{opt.experiment_name}/opt.txt', 'a') as opt_file: opt_log = '------------ Options -------------\n' args = vars(opt) for k, v in args.items(): opt_log += f'{str(k)}: {str(v)}\n' opt_log += '---------------------------------------\n' print(opt_log) opt_file.write(opt_log) """ start training """ start_iter = 0 if opt.saved_model != '': try: start_iter = int(opt.saved_model.split('_')[-1].split('.')[0]) print(f'continue to train, start_iter: {start_iter}') except: pass if opt.load_from_checkpoint: with open(os.path.join(opt.load_from_checkpoint, 'iter.json'), mode='r', encoding='utf8') as f: start_iter = json.load(f) print(f'continue to train, start_iter: {start_iter}') f.close() start_time = time.time() best_accuracy = -1 best_norm_ED = -1 # i = start_iter bar = tqdm(range(start_iter, opt.num_iter)) # while(True): for i in bar: bar.set_description(f'Iter {i}: train_loss = {loss_avg.val():.5f}') # train part image_tensors, labels = train_dataset.get_batch() image = image_tensors.to(device) text, length = converter.encode(labels, batch_max_length=opt.batch_max_length) batch_size = image.size(0) if 'CTC' in opt.Prediction: preds = model(image, text).log_softmax(2) preds_size = torch.IntTensor([preds.size(1)] * batch_size) preds = preds.permute(1, 0, 2) # (ctc_a) For PyTorch 1.2.0 and 1.3.0. To avoid ctc_loss issue, disabled cudnn for the computation of the ctc_loss # https://github.com/jpuigcerver/PyLaia/issues/16 torch.backends.cudnn.enabled = False cost = criterion(preds, text.to(device), preds_size.to(device), length.to(device)) torch.backends.cudnn.enabled = True # # (ctc_b) To reproduce our pretrained model / paper, use our previous code (below code) instead of (ctc_a). # # With PyTorch 1.2.0, the below code occurs NAN, so you may use PyTorch 1.1.0. # # Thus, the result of CTCLoss is different in PyTorch 1.1.0 and PyTorch 1.2.0. # # See https://github.com/clovaai/deep-text-recognition-benchmark/issues/56#issuecomment-526490707 # cost = criterion(preds, text, preds_size, length) elif opt.Prediction == 'None': tgt_input = text['tgt_input'] tgt_output = text['tgt_output'] tgt_padding_mask = text['tgt_padding_mask'] preds = model(image, tgt_input.transpose(0, 1), tgt_key_padding_mask=tgt_padding_mask,) cost = criterion(preds.view(-1, preds.shape[-1]), tgt_output.contiguous().view(-1)) else: preds = model(image, text[:, :-1]) # align with Attention.forward target = text[:, 1:] # without [GO] Symbol cost = criterion(preds.view(-1, preds.shape[-1]), target.contiguous().view(-1)) model.zero_grad() cost.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip) # gradient clipping with 5 (Default) optimizer.step() loss_avg.add(cost) # validation part if (i + 1) % opt.valInterval == 0: elapsed_time = time.time() - start_time # for log with open(f'./saved_models/{opt.experiment_name}/log_train.txt', 'a') as log: model.eval() with torch.no_grad(): valid_loss, current_accuracy, current_norm_ED, preds, confidence_score, labels, infer_time, length_of_data = validation( model, criterion, valid_loader, converter, opt) model.train() # training loss and validation loss loss_log = f'[{i}/{opt.num_iter}] Train loss: {loss_avg.val():0.5f}, Valid loss: {valid_loss:0.5f}, Elapsed_time: {elapsed_time:0.5f}' loss_avg.reset() current_model_log = f'{"Current_accuracy":17s}: {current_accuracy:0.3f}, {"Current_norm_ED":17s}: {current_norm_ED:0.2f}' # keep best accuracy model (on valid dataset) if current_accuracy > best_accuracy: best_accuracy = current_accuracy torch.save(model.state_dict(), f'./saved_models/{opt.experiment_name}/best_accuracy.pth') if current_norm_ED > best_norm_ED: best_norm_ED = current_norm_ED torch.save(model.state_dict(), f'./saved_models/{opt.experiment_name}/best_norm_ED.pth') # checkpoint os.makedirs(f'./checkpoints/{opt.experiment_name}/', exist_ok=True) torch.save(model.state_dict(), f'./checkpoints/{opt.experiment_name}/checkpoint.pth') torch.save(optimizer.state_dict(), f'./checkpoints/{opt.experiment_name}/optimizer.pth') with open(f'./checkpoints/{opt.experiment_name}/iter.json', mode='w', encoding='utf8') as f: json.dump(i + 1, f) f.close() with open(f'./checkpoints/{opt.experiment_name}/checkpoint.log', mode='a', encoding='utf8') as f: f.write(f'Saved checkpoint with iter={i}\n') f.write(f'\tCheckpoint at: ./checkpoints/{opt.experiment_name}/checkpoint.pth') f.write(f'\tOptimizer at: ./checkpoints/{opt.experiment_name}/optimizer.pth') best_model_log = f'{"Best_accuracy":17s}: {best_accuracy:0.3f}, {"Best_norm_ED":17s}: {best_norm_ED:0.2f}' loss_model_log = f'{loss_log}\n{current_model_log}\n{best_model_log}' print(loss_model_log) log.write(loss_model_log + '\n') # show some predicted results dashed_line = '-' * 80 head = f'{"Ground Truth":25s} | {"Prediction":25s} | Confidence Score & T/F' predicted_result_log = f'{dashed_line}\n{head}\n{dashed_line}\n' for gt, pred, confidence in zip(labels[:5], preds[:5], confidence_score[:5]): if 'Attn' in opt.Prediction: gt = gt[:gt.find('[s]')] pred = pred[:pred.find('[s]')] predicted_result_log += f'{gt:25s} | {pred:25s} | {confidence:0.4f}\t{str(pred == gt)}\n' predicted_result_log += f'{dashed_line}' print(predicted_result_log) log.write(predicted_result_log + '\n') # save model per 1e+5 iter. if (i + 1) % 1e+5 == 0: torch.save( model.state_dict(), f'./saved_models/{opt.experiment_name}/iter_{i+1}.pth') # if i == opt.num_iter: # print('end the training') # sys.exit() # i += 1 # if i == 1: break print('end training')
transforms=True) train_dataloader = DataLoader(train_dataset, batch_size=opts.batch_size, drop_last=False, shuffle=True) if opts.visdom: vis = visdom.Visdom() train_loss_window = vis.line(X=torch.zeros((1, )).cpu(), Y=torch.zeros((1)).cpu(), opts=dict(xlabel='epoch', ylabel='Loss', title='Training Loss', legend=['Loss'])) optimizer = RAdam(filter(lambda p: p.requires_grad, model.parameters()), lr=opts.learning_rate) criterion = nn.BCELoss() cudnn.benchmark = True train_add_loss = [] train_epoch = [] one_epoch_iteration = len(train_dataloader) early_stoping = EarlyStopping(patience=30, learning_rate=opts.learning_rate, verbose=True) def adjust_learning_rate(optimizer, lr): for param_group in optimizer.param_groups: param_group['lr'] = lr
def main(): global args best_prec1, best_epoch = 0.0, 0 if not os.path.exists(args.save): os.makedirs(args.save) if args.data.startswith('cifar'): IM_SIZE = 32 else: IM_SIZE = 224 model = getattr(models, args.arch)(args) n_flops, n_params = measure_model(model, IM_SIZE, IM_SIZE) torch.save(n_flops, os.path.join(args.save, 'flops.pth')) del(model) model = getattr(models, args.arch)(args) if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() criterion = nn.CrossEntropyLoss().cuda() if args.optimizer == 'sgd': optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer == 'adam': optimizer = torch.optim.Adam(model.parameters(), args.lr, weight_decay=args.weight_decay) elif args.optimizer == 'radam': from radam import RAdam optimizer = RAdam(model.parameters(), args.lr, weight_decay=args.weight_decay) else: raise NotImplementedError("Wrong optimizer.") if args.resume: checkpoint = load_checkpoint(args) if checkpoint is not None: args.start_epoch = checkpoint['epoch'] + 1 best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) cudnn.benchmark = True train_loader, val_loader, test_loader = get_dataloaders(args) if args.evalmode is not None: state_dict = torch.load(args.evaluate_from)['state_dict'] model.load_state_dict(state_dict) if args.evalmode == 'anytime': validate(test_loader, model, criterion) else: dynamic_evaluate(model, test_loader, val_loader, args) return scores = ['epoch\tlr\ttrain_loss\tval_loss\ttrain_prec1' '\tval_prec1\ttrain_prec5\tval_prec5'] for epoch in range(args.start_epoch, args.epochs): train_loss, train_prec1, train_prec5, lr = train(train_loader, model, criterion, optimizer, epoch) val_loss, val_prec1, val_prec5 = validate(val_loader, model, criterion) scores.append(('{}\t{:.3f}' + '\t{:.4f}' * 6) .format(epoch, lr, train_loss, val_loss, train_prec1, val_prec1, train_prec5, val_prec5)) is_best = val_prec1 > best_prec1 if is_best: best_prec1 = val_prec1 best_epoch = epoch print('Best var_prec1 {}'.format(best_prec1)) model_filename = 'checkpoint_%03d.pth.tar' % epoch save_checkpoint({ 'epoch': epoch, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, args, is_best, model_filename, scores) print('Best val_prec1: {:.4f} at epoch {}'.format(best_prec1, best_epoch)) ### Test the final model print('********** Final prediction results **********') validate(test_loader, model, criterion) return
def train_ccblock(model_options): # get train&valid datasets' paths if model_options.trainset_num > 1: train_file_paths = [ model_options.trainset_path.format(i) for i in range(1, model_options.trainset_num + 1) ] else: train_file_paths = [model_options.trainset_path] # load datasets print(train_file_paths) label_paths = "/home/langruimin/BLSTM_pytorch/data/fcv/fcv_train_labels.mat" videoset = VideoDataset(train_file_paths, label_paths) print(len(videoset)) # create model model = RCCAModule(1, 1) model_quan = Quantization(model_options.subLevel, model_options.subCenters, model_options.dim) params_path = os.path.join(model_options.model_save_path, model_options.params_filename) params_path_Q = os.path.join(model_options.model_save_path, model_options.Qparams_filename) if model_options.reload_params: print('Loading model params...') model.load_state_dict(torch.load(params_path)) print('Done.') model = model.cuda() model_quan = model_quan.cuda() # optimizer optimizer = RAdam(model.parameters(), lr=1e-3, betas=(0.9, 0.999), weight_decay=1e-4) optimizer2 = RAdam( model_quan.parameters(), lr=1e-3, # 7e-6 betas=(0.9, 0.999), weight_decay=1e-4) lr_C = "" lr_Q = "" # milestones = [] # lr_schduler_C = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones, gamma=0.1, last_epoch=-1) # lr_schduler_Q = torch.optim.lr_scheduler.MultiStepLR(optimizer2, milestones, gamma=0.6, last_epoch=-1) selector = AllTripletSelector() triplet_loss = OnlineTripletLoss(margin=512, triplet_selector=selector) batch_idx = 1 train_loss_rec = open( os.path.join(model_options.records_save_path, model_options.train_loss_filename), 'w') error_ = 0. loss_ = 0. num = 0 print("##########start train############") trainloader = torch.utils.data.DataLoader(videoset, batch_size=9, shuffle=True, num_workers=4, pin_memory=True) model.train() model_quan.train() init_train_label = np.load( "/home/langruimin/BLSTM_pytorch/data/fcv/init_train_labels.npy") for l in range(100): # lr_schduler_C.step(l) # milestones.append(l+2) # lr_schduler_Q.step(l) # training for i, (data, index, _, _) in enumerate(trainloader): data = data.to(model_options.default_dtype) data = data.unsqueeze(1) data = data.cuda() # cc_block output_ccblock_mean = torch.tanh(model(data)) # quantization block Qhard, Qsoft, SoftDistortion, HardDistortion, JointCenter, error, _ = model_quan( output_ccblock_mean) Q_loss = 0.1 * SoftDistortion + HardDistortion + 0.1 * JointCenter tri_loss, tri_num = triplet_loss(output_ccblock_mean, init_train_label[index]) optimizer2.zero_grad() Q_loss.backward(retain_graph=True) optimizer2.step() optimizer.zero_grad() tri_loss.backward() optimizer.step() error_ += error.item() loss_ += tri_loss.item() num += 1 if batch_idx % model_options.disp_freq == 0: info = "epoch{0} Batch {1} loss:{2:.3f} distortion:{3:.3f} " \ .format(l, batch_idx, loss_/ num, error_ / num) print(info) train_loss_rec.write(info + '\n') batch_idx += 1 batch_idx = 0 error_ = 0. loss_ = 0. num = 0 if (l + 1) % model_options.save_freq == 0: print('epoch: ', l, 'New best model. Saving model ...') torch.save(model.state_dict(), params_path) torch.save(model_quan.state_dict(), params_path_Q) for param_group in optimizer.param_groups: lr_C = param_group['lr'] for param_group in optimizer2.param_groups: lr_Q = param_group['lr'] record_inf = "saved model at epoch {0} lr_C:{1} lr_Q:{2}".format( l, lr_C, lr_Q) train_loss_rec.write(record_inf + '\n') print("##########epoch done##########") print('train done. Saving model ...') torch.save(model.state_dict(), params_path) torch.save(model_quan.state_dict(), params_path_Q) print("##########train done##########")
model = LeNet5(N_CLASSES).to(DEVICE) if len(sys.argv) == 1: optimizer = optim.SGD(model.parameters(), lr=0.01) elif sys.argv[1] == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) elif sys.argv[1] == 'sgd': optimizer = optim.SGD(model.parameters(), lr=0.01) elif sys.argv[1] == 'sgdwm': optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) elif sys.argv[1] == 'rmsprop': optimizer = optim.RMSprop(model.parameters(), lr=0.001, momentum=0.9) elif sys.argv[1] == 'adagrad': optimizer = optim.Adagrad(model.parameters(), lr=0.01) elif sys.argv[1] == 'radam': optimizer = RAdam(model.parameters()) elif sys.argv[1] == 'lars': #no tensorboardX optimizer = LARS(model.parameters(), lr=0.1, momentum=0.9) elif sys.argv[1] == 'lamb': optimizer = Lamb(model.parameters()) elif sys.argv[1] == 'novograd': optimizer = NovoGrad(model.parameters(), lr=0.01, weight_decay=0.001) schedular = optim.lr_scheduler.CosineAnnealingLR(optimizer, 3 * len(train_loader), 1e-4) def train(train_loader, model, criterion, optimizer, schedular, device): ''' Function for the training step of the training loop '''
optimizer = optim.Adam(model.parameters(), lr=args.init_lr, betas=betas, eps=1e-9) elif args.optim == '1cycle': optimizer = optim.Adam(model.parameters(), lr=args.init_lr, betas=betas, eps=1e-9) scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=args.max_lr, steps_per_epoch=len(train_data), epochs=args.max_epochs) elif args.optim == 'radam': optimizer = RAdam(model.parameters(), lr=args.init_lr, betas=betas, eps=1e-9) elif args.optim == 'schedule': optimizer = optim.Adam(model.parameters(), lr=args.init_lr, betas=betas, eps=1e-9) scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, factor=args.schedule_factor, patience=args.schedule_patience) # **************** TRAINING ****************** print('Training starts...') alignment = None
def build_pipeline( data_dir, model, save_every, batch_size, input_size, output_size, raw, labels, affs, affs_predicted, lr=1e-5): dataset_shape = zarr.open(str(data_dir))['train/raw'].shape num_samples = dataset_shape[0] sample_size = dataset_shape[1:] loss = torch.nn.MSELoss() optimizer = RAdam(model.parameters(), lr=lr) pipeline = ( gp.ZarrSource( data_dir, { raw: 'train/raw', labels: 'train/gt' }, array_specs={ raw: gp.ArraySpec( roi=gp.Roi((0, 0, 0), (num_samples,) + sample_size), voxel_size=(1, 1, 1)), labels: gp.ArraySpec( roi=gp.Roi((0, 0, 0), (num_samples,) + sample_size), voxel_size=(1, 1, 1)) }) + # raw: (d=1, h, w) # labels: (d=1, fmap_inc_factors=5h, w) gp.RandomLocation() + # raw: (d=1, h, w) # labels: (d=1, h, w) gp.AddAffinities( affinity_neighborhood=[(0, 1, 0), (0, 0, 1)], labels=labels, affinities=affs) + gp.Normalize(affs, factor=1.0) + # raw: (d=1, h, w) # affs: (c=2, d=1, h, w) Squash(dim=-3) + # get rid of z dim # raw: (h, w) # affs: (c=2, h, w) AddChannelDim(raw) + # raw: (c=1, h, w) # affs: (c=2, h, w) gp.PreCache() + gp.Stack(batch_size) + # raw: (b=10, c=1, h, w) # affs: (b=10, c=2, h, w) Train( model=model, loss=loss, optimizer=optimizer, inputs={'x': raw}, target=affs, output=affs_predicted, save_every=save_every, log_dir='log') + # raw: (b=10, c=1, h, w) # affs: (b=10, c=2, h, w) # affs_predicted: (b=10, c=2, h, w) TransposeDims(raw,(1, 0, 2, 3)) + TransposeDims(affs,(1, 0, 2, 3)) + TransposeDims(affs_predicted,(1, 0, 2, 3)) + # raw: (c=1, b=10, h, w) # affs: (c=2, b=10, h, w) # affs_predicted: (c=2, b=10, h, w) RemoveChannelDim(raw) + # raw: (b=10, h, w) # affs: (c=2, b=10, h, w) # affs_predicted: (c=2, b=10, h, w) gp.Snapshot( dataset_names={ raw: 'raw', labels: 'labels', affs: 'affs', affs_predicted: 'affs_predicted' }, every=100) + gp.PrintProfilingStats(every=100) ) return pipeline
def train(config, num_classes=1108): model = model_whale(num_classes=num_classes, inchannels=6, model_name=config.train.model_name, pretrained=config.train.pretrained).cuda() if config.train.freeze: model.freeze() base_opt = RAdam(model.parameters(), lr=config.train.lr) optimizer = Lookahead(base_opt) # optimizer = torch.optim.Adam(model.parameters(), lr=config.train.lr, betas=(0.9, 0.99), weight_decay=0.0002) resultDir = config.train.result_dir checkPoint = join(resultDir, 'checkpoint') # if not config.train.in_colab: # os.makedirs(checkPoint, exist_ok=True) train_dataset = CustomDataset(config.train.csv_file, config.train.img_dir, transforms=transforms['train']) dataset_size = len(train_dataset) indices = list(range(dataset_size)) split = int(np.floor(config.train.validation_split * dataset_size)) if config.train.shuffle_dataset: np.random.seed(config.train.random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] # Creating PT data samplers and loaders: train_sampler = SubsetRandomSampler(train_indices) valid_sampler = SubsetRandomSampler(val_indices) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.train.batch_size, sampler=train_sampler, num_workers=config.train.num_workers) validation_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.train.batch_size, sampler=valid_sampler, num_workers=config.train.num_workers) train_loss = 0. # load from cpk: if config.train.load_cpk: model.load_pretrain(os.path.join( checkPoint, '%08d_model.pth' % (config.train.start_epoch)), skip=[]) cpk = torch.load( os.path.join(checkPoint, '%08d_optimizer.pth' % (config.train.start_epoch))) optimizer.load_state_dict(cpk['optimizer']) adjust_learning_rate(optimizer, config.train.lr) start_epoch = cpk['epoch'] else: start_epoch = 0 top1_batch, map5_batch = 0, 0 for epoch in range(start_epoch + 1, config.train.epochs): print('Starting:', epoch, 'Iterations:', len(train_loader)) for i, data in enumerate(train_loader): model.train() model.mode = 'train' images, labels = data images = images.cuda() labels = labels.cuda().long() global_feat, local_feat, results = data_parallel(model, images) model.getLoss(global_feat, local_feat, results, labels, config, verbose=(i % config.loss.verbose_interval == 0)) batch_loss = model.loss optimizer.zero_grad() batch_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0, norm_type=2) optimizer.step() results = torch.sigmoid(results) train_loss += batch_loss.data.cpu().numpy() top1_batch += accuracy(results, labels, topk=[1])[0] map5_batch += mapk(labels, results, k=5) if i % config.train.verbose_interval == 0: print( 'epoch: %03d, iter: %05d, train_loss: %f, top1_batch: %f, map5_batch: %f' % (epoch, i, float(train_loss / config.train.verbose_interval), float(top1_batch / config.train.verbose_interval), float(map5_batch / config.train.verbose_interval))) # print(f'epoch: {epoch}, iter: {i}, train_loss: {float(train_loss / config.train.verbose_interval)}, top1_batch: {float(top1_batch / config.train.verbose_interval)}, map5_batch: {float(map5_batch / config.train.verbose_interval)}') train_loss, top1_batch, map5_batch = 0, 0, 0 valid_loss, top1_valid, map5_valid = valid_eval( config, model, validation_loader) print( 'epoch: %03d, iter: %05d, valid_loss: %f, valid_top1_batch: %f, valid_map5_batch: %f' % (epoch, i, valid_loss, top1_valid, map5_valid)) # print(f'epoch: {epoch}, iter: {i}, valid_loss: {valid_loss}, top1_batch: {top1_valid}, map5_batch: {map5_valid}') if epoch % config.train.save_period == 0: os.system("touch " + resultDir + "/checkpoint/%08d_model.pth" % (epoch)) os.system("touch " + resultDir + "/checkpoint/%08d_optimizer.pth" % (epoch)) time.sleep(1) torch.save(model.state_dict(), resultDir + '/checkpoint/%08d_model.pth' % (epoch)) torch.save({ 'optimizer': optimizer.state_dict(), 'epoch': epoch, }, resultDir + '/checkpoint/%08d_optimizer.pth' % (epoch))
def train(args): # augmentations train_transform = Compose([ Resize(args.img_size, args.img_size), Cutout(num_holes=8, max_h_size=20, max_w_size=20, fill_value=0, always_apply=False, p=0.5), Normalize( mean=[0.0692], std=[0.205], ), ToTensorV2() ]) val_transform = Compose([ Resize(args.img_size, args.img_size), Normalize( mean=[0.0692], std=[0.205], ), ToTensorV2() ]) # Load data df_train = pd.read_csv("../input/train_folds.csv") if args.fold == -1: sys.exit() train = df_train[df_train['kfold']!=args.fold].reset_index(drop=True)#[:1000] val = df_train[df_train['kfold']==args.fold].reset_index(drop=True)#[:1000] train_data = ImageDataset('../input/images', train_transform, train) train_loader = utils.DataLoader(train_data, shuffle=True, num_workers=5, batch_size=args.batch_size, pin_memory=True) val_data = ImageDataset('../input/images', val_transform, val) val_loader = utils.DataLoader(val_data, shuffle=False, num_workers=5, batch_size=args.batch_size, pin_memory=True) # create model device = torch.device(f"cuda:{args.gpu_n}") model = PretrainedCNN() if args.pretrain_path != "": model.load_state_dict(torch.load(args.pretrain_path, map_location=f"cuda:{args.gpu_n}")) print("weights loaded") model.to(device) optimizer = RAdam(model.parameters(), lr=args.start_lr) opt_level = 'O1' model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level) loss_fn = nn.CrossEntropyLoss() scheduler = ReduceLROnPlateau(optimizer, mode='max', verbose=True, patience=8, factor=0.6) best_models = deque(maxlen=5) best_score = 0.99302 for e in range(args.epoch): # Training: train_loss = [] model.train() for image, target in tqdm(train_loader, ncols = 70): optimizer.zero_grad() xs = image.to(device) ys = target.to(device) # Cutmix using with BUG if np.random.rand()<0.5: images, targets = cutmix(xs, ys[:,0], ys[:,1], ys[:,2], 1.0) pred = model(xs) output1 = pred[:,:168] output2 = pred[:,168:179] output3 = pred[:,179:] loss = cutmix_criterion(output1,output2,output3, targets) else: pred = model(xs) grapheme = pred[:,:168] vowel = pred[:,168:179] cons = pred[:,179:] loss = loss_fn(grapheme, ys[:,0]) + loss_fn(vowel, ys[:,1])+ loss_fn(cons, ys[:,2]) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() optimizer.step() train_loss.append(loss.item()) #Validation val_loss = [] val_true = [] val_pred = [] model.eval() with torch.no_grad(): for image, target in val_loader:#tqdm(val_loader, ncols=50): xs = image.to(device) ys = target.to(device) pred = model(xs) grapheme = pred[:,:168] vowel = pred[:,168:179] cons = pred[:,179:] loss = loss_fn(grapheme, ys[:,0]) + loss_fn(vowel, ys[:,1])+ loss_fn(cons, ys[:,2]) val_loss.append(loss.item()) grapheme = grapheme.cpu().argmax(dim=1).data.numpy() vowel = vowel.cpu().argmax(dim=1).data.numpy() cons = cons.cpu().argmax(dim=1).data.numpy() val_true.append(target.numpy()) val_pred.append(np.stack([grapheme, vowel, cons], axis=1)) val_true = np.concatenate(val_true) val_pred = np.concatenate(val_pred) val_loss = np.mean(val_loss) train_loss = np.mean(train_loss) scores = [] for i in [0,1,2]: scores.append(sklearn.metrics.recall_score(val_true[:,i], val_pred[:,i], average='macro')) final_score = np.average(scores, weights=[2,1,1]) print(f'Epoch: {e:03d}; train_loss: {train_loss:.05f}; val_loss: {val_loss:.05f}; ', end='') print(f'score: {final_score:.5f} ', end='') # Checkpoint model. If there are 2nd stage(224x224) save best 5 checkpoints if final_score > best_score: best_score = final_score state_dict = copy.deepcopy(model.state_dict()) if args.save_queue==1: best_models.append(state_dict) for i, m in enumerate(best_models): path = f"models/{args.exp_name}" os.makedirs(path, exist_ok=True) torch.save(m, join(path, f"{i}.pt")) else: path = f"models/{args.exp_name}" os.makedirs(path, exist_ok=True) torch.save(state_dict, join(path, "model.pt")) print('+') else: print() scheduler.step(final_score)