def main(): with timer('load data'): df = pd.read_csv(TRAIN_PATH) df = df[df.Image != "ID_6431af929"].reset_index(drop=True) df.loc[df.pre_SOPInstanceUID=="ID_6431af929", "pre1_SOPInstanceUID"] = df.loc[ df.pre_SOPInstanceUID=="ID_6431af929", "Image"] df.loc[df.post_SOPInstanceUID == "ID_6431af929", "post1_SOPInstanceUID"] = df.loc[ df.post_SOPInstanceUID == "ID_6431af929", "Image"] df.loc[df.prepre_SOPInstanceUID == "ID_6431af929", "pre2_SOPInstanceUID"] = df.loc[ df.prepre_SOPInstanceUID == "ID_6431af929", "pre1_SOPInstanceUID"] df.loc[df.postpost_SOPInstanceUID == "ID_6431af929", "post2_SOPInstanceUID"] = df.loc[ df.postpost_SOPInstanceUID == "ID_6431af929", "post1_SOPInstanceUID"] y = df[TARGET_COLUMNS].values df = df[["Image", "pre1_SOPInstanceUID", "post1_SOPInstanceUID", "pre2_SOPInstanceUID", "post2_SOPInstanceUID"]] gc.collect() with timer('preprocessing'): train_augmentation = Compose([ CenterCrop(512 - 50, 512 - 50, p=1.0), HorizontalFlip(p=0.5), OneOf([ ElasticTransform(p=0.5, alpha=120, sigma=120 * 0.05, alpha_affine=120 * 0.03), GridDistortion(p=0.5), OpticalDistortion(p=1, distort_limit=2, shift_limit=0.5) ], p=0.5), Rotate(limit=30, border_mode=0, p=0.7), Resize(img_size, img_size, p=1) ]) train_dataset = RSNADataset(df, y, img_size, IMAGE_PATH, id_colname=ID_COLUMNS, transforms=train_augmentation, black_crop=False, subdural_window=True, user_window=2) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True) del df, train_dataset gc.collect() with timer('create model'): model = CnnModel(num_classes=N_CLASSES, encoder="se_resnext50_32x4d", pretrained="imagenet", pool_type="avg") if model_path is not None: model.load_state_dict(torch.load(model_path)) model.to(device) criterion = torch.nn.BCEWithLogitsLoss(weight=torch.FloatTensor([2, 1, 1, 1, 1, 1]).cuda()) optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, eps=1e-4) model = torch.nn.DataParallel(model) with timer('train'): for epoch in range(1, epochs + 1): if epoch == 5: for param_group in optimizer.param_groups: param_group['lr'] = param_group['lr'] * 0.1 seed_torch(SEED + epoch) LOGGER.info("Starting {} epoch...".format(epoch)) tr_loss = train_one_epoch(model, train_loader, criterion, optimizer, device) LOGGER.info('Mean train loss: {}'.format(round(tr_loss, 5))) torch.save(model.module.state_dict(), 'models/{}_ep{}.pth'.format(EXP_ID, epoch))
def main(): with timer('load data'): df = pd.read_csv(TRAIN_PATH) df["loc_x"] = df["loc_x"] / 100 df["loc_y"] = df["loc_y"] / 100 y = df[TARGET_COLUMNS].values df = df[[ID_COLUMNS]] gc.collect() with timer("split data"): folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0).split(df, y) for n_fold, (train_index, val_index) in enumerate(folds): train_df = df.loc[train_index] val_df = df.loc[val_index] y_train = y[train_index] y_val = y[val_index] if n_fold == fold_id: break with timer('preprocessing'): train_augmentation = Compose([ HorizontalFlip(p=0.5), OneOf([ ElasticTransform(p=0.5, alpha=120, sigma=120 * 0.05, alpha_affine=120 * 0.03), GridDistortion(p=0.5), OpticalDistortion(p=1, distort_limit=2, shift_limit=0.5) ], p=0.5), RandomBrightnessContrast(p=0.5), ShiftScaleRotate(rotate_limit=20, p=0.5), Resize(img_size, img_size, p=1) ]) val_augmentation = Compose([ Resize(img_size, img_size, p=1) ]) train_dataset = KDDataset(train_df, y_train, img_size, IMAGE_PATH, id_colname=ID_COLUMNS, transforms=train_augmentation) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True) val_dataset = KDDataset(val_df, y_val, img_size, IMAGE_PATH, id_colname=ID_COLUMNS, transforms=val_augmentation) val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True) del df, train_dataset, val_dataset gc.collect() with timer('create model'): model = CnnModel(num_classes=N_CLASSES, encoder="se_resnext50_32x4d", pretrained="../input/pytorch-pretrained-models/se_resnext50_32x4d-a260b3a4.pth", pool_type="avg") if model_path is not None: model.load_state_dict(torch.load(model_path)) model.to(device) criterion = torch.nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, eps=1e-4) # model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) with timer('train'): best_score = 0 for epoch in range(1, epochs + 1): seed_torch(SEED + epoch) if epoch == epochs - 3: for param_group in optimizer.param_groups: param_group['lr'] = param_group['lr'] * 0.1 LOGGER.info("Starting {} epoch...".format(epoch)) tr_loss = train_one_epoch(model, train_loader, criterion, optimizer, device, N_CLASSES) LOGGER.info('Mean train loss: {}'.format(round(tr_loss, 5))) y_pred, target, val_loss = validate(model, val_loader, criterion, device, N_CLASSES) score = roc_auc_score(target, y_pred) LOGGER.info('Mean val loss: {}'.format(round(val_loss, 5))) LOGGER.info('val score: {}'.format(round(score, 5))) if score > best_score: best_score = score np.save("y_pred.npy", y_pred) torch.save(model.state_dict(), save_path) np.save("target.npy", target) with timer('predict'): test_df = pd.read_csv(TEST_PATH) test_ids = test_df["id"].values test_augmentation = Compose([ Resize(img_size, img_size, p=1) ]) test_dataset = KDDatasetTest(test_df, img_size, TEST_IMAGE_PATH, id_colname=ID_COLUMNS, transforms=test_augmentation, n_tta=2) test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True) model.load_state_dict(torch.load(save_path)) pred = predict(model, test_loader, device, N_CLASSES, n_tta=2) print(pred.shape) results = pd.DataFrame({"id": test_ids, "is_star": pred.reshape(-1)}) results.to_csv("results.csv", index=False)
def main(seed): with timer('load data'): df = pd.read_csv(FOLD_PATH) df.drop("EncodedPixels_2", axis=1, inplace=True) df = df.rename(columns={"EncodedPixels_3": "EncodedPixels_2"}) df = df.rename(columns={"EncodedPixels_4": "EncodedPixels_3"}) y1 = (df.EncodedPixels_1 != "-1").astype("float32").values.reshape( -1, 1) y2 = (df.EncodedPixels_2 != "-1").astype("float32").values.reshape( -1, 1) y3 = (df.EncodedPixels_3 != "-1").astype("float32").values.reshape( -1, 1) #y4 = (df.EncodedPixels_4 != "-1").astype("float32").values.reshape(-1, 1) y = np.concatenate([y1, y2, y3], axis=1) with timer('preprocessing'): train_df, val_df = df[df.fold_id != FOLD_ID], df[df.fold_id == FOLD_ID] y_train, y_val = y[df.fold_id != FOLD_ID], y[df.fold_id == FOLD_ID] train_augmentation = Compose([ Flip(p=0.5), OneOf([ GridDistortion(p=0.5), OpticalDistortion(p=0.5, distort_limit=2, shift_limit=0.5) ], p=0.5), OneOf([ RandomGamma(gamma_limit=(100, 140), p=0.5), RandomBrightnessContrast(p=0.5), ], p=0.5), OneOf([ GaussNoise(p=0.5), ], p=0.5), ShiftScaleRotate(rotate_limit=20, p=0.5), ]) val_augmentation = None train_dataset = SeverDataset(train_df, IMG_DIR, IMG_SIZE, N_CLASSES, id_colname=ID_COLUMNS, transforms=train_augmentation, crop_rate=1.0, class_y=y_train) val_dataset = SeverDataset(val_df, IMG_DIR, IMG_SIZE, N_CLASSES, id_colname=ID_COLUMNS, transforms=val_augmentation) train_sampler = MaskProbSampler(train_df, demand_non_empty_proba=0.6) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, num_workers=8) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=8) del train_df, val_df, df, train_dataset, val_dataset gc.collect() with timer('create model'): model = smp.Unet('resnet34', encoder_weights="imagenet", classes=N_CLASSES, encoder_se_module=True, decoder_semodule=True, h_columns=False, skip=True, act="swish", freeze_bn=True, classification=CLASSIFICATION, attention_type="cbam", center=True, mode="train") model = convert_model(model) if base_model is not None: model.load_state_dict(torch.load(base_model)) model.to(device) criterion = torch.nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam([ { 'params': model.decoder.parameters(), 'lr': 3e-3 }, { 'params': model.encoder.parameters(), 'lr': 3e-4 }, ]) if base_model is None: scheduler_cosine = CosineAnnealingLR(optimizer, T_max=CLR_CYCLE, eta_min=3e-5) scheduler = GradualWarmupScheduler( optimizer, multiplier=1.1, total_epoch=CLR_CYCLE * 2, after_scheduler=scheduler_cosine) else: scheduler = CosineAnnealingLR(optimizer, T_max=CLR_CYCLE, eta_min=3e-5) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) if EMA: ema_model = copy.deepcopy(model) if base_model_ema is not None: ema_model.load_state_dict(torch.load(base_model_ema)) ema_model.to(device) ema_model = torch.nn.DataParallel(ema_model) else: ema_model = None model = torch.nn.DataParallel(model) with timer('train'): train_losses = [] valid_losses = [] best_model_loss = 999 best_model_ema_loss = 999 best_model_ep = 0 ema_decay = 0 checkpoint = base_ckpt + 1 for epoch in range(1, EPOCHS + 1): seed = seed + epoch seed_torch(seed) if epoch >= EMA_START: ema_decay = 0.99 LOGGER.info("Starting {} epoch...".format(epoch)) tr_loss = train_one_epoch(model, train_loader, criterion, optimizer, device, cutmix_prob=0.0, classification=CLASSIFICATION, ema_model=ema_model, ema_decay=ema_decay) train_losses.append(tr_loss) LOGGER.info('Mean train loss: {}'.format(round(tr_loss, 5))) valid_loss = validate(model, val_loader, criterion, device, classification=CLASSIFICATION) valid_losses.append(valid_loss) LOGGER.info('Mean valid loss: {}'.format(round(valid_loss, 5))) if EMA and epoch >= EMA_START: ema_valid_loss = validate(ema_model, val_loader, criterion, device, classification=CLASSIFICATION) LOGGER.info('Mean EMA valid loss: {}'.format( round(ema_valid_loss, 5))) if ema_valid_loss < best_model_ema_loss: torch.save( ema_model.module.state_dict(), 'models/{}_fold{}_ckpt{}_ema.pth'.format( EXP_ID, FOLD_ID, checkpoint)) best_model_ema_loss = ema_valid_loss scheduler.step() if valid_loss < best_model_loss: torch.save( model.module.state_dict(), 'models/{}_fold{}_ckpt{}.pth'.format( EXP_ID, FOLD_ID, checkpoint)) best_model_loss = valid_loss best_model_ep = epoch #np.save("val_pred.npy", val_pred) if epoch % (CLR_CYCLE * 2) == CLR_CYCLE * 2 - 1: torch.save( model.module.state_dict(), 'models/{}_fold{}_latest.pth'.format(EXP_ID, FOLD_ID)) LOGGER.info('Best valid loss: {} on epoch={}'.format( round(best_model_loss, 5), best_model_ep)) if EMA: torch.save( ema_model.module.state_dict(), 'models/{}_fold{}_latest_ema.pth'.format( EXP_ID, FOLD_ID)) LOGGER.info('Best ema valid loss: {}'.format( round(best_model_ema_loss, 5))) best_model_ema_loss = 999 checkpoint += 1 best_model_loss = 999 #del val_pred gc.collect() LOGGER.info('Best valid loss: {} on epoch={}'.format( round(best_model_loss, 5), best_model_ep)) xs = list(range(1, len(train_losses) + 1)) plt.plot(xs, train_losses, label='Train loss') plt.plot(xs, valid_losses, label='Val loss') plt.legend() plt.xticks(xs) plt.xlabel('Epochs') plt.savefig("loss.png")
def main(seed): with timer('load data'): df = pd.read_csv(FOLD_PATH) with timer('preprocessing'): train_df, val_df = df[df.fold_id != FOLD_ID], df[df.fold_id == FOLD_ID] train_augmentation = Compose([ Flip(p=0.5), OneOf([ GridDistortion(p=0.5), OpticalDistortion(p=0.5, distort_limit=2, shift_limit=0.5) ], p=0.5), OneOf([ RandomGamma(gamma_limit=(100, 140), p=0.5), RandomBrightnessContrast(p=0.5), RandomBrightness(p=0.5), RandomContrast(p=0.5) ], p=0.5), OneOf([ GaussNoise(p=0.5), Cutout(num_holes=10, max_h_size=10, max_w_size=20, p=0.5) ], p=0.5), ShiftScaleRotate(rotate_limit=20, p=0.5), ]) val_augmentation = None train_dataset = SeverDataset(train_df, IMG_DIR, IMG_SIZE, N_CLASSES, id_colname=ID_COLUMNS, transforms=train_augmentation, crop_rate=1.0) val_dataset = SeverDataset(val_df, IMG_DIR, IMG_SIZE, N_CLASSES, id_colname=ID_COLUMNS, transforms=val_augmentation) train_sampler = MaskProbSampler(train_df, demand_non_empty_proba=0.6) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, num_workers=8) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=8) del train_df, val_df, df, train_dataset, val_dataset gc.collect() with timer('create model'): model = smp.Unet('se_resnext50_32x4d', encoder_weights="imagenet", classes=N_CLASSES, encoder_se_module=True, decoder_semodule=True, h_columns=False, skip=True, act="swish") model = convert_model(model) if base_model is not None: model.load_state_dict(torch.load(base_model)) model.to(device) criterion = torch.nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam(model.parameters(), lr=3e-4) if base_model is None: scheduler_cosine = CosineAnnealingLR(optimizer, T_max=CLR_CYCLE, eta_min=3e-5) scheduler = GradualWarmupScheduler( optimizer, multiplier=1.1, total_epoch=CLR_CYCLE * 2, after_scheduler=scheduler_cosine) else: scheduler = CosineAnnealingLR(optimizer, T_max=CLR_CYCLE, eta_min=3e-5) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) model = torch.nn.DataParallel(model) with timer('train'): train_losses = [] valid_losses = [] best_model_loss = 999 best_model_ep = 0 checkpoint = base_ckpt + 1 for epoch in range(1, EPOCHS + 1): seed = seed + epoch seed_torch(seed) LOGGER.info("Starting {} epoch...".format(epoch)) tr_loss = train_one_epoch(model, train_loader, criterion, optimizer, device, cutmix_prob=0.0) train_losses.append(tr_loss) LOGGER.info('Mean train loss: {}'.format(round(tr_loss, 5))) valid_loss = validate(model, val_loader, criterion, device) valid_losses.append(valid_loss) LOGGER.info('Mean valid loss: {}'.format(round(valid_loss, 5))) scheduler.step() if valid_loss < best_model_loss: torch.save( model.module.state_dict(), 'models/{}_fold{}_ckpt{}.pth'.format( EXP_ID, FOLD_ID, checkpoint)) best_model_loss = valid_loss best_model_ep = epoch #np.save("val_pred.npy", val_pred) if epoch % (CLR_CYCLE * 2) == CLR_CYCLE * 2 - 1: torch.save( model.module.state_dict(), 'models/{}_fold{}_latest.pth'.format(EXP_ID, FOLD_ID)) LOGGER.info('Best valid loss: {} on epoch={}'.format( round(best_model_loss, 5), best_model_ep)) checkpoint += 1 best_model_loss = 999 #del val_pred gc.collect() LOGGER.info('Best valid loss: {} on epoch={}'.format( round(best_model_loss, 5), best_model_ep)) xs = list(range(1, len(train_losses) + 1)) plt.plot(xs, train_losses, label='Train loss') plt.plot(xs, valid_losses, label='Val loss') plt.legend() plt.xticks(xs) plt.xlabel('Epochs') plt.savefig("loss.png")
def main( cfg, model, log_dir, checkpoint=None, ): if checkpoint is not None: print("...Load checkpoint from {}".format(checkpoint)) checkpoint = torch.load(checkpoint) model.load_state_dict(checkpoint['state_dict']) print("...Checkpoint loaded") # Checking cuda device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") logging.info("Using device: {} ".format(device)) # Convert to suitable device # logging.info(model) model = model.to(device) logging.info("Number parameters of model: {:,}".format( sum(p.numel() for p in model.parameters()))) # using parsed configurations to create a dataset # Create dataset num_of_class = len(cfg["data"]["label_dict"]) train_loader, valid_loader, test_loader = get_data_loader(cfg) print("Dataset and Dataloaders created") # create a metric for evaluating metric_names = cfg["train"]["metrics"] train_metrics = metrics_loader.Metrics(metric_names) val_metrics = metrics_loader.Metrics(metric_names) print("Metrics implemented successfully") ## read settings from json file ## initlize optimizer from config optimizer_module, optimizer_params = get_optimizer(cfg) optimizer = optimizer_module(model.parameters(), **optimizer_params) ## initlize sheduler from config scheduler_module, scheduler_params = get_lr_scheduler(cfg) scheduler = scheduler_module(optimizer, **scheduler_params) # scheduler = CosineAnnealingLR(optimizer, T_max=t_max, eta_min=min_lr) loss_fn = get_loss_fn(cfg) criterion = loss_fn() print("\nTraing shape: {} samples".format(len(train_loader.dataset))) print("Validation shape: {} samples".format(len(valid_loader.dataset))) print("Beginning training...") # initialize the early_stopping object save_mode = cfg["train"]["mode"] early_patience = cfg["train"]["patience"] checkpoint_path = os.path.join(log_dir, "Checkpoint.ckpt") early_stopping = callbacks.EarlyStopping(patience=early_patience, mode=save_mode, path=checkpoint_path) # training models logging.info("--" * 50) num_epochs = int(cfg["train"]["num_epochs"]) t0 = time.time() for epoch in range(num_epochs): t1 = time.time() if epoch == 3: print('\t Release _ PARAMETERS') for param in model.parameters(): param.requires_grad = True print(('\n' + '%13s' * 3) % ('Epoch', 'gpu_mem', 'mean_loss')) train_loss, train_acc, val_loss, val_acc, train_result, val_result = trainer.train_one_epoch( epoch, num_epochs, model, device, train_loader, valid_loader, criterion, optimizer, train_metrics, val_metrics, ) ## lr scheduling scheduler.step(val_loss) ## log to file logging.info( "\n------Epoch {} / {}, Training time: {:.4f} seconds------". format(epoch, num_epochs, (time.time() - t1))) logging.info( f"Training loss: {train_loss} \n Training metrics: {train_result}") logging.info( f"Validation loss: {val_loss} \n Validation metrics: {val_result}") ## tensorboard writer tb_writer.add_scalar("Training Loss", train_loss, epoch) tb_writer.add_scalar("Valid Loss", val_loss, epoch) for metric_name in metric_names: tb_writer.add_scalar(f"Training {metric_name}", train_result[metric_name], epoch) tb_writer.add_scalar(f"Validation {metric_name}", val_result[metric_name], epoch) train_checkpoint = { 'epoch': epoch, 'valid_loss': val_loss, 'model': model, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), } # Save model if save_mode == "min": early_stopping(val_loss, train_checkpoint) else: early_stopping(val_acc, train_checkpoint) if early_stopping.early_stop: logging.info("Early Stopping!!!") break # testing on test set # load the test model and making inference print("\n==============Inference on the testing set==============") best_checkpoint = torch.load(checkpoint_path) test_model = best_checkpoint['model'] test_model.load_state_dict(best_checkpoint['state_dict']) test_model = test_model.to(device) test_model.eval() # logging report report = tester.test_result(test_model, test_loader, device, cfg) logging.info(f"\nClassification Report: \n {report}") logging.info("Completed in {:.3f} seconds. ".format(time.time() - t0)) print(f"Classification Report: \n {report}") print("Completed in {:.3f} seconds.".format(time.time() - t0)) print(f"-------- Checkpoints and logs are saved in ``{log_dir}`` --------") return checkpoint_path
def main(): # train_df = pd.read_csv(TRAIN_PATH).sample(frac=1.0, random_state=seed) # train_size = int(len(train_df) * 0.9) train_df = pd.read_csv(TRAIN_PATH).sample(train_size + valid_size, random_state=seed) LOGGER.info(f'data_size is {len(train_df)}') LOGGER.info(f'train_size is {train_size}') y = np.where(train_df['target'] >= 0.5, 1, 0) y_aux = train_df[AUX_COLUMNS].values identity_columns_new = [] for column in identity_columns + ['target']: train_df[column + "_bin"] = np.where(train_df[column] >= 0.5, True, False) if column != "target": identity_columns_new.append(column + "_bin") sample_weights = np.ones(len(train_df), dtype=np.float32) sample_weights += train_df[identity_columns_new].sum(axis=1) sample_weights += train_df['target_bin'] * (~train_df[identity_columns_new]).sum(axis=1) sample_weights += (~train_df['target_bin']) * train_df[identity_columns_new].sum(axis=1) * 5 sample_weights /= sample_weights.mean() with timer('preprocessing text'): # df["comment_text"] = [analyzer_embed(text) for text in df["comment_text"]] train_df['comment_text'] = train_df['comment_text'].astype(str) train_df = train_df.fillna(0) with timer('load embedding'): tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None, do_lower_case=True) X_text = convert_lines(train_df["comment_text"].fillna("DUMMY_VALUE"), max_len, tokenizer) test_df = train_df[train_size:] with timer('train'): X_train, y_train, y_aux_train, w_train = X_text[:train_size], y[:train_size], y_aux[ :train_size], sample_weights[ :train_size] X_val, y_val, y_aux_val, w_val = X_text[train_size:], y[train_size:], y_aux[train_size:], sample_weights[ train_size:] model = BertForSequenceClassification(bert_config, num_labels=n_labels) model.load_state_dict(torch.load(model_path)) model.zero_grad() model = model.to(device) train_dataset = torch.utils.data.TensorDataset(torch.tensor(X_train, dtype=torch.long), torch.tensor(y_train, dtype=torch.float)) valid = torch.utils.data.TensorDataset(torch.tensor(X_val, dtype=torch.long), torch.tensor(y_val, dtype=torch.float)) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size * 2, shuffle=False) sample_weight_train = [w_train.values, np.ones_like(w_train)] sample_weight_val = [w_val.values, np.ones_like(w_val)] param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] num_train_optimization_steps = int(epochs * train_size / batch_size / accumulation_steps) total_step = int(epochs * train_size / batch_size) optimizer = BertAdam(optimizer_grouped_parameters, lr=2e-5*gamma, warmup=0.05, t_total=num_train_optimization_steps) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) criterion = torch.nn.BCEWithLogitsLoss().to(device) LOGGER.info(f"Starting 1 epoch...") tr_loss, train_losses = train_one_epoch(model, train_loader, criterion, optimizer, device, accumulation_steps, total_step, n_labels) LOGGER.info(f'Mean train loss: {round(tr_loss,5)}') torch.save(model.state_dict(), '{}_dic'.format(exp)) valid_loss, oof_pred = validate(model, valid_loader, criterion, device, n_labels) del model gc.collect() torch.cuda.empty_cache() test_df["pred"] = oof_pred.reshape(-1) test_df = convert_dataframe_to_bool(test_df) bias_metrics_df = compute_bias_metrics_for_model(test_df, identity_columns) LOGGER.info(bias_metrics_df) score = get_final_metric(bias_metrics_df, calculate_overall_auc(test_df)) LOGGER.info(f'final score is {score}') test_df.to_csv("oof.csv", index=False) xs = list(range(1, len(train_losses) + 1)) plt.plot(xs, train_losses, label='Train loss'); plt.legend(); plt.xticks(xs); plt.xlabel('Iter') plt.savefig("loss.png")
def main(seed): with timer('load data'): df = pd.read_csv(FOLD_PATH) soft_df = pd.read_csv(SOFT_PATH) df = df.append(pd.read_csv(PSEUDO_PATH)).reset_index(drop=True) soft_df = soft_df.append( pd.read_csv(PSEUDO_PATH)).reset_index(drop=True) soft_df = df[[ID_COLUMNS]].merge(soft_df, how="left", on=ID_COLUMNS) LOGGER.info(df.head()) LOGGER.info(soft_df.head()) for c in [ "EncodedPixels_1", "EncodedPixels_2", "EncodedPixels_3", "EncodedPixels_4" ]: df[c] = df[c].astype(str) soft_df[c] = soft_df[c].astype(str) df["fold_id"] = df["fold_id"].fillna(FOLD_ID + 1) y = (df.sum_target != 0).astype("float32").values y += (soft_df.sum_target != 0).astype("float32").values y = y / 2 with timer('preprocessing'): train_df, val_df = df[df.fold_id != FOLD_ID], df[df.fold_id == FOLD_ID] train_soft_df, val_soft_df = soft_df[df.fold_id != FOLD_ID], soft_df[ df.fold_id == FOLD_ID] y_train, y_val = y[df.fold_id != FOLD_ID], y[df.fold_id == FOLD_ID] train_augmentation = Compose([ Flip(p=0.5), OneOf([ GridDistortion(p=0.5), OpticalDistortion(p=0.5, distort_limit=2, shift_limit=0.5) ], p=0.5), OneOf([ RandomGamma(gamma_limit=(100, 140), p=0.5), RandomBrightnessContrast(p=0.5), ], p=0.5), OneOf([ GaussNoise(p=0.5), ], p=0.5), ShiftScaleRotate(rotate_limit=20, p=0.5), ]) val_augmentation = None train_dataset = SeverDataset(train_df, IMG_DIR, IMG_SIZE, N_CLASSES, id_colname=ID_COLUMNS, transforms=train_augmentation, crop_rate=1.0, class_y=y_train, soft_df=train_soft_df) val_dataset = SeverDataset(val_df, IMG_DIR, IMG_SIZE, N_CLASSES, id_colname=ID_COLUMNS, transforms=val_augmentation, soft_df=val_soft_df) train_sampler = MaskProbSampler(train_df, demand_non_empty_proba=0.6) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, num_workers=8) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=8) del train_df, val_df, df, train_dataset, val_dataset gc.collect() with timer('create model'): model = smp_old.Unet('resnet34', encoder_weights="imagenet", classes=N_CLASSES, encoder_se_module=True, decoder_semodule=True, h_columns=False, skip=True, act="swish", freeze_bn=True, classification=CLASSIFICATION) model = convert_model(model) if base_model is not None: model.load_state_dict(torch.load(base_model)) model.to(device) criterion = torch.nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam([ { 'params': model.decoder.parameters(), 'lr': 3e-3 }, { 'params': model.encoder.parameters(), 'lr': 3e-4 }, ]) if base_model is None: scheduler_cosine = CosineAnnealingLR(optimizer, T_max=CLR_CYCLE, eta_min=3e-5) scheduler = GradualWarmupScheduler( optimizer, multiplier=1.1, total_epoch=CLR_CYCLE * 2, after_scheduler=scheduler_cosine) else: scheduler = CosineAnnealingLR(optimizer, T_max=CLR_CYCLE, eta_min=3e-5) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) model = torch.nn.DataParallel(model) with timer('train'): train_losses = [] valid_losses = [] best_model_loss = 999 best_model_ep = 0 checkpoint = base_ckpt + 1 for epoch in range(1, EPOCHS + 1): seed = seed + epoch seed_torch(seed) LOGGER.info("Starting {} epoch...".format(epoch)) tr_loss = train_one_epoch(model, train_loader, criterion, optimizer, device, cutmix_prob=0.0, classification=CLASSIFICATION) train_losses.append(tr_loss) LOGGER.info('Mean train loss: {}'.format(round(tr_loss, 5))) valid_loss, val_score = validate(model, val_loader, criterion, device, classification=CLASSIFICATION) valid_losses.append(valid_loss) LOGGER.info('Mean valid loss: {}'.format(round(valid_loss, 5))) LOGGER.info('Mean valid score: {}'.format(round(val_score, 5))) scheduler.step() if valid_loss < best_model_loss: torch.save( model.module.state_dict(), 'models/{}_fold{}_ckpt{}.pth'.format( EXP_ID, FOLD_ID, checkpoint)) best_model_loss = valid_loss best_model_ep = epoch #np.save("val_pred.npy", val_pred) if epoch % (CLR_CYCLE * 2) == CLR_CYCLE * 2 - 1: torch.save( model.module.state_dict(), 'models/{}_fold{}_latest.pth'.format(EXP_ID, FOLD_ID)) LOGGER.info('Best valid loss: {} on epoch={}'.format( round(best_model_loss, 5), best_model_ep)) checkpoint += 1 best_model_loss = 999 #del val_pred gc.collect() LOGGER.info('Best valid loss: {} on epoch={}'.format( round(best_model_loss, 5), best_model_ep)) xs = list(range(1, len(train_losses) + 1)) plt.plot(xs, train_losses, label='Train loss') plt.plot(xs, valid_losses, label='Val loss') plt.legend() plt.xticks(xs) plt.xlabel('Epochs') plt.savefig("loss.png")
def main( model, config=None, comment="No comment", checkpoint=None, ): if checkpoint is not None: print("...Load checkpoint from {}".format(checkpoint)) checkpoint = torch.load(checkpoint) model.load_state_dict(checkpoint['state_dict']) print("...Checkpoint loaded") # Checking cuda device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") logging.info("Using device: {} ".format(device)) # Convert to suitable device model = model.to(device) print("Number of parameters: ", sum(p.numel() for p in model.parameters())) logging.info("Model created...") # using parsed configurations to create a dataset num_of_class = len(cfg["data"]["label_dict"]) # Create dataset train_loader, valid_loader, test_loader = get_data_loader(cfg) logging.info("Dataset and Dataloaders created") # create a metric for evaluating train_metrics = metrics.Metrics(cfg["train"]["metrics"]) val_metrics = metrics.Metrics(cfg["train"]["metrics"]) print("Metrics implemented successfully") # read settings from json file # initlize optimizing methods : lr, scheduler of lr, optimizer learning_rate = cfg["optimizer"]["lr"] optimizer = get_optimizer(cfg) optimizer = optimizer(model.parameters(), lr=learning_rate) loss_fn = get_loss_fn(cfg) criterion = loss_fn() ## Learning rate decay max_lr = 3e-3 # Maximum LR min_lr = cfg["optimizer"]["min_lr"] # Minimum LR t_max = 10 # How many epochs to go from max_lr to min_lr save_method = cfg["optimizer"]["lr_scheduler_factor"] lr_patiences = cfg["optimizer"]["lr_patience"] lr_factor = cfg["optimizer"]["reduce_lr_factor"] scheduler = ReduceLROnPlateau(optimizer, mode=save_method, min_lr=min_lr, patience=lr_patiences, factor=lr_factor) # scheduler = CosineAnnealingLR(optimizer, T_max=t_max, eta_min=min_lr) print("\nTraing shape: {} samples".format(len(train_loader.dataset))) print("Validation shape: {} samples".format(len(valid_loader.dataset))) print("Beginning training...") # export the result to log file logging.info("--------------------------------") logging.info("session name: {}".format(cfg["session"]["sess_name"])) # logging.info(model) logging.info("CONFIGS:") logging.info(cfg) # initialize the early_stopping object checkpoint_path = os.path.join(log_dir, "Checkpoint.pt") save_mode = cfg["train"]["mode"] early_patience = cfg["train"]["early_patience"] early_stopping = callbacks.EarlyStopping(patience=early_patience, mode=save_mode, path=checkpoint_path) # training models num_epoch = int(cfg["train"]["num_epoch"]) best_val_acc = 0 t0 = time.time() for epoch in range(num_epoch): t1 = time.time() train_loss, train_acc, val_loss, val_acc, train_result, val_result = trainer.train_one_epoch( epoch, num_epoch, model, device, train_loader, valid_loader, criterion, optimizer, train_metrics, val_metrics, ) train_checkpoint = { 'epoch': epoch + 1, 'valid_loss': val_loss, 'model': model, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), } scheduler.step(val_loss) ## lr scheduling logging.info( "\n------Epoch %d / %d, Training time: %.4f seconds------" % (epoch + 1, num_epoch, (time.time() - t1))) logging.info("Training loss: {} - Other training metrics: {}".format( train_loss, train_result)) logging.info( "Validation loss: {} - Other validation metrics: {}".format( val_loss, val_result)) ## tensorboard tb_writer.add_scalar("Training Loss", train_loss, epoch + 1) tb_writer.add_scalar("Valid Loss", val_loss, epoch + 1) tb_writer.add_scalar("Training Accuracy", train_result["accuracy_score"], epoch + 1) tb_writer.add_scalar("Valid Accuracy", val_result["accuracy_score"], epoch + 1) # tb_writer.add_scalar("training f1_score", train_result["f1_score"], epoch + 1) # tb_writer.add_scalar("valid f1_score", val_result["f1_score"], epoch + 1) # Save model if save_mode == "min": early_stopping(val_loss, train_checkpoint) else: early_stopping(val_acc, train_checkpoint) if early_stopping.early_stop: logging.info("Early Stopping!!!") break # testing on test set # load the test model and making inference print("\nInference on the testing set") checkpoint = torch.load(checkpoint_path) test_model = checkpoint['model'] test_model.load_state_dict(checkpoint['state_dict']) test_model = test_model.to(device) # logging report report = tester.test_result(test_model, test_loader, device, cfg) logging.info("\nClassification Report: \n {}".format(report)) logging.info('Completed in %.3f seconds.' % (time.time() - t0)) print("Classification Report: \n{}".format(report)) print('Completed in %.3f seconds.' % (time.time() - t0)) print( 'Start Tensorboard with tensorboard --logdir {}, view at http://localhost:6006/' .format(log_dir))
def main(model, dataset, validation_flag, comment="No comment", checkpoint=None, num_of_class=2): # Checking cuda device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") logging.info("Using device: {} ".format(device)) if checkpoint is not None: print("...Load checkpoint from {}".format(checkpoint)) checkpoint = torch.load(checkpoint) model.load_state_dict(checkpoint['state_dict']) print("...Checkpoint loaded") # Convert to suitable device model = model.to(device) print("Number of parameters: ", sum(p.numel() for p in model.parameters())) logging.info("Model created...") # using parsed configurations to create a dataset data = cfg["data"]["data_csv_name"] print("Reading training data from file: ", data) training_set = pd.read_csv(data) # check if validation flag is on if validation_flag == 0: # using custom validation set print("Creating validation set from file") valid = cfg["data"]["validation_csv_name"] print("Reading validation data from file: ", valid) valid_set = pd.read_csv(valid) else: # auto divide validation set print("Splitting dataset into train and valid....") validation_split = float(cfg["data"]["validation_ratio"]) training_set, valid_set, _, _ = data_split(training_set, validation_split) print("Done Splitting !!!") data_path = cfg["data"]["data_path"] batch_size = int(cfg["data"]["batch_size"]) # Create dataset training_set = dataset(training_set, data_path, transform.train_transform) valid_set = dataset(valid_set, data_path, transform.val_transform) # End sampler train_loader = torch.utils.data.DataLoader(training_set, batch_size=batch_size, shuffle=True) val_loader = torch.utils.data.DataLoader(valid_set, batch_size=batch_size, shuffle=False) logging.info("Dataset and Dataloaders created") # create a metric for evaluating # global train_metrics # global val_metrics train_metrics = metrics.Metrics(cfg["train"]["metrics"]) val_metrics = metrics.Metrics(cfg["train"]["metrics"]) print("Metrics implemented successfully") # method to optimize the model # read settings from json file loss_function = cfg["optimizer"]["loss"] optimizers = cfg["optimizer"]["name"] learning_rate = cfg["optimizer"]["lr"] # initlize optimizing methods : lr, scheduler of lr, optimizer try: # if the loss function comes from nn package criterion = getattr( nn, loss_function, "The loss {} is not available".format(loss_function)) except: # use custom loss criterion = getattr( custom_loss, loss_function, "The loss {} is not available".format(loss_function), ) criterion = criterion() optimizer = getattr(torch.optim, optimizers, "The optimizer {} is not available".format(optimizers)) max_lr = 3e-3 # Maximum LR min_lr = 1e-5 # Minimum LR t_max = 10 # How many epochs to go from max_lr to min_lr # optimizer = torch.optim.Adam( # params=model.parameters(), lr=max_lr, amsgrad=False) optimizer = optimizer(model.parameters(), lr=learning_rate) save_method = cfg["train"]["lr_scheduler_factor"] patiences = cfg["train"]["patience"] lr_factor = cfg["train"]["reduce_lr_factor"] scheduler = ReduceLROnPlateau(optimizer, mode=save_method, min_lr=min_lr, patience=patiences, factor=lr_factor) # scheduler = CosineAnnealingLR(optimizer, T_max=t_max, eta_min=min_lr) print("\nTraing shape: {} samples".format(len(train_loader.dataset))) print("Validation shape: {} samples".format(len(val_loader.dataset))) print("Beginning training...") # export the result to log file logging.info("--------------------------------") logging.info("session name: {}".format(cfg["session"]["sess_name"])) # logging.info(model) logging.info("CONFIGS:") logging.info(cfg) # training models num_epoch = int(cfg["train"]["num_epoch"]) best_val_acc = 0 t0 = time.time() for epoch in range(0, num_epoch): t1 = time.time() print(('\n' + '%13s' * 3) % ('Epoch', 'gpu_mem', 'mean_loss')) train_loss, val_loss, train_result, val_result = trainer.train_one_epoch( epoch, num_epoch, model, device, train_loader, val_loader, criterion, optimizer, train_metrics, val_metrics, ) scheduler.step(val_loss) # lr scheduling logging.info( "\n------Epoch %d / %d, Training time: %.4f seconds------" % (epoch + 1, num_epoch, (time.time() - t1))) logging.info("Training loss: {} - Other training metrics: {}".format( train_loss, train_result)) logging.info( "Validation loss: {} - Other validation metrics: {}".format( val_loss, val_result)) tb_writer.add_scalar("Training Loss", train_loss, epoch + 1) tb_writer.add_scalar("Valid Loss", val_loss, epoch + 1) tb_writer.add_scalar("Training Accuracy", train_result["accuracy_score"], epoch + 1) tb_writer.add_scalar("Valid Accuracy", val_result["accuracy_score"], epoch + 1) # tb_writer.add_scalar("training f1_score", train_result["f1_score"], epoch + 1) # tb_writer.add_scalar("valid f1_score", val_result["f1_score"], epoch + 1) # saving epoch with best validation accuracy if best_val_acc < float(val_result["accuracy_score"]): logging.info("Validation accuracy= " + str(val_result["accuracy_score"])) logging.info("====> Save best at epoch {}".format(epoch + 1)) best_val_acc = val_result["accuracy_score"] checkpoint = { 'epoch': epoch + 1, 'valid_loss': val_loss, 'model': model, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), } torch.save(checkpoint, log_dir + "/Checkpoint.pt") # testing on test set test_data = cfg["data"]["test_csv_name"] data_path = cfg["data"]["data_path"] test_df = pd.read_csv(test_data) # prepare the dataset testing_set = dataset(test_df, data_path, transform.val_transform) test_loader = torch.utils.data.DataLoader(testing_set, batch_size=32, shuffle=False) print("\nInference on the testing set") # load the test model and making inference checkpoint = torch.load(log_dir + "/Checkpoint.pt") test_model = checkpoint['model'] test_model.load_state_dict(checkpoint['state_dict']) test_model = test_model.to(device) # logging report report = tester.test_result(test_model, test_loader, device, cfg) logging.info("\nClassification Report: \n {}".format(report)) logging.info('%d epochs completed in %.3f seconds.' % (num_epoch, (time.time() - t0))) print("Classification Report: \n{}".format(report)) print('%d epochs completed in %.3f seconds.' % (num_epoch, (time.time() - t0))) print( f'Start Tensorboard with "tensorboard --logdir {log_dir}", view at http://localhost:6006/' )
def main(seed): with timer('load data'): df = pd.read_csv(FOLD_PATH) y1 = (df.EncodedPixels_1 != "-1").astype("float32").values.reshape(-1, 1) y2 = (df.EncodedPixels_2 != "-1").astype("float32").values.reshape(-1, 1) y3 = (df.EncodedPixels_3 != "-1").astype("float32").values.reshape(-1, 1) y4 = (df.EncodedPixels_4 != "-1").astype("float32").values.reshape(-1, 1) y = np.concatenate([y1, y2, y3, y4], axis=1) with timer('preprocessing'): train_df, val_df = df[df.fold_id != FOLD_ID], df[df.fold_id == FOLD_ID] y_train, y_val = y[df.fold_id != FOLD_ID], y[df.fold_id == FOLD_ID] train_augmentation = Compose([ Flip(p=0.5), OneOf([ GridDistortion(p=0.5), OpticalDistortion(p=0.5, distort_limit=2, shift_limit=0.5) ], p=0.5), OneOf([ RandomGamma(gamma_limit=(100,140), p=0.5), RandomBrightnessContrast(p=0.5), ], p=0.5), OneOf([ GaussNoise(p=0.5), ], p=0.5), ShiftScaleRotate(rotate_limit=20, p=0.5), ]) val_augmentation = None train_dataset = SeverDataset(train_df, IMG_DIR, IMG_SIZE, N_CLASSES, id_colname=ID_COLUMNS, transforms=train_augmentation, crop_rate=1.0, class_y=y_train, gamma=GAMMA) val_dataset = SeverDataset(val_df, IMG_DIR, IMG_SIZE, N_CLASSES, id_colname=ID_COLUMNS, transforms=val_augmentation, gamma=GAMMA) train_sampler = MaskProbSampler(train_df, demand_non_empty_proba=0.6) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, num_workers=8) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=8) del train_df, val_df, df, train_dataset, val_dataset gc.collect() with timer('create model'): model = smp.Unet('resnet34', encoder_weights="imagenet", classes=N_CLASSES, encoder_se_module=True, decoder_semodule=True, h_columns=False, skip=True, act="swish", freeze_bn=True, classification=CLASSIFICATION, attention_type="cbam", center=True) model = convert_model(model) if base_model is not None: model.load_state_dict(torch.load(base_model)) model.to(device) criterion = torch.nn.BCEWithLogitsLoss() optimizer = torch.optim.SGD( model.parameters(), lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=False, ) scheduler = CosineAnnealingLR(optimizer, T_max=CLR_CYCLE, eta_min=0) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) model = torch.nn.DataParallel(model) with timer('train'): train_losses = [] valid_losses = [] best_model_loss = 999 best_model_ep = 0 best_model_score = 0 checkpoint = base_ckpt+1 for epoch in range(84, EPOCHS + 1): seed = seed + epoch seed_torch(seed) LOGGER.info("Starting {} epoch...".format(epoch)) tr_loss = train_one_epoch(model, train_loader, criterion, optimizer, device, cutmix_prob=0.0, classification=CLASSIFICATION) train_losses.append(tr_loss) LOGGER.info('Mean train loss: {}'.format(round(tr_loss, 5))) valid_loss, val_score = validate(model, val_loader, criterion, device, classification=CLASSIFICATION) valid_losses.append(valid_loss) LOGGER.info('Mean valid loss: {}'.format(round(valid_loss, 5))) LOGGER.info('Mean valid score: {}'.format(round(val_score, 5))) scheduler.step() if val_score > best_model_score: torch.save(model.module.state_dict(), 'models/{}_fold{}_ckpt{}_score.pth'.format(EXP_ID, FOLD_ID, checkpoint)) best_model_score = val_score best_model_ep_score = epoch if valid_loss < best_model_loss: torch.save(model.module.state_dict(), 'models/{}_fold{}_ckpt{}.pth'.format(EXP_ID, FOLD_ID, checkpoint)) best_model_loss = valid_loss best_model_ep = epoch if epoch % (CLR_CYCLE * 2) == CLR_CYCLE * 2 - 1: torch.save(model.module.state_dict(), 'models/{}_fold{}_latest.pth'.format(EXP_ID, FOLD_ID)) LOGGER.info('Best valid loss: {} on epoch={}'.format(round(best_model_loss, 5), best_model_ep)) LOGGER.info('Best valid score: {} on epoch={}'.format(round(best_model_score, 5), best_model_ep_score)) checkpoint += 1 best_model_loss = 999 best_model_score = 0 #del val_pred gc.collect() LOGGER.info('Best valid loss: {} on epoch={}'.format(round(best_model_loss, 5), best_model_ep)) xs = list(range(1, len(train_losses) + 1)) plt.plot(xs, train_losses, label='Train loss') plt.plot(xs, valid_losses, label='Val loss') plt.legend() plt.xticks(xs) plt.xlabel('Epochs') plt.savefig("loss.png")
def main(): parser = argparse.ArgumentParser(description='NA') parser.add_argument('-c', '--configure', default='cfgs/chexphoto.cfg', help='JSON file') parser.add_argument('-cp', '--checkpoint', default=None, help='checkpoint path') args = parser.parse_args() checkpoint = args.checkpoint # read configure file with open(args.configure) as f: cfg = json.load(f) time_str = str(datetime.now().strftime("%Y%m%d-%H%M")) tensorboard_writer = logger.make_writer(cfg["session"]["sess_name"], time_str) # using parsed configurations to create a dataset data = cfg["data"]["data_csv_name"] valid = cfg['data']['test_csv_name'] data_path = cfg["data"]["data_path"] batch_size = int(cfg["data"]["batch_size"]) validation_split = float(cfg["data"]["validation_ratio"]) # create dataset training_set = pd.read_csv(data, usecols=["file_name", "label"]) valid_set = pd.read_csv(valid, usecols=["file_name", "label"]) # train, test, _, _ = dataloader.data_split(training_set, validation_split) training_set = dataloader.ClassificationDataset(training_set, data_path, transform.train_transform) testing_set = dataloader.ClassificationDataset(valid_set, data_path, transform.val_transform) # create dataloaders # global train_loader # global val_loader #SAmpler to prevent inbalance data label # train_loader = torch.utils.data.DataLoader(training_set,sampler=ImbalancedDatasetSampler(training_set, callback_get_label=lambda x, i: tuple(x[i][1].tolist())),batch_size=batch_size,) #End sampler train_loader = torch.utils.data.DataLoader( training_set, batch_size=batch_size, shuffle=True, ) val_loader = torch.utils.data.DataLoader( testing_set, batch_size=batch_size, shuffle=False, ) # val_loader = torch.utils.data.DataLoader(testing_set,sampler=ImbalancedDatasetSampler(testing_set, callback_get_label=lambda x, i: tuple(x[i][1].tolist())),batch_size=batch_size,) logging.info("Dataset and Dataloaders created") # create a model extractor_name = cfg["train"]["extractor"] model = cls.ClassificationModel(model_name=extractor_name).create_model() #load checkpoint to continue training if checkpoint is not None: print('...Load checkpoint from {}'.format(checkpoint)) checkpoint = torch.load(checkpoint) model.load_state_dict(checkpoint) print('...Checkpoint loaded') classifier = nn.Sequential(nn.Linear(1408, 512, bias=True), nn.ReLU(inplace=True), nn.Linear(512, 6, bias=True)) # create classfier # replace the last linear layer with your custom classifier # model._avg_pooling = SPPLayer([1,2,4]) model._fc = classifier # model.last_linear = self.cls # select with layers to unfreeze params = list(model.parameters()) len_param = len(params) # for index,param in enumerate(model.parameters()): # if index == (len_param -1): # param.requires_grad = True # else: # param.requires_grad = False # for param in model.parameters(): # print(param.requires_grad) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") logging.info("Using device: {} ".format(device)) # convert to suitable device # global model model = model.to(device) logging.info("Model created...") # create a metric for evaluating # global train_metrics # global val_metrics train_metrics = metrics.Metrics(cfg["train"]["metrics"]) val_metrics = metrics.Metrics(cfg["train"]["metrics"]) print("Metrics implemented successfully") # method to optimize the model # read settings from json file loss_function = cfg["optimizer"]["loss"] optimizers = cfg["optimizer"]["name"] learning_rate = cfg["optimizer"]["lr"] # initlize optimizing methods : lr, scheduler of lr, optimizer try: # if the loss function comes from nn package criterion = getattr( nn, loss_function, "The loss {} is not available".format(loss_function)) except: # use custom loss criterion = getattr( custom_loss, loss_function, "The loss {} is not available".format(loss_function), ) criterion = criterion() optimizer = getattr(torch.optim, optimizers, "The optimizer {} is not available".format(optimizers)) max_lr = 3e-3 # Maximum LR min_lr = 1e-5 # Minimum LR t_max = 10 # How many epochs to go from max_lr to min_lr # optimizer = torch.optim.Adam( # params=model.parameters(), lr=max_lr, amsgrad=False) optimizer = optimizer(model.parameters(), lr=learning_rate) save_method = cfg["train"]["lr_scheduler_factor"] patiences = cfg["train"]["patience"] lr_factor = cfg["train"]["reduce_lr_factor"] # scheduler = ReduceLROnPlateau( # optimizer, save_method, patience=patiences, factor=lr_factor # ) scheduler = CosineAnnealingLR(optimizer, T_max=t_max, eta_min=min_lr) # before training, let's create a file for logging model result log_file = logger.make_file(cfg["session"]["sess_name"], time_str) logger.log_initilize(log_file) print("Beginning training...") # export the result to log file f = open("saved/logs/traning_{}.txt".format(cfg["session"]["sess_name"]), "a") logging.info("-----") logging.info("session name: {} \n".format(cfg["session"]["sess_name"])) logging.info(model) logging.info("\n") logging.info("CONFIGS \n") # logging the configs: # logging.info(f.read()) # training models num_epoch = int(cfg["train"]["num_epoch"]) best_val_acc = 0 for i in range(0, num_epoch): loss, val_loss, train_result, val_result = trainer.train_one_epoch( model, train_loader, val_loader, device, optimizer, criterion, train_metrics, val_metrics, ) # lr scheduling logging.info( "Epoch {} / {} \n Training loss: {} - Other training metrics: ". format(i + 1, num_epoch, loss)) print("Epoch {} / {} \n Training acc: {} - Other training metrics: ". format(i + 1, num_epoch, train_result["accuracy_score"])) print("Epoch {} / {} \n Training loss: {} - Other training metrics: ". format(i + 1, num_epoch, loss)) f.write( "Epoch {} / {} \n Training loss: {} - Other training metrics: ". format(i + 1, num_epoch, loss)) f.write("Epoch {} / {} \n Training acc: {} - Other training metrics: ". format(i + 1, num_epoch, train_result["accuracy_score"])) tensorboard_writer.add_scalar("training accuracy", train_result["accuracy_score"], i + 1) tensorboard_writer.add_scalar("training f1_score", train_result["f1_score"], i + 1) tensorboard_writer.add_scalar("training metrics", loss, i + 1) logging.info(train_result) logging.info( " \n Validation loss : {} - Other validation metrics:".format( val_loss)) print( "Epoch {} / {} \n valid acc: {} - Other training metrics: ".format( i + 1, num_epoch, val_result["accuracy_score"])) f.write(" \n Validation loss : {} - Other validation metrics:".format( val_loss)) tensorboard_writer.add_scalar("valid accuracy", val_result["accuracy_score"], i + 1) tensorboard_writer.add_scalar("valid f1_score", val_result["f1_score"], i + 1) tensorboard_writer.add_scalar("valid metrics", val_loss, i + 1) logging.info(val_result) logging.info("\n") # saving epoch with best validation accuracy if best_val_acc < float(val_result["accuracy_score"]): logging.info("Validation accuracy= " + str(val_result["accuracy_score"]) + "===> Save best epoch") f.write("Validation accuracy= " + str(val_result["accuracy_score"]) + "===> Save best epoch") best_val_acc = val_result["accuracy_score"] torch.save( model.state_dict(), "saved/models/" + time_str + "-" + cfg["train"]["save_as_name"], ) scheduler.step(val_loss) # else: # # logging.info( # # "Validation accuracy= "+ str(val_result["accuracy_score"])+ "===> No saving" # # ) # continue # testing on test set test_data = cfg["data"]["test_csv_name"] data_path = cfg["data"]["data_path"] test_df = pd.read_csv(test_data, usecols=["file_name", "label"]) # prepare the dataset testing_set = dataloader.ClassificationDataset(test_df, data_path, transform.val_transform) # make dataloader device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") test_loader = torch.utils.data.DataLoader(testing_set, batch_size=32, shuffle=False) print("Inference on the testing set") # load the test model and making inference test_model = cls.ClassificationModel( model_name=extractor_name).create_model() model_path = os.path.join("saved/models", time_str + "-" + cfg["train"]["save_as_name"]) test_model.load_state_dict(torch.load(model_path)) test_model = test_model.to(device) logging.info(tester.test_result(test_model, test_loader, device, cfg))
def main(): # read configure file with open("cfgs/tenes.cfg") as f: cfg = json.load(f) # using parsed configurations to create a dataset data = cfg["data"]["data_csv_name"] data_path = cfg["data"]["data_path"] batch_size = int(cfg["data"]["batch_size"]) validation_split = float(cfg["data"]["validation_ratio"]) # create dataset training_set = pd.read_csv(data, usecols=["image_name", "target"]) training_set["image_name"] = training_set["image_name"] + '.jpg' training_set = shuffle(training_set) training_set = training_set.sample(25000) print(training_set['target'].value_counts()) train, test, _, _ = dataloader.data_split(training_set, validation_split) training_set = dataloader.ClassificationDataset(train, data_path, transform.train_transform) testing_set = dataloader.ClassificationDataset(test, data_path, transform.val_transform) # create dataloaders # global train_loader # global val_loader train_loader = torch.utils.data.DataLoader( training_set, batch_size=batch_size, shuffle=True, ) val_loader = torch.utils.data.DataLoader( testing_set, batch_size=batch_size, shuffle=False, ) logging.info("Dataset and Dataloaders created") # create a model extractor_name = cfg["train"]["extractor"] model = cls.ClassificationModel(model_name=extractor_name).create_model() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") logging.info("Using device: {} ".format(device)) # convert to suitable device # global model model = model.to(device) logging.info("Model created...") # create a metric for evaluating # global train_metrics # global val_metrics train_metrics = metrics.Metrics(cfg["train"]["metrics"]) val_metrics = metrics.Metrics(cfg["train"]["metrics"]) print("Metrics implemented successfully") # method to optimize the model # read settings from json file loss_function = cfg["optimizer"]["loss"] optimizers = cfg["optimizer"]["name"] learning_rate = cfg["optimizer"]["lr"] # initlize optimizing methods : lr, scheduler of lr, optimizer try: # if the loss function comes from nn package criterion = getattr( nn, loss_function, "The loss {} is not available".format(loss_function)) except: # use custom loss criterion = getattr( custom_loss, loss_function, "The loss {} is not available".format(loss_function), ) criterion = custom_loss.FocalLoss() optimizer = getattr(torch.optim, optimizers, "The optimizer {} is not available".format(optimizers)) optimizer = optimizer(model.parameters(), lr=learning_rate) save_method = cfg["train"]["lr_scheduler_factor"] patiences = cfg["train"]["patience"] lr_factor = cfg["train"]["reduce_lr_factor"] scheduler = ReduceLROnPlateau(optimizer, save_method, patience=patiences, factor=lr_factor) # before training, let's create a file for logging model result time_str = str(datetime.now().strftime("%Y%m%d-%H%M")) log_file = logger.make_file(cfg["session"]["sess_name"], time_str) logger.log_initilize(log_file) print("Beginning training...") # export the result to log file logging.info("-----") logging.info("session name: {} \n".format(cfg["session"]["sess_name"])) logging.info("Training size: " + str(len(train))) logging.info("Validation size: " + str(len(test))) logging.info(model) logging.info("\n") logging.info("CONFIGS \n") # logging the configs: # training models num_epoch = int(cfg["train"]["num_epoch"]) best_val_acc = 0 for i in range(0, num_epoch): loss, val_loss, train_result, val_result = trainer.train_one_epoch( model, train_loader, val_loader, device, optimizer, criterion, train_metrics, val_metrics, ) # lr scheduling scheduler.step(val_loss) logging.info( "Epoch {} / {} \n Training loss: {} - Other training metrics: ". format(i + 1, num_epoch, loss)) logging.info(train_result) logging.info( " \n Validation loss : {} - Other validation metrics:".format( val_loss)) logging.info(val_result) logging.info("\n") # saving epoch with best validation accuracy if best_val_acc < float(val_result["f1_score"]): logging.info("Validation f1= " + str(val_result["f1_score"]) + "===> Save best epoch") best_val_acc = val_result["f1_score"] torch.save( model.state_dict(), "saved/models/" + time_str + "-" + cfg["train"]["save_as_name"], ) else: logging.info("Validation f1= " + str(val_result["f1_score"]) + "===> No saving") continue # testing on test set test_data = cfg["data"]["test_csv_name"] data_path = cfg["data"]["data_path"] test_df = pd.read_csv(test_data, usecols=["image_name", "target"]) test_df['image_name'] = test_df['image_name'] + '.jpg' # prepare the dataset testing_set = dataloader.TestDataset(test_df, 'dataset/test/test', transform.test_transform) # make dataloader device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") test_loader = torch.utils.data.DataLoader(testing_set, batch_size=16, shuffle=False) print("\n Inference on the testing set") # load the test model and making inference test_model = cls.ClassificationModel( model_name=extractor_name).create_model() model_path = os.path.join("saved/models", time_str + "-" + cfg["train"]["save_as_name"]) test_model.load_state_dict(torch.load(model_path)) test_model = test_model.to(device) logging.info(tester.test_result(test_model, test_loader, device))
def main(collocation, model, dataset, validation_flag, current_fold, comment="No comment", checkpoint=None, logger=None, num_of_class=2): # read training set data = cfg["data"]["data_csv_name"] data = re.sub(r"fold[0-9]", str(current_fold), data) print("Reading training data from file: ", data) training_set = pd.read_csv(data, delimiter="*", header=None) # check if validation flag is on if validation_flag == 1: # using custom validation set print("Creating validation set from file") valid = cfg["data"]["validation_csv_name"] valid = re.sub(r"fold[0-9]", str(current_fold), valid) print("Reading validation data from file: ", valid) valid_set = pd.read_csv(valid, delimiter="*", header=None) else: # auto divide validation set validation_split = float(cfg["data"]["validation_ratio"]) training_set, valid_set = data_split(training_set, validation_split) data_path = cfg["data"]["data_path"] batch_size = int(cfg["data"]["batch_size"]) # create dataset training_set = dataset(training_set, data_path, padding=True, normalize=True) testing_set = dataset(valid_set, data_path, padding=True, normalize=True) # End sampler train_loader = torch.utils.data.DataLoader(training_set, batch_size=batch_size, shuffle=True, collate_fn=collocation) val_loader = torch.utils.data.DataLoader(testing_set, batch_size=batch_size, shuffle=False, collate_fn=collocation) # val_loader = torch.utils.data.DataLoader(testing_set,sampler=ImbalancedDatasetSampler(testing_set, callback_get_label=lambda x, i: tuple(x[i][1].tolist())),batch_size=batch_size,) logging.info("Dataset and Dataloaders created") # create a model # extractor_name = cfg["train"]["extractor"] # model = cls(model_name=extractor_name).create_model() # model = cls( # num_blocks=6, # in_channels=1, # out_channels=64, # bottleneck_channels=0, # kernel_sizes=8, # num_pred_classes=2 # ) model = cls(class_num=2, num_of_blocks=9, training=True, dense_layers=[256, 256]) for param in model.parameters(): param.requires_grad = True # load checkpoint to continue training if checkpoint is not None: print("...Load checkpoint from {}".format(checkpoint)) checkpoint = torch.load(checkpoint) model.load_state_dict(checkpoint) print("...Checkpoint loaded") device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") logging.info("Using device: {} ".format(device)) # convert to suitable device # global model model = model.to(device) print(sum(p.numel() for p in model.parameters())) time.sleep(4) logging.info("Model created...") # create a metric for evaluating # global train_metrics # global val_metrics train_metrics = metrics.Metrics(cfg["train"]["metrics"]) val_metrics = metrics.Metrics(cfg["train"]["metrics"]) print("Metrics implemented successfully") # method to optimize the model # read settings from json file loss_function = cfg["optimizer"]["loss"] optimizers = cfg["optimizer"]["name"] learning_rate = cfg["optimizer"]["lr"] # initlize optimizing methods : lr, scheduler of lr, optimizer try: # if the loss function comes from nn package criterion = getattr( nn, loss_function, "The loss {} is not available".format(loss_function)) except: # use custom loss criterion = getattr( custom_loss, loss_function, "The loss {} is not available".format(loss_function), ) criterion = custom_loss.WeightedFocalLoss(weight=None, gamma=2, reduction="sum") # criterion = nn.CrossEntropyLoss(reduction='none') optimizer = getattr(torch.optim, optimizers, "The optimizer {} is not available".format(optimizers)) max_lr = 3e-3 # Maximum LR min_lr = 1e-5 # Minimum LR t_max = 10 # How many epochs to go from max_lr to min_lr optimizer = optimizer(model.parameters(), lr=learning_rate, momentum=0.9) save_method = cfg["train"]["lr_scheduler_factor"] patiences = cfg["train"]["patience"] lr_factor = cfg["train"]["reduce_lr_factor"] # scheduler = ReduceLROnPlateau( # optimizer, save_method, patience=patiences, factor=lr_factor # ) scheduler = ReduceLROnPlateau( optimizer, mode=save_method, factor=lr_factor, min_lr=0.00001, verbose=True, patience=patiences, ) # before training, let's create a neptune protocol for tracking experiment neptune.init("deepbox/gtopia-ml") PARAMS = { "loss_function": cfg["optimizer"]["loss"], "optimizers": cfg["optimizer"]["name"], "learning_rate": cfg["optimizer"]["lr"], "lr_factor": cfg["train"]["reduce_lr_factor"], "patiences": cfg["train"]["patience"], "loss_function": cfg["optimizer"]["loss"], "data_path": cfg["data"]["data_csv_name"], "batch_size": batch_size, } # create neptune experiment neptune.create_experiment( name=comment + "_" + str(current_fold), params=PARAMS, tags=[ str(current_fold), cfg["train"]["model.class"], cfg["data"]["mode"] ], ) logging.info("Created experiment tracking protocol") print("Beginning training...") print("Traing shape: ", len(train_loader.dataset)) print("Validation shape: ", len(val_loader.dataset)) time.sleep(3) # export the result to log file logging.info("-----") logging.info("session name: {} \n".format(cfg["session"]["sess_name"])) logging.info("session description: {} \n".format(comment)) logging.info(model) logging.info("\n") logging.info("CONFIGS \n") # training models num_epoch = int(cfg["train"]["num_epoch"]) best_val_acc = 0 for i in range(0, num_epoch): loss, val_loss, train_result, val_result = trainer.train_one_epoch( model, train_loader, val_loader, device, optimizer, criterion, train_metrics, val_metrics, num_of_class, ) # neptune logging neptune.log_metric("train_loss", loss) neptune.log_metric("validation_loss", val_loss) for single_metric in train_result.keys(): neptune.log_metric("train_" + single_metric, train_result[single_metric]) neptune.log_metric("val_" + single_metric, val_result[single_metric]) # lr scheduling logging.info( "Epoch {} / {} \n Training loss: {} - Other training metrics: ". format(i + 1, num_epoch, loss)) logging.info(train_result) logging.info( " \n Validation loss : {} - Other validation metrics:".format( val_loss)) logging.info(val_result) logging.info("\n") # saving epoch with best validation accuracy if best_val_acc < float(val_result["f1_score"]): logging.info("Validation f1= " + str(val_result["f1_score"]) + "===> Save best epoch \n") best_val_acc = val_result["f1_score"] torch.save( model.state_dict(), "saved/models/" + time_str + "-" + str(current_fold) + "-" + cfg["train"]["save_as_name"], ) scheduler.step(val_loss) # testing on test set test_data = cfg["data"]["test_csv_name"] data_path = cfg["data"]["data_path"] test_data = re.sub(r"fold[0-9]", str(current_fold), test_data) print("reading testing data from file: ", test_data) test_df = pd.read_csv(test_data, delimiter="*", header=None) # prepare the dataset testing_set = dataset(test_df, data_path, padding=False, normalize=True) # make dataloader device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") test_loader = torch.utils.data.DataLoader(testing_set, batch_size=1, shuffle=False, collate_fn=collocation) print("Inference on the testing set") # load the test model and making inference test_model = cls(class_num=2, num_of_blocks=9, training=True, dense_layers=[256, 256]) # test_model = cls( # num_blocks=6, # in_channels=1, # out_channels=64, # bottleneck_channels=0, # kernel_sizes=8, # num_pred_classes=2, # ) model_path = os.path.join( "saved/models", time_str + "-" + str(current_fold) + "-" + cfg["train"]["save_as_name"], ) test_model.load_state_dict(torch.load(model_path)) test_model = test_model.to(device) logging.info( tester.adaptive_test_result(test_model, test_loader, device, cfg, num_of_class)) f = open("test_report.txt", "w") f.write("Test results \n : {}".format( tester.adaptive_test_result(test_model, test_loader, device, cfg))) f.close() # send some versions of code neptune.log_artifact("test_report.txt") neptune.log_artifact("data_loader/dataloader.py") neptune.log_artifact("cfgs/tenes.cfg") neptune.log_artifact("trainer.py") neptune.log_artifact("test.py") neptune.log_artifact("run_exp_2.py") if (cfg["train"]["model.class"] == "Lecnet"): neptune.log_artifact("model/classification.py") else: neptune.log_artifact("model/benchmark.py") # saving torch models print("---End of testing phase----") neptune.stop()
def main(collocation,model,dataset,validation_flag,current_fold,comment="No comment", checkpoint=None,logger=None): # parser = argparse.ArgumentParser(description='NA') # parser.add_argument('-c', '--configure', default='cfgs/chexphoto.cfg', help='JSON file') # parser.add_argument('-cp', '--checkpoint', default=None, help = 'checkpoint path') # args = parser.parse_args() # checkpoint = args.checkpoint # # read configure file # with open(args.configure) as f: # cfg = json.load(f) # using parsed configurations to create a dataset # read training set data = cfg["data"]["data_csv_name"] data = re.sub(r"fold[0-9]",str(current_fold),data) print("Reading training data from file: ",data) training_set = pd.read_csv(data,delimiter='*',header=None) # check if validation flag is on if (validation_flag==1): # using custom validation set print("Creating validation set from file") valid = cfg["data"]["validation_csv_name"] valid = re.sub(r"fold[0-9]",str(current_fold),valid) print("Reading validation data from file: ",valid) valid_set = pd.read_csv(valid,delimiter='*',header=None) else: # auto divide validation set validation_split = float(cfg["data"]["validation_ratio"]) training_set,valid_set = data_split(training_set,validation_split) data_path = cfg["data"]["data_path"] batch_size = int(cfg["data"]["batch_size"]) # create dataset # train, test, _, _ = dataloader.data_split(training_set, validation_split) training_set = dataset( training_set, data_path, padding=True,normalize=True ) testing_set = dataset( valid_set, data_path, padding=True,normalize=True ) # create dataloaders # global train_loader # global val_loader # SAmpler to prevent inbalance data label # train_loader = torch.utils.data.DataLoader(training_set,sampler=ImbalancedDatasetSampler(training_set, callback_get_label=lambda x, i: tuple(x[i][1].tolist())),batch_size=batch_size,) # End sampler train_loader = torch.utils.data.DataLoader( training_set, batch_size=batch_size, shuffle=True,collate_fn=collocation ) val_loader = torch.utils.data.DataLoader( testing_set, batch_size=batch_size, shuffle=False,collate_fn=collocation ) # val_loader = torch.utils.data.DataLoader(testing_set,sampler=ImbalancedDatasetSampler(testing_set, callback_get_label=lambda x, i: tuple(x[i][1].tolist())),batch_size=batch_size,) logging.info("Dataset and Dataloaders created") # create a model # extractor_name = cfg["train"]["extractor"] # model = cls(model_name=extractor_name).create_model() model = cls(class_num=2,num_of_blocks=9,training=True,dense_layers=[256,256]) # model = cls( num_blocks = 8, in_channels=1,out_channels=64,bottleneck_channels=0,kernel_sizes=8,num_pred_classes=2) # load checkpoint to continue training if checkpoint is not None: print("...Load checkpoint from {}".format(checkpoint)) checkpoint = torch.load(checkpoint) model.load_state_dict(checkpoint) print("...Checkpoint loaded") classifier = nn.Sequential( nn.Linear(1408, 512, bias=True), nn.ReLU(inplace=True), nn.Linear(512, 6, bias=True), ) # create classfier # replace the last linear layer with your custom classifier # model._avg_pooling = SPPLayer([1,2,4]) # model._fc = classifier # model.last_linear = self.cls # select with layers to unfreeze params = list(model.parameters()) len_param = len(params) # for index,param in enumerate(model.parameters()): # if index == (len_param -1): # param.requires_grad = True # else: # param.requires_grad = False # for param in model.parameters(): # print(param.requires_grad) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") logging.info("Using device: {} ".format(device)) # convert to suitable device # global model model = model.to(device) print(sum(p.numel() for p in model.parameters())) time.sleep(4) logging.info("Model created...") # create a metric for evaluating # global train_metrics # global val_metrics train_metrics = metrics.Metrics(cfg["train"]["metrics"]) val_metrics = metrics.Metrics(cfg["train"]["metrics"]) print("Metrics implemented successfully") # method to optimize the model # read settings from json file loss_function = cfg["optimizer"]["loss"] optimizers = cfg["optimizer"]["name"] learning_rate = cfg["optimizer"]["lr"] # initlize optimizing methods : lr, scheduler of lr, optimizer try: # if the loss function comes from nn package criterion = getattr( nn, loss_function, "The loss {} is not available".format(loss_function) ) except: # use custom loss criterion = getattr( custom_loss, loss_function, "The loss {} is not available".format(loss_function), ) criterion = custom_loss.WeightedFocalLoss(weight=None, gamma=2,reduction='mean') # criterion = nn.CrossEntropyLoss(reduction='none') optimizer = getattr( torch.optim, optimizers, "The optimizer {} is not available".format(optimizers) ) max_lr = 3e-3 # Maximum LR min_lr = 1e-5 # Minimum LR t_max = 10 # How many epochs to go from max_lr to min_lr # optimizer = torch.optim.Adam( # params=model.parameters(), lr=max_lr, amsgrad=False) optimizer = optimizer(model.parameters(), lr=learning_rate) save_method = cfg["train"]["lr_scheduler_factor"] patiences = cfg["train"]["patience"] lr_factor = cfg["train"]["reduce_lr_factor"] # scheduler = ReduceLROnPlateau( # optimizer, save_method, patience=patiences, factor=lr_factor # ) scheduler = ReduceLROnPlateau(optimizer, mode='min',factor=0.5,min_lr=0.00001,verbose=True,patience=5) # before training, let's create a file for logging model result print("Beginning training...") time.sleep(3) # export the result to log file logging.info("-----") logging.info("session name: {} \n".format(cfg["session"]["sess_name"])) logging.info("session description: {} \n".format(comment)) logging.info(model) logging.info("\n") logging.info("CONFIGS \n") # logging the configs: # logging.info(f.read()) # training models num_epoch = int(cfg["train"]["num_epoch"]) best_val_acc = 0 for i in range(0, num_epoch): loss, val_loss, train_result, val_result = trainer.train_one_epoch( model, train_loader, val_loader, device, optimizer, criterion, train_metrics, val_metrics, ) # lr scheduling logging.info( "Epoch {} / {} \n Training loss: {} - Other training metrics: ".format( i + 1, num_epoch, loss ) ) # tensorboard_writer.add_scalar("training accuracy",train_result["accuracy_score"],i + 1) # tensorboard_writer.add_scalar("training f1_score",train_result["f1_score"],i + 1) # tensorboard_writer.add_scalar("training metrics",loss,i + 1) logging.info(train_result) logging.info( " \n Validation loss : {} - Other validation metrics:".format(val_loss) ) # tensorboard_writer.add_scalar("valid accuracy",val_result["accuracy_score"],i + 1) # tensorboard_writer.add_scalar("valid f1_score",val_result["f1_score"],i + 1) # tensorboard_writer.add_scalar("valid metrics",val_loss,i + 1) logging.info(val_result) logging.info("\n") # saving epoch with best validation accuracy if best_val_acc < float(val_result["f1_score"]): logging.info( "Validation f1= " + str(val_result["f1_score"]) + "===> Save best epoch \n" ) best_val_acc = val_result["f1_score"] torch.save( model.state_dict(), "saved/models/" + time_str + "-" + str(current_fold) + "-" +cfg["train"]["save_as_name"], ) scheduler.step(val_loss) # else: # # logging.info( # # "Validation accuracy= "+ str(val_result["accuracy_score"])+ "===> No saving" # # ) # continue # testing on test set test_data = cfg["data"]["test_csv_name"] data_path = cfg["data"]["data_path"] test_data = re.sub(r"fold[0-9]",str(current_fold),test_data) print("reading testing data from file: ",test_data) test_df = pd.read_csv(test_data,delimiter='*',header=None) # prepare the dataset testing_set = dataset( test_df, data_path, padding=False, normalize=True ) # make dataloader device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") test_loader = torch.utils.data.DataLoader(testing_set, batch_size=1, shuffle=False,collate_fn=collocation) print("Inference on the testing set") # load the test model and making inference test_model = cls(class_num=2,num_of_blocks=9,training=True,dense_layers=[256,256]) # test_model = cls( num_blocks = 8, in_channels=1,out_channels=64,bottleneck_channels=0,kernel_sizes=8,num_pred_classes=2) model_path = os.path.join( "saved/models", time_str + "-" + str(current_fold) + "-" + cfg["train"]["save_as_name"] ) test_model.load_state_dict(torch.load(model_path)) test_model = test_model.to(device) logging.info(tester.adaptive_test_result(test_model, test_loader, device, cfg)) # saving torch models print("---End of testing phase----")
def main(): with timer('load data'): df = pd.read_csv(FOLD_PATH) with timer('preprocessing'): train_df, val_df = df[df.fold_id != FOLD_ID], df[df.fold_id == FOLD_ID] train_augmentation = Compose([ Flip(p=0.5), OneOf([ #ElasticTransform(p=0.5, alpha=120, sigma=120 * 0.05, alpha_affine=120 * 0.03), GridDistortion(p=0.5), OpticalDistortion(p=0.5, distort_limit=2, shift_limit=0.5) ], p=0.5), #OneOf([ # ShiftScaleRotate(p=0.5), ## RandomRotate90(p=0.5), # Rotate(p=0.5) #], p=0.5), OneOf([ Blur(blur_limit=8, p=0.5), MotionBlur(blur_limit=8,p=0.5), MedianBlur(blur_limit=8,p=0.5), GaussianBlur(blur_limit=8,p=0.5) ], p=0.5), OneOf([ #CLAHE(clip_limit=4, tile_grid_size=(4, 4), p=0.5), RandomGamma(gamma_limit=(100,140), p=0.5), RandomBrightnessContrast(p=0.5), RandomBrightness(p=0.5), RandomContrast(p=0.5) ], p=0.5), OneOf([ GaussNoise(p=0.5), Cutout(num_holes=10, max_h_size=10, max_w_size=20, p=0.5) ], p=0.5) ]) val_augmentation = None train_dataset = SeverDataset(train_df, IMG_DIR, IMG_SIZE, N_CLASSES, id_colname=ID_COLUMNS, transforms=train_augmentation) val_dataset = SeverDataset(val_df, IMG_DIR, IMG_SIZE, N_CLASSES, id_colname=ID_COLUMNS, transforms=val_augmentation) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2) del train_df, val_df, df, train_dataset, val_dataset gc.collect() with timer('create model'): model = smp.Unet('se_resnext50_32x4d', encoder_weights='imagenet', classes=N_CLASSES) model.to(device) criterion = torch.nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam(model.parameters(), lr=3e-4) scheduler_cosine = CosineAnnealingLR(optimizer, T_max=CLR_CYCLE, eta_min=3e-5) scheduler = GradualWarmupScheduler(optimizer, multiplier=1.1, total_epoch=CLR_CYCLE*2, after_scheduler=scheduler_cosine) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) with timer('train'): train_losses = [] valid_losses = [] best_model_loss = 999 best_model_ep = 0 checkpoint = 0 for epoch in range(1, EPOCHS + 1): if epoch % (CLR_CYCLE * 2) == 0: if epoch != 0: y_val = y_val.reshape(-1, N_CLASSES, IMG_SIZE[0], IMG_SIZE[1]) best_pred = best_pred.reshape(-1, N_CLASSES, IMG_SIZE[0], IMG_SIZE[1]) for i in range(N_CLASSES): th, score, _, _ = search_threshold(y_val[:, i, :, :], best_pred[:, i, :, :]) LOGGER.info('Best loss: {} Best Dice: {} on epoch {} th {} class {}'.format( round(best_model_loss, 5), round(score, 5), best_model_ep, th, i)) checkpoint += 1 best_model_loss = 999 LOGGER.info("Starting {} epoch...".format(epoch)) tr_loss = train_one_epoch(model, train_loader, criterion, optimizer, device) train_losses.append(tr_loss) LOGGER.info('Mean train loss: {}'.format(round(tr_loss, 5))) valid_loss, val_pred, y_val = validate(model, val_loader, criterion, device) valid_losses.append(valid_loss) LOGGER.info('Mean valid loss: {}'.format(round(valid_loss, 5))) scheduler.step() if valid_loss < best_model_loss: torch.save(model.state_dict(), '{}_fold{}_ckpt{}.pth'.format(EXP_ID, FOLD_ID, checkpoint)) best_model_loss = valid_loss best_model_ep = epoch best_pred = val_pred del val_pred gc.collect() with timer('eval'): y_val = y_val.reshape(-1, N_CLASSES, IMG_SIZE[0], IMG_SIZE[1]) best_pred = best_pred.reshape(-1, N_CLASSES, IMG_SIZE[0], IMG_SIZE[1]) for i in range(N_CLASSES): th, score, _, _ = search_threshold(y_val[:, i, :, :], best_pred[:, i, :, :]) LOGGER.info('Best loss: {} Best Dice: {} on epoch {} th {} class {}'.format( round(best_model_loss, 5), round(score, 5), best_model_ep, th, i)) xs = list(range(1, len(train_losses) + 1)) plt.plot(xs, train_losses, label='Train loss') plt.plot(xs, valid_losses, label='Val loss') plt.legend() plt.xticks(xs) plt.xlabel('Epochs') plt.savefig("loss.png")
def main(seed): with timer('load data'): df = pd.read_csv(FOLD_PATH) y = (df.sum_target != 0).astype("float32").values with timer('preprocessing'): train_augmentation = Compose([ Flip(p=0.5), OneOf([ GridDistortion(p=0.5), OpticalDistortion(p=0.5, distort_limit=2, shift_limit=0.5) ], p=0.5), OneOf([ RandomGamma(gamma_limit=(100, 140), p=0.5), RandomBrightnessContrast(p=0.5), RandomBrightness(p=0.5), RandomContrast(p=0.5) ], p=0.5), OneOf([ GaussNoise(p=0.5), Cutout(num_holes=10, max_h_size=10, max_w_size=20, p=0.5) ], p=0.5), ShiftScaleRotate(rotate_limit=20, p=0.5), ]) val_augmentation = None train_dataset = SeverDataset(df, IMG_DIR, IMG_SIZE, N_CLASSES, id_colname=ID_COLUMNS, transforms=train_augmentation, crop_rate=1.0, class_y=y) train_sampler = MaskProbSampler(df, demand_non_empty_proba=0.6) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, num_workers=8) del df, train_dataset gc.collect() with timer('create model'): model = smp_old.Unet('resnet34', encoder_weights="imagenet", classes=N_CLASSES, encoder_se_module=True, decoder_semodule=True, h_columns=False, skip=True, act="swish", freeze_bn=True, classification=CLASSIFICATION) model = convert_model(model) if base_model is not None: model.load_state_dict(torch.load(base_model)) model.to(device) criterion = torch.nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam([ { 'params': model.decoder.parameters(), 'lr': 3e-3 }, { 'params': model.encoder.parameters(), 'lr': 3e-4 }, ]) #if base_model is None: # scheduler_cosine = CosineAnnealingLR(optimizer, T_max=CLR_CYCLE, eta_min=3e-5) # scheduler = GradualWarmupScheduler(optimizer, multiplier=1.1, total_epoch=CLR_CYCLE*2, after_scheduler=scheduler_cosine) #else: # scheduler = CosineAnnealingLR(optimizer, T_max=CLR_CYCLE, eta_min=3e-5) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) model = torch.nn.DataParallel(model) with timer('train'): for epoch in range(71, EPOCHS + 1): seed = seed + epoch seed_torch(seed) LOGGER.info("Starting {} epoch...".format(epoch)) tr_loss = train_one_epoch(model, train_loader, criterion, optimizer, device, cutmix_prob=0.0, classification=CLASSIFICATION) LOGGER.info('Mean train loss: {}'.format(round(tr_loss, 5))) #scheduler.step() if epoch % (CLR_CYCLE * 2) == CLR_CYCLE * 2 - 1: torch.save(model.module.state_dict(), 'models/{}_latest.pth'.format(EXP_ID)) gc.collect()