logger.info('Using Cuda...') model = net.Net(params).cuda() else: params.device = torch.device('cpu') # torch.manual_seed(230) logger.info('Not using cuda...') model = net.Net(params) utils.set_logger(os.path.join(model_dir, 'train.log')) logger.info('Loading the datasets...') train_set = TrainDataset(data_dir, args.dataset, params.num_class) test_set = TestDataset(data_dir, args.dataset, params.num_class) sampler = WeightedSampler(data_dir, args.dataset) # Use weighted sampler instead of random sampler train_loader = DataLoader(train_set, batch_size=params.batch_size, sampler=sampler, num_workers=4) test_loader = DataLoader(test_set, batch_size=params.predict_batch, sampler=RandomSampler(test_set), num_workers=4) logger.info('Loading complete.') logger.info(f'Model: \n{str(model)}') optimizer = optim.Adam(model.parameters(), lr=params.learning_rate) # fetch loss function loss_fn = net.loss_fn # Train the model logger.info('Starting training for {} epoch(s)'.format(params.num_epochs)) train_and_evaluate(model, train_loader, test_loader, optimizer, loss_fn,
if torch.cuda.device_count() > 1: model = nn.DataParallel(model) discriminator = nn.DataParallel(discriminator) model.to(params.device) discriminator.to(params.device) utils.set_logger(os.path.join(model_dir, 'train.log')) logger.info('Loading the datasets...') train_set = TrainDataset(data_dir, args.dataset, params.num_class) valid_set = ValidDataset(data_dir, args.dataset, params.num_class) test_set = TestDataset(data_dir, args.dataset, params.num_class) #sampler = WeightedSampler(data_dir, args.dataset) # Use weighted sampler instead of random sampler train_loader = DataLoader(train_set, batch_size=params.batch_size, sampler=RandomSampler(train_set), num_workers=4) valid_loader = DataLoader(valid_set, batch_size=params.predict_batch, sampler=RandomSampler(valid_set), num_workers=4) test_loader = DataLoader(test_set, batch_size=params.predict_batch, sampler=RandomSampler(test_set), num_workers=4) logger.info('Loading complete.') n_updates_total = (train_set.__len__() // params.batch_size) * params.num_epochs optimizer_D = optim.RMSprop(discriminator.parameters(), lr = params.lr_d) optimizer_G = OpenAIAdam(model.parameters(), lr=params.lr, schedule=params.lr_schedule, warmup=params.lr_warmup, t_total=n_updates_total, b1=0.9, b2=0.999, e=1e-8,
def main(args, logger): # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv') trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl') trn_df['is_original'] = 1 gkf = GroupKFold(n_splits=5).split( X=trn_df.question_body, groups=trn_df.question_body_le, ) histories = { 'trn_loss': {}, 'val_loss': {}, 'val_metric': {}, 'val_metric_raws': {}, } loaded_fold = -1 loaded_epoch = -1 if args.checkpoint: histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint) fold_best_metrics = [] fold_best_metrics_raws = [] for fold, (trn_idx, val_idx) in enumerate(gkf): if fold < loaded_fold: fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax( histories["val_metric"][fold])]) continue sel_log( f' --------------------------- start fold {fold} --------------------------- ', logger) fold_trn_df = trn_df.iloc[trn_idx] # .query('is_original == 1') fold_trn_df = fold_trn_df.drop(['is_original', 'question_body_le'], axis=1) # use only original row fold_val_df = trn_df.iloc[val_idx].query('is_original == 1') fold_val_df = fold_val_df.drop(['is_original', 'question_body_le'], axis=1) if args.debug: fold_trn_df = fold_trn_df.sample(100, random_state=71) fold_val_df = fold_val_df.sample(100, random_state=71) temp = pd.Series( list( itertools.chain.from_iterable( fold_trn_df.question_title.apply(lambda x: x.split(' ')) + fold_trn_df.question_body.apply(lambda x: x.split(' ')) + fold_trn_df.answer.apply(lambda x: x.split(' ')))) ).value_counts() tokens = temp[temp >= 10].index.tolist() # tokens = [] tokens = [ 'CAT_TECHNOLOGY'.casefold(), 'CAT_STACKOVERFLOW'.casefold(), 'CAT_CULTURE'.casefold(), 'CAT_SCIENCE'.casefold(), 'CAT_LIFE_ARTS'.casefold(), ] trn_dataset = QUESTDataset( df=fold_trn_df, mode='train', tokens=tokens, augment=[], tokenizer_type=TOKENIZER_TYPE, pretrained_model_name_or_path=TOKENIZER_PRETRAIN, do_lower_case=DO_LOWER_CASE, LABEL_COL=LABEL_COL, t_max_len=T_MAX_LEN, q_max_len=Q_MAX_LEN, a_max_len=A_MAX_LEN, tqa_mode='tq_a', TBSEP='[TBSEP]', pos_id_type='arange', MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, ) # update token trn_sampler = RandomSampler(data_source=trn_dataset) trn_loader = DataLoader(trn_dataset, batch_size=BATCH_SIZE, sampler=trn_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=True, pin_memory=True) val_dataset = QUESTDataset( df=fold_val_df, mode='valid', tokens=tokens, augment=[], tokenizer_type=TOKENIZER_TYPE, pretrained_model_name_or_path=TOKENIZER_PRETRAIN, do_lower_case=DO_LOWER_CASE, LABEL_COL=LABEL_COL, t_max_len=T_MAX_LEN, q_max_len=Q_MAX_LEN, a_max_len=A_MAX_LEN, tqa_mode='tq_a', TBSEP='[TBSEP]', pos_id_type='arange', MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, ) val_sampler = RandomSampler(data_source=val_dataset) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=False, pin_memory=True) fobj = BCEWithLogitsLoss() state_dict = RobertaModel.from_pretrained(MODEL_PRETRAIN).state_dict() model = RobertaModelForBinaryMultiLabelClassifier( num_labels=len(LABEL_COL), config_path=MODEL_CONFIG_PATH, state_dict=state_dict, token_size=len(trn_dataset.tokenizer), MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, ) optimizer = optim.Adam(model.parameters(), lr=3e-5) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=MAX_EPOCH, eta_min=1e-5) # load checkpoint model, optim, scheduler if args.checkpoint and fold == loaded_fold: load_checkpoint(args.checkpoint, model, optimizer, scheduler) for epoch in tqdm(list(range(MAX_EPOCH))): if fold <= loaded_fold and epoch <= loaded_epoch: continue if epoch < 1: model.freeze_unfreeze_bert(freeze=True, logger=logger) else: model.freeze_unfreeze_bert(freeze=False, logger=logger) model = DataParallel(model) model = model.to(DEVICE) trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader, DEVICE) val_loss, val_metric, val_metric_raws, val_y_preds, val_y_trues, val_qa_ids = test( model, fobj, val_loader, DEVICE, mode='valid') scheduler.step() if fold in histories['trn_loss']: histories['trn_loss'][fold].append(trn_loss) else: histories['trn_loss'][fold] = [ trn_loss, ] if fold in histories['val_loss']: histories['val_loss'][fold].append(val_loss) else: histories['val_loss'][fold] = [ val_loss, ] if fold in histories['val_metric']: histories['val_metric'][fold].append(val_metric) else: histories['val_metric'][fold] = [ val_metric, ] if fold in histories['val_metric_raws']: histories['val_metric_raws'][fold].append(val_metric_raws) else: histories['val_metric_raws'][fold] = [ val_metric_raws, ] logging_val_metric_raws = '' for val_metric_raw in val_metric_raws: logging_val_metric_raws += f'{float(val_metric_raw):.4f}, ' sel_log( f'fold : {fold} -- epoch : {epoch} -- ' f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_metric : {float(val_metric):.4f} -- ' f'val_metric_raws : {logging_val_metric_raws}', logger) model = model.to('cpu') model = model.module save_checkpoint(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model, optimizer, scheduler, histories, val_y_preds, val_y_trues, val_qa_ids, fold, epoch, val_loss, val_metric) fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax( histories["val_metric"][fold])]) save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', trn_dataset.tokenizer, clean=False) del model # calc training stats fold_best_metric_mean = np.mean(fold_best_metrics) fold_best_metric_std = np.std(fold_best_metrics) fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}' sel_log(fold_stats, logger) send_line_notification(fold_stats) fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0) fold_raw_stats = '' for metric_stats_raw in fold_best_metrics_raws_mean: fold_raw_stats += f'{float(metric_stats_raw):.4f},' sel_log(fold_raw_stats, logger) send_line_notification(fold_raw_stats) sel_log('now saving best checkpoints...', logger)
environment_provider=env, properties=['energy', 'forces']) if not os.path.isdir(args.modelpath): os.makedirs(args.modelpath) if args.split_path is not None: copyfile(args.split_path, split_path) #from sklearn.model_selection import train_test_split #train,test = train_test_split(df, test_size=0.20, random_state=42,stratify=df['Ebin'].values) print(args.batch_size) train_loader = schnetpack2.custom.data.AtomsLoader( data_train, batch_size=args.batch_size, sampler=RandomSampler(data_train), num_workers=9 * torch.cuda.device_count(), pin_memory=True) val_loader = schnetpack2.custom.data.AtomsLoader(data_val, batch_size=args.batch_size, num_workers=9 * torch.cuda.device_count(), pin_memory=True) mean, stddev = train_loader.get_statistics('energy', False) mean_forces, stddev_forces = train_loader.get_statistics('forces', True) #mean = -4178.7568 #stddev = 29.6958 if args.uncertainty: mean, stddev = torch.tensor([-1202.6432, 0]), torch.tensor([12.3304, 1]) else:
def fit( model, train_dataset, val_dataset, optimizer_name="adam", samples_per_player=0, epochs=50, batch_size=32, val_bs=32, warmup_prop=0.1, lr=1e-3, acc_steps=1, swa_first_epoch=50, num_classes_aux=0, aux_mode="sigmoid", verbose=1, first_epoch_eval=0, device="cuda", ): """ Fitting function for the classification task. Args: model (torch model): Model to train. train_dataset (torch dataset): Dataset to train with. val_dataset (torch dataset): Dataset to validate with. optimizer_name (str, optional): Optimizer name. Defaults to 'adam'. samples_per_player (int, optional): Number of images to use per player. Defaults to 0. epochs (int, optional): Number of epochs. Defaults to 50. batch_size (int, optional): Training batch size. Defaults to 32. val_bs (int, optional): Validation batch size. Defaults to 32. warmup_prop (float, optional): Warmup proportion. Defaults to 0.1. lr (float, optional): Learning rate. Defaults to 1e-3. acc_steps (int, optional): Accumulation steps. Defaults to 1. swa_first_epoch (int, optional): Epoch to start applying SWA from. Defaults to 50. num_classes_aux (int, optional): Number of auxiliary classes. Defaults to 0. aux_mode (str, optional): Mode for auxiliary classification. Defaults to 'sigmoid'. verbose (int, optional): Period (in epochs) to display logs at. Defaults to 1. first_epoch_eval (int, optional): Epoch to start evaluating at. Defaults to 0. device (str, optional): Device for torch. Defaults to "cuda". Returns: numpy array [len(val_dataset)]: Last predictions on the validation data. numpy array [len(val_dataset) x num_classes_aux]: Last aux predictions on the val data. """ optimizer = define_optimizer(optimizer_name, model.parameters(), lr=lr) if swa_first_epoch <= epochs: optimizer = SWA(optimizer) loss_fct = nn.BCEWithLogitsLoss() loss_fct_aux = nn.BCEWithLogitsLoss( ) if aux_mode == "sigmoid" else nn.CrossEntropyLoss() aux_loss_weight = 1 if num_classes_aux else 0 if samples_per_player: sampler = PlayerSampler( RandomSampler(train_dataset), train_dataset.players, batch_size=batch_size, drop_last=True, samples_per_player=samples_per_player, ) train_loader = DataLoader( train_dataset, batch_sampler=sampler, num_workers=NUM_WORKERS, pin_memory=True, ) print( f"Using {len(train_loader)} out of {len(train_dataset) // batch_size} " f"batches by limiting to {samples_per_player} samples per player.\n" ) else: train_loader = DataLoader( train_dataset, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=NUM_WORKERS, pin_memory=True, ) val_loader = DataLoader( val_dataset, batch_size=val_bs, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True, ) num_training_steps = int(epochs * len(train_loader)) num_warmup_steps = int(warmup_prop * num_training_steps) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps) for epoch in range(epochs): model.train() start_time = time.time() optimizer.zero_grad() avg_loss = 0 if epoch + 1 > swa_first_epoch: optimizer.swap_swa_sgd() for batch in train_loader: images = batch[0].to(device) y_batch = batch[1].to(device).view(-1).float() y_batch_aux = batch[2].to(device).float() y_batch_aux = y_batch_aux.float( ) if aux_mode == "sigmoid" else y_batch_aux.long() y_pred, y_pred_aux = model(images) loss = loss_fct(y_pred.view(-1), y_batch) if aux_loss_weight: loss += aux_loss_weight * loss_fct_aux(y_pred_aux, y_batch_aux) loss.backward() avg_loss += loss.item() / len(train_loader) optimizer.step() scheduler.step() for param in model.parameters(): param.grad = None if epoch + 1 >= swa_first_epoch: optimizer.update_swa() optimizer.swap_swa_sgd() preds = np.empty(0) preds_aux = np.empty((0, num_classes_aux)) model.eval() avg_val_loss, auc, scores_aux = 0., 0., 0. if epoch + 1 >= first_epoch_eval or epoch + 1 == epochs: with torch.no_grad(): for batch in val_loader: images = batch[0].to(device) y_batch = batch[1].to(device).view(-1).float() y_aux = batch[2].to(device).float() y_batch_aux = y_aux.float( ) if aux_mode == "sigmoid" else y_aux.long() y_pred, y_pred_aux = model(images) loss = loss_fct(y_pred.detach().view(-1), y_batch) if aux_loss_weight: loss += aux_loss_weight * loss_fct_aux( y_pred_aux.detach(), y_batch_aux) avg_val_loss += loss.item() / len(val_loader) y_pred = torch.sigmoid(y_pred).view(-1) preds = np.concatenate( [preds, y_pred.detach().cpu().numpy()]) if num_classes_aux: y_pred_aux = (y_pred_aux.sigmoid() if aux_mode == "sigmoid" else y_pred_aux.softmax(-1)) preds_aux = np.concatenate( [preds_aux, y_pred_aux.detach().cpu().numpy()]) auc = roc_auc_score(val_dataset.labels, preds) if num_classes_aux: if aux_mode == "sigmoid": scores_aux = np.round( [ roc_auc_score(val_dataset.aux_labels[:, i], preds_aux[:, i]) for i in range(num_classes_aux) ], 3, ).tolist() else: scores_aux = np.round( [ roc_auc_score((val_dataset.aux_labels == i).astype(int), preds_aux[:, i]) for i in range(num_classes_aux) ], 3, ).tolist() else: scores_aux = 0 elapsed_time = time.time() - start_time if (epoch + 1) % verbose == 0: elapsed_time = elapsed_time * verbose lr = scheduler.get_last_lr()[0] print( f"Epoch {epoch + 1:02d}/{epochs:02d} \t lr={lr:.1e}\t t={elapsed_time:.0f}s \t" f"loss={avg_loss:.3f}", end="\t", ) if epoch + 1 >= first_epoch_eval: print( f"val_loss={avg_val_loss:.3f} \t auc={auc:.3f}\t aucs_aux={scores_aux}" ) else: print("") del val_loader, train_loader, y_pred torch.cuda.empty_cache() return preds, preds_aux
article_torch = utils.to_tensor((X_data[index])) dict_ = {'article': article_torch} return dict_ def my_collate(batch): return batch if __name__ == '__main__': batch_size = 8 train_data, val_data, vocabulary = ( utils.to_tensor(np.concatenate(np.load('./dataset/wiki.train.npy'))), utils.to_tensor(np.concatenate(np.load('./dataset/wiki.valid.npy'))), np.load('./dataset/vocab.npy')) wiki_train_ds = WikiDataset(train_data) wiki_train_loader = data.DataLoader(wiki_train_ds, batch_size=batch_size, sampler=RandomSampler(wiki_train_ds), collate_fn=my_collate) for batch_index, batch_dict in enumerate(val_loader): print(batch_dict) if batch_index == 0: break
def construct_loader(cfg, split, is_precise_bn=False): """ Constructs the data loader for the given dataset. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py split (str): the split of the data loader. Options include `train`, `val`, and `test`. """ assert split in ["train", "val", "test"] if split in ["train"]: dataset_name = cfg.TRAIN.DATASET batch_size = int(cfg.TRAIN.BATCH_SIZE / max(1, cfg.NUM_GPUS)) shuffle = True drop_last = True elif split in ["val"]: dataset_name = cfg.TRAIN.DATASET batch_size = int(cfg.TRAIN.BATCH_SIZE / max(1, cfg.NUM_GPUS)) shuffle = False drop_last = False elif split in ["test"]: dataset_name = cfg.TEST.DATASET batch_size = int(cfg.TEST.BATCH_SIZE / max(1, cfg.NUM_GPUS)) shuffle = False drop_last = False # Construct the dataset dataset = build_dataset(dataset_name, cfg, split) if cfg.MULTIGRID.SHORT_CYCLE and split in ["train"] and not is_precise_bn: # Create a sampler for multi-process training sampler = ( DistributedSampler(dataset) if cfg.NUM_GPUS > 1 else RandomSampler(dataset) ) batch_sampler = ShortCycleBatchSampler( sampler, batch_size=batch_size, drop_last=drop_last, cfg=cfg ) # Create a loader loader = torch.utils.data.DataLoader( dataset, batch_sampler=batch_sampler, num_workers=cfg.DATA_LOADER.NUM_WORKERS, pin_memory=cfg.DATA_LOADER.PIN_MEMORY, ) else: # Create a sampler for multi-process training sampler = DistributedSampler(dataset) if cfg.NUM_GPUS > 1 else None # Create a loader loader = torch.utils.data.DataLoader( dataset, batch_size=batch_size, shuffle=(False if sampler else shuffle), sampler=sampler, num_workers=cfg.DATA_LOADER.NUM_WORKERS, pin_memory=cfg.DATA_LOADER.PIN_MEMORY, drop_last=drop_last, collate_fn=detection_collate if cfg.DETECTION.ENABLE else None, ) return loader
def load_data(args): """Load data from here and return. Note: Compose Composes several transforms together and if augmentation is chosen you compose an additional bunch of transforms to be applied to the train data and you send this to the DataTransformer class which returns the data set that is used in the data loader. The data loader then takes in this dataset with a batch size and sampler. Sampler is defines the strategy to draw samples from the dataset. Here for training data random sampling is used and for validation sequential is used. You can also write a custom sampler class if you want. :param args: main_dir (string) : path to the main directory from the args. image_size (int) : size of the image to be resized. transform_prob (float) : probability to apply transformations on the data. batch_size (int) : batch size to be used in the data loader. :return: the train loader and validation loader to be used for training and validating. """ # get data set file path data_path = os.path.join(args.main_dir, 'data', 'train-volume.tif') labels_path = os.path.join(args.main_dir, 'data', 'train-labels.tif') # compose the transforms for the train set train_data = Compose([Resize(args.image_size), ToTensor()]) # choose between augmentations for train data if args.augment: train_augment = augmentations(args) train_transform = DataTransformer(data_path, labels_path, image_transform=train_data, image_augmentation=train_augment) else: # transforming the train data and returning a 4D tensor train_transform = DataTransformer(data_path, labels_path, image_transform=train_data, image_augmentation=None) # transform for validation data val_data = Compose([Resize(args.image_size), ToTensor()]) val_transform = DataTransformer(data_path, labels_path, image_transform=val_data, image_augmentation=None) # split the train and validation indices train_indices, validation_indices = train_test_split(range( len(train_transform)), test_size=0.15) # call the sampler for the train and validation data train_samples = RandomSampler(train_indices) validation_samples = SequentialSampler(validation_indices) # load train and validation data train_loader = DataLoader(train_transform, batch_size=args.batch_size, sampler=train_samples) val_loader = DataLoader(val_transform, batch_size=args.batch_size, sampler=validation_samples) return train_loader, val_loader
train_dataset = DatasetRetriever( image_ids=train_csv.frame_no.unique(), marking=train_csv, transforms=get_valid_transforms(), test=False, ) validation_dataset = DatasetRetriever( image_ids=test_csv.frame_no.unique(), marking=test_csv, transforms=get_valid_transforms(), test=True, ) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=BATCH_SIZE, sampler=RandomSampler(train_dataset), pin_memory=False, drop_last=True, num_workers=6, collate_fn=collate_fn, ) val_loader = torch.utils.data.DataLoader( validation_dataset, batch_size=BATCH_SIZE, num_workers=6, shuffle=False, sampler=SequentialSampler(validation_dataset), pin_memory=False, collate_fn=collate_fn, )
def main(): # define parser and arguments args = get_train_test_args() util.set_seed(args.seed) model = DistilBertForQuestionAnswering.from_pretrained( "distilbert-base-uncased") if args.do_finetune: checkpoint_path = os.path.join(args.save_dir, 'checkpoint') model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path) for name, param in model.named_parameters(): if name.startswith("distilbert.embeddings."): param.requires_grad = False for i in range(args.freeze_layer): if name.startswith("distilbert.transformer.layer.%s." % i): param.requires_grad = False return tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased') if args.do_train: if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) args.save_dir = util.get_save_dir(args.save_dir, args.run_name) log = util.get_logger(args.save_dir, 'log_train') log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}') log.info("Preparing Training Data...") args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') trainer = Trainer(args, log) train_dataset, _ = get_dataset(args, args.train_datasets, args.train_dir, tokenizer, 'train') log.info("Preparing Validation Data...") val_dataset, val_dict = get_dataset(args, args.train_datasets, args.val_dir, tokenizer, 'val') train_loader = DataLoader(train_dataset, batch_size=args.batch_size, sampler=RandomSampler(train_dataset)) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, sampler=SequentialSampler(val_dataset)) best_scores = trainer.train(model, train_loader, val_loader, val_dict) if args.do_eval: args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') split_name = 'test' if 'test' in args.eval_dir else 'validation' log = util.get_logger(args.save_dir, f'log_{split_name}') trainer = Trainer(args, log) checkpoint_path = os.path.join(args.save_dir, 'checkpoint') model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path) model.to(args.device) eval_dataset, eval_dict = get_dataset(args, args.eval_datasets, args.eval_dir, tokenizer, split_name) eval_loader = DataLoader(eval_dataset, batch_size=args.batch_size, sampler=SequentialSampler(eval_dataset)) eval_preds, eval_scores = trainer.evaluate(model, eval_loader, eval_dict, return_preds=True, split=split_name) results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in eval_scores.items()) log.info(f'Eval {results_str}') # Write submission file sub_path = os.path.join(args.save_dir, split_name + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(eval_preds): csv_writer.writerow([uuid, eval_preds[uuid]])
def run_training(): #-------------------------------------------- Training settings -------------------------------------------- out_dir = '../' # s_xx1' # initial_checkpoint = None initial_checkpoint = '../checkpoint/best_train_model.pth' # pretrained_file = '../trained_models/LB=0.69565_inc3_00075000_model.pth' pretrained_file = None skip = [] #['fc.weight', 'fc.bias'] num_iters = 1000 * 1000 iter_smooth = 50 iter_valid = 100 #500 iter_log = 5 iter_save_freq = 50 iter_save = [0, num_iters - 1] + list( range(0, num_iters, 1 * iter_save_freq)) # first and last iters, then every 1000 iters validation_num = 10000 batch_size = 128 #60 #512 #96 #256 validation_batch_size = 128 iter_accum = 4 #2 #448//batch_size valid_loss = 0.0 valid_acc = 0.0 batch_loss = 0.0 batch_acc = 0.0 best_valid_acc = 0.0 best_train_acc = 0.0 rate = 0 iter_time_meter = AverageMeter() train_loss_meter = AverageMeter() train_acc_meter = AverageMeter() j = 0 # number of iters in total i = 0 # number of real iters where bp is conducted #----------------------------------------------------------------------------------------------------------- ## setup --------------------------- os.makedirs(out_dir + '/checkpoint/' + IDENTIFIER, exist_ok=True) os.makedirs(out_dir + '/backup/' + IDENTIFIER, exist_ok=True) log.write('\n--- [START %s] %s\n\n' % (datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '-' * 64)) log.write('** some experiment setting **\n') log.write('\tIDENTIFIER = %s\n' % IDENTIFIER) log.write('\tSEED = %u\n' % SEED) log.write('\tPROJECT_PATH = %s\n' % PROJECT_PATH) log.write('\tout_dir = %s\n' % out_dir) ## net ------------------------------- log.write('** net setting **\n') net = Net(in_shape=(3, CDISCOUNT_HEIGHT, CDISCOUNT_WIDTH), num_classes=CDISCOUNT_NUM_CLASSES) if use_cuda: net.cuda() #### # if 0: #freeze early layers # for p in net.layer0.parameters(): # p.requires_grad = False # for p in net.layer1.parameters(): # p.requires_grad = False # for p in net.layer2.parameters(): # p.requires_grad = False # for p in net.layer3.parameters(): # p.requires_grad = False log.write('%s\n\n' % (type(net))) # log.write('\n%s\n'%(str(net))) # log.write(inspect.getsource(net.__init__)+'\n') # log.write(inspect.getsource(net.forward )+'\n') log.write('\n') ## optimiser ---------------------------------- #LR = StepLR([ (0, 0.01), (200, 0.001), (300, -1)]) LR = StepLR([(0, 0.01), (1, 0.001), (3, 0.0001)]) ## optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9, weight_decay=0.0005) ###0.0005 optimizer = optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), lr=0.01, momentum=0.1, weight_decay=0.0001) ## dataset ---------------------------------------- log.write('** dataset setting **\n') transform_train = transforms.Compose([ # transforms.ToTensor(): Converts a PIL.Image or numpy.ndarray (H x W x C) in the range [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]. transforms.Lambda(lambda x: train_augment(x)) ]) transform_valid = transforms.Compose([ # transforms.ToTensor(): Converts a PIL.Image or numpy.ndarray (H x W x C) in the range [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]. transforms.Lambda(lambda x: valid_augment(x)) ]) train_dataset = CDiscountDataset(csv_dir + train_data_filename, root_dir, transform=transform_train) train_loader = DataLoader( train_dataset, #sampler = RandomSampler1(train_dataset,50000), sampler=RandomSampler(train_dataset), batch_size=batch_size, drop_last=True, num_workers=0, pin_memory=False) # if train_loader != None: print("Train loader loaded!") valid_dataset = CDiscountDataset(csv_dir + validation_data_filename, root_dir, transform=transform_valid) valid_loader = DataLoader(valid_dataset, sampler=SequentialSampler(valid_dataset), batch_size=validation_batch_size, drop_last=False, num_workers=0, pin_memory=False) # if valid_loader != None: print("Valid loader loaded!") # log.write('\ttrain_dataset.split = %s\n'%(train_dataset.split)) # log.write('\tvalid_dataset.split = %s\n'%(valid_dataset.split)) log.write('\tlen(train_dataset) = %d\n' % (len(train_dataset))) log.write('\tlen(valid_dataset) = %d\n' % (len(valid_dataset))) log.write('\tlen(train_loader) = %d\n' % (len(train_loader))) log.write('\tlen(valid_loadernum_iters) = %d\n' % (len(valid_loader))) log.write('\tbatch_size = %d\n' % (batch_size)) log.write('\titer_accum = %d\n' % (iter_accum)) log.write('\tbatch_size*iter_accum = %d\n' % (batch_size * iter_accum)) # log.write('\n') # log.write(inspect.getsource(train_augment)+'\n') # log.write(inspect.getsource(valid_augment)+'\n') # log.write('\n') #### # if 0: ## check data # check_dataset(train_dataset, train_loader) # exit(0) ## resume from previous ---------------------------------- start_iter = 0 start_epoch = 0. if initial_checkpoint is not None: # load a checkpoint and resume from previous training log.write('\tloading @ initial_checkpoint = %s\n' % initial_checkpoint) # load if os.path.isfile(initial_checkpoint): print("=> loading checkpoint '{}'".format(initial_checkpoint)) checkpoint = torch.load(initial_checkpoint) start_epoch = checkpoint['epoch'] start_iter = checkpoint['iter'] best_train_acc = checkpoint['best_train_acc'] best_valid_acc = checkpoint['best_valid_acc'] net.load_state_dict(checkpoint['state_dict'] ) # load model weights from the checkpoint optimizer.load_state_dict(checkpoint['optimizer']) # net.load_state_dict(checkpoint) log.write( "=> loaded checkpoint '{}' (epoch: {}, iter: {}, best_train_acc: {}, best_valid_acc: {})" .format(initial_checkpoint, start_epoch, start_iter, best_train_acc, best_valid_acc)) else: print("=> no checkpoint found at '{}'".format(initial_checkpoint)) exit(0) elif pretrained_file is not None: # load a pretrained model and train from the beginning log.write('\tloading @ pretrained_file = %s\n' % pretrained_file) net.load_pretrain_pytorch_file(pretrained_file, skip) ## start training here! ############################################## log.write('** start training here! **\n') log.write('\toptimizer=%s\n' % str(optimizer)) # log.write(' LR=%s\n\n'%str(LR) ) log.write( ' rate iter epoch | valid_loss/acc | train_loss/acc | batch_loss/acc | total time | avg iter time | i j |\n' ) log.write( '----------------------------------------------------------------------------------------------------------------\n' ) # Custom setting # start_iter = 75000 # start_epoch= 2.98 i = start_iter start = timer() end = time.time() while i < num_iters: net.train() optimizer.zero_grad() ############################## # for images, labels, indices in train_loader: #for images, labels in train_loader:#delete indices for testing ################################ #print("start iteration") for k, data in enumerate(train_loader, 0): images, labels = data i = j / iter_accum + start_iter epoch = (i - start_iter) * batch_size * iter_accum / len( train_dataset) + start_epoch if i % iter_log == 0: # print('\r',end='',flush=True) log.write('\r%0.4f %5.1f k %4.2f | %0.4f %0.4f | %0.4f %0.4f | %0.4f %0.4f | %5.0f min | %5.2f s | %d,%d \n' % \ (rate, i/1000, epoch, valid_loss, valid_acc, train_loss_meter.avg, train_acc_meter.avg, batch_loss, batch_acc,(timer() - start)/60, iter_time_meter.avg, i, j)) #if 1: if i in iter_save and i != start_iter: # torch.save(net.state_dict(),out_dir +'/checkpoint/%08d_model.pth'%(i)) # torch.save({ # 'optimizer': optimizer.state_dict(), # 'iter' : i, # 'epoch' : epoch, # 'state_dict': net.state_dict(), # 'best_valid_acc': best_valid_acc # }, out_dir +'/checkpoint/%08d_model.pth'%(i)) save_checkpoint(optimizer, i, epoch, net, best_valid_acc, best_train_acc, out_dir, '%08d_model.pth' % (i)) if i % iter_valid == 0 and i != start_iter: net.eval() valid_loss, valid_acc = evaluate(net, valid_loader, validation_num) net.train() # update best valida_acc and update best model if valid_acc > best_valid_acc: best_valid_acc = valid_acc # update best model on validation set # torch.save(net.state_dict(), out_dir + '/checkpoint/best_model.pth') save_checkpoint(optimizer, i, epoch, net, best_valid_acc, best_train_acc, out_dir, "best_val_model.pth") log.write( "=> Best validation model updated: iter %d, validation acc %f\n" % (i, best_valid_acc)) # learning rate schduler ------------- lr = LR.get_rate(epoch) if lr < 0: break adjust_learning_rate(optimizer, lr / iter_accum) rate = get_learning_rate(optimizer)[0] * iter_accum end = time.time() # one iteration update ------------- images = Variable(images.type( torch.FloatTensor)).cuda() if use_cuda else Variable( images.type(torch.FloatTensor)) labels = Variable(labels).cuda() if use_cuda else Variable(labels) logits = net(images) probs = F.softmax(logits) loss = F.cross_entropy(logits, labels) batch_loss = loss.data[0] train_loss_meter.update(batch_loss) #### # loss = FocalLoss()(logits, labels) #F.cross_entropy(logits, labels) # acc = top_accuracy(probs, labels, top_k=(1,)) #### # optimizer.zero_grad() # loss.backward() # optimizer.step() # accumulate gradients loss.backward() ## update gradients every iter_accum if j % iter_accum == 0: #torch.nn.utils.clip_grad_norm(net.parameters(), 1) #print("optim step") optimizer.step() optimizer.zero_grad() # measure elapsed time iter_time_meter.update(time.time() - end) # print statistics ------------ batch_acc = get_accuracy(probs, labels) train_acc_meter.update(batch_acc) if i % iter_smooth == 0: # reset train stats every iter_smooth iters if train_acc_meter.avg > best_train_acc: best_train_acc = train_acc_meter.avg # update best model on train set save_checkpoint(optimizer, i, epoch, net, best_valid_acc, best_train_acc, out_dir, "best_train_model.pth") log.write( "=> Best train model updated: iter %d, train acc %f\n" % (i, best_train_acc)) train_loss_meter = AverageMeter() train_acc_meter = AverageMeter() print('\r%0.4f %5.1f k %4.2f | %0.4f %0.4f | %0.4f %0.4f | %0.4f %0.4f | %5.0f min | %5.2f s | %d,%d' % \ (rate, i/1000, epoch, valid_loss, valid_acc, train_loss_meter.avg, train_acc_meter.avg, batch_loss, batch_acc,(timer() - start)/60, iter_time_meter.avg, i, j),\ end='',flush=True) j = j + 1 pass #-- end of one data loader -- pass #-- end of all iterations -- ## check : load model and re-test if 1: # torch.save(net.state_dict(),out_dir +'/checkpoint/%d_model.pth'%(i)) # torch.save({ # 'optimizer': optimizer.state_dict(), # 'iter' : i, # 'epoch' : epoch, # }, out_dir +'/checkpoint/%d_optimizer.pth'%(i)) save_checkpoint(optimizer, i, epoch, net, best_valid_acc, best_train_acc, out_dir, '%d_optimizer.pth' % (i)) log.write('\n')
def main(args, logger): trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv') gkf = GroupKFold(n_splits=5).split(X=trn_df.question_body, groups=trn_df.question_body) histories = { 'trn_loss': [], 'val_loss': [], 'val_metric': [], } loaded_fold = -1 loaded_epoch = -1 if args.checkpoint: histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint) for fold, (trn_idx, val_idx) in enumerate(gkf): if fold < loaded_fold: continue fold_trn_df = trn_df.iloc[trn_idx] fold_val_df = trn_df.iloc[val_idx] if args.debug: fold_trn_df = fold_trn_df.sample(100, random_state=71) fold_val_df = fold_val_df.sample(100, random_state=71) temp = pd.Series( list( itertools.chain.from_iterable( fold_trn_df.question_title.apply(lambda x: x.split(' ')) + fold_trn_df.question_body.apply(lambda x: x.split(' ')) + fold_trn_df.answer.apply(lambda x: x.split(' ')))) ).value_counts() tokens = temp[temp >= 10].index.tolist() tokens = [] trn_dataset = QUESTDataset( df=fold_trn_df, mode='train', tokens=tokens, augment=[], pretrained_model_name_or_path=TOKENIZER_PRETRAIN, ) # update token trn_sampler = RandomSampler(data_source=trn_dataset) trn_loader = DataLoader(trn_dataset, batch_size=BATCH_SIZE, sampler=trn_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=True, pin_memory=True) val_dataset = QUESTDataset( df=fold_val_df, mode='valid', tokens=tokens, augment=[], pretrained_model_name_or_path=TOKENIZER_PRETRAIN, ) val_sampler = RandomSampler(data_source=val_dataset) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=False, pin_memory=True) fobj = soft_binary_cross_entropy model = BertModelForBinaryMultiLabelClassifier( num_labels=30, pretrained_model_name_or_path=MODEL_PRETRAIN, ) # model.resize_token_embeddings(len(trn_dataset.tokenizer)) optimizer = optim.Adam(model.parameters(), lr=3e-5) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=MAX_EPOCH, eta_min=1e-5) # load checkpoint model, optim, scheduler if args.checkpoint and fold == loaded_fold: load_checkpoint(args.checkpoint, model, optimizer, scheduler) for epoch in tqdm(list(range(MAX_EPOCH))): model = model.to(DEVICE) if fold <= loaded_fold and epoch <= loaded_epoch: continue trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader) val_loss, val_metric, val_y_preds, val_y_trues, val_qa_ids = test( model, val_loader) scheduler.step() histories['trn_loss'].append(trn_loss) histories['val_loss'].append(val_loss) histories['val_metric'].append(val_metric) sel_log( f'epoch : {epoch} -- fold : {fold} -- ' f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_metric : {float(val_metric):.4f}', logger) model = model.to('cpu') save_checkpoint(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model, optimizer, scheduler, histories, val_y_preds, val_y_trues, val_qa_ids, fold, epoch, val_loss, val_metric) save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', trn_dataset.tokenizer) del model sel_log('now saving best checkpoints...', logger)
def training(model_name, model_type, optimizer_name, lr_scheduler_name, lr, batch_size, valid_batch_size, num_epoch, start_epoch, accumulation_steps, train_data_folder, checkpoint_folder, train_split, val_split, fold, load_pretrain): COMMON_STRING = '@%s: \n' % os.path.basename(__file__) COMMON_STRING += '\tset random seed\n' COMMON_STRING += '\t\tSEED = %d\n' % SEED torch.backends.cudnn.benchmark = False ##uses the inbuilt cudnn auto-tuner to find the fastest convolution algorithms. - torch.backends.cudnn.enabled = True torch.backends.cudnn.deterministic = True COMMON_STRING += '\tset cuda environment\n' COMMON_STRING += '\t\ttorch.__version__ = %s\n' % torch.__version__ COMMON_STRING += '\t\ttorch.version.cuda = %s\n' % torch.version.cuda COMMON_STRING += '\t\ttorch.backends.cudnn.version() = %s\n' % torch.backends.cudnn.version( ) try: COMMON_STRING += '\t\tos[\'CUDA_VISIBLE_DEVICES\'] = %s\n' % os.environ[ 'CUDA_VISIBLE_DEVICES'] NUM_CUDA_DEVICES = len(os.environ['CUDA_VISIBLE_DEVICES'].split(',')) except Exception: COMMON_STRING += '\t\tos[\'CUDA_VISIBLE_DEVICES\'] = None\n' NUM_CUDA_DEVICES = 1 COMMON_STRING += '\t\ttorch.cuda.device_count() = %d\n' % torch.cuda.device_count( ) COMMON_STRING += '\n' os.makedirs(checkpoint_folder + '/' + model_type + '/' + model_name, exist_ok=True) log = Logger() log.open(checkpoint_folder + '/' + model_type + '/' + model_name + '/' + model_name + '_fold_' + str(fold) + '_log_train.txt', mode='a+') log.write('\t%s\n' % COMMON_STRING) log.write('\n') log.write('\tSEED = %u\n' % SEED) log.write('\tPROJECT_PATH = %s\n' % train_data_folder) log.write('\t__file__ = %s\n' % __file__) log.write('\tout_dir = %s\n' % checkpoint_folder) log.write('\n') ## dataset ---------------------------------------- log.write('** dataset setting **\n') train_dataset = URESDataset( data_dir=train_data_folder, mode='train', csv=[ 'train.csv', ], split=train_split, augment=transform_train, size=(1024, 1024), ) train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size, drop_last=True, num_workers=4, pin_memory=True, collate_fn=null_collate) valid_dataset = URESDataset( data_dir=train_data_folder, mode='train', csv=[ 'train.csv', ], split=val_split, augment=transform_valid, size=(1024, 1024), ) valid_dataloader = DataLoader(valid_dataset, sampler=SequentialSampler(valid_dataset), batch_size=valid_batch_size, drop_last=False, num_workers=4, pin_memory=True, collate_fn=null_collate) log.write('train_dataset : \n%s\n' % (train_dataset)) log.write('valid_dataset : \n%s\n' % (valid_dataset)) log.write('\n') ############################################################################## define unet model with backbone def load(model, pretrain_file, skip=[]): pretrain_state_dict = torch.load(pretrain_file) state_dict = model.state_dict() keys = list(state_dict.keys()) for key in keys: if any(s in key for s in skip): continue try: state_dict[key] = pretrain_state_dict[key] except: print(key) model.load_state_dict(state_dict) return model def get_deeplab_model(model_name="deep_se101", in_channel=3, num_classes=1, criterion=SoftDiceLoss_binary(), \ load_pretrain=False, checkpoint_filepath=None): if model_name == 'deep_se50': model = DeepSRNX50V3PlusD_m1(in_channel=in_channel, num_classes=num_classes, criterion=criterion) elif model_name == 'deep_se101': model = DeepSRNX101V3PlusD_m1(in_channel=in_channel, num_classes=num_classes, criterion=criterion) elif model_name == 'WideResnet38': model = DeepWR38V3PlusD_m1(in_channel=in_channel, num_classes=num_classes, criterion=criterion) elif model_name == 'unet_ef3': model = EfficientNet_3_unet() elif model_name == 'unet_ef5': model = EfficientNet_5_unet() else: print('No model name in it') model = None if (load_pretrain): model = load(model, checkpoint_filepath) return model def get_unet_model(model_name="efficientnet-b3", IN_CHANNEL=3, NUM_CLASSES=1, \ WIDTH=MASK_WIDTH, HEIGHT=MASK_HEIGHT, load_pretrain=False, checkpoint_filepath=None): model = model_iMet(model_name, IN_CHANNEL, NUM_CLASSES, WIDTH, HEIGHT) if (load_pretrain): model.load_pretrain(checkpoint_filepath) return model def get_aspp_model(model_name="efficientnet-b3", NUM_CLASSES=1, load_pretrain=False, checkpoint_filepath=None): model = Net(model_name, IN_CHANNEL, NUM_CLASSES, WIDTH, HEIGHT) if (load_pretrain): state_dict = torch.load(checkpoint_filepath, map_location=lambda storage, loc: storage) model.load_state_dict(state_dict, strict=True) return model ############################################################################### training parameters checkpoint_filename = model_type + '/' + model_name + '/' + model_name + "_" + model_type + '_fold_' + str( fold) + "_checkpoint.pth" checkpoint_filepath = os.path.join(checkpoint_folder, checkpoint_filename) ############################################################################### model and optimizer if model_type == 'unet': model = get_unet_model(model_name=model_name, IN_CHANNEL=3, NUM_CLASSES=NUM_CLASS, \ WIDTH=MASK_WIDTH, HEIGHT=MASK_HEIGHT, load_pretrain=load_pretrain, checkpoint_filepath=checkpoint_filepath) elif model_type == 'deeplab': model = get_deeplab_model(model_name=model_name, in_channel=3, num_classes=NUM_CLASS, \ criterion=BCEDiceLoss(), load_pretrain=load_pretrain, checkpoint_filepath=checkpoint_filepath) elif model_type == 'aspp': model = get_aspp_model(model_name=model_name, NUM_CLASSES=NUM_CLASS, \ load_pretrain=load_pretrain, checkpoint_filepath=checkpoint_filepath) model = model.cuda() if optimizer_name == "Adam": if model_type != 'deeplab': optimizer = torch.optim.Adam([{ 'params': model.decoder.parameters(), 'lr': lr, 'weight_decay': 0.01 }, { 'params': model.encoder.parameters(), 'lr': lr * 0.05 }]) else: optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) elif optimizer_name == "adamonecycle": flatten_model = lambda m: sum(map(flatten_model, m.children()), [] ) if num_children(m) else [m] get_layer_groups = lambda m: [nn.Sequential(*flatten_model(m))] optimizer_func = partial(optim.Adam, betas=(0.9, 0.99)) optimizer = OptimWrapper.create(optimizer_func, 3e-3, get_layer_groups(model), wd=1e-4, true_wd=True, bn_wd=True) elif optimizer_name == "Ranger": if model_type != 'deeplab': optimizer = Ranger([{ 'params': model.decoder.parameters(), 'lr': lr, 'weight_decay': 0.01 }, { 'params': model.encoder.parameters(), 'lr': lr * 0.05 }]) else: optimizer = Ranger(filter(lambda p: p.requires_grad, model.parameters()), lr, weight_decay=1e-5) else: raise NotImplementedError if lr_scheduler_name == "adamonecycle": scheduler = lsf.OneCycle(optimizer, len(train_dataset) * num_epoch, lr, [0.95, 0.85], 10.0, 0.4) lr_scheduler_each_iter = True elif lr_scheduler_name == "CosineAnealing": scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epoch, eta_min=0, last_epoch=-1) lr_scheduler_each_iter = False elif lr_scheduler_name == "WarmRestart": scheduler = WarmRestart(optimizer, T_max=5, T_mult=1, eta_min=1e-6) lr_scheduler_each_iter = False else: raise NotImplementedError log.write('net\n %s\n' % (model_name)) log.write('optimizer\n %s\n' % (optimizer_name)) log.write('schduler\n %s\n' % (lr_scheduler_name)) log.write('\n') # mix precision model, optimizer = amp.initialize(model, optimizer, opt_level="O1") ############################################################################### training log.write('** start training here! **\n') log.write(' batch_size=%d, accumulation_steps=%d\n' % (batch_size, accumulation_steps)) log.write(' experiment = %s\n' % str(__file__.split('/')[-2:])) valid_loss = np.zeros(3, np.float32) train_loss = np.zeros(3, np.float32) valid_metric_optimal = np.inf eval_step = len(train_dataloader) # or len(train_dataloader) log_step = 10 eval_count = 0 # define tensorboard writer and timer writer = SummaryWriter() start_timer = timer() # define criterion criterion = BCEDiceLoss() metric = FscoreMetric(activation=None) for epoch in range(1, num_epoch + 1): torch.cuda.empty_cache() # update lr and start from start_epoch # if (not lr_scheduler_each_iter): # if epoch < 600: # if epoch != 0: # scheduler.step() # scheduler = warm_restart(scheduler, T_mult=2) # elif epoch > 600 and epoch < 800: # optimizer.param_groups[0]['lr'] = 1e-5 # else: # optimizer.param_groups[0]['lr'] = 5e-6 affect_rate = CosineAnnealingWarmUpRestarts( epoch, T_0=num_epoch, T_warmup=15, gamma=0.8, ) optimizer.param_groups[0]['lr'] = affect_rate * lr if epoch < 100: optimizer.param_groups[0]['lr'] = affect_rate * lr elif epoch < 150: lr = 4e-4 optimizer.param_groups[0]['lr'] = affect_rate * lr else: lr = 1e-4 # optimizer.param_groups[0]['lr'] = rate * lr # optimizer.param_groups[1]['lr'] = rate * lr * 0.01 if (epoch < start_epoch): continue log.write("Epoch%s\n" % epoch) log.write('\n') for param_group in optimizer.param_groups: rate = param_group['lr'] sum_train_loss = np.zeros_like(train_loss) sum_train = np.zeros_like(train_loss) seed_everything(SEED + epoch) torch.cuda.empty_cache() optimizer.zero_grad() for tr_batch_i, (X, truth_mask) in enumerate(train_dataloader): if (lr_scheduler_each_iter): scheduler.step(tr_batch_i) model.train() X = X.cuda().float() truth_mask = truth_mask.cuda() prediction = model(X) # [N, C, H, W] # loss = criterion_mask(prediction, truth_mask, weight=None) loss = criterion(prediction, truth_mask) with amp.scale_loss(loss / accumulation_steps, optimizer) as scaled_loss: scaled_loss.backward() #loss.backward() if ((tr_batch_i + 1) % accumulation_steps == 0): torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0, norm_type=2) optimizer.step() optimizer.zero_grad() writer.add_scalar( 'train_loss_' + str(fold), loss.item(), (epoch - 1) * len(train_dataloader) * batch_size + tr_batch_i * batch_size) # print statistics -------- # probability_mask = prediction probability_mask = torch.sigmoid(prediction) mask_positive = torch.where(truth_mask > 0.5, torch.ones_like(truth_mask), truth_mask) mask_negative = 1 - mask_positive fscore_positive = metric(probability_mask, mask_positive) fscore_negative = metric(1 - probability_mask, mask_negative) # probability_mask = torch.sigmoid(prediction) # mask_positive = np.where(truth_mask.clone().detach().cpu().numpy().flatten() > 0, 1, 0) # mask_negative = 1 - mask_positive # mask_pred_positive = np.where(probability_mask.detach().clone().cpu().numpy().flatten() > 0.5, 1, 0) # mask_pred_negative = 1 - mask_pred_positive # fscore_positive = f1_score(mask_positive, mask_pred_positive) # fscore_negative = f1_score(mask_negative, mask_pred_negative) l = np.array( [loss.item() * batch_size, fscore_positive, fscore_negative]) n = np.array([batch_size]) sum_train_loss = sum_train_loss + l sum_train = sum_train + n # log for training if (tr_batch_i + 1) % log_step == 0: train_loss = sum_train_loss / (sum_train + 1e-12) sum_train_loss[...] = 0 sum_train[...] = 0 log.write('lr: %f train loss: %f fscore_positive: %f fscore_negative: %f\n' % \ (rate, train_loss[0], train_loss[1], train_loss[2])) if (tr_batch_i + 1) % eval_step == 0: eval_count += 1 valid_loss = np.zeros(3, np.float32) valid_num = np.zeros_like(valid_loss) valid_metric = [] with torch.no_grad(): torch.cuda.empty_cache() for val_batch_i, ( X, truth_mask) in enumerate(valid_dataloader): model.eval() X = X.cuda().float() truth_mask = truth_mask.cuda() prediction = model(X) # [N, C, H, W] # loss = criterion_mask(prediction, truth_mask, weight=None) loss = criterion(prediction, truth_mask) writer.add_scalar( 'val_loss_' + str(fold), loss.item(), (eval_count - 1) * len(valid_dataloader) * valid_batch_size + val_batch_i * valid_batch_size) # print statistics -------- # probability_mask = prediction probability_mask = torch.sigmoid(prediction) mask_positive = torch.where( truth_mask > 0.5, torch.ones_like(truth_mask), truth_mask) mask_negative = 1 - mask_positive fscore_positive = metric(probability_mask, mask_positive) fscore_negative = metric(1 - probability_mask, mask_negative) # if (epoch == 1) and (val_batch_i == 0): # predict = probability_mask[0, :, :].detach().squeeze().cpu().numpy() # predict = predict > 0.5 # Threshould # predict = (1 - predict)*255 # cv2.imwrite('result/0_0.tiff', predict.astype(np.uint8)) # probability_mask = torch.sigmoid(prediction) # mask_positive = np.where(truth_mask.clone().detach().cpu().numpy().flatten() > 0, 1, 0) # mask_negative = 1 - mask_positive # mask_pred_positive = np.where(probability_mask.detach().clone().cpu().numpy().flatten() > 0.5, 1, 0) # mask_pred_negative = 1 - mask_pred_positive # fscore_positive = f1_score(mask_positive, mask_pred_positive) # fscore_negative = f1_score(mask_negative, mask_pred_negative) #--- l = np.array([ loss.item() * valid_batch_size, fscore_positive, fscore_negative ]) n = np.array([valid_batch_size]) valid_loss = valid_loss + l valid_num = valid_num + n valid_loss = valid_loss / valid_num log.write('validation loss: %f fscore_positive: %f fscore_negative: %f\n' % \ (valid_loss[0], \ valid_loss[1], \ valid_loss[2])) val_metric_epoch = valid_loss[0] if (val_metric_epoch <= valid_metric_optimal): log.write('Validation metric improved ({:.6f} --> {:.6f}). Saving model ...'.format(\ valid_metric_optimal, val_metric_epoch)) valid_metric_optimal = val_metric_epoch torch.save(model.state_dict(), checkpoint_filepath)
def main(args, logger): # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv') trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl') tst_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/test.csv') trn_df = pd.concat([trn_df, tst_df], axis=0).fillna(-1) trn_df['is_original'] = 1 # raw_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/raw_pseudo_tst_df.csv') # half_opt_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/half_opt_pseudo_tst_df.csv') # opt_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/opt_pseudo_tst_df.csv') # clean texts # trn_df = clean_data(trn_df, ['question_title', 'question_body', 'answer']) # load additional tokens # with open('./mnt/inputs/nes_info/trn_over_10_vocab.pkl', 'rb') as fin: # additional_tokens = pickle.load(fin) gkf = GroupKFold(n_splits=5).split( X=trn_df.question_body, groups=trn_df.question_body_le, ) histories = { 'trn_loss': {}, 'val_loss': {}, 'val_metric': {}, 'val_metric_raws': {}, } loaded_fold = -1 loaded_epoch = -1 if args.checkpoint: histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint) fold_best_metrics = [] fold_best_metrics_raws = [] for fold, (trn_idx, val_idx) in enumerate(gkf): if fold > 0: break if fold < loaded_fold: fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax( histories["val_metric"][fold])]) continue sel_log( f' --------------------------- start fold {fold} --------------------------- ', logger) fold_trn_df = trn_df.iloc[trn_idx] # .query('is_original == 1') fold_trn_df = fold_trn_df.drop(['is_original', 'question_body_le'], axis=1) # use only original row fold_val_df = trn_df.iloc[val_idx].query('is_original == 1') fold_val_df = fold_val_df.drop(['is_original', 'question_body_le'], axis=1) if args.debug: fold_trn_df = fold_trn_df.sample(100, random_state=71) trn_df = trn_df.sample(100, random_state=71) fold_val_df = fold_val_df.sample(100, random_state=71) temp = pd.Series( list( itertools.chain.from_iterable( fold_trn_df.question_title.apply(lambda x: x.split(' ')) + fold_trn_df.question_body.apply(lambda x: x.split(' ')) + fold_trn_df.answer.apply(lambda x: x.split(' ')))) ).value_counts() tokens = temp[temp >= 10].index.tolist() # tokens = [] tokens = [ 'CAT_TECHNOLOGY'.casefold(), 'CAT_STACKOVERFLOW'.casefold(), 'CAT_CULTURE'.casefold(), 'CAT_SCIENCE'.casefold(), 'CAT_LIFE_ARTS'.casefold(), ] # + additional_tokens fold_trn_df = trn_df.drop(['is_original', 'question_body_le'], axis=1) # fold_trn_df = pd.concat([fold_trn_df, raw_pseudo_df, opt_pseudo_df, half_opt_pseudo_df], axis=0) trn_dataset = QUESTDataset( df=fold_trn_df, mode='train', tokens=tokens, augment=[], tokenizer_type=TOKENIZER_TYPE, pretrained_model_name_or_path=TOKENIZER_PRETRAIN, do_lower_case=DO_LOWER_CASE, LABEL_COL=LABEL_COL, t_max_len=T_MAX_LEN, q_max_len=Q_MAX_LEN, a_max_len=A_MAX_LEN, tqa_mode=TQA_MODE, TBSEP='[SEP]', pos_id_type='arange', MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, use_category=False, ) # update token trn_sampler = RandomSampler(data_source=trn_dataset) trn_loader = DataLoader(trn_dataset, batch_size=BATCH_SIZE, sampler=trn_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=True, pin_memory=True) model = BertForMaskedLM.from_pretrained('bert-base-uncased') optimizer = optim.Adam(model.parameters(), lr=3e-5) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=MAX_EPOCH, eta_min=1e-5) # load checkpoint model, optim, scheduler if args.checkpoint and fold == loaded_fold: load_checkpoint(args.checkpoint, model, optimizer, scheduler) for epoch in tqdm(list(range(MAX_EPOCH))): if fold <= loaded_fold and epoch <= loaded_epoch: continue # model = DataParallel(model) model = model.to(DEVICE) trn_loss = train_one_epoch_ML(model, optimizer, trn_loader, DEVICE) scheduler.step() if fold in histories['trn_loss']: histories['trn_loss'][fold].append(trn_loss) else: histories['trn_loss'][fold] = [ trn_loss, ] if fold in histories['val_loss']: histories['val_loss'][fold].append(trn_loss) else: histories['val_loss'][fold] = [ trn_loss, ] if fold in histories['val_metric']: histories['val_metric'][fold].append(trn_loss) else: histories['val_metric'][fold] = [ trn_loss, ] if fold in histories['val_metric_raws']: histories['val_metric_raws'][fold].append(trn_loss) else: histories['val_metric_raws'][fold] = [ trn_loss, ] sel_log( f'fold : {fold} -- epoch : {epoch} -- ' f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- ', logger) model = model.to('cpu') # model = model.module save_checkpoint( f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model, optimizer, scheduler, histories, [], [], [], fold, epoch, trn_loss, trn_loss, ) save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', trn_dataset.tokenizer, clean=False) del model send_line_notification('fini!') sel_log('now saving best checkpoints...', logger)
def mnist(argv=None): args = parse_arguments(argv) use_cuda = args.cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} # Train # ----- train_source = datasets.MNIST( '../data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ]), ) train_sampler = RandomSampler(train_source) train_loader = torch.utils.data.DataLoader( train_source, batch_size=args.batch_size, shuffle=True, # sampler=train_sampler, **kwargs ) # Test # ---- test_source = datasets.MNIST( '../data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ]) ) test_loader = torch.utils.data.DataLoader( test_source, batch_size=args.test_batch_size, shuffle=False, sampler=RandomSampler(test_source), **kwargs ) model = Net().to(device) optimizer = optim.SGD( model.parameters(), lr=args.lr, momentum=args.momentum ) trainer = TrainClassifier( optimizer, nn.NLLLoss(reduction='sum'), model, train_sampler, device) trainer.fit(args.epochs, train_loader) result = trainer.eval_model(test_loader) print(f'Eval (acc: {result.acc * 100}) (loss: {result.loss})')
def run(fold, df, meta_features, n_meta_features, transforms_train, transforms_val, mel_idx): if args.DEBUG: args.n_epochs = 5 df_train = df[df['fold'] != fold].sample(args.batch_size * 5) df_valid = df[df['fold'] == fold].sample(args.batch_size * 5) else: df_train = df[df['fold'] != fold] df_valid = df[df['fold'] == fold] dataset_train = MelanomaDataset(df_train, 'train', meta_features, transform=transforms_train) dataset_valid = MelanomaDataset(df_valid, 'valid', meta_features, transform=transforms_val) train_loader = torch.utils.data.DataLoader( dataset_train, batch_size=args.batch_size, sampler=RandomSampler(dataset_train), num_workers=args.num_workers) # 随机不重复采样 valid_loader = torch.utils.data.DataLoader(dataset_valid, batch_size=args.batch_size, num_workers=args.num_workers) model = ModelClass( args.enet_type, n_meta_features=n_meta_features, n_meta_dim=[int(nd) for nd in args.n_meta_dim.split(',')], out_dim=args.out_dim, pretrained=True) if DP: model = apex.parallel.convert_syncbn_model(model) model = model.to(device) auc_max = 0. auc_20_max = 0. model_file = os.path.join(args.model_dir, f'{args.kernel_type}_best_fold{fold}.pth') model_file2 = os.path.join(args.model_dir, f'{args.kernel_type}_best_20_fold{fold}.pth') model_file3 = os.path.join(args.model_dir, f'{args.kernel_type}_final_fold{fold}.pth') optimizer = optim.Adam(model.parameters(), lr=args.init_lr) if args.use_amp: model, optimizer = amp.initialize(model, optimizer, opt_level="O1") if DP: model = nn.DataParallel(model) # scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.n_epochs - 1) scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( optimizer, args.n_epochs - 1) scheduler_warmup = GradualWarmupSchedulerV2( optimizer, multiplier=10, total_epoch=1, after_scheduler=scheduler_cosine) print(len(dataset_train), len(dataset_valid)) for epoch in range(1, args.n_epochs + 1): print(time.ctime(), f'Epoch {epoch}', f'Fold {fold}') # scheduler_warmup.step(epoch - 1) train_loss = train_epoch(model, train_loader, optimizer) val_loss, acc, auc, auc_20 = val_epoch( model, valid_loader, mel_idx, is_ext=df_valid['is_ext'].values) content = time.ctime( ) + ' ' + f'Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {train_loss:.5f}, valid loss: {(val_loss):.5f}, acc: {(acc):.4f}, auc: {(auc):.6f}, auc_20: {(auc_20):.6f}.' print(content) with open(os.path.join(args.log_dir, f'log_{args.kernel_type}.txt'), 'a') as appender: appender.write(content + '\n') scheduler_warmup.step() if epoch == 2: scheduler_warmup.step() # bug workaround if auc > auc_max: print('auc_max ({:.6f} --> {:.6f}). Saving model ...'.format( auc_max, auc)) torch.save(model.state_dict(), model_file) auc_max = auc if auc_20 > auc_20_max: print('auc_20_max ({:.6f} --> {:.6f}). Saving model ...'.format( auc_20_max, auc_20)) torch.save(model.state_dict(), model_file2) auc_20_max = auc_20 torch.save(model.state_dict(), model_file3)
def main(): # define parser and arguments args = get_train_test_args() util.set_seed(args.seed) model = DistilBertForQuestionAnswering.from_pretrained( args.model_checkpoint) tokenizer = DistilBertTokenizerFast.from_pretrained(args.model_checkpoint) with wandb.init(project="qa-system", config=args) as run: run.name = args.run_name wandb.watch(model) if args.do_train: if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) args.save_dir = util.get_save_dir(args.save_dir, args.run_name) log = util.get_logger(args.save_dir, 'log_train') log.info( f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}') log.info("Preparing Training Data...") args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') trainer = Trainer(args, log) train_dataset, _ = get_dataset(args, args.train_datasets, args.train_dir, tokenizer, 'train') log.info("Preparing Validation Data...") val_dataset, val_dict = get_dataset(args, args.val_datasets, args.val_dir, tokenizer, 'val') train_loader = DataLoader(train_dataset, batch_size=args.batch_size, sampler=RandomSampler(train_dataset)) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, sampler=SequentialSampler(val_dataset)) best_scores = trainer.train(model, train_loader, val_loader, val_dict) model_artifact = wandb.Artifact( args.run_name, type="model", ) model_artifact.add_dir(os.path.join(args.save_dir, 'checkpoint')) run.log_artifact(model_artifact) if args.do_eval: args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') split_name = 'test' if 'test' in args.eval_dir else 'validation' log = util.get_logger(args.save_dir, f'log_{split_name}') trainer = Trainer(args, log) if args.checkpoint_path != "": model = DistilBertForQuestionAnswering.from_pretrained( args.checkpoint_path) else: checkpoint_path = os.path.join(args.save_dir, 'checkpoint') model = DistilBertForQuestionAnswering.from_pretrained( checkpoint_path) model.to(args.device) eval_dataset, eval_dict = get_dataset(args, args.eval_datasets, args.eval_dir, tokenizer, split_name) eval_loader = DataLoader(eval_dataset, batch_size=args.batch_size, sampler=SequentialSampler(eval_dataset)) eval_preds, eval_scores = trainer.evaluate(model, eval_loader, eval_dict, return_preds=True, split=split_name) results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in eval_scores.items()) log.info(f'Eval {results_str}') # Write submission file sub_path = os.path.join(args.save_dir, split_name + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(eval_preds): csv_writer.writerow([uuid, eval_preds[uuid]])
def main(): args = parser.parse_args() log_out_dir = os.path.join(RESULT_DIR, 'logs', args.out_dir, 'fold%d' % args.fold) if not os.path.exists(log_out_dir): os.makedirs(log_out_dir) log = Logger() log.open(os.path.join(log_out_dir, 'log.train.txt'), mode='a') model_out_dir = os.path.join(RESULT_DIR, 'models', args.out_dir, 'fold%d' % args.fold) log.write(">> Creating directory if it does not exist:\n>> '{}'\n".format( model_out_dir)) if not os.path.exists(model_out_dir): os.makedirs(model_out_dir) # set cuda visible device os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id cudnn.benchmark = True # cudnn.enabled = False # set random seeds torch.manual_seed(0) torch.cuda.manual_seed_all(0) np.random.seed(0) model_params = {} model_params['architecture'] = args.arch model_params['num_classes'] = args.num_classes model_params['in_channels'] = args.in_channels if 'efficientnet' in args.arch: model_params['image_size'] = args.img_size model_params['encoder'] = args.effnet_encoder model = init_network(model_params) if args.load_state_dict_path is not None: init_pretrained = torch.load(args.load_state_dict_path) model.load_state_dict(init_pretrained['state_dict']) # state_dict = model.state_dict() # torch.save({ # 'state_dict': state_dict # }, '../output/densenet121_bestfitting_converted_classes.h5') # sys.exit(0) # move network to gpu # model = DataParallel(model) if args.clip_and_replace_grad_explosures: def clip_and_replace_explosures(grad): grad[torch.logical_or( torch.isnan(grad), torch.isinf(grad))] = torch.tensor(0.0).cuda() grad = torch.clamp(grad, -0.5, 0.5) return grad for param in model.parameters(): if param.requires_grad: param.register_hook(clip_and_replace_explosures) model.cuda() # define loss function (criterion) try: criterion = eval(args.loss)().cuda() except: raise (RuntimeError("Loss {} not available!".format(args.loss))) start_epoch = 0 best_loss = 1e5 best_epoch = 0 best_map = 0 # define scheduler try: scheduler = eval(args.scheduler)( scheduler_lr_multiplier=args.scheduler_lr_multiplier, scheduler_epoch_offset=args.scheduler_epoch_offset) except: raise (RuntimeError("Scheduler {} not available!".format( args.scheduler))) optimizer = scheduler.schedule(model, start_epoch, args.epochs)[0] # optionally resume from a checkpoint if args.resume: # args.resume = os.path.join(model_out_dir, args.resume) if os.path.isfile(args.resume): # load checkpoint weights and update model and optimizer log.write(">> Loading checkpoint:\n>> '{}'\n".format(args.resume)) checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] best_epoch = checkpoint['best_epoch'] best_map = checkpoint['best_score'] model.module.load_state_dict(checkpoint['state_dict']) optimizer_fpath = args.resume.replace('.pth', '_optim.pth') if os.path.exists(optimizer_fpath): log.write(">> Loading checkpoint:\n>> '{}'\n".format( optimizer_fpath)) optimizer.load_state_dict( torch.load(optimizer_fpath)['optimizer']) log.write(">>>> loaded checkpoint:\n>>>> '{}' (epoch {})\n".format( args.resume, checkpoint['epoch'])) else: log.write(">> No checkpoint found at '{}'\n".format(args.resume)) # Data loading code train_transform = train_multi_augment2 with open('../input/imagelevel_folds_obvious_staining_5.pkl', 'rb') as f: folds = pickle.load(f) fold = args.fold trn_img_paths, val_img_paths = folds[fold] train_df = get_train_df_ohe(clean_from_duplicates=args.clean_duplicates, clean_mitotic=args.clean_mitotic_samples, clean_aggresome=args.clean_aggresome) if args.ignore_negs: train_df['Negative'] = 0 train_paths_set = set(train_df['img_base_path']) basepath_2_ohe_vector = { img: vec for img, vec in zip(train_df['img_base_path'], train_df.iloc[:, 2:].values) } train_paths_set = set(train_df['img_base_path']) if not args.without_public_data: public_hpa_df_17 = get_public_df_ohe( clean_from_duplicates=args.clean_duplicates, clean_mitotic=args.clean_mitotic_samples, clean_aggresome=args.clean_aggresome) if args.ignore_negs: public_hpa_df_17['Negative'] = 0 public_basepath_2_ohe_vector = { img_path: vec for img_path, vec in zip(public_hpa_df_17['img_base_path'], public_hpa_df_17.iloc[:, 2:].values) } basepath_2_ohe_vector.update(public_basepath_2_ohe_vector) else: trn_img_paths = [ path for path in trn_img_paths if path in train_paths_set ] if not args.without_public_data: available_paths = set( np.concatenate((train_df['img_base_path'].values, public_hpa_df_17['img_base_path'].values))) else: available_paths = set(train_df['img_base_path'].values) trn_img_paths = [path for path in trn_img_paths if path in available_paths] val_img_paths = [path for path in val_img_paths if path in available_paths] if args.copy_paste_augment_mitotic_aggresome: train_ids = {os.path.basename(x) for x in trn_img_paths} id_2_ohe_vector = { os.path.basename(path): ohe for path, ohe in basepath_2_ohe_vector.items() } cherrypicked_mitotic_spindle = pd.read_csv( '../input/mitotic_cells_selection.csv') cherrypicked_mitotic_spindle = cherrypicked_mitotic_spindle[ cherrypicked_mitotic_spindle['ID'].isin(train_ids)] cherrypicked_aggresome = pd.read_csv( '../input/aggressome_cells_selection.csv') cherrypicked_aggresome = cherrypicked_aggresome[ cherrypicked_aggresome['ID'].isin(train_ids)] cherrypicked_mitotic_spindle['ohe'] = cherrypicked_mitotic_spindle[ 'ID'].map(id_2_ohe_vector) cherrypicked_aggresome['ohe'] = cherrypicked_aggresome['ID'].map( id_2_ohe_vector) mitotic_idx = [ idx for idx, colname in enumerate(train_df.columns) if colname == 'Mitotic spindle' ][0] aggresome_idx = [ idx for idx, colname in enumerate(train_df.columns) if colname == 'Aggresome' ][0] mitotic_ohe = np.zeros_like(cherrypicked_aggresome['ohe'].values[0]) mitotic_ohe[mitotic_idx] = 1 aggresome_ohe = np.zeros_like(cherrypicked_aggresome['ohe'].values[0]) aggresome_ohe[aggresome_idx] = 1 cherrypicked_mitotic_spindle.loc[ cherrypicked_mitotic_spindle['is_pure'] == 1, 'ohe'] = pd.Series( [ mitotic_ohe for _ in range( sum(cherrypicked_mitotic_spindle['is_pure'] == 1)) ], index=cherrypicked_mitotic_spindle.index[ cherrypicked_mitotic_spindle['is_pure'] == 1]) cherrypicked_aggresome.loc[ cherrypicked_aggresome['is_pure'] == 1, 'ohe'] = pd.Series([ mitotic_ohe for _ in range(sum(cherrypicked_aggresome['is_pure'] == 1)) ], index=cherrypicked_aggresome.index[ cherrypicked_aggresome['is_pure'] == 1]) class_purity_2_weight = {1: 4, 0: 1} cherrypicked_mitotic_spindle[ 'sampling_weight'] = cherrypicked_mitotic_spindle['is_pure'].map( class_purity_2_weight) cherrypicked_aggresome['sampling_weight'] = cherrypicked_aggresome[ 'is_pure'].map(class_purity_2_weight) else: cherrypicked_mitotic_spindle = None cherrypicked_aggresome = None train_dataset = ProteinDatasetImageLevel( trn_img_paths, basepath_2_ohe=basepath_2_ohe_vector, img_size=args.img_size, is_trainset=True, return_label=True, in_channels=args.in_channels, transform=train_transform, cherrypicked_mitotic_spindle_df=cherrypicked_mitotic_spindle, cherrypicked_aggresome_df=cherrypicked_aggresome) class_names = get_class_names() if args.balance_classes: sampler = BalancingSubSampler(trn_img_paths, basepath_2_ohe_vector, class_names, required_class_count=1500) else: sampler = RandomSampler(train_dataset) train_loader = DataLoader( train_dataset, sampler=sampler, batch_size=args.batch_size, drop_last=True, num_workers=args.workers, pin_memory=True, ) # val_img_paths = [path for path in val_img_paths if path in train_paths_set] valid_dataset = ProteinDatasetImageLevel( val_img_paths, basepath_2_ohe=basepath_2_ohe_vector, img_size=args.img_size, is_trainset=True, return_label=True, in_channels=args.in_channels, transform=train_transform) valid_loader = DataLoader(valid_dataset, sampler=SequentialSampler(valid_dataset), batch_size=args.batch_size, drop_last=False, num_workers=args.workers, pin_memory=True) focal_loss = FocalLoss().cuda() log.write('** start training here! **\n') log.write('\n') log.write( 'epoch iter rate | train_loss/acc | valid_loss/acc/focal/map |best_epoch/best_map| min \n' ) log.write( '-----------------------------------------------------------------------------------------------------------------\n' ) start_epoch += 1 if args.eval_at_start: with torch.no_grad(): valid_loss, valid_acc, valid_focal_loss, valid_map = validate( valid_loader, model, criterion, -1, focal_loss, log) print('\r', end='', flush=True) log.write( '%5.1f %5d %0.6f | %0.4f %0.4f | %0.4f %6.4f %6.4f %6.4f | %6.1f %6.4f | %3.1f min \n' % \ (-1, -1, -1, -1, -1, valid_loss, valid_acc, valid_focal_loss, valid_map, best_epoch, best_map, -1)) for epoch in range(start_epoch, args.epochs + 1): end = time.time() # set manual seeds per epoch np.random.seed(epoch) torch.manual_seed(epoch) torch.cuda.manual_seed_all(epoch) # adjust learning rate for each epoch lr_list = scheduler.step(model, epoch, args.epochs) lr = lr_list[0] # train for one epoch on train set iter, train_loss, train_acc = train( train_loader, model, criterion, optimizer, epoch, clipnorm=args.clipnorm, lr=lr, agg_steps=args.gradient_accumulation_steps) if np.isnan(train_loss): print('@@@@@NAN!') else: print('norm') with torch.no_grad(): valid_loss, valid_acc, valid_focal_loss, valid_map = validate( valid_loader, model, criterion, epoch, focal_loss, log) # remember best loss and save checkpoint is_best = valid_map > best_map best_loss = min(valid_focal_loss, best_loss) best_epoch = epoch if is_best else best_epoch best_map = valid_map if is_best else best_map print('\r', end='', flush=True) log.write( '%5.1f %5d %0.6f | %0.4f %0.4f | %0.4f %6.4f %6.4f %6.4f | %6.1f %6.4f | %3.1f min \n' % \ (epoch, iter + 1, lr, train_loss, train_acc, valid_loss, valid_acc, valid_focal_loss, valid_map, best_epoch, best_map, (time.time() - end) / 60)) save_model(model, is_best, model_out_dir, optimizer=optimizer, epoch=epoch, best_epoch=best_epoch, best_map=best_map)
dtype = torch.cuda.FloatTensor #csv_path='final_label.csv' #data=pd.read_csv(csv_path,sep='\t') #train,test=train_test_split(data,test_size=0.3,shuffle=True,random_state=1235) #train.to_csv('train.csv',index=False) #test.to_csv('test.csv',index=False) composed_transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((.5, .5, .5), (.5, .5, .5))]) train_ds = CDATA('train.csv', 'data/', transform=composed_transform) train_loader = data.DataLoader(train_ds, batch_size=1, sampler=RandomSampler(train_ds)) val_ds = CDATA('test.csv', 'data/', transform=composed_transform) val_loader = data.DataLoader(val_ds, batch_size=1, sampler=RandomSampler(val_ds)) retinet = RetiNet().type(dtype) optimizer = optim.Adam(retinet.parameters(), lr=.0002) loss = nn.BCELoss() print(len(train_loader)) num_epochs = 10 c = 0 fopen = open('loss.txt', 'w') for epoch in range(num_epochs): for x, y in train_loader:
def main(args, logger): # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv') trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl') trn_df['is_original'] = 1 for HOST in HOSTs: trn_df.loc[trn_df.host.str.contains(HOST).values, 'host'] = f'HOST_{HOST}'.casefold() # aug_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/ContextualWordEmbsAug_sub_df.pkl') # aug_df['is_original'] = 0 # trn_df = pd.concat([trn_df, aug_df], axis=0).reset_index(drop=True) gkf = GroupKFold(n_splits=5).split( X=trn_df.question_body, groups=trn_df.question_body_le, ) histories = { 'trn_loss': {}, 'val_loss': {}, 'val_metric': {}, 'val_metric_raws': {}, } loaded_fold = -1 loaded_epoch = -1 if args.checkpoint: histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint) # calc max_seq_len using quest dataset # max_seq_len = QUESTDataset( # df=trn_df, # mode='train', # tokens=[], # augment=[], # pretrained_model_name_or_path=TOKENIZER_PRETRAIN, # ).MAX_SEQUENCE_LENGTH # max_seq_len = 9458 # max_seq_len = 1504 max_seq_len = 512 fold_best_metrics = [] fold_best_metrics_raws = [] for fold, (trn_idx, val_idx) in enumerate(gkf): if fold < loaded_fold: fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax( histories["val_metric"][fold])]) continue sel_log( f' --------------------------- start fold {fold} --------------------------- ', logger) fold_trn_df = trn_df.iloc[trn_idx] # .query('is_original == 1') fold_trn_df = fold_trn_df.drop(['is_original', 'question_body_le'], axis=1) # use only original row fold_val_df = trn_df.iloc[val_idx].query('is_original == 1') fold_val_df = fold_val_df.drop(['is_original', 'question_body_le'], axis=1) if args.debug: fold_trn_df = fold_trn_df.sample(100, random_state=71) fold_val_df = fold_val_df.sample(100, random_state=71) temp = pd.Series( list( itertools.chain.from_iterable( fold_trn_df.question_title.apply(lambda x: x.split(' ')) + fold_trn_df.question_body.apply(lambda x: x.split(' ')) + fold_trn_df.answer.apply(lambda x: x.split(' ')))) ).value_counts() tokens = temp[temp >= 10].index.tolist() # tokens = [] tokens = [ 'CAT_TECHNOLOGY'.casefold(), 'CAT_STACKOVERFLOW'.casefold(), 'CAT_CULTURE'.casefold(), 'CAT_SCIENCE'.casefold(), 'CAT_LIFE_ARTS'.casefold(), # 'host_stackexchange', # 'host_askubuntu', # 'host_mathoverflow', # 'host_serverfault', # 'host_stackoverflow', # 'host_superuser', ] trn_dataset = QUESTDataset( df=fold_trn_df, mode='train', tokens=tokens, augment=[], pretrained_model_name_or_path=TOKENIZER_PRETRAIN, MAX_SEQUENCE_LENGTH=max_seq_len, ) # update token trn_sampler = RandomSampler(data_source=trn_dataset) trn_loader = DataLoader(trn_dataset, batch_size=BATCH_SIZE, sampler=trn_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=True, pin_memory=True) val_dataset = QUESTDataset( df=fold_val_df, mode='valid', tokens=tokens, augment=[], pretrained_model_name_or_path=TOKENIZER_PRETRAIN, MAX_SEQUENCE_LENGTH=max_seq_len, ) val_sampler = RandomSampler(data_source=val_dataset) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=False, pin_memory=True) fobj = BCEWithLogitsLoss() # fobj = MSELoss() model = BertModelForBinaryMultiLabelClassifier( num_labels=30, pretrained_model_name_or_path=MODEL_PRETRAIN, # cat_num=5, token_size=len(trn_dataset.tokenizer), MAX_SEQUENCE_LENGTH=max_seq_len, ) optimizer = optim.Adam(model.parameters(), lr=3e-5) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=MAX_EPOCH, eta_min=1e-5) # load checkpoint model, optim, scheduler if args.checkpoint and fold == loaded_fold: load_checkpoint(args.checkpoint, model, optimizer, scheduler) for epoch in tqdm(list(range(MAX_EPOCH))): if fold <= loaded_fold and epoch <= loaded_epoch: continue if epoch < 1: model.freeze_unfreeze_bert(freeze=True, logger=logger) else: model.freeze_unfreeze_bert(freeze=False, logger=logger) model = DataParallel(model) model = model.to(DEVICE) trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader) val_loss, val_metric, val_metric_raws, val_y_preds, val_y_trues, val_qa_ids = test( model, fobj, val_loader) scheduler.step() if fold in histories['trn_loss']: histories['trn_loss'][fold].append(trn_loss) else: histories['trn_loss'][fold] = [ trn_loss, ] if fold in histories['val_loss']: histories['val_loss'][fold].append(val_loss) else: histories['val_loss'][fold] = [ val_loss, ] if fold in histories['val_metric']: histories['val_metric'][fold].append(val_metric) else: histories['val_metric'][fold] = [ val_metric, ] if fold in histories['val_metric_raws']: histories['val_metric_raws'][fold].append(val_metric_raws) else: histories['val_metric_raws'][fold] = [ val_metric_raws, ] logging_val_metric_raws = '' for val_metric_raw in val_metric_raws: logging_val_metric_raws += f'{float(val_metric_raw):.4f}, ' sel_log( f'fold : {fold} -- epoch : {epoch} -- ' f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_metric : {float(val_metric):.4f} -- ' f'val_metric_raws : {logging_val_metric_raws}', logger) model = model.to('cpu') model = model.module save_checkpoint(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model, optimizer, scheduler, histories, val_y_preds, val_y_trues, val_qa_ids, fold, epoch, val_loss, val_metric) fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax( histories["val_metric"][fold])]) save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', trn_dataset.tokenizer) del model # calc training stats fold_best_metric_mean = np.mean(fold_best_metrics) fold_best_metric_std = np.std(fold_best_metrics) fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}' sel_log(fold_stats, logger) send_line_notification(fold_stats) fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0) fold_raw_stats = '' for metric_stats_raw in fold_best_metrics_raws_mean: fold_raw_stats += f'{float(metric_stats_raw):.4f},' sel_log(fold_raw_stats, logger) send_line_notification(fold_raw_stats) sel_log('now saving best checkpoints...', logger)
def get_tpu_sampler(dataset: Dataset): if xm.xrt_world_size() <= 1: return RandomSampler(dataset) return DistributedSampler(dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal())
import torch from torch.utils.data import DataLoader, Dataset from model import NestedUNet from torch.utils.data.sampler import RandomSampler from dataloader import random_seed, PolypDataset # from model_without_effcientnet_encoder import NestedUNet test_images_file = "data/test_images.txt" test_labels_file = "data/test_masks.txt" input_size = (128, 128) torch.manual_seed(15) test_set = PolypDataset(test_images_file, test_labels_file, input_size) test_loader = DataLoader(test_set, batch_size=1, sampler=RandomSampler(test_set)) # Inference device device = 'cuda' if torch.cuda.is_available() else 'cpu' # Load Model model_path = "experiment_test/polyp_unet_deepsupervision1.pth" # model = NestedUNet(n_channels = 3, n_classes = 1, bilinear = False).to(device) model = NestedUNet(num_classes=1, input_channels=3, bilinear=False).to(device) model.load_state_dict(torch.load(model_path, map_location=device)) model.eval()
def main(): args = parser.parse_args() log_out_dir = os.path.join(RESULT_DIR, 'logs', args.out_dir, 'fold%d' % args.fold) if not os.path.exists(log_out_dir): os.makedirs(log_out_dir) log = Logger() log.open(os.path.join(log_out_dir, 'log.train.txt'), mode='a') model_out_dir = os.path.join(RESULT_DIR, 'models', args.out_dir, 'fold%d' % args.fold) log.write(">> Creating directory if it does not exist:\n>> '{}'\n".format( model_out_dir)) if not os.path.exists(model_out_dir): os.makedirs(model_out_dir) # set cuda visible device if not args.all_gpus: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id cudnn.benchmark = True # cudnn.enabled = False # set random seeds torch.manual_seed(0) torch.cuda.manual_seed_all(0) np.random.seed(0) model_params = {} model_params['architecture'] = args.arch model_params['num_classes'] = args.num_classes model_params['in_channels'] = args.in_channels if 'efficientnet' in args.arch: model_params['image_size'] = args.img_size model_params['encoder'] = args.effnet_encoder model = init_network(model_params) if args.load_state_dict_path is not None: if args.load_state_dict_path == 'use-img-level-densenet-ckpt': model_dir = '../output/models/densenet121_1024_all_data__obvious_neg__gradaccum_20__start_lr_3e6' pretrained_ckpt_path = os.path.join(f'{model_dir}', f'fold{args.fold}', 'final.pth') else: pretrained_ckpt_path = args.load_state_dict_path init_pretrained = torch.load(pretrained_ckpt_path) model.load_state_dict(init_pretrained['state_dict']) if args.all_gpus: model = DataParallel(model) model.cuda() # define loss function (criterion) try: criterion = eval(args.loss)().cuda() except: raise (RuntimeError("Loss {} not available!".format(args.loss))) start_epoch = 0 best_loss = 1e5 best_epoch = 0 best_focal = float('inf') # define scheduler try: scheduler = eval(args.scheduler)( scheduler_lr_multiplier=args.scheduler_lr_multiplier, scheduler_epoch_offset=args.scheduler_epoch_offset) except: raise (RuntimeError("Scheduler {} not available!".format( args.scheduler))) optimizer = scheduler.schedule(model, start_epoch, args.epochs)[0] # optionally resume from a checkpoint if args.resume: args.resume = os.path.join(model_out_dir, args.resume) if os.path.isfile(args.resume): # load checkpoint weights and update model and optimizer log.write(">> Loading checkpoint:\n>> '{}'\n".format(args.resume)) checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] best_epoch = checkpoint['best_epoch'] best_focal = checkpoint['best_map'] model.load_state_dict(checkpoint['state_dict']) optimizer_fpath = args.resume.replace('.pth', '_optim.pth') if os.path.exists(optimizer_fpath): log.write(">> Loading checkpoint:\n>> '{}'\n".format( optimizer_fpath)) optimizer.load_state_dict( torch.load(optimizer_fpath)['optimizer']) log.write(">>>> loaded checkpoint:\n>>>> '{}' (epoch {})\n".format( args.resume, checkpoint['epoch'])) else: log.write(">> No checkpoint found at '{}'\n".format(args.resume)) # Data loading code train_transform = train_multi_augment2 with open('../input/imagelevel_folds_obvious_staining_5.pkl', 'rb') as f: folds = pickle.load(f) fold = args.fold trn_img_paths, val_img_paths = folds[fold] train_df = get_train_df_ohe(clean_from_duplicates=True) basepath_2_ohe_vector = { img: vec for img, vec in zip(train_df['img_base_path'], train_df.iloc[:, 2:].values) } public_hpa_df_17 = get_public_df_ohe(clean_from_duplicates=True) public_basepath_2_ohe_vector = { img_path: vec for img_path, vec in zip(public_hpa_df_17['img_base_path'], public_hpa_df_17.iloc[:, 2:].values) } basepath_2_ohe_vector.update(public_basepath_2_ohe_vector) available_paths = set( np.concatenate((train_df['img_base_path'].values, public_hpa_df_17['img_base_path'].values))) trn_img_paths = [path for path in trn_img_paths if path in available_paths] val_img_paths = [path for path in val_img_paths if path in available_paths] labels_df = pd.read_hdf(args.cell_level_labels_path) # modifying minor class labels cherrypicked_mitotic_spindle = pd.read_csv( '../input/mitotic_cells_selection.csv') cherrypicked_mitotic_spindle_img_cell = set( cherrypicked_mitotic_spindle[['ID', 'cell_i']].apply(tuple, axis=1).values) cherrypicked_mitotic_spindle_img_cell = { (img, cell_i - 1) for img, cell_i in cherrypicked_mitotic_spindle_img_cell } class_names = get_class_names() mitotic_spindle_class_i = class_names.index('Mitotic spindle') if args.include_nn_mitotic: cherrypicked_mitotic_spindle_based_on_nn = pd.read_csv( '../input/mitotic_pos_nn_added.csv') cherrypicked_mitotic_spindle_img_cell.update( set(cherrypicked_mitotic_spindle_based_on_nn[[ 'ID', 'cell_i' ]].apply(tuple, axis=1).values)) print('len cherrypicked_mitotic_spindle_img_cell', len(cherrypicked_mitotic_spindle_img_cell)) mitotic_bool_idx = labels_df.index.isin( cherrypicked_mitotic_spindle_img_cell) def modify_label(labels, idx, val): labels[idx] = val return labels labels_df.loc[mitotic_bool_idx, 'image_level_pred'] = labels_df.loc[ mitotic_bool_idx, 'image_level_pred'].map( lambda x: modify_label(x, mitotic_spindle_class_i, 1)) if args.include_nn_mitotic: cherrypicked_not_mitotic_spindle_based_on_nn = pd.read_csv( '../input/mitotic_neg_nn_added.csv') cherrypicked_not_mitotic_spindle_based_on_nn = set( cherrypicked_not_mitotic_spindle_based_on_nn[[ 'ID', 'cell_i' ]].apply(tuple, axis=1).values) not_mitotic_bool_idx = labels_df.index.isin( cherrypicked_not_mitotic_spindle_based_on_nn) labels_df.loc[not_mitotic_bool_idx, 'image_level_pred'] = labels_df.loc[ not_mitotic_bool_idx, 'image_level_pred'].map(lambda x: modify_label( x, mitotic_spindle_class_i, 0)) if args.ignore_negative: raise NotImplementedError if args.upsample_minorities: cells_to_upsample = list(cherrypicked_mitotic_spindle_img_cell) aggresome_class_i = class_names.index('Aggresome') confident_aggresome_indices = list( labels_df.index[labels_df['image_level_pred'].map( lambda x: x[aggresome_class_i] > 0.9)]) print('confident_aggresome_indices len', len(confident_aggresome_indices)) print('confident_aggresome_indices[:5]', confident_aggresome_indices[:5]) cells_to_upsample += confident_aggresome_indices else: cells_to_upsample = None train_dataset = ProteinDatasetCellSeparateLoading( trn_img_paths, labels_df=labels_df, cells_to_upsample=cells_to_upsample, img_size=args.img_size, in_channels=args.in_channels, transform=train_transform, basepath_2_ohe=basepath_2_ohe_vector, normalize=args.normalize, target_raw_img_size=args.target_raw_img_size) train_loader = DataLoader( train_dataset, sampler=RandomSampler(train_dataset), batch_size=args.batch_size, drop_last=False, num_workers=args.workers, pin_memory=True, ) # valid_dataset = ProteinDatasetCellLevel(val_img_paths, # labels_df=labels_df, # img_size=args.img_size, # batch_size=64, # is_trainset=True, # in_channels=args.in_channels) valid_dataset = ProteinDatasetCellSeparateLoading( val_img_paths, labels_df=labels_df, img_size=args.img_size, in_channels=args.in_channels, basepath_2_ohe=basepath_2_ohe_vector, normalize=args.normalize, target_raw_img_size=args.target_raw_img_size) valid_loader = DataLoader(valid_dataset, sampler=SequentialSampler(valid_dataset), batch_size=args.batch_size, drop_last=False, num_workers=args.workers, pin_memory=True) log.write('** start training here! **\n') log.write('\n') log.write( 'epoch iter rate | train_loss/acc | valid_loss/acc/map/focal |best_epoch/best_focal| min \n' ) log.write( '-----------------------------------------------------------------------------------------------------------------\n' ) start_epoch += 1 if args.eval_at_start: with torch.no_grad(): valid_loss, valid_acc, val_focal, val_map_score = validate( valid_loader, model, criterion, -1, log) print('\r', end='', flush=True) log.write( '%5.1f %5d %0.6f | %0.4f %0.4f | %0.4f %6.4f %6.4f %6.1f | %6.4f %6.4f | %3.1f min \n' % \ (-1, -1, -1, -1, -1, valid_loss, valid_acc, val_map_score, val_focal, best_epoch, best_focal, -1)) for epoch in range(start_epoch, args.epochs + 1): end = time.time() # set manual seeds per epoch np.random.seed(epoch) torch.manual_seed(epoch) torch.cuda.manual_seed_all(epoch) # adjust learning rate for each epoch lr_list = scheduler.step(model, epoch, args.epochs) lr = lr_list[0] # train for one epoch on train set iter, train_loss, train_acc = train( train_loader, model, criterion, optimizer, epoch, clipnorm=args.clipnorm, lr=lr, agg_steps=args.gradient_accumulation_steps) with torch.no_grad(): valid_loss, valid_acc, val_focal, val_map_score = validate( valid_loader, model, criterion, epoch, log) # remember best loss and save checkpoint is_best = val_focal < best_focal best_loss = min(valid_loss, best_loss) best_epoch = epoch if is_best else best_epoch best_focal = val_focal if is_best else best_focal print('\r', end='', flush=True) log.write('%5.1f %5d %0.6f | %0.4f %0.4f | %0.4f %6.4f %6.4f %6.1f | %6.4f %6.4f | %3.1f min \n' % \ (epoch, iter + 1, lr, train_loss, train_acc, valid_loss, valid_acc, val_map_score, val_focal, best_epoch, best_focal, (time.time() - end) / 60)) save_model(model, is_best, model_out_dir, optimizer=optimizer, epoch=epoch, best_epoch=best_epoch, best_map=best_focal)
def run(fold, df, meta_features, n_meta_features, transforms_train, transforms_val, target_idx, df_test): ''' 학습 진행 메인 함수 :param fold: cross-validation에서 valid에 쓰일 분할 번호 :param df: DataFrame 학습용 전체 데이터 목록 :param meta_features, n_meta_features: 이미지 외 추가 정보 사용 여부 :param transforms_train, transforms_val: 데이터셋 transform 함수 :param target_idx: ''' if args.DEBUG: args.n_epochs = 5 df_train = df[df['fold'] != fold].sample(args.batch_size * 5) df_valid = df[df['fold'] == fold].sample(args.batch_size * 5) else: df_train = df[df['fold'] != fold] df_valid = df[df['fold'] == fold] # https://discuss.pytorch.org/t/error-expected-more-than-1-value-per-channel-when-training/26274 # batch_normalization에서 배치 사이즈 1인 경우 에러 발생할 수 있으므로, 데이터 한개 버림 if len(df_train) % args.batch_size == 1: df_train = df_train.sample(len(df_train) - 1) if len(df_valid) % args.batch_size == 1: df_valid = df_valid.sample(len(df_valid) - 1) # 데이터셋 읽어오기 dataset_train = MMC_ClassificationDataset(df_train, 'train', meta_features, transform=transforms_train) dataset_valid = MMC_ClassificationDataset(df_valid, 'valid', meta_features, transform=transforms_val) train_loader = torch.utils.data.DataLoader( dataset_train, batch_size=args.batch_size, sampler=RandomSampler(dataset_train), num_workers=args.num_workers) valid_loader = torch.utils.data.DataLoader(dataset_valid, batch_size=args.batch_size, num_workers=args.num_workers) auc_max = 0. auc_no_ext_max = 0. model_file = os.path.join(args.model_dir, f'{args.kernel_type}_best_fold{fold}.pth') model_file2 = os.path.join( args.model_dir, f'{args.kernel_type}_best_no_ext_fold{fold}.pth') model_file3 = os.path.join(args.model_dir, f'{args.kernel_type}_final_fold{fold}.pth') # model_file_astraining = os.path.join(args.model_dir, f'{args.kernel_type}_best_aspsg_fold{fold}.pth') # pretrained file이 있는 경우 if os.path.isfile(model_file3): model = ModelClass( args.enet_type, n_meta_features=n_meta_features, n_meta_dim=[int(nd) for nd in args.n_meta_dim.split(',')], out_dim=args.out_dim, pretrained=True) model.load_state_dict(torch.load(model_file3)) else: model = ModelClass( args.enet_type, n_meta_features=n_meta_features, n_meta_dim=[int(nd) for nd in args.n_meta_dim.split(',')], out_dim=args.out_dim, pretrained=True) # GPU 여러개로 병렬처리 # if DP: # model = apex.parallel.convert_syncbn_model(model) model = model.to(device) optimizer = optim.Adam(model.parameters(), lr=args.init_lr) # if args.use_amp: # model, optimizer = amp.initialize(model, optimizer, opt_level="O1") if DP: model = nn.DataParallel(model) # amp를 사용하면 버그 (use_amp 비활성화) scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( optimizer, args.n_epochs - 1) scheduler_warmup = GradualWarmupSchedulerV2( optimizer, multiplier=10, total_epoch=1, after_scheduler=scheduler_cosine) for epoch in range(1, args.n_epochs + 1): print(time.ctime(), f'Fold {fold}, Epoch {epoch}') train_loss = train_epoch(model, train_loader, optimizer) val_loss, acc, auc, auc_no_ext = val_epoch(model, valid_loader, target_idx, is_ext=0) if args.use_ext: content = time.ctime( ) + ' ' + f'Fold {fold}, Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {train_loss:.5f}, valid loss: {(val_loss):.5f}, Acc: {(acc):.4f}, AUC: {(auc):.6f}, AUC_no_ext: {(auc_no_ext):.6f}.' else: content = time.ctime( ) + ' ' + f'Fold {fold}, Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {train_loss:.5f}, valid loss: {(val_loss):.5f}, Acc: {(acc):.4f}, AUC: {(auc):.6f}.' print(content) with open(os.path.join(args.log_dir, f'log_{args.kernel_type}.txt'), 'a') as appender: appender.write(content + '\n') scheduler_warmup.step() if epoch == 2: scheduler_warmup.step() # bug workaround if auc > auc_max: print('auc_max ({:.6f} --> {:.6f}). Saving model ...'.format( auc_max, auc)) torch.save(model.state_dict(), model_file_astraining) auc_max = auc # 외부데이터를 사용할 경우, 외부데이터를 제외한 모델을 따로 저장한다. if args.use_ext: if auc_no_ext > auc_no_ext_max: print('auc_no_ext_max ({:.6f} --> {:.6f}). Saving model ...'. format(auc_no_ext_max, auc_no_ext)) torch.save(model.state_dict(), model_file2) auc_no_ext_max = auc_no_ext torch.save(model.state_dict(), model_file3)
def run(fold): df_train = df_study[(df_study['fold'] != fold)] df_valid = df_study[(df_study['fold'] == fold)] dataset_train = RSNADataset3D(df_train, 'train', transform=train_transforms) dataset_valid = RSNADataset3D(df_valid, 'val', transform=val_transforms) train_loader = torch.utils.data.DataLoader( dataset_train, batch_size=4, sampler=RandomSampler(dataset_train), num_workers=num_workers) valid_loader = torch.utils.data.DataLoader(dataset_valid, batch_size=4, num_workers=num_workers) model = monai.networks.nets.densenet.densenet121( spatial_dims=3, in_channels=3, out_channels=out_dim).to(device) val_loss_best = 1000 model_file = f'{kernel_type}_best_fold{fold}.pth' optimizer = optim.Adam(model.parameters(), lr=init_lr) if use_amp: model, optimizer = amp.initialize(model, optimizer, opt_level="O1") # if len(os.environ['CUDA_VISIBLE_DEVICES'].split(',')) > 1: # model = nn.DataParallel(model) scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( optimizer, cosine_epo) scheduler_warmup = GradualWarmupSchedulerV2( optimizer, multiplier=10, total_epoch=warmup_epo, after_scheduler=scheduler_cosine) print(len(dataset_train), len(dataset_valid)) for epoch in range(1, n_epochs + 1): print(time.ctime(), 'Epoch:', epoch) scheduler_warmup.step(epoch - 1) train_loss = train_epoch(model, train_loader, optimizer) val_loss, acc, auc = val_epoch(model, valid_loader) content = time.ctime( ) + ' ' + f'Fold {fold}, Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {np.mean(train_loss):.5f}, valid loss: {(val_loss):.5f}, acc: {(acc):.4f}, auc: {(auc):.6f}' print(content) with open(f'log_{kernel_type}.txt', 'a') as appender: appender.write(content + '\n') if val_loss < val_loss_best: print( 'val_loss_best ({:.6f} --> {:.6f}). Saving model ...'.format( val_loss_best, val_loss)) torch.save(model.state_dict(), model_file) val_loss_best = val_loss torch.save(model.state_dict(), f'{kernel_type}_model_fold{fold}.pth')
weight_decay=weight_decay) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, scheduler_step, min_lr) # Load data train_id = fold_train[idx] val_id = fold_valid[idx] X_train, y_train = trainImageFetch(train_id) X_val, y_val = trainImageFetch(val_id) train_data = DataSource1(X_train, mode='train', mask_list=y_train, fine_size=fine_size, pad_left=pad_left, pad_right=pad_right) train_loader = DataLoader( train_data, shuffle=RandomSampler(train_data), batch_size=batch_size, num_workers=8, pin_memory=True) val_data = DataSource1(X_val, mode='val', mask_list=y_val, fine_size=fine_size, pad_left=pad_left, pad_right=pad_left) val_loader = DataLoader( val_data, shuffle=False, batch_size=batch_size, num_workers=8, pin_memory=True) num_snapshot = 0 best_acc = 0
def get_dataloader(dataset, batchsize, use_hidden=True): dataloader = data.DataLoader(dataset, batch_size=batchsize, sampler=RandomSampler(dataset), collate_fn=get_collate_fn(use_hidden)) return dataloader
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) parser.add_argument( "--bert_config_file", default=None, type=str, required=True, help="Path to the configuration file for the BERT model.") ## Other parameters parser.add_argument( "--init_checkpoint", default=None, type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--discr", default=False, action='store_true', help="Whether to do discriminative fine-tuning.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--save_checkpoints_steps", default=1000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--accumulate_gradients", type=int, default=1, help= "Number of steps to accumulate gradient on (divide the batch_size and accumulate)" ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--layers', type=int, nargs='+', default=[-2], help="choose the layers that used for downstream tasks, " "-2 means use pooled output, -1 means all layer," "else means the detail layers. default is -2") parser.add_argument('--num_datas', default=None, type=int, help="the number of data examples") parser.add_argument('--num_test_datas', default=None, type=int, help="the number of data examples") parser.add_argument('--pooling_type', default=None, type=str, choices=[None, 'mean', 'max']) args = parser.parse_args() processors = {"yelp": YELPProcessor} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.accumulate_gradients < 1: raise ValueError( "Invalid accumulate_gradients parameter: {}, should be >= 1". format(args.accumulate_gradients)) args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) summary_writer = SummaryWriter(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained("bert-large-cased") train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir, data_num=args.num_datas) num_train_steps = int( len(train_examples) / args.train_batch_size * args.num_train_epochs) bert_config = BertConfig.from_json_file(args.bert_config_file) model = BertForSequenceClassification(bert_config, len(label_list), args.layers, pooling=args.pooling_type) if args.init_checkpoint is not None: model.bert.load_state_dict( torch.load(args.init_checkpoint, map_location='cpu')) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.0 }] optimizer = AdamW(optimizer_parameters, lr=args.learning_rate, correct_bias=False) global_step = 0 global_train_step = 0 eval_examples = processor.get_dev_examples(args.data_dir, data_num=args.num_test_datas) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) all_input_ids = eval_features['input_ids'] all_input_mask = eval_features['attention_mask'] all_segment_ids = eval_features['token_type_ids'] all_label_ids = eval_features['labels'] eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size, shuffle=False) if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = train_features['input_ids'] all_input_mask = train_features['attention_mask'] all_segment_ids = train_features['token_type_ids'] all_label_ids = train_features['labels'] train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) print("TOTAL STEPS: ", (len(train_dataloader) * int(args.num_train_epochs))) epoch = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): epoch += 1 model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, token_type_ids, label_ids = batch loss, _ = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients # scheduler.step() summary_writer.add_scalar('Loss/train', loss.item(), global_step) # possibly comment this out max_grad_norm = 1.0 _clip_grad_norm(optimizer_parameters, max_grad_norm) model.zero_grad() global_step += 1 model.eval() eval_loss, eval_accuracy = 0, 0 pos_eval_prec, pos_eval_recall, pos_eval_f1 = 0, 0, 0 neg_eval_prec, neg_eval_recall, neg_eval_f1 = 0, 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 with open( os.path.join(args.output_dir, "results_ep" + str(epoch) + ".txt"), "w") as f: for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluate"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model( input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = logits.detach().cpu().numpy() label_ids = label_ids.detach().to('cpu').numpy() outputs = np.argmax(logits, axis=1) for output in outputs: f.write(str(output) + "\n") tmp_eval_accuracy = np.sum(outputs == label_ids) tmp_eval_prec, tmp_eval_recall, tmp_eval_f1 = get_analytics_neg_sent( outputs, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy neg_eval_prec += tmp_eval_prec neg_eval_recall += tmp_eval_recall neg_eval_f1 += tmp_eval_f1 tmp_eval_prec, tmp_eval_recall, tmp_eval_f1 = get_analytics_pos_sent( outputs, label_ids) pos_eval_prec += tmp_eval_prec pos_eval_recall += tmp_eval_recall pos_eval_f1 += tmp_eval_f1 global_train_step += 1 summary_writer.add_scalar("Loss/test", tmp_eval_loss.mean().item(), global_train_step) summary_writer.add_scalar("Accuracy/test", tmp_eval_accuracy, global_train_step) nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples pos_eval_prec = pos_eval_prec / nb_eval_steps pos_eval_recall = pos_eval_recall / nb_eval_steps pos_eval_f1 = pos_eval_f1 / nb_eval_steps neg_eval_prec = neg_eval_prec / nb_eval_steps neg_eval_recall = neg_eval_recall / nb_eval_steps neg_eval_f1 = neg_eval_f1 / nb_eval_steps result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps, 'pos_eval_precision': pos_eval_prec, 'neg_eval_precision': neg_eval_prec, 'pos_eval_recall': pos_eval_recall, 'neg_eval_recall': neg_eval_recall, 'pos_eval_f1': pos_eval_f1, 'neg_eval_f1': neg_eval_f1 } summary_writer.add_scalar("Epoch_loss/train", tr_loss, epoch) summary_writer.add_scalar("Epoch_loss/test", eval_loss, epoch) summary_writer.add_scalar("Epoch_accuracy/test", eval_accuracy, epoch) summary_writer.add_scalar("Epoch_positive_precision/test", pos_eval_prec, epoch) summary_writer.add_scalar("Epoch_negative_precision/test", neg_eval_prec, epoch) summary_writer.add_scalar("Epoch_positive_recall/test", pos_eval_recall, epoch) summary_writer.add_scalar("Epoch_negative_recall/test", neg_eval_recall, epoch) summary_writer.add_scalar("Epoch_positive_f1/test", pos_eval_f1, epoch) summary_writer.add_scalar("Epoch_negative_f1/test", neg_eval_f1, epoch) output_eval_file = os.path.join( args.output_dir, "eval_results_ep" + str(epoch) + ".txt") print("output_eval_file=", output_eval_file) with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) print("Saving model") torch.save( model.module.state_dict(), os.path.join( args.output_dir, "yelp-finetuned-bert-model_" + str(epoch) + ".pth"))
hebtb) dev_inf_set = get_morph_dataset_partition('dev-inf', home_path, tb_vocab, hebtb) test_inf_set = get_morph_dataset_partition('test-inf', home_path, tb_vocab, hebtb) dev_uninf_set = get_morph_dataset_partition('dev-uninf', home_path, tb_vocab, hebtb) test_uninf_set = get_morph_dataset_partition('test-uninf', home_path, tb_vocab, hebtb) train_set = get_model_morpheme_dataset_partition(home_path, train_set) dev_inf_set = get_model_morpheme_dataset_partition(home_path, dev_inf_set) test_inf_set = get_model_morpheme_dataset_partition(home_path, test_inf_set) dev_uninf_set = get_model_morpheme_dataset_partition(home_path, dev_uninf_set) test_uninf_set = get_model_morpheme_dataset_partition(home_path, test_uninf_set) train_sampler = RandomSampler(train_set) dev_inf_sampler = SequentialSampler(dev_inf_set) test_inf_sampler = SequentialSampler(test_inf_set) dev_uninf_sampler = SequentialSampler(dev_uninf_set) test_uninf_sampler = SequentialSampler(test_uninf_set) train_dataloader = DataLoader(train_set, sampler=train_sampler) dev_inf_dataloader = DataLoader(dev_inf_set, sampler=dev_inf_sampler) test_inf_dataloader = DataLoader(test_inf_set, sampler=test_inf_sampler) dev_uninf_dataloader = DataLoader(dev_uninf_set, sampler=dev_uninf_sampler) test_uninf_dataloader = DataLoader(test_uninf_set, sampler=test_uninf_sampler) # Embedding ft_form_vec_file_path = Path( 'data/processed/spmrl/hebtb-morph-vocab/word-form.vec') ft_lemma_vec_file_path = Path( 'data/processed/spmrl/hebtb-morph-vocab/word-lemma.vec')
def train_model(model, dataset_train, dataset_val, lr, num_epochs, model_dir, exp_name, scale_lr=None): params = itertools.chain( model.parameters()) optimizer = optim.Adam(params, lr=lr) criterion = nn.BCELoss() step = 0 optimizer.zero_grad() for epoch in range(num_epochs): sampler = RandomSampler(dataset_train) for i, sample_id in enumerate(sampler): data = dataset_train[sample_id] feats = { 'human_rcnn': Variable(torch.cuda.FloatTensor(data['human_feat'])), 'object_rcnn': Variable(torch.cuda.FloatTensor(data['object_feat'])), 'box': Variable(torch.cuda.FloatTensor(data['box_feat'])), "human_det_score": Variable(torch.cuda.FloatTensor(data["human_prob"])), "object_det_score": Variable(torch.cuda.FloatTensor(data["object_prob"])), "object_word2vec": Variable(torch.cuda.FloatTensor(data['verb_obj_vec'][:, 300:])), } model.train() binary_score = model(feats) # add binary_label = Variable(torch.cuda.FloatTensor(data['nis_labels'])) loss_binary = criterion(binary_score, binary_label.view(binary_score.size(0), 1)) loss_binary.backward() if step % 1 == 0: optimizer.step() optimizer.zero_grad() if step % 20 == 0: num_tp = np.sum(data['hoi_label']) num_fp = data['hoi_label'].shape[0] - num_tp log_str = \ 'Epoch: {} | Iter: {} | Step: {} | ' + \ ' Train Loss binary: {:.8f}' \ '| TPs: {} | FPs: {} | lr:{} ' log_str = log_str.format( epoch, i, step, loss_binary.data[0], num_tp, num_fp, optimizer.param_groups[0]['lr']) print(log_str) if step % 100 == 0: log_value('train_loss_binary', loss_binary.data[0], step) print(exp_name) if step % 1000 == 0 and step > 2000: val_loss_binary, recall, tp, fp = eval_model(model, dataset_val) log_value('val_loss_binary', val_loss_binary, step) log_value('recall', recall, step) log_value('tp', tp, step) log_value('fp', fp, step) log_str = \ 'Epoch: {} | Iter: {} | Step: {} | Val Loss binary: {:.8f}' \ '| recall: {:.2f} | tp: {:.2f}|fp: {:.2f}' log_str = log_str.format( epoch, i, step, val_loss_binary, recall, tp, fp) print(log_str) if step == 10 or( step % 1000 == 0 and step > 2000): hoi_classifier_pth = os.path.join( model_dir, "model", f'hoi_classifier_{step}') torch.save( model.state_dict(), hoi_classifier_pth) step += 1 if scale_lr is not None and step == scale_lr: scale_lr(optimizer, 0.1)