def training(session, training_op, cost, train_data, valid_data, y_upper_bound=None, seed=0, n_epochs=10, batch_size=128): """ * train_data * valid_data はいずれも、 * key : tensorflowのplaceholder * val : numpyのarray(全データ) のディクショナリ形式で与える """ np.random.seed(seed) # バッチ数を計算 n_samples_train = len(list(train_data.values())[0]) n_samples_valid = len(list(valid_data.values())[0]) n_batches_train = n_samples_train // batch_size n_batches_valid = n_samples_valid // batch_size mb = master_bar(range(n_epochs)) # 学習曲線描画のための前準備 train_costs_lst = [] valid_costs_lst = [] x_bounds = [0, n_epochs] y_bounds = None for epoch in mb: # Train train_costs = [] for _ in progress_bar(range(n_batches_train), parent=mb): batch_idx = np.random.randint(n_samples_train, size=batch_size) # feedするデータを指定 feed_dict = {} for k, v in train_data.items(): feed_dict[k] = v[batch_idx] _, train_cost = session.run([training_op, cost], feed_dict=feed_dict) train_costs.append(train_cost) # Valid valid_costs = [] for i in range(n_batches_valid): start = i * batch_size end = start + batch_size # feedするデータを指定 feed_dict = {} for k, v in valid_data.items(): feed_dict[k] = v[start:end] valid_cost = session.run(cost, feed_dict=feed_dict) valid_costs.append(valid_cost) # 損失関数の値の計算 train_costs_mean = np.mean(train_costs) valid_costs_mean = np.mean(valid_costs) train_costs_lst.append(train_costs_mean) valid_costs_lst.append(valid_costs_mean) # learning curveの図示 if y_bounds is None: # 1エポック目のみ実行 y_bounds = [ 0, train_costs_mean * 1.1 if y_upper_bound is None else y_upper_bound ] t = np.arange(len(train_costs_lst)) graphs = [[t, train_costs_lst], [t, valid_costs_lst]] mb.update_graph(graphs, x_bounds, y_bounds) # 学習過程の出力 mb.write( 'EPOCH: {0:02d}, Training cost: {1:10.5f}, Validation cost: {2:10.5f}' .format(epoch + 1, train_costs_mean, valid_costs_mean))
def main(): now = datetime.datetime.now() now_date = '{}-{:0>2d}-{:0>2d}_{:0>2d}-{:0>2d}-{:0>2d}'.format(now.year, now.month, now.day, now.hour, now.minute, now.second) # set logger logger = logging.getLogger("Log") logger.setLevel(logging.DEBUG) handler_format = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') # stream_handler = logging.StreamHandler() # stream_handler.setLevel(logging.DEBUG) # stream_handler.setFormatter(handler_format) # logger.addHandler(stream_handler) print('{}-{}-{} {}:{}:{}'.format(now.year, now.month, now.day, now.hour, now.minute, now.second)) with open('../params/exp{}.yaml'.format(EXP_NO), "r+") as f: param = yaml.load(f, Loader=yaml.FullLoader) param['date'] = now_date # seed set seed_setting(param['seed']) if torch.cuda.is_available(): torch.backends.cudnn.benchmark = True local_cv = dict() for fold in param['fold']: # /mnt/hdd1/alcon2019/ + exp0/ + 2019-mm-dd_hh-mm-ss/ + foldN outdir = os.path.join(param['save path'], EXP_NAME ,now_date, 'fold{}'.format(fold)) if os.path.exists(param['save path']): os.makedirs(outdir, exist_ok=True) else: print("Not find {}".format(param['save path'])) raise FileNotFoundError file_handler = logging.FileHandler(os.path.join(outdir, 'experiment.log')) file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(handler_format) logger.addHandler(file_handler) logger.debug('============= FOLD {} ============='.format(fold)) logger.debug('{}-{}-{} {}:{}:{}'.format(now.year, now.month, now.day, now.hour, now.minute, now.second)) # Dataset param['batch size'] = max(param['batch size'], param['batch size'] * param['GPU']) if param['debug']: train_dataset = AlconDataset(df=get_train_df(param['tabledir']).query('valid != @fold').iloc[:param['batch size']], augmentation=get_train_augmentation(*get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'],'train','imgs'), mode='train') valid_dataset = AlconDataset(df=get_train_df(param['tabledir']).query('valid == @fold').iloc[:param['batch size']], augmentation=get_test_augmentation(*get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'],'train','imgs'), mode='valid') else: train_dataset = AlconDataset(df=get_train_df(param['tabledir']).query('valid != @fold'), augmentation=get_train_augmentation(*get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'], 'train', 'imgs'), mode='train', margin_augmentation=True) valid_dataset = AlconDataset(df=get_train_df(param['tabledir']).query('valid == @fold'), augmentation=get_test_augmentation(*get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'], 'train', 'imgs'), mode='valid', margin_augmentation=False) logger.debug('train dataset size: {}'.format(len(train_dataset))) logger.debug('valid dataset size: {}'.format(len(valid_dataset))) # Dataloader train_dataloader = DataLoader(train_dataset, batch_size=param['batch size'], num_workers=param['thread'], pin_memory=False, drop_last=False) valid_dataloader = DataLoader(valid_dataset, batch_size=param['batch size'], num_workers=param['thread'], pin_memory=False, drop_last=False) logger.debug('train loader size: {}'.format(len(train_dataloader))) logger.debug('valid loader size: {}'.format(len(valid_dataloader))) # model # model = ResNetGRU(num_classes=48, hidden_size=512, bidirectional=True, load_weight=param['load weight'], dropout=param['dropout']) # model = OctResNetGRU2(num_classes=48, hidden_size=512, bidirectional=True, load_weight=None, dropout=param['dropout']) # model = ResNetGRU3(num_classes=48, hidden_size=512, bidirectional=True, load_weight=param['load weight'], dropout=param['dropout']) # model = ResNetLSTM(num_classes=48, hidden_size=512, bidirectional=True, load_weight=param['load weight'], dropout=param['dropout']) # model = ResNetResLSTM_MLP(num_classes=48, hidden_size=512, bidirectional=True, load_weight=param['load weight'], dropout=param['dropout']) model = SEResNeXtGRU2(num_classes=48, hidden_size=512, bidirectional=True, load_weight=None, dropout=param['dropout']) param['model'] = model.__class__.__name__ # optim if param['optim'].lower() == 'sgd': optimizer = torch.optim.SGD(model.parameters(), lr=param['lr'], momentum=0.9, weight_decay=1e-5, nesterov=False) elif param['optim'].lower() == 'adam': optimizer = torch.optim.SGD(model.parameters(), lr=param['lr']) else: raise NotImplementedError # scheduler scheduler = eval(param['scheduler']) model = model.to(param['device']) if param['GPU'] > 0: model = nn.DataParallel(model) loss_fn = nn.CrossEntropyLoss().to(param['device']) eval_fn = accuracy_one_character max_char_acc = -1. max_3char_acc = -1. min_loss = 10**5 writer = tbx.SummaryWriter("../log/exp{}/{}/fold{}".format(EXP_NO, now_date, fold)) for key, val in param.items(): writer.add_text('data/hyperparam/{}'.format(key), str(val), 0) mb = master_bar(range(param['epoch'])) for epoch in mb: avg_train_loss, avg_train_accuracy, avg_three_train_acc = train_alcon_rnn(model, optimizer, train_dataloader, param['device'], loss_fn, eval_fn, epoch, scheduler=None, writer=writer, parent=mb) #ok avg_valid_loss, avg_valid_accuracy, avg_three_valid_acc = valid_alcon_rnn(model, valid_dataloader, param['device'], loss_fn, eval_fn) writer.add_scalars("data/metric/valid", { 'loss': avg_valid_loss, 'accuracy': avg_valid_accuracy, '3accuracy': avg_three_valid_acc }, epoch) logger.debug('======================== epoch {} ========================'.format(epoch+1)) logger.debug('lr : {:.5f}'.format(scheduler.get_lr()[0])) logger.debug('loss : train={:.5f} , test={:.5f}'.format(avg_train_loss, avg_valid_loss)) logger.debug('acc(per 1 char) : train={:.3%} , test={:.3%}'.format(avg_train_accuracy, avg_valid_accuracy)) logger.debug('acc(per 3 char) : train={:.3%} , test={:.3%}'.format(avg_three_train_acc, avg_three_valid_acc)) if min_loss > avg_valid_loss: logger.debug('update best loss: {:.5f} ---> {:.5f}'.format(min_loss, avg_valid_loss)) min_loss = avg_valid_loss torch.save(model.state_dict(), os.path.join(outdir, 'best_loss.pth')) if max_char_acc < avg_valid_accuracy: logger.debug('update best acc per 1 char: {:.3%} ---> {:.3%}'.format(max_char_acc, avg_valid_accuracy)) max_char_acc = avg_valid_accuracy torch.save(model.state_dict(), os.path.join(outdir, 'best_acc.pth')) if max_3char_acc < avg_three_valid_acc: logger.debug('update best acc per 3 char: {:.3%} ---> {:.3%}'.format(max_3char_acc , avg_three_valid_acc)) max_3char_acc = avg_three_valid_acc torch.save(model.state_dict(), os.path.join(outdir, 'best_3acc.pth')) if 1: if scheduler is not None: if writer is not None: writer.add_scalar("data/learning rate", scheduler.get_lr()[0], epoch) scheduler.step() writer.add_scalars("data/metric/valid", { 'best loss': min_loss, 'best accuracy': max_char_acc, 'best 3accuracy': max_3char_acc }) logger.debug('================ FINISH TRAIN ================') logger.debug('Result') logger.debug('Best loss : {}'.format(min_loss)) logger.debug('Best 1 acc : {}'.format(max_char_acc)) logger.debug('Best 3 acc : {}'.format(max_3char_acc)) writer.export_scalars_to_json(os.path.join(outdir, 'history.json')) writer.close() local_cv['fold{}'.format(fold)] = {'accuracy' : max_3char_acc, 'valid_size' : len(valid_dataset)} del train_dataset, valid_dataset del train_dataloader, valid_dataloader del scheduler, optimizer gc.collect() logger.debug('=========== Prediction phrase ===========') logger.debug('load weight : {}'.format(os.path.join(outdir, 'best_3acc.pth'))) model.load_state_dict(torch.load(os.path.join(outdir, 'best_3acc.pth'))) if param['debug']: test_dataset = AlconDataset(df=get_test_df(param['tabledir']).iloc[:param['batch size']], augmentation=get_test_augmentation(*get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'], 'test', 'imgs'), mode='test') else: test_dataset = AlconDataset(df=get_test_df(param['tabledir']), augmentation=get_test_augmentation(*get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'], 'test', 'imgs'), mode='test') test_dataloader = DataLoader(test_dataset, batch_size=param['batch size'], num_workers=param['thread'], pin_memory=False, drop_last=False) logger.debug('test dataset size: {}'.format(len(test_dataset))) logger.debug('test loader size: {}'.format(len(test_dataloader))) output_list = pred_alcon_rnn(model, test_dataloader, param['device']) torch.save(output_list, os.path.join(outdir, 'prediction.pth')) pd.DataFrame(output_list).drop('logit', axis=1).sort_values('ID').set_index('ID').to_csv(os.path.join(outdir, 'test_prediction.csv')) logger.debug('success!') logger.removeHandler(file_handler) del test_dataset, test_dataloader gc.collect() # Ensemble print('======== Ensemble phase =========') prediction_dict = dict() mb = master_bar(param['fold']) print('======== Load Vector =========') for i, fold in enumerate(mb): outdir = os.path.join(param['save path'], EXP_NAME, now_date,'fold{}'.format(fold)) prediction = torch.load(os.path.join(outdir, 'prediction.pth')) # prediction is list # prediction[0] = {'ID' : 0, 'logit' torch.tensor, ...} if i == 0: for preds in progress_bar(prediction, parent=mb): prediction_dict[preds['ID']] = preds['logit'] / len(param['fold']) else: for preds in progress_bar(prediction, parent=mb): prediction_dict[preds['ID']] += preds['logit'] / len(param['fold']) outdir = os.path.join(param['save path'], EXP_NAME, now_date) file_handler = logging.FileHandler(os.path.join(outdir, 'result.log')) file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(handler_format) logger.addHandler(file_handler) logger.info(' ========== RESULT ========== \n') cv = 0.0 train_data_size = 0 for fold in param['fold']: acc = local_cv['fold{}'.format(fold)]['accuracy'] valid_size = local_cv['fold{}'.format(fold)]['valid_size'] train_data_size += valid_size logger.info(' fold {} : {:.3%} \n'.format(fold, acc)) cv += acc * valid_size logger.info(' Local CV : {:.3%} \n'.format(cv / train_data_size)) logger.info(' ============================== \n') logger.removeHandler(file_handler) torch.save(prediction_dict, os.path.join(outdir, 'prediction.pth')) print('======== make submittion file =========') vocab = get_vocab(param['vocabdir']) submit_list = list() for ID, logits in progress_bar(prediction_dict.items()): submit_dict = dict() submit_dict["ID"] = ID preds = logits.softmax(dim=1).argmax(dim=1) submit_dict["Unicode1"] = vocab['index2uni'][preds[0]] submit_dict["Unicode2"] = vocab['index2uni'][preds[1]] submit_dict["Unicode3"] = vocab['index2uni'][preds[2]] submit_list.append(submit_dict) print() pd.DataFrame(submit_list).sort_values('ID').set_index('ID').to_csv(os.path.join(outdir, 'test_prediction.csv')) import zipfile with zipfile.ZipFile(os.path.join(outdir,'submit_{}_{}.zip'.format(EXP_NAME, now_date)), 'w') as zf: zf.write(os.path.join(outdir, 'test_prediction.csv')) print('success!')
def _custom_train( self, train_dataset, tokenizer, model, num_train_examples, train_batch_size, ): config = self.parent.config._asdict() config["strategy"] = self.parent.config.strategy config["n_device"] = self.parent.config.n_device labels = config["ner_tags"] if config["max_steps"] > 0: num_train_steps = ( config["max_steps"] * config["gradient_accumulation_steps"] ) config["epochs"] = 1 else: num_train_steps = ( math.ceil(num_train_examples / train_batch_size) // config["gradient_accumulation_steps"] * config["epochs"] ) with config["strategy"].scope(): loss_fct = tf.keras.losses.SparseCategoricalCrossentropy( reduction=tf.keras.losses.Reduction.NONE ) optimizer = create_optimizer( config["learning_rate"], num_train_steps, config["warmup_steps"], ) if config["use_fp16"]: optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer( optimizer, "dynamic" ) loss_metric = tf.keras.metrics.Mean(name="loss", dtype=tf.float32) gradient_accumulator = GradientAccumulator() self.logger.info("***** Running training *****") self.logger.info(" Num examples = %d", num_train_examples) self.logger.info(" Num Epochs = %d", config["epochs"]) self.logger.info( " Instantaneous batch size per device = %d", config["per_device_train_batch_size"], ) self.logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", train_batch_size * config["gradient_accumulation_steps"], ) self.logger.info( " Gradient Accumulation steps = %d", config["gradient_accumulation_steps"], ) self.logger.info(" Total training steps = %d", num_train_steps) self.logger.debug(model.summary()) @tf.function def apply_gradients(): grads_and_vars = [] for gradient, variable in zip( gradient_accumulator.gradients, model.trainable_variables ): if gradient is not None: scaled_gradient = gradient / ( config["n_device"] * config["gradient_accumulation_steps"] ) grads_and_vars.append((scaled_gradient, variable)) else: grads_and_vars.append((gradient, variable)) optimizer.apply_gradients(grads_and_vars, config["max_grad_norm"]) gradient_accumulator.reset() @tf.function def train_step(train_features, train_labels): def step_fn(train_features, train_labels): inputs = { "attention_mask": train_features["input_mask"], "training": True, } if config["model_architecture_type"] != "distilbert": inputs["token_type_ids"] = ( train_features["segment_ids"] if config["model_architecture_type"] in ["bert", "xlnet"] else None ) with tf.GradientTape() as tape: logits = model(train_features["input_ids"], **inputs)[0] logits = tf.reshape(logits, (-1, len(labels) + 1)) active_loss = tf.reshape( train_features["input_mask"], (-1,) ) active_logits = tf.boolean_mask(logits, active_loss) train_labels = tf.reshape(train_labels, (-1,)) active_labels = tf.boolean_mask(train_labels, active_loss) cross_entropy = loss_fct(active_labels, active_logits) loss = tf.reduce_sum(cross_entropy) * ( 1.0 / train_batch_size ) grads = tape.gradient(loss, model.trainable_variables) gradient_accumulator(grads) return cross_entropy per_example_losses = config["strategy"].experimental_run_v2( step_fn, args=(train_features, train_labels) ) mean_loss = config["strategy"].reduce( tf.distribute.ReduceOp.MEAN, per_example_losses, axis=0 ) return mean_loss current_time = datetime.datetime.now() train_iterator = master_bar(range(config["epochs"])) global_step = 0 self.logger_loss = 0.0 for epoch in train_iterator: epoch_iterator = progress_bar( train_dataset, total=num_train_steps, parent=train_iterator, display=config["n_device"] > 1, ) step = 1 with config["strategy"].scope(): for train_features, train_labels in epoch_iterator: loss = train_step(train_features, train_labels) if step % config["gradient_accumulation_steps"] == 0: config["strategy"].experimental_run_v2(apply_gradients) loss_metric(loss) global_step += 1 if ( config["save_steps"] > 0 and global_step % config["save_steps"] == 0 ): # Save model checkpoint output_dir = os.path.join( config["output_dir"], "checkpoint-{}".format(global_step), ) if not os.path.exists(output_dir): os.makedirs(output_dir) model.save_pretrained(output_dir) self.logger.info( "Saving model checkpoint to %s", output_dir ) train_iterator.child.comment = ( f"loss : {loss_metric.result()}" ) step += 1 train_iterator.write( f"loss epoch {epoch + 1}: {loss_metric.result()}" ) loss_metric.reset_states() self.logger.debug( " Training took time = {}".format( datetime.datetime.now() - current_time ) )
def begin_fit(self): self.mbar = master_bar(range(self.epochs)) self.mbar.on_iter_begin() self.run.logger = partial(self.mbar.write, table=True)
def train(model, epochs, learning_rates, optimizer, criterion, dataset, batch_size=512, num_workers=0, drop_last=False, timer=None): t = timer or Timer() train_batches = get_dataloader["train"] if balance: train_batches = torch.utils.data.DataLoader( dataset["train"], batch_size, shuffle=False, pin_memory=True, num_workers=num_workers, drop_last=drop_last, sampler=dataset["train"].get_balanced_sampler()) else: train_batches = torch.utils.data.DataLoader(dataset["train"], batch_size, shuffle=True, pin_memory=True, num_workers=num_workers, drop_last=drop_last) test_batches = torch.utils.data.DataLoader(dataset["val"], batch_size, shuffle=False, pin_memory=True, num_workers=num_workers) train_size, val_size = len(dataset["train"]), len(dataset["val"]) if drop_last: train_size -= (train_size % batch_size) num_epochs = epochs[-1] lr_schedule = LinearInterpolation(epochs, learning_rates) #mo_schedule = LinearInterpolation(epochs, momentum) mb = master_bar(range(num_epochs)) mb.write("Epoch\tTime\tLearRate\tT_loss\tT_accu\t\tV_loss\tV_accu") mb.write("-" * 70) for epoch in mb: #train_batches.dataset.set_random_choices() lrs = (lr_schedule(x) / batch_size for x in np.arange(epoch, epoch + 1, 1 / len(train_batches))) train_stats, train_time = train_epoch(mb, model, train_batches, optimizer, criterion, lrs, { 'loss': [], 'correct': [] }), t() test_stats, test_time = test_epoch(mb, model, test_batches, criterion, { 'loss': [], 'correct': [] }), t() metric["epoch"].append(epoch + 1) metric["learning rate"].append(lr_schedule(epoch + 1)) metric["total time"].append(t.total_time) metric["train loss"].append(sum(train_stats['loss']) / train_size) metric["train acc"].append(sum(train_stats['correct']) / train_size) metric["val loss"].append(sum(test_stats['loss']) / val_size) metric["val acc"].append(sum(test_stats['correct']) / val_size) mb.write( "{}/{}\t{:.0f}:{:.0f}\t{:.4f}\t\t{:.4f}\t{:.4f}\t\t{:.4f}\t{:.4f}". format(metric["epoch"][-1], num_epochs, metric["total time"][-1] // 60, metric["total time"][-1] % 60, metric["learning rate"][-1], metric["train loss"][-1], metric["train acc"][-1], metric["val loss"][-1], metric["val acc"][-1])) graphs = [[metric["epoch"], metric["train acc"]], [metric["epoch"], metric["val acc"]]] mb.update_graph(graphs) return metric
def fit(self, X, y, epochs=100, validation_data=None, batch_size=32, verbose=True, early_stopping=False, trans=None, validation_trans=None): dataset = self.dataset(X, y, trans=trans) dataloader = DataLoader(dataset, shuffle=True, batch_size=batch_size) self.history = {"loss": []} for metric in self.metrics: self.history[f'{metric.name}'] = [] if validation_data: dataset = self.dataset(validation_data[0], validation_data[1], trans=validation_trans) dataloader_val = DataLoader(dataset, shuffle=False, batch_size=batch_size) self.history["val_loss"] = [] for metric in self.metrics: self.history[f'val_{metric.name}'] = [] if self.scheduler: self.history["lr"] = [] self.net.to(self.device) mb = master_bar(range(1, epochs+1)) best_loss, step, best_e = 1e10, 0, 0 for epoch in mb: # train self.net.train() train_loss, train_metrics = [], [[] for m in self.metrics] for X, y in progress_bar(dataloader, parent=mb): X, y = X.to(self.device), y.to(self.device) self.optimizer.zero_grad() output = self.net(X) loss = self.loss(output, y) loss.backward() self.optimizer.step() train_loss.append(loss.item()) comment = f'train_loss {np.mean(train_loss):.5f}' for i, metric in enumerate(self.metrics): train_metrics[i].append(metric.call(output, y)) comment += f' train_{metric.name} {np.mean(train_metrics[i]):.5f}' mb.child.comment = comment self.history["loss"].append(np.mean(train_loss)) for i, metric in enumerate(self.metrics): self.history[f'{metric.name}'].append(np.mean(train_metrics[i])) bar_text = f'Epoch {epoch}/{epochs} loss {np.mean(train_loss):.5f}' for i, metric in enumerate(self.metrics): bar_text += f' {metric.name} {np.mean(train_metrics[i]):.5f}' if self.scheduler: self.history["lr"].append(optimizer.param_groups[0]['lr']) self.scheduler.step() # eval if validation_data: self.net.eval() val_loss, val_metrics = [], [[] for m in self.metrics] with torch.no_grad(): for X, y in progress_bar(dataloader_val, parent=mb): X, y = X.to(self.device), y.to(self.device) output = self.net(X) loss = self.loss(output, y) val_loss.append(loss.item()) comment = f'val_loss {np.mean(val_loss):.5f}' for i, metric in enumerate(self.metrics): val_metrics[i].append(metric.call(output, y)) comment += f' val_{metric.name} {np.mean(val_metrics[i]):.5f}' mb.child.comment = comment self.history["val_loss"].append(np.mean(val_loss)) for i, metric in enumerate(self.metrics): self.history[f'val_{metric.name}'].append(np.mean(val_metrics[i])) bar_text += f' val_loss {np.mean(val_loss):.5f}' for i, metric in enumerate(self.metrics): bar_text += f' val_{metric.name} {np.mean(val_metrics[i]):.5f}' if early_stopping: step += 1 if np.mean(val_loss) < best_loss: best_loss = np.mean(val_loss) torch.save(self.net.state_dict(),'best_dict.pth') best_e = epoch step = 0 if step >= early_stopping: self.net.load_state_dict(torch.load('best_dict.pth')) print(f"training stopped at epoch {epoch}") print(f"best model found at epoch {best_e} with val_loss {best_loss:.5f}") break if verbose: mb.write(bar_text) return self.history
def main(out: Param("dataset folder", Path, required=True), info: Param('info file', Path, required=True), tile: Param('generated tile size', int, nargs='+', required=True), n_train: Param('number of train tiles', int, required=True), n_valid: Param('number of validation tiles', int, required=True), crap_func: Param('crappifier name', str) = 'no_crap', n_frames: Param('number of frames', int) = 1, lr_type: Param('training input, (s)ingle, (t) multi or (z) multi', str) = 's', scale: Param('amount to scale', int) = 4, ftypes: Param('ftypes allowed e.g. - czi, tif', str, nargs='+') = None, upsample: Param('use upsample', action='store_true') = False, only: Param('limit to these categories', nargs='+') = None, skip: Param("categories to skip", str, nargs='+') = ['random', 'ArgoSIMDL'], clean: Param("wipe existing data first", action='store_true') = False): "generate tiles from source tiffs" up = 'up' if upsample else '' if lr_type not in ['s', 't', 'z']: print('lr_type should be s, t or z') return 1 if lr_type == 's': z_frames, t_frames = 1, 1 elif lr_type == 't': z_frames, t_frames = 1, n_frames elif lr_type == 'z': z_frames, t_frames = n_frames, 1 out = ensure_folder(out / f'{lr_type}_{n_frames}_{info.stem}_{crap_func}') if clean: shutil.rmtree(out) crap_func = eval(crap_func) if not crap_func is None: if not callable(crap_func): print('crap_func is not callable') crap_func = None else: crap_func = partial(crap_func, scale=scale, upsample=upsample) info = pd.read_csv(info) if ftypes: info = info.loc[info.ftype.isin(ftypes)] if only: info = info.loc[info.category.isin(only)] elif skip: info = info.loc[~info.category.isin(skip)] info = info.loc[info.nz >= z_frames] info = info.loc[info.nt >= t_frames] tile_infos = [] for mode, n_samples in [('train', n_train), ('valid', n_valid)]: mode_info = info.loc[info.dsplit == mode] categories = list(mode_info.groupby('category')) files_by_category = { c: list(info.groupby('fn')) for c, info in categories } for i in range(n_samples): category, cat_df = random.choice(categories) fn, item_df = random.choice(files_by_category[category]) legal_choices = [ item_info for ix, item_info in item_df.iterrows() if check_info(item_info, t_frames, z_frames) ] assert (legal_choices) item_info = random.choice(legal_choices) for tile_sz in tile: item_d = dict(item_info) item_d['tile_sz'] = tile_sz tile_infos.append(item_d) tile_info_df = pd.DataFrame(tile_infos).reset_index() print('num tile pulls:', len(tile_infos)) print(tile_info_df.groupby('category').fn.count()) last_stat = None tile_pull_info = [] tile_puller = None multi_str = f'_{lr_type}_{n_frames}' if lr_type != 's' else '' mbar = master_bar(tile_info_df.groupby('fn')) for fn, tile_stats in mbar: if Path( fn ).stem == 'high res microtubules for testing before stitching - great quality': continue for i, tile_stat in progress_bar(list(tile_stats.iterrows()), parent=mbar): try: mode = tile_stat['dsplit'] category = tile_stat['category'] tile_sz = tile_stat['tile_sz'] tile_folder = ensure_folder( out / f'hr_t_{tile_sz}{multi_str}' / mode / category) if crap_func: crap_folder = ensure_folder( out / f'lr{up}_t_{tile_sz}{multi_str}' / mode / category) else: crap_folder = None if need_cache_flush(tile_stat, last_stat): if tile_puller: tile_puller(None, None, None, close_me=True) last_stat = tile_stat.copy() tile_sz = tile_stat['tile_sz'] tile_puller = get_tile_puller(tile_stat, crap_func, t_frames, z_frames) tile_pull_info.append( tile_puller(tile_stat, tile_folder, crap_folder)) except MemoryError as error: # some files are too big to read fn = Path(tile_stat['fn']) print(f'too big: {fn.stem}') pd.DataFrame(tile_pull_info).to_csv(out / f'tiles{multi_str}.csv', index=False)
def begin_fit(self): self.master_bar = master_bar(range(self.epochs)) self.master_bar.on_iter_begin() # Callback class stores the Learner() object under self.run self.run.logger = partial(self.master_bar.write, table=True)
def train(self, num_epochs, max_lr=0.1): t = Timer() valid_loss_min = np.Inf patience = 10 p = 0 # current number of epochs, where validation loss didn't increase #train_size, val_size = len(self.train_ds), len(self.valid_ds) #if drop_last: train_size -= (train_size % self.batch_size) self.epochs = [0, num_epochs / 4, num_epochs] #[0, 15, 30, 35] self.learning_rates = [0, max_lr, 0] #[0, 0.1, 0.005, 0] lr_schedule = LinearInterpolation(self.epochs, self.learning_rates) #mo_schedule = LinearInterpolation(epochs, momentum) stats = { "train_it": [], 'train_loss': [], 'train_metric': [], "valid_it": [], 'valid_loss': [], 'valid_metric': [] } mb = master_bar(range(num_epochs)) mb.names = ["train loss", "train acc", "val loss", "val acc"] mb.write("Epoch\tTime\tLearRate\tT_loss\tT_accu\t\tV_loss\tV_accu") mb.write("-" * 70) for epoch in mb: mb.write("epoch") #self.train_batches.dataset.set_random_choices() lrs = (lr_schedule(x) / self.batch_size for x in np.arange(epoch, epoch + 1, 1 / len(self.train_batches))) stats, train_time = self.train_epoch(stats, epoch, mb, lrs), t() stats, valid_time = self.valid_epoch( stats, epoch, mb, ), t() self.log["epoch"].append(epoch + 1) self.log["learning rate"].append(lr_schedule(epoch + 1)) self.log["total time"].append(t.total_time) self.log["train loss"].append(np.mean( stats['train_loss'])) # or np.mean self.log["train acc"].append(np.mean(stats['train_metric'])) self.log["val loss"].append(np.mean(stats['valid_loss'])) self.log["val acc"].append(np.mean(stats['valid_metric'])) if self.log["val loss"][-1] <= valid_loss_min: # Val loss improve mb.write('Saving model!') self.save_model() valid_loss_min = self.log["val loss"][-1] p = 0 else: # Val loss didn't improve p += 1 if p > patience: mb.write('Stopping training') break mb.write( "{}/{}\t{:.0f}:{:.0f}\t{:.4f}\t\t{:.4f}\t{:.4f}\t\t{:.4f}\t{:.4f}" .format(self.log["epoch"][-1], num_epochs, self.log["total time"][-1] // 60, self.log["total time"][-1] % 60, self.log["learning rate"][-1], self.log["train loss"][-1], self.log["train acc"][-1], self.log["val loss"][-1], self.log["val acc"][-1])) #graphs = [[self.log["epoch"], self.log["train acc"]], # [self.log["epoch"], self.log["val acc"]]] #mb.update_graph(graphs) torch.cuda.empty_cache() # free cache mem after train
def training(self): condition = self.config["train"]["condition"] best_score = {"epoch": -1, "train_loss": np.inf, "valid_loss": np.inf, "train_qwk": 0.0, "valid_qwk": 0.0} non_improvement_round = 0 mb = master_bar(range(condition["epoch"])) for epoch in mb: temp_score = {"epoch": epoch, "train_loss": 0.0, "valid_loss": 0.0, "train_qwk": 0.0, "valid_qwk": 0.0} for phase in ["train", "valid"]: if phase == "train": data_loader = self.train_loader self.scheduler.step() self.model.train() elif phase == "valid": data_loader = self.valid_loader self.model.eval() running_loss = 0.0 y_true, y_pred = np.array([]).reshape((0, 1)), np.array([]).reshape((0, 1)) for data in progress_bar(data_loader, parent=mb): mb.child.comment = ">> {} phase".format(phase) inputs = data["image"].to(self.device, dtype=torch.float) labels = data["label"].view(-1, 1).to(self.device, dtype=torch.float) self.optimizer.zero_grad() outputs = self.model(inputs) with torch.set_grad_enabled(phase == "train"): loss = self.criterion(outputs, labels) if phase == "train": loss.backward() self.optimizer.step() running_loss += loss.item() if torch.cuda.is_available(): labels = labels.cpu() outputs = outputs.cpu() y_true = np.vstack((y_true, labels.detach().numpy())) y_pred = np.vstack((y_pred, outputs.detach().numpy())) temp_score["{}_loss".format(phase)] = running_loss / len(data_loader) temp_score["{}_qwk".format(phase)] = self.__auc_scoring(y_true, y_pred) super().update_training_log(temp_score) if best_score["valid_loss"] > temp_score["valid_loss"]: best_score = temp_score super().update_best_model(self.model.state_dict()) non_improvement_round = 0 else: non_improvement_round += 1 if epoch % 10 == 0: text = "[epoch {}] best epoch:{} train loss:{} valid loss:{} train auc:{} valid auc:{}".format( epoch, best_score["epoch"], np.round(best_score["train_loss"], 5), np.round(best_score["valid_loss"], 5), np.round(best_score["train_qwk"], 5), np.round(best_score["valid_qwk"], 5) ) mb.write(text) super().update_learning_curve() # Early Stopping if non_improvement_round >= condition["early_stopping_rounds"]: print("\t Early stopping: {}[epoch]".format(epoch)) break super().update_learning_curve() return best_score
def train( args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id ): if args["max_steps"] > 0: num_train_steps = args["max_steps"] * args["gradient_accumulation_steps"] args["num_train_epochs"] = 1 else: num_train_steps = ( math.ceil(num_train_examples / train_batch_size) // args["gradient_accumulation_steps"] * args["num_train_epochs"] ) writer = tf.summary.create_file_writer("/tmp/mylogs") with strategy.scope(): loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE) optimizer = create_optimizer(args["learning_rate"], num_train_steps, args["warmup_steps"]) if args["fp16"]: optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic") loss_metric = tf.keras.metrics.Mean(name="loss", dtype=tf.float32) gradient_accumulator = GradientAccumulator() logging.info("***** Running training *****") logging.info(" Num examples = %d", num_train_examples) logging.info(" Num Epochs = %d", args["num_train_epochs"]) logging.info(" Instantaneous batch size per device = %d", args["per_device_train_batch_size"]) logging.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", train_batch_size * args["gradient_accumulation_steps"], ) logging.info(" Gradient Accumulation steps = %d", args["gradient_accumulation_steps"]) logging.info(" Total training steps = %d", num_train_steps) model.summary() @tf.function def apply_gradients(): grads_and_vars = [] for gradient, variable in zip(gradient_accumulator.gradients, model.trainable_variables): if gradient is not None: scaled_gradient = gradient / (args["n_device"] * args["gradient_accumulation_steps"]) grads_and_vars.append((scaled_gradient, variable)) else: grads_and_vars.append((gradient, variable)) optimizer.apply_gradients(grads_and_vars, args["max_grad_norm"]) gradient_accumulator.reset() @tf.function def train_step(train_features, train_labels): def step_fn(train_features, train_labels): inputs = {"attention_mask": train_features["input_mask"], "training": True} if args["model_type"] != "distilbert": inputs["token_type_ids"] = ( train_features["segment_ids"] if args["model_type"] in ["bert", "xlnet"] else None ) with tf.GradientTape() as tape: logits = model(train_features["input_ids"], **inputs)[0] logits = tf.reshape(logits, (-1, len(labels) + 1)) active_loss = tf.reshape(train_features["input_mask"], (-1,)) active_logits = tf.boolean_mask(logits, active_loss) train_labels = tf.reshape(train_labels, (-1,)) active_labels = tf.boolean_mask(train_labels, active_loss) cross_entropy = loss_fct(active_labels, active_logits) loss = tf.reduce_sum(cross_entropy) * (1.0 / train_batch_size) grads = tape.gradient(loss, model.trainable_variables) gradient_accumulator(grads) return cross_entropy per_example_losses = strategy.experimental_run_v2(step_fn, args=(train_features, train_labels)) mean_loss = strategy.reduce(tf.distribute.ReduceOp.MEAN, per_example_losses, axis=0) return mean_loss current_time = datetime.datetime.now() train_iterator = master_bar(range(args["num_train_epochs"])) global_step = 0 logging_loss = 0.0 for epoch in train_iterator: epoch_iterator = progress_bar( train_dataset, total=num_train_steps, parent=train_iterator, display=args["n_device"] > 1 ) step = 1 with strategy.scope(): for train_features, train_labels in epoch_iterator: loss = train_step(train_features, train_labels) if step % args["gradient_accumulation_steps"] == 0: strategy.experimental_run_v2(apply_gradients) loss_metric(loss) global_step += 1 if args["logging_steps"] > 0 and global_step % args["logging_steps"] == 0: # Log metrics if ( args["n_device"] == 1 and args["evaluate_during_training"] ): # Only evaluate when single GPU otherwise metrics may not average well y_true, y_pred, eval_loss = evaluate( args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev" ) report = metrics.classification_report(y_true, y_pred, digits=4) logging.info("Eval at step " + str(global_step) + "\n" + report) logging.info("eval_loss: " + str(eval_loss)) precision = metrics.precision_score(y_true, y_pred) recall = metrics.recall_score(y_true, y_pred) f1 = metrics.f1_score(y_true, y_pred) with writer.as_default(): tf.summary.scalar("eval_loss", eval_loss, global_step) tf.summary.scalar("precision", precision, global_step) tf.summary.scalar("recall", recall, global_step) tf.summary.scalar("f1", f1, global_step) lr = optimizer.learning_rate learning_rate = lr(step) with writer.as_default(): tf.summary.scalar("lr", learning_rate, global_step) tf.summary.scalar( "loss", (loss_metric.result() - logging_loss) / args["logging_steps"], global_step ) logging_loss = loss_metric.result() with writer.as_default(): tf.summary.scalar("loss", loss_metric.result(), step=step) if args["save_steps"] > 0 and global_step % args["save_steps"] == 0: # Save model checkpoint output_dir = os.path.join(args["output_dir"], "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model.save_pretrained(output_dir) logging.info("Saving model checkpoint to %s", output_dir) train_iterator.child.comment = f"loss : {loss_metric.result()}" step += 1 train_iterator.write(f"loss epoch {epoch + 1}: {loss_metric.result()}") loss_metric.reset_states() logging.info(" Training took time = {}".format(datetime.datetime.now() - current_time))
def main(config, pretrained=False, patience=1, lr_scale=1., pretrained_path=None, var_neighbor=5, random_neighbor=True, netname='att-gcn'): # Instantiate the network # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # os.environ["CUDA_VISIBLE_DEVICES"] = str(config.gpu_id) net = nn.DataParallel(ResidualGatedGCNModel(config, dtypeFloat, dtypeLong)) if torch.cuda.is_available(): net.cuda() if pretrained: if pretrained_path is not None: log_dir = pretrained_path if torch.cuda.is_available(): checkpoint = torch.load(log_dir) net.load_state_dict(checkpoint['model_state_dict']) else: log_dir = f"./tsp-models/{config.expt_name}/" if torch.cuda.is_available(): checkpoint = torch.load(log_dir + "best_val_checkpoint.tar") net.load_state_dict(checkpoint['model_state_dict']) print(net) # Compute number of network parameters nb_param = 0 for param in net.parameters(): nb_param += np.prod(list(param.data.size())) print('Number of parameters:', nb_param) # Create log directory tmp_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') tmp_time = '{}-{}'.format(netname, tmp_time) log_dir = f"./logs/{config.expt_name}/{tmp_time}/" os.makedirs(log_dir, exist_ok=True) json.dump(config, open(f"{log_dir}/config.json", "w"), indent=4) writer = SummaryWriter(log_dir) # Define Tensorboard writer # Training parameters #batch_size = config.batch_size #batches_per_epoch = config.batches_per_epoch #accumulation_steps = config.accumulation_steps #num_nodes = config.num_nodes #num_neighbors = config.num_neighbors max_epochs = config.max_epochs val_every = config.val_every test_every = config.test_every learning_rate = config.learning_rate * lr_scale decay_rate = config.decay_rate num_patience = 0 val_loss_old = 1e6 # For decaying LR based on validation loss val_loss_best = 1e6 best_pred_tour_len = 1e6 # For saving checkpoints # Define optimizer optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate) # optimizer = torch.optim.SGD(net.parameters(), lr=learning_rate, # momentum=0.9, weight_decay=0.0005) print(optimizer) epoch_bar = master_bar(range(max_epochs)) for epoch in epoch_bar: # Log to Tensorboard if random_neighbor: if epoch % var_neighbor == 0: num_neighbors = np.random.choice(config.num_neighbors) else: num_neighbors = config.num_neighbors writer.add_scalar('learning_rate', learning_rate, epoch) # Train train_time, train_loss, train_err_edges, train_err_tour, train_err_tsp, train_pred_tour_len, train_gt_tour_len = train_one_epoch( net, optimizer, config, epoch_bar, num_neighbors) epoch_bar.write('t: ' + metrics_to_str( epoch, train_time, learning_rate, train_loss, train_err_edges, train_err_tour, train_err_tsp, train_pred_tour_len, train_gt_tour_len, num_neighbors)) writer.add_scalar('loss/train_loss', train_loss, epoch) writer.add_scalar('pred_tour_len/train_pred_tour_len', train_pred_tour_len, epoch) writer.add_scalar('optimality_gap/train_opt_gap', train_pred_tour_len / train_gt_tour_len - 1, epoch) if epoch % val_every == 0 or epoch == max_epochs - 1: # Validate val_time, val_loss, val_err_edges, val_err_tour, val_err_tsp, val_pred_tour_len, val_gt_tour_len = test( net, config, epoch_bar, mode='val', num_neighbors=num_neighbors) epoch_bar.write('v: ' + metrics_to_str( epoch, val_time, learning_rate, val_loss, val_err_edges, val_err_tour, val_err_tsp, val_pred_tour_len, val_gt_tour_len, num_neighbors)) writer.add_scalar('loss/val_loss', val_loss, epoch) writer.add_scalar('pred_tour_len/val_pred_tour_len', val_pred_tour_len, epoch) writer.add_scalar('optimality_gap/val_opt_gap', val_pred_tour_len / val_gt_tour_len - 1, epoch) # Save checkpoint if val_pred_tour_len < best_pred_tour_len: best_pred_tour_len = val_pred_tour_len # Update best prediction torch.save( { 'epoch': epoch, 'model_state_dict': net.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'train_loss': train_loss, 'val_loss': val_loss, }, log_dir + "best_val_checkpoint_{}.tar".format(epoch)) # Update learning rate if val_loss > 0.99 * val_loss_old: learning_rate /= decay_rate optimizer = update_learning_rate(optimizer, learning_rate) val_loss_old = val_loss # Update old validation loss # Early Stopping if val_loss_best > val_loss: num_patience = 0 val_loss_best = val_loss else: num_patience += 1 # if epoch % test_every == 0 or epoch == max_epochs-1: # # Test # test_time, test_loss, test_err_edges, test_err_tour, test_err_tsp, test_pred_tour_len, test_gt_tour_len = test(net, config, epoch_bar, mode='test') # epoch_bar.write('T: ' + metrics_to_str(epoch, test_time, learning_rate, test_loss, test_err_edges, test_err_tour, test_err_tsp, test_pred_tour_len, test_gt_tour_len)) # writer.add_scalar('loss/test_loss', test_loss, epoch) # writer.add_scalar('pred_tour_len/test_pred_tour_len', test_pred_tour_len, epoch) # writer.add_scalar('optimality_gap/test_opt_gap', test_pred_tour_len/test_gt_tour_len - 1, epoch) # Save training checkpoint at the end of epoch torch.save( { 'epoch': epoch, 'model_state_dict': net.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'train_loss': train_loss, 'val_loss': val_loss, }, log_dir + "last_train_checkpoint.tar") # Save checkpoint after every 250 epochs if epoch != 0 and (epoch % 250 == 0 or epoch == max_epochs - 1): torch.save( { 'epoch': epoch, 'model_state_dict': net.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'train_loss': train_loss, 'val_loss': val_loss, }, log_dir + f"checkpoint_epoch{epoch}.tar") if num_patience >= patience: break return net
def begin_fit(self): self.mbar = master_bar(range(self.epochs)) self.trainer.logger = partial(self.mbar.write, table=True) return
def train_vcae(n_epochs, model, train_iterator, val_iterator, optimizer, device, criterion, save_best=True, verbose=True, is_nf=False, nf=None): model_name = 'NormalizingFlow' + model.__class__.__name__ if is_nf else model.__class__.__name__ writer, experiment_name, best_model_path = setup_experiment(model_name, log_dir="./tb") mb = master_bar(range(n_epochs)) train_losses, val_losses = [], [] best_val_loss = float('+inf') for epoch in mb: train_loss = run_epoch(model, train_iterator, optimizer, criterion, mb, phase='train', epoch=epoch, writer=writer, is_nf=is_nf, nf=nf, device=device) val_loss = run_epoch(model, val_iterator, None, criterion, mb, phase='val', epoch=epoch, writer=writer, is_nf=is_nf, nf=nf, device=device) # save logs dict_saver = {} dict_saver.update({'train_loss_mean': train_loss}) dict_saver.update({'test_loss_mean': val_loss}) file_to_save_path = ''.join( [LOG_PATH, FILE_NAME, experiment_name, FILE_EXCITON]) save_to_file(file_to_save_path, dict_saver) # save the best model if save_best and (val_loss < best_val_loss): best_val_loss = val_loss save_model(nf if is_nf else model, best_model_path) if verbose: # append to a list for real-time plotting train_losses.append(train_loss) val_losses.append(val_loss) # start plotting for notebook mb.main_bar.comment = f'EPOCHS, best_loss:{best_val_loss}' mb.child.comment = f"train_loss:{round(train_loss, 3)}, val_loss:{round(val_loss, 3)}" plot_loss_update(epoch, n_epochs, mb, train_losses, val_losses) return best_model_path
def train(region): np.random.seed(0) torch.manual_seed(0) input_len = 10 encoder_units = 32 decoder_units = 64 encoder_rnn_layers = 3 encoder_dropout = 0.2 decoder_dropout = 0.2 input_size = 2 output_size = 1 predict_len = 5 batch_size = 16 epochs = 500 force_teacher = 0.8 train_dataset, test_dataset, train_max, train_min = create_dataset( input_len, predict_len, region) train_loader = DataLoader( train_dataset, batch_size=batch_size, shuffle=True, drop_last=True) test_loader = DataLoader( test_dataset, batch_size=batch_size, shuffle=False, drop_last=True) enc = Encoder(input_size, encoder_units, input_len, encoder_rnn_layers, encoder_dropout) dec = Decoder(encoder_units*2, decoder_units, input_len, input_len, decoder_dropout, output_size) optimizer = AdaBound(list(enc.parameters()) + list(dec.parameters()), 0.01, final_lr=0.1) # optimizer = optim.Adam(list(enc.parameters()) + list(dec.parameters()), 0.01) criterion = nn.MSELoss() mb = master_bar(range(epochs)) for ep in mb: train_loss = 0 enc.train() dec.train() for encoder_input, decoder_input, target in progress_bar(train_loader, parent=mb): optimizer.zero_grad() enc_vec = enc(encoder_input) h = enc_vec[:, -1, :] _, c = dec.initHidden(batch_size) x = decoder_input[:, 0] pred = [] for pi in range(predict_len): x, h, c = dec(x, h, c, enc_vec) rand = np.random.random() pred += [x] if rand < force_teacher: x = decoder_input[:, pi] pred = torch.cat(pred, dim=1) # loss = quantile_loss(pred, target) loss = criterion(pred, target) loss.backward() optimizer.step() train_loss += loss.item() test_loss = 0 enc.eval() dec.eval() for encoder_input, decoder_input, target in progress_bar(test_loader, parent=mb): with torch.no_grad(): enc_vec = enc(encoder_input) h = enc_vec[:, -1, :] _, c = dec.initHidden(batch_size) x = decoder_input[:, 0] pred = [] for pi in range(predict_len): x, h, c = dec(x, h, c, enc_vec) pred += [x] pred = torch.cat(pred, dim=1) # loss = quantile_loss(pred, target) loss = criterion(pred, target) test_loss += loss.item() print( f"Epoch {ep} Train Loss {train_loss/len(train_loader)} Test Loss {test_loss/len(test_loader)}") if not os.path.exists("models"): os.mkdir("models") torch.save(enc.state_dict(), f"models/{region}_enc.pth") torch.save(dec.state_dict(), f"models/{region}_dec.pth") test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, drop_last=False) rmse = 0 p = 0 predicted = [] true_target = [] enc.eval() dec.eval() for encoder_input, decoder_input, target in progress_bar(test_loader, parent=mb): with torch.no_grad(): enc_vec = enc(encoder_input) x = decoder_input[:, 0] h, c = dec.initHidden(1) pred = [] for pi in range(predict_len): x, h, c = dec(x, h, c, enc_vec) pred += [x] pred = torch.cat(pred, dim=1) predicted += [pred[0, p].item()] true_target += [target[0, p].item()] predicted = np.array(predicted).reshape(1, -1) predicted = predicted * (train_max - train_min) + train_min true_target = np.array(true_target).reshape(1, -1) true_target = true_target * (train_max - train_min) + train_min rmse, peasonr = calc_metric(predicted, true_target) print(f"{region} RMSE {rmse}") print(f"{region} r {peasonr[0]}") return f"{region} RMSE {rmse} r {peasonr[0]}"
def main(): now = datetime.datetime.now() now_date = '{}-{:0>2d}-{:0>2d}_{:0>2d}-{:0>2d}-{:0>2d}'.format( now.year, now.month, now.day, now.hour, now.minute, now.second) # set logger logger = logging.getLogger("Log") logger.setLevel(logging.DEBUG) handler_format = logging.Formatter( '%(asctime)s - %(levelname)s - %(message)s') # stream_handler = logging.StreamHandler() # stream_handler.setLevel(logging.DEBUG) # stream_handler.setFormatter(handler_format) # logger.addHandler(stream_handler) print('{}-{}-{} {}:{}:{}'.format(now.year, now.month, now.day, now.hour, now.minute, now.second)) with open('../params/exp{}.yaml'.format(EXP_NO), "r+") as f: param = yaml.load(f, Loader=yaml.FullLoader) param['date'] = now_date # seed set seed_setting(param['seed']) if torch.cuda.is_available(): torch.backends.cudnn.benchmark = True local_cv = dict() for fold in param['fold']: # /mnt/hdd1/alcon2019/ + exp0/ + 2019-mm-dd_hh-mm-ss/ + foldN outdir = os.path.join(param['save path'], EXP_NAME, now_date, 'fold{}'.format(fold)) if os.path.exists(param['save path']): os.makedirs(outdir, exist_ok=True) else: print("Not find {}".format(param['save path'])) raise FileNotFoundError file_handler = logging.FileHandler( os.path.join(outdir, 'experiment.log')) file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(handler_format) logger.addHandler(file_handler) logger.debug('============= FOLD {} ============='.format(fold)) logger.debug('{}-{}-{} {}:{}:{}'.format(now.year, now.month, now.day, now.hour, now.minute, now.second)) # Dataset param['batch size'] = max(param['batch size'], param['batch size'] * param['GPU']) if param['debug']: train_dataset = AlconDataset( df=get_train_df(param['tabledir']).query( 'valid != @fold').iloc[:param['batch size'] * 12], augmentation=get_train_augmentation( *get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'], 'train', 'imgs'), mode='train') valid_dataset = AlconDataset( df=get_train_df(param['tabledir']).query( 'valid == @fold').iloc[:param['batch size'] * 12], augmentation=get_test_augmentation( *get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'], 'train', 'imgs'), mode='valid') else: train_dataset = AlconDataset( df=get_train_df(param['tabledir']).query('valid != @fold'), augmentation=get_train_augmentation( *get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'], 'train', 'imgs'), mode='train', margin_augmentation=True) valid_dataset = AlconDataset( df=get_train_df(param['tabledir']).query('valid == @fold'), augmentation=get_test_augmentation( *get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'], 'train', 'imgs'), mode='valid', margin_augmentation=False) logger.debug('train dataset size: {}'.format(len(train_dataset))) logger.debug('valid dataset size: {}'.format(len(valid_dataset))) # Dataloader train_dataloader = DataLoader(train_dataset, batch_size=param['batch size'], num_workers=param['thread'], pin_memory=False, drop_last=False, shuffle=True) valid_dataloader = DataLoader(valid_dataset, batch_size=param['batch size'], num_workers=param['thread'], pin_memory=False, drop_last=False, shuffle=False) logger.debug('train loader size: {}'.format(len(train_dataloader))) logger.debug('valid loader size: {}'.format(len(valid_dataloader))) # model model = DenseNet201GRU2(num_classes=48, hidden_size=512, bidirectional=True, load_weight=None, dropout=param['dropout']) param['model'] = model.__class__.__name__ # optim optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=1e-5, nesterov=False) # scheduler model = model.to(param['device']) model, optimizer = amp.initialize(model, optimizer, opt_level="O1") if param['GPU'] > 0: model = nn.DataParallel(model) loss_fn = nn.CrossEntropyLoss().to(param['device']) eval_fn = accuracy_one_character max_char_acc = -1. max_3char_acc = -1. min_loss = 10**5 writer = tbx.SummaryWriter("../log/exp{}/{}/fold{}".format( EXP_NO, now_date, fold)) for key, val in param.items(): writer.add_text('data/hyperparam/{}'.format(key), str(val), 0) max_char_acc = -1e-5 max_3char_acc = -1e-5 min_loss = 1e+5 snapshot = 0 snapshot_loss_list = list() snapshot_eval_list = list() snapshot_eval3_list = list() snapshot_loss = 1e+5 snapshot_eval = -1e-5 snapshot_eval3 = -1e-5 val_iter = math.ceil(len(train_dataloader) / 3) print('val_iter: {}'.format(val_iter)) # Hyper params cycle_iter = 5 snap_start = 2 n_snap = 8 mb = master_bar(range((n_snap + snap_start) * cycle_iter)) scheduler = CosineAnnealingWarmUpRestarts(optimizer, T_0=len(train_dataloader) * cycle_iter, T_mult=1, T_up=500, eta_max=0.1) for epoch in mb: if epoch % cycle_iter == 0 and epoch >= snap_start * cycle_iter: if snapshot > 1: snapshot_loss_list.append(snapshot_loss) snapshot_eval_list.append(snapshot_eval) snapshot_eval3_list.append(snapshot_eval3) snapshot += 1 snapshot_loss = 10**5 snapshot_eval = 0.0 snapshot_eval3 = 0.0 model.train() avg_train_loss = 10**5 avg_train_accuracy = 0.0 avg_three_train_acc = 0.0 for step, (inputs, targets, indice) in enumerate( progress_bar(train_dataloader, parent=mb)): model.train() inputs = inputs.to(param['device']) targets = targets.to(param['device']) optimizer.zero_grad() logits = model(inputs) # logits.size() = (batch*3, 48) preds = logits.view(targets.size(0), 3, -1).softmax(dim=2) loss = loss_fn(logits, targets.view(-1, targets.size(2)).argmax(dim=1)) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() # loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() avg_train_loss += loss.item() _avg_accuracy = eval_fn(preds, targets.argmax(dim=2)).item() avg_train_accuracy += _avg_accuracy _three_char_accuracy = accuracy_three_character( preds, targets.argmax(dim=2), mean=True).item() avg_three_train_acc += _three_char_accuracy writer.add_scalar("data/learning rate", scheduler.get_lr()[0], step + epoch * len(train_dataloader)) scheduler.step() writer.add_scalars( "data/metric/train", { 'loss': loss.item(), 'accuracy': _avg_accuracy, '3accuracy': _three_char_accuracy }, step + epoch * len(train_dataloader)) if step % val_iter == 0 and step != 0: avg_valid_loss, avg_valid_accuracy, avg_three_valid_acc = valid_alcon_rnn( model, valid_dataloader, param['device'], loss_fn, eval_fn) writer.add_scalars( "data/metric/valid", { 'loss': avg_valid_loss, 'accuracy': avg_valid_accuracy, '3accuracy': avg_three_valid_acc }, epoch) logger.debug( '======================== epoch {} | step {} ========================' .format(epoch + 1, step + 1)) logger.debug('lr : {:.5f}'.format( scheduler.get_lr()[0])) logger.debug( 'loss : test={:.5f}'.format(avg_valid_loss)) logger.debug('acc(per 1 char) : test={:.3%}'.format( avg_valid_accuracy)) logger.debug('acc(per 3 char) : test={:.3%}'.format( avg_three_valid_acc)) if min_loss > avg_valid_loss: logger.debug( 'update best loss: {:.5f} ---> {:.5f}'.format( min_loss, avg_valid_loss)) min_loss = avg_valid_loss torch.save(model.state_dict(), os.path.join(outdir, 'best_loss.pth')) if max_char_acc < avg_valid_accuracy: logger.debug( 'update best acc per 1 char: {:.3%} ---> {:.3%}'. format(max_char_acc, avg_valid_accuracy)) max_char_acc = avg_valid_accuracy torch.save(model.state_dict(), os.path.join(outdir, 'best_acc.pth')) if max_3char_acc < avg_three_valid_acc: logger.debug( 'update best acc per 3 char: {:.3%} ---> {:.3%}'. format(max_3char_acc, avg_three_valid_acc)) max_3char_acc = avg_three_valid_acc torch.save(model.state_dict(), os.path.join(outdir, 'best_3acc.pth')) if snapshot > 0: if snapshot_loss > avg_valid_loss: logger.debug( '[snap] update best loss: {:.5f} ---> {:.5f}'. format(snapshot_loss, avg_valid_loss)) snapshot_loss = avg_valid_loss torch.save( model.state_dict(), os.path.join(outdir, f'best_loss_{snapshot}.pth')) if snapshot_eval < avg_valid_accuracy: logger.debug( '[snap] update best acc per 1 char: {:.3%} ---> {:.3%}' .format(snapshot_eval, avg_valid_accuracy)) snapshot_eval = avg_valid_accuracy torch.save( model.state_dict(), os.path.join(outdir, f'best_acc_{snapshot}.pth')) if snapshot_eval3 < avg_three_valid_acc: logger.debug( '[snap] update best acc per 3 char: {:.3%} ---> {:.3%}' .format(snapshot_eval3, avg_three_valid_acc)) snapshot_eval3 = avg_three_valid_acc torch.save( model.state_dict(), os.path.join(outdir, f'best_3acc_{snapshot}.pth')) avg_train_loss /= len(train_dataloader) avg_train_accuracy /= len(train_dataloader) avg_three_train_acc /= len(train_dataloader) avg_valid_loss, avg_valid_accuracy, avg_three_valid_acc = valid_alcon_rnn( model, valid_dataloader, param['device'], loss_fn, eval_fn) writer.add_scalars( "data/metric/valid", { 'loss': avg_valid_loss, 'accuracy': avg_valid_accuracy, '3accuracy': avg_three_valid_acc }, epoch) logger.debug( '======================== epoch {} ========================'. format(epoch + 1)) logger.debug('lr : {:.5f}'.format( scheduler.get_lr()[0])) logger.debug( 'loss : train={:.5f} , test={:.5f}'.format( avg_train_loss, avg_valid_loss)) logger.debug( 'acc(per 1 char) : train={:.3%} , test={:.3%}'.format( avg_train_accuracy, avg_valid_accuracy)) logger.debug( 'acc(per 3 char) : train={:.3%} , test={:.3%}'.format( avg_three_train_acc, avg_three_valid_acc)) if epoch == cycle_iter * snap_start: torch.save( model.state_dict(), os.path.join(outdir, f'model_epoch_{cycle_iter * snap_start}.pth')) if min_loss > avg_valid_loss: logger.debug('update best loss: {:.5f} ---> {:.5f}'.format( min_loss, avg_valid_loss)) min_loss = avg_valid_loss torch.save(model.state_dict(), os.path.join(outdir, 'best_loss.pth')) if max_char_acc < avg_valid_accuracy: logger.debug( 'update best acc per 1 char: {:.3%} ---> {:.3%}'.format( max_char_acc, avg_valid_accuracy)) max_char_acc = avg_valid_accuracy torch.save(model.state_dict(), os.path.join(outdir, 'best_acc.pth')) if max_3char_acc < avg_three_valid_acc: logger.debug( 'update best acc per 3 char: {:.3%} ---> {:.3%}'.format( max_3char_acc, avg_three_valid_acc)) max_3char_acc = avg_three_valid_acc torch.save(model.state_dict(), os.path.join(outdir, 'best_3acc.pth')) if snapshot > 0: if snapshot_loss > avg_valid_loss: logger.debug( '[snap] update best loss: {:.5f} ---> {:.5f}'.format( snapshot_loss, avg_valid_loss)) snapshot_loss = avg_valid_loss torch.save( model.state_dict(), os.path.join(outdir, f'best_loss_{snapshot}.pth')) if snapshot_eval < avg_valid_accuracy: logger.debug( '[snap] update best acc per 1 char: {:.3%} ---> {:.3%}' .format(snapshot_eval, avg_valid_accuracy)) snapshot_eval = avg_valid_accuracy torch.save( model.state_dict(), os.path.join(outdir, f'best_acc_{snapshot}.pth')) if snapshot_eval3 < avg_three_valid_acc: logger.debug( '[snap] update best acc per 3 char: {:.3%} ---> {:.3%}' .format(snapshot_eval3, avg_three_valid_acc)) snapshot_eval3 = avg_three_valid_acc torch.save( model.state_dict(), os.path.join(outdir, f'best_3acc_{snapshot}.pth')) snapshot_loss_list.append(snapshot_loss) snapshot_eval_list.append(snapshot_eval) snapshot_eval3_list.append(snapshot_eval3) writer.add_scalars( "data/metric/valid", { 'best loss': min_loss, 'best accuracy': max_char_acc, 'best 3accuracy': max_3char_acc }) logger.debug('================ FINISH TRAIN ================') logger.debug('Result') logger.debug('Best loss : {}'.format(min_loss)) logger.debug('Best 1 acc : {}'.format(max_char_acc)) logger.debug('Best 3 acc : {}'.format(max_3char_acc)) writer.export_scalars_to_json(os.path.join(outdir, 'history.json')) writer.close() # Local cv target_list = list() for _, targets, _ in valid_dataloader: targets = targets.argmax(dim=2) target_list.append(targets) target_list = torch.cat(target_list) mb = master_bar(range(n_snap)) valid_logit_dict = dict() init = True for i in mb: model.load_state_dict( torch.load(os.path.join(outdir, f'best_loss_{i+1}.pth'))) logit_alcon_rnn(model, valid_dataloader, param['device'], valid_logit_dict, div=n_snap, init=init) init = False pred_list = torch.stack(list(valid_logit_dict.values())) pred_list = pred_list.softmax(dim=2) local_accuracy = accuracy_three_character(pred_list, target_list) logger.debug('LOCAL CV : {:5%}'.format(local_accuracy)) torch.save(valid_logit_dict, os.path.join(outdir, f'fold{fold}_valid_logit.pth')) local_cv['fold{}'.format(fold)] = { 'accuracy': local_accuracy, 'valid_size': len(valid_dataset) } del train_dataset, valid_dataset del train_dataloader, valid_dataloader del scheduler, optimizer del valid_logit_dict, target_list gc.collect() logger.debug('=========== Prediction phrase ===========') if param['debug']: test_dataset = AlconDataset( df=get_test_df(param['tabledir']).iloc[:param['batch size'] * 12], augmentation=get_test_augmentation( *get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'], 'test', 'imgs'), mode='test') else: test_dataset = AlconDataset( df=get_test_df(param['tabledir']), augmentation=get_test_augmentation( *get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'], 'test', 'imgs'), mode='test') test_dataloader = DataLoader(test_dataset, batch_size=param['batch size'], num_workers=param['thread'], pin_memory=False, drop_last=False, shuffle=False) logger.debug('test dataset size: {}'.format(len(test_dataset))) logger.debug('test loader size: {}'.format(len(test_dataloader))) test_logit_dict = dict() init = True for i in range(n_snap): logger.debug('load weight : {}'.format( os.path.join(outdir, f'best_loss_{i+1}.pth'))) model.load_state_dict( torch.load(os.path.join(outdir, f'best_loss_{i+1}.pth'))) logit_alcon_rnn(model, test_dataloader, param['device'], test_logit_dict, div=n_snap, init=init) init = False torch.save(test_logit_dict, os.path.join(outdir, 'prediction.pth')) output_list = make_submission(test_logit_dict) pd.DataFrame(output_list).sort_values('ID').set_index('ID').to_csv( os.path.join(outdir, 'test_prediction.csv')) logger.debug('success!') logger.removeHandler(file_handler) del test_dataset, test_dataloader gc.collect() # Ensemble print('======== Ensemble phase =========') emsemble_prediction = dict() mb = master_bar(param['fold']) print('======== Load Vector =========') for i, fold in enumerate(mb): outdir = os.path.join(param['save path'], EXP_NAME, now_date, 'fold{}'.format(fold)) prediction = torch.load(os.path.join(outdir, 'prediction.pth')) # prediction is list # prediction[0] = {'ID' : 0, 'logit' torch.tensor, ...} if i == 0: for ID, logit in progress_bar(prediction.items(), parent=mb): emsemble_prediction[ID] = logit / len(param['fold']) else: for ID, logit in progress_bar(prediction.items(), parent=mb): emsemble_prediction[ID] += logit / len(param['fold']) # outdir = os.path.join(param['save path'], EXP_NAME, now_date) # file_handler = logging.FileHandler(os.path.join(outdir, 'result.log')) file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(handler_format) logger.addHandler(file_handler) logger.info(' ========== RESULT ========== \n') # cv = 0.0 train_data_size = 0 for fold in param['fold']: acc = local_cv['fold{}'.format(fold)]['accuracy'] valid_size = local_cv['fold{}'.format(fold)]['valid_size'] train_data_size += valid_size logger.info(' fold {} : {:.3%} \n'.format(fold, acc)) cv += acc * valid_size logger.info(' Local CV : {:.3%} \n'.format(cv / train_data_size)) logger.info(' ============================== \n') # logger.removeHandler(file_handler) # # torch.save(emsemble_prediction, os.path.join(outdir, 'prediction.pth')) # print('======== make submittion file =========') submit_list = make_submission(emsemble_prediction) pd.DataFrame(submit_list).sort_values('ID').set_index('ID').to_csv( os.path.join(outdir, 'test_prediction.csv')) print('success!')
def calculate_map(self, model: nn.Module, conf_threshold: float = 0.01, iou_threshold: float = 0.50, same_threshold: float = 0.45, max_preds: float = 200, plot: bool = True, use_gpu: bool = True) -> float: """Compute the mean average precision (mAP) achieved by a model on this dataset""" model.eval() dl = DataLoader(self, batch_size=32, drop_last=False, num_workers=32) # First, populate a dictionary with class ids as keys, and tuples of # the form (confidence, is_correct, bbox) for each bounding box # predicted for that class predictions = defaultdict(list) num_targs = defaultdict( int) # Dict with number of ground truth boxes for each class mb = master_bar(dl) for imgs, targs in mb: if use_gpu: imgs = imgs.cuda() preds = model(imgs).cpu() for pred, targ in progress_bar(list(zip(preds, targs)), parent=mb): # Process Targets targ_boxes, targ_classes, _ = tensor2boxes( self.matcher.default_boxes, targ) targ_boxes, filtered_idxs = filter_overlapping_boxes( targ_boxes, iou_threshold=0.95) targ_classes = targ_classes[filtered_idxs] for targ_class in targ_classes: num_targs[targ_class.item()] += 1 # Process Predictions pred_boxes, pred_classes, pred_confs = nms( pred, self.matcher.default_boxes, conf_threshold, same_threshold, max_preds) # Match Prediction to Targets for each Class matched_targs = set() for pred_box, pred_class, pred_conf in zip( pred_boxes, pred_classes, pred_confs): # Indices of targets in the same class as the current prediction same_classes = (targ_classes == pred_class).float() same_class_idxs = set( same_classes.nonzero().flatten().numpy()) # Indices of targets that overlap sufficiently with the current prediction overlaps = jaccard_overlap(pred_box, targ_boxes) above_thresholds = (overlaps > iou_threshold).float() above_threshold_idxs = set( above_thresholds.nonzero().flatten().numpy()) # Indices of targets that are both in the same class and overlap sufficiently # with the current prediction valid_idxs = same_class_idxs.intersection( above_threshold_idxs) # Target indices in order of decreasing overlap with the current prediction valid_idxs = list(valid_idxs) valid_idxs.sort(key=lambda idx: overlaps[idx], reverse=True) valid_idxs = [ idx for idx in valid_idxs if idx not in matched_targs ] pred_box_matched = False if len(valid_idxs): targ_idx = valid_idxs[0] matched_targs.add(targ_idx) pred_box_matched = True pred_conf = pred_conf.item() pred_box = pred_box.detach().cpu().numpy().tolist() predictions[pred_class].append( (pred_conf, pred_box_matched, pred_box)) # Calculate Average Precision for each Class all_classes = set(num_targs.keys()).union(predictions.keys()) avg_precisions = [] for class_idx in all_classes: tps, fps, fns = 0, 0, num_targs[class_idx] if fns == 0: avg_precisions.append(1) continue precisions, recalls = [], [] # Sort Predictions in order of decreasing confidence class_preds = predictions[class_idx] class_preds = [(conf, is_correct) for conf, is_correct, _ in class_preds] class_preds.sort( reverse=True) # Sort in order of decreasing confidence for _, is_correct in class_preds: if is_correct: tps += 1 fns -= 1 else: fps += 1 precision = tps / (tps + fps) if tps + fps > 0 else 0 recall = tps / (tps + fns) if tps + fns > 0 else 0 if not (recalls and recalls[-1] == recall): precisions.append(precision) recalls.append(recall) precisions_adj = [ max(precisions[idx:]) for idx in range(len(precisions)) ] avg_precision = 0 for idx, precision in enumerate(precisions_adj[:-1]): increment = recalls[idx + 1] - recalls[idx] avg_precision += precision * increment print( f"\nAP for {self.categories[class_idx].capitalize()}: {round(avg_precision, 4)}" ) if plot: plt.plot(recalls, precisions_adj) plt.title(self.categories[class_idx].capitalize()) plt.xlabel("Recall") plt.ylim(0, 1) plt.xlim(0, 1) plt.ylabel("Precision") plt.show() avg_precisions.append(avg_precision) mean_avg_precision = np.mean(avg_precisions) return mean_avg_precision
def train(self, train_df, target_df): oof = np.zeros((len(train_df), self.cfg.model.n_classes)) cv = 0 for fold_, col in enumerate(self.fold_df.columns): print( f'\n========================== FOLD {fold_} ... ==========================\n' ) logging.debug( f'\n========================== FOLD {fold_} ... ==========================\n' ) trn_x, val_x = train_df[self.fold_df[col] == 0], train_df[ self.fold_df[col] > 0] val_y = target_df[self.fold_df[col] > 0].values train_loader = factory.get_dataloader(trn_x, self.cfg.data.train) valid_loader = factory.get_dataloader(val_x, self.cfg.data.valid) model = factory.get_nn_model(self.cfg).to(device) criterion = factory.get_loss(self.cfg) optimizer = factory.get_optim(self.cfg, model.parameters()) scheduler = factory.get_scheduler(self.cfg, optimizer) best_epoch = -1 best_val_score = -np.inf mb = master_bar(range(self.cfg.model.epochs)) train_loss_list = [] val_loss_list = [] val_score_list = [] for epoch in mb: start_time = time.time() model, avg_loss = self._train_epoch(model, train_loader, criterion, optimizer, mb) valid_preds, avg_val_loss = self._val_epoch( model, valid_loader, criterion) val_score = factory.get_metrics(self.cfg.common.metrics.name)( val_y, valid_preds) train_loss_list.append(avg_loss) val_loss_list.append(avg_val_loss) val_score_list.append(val_score) if self.cfg.scheduler.name != 'ReduceLROnPlateau': scheduler.step() elif self.cfg.scheduler.name == 'ReduceLROnPlateau': scheduler.step(avg_val_loss) elapsed = time.time() - start_time mb.write( f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} val_score: {val_score:.4f} time: {elapsed:.0f}s' ) logging.debug( f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} val_score: {val_score:.4f} time: {elapsed:.0f}s' ) if val_score > best_val_score: best_epoch = epoch + 1 best_val_score = val_score best_valid_preds = valid_preds if self.cfg.model.multi_gpu: best_model = model.module.state_dict() else: best_model = model.state_dict() oof[val_x.index, :] = best_valid_preds cv += best_val_score * self.fold_df[col].max() torch.save(best_model, f'../logs/{self.run_name}/weight_best_{fold_}.pt') self._save_loss_png(train_loss_list, val_loss_list, val_score_list, fold_) print(f'\nEpoch {best_epoch} - val_score: {best_val_score:.4f}') logging.debug( f'\nEpoch {best_epoch} - val_score: {best_val_score:.4f}') print('\n\n===================================\n') print(f'CV: {cv:.6f}') logging.debug(f'\n\nCV: {cv:.6f}') print('\n===================================\n\n') self.oof = oof.reshape(-1, 5) return cv
def get_opt_rf_params(x_trn:np.ndarray, y_trn:np.ndarray, x_val:np.ndarray, y_val:np.ndarray, objective:str, w_trn:Optional[np.ndarray]=None, w_val:Optional[np.ndarray]=None, params:Optional[OrderedDict]=None, n_estimators:int=40, verbose=True) \ -> Tuple[Dict[str,float],Union[RandomForestRegressor,RandomForestClassifier]]: r''' Use an ordered parameter-scan to roughly optimise Random Forest hyper-parameters. Arguments: x_trn: training input data y_trn: training target data x_val: validation input data y_val: validation target data objective: string representation of objective: either 'classification' or 'regression' w_trn: training weights w_val: validation weights params: ordered dictionary mapping parameters to optimise to list of values to cosnider n_estimators: number of trees to use in each forest verbose: Print extra information and show a live plot of model performance Returns: params: dictionary mapping parameters to their optimised values rf: best performing Random Forest ''' if params is None: params = OrderedDict({ 'min_samples_leaf': [1, 3, 5, 10, 25, 50, 100], 'max_features': [0.3, 0.5, 0.7, 0.9] }) rf = RandomForestClassifier if 'class' in objective.lower( ) else RandomForestRegressor best_params = { 'n_estimators': n_estimators, 'n_jobs': -1, 'max_features': 'sqrt' } best_scores = [] scores = [] mb = master_bar(params) mb.names = ['Best', 'Scores'] if verbose: mb.update_graph([[[], []], [[], []]]) for param in mb: pb = progress_bar(params[param], parent=mb) pb.comment = f'{param} = {params[param][0]}' for i, value in enumerate(pb): pb.comment = f'{param} = {params[param][min(i+1, len(params[param])-1)]}' m = rf(**{**best_params, param: value}) m.fit(X=x_trn, y=y_trn, sample_weight=w_trn) scores.append(m.score(X=x_val, y=y_val, sample_weight=w_val)) if len(best_scores) == 0 or scores[-1] > best_scores[-1]: best_scores.append(scores[-1]) best_params[param] = value if verbose: print( f'Better score schieved: {param} @ {value} = {best_scores[-1]:.4f}' ) best_m = m else: best_scores.append(best_scores[-1]) if verbose: mb.update_graph([[range(len(best_scores)), best_scores], [range(len(scores)), scores]]) if verbose: delattr(mb, 'fig') if verbose: plt.clf() return best_params, best_m
def begin_fit(self, e: Event): self.mbar = master_bar(range(e.learn.epochs)) self.mbar.on_iter_begin() e.learn.logger = partial(self.mbar.write, table=True)
def run_training(model, optimizer, loss_function, device, num_epochs, train_dataloader, val_dataloader, early_stopper=None, verbose=False): """Run model training. Args: model (nn.Module): Torch model to train optimizer: Torch optimizer object loss_fn: Torch loss function for training device (torch.device): Torch device to use for training num_epochs (int): Max. number of epochs to train train_dataloader (DataLoader): Torch DataLoader object to load the training data val_dataloader (DataLoader): Torch DataLoader object to load the validation data early_stopper (EarlyStopper, optional): If passed, model will be trained with early stopping. Defaults to None. verbose (bool, optional): Print information about model training. Defaults to False. Returns: list, list, list, list, torch.Tensor shape (10,10): Return list of train losses, validation losses, train accuracies, validation accuracies per epoch and the confusion matrix evaluated in the last epoch. """ start_time = time.time() master_bar = fastprogress.master_bar(range(num_epochs)) train_losses, val_losses, train_accs, val_accs = [],[],[],[] for epoch in master_bar: # Train the model epoch_train_loss, epoch_train_acc = train(train_dataloader, optimizer, model, loss_function, device, master_bar) # Validate the model epoch_val_loss, epoch_val_acc, confusion_matrix = validate(val_dataloader, model, loss_function, device, master_bar) # Save loss and acc for plotting train_losses.append(epoch_train_loss) val_losses.append(epoch_val_loss) train_accs.append(epoch_train_acc) val_accs.append(epoch_val_acc) if verbose: master_bar.write(f'Train loss: {epoch_train_loss:.2f}, val loss: {epoch_val_loss:.2f}, train acc: {epoch_train_acc:.3f}, val acc {epoch_val_acc:.3f}') if early_stopper: #################### ## YOUR CODE HERE ## #################### early_stopper.update(epoch_val_acc, model) if early_stopper.early_stop: early_stopper.load_checkpoint(model) print("Early stopping, since the validation accuracy did not increase. Epoch: {}".format(epoch)) break # END OF YOUR CODE # time_elapsed = np.round(time.time() - start_time, 0).astype(int) print(f'Finished training after {time_elapsed} seconds.') return train_losses, val_losses, train_accs, val_accs, confusion_matrix
mito_train = [fn for fn in hr_mito] neuron_path = datasources / 'live_neuron_mito_timelapse_for_deep_learning' two_channel = list(neuron_path.glob('*MTGreen*.czi')) one_channel = [x for x in neuron_path.glob('*.czi') if x not in two_channel] airyscan_path = datasources / 'Airyscan_processed_data_from_the_server' hr_airyscan = list(airyscan_path.glob('*.czi')) for fn in hr_mito: if '03-Airyscan' in fn.stem: valid_files.append(fn) else: train_files.append(fn) for lst in [hr_airyscan, one_channel, two_channel]: lst.sort() random.shuffle(lst) split_idx = int(valid_pct * len(lst)) print(split_idx) valid_files += lst[-split_idx:] train_files += lst[:-split_idx] for subdir, file_list in [('train', train_files), ('valid', valid_files)]: print(f'\n\ncopy, crappify and upsample {subdir} files\n\n') pbar = master_bar(file_list) for czi_fn in pbar: czi_to_multiframe(czi_fn, hr_path / subdir, lr_path / subdir, lr_up_path / subdir, pbar=pbar)
def train_old(model, optimizer, criterion, dataset, batch_size, num_epochs, num_workers=0, half=True): dataloader = { x: torch.utils.data.DataLoader(dataset[x], batch_size=batch_size, shuffle=True, num_workers=num_workers) for x in ['train', 'val'] } dataset_sizes = {x: len(dataset[x]) for x in ['train', 'val']} # Decay LR by a factor of 0.1 every 7 epochs scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1) since = time.time() best_acc = 0.0 #best_model_wts = copy.deepcopy(model.state_dict()) mb = master_bar(range(num_epochs)) mb.names = ['train', 'val'] mb.write("Epoch\tTrn_loss\tVal_loss\tTrn_acc\t\tVal_acc") # Iterate epochs #for epoch in range(num_epochs): for epoch in mb: # Each epoch has a training and validation phase for phase in ['train', 'val']: if phase == 'train': scheduler.step() # Scheduling the learning rate model.train() # Set model to training mode else: model.eval() # Set model to evaluate mode running_loss = 0.0 running_corrects = 0 # Iterate over data #for inputs, labels in dataloader[phase]: for inputs, labels in progress_bar(dataloader[phase], parent=mb): inputs = inputs.to(device) labels = labels.to(device) if half: inputs = inputs.half() optimizer.zero_grad() # zero the parameter gradients outputs = model(inputs) # forward preds = torch.argmax(outputs, dim=1) # prediction loss = criterion(outputs, labels) # loss if phase == 'train': loss.backward() # backward if phase == 'train': optimizer.step() # optimize # statistics running_loss += loss.item() * inputs.size( 0 ) # multiplicar si nn.CrossEntropyLoss(size_average=True) que es lo default running_corrects += torch.sum(preds == labels.data) epoch_loss = running_loss / dataset_sizes[phase] epoch_acc = running_corrects.double() / dataset_sizes[phase] metrics[phase]["loss"].append(epoch_loss) metrics[phase]["acc"].append(epoch_acc) #draw_plot(metrics) # deep copy the model if phase == 'val' and epoch_acc > best_acc: best_acc = epoch_acc #best_model_wts = copy.deepcopy(model.state_dict()) x = list(range(len(metrics["train"]["acc"]))) graphs = [[x, metrics["train"]["acc"]], [x, metrics["val"]["acc"]]] mb.update_graph(graphs) mb.write("{}/{}\t{:06.6f}\t{:06.6f}\t{:06.6f}\t{:06.6f}".format( epoch + 1, num_epochs, metrics["train"]["loss"][-1], metrics["val"]["loss"][-1], metrics["train"]["acc"][-1], metrics["val"]["acc"][-1])) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best val Acc: {:4f}'.format(best_acc))
def main(args): if args.deterministic: set_seed(42) # Set device if args.device is None: if torch.cuda.is_available(): args.device = 'cuda:0' else: args.device = 'cpu' normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_transforms = transforms.Compose([ transforms.RandomResizedCrop((args.resize, args.resize)), transforms.RandomHorizontalFlip(), transforms.RandomRotation(10), transforms.ToTensor(), normalize ]) test_transforms = transforms.Compose([ transforms.Resize((args.resize, args.resize)), transforms.ToTensor(), normalize ]) # Train & test sets train_set = OpenFire(root=args.data_path, train=True, download=True, valid_pct=0.2, transform=train_transforms) val_set = OpenFire(root=args.data_path, train=False, download=True, valid_pct=0.2, transform=test_transforms) num_classes = len(train_set.classes) # Samplers train_sampler = torch.utils.data.RandomSampler(train_set) test_sampler = torch.utils.data.SequentialSampler(val_set) # Data loader train_loader = torch.utils.data.DataLoader(train_set, batch_size=args.batch_size, sampler=train_sampler, num_workers=args.workers, pin_memory=True) test_loader = torch.utils.data.DataLoader(val_set, batch_size=args.batch_size, sampler=test_sampler, num_workers=args.workers, pin_memory=True) # Model definition model = torchvision.models.__dict__[args.model](pretrained=args.pretrained) # Change fc in_features = getattr(model, 'fc').in_features setattr(model, 'fc', nn.Linear(in_features, num_classes)) model.to(args.device) # Loss function criterion = nn.CrossEntropyLoss() # optimizer optimizer = optim.Adam(model.parameters(), betas=(0.9, 0.99), weight_decay=args.weight_decay) # Scheduler lr_scheduler = optim.lr_scheduler.OneCycleLR( optimizer, max_lr=args.lr, epochs=args.epochs, steps_per_epoch=len(train_loader), cycle_momentum=(not isinstance(optimizer, optim.Adam)), div_factor=args.div_factor, final_div_factor=args.final_div_factor) best_loss = math.inf mb = master_bar(range(args.epochs)) for epoch_idx in mb: # Training train_loss = train_epoch(model, train_loader, optimizer, criterion, master_bar=mb, epoch=epoch_idx, scheduler=lr_scheduler, device=args.device) # Evaluation val_loss, acc = evaluate(model, test_loader, criterion, device=args.device) mb.first_bar.comment = f"Epoch {epoch_idx+1}/{args.epochs}" mb.write( f'Epoch {epoch_idx+1}/{args.epochs} - Training loss: {train_loss:.4} | Validation loss: {val_loss:.4} | Error rate: {1 - acc:.4}' ) # State saving if val_loss < best_loss: print( f"Validation loss decreased {best_loss:.4} --> {val_loss:.4}: saving state..." ) best_loss = val_loss if args.output_dir: torch.save( dict(model=model.state_dict(), optimizer=optimizer.state_dict(), lr_scheduler=lr_scheduler.state_dict(), epoch=epoch_idx, args=args), Path(args.output_dir, f"{args.checkpoint}.pth"))
def experiment_blue_print( output_root_dir=None, cv_run_num=None, ds_train_name=None, ds_test_name=None, ds_normalization=None, num_train_samples=None, num_augmentations=None, type_augmentation=None, num_intra_samples=None, model_name=None, batch_size=None, num_epochs=None, cls_loss_fn=None, lr_init=None, w_top_loss=None, top_scale=None, weight_decay_cls=None, weight_decay_feat_ext=None, normalize_gradient=None, pers_type=None, compute_persistence=None, track_model=None, tag=''): args = dict(locals()) print(args) if not all(((v is not None) for k, v in args.items())): s = ', '.join((k for k, v in args.items() if v is None)) raise AssertionError("Some kwargs are None: {}!".format(s)) if w_top_loss > 0 and not compute_persistence: raise AssertionError('w_top_loss > 0 and compute_persistence == False') exp_id = get_experiment_id(tag) output_dir = Path(output_root_dir) / exp_id output_dir.mkdir() logger = ExperimentLogger(output_dir, args) track_accuracy = True """ Get the splits for the training data. """ DS_TRAIN_ORIGINAL_SPLITS = ds_factory_stratified_shuffle_split( ds_train_name, num_train_samples) DS_TEST_ORIGINAL = ds_factory(ds_test_name) assert len(DS_TRAIN_ORIGINAL_SPLITS) >= cv_run_num DS_TRAIN_ORIGINAL_SPLITS = DS_TRAIN_ORIGINAL_SPLITS[:cv_run_num] pers_fn = persistence_fn_factory(args['pers_type']) cls_loss_fn = cls_loss_fn_factory(args['cls_loss_fn']) """ Run over the dataset splits; the splits are fixed for each number of training samples (500,1000,4000, etc.) """ for run_i, DS_TRAIN_ORIGINAL in enumerate(DS_TRAIN_ORIGINAL_SPLITS): assert len(DS_TRAIN_ORIGINAL) == num_train_samples logger.new_run() dl_train, DS_TRAIN, DS_TEST, num_classes = setup_data_for_training( ds_train_original=DS_TRAIN_ORIGINAL, ds_test_original=DS_TEST_ORIGINAL, ds_normalization=ds_normalization, type_augmentation=type_augmentation, num_augmentations=num_augmentations, num_intra_samples=num_intra_samples, batch_size=batch_size ) model = model_factory(model_name, num_classes) model = model.to(DEVICE) print(model) opt = torch.optim.SGD( [ {'params': model.feat_ext.parameters( ), 'weight_decay': weight_decay_feat_ext}, {'params': model.cls.parameters(), 'weight_decay': weight_decay_cls} ], lr=lr_init, momentum=0.9, nesterov=True) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( opt, T_max=num_epochs, eta_min=0, last_epoch=-1) mb = master_bar(range(num_epochs)) mb_comment = '' for epoch_i in mb: model.train() epoch_loss = 0 L = len(dl_train)-1 for b_i, ((batch_x, batch_y), _) in enumerate(zip(dl_train, progress_bar(range(L), parent=mb))): n = batch_x[0].size(0) assert n == num_intra_samples*num_augmentations assert all(((x.size(0) == n) for x in batch_x)) x, y = torch.cat(batch_x, dim=0), torch.cat(batch_y, dim=0) x, y = x.to(DEVICE), y.to(DEVICE) y_hat, z = model(x) l_cls = cls_loss_fn(y_hat, y) l_top = torch.tensor(0.0).to(DEVICE) if compute_persistence: for i in range(batch_size): z_sample = z[i*n: (i+1)*n, :].contiguous() lt = pers_fn(z_sample)[0][0][:, 1] logger.log_value('batch_lt', lt) l_top = l_top + (lt-top_scale).abs().sum() l_top = l_top / float(batch_size) l = l_cls + w_top_loss * l_top opt.zero_grad() l.backward() # gradient norm and normalization aa grad_vec_abs = torch.cat( [p.grad.data.view(-1) for p in model.parameters()], dim=0).abs() grad_norm = grad_vec_abs.pow(2).sum().sqrt().item() if grad_norm > 0 and normalize_gradient: for p in model.parameters(): p.grad.data /= grad_norm opt.step() epoch_loss += l.item() logger.log_value('batch_cls_loss', l_cls) logger.log_value('batch_top_loss', l_top) logger.log_value('batch_grad_norm', grad_norm) logger.log_value('batch_grad_abs_max', grad_vec_abs.max()) logger.log_value('batch_grad_abs_min', grad_vec_abs.min()) logger.log_value('batch_grad_abs_mean', grad_vec_abs.mean()) logger.log_value('batch_grad_abs_std', grad_vec_abs.std()) logger.log_value('lr', scheduler.get_last_lr()[0]) logger.log_value( 'cls_norm', model.cls[0].weight.data.view(-1).norm()) scheduler.step() mb_comment = "Last loss: {:.2f} {:.4f} ".format( epoch_loss, w_top_loss) track_accuracy = True if track_accuracy: X, Y = apply_model(model, DS_TRAIN, device=DEVICE) acc_train = argmax_and_accuracy(X, Y) logger.log_value('acc_train', acc_train) mb_comment += " | acc. train {:.2f} ".format(acc_train) X, Y = apply_model(model, DS_TEST, device=DEVICE) acc_test = argmax_and_accuracy(X, Y) logger.log_value('acc_test', acc_test) mb_comment += " | acc. test {:.2f} ".format(acc_test) logger.log_value('epoch_i', epoch_i) mb.main_bar.comment = mb_comment logger.write_logged_values_to_disk() if track_model: logger.write_model_to_disk('model_epoch_{}'.format(epoch_i), model) logger.write_model_to_disk('model', model)
3, skip=0, is_transform=True, crop=CROP_SIZE) train = torch.utils.data.DataLoader(train_folder, batch_size=BATCH_SIZE, shuffle=True, num_workers=12) valid = torch.utils.data.DataLoader(valid_folder, batch_size=BATCH_SIZE, shuffle=True, num_workers=12) lr = INIT_LR #loss = torch.nn.MSELoss(reduction='mean') loss = torch.nn.MSELoss(reduction='mean') mb = master_bar(range(EPOCHS)) best_loss = float('inf') valid_loss = float('inf') count = 0 sum_loss = 0 model = model.eval() for X, Y in progress_bar(valid, parent=mb, txt_len=100): X = X.cuda() #*2/255-1 #E = E.cuda() Y = Y.cuda() #*2/255-1 with torch.set_grad_enabled(False): out = model(X, None) l = loss(out, Y) * X.size()[0] count += X.size()[0] sum_loss += l
def train_model(x_trn, x_val, config, num_classes, weights, device): y_gr_val = x_val['grapheme_root'] y_vo_val = x_val['vowel_diacritic'] y_co_val = x_val['consonant_diacritic'] model_params = config.model_params train_dataset = BengaliDataset(x_trn, n_channels=model_params.n_channels, img_size=config.img_size, transforms=config.augmentation) valid_dataset = BengaliDataset(x_val, n_channels=model_params.n_channels, img_size=config.img_size, transforms=None) train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=3) valid_loader = DataLoader(valid_dataset, batch_size=config.batch_size, shuffle=False, num_workers=3) del train_dataset, valid_dataset gc.collect() if 'se_resnext' in model_params.model_name: model = SeNet(model_name=model_params.model_name, n_channels=model_params.n_channels, n_classes=model_params.n_classes, pretrained=model_params.pretrained).to(device) elif 'resnetd' in model_params.model_name: model = ResNetD(model_name=model_params.model_name, n_channels=model_params.n_channels, n_classes=model_params.n_classes).to(device) elif 'resne' in model_params.model_name: model = ResNet(model_name=model_params.model_name, n_channels=model_params.n_channels, n_classes=model_params.n_classes, pretrained=model_params.pretrained).to(device) elif 'densenet' in model_params.model_name: model = DenseNet(model_name=model_params.model_name, n_channels=model_params.n_channels, n_classes=model_params.n_classes, pretrained=model_params.pretrained).to(device) elif 'efficient' in model_params.model_name: model = ENet(model_name=model_params.model_name, n_channels=model_params.n_channels, n_classes=model_params.n_classes, pretrained=model_params.pretrained).to(device) if config.model_state_fname is not None: model.load_state_dict( torch.load(f'../logs/{config.model_state_fname}/weight_best.pt')) # relu_replace(model) # bn_replace(model) weights_gr = torch.from_numpy(weights['grapheme_root']).cuda() weights_vo = torch.from_numpy(weights['vowel_diacritic']).cuda() weights_co = torch.from_numpy(weights['consonant_diacritic']).cuda() if config.loss == 'CrossEntropyLoss': # criterion_gr = nn.CrossEntropyLoss(weight=weights_gr) # criterion_vo = nn.CrossEntropyLoss(weight=weights_vo) # criterion_co = nn.CrossEntropyLoss(weight=weights_co) criterion_gr = nn.CrossEntropyLoss() criterion_vo = nn.CrossEntropyLoss() criterion_co = nn.CrossEntropyLoss() elif config.loss == 'SmoothCrossEntropyLoss': criterion_gr = SmoothCrossEntropyLoss() criterion_vo = SmoothCrossEntropyLoss() criterion_co = SmoothCrossEntropyLoss() elif config.loss == 'FocalLoss': criterion_gr = FocalLoss() criterion_vo = FocalLoss() criterion_co = FocalLoss() elif config.loss == 'ClassBalancedLoss': criterion_gr = ClassBalancedLoss(samples_per_cls=weights_gr, no_of_classes=num_classes[0], loss_type='focal', beta=0.999, gamma=2.0) criterion_vo = ClassBalancedLoss(samples_per_cls=weights_vo, no_of_classes=num_classes[1], loss_type='focal', beta=0.999, gamma=2.0) criterion_co = ClassBalancedLoss(samples_per_cls=weights_co, no_of_classes=num_classes[2], loss_type='focal', beta=0.999, gamma=2.0) elif config.loss == 'OhemLoss': criterion_gr = OhemLoss(rate=1.0) criterion_vo = OhemLoss(rate=1.0) criterion_co = OhemLoss(rate=1.0) if config.optimizer.type == 'Adam': optimizer = Adam(params=model.parameters(), lr=config.optimizer.lr, amsgrad=False, weight_decay=1e-4) elif config.optimizer.type == 'SGD': optimizer = SGD(params=model.parameters(), lr=config.optimizer.lr, momentum=0.9, weight_decay=1e-4, nesterov=True) scheduler_flg = False if config.scheduler.type == 'cosine': scheduler_flg = True scheduler = CosineAnnealingLR(optimizer, T_max=config.scheduler.t_max, eta_min=config.scheduler.eta_min) elif config.scheduler.type == 'cosine-warmup': scheduler_flg = True scheduler = CosineAnnealingWarmUpRestarts( optimizer, T_0=config.scheduler.t_0, T_mult=config.scheduler.t_mult, eta_max=config.scheduler.eta_max, T_up=config.scheduler.t_up, gamma=config.scheduler.gamma) elif config.scheduler.type == 'step': scheduler_flg = True scheduler = StepLR(optimizer, step_size=config.scheduler.step_size, gamma=config.scheduler.gamma) elif config.scheduler.type == 'reduce': scheduler_flg = True scheduler = ReduceLROnPlateau(optimizer, factor=config.scheduler.factor, patience=config.scheduler.patience, min_lr=config.scheduler.min_lr) best_epoch = -1 best_val_score = -np.inf mb = master_bar(range(config.epochs)) train_loss_list = [] val_loss_list = [] val_score_list = [] counter = 0 for epoch in mb: start_time = time.time() model.train() avg_loss = 0. for images, labels_gr, labels_vo, labels_co in progress_bar( train_loader, parent=mb): images = Variable(images).to(device) labels_gr = Variable(labels_gr).to(device) labels_vo = Variable(labels_vo).to(device) labels_co = Variable(labels_co).to(device) if config.loss == 'OhemLoss': if epoch < config.epochs * 0.2: new_rate = 1.0 elif epoch < config.epochs * 0.4: new_rate = 0.8 elif epoch < config.epochs * 0.6: new_rate = 0.75 elif epoch < config.epochs * 0.8: new_rate = 0.7 else: new_rate = 0.6 criterion_gr.update_rate(new_rate) criterion_vo.update_rate(new_rate) criterion_co.update_rate(new_rate) r = np.random.rand() mix_params = config.augmentation.mix_params if r < mix_params.mixup: images, targets = mixup(images, labels_gr, labels_vo, labels_co, 1.0) preds_gr, preds_vo, preds_co = model(images) loss = mixup_criterion(preds_gr, preds_vo, preds_co, targets, criterion_gr, criterion_vo, criterion_co) elif r < (mix_params.mixup + mix_params.cutmix): images, targets = cutmix(images, labels_gr, labels_vo, labels_co, 1.0) preds_gr, preds_vo, preds_co = model(images) loss = cutmix_criterion(preds_gr, preds_vo, preds_co, targets, criterion_gr, criterion_vo, criterion_co) else: preds_gr, preds_vo, preds_co = model(images.float()) loss = criterion_gr(preds_gr, labels_gr) \ + criterion_vo(preds_vo, labels_vo) \ + criterion_co(preds_co, labels_co) optimizer.zero_grad() loss.backward() optimizer.step() avg_loss += loss.item() / len(train_loader) train_loss_list.append(avg_loss) model.eval() valid_gr_preds = np.zeros((len(valid_loader.dataset), num_classes[0])) valid_vo_preds = np.zeros((len(valid_loader.dataset), num_classes[1])) valid_co_preds = np.zeros((len(valid_loader.dataset), num_classes[2])) avg_val_loss = 0. for i, (images, labels_gr, labels_vo, labels_co) in enumerate(valid_loader): images = Variable(images).to(device) labels_gr = Variable(labels_gr).to(device) labels_vo = Variable(labels_vo).to(device) labels_co = Variable(labels_co).to(device) preds_gr, preds_vo, preds_co = model(images.float()) loss_gr = criterion_gr(preds_gr, labels_gr) loss_vo = criterion_vo(preds_vo, labels_vo) loss_co = criterion_co(preds_co, labels_co) valid_gr_preds[i * config.batch_size:( i + 1) * config.batch_size] = preds_gr.cpu().detach().numpy() valid_vo_preds[i * config.batch_size:( i + 1) * config.batch_size] = preds_vo.cpu().detach().numpy() valid_co_preds[i * config.batch_size:( i + 1) * config.batch_size] = preds_co.cpu().detach().numpy() avg_val_loss += (loss_gr.item() + loss_vo.item() + loss_co.item()) / len(valid_loader) recall_gr = recall_score(y_gr_val, np.argmax(valid_gr_preds, axis=1), average='macro') recall_vo = recall_score(y_vo_val, np.argmax(valid_vo_preds, axis=1), average='macro') recall_co = recall_score(y_co_val, np.argmax(valid_co_preds, axis=1), average='macro') val_score = np.average([recall_gr, recall_vo, recall_co], weights=[2, 1, 1]) val_loss_list.append(avg_val_loss) val_score_list.append(val_score) if scheduler_flg and config.scheduler.type != 'reduce': scheduler.step() elif scheduler_flg and config.scheduler.type == 'reduce': scheduler.step(avg_val_loss) elapsed = time.time() - start_time mb.write( f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} val_score: {val_score:.4f} val_gr_score: {recall_gr:.4f} val_vo_score: {recall_vo:.4f} val_co_score: {recall_co:.4f} time: {elapsed:.0f}s' ) logging.debug( f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} val_score: {val_score:.4f} val_gr_score: {recall_gr:.4f} val_vo_score: {recall_vo:.4f} val_co_score: {recall_co:.4f} time: {elapsed:.0f}s' ) if best_val_score < val_score: best_epoch = epoch + 1 best_val_score = val_score best_recall_gr = recall_gr best_recall_vo = recall_vo best_recall_co = recall_co best_valid_gr_preds = valid_gr_preds best_valid_vo_preds = valid_vo_preds best_valid_co_preds = valid_co_preds best_model = model.state_dict() counter = 0 counter += 1 if counter == config.early_stopping: break print('\n\n===================================\n') print(f'CV: {best_val_score}\n') print(f'BEST EPOCH: {best_epoch}') print(f'BEST RECALL GR: {best_recall_gr}') print(f'BEST RECALL VO: {best_recall_vo}') print(f'BEST RECALL CO: {best_recall_co}') logging.debug(f'\n\nCV: {best_val_score}\n') logging.debug(f'BEST EPOCH: {best_epoch}') logging.debug(f'BEST RECALL GR: {best_recall_gr}') logging.debug(f'BEST RECALL VO: {best_recall_vo}') logging.debug(f'BEST RECALL CO: {best_recall_co}\n\n') print('\n===================================\n\n') return best_model, [ best_valid_gr_preds, best_valid_vo_preds, best_valid_co_preds ], best_val_score, train_loss_list, val_loss_list, val_score_list
def train_model(x_train, y_train, train_transforms): num_epochs = 80 batch_size = 64 test_batch_size = 256 lr = 3e-3 eta_min = 1e-5 t_max = 10 num_classes = y_train.shape[1] x_trn, x_val, y_trn, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=SEED) train_dataset = FATTrainDataset(x_trn, y_trn, train_transforms) valid_dataset = FATTrainDataset(x_val, y_val, train_transforms) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) valid_loader = DataLoader(valid_dataset, batch_size=test_batch_size, shuffle=False) model = Classifier(num_classes=num_classes).cuda() criterion = nn.BCEWithLogitsLoss().cuda() optimizer = Adam(params=model.parameters(), lr=lr, amsgrad=False) scheduler = CosineAnnealingLR(optimizer, T_max=t_max, eta_min=eta_min) best_epoch = -1 best_lwlrap = 0. mb = master_bar(range(num_epochs)) for epoch in mb: start_time = time.time() model.train() avg_loss = 0. for x_batch, y_batch in progress_bar(train_loader, parent=mb): preds = model(x_batch.cuda()) loss = criterion(preds, y_batch.cuda()) optimizer.zero_grad() loss.backward() optimizer.step() avg_loss += loss.item() / len(train_loader) model.eval() valid_preds = np.zeros((len(x_val), num_classes)) avg_val_loss = 0. for i, (x_batch, y_batch) in enumerate(valid_loader): preds = model(x_batch.cuda()).detach() loss = criterion(preds, y_batch.cuda()) preds = torch.sigmoid(preds) valid_preds[i * test_batch_size:(i + 1) * test_batch_size] = preds.cpu().numpy() avg_val_loss += loss.item() / len(valid_loader) score, weight = calculate_per_class_lwlrap(y_val, valid_preds) lwlrap = (score * weight).sum() scheduler.step() if (epoch + 1) % 5 == 0: elapsed = time.time() - start_time mb.write( f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} val_lwlrap: {lwlrap:.6f} time: {elapsed:.0f}s' ) if lwlrap > best_lwlrap: best_epoch = epoch + 1 best_lwlrap = lwlrap torch.save(model.state_dict(), 'weight_best.pt') return { 'best_epoch': best_epoch, 'best_lwlrap': best_lwlrap, }
args, abs_, optimize_both_exp=False, batchnorm=True, prior_var=80, device=device) model = model.to(device) model.apply(EXPVAEWAVE.weight_init) optimizer = optim.Adam(list(model.parameters()) + [model.exps], lr=args.learning_rate, weight_decay=0, betas=(args.beta_one, args.beta_two)) train_losses = [] n_epochs = args.epochs mb = master_bar(range(20)) y_ax_index = 0 for i in mb: epoch = i for j in progress_bar(range(int(n_epochs / 20)), parent=mb): model = EXPVAEWAVE.train(model, device, args, optimizer, train_loader, epoch, train_losses) # x = range(len(train_losses)) # y = train_losses # graphs = [[x,y]] # y_bounds = [0,train_losses[0]] # mb.update_graph(graphs, y_bounds=y_bounds) mb.write(f'Avg. Training Loss: {train_losses[-1]}.') # To get the location estimates for each spike in the recording, we run them through the inference network and then average the location estimates belonging to the same event (this is described in the manuscript in the amplitude jitter portion of the paper).
def main(): n_epoch = 10 now = datetime.datetime.now() now_date = '{}-{:0>2d}-{:0>2d}_{:0>2d}-{:0>2d}-{:0>2d}'.format( now.year, now.month, now.day, now.hour, now.minute, now.second) # set logger logger = logging.getLogger("Log") logger.setLevel(logging.DEBUG) handler_format = logging.Formatter( '%(asctime)s - %(levelname)s - %(message)s') print('{}-{}-{} {}:{}:{}'.format(now.year, now.month, now.day, now.hour, now.minute, now.second)) with open('../params/stacking.yaml', "r+") as f: param = yaml.load(f, Loader=yaml.FullLoader) seed_setting(param['seed']) if torch.cuda.is_available(): torch.backends.cudnn.benchmark = True local_cv = dict() for fold in range(5): outdir = os.path.join(param['save path'], EXP_NAME, now_date, 'fold{}'.format(fold)) if os.path.exists(param['save path']): os.makedirs(outdir, exist_ok=True) else: print("Not find {}".format(param['save path'])) raise FileNotFoundError file_handler = logging.FileHandler( os.path.join(outdir, 'experiment.log')) file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(handler_format) logger.addHandler(file_handler) logger.debug('============= FOLD {} ============='.format(fold)) logger.debug('{}-{}-{} {}:{}:{}'.format(now.year, now.month, now.day, now.hour, now.minute, now.second)) print(f'fold - {fold}') print('load data set') train_dataset = StackingDataset( df=get_train_df(param['tabledir']).query('valid != @fold'), logit_path='/mnt/hdd1/alcon2019/logits_for_oof.pth', mode='train') valid_dataset = StackingDataset( df=get_train_df(param['tabledir']).query('valid == @fold'), logit_path='/mnt/hdd1/alcon2019/logits_for_oof.pth', mode='valid') print('load data loader') train_dataloader = DataLoader(train_dataset, batch_size=param['batch size'], num_workers=param['thread'], pin_memory=False, drop_last=False, shuffle=True) valid_dataloader = DataLoader(valid_dataset, batch_size=param['batch size'], num_workers=param['thread'], pin_memory=False, drop_last=False, shuffle=False) print('model set') model = MLP() optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) model = model.to(param['device']) loss_fn = nn.CrossEntropyLoss().to(param['device']) eval_fn = accuracy_one_character max_char_acc = -1e-5 max_3char_acc = -1e-5 min_loss = 1e+5 mb = master_bar(range(n_epoch)) for epoch in mb: model.train() avg_train_loss = 10**5 avg_train_accuracy = 0.0 avg_three_train_acc = 0.0 for step, (inputs, targets, indice) in enumerate( progress_bar(train_dataloader, parent=mb)): model.train() inputs = inputs.to(param['device']) targets = targets.to(param['device']) optimizer.zero_grad() logits = model(inputs) # logits.size() = (batch*3, 48) preds = logits.view(targets.size(0), 3, -1).softmax(dim=2) loss = loss_fn(logits, targets.view(-1, targets.size(2)).argmax(dim=1)) loss.backward() avg_train_loss += loss.item() _avg_accuracy = eval_fn(preds, targets.argmax(dim=2)).item() avg_train_accuracy += _avg_accuracy _three_char_accuracy = accuracy_three_character( preds, targets.argmax(dim=2), mean=True).item() avg_three_train_acc += _three_char_accuracy avg_train_loss /= len(train_dataloader) avg_train_accuracy /= len(train_dataloader) avg_three_train_acc /= len(train_dataloader) avg_valid_loss, avg_valid_accuracy, avg_three_valid_acc = valid_alcon_rnn( model, valid_dataloader, param['device'], loss_fn, eval_fn) if min_loss > avg_valid_loss: logger.debug('update best loss: {:.5f} ---> {:.5f}'.format( min_loss, avg_valid_loss)) min_loss = avg_valid_loss torch.save(model.state_dict(), os.path.join(outdir, 'best_loss.pth')) if max_char_acc < avg_valid_accuracy: logger.debug( 'update best acc per 1 char: {:.3%} ---> {:.3%}'.format( max_char_acc, avg_valid_accuracy)) max_char_acc = avg_valid_accuracy torch.save(model.state_dict(), os.path.join(outdir, 'best_acc.pth')) if max_3char_acc < avg_three_valid_acc: logger.debug( 'update best acc per 3 char: {:.3%} ---> {:.3%}'.format( max_3char_acc, avg_three_valid_acc)) max_3char_acc = avg_three_valid_acc torch.save(model.state_dict(), os.path.join(outdir, 'best_3acc.pth')) logger.debug( '======================== epoch {} ========================'. format(epoch + 1)) logger.debug('lr : {:.5f}'.format( scheduler.get_lr()[0])) logger.debug( 'loss : train={:.5f} , test={:.5f}'.format( avg_train_loss, avg_valid_loss)) logger.debug( 'acc(per 1 char) : train={:.3%} , test={:.3%}'.format( avg_train_accuracy, avg_valid_accuracy)) logger.debug( 'acc(per 3 char) : train={:.3%} , test={:.3%}'.format( avg_three_train_acc, avg_three_valid_acc)) logger.debug('================ FINISH TRAIN ================') logger.debug('Result') logger.debug('Best loss : {}'.format(min_loss)) logger.debug('Best 1 acc : {}'.format(max_char_acc)) logger.debug('Best 3 acc : {}'.format(max_3char_acc))