def test_zero_sample(): rmse = RootMeanSquaredError() with pytest.raises( NotComputableError, match= r"MeanSquaredError must have at least one example before it can be computed" ): rmse.compute()
def __init__(self, column, title=None, metrics=None, figures=None, output_transform=lambda x: x): self.column = column self.title = title if title is not None else '' self.metrics = set(metrics if metrics is not None else LocalMetrics.METRICS) self.figures = set(figures if figures is not None else LocalMetrics.FIGURES) self._rmse = RootMeanSquaredError() self._pearson = PearsonR() self._per_model_pearson = Mean() self._hist = ScoreHistogram(title=title) super(LocalMetrics, self).__init__(output_transform=output_transform)
def test_compute(): rmse = RootMeanSquaredError() y_pred = torch.Tensor([[2.0], [-2.0]]) y = torch.zeros(2) rmse.update((y_pred, y)) assert rmse.compute() == 2.0 rmse.reset() y_pred = torch.Tensor([[3.0], [-3.0]]) y = torch.zeros(2) rmse.update((y_pred, y)) assert rmse.compute() == 3.0
def test_compute(): rmse = RootMeanSquaredError() def _test(y_pred, y, batch_size): rmse.reset() if batch_size > 1: n_iters = y.shape[0] // batch_size + 1 for i in range(n_iters): idx = i * batch_size rmse.update( (y_pred[idx:idx + batch_size], y[idx:idx + batch_size])) else: rmse.update((y_pred, y)) np_y = y.numpy().ravel() np_y_pred = y_pred.numpy().ravel() np_res = np.sqrt( np.power((np_y - np_y_pred), 2.0).sum() / np_y.shape[0]) res = rmse.compute() assert isinstance(res, float) assert pytest.approx(res) == np_res def get_test_cases(): test_cases = [ ( torch.empty(10, ).uniform_(0, 10), torch.empty(10, ).uniform_(0, 10), 1, ), (torch.empty(10, 1).uniform_(-10, 10), torch.empty(10, 1).uniform_(-10, 10), 1), # updated batches ( torch.empty(50, ).uniform_(0, 10), torch.empty(50).uniform_(0, 10), 16, ), (torch.empty(50, 1).uniform_(-10, 10), torch.empty(50, 1).uniform_(-10, 10), 16), ] return test_cases for _ in range(5): # check multiple random inputs as random exact occurencies are rare test_cases = get_test_cases() for y_pred, y, batch_size in test_cases: _test(y_pred, y, batch_size)
def _test(metric_device): engine = Engine(update) m = RootMeanSquaredError(device=metric_device) m.attach(engine, "rmse") data = list(range(n_iters)) engine.run(data=data, max_epochs=1) assert "rmse" in engine.state.metrics res = engine.state.metrics["rmse"] y_preds_full = [] for i in range(idist.get_world_size()): y_preds_full.append((i + 1) * torch.ones(offset)) y_preds_full = torch.stack(y_preds_full).to(device).flatten() true_res = np.sqrt(np.mean(np.square((y_true - y_preds_full).cpu().numpy()))) assert pytest.approx(res, rel=tol) == true_res
def _test_distrib_itegration(device): import numpy as np import torch.distributed as dist from ignite.engine import Engine rank = dist.get_rank() n_iters = 100 s = 50 offset = n_iters * s y_true = torch.arange(0, offset * dist.get_world_size(), dtype=torch.float).to( device ) y_preds = (rank + 1) * torch.ones(offset, dtype=torch.float).to(device) def update(engine, i): return ( y_preds[i * s : (i + 1) * s], y_true[i * s + offset * rank : (i + 1) * s + offset * rank], ) engine = Engine(update) m = RootMeanSquaredError(device=device) m.attach(engine, "rmse") data = list(range(n_iters)) engine.run(data=data, max_epochs=1) assert "rmse" in engine.state.metrics res = engine.state.metrics["rmse"] y_preds_full = [] for i in range(dist.get_world_size()): y_preds_full.append((i + 1) * torch.ones(offset)) y_preds_full = torch.stack(y_preds_full).to(device).flatten() true_res = np.sqrt(np.mean(np.square((y_true - y_preds_full).cpu().numpy()))) assert pytest.approx(res) == true_res
def test_compute(n_times, test_data): rmse = RootMeanSquaredError() y_pred, y, batch_size = test_data rmse.reset() if batch_size > 1: n_iters = y.shape[0] // batch_size + 1 for i in range(n_iters): idx = i * batch_size rmse.update((y_pred[idx : idx + batch_size], y[idx : idx + batch_size])) else: rmse.update((y_pred, y)) np_y = y.numpy().ravel() np_y_pred = y_pred.numpy().ravel() np_res = np.sqrt(np.power((np_y - np_y_pred), 2.0).sum() / np_y.shape[0]) res = rmse.compute() assert isinstance(res, float) assert pytest.approx(res) == np_res
def RMSEMetric(key): """Create RMSE metric on key.""" return DictMetric(key, RootMeanSquaredError())
class LocalMetrics(ignite.metrics.Metric): METRICS = ( 'rmse', 'pearson', 'per_model_pearson', ) FIGURES = ( 'hist', ) def __init__(self, column, title=None, metrics=None, figures=None, output_transform=lambda x: x): self.column = column self.title = title if title is not None else '' self.metrics = set(metrics if metrics is not None else LocalMetrics.METRICS) self.figures = set(figures if figures is not None else LocalMetrics.FIGURES) self._rmse = RootMeanSquaredError() self._pearson = PearsonR() self._per_model_pearson = Mean() self._hist = ScoreHistogram(title=title) super(LocalMetrics, self).__init__(output_transform=output_transform) def reset(self): self._rmse.reset() self._pearson.reset() self._per_model_pearson.reset() self._hist.reset() def update(self, batch: DecoyBatch): # Skip native structures and ignore residues that don't have a ground-truth score non_native = np.repeat(np.char.not_equal(batch.decoy_name, 'native'), repeats=batch.num_nodes_by_graph.cpu().numpy()) has_score = torch.isfinite(batch.lddt).cpu().numpy() valid_scores = np.logical_and(non_native, has_score) # Used to uniquely identify a (protein, model) pair without using their str names target_model_id = batch.node_index_by_graph[valid_scores].cpu().numpy() node_preds = batch.node_features[valid_scores, self.column].detach().cpu().numpy() node_targets = batch.lddt[valid_scores].detach().cpu().numpy() # Streaming metrics on local scores (they expect torch tensors, not numpy arrays) self._rmse.update((torch.from_numpy(node_preds), torch.from_numpy(node_targets))) self._pearson.update((torch.from_numpy(node_preds), torch.from_numpy(node_targets))) # Per model metrics: pandas is the easiest way to get a groupby. grouped = pd.DataFrame({ 'target_model': target_model_id, 'preds': node_preds, 'true': node_targets }).groupby('target_model') per_model_pearsons = grouped.apply(lambda df: pearson(df['preds'], df['true'])) self._per_model_pearson.update(torch.from_numpy(per_model_pearsons.values)) self._hist.update(node_preds, node_targets) def compute(self): metrics = {} figures = {} if 'rmse' in self.metrics: metrics['rmse'] = self._rmse.compute() if 'pearson' in self.metrics: metrics['pearson'] = self._pearson.compute() if 'per_model_pearson' in self.metrics: metrics['per_model_pearson'] = self._per_model_pearson.compute() if 'hist' in self.figures: extra_title = [] if 'pearson' in self.metrics: extra_title.append(f'$R$ {metrics["pearson"]:.3f}') if 'per_model_pearson' in self.metrics: extra_title.append(f'$R_\\mathrm{{model}}$ {metrics["per_model_pearson"]:.3f}') figures['hist'] = self._hist.compute('\n'.join(extra_title)) return {'metrics': metrics, 'figures': figures} def completed(self, engine, prefix): result = self.compute() for name, metric in result['metrics'].items(): engine.state.metrics[prefix + '/' + name] = metric for name, fig in result['figures'].items(): engine.state.figures[prefix + '/' + name] = fig
def test_zero_div(): rmse = RootMeanSquaredError() with pytest.raises(NotComputableError): rmse.compute()
def run(args, seed): config.make_paths() torch.random.manual_seed(seed) train_loader, val_loader, shape = get_data_loaders( config.Training.batch_size, proportion=config.Training.proportion, test_batch_size=config.Training.batch_size * 2, ) n, d, t = shape model = models.ConvNet(d, seq_len=t) writer = tb.SummaryWriter(log_dir=config.TENSORBOARD) model.to(config.device) # Move model before creating optimizer optimizer = torch.optim.Adam(model.parameters()) criterion = nn.MSELoss() trainer = create_supervised_trainer(model, optimizer, criterion, device=config.device) trainer.logger = setup_logger("trainer") checkpointer = ModelCheckpoint( config.MODEL, model.__class__.__name__, n_saved=2, create_dir=True, save_as_state_dict=True, ) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=config.Training.save_every), checkpointer, {"model": model}, ) val_metrics = { "mse": Loss(criterion), "mae": MeanAbsoluteError(), "rmse": RootMeanSquaredError(), } evaluator = create_supervised_evaluator(model, metrics=val_metrics, device=config.device) evaluator.logger = setup_logger("evaluator") ar_evaluator = create_ar_evaluator(model, metrics=val_metrics, device=config.device) ar_evaluator.logger = setup_logger("ar") @trainer.on(Events.EPOCH_COMPLETED(every=config.Training.save_every)) def log_ar(engine): ar_evaluator.run(val_loader) y_pred, y = ar_evaluator.state.output fig = plot_output(y, y_pred) writer.add_figure("eval/ar", fig, engine.state.epoch) plt.close() # desc = "ITERATION - loss: {:.2f}" # pbar = tqdm(initial=0, leave=False, total=len(train_loader), desc=desc.format(0)) @trainer.on(Events.ITERATION_COMPLETED(every=config.Training.log_every)) def log_training_loss(engine): # pbar.desc = desc.format(engine.state.output) # pbar.update(log_interval) if args.verbose: grad_norm = torch.stack( [p.grad.norm() for p in model.parameters()]).sum() writer.add_scalar("train/grad_norm", grad_norm, engine.state.iteration) writer.add_scalar("train/loss", engine.state.output, engine.state.iteration) @trainer.on(Events.EPOCH_COMPLETED(every=config.Training.eval_every)) def log_training_results(engine): # pbar.refresh() evaluator.run(train_loader) metrics = evaluator.state.metrics for k, v in metrics.items(): writer.add_scalar(f"train/{k}", v, engine.state.epoch) # tqdm.write( # f"Training Results - Epoch: {engine.state.epoch} Avg mse: {evaluator.state.metrics['mse']:.2f}" # ) @trainer.on(Events.EPOCH_COMPLETED(every=config.Training.eval_every)) def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics for k, v in metrics.items(): writer.add_scalar(f"eval/{k}", v, engine.state.epoch) # tqdm.write( # f"Validation Results - Epoch: {engine.state.epoch} Avg mse: {evaluator.state.metrics['mse']:.2f}" # ) # pbar.n = pbar.last_print_n = 0 y_pred, y = evaluator.state.output fig = plot_output(y, y_pred) writer.add_figure("eval/preds", fig, engine.state.epoch) plt.close() # @trainer.on(Events.EPOCH_COMPLETED | Events.COMPLETED) # def log_time(engine): # #tqdm.write( # # f"{trainer.last_event_name.name} took {trainer.state.times[trainer.last_event_name.name]} seconds" # #) if args.ckpt is not None: ckpt = torch.load(args.ckpt) ModelCheckpoint.load_objects({"model": model}, ckpt) try: trainer.run(train_loader, max_epochs=config.Training.max_epochs) except Exception as e: import traceback print(traceback.format_exc()) # pbar.close() writer.close()
def run( train_batch_size: int, val_batch_size: int, epochs: int, lr: float, model_name: str, architecture: str, momentum: float, log_interval: int, log_dir: str, save_dir: str, save_step: int, val_step: int, num_workers: int, patience: int, eval_only: bool = False, overfit_on_few_samples: bool = False, ): train_loader, val_loader, test_loader = get_data_loaders( train_batch_size, val_batch_size, num_workers=num_workers, overfit_on_few_samples=overfit_on_few_samples, ) models_available = {'convmos': ConvMOS} model = models_available[model_name](architecture=architecture) writer = create_summary_writer(model, train_loader, log_dir) device = 'cpu' if torch.cuda.is_available(): device = 'cuda' model = model.to(device=device) # E-OBS only provides observational data for land so we need to use a mask to avoid fitting on the sea land_mask_np = np.load('remo_eobs_land_mask.npy') # Convert booleans to 1 and 0, and convert numpy array to torch Tensor land_mask = torch.from_numpy(1 * land_mask_np).to(device) print('Land mask:') print(land_mask) loss_fn = partial(masked_mse_loss, mask=land_mask) optimizer = Adam(model.parameters(), lr=lr) trainer = create_supervised_trainer(model, optimizer, loss_fn, device=device) metrics = { 'rmse': RootMeanSquaredError(), 'mae': MeanAbsoluteError(), 'mse': Loss(loss_fn), } train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) val_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) to_save = {'model': model, 'optimizer': optimizer, 'trainer': trainer} checkpoint_handler = Checkpoint( to_save, DiskSaver(save_dir, create_dir=True, require_empty=False), n_saved=2, global_step_transform=global_step_from_engine(trainer), ) trainer.add_event_handler(Events.EPOCH_COMPLETED(every=save_step), checkpoint_handler) trainer.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan()) def score_function(engine): val_loss = engine.state.metrics['mse'] return -val_loss best_checkpoint_handler = Checkpoint( to_save, DiskSaver(save_dir, create_dir=True, require_empty=False), n_saved=2, filename_prefix='best', score_function=score_function, score_name='val_loss', global_step_transform=global_step_from_engine(trainer), ) val_evaluator.add_event_handler(Events.COMPLETED, best_checkpoint_handler) earlystop_handler = EarlyStopping(patience=patience, score_function=score_function, trainer=trainer) val_evaluator.add_event_handler(Events.COMPLETED, earlystop_handler) # Maybe load model checkpoint_files = glob(join(save_dir, 'checkpoint_*.pt')) if len(checkpoint_files) > 0: # latest_checkpoint_file = sorted(checkpoint_files)[-1] epoch_list = [ int(c.split('.')[0].split('_')[-1]) for c in checkpoint_files ] last_epoch = sorted(epoch_list)[-1] latest_checkpoint_file = join(save_dir, f'checkpoint_{last_epoch}.pt') print('Loading last checkpoint', latest_checkpoint_file) last_epoch = int(latest_checkpoint_file.split('.')[0].split('_')[-1]) if last_epoch >= epochs: print('Training was already completed') eval_only = True # return checkpoint = torch.load(latest_checkpoint_file, map_location=device) Checkpoint.load_objects(to_load=to_save, checkpoint=checkpoint) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: print("Epoch[{}] Iteration[{}/{}] Loss: {:.2f}" "".format(engine.state.epoch, iter, len(train_loader), engine.state.output)) writer.add_scalar("training/loss", engine.state.output, engine.state.iteration) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): train_evaluator.run(train_loader) metrics = train_evaluator.state.metrics avg_rmse = metrics['rmse'] avg_mae = metrics['mae'] avg_mse = metrics['mse'] print( "Training Results - Epoch: {} Avg RMSE: {:.2f} Avg loss: {:.2f} Avg MAE: {:.2f}" .format(engine.state.epoch, avg_rmse, avg_mse, avg_mae)) writer.add_scalar("training/avg_loss", avg_mse, engine.state.epoch) writer.add_scalar("training/avg_rmse", avg_rmse, engine.state.epoch) writer.add_scalar("training/avg_mae", avg_mae, engine.state.epoch) @trainer.on(Events.EPOCH_COMPLETED(every=val_step)) def log_validation_results(engine): val_evaluator.run(val_loader) metrics = val_evaluator.state.metrics avg_rmse = metrics['rmse'] avg_mae = metrics['mae'] avg_mse = metrics['mse'] print( "Validation Results - Epoch: {} Avg RMSE: {:.2f} Avg loss: {:.2f} Avg MAE: {:.2f}" .format(engine.state.epoch, avg_rmse, avg_mse, avg_mae)) writer.add_scalar("validation/avg_loss", avg_mse, engine.state.epoch) writer.add_scalar("validation/avg_rmse", avg_rmse, engine.state.epoch) writer.add_scalar("validation/avg_mae", avg_mae, engine.state.epoch) @trainer.on(Events.EPOCH_COMPLETED(every=save_step)) def log_model_weights(engine): for name, param in model.named_parameters(): writer.add_histogram(f"model/weights_{name}", param, engine.state.epoch) @trainer.on(Events.EPOCH_COMPLETED(every=save_step)) def regularly_predict_val_data(engine): predict_data(engine.state.epoch, val_loader) def predict_data(epoch: int, data_loader) -> xr.Dataset: # Predict all test data points and write the predictions print(f'Predicting {data_loader.dataset.mode} data...') data_loader_iter = iter(data_loader) pred_np = None for i in range(len(data_loader)): x, y = next(data_loader_iter) # print(x) pred = (model.forward(x.to(device=device)).to( device='cpu').detach().numpy()[:, 0, :, :]) # print('=======================================') # print(pred) if pred_np is None: pred_np = pred else: pred_np = np.concatenate((pred_np, pred), axis=0) preds = xr.Dataset( { 'pred': (['time', 'lat', 'lon'], pred_np), 'input': (['time', 'lat', 'lon'], data_loader.dataset.X), 'target': (['time', 'lat', 'lon'], data_loader.dataset.Y[:, :, :, 0]), }, coords={ 'time': data_loader.dataset. times, # list(range(len(val_loader.dataset))), 'lon_var': ( ('lat', 'lon'), data_loader.dataset.lons[0], ), # list(range(x.shape[-2])), 'lat_var': (('lat', 'lon'), data_loader.dataset.lats[0]), }, # list(range(x.shape[-1]))} ) preds.to_netcdf( join(save_dir, f'predictions_{data_loader.dataset.mode}_{epoch}.nc')) return preds # kick everything off if not eval_only: trainer.run(train_loader, max_epochs=epochs) # Load best model best_checkpoint = best_checkpoint_handler.last_checkpoint print('Loading best checkpoint from', best_checkpoint) checkpoint = torch.load(join(save_dir, best_checkpoint_handler.last_checkpoint), map_location=device) Checkpoint.load_objects(to_load=to_save, checkpoint=checkpoint) writer.close() val_preds = predict_data(trainer.state.epoch, val_loader) test_preds = predict_data(trainer.state.epoch, test_loader) val_res = mean_metrics(calculate_metrics(val_preds.pred, val_preds.target)) test_res = mean_metrics( calculate_metrics(test_preds.pred, test_preds.target)) # val_evaluator.run(val_loader) results = {} # Store the config, ... results.update({ section_name: dict(config[section_name]) for section_name in config.sections() }) # ... the last training metrics, results.update( {f'train_{k}': v for k, v in train_evaluator.state.metrics.items()}) # ... the last validation metrics from torch, results.update( {f'val_torch_{k}': v for k, v in val_evaluator.state.metrics.items()}) # ... the validation metrics that I calculate, results.update({f'val_{k}': v for k, v in val_res.items()}) # ... asnd the test metrics that I calculate results.update({f'test_{k}': v for k, v in test_res.items()}) write_results_file(join('results', 'results.json'), pd.json_normalize(results))