def main(): tf.logging.set_verbosity(tf.logging.INFO) #tf.logging.set_verbosity(tf.logging.ERROR) tpu_cluster_resolver = None if FLAGS.use_tpu: tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( tpu=FLAGS.tpu, zone=FLAGS.tpu_zone, project=None, job_name='worker', coordinator_name=None, coordinator_address=None, credentials='default', service=None, discovery_url=None) tpu_config = tf.compat.v1.estimator.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_cores_per_replica=FLAGS.num_tpu_cores, per_host_input_for_training=True) run_config = tf.compat.v1.estimator.tpu.RunConfig( tpu_config=tpu_config, evaluation_master=None, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=True), #, arithmetic_optimization=False), master=None, cluster=tpu_cluster_resolver, **{ 'save_checkpoints_steps': FLAGS.save_checkpoints_steps, 'tf_random_seed': FLAGS.random_seed, 'model_dir': FLAGS.output_dir, 'keep_checkpoint_max': FLAGS.keep_checkpoint_max, 'log_step_count_steps': FLAGS.log_step_count_steps }) estimator = tf.compat.v1.estimator.tpu.TPUEstimator( model_fn=model_fn_builder(FLAGS.init_checkpoint, FLAGS.learning_rate, FLAGS.num_train_steps, FLAGS.use_tpu), use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.batch_size, eval_batch_size=FLAGS.batch_size, predict_batch_size=FLAGS.batch_size, config=run_config, params={ "conv1d_filter_width": FLAGS.conv1d_filter_width, "GRU_hidden_size": FLAGS.GRU_hidden_size, "fc_hidden_size": FLAGS.fc_hidden_size, "VAE_latent_space_dimension": FLAGS.VAE_latent_space_dimension, "gamma": FLAGS.gamma, "initializer_range": FLAGS.initializer_range, "num_features": FLAGS.num_features, "dropout_prob": FLAGS.dropout_prob, "use_tpu": FLAGS.use_tpu, "prediction_task": FLAGS.prediction_task, "threshold": FLAGS.threshold }) if FLAGS.action == 'TRAIN': estimator.train(input_fn=make_input_fn(FLAGS.train_file, is_training=True, drop_reminder=True), max_steps=FLAGS.num_train_steps) if FLAGS.action == 'EVALUATE': eval_drop_remainder = True if FLAGS.use_tpu else False results = estimator.evaluate(input_fn=make_input_fn( FLAGS.test_file, is_training=False, drop_reminder=eval_drop_remainder), steps=None) for key in sorted(results.keys()): tf.logging.info(" %s = %s", key, str(results[key])) if FLAGS.action == 'PREDICT': predict_drop_remainder = True if FLAGS.use_tpu else False results = estimator.predict( input_fn=make_input_fn(FLAGS.test_file, is_training=False, drop_reminder=predict_drop_remainder)) if FLAGS.prediction_task == 'RMS_loss': output_predict_file = os.path.join("./", "RMS_loss.csv") with tf.gfile.GFile(output_predict_file, "w") as writer: for prediction in results: writer.write(str(prediction["RMS_loss"]) + "\n") elif FLAGS.prediction_task == 'EVALUATE': labels = [] anomalies = [] for prediction in results: labels.append(prediction["label"]) anomalies.append(prediction["predicted"]) metrics = calculate_metrics(anomalies, labels, True) tf.logging.info(" %s = %s", "threshold", FLAGS.threshold) for key in sorted(metrics.keys()): tf.logging.info(" %s = %s", key, str(metrics[key])) else: output_predict_file = os.path.join("./", "Anomaly.csv") with tf.gfile.GFile(output_predict_file, "w") as writer: for prediction in results: writer.write(str(prediction["anomaly"]) + "\n")
def run( model_name: str = 'linear', num_jobs: int = 1, n_components: float = 0.95, non_local_dist: int = 5, n_param_sets: int = 20, save_dir: str = '.', ): X_train, y_train, X_val, y_val, X_test, y_test, _, val_ds, test_ds = get_datasets( ) non_local = 'nonlocal' in model_name # Do not use elevation as it would always be static # Elevation is always the last predictor X_train = X_train[:, :-1] X_val = X_val[:, :-1] X_test = X_test[:, :-1] print( cur_time_string(), X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape, ) model_dict = { 'linear': LinearRegression, 'linear-nonlocal': LinearRegression, 'rf': RandomForestRegressor, 'rf-nonlocal': RandomForestRegressor, # Note: SVM takes forever on our datasets, which is why we could not evaluate it 'svm': SVR, 'svm-nonlocal': SVR, } # There is one model per cell models = [] # if non_local: pcas = [] for i in range(X_train.shape[-2] * X_train.shape[-1]): # models.append(model_dict[model_name](**hparams_dict)) models.append(model_dict[model_name]()) pcas.append( PCA(n_components=n_components)) # Sa'adi2017 does it like this y_val_pred = np.empty(y_val.shape) y_test_pred = np.empty(y_test.shape) best_ks = np.empty((X_train.shape[-2], X_train.shape[-1])) best_features = np.empty((X_train.shape[-2], X_train.shape[-1])).tolist() for i in range(X_train.shape[-2]): print(cur_time_string(), 'Row', i) for j in range(X_train.shape[-1]): print(cur_time_string(), 'Column', j) model = models[i * X_train.shape[-2] + j] if non_local: dist = non_local_dist min_i = i - dist if i - dist >= 0 else 0 max_i = (i + dist + 1 if i + dist + 1 < X_train.shape[-2] else X_train.shape[-2] - 1) min_j = j - dist if j - dist >= 0 else 0 max_j = (j + dist + 1 if j + dist + 1 < X_train.shape[-1] else X_train.shape[-1] - 1) X_train_loc = X_train[:, :, min_i:max_i, min_j:max_j].reshape( X_train.shape[0], -1) X_val_loc = X_val[:, :, min_i:max_i, min_j:max_j].reshape(X_val.shape[0], -1) X_test_loc = X_test[:, :, min_i:max_i, min_j:max_j].reshape(X_test.shape[0], -1) else: X_train_loc = X_train[:, :, i, j] X_val_loc = X_val[:, :, i, j] X_test_loc = X_test[:, :, i, j] y_train_loc = y_train[:, 0, i, j] y_val_loc = y_val[:, 0, i, j] # Speed up things by not doing the feature selection and pca for cells where the label is always static if y_train_loc.std() == 0.0: pipe = Pipeline(steps=[('model', model)]) best_ks[i, j] = np.nan pipe.fit(X_train_loc, y_train_loc) else: # Select features best_k = find_best_k_for_SelectKBest(X_train_loc, y_train_loc, X_val_loc, y_val_loc, n_jobs=num_jobs) best_ks[i, j] = best_k pca = pcas[i * X_train.shape[-2] + j] # print(f'Try to find best model_parameters (n_param_sets: {n_param_sets}, n_jobs: {num_jobs})') optimal_model = find_best_model_parameters( X_train_loc, y_train_loc, X_val_loc, y_val_loc, model_dict[model_name], pca, best_k, n_jobs=num_jobs, n_iter=n_param_sets, ) models[i * X_train.shape[-2] + j] = optimal_model pipe = Pipeline(steps=[ ('kbest', SelectKBest(k=best_k, score_func=f_regression)), ('pca', pca), ('model', optimal_model), ]) pipe.fit(X_train_loc, y_train_loc) best_features[i][j] = pipe[0].get_support(indices=True) y_val_pred[:, 0, i, j] = pipe.predict(X_val_loc) y_test_pred[:, 0, i, j] = pipe.predict(X_test_loc) # Cleanup models as I don't really use them thereafter and especially random forests take a lot of RAM models[i * X_train.shape[-2] + j] = None # print(y_val_pred) print(y_val_pred.shape, y_val.shape) print(y_test_pred.shape, y_test.shape) np.set_printoptions(threshold=sys.maxsize) print('best_ks:') print(best_ks) print(np.nanmean(best_ks), np.nanmin(best_ks), np.nanmax(best_ks), np.nanstd(best_ks)) print('best_features:') print(best_features) metrics = calculate_metrics(y_val_pred[:, 0], y_val[:, 0]) val_res = mean_metrics(metrics) print('Validation metrics:') print(val_res) metrics = calculate_metrics(y_test_pred[:, 0], y_test[:, 0]) test_res = mean_metrics(metrics) print('Test metrics:') print(test_res) results = {} # Store the config, ... results.update({ section_name: dict(config[section_name]) for section_name in config.sections() }) # ... the validation metrics that I calculate, results.update({f'val_{k}': v for k, v in val_res.items()}) # ... and the test metrics that I calculate results.update({f'test_{k}': v for k, v in test_res.items()}) write_results_file(join('results', 'results.json'), pd.json_normalize(results)) val_preds = xr.Dataset( { 'pred': (['time', 'lat', 'lon'], y_val_pred[:, 0]), 'input': ( ['time', 'lat', 'lon'], val_ds.X, ), # I cannot use x_val directly as it is standardized 'target': (['time', 'lat', 'lon'], val_ds.Y[:, :, :, 0]), }, coords={ 'time': val_ds.times, 'lon_var': (('lat', 'lon'), val_ds.lons[0]), 'lat_var': (('lat', 'lon'), val_ds.lats[0]), }, ) test_preds = xr.Dataset( { 'pred': (['time', 'lat', 'lon'], y_test_pred[:, 0]), 'input': ( ['time', 'lat', 'lon'], test_ds.X, ), # I cannot use x_val directly as it is standardized 'target': (['time', 'lat', 'lon'], test_ds.Y[:, :, :, 0]), }, coords={ 'time': test_ds.times, 'lon_var': (('lat', 'lon'), test_ds.lons[0]), 'lat_var': (('lat', 'lon'), test_ds.lats[0]), }, ) try: makedirs(save_dir) except FileExistsError: # directory already exists pass val_preds.to_netcdf(join(save_dir, f'val_predictions.nc')) test_preds.to_netcdf(join(save_dir, f'test_predictions.nc'))
def run( train_batch_size: int, val_batch_size: int, epochs: int, lr: float, model_name: str, architecture: str, momentum: float, log_interval: int, log_dir: str, save_dir: str, save_step: int, val_step: int, num_workers: int, patience: int, eval_only: bool = False, overfit_on_few_samples: bool = False, ): train_loader, val_loader, test_loader = get_data_loaders( train_batch_size, val_batch_size, num_workers=num_workers, overfit_on_few_samples=overfit_on_few_samples, ) models_available = {'convmos': ConvMOS} model = models_available[model_name](architecture=architecture) writer = create_summary_writer(model, train_loader, log_dir) device = 'cpu' if torch.cuda.is_available(): device = 'cuda' model = model.to(device=device) # E-OBS only provides observational data for land so we need to use a mask to avoid fitting on the sea land_mask_np = np.load('remo_eobs_land_mask.npy') # Convert booleans to 1 and 0, and convert numpy array to torch Tensor land_mask = torch.from_numpy(1 * land_mask_np).to(device) print('Land mask:') print(land_mask) loss_fn = partial(masked_mse_loss, mask=land_mask) optimizer = Adam(model.parameters(), lr=lr) trainer = create_supervised_trainer(model, optimizer, loss_fn, device=device) metrics = { 'rmse': RootMeanSquaredError(), 'mae': MeanAbsoluteError(), 'mse': Loss(loss_fn), } train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) val_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) to_save = {'model': model, 'optimizer': optimizer, 'trainer': trainer} checkpoint_handler = Checkpoint( to_save, DiskSaver(save_dir, create_dir=True, require_empty=False), n_saved=2, global_step_transform=global_step_from_engine(trainer), ) trainer.add_event_handler(Events.EPOCH_COMPLETED(every=save_step), checkpoint_handler) trainer.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan()) def score_function(engine): val_loss = engine.state.metrics['mse'] return -val_loss best_checkpoint_handler = Checkpoint( to_save, DiskSaver(save_dir, create_dir=True, require_empty=False), n_saved=2, filename_prefix='best', score_function=score_function, score_name='val_loss', global_step_transform=global_step_from_engine(trainer), ) val_evaluator.add_event_handler(Events.COMPLETED, best_checkpoint_handler) earlystop_handler = EarlyStopping(patience=patience, score_function=score_function, trainer=trainer) val_evaluator.add_event_handler(Events.COMPLETED, earlystop_handler) # Maybe load model checkpoint_files = glob(join(save_dir, 'checkpoint_*.pt')) if len(checkpoint_files) > 0: # latest_checkpoint_file = sorted(checkpoint_files)[-1] epoch_list = [ int(c.split('.')[0].split('_')[-1]) for c in checkpoint_files ] last_epoch = sorted(epoch_list)[-1] latest_checkpoint_file = join(save_dir, f'checkpoint_{last_epoch}.pt') print('Loading last checkpoint', latest_checkpoint_file) last_epoch = int(latest_checkpoint_file.split('.')[0].split('_')[-1]) if last_epoch >= epochs: print('Training was already completed') eval_only = True # return checkpoint = torch.load(latest_checkpoint_file, map_location=device) Checkpoint.load_objects(to_load=to_save, checkpoint=checkpoint) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: print("Epoch[{}] Iteration[{}/{}] Loss: {:.2f}" "".format(engine.state.epoch, iter, len(train_loader), engine.state.output)) writer.add_scalar("training/loss", engine.state.output, engine.state.iteration) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): train_evaluator.run(train_loader) metrics = train_evaluator.state.metrics avg_rmse = metrics['rmse'] avg_mae = metrics['mae'] avg_mse = metrics['mse'] print( "Training Results - Epoch: {} Avg RMSE: {:.2f} Avg loss: {:.2f} Avg MAE: {:.2f}" .format(engine.state.epoch, avg_rmse, avg_mse, avg_mae)) writer.add_scalar("training/avg_loss", avg_mse, engine.state.epoch) writer.add_scalar("training/avg_rmse", avg_rmse, engine.state.epoch) writer.add_scalar("training/avg_mae", avg_mae, engine.state.epoch) @trainer.on(Events.EPOCH_COMPLETED(every=val_step)) def log_validation_results(engine): val_evaluator.run(val_loader) metrics = val_evaluator.state.metrics avg_rmse = metrics['rmse'] avg_mae = metrics['mae'] avg_mse = metrics['mse'] print( "Validation Results - Epoch: {} Avg RMSE: {:.2f} Avg loss: {:.2f} Avg MAE: {:.2f}" .format(engine.state.epoch, avg_rmse, avg_mse, avg_mae)) writer.add_scalar("validation/avg_loss", avg_mse, engine.state.epoch) writer.add_scalar("validation/avg_rmse", avg_rmse, engine.state.epoch) writer.add_scalar("validation/avg_mae", avg_mae, engine.state.epoch) @trainer.on(Events.EPOCH_COMPLETED(every=save_step)) def log_model_weights(engine): for name, param in model.named_parameters(): writer.add_histogram(f"model/weights_{name}", param, engine.state.epoch) @trainer.on(Events.EPOCH_COMPLETED(every=save_step)) def regularly_predict_val_data(engine): predict_data(engine.state.epoch, val_loader) def predict_data(epoch: int, data_loader) -> xr.Dataset: # Predict all test data points and write the predictions print(f'Predicting {data_loader.dataset.mode} data...') data_loader_iter = iter(data_loader) pred_np = None for i in range(len(data_loader)): x, y = next(data_loader_iter) # print(x) pred = (model.forward(x.to(device=device)).to( device='cpu').detach().numpy()[:, 0, :, :]) # print('=======================================') # print(pred) if pred_np is None: pred_np = pred else: pred_np = np.concatenate((pred_np, pred), axis=0) preds = xr.Dataset( { 'pred': (['time', 'lat', 'lon'], pred_np), 'input': (['time', 'lat', 'lon'], data_loader.dataset.X), 'target': (['time', 'lat', 'lon'], data_loader.dataset.Y[:, :, :, 0]), }, coords={ 'time': data_loader.dataset. times, # list(range(len(val_loader.dataset))), 'lon_var': ( ('lat', 'lon'), data_loader.dataset.lons[0], ), # list(range(x.shape[-2])), 'lat_var': (('lat', 'lon'), data_loader.dataset.lats[0]), }, # list(range(x.shape[-1]))} ) preds.to_netcdf( join(save_dir, f'predictions_{data_loader.dataset.mode}_{epoch}.nc')) return preds # kick everything off if not eval_only: trainer.run(train_loader, max_epochs=epochs) # Load best model best_checkpoint = best_checkpoint_handler.last_checkpoint print('Loading best checkpoint from', best_checkpoint) checkpoint = torch.load(join(save_dir, best_checkpoint_handler.last_checkpoint), map_location=device) Checkpoint.load_objects(to_load=to_save, checkpoint=checkpoint) writer.close() val_preds = predict_data(trainer.state.epoch, val_loader) test_preds = predict_data(trainer.state.epoch, test_loader) val_res = mean_metrics(calculate_metrics(val_preds.pred, val_preds.target)) test_res = mean_metrics( calculate_metrics(test_preds.pred, test_preds.target)) # val_evaluator.run(val_loader) results = {} # Store the config, ... results.update({ section_name: dict(config[section_name]) for section_name in config.sections() }) # ... the last training metrics, results.update( {f'train_{k}': v for k, v in train_evaluator.state.metrics.items()}) # ... the last validation metrics from torch, results.update( {f'val_torch_{k}': v for k, v in val_evaluator.state.metrics.items()}) # ... the validation metrics that I calculate, results.update({f'val_{k}': v for k, v in val_res.items()}) # ... asnd the test metrics that I calculate results.update({f'test_{k}': v for k, v in test_res.items()}) write_results_file(join('results', 'results.json'), pd.json_normalize(results))