check_randomness("Before fitting") logging.info("Fitting the network...") history = model.fit(train_x, train_y, validation_data=(val_x,val_y), epochs=EPOCHS, batch_size=BATCH_SIZE, sample_weight=train_y_weights, callbacks=[metrics_callback], shuffle=False) if SHOW_PLOTS : plots.plot_accuracy(history) plots.plot_loss(history) plots.plot_prf(metrics_callback) if SAVE_MODEL : model.save(MODEL_PATH) logging.info("Model saved in %s", MODEL_PATH) else : logging.info("Loading existing model from %s...",MODEL_PATH) model = load_model(MODEL_PATH) logging.info("Completed loading model from file") logging.info("Predicting on test set...") output = model.predict(x=test_x, verbose=1) logging.debug("Shape of output array: %s",np.shape(output))
def main( df_path: str = '/project/cq-training-1/project1/data/catalog.helios.public.20100101-20160101.pkl', image_size: int = 32, model: str = 'dummy', epochs: int = 20, optimizer: str = 'adam', lr: float = 1e-4, batch_size: int = 100, subset_perc: float = 1, subset_dates: bool = False, saved_model_dir: str = None, seq_len: int = 6, seed: bool = True, scale_label: bool = True, use_csky: bool = False, cache: bool = True, timesteps_minutes: int = 15): # Warning if no GPU detected if len(tf.config.list_physical_devices('GPU')) == 0: logger.warning('No GPU detected, training will run on CPU.') elif len(tf.config.list_physical_devices('GPU')) > 1: logger.warning( 'Multiple GPUs detected, training will run on only one GPU.') if subset_dates and subset_perc != 1: raise Exception( f'Invalid configuration. Argument --subset_dates=True and --subset_perc={subset_perc}.' ) # Set random seed if seed: tf.random.set_seed(SEED) np.random.seed(SEED) # Load dataframe logger.info('Loading and preprocessing dataframe...') df = pd.read_pickle(df_path) df = preprocessing.preprocess(df, shuffle=False, scale_label=scale_label) metadata = data.Metadata(df, scale_label) # Pre-crop data logger.info('Getting crops...') images = data.Images(metadata, image_size) # images.crop(dest=SLURM_TMPDIR) images.crop(dest=images.shared_storage) # Split into train and valid if subset_dates: metadata_train, metadata_valid = metadata.split_with_dates() else: metadata, _ = metadata.split(1 - subset_perc) metadata_train, metadata_valid = metadata.split(VALID_PERC) nb_train_examples = metadata_train.get_number_of_examples() nb_valid_examples = metadata_valid.get_number_of_examples() logger.info( f'Number of training examples : {nb_train_examples}, number of validation examples : \ {nb_valid_examples}') # Create model if model == 'dummy': model = baselines.DummyModel() elif model == 'sunset': model = baselines.SunsetModel() elif model == 'cnndem': model = baselines.ConvDemModel(image_size) elif model == 'sunset3d': model = baselines.Sunset3DModel() elif model == 'convlstm': model = baselines.ConvLSTM() elif model == 'cnngru': model = CnnGru(seq_len) elif model == 'cnngruatt': model = CnnGruAtt(seq_len) elif model == 'cnnlstm': model = LSTM_Resnet(seq_len) elif model == 'resnet': model = baselines.ResNetModel() else: raise Exception(f'Model "{model}" not recognized.') # Load model weights if saved_model_dir is not None: model.load_weights(os.path.join(saved_model_dir, "model")) # Loss and optimizer mse = tf.keras.losses.MeanSquaredError() if optimizer == 'adam': optimizer = tf.keras.optimizers.Adam(lr) elif optimizer == 'sgd': optimizer = tf.keras.optimizers.SGD(lr) else: raise Exception(f'Optimizer "{optimizer}" not recognized.') # Create data loader dataloader_train = SequenceDataset( metadata_train, images, seq_len, batch_size, timesteps=datetime.timedelta(minutes=timesteps_minutes), cache=cache) dataloader_valid = SequenceDataset( metadata_valid, images, seq_len, batch_size, timesteps=datetime.timedelta(minutes=timesteps_minutes), cache=cache) # Training loop logger.info('Training...') losses = {'train': [], 'valid': []} best_valid_loss = float('inf') for epoch in range(epochs): train_epoch(model, dataloader_train, batch_size, mse, optimizer, nb_train_examples, scale_label, use_csky) test_epoch(model, dataloader_valid, batch_size, mse, nb_valid_examples, scale_label, use_csky) train_loss = np.sqrt(train_mse_metric.result().numpy()) valid_loss = np.sqrt(valid_mse_metric.result().numpy()) csky_valid_loss = np.sqrt(valid_csky_mse_metric.result().numpy()) if valid_loss < best_valid_loss: best_valid_loss = valid_loss utils.save_model(model) # Logs logger.info( f'Epoch {epoch} - Train Loss : {train_loss:.4f}, Valid Loss : {valid_loss:.4f}, Csky Valid Loss : \ {csky_valid_loss:.4f}') losses['train'].append(train_loss) losses['valid'].append(valid_loss) with train_summary_writer.as_default(): tf.summary.scalar('loss', train_loss, step=epoch) with test_summary_writer.as_default(): tf.summary.scalar('loss', valid_loss, step=epoch) # Plot losses plots.plot_loss(losses['train'], losses['valid'], csky_valid_loss)
if value < 60000: correct_test_losses.append(value) else: correct_test_losses.append(correct_test_losses[-1]) test_losses = np.array(correct_test_losses) correct_test_bpd = [] for value in test_bpd: if value < 60000/(3072*np.log(2)): correct_test_bpd.append(value) else: correct_test_bpd.append(correct_test_bpd[-1]) test_bpd = np.array(correct_test_bpd) print(test_bpd) # Plot plot_loss(train_losses, 'Train Loss', 'output/cifar/train_loss.png') plot_loss(test_losses, 'Test Loss', 'output/cifar/test_loss.png') plot_bpd(train_bpd, 'Train bits/dim', 'output/cifar/train_bpd.png') plot_bpd(test_bpd, 'Test bits/dim', 'output/cifar/test_bpd.png') # 2) Dog CIFAR-10 model = Trainer(lr=1e-5, epochs=50, device='cpu', subset=True, label='dog') print('[==> Visualize training images ...') model.visualize(fname='output/dogs_cifar_trainset.png') model.build() # Load pre-trained model print('[==> Loading pre-trained model') model.load_model('input/pre_trained/dogs_cifar/net_final.model') print('> Sampling') model.save_samples('output/dogs_cifar/pre_trained_dogs_cifar.png') print('> Plotting loss and bits/dim')