def load(features_path, features_format, to_tensor=True, to_cuda=True): loader = loaders[features_format] features = loader(features_path) if to_tensor: features = torch.from_numpy(features) if to_cuda: features = to_cuda_if_available(features) return features
def train(vae, train_data, test_data, output_path, output_loss_path, batch_size=100, start_epoch=0, num_epochs=100, l2_regularization=0.001, learning_rate=0.001, variable_sizes=None, max_seconds_without_save=300): start_time = time.time() vae = to_cuda_if_available(vae) optim = Adam(vae.parameters(), weight_decay=l2_regularization, lr=learning_rate) logger = Logger(output_loss_path, append=start_epoch > 0) saver = Saver({vae: output_path}, logger, max_seconds_without_save) trainer = Trainer(vae, train_data, test_data, batch_size, optim, variable_sizes) for epoch_index in range(start_epoch, num_epochs): # train vae logger.start_timer() train_losses = trainer.train() logger.log(epoch_index, num_epochs, "vae", "train_mean_loss", np.mean(train_losses)) # test imputation logger.start_timer() test_loss = trainer.test() logger.log(epoch_index, num_epochs, "vae", "test_loss", test_loss) # save models for the epoch saver.delayed_save() saver.save() logger.close() print("Total time: {:02f}s".format(time.time() - start_time))
def main(args=None): options_parser = argparse.ArgumentParser(description="Impute missing values with iterative VAE. " + "Define 'temperature' to use multi-output.") options_parser.add_argument("data", type=str, help="See 'data_format' parameter.") options_parser.add_argument("metadata", type=str, help="Information about the categorical variables in json format.") options_parser.add_argument("model", type=str, help="Model input file.") options_parser.add_argument("output_loss", type=str, help="Loss output file.") options_parser.add_argument( "--data_format", type=str, default="sparse", choices=data_formats, help="Either a dense numpy array, a sparse csr matrix or any of those formats in split into several files." ) options_parser.add_argument( "--split_size", type=int, default=128, help="Dimension of the VAE latent space." ) options_parser.add_argument( "--code_size", type=int, default=128, help="Dimension of the VAE latent space." ) options_parser.add_argument( "--encoder_hidden_sizes", type=str, default="", help="Size of each hidden layer in the encoder separated by commas (no spaces)." ) options_parser.add_argument( "--decoder_hidden_sizes", type=str, default="", help="Size of each hidden layer in the decoder separated by commas (no spaces)." ) options_parser.add_argument( "--max_iterations", type=int, default=1000, help="Maximum number of iterations." ) options_parser.add_argument( "--tolerance", type=float, default=0.001, help="Minimum RMSE to continue iterating." ) options_parser.add_argument( "--temperature", type=float, default=None, help="Gumbel-Softmax temperature. Only used if metadata is also provided." ) options_parser.add_argument( "--missing_probability", type=float, default=0.5, help="Probability of a value being missing." ) options_parser.add_argument( "--noise_learning_rate", type=float, help="Learning rate to use backpropagation and modify the noise." ) options_parser.add_argument("--seed", type=int, help="Random number generator seed.", default=42) options = options_parser.parse_args(args=args) seed_all(options.seed) variable_sizes = load_variable_sizes_from_metadata(options.metadata) features = load(options.data, options.data_format) mask = generate_mask_for(features, options.missing_probability, variable_sizes) mask = to_cuda_if_available(mask) if options.temperature is not None: temperature = options.temperature else: temperature = None vae = VAE( features.shape[1], options.split_size, options.code_size, encoder_hidden_sizes=parse_int_list(options.encoder_hidden_sizes), decoder_hidden_sizes=parse_int_list(options.decoder_hidden_sizes), variable_sizes=(None if temperature is None else variable_sizes), # do not use multi-output without temperature temperature=temperature ) load_without_cuda(vae, options.model) impute( vae, features, mask, create_parent_directories_if_needed(options.output_loss), max_iterations=options.max_iterations, tolerance=options.tolerance, variable_sizes=variable_sizes, noise_learning_rate=options.noise_learning_rate )
def impute(vae, features, mask, output_loss_path, max_iterations=1000, tolerance=1e-3, variable_sizes=None, noise_learning_rate=None ): start_time = time.time() vae = to_cuda_if_available(vae) logger = Logger(output_loss_path, append=False) loss_function = MSELoss() inverted_mask = 1 - mask observed = features * mask missing = torch.randn_like(features) if noise_learning_rate is not None: missing = Variable(missing, requires_grad=True) optim = Adam([missing], weight_decay=0, lr=noise_learning_rate) vae.train(mode=True) for iteration in range(max_iterations): logger.start_timer() if noise_learning_rate is not None: optim.zero_grad() noisy_features = observed + missing * inverted_mask _, reconstructed, _, _ = vae(noisy_features, training=True) observed_loss = masked_reconstruction_loss_function(reconstructed, features, mask, variable_sizes) missing_loss = masked_reconstruction_loss_function(reconstructed, features, inverted_mask, variable_sizes) loss = torch.sqrt(loss_function(compose_with_mask(features, reconstructed, mask), features)) if noise_learning_rate is None: missing = reconstructed * inverted_mask else: observed_loss.backward() optim.step() observed_loss, missing_loss, loss = to_cpu_if_available(observed_loss, missing_loss, loss) observed_loss = observed_loss.data.numpy() missing_loss = missing_loss.data.numpy() loss = loss.data.numpy() logger.log(iteration, max_iterations, "vae", "observed_loss", observed_loss) logger.log(iteration, max_iterations, "vae", "missing_loss", missing_loss) logger.log(iteration, max_iterations, "vae", "loss", loss) if observed_loss < tolerance: break logger.close() print("Total time: {:02f}s".format(time.time() - start_time))
def create_noisy_dataset(features, missing_probability, variable_sizes, return_all=False): mask = generate_mask_for(features, missing_probability, variable_sizes) mask = to_cuda_if_available(mask) return NoisyDataset(features, mask, return_all=return_all)
def smooth_label_ones_like(batch): smooth_label_ones = Variable(torch.FloatTensor(len(batch)).uniform_(0.9, 1)) return to_cuda_if_available(smooth_label_ones)
def label_zeros_like(batch): label_zeros = Variable(torch.zeros(len(batch))) return to_cuda_if_available(label_zeros)
def generate_noise(num_samples, num_features): noise = Variable(torch.FloatTensor(num_samples, num_features).normal_()) return to_cuda_if_available(noise)
def generate_hint_for(mask, hint_probability, variable_sizes): return mask * to_cuda_if_available( generate_mask_for(mask, 1.0 - hint_probability, variable_sizes))
def train( generator, discriminator, train_data, test_data, output_gen_path, output_disc_path, output_loss_path, batch_size=64, start_epoch=0, num_epochs=10000, num_disc_steps=1, num_gen_steps=1, l2_regularization=0, learning_rate=0.001, variable_sizes=None, reconstruction_loss_weight=1, hint_probability=0.9, max_seconds_without_save=300, early_stopping_patience=100, ): start_time = time.time() generator, discriminator = to_cuda_if_available(generator, discriminator) optim_gen = Adam(generator.parameters(), weight_decay=l2_regularization, lr=learning_rate) optim_disc = Adam(discriminator.parameters(), weight_decay=l2_regularization, lr=learning_rate) logger = Logger(output_loss_path, append=start_epoch > 0) saver = Saver({ generator: output_gen_path, discriminator: output_disc_path }, logger, max_seconds_without_save) trainer = Trainer(train_data, test_data, generator, discriminator, optim_gen, optim_disc, batch_size, variable_sizes, num_disc_steps, num_gen_steps, reconstruction_loss_weight, hint_probability) # initialize early stopping best_test_mean_loss = None bad_epochs = 0 for epoch_index in range(start_epoch, num_epochs): # train discriminator and generator logger.start_timer() disc_losses, gen_losses = trainer.train() logger.log(epoch_index, num_epochs, "discriminator", "train_mean_loss", np.mean(disc_losses)) logger.log(epoch_index, num_epochs, "generator", "train_mean_loss", np.mean(gen_losses)) # test imputation logger.start_timer() reconstruction_losses = trainer.test() test_mean_loss = np.mean(reconstruction_losses) logger.log(epoch_index, num_epochs, "generator", "test_mean_loss", test_mean_loss) # check if the test loss is improving if best_test_mean_loss is None or test_mean_loss < best_test_mean_loss: best_test_mean_loss = test_mean_loss bad_epochs = 0 # save models for the epoch saver.delayed_save(keep_parameters=True) # if the test loss is not improving check if early stopping should be executed else: bad_epochs += 1 if bad_epochs >= early_stopping_patience: break saver.save(only_use_kept=True) logger.close() print("Total time: {:02f}s".format(time.time() - start_time))