def create_test_performance_estimators(self): performance_estimators = PerformanceList() performance_estimators += [FloatHelper("reconstruction_loss")] performance_estimators += [LossHelper("test_loss")] performance_estimators += [AccuracyHelper("test_")] performance_estimators += [FloatHelper("weight")] self.test_performance_estimators = performance_estimators return performance_estimators
def create_training_performance_estimators(self): performance_estimators = PerformanceList() performance_estimators += [FloatHelper("train_critic_loss")] performance_estimators += [FloatHelper("train_encoder_loss")] performance_estimators += [FloatHelper("train_accuracy")] performance_estimators += [FloatHelper("train_encoded_accuracy")] performance_estimators += [FloatHelper("ratio")] self.training_performance_estimators = performance_estimators return performance_estimators
def create_training_performance_estimators(self): performance_estimators = PerformanceList() performance_estimators += [FloatHelper("reconstruction_loss")] performance_estimators += [FloatHelper("discriminator_loss")] performance_estimators += [FloatHelper("generator_loss")] performance_estimators += [FloatHelper("semisup_loss")] performance_estimators += [FloatHelper("weight")] self.training_performance_estimators = performance_estimators return performance_estimators
def train_autoencoder(self, epoch, performance_estimators=None): if performance_estimators is None: performance_estimators = PerformanceList() performance_estimators += [LossHelper("train_loss")] performance_estimators += [FloatHelper("train_grad_norm")] print('\nTraining, epoch: %d' % epoch) self.net.train() supervised_grad_norm = 1. for performance_estimator in performance_estimators: performance_estimator.init_performance_metrics() unsupervised_loss_acc = 0 num_batches = 0 train_loader_subset = self.problem.train_loader_subset_range( 0, self.args.num_training) for batch_idx, (_, data_dict) in enumerate(train_loader_subset): inputs = data_dict["input"].to(self.device) num_batches += 1 inputs, targets = Variable(inputs), Variable(inputs, requires_grad=False) # outputs used to calculate the loss of the supervised model # must be done with the model prior to regularization: self.net.train() self.optimizer_training.zero_grad() outputs = self.net(inputs) supervised_loss = self.criterion(outputs, targets) optimized_loss = supervised_loss optimized_loss.backward() self.optimizer_training.step() performance_estimators.set_metric_with_outputs( batch_idx, "train_loss", supervised_loss.item(), outputs, targets) supervised_grad_norm = grad_norm(self.net.parameters()) performance_estimators.set_metric(batch_idx, "train_grad_norm", supervised_grad_norm) performance_estimators.set_metric_with_outputs( batch_idx, "optimized_loss", optimized_loss.item(), outputs, targets) progress_bar( batch_idx * self.mini_batch_size, self.max_training_examples, " ".join([ performance_estimator.progress_message() for performance_estimator in performance_estimators ])) if (batch_idx + 1) * self.mini_batch_size > self.max_training_examples: break return performance_estimators
def train_supervised(self, epoch): performance_estimators = PerformanceList() performance_estimators += [FloatHelper("supervised_loss")] performance_estimators += [AccuracyHelper("train_")] if self.use_cuda: self.tensor_cache.cuda() print('\nTraining, epoch: %d' % epoch) for performance_estimator in performance_estimators: performance_estimator.init_performance_metrics() unsupervised_loss_acc = 0 num_batches = 0 train_loader_subset = self.problem.train_loader_subset_range( 0, self.args.num_training) data_provider = MultiThreadedCpuGpuDataProvider( iterator=zip(train_loader_subset), is_cuda=self.use_cuda, batch_names=["training"], requires_grad={"training": ["sbi"]}, volatile={"training": ["metaData"]}, recode_functions={ "softmaxGenotype": lambda x: recode_for_label_smoothing(x, self.epsilon), }) cudnn.benchmark = False try: for batch_idx, (_, data_dict) in enumerate(data_provider): sbi = data_dict["training"]["sbi"] target_s = data_dict["training"]["softmaxGenotype"] metadata = data_dict["training"]["metaData"] self.train_one_batch(performance_estimators, batch_idx, sbi, target_s, metadata) if (batch_idx + 1) * self.mini_batch_size > self.max_training_examples: break finally: data_provider.close() return performance_estimators
def create_training_performance_estimators(self): performance_estimators = PerformanceList() performance_estimators += [FloatHelper("supervised_loss")] performance_estimators += [AccuracyHelper("train_")] self.training_performance_estimators = performance_estimators return performance_estimators
def supervised_somatic(self, epoch, performance_estimators=None): if performance_estimators is None: performance_estimators = PerformanceList() performance_estimators += [LossHelper("train_loss")] performance_estimators += [LossHelper("classification_loss")] performance_estimators += [LossHelper("frequency_loss")] performance_estimators += [FloatHelper("train_grad_norm")] print('\nTraining, epoch: %d' % epoch) self.net.train() supervised_grad_norm = 1. for performance_estimator in performance_estimators: performance_estimator.init_performance_metrics() unsupervised_loss_acc = 0 num_batches = 0 train_loader_subset = self.problem.train_loader_subset_range( 0, self.args.num_training) cross_entropy_loss = CrossEntropyLoss() mse_loss = MSELoss() self.net.train() for batch_idx, (_, data_dict) in enumerate(train_loader_subset): inputs = data_dict["input"].to(self.device) is_mutated_base_target = data_dict["isBaseMutated"].to(self.device) # transform one-hot encoding into a class index: max, indices = is_mutated_base_target.max(dim=1) is_mutated_base_target = indices somatic_frequency_target = data_dict["somaticFrequency"].to( self.device) num_batches += 1 # outputs used to calculate the loss of the supervised model # must be done with the model prior to regularization: self.optimizer_training.zero_grad() output_mut, output_frequency = self.net(inputs) classification_loss = cross_entropy_loss(output_mut, is_mutated_base_target) frequency_loss = mse_loss(output_frequency, somatic_frequency_target) optimized_loss = classification_loss + frequency_loss optimized_loss.backward() self.optimizer_training.step() performance_estimators.set_metric(batch_idx, "train_loss", optimized_loss.item()) performance_estimators.set_metric(batch_idx, "classification_loss", classification_loss.item()) performance_estimators.set_metric(batch_idx, "frequency_loss", frequency_loss.item()) supervised_grad_norm = grad_norm(self.net.parameters()) performance_estimators.set_metric(batch_idx, "train_grad_norm", supervised_grad_norm) progress_bar( batch_idx * self.mini_batch_size, self.max_training_examples, performance_estimators.progress_message( ["classification_loss", "frequency_loss"])) if (batch_idx + 1) * self.mini_batch_size > self.max_training_examples: break return performance_estimators
def train_semisup(self, epoch): performance_estimators = PerformanceList() performance_estimators += [FloatHelper("optimized_loss")] performance_estimators += [FloatHelper("supervised_loss")] performance_estimators += [FloatHelper("reconstruction_loss")] performance_estimators += [AccuracyHelper("train_")] print('\nTraining, epoch: %d' % epoch) self.net.train() for performance_estimator in performance_estimators: performance_estimator.init_performance_metrics() unsupervised_loss_acc = 0 num_batches = 0 train_loader_subset = self.problem.train_loader_subset_range(0, self.args.num_training) unlabeled_loader = self.problem.unlabeled_loader() data_provider = MultiThreadedCpuGpuDataProvider(iterator=zip(train_loader_subset, unlabeled_loader),is_cuda=self.use_cuda, batch_names=["training", "unlabeled"], requires_grad={"training": ["input"], "unlabeled": ["input"]}, volatile={"training": ["metaData"], "unlabeled": []}, recode_functions={"softmaxGenotype": lambda x: recode_for_label_smoothing(x,self.epsilon)}) self.net.autoencoder.train() try: for batch_idx, (_, data_dict) in enumerate(data_provider): input_s = data_dict["training"]["input"] metadata = data_dict["training"]["metaData"] target_s = data_dict["training"]["softmaxGenotype"] input_u = data_dict["unlabeled"]["input"] num_batches += 1 # need a copy of input_u and input_s as output: target_u = Variable(input_u.data, requires_grad=False) target_output_s = Variable(input_s.data, requires_grad=False) # outputs used to calculate the loss of the supervised model # must be done with the model prior to regularization: # Zero gradients: self.net.zero_grad() self.net.autoencoder.zero_grad() self.optimizer_training.zero_grad() output_s = self.net(input_s) output_u = self.net.autoencoder(input_u) input_output_s = self.net.autoencoder(input_s) output_s_p = self.get_p(output_s) _, target_index = torch.max(target_s, dim=1) supervised_loss = self.criterion_classifier(output_s, target_s) reconstruction_loss_unsup = self.criterion_autoencoder(output_u, target_u) reconstruction_loss_sup = self.criterion_autoencoder(input_output_s, target_output_s) reconstruction_loss = self.args.gamma * reconstruction_loss_unsup+reconstruction_loss_sup optimized_loss = supervised_loss + reconstruction_loss optimized_loss.backward() self.optimizer_training.step() performance_estimators.set_metric(batch_idx, "supervised_loss", supervised_loss.data[0]) performance_estimators.set_metric(batch_idx, "reconstruction_loss", reconstruction_loss.data[0]) performance_estimators.set_metric(batch_idx, "optimized_loss", optimized_loss.data[0]) performance_estimators.set_metric_with_outputs(batch_idx, "train_accuracy", supervised_loss.data[0], output_s_p, targets=target_index) progress_bar(batch_idx * self.mini_batch_size, self.max_training_examples, performance_estimators.progress_message(["supervised_loss", "reconstruction_loss", "train_accuracy"])) if (batch_idx + 1) * self.mini_batch_size > self.max_training_examples: break finally: data_provider.close() return performance_estimators
def train_semisup_aae(self, epoch, performance_estimators=None): if performance_estimators is None: performance_estimators = PerformanceList() performance_estimators += [FloatHelper("reconstruction_loss")] performance_estimators += [FloatHelper("discriminator_loss")] performance_estimators += [FloatHelper("generator_loss")] performance_estimators += [FloatHelper("supervised_loss")] performance_estimators += [FloatHelper("weight")] print('\nTraining, epoch: %d' % epoch) for performance_estimator in performance_estimators: performance_estimator.init_performance_metrics() self.net.train() supervised_grad_norm = 1. for performance_estimator in performance_estimators: performance_estimator.init_performance_metrics() unsupervised_loss_acc = 0 num_batches = 0 train_loader_subset1 = self.problem.train_loader_subset_range( 0, self.args.num_training) train_loader_subset2 = self.problem.train_loader_subset_range( 0, self.args.num_training) data_provider = MultiThreadedCpuGpuDataProvider( iterator=zip(train_loader_subset1, train_loader_subset2), is_cuda=self.use_cuda, batch_names=["training1", "training2"], requires_grad={ "training1": ["input"], "training2": ["input"] }, volatile={ "training1": ["metaData"], "training2": ["metaData"] }, recode_functions={ "softmaxGenotype": recode_for_label_smoothing, "input": self.normalize_inputs }) indel_weight = self.args.indel_weight_factor snp_weight = 1.0 latent_codes = [] try: for batch_idx, (_, data_dict) in enumerate(data_provider): input_s1 = data_dict["training1"]["input"] input_s2 = data_dict["training2"]["input"] target_s1 = data_dict["training1"]["softmaxGenotype"] target_s2 = data_dict["training2"]["softmaxGenotype"] meta_data1 = data_dict["training1"]["metaData"] meta_data2 = data_dict["training2"]["metaData"] num_batches += 1 self.zero_grad_all_optimizers() # input_s=normalize_mean_std(input_s) # input_u=normalize_mean_std(input_u) # print(torch.mean(input_s,dim=0)) # Train reconstruction phase: self.net.decoder.train() reconstruction_loss = self.net.get_crossconstruction_loss( input_s1, input_s2, target_s2) reconstruction_loss.backward() for opt in [self.decoder_opt, self.encoder_reconstruction_opt]: opt.step() # Train discriminators: self.net.encoder.train() self.net.discriminator_cat.train() self.net.discriminator_prior.train() self.zero_grad_all_optimizers() genotype_frequencies = self.class_frequencies[ "softmaxGenotype"] category_prior = (genotype_frequencies / torch.sum(genotype_frequencies)).numpy() discriminator_loss = self.net.get_discriminator_loss( input_s1, category_prior=category_prior) discriminator_loss.backward() for opt in [ self.discriminator_cat_opt, self.discriminator_prior_opt ]: opt.step() self.zero_grad_all_optimizers() # Train generator: self.net.encoder.train() generator_loss = self.net.get_generator_loss(input_s1) generator_loss.backward() for opt in [self.encoder_generator_opt]: opt.step() self.zero_grad_all_optimizers() if self.use_pdf: self.net.encoder.train() _, latent_code = self.net.encoder(input_s1) weight = self.estimate_example_density_weight(latent_code) else: weight = self.estimate_batch_weight( meta_data1, indel_weight=indel_weight, snp_weight=snp_weight) self.net.encoder.train() supervised_loss = self.net.get_crossencoder_supervised_loss( input_s1, target_s1) * weight supervised_loss.backward() for opt in [self.encoder_semisup_opt]: opt.step() self.zero_grad_all_optimizers() performance_estimators.set_metric(batch_idx, "reconstruction_loss", reconstruction_loss.data[0]) performance_estimators.set_metric(batch_idx, "discriminator_loss", discriminator_loss.data[0]) performance_estimators.set_metric(batch_idx, "generator_loss", generator_loss.data[0]) performance_estimators.set_metric(batch_idx, "supervised_loss", supervised_loss.data[0]) performance_estimators.set_metric(batch_idx, "weight", weight) if not self.args.no_progress: progress_bar( batch_idx * self.mini_batch_size, self.max_training_examples, performance_estimators.progress_message([ "reconstruction_loss", "discriminator_loss", "generator_loss", "semisup_loss" ])) if ((batch_idx + 1) * self.mini_batch_size) > self.max_training_examples: break finally: data_provider.close() return performance_estimators
def test_semisup_aae(self, epoch, performance_estimators=None): print('\nTesting, epoch: %d' % epoch) if performance_estimators is None: performance_estimators = PerformanceList() performance_estimators += [FloatHelper("reconstruction_loss")] performance_estimators += [LossHelper("test_loss")] performance_estimators += [AccuracyHelper("test_")] performance_estimators += [FloatHelper("weight")] self.net.eval() for performance_estimator in performance_estimators: performance_estimator.init_performance_metrics() validation_loader_subset = self.problem.validation_loader_range( 0, self.args.num_validation) data_provider = MultiThreadedCpuGpuDataProvider( iterator=zip(validation_loader_subset), is_cuda=self.use_cuda, batch_names=["validation"], requires_grad={"validation": []}, volatile={ "validation": ["input", "softmaxGenotype"], }, recode_functions={"input": self.normalize_inputs}) self.net.eval() try: for batch_idx, (_, data_dict) in enumerate(data_provider): input_s = data_dict["validation"]["input"] target_s = data_dict["validation"]["softmaxGenotype"] # Estimate the reconstruction loss on validation examples: reconstruction_loss = self.net.get_crossconstruction_loss( input_s, input_s, target_s) # now evaluate prediction of categories: categories_predicted, latent_code = self.net.encoder(input_s) # categories_predicted+=self.net.latent_to_categories(latent_code) categories_predicted_p = self.get_p(categories_predicted) categories_predicted_p[ categories_predicted_p != categories_predicted_p] = 0.0 _, target_index = torch.max(target_s, dim=1) categories_loss = self.net.semisup_loss_criterion( categories_predicted, target_s) weight = self.estimate_example_density_weight(latent_code) performance_estimators.set_metric(batch_idx, "reconstruction_loss", reconstruction_loss.data[0]) performance_estimators.set_metric(batch_idx, "weight", weight) performance_estimators.set_metric_with_outputs( batch_idx, "test_accuracy", reconstruction_loss.data[0], categories_predicted_p, target_index) performance_estimators.set_metric_with_outputs( batch_idx, "test_loss", categories_loss.data[0] * weight, categories_predicted_p, target_s) if not self.args.no_progress: progress_bar( batch_idx * self.mini_batch_size, self.max_validation_examples, performance_estimators.progress_message([ "test_loss", "test_accuracy", "reconstruction_loss" ])) if ((batch_idx + 1) * self.mini_batch_size) > self.max_validation_examples: break # print() finally: data_provider.close() # Apply learning rate schedules: test_metric = performance_estimators.get_metric( self.get_test_metric_name()) assert test_metric is not None, ( self.get_test_metric_name() + "must be found among estimated performance metrics") if not self.args.constant_learning_rates: for scheduler in self.schedulers: scheduler.step(test_metric, epoch) # Run the garbage collector to try to release memory we no longer need: import gc gc.collect() return performance_estimators