def test_adda(self, epoch):
        print('\nTesting, epoch: %d' % epoch)
        performance_estimators = self.create_test_performance_estimators()
        for performance_estimator in performance_estimators:
            performance_estimator.init_performance_metrics()

        self.reset_before_test_epoch()
        validation_loader_subset = self.problem.validation_loader_range(
            0, self.args.num_validation)
        data_provider = MultiThreadedCpuGpuDataProvider(
            iterator=zip(validation_loader_subset),
            device=self.device,
            batch_names=["validation"],
            requires_grad={"validation": []},
            vectors_to_keep=["input", "softmaxGenotype"])
        try:
            for batch_idx, (_, data_dict) in enumerate(data_provider):
                input_s = data_dict["validation"]["input"]
                target_s = data_dict["validation"]["softmaxGenotype"]
                self.net.eval()

                self.test_one_batch(performance_estimators, batch_idx, input_s,
                                    target_s)

                if ((batch_idx + 1) *
                        self.mini_batch_size) > self.max_validation_examples:
                    break
            # print()
        finally:
            data_provider.close()

        self.compute_after_test_epoch()

        return performance_estimators
    def train_supervised_mixup(self, epoch):
        performance_estimators = self.create_training_performance_estimators()

        print('\nTraining, epoch: %d' % epoch)

        self.net.train()

        for performance_estimator in performance_estimators:
            performance_estimator.init_performance_metrics()

        unsupervised_loss_acc = 0
        num_batches = 0

        train_loader_subset_1 = self.problem.train_loader_subset_range(
            0, self.args.num_training)
        train_loader_subset_2 = self.problem.train_loader_subset_range(
            0, self.args.num_training)
        data_provider = MultiThreadedCpuGpuDataProvider(
            iterator=zip(train_loader_subset_1, train_loader_subset_2),
            is_cuda=self.use_cuda,
            batch_names=["training_1", "training_2"],
            requires_grad={
                "training_1": ["input"],
                "training_2": ["input"]
            },
            volatile={
                "training_1": ["metaData"],
                "training_2": ["metaData"]
            },
            recode_functions={
                "softmaxGenotype":
                lambda x: recode_for_label_smoothing(x, self.epsilon),
                "input":
                self.normalize_inputs
            })

        try:
            for batch_idx, (_, data_dict) in enumerate(data_provider):
                input_s_1 = data_dict["training_1"]["input"]
                target_s_1 = data_dict["training_1"]["softmaxGenotype"]
                input_s_2 = data_dict["training_2"]["input"]
                target_s_2 = data_dict["training_2"]["softmaxGenotype"]
                metadata_1 = data_dict["training_1"]["metaData"]
                metadata_2 = data_dict["training_2"]["metaData"]
                num_batches += 1

                self.train_one_batch(performance_estimators, batch_idx,
                                     input_s_1, input_s_2, target_s_1,
                                     target_s_2, metadata_1, metadata_2)

                if (batch_idx +
                        1) * self.mini_batch_size > self.max_training_examples:
                    break
        finally:
            data_provider.close()

        return performance_estimators
Пример #3
0
    def train_adda(self, epoch):
        performance_estimators = self.create_training_performance_estimators()
        self.reset_before_train_epoch()
        print('\nTraining, epoch: %d' % epoch)

        for performance_estimator in performance_estimators:
            performance_estimator.init_performance_metrics()

        unsupervised_loss_acc = 0
        num_batches = 0
        # Use the entire training set to draw examples, even num_training is limiting the length of an epoch.
        train_loader_subset = self.problem.train_loader_subset_range(
            0, len(self.problem.train_set()))
        unlabeled_loader_subset = self.problem.unlabeled_loader()
        data_provider = MultiThreadedCpuGpuDataProvider(
            iterator=zip(train_loader_subset, unlabeled_loader_subset),
            is_cuda=self.use_cuda,
            batch_names=["training", "unlabeled"],
            requires_grad={
                "training": ["input"],
                "unlabeled": ["input"]
            },
            volatile={
                "training": ["metaData"],
                "unlabeled": ["metaData"]
            },
        )

        try:
            for batch_idx, (_, data_dict) in enumerate(data_provider):
                input_s_1 = data_dict["training"]["input"]
                input_u_2 = data_dict["unlabeled"]["input"]

                num_batches += 1
                # allow some epochs of pre-training the critic without training the encoder:
                self.do_train_encoder = True  #epoch>2

                self.train_one_batch(performance_estimators, batch_idx,
                                     input_s_1, input_u_2)

                if (batch_idx +
                        1) * self.mini_batch_size > self.max_training_examples:
                    break
        finally:
            data_provider.close()
        self.training_perfs = performance_estimators
        # Apply learning rate schedule:
        test_metric = performance_estimators.get_metric(
            self.get_test_metric_name())
        assert test_metric is not None, (
            self.get_test_metric_name() +
            "must be found among estimated performance metrics")
        if not self.args.constant_learning_rates:
            self.scheduler_train.step(test_metric, epoch)
        return performance_estimators
Пример #4
0
    def test_semi_sup(self, epoch):
        print('\nTesting, epoch: %d' % epoch)

        performance_estimators = PerformanceList()
        performance_estimators += [LossHelper("test_supervised_loss")]
        performance_estimators += [LossHelper("test_reconstruction_loss")]
        performance_estimators += [AccuracyHelper("test_")]

        self.net.eval()
        for performance_estimator in performance_estimators:
            performance_estimator.init_performance_metrics()
        validation_loader_subset = self.problem.validation_loader_range(0, self.args.num_validation)
        data_provider = MultiThreadedCpuGpuDataProvider(iterator=zip(validation_loader_subset), is_cuda=self.use_cuda,
                                                        batch_names=["validation"],
                                                        requires_grad={"validation": []},
                                                        volatile={"validation": ["input", "softmaxGenotype"]})
        try:
            for batch_idx, (_, data_dict) in enumerate(data_provider):
                input_s = data_dict["validation"]["input"]
                target_s = data_dict["validation"]["softmaxGenotype"]
                # we need copies of the same tensors:
                input_u, target_u = Variable(input_s.data, volatile=True), Variable(input_s.data, volatile=True)

                output_s = self.net(input_s)
                output_u = self.net.autoencoder(input_u)
                output_s_p = self.get_p(output_s)

                _, target_index = torch.max(target_s, dim=1)

                supervised_loss = self.criterion_classifier(output_s, target_s)
                reconstruction_loss = self.criterion_autoencoder(output_u, target_u)

                performance_estimators.set_metric(batch_idx, "test_supervised_loss", supervised_loss.data[0])
                performance_estimators.set_metric(batch_idx, "test_reconstruction_loss", reconstruction_loss.data[0])
                performance_estimators.set_metric_with_outputs(batch_idx, "test_accuracy", supervised_loss.data[0],
                                                               output_s_p, targets=target_index)

                progress_bar(batch_idx * self.mini_batch_size, self.max_validation_examples,
                             performance_estimators.progress_message(["test_supervised_loss", "test_reconstruction_loss",
                                                                      "test_accuracy"]))

                if ((batch_idx + 1) * self.mini_batch_size) > self.max_validation_examples:
                    break
            # print()
        finally:
            data_provider.close()
        test_metric = performance_estimators.get_metric(self.get_test_metric_name())
        assert test_metric is not None, self.get_test_metric_name() + "must be found among estimated performance metrics"
        if not self.args.constant_learning_rates:
            self.scheduler_train.step(test_metric, epoch)
        return performance_estimators
Пример #5
0
    def test_semisupervised_mixup(self, epoch):
        print('\nTesting, epoch: %d' % epoch)
        errors = None
        performance_estimators = self.create_test_performance_estimators()
        for performance_estimator in performance_estimators:
            performance_estimator.init_performance_metrics()
        validation_loader_subset = self.problem.validation_loader_range(
            0, self.args.num_validation)
        data_provider = MultiThreadedCpuGpuDataProvider(
            iterator=zip(validation_loader_subset),
            device=self.device,
            batch_names=["validation"],
            requires_grad={"validation": []},
            recode_functions={"input": self.normalize_inputs},
            vectors_to_keep=["softmaxGenotype"])
        if self.best_model is None:
            self.best_model = self.net
        self.reset_before_test_epoch()

        try:
            for batch_idx, (_, data_dict) in enumerate(data_provider):
                input_s = data_dict["validation"]["input"]
                target_s = data_dict["validation"]["softmaxGenotype"]
                self.net.eval()

                self.test_one_batch(performance_estimators,
                                    batch_idx,
                                    input_s,
                                    target_s,
                                    errors=None)

                if ((batch_idx + 1) *
                        self.mini_batch_size) > self.max_validation_examples:
                    break
            # print()
        finally:
            data_provider.close()
        print("test errors by class: ", str(errors))
        # Apply learning rate schedule:
        test_metric = performance_estimators.get_metric(
            self.get_test_metric_name())
        assert test_metric is not None, (
            self.get_test_metric_name() +
            "must be found among estimated performance metrics")
        if not self.args.constant_learning_rates:
            self.scheduler_train.step(test_metric, epoch)

        self.compute_after_test_epoch()

        return performance_estimators
Пример #6
0
    def test_semisup_aae(self, epoch, performance_estimators=None):
        print('\nTesting, epoch: %d' % epoch)
        if performance_estimators is None:
            performance_estimators = self.create_test_performance_estimators()

        self.net.eval()

        for performance_estimator in performance_estimators:
            performance_estimator.init_performance_metrics()
        validation_loader_subset = self.problem.validation_loader_range(
            0, self.args.num_validation)
        data_provider = MultiThreadedCpuGpuDataProvider(
            iterator=zip(validation_loader_subset),
            is_cuda=self.use_cuda,
            batch_names=["validation"],
            requires_grad={"validation": []},
            volatile={
                "validation": ["input", "softmaxGenotype"],
            },
            recode_functions={"input": self.normalize_inputs})
        self.reset_before_test_epoch()
        errors = None
        try:
            for batch_idx, (_, data_dict) in enumerate(data_provider):
                input_s = data_dict["validation"]["input"]
                target_s = data_dict["validation"]["softmaxGenotype"]
                meta_data = data_dict["validation"]["metaData"]
                self.test_one_batch(performance_estimators, batch_idx, input_s,
                                    target_s, meta_data, errors)
                if ((batch_idx + 1) *
                        self.mini_batch_size) > self.max_validation_examples:
                    break
            # print()
        finally:
            data_provider.close()
        # Apply learning rate schedules:
        test_metric = performance_estimators.get_metric(
            self.get_test_metric_name())
        assert test_metric is not None, (
            self.get_test_metric_name() +
            "must be found among estimated performance metrics")
        if not self.args.constant_learning_rates:
            for scheduler in self.schedulers:
                scheduler.step(test_metric, epoch)
        self.compute_after_test_epoch()
        return performance_estimators
    def predict(self, iterator, output_filename, max_examples=sys.maxsize):
        self.model.eval()
        if self.processing_type == "multithreaded":
            # Enable fake_GPU_on_CPU to debug on CPU
            data_provider = MultiThreadedCpuGpuDataProvider(
                iterator=zip(iterator),
                is_cuda=self.use_cuda,
                batch_names=["unlabeled"],
                volatile={"unlabeled": [self.input_name]},
                recode_functions=self.recode_fn,
                fake_gpu_on_cpu=False)

        elif self.processing_type == "sequential":
            data_provider = DataProvider(
                iterator=zip(iterator),
                is_cuda=self.use_cuda,
                batch_names=["unlabeled"],
                volatile={"unlabeled": [self.input_name]},
                recode_functions=self.recode_fn)
        else:
            raise Exception("Unrecognized processing type {}".format(
                self.processing_type))

        with VectorWriterBinary(sample_id=0,
                                path_with_basename=output_filename,
                                tensor_names=self.problem.get_output_names(),
                                domain_descriptor=self.domain_descriptor,
                                feature_mapper=self.feature_mapper,
                                samples=self.samples,
                                input_files=self.input_files,
                                problem=self.problem,
                                model=self.model) as writer:
            for batch_idx, (indices_dict,
                            data_dict) in enumerate(data_provider):
                input_u = data_dict["unlabeled"][self.input_name]
                idxs_u = indices_dict["unlabeled"]
                outputs = self.model(input_u)
                writer.append(list(idxs_u), outputs, inverse_logit=True)
                progress_bar(batch_idx * self.mini_batch_size, max_examples)

                if ((batch_idx + 1) * self.mini_batch_size) > max_examples:
                    break
        data_provider.close()
        print("Done")
Пример #8
0
    def test_supervised(self, epoch):
        print('\nTesting, epoch: %d' % epoch)
        errors = None
        performance_estimators = self.create_test_performance_estimators()
        for performance_estimator in performance_estimators:
            performance_estimator.init_performance_metrics()
        validation_loader_subset = self.problem.validation_loader_range(
            0, self.args.num_validation)
        data_provider = MultiThreadedCpuGpuDataProvider(
            iterator=zip(validation_loader_subset),
            is_cuda=self.use_cuda,
            batch_names=["validation"],
            requires_grad={"validation": []},
            volatile={"validation": ["sbi", "softmaxGenotype"]},
        )
        try:
            for batch_idx, (_, data_dict) in enumerate(data_provider):
                sbi = data_dict["validation"]["sbi"]
                target_s = data_dict["validation"]["softmaxGenotype"]
                self.net.eval()
                self.test_one_batch(performance_estimators,
                                    batch_idx,
                                    sbi,
                                    target_s,
                                    errors=errors)

                if ((batch_idx + 1) *
                        self.mini_batch_size) > self.max_validation_examples:
                    break
            # print()
        finally:
            data_provider.close()
        print("test errors by class: ", str(errors))
        if self.reweight_by_validation_error:
            self.reweight_by_val_errors(errors)
        # Apply learning rate schedule:
        test_metric = performance_estimators.get_metric(
            self.get_test_metric_name())
        assert test_metric is not None, (
            self.get_test_metric_name() +
            "must be found among estimated performance metrics")
        if not self.args.constant_learning_rates:
            self.scheduler_train.step(test_metric, epoch)
        return performance_estimators
Пример #9
0
    def train_supervised(self, epoch):
        performance_estimators = PerformanceList()
        performance_estimators += [FloatHelper("supervised_loss")]
        performance_estimators += [AccuracyHelper("train_")]
        if self.use_cuda:
            self.tensor_cache.cuda()
        print('\nTraining, epoch: %d' % epoch)

        for performance_estimator in performance_estimators:
            performance_estimator.init_performance_metrics()

        unsupervised_loss_acc = 0
        num_batches = 0
        train_loader_subset = self.problem.train_loader_subset_range(
            0, self.args.num_training)
        data_provider = MultiThreadedCpuGpuDataProvider(
            iterator=zip(train_loader_subset),
            is_cuda=self.use_cuda,
            batch_names=["training"],
            requires_grad={"training": ["sbi"]},
            volatile={"training": ["metaData"]},
            recode_functions={
                "softmaxGenotype":
                lambda x: recode_for_label_smoothing(x, self.epsilon),
            })
        cudnn.benchmark = False
        try:

            for batch_idx, (_, data_dict) in enumerate(data_provider):
                sbi = data_dict["training"]["sbi"]
                target_s = data_dict["training"]["softmaxGenotype"]
                metadata = data_dict["training"]["metaData"]

                self.train_one_batch(performance_estimators, batch_idx, sbi,
                                     target_s, metadata)
                if (batch_idx +
                        1) * self.mini_batch_size > self.max_training_examples:
                    break
        finally:
            data_provider.close()

        return performance_estimators
    def class_frequency(self,
                        recode_as_multi_label=False,
                        class_frequencies=None):
        """
        Estimate class frequencies for the output vectors of the problem and rebuild criterions
        with weights that correct class imbalances.
        """
        if class_frequencies is None:

            train_loader_subset = self.problem.train_loader_subset_range(
                0,
                min(self.args.num_estimate_class_frequencies,
                    min(100000, self.args.num_training)))
            data_provider = MultiThreadedCpuGpuDataProvider(
                iterator=zip(train_loader_subset),
                is_cuda=False,
                batch_names=["training"],
                volatile={"training": self.problem.get_vector_names()},
            )

            class_frequencies = {}
            done = False
            for batch_idx, (_, data_dict) in enumerate(data_provider):
                if batch_idx * self.mini_batch_size > self.args.num_estimate_class_frequencies:
                    break
                for output_name in self.problem.get_output_names():
                    target_s = data_dict["training"][output_name]
                    if output_name not in class_frequencies.keys():
                        class_frequencies[output_name] = torch.ones(
                            target_s[0].size())
                    cf = class_frequencies[output_name]
                    indices = torch.nonzero(target_s.data)
                    indices = indices[:, 1]
                    for index in range(indices.size(0)):
                        cf[indices[index]] += 1

                progress_bar(batch_idx * self.mini_batch_size,
                             self.max_training_examples, "Class frequencies")
        else:
            self.class_frequencies = class_frequencies
        for output_name in self.problem.get_output_names():

            class_frequencies_output = class_frequencies[output_name]
            # normalize with 1-f, where f is normalized frequency vector:
            weights = torch.ones(class_frequencies_output.size())
            weights -= class_frequencies_output / torch.norm(
                class_frequencies_output, p=1, dim=0)
            if self.use_cuda:
                weights = weights.cuda()

            self.rebuild_criterions(output_name=output_name, weights=weights)
        return class_frequencies
Пример #11
0
    def train_semisup(self, epoch):
        performance_estimators = PerformanceList()
        performance_estimators += [FloatHelper("optimized_loss")]
        performance_estimators += [FloatHelper("supervised_loss")]
        performance_estimators += [FloatHelper("reconstruction_loss")]
        performance_estimators += [AccuracyHelper("train_")]

        print('\nTraining, epoch: %d' % epoch)

        self.net.train()

        for performance_estimator in performance_estimators:
            performance_estimator.init_performance_metrics()

        unsupervised_loss_acc = 0
        num_batches = 0
        train_loader_subset = self.problem.train_loader_subset_range(0, self.args.num_training)
        unlabeled_loader = self.problem.unlabeled_loader()
        data_provider = MultiThreadedCpuGpuDataProvider(iterator=zip(train_loader_subset, unlabeled_loader),is_cuda=self.use_cuda,
                                     batch_names=["training", "unlabeled"],
                                     requires_grad={"training": ["input"], "unlabeled": ["input"]},
                                     volatile={"training": ["metaData"], "unlabeled": []},
                                     recode_functions={"softmaxGenotype": lambda x: recode_for_label_smoothing(x,self.epsilon)})
        self.net.autoencoder.train()
        try:
            for batch_idx, (_, data_dict) in enumerate(data_provider):
                input_s = data_dict["training"]["input"]
                metadata = data_dict["training"]["metaData"]
                target_s = data_dict["training"]["softmaxGenotype"]
                input_u = data_dict["unlabeled"]["input"]
                num_batches += 1

                # need a copy of input_u and input_s as output:
                target_u = Variable(input_u.data, requires_grad=False)
                target_output_s = Variable(input_s.data, requires_grad=False)
                # outputs used to calculate the loss of the supervised model
                # must be done with the model prior to regularization:

                # Zero gradients:
                self.net.zero_grad()
                self.net.autoencoder.zero_grad()
                self.optimizer_training.zero_grad()

                output_s = self.net(input_s)
                output_u = self.net.autoencoder(input_u)
                input_output_s = self.net.autoencoder(input_s)
                output_s_p = self.get_p(output_s)

                _, target_index = torch.max(target_s, dim=1)
                supervised_loss = self.criterion_classifier(output_s, target_s)
                reconstruction_loss_unsup = self.criterion_autoencoder(output_u, target_u)
                reconstruction_loss_sup = self.criterion_autoencoder(input_output_s, target_output_s)
                reconstruction_loss = self.args.gamma * reconstruction_loss_unsup+reconstruction_loss_sup
                optimized_loss = supervised_loss + reconstruction_loss
                optimized_loss.backward()
                self.optimizer_training.step()
                performance_estimators.set_metric(batch_idx, "supervised_loss", supervised_loss.data[0])
                performance_estimators.set_metric(batch_idx, "reconstruction_loss", reconstruction_loss.data[0])
                performance_estimators.set_metric(batch_idx, "optimized_loss", optimized_loss.data[0])
                performance_estimators.set_metric_with_outputs(batch_idx, "train_accuracy", supervised_loss.data[0],
                                                               output_s_p, targets=target_index)

                progress_bar(batch_idx * self.mini_batch_size,
                             self.max_training_examples,
                             performance_estimators.progress_message(["supervised_loss", "reconstruction_loss",
                                                                      "train_accuracy"]))

                if (batch_idx + 1) * self.mini_batch_size > self.max_training_examples:
                    break
        finally:
            data_provider.close()

        return performance_estimators
Пример #12
0
                sum_sdm_n += torch.sum(torch.pow(x - mean.expand_as(x), 2),
                                       dim=0)

    datasets = [
        problem.train_set(),
        problem.validation_set(),
        problem.unlabeled_set(),
        problem.test_set()
    ]
    # First, get the sum of all of the example vectors
    for index, dataset in enumerate(datasets):
        train_loader_subset = problem.loader_for_dataset(dataset, shuffle=True)
        print("Summing dataset {}/{}".format(index + 1, len(datasets)))
        data_provider = MultiThreadedCpuGpuDataProvider(
            iterator=zip(train_loader_subset),
            is_cuda=False,
            batch_names=["dataset"],
            volatile={"dataset": [args.vector_name]},
            recode_functions={args.vector_name: add_to_sum})
        batch_index = 0
        for example in data_provider:
            batch_index += 1
            if batch_index * args.mini_batch_size > args.n:
                break
        data_provider.close()
    # Calculate the mean
    print("Calculating the mean")
    mean = sum_n / n
    # Calculate the sum of squared deviations from means (sum_sdm)
    for index, dataset in enumerate(datasets):
        train_loader_subset = problem.loader_for_dataset(dataset, shuffle=True)
        print("Calculating sum of squared deviations for dataset {}/{}".format(
Пример #13
0
    def train_semisup_aae(self, epoch, performance_estimators=None):
        if performance_estimators is None:
            performance_estimators = PerformanceList()
            performance_estimators += [FloatHelper("reconstruction_loss")]
            performance_estimators += [FloatHelper("discriminator_loss")]
            performance_estimators += [FloatHelper("generator_loss")]
            performance_estimators += [FloatHelper("supervised_loss")]
            performance_estimators += [FloatHelper("weight")]
            print('\nTraining, epoch: %d' % epoch)
        for performance_estimator in performance_estimators:
            performance_estimator.init_performance_metrics()

        self.net.train()
        supervised_grad_norm = 1.
        for performance_estimator in performance_estimators:
            performance_estimator.init_performance_metrics()

        unsupervised_loss_acc = 0
        num_batches = 0
        train_loader_subset1 = self.problem.train_loader_subset_range(
            0, self.args.num_training)
        train_loader_subset2 = self.problem.train_loader_subset_range(
            0, self.args.num_training)

        data_provider = MultiThreadedCpuGpuDataProvider(
            iterator=zip(train_loader_subset1, train_loader_subset2),
            is_cuda=self.use_cuda,
            batch_names=["training1", "training2"],
            requires_grad={
                "training1": ["input"],
                "training2": ["input"]
            },
            volatile={
                "training1": ["metaData"],
                "training2": ["metaData"]
            },
            recode_functions={
                "softmaxGenotype": recode_for_label_smoothing,
                "input": self.normalize_inputs
            })

        indel_weight = self.args.indel_weight_factor
        snp_weight = 1.0

        latent_codes = []
        try:
            for batch_idx, (_, data_dict) in enumerate(data_provider):
                input_s1 = data_dict["training1"]["input"]
                input_s2 = data_dict["training2"]["input"]
                target_s1 = data_dict["training1"]["softmaxGenotype"]
                target_s2 = data_dict["training2"]["softmaxGenotype"]

                meta_data1 = data_dict["training1"]["metaData"]
                meta_data2 = data_dict["training2"]["metaData"]
                num_batches += 1
                self.zero_grad_all_optimizers()

                # input_s=normalize_mean_std(input_s)
                # input_u=normalize_mean_std(input_u)
                # print(torch.mean(input_s,dim=0))
                # Train reconstruction phase:
                self.net.decoder.train()
                reconstruction_loss = self.net.get_crossconstruction_loss(
                    input_s1, input_s2, target_s2)
                reconstruction_loss.backward()
                for opt in [self.decoder_opt, self.encoder_reconstruction_opt]:
                    opt.step()

                # Train discriminators:
                self.net.encoder.train()
                self.net.discriminator_cat.train()
                self.net.discriminator_prior.train()
                self.zero_grad_all_optimizers()
                genotype_frequencies = self.class_frequencies[
                    "softmaxGenotype"]
                category_prior = (genotype_frequencies /
                                  torch.sum(genotype_frequencies)).numpy()
                discriminator_loss = self.net.get_discriminator_loss(
                    input_s1, category_prior=category_prior)
                discriminator_loss.backward()
                for opt in [
                        self.discriminator_cat_opt,
                        self.discriminator_prior_opt
                ]:
                    opt.step()
                self.zero_grad_all_optimizers()

                # Train generator:
                self.net.encoder.train()
                generator_loss = self.net.get_generator_loss(input_s1)
                generator_loss.backward()
                for opt in [self.encoder_generator_opt]:
                    opt.step()
                self.zero_grad_all_optimizers()

                if self.use_pdf:
                    self.net.encoder.train()
                    _, latent_code = self.net.encoder(input_s1)
                    weight = self.estimate_example_density_weight(latent_code)
                else:
                    weight = self.estimate_batch_weight(
                        meta_data1,
                        indel_weight=indel_weight,
                        snp_weight=snp_weight)
                self.net.encoder.train()
                supervised_loss = self.net.get_crossencoder_supervised_loss(
                    input_s1, target_s1) * weight
                supervised_loss.backward()

                for opt in [self.encoder_semisup_opt]:
                    opt.step()
                self.zero_grad_all_optimizers()

                performance_estimators.set_metric(batch_idx,
                                                  "reconstruction_loss",
                                                  reconstruction_loss.data[0])
                performance_estimators.set_metric(batch_idx,
                                                  "discriminator_loss",
                                                  discriminator_loss.data[0])
                performance_estimators.set_metric(batch_idx, "generator_loss",
                                                  generator_loss.data[0])
                performance_estimators.set_metric(batch_idx, "supervised_loss",
                                                  supervised_loss.data[0])
                performance_estimators.set_metric(batch_idx, "weight", weight)
                if not self.args.no_progress:
                    progress_bar(
                        batch_idx * self.mini_batch_size,
                        self.max_training_examples,
                        performance_estimators.progress_message([
                            "reconstruction_loss", "discriminator_loss",
                            "generator_loss", "semisup_loss"
                        ]))
                if ((batch_idx + 1) *
                        self.mini_batch_size) > self.max_training_examples:
                    break
        finally:
            data_provider.close()

        return performance_estimators
Пример #14
0
    def test_semisup_aae(self, epoch, performance_estimators=None):
        print('\nTesting, epoch: %d' % epoch)
        if performance_estimators is None:
            performance_estimators = PerformanceList()
            performance_estimators += [FloatHelper("reconstruction_loss")]
            performance_estimators += [LossHelper("test_loss")]
            performance_estimators += [AccuracyHelper("test_")]
            performance_estimators += [FloatHelper("weight")]

        self.net.eval()
        for performance_estimator in performance_estimators:
            performance_estimator.init_performance_metrics()
        validation_loader_subset = self.problem.validation_loader_range(
            0, self.args.num_validation)
        data_provider = MultiThreadedCpuGpuDataProvider(
            iterator=zip(validation_loader_subset),
            is_cuda=self.use_cuda,
            batch_names=["validation"],
            requires_grad={"validation": []},
            volatile={
                "validation": ["input", "softmaxGenotype"],
            },
            recode_functions={"input": self.normalize_inputs})
        self.net.eval()
        try:
            for batch_idx, (_, data_dict) in enumerate(data_provider):
                input_s = data_dict["validation"]["input"]
                target_s = data_dict["validation"]["softmaxGenotype"]

                # Estimate the reconstruction loss on validation examples:
                reconstruction_loss = self.net.get_crossconstruction_loss(
                    input_s, input_s, target_s)

                # now evaluate prediction of categories:
                categories_predicted, latent_code = self.net.encoder(input_s)
                #            categories_predicted+=self.net.latent_to_categories(latent_code)

                categories_predicted_p = self.get_p(categories_predicted)
                categories_predicted_p[
                    categories_predicted_p != categories_predicted_p] = 0.0
                _, target_index = torch.max(target_s, dim=1)
                categories_loss = self.net.semisup_loss_criterion(
                    categories_predicted, target_s)

                weight = self.estimate_example_density_weight(latent_code)
                performance_estimators.set_metric(batch_idx,
                                                  "reconstruction_loss",
                                                  reconstruction_loss.data[0])
                performance_estimators.set_metric(batch_idx, "weight", weight)
                performance_estimators.set_metric_with_outputs(
                    batch_idx, "test_accuracy", reconstruction_loss.data[0],
                    categories_predicted_p, target_index)
                performance_estimators.set_metric_with_outputs(
                    batch_idx, "test_loss", categories_loss.data[0] * weight,
                    categories_predicted_p, target_s)

                if not self.args.no_progress:
                    progress_bar(
                        batch_idx * self.mini_batch_size,
                        self.max_validation_examples,
                        performance_estimators.progress_message([
                            "test_loss", "test_accuracy", "reconstruction_loss"
                        ]))

                if ((batch_idx + 1) *
                        self.mini_batch_size) > self.max_validation_examples:
                    break
            # print()
        finally:
            data_provider.close()
        # Apply learning rate schedules:
        test_metric = performance_estimators.get_metric(
            self.get_test_metric_name())
        assert test_metric is not None, (
            self.get_test_metric_name() +
            "must be found among estimated performance metrics")
        if not self.args.constant_learning_rates:
            for scheduler in self.schedulers:
                scheduler.step(test_metric, epoch)
        # Run the garbage collector to try to release memory we no longer need:
        import gc
        gc.collect()
        return performance_estimators
Пример #15
0
    def do_training(epoch, thread_executor):
        print('Training, epoch: %d' % epoch)
        for model_trainer in trainers:
            model_trainer.training_performance_estimators.init_performance_metrics()

        unsupervised_loss_acc = 0
        num_batches = 0
        train_loaders = []
        if args.mode == "supervised_direct":
            train_loader_subset = problem.train_loader_subset_range(0, args.num_training)
            data_provider = DataProvider(
                iterator=zip(train_loader_subset),
                device=device,
                batch_names=["training"],
                requires_grad={"training": ["input"]},
                vectors_to_keep=["metaData", "softmaxGenotype"]
            )
            train_loaders += [train_loader_subset]
        elif args.mode == "supervised_mixup":
            train_loader_subset1 = problem.train_loader_subset_range(0, args.num_training)
            train_loader_subset2 = problem.train_loader_subset_range(0, args.num_training)
            data_provider = DataProvider(
                iterator=zip(train_loader_subset1, train_loader_subset2),
                device=device,
                batch_names=["training_1", "training_2"],
                requires_grad={"training_1": ["input"], "training_2": ["input"]},
                vectors_to_keep=["metaData", "softmaxGenotype"]
            )
            train_loaders += [train_loader_subset1, train_loader_subset2]
        if args.mode == "semisupervised":
            train_loader_subset = problem.train_loader_subset_range(0, args.num_training)
            unlabeled_loader = problem.unlabeled_loader()
            data_provider = MultiThreadedCpuGpuDataProvider(
                iterator=zip(train_loader_subset, unlabeled_loader),
                device=device,
                batch_names=["training", "unlabeled"],
                requires_grad={"training": ["input"],
                               "unlabeled": ["input"]},
                vectors_to_keep=["metaData", "softmaxGenotype"]
            )
            train_loaders += [train_loader_subset, unlabeled_loader]
        try:

            snp_weight = 1.0
            todo_arguments = []
            for batch_idx, (_, data_dict) in enumerate(data_provider):
                if args.mode == "supervised_direct":
                    input_s = data_dict["training"]["input"]
                    target_s = data_dict["training"]["softmaxGenotype"]
                    metadata = data_dict["training"]["metaData"]
                    todo_arguments = [input_s, target_s, metadata]
                if args.mode == "supervised_mixup":
                    input_s_1 = data_dict["training_1"]["input"]
                    target_s_1 = data_dict["training_1"]["softmaxGenotype"]
                    metadata_1 = data_dict["training_1"]["metaData"]
                    input_s_2 = data_dict["training_2"]["input"]
                    target_s_2 = data_dict["training_2"]["softmaxGenotype"]
                    metadata_2 = data_dict["training_2"]["metaData"]
                    todo_arguments = [input_s_1, target_s_1, metadata_1, input_s_2, target_s_2, metadata_2]
                if args.mode == "semisupervised":
                    input_s = data_dict["training"]["input"]
                    target_s = data_dict["training"]["softmaxGenotype"]
                    metadata = data_dict["training"]["metaData"]
                    input_u = data_dict["unlabeled"]["input"]
                    todo_arguments = [input_s, target_s, metadata, input_u]
                batch_size = len(todo_arguments[0])
                num_batches += 1
                futures = []
                for model_trainer in trainers:
                    def to_do(model_trainer, batch_idx, todo_arguments):
                        if args.mode == "supervised_direct":
                            input_s, target_s, metadata = todo_arguments
                            input_s_local = input_s.clone()
                            target_s_local = target_s.clone()
                            metadata_local = metadata.clone()
                            target_s_smoothed = recode_for_label_smoothing(target_s_local,
                                                                           model_trainer.args.epsilon_label_smoothing)
                            model_trainer.net.train()
                            model_trainer.train_one_batch(model_trainer.training_performance_estimators,
                                                          batch_idx, input_s_local, target_s_smoothed, metadata_local)

                        if args.mode == "supervised_mixup":
                            input_s_1, target_s_1, metadata_1, input_s_2, target_s_2, metadata_2 = todo_arguments
                            input_s_1_local = input_s_1.clone()
                            target_s_1_local = target_s_1.clone()
                            target_s_1_smoothed = recode_for_label_smoothing(target_s_1_local,
                                                                             model_trainer.args.epsilon_label_smoothing)
                            input_s_2_local = input_s_2.clone()
                            target_s_2_local = target_s_2.clone()
                            target_s_2_smoothed = recode_for_label_smoothing(target_s_2_local,
                                                                             model_trainer.args.epsilon_label_smoothing)
                            metadata_1_local = metadata_1.clone()
                            metadata_2_local = metadata_2.clone()
                            model_trainer.net.train()
                            model_trainer.train_one_batch(model_trainer.training_performance_estimators,
                                                          batch_idx, input_s_1_local, input_s_2_local,
                                                          target_s_1_smoothed, target_s_2_smoothed,
                                                          metadata_1_local, metadata_2_local)
                        if args.mode == "semisupervised":

                            input_s, target_s, metadata, input_u = todo_arguments
                            input_s_local = input_s.clone()
                            target_s_local = target_s.clone()
                            target_s_smoothed = recode_for_label_smoothing(target_s_local,
                                                                           model_trainer.args.epsilon_label_smoothing)
                            input_u_local = input_u.clone()
                            metadata_local = metadata.clone()
                            model_trainer.net.train()
                            model_trainer.train_one_batch(model_trainer.training_performance_estimators, batch_idx,
                                                          input_s_local, target_s_smoothed, metadata_local,
                                                          input_u_local)

                    futures += [thread_executor.submit(to_do, model_trainer, batch_idx, todo_arguments)]

                concurrent.futures.wait(futures)
                # Report any exceptions encountered in to_do:
                raise_to_do_exceptions(futures)
                if (batch_idx + 1) * batch_size > args.num_training:
                    break
        finally:
            data_provider.close()
            for train_loader in train_loaders:
                del train_loader
    def train_semisup_aae(self, epoch,
                          performance_estimators=None):
        if performance_estimators is None:
            performance_estimators = self.create_training_performance_estimators()

            print('\nTraining, epoch: %d' % epoch)
        for performance_estimator in performance_estimators:
            performance_estimator.init_performance_metrics()

        self.net.train()
        supervised_grad_norm = 1.


        unsupervised_loss_acc = 0
        num_batches = 0
        train_loader_subset = self.problem.train_loader_subset_range(0, self.args.num_training)
        unlabeled_loader = self.problem.unlabeled_loader()

        data_provider = MultiThreadedCpuGpuDataProvider(
            iterator=zip(train_loader_subset, unlabeled_loader),
            device=self.device,
            batch_names=["training", "unlabeled"],
            requires_grad={"training": ["input"], "unlabeled": ["input"]},
            recode_functions={
                "softmaxGenotype": lambda x: recode_for_label_smoothing(x, self.epsilon),
                "input": self.normalize_inputs
            },
            vectors_to_keep=["metaData"]
        )


        self.reset_before_train_epoch()
        try:
            for batch_idx, (_, data_dict) in enumerate(data_provider):
                input_s = data_dict["training"]["input"]
                target_s = data_dict["training"]["softmaxGenotype"]
                input_u = data_dict["unlabeled"]["input"]
                meta_data = data_dict["training"]["metaData"]
                num_batches += 1
                self.train_one_batch( performance_estimators, batch_idx, input_s, target_s, meta_data, input_u)

                if ((batch_idx + 1) * self.mini_batch_size) > self.max_training_examples:
                    break
        finally:
            data_provider.close()

        latent_code_device = torch.device("cpu")
        if self.args.latent_code_output is not None:
            # Each dimension in latent code should be Gaussian distributed, so take histogram of each column
            # Plot histograms later to see how they compare to Gaussian
            latent_code_tensor = torch.stack(self.latent_codes).to(latent_code_device)
            latent_code_histograms = [torch.histc(latent_code_tensor[:, col_idx],
                                                  bins=self.args.latent_code_bins).data.numpy()
                                      for col_idx in range(latent_code_tensor.size()[1])]
            gaussian_code_tensor = torch.stack(self.gaussian_codes).to(latent_code_device)
            gaussian_code_histograms = [torch.histc(gaussian_code_tensor[:, col_idx],
                                                    bins=self.args.latent_code_bins).data.numpy()
                                        for col_idx in range(gaussian_code_tensor.size()[1])]
            torch.save({
                "latent": latent_code_histograms,
                "gaussian": gaussian_code_histograms,
            }, "{}_{}.pt".format(self.args.latent_code_output, epoch))
        return performance_estimators
Пример #17
0
                sum_sdm_n += torch.sum(torch.pow(x - mean.expand_as(x), 2),
                                       dim=0)

    datasets = [
        problem.train_set(),
        problem.validation_set(),
        problem.unlabeled_set(),
        problem.test_set()
    ]
    # First, get the sum of all of the example vectors
    for index, dataset in enumerate(datasets):
        train_loader_subset = problem.loader_for_dataset(dataset, shuffle=True)
        print("Summing dataset {}/{}".format(index + 1, len(datasets)))
        data_provider = MultiThreadedCpuGpuDataProvider(
            iterator=zip(train_loader_subset),
            device=torch.device("cpu"),
            batch_names=["dataset"],
            recode_functions={args.vector_name: add_to_sum})
        batch_index = 0
        for example in data_provider:
            batch_index += 1
            if batch_index * args.mini_batch_size > args.n:
                break
        data_provider.close()
    # Calculate the mean
    print("Calculating the mean")
    mean = sum_n / n
    # Calculate the sum of squared deviations from means (sum_sdm)
    for index, dataset in enumerate(datasets):
        train_loader_subset = problem.loader_for_dataset(dataset, shuffle=True)
        print("Calculating sum of squared deviations for dataset {}/{}".format(