示例#1
0
 model_name = f'models/CUT{n_epoch}.pt'
 for epoch in range(start_epoch, n_epoch + 1):
     start_time = time.time()
     train_loss, train_loss_G, train_loss_D = train(
         G, H, D, optimizer_G, optimizer_H, optimizer_D, criterion_GAN,
         criterion_NCE, layers_nce, train_loader, device, epoch,
         args.log_interval)
     end_time = time.time()
     epoch_mins, epoch_secs = epoch_time(start_time, end_time)
     # print statistics and update training progress
     print(f'Epoch: {epoch:02} | Time: {epoch_mins}m {epoch_secs}s')
     im_fake_B = visualize(G)
     if args.tensorboard:
         writer.add_scalar('Loss/train', train_loss, epoch)
         writer.add_scalars('Loss/G_D', {
             'G': train_loss_G,
             'D': train_loss_D
         }, epoch)
         writer.add_image('Image/Fake Dog', im_fake_B, epoch)
     # log results to model dictionary
     model_dict['train_loss']['G'].append(train_loss)
     model_dict['train_loss']['D'].append(train_loss_G)
     model_dict['train_loss']['total'].append(train_loss_D)
     model_dict['metrics']['last']['loss'] = train_loss
     model_dict['metrics']['last']['epoch'] = epoch
     if epoch == 1 or train_loss < model_dict['metrics']['best']['loss']:
         model_dict['model_state_dict'] = G.state_dict()
         model_dict['optimizer_state_dict'] = optimizer_G.state_dict()
         model_dict['metrics']['best']['epoch'] = epoch
         model_dict['metrics']['best']['loss'] = train_loss
     if args.save:
         torch.save(model_dict, model_name)
示例#2
0
def main():
    writer = SummaryWriter()
    data_loader = configure_data()
    g = GeneratorNet().to(device)
    d = DiscriminatorNet().to(device)
    loss_func = nn.BCELoss().to(device)

    optimizer_g = opt.Adam(g.parameters(),
                           lr=args.g_lr,
                           betas=(args.b1, args.b2))
    optimizer_d = opt.Adam(d.parameters(),
                           lr=args.d_lr,
                           betas=(args.b1, args.b2))

    for e in range(args.epoch):
        total_d_loss, total_r_loss, total_f_loss, total_g_loss = torch.tensor(0.0).to(device), \
                                                                 torch.tensor(0.0).to(device),\
                                                                 torch.tensor(0.0).to(device),\
                                                                 torch.tensor(0.0).to(device)
        for i, (imgs, _) in enumerate(data_loader):
            fake = torch.zeros(imgs.shape[0], 1).to(device)
            real = torch.ones(imgs.shape[0], 1).to(device)
            noise_a = torch.randn(imgs.shape[0], args.noise_dim).to(device)
            noise_b = torch.randn(imgs.shape[0], args.noise_dim).to(device)
            with torch.no_grad():
                gen_pictures_a = g(noise_a)
            gen_pictures_b = g(noise_b)
            real_pictures = imgs.to(device)

            # 训练D
            gen_scores = d(gen_pictures_a)
            real_scores = d(real_pictures)
            optimizer_d.zero_grad()
            r_loss = loss_func(real_scores, real)
            f_loss = loss_func(gen_scores, fake)
            d_loss = r_loss + f_loss
            total_d_loss += d_loss
            total_r_loss += r_loss
            total_f_loss += f_loss
            d_loss.backward()
            optimizer_d.step()

            # 训练G
            optimizer_g.zero_grad()
            # g_loss = -loss_func(d(gen_pictures.detach()), fake)
            g_loss = loss_func(d(gen_pictures_b), real)
            total_g_loss += g_loss
            g_loss.backward()
            optimizer_g.step()

            print(
                f"[Epoch:{e+1}] [Batch:{i+1}] [D loss:{d_loss}] [G loss:{g_loss}]"
            )

            batchs_done = e * len(data_loader) + i
            if batchs_done % args.save_interval == 0:
                save_image(gen_pictures_a[np.random.randint(0,
                                                            args.batch_size //
                                                            2,
                                                            size=25)],
                           f"images/{batchs_done}.png",
                           nrow=5,
                           normalize=True)

        writer.add_scalars(
            'loss', {
                'd_loss_expectation': total_d_loss / len(data_loader),
                'real_loss_expectation': total_r_loss / len(data_loader),
                'fake_loss_expectation': total_f_loss / len(data_loader),
                'g_loss_expectation': total_g_loss / len(data_loader)
            }, e)
        if (e + 1) % 50 == 0:
            torch.save(g.state_dict(), f"models/g{e+1}.pth")
            torch.save(d.state_dict(), f"models/d{e+1}.pth")
示例#3
0
class ExperimentBuilder(object):
    def __init__(self, args, data, model, device):
        """
        Initializes an experiment builder using a named tuple (args), a data provider (data), a meta learning system
        (model) and a device (e.g. gpu/cpu/n)
        :param args: A namedtuple containing all experiment hyperparameters
        :param data: A data provider of instance MetaLearningSystemDataLoader
        :param model: A meta learning system instance
        :param device: Device/s to use for the experiment
        """
        self.args, self.device = args, device

        self.model = model
        (
            self.saved_models_filepath,
            self.logs_filepath,
            self.samples_filepath,
        ) = build_experiment_folder(experiment_name=self.args.experiment_name)

        self.per_task_performance = defaultdict(lambda: 0)
        self.total_losses = dict()
        self.state = dict()
        self.state["best_val_loss"] = 10**6
        self.state["best_val_accuracy"] = 0
        self.state["best_val_iter"] = 0
        self.state["current_iter"] = 0
        self.start_epoch = 0
        self.num_epoch_no_improvements = 0
        self.patience = args.patience
        self.create_summary_csv = False

        self.writer = SummaryWriter("runs/{}".format(
            self.args.experiment_name))

        if self.args.continue_from_epoch == "from_scratch":
            self.create_summary_csv = True

        elif self.args.continue_from_epoch == "latest":
            checkpoint = os.path.join(self.saved_models_filepath,
                                      "train_model_latest")
            print("attempting to find existing checkpoint", )
            if os.path.exists(checkpoint):
                self.state = self.model.load_model(
                    model_save_dir=self.saved_models_filepath,
                    model_name="train_model",
                    model_idx="latest",
                )
                self.start_epoch = int(self.state["current_iter"] /
                                       self.args.total_iter_per_epoch)

            else:
                self.args.continue_from_epoch = "from_scratch"
                self.create_summary_csv = True
        elif int(self.args.continue_from_epoch) >= 0:
            self.state = self.model.load_model(
                model_save_dir=self.saved_models_filepath,
                model_name="train_model",
                model_idx=self.args.continue_from_epoch,
            )
            self.start_epoch = int(self.state["current_iter"] /
                                   self.args.total_iter_per_epoch)

        self.data = data(args=args, current_iter=self.state["current_iter"])

        self.idx_to_class_name = self.data.dataset.load_from_json(
            self.data.dataset.index_to_label_name_dict_file)

        print("train_seed {}, val_seed: {}, at start time".format(
            self.data.dataset.seed["train"], self.data.dataset.seed["val"]))
        self.total_epochs_before_pause = self.args.total_epochs_before_pause
        self.state["best_epoch"] = int(self.state["best_val_iter"] /
                                       self.args.total_iter_per_epoch)
        self.epoch = int(self.state["current_iter"] /
                         self.args.total_iter_per_epoch)

        self.start_time = time.time()
        self.epochs_done_in_this_run = 0
        print(
            self.state["current_iter"],
            int(self.args.total_iter_per_epoch * self.args.total_epochs),
        )

        if self.epoch == 0:
            for param_name, param in self.model.named_parameters():
                self.writer.add_histogram(param_name, param, 0)

            self.writer.flush()

    def build_summary_dict(self, total_losses, phase, summary_losses=None):
        """
        Builds/Updates a summary dict directly from the metric dict of the current iteration.
        :param total_losses: Current dict with total losses (not aggregations) from experiment
        :param phase: Current training phase
        :param summary_losses: Current summarised (aggregated/summarised) losses stats means, stdv etc.
        :return: A new summary dict with the updated summary statistics information.
        """
        if summary_losses is None:
            summary_losses = dict()

        for key in total_losses:
            summary_losses["{}_{}_mean".format(phase, key)] = np.mean(
                total_losses[key])
            summary_losses["{}_{}_std".format(phase,
                                              key)] = np.std(total_losses[key])

        return summary_losses

    def build_loss_summary_string(self, summary_losses):
        """
        Builds a progress bar summary string given current summary losses dictionary
        :param summary_losses: Current summary statistics
        :return: A summary string ready to be shown to humans.
        """
        output_update = ""
        for key, value in zip(list(summary_losses.keys()),
                              list(summary_losses.values())):
            if "loss" in key or "accuracy" in key:
                value = float(value)
                output_update += "{}: {:.4f}, ".format(key, value)

        return output_update

    def merge_two_dicts(self, first_dict, second_dict):
        """Given two dicts, merge them into a new dict as a shallow copy."""
        z = first_dict.copy()
        z.update(second_dict)
        return z

    def write_task_lang_log(self, log):
        """
        Writes the log from a train iteration in tidy format to the task/lang log file
        :param log: list containing [task name, language, iteration, support loss, support accuracy, query loss, query accuracy]
        :return:
        """
        for line in log:
            save_statistics(self.logs_filepath,
                            line,
                            filename="task_lang_log.csv",
                            create=False)

    def train_iteration(
        self,
        train_sample,
        sample_idx,
        epoch_idx,
        total_losses,
        current_iter,
        pbar_train,
    ):
        """
        Runs a training iteration, updates the progress bar and returns the total and current epoch train losses.
        :param train_sample: A sample from the data provider
        :param sample_idx: The index of the incoming sample, in relation to the current training run.
        :param epoch_idx: The epoch index.
        :param total_losses: The current total losses dictionary to be updated.
        :param current_iter: The current training iteration in relation to the whole experiment.
        :param pbar_train: The progress bar of the training.
        :return: Updates total_losses, train_losses, current_iter
        """
        (
            x_support_set,
            len_support_set,
            x_target_set,
            len_target_set,
            y_support_set,
            y_target_set,
            selected_classes,
            seed,
        ) = train_sample

        # Get teacher names and languages
        teacher_names, langs = zip(*[t.split("_") for t in selected_classes])

        data_batch = (
            x_support_set,
            len_support_set,
            x_target_set,
            len_target_set,
            y_support_set,
            y_target_set,
            selected_classes,
        )

        losses, task_lang_log = self.model.run_train_iter(
            data_batch=data_batch, epoch=epoch_idx)
        for log, lang in zip(task_lang_log, langs):
            log.insert(1, lang)

        self.write_task_lang_log(task_lang_log)

        for key, value in zip(list(losses.keys()), list(losses.values())):
            if key not in total_losses:
                total_losses[key] = [float(value)]
            else:
                total_losses[key].append(float(value))

        train_losses = self.build_summary_dict(total_losses=total_losses,
                                               phase="train")
        train_output_update = self.build_loss_summary_string(losses)

        pbar_train.update(1)
        pbar_train.set_description("training phase {} -> {}".format(
            self.epoch, train_output_update))

        current_iter += 1

        return train_losses, total_losses, current_iter

    def full_task_set_evaluation(self, epoch, set_name="val", **kwargs):

        if set_name == "test":
            print("Loading best model for evaluation..")
            self.model.load_model(
                model_save_dir=self.saved_models_filepath,
                model_name="train_model",
                model_idx="best",
            )

        set_meta_loss_back = False
        if self.model.meta_loss.lower(
        ) == "kl" and self.args.val_using_cross_entropy:
            # Use cross entropy on gold labels as no teacher encoding is available
            self.model.meta_loss = "ce"
            set_meta_loss_back = True
        # list sets in dev set
        val_tasks = list(self.data.dataset.task_set_sizes[set_name].keys())
        # generate seeds
        seeds = [42 + i for i in range(self.args.num_evaluation_seeds)]

        per_val_set_performance = {k: [] for k in val_tasks}
        # perform finetuning and evaluation
        result = {}
        losses = []
        accuracies = []
        saved_already = False
        for task_name in val_tasks:
            for seed in seeds:
                print("Evaluating {} with seed {}...".format(task_name, seed))
                train_dataloader, dev_dataloader = self.data.get_finetune_dataloaders(
                    task_name, 0, seed)

                _, best_loss, curr_loss, accuracy = self.model.finetune_epoch(
                    None,
                    self.model.classifier.config,
                    train_dataloader,
                    dev_dataloader,
                    task_name=task_name,
                    epoch=epoch,
                    eval_every=1,
                    model_save_dir=self.saved_models_filepath,
                    best_loss=0,
                )

                per_val_set_performance[task_name].append(accuracy)
                accuracies.append(accuracy)
                losses.append(curr_loss)
            # Store and compare performance per validation task
            avg_accuracy = np.mean(per_val_set_performance[task_name])
            if avg_accuracy > self.per_task_performance[task_name]:
                print("New best performance for task", task_name)
                self.per_task_performance[task_name] = avg_accuracy
                self.state["best_epoch_{}".format(task_name)] = int(
                    self.state["current_iter"] /
                    self.args.total_iter_per_epoch)

        result["{}_accuracy_mean".format(set_name)] = np.mean(accuracies)
        result["{}_loss_std".format(set_name)] = np.std(accuracies)
        result["{}_loss_mean".format(set_name)] = np.mean(losses)
        result["{}_loss_std".format(set_name)] = np.std(losses)

        if set_meta_loss_back:
            self.model.meta_loss = "kl"

        return result

    def evaluation_iteration(self, val_sample, total_losses, pbar_val, phase):
        """
        Runs a validation iteration, updates the progress bar and returns the total and current epoch val losses.
        :param val_sample: A sample from the data provider
        :param total_losses: The current total losses dictionary to be updated.
        :param pbar_val: The progress bar of the val stage.
        :return: The updated val_losses, total_losses
        """
        (
            x_support_set,
            len_support_set,
            x_target_set,
            len_target_set,
            y_support_set,
            y_target_set,
            selected_classes,
            seed,
        ) = val_sample

        # Convert selected_classes to their pretrained directories
        if self.args.sets_are_pre_split:
            teacher_names = [t.split("_")[0] for t in selected_classes]
        else:
            teacher_names = [
                self.idx_to_class_name[selected_class].split("_")[0]
                for selected_class in selected_classes
            ]
        data_batch = (
            x_support_set,
            len_support_set,
            x_target_set,
            len_target_set,
            y_support_set,
            y_target_set,
            teacher_names,
        )

        losses = self.model.run_validation_iter(data_batch=data_batch)
        for key, value in losses.items():
            if key not in total_losses:
                total_losses[key] = [float(value)]
            else:
                total_losses[key].append(float(value))

        val_losses = self.build_summary_dict(total_losses=total_losses,
                                             phase=phase)
        val_output_update = self.build_loss_summary_string(losses)

        pbar_val.update(1)
        pbar_val.set_description("val_phase {} -> {}".format(
            self.epoch, val_output_update))

        return val_losses, total_losses

    def test_evaluation_iteration(self, val_sample, pbar_test):
        """
        Runs a validation iteration, updates the progress bar and returns the total and current epoch val losses.
        :param val_sample: A sample from the data provider
        :param total_losses: The current total losses dictionary to be updated.
        :param pbar_test: The progress bar of the val stage.
        :return: The updated val_losses, total_losses
        """
        (
            x_support_set,
            len_support_set,
            x_target_set,
            len_target_set,
            y_support_set,
            y_target_set,
            selected_classes,
            seed,
        ) = val_sample
        # Convert selected_classes to their pretrained directories
        if self.args.sets_are_pre_split:
            teacher_names = [t.split("_")[0] for t in selected_classes]
        else:
            teacher_names = [
                self.idx_to_class_name[selected_class].split("_")[0]
                for selected_class in selected_classes
            ]
        data_batch = (
            x_support_set,
            len_support_set,
            x_target_set,
            len_target_set,
            y_support_set,
            y_target_set,
            teacher_names,
        )

        losses = self.model.run_validation_iter(data_batch=data_batch)

        test_output_update = self.build_loss_summary_string(losses)

        pbar_test.update(1)
        pbar_test.set_description("test_phase {} -> {}".format(
            self.epoch, test_output_update))

        return losses

    def save_models(self, model, epoch, state, new_best):
        """
        Saves two separate instances of the current model. One to be kept for history and reloading later and another
        one marked as "latest" to be used by the system for the next epoch training. Useful when the training/val
        process is interrupted or stopped. Leads to fault tolerant training and validation systems that can continue
        from where they left off before.
        :param model: Current meta learning model of any instance within the few_shot_learning_system.py
        :param epoch: Current epoch
        :param state: Current model and experiment state dict.
        :param new best: Only save double copy of model when it performs better than all previous models
        """
        print("New best: ", new_best)
        if new_best:
            model.save_model(
                model_save_dir=os.path.join(self.saved_models_filepath,
                                            "train_model_best"),
                state=state,
            )

        model.save_model(
            model_save_dir=os.path.join(self.saved_models_filepath,
                                        "train_model_latest"),
            state=state,
        )

        print("saved models to", self.saved_models_filepath)

    def pack_and_save_metrics(self, start_time, create_summary_csv,
                              train_losses, val_losses, state):
        """
        Given current epochs start_time, train losses, val losses and whether to create a new stats csv file, pack stats
        and save into a statistics csv file. Return a new start time for the new epoch.
        :param start_time: The start time of the current epoch
        :param create_summary_csv: A boolean variable indicating whether to create a new statistics file or
        append results to existing one
        :param train_losses: A dictionary with the current train losses
        :param val_losses: A dictionary with the currrent val loss
        :return: The current time, to be used for the next epoch.
        """
        epoch_summary_losses = self.merge_two_dicts(first_dict=train_losses,
                                                    second_dict=val_losses)

        if "per_epoch_statistics" not in state:
            state["per_epoch_statistics"] = dict()

        for key, value in epoch_summary_losses.items():

            if key not in state["per_epoch_statistics"]:
                state["per_epoch_statistics"][key] = [value]
            else:
                state["per_epoch_statistics"][key].append(value)

        epoch_summary_string = self.build_loss_summary_string(
            epoch_summary_losses)
        epoch_summary_losses["epoch"] = self.epoch
        epoch_summary_losses["epoch_run_time"] = time.time() - start_time

        if create_summary_csv:
            self.summary_statistics_filepath = save_statistics(
                self.logs_filepath,
                list(epoch_summary_losses.keys()),
                create=True)
            self.create_summary_csv = False

        start_time = time.time()
        print("epoch {} -> {}".format(epoch_summary_losses["epoch"],
                                      epoch_summary_string))

        self.summary_statistics_filepath = save_statistics(
            self.logs_filepath, list(epoch_summary_losses.values()))
        return start_time, state

    def evaluate_test_set_using_the_best_models(self, top_n_models):
        per_epoch_statistics = self.state["per_epoch_statistics"]
        val_acc = np.copy(per_epoch_statistics["val_loss_mean"])
        val_idx = np.array([i for i in range(len(val_acc))])
        sorted_idx = np.argsort(val_acc,
                                axis=0).astype(dtype=np.int32)[:top_n_models]

        sorted_val_acc = val_acc[sorted_idx]
        val_idx = val_idx[sorted_idx]
        print(sorted_idx)
        print(sorted_val_acc)

        top_n_idx = val_idx[:top_n_models]
        per_model_per_batch_loss = [[] for i in range(top_n_models)]
        # per_model_per_batch_targets = [[] for i in range(top_n_models)]
        test_losses = [dict() for i in range(top_n_models)]
        for idx, model_idx in enumerate(top_n_idx):
            self.state = self.model.load_model(
                model_save_dir=self.saved_models_filepath,
                model_name="train_model",
                model_idx=model_idx + 1,
            )
            with tqdm.tqdm(total=int(self.args.num_evaluation_tasks /
                                     self.args.batch_size)) as pbar_test:
                for sample_idx, test_sample in enumerate(
                        self.data.get_test_batches(
                            total_batches=int(self.args.num_evaluation_tasks /
                                              self.args.batch_size),
                            augment_images=False,
                        )):
                    # print(test_sample[4])
                    # per_model_per_batch_targets[idx].extend(np.array(test_sample[3]))
                    per_model_per_batch_loss = self.test_evaluation_iteration(
                        val_sample=test_sample,
                        sample_idx=sample_idx,
                        model_idx=idx,
                        per_model_per_batch_preds=per_model_per_batch_loss,
                        pbar_test=pbar_test,
                    )

        per_batch_loss = np.mean(per_model_per_batch_loss, axis=0)
        loss = np.mean(per_batch_loss)
        loss_std = np.std(per_batch_loss)

        test_losses = {"test_loss_mean": loss, "test_loss_std": loss_std}

        _ = save_statistics(
            self.logs_filepath,
            list(test_losses.keys()),
            create=True,
            filename="test_summary.csv",
        )

        summary_statistics_filepath = save_statistics(
            self.logs_filepath,
            list(test_losses.values()),
            create=False,
            filename="test_summary.csv",
        )
        print(test_losses)
        print("saved test performance at", summary_statistics_filepath)

    def prep_finetuning(
        self,
        task_name,
        is_baseline,
        percentage_train,
        seed,
    ):
        """
        Takes the best performing model and fine-tunes it using all available data for a task
        :param task_name:
        :return:
        """

        # Get dataloader with all task data
        train_dataloader, dev_dataloader = self.data.get_finetune_dataloaders(
            task_name, percentage_train, seed)
        #############################
        # Load the model to finetune
        #############################
        if is_baseline:

            teacher_name = (task_name.split("_")[0].replace("val/",
                                                            "").replace(
                                                                "train/", ""))
            model = AutoModelForSequenceClassification.from_pretrained(
                os.path.join(self.args.teacher_dir, teacher_name),
                output_hidden_states=False,
            )
            return train_dataloader, dev_dataloader, model
        else:
            per_epoch_statistics = self.state["per_epoch_statistics"]
            val_acc = np.copy(per_epoch_statistics["val_loss_mean"])
            # Load the best scoring model
            model_idx = np.argsort(val_acc, axis=0).astype(dtype=np.int32)[0]

            sorted_val_acc = val_acc[model_idx]
            print("Loading model {} with validation loss {}".format(
                model_idx, sorted_val_acc))

            self.state = self.model.load_model(
                model_save_dir=self.saved_models_filepath,
                model_name="train_model",
                model_idx="best",  # model_idx + 1,
            )
            del self.state

            return train_dataloader, dev_dataloader, self.model.classifier

    def run_experiment(self):
        """
        Runs a full training experiment with evaluations of the model on the val set at every epoch. Furthermore,
        will return the test set evaluation results on the best performing validation model.
        """

        # pr = cProfile.Profile()
        # pr.enable()
        with tqdm.tqdm(
                initial=self.state["current_iter"],
                total=int(self.args.total_iter_per_epoch *
                          self.args.total_epochs),
        ) as pbar_train:

            while (self.state["current_iter"] <
                   (self.args.total_epochs * self.args.total_iter_per_epoch)
                   ) and (self.args.evaluate_on_test_set_only == False):

                for train_sample_idx, train_sample in enumerate(
                        self.data.get_train_batches(
                            total_batches=int(self.args.total_iter_per_epoch *
                                              self.args.total_epochs) -
                            self.state["current_iter"])):
                    (
                        train_losses,
                        total_losses,
                        self.state["current_iter"],
                    ) = self.train_iteration(
                        train_sample=train_sample,
                        total_losses=self.total_losses,
                        epoch_idx=(self.state["current_iter"] /
                                   self.args.total_iter_per_epoch),
                        pbar_train=pbar_train,
                        current_iter=self.state["current_iter"],
                        sample_idx=self.state["current_iter"],
                    )

                    if self.state[
                            "current_iter"] % self.args.total_iter_per_epoch == 0:
                        # pr.disable()
                        # pr.print_stats()
                        epoch = (self.state["current_iter"] //
                                 self.args.total_iter_per_epoch)
                        total_losses = dict()
                        val_losses = dict()
                        new_best = False

                        if (self.args.eval_using_full_task_set
                            ):  # evaluate on the whole available task set
                            val_losses = self.full_task_set_evaluation(
                                epoch=epoch)
                        else:  # evaluate in few-shot fashion/ on query set only
                            with tqdm.tqdm(total=int(
                                    self.args.num_evaluation_tasks /
                                    self.args.batch_size)) as pbar_val:
                                for _, val_sample in enumerate(
                                        self.data.
                                        get_val_batches(total_batches=int(
                                            self.args.num_evaluation_tasks /
                                            self.args.batch_size))):
                                    (
                                        val_losses,
                                        total_losses,
                                    ) = self.evaluation_iteration(
                                        val_sample=val_sample,
                                        total_losses=total_losses,
                                        pbar_val=pbar_val,
                                        phase="val",
                                    )
                        # Write metrics to tensorboard

                        # log metrics
                        self.writer.add_scalars(
                            "loss",
                            {
                                "train": train_losses["train_loss_mean"],
                                "val": val_losses["val_loss_mean"],
                            },
                            epoch,
                        )

                        self.writer.add_scalars(
                            "Accuracy",
                            {
                                "train": train_losses["train_accuracy_mean"],
                                "val": val_losses["val_accuracy_mean"],
                            },
                            epoch,
                        )

                        # log weight distributions and gradients of slow weights
                        for param_name, param in self.model.named_parameters():
                            self.writer.add_histogram(param_name, param, epoch)

                        self.writer.flush()

                        if (val_losses["val_accuracy_mean"] >
                                self.state["best_val_accuracy"]):
                            self.num_epoch_no_improvements = 0
                            new_best = True
                            print(
                                "Best validation accuracy",
                                val_losses["val_accuracy_mean"],
                                "with loss",
                                val_losses["val_loss_mean"],
                            )

                            self.state["best_val_accuracy"] = (
                                val_losses["val_accuracy_mean"], )

                            self.state["best_val_iter"] = self.state[
                                "current_iter"]
                            self.state["best_epoch"] = int(
                                self.state["best_val_iter"] /
                                self.args.total_iter_per_epoch)

                        else:
                            self.num_epoch_no_improvements += 1
                        self.epoch += 1
                        self.state = self.merge_two_dicts(
                            first_dict=self.merge_two_dicts(
                                first_dict=self.state,
                                second_dict=train_losses),
                            second_dict=val_losses,
                        )

                        self.save_models(
                            model=self.model,
                            epoch=self.epoch,
                            state=self.state,
                            new_best=new_best,
                        )

                        self.start_time, self.state = self.pack_and_save_metrics(
                            start_time=self.start_time,
                            create_summary_csv=self.create_summary_csv,
                            train_losses=train_losses,
                            val_losses=val_losses,
                            state=self.state,
                        )

                        self.total_losses = dict()

                        self.epochs_done_in_this_run += 1

                        save_to_json(
                            filename=os.path.join(self.logs_filepath,
                                                  "summary_statistics.json"),
                            dict_to_store=self.state["per_epoch_statistics"],
                        )

                        if (self.epochs_done_in_this_run >=
                                self.total_epochs_before_pause):
                            print("Pause time, evaluating on test set...")
                            print(
                                self.full_task_set_evaluation(
                                    set_name="test", epoch=self.epoch))
                            print("train_seed {}, val_seed: {}, at pause time".
                                  format(
                                      self.data.dataset.seed["train"],
                                      self.data.dataset.seed["val"],
                                  ))

                            sys.exit()
                        if self.num_epoch_no_improvements > self.patience:
                            print(
                                "{} epochs no improvement, early stopping applied."
                                .format(self.num_epoch_no_improvements))
                            print(
                                self.full_task_set_evaluation(
                                    set_name="test", epoch=self.epoch))
                            print("train_seed {}, val_seed: {}, at pause time".
                                  format(
                                      self.data.dataset.seed["train"],
                                      self.data.dataset.seed["val"],
                                  ))

                            sys.exit()

            print(
                self.full_task_set_evaluation(epoch=self.epoch,
                                              set_name="test"))
示例#4
0
                test_avg_loss += avg1_loss.item() * batch_size
                test_max_loss += max1_loss.item() * batch_size
                test_concat_loss += concat_loss.item() * batch_size
                test_metric_loss += metric_loss.item() * batch_size
        test_acc = float(test_correct) / total
        test_loss = test_loss / total
        test_avg_loss = test_avg_loss / total
        test_max_loss = test_max_loss / total
        test_concat_loss = test_concat_loss / total
        test_metric_loss = 5.0 * test_metric_loss / total
        print(
            "epoch:{} - test loss: {:.3f} and test acc: {:.3f} total sample:{}"
            .format(epoch, test_loss, test_acc, total))

        write.add_scalars("lOSS", {
            'train': train_loss,
            "test": test_loss
        }, epoch)
        write.add_scalars("AVG_loss", {
            'train': train_avg_loss,
            "test": test_avg_loss
        }, epoch)
        write.add_scalars("MAX_loss", {
            'train': train_max_loss,
            "test": test_max_loss
        }, epoch)
        write.add_scalars("Cat_loss", {
            'train': train_concat_loss,
            "test": test_concat_loss
        }, epoch)
        write.add_scalars("Metric_loss", {
            'train': train_metric_loss,
def train(model, training_data, validation_data, optimizer, device, opt):
    ''' Start training '''

    # Use tensorboard to plot curves, e.g. perplexity, accuracy, learning rate
    if opt.use_tb:
        from torch.utils.tensorboard import SummaryWriter
        tb_writer = SummaryWriter(
            log_dir=os.path.join(opt.output_dir, 'tensorboard'))

    log_train_file = os.path.join(opt.output_dir, 'train.log')
    log_valid_file = os.path.join(opt.output_dir, 'valid.log')

    print('[Info] Training performance will be written to file: {} and {}'.
          format(log_train_file, log_valid_file))

    with open(log_train_file, 'w') as log_tf, open(log_valid_file,
                                                   'w') as log_vf:
        log_tf.write('epoch,loss,ppl,accuracy\n')
        log_vf.write('epoch,loss,ppl,accuracy\n')

    def print_performances(header, ppl, accu, start_time, lr, Num_parameters):
        print('  - {header:12} ppl: {ppl: 8.5f}, accuracy: {accu:3.3f} %, lr: {lr:8.5f}, '\
              'elapse: {elapse:3.3f} min, ParameterNumber: {Num_parameters: 8.2f}'.format(
                  header=f"({header})", ppl=ppl,
                  accu=100*accu, elapse=(time.time()-start_time)/60, lr=lr, Num_parameters=Num_parameters))

    #valid_accus = []
    valid_losses = []
    for epoch_i in range(opt.epoch):
        print('[ Epoch', epoch_i, ']')

        start = time.time()
        train_loss, train_accu = train_epoch(model,
                                             training_data,
                                             optimizer,
                                             opt,
                                             device,
                                             smoothing=opt.label_smoothing)

        train_ppl = math.exp(min(train_loss, 100))

        # Current learning rate
        lr = optimizer._optimizer.param_groups[0]['lr']

        #Calculate Num op parameters of model
        Num_parameters = count_parameters(model)
        print_performances('Training', train_ppl, train_accu, start, lr,
                           Num_parameters)

        start = time.time()
        valid_loss, valid_accu = eval_epoch(model, validation_data, device,
                                            opt)
        valid_ppl = math.exp(min(valid_loss, 100))

        #Calculate Num op parameters of model
        Num_parameters = count_parameters(model)
        print_performances('Validation', valid_ppl, valid_accu, start, lr,
                           Num_parameters)

        valid_losses += [valid_loss]

        checkpoint = {
            'epoch': epoch_i,
            'settings': opt,
            'model': model.state_dict()
        }

        if opt.save_mode == 'all':
            model_name = 'model_accu_{accu:3.3f}.chkpt'.format(accu=100 *
                                                               valid_accu)
            torch.save(checkpoint, model_name)
        elif opt.save_mode == 'best':
            model_name = 'model.chkpt'
            if valid_loss <= min(valid_losses):
                torch.save(checkpoint, os.path.join(opt.output_dir,
                                                    model_name))
                print('    - [Info] The checkpoint file has been updated.')

        with open(log_train_file, 'a') as log_tf, open(log_valid_file,
                                                       'a') as log_vf:
            log_tf.write(
                '{epoch},{loss: 8.5f},{ppl: 8.5f},{accu:3.3f}\n'.format(
                    epoch=epoch_i,
                    loss=train_loss,
                    ppl=train_ppl,
                    accu=100 * train_accu))
            log_vf.write(
                '{epoch},{loss: 8.5f},{ppl: 8.5f},{accu:3.3f}\n'.format(
                    epoch=epoch_i,
                    loss=valid_loss,
                    ppl=valid_ppl,
                    accu=100 * valid_accu))

        if opt.use_tb:
            tb_writer.add_scalars('ppl', {
                'train': train_ppl,
                'val': valid_ppl
            }, epoch_i)
            tb_writer.add_scalars('accuracy', {
                'train': train_accu * 100,
                'val': valid_accu * 100
            }, epoch_i)
            tb_writer.add_scalar('learning_rate', lr, epoch_i)
class Dense_U_Net_lidar_Agent:
    def __init__(self, config=None, torchvision_init=True):
        '''
        Handles everything
        - training, validation testing
        - checkpoint loading and saving
        - logging | tensorboard summaries

        Accordingly everything is specified here
        - model
        - loss
        - optimizer
        - lr scheduling

        Arguments:  
            torchvision_init: boolean
                - True:     load densenet state dict from torchvision
                - False:    load checkpoint; if no checkpoint just normal init
        '''

        self.logger = logging.getLogger('Agent')

        # model and config if lazy
        self.model = densenet121_u_lidar(pretrained=torchvision_init, 
            config=config)
        
        # in case config is empty it is created in model
        self.config = self.model.config

        # dataloader
        self.data_loader = WaymoDataset_Loader(self.config)

        # pixel-wise cross-entropy loss 
        self.loss = torch.nn.BCEWithLogitsLoss(reduction='none').cuda()

        # optimizer
        self.optimizer = torch.optim.Adam(self.model.parameters(), 
            lr=self.config.optimizer.learning_rate, 
            betas=(self.config.optimizer.beta1, self.config.optimizer.beta2), 
            eps=self.config.optimizer.eps, weight_decay=self.config.optimizer.weight_decay, 
            amsgrad=self.config.optimizer.amsgrad)

        # learning rate decay scheduler
        if self.config.optimizer.lr_scheduler.want:
            self.lr_scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, 
                step_size=self.config.optimizer.lr_scheduler.every_n_epochs, 
                gamma=self.config.optimizer.lr_scheduler.gamma)

        # initialize counters; updated in load_checkpoint
        self.current_epoch = 0
        self.current_train_iteration = 0
        self.current_val_iteration = 0
        self.best_val_iou = 0

        # if cuda is available export model to gpu
        self.cuda = torch.cuda.is_available()
        if self.cuda:
            self.device = torch.device('cuda')
            torch.cuda.manual_seed_all(self.config.agent.seed)
            self.logger.info('Operation will be on *****GPU-CUDA***** ')
        else:
            self.device = torch.device('cpu')
            torch.manual_seed(self.config.agent.seed)
            self.logger.info('Operation will be on *****CPU***** ')
        self.model = self.model.to(self.device)
        self.loss = self.loss.to(self.device)

        if not torchvision_init:
            self.load_checkpoint()

        # Tensorboard Writers 
        Path(self.config.dir.current_run.summary).mkdir(exist_ok=True, parents=True)
        self.train_summary_writer = SummaryWriter(log_dir=self.config.dir.current_run.summary, comment='Dense_U_Net')
        self.val_summary_writer = SummaryWriter(log_dir=self.config.dir.current_run.summary, comment='Dense_U_Net')

    def save_checkpoint(self, filename='checkpoint.pth.tar', is_best=False):
        '''
        Saving the latest checkpoint of the training

        Arguments:
            filename: filename which will contain the state
            is_best: flag is it is the best model
        '''

        #aggregate important data
        state = {
            self.config.agent.checkpoint.epoch: self.current_epoch,
            self.config.agent.checkpoint.train_iteration: self.current_train_iteration,
            self.config.agent.checkpoint.val_iteration: self.current_val_iteration,
            self.config.agent.checkpoint.best_val_iou: self.best_val_iou,
            self.config.agent.checkpoint.state_dict: self.model.state_dict(),
            self.config.agent.checkpoint.optimizer: self.optimizer.state_dict()
        }
        
        if is_best:
            filename = self.config.agent.best_checkpoint_name

        # create dir if not exists
        Path(self.config.dir.current_run.checkpoints).mkdir(exist_ok=True, parents=True)

        # Save the state
        torch.save(state, os.path.join(self.config.dir.current_run.checkpoints, filename))
    
    def load_checkpoint(self, filename=None):
        '''
        load checkpoint from file
        should contain following keys: 
            'epoch', 'iteration', 'best_val_iou', 'state_dict', 'optimizer'
            where state_dict is model statedict
            and optimizer is optimizer statesict
        
        Arguments:
            filename: only name with file type extension | path in config.dir.current_run.checkpoints
        '''

        # use best if not specified
        if filename is None:
            filename = self.config.agent.best_checkpoint_name

        # load according to key
        filepath = os.path.join(self.config.dir.current_run.checkpoints, filename)
        try:
            self.logger.info('Loading checkpoint {}'.format(filename))
            checkpoint = torch.load(filepath)

            self.current_epoch = checkpoint[self.config.agent.checkpoint.epoch]
            self.current_train_iteration = checkpoint[
                self.config.agent.checkpoint.train_iteration]
            self.current_val_iteration = checkpoint[
                self.config.agent.checkpoint.val_iteration]
            self.best_val_iou = checkpoint[
                self.config.agent.checkpoint.best_val_iou]
            self.model.load_state_dict(checkpoint[
                self.config.agent.checkpoint.state_dict])
            self.optimizer.load_state_dict(checkpoint[
                self.config.agent.checkpoint.optimizer])

            self.logger.info('Checkpoint loaded successfully from {} at (epoch {}) at (iteration {})\n'
                             .format(self.config.dir.current_run.checkpoints, checkpoint['epoch'], checkpoint['train_iteration']))
        except OSError:
            warnings.warn('No checkpoint exists from {}. Skipping...'.format(filepath))
            self.logger.info('No checkpoint exists from {}. Skipping...'.format(filepath))
            self.logger.info('**First time to train**')

    def run(self):
        '''
        starts training are testing: specify under config.loader.mode
        can handle keyboard interupt
        '''

        print('starting ' + self.config.loader.mode + ' at ' + str(datetime.now()))
        try:
            if self.config.loader.mode == 'test':
                with torch.no_grad():
                    self.validate()
            else:
                self.train()

        except KeyboardInterrupt:
            self.logger.info('You have entered CTRL+C.. Wait to finalize')

    def train(self):
        '''
        training one epoch at a time
        validating after each epoch
        saving checkpoint after each epoch
        check if val acc is best and store separately
        '''

        # add selected loss and optimizer to config  | not added in init as may be changed before training
        self.config.loss.func = str(self.loss)
        self.config.optimizer.func = str(self.optimizer)

        # make sure to remember the hyper params
        self.add_hparams_summary_writer()
        self.save_hparams_json()

        # Iterate epochs | train one epoch | validate | save checkpoint
        for epoch in range(self.current_epoch, self.config.agent.max_epoch):
            self.current_epoch = epoch
            self.train_one_epoch()

            with torch.no_grad():
                avg_val_iou_per_class = self.validate()

            val_iou = sum(avg_val_iou_per_class)/len(avg_val_iou_per_class)
            is_best = val_iou > self.best_val_iou
            if is_best:
                self.best_val_iou = val_iou
            self.save_checkpoint(is_best=is_best)

        self.train_summary_writer.close()
        self.val_summary_writer.close()

    def train_one_epoch(self):
        '''
        One epoch training function
        '''

        # Initialize progress visualization and get batch
        tqdm_batch = tqdm(self.data_loader.train_loader, total=self.data_loader.train_iterations,
                          desc='Epoch-{}-'.format(self.current_epoch))
        
        # Set the model to be in training mode
        self.model.train()

        # metric counters
        current_batch = 0
        number_of_batches = self.data_loader.train_loader.dataset.__len__()
        epoch_loss = torch.zeros((number_of_batches, self.config.model.num_classes)).to(self.device)
        epoch_iou = torch.zeros((number_of_batches, self.config.model.num_classes))
        epoch_iou_nans = torch.zeros((number_of_batches, self.config.model.num_classes))
        epoch_acc = torch.zeros((number_of_batches, self.config.model.num_classes)).to(self.device)
        
        for image, lidar, ht_map in tqdm_batch:
            
            # push to gpu if possible
            if self.cuda:
                image = image.cuda(non_blocking=self.config.loader.async_loading)
                lidar = lidar.cuda(non_blocking=self.config.loader.async_loading)
                ht_map = ht_map.cuda(non_blocking=self.config.loader.async_loading)

            # forward pass
            prediction = self.model(image, lidar)
            
            # pixel-wise loss
            current_loss = self.loss(prediction, ht_map)
            loss_per_class = torch.sum(current_loss.detach(), dim=(0,2,3))
            epoch_loss[current_batch, :] = loss_per_class

            # whole image IoU per class; not taking nans into acc for the mean value; counting the nans separately
            iou_per_instance_per_class = utils.compute_IoU_whole_img_batch(prediction.detach(), ht_map.detach(), self.config.agent.iou_threshold)
            iou_per_class = torch.tensor(np.nanmean(iou_per_instance_per_class, axis=0))
            iou_per_class[torch.isnan(iou_per_class)] = 0
            epoch_iou[current_batch, :] = iou_per_class
            epoch_iou_nans[current_batch, :] = torch.sum(torch.isnan(iou_per_instance_per_class), axis=0)
            
            # compute class-wise accuracy of current batch
            acc_per_class = utils.compute_accuracy(ht_map.detach(), prediction.detach(), self.config.agent.iou_threshold)
            epoch_acc[current_batch, :] = acc_per_class

            # backprop
            self.optimizer.zero_grad()
            current_loss.backward(torch.ones_like(current_loss.detach(), device=self.device))                            # , retain_graph=True?
            self.optimizer.step()

            # logging for visualization during training: separate plots for loss, acc, iou | each-classwise + overall
            loss_dict = {
                'Vehicle': loss_per_class[0],
                'Pedestrian': loss_per_class[1],
                'Cyclist': loss_per_class[2],
                'Overall': torch.mean(loss_per_class)
            }
            self.train_summary_writer.add_scalars('Training/Loss', loss_dict, self.current_train_iteration)
            acc_dict = {
                'Vehicle': acc_per_class[0],
                'Pedestrian': acc_per_class[1],
                'Cyclist': acc_per_class[2],
                'Overall': torch.mean(acc_per_class)
            }
            self.train_summary_writer.add_scalars('Training/Accuracy', acc_dict, self.current_train_iteration)
            iou_dict = {
                'Vehicle': iou_per_class[0],
                'Pedestrian': iou_per_class[1],
                'Cyclist': iou_per_class[2],
                'Overall': torch.mean(iou_per_class)
            }
            self.train_summary_writer.add_scalars('Training/IoU', iou_dict, self.current_train_iteration)

            # counters
            self.current_train_iteration += 1
            current_batch += 1

        tqdm_batch.close()

        # learning rate decay update; after validate; after each epoch
        if self.config.optimizer.lr_scheduler.want:
            self.lr_scheduler.step()

        # log
        avg_epoch_loss = torch.mean(epoch_loss, axis=0).tolist()
        avg_epoch_iou = torch.mean(epoch_iou, axis=0).tolist()
        cum_epoch_nans = torch.sum(epoch_iou_nans, axis=0).tolist()
        avg_epoch_acc = torch.mean(epoch_acc, axis=0).tolist()
        self.logger.info('Training at Epoch-' + str(self.current_epoch) + ' | ' + 'Average Loss: ' + str(
             avg_epoch_loss) + ' | ' + 'Average IoU: ' + str(avg_epoch_iou) + ' | ' + 'Number of NaNs: ' + str(
                 cum_epoch_nans) + ' | ' + 'Average Accuracy: ' + str(avg_epoch_acc))

    def validate(self):
        '''
        One epoch validation
        
        return: 
            average IoU per class
        '''

        # Initialize progress visualization and get batch
        # !self.data_loader.valid_loader works for both valid and test 
        tqdm_batch = tqdm(self.data_loader.valid_loader, total=self.data_loader.valid_iterations,
                          desc='Valiation at -{}-'.format(self.current_epoch))

        # set the model in training mode
        self.model.eval()

        # metric counters
        current_batch = 0
        number_of_batches = self.data_loader.valid_loader.dataset.__len__()
        epoch_loss = torch.zeros((number_of_batches, self.config.model.num_classes)).to(self.device)
        epoch_iou = torch.zeros((number_of_batches, self.config.model.num_classes))
        epoch_iou_nans = torch.zeros((number_of_batches, self.config.model.num_classes))
        epoch_acc = torch.zeros((number_of_batches, self.config.model.num_classes)).to(self.device)
        
        for image, lidar, ht_map in tqdm_batch:
            
            # push to gpu if possible
            if self.cuda:
                image = image.cuda(non_blocking=self.config.loader.async_loading)
                lidar = lidar.cuda(non_blocking=self.config.loader.async_loading)
                ht_map = ht_map.cuda(non_blocking=self.config.loader.async_loading)

            # forward pass
            prediction = self.model(image, lidar)
            
            # pixel-wise loss
            current_loss = self.loss(prediction, ht_map)
            loss_per_class = torch.sum(current_loss.detach(), dim=(0,2,3))
            epoch_loss[current_batch, :] = loss_per_class
            
            # whole image IoU per class; not taking nans into acc for the mean value; counting the nans separately
            iou_per_instance_per_class = utils.compute_IoU_whole_img_batch(prediction.detach(), ht_map.detach(), self.config.agent.iou_threshold)
            iou_per_class = torch.tensor(np.nanmean(iou_per_instance_per_class, axis=0))
            iou_per_class[torch.isnan(iou_per_class)] = 0
            epoch_iou[current_batch, :] = iou_per_class
            epoch_iou_nans[current_batch, :] = torch.sum(torch.isnan(iou_per_instance_per_class), axis=0)

            # compute class-wise accuracy of current batch
            acc_per_class = utils.compute_accuracy(ht_map.detach(), prediction.detach(), self.config.agent.iou_threshold)
            epoch_acc[current_batch, :] = acc_per_class

            # logging for visualization during training: separate plots for loss, acc, iou | each-classwise + overall
            loss_dict = {
                'Vehicle': loss_per_class[0],
                'Pedestrian': loss_per_class[1],
                'Cyclist': loss_per_class[2],
                'Overall': torch.mean(loss_per_class)
            }
            self.val_summary_writer.add_scalars('Validation/Loss', loss_dict, self.current_val_iteration)
            acc_dict = {
                'Vehicle': acc_per_class[0],
                'Pedestrian': acc_per_class[1],
                'Cyclist': acc_per_class[2],
                'Overall': torch.mean(acc_per_class)
            }
            self.val_summary_writer.add_scalars('Validation/Accuracy', acc_dict, self.current_val_iteration)
            iou_dict = {
                'Vehicle': iou_per_class[0],
                'Pedestrian': iou_per_class[1],
                'Cyclist': iou_per_class[2],
                'Overall': torch.mean(iou_per_class)
            }
            self.val_summary_writer.add_scalars('Validation/IoU', iou_dict, self.current_val_iteration)

            # counters
            self.current_val_iteration += 1
            current_batch += 1

        # log
        avg_epoch_loss = torch.mean(epoch_loss, axis=0).tolist()
        avg_epoch_iou = torch.mean(epoch_iou, axis=0).tolist()
        cum_epoch_nans = torch.sum(epoch_iou_nans, axis=0).tolist()
        avg_epoch_acc = torch.mean(epoch_acc, axis=0).tolist()
        self.logger.info('Validation at Epoch-' + str(self.current_epoch) + ' | ' + 'Average Loss: ' + str(
             avg_epoch_loss) + ' | ' + 'Average IoU: ' + str(avg_epoch_iou) + ' | ' + 'Number of NaNs: ' + str(
                 cum_epoch_nans) + ' | ' + 'Average Accuracy: ' + str(avg_epoch_acc))

        tqdm_batch.close()
        
        return avg_epoch_iou

    def add_hparams_summary_writer(self):
        '''
        Add Hyperparamters to tensorboard summary writers using .add_hparams
        Can be accessed under the Hyperparameter tab in Tensorboard
        '''

        hyper_params = {
            'loss_func': self.config.loss.func,
            'loss_alpha': torch.tensor(self.config.loss.alpha),                                                         
            'loss_gamma': torch.tensor(self.config.loss.gamma),    
            'loss_skip_v_every_n_its': self.config.loss.skip_v_every_n_its,
            'loss_skip_p_every_n_its': self.config.loss.skip_p_every_n_its,
            'loss_skip_b_every_n_its': self.config.loss.skip_b_every_n_its,
            'optimizer': self.config.optimizer.func,
            'learning_rate': self.config.optimizer.learning_rate,
            'beta1': self.config.optimizer.beta1,
            'beta2': self.config.optimizer.beta2,
            'eps': self.config.optimizer.eps,
            'amsgrad': self.config.optimizer.amsgrad,
            'weight_decay': self.config.optimizer.weight_decay,
            'lr_scheduler': self.config.optimizer.lr_scheduler.want,
            'lr_scheduler_every_n_epochs': self.config.optimizer.lr_scheduler.every_n_epochs,
            'lr_scheduler_gamma': self.config.optimizer.lr_scheduler.gamma,
        }
       
        self.train_summary_writer.add_hparams(hyper_params, {})
        self.val_summary_writer.add_hparams(hyper_params, {})

    def save_hparams_json(self):
        '''
        Uses config information to generate a hyperparameter dict and saves it as a json file
        into the current_run directory
        '''

        hparams = {
            'loss': self.config.loss.__dict__,
            'optimizer': self.config.optimizer.__dict__
        }

        utils.save_json_file(os.path.join(self.config.dir.current_run.summary, 'hyperparams.json'), 
                                hparams , indent=4)
    
    def finalize(self):
        '''
        Close all Writers and print time
        '''

        self.logger.info('Please wait while finalizing the operation.. Thank you')
        self.train_summary_writer.close()
        self.val_summary_writer.close()
        print('ending ' + self.config.loader.mode + ' at ' + str(datetime.now()))
示例#7
0
        # set new states to current states for determining next actions
        states = next_states
        # Update episode score for each agent
        agent_scores += reward

        if iteration_step % iteration_interval == 0:

            buildings_reward_dict = {}
            building_idx = 1
            for building in reward:
                buildings_reward_dict["Building {}".format(
                    building_idx)] = building
                building_idx += 1
            # Building reward
            writer.add_scalars("Reward/Buildings", buildings_reward_dict,
                               iteration_step)

            agent_scores_dict = {}
            agent_idx = 1
            for agentS in agent_scores:
                agent_scores_dict["Agent {}".format(agent_idx)] = agentS
                agent_idx += 1
            # Agent scores
            #writer.add_scalars("Scores/Agents", agent_scores_dict, iteration_step)

            # Plot losses for critic and actor
            if agent.critic_loss is not None:
                writer.add_scalar("Losses/Critic Loss", agent.critic_loss,
                                  iteration_step)
            if agent.actor_loss is not None:
                writer.add_scalar("Losses/Actor Loss", agent.actor_loss,
示例#8
0
文件: train.py 项目: nhendy/re-id
def train(data_root, epochs, log_dir):
    input_transforms = torchvision.transforms.Compose([
        lambda x: x / 255.0,
        torchvision.transforms.ToTensor(), lambda x: x.type(torch.FloatTensor)
    ])
    # Skip the first 2 elements in the lable which are the ID, age and subtract 1
    # to make it one-hot encoded
    target_transforms = torchvision.transforms.Compose([lambda x: x[2:] - 1])
    loaders = {}
    for mode in _modes():
        dataset = Market1501Dataset(root=data_root,
                                    train=mode == 'train',
                                    input_transforms=input_transforms,
                                    target_transforms=target_transforms)
        loaders[mode] = torch.utils.data.DataLoader(dataset=dataset,
                                                    batch_size=_batch(mode),
                                                    drop_last=True)
    net = EDNet(input_shape=(3, 128, 64), num_classes=26, num_downsamples=3)
    print(net)
    net = net.to(DEVICE)
    criterion = torch.nn.BCEWithLogitsLoss()
    optimizer = torch.optim.SGD(net.parameters(), lr=1e-3, momentum=0.9)
    global_step = 0
    writer = SummaryWriter(
        log_dir=os.path.join(log_dir,
                             datetime.now().strftime('%Y-%m-%d-%H:%M:%S')),
        flush_secs=20,
        filename_suffix=datetime.now().strftime('%Y-%m-%d-%H:%M:%S'))
    for epoch in range(epochs):
        running_loss = {'train': 0.0, 'test': 0.0}
        for mode in _modes():
            print("Running {} on {} samples".format(mode, len(loaders[mode])))
            if mode == "test":
                net.eval()
            else:
                net.train()
            for i, data in enumerate(loaders[mode]):
                inputs, labels = data
                inputs = inputs.to(DEVICE)
                labels = labels.to(DEVICE)
                optimizer.zero_grad()
                outputs = net(inputs)
                loss = criterion(outputs, labels)
                running_loss[mode] += loss.item()
                if mode == "train":
                    loss.backward()
                    optimizer.step()
                if mode == "train" and i % 500 == 0:
                    writer.add_scalar('loss/500th_iter_train_loss',
                                      loss.item(), global_step)
                    print("Training loss, iter {}: {}".format(
                        i, running_loss['train'] / (i + 1)))
                if mode == 'train':
                    global_step += 1
            writer.add_scalars('loss/epoch_loss', {
                '{}_loss'.format(mode):
                running_loss[mode] / len(loaders[mode])
            }, global_step)
        print("Epoch {}: Train Loss {}, Validation Loss {}.".format(
            epoch, running_loss['train'] / len(loaders['train']),
            running_loss['test'] / len(loaders['test'])))
    writer.close()
示例#9
0
class Trainer:
    def __init__(self, kwargs):
        kwargs["env_cls"] = Atari
        env = kwargs["env_cls"](kwargs["env_id"])
        kwargs["state_shape"] = env.observation_space.shape
        kwargs["state_dtype"] = np.uint8
        kwargs["n_actions"] = env.action_space.n
        kwargs["device"] = torch.device(kwargs["device_id"])
        env.close()
        self.__dict__.update(kwargs)
        self.agent = DQNAgent(**kwargs)
        self.writer = SummaryWriter("./log/")
        self.cuda_eval = torch.cuda.Stream(self.device)

        mem_kwargs = dict(
            capacity=self.mem_capacity,
            history_len=self.history_len,
            state_shape=self.state_shape,
            state_dtype=self.state_dtype,
            batch_sz=self.batch_sz,
            alpha=self.mem_alpha,
            beta=LinearScheduler(self.mem_beta, 1., self.train_steps),
            priority_eps=self.mem_priority_eps,
            priority_upper=self.mem_priority_upper,
            prioritized_replay=self.prioritized_replay,
            device=self.device,
        )
        mem_cls = PrioritizedReplayMemory if self.prioritized_replay else UniformReplayMemory
        self.mem = mem_cls(**mem_kwargs)
        self.mem_lock = Lock()
        self.sync = Queue(maxsize=1)
        self.sync.put(None)

    def play_thread(self):
        env = self.env_cls(self.env_id)
        terminal = True
        eps = LinearScheduler(self.eps_init, self.eps_final, self.eps_steps)
        behavior = list()
        with torch.cuda.stream(torch.cuda.Stream(self.device)):
            for global_step in range(-self.mem_init_sz, self.train_steps + 1):
                if terminal:
                    state = env.reset()
                actions, mu, sigma = self.agent.policy(
                    np.expand_dims(state, 0),
                    training=True,
                    eps=eps.get() if global_step > 0 else 1.,
                    return_streams=True,
                )
                action = actions[0]
                if mu is not None and sigma is not None:
                    mu = mu.cpu()[0]
                    behavior.append(mu.argmax(0).item() != action)
                state, reward, terminal, lost_live = env.step(action)
                with self.mem_lock:
                    self.mem.put(state[-2], action, np.sign(reward), terminal
                                 or lost_live)
                if global_step < 0:
                    continue
                eps.step()

                if global_step % self.optimize_freq == 0:
                    try:
                        self.sync.get(block=True, timeout=10.)
                    except Empty:
                        continue
                if len(behavior) > 0:
                    if self.adaptive_eps is not None and global_step % self.adaptive_freq == 0:
                        real_eps = np.mean(behavior[-self.adaptive_freq:])
                        self.agent.c += 0.01 * np.sign(self.adaptive_eps -
                                                       real_eps)
                        self.agent.c = max(0.01, self.agent.c)
                    if global_step % self.log_freq == 0:
                        if self.adaptive_eps is not None:
                            self.write(self.agent.c, "c", global_step)
                        self.write(np.mean(behavior), "behavior", global_step)
                        behavior = list()
        env.close()

    def train(self):
        Thread(target=self.play_thread, ).start()
        self.sync.put(None)
        start_t = datetime.now()
        for global_step in range(0, self.train_steps + 1):

            if global_step % self.print_freq == 0:
                step_time = (datetime.now() - start_t) / self.print_freq
                start_t = datetime.now()
                print(
                    "every {} steps {}\t4M {}\t200M {}\tremain {}M,{}".format(
                        self.optimize_freq,
                        step_time * self.optimize_freq,
                        step_time * 10**6,
                        step_time * (50 * 10**6),
                        (self.train_steps - global_step) * 4 // 10**6,
                        step_time * (self.train_steps - global_step),
                    ))
            if global_step % self.update_target_freq == 0:
                self.agent.update_target()
            if global_step % self.eval_freq == 0:
                self.agent.update_eval()
                eval_thread = Thread(target=self.eval, args=(global_step, ))
                eval_thread.start()

            if global_step % self.optimize_freq == 0:
                try:
                    self.sync.put(None, block=True, timeout=10.)
                except Full:
                    continue
                with self.mem_lock:
                    batch = self.mem.sample()
                idx, td_err = self.agent.optimize(*batch)
                if self.prioritized_replay:
                    with self.mem_lock:
                        self.mem.update_priority(idx,
                                                 np.abs(td_err.cpu().numpy()))
        self.sync.task_done()
        eval_thread.join()
        return

    def eval(self, global_step):
        eval_func = dict(
            frames=self.eval_by_frames,
            episodes=self.eval_by_episodes,
        )[self.eval_method]
        reward = eval_func()
        self.write(reward, "reward", global_step)
        self.writer.flush()
        return

    def eval_by_episodes(self):
        n_trials = self.eval_episodes
        envs = [Atari(self.env_id) for _ in range(n_trials)]
        states = np.stack([u.reset() for u in envs])
        actions = np.empty(n_trials, dtype=np.int)
        reward = np.zeros(n_trials, dtype=np.float32)
        terminal = np.zeros(n_trials, dtype=np.bool)
        with torch.cuda.stream(self.cuda_eval):
            while not terminal.all():
                not_t = ~terminal
                actions[not_t] = self.agent.policy(
                    states=states[not_t],
                    training=False,
                    eps=self.eps_eval,
                    return_streams=False,
                )
                for i, nt in enumerate(not_t):
                    if nt:
                        states[i], r, terminal[i], _ = envs[i].step(actions[i])
                        reward[i] += r
        for e in envs:
            e.close()
        return np.mean(reward)

    def eval_by_frames(self):
        rewards = list()
        reward = 0.
        env = Atari(self.env_id)
        state = env.reset()
        with torch.cuda.stream(self.cuda_eval):
            for step in range(self.eval_frames // 4):
                action = self.agent.policy(
                    np.expand_dims(state, 0),
                    training=False,
                    eps=self.eps_eval,
                    return_streams=False,
                )[0]
                state, r, terminal, _ = env.step(action)
                reward += r
                if terminal:
                    rewards.append(reward)
                    reward = 0.
                    state = env.reset()
        env.close()
        return np.mean(rewards)

    def write(self, value, category, step):
        frm_idx = step * 4
        self.writer.add_scalars(
            main_tag="{}/{}".format(category, self.env_id),
            tag_scalar_dict={self.label: value},
            global_step=frm_idx,
        )
        if not os.path.exists(CSV_FOLDER):
            os.makedirs(CSV_FOLDER)
        path = os.path.join(
            CSV_FOLDER,
            "{}--{}--{}.csv".format(category, self.env_id, self.label),
        )
        has_header = os.path.exists(path)
        with open(path, "a") as fp:
            if not has_header:
                fp.write("frame (millions), {}\n".format(category))
            fp.write("{:.2f}, {:.3f}\n".format(frm_idx / 10**6, value))
        return
示例#10
0
class SelfPlay:
    """
    Class which run in a dedicated thread to play games and save them to the replay-buffer.
    """

    def __init__(self, initial_weights, game, config, test=False, idx=-1, render=False):
        self.config: MuZeroConfigBase = config
        self.game = game
        self.idx = idx
        self.episode = 0
        self.render = render
        self.writer = SummaryWriter(self.config.results_path / f"self_play_{idx}")

        # Initialize the network
        self.model = models.MuZeroExtendedNetwork(
            self.config.observation_shape,
            len(self.config.action_space),
            self.config.encoding_size,
            self.config.hidden_size,
        )
        self.model.set_weights(initial_weights)
        self.model.to(torch.device("cpu"))
        self.model.eval()

        self.continuous_self_play(test)

    def continuous_self_play(self, test_mode=False):
        while True:
            if self.config.v_self_play_count.value > 0:
                # Update the model if the trianer is running
                self.model.set_weights(self.config.q_weights.get())

            # Take the best action (no exploration) in test mode
            temperature = (
                0
                if test_mode
                else self.config.visit_softmax_temperature_fn(
                    trained_steps=self.config.v_training_step.value
                )
            )
            game_history = self.play_game(temperature, False)

            # Save to the shared storage
            score = sum(game_history.rewards)
            self.writer.add_scalars(
                f"1.Total reward/{'test' if test_mode else 'train'}",
                {f"env_{self.idx}": score},
                global_step=self.episode,
            )
            self.episode += 1
            if test_mode:
                self.config.v_total_reward.value = int(score)

            if not test_mode:
                self.config.q_save_game.put(game_history)

            if not test_mode and self.config.self_play_delay:
                time.sleep(self.config.self_play_delay)

    def play_game(self, temperature, render: bool = None):
        """
        Play one game with actions based on the Monte Carlo tree search at each moves.
        """
        if render is None:
            render = self.render
        game_history = GameHistory()
        observation = self.game.reset()
        game_history.observation_history.append(observation)
        done = False

        with torch.no_grad():
            while not done and len(game_history.action_history) < self.config.max_moves:

                root = MCTS(self.config).run(
                    self.model,
                    observation,
                    self.game.to_play(),
                    True if temperature else False,
                    self.game
                )

                action = select_action(root, temperature, self.game)

                observation, reward, done = self.game.step(action)

                if render:
                    self.game.render()

                game_history.observation_history.append(observation)
                game_history.rewards.append(reward)
                game_history.action_history.append(action)
                game_history.store_search_statistics(root, self.config.action_space)

        self.game.close()
        return game_history
示例#11
0
def train_model(model, dataloaders, criterion, optimizer, num_epochs,
                scheduler, string_name, device):

    logger = SummaryWriter()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_rank1_v, best_rank5_v = 0.0, 0.0
    feats_val, labels_val = torch.ones(len(dataloaders['val'].dataset),
                                       1024), torch.ones(
                                           len(dataloaders['val'].dataset))

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        for phase in ['train', 'val']:
            # configure model functionality - train/val
            if phase == 'train':
                model.train()
                running_loss, nr_batch_triplets = (0.0, 0)
            else:
                model.eval()

            for index, (inputs, labels) in enumerate(dataloaders[phase]):
                inputs, labels = inputs.to(device), labels.to(device)
                optimizer.zero_grad()
                embeddings = F.adaptive_avg_pool2d(
                    F.relu(model.features(inputs), inplace=True),
                    (1, 1)).view(inputs.size(0), -1)
                # for each batch update
                if phase == 'train':
                    with torch.set_grad_enabled(phase == 'train'):

                        batch_triplets = triplet_selector.get_triplets(
                            embeddings, labels)
                        nr_batch_triplets += batch_triplets.size(0)
                        loss = criterion(embeddings[batch_triplets[:, 0]],
                                         embeddings[batch_triplets[:, 1]],
                                         embeddings[batch_triplets[:, 2]])
                        running_loss += batch_triplets.size(0) * loss.item()
                        loss.backward()
                        optimizer.step()

                else:
                    with torch.no_grad():
                        feats_val[index * dataloaders[phase].
                                  batch_size:dataloaders[phase].batch_size *
                                  (index + 1)] = embeddings
                        labels_val[index * dataloaders[phase].
                                   batch_size:dataloaders[phase].batch_size *
                                   (index + 1)] = labels
            # for each epoch
            if phase == 'train':
                epoch_loss = running_loss / nr_batch_triplets
                print('{} Triplet Loss: {:.4f} Informative triplets: {:.4f}'.
                      format(
                          phase, epoch_loss,
                          round(nr_batch_triplets / len(dataloaders[phase]))))
            else:
                rank1_v, rank5_v = compute_rank(feats_val, labels_val,
                                                dataloaders[phase], device)
                print('{} Rank-1: {:.4f} Rank-5: {:.4f}'.format(
                    phase, rank1_v, rank5_v))
                scheduler.step()

            # deep copy the model
            if phase == 'val' and rank1_v > best_rank1_v:
                best_rank1_v, best_rank5_v = rank1_v, rank5_v
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(
                    {
                        'epoch': epoch,
                        'model_state_dict': model.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict()
                    }, string_name + '.pt')

        logger.add_scalars(
            string_name, {
                'train_loss': epoch_loss,
                'nr_triplets': nr_batch_triplets,
                'rank1_val': rank1_v,
                'rank5_val': rank5_v
            }, epoch + 1)
    logger.close()
    print(
        'Training Finished. Best Validation Rank-1: {:4f} and Best val Rank-5: {:4f} '
        .format(best_rank1_v, best_rank5_v))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model
class BaseSolver():
    ''' 
    Prototype Solver for all kinds of tasks
    Arguments
        config - yaml-styled config
        paras  - argparse outcome
    '''
    def __init__(self, config, paras, mode):
        # General Settings
        self.config = config
        self.paras = paras
        self.mode = mode
        for k, v in default_hparas.items():
            setattr(self, k, v)
        self.device = torch.device(
            'cuda') if self.paras.gpu and torch.cuda.is_available(
            ) else torch.device('cpu')
        self.amp = paras.amp

        # Name experiment
        self.exp_name = paras.name
        if self.exp_name is None:
            # By default, exp is named after config file
            self.exp_name = paras.config.split('/')[-1].replace('.yaml', '')
            if mode == 'train':
                self.exp_name += '_sd{}'.format(paras.seed)

        # Plugin list
        self.emb_decoder = None

        if mode == 'train':
            # Filepath setup
            os.makedirs(paras.ckpdir, exist_ok=True)
            self.ckpdir = os.path.join(paras.ckpdir, self.exp_name)
            os.makedirs(self.ckpdir, exist_ok=True)

            # Logger settings
            self.logdir = os.path.join(paras.logdir, self.exp_name)
            self.log = SummaryWriter(self.logdir,
                                     flush_secs=self.TB_FLUSH_FREQ)
            self.timer = Timer()

            # Hyperparameters
            self.step = 0
            self.valid_step = config['hparas']['valid_step']
            self.max_step = config['hparas']['max_step']

            self.verbose('Exp. name : {}'.format(self.exp_name))
            self.verbose('Loading data... large corpus may took a while.')

        elif mode == 'test':
            # Output path
            os.makedirs(paras.outdir, exist_ok=True)
            self.ckpdir = os.path.join(paras.outdir, self.exp_name)

            # Load training config to get acoustic feat, text encoder and build model
            self.src_config = yaml.load(open(config['src']['config'], 'r'),
                                        Loader=yaml.FullLoader)
            self.paras.load = config['src']['ckpt']

            self.verbose('Evaluating result of tr. config @ {}'.format(
                config['src']['config']))

    def backward(self, loss):
        '''
        Standard backward step with self.timer and debugger
        Arguments
            loss - the loss to perform loss.backward()
        '''
        self.timer.set()
        loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                   self.GRAD_CLIP)
        if math.isnan(grad_norm):
            self.verbose('Error : grad norm is NaN @ step ' + str(self.step))
        else:
            self.optimizer.step()
        self.timer.cnt('bw')
        return grad_norm

    def load_ckpt(self):
        ''' Load ckpt if --load option is specified '''
        if self.paras.load:
            # Load weights
            ckpt = torch.load(
                self.paras.load,
                map_location=self.device if self.mode == 'train' else 'cpu')
            self.model.load_state_dict(ckpt['model'])
            # if self.emb_decoder is not None:
            # self.emb_decoder.load_state_dict(ckpt['emb_decoder'])
            # if self.amp:
            #    amp.load_state_dict(ckpt['amp'])
            # Load task-dependent items
            if self.mode == 'train':
                self.step = ckpt['global_step']
                self.optimizer.load_opt_state_dict(ckpt['optimizer'])
                self.verbose('Load ckpt from {}, restarting at step {}'.format(
                    self.paras.load, self.step))
            else:
                for k, v in ckpt.items():
                    if type(v) is float:
                        metric, score = k, v
                self.model.eval()
                if self.emb_decoder is not None:
                    self.emb_decoder.eval()
                self.verbose(
                    'Evaluation target = {} (recorded {} = {:.2f} %)'.format(
                        self.paras.load, metric, score))

    def verbose(self, msg):
        ''' Verbose function for print information to stdout'''
        if self.paras.verbose:
            if type(msg) == list:
                for m in msg:
                    print('[INFO]', m.ljust(100))
            else:
                print('[INFO]', msg.ljust(100))

    def progress(self, msg):
        ''' Verbose function for updating progress on stdout (do not include newline) '''
        if self.paras.verbose:
            sys.stdout.write("\033[K")  # Clear line
            print('[{}] {}'.format(human_format(self.step), msg), end='\r')

    def write_log(self, log_name, log_dict):
        '''
        Write log to TensorBoard
            log_name  - <str> Name of tensorboard variable 
            log_value - <dict>/<array> Value of variable (e.g. dict of losses), passed if value = None
        '''
        if type(log_dict) is dict:
            log_dict = {
                key: val
                for key, val in log_dict.items()
                if (val is not None and not math.isnan(val))
            }
        if log_dict is None:
            pass
        elif len(log_dict) > 0:
            if 'align' in log_name or 'spec' in log_name:
                img, form = log_dict
                self.log.add_image(log_name,
                                   img,
                                   global_step=self.step,
                                   dataformats=form)
            elif 'text' in log_name or 'hyp' in log_name:
                self.log.add_text(log_name, log_dict, self.step)
            else:
                self.log.add_scalars(log_name, log_dict, self.step)

    def save_checkpoint(self, f_name, metric, score, show_msg=True):
        '''' 
        Ckpt saver
            f_name - <str> the name phnof ckpt file (w/o prefix) to store, overwrite if existed
            score  - <float> The value of metric used to evaluate model
        '''
        ckpt_path = os.path.join(self.ckpdir, f_name)
        full_dict = {
            "model": self.model.state_dict(),
            "optimizer": self.optimizer.get_opt_state_dict(),
            "global_step": self.step,
            metric: score
        }
        # Additional modules to save
        # if self.amp:
        #    full_dict['amp'] = self.amp_lib.state_dict()
        if self.emb_decoder is not None:
            full_dict['emb_decoder'] = self.emb_decoder.state_dict()

        torch.save(full_dict, ckpt_path)
        if show_msg:
            self.verbose(
                "Saved checkpoint (step = {}, {} = {:.2f}) and status @ {}".
                format(human_format(self.step), metric, score, ckpt_path))

    def enable_apex(self):
        if self.amp:
            # Enable mixed precision computation (ToDo: Save/Load amp)
            from apex import amp
            self.amp_lib = amp
            self.verbose(
                "AMP enabled (check https://github.com/NVIDIA/apex for more details)."
            )
            self.model, self.optimizer.opt = self.amp_lib.initialize(
                self.model, self.optimizer.opt, opt_level='O1')

    # ----------------------------------- Abtract Methods ------------------------------------------ #
    @abc.abstractmethod
    def load_data(self):
        '''
        Called by main to load all data
        After this call, data related attributes should be setup (e.g. self.tr_set, self.dev_set)
        No return value
        '''
        raise NotImplementedError

    @abc.abstractmethod
    def set_model(self):
        '''
        Called by main to set models
        After this call, model related attributes should be setup (e.g. self.l2_loss)
        The followings MUST be setup
            - self.model (torch.nn.Module)
            - self.optimizer (src.Optimizer),
                init. w/ self.optimizer = src.Optimizer(self.model.parameters(),**self.config['hparas'])
        Loading pre-trained model should also be performed here 
        No return value
        '''
        raise NotImplementedError

    @abc.abstractmethod
    def exec(self):
        '''
        Called by main to execute training/inference
        '''
        raise NotImplementedError
示例#13
0
class GpuPynvmlLogger(Thread):
    """
    Logger for the GPU resources GPU and RAM utilization.

    `CpuInfo` is implemented on a separate thread as any attachment to an event would would effectively measure
    the GPU/CPU-utilization of the downtime, as all events are not fired during the `Engine().process()` where the
    GPU/CPU is in use. Triggering the logging independently will randomize the measurements during
    _up times_ (when `Engine().process()` is running) and _down times_.

    Args:
        logger_directory: directory for tensorboard logs
        logger_name: name of logger
        log_interval_seconds: logging interval in seconds. Decreasing the `log_interval_seconds < 0.1` may \
        reasonably increase (> ~5-30%) the GPU-utilization by the measurement task
        unit (['KB', 'MB', 'GB']): logging unit defaults to `'GB'`
    """
    def __init__(self,
                 logger_directory,
                 logger_name='GPULogger',
                 log_interval_seconds=1,
                 unit='GB'):
        super(GpuPynvmlLogger, self).__init__(name=logger_name, daemon=True)
        # CAUTION: Always avoid more than one `SummaryWriter` logging to the same directory
        # because this will lead to log file losses
        self.logger_directory = logger_directory
        self.log_interval_seconds = log_interval_seconds
        nvmlInit()
        self.gpu_count = nvmlDeviceGetCount()
        self.gpu_handles = {}
        for gpu_idx in range(self.gpu_count):
            hdl = nvmlDeviceGetHandleByIndex(gpu_idx)
            name = nvmlDeviceGetName(hdl).decode('ascii').replace(' ', '_')
            self.gpu_handles['GPU{}_{}'.format(gpu_idx, name)] = hdl
        self._tb_logger = SummaryWriter(logdir=self.logger_directory)
        self._memory_stats_to_log = ['total', 'used', 'free']
        self._log_gpu = True
        self._unit = unit
        self._units = {'KB': 1024, 'MB': 1024**2, 'GB': 1024**3}
        self._start_time = None

    def run(self):
        """
        Target function of the thread that logs GPU resources to tensoboard till it is closed.
        CAUTION:
            DO NOT CALL `self.run()`  on its own but CALL `self.start()` inherited from `Thread`.
            Otherwise `self.run()` will simple be executed in the `MainThread` instead of passed
            as target function to the new thread.
        :return:
        """
        self._start_time = time()
        while self._log_gpu:
            self._log_gpu_utilization()
            self._log_gpu_memory()
            sleep(self.log_interval_seconds)

    def _log_gpu_memory(self):
        # Get memory statistics for each GPU
        for gpu_name, gpu_hdl in self.gpu_handles.items():
            # Get current memory stats
            memory_sizes = nvmlDeviceGetMemoryInfo(handle=gpu_hdl)
            memory_stats = {}
            # Select memory statistics to be logged and calculate units
            for mem_stat in self._memory_stats_to_log:
                memory_stats[mem_stat] = memory_sizes.__getattribute__(
                    mem_stat) / self._units[self._unit]
            # log memory statistics to tensorboard
            self._tb_logger.add_scalars(main_tag='{}_memory_{}'.format(
                gpu_name, self._unit),
                                        tag_scalar_dict=memory_stats,
                                        global_step=time() - self._start_time)

    def _log_gpu_utilization(self):
        gpu_utilizations = {}
        # Get current GPU utilizations in percent
        for gpu_name, gpu_hdl in self.gpu_handles.items():
            gpu_percentage = nvmlDeviceGetUtilizationRates(handle=gpu_hdl).gpu
            gpu_utilizations[gpu_name] = gpu_percentage
        # log CPU utilization to tensorboard
        self._tb_logger.add_scalars(main_tag='GPUs_utilization_percentage',
                                    tag_scalar_dict=gpu_utilizations,
                                    global_step=time() - self._start_time)

    def close(self):
        # Quit while-loop in `self.run()`
        self._log_gpu = False
        # Close tensorboard logger
        self._tb_logger.close()
        # Join thread
        self.join()
示例#14
0
class train_config(ut_cfg.config):
    def __init__(self):
        super(train_config, self).__init__(pBs=32, pWn=4, p_force_cpu=False)
        self.path_save_mdroot = self.check_path_valid(
            os.path.join(ROOT, "outputs"))
        localtime = time.localtime(time.time())
        self.path_save_mdid = "alexMNIST" + "%02d%02d" % (localtime.tm_mon,
                                                          localtime.tm_mday)

        self.save_epoch_begin = 50
        self.save_epoch_interval = 25

        self.log_epoch_txt = open(
            os.path.join(self.path_save_mdroot, "conv_epoch_loss_log.txt"),
            'a+')
        self.writer = SummaryWriter(
            log_dir=os.path.join(self.path_save_mdroot, "board"))

        self.height_in = 28
        self.width_in = 28
        self.class_num = 10

        self.method_init = "xavier"  #"preTrain" #"kaming"#"xavier"
        self.training_epoch_amount = 150

        self.dtroot = os.path.join(ROOT, "datasets")

        self.opt_baseLr = 1e-3
        self.opt_bata1 = 0.9
        self.opt_weightdecay = 3e-6

    def init_net(self, pNet):
        if self.method_init == "xavier":
            ut_init.init_xavier(pNet)
        elif self.method_init == "kaiming":
            ut_init.init_kaiming(pNet)

        elif self.method_init == "preTrain":
            assert self.preTrain_model_path is not None, "weight path ungiven"
            pNet.load_state_dict(torch.load(self.preTrain_model_path))

        pNet.to(self.device).train()

    def create_dataset(self, istrain):
        if istrain:
            imgUbyte_absfilename = r"datasets\MNIST\train-images-idx3-ubyte.gz"
            labelUbyte_absfilename = r"datasets\MNIST\train-labels-idx1-ubyte.gz"
        else:
            imgUbyte_absfilename = r"datasets\MNIST\t10k-images-idx3-ubyte.gz"
            labelUbyte_absfilename = r"datasets\MNIST\t10k-labels-idx1-ubyte.gz"
        q_dataset = mnstld.minist_Loader(imgUbyte_absfilename,
                                         labelUbyte_absfilename)

        return q_dataset

    def name_save_model(self, save_mode, epochX=None):
        model_filename = self.path_save_mdid
        if save_mode == "processing":
            assert epochX is not None, "miss the epoch info"
            model_filename += "_%03d" % (epochX) + ".pth"
        elif save_mode == "ending":
            model_filename += "_%03d" % (self.training_epoch_amount) + ".pth"
        elif save_mode == "interrupt":
            model_filename += "_interrupt" + ".pth"
        assert os.path.splitext(model_filename)[-1] == ".pth"
        q_abs_path = os.path.join(self.path_save_mdroot, model_filename)
        return q_abs_path

    def log_in_file(self, *print_paras):
        for para_i in print_paras:
            print(para_i, end="")
            print(para_i, end="", file=self.log_epoch_txt)
        print("")
        print("", file=self.log_epoch_txt)

    def log_in_board(self, chartname, data_Dic, epoch):
        # for key_i, val_i in data_Dic:
        self.writer.add_scalars(chartname, data_Dic, epoch)

    def validate(self, pNet):
        # use the classifacation acc to validate the convNet performance
        valid_dataset = self.create_dataset(istrain=False)
        validloader = torch.utils.data.DataLoader(dataset=valid_dataset,
                                                  batch_size=self.ld_batchsize,
                                                  shuffle=True,
                                                  num_workers=self.ld_workers)

        acc_Lst = []  # len(validloader) = 313;
        for iter_idx, (img_Tsor_bacth_i,
                       label_Tsor_bacth_i) in enumerate(validloader):
            img_Tsor_bacth_i = img_Tsor_bacth_i.to(self.device)
            label_Tsor_bacth_i = label_Tsor_bacth_i.to(self.device)
            pred_Tsor_bacth_i = gm_net(img_Tsor_bacth_i)
            max_likeli_pred_bacth_i = torch.argmax(pred_Tsor_bacth_i, dim=-1)

            error_num = (max_likeli_pred_bacth_i -
                         label_Tsor_bacth_i).nonzero().shape[0]
            cur_acc = 1 - error_num / label_Tsor_bacth_i.shape[0]
            acc_Lst.append(cur_acc)

        return sum(acc_Lst) / len(acc_Lst)
示例#15
0
def train(appliance_name, model, mains, appliance, epochs, batch_size, pretrain=False, checkpoint_interval=None,
          train_patience=3):
    # Model configuration
    if USE_CUDA:
        model = model.cuda()
    if not pretrain:
        model.apply(initialize)
    summary(model, (1, mains.shape[1]))
    # Split the train and validation set
    train_mains, valid_mains, train_appliance, valid_appliance = train_test_split(mains, appliance, test_size=.2,
                                                                                  random_state=random_seed)

    # Create optimizer, loss function, and dataloader
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = torch.nn.MSELoss(reduction='mean')

    train_dataset = TensorDataset(torch.from_numpy(train_mains).float().permute(0, 2, 1),
                                  torch.from_numpy(train_appliance).float())
    train_loader = tud.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True)

    valid_dataset = TensorDataset(torch.from_numpy(valid_mains).float().permute(0, 2, 1),
                                  torch.from_numpy(valid_appliance).float())
    valid_loader = tud.DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True)

    writer = SummaryWriter(comment='train_visual')
    patience, best_loss = 0, None

    for epoch in range(epochs):
        # Earlystopping
        if (patience == train_patience):
            print("val_loss did not improve after {} Epochs, thus Earlystopping is calling".format(train_patience))
            break
            # train the model
        model.train()
        st = time.time()
        for i, (batch_mains, batch_appliance) in enumerate(train_loader):
            if USE_CUDA:
                batch_mains = batch_mains.cuda()
                batch_appliance = batch_appliance.cuda()

            batch_pred = model(batch_mains)
            loss = loss_fn(batch_appliance, batch_pred)

            model.zero_grad()
            loss.backward()
            optimizer.step()
        ed = time.time()

        # Evaluate the model
        model.eval()
        with torch.no_grad():
            cnt, loss_sum = 0, 0
            for i, (batch_mains, batch_appliance) in enumerate(valid_loader):
                if USE_CUDA:
                    batch_mains = batch_mains.cuda()
                    batch_appliance = batch_appliance.cuda()

                batch_pred = model(batch_mains)
                loss = loss_fn(batch_appliance, batch_pred)
                loss_sum += loss
                cnt += 1

        final_loss = loss_sum / cnt
        final_loss = loss_sum / cnt
        # Save best only
        if best_loss is None or final_loss < best_loss:
            best_loss = final_loss
            patience = 0
            net_state_dict = model.state_dict()
            path_state_dict = "./" + appliance_name + "_AttentionCNN_best_state_dict.pt"
            torch.save(net_state_dict, path_state_dict)
        else:
            patience = patience + 1

        print("Epoch: {}, Valid_Loss: {}, Time consumption: {}s.".format(epoch, final_loss, ed - st))

        # For the visualization of training process
        for name, param in model.named_parameters():
            writer.add_histogram(name + '_grad', param.grad, epoch)
            writer.add_histogram(name + '_data', param, epoch)
        writer.add_scalars("MSELoss", {"Valid": final_loss}, epoch)

        # Save checkpoint
        if (checkpoint_interval != None) and ((epoch + 1) % checkpoint_interval == 0):
            checkpoint = {"model_state_dict": model.state_dict(),
                          "optimizer_state_dict": optimizer.state_dict(),
                          "epoch": epoch}
            path_checkpoint = "./" + appliance_name + "_AttentionCNN_{}_epoch.pkl".format(epoch)
            torch.save(checkpoint, path_checkpoint)
示例#16
0
class Train:
    def __init__(self, root):

        self.summaryWriter = SummaryWriter("./logs")

        # 加载训练数据
        self.train_dataset = datasets.CIFAR10(root,
                                              True,
                                              transform=transforms.ToTensor(),
                                              download=True)
        self.train_dataloader = DataLoader(
            self.train_dataset,
            batch_size=100,
            shuffle=True,
        )

        # 加载测试数据
        self.test_dataset = datasets.CIFAR10(root,
                                             False,
                                             transform=transforms.ToTensor(),
                                             download=True)
        self.test_dataloder = DataLoader(
            self.test_dataset,
            100,
        )

        # 创建模型
        self.net = NetV2()
        # self.net.load_state_dict(torch.load("./checkpoint/2.t"))
        # self.net.to(DEVICE)

        # 创建优化器
        self.opt = optim.Adam(self.net.parameters())

        self.loos_fn = nn.CrossEntropyLoss()

    # 训练代码
    def __call__(self):
        for epoch in range(100000):
            self.net.train()
            sum_loss = 0.
            for i, (imgs, tags) in enumerate(self.train_dataloader):
                y = self.net(imgs)

                #正则化
                # L2 = []
                # for param in self.net.parameters():
                #     L2 += torch.sum(param ** 2)

                # loss = torch.mean((tags - y) ** 2)
                loss = self.loos_fn(y, tags)
                # loss = self.loos_fn(y,tags) + 0.01*L2

                self.opt.zero_grad()
                loss.backward()
                self.opt.step()

                sum_loss += loss.cpu().detach().item()

            avg_loss = sum_loss / len(self.train_dataloader)

            self.net.eval()
            sum_score = 0.
            test_sum_loss = 0.
            for i, (imgs, tags) in enumerate(self.test_dataloder):
                # imgs,tags = imgs.to(DRVICE),tags.to(DEVICE)
                test_y = self.net(imgs)
                # test_loss = torch.mean((tags - test_y) ** 2)
                test_loss = self.loos_fn(test_y, tags)
                test_sum_loss += test_loss.cpu().detach().item()

                pred_tags = torch.argmax(test_y, dim=1)
                # label_tags = torch.argmax(tags, dim=1)
                sum_score += torch.sum(torch.eq(
                    pred_tags, tags).float()).cpu().detach().item()

            # 加载测试图片
            self.summaryWriter.add_images("imgs", imgs[:10], epoch)

            test_avg_loss = test_sum_loss / len(self.test_dataloder)
            score = sum_score / len(self.test_dataset)

            self.summaryWriter.add_scalars("loss", {
                "train_loss": avg_loss,
                "test_loss": test_avg_loss
            }, epoch)
            self.summaryWriter.add_scalar("score", score, epoch)

            layer1_weight = self.net.seq[1].weight
            layer2_weight = self.net.seq[5].weight
            layer3_weight = self.net.seq[9].weight

            self.summaryWriter.add_histogram("later1", layer1_weight, epoch)
            self.summaryWriter.add_histogram("later2", layer2_weight, epoch)
            self.summaryWriter.add_histogram("later3", layer3_weight, epoch)

            print(epoch, avg_loss, test_avg_loss, score)

            torch.save(self.net.state_dict(), f"./checkpoint/{epoch}.t")
示例#17
0
class Run():
    def __init__(self,
                 modeln="MyModel",
                 val_length=10,
                 batch_size=2,
                 classifications_file="classifications.pkl",
                 learning_rate=3e-2):

        # SAMPLE FOR VALIDATION AND TEST SETS
        self.val_length = val_length  # , self.test_length = 20, 0
        self.batch_size = batch_size
        self.classifications_file = classifications_file
        self.lr = learning_rate

        sample = random.sample(range(0, len(master_list)),
                               k=self.val_length)  # + self.test_length
        # sample  = sample_by_label(master_list, val_size=self.val_length, n_min=2)
        # self.test_list = [e for i, e in enumerate(master_list) if i in sample[self.val_length:]]
        self.val_list = [e for i, e in enumerate(master_list) if i in sample]
        self.train_list = [
            e for i, e in enumerate(master_list) if i not in sample
        ]

        print("train length: %s \t val length: %s \t test length: " %
              (len(self.train_list), len(self.val_list)))  # , len(self.te)))

        self.train_dataset = PET_CT_Dataset(self.train_list)
        self.val_dataset = PET_CT_Dataset(self.val_list)
        # self.test_dataset = PET_CT_Dataset(self.test_list)

        self.train_loader = DataLoader(self.train_dataset,
                                       batch_size=self.batch_size,
                                       num_workers=4,
                                       shuffle=True)
        self.val_loader = DataLoader(self.val_dataset,
                                     batch_size=self.batch_size,
                                     num_workers=4,
                                     shuffle=False,
                                     drop_last=False)
        # self.test_loader = DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=4, shuffle=False)

        self.writer = SummaryWriter()
        self.modeln = modeln
        self.model = self._init_model(model_name=self.modeln)
        self.model = self.model.to(device)

        self.loss_ce = nn.BCELoss()
        # self.loss_ce = nn.BCEWithLogitsLoss()  # naj bi se BCE loss uporabljal z sigmoidom, ne pa z softmax!!

        # oba optimizera sta kr cool :)
        self.optimizer = torch.optim.SGD(self.model.parameters(),
                                         lr=self.lr,
                                         weight_decay=5e-3,
                                         momentum=0.9)  # works better?
        # self.optimizer = torch.optim.Adam(self.model.parameters(), lr=3e-3)  # weight_decay=5e-3, momentum=0.9)
        self.global_step = 0
        self.val_top_loss = 1e5
        self.train_top_loss = 1e5

    def _init_model(self, model_name):
        if model_name == "MyModel":
            return MyModel(num_classes=5)
        if model_name == 'resnet10':
            return resnet10(num_classes=5, activation="softmax")
        else:
            return None

    # TODO: implement forward function
    def forward(self, *inputs):
        raise NotImplementedError

    def epoch_train(self):
        self.model = self.model.train()
        epoch_loss = 0
        for ct, pet, merged, label, _ in self.train_loader:
            self.optimizer.zero_grad()
            inp = torch.Tensor(merged.float())
            inp = inp.to(device)  # no schema error!!
            label = label.to(device)
            otpt = self.model(inp)
            loss = self.loss_ce(otpt, label)
            loss.backward()
            self.optimizer.step()
            epoch_loss += loss.sum().detach().cpu()
        epoch_loss /= len(self.train_list)
        self.writer.add_scalar("train_loss",
                               epoch_loss,
                               global_step=self.global_step)
        return epoch_loss

    def epoch_val(self):
        self.model = self.model.eval()
        epoch_loss = 0
        log_txt = ""
        for ct, pet, merged, label, _ in self.val_loader:
            inp = torch.Tensor(merged.float())
            inp = inp.to(device)
            label = label.to(device)
            otpt = self.model(inp)
            loss = self.loss_ce(otpt, label)
            epoch_loss += loss.sum().detach().cpu()
            # otpt = F.sigmoid(otpt)

            log_txt += f'truth: \t{str(label.detach().cpu().numpy())} output: \t{str(otpt.detach().cpu().numpy())}\n'
        epoch_loss /= len(self.val_list)
        self.writer.add_text("val_",
                             text_string=log_txt,
                             global_step=self.global_step)
        self.writer.add_scalar("val_loss",
                               epoch_loss,
                               global_step=self.global_step)
        return epoch_loss

    def evaluate_classification(self):
        self.model = torch.load(
            os.path.join(self.writer.log_dir, "best_val.pth"))
        label_list = m_list_settings['encoding'][1]
        try:
            with open(self.classifications_file, "rb") as f:
                classifications = pickle.load(f)
        except:
            classifications = dict()
            classifications['val_loss'] = list(
            )  # se itak požene na koncu, ko je že zoptimiziran..
            classifications['model_version'] = list()
            classifications['truth'] = dict()
            classifications['pred'] = dict()
            classifications['CT_dirs'] = list()
            classifications['PET_dirs'] = list()
            for l in label_list:
                classifications['truth'][l] = list()
                classifications['pred'][l] = list()

        self.model = self.model.eval()
        val_loss = 0
        for ct, pet, merged, label, entry in self.val_loader:
            inp = torch.Tensor(merged.float())
            inp = inp.to(device)
            label = label.to(device)
            otpt = self.model(inp)
            loss = self.loss_ce(otpt, label)
            val_loss += loss.sum().detach().cpu()
            otpt = otpt.detach().cpu().numpy()
            label = label.detach().cpu().numpy()
            for b in range(otpt.shape[0]):  # for example in batch
                classifications['model_version'].append(self.writer.log_dir)
                classifications['CT_dirs'].append(entry['CT_dir'])
                classifications['PET_dirs'].append(entry['PET_dir'])
                for il, l in enumerate(label_list):
                    classifications['truth'][l].append(label[b, il])
                    classifications['pred'][l].append(otpt[b, il])
        classifications['val_loss'].append(val_loss)

        with open(self.classifications_file, "wb") as f:
            pickle.dump(classifications, f)

    def train(self, no_epochs=10):

        for i in range(no_epochs):
            t0 = time()
            self.global_step += 1
            tr = self.epoch_train()
            val = self.epoch_val()
            self.writer.add_scalars(main_tag="losses",
                                    tag_scalar_dict={
                                        'train_loss': tr,
                                        "val_loss": val
                                    },
                                    global_step=self.global_step)
            if val < self.val_top_loss:
                torch.save(self.model,
                           os.path.join(self.writer.log_dir, "best_val.pth"))
                self.val_top_loss = val
                print("saved_top_model_val")
            if tr < self.train_top_loss:
                torch.save(self.model,
                           os.path.join(self.writer.log_dir, "best_tr.pth"))
                self.train_top_loss = tr
                print("saved_top_model_tr")

            print(f"STEP: {i} TRAINLOSS: {tr} VALLOSS {val} dt {time() - t0}")
        self.writer.close()
示例#18
0
文件: main.py 项目: kimshun09/lenet
def main():
    epoch = 500
    history = {
        'train_loss': [],
        'test_loss': [],
        'train_acc': [],
        'test_acc': []
    }
    loader = load_cifar10()

    net = CNN()
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(params=net.parameters(),
                                lr=0.001,
                                momentum=0.9)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    net.to(device)
    print(device)

    writer = SummaryWriter(log_dir='./logs')

    for e in range(epoch):
        net.train()
        loss = None
        for i, (images, labels) in enumerate(loader['train']):
            images = images.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            output = net(images)
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()

            if i % 10 == 0:
                print(
                    f'Training log: {e+1:03} epoch ({(i+1)*128:05} / 50000 train. data). Loss: {loss.item()}'
                )

        history['train_loss'].append(loss.item())
        net.eval()

        correct = 0
        with torch.no_grad():
            for i, (images, labels) in enumerate(loader['train']):
                images = images.to(device)
                labels = labels.to(device)
                outputs = net(images)
                _, predicted = torch.max(outputs.data, 1)
                correct += (predicted == labels).sum().item()
        acc = float(correct / 50000)
        history['train_acc'].append(acc)
        print(f'Accuracy on train. data: {acc}')

        loss_test = None
        correct = 0
        with torch.no_grad():
            for i, (images, labels) in enumerate(loader['test']):
                images = images.to(device)
                labels = labels.to(device)
                outputs = net(images)
                _, predicted = torch.max(outputs.data, 1)
                correct += (predicted == labels).sum().item()
                loss_test = criterion(outputs, labels)
        acc_test = float(correct / 10000)
        history['test_acc'].append(acc_test)
        history['test_loss'].append(loss_test.item())
        print(f'Accuracy on test data: {acc_test}')
        print(f'Loss on test: {loss_test.item()}')

        writer.add_scalars('Loss', {
            'train': loss.item(),
            'test': loss_test.item()
        }, e)
        writer.add_scalars('Accuracy', {'train': acc, 'test': acc_test}, e)

    print(history)
    writer.close()
示例#19
0
class Trainer(object):
    def __init__(self, model, train_loader, val_loader, args, device, logging):
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.args = args
        self.device = device
        self.logging = logging

        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = torch.optim.AdamW(model.parameters(),
                                           lr=args.lr,
                                           weight_decay=0)
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            self.optimizer,
            mode='max',
            factor=0.5,
            patience=3,
            verbose=True,
            min_lr=1e-5)
        if args.action == 'train':
            self.writer = SummaryWriter(log_dir=args.tensorboard_dir)
            self.inputs = next(iter(train_loader))[0]
            self.writer.add_graph(model,
                                  self.inputs.to(device, dtype=torch.float32))
        if args.DataParallel:
            self.model = torch.nn.DataParallel(model)
        else:
            self.model = model

    def train(self):
        epochs = self.args.epochs
        n_train = len(self.train_loader.dataset)
        step = 0
        best_acc = 0.
        accs = AverageMeter()
        for epoch in range(epochs):
            self.model.train()
            epoch_loss = 0
            # training
            with tqdm(total=n_train,
                      desc=f'Epoch {epoch + 1}/{epochs}',
                      unit='img') as pbar:
                for batch in self.train_loader:
                    images, labels = batch[0], batch[1]

                    images = images.to(device=self.device, dtype=torch.float32)
                    labels = labels.to(device=self.device, dtype=torch.long)
                    preds = self.model(images)
                    loss = self.criterion(preds, labels)

                    epoch_loss += loss.item()
                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()

                    accs.update((preds.argmax(1) == labels).sum().item() /
                                images.size(0), images.size(0))
                    pbar.set_postfix(**{'loss': loss.item(), 'acc': accs.avg})
                    self.writer.add_scalar('acc/train', accs.avg, step)
                    self.writer.add_scalar('Loss/train', loss.item(), step)
                    pbar.update(images.shape[0])
                    step = step + 1
            # eval
            if (epoch + 1) % self.args.val_epoch == 0:
                acc = self.test(mode='val')
                if acc > best_acc:
                    best_acc = acc
                    if self.args.save_path:
                        if not os.path.exists(self.args.save_path):
                            os.makedirs(self.args.save_path)
                        torch.save(self.model.state_dict(),
                                   f'{self.args.save_path}/best_model.pth')
                        self.logging.info(
                            char_color(f'best model saved !', word=33))

                self.logging.info(f'acc: {acc}')
                self.writer.add_scalars('Valid', {'acc': acc}, step)
                self.writer.add_scalar('learning_rate',
                                       self.optimizer.param_groups[0]['lr'],
                                       step)
                self.scheduler.step(acc)
            if (epoch + 1) % self.args.save_model_epoch == 0:
                if self.args.save_path:
                    if not os.path.exists(self.args.save_path):
                        os.makedirs(self.args.save_path)
                    model_name = f'{self.args.task}_'
                    torch.save(
                        self.model.state_dict(),
                        f'{self.args.save_path}/{model_name}{epoch + 1}.pth')
                    self.logging.info(
                        char_color(f'Checkpoint {epoch + 1} saved !'))
        self.writer.close()

    def test(self, mode='val', model_path=None, aug=False):
        self.model.train(False)
        self.model.eval()

        accs = AverageMeter()
        test_len = len(self.val_loader)
        step = 0
        with torch.no_grad():
            with tqdm(total=test_len, desc=f'{mode}', unit='batch') as pbar:
                for batch in self.val_loader:
                    images, labels = batch[0], batch[1]
                    images = images.to(device=self.device, dtype=torch.float32)
                    labels = labels.to(device=self.device, dtype=torch.long)
                    preds = self.model(images)
                    accs.update((preds.argmax(1) == labels).sum().item() /
                                images.size(0), images.size(0))
                    pbar.set_postfix(**{'acc': accs.avg})
                    pbar.update(images.shape[0])
                    step = step + 1
        return accs.avg
示例#20
0
class Train:
    def __init__(self, root):
        self.epoch = 100000
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.param_path = 'param/params.pt'

        # 加载训练集
        self.train_dataset = MyDataset(root)
        self.train_dataloader = DataLoader(self.train_dataset,
                                           batch_size=100,
                                           shuffle=True,
                                           num_workers=8)

        # 加载验证集
        self.test_dataset = MyDataset(root, is_train=False)
        self.test_dataloader = DataLoader(self.test_dataset,
                                          batch_size=10,
                                          shuffle=False,
                                          num_workers=0)

        # 定义网络
        # self.net = RnnNet().to(self.device)
        self.net = Seq2seqNet().to(self.device)

        # 加载参数
        if os.path.exists(self.param_path):
            self.net.load_state_dict(torch.load(self.param_path))

        # 定义优化器和损失
        self.optim = torch.optim.Adam(self.net.parameters())
        self.loss_func = nn.MSELoss()

    def __call__(self):
        self.summ = SummaryWriter('./logs')
        for epoch in range(self.epoch):
            loss_train_sum = 0
            for i, (img, label) in enumerate(tqdm(self.train_dataloader)):
                img = img.to(self.device)
                label = label.to(self.device)

                out = self.net(img)
                loss = self.loss_func(out, label)

                self.optim.zero_grad()
                loss.backward()
                self.optim.step()

                loss_train_sum += loss.detach().cpu().item()
            loss_train_avg = torch.true_divide(loss_train_sum,
                                               len(self.train_dataloader))

            loss_test_sum = 0
            acc = 0
            for i, (img, tage) in enumerate(tqdm(self.test_dataloader)):
                # 把数据的标签放入GPU进行计算
                input, test_tage = img.to(self.device), tage.to(self.device)
                test_output = self.net(input)

                perd = torch.argmax(test_output, 2).detach().cpu().numpy()
                label = torch.argmax(tage, 2).detach().cpu().numpy()

                loss = self.loss_func(test_output, test_tage)
                loss_test_sum += loss.cpu().item()
                acc += np.mean(np.all(perd == label, axis=1))

            loss_test_avg = torch.true_divide(loss_test_sum,
                                              len(self.test_dataloader))
            acc_avg = torch.true_divide(acc, len(self.test_dataloader))
            # add_scalars用来保存多个值, add_scalar只能保存一个
            self.summ.add_scalars("loss", {
                "train_avg_loss": loss_train_avg,
                "test_avg_loss": loss_test_avg
            }, epoch)
            self.summ.add_scalar("acc", acc_avg, epoch)

            # 保存网络参数 w, b,不会自动创建文件 需要先将文件夹创建出来,按轮次保存,保存的格式为 .apk 或则 .t 文件  为二进制文件
            # 防止出现意外情况,保留参数
            torch.save(self.net.state_dict(), self.param_path)
            print(epoch, "训练损失", loss_train_avg.item(), "测试损失",
                  loss_test_avg.item(), "得分", acc_avg.item())
for epoch in range(20):
    total_train_loss, train_avg_auc, train_auc, train_data_pr, train_duration = one_epoch_train(
        model,
        train_loader,
        optimizer,
        criterion,
        device,
        scaler,
        iters_to_accumulate=8,
        clip_grads=False)
    total_val_loss, val_avg_auc, val_auc, val_data_pr, val_duration = eval_model(
        model, val_loader, device, criterion, scaler)
    scheduler.step()

    writer.add_scalars('avg/loss', {
        'train': total_train_loss,
        'val': total_val_loss
    }, epoch)
    writer.add_scalars('avg/auc', {
        'train': train_avg_auc,
        'val': val_avg_auc
    }, epoch)
    writer.flush()

    print(
        'EPOCH %d:\tTRAIN [duration %.3f sec, loss: %.3f, avg auc: %.3f]\t\t'
        'VAL [duration %.3f sec, loss: %.3f, avg auc: %.3f]\tCurrent time %s' %
        (epoch + 1, train_duration, total_train_loss, train_avg_auc,
         val_duration, total_val_loss, val_avg_auc,
         str(datetime.now(timezone('Europe/Moscow')))))

    torch.save(
示例#22
0
def train_gan(training_config):
    writer = SummaryWriter()
    device = torch.device("cpu")

    # Download MNIST dataset in the directory data
    mnist_data_loader = utils.get_mnist_data_loader(
        training_config['batch_size'])

    discriminator_net, generator_net = utils.get_gan(device,
                                                     GANType.CLASSIC.name)
    discriminator_opt, generator_opt = utils.get_optimizers(
        discriminator_net, generator_net)

    adversarial_loss = nn.BCELoss()
    real_image_gt = torch.ones((training_config['batch_size'], 1),
                               device=device)
    fake_image_gt = torch.zeros((training_config['batch_size'], 1),
                                device=device)

    ref_batch_size = 16
    ref_noise_batch = utils.get_gaussian_latent_batch(ref_batch_size, device)
    discriminator_loss_values = []
    generator_loss_values = []
    img_cnt = 0

    ts = time.time()

    utils.print_training_info_to_console(training_config)
    for epoch in range(training_config['num_epochs']):
        for batch_idx, (real_images, _) in enumerate(mnist_data_loader):
            real_images = real_images.to(device)

            # Train discriminator
            discriminator_opt.zero_grad()

            real_discriminator_loss = adversarial_loss(
                discriminator_net(real_images), real_image_gt)

            fake_images = generator_net(
                utils.get_gaussian_latent_batch(training_config['batch_size'],
                                                device))
            fake_images_predictions = discriminator_net(fake_images.detach())
            fake_discriminator_loss = adversarial_loss(fake_images_predictions,
                                                       fake_image_gt)

            discriminator_loss = real_discriminator_loss + fake_discriminator_loss
            discriminator_loss.backward()
            discriminator_opt.step()

            # Train generator
            generator_opt.zero_grad()

            generated_images_prediction = discriminator_net(
                generator_net(
                    utils.get_gaussian_latent_batch(
                        training_config['batch_size'], device)))

            generator_loss = adversarial_loss(generated_images_prediction,
                                              real_image_gt)

            generator_loss.backward()
            generator_opt.step()

            # Logging and checkpoint creation
            generator_loss_values.append(generator_loss.item())
            discriminator_loss_values.append(discriminator_loss.item())

            if training_config['enable_tensorboard']:
                writer.add_scalars(
                    'Losses/g-and-d', {
                        'g': generator_loss.item(),
                        'd': discriminator_loss.item()
                    },
                    len(mnist_data_loader) * epoch + batch_idx + 1)

                if training_config[
                        'debug_imagery_log_freq'] is not None and batch_idx % training_config[
                            'debug_imagery_log_freq'] == 0:
                    with torch.no_grad():
                        log_generated_images = generator_net(ref_noise_batch)
                        log_generated_images_resized = nn.Upsample(
                            scale_factor=2,
                            mode='nearest')(log_generated_images)
                        intermediate_imagery_grid = make_grid(
                            log_generated_images_resized,
                            nrow=int(np.sqrt(ref_batch_size)),
                            normalize=True)
                        writer.add_image(
                            'intermediate generated imagery',
                            intermediate_imagery_grid,
                            len(mnist_data_loader) * epoch + batch_idx + 1)

            if training_config[
                    'console_log_freq'] is not None and batch_idx % training_config[
                        'console_log_freq'] == 0:
                print(
                    f'GAN training: time elapsed = {(time.time() - ts):.2f} [s] | epoch={epoch + 1} | batch= [{batch_idx + 1}/{len(mnist_data_loader)}]'
                )

            # Save intermediate generator images
            if training_config[
                    'debug_imagery_log_freq'] is not None and batch_idx % training_config[
                        'debug_imagery_log_freq'] == 0:
                with torch.no_grad():
                    log_generated_images = generator_net(ref_noise_batch)
                    log_generated_images_resized = nn.Upsample(
                        scale_factor=2, mode='nearest')(log_generated_images)
                    save_image(log_generated_images_resized,
                               os.path.join(training_config['debug_path'],
                                            f'{str(img_cnt).zfill(6)}.jpg'),
                               nrow=int(np.sqrt(ref_batch_size)),
                               normalize=True)
                    img_cnt += 1

            # Save generator checkpoint
            if training_config['checkpoint_freq'] is not None and (
                    epoch + 1
            ) % training_config['checkpoint_freq'] == 0 and batch_idx == 0:
                ckpt_model_name = f"Classic_ckpt_epoch_{epoch + 1}_batch_{batch_idx + 1}.pth"
                torch.save(
                    utils.get_training_state(generator_net,
                                             GANType.CLASSIC.name),
                    os.path.join(CHECKPOINTS_PATH, ckpt_model_name))

    torch.save(utils.get_training_state(generator_net, GANType.CLASSIC.name),
               os.path.join(BINARIES_PATH, utils.get_available_binary_name()))
示例#23
0
            mb_loss = loss_fn(y_hat_mb, y_mb)
            mb_loss.backward()
            opt.step()

            with torch.no_grad():
                mb_acc = acc(y_hat_mb, y_mb)

            tr_loss += mb_loss.item()
            tr_acc += mb_acc.item()

            if (epoch * len(tr_dl) + step) % model_config.summary_step == 0:
                val_loss = evaluate(model, val_dl, {'loss': loss_fn},
                                    device)['loss']
                writer.add_scalars('loss', {
                    'train': tr_loss / (step + 1),
                    'val': val_loss
                },
                                   epoch * len(tr_dl) + step)
                model.train()
        else:
            tr_loss /= (step + 1)
            tr_acc /= (step + 1)

            tr_summary = {'loss': tr_loss, 'acc': tr_acc}
            val_summary = evaluate(model, val_dl, {
                'loss': loss_fn,
                'acc': acc
            }, device)
            scheduler.step(val_summary['loss'])
            tqdm.write('epoch : {}, tr_loss: {:.3f}, val_loss: '
                       '{:.3f}, tr_acc: {:.2%}, val_acc: {:.2%}'.format(
示例#24
0
            loss.backward()
            optimizer.step()
            tr_loss += loss.item()
        tr_loss /= (step + 1)

        model.eval()
        val_loss = 0
        for step, batch in enumerate(val_dl):
            h, t, r = map(lambda elm: elm.to(device), batch)
            n_h, n_t = sampler.corrupt_batch(h, t, r)
            with torch.no_grad():
                pos, neg = model(h, t, n_h, n_t, r)
                loss = criterion(pos, neg)
                val_loss += loss.item()
        val_loss /= (step + 1)
        writer.add_scalars('loss', {'train': tr_loss, 'val': val_loss}, epoch)
        if (epoch + 1) % args.summary_step == 0:
            tqdm.write(
                'Epoch {} | train loss: {:.5f}, valid loss: {:.5f}'.format(
                    epoch + 1, tr_loss, val_loss))
        model.normalize_parameters()
        is_best = val_loss < best_val_loss
        if is_best:
            state = {
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict()
            }
            summary = {
                'training loss': round(tr_loss, 4),
                'validation loss': round(val_loss, 4)
示例#25
0
def main():
    env = gym.make(args.environment)
    agent_ = getattr(Agent, args.agent.replace(' ', '') + 'Agent')
    if args.test:
        args.load_models = True
        args.render = True
    print(args)
    if args.agent == 'DDPG':
        max_action = float(env.action_space.high[0])
        agent = agent_(state_dim=env.observation_space.shape,
                       action_dim=env.action_space.shape,
                       hidden_dims=args.hidden_dims,
                       max_action=max_action,
                       gamma=args.gamma,
                       tau=args.tau,
                       critic_lr=args.critic_lr,
                       critic_wd=args.critic_wd,
                       actor_lr=args.actor_lr,
                       actor_wd=args.actor_wd,
                       batch_size=args.batch_size,
                       final_init=args.final_init,
                       maxsize=int(args.maxsize),
                       sigma=args.sigma,
                       theta=args.theta,
                       dt=args.dt,
                       checkpoint=args.checkpoint)
    elif args.agent == 'TD3':
        max_action = float(env.action_space.high[0])
        agent = agent_(state_dim=env.observation_space.shape,
                       action_dim=env.action_space.shape,
                       hidden_dims=args.hidden_dims,
                       max_action=max_action,
                       gamma=args.gamma,
                       tau=args.tau,
                       critic_lr=args.critic_lr,
                       critic_wd=args.critic_wd,
                       actor_lr=args.actor_lr,
                       actor_wd=args.actor_wd,
                       batch_size=args.batch_size,
                       final_init=args.final_init,
                       maxsize=int(args.maxsize),
                       sigma=args.sigma,
                       theta=args.theta,
                       dt=args.dt,
                       checkpoint=args.checkpoint,
                       actor_update_iter=args.actor_update_iter,
                       action_sigma=args.action_sigma,
                       action_clip=args.action_clip)
    elif args.agent == 'SAC':
        max_action = float(env.action_space.high[0])
        agent = agent_(
            state_dim=env.observation_space.shape,
            action_dim=env.action_space.shape,
            hidden_dims=args.hidden_dims,
            max_action=max_action,
            gamma=args.gamma,
            tau=args.tau,
            alpha=args.alpha,
            lr=args.critic_lr,
            batch_size=args.batch_size,
            maxsize=int(args.maxsize),
            log_std_min=args.log_std_min,
            log_std_max=args.log_std_max,
            epsilon=args.epsilon,
            checkpoint=args.checkpoint,
        )

    else:
        agent = agent_(state_dim=env.observation_space.shape,
                       actionaction_dim_dim=env.action_space.n,
                       hidden_dims=args.hidden_dims,
                       gamma=args.gamma,
                       lr=args.lr)

    Path(args.logdir).mkdir(parents=True, exist_ok=True)
    Path(args.checkpoint).mkdir(parents=True, exist_ok=True)

    writer = SummaryWriter(args.logdir)

    if args.load_models:
        agent.load_models(args.agent + '_' + args.environment)
    pbar = tqdm(range(args.n_episodes))
    score_history = deque(maxlen=args.window_legnth)
    best_score = -np.inf
    for e in pbar:
        done, score, observation = False, 0, env.reset()

        # reset DDPG UO Noise and also keep track of actor/critic losses
        if args.agent in ['DDPG', 'TD3', 'SAC']:
            if args.agent == 'DDPG':
                agent.noise.reset()
            actor_losses, critic_losses = [], []
        while not done:
            if args.render:
                env.render(mode='human')

            action = agent.choose_action(observation, args.test)
            next_observation, reward, done, _ = env.step(action)
            score += reward

            # update for td methods, recording for mc methods
            if args.test:
                continue
            elif args.agent == 'Actor Critic':
                agent.update(reward, next_observation, done)
            elif args.agent in ['DDPG', 'TD3', 'SAC']:
                agent.store_transition(observation, action, reward,
                                       next_observation, done)
                # if we have memory smaller than batch size, do not update
                if agent.memory.idx < args.batch_size or (
                        args.agent == 'TD3' and agent.ctr < args.warmup_steps):
                    continue
                else:
                    actor_loss, critic_loss = agent.update()
                actor_losses.append(actor_loss)
                critic_losses.append(critic_loss)
                pbar.set_postfix({
                    'Reward': reward,
                    'Actor Loss': actor_loss,
                    'Critic Loss': critic_loss
                })
            else:
                agent.store_reward(reward)
            observation = next_observation

        score_history.append(score)

        if args.test:
            continue
        # update for mc methods w/ full trajectory
        elif args.agent == 'Policy Gradient':
            agent.update()

        # logging & saving
        elif args.agent in ['DDPG', 'TD3', 'SAC']:
            writer.add_scalars('Scores', {
                'Episodic': score,
                'Windowed Average': np.mean(score_history)
            },
                               global_step=e)

            if actor_losses:
                loss_dict = {
                    'Actor': np.mean(actor_losses),
                    'Critic': np.mean(critic_losses)
                }
                writer.add_scalars('Losses', loss_dict, global_step=e)
            actor_losses, critic_losses = [], []

            if np.mean(score_history) > best_score:
                best_score = np.mean(score_history)
                agent.save_models(args.agent + '_' + args.environment)

        tqdm.write(
            f'Episode: {e + 1}/{args.n_episodes}, Score: {score}, Average Score: {np.mean(score_history)}'
        )
示例#26
0
def main(_A: argparse.Namespace):
    apex = False
    is_cpu = False
    if _A.num_gpus_per_machine == 0:
        # Set device as CPU if num_gpus_per_machine = 0.
        device = torch.device("cpu")
        is_cpu = True
    else:
        # Get the current device as set for current distributed process.
        # Check `launch` function in `virtex.utils.distributed` module.
        device = torch.cuda.current_device()

    # Create a config object (this will be immutable) and perform common setup
    # such as logging and setting up serialization directory.
    _C = Config(_A.config, _A.config_override)
    common_setup(_C, _A)

    # -------------------------------------------------------------------------
    #   INSTANTIATE DATALOADER, MODEL, OPTIMIZER
    # -------------------------------------------------------------------------
    tokenizer = TokenizerFactory.from_config(_C)
    train_dataset = PretrainingDatasetFactory.from_config(_C,
                                                          split="train",
                                                          csv=_A.train_csv)
    val_dataset = PretrainingDatasetFactory.from_config(_C,
                                                        split="val",
                                                        csv=_A.val_csv)

    train_dataloader = DataLoader(
        train_dataset,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        #sampler= Sampler(train_dataset),
        sampler=DistributedSampler(train_dataset, shuffle=True),
        num_workers=_A.cpu_workers,
        pin_memory=True,
        drop_last=True,
        collate_fn=train_dataset.collate_fn,
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        # sampler = Sampler(val_dataset),
        sampler=DistributedSampler(val_dataset, shuffle=False),
        num_workers=_A.cpu_workers,
        pin_memory=True,
        drop_last=False,
        collate_fn=val_dataset.collate_fn,
    )

    model = PretrainingModelFactory.from_config(_C).to(device)
    optimizer = OptimizerFactory.from_config(_C, model.named_parameters())
    scheduler = LRSchedulerFactory.from_config(_C, optimizer)

    # -------------------------------------------------------------------------
    #   BEFORE TRAINING STARTS
    # -------------------------------------------------------------------------

    # Load checkpoint to resume training if specified.
    if _A.resume_from is not None:
        start_iteration = CheckpointManager(model=model,
                                            optimizer=optimizer,
                                            scheduler=scheduler).load(
                                                _A.resume_from)
    else:
        start_iteration = 0

    # Keep track of time per iteration and ETA.
    timer = Timer(
        start_from=start_iteration + 1,
        total_iterations=_C.OPTIM.NUM_ITERATIONS,
    )
    # Create an iterator from dataloader to sample batches perpetually.
    train_dataloader_iter = cycle(train_dataloader, device, start_iteration)

    if (not is_cpu):
        # Wrap model and optimizer using NVIDIA Apex for mixed precision training.
        # NOTE: Always do this before wrapping model with DistributedDataParallel.
        if apex:
            if _C.FP16_OPT > 0:
                from apex import amp

                model, optimizer = amp.initialize(model,
                                                  optimizer,
                                                  opt_level=f"O{_C.FP16_OPT}")

        # Wrap model in DDP if using more than one processes.
        if dist.get_world_size() > 1:
            dist.synchronize()
            model = nn.parallel.DistributedDataParallel(
                model, device_ids=[device], find_unused_parameters=True)

        # Create checkpoint manager and tensorboard writer (only in master process).
        if dist.is_master_process():
            checkpoint_manager = CheckpointManager(
                _A.serialization_dir,
                model=model,
                optimizer=optimizer,
                scheduler=scheduler,
            )
            tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir)
            tensorboard_writer.add_text("config", f"```\n{_C}\n```")

    # -------------------------------------------------------------------------
    #   TRAINING LOOP
    # -------------------------------------------------------------------------
    for iteration in range(start_iteration + 1, _C.OPTIM.NUM_ITERATIONS + 1):
        timer.tic()
        optimizer.zero_grad()

        batch_loss = torch.tensor(0.0, device=device)

        batch = next(train_dataloader_iter)
        output_dict = model(batch)

        loss = output_dict["loss"]
        batch_loss += loss.item()

        # Perform dynamic scaling of loss to adjust for mixed precision.
        if apex and _C.FP16_OPT > 0:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        # Clip norm of gradients before optimizer step.
        torch.nn.utils.clip_grad_norm_(
            amp.master_params(optimizer)
            if apex and _C.FP16_OPT > 0 else model.parameters(),
            _C.OPTIM.CLIP_GRAD_NORM,
        )
        optimizer.step()
        scheduler.step(iteration)
        timer.toc()

        # ---------------------------------------------------------------------
        #   TENSORBOARD LOGGING
        # ---------------------------------------------------------------------
        if iteration % _A.log_every == 0 and dist.is_master_process():
            logger.info(f"{timer.stats} | Loss: {batch_loss:.3f} | "
                        f"GPU mem: {dist.gpu_mem_usage()} MB")
            tensorboard_writer.add_scalars(
                "learning_rate",
                {
                    "visual": optimizer.param_groups[0]["lr"],
                    "common": optimizer.param_groups[-1]["lr"],
                },
                iteration,
            )
            tensorboard_writer.add_scalars("train",
                                           output_dict["loss_components"],
                                           iteration)

        # ---------------------------------------------------------------------
        #   VALIDATION
        # ---------------------------------------------------------------------
        if iteration % _A.checkpoint_every == 0:
            if dist.is_master_process():
                checkpoint_manager.step(iteration)

            torch.set_grad_enabled(False)
            model.eval()

            # Accumulate different val loss components according to the type of
            # pretraining model.
            val_loss_counter: Counter = Counter()

            for val_iteration, val_batch in enumerate(val_dataloader, start=1):
                for key in val_batch:
                    val_batch[key] = val_batch[key].to(device)
                output_dict = model(val_batch)

                val_loss_counter.update(output_dict["loss_components"])

            # Divide each loss component by number of val batches per GPU.
            val_loss_dict = {
                k: v / val_iteration
                for k, v in dict(val_loss_counter).items()
            }
            dist.average_across_processes(val_loss_dict)
            torch.set_grad_enabled(True)
            model.train()

        if iteration % _A.checkpoint_every == 0 and dist.is_master_process():
            logger.info(f"Iter: {iteration} | Val loss: {val_loss_dict}")
            tensorboard_writer.add_scalars("val", val_loss_dict, iteration)

        # All processes will wait till master process is done logging.
        dist.synchronize()
示例#27
0
class Trainer(object):
    def __init__(self,
                 dataset,
                 model,
                 optimizer,
                 batch_size=64,
                 annealing_strategy='logistic',
                 kl_anneal_rate=0.01,
                 kl_anneal_time=100,
                 kl_anneal_target=1.,
                 label_anneal_rate=0.01,
                 label_anneal_time=100,
                 label_anneal_target=1.,
                 add_bow_loss=False,
                 force_cpu=False,
                 run_dir=None,
                 alpha=1.):

        self.force_cpu = force_cpu
        self.dataset = dataset
        self.model = model
        self.optimizer = optimizer
        self.i2w = dataset.i2w
        self.i2int = dataset.i2int
        self.w2i = dataset.w2i
        self.int2i = dataset.int2i

        self.batch_size = batch_size

        self.annealing_strategy = annealing_strategy
        self.kl_anneal_time = kl_anneal_time
        self.kl_anneal_rate = kl_anneal_rate
        self.kl_anneal_target = kl_anneal_target
        self.label_anneal_time = label_anneal_time
        self.label_anneal_rate = label_anneal_rate
        self.label_anneal_target = label_anneal_target
        self.add_bow_loss = add_bow_loss
        self.alpha = alpha

        self.epoch = 0
        self.step = 0
        self.latent_rep = {intent: [] for intent in self.i2int}

        self.run_logs = {
            'train': {
                'recon_loss': [],
                'kl_losses': [[] for _ in range(self.model.z_size)],
                'conditioning_accuracy': [],
                'total_loss': [],
                'classifications': {
                    real_intent:
                    {pred_intent: 0
                     for pred_intent in self.i2int}
                    for real_intent in self.i2int
                },
                'transfer': {
                    real_intent: torch.zeros(model.cat_size)
                    for real_intent in self.i2int
                }
            },
            'dev': {
                'recon_loss': [],
                'kl_loss': [],
                'conditioning_accuracy': [],
                'total_loss': [],
                'classifications': {
                    real_intent:
                    {pred_intent: 0
                     for pred_intent in self.i2int}
                    for real_intent in self.i2int
                },
                'transfer': {
                    real_intent: torch.zeros(model.cat_size)
                    for real_intent in self.i2int
                }
            }
        }
        self.summary_writer = SummaryWriter(log_dir=run_dir)

    def run(self, n_epochs, dev_step_every_n_epochs=1):
        train_iter, val_iter = self.dataset.get_iterators(
            batch_size=self.batch_size)

        for idx in range(n_epochs):
            import gc
            gc.collect()
            torch.cuda.empty_cache()

            self.epoch += 1
            is_last_epoch = self.epoch == n_epochs - 1
            train_loss, train_recon_loss, train_kl_loss, train_acc = self.do_one_sweep(
                train_iter, is_last_epoch, "train")

            # logging
            LOGGER.info('Training loss after epoch %d: %f', self.epoch,
                        train_loss.item())
            LOGGER.info('Training reconstruction loss after epoch %d: %f',
                        self.epoch, train_recon_loss.item())
            LOGGER.info('Training KL loss after epoch %d: %f', self.epoch,
                        train_kl_loss.item())
            LOGGER.info('Training accuracy after epoch %d: %f', self.epoch,
                        train_acc)
            self.summary_writer.add_scalar(
                'train/total-loss',
                train_loss.cpu().detach().numpy().item(), self.epoch)
            self.run_logs['train']['total_loss'].append(
                train_loss.cpu().detach().numpy().item())

            if (idx + 1) % dev_step_every_n_epochs == 0:
                dev_loss, dev_recon_loss, dev_kl_loss, dev_acc = self.do_one_sweep(
                    val_iter, is_last_epoch, "dev")

                # logging
                LOGGER.info('Dev loss after epoch %d: %f', self.epoch,
                            dev_loss)
                LOGGER.info('Dev recon loss after epoch %d: %f', self.epoch,
                            dev_recon_loss)
                LOGGER.info('Dev KL loss after epoch %d: %f', self.epoch,
                            dev_kl_loss)
                LOGGER.info('Dev acc after epoch %d: %f', self.epoch, dev_acc)
                # summaries
                self.summary_writer.add_scalar(
                    'dev/recon-loss',
                    dev_recon_loss.cpu().detach().numpy().item(), self.epoch)
                self.run_logs['dev']['recon_loss'].append(
                    dev_recon_loss.cpu().detach().numpy().item())
                self.summary_writer.add_scalar(
                    'dev/kl-loss',
                    dev_kl_loss.detach().cpu().detach().numpy(), self.epoch)
                self.run_logs['dev']['kl_loss'].append(
                    dev_kl_loss.cpu().detach().numpy().item())
                self.summary_writer.add_scalar(
                    'dev/total-loss',
                    dev_loss.detach().cpu().detach().numpy(), self.epoch)
                self.run_logs['dev']['total_loss'].append(
                    dev_loss.cpu().detach().numpy().item())

    def do_one_sweep(self, iter, is_last_epoch, train_or_dev):
        if train_or_dev not in ['train', 'dev']:
            raise TypeError("train_or_dev should be either train or dev")

        if train_or_dev == "train":
            self.model.train()
        else:
            self.model.eval()

        sweep_loss = 0
        sweep_recon_loss = 0
        sweep_kl_loss = 0
        sweep_accuracy = 0
        n_batches = 0
        for iteration, batch in enumerate(tqdm(iter)):
            # if len(batch) < self.batch_size and :
            #     continue
            if train_or_dev == "train":
                self.step += 1
                self.optimizer.zero_grad()

            # forward pass
            x, lengths = getattr(batch, self.dataset.input_type)
            input = x[:, :-1]  # remove <eos>
            target = x[:, 1:]  # remove <sos>
            lengths -= 1  # account for the removal
            input, target = to_device(input, self.force_cpu), to_device(
                target, self.force_cpu)

            y = None
            if self.model.conditional is not None:
                y = batch.intent.squeeze()
                y = to_device(y, self.force_cpu)
                sorted_lengths, sorted_idx = torch.sort(lengths,
                                                        descending=True)
                y = y[sorted_idx]

            logp, mean, logv, logc, z, bow = self.model(input, lengths)

            if is_last_epoch:
                _, reversed_idx = torch.sort(sorted_idx)
                y = y[reversed_idx]
                logc = logc[reversed_idx]
                real_labels = [self.i2int[label] for label in y]
                pred_labels = [
                    self.i2int[label] if label < len(self.i2int) else 'None'
                    for label in logc.max(1)[1]
                ]
                for real_label, pred_label in zip(real_labels, pred_labels):
                    self.run_logs[train_or_dev]['classifications'][real_label][
                        pred_label] += 1
                for real_label in real_labels:
                    self.run_logs[train_or_dev]['transfer'][
                        real_label] += logc.sum(dim=0).cpu().detach()

                # save latent representation
                if train_or_dev == "train" and self.model.conditional:
                    for i, intent in enumerate(y):
                        self.latent_rep[self.i2int[intent]].append(
                            z[i].cpu().detach().numpy())

            # loss calculation
            loss, recon_loss, kl_loss, accuracy = self.compute_loss(
                logp, bow, target, lengths, mean, logv, logc, y, train_or_dev)

            sweep_loss += loss
            sweep_recon_loss += recon_loss
            sweep_kl_loss += kl_loss
            sweep_accuracy += accuracy

            n_batches += 1
            if train_or_dev == "train":
                loss.backward()
                self.optimizer.step()

        if is_last_epoch:
            for intent1 in self.i2int:
                n_sentences = sum(self.run_logs[train_or_dev]
                                  ['classifications'][intent1].values())
                self.run_logs[train_or_dev]['transfer'][intent1] /= n_sentences
                for intent2 in self.i2int:
                    self.run_logs[train_or_dev]['classifications'][intent1][
                        intent2] /= n_sentences

        return sweep_loss / n_batches, sweep_recon_loss / n_batches, \
               sweep_kl_loss / n_batches, sweep_accuracy / n_batches

    def compute_loss(self, logp, bow, target, length, mean, logv, logc, y,
                     train_or_dev):
        batch_size, seqlen, vocab_size = logp.size()
        target = target.view(batch_size, -1)

        # reconstruction loss
        recon_loss = compute_recon_loss(self.dataset.pad_idx, vocab_size,
                                        length, logp, target)

        # kl loss
        kl_weight, kl_losses = compute_kl_loss(logv, mean,
                                               self.annealing_strategy,
                                               self.step, self.kl_anneal_rate,
                                               self.kl_anneal_time,
                                               self.kl_anneal_target)
        kl_loss = torch.sum(kl_losses)

        total_loss = (recon_loss + kl_weight * kl_loss)

        # bow loss
        if self.add_bow_loss:
            total_loss += compute_bow_loss(batch_size, bow, target)

        # labels loss
        if self.model.conditional == 'supervised':
            if 'None' in self.i2int:
                none_idx = self.int2i['None']
            else:
                none_idx = -100
            label_loss, label_weight = compute_label_loss(
                logc, y, self.annealing_strategy, self.step,
                self.label_anneal_time, self.label_anneal_rate,
                self.label_anneal_target, none_idx, self.alpha)
            total_loss += label_weight * label_loss
        elif self.model.conditional == 'unsupervised':
            entropy = torch.sum(
                torch.exp(logc) *
                torch.log(self.model.n_classes * torch.exp(logc)))
            total_loss += entropy

        # summaries
        if train_or_dev == "train":
            self.summary_writer.add_scalar(
                train_or_dev + '/recon-loss',
                recon_loss.detach().cpu().detach().numpy() / batch_size,
                self.step)
            self.run_logs[train_or_dev]['recon_loss'].append(
                recon_loss.cpu().detach().numpy() / batch_size)
            for i in range(self.model.z_size):
                self.summary_writer.add_scalars(
                    train_or_dev + '/kl-losses', {
                        str(i):
                        kl_losses[i].cpu().detach().numpy().item() / batch_size
                    }, self.step)
                self.run_logs[train_or_dev]['kl_losses'][i].append(
                    kl_losses[i].cpu().detach().numpy().item() / batch_size)
        n_correct = 0
        if self.model.conditional is not None:
            mask = y != self.int2i['None']  # ignore nones
            pred_labels = logc[mask].data.max(1)[1].long()
            true_labels = y[mask].data
            n_correct = pred_labels.eq(true_labels).cpu().sum().float().item()
        self.summary_writer.add_scalar(train_or_dev + '/conditioning-accuracy',
                                       n_correct / len(true_labels), self.step)
        self.run_logs[train_or_dev]['conditioning_accuracy'].append(
            n_correct / len(true_labels))

        return total_loss / batch_size, recon_loss / batch_size, \
               kl_loss / batch_size, n_correct / len(true_labels)
示例#28
0
'''
安装好tensorboard后测试是否能够添加标量
'''
import numpy as np
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter(comment='test_tensorboard')

for x in range(100):

    writer.add_scalar('y=2x', x * 2, x)
    writer.add_scalar('y=pow(2, x)', 2**x, x)

    writer.add_scalars('data/scalar_group', {
        "xsinx": x * np.sin(x),
        "xcosx": x * np.cos(x),
        "arctanx": np.arctan(x)
    }, x)
writer.close()
示例#29
0
            cum_labels = torch.Tensor().to(device)
            for batch_n, batch in enumerate(masked_loader):
                batch = batch.to(device)
                out, _ = model(batch)
                labels = batch.y.to(device)
                weights = generate_weights(labels).to(device)
                te_loss = F.binary_cross_entropy(
                    out, target=labels, weight=generate_weights(labels))
                pred = out.detach().round().to(device)
                cum_labels = torch.cat((cum_labels, labels.clone().detach()),
                                       dim=0)
                cum_pred = torch.cat((cum_pred, pred.clone().detach()), dim=0)
            roc_auc_masked = roc_auc_score(cum_labels.cpu(), cum_pred.cpu())

            writer.add_scalars('Loss', {
                'train': tr_loss,
                'test': te_loss
            }, epoch)
            writer.add_scalars('ROC AUC', {
                'train': roc_auc,
                'test': roc_auc_te,
                'masked': roc_auc_masked
            }, epoch)
            writer.add_scalar('learning rate', learn_rate, epoch)

            print("---- Round {}: tr_loss={:.4f} te_roc_auc:{:.4f} lr:{:.6f}".
                  format(epoch, loss, roc_auc_te, learn_rate))

            #   -------------- MODEL SAVING ------------------------
            if roc_auc_te > max_roc_auc:
                max_roc_auc = roc_auc_te
                path = './{}/best_{}.pt'.format(modelpath, model_n)
示例#30
0
class TensorBoardVisualize():

    def __init__(self, experiment_name, logdir, dic, hyperparam={"hyper":1}):
        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        experiment_name = experiment_name+"_"+current_time
        self.tensorboard_writer = SummaryWriter(
            log_dir=pth.join(logdir,experiment_name),
            filename_suffix=experiment_name)

        self.comet_exp = comet_ml.Experiment(project_name="masterthesis")
        self.comet_exp.log_parameters(hyperparam)

        self.comet_exp.log_asset("train.py")
        self.comet_exp.log_asset("visualize.py")
        self.comet_exp.log_asset("dataset.py")
        self.comet_exp.log_asset("model.py")      

        self.word_dic = {v: k for k, v in dic['word_dic'].items()}
        self.answer_dic = {v: k for k, v in dic['answer_dic'].items()}

        self.word_vect = np.vectorize(lambda x: self.word_dic[x] if x > 0 else "")
        self.answer_vect = np.vectorize(lambda x: self.answer_dic[x])

        self.hooks = {}

        self.epoch = 0
        self.step = 0

    def set_epoch_step(self,epoch,step):
        self.epoch = epoch
        self.step = step


    def register_hook(self,key,hook):
        self.hooks[key] = hook

    def append_histogram(self, x, y, chart):

        self.tensorboard_writer.add_histogram(chart, y, x)
        #self.tensorboard_writer.close()

    def append_line(self, x, y_dic, chart):
        
        self.tensorboard_writer.add_scalars(chart, y_dic, x)
        #self.tensorboard_writer.close()

    def comet_line(self,y_dic,prefix):
        self.comet_exp.log_metrics(y_dic,prefix=prefix,epoch=self.epoch,step=self.step)

    def comet_image(self,images,chart):

        for i,comet_image in enumerate(images):
            self.comet_exp.log_image(
                comet_image.squeeze(0), name=f"{chart}_{i}", 
                image_format="png",
                image_channels="first", step=self.step)

    def add_images(self,x,images,chart):

        self.tensorboard_writer.add_images(
                chart, images, global_step=x, 
                walltime=None, dataformats='NCHW')

        self.comet_image(images,chart)

    # def add_conv2(self,x,module,chart,hook_name,mask,n_act,suffix=""):

    #     #weights and gradients
    #     weights = module.weight.data.cpu().numpy()
    #     gradients = module.weight.grad.cpu().numpy()
    #     self.append_histogram(x, weights.reshape(-1), f"{chart}_weights")
    #     self.append_histogram(x, gradients.reshape(-1), f"{chart}_gradients")

    #     #need hook
    #     act_hook = self.hooks[hook_name]
    #     act = act_hook.get_features()[mask][:n_act].mean(1,keepdim=True).cpu()
    #     self.add_images(
    #        x,
    #        act,
    #        f"{chart}_activations{suffix}")

    def add_conv2(self,x,module,chart,hook_name,mask,n_act,suffix=""):

        #weights and gradients
        if isinstance(module,Conv2dBatchAct):
            module = module.conv2d_batch_act[0]
        weights = module.weight.data.cpu().numpy()
        gradients = module.weight.grad.cpu().numpy()

        self.comet_exp.log_histogram_3d(weights, name=f"{chart}_weights", step=self.step)
        self.comet_exp.log_histogram_3d(gradients, name=f"{chart}_gradients", step=self.step)
        
        self.append_histogram(x, gradients.reshape(-1), f"{chart}_gradients")
        self.append_histogram(x, weights.reshape(-1), f"{chart}_weights")
        
        #need hook
        act_hook = self.hooks[hook_name]
        act = act_hook.get_features()[mask][0].unsqueeze(1).cpu()
        act = act - act.min()
        act = act / (act.max() - act.min())
        self.add_images(
           x,
           act,
           f"{chart}_act_first_image{suffix}")

    def add_figure_with_question(self,x,image,question,answer,output,index,chart,suffix=""):
        norm_img = mpl.colors.Normalize(vmin=-1,vmax=1)
        visu_question = self.word_vect(question)
        visu_answer = self.answer_vect(answer)
        visu_output = self.answer_vect(output)

        figures = []
        for idx in range(image.shape[0]):
            fig = plt.figure()
            a = fig.add_subplot(111)

            plt.imshow(
                norm_img(np.transpose(image[idx],[1,2,0])),
                vmin=0.,vmax=1.)
            a.text(0, 0, textwrap.fill(
                    f"{index[idx]}: " + " ".join(visu_question[idx]) + f"Answer/Output: {visu_answer[idx]}/{visu_output[idx]}",
                    60),wrap=True,ha='left',va='bottom')

            figures.append(fig)
            self.comet_exp.log_figure(figure_name=f"{chart}/sample{suffix}_{idx}", figure=fig, overwrite=False, step=self.step)
        
        self.tensorboard_writer.add_figure(
            f"{chart}/sample{suffix}",
            figures,
            x)

        






    def close(self):
        self.tensorboard_writer.close()