Exemplo n.º 1
0
class ClusteringTraining(Training):
    def __init__(self,
                 net,
                 data,
                 clustering=KMeans(3),
                 order_less=True,
                 loss=nn.CrossEntropyLoss(),
                 optimizer=torch.optim.Adam,
                 max_epochs=50,
                 batch_size=128,
                 device="cpu",
                 report_interval=10,
                 checkpoint_interval=1000,
                 path_prefix=".",
                 network_name="network"):
        super(ClusteringTraining, self).__init__()
        self.net = net.to(device)
        self.clustering = clustering
        self.data = data
        self.train_data = None
        self.loss = loss
        self.max_epochs = max_epochs
        self.batch_size = batch_size
        self.device = device

        self.report_interval = report_interval
        self.checkpoint_interval = checkpoint_interval

        self.checkpoint_path = f"{path_prefix}/{network_name}"
        self.order_less = order_less

        if not order_less:
            self.classifier = nn.Linear(256, 50)
            self.classifier = self.classifier.to(self.device)
        else:
            self.embedding = nn.Linear(256, 256)
            self.embedding = self.embedding.to(self.device)

        self.optimizer = optimizer(self.net.parameters())

        self.network_name = network_name
        self.writer = SummaryWriter(network_name)

        self.epoch_id = 0
        self.step_id = 0

    def save_path(self):
        return f"{self.checkpoint_path}-save.torch"

    def checkpoint(self):
        the_net = self.net
        if isinstance(the_net, torch.nn.DataParallel):
            the_net = the_net.module
        netwrite(
            the_net,
            f"{self.checkpoint_path}-encoder-epoch-{self.epoch_id}-step-{self.step_id}.torch"
        )
        self.each_checkpoint()

    def step(self, data, label, centers):
        self.optimizer.zero_grad()
        attention = self.net(data.to(self.device)).squeeze()
        centers = centers.to(self.device).unsqueeze(0)
        if self.order_less:
            center_embedding = self.embedding(centers.squeeze())
            logits = center_embedding.matmul(attention.unsqueeze(2)).squeeze()
        else:
            logits = self.classifier(attention.reshape(attention.size(0), -1))
        label = label.long().to(self.device)
        loss_val = self.loss(logits, label)
        loss_val.backward()
        self.writer.add_scalar("cluster assignment loss", float(loss_val),
                               self.step_id)
        self.optimizer.step()
        self.each_step()

    def embed_all(self):
        self.net.eval()
        with torch.no_grad():
            embedding = []
            batch_loader = DataLoader(self.data,
                                      batch_size=self.batch_size,
                                      shuffle=False)
            for point, *_ in islice(batch_loader, 5000 // self.batch_size):
                latent_point = self.net(point.to(self.device))
                latent_point = latent_point.to("cpu")
                latent_point = latent_point.reshape(latent_point.size(0), -1)
                embedding.append(latent_point)
            embedding = torch.cat(embedding, dim=0)
        self.net.train()
        return embedding

    def cluster(self, embedding):
        fit = self.clustering.fit(embedding.squeeze())
        labels = list(fit.labels_)
        try:
            cluster_centers = fit.cluster_centers_
        except:
            cluster_centers = [
                embedding[(labels == label).astype(int)].mean(
                    dim=0).squeeze().unsqueeze(0).numpy()
                for label in set(labels)
            ]
            cluster_centers = np.concatenate(cluster_centers, axis=0)
        if len(set(labels)) == 1:
            N = random.randint(2, 10)
            labels = [random.choice(list(range(N))) for label in labels]
            offsets = [
                np.random.randn(*cluster_centers.shape) * 2.0 for _ in range(N)
            ]
            cluster_centers = np.concatenate(
                [cluster_centers + offsets[idx] for idx in range(N)],
                axis=0).squeeze()
        counts = [labels.count(label) for label in range(len(set(labels)))]
        weights = [1 / counts[label] for label in labels]
        centers = torch.Tensor(cluster_centers)
        return weights, labels, centers

    def _cluster_image(self, labels):
        count = 10
        n_clusters = 50  #max(list(set(labels)))
        indices = list(range(len(labels)))
        random.shuffle(indices)
        cluster_done = [False for _ in range(n_clusters)]
        cluster_images = [[] for _ in range(n_clusters)]
        for index in indices:
            label = labels[index]
            if all(cluster_done):
                break
            if len(cluster_images[label]) < count:
                img, *_ = self.data[index]
                img = img - img.min()
                img = img / img.max()
                cluster_images[label].append(img)
            else:
                cluster_done[label] = True
        rows = [
            torch.cat(image_list, dim=2) for image_list in cluster_images
            if image_list
        ]
        for idx, row in enumerate(rows):
            self.writer.add_image(f"cluster samples {idx}", row, self.step_id)

    def _cluster_plot(self, embedding, labels):
        silhouette = silhouette_score(embedding.squeeze(), labels)
        chs = calinski_harabaz_score(embedding.squeeze(), labels)
        dbs = davies_bouldin_score(embedding.squeeze(), labels)

        n_labels = len(set(labels))

        self.writer.add_scalar(f"silhouette {n_labels}", silhouette,
                               self.step_id)
        self.writer.add_scalar(f"chs {n_labels}", chs, self.step_id)
        self.writer.add_scalar(f"dbs {n_labels}", dbs, self.step_id)

        indices = list(range(len(labels)))
        random.shuffle(indices)
        samples_to_plot = indices[:1000]
        sample_labels = [labels[idx] for idx in samples_to_plot]
        sample_embedding = embedding[samples_to_plot]
        pca = PCA(2).fit_transform(sample_embedding.squeeze())
        fig, ax = plt.subplots()
        ax.scatter(pca[:, 0], pca[:, 1], c=sample_labels, cmap="tab20")
        self.writer.add_figure(f"clustering {n_labels}", fig, self.step_id)

    def each_cluster(self, embedding, labels):
        self._cluster_image(labels)
        self._cluster_plot(embedding, labels)

    def train(self):
        for epoch_id in range(self.max_epochs):
            self.epoch_id = epoch_id
            embedding = self.embed_all()
            weights, labels, centers = self.cluster(embedding)

            self.each_cluster(embedding, labels)

            self.data.labels = labels
            self.train_data = None
            self.train_data = DataLoader(self.data,
                                         batch_size=self.batch_size,
                                         num_workers=8,
                                         sampler=WeightedRandomSampler(
                                             weights,
                                             len(self.data) * 4,
                                             replacement=True))
            for data, label in self.train_data:
                self.step(data, label, centers)
                if self.step_id % self.checkpoint_interval == 0:
                    self.checkpoint()
                self.step_id += 1

        return self.net
Exemplo n.º 2
0
def main():
    # Create the log and model directiory if they're not present.
    model_dir = os.path.join(
        args.log_dir,
        'models_' + time.strftime('%d_%b_%Y_%H_%M_%S', time.localtime()))
    pathlib.Path(model_dir).mkdir(parents=True, exist_ok=True)

    log_writer = SummaryWriter(log_dir=model_dir)

    train_data_path = os.path.join(args.data_dir, args.train_data_dict)
    with open(train_data_path, 'rb') as f:
        train_data_dict = pickle.load(f, encoding='latin1')
    train_dt = train_data_dict['dt']
    print('Loaded training data from %s, train_dt = %.2f' %
          (train_data_path, train_dt))

    if args.eval_every is not None:
        eval_data_path = os.path.join(args.data_dir, args.eval_data_dict)
        with open(eval_data_path, 'rb') as f:
            eval_data_dict = pickle.load(f, encoding='latin1')
        eval_dt = eval_data_dict['dt']
        print('Loaded evaluation data from %s, eval_dt = %.2f' %
              (eval_data_path, eval_dt))

    if args.incl_robot_node:
        robot_node = stg_node.STGNode('0', 'Pedestrian')
    else:
        robot_node = None

    for key in train_data_dict['input_dict'].keys():
        if isinstance(key, stg_node.STGNode):
            random_node = key
            break

    model_registrar = ModelRegistrar(model_dir, args.device)
    hyperparams['state_dim'] = train_data_dict['input_dict'][
        random_node].shape[2]
    hyperparams['pred_dim'] = len(train_data_dict['pred_indices'])
    hyperparams['pred_indices'] = train_data_dict['pred_indices']
    hyperparams['dynamic_edges'] = args.dynamic_edges
    hyperparams['edge_state_combine_method'] = args.edge_state_combine_method
    hyperparams[
        'edge_influence_combine_method'] = args.edge_influence_combine_method
    hyperparams['nodes_standardization'] = train_data_dict[
        'nodes_standardization']
    hyperparams['labels_standardization'] = train_data_dict[
        'labels_standardization']
    hyperparams['edge_radius'] = args.edge_radius

    if args.eval_every is not None:
        eval_hyperparams = copy.deepcopy(hyperparams)
        eval_hyperparams['nodes_standardization'] = eval_data_dict[
            "nodes_standardization"]
        eval_hyperparams['labels_standardization'] = eval_data_dict[
            "labels_standardization"]

    kwargs_dict = {
        'dynamic_edges':
        hyperparams['dynamic_edges'],
        'edge_state_combine_method':
        hyperparams['edge_state_combine_method'],
        'edge_influence_combine_method':
        hyperparams['edge_influence_combine_method']
    }

    stg = SpatioTemporalGraphCVAEModel(robot_node, model_registrar,
                                       hyperparams, kwargs_dict, None,
                                       args.device)
    print('Created training STG model.')

    if args.eval_every is not None:
        # It is important that eval_stg uses the same model_registrar as
        # the stg being trained, otherwise you're just repeatedly evaluating
        # randomly-initialized weights!
        eval_stg = SpatioTemporalGraphCVAEModel(robot_node, model_registrar,
                                                eval_hyperparams, kwargs_dict,
                                                None, args.eval_device)
        print('Created evaluation STG model.')

    # Create the aggregate scene_graph for all the data, allowing
    # for batching, just like the old one. Then, for speed tests
    # we'll show how much faster this method is than keeping the
    # full version. Can show graphs of forward inference time vs problem size
    # with two lines (using aggregate graph, using online-computed graph).
    agg_scene_graph = create_batch_scene_graph(
        train_data_dict['input_dict'],
        float(hyperparams['edge_radius']),
        use_old_method=(args.dynamic_edges == 'no'))
    print('Created aggregate training scene graph.')

    if args.dynamic_edges == 'yes':
        agg_scene_graph.compute_edge_scaling(args.edge_addition_filter,
                                             args.edge_removal_filter)
        train_data_dict['input_dict'][
            'edge_scaling_mask'] = agg_scene_graph.edge_scaling_mask
        print('Computed edge scaling for the training scene graph.')

    stg.set_scene_graph(agg_scene_graph)
    stg.set_annealing_params()

    if args.eval_every is not None:
        eval_agg_scene_graph = create_batch_scene_graph(
            eval_data_dict['input_dict'],
            float(hyperparams['edge_radius']),
            use_old_method=(args.dynamic_edges == 'no'))
        print('Created aggregate evaluation scene graph.')

        if args.dynamic_edges == 'yes':
            eval_agg_scene_graph.compute_edge_scaling(
                args.edge_addition_filter, args.edge_removal_filter)
            eval_data_dict['input_dict'][
                'edge_scaling_mask'] = eval_agg_scene_graph.edge_scaling_mask
            print('Computed edge scaling for the evaluation scene graph.')

        eval_stg.set_scene_graph(eval_agg_scene_graph)
        eval_stg.set_annealing_params()

    # model_registrar.print_model_names()
    optimizer = optim.Adam(model_registrar.parameters(),
                           lr=hyperparams['learning_rate'])
    lr_scheduler = optim.lr_scheduler.ExponentialLR(
        optimizer, gamma=hyperparams['learning_decay_rate'])

    # Keeping colors consistent throughout training.
    color_dict = defaultdict(dict)

    print_training_header(newline_start=True)
    for curr_iter in range(args.num_iters):
        # Necessary because we flip the weights contained between GPU and CPU sometimes.
        model_registrar.to(args.device)

        # Setting the current iterator value for internal logging.
        stg.set_curr_iter(curr_iter)

        # Stepping forward the learning rate scheduler and annealers.
        lr_scheduler.step()
        log_writer.add_scalar('dynstg/learning_rate',
                              lr_scheduler.get_lr()[0], curr_iter)
        stg.step_annealers()

        # Zeroing gradients for the upcoming iteration.
        optimizer.zero_grad()

        train_losses = list()
        for mb_num in range(args.batch_multiplier):
            # Obtaining the batch's training loss.
            train_inputs, train_labels = sample_inputs_and_labels(
                train_data_dict, batch_size=hyperparams['batch_size'])

            # Compute the training loss.
            train_loss = stg.train_loss(
                train_inputs, train_labels,
                hyperparams['prediction_horizon']) / args.batch_multiplier
            train_losses.append(train_loss.item())

            # Calculating gradients.
            train_loss.backward()

        # Print training information. Also, no newline here. It's added in at a later line.
        iter_train_loss = sum(train_losses)
        print('{:9} | {:10} | '.format(curr_iter, '%.2f' % iter_train_loss),
              end='',
              flush=True)

        log_writer.add_histogram('dynstg/train_minibatch_losses',
                                 np.asarray(train_losses), curr_iter)
        log_writer.add_scalar('dynstg/train_loss', iter_train_loss, curr_iter)

        # Clipping gradients.
        if hyperparams['grad_clip'] is not None:
            nn.utils.clip_grad_value_(model_registrar.parameters(),
                                      hyperparams['grad_clip'])

        # # Logging gradient norms.
        # len_prefix = len('model_dict.')
        # for name, param in model_registrar.named_parameters():
        #     if param.grad is None:
        #         # print(name, 'grad is None')
        #         continue

        #     log_writer.add_scalar('gradient_norms/' + name[len_prefix:],
        #                           param.grad.norm(),
        #                           curr_iter)

        # Performing a gradient step.
        optimizer.step()

        # Freeing up memory.
        del train_loss

        if args.eval_every is not None and (curr_iter +
                                            1) % args.eval_every == 0:
            with torch.no_grad():
                # First plotting training predictions.
                pred_fig = plot_utils.plot_predictions_during_training(
                    stg,
                    train_inputs,
                    hyperparams['prediction_horizon'],
                    num_samples=100,
                    dt=train_dt,
                    max_speed=max_speed,
                    color_dict=color_dict,
                    most_likely=True)
                log_writer.add_figure('dynstg/train_prediction', pred_fig,
                                      curr_iter)

                train_mse_batch_errors, train_fse_batch_errors = eval_utils.compute_batch_statistics(
                    stg,
                    train_data_dict,
                    hyperparams['minimum_history_length'],
                    hyperparams['prediction_horizon'],
                    num_samples=100,
                    num_runs=100,
                    dt=train_dt,
                    max_speed=max_speed,
                    robot_node=robot_node)
                log_writer.add_histogram('dynstg/train_mse',
                                         train_mse_batch_errors, curr_iter)
                log_writer.add_histogram('dynstg/train_fse',
                                         train_fse_batch_errors, curr_iter)

                mse_boxplot_fig, fse_boxplot_fig = plot_utils.plot_boxplots_during_training(
                    train_mse_batch_errors, train_fse_batch_errors)
                log_writer.add_figure('dynstg/train_mse_boxplot',
                                      mse_boxplot_fig, curr_iter)
                log_writer.add_figure('dynstg/train_fse_boxplot',
                                      fse_boxplot_fig, curr_iter)

                log_writer.add_scalars(
                    'dynstg/train_sq_error', {
                        'mean_mse': torch.mean(train_mse_batch_errors),
                        'mean_fse': torch.mean(train_fse_batch_errors),
                        'median_mse': torch.median(train_mse_batch_errors),
                        'median_fse': torch.median(train_fse_batch_errors)
                    }, curr_iter)

                # Then computing evaluation values and predictions.
                model_registrar.to(args.eval_device)
                eval_stg.set_curr_iter(curr_iter)
                eval_inputs, eval_labels = sample_inputs_and_labels(
                    eval_data_dict,
                    device=args.eval_device,
                    batch_size=args.eval_batch_size)

                (eval_loss_q_is, eval_loss_p,
                 eval_loss_exact) = eval_stg.eval_loss(
                     eval_inputs, eval_labels,
                     hyperparams['prediction_horizon'])
                log_writer.add_scalars(
                    'dynstg/eval', {
                        'nll_q_is': eval_loss_q_is,
                        'nll_p': eval_loss_p,
                        'nll_exact': eval_loss_exact
                    }, curr_iter)

                pred_fig = plot_utils.plot_predictions_during_training(
                    eval_stg,
                    eval_inputs,
                    hyperparams['prediction_horizon'],
                    num_samples=100,
                    dt=eval_dt,
                    max_speed=max_speed,
                    color_dict=color_dict,
                    most_likely=True)
                log_writer.add_figure('dynstg/eval_prediction', pred_fig,
                                      curr_iter)

                eval_mse_batch_errors, eval_fse_batch_errors = eval_utils.compute_batch_statistics(
                    eval_stg,
                    eval_data_dict,
                    hyperparams['minimum_history_length'],
                    hyperparams['prediction_horizon'],
                    num_samples=100,
                    num_runs=100,
                    dt=eval_dt,
                    max_speed=max_speed,
                    robot_node=robot_node)
                log_writer.add_histogram('dynstg/eval_mse',
                                         eval_mse_batch_errors, curr_iter)
                log_writer.add_histogram('dynstg/eval_fse',
                                         eval_fse_batch_errors, curr_iter)

                mse_boxplot_fig, fse_boxplot_fig = plot_utils.plot_boxplots_during_training(
                    eval_mse_batch_errors, eval_fse_batch_errors)
                log_writer.add_figure('dynstg/eval_mse_boxplot',
                                      mse_boxplot_fig, curr_iter)
                log_writer.add_figure('dynstg/eval_fse_boxplot',
                                      fse_boxplot_fig, curr_iter)

                log_writer.add_scalars(
                    'dynstg/eval_sq_error', {
                        'mean_mse': torch.mean(eval_mse_batch_errors),
                        'mean_fse': torch.mean(eval_fse_batch_errors),
                        'median_mse': torch.median(eval_mse_batch_errors),
                        'median_fse': torch.median(eval_fse_batch_errors)
                    }, curr_iter)

                print('{:15} | {:10} | {:14}'.format(
                    '%.2f' % eval_loss_q_is.item(),
                    '%.2f' % eval_loss_p.item(),
                    '%.2f' % eval_loss_exact.item()),
                      end='',
                      flush=True)

                # Freeing up memory.
                del eval_loss_q_is
                del eval_loss_p
                del eval_loss_exact

        else:
            print('{:15} | {:10} | {:14}'.format('', '', ''),
                  end='',
                  flush=True)

        # Here's the newline that ends the current training information printing.
        print('')

        if args.save_every is not None and (curr_iter +
                                            1) % args.save_every == 0:
            model_registrar.save_models(curr_iter)
            print_training_header()
Exemplo n.º 3
0
def main():
    global args
    args = parser.parse_args()
    out_dir = os.path.join(args.out_root, args.name)

    writer = SummaryWriter(out_dir)

    model = define_model(is_resnet=False,
                         is_densenet=False,
                         is_senet=True,
                         model=args.model,
                         parallel=args.parallel,
                         semff=args.semff,
                         pcamff=args.pcamff)

    gpu_num = torch.cuda.device_count()
    batch_size_per_gpu = 4

    device_ids = []
    for i in range(gpu_num):
        device_ids.append(i)

    model = torch.nn.DataParallel(model, device_ids=device_ids).cuda()
    batch_size = gpu_num * batch_size_per_gpu

    cudnn.benchmark = True
    """ Set Different Learning Rate """
    """
    params_with_lr = []
    for name, param in model.named_parameters():
        if "SEMFF.se" in name:
            params_with_lr.append({"params": param, "lr": args.lr/10})
        else:
            params_with_lr.append({"params": param})

    optimizer = torch.optim.Adam(params_with_lr, args.lr, weight_decay=args.weight_decay)
    """

    optimizer = torch.optim.Adam(model.parameters(),
                                 args.lr,
                                 weight_decay=args.weight_decay)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=args.step_size,
                                                gamma=0.1)
    print("step_size is set to %d" % scheduler.step_size)

    if args.dataset == 'nyud':
        train_loader = loaddata.getTrainingData(nyud_root_path, batch_size)
        test_loader = loaddata.getTestingData(nyud_root_path, batch_size)
    elif args.dataset == 'sun':
        train_loader = loaddata_sun.getTrainingData(sun_root_path, batch_size)
        test_loader = loaddata_sun.getTestingData(sun_root_path, batch_size)
    else:
        raise NotImplementedError('Specify dataset in [\'nyud\', \'sun\']')

    vis_out_dir = osp.join(out_dir, "outputs")

    for epoch in range(args.epochs):
        scheduler.step()
        e = epoch + args.start_epoch
        lr = args.lr * (0.1**(epoch // args.step_size))
        train(train_loader, model, optimizer, e, writer)

        results = None
        images = None
        pred_depths = None
        if isinstance(model.module, net.TBDPNet):
            mff_results, semff_results, images, mff_depths, semff_depths =\
                test.test_tbdp(test_loader, model, dataset=args.dataset, returnValue=True, returnSamples=True,
                                     sample_idx=[0, 1, 2, 3, 4, 5])
            results = [mff_results, semff_results]
            pred_depths = [mff_depths, semff_depths]

        elif isinstance(model.module, net.Hu):
            results, images, pred_depths = test.test_hu(
                test_loader,
                model,
                dataset=args.dataset,
                returnValue=True,
                returnSamples=True,
                sample_idx=[0, 1, 2, 3, 4, 5])

        for i in range(len(images)):
            if epoch == 0:
                os.makedirs(osp.join(vis_out_dir, "images"), exist_ok=True)
                image = np.clip(
                    denormalize_image(images[i].data.cpu().numpy(),
                                      mode='nyud').astype(np.uint8), 0, 254)
                image = visualize_array(image,
                                        f_name=osp.join(
                                            vis_out_dir, "images",
                                            str(i + 1) + ".png"))
            else:
                image = np.clip(
                    denormalize_image(images[i].data.cpu().numpy(),
                                      mode='nyud').astype(np.uint8), 0, 254)
                image = visualize_array(image)

            writer.add_figure("image/%d" % (i + 1), image, e)

            if isinstance(model.module, net.TBDPNet):
                mff_depths, semff_depths = pred_depths
                os.makedirs(osp.join(vis_out_dir, "mff_depths", str(i + 1)),
                            exist_ok=True)
                os.makedirs(osp.join(vis_out_dir, "semff_depths", str(i + 1)),
                            exist_ok=True)

                mff_depth = visualize_array(mff_depths[i].data.cpu().numpy(),
                                            f_name=osp.join(
                                                vis_out_dir, "mff_depths",
                                                str(i + 1),
                                                "%d_%d.png" % (i + 1, e)))
                semff_depth = visualize_array(
                    semff_depths[i].data.cpu().numpy(),
                    f_name=osp.join(vis_out_dir, "semff_depths", str(i + 1),
                                    "%d_%d.png" % (i + 1, e)))

                writer.add_figure("mff_prediction/%d" % (i + 1), mff_depth, e)
                writer.add_figure("semff_prediction/%d" % (i + 1), semff_depth,
                                  e)
            elif isinstance(model.module, net.Hu):
                os.makedirs(osp.join(vis_out_dir, "pred", str(i + 1)),
                            exist_ok=True)

                depth = visualize_array(pred_depths[i].data.cpu().numpy(),
                                        f_name=osp.join(
                                            vis_out_dir, "pred", str(i + 1),
                                            "%d_%d.png" % (i + 1, e)))
                writer.add_figure("prediction/%d" % (i + 1), depth, e)

        if isinstance(model.module, net.TBDPNet):
            mff_results, semff_results = results

            writer.add_scalar("mff/RMSE", mff_results["RMSE"], e)
            writer.add_scalar("mff/ABS_REL", mff_results["ABS_REL"], e)
            writer.add_scalar("mff/LG10", mff_results["LG10"], e)
            writer.add_scalar("mff/DELTA1", mff_results["DELTA1"], e)
            writer.add_scalar("mff/DELTA2", mff_results["DELTA2"], e)
            writer.add_scalar("mff/DELTA3", mff_results["DELTA3"], e)
            writer.add_scalar("mff/lr", lr, e)
            writer.add_scalar("semff/RMSE", semff_results["RMSE"], e)
            writer.add_scalar("semff/ABS_REL", semff_results["ABS_REL"], e)
            writer.add_scalar("semff/LG10", semff_results["LG10"], e)
            writer.add_scalar("semff/DELTA1", semff_results["DELTA1"], e)
            writer.add_scalar("semff/DELTA2", semff_results["DELTA2"], e)
            writer.add_scalar("semff/DELTA3", semff_results["DELTA3"], e)
            writer.add_scalar("semff/lr", lr, e)
        elif isinstance(model.module, net.Hu):
            writer.add_scalar("data/RMSE", results["RMSE"], e)
            writer.add_scalar("data/ABS_REL", results["ABS_REL"], e)
            writer.add_scalar("data/LG10", results["LG10"], e)
            writer.add_scalar("data/DELTA1", results["DELTA1"], e)
            writer.add_scalar("data/DELTA2", results["DELTA2"], e)
            writer.add_scalar("data/DELTA3", results["DELTA3"], e)
            writer.add_scalar("data/lr", lr, e)

        save_checkpoint(model.state_dict(), e, out_dir)
def main(dataset_train,
         dataset_validation,
         mmscaler,
         modelo='B',
         batch_size=100,
         num_epoch=50,
         p_dropout=0.5):
    writer = SummaryWriter('selected/' + modelo + '_' + str(batch_size) + '_' +
                           str(num_epoch) + '_' + str(p_dropout))
    red = Red_ArquitecturaB(input_size=168, output_size=24, p_drop=p_dropout)

    funcion_perdida = nn.MSELoss()
    optimizer = torch.optim.Adam(params=red.parameters(), lr=0.001)

    train = np.array(
        pd.read_csv(dataset_train, decimal=".", sep=",", header=None).values)
    train_ds = TemperaturaDataSet(train)
    train_dataloader = DataLoader(dataset=train_ds,
                                  shuffle=True,
                                  batch_size=batch_size)

    # entrenamiento de la red
    red.train()
    for epoch in range(num_epoch):
        batch = 0
        for x_train, y_train in train_dataloader:
            optimizer.zero_grad()
            x_train = x_train.type(torch.float)
            y_train = y_train.type(torch.float)

            y_pred_train = red(x_train)  # es lo mismo que red.forward(entrada)

            loss = funcion_perdida(y_pred_train, y_train)
            loss.backward()
            optimizer.step()
            print("Epoch: %2d Batch: %6d Loss: %2.8f ErrorMean: %2.8f" %
                  (epoch, batch, loss.item(), (y_pred_train - y_train).mean()))
            batch = batch + 1

        writer.add_scalar('data/train/loss', loss.item(), epoch)
        writer.add_scalar('data/train/ErrorMean',
                          (y_pred_train - y_train).mean(), epoch)
        for name, param in red.named_parameters():
            writer.add_histogram(name, param.clone().data.numpy(), epoch)

    # evaluacion de la validacion
    test = np.array(
        pd.read_csv(dataset_validation, decimal=".", sep=",",
                    header=None).values)
    test_ds = TemperaturaDataSet(test)
    test_dataloader = DataLoader(dataset=test_ds, shuffle=False, batch_size=1)

    temp_scaler = pickle.load(open(mmscaler, 'rb'))
    print(temp_scaler)

    red.eval()

    batch = 0
    for x_test, y_test in test_dataloader:
        x_test = x_test.type(torch.float)
        y_test = y_test.type(torch.float)

        y_pred_test = red(x_test)
        loss = funcion_perdida(y_pred_test, y_test)
        print("Batch: %6d Loss: %2.8f ErrorMean: %2.8f" %
              (batch, loss.item(), (y_pred_test - y_test).mean()))
        writer.add_scalar('data/test/loss', loss.item(), batch)
        writer.add_scalar('data/test/ErrorMean', (y_pred_test - y_test).mean(),
                          batch)
        if batch % 100 == 0:
            fig = plt.figure(figsize=(13, 6))
            plt.plot(
                temp_scaler.inverse_transform(y_test.data.numpy().reshape(
                    -1, 1)), 'b',
                temp_scaler.inverse_transform(y_pred_test.data.numpy().reshape(
                    -1, 1)), 'r')
            writer.add_figure('data/test/resultados', fig, batch)

        batch = batch + 1

    writer.close()
Exemplo n.º 5
0
class TBLogger(object):
    """
    xyz_dummies: stretch the screen with empty plots so the legend would
                 always fit for other plots
    """
    def __init__(self, local_rank, log_dir, name, interval=1, dummies=False):
        self.enabled = (local_rank == 0)
        self.interval = interval
        self.cache = {}
        if local_rank == 0:
            self.summary_writer = SummaryWriter(log_dir=os.path.join(
                log_dir, name),
                                                flush_secs=120,
                                                max_queue=200)
            atexit.register(self.summary_writer.close)
            if dummies:
                for key in ('aaa', 'zzz'):
                    self.summary_writer.add_scalar(key, 0.0, 1)

    def log_value(self, step, key, val, stat='mean'):
        if self.enabled:
            if key not in self.cache:
                self.cache[key] = []
            self.cache[key].append(val)
            if len(self.cache[key]) == self.interval:
                agg_val = getattr(np, stat)(self.cache[key])
                self.summary_writer.add_scalar(key, agg_val, step)
                del self.cache[key]

    def log_meta(self, step, meta):
        for k, v in meta.items():
            self.log_value(step, k, v.item())

    def log_grads(self, step: int, model):
        """Log max, min, mean gradients of the `model` at `step`.

        Args:
            step (int): Iteration number.
            model (nn.Model): 
        """
        if self.enabled:
            norms = [
                p.grad.norm().item() for p in model.parameters()
                if p.grad is not None
            ]
            for stat in ('max', 'min', 'mean'):
                self.log_value(step,
                               f'grad_{stat}',
                               getattr(np, stat)(norms),
                               stat=stat)

    def log_image(self,
                  tag: str,
                  img_tensor,
                  global_step=None,
                  walltime: float = None,
                  dataformats: str = 'CHW'):
        """Add image to log via tensorboardX.

        Args:
            tag (str): Data identifier
            img_tensor (Union[torch.Tensor, numpy.array]): An uint8 or float Tensor
                of shape [channel, height, width] where channel is 1, 3, or 4.
                The elements in img_tensor can either have values in [0, 1] (float32)
                or [0, 255] (uint8). Users are responsible to scale the data in
                the correct range/type.
            walltime (float, optional): Optional override default walltime (time.time()) of event.
                Defaults to None.
            dataformats (str, optional): Specifies the meaning of each dimension of
                the input tensor. Supported: CHW, HWC, HW. Defaults to 'CHW'.
        """
        if self.enabled:
            self.summary_writer.add_image(tag,
                                          img_tensor,
                                          global_step=global_step,
                                          walltime=walltime,
                                          dataformats=dataformats)

    def log_figure(self,
                   tag: str,
                   img_tensor,
                   global_step=None,
                   walltime: float = None):
        """Add image to log via tensorboardX.

        Args:
            tag (str): Data identifier
            img_tensor (Union[torch.Tensor, numpy.array]): An uint8 or float Tensor
                of shape [channel, height, width] where channel is 1, 3, or 4.
                The elements in img_tensor can either have values in [0, 1] (float32)
                or [0, 255] (uint8). Users are responsible to scale the data in
                the correct range/type.
            walltime (float, optional): Optional override default walltime (time.time()) of event.
                Defaults to None.
            dataformats (str, optional): Specifies the meaning of each dimension of
                the input tensor. Supported: CHW, HWC, HW. Defaults to 'CHW'.
        """
        if self.enabled:
            self.summary_writer.add_figure(tag,
                                           img_tensor,
                                           global_step=global_step,
                                           walltime=walltime)
Exemplo n.º 6
0
class Logger:
    def __init__(self, log_dir, n_logged_samples=10, summary_writer=None):
        self._log_dir = log_dir
        self._n_logged_samples = n_logged_samples
        if summary_writer is not None:
            self._summ_writer = summary_writer
        else:
            self._summ_writer = SummaryWriter(log_dir)

    def _loop_batch(self, fn, name, val, *argv, **kwargs):
        """Loops the logging function n times."""
        for log_idx in range(min(self._n_logged_samples, len(val))):
            name_i = os.path.join(name, "_%d" % log_idx)
            fn(name_i, val[log_idx], *argv, **kwargs)

    @staticmethod
    def _check_size(val, size):
        if isinstance(val, torch.Tensor) or isinstance(val, np.ndarray):
            assert len(
                val.shape
            ) == size, "Size of tensor does not fit required size, {} vs {}".format(
                len(val.shape), size)
        elif isinstance(val, list):
            assert len(
                val[0].shape
            ) == size - 1, "Size of list element does not fit required size, {} vs {}".format(
                len(val[0].shape), size - 1)
        else:
            raise NotImplementedError(
                "Input type {} not supported for dimensionality check!".format(
                    type(val)))
        if (val[0].shape[1] > 10000) or (val[0].shape[2] > 10000):
            raise ValueError("This might be a bit too much")

    def log_scalar(self, scalar, name, step, phase):
        self._summ_writer.add_scalar('{}_{}'.format(name, phase), scalar, step)

    def log_scalars(self, scalar_dict, group_name, step, phase):
        """Will log all scalars in the same plot."""
        self._summ_writer.add_scalars('{}_{}'.format(group_name, phase),
                                      scalar_dict, step)

    def log_images(self, image, name, step, phase):
        self._check_size(image, 4)  # [N, C, H, W]
        self._loop_batch(self._summ_writer.add_image,
                         '{}_{}'.format(name, phase), image, step)

    def log_video(self, video_frames, name, step, phase):
        assert len(
            video_frames.shape
        ) == 4, "Need [T, C, H, W] input tensor for single video logging!"
        if not isinstance(video_frames, torch.Tensor):
            video_frames = torch.tensor(video_frames)
        video_frames = torch.transpose(video_frames, 0,
                                       1)  # tbX requires [C, T, H, W]
        video_frames = video_frames.unsqueeze(
            0)  # add an extra dimension to get grid of size 1
        self._summ_writer.add_video('{}_{}'.format(name, phase), video_frames,
                                    step)

    def log_videos(self, video_frames, name, step, phase, fps=3):
        assert len(
            video_frames.shape
        ) == 5, "Need [N, T, C, H, W] input tensor for video logging!"
        video_frames = video_frames.unsqueeze(
            1)  # add an extra dimension after batch to get grid of size 1
        self._loop_batch(self._summ_writer.add_video,
                         '{}_{}'.format(name, phase),
                         video_frames,
                         step,
                         fps=fps)

    def log_image(self, images, name, step, phase):
        self._summ_writer.add_image('{}_{}'.format(name, phase), images, step)

    def log_image_grid(self, images, name, step, phase, nrow=8):
        assert len(
            images.shape
        ) == 4, "Image grid logging requires input shape [batch, C, H, W]!"
        img_grid = torchvision.utils.make_grid(images, nrow=nrow)
        self.log_images(img_grid, '{}_{}'.format(name, phase), step)

    def log_video_grid(self, video_frames, name, step, phase, fps=3):
        assert len(
            video_frames.shape
        ) == 5, "Need [N, T, C, H, W] input tensor for video logging!"
        self._summ_writer.add_video('{}_{}'.format(name, phase),
                                    video_frames,
                                    step,
                                    fps=fps)

    def log_figures(self, figure, name, step, phase):
        """figure: matplotlib.pyplot figure handle"""
        assert figure.shape[
            0] > 0, "Figure logging requires input shape [batch x figures]!"
        self._loop_batch(self._summ_writer.add_figure,
                         '{}_{}'.format(name, phase), figure, step)

    def log_figure(self, figure, name, step, phase):
        """figure: matplotlib.pyplot figure handle"""
        self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)

    def log_graph(self, array, name, step, phase):
        """figure: matplotlib.pyplot figure handle"""
        im = plot_graph(array)
        self._summ_writer.add_image('{}_{}'.format(name, phase), im, step)

    def dump_scalars(self, log_path=None):
        log_path = os.path.join(
            self._log_dir,
            "scalar_data.json") if log_path is None else log_path
        self._summ_writer.export_scalars_to_json(log_path)
Exemplo n.º 7
0
def infer(model,
          path,
          detections_file,
          resize,
          max_size,
          batch_size,
          mixed_precision=True,
          is_master=True,
          world=0,
          annotations=None,
          use_dali=True,
          is_validation=False,
          verbose=True,
          logdir=None,
          iteration=100):
    'Run inference on images from path'

    backend = 'pytorch' if isinstance(model, Model) or isinstance(
        model, DDP) else 'tensorrt'

    stride = model.module.stride if isinstance(model, DDP) else model.stride

    # Create annotations if none was provided
    if not annotations:
        annotations = tempfile.mktemp('.json')
        images = [{
            'id': i,
            'file_name': f
        } for i, f in enumerate(os.listdir(path))]
        json.dump({'images': images}, open(annotations, 'w'))

    # TensorRT only supports fixed input sizes, so override input size accordingly
    if backend == 'tensorrt': max_size = max(model.input_size)

    # Prepare dataset
    if verbose: print('Preparing dataset...')
    data_iterator = (DaliDataIterator if use_dali else DataIterator)(
        path,
        resize,
        max_size,
        batch_size,
        stride,
        world,
        annotations,
        training=False)
    if verbose: print(data_iterator)

    # Prepare model
    if backend is 'pytorch':
        # If we are doing validation during training,
        # no need to register model with AMP again
        if not is_validation:
            if torch.cuda.is_available(): model = model.cuda()
            model = amp.initialize(model,
                                   None,
                                   opt_level='O2' if mixed_precision else 'O0',
                                   keep_batchnorm_fp32=True,
                                   verbosity=0)

        model.eval()

    if verbose:
        print('   backend: {}'.format(backend))
        print('    device: {} {}'.format(
            world, 'cpu' if not torch.cuda.is_available() else
            'gpu' if world == 1 else 'gpus'))
        print('     batch: {}, precision: {}'.format(
            batch_size, 'unknown' if backend is 'tensorrt' else
            'mixed' if mixed_precision else 'full'))
        print('Running inference...')

    results = []
    profiler = Profiler(['infer', 'fw'])
    with torch.no_grad():
        for i, (data, ids, ratios) in enumerate(data_iterator):
            # Forward pass
            profiler.start('fw')
            scores, boxes, classes = model(data)
            profiler.stop('fw')

            results.append([scores, boxes, classes, ids, ratios])

            profiler.bump('infer')
            if verbose and (profiler.totals['infer'] > 60
                            or i == len(data_iterator) - 1):
                size = len(data_iterator.ids)
                msg = '[{:{len}}/{}]'.format(min((i + 1) * batch_size, size),
                                             size,
                                             len=len(str(size)))
                msg += ' {:.3f}s/{}-batch'.format(profiler.means['infer'],
                                                  batch_size)
                msg += ' (fw: {:.3f}s)'.format(profiler.means['fw'])
                msg += ', {:.1f} im/s'.format(batch_size /
                                              profiler.means['infer'])
                print(msg, flush=True)

                profiler.reset()

    # Gather results from all devices
    if verbose: print('Gathering results...')
    results = [torch.cat(r, dim=0) for r in zip(*results)]
    if world > 1:
        for r, result in enumerate(results):
            all_result = [
                torch.ones_like(result, device=result.device)
                for _ in range(world)
            ]
            torch.distributed.all_gather(list(all_result), result)
            results[r] = torch.cat(all_result, dim=0)

    if is_master:
        # Copy buffers back to host
        results = [r.cpu() for r in results]

        # Collect detections
        detections = []
        processed_ids = set()
        for scores, boxes, classes, image_id, ratios in zip(*results):
            image_id = image_id.item()
            if image_id in processed_ids:
                continue
            processed_ids.add(image_id)

            keep = (scores > 0).nonzero()
            scores = scores[keep].view(-1)
            boxes = boxes[keep, :].view(-1, 4) / ratios
            classes = classes[keep].view(-1).int()

            for score, box, cat in zip(scores, boxes, classes):
                x1, y1, x2, y2 = box.data.tolist()
                cat = cat.item()
                if 'annotations' in data_iterator.coco.dataset:
                    cat = data_iterator.coco.getCatIds()[cat]
                detections.append({
                    'image_id': image_id,
                    'score': score.item(),
                    'bbox': [x1, y1, x2 - x1 + 1, y2 - y1 + 1],
                    'category_id': cat
                })

        if detections:
            # Save detections
            if detections_file and verbose:
                print('Writing {}...'.format(detections_file))
            detections = {'annotations': detections}
            detections['images'] = data_iterator.coco.dataset['images']
            if 'categories' in data_iterator.coco.dataset:
                detections['categories'] = [
                    data_iterator.coco.dataset['categories']
                ]
            if detections_file:
                json.dump(detections, open(detections_file, 'w'), indent=4)

            # Evaluate model on dataset
            if 'annotations' in data_iterator.coco.dataset:
                if verbose: print('Evaluating model...')
                with redirect_stdout(None):
                    coco_pred = data_iterator.coco.loadRes(
                        detections['annotations'])
                    coco_eval = COCOeval(data_iterator.coco, coco_pred, 'bbox')
                    coco_eval.evaluate()
                    coco_eval.accumulate()
                coco_eval.summarize()
                results = coco_eval.stats
                # Create TensorBoard writer
                if logdir is not None:
                    from tensorboardX import SummaryWriter
                    if is_master and verbose:
                        print('Infer writer: Writing TensorBoard logs to: {}'.
                              format(logdir))
                    writer = SummaryWriter(logdir=logdir)
                    if results != []:
                        writer.add_scalar(
                            'Average Precision/IoU=0.50:0.95/area=all/maxDets=100',
                            results[0], iteration)
                        writer.add_scalar(
                            'Average Precision/IoU=0.50/area=all/maxDets=100',
                            results[1], iteration)
                        writer.add_scalar(
                            'Average Precision/IoU=0.75/area=all/maxDets=100',
                            results[2], iteration)
                        writer.add_scalar(
                            'Average Precision/IoU=0.50:0.95/area=small/maxDets=100',
                            results[3], iteration)
                        writer.add_scalar(
                            'Average Precision/IoU=0.50:0.95/area=medium/maxDets=100',
                            results[4], iteration)
                        writer.add_scalar(
                            'Average Precision/IoU=0.50:0.95/area=large/maxDets=100',
                            results[5], iteration)
                        writer.add_scalar(
                            'Average Recall/IoU=0.50:0.95/area=all/maxDets=1',
                            results[6], iteration)
                        writer.add_scalar(
                            'Average Recall/IoU=0.50:0.95/area=all/maxDets=10',
                            results[7], iteration)
                        writer.add_scalar(
                            'Average Recall/IoU=0.50:0.95/area=all/maxDets=100',
                            results[8], iteration)
                        writer.add_scalar(
                            'Average Recall/IoU=0.50:0.95/area= small/maxDets=100',
                            results[9], iteration)
                        writer.add_scalar(
                            'Average Recall/IoU=0.50:0.95/area=medium/maxDets=100',
                            results[10], iteration)
                        writer.add_scalar(
                            'Average Recall/IoU=0.50:0.95/area= large/maxDets=100',
                            results[11], iteration)
                    writer.close()
        else:
            print('No detections!')

        if logdir is not None and detections_file is not None:
            from tensorboardX import SummaryWriter
            if is_master and verbose:
                print('Writing TensorBoard logs to: {}'.format(logdir))
            writer = SummaryWriter(logdir=logdir)

            def get_bounding_boxes(annotations: List, image_id: int) -> List:
                return [a for a in annotations if a["image_id"] == image_id]

            with open(detections_file, "r") as file:
                all_detections = json.load(file)

            with open(annotations, "r") as file:
                all_ground_truths = json.load(file)

            i = 0
            for image_json in all_detections["images"][:3]:
                image_id = image_json["id"]
                image_path = path + '/' + image_json["file_name"]
                image = io.imread(image_path)

                assert (image_json["file_name"] == [
                    x["file_name"] for x in all_ground_truths["images"]
                    if x["id"] == image_id
                ][0])

                fig, ax = plt.subplots(figsize=(16, 16))
                ax.imshow(image)

                detections = get_bounding_boxes(all_detections["annotations"],
                                                image_id)
                detections = [d for d in detections if d["score"] > 0.5]

                ground_truths = get_bounding_boxes(
                    all_ground_truths["annotations"], image_id)

                for d in detections:
                    x, y, width, height = d["bbox"]
                    score = d["score"]
                    category_id = d["category_id"]

                    rectangle = patches.Rectangle(
                        (x, y),
                        width,
                        height,
                        linewidth=2,
                        edgecolor="r",
                        facecolor="none",
                    )
                    ax.add_patch(rectangle)

                    ax.text(
                        x,
                        y - 4,
                        f"{category_id}: {score:0.2f}",
                        color="r",
                        fontsize=20,
                        fontweight="bold",
                    )

                for gt in ground_truths:
                    x, y, width, height = gt["bbox"]

                    rectangle = patches.Rectangle(
                        (x, y),
                        width,
                        height,
                        linewidth=2,
                        edgecolor="b",
                        facecolor="none",
                    )
                    ax.add_patch(rectangle)

                ax.axis("off")
                writer.add_figure('images', fig, i)
                i += 1

            writer.close()
Exemplo n.º 8
0
with torch.no_grad():
    # Everything is in one batch, so this loop will only happen once
    for i, data in enumerate(vizloader):
        maze_loc_goal_ssps, directions, locs, goals = data

        outputs = model(maze_loc_goal_ssps)

        loss = criterion(outputs, directions)

        # print(loss.data.item())

    if args.logdir != '':
        fig_pred = plot_path_predictions(directions=outputs,
                                         coords=locs,
                                         type='colour')
        writer.add_figure('viz set predictions', fig_pred)
        fig_truth = plot_path_predictions(directions=directions,
                                          coords=locs,
                                          type='colour')
        writer.add_figure('ground truth', fig_truth)

        fig_pred_quiver = plot_path_predictions(directions=outputs,
                                                coords=locs,
                                                dcell=xs[1] - xs[0])
        writer.add_figure('viz set predictions quiver', fig_pred_quiver)
        fig_truth_quiver = plot_path_predictions(directions=directions,
                                                 coords=locs,
                                                 dcell=xs[1] - xs[0])
        writer.add_figure('ground truth quiver', fig_truth_quiver)

        writer.add_scalar('viz_loss', loss.data.item())
Exemplo n.º 9
0
def main(args):
    ''' --- SELECT DEVICES --- '''
    # Select either gpu or cpu
    device = torch.device("cuda" if args.cuda else "cpu")
    # Select among available GPUs
    if args.cuda:
        os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(
            str(x) for x in args.gpudevice)
    ''' --- CREATE EXPERIMENTS DIRECTORY AND LOGGERS IN TENSORBOARD --- '''
    projdir = sys.path[0]
    # Path for saving and loading the network.
    saveloadpath = os.path.join(projdir, 'experiment\\checkpoints',
                                args.exp_name + '.pth')
    Path(os.path.dirname(saveloadpath)).mkdir(exist_ok=True, parents=True)
    # timestamp = str(datetime.datetime.now().strftime('%Y-%m-%d-%H-%M'))
    tblogdir = os.path.join(projdir, 'experiment\\tensorboardX',
                            args.exp_name)  # + '_' + timestamp )
    Path(tblogdir).mkdir(exist_ok=True, parents=True)
    # Create tb_writer(the writer will be used to write the information on tb) by using SummaryWriter,
    # flush_secs defines how much seconds need to wait for writing information.
    tb_writer = SummaryWriter(logdir=tblogdir,
                              flush_secs=3,
                              write_to_disk=True)
    ''' --- INIT DATASETS AND DATALOADER (FOR SINGLE EPOCH) --- '''
    # Read data from file, and create training data and testing data which are both in multiple frames. Beware Ts is
    # recording for every frame, i.e. every 82ms the automotive radar records once to form single frame(We need this information for LSTM).
    train_dataset, test_dataset, class_names = read_dataset(
        args.datapath, Ts=0.082, train_test_split=0.8)

    # Prepare the traing and testing dataset. both trainDataset and testDataset are dataset have multiple frames data,
    # for each frame it contains the "unified number of detection points"(NMAX detection points per frame).

    # Init test dataset(Beware we should NOT use data augmentation for test dataset), beware we need "Resampling(maxPointsPerFrame=10)"
    # as long as we use PointNet/PointNet++ as feature extractor. We do NOT need "Resampling(maxPointsPerFrame=10)" asif rely on feature
    # engineering to extract feature.
    test_dataTransformations = transforms.Compose([
        NormalizeTime(),
        FeatureEngineering() if args.head_network == 'manual' else Resampling(
            maxPointsPerFrame=10)
    ])
    testDataset = RadarClassDataset(dataset=test_dataset,
                                    transforms=test_dataTransformations,
                                    sequence_length=args.sequence_length)
    # Init train datasets, beware we need "Resampling(maxPointsPerFrame=10)" as long as we use PointNet/PointNet++ as feature extractor.
    # We do NOT need "Resampling(maxPointsPerFrame=10)" asif rely on feature engineering to extract feature.
    train_dataTransformations = transforms.Compose([
        NormalizeTime(),
        DataAugmentation(),
        FeatureEngineering() if args.head_network == 'manual' else Resampling(
            maxPointsPerFrame=10)
    ])
    trainDataset = RadarClassDataset(dataset=train_dataset,
                                     transforms=train_dataTransformations,
                                     sequence_length=args.sequence_length)
    # Create dataloader for training by using batch_size frames' data in each batch
    trainDataLoader = DataLoader(trainDataset,
                                 batch_size=args.batchsize,
                                 shuffle=True,
                                 num_workers=args.num_workers)
    ''' --- INIT NETWORK MODEL --- '''
    # Load PointLSTM network model and put it to right device
    classifier = PointLSTM(
        head_name=args.head_network,
        num_class=args.numclasses,
        pointCoordDim=6,
        # If args.head_network=='manual', num_features should be same as how many features have been extracted in class FeatureEngineering of file RadarFeatureTransforms.py
        num_features=34 if args.head_network == 'manual' else
        128  # Note num_features = 34, NOT 15, is because we have 'hist_v', 'hist_RCS' are the vector with size [10, 1] and 'eig_cov_xy' is vecotr with size [2, 1]
    ).to(device)
    ''' --- LOAD NETWORK IF EXISTS --- '''
    if os.path.exists(saveloadpath):
        print('Using pretrained model found...')
        checkpoint = torch.load(saveloadpath)
        start_epoch = checkpoint[
            'epoch'] + 1  # Just becase make sure counting starts from 1, 2, ..., rather than 0, 1, ..., when print the information of start_epoch
        iteration = checkpoint['iteration']
        best_test_acc = checkpoint['test_accuracy']
        classifier.load_state_dict(checkpoint['model_state_dict'])
    else:
        print('No existing model, starting training from scratch...')
        start_epoch = 1  # Just becase make sure counting starts from 1, 2, ..., rather than 0, 1, ..., when print the information of start_epoch
        iteration = 1  # Just becase make sure counting starts from 1, 2, ..., rather than 0, 1, ..., when print the information of iteration
        best_test_acc = 0
    ''' --- CREATE OPTIMIZER ---'''
    if args.optimizer == 'SGD':
        optimizer = torch.optim.SGD(classifier.parameters(),
                                    lr=args.lr,
                                    momentum=0.9)
    elif args.optimizer == 'ADAM':
        optimizer = torch.optim.Adam(classifier.parameters(),
                                     lr=args.lr,
                                     betas=(0.9, 0.999),
                                     eps=1e-08,
                                     weight_decay=args.decay_rate)
    scheduler = torch.optim.lr_scheduler.StepLR(
        optimizer, step_size=args.lr_epoch_half,
        gamma=0.5)  # half(0.5) the learning rate every 'step_size' epochs

    # log info
    printparams = 'Model parameters:' + json.dumps(
        vars(args), indent=4, sort_keys=True)
    print(printparams)
    tb_writer.add_text('hyper-parameters', printparams,
                       iteration)  # tb_writer.add_hparam(args)
    tb_writer.add_text(
        'dataset', 'dataset sample size: training: {}, test: {}'.format(
            train_dataset.shape[0], test_dataset.shape[0]), iteration)
    ''' --- START TRANING ---'''
    for epoch in range(start_epoch, args.epoch + 1):
        print('Epoch %d/%s:' % (epoch, args.epoch))

        # Add the "learning rate" into tensorboard scalar which will be shown in tensorboard
        tb_writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'],
                             iteration)

        for batch_id, data in tqdm(enumerate(trainDataLoader, 0),
                                   total=len(trainDataLoader),
                                   smoothing=0.9):
            points, target = data  # (B:batch x S:seq x C:features x N:points) , (B x S:seq)
            # Squeeze to drop Sequence dimension, which is equal to 1, convert all the data to float(otherwise there will be data type problems when running the model) and move to device
            points, target = points.float().to(device), target.float().to(
                device)  # (B:batch x S:sequence, C:features x N:points) , (B)
            # points, target = points.float().to(device), target.float().to(device)
            # Reset gradients
            optimizer.zero_grad()
            # Sets the module in training mode
            classifier = classifier.train()
            # Forward propagation
            pred = classifier(points)
            # Calculate cross entropy loss (In the pointnet/pointnet2 network model, it outputs log_softmax result. Since
            # "log_softmax -> nll_loss" == CrossEntropyLoss, so that we just need to call F.nll_loss)
            loss = F.nll_loss(pred, target.long())
            if args.head_network == 'pointnet':
                loss += feature_transform_regularizer(
                    classifier.head.trans) * 0.001
                if args.feature_transform:
                    loss += feature_transform_regularizer(
                        classifier.head.trans_feat) * 0.001
            # Back propagate
            loss.backward()
            # Update weights
            optimizer.step()
            # Log once for every 5 batches, add the "train_loss/cross_entropy" into tensorboard scalar which will be shown in tensorboard
            if not batch_id % 5:
                tb_writer.add_scalar('train_loss/cross_entropy', loss.item(),
                                     iteration)
            iteration += 1
            # if batch_id> 2: break

        scheduler.step()
        ''' --- TEST AND SAVE NETWORK --- '''
        if not epoch % 10:  # doing the following things every epoch
            # Perform predictions on the training data
            train_targ, train_pred = test(classifier,
                                          trainDataset,
                                          device,
                                          num_workers=args.num_workers,
                                          batch_size=1800)
            # Perform predictions on the testing data
            test_targ, test_pred = test(classifier,
                                        testDataset,
                                        device,
                                        num_workers=args.num_workers,
                                        batch_size=1800)

            # Calculate the accuracy rate for training data
            train_acc = metrics_accuracy(train_targ, train_pred)
            # Calculate the accuracy rate for testing data
            test_acc = metrics_accuracy(test_targ, test_pred)
            print('\r Training loss: {}'.format(loss.item()))
            print('Train Accuracy: {}\nTest Accuracy: {}'.format(
                train_acc, test_acc))
            # Add the "train_acc" "test_acc" into tensorboard scalars which will be shown in tensorboard
            tb_writer.add_scalars('metrics/accuracy', {
                'train': train_acc,
                'test': test_acc
            }, iteration)

            # Calculate confusion matrix
            confmatrix_test = metrics_confusion_matrix(test_targ, test_pred)
            print('Test confusion matrix: \n', confmatrix_test)
            # Log confusion matrix
            fig, ax = plot_confusion_matrix(confmatrix_test,
                                            class_names,
                                            normalize=False,
                                            title='Test Confusion Matrix')
            # Log normalized confusion matrix
            fig_n, ax_n = plot_confusion_matrix(
                confmatrix_test,
                class_names,
                normalize=True,
                title='Test Confusion Matrix - Normalized')
            # Add the "confusion matrix" "normalized confusion matrix" into tensorboard figure which will be shown in tensorboard
            tb_writer.add_figure('test_confusion_matrix/abs',
                                 fig,
                                 global_step=iteration,
                                 close=True)
            tb_writer.add_figure('test_confusion_matrix/norm',
                                 fig_n,
                                 global_step=iteration,
                                 close=True)

            # Log precision recall curves
            for idx, clsname in enumerate(class_names):
                # Convert log_softmax to softmax(which is actual probability) and select the desired class
                test_pred_binary = torch.exp(test_pred[:, idx])
                test_targ_binary = test_targ.eq(idx)
                # Add the "precision recall curves" which will be shown in tensorboard
                tb_writer.add_pr_curve(tag='pr_curves/' + clsname,
                                       labels=test_targ_binary,
                                       predictions=test_pred_binary,
                                       global_step=iteration)
            ''' --- SAVE NETWORK --- '''
            # if (test_acc >= best_test_acc): # for now lets save every time, since we are only testing in a subset of the test dataset
            best_test_acc = test_acc  # if test_acc > best_test_acc else best_test_acc
            state = {
                'epoch': epoch,
                'iteration': iteration,
                'train_accuracy': train_acc if args.train_metric else 0.0,
                'test_accuracy': best_test_acc,
                'model_state_dict': classifier.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
            }
            torch.save(state, saveloadpath)
            print('Model saved!!!')

    print('Best Accuracy: %f' % best_test_acc)

    tb_writer.close()
Exemplo n.º 10
0
def train(batch_size, num_train_steps, generator, discriminator, model_dir,
          beat_type, device):

    #
    # Support for tensorboard:
    #
    writer = SummaryWriter(model_dir)

    #
    # 1. create the ECG dataset:
    #
    positive_configs = dataset_configs.DatasetConfigs(
        'train',
        beat_type,
        one_vs_all=True,
        lstm_setting=False,
        over_sample_minority_class=False,
        under_sample_majority_class=False,
        only_take_heartbeat_of_type=beat_type,
        add_data_from_gan=False,
        gan_configs=None)

    dataset = ecg_dataset_pytorch.EcgHearBeatsDatasetPytorch(
        positive_configs, transform=ecg_dataset_pytorch.ToTensor())

    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             shuffle=True,
                                             num_workers=1)
    print("Size of real dataset is {}".format(len(dataset)))

    #
    # 2. Create the Networks:
    #
    netG = generator.float()
    netD = discriminator.float()

    num_d_iters = 5
    weight_cliping_limit = 0.01
    #
    # Loss functions for WGAN:
    #

    # Optimizers:
    # WGAN values from paper
    lr = 0.00005

    writer.add_scalar('Learning_Rate', lr)
    # WGAN with gradient clipping uses RMSprop instead of ADAM
    optimizer_d = torch.optim.RMSprop(netD.parameters(), lr=lr)
    optimizer_g = torch.optim.RMSprop(netG.parameters(), lr=lr)

    # Noise for validation:
    val_noise = torch.from_numpy(np.random.uniform(
        0, 1, (4, 100))).float().to(device)
    loss_d_real_hist = []
    loss_d_fake_hist = []
    loss_g_fake_hist = []
    norma_grad_g = []
    norm_grad_d = []
    d_real_pred_hist = []
    d_fake_pred_hist = []
    epoch = 0
    iters = 0
    while True:
        num_of_beats_seen = 0
        if iters == num_train_steps:
            break
        for i, data in enumerate(dataloader):
            if iters == num_train_steps:
                break

            # Train Dicriminator forward - loss - backward - update num_d_iters times while 1 Generator
            # forward-loss-backward-update
            for p in netD.parameters():
                p.requires_grad = True
            for d_iter in range(num_d_iters):

                netD.zero_grad()

                # Clamp parameters to a range [-c, c], c=self.weight_cliping_limit
                for p in netD.parameters():
                    p.data.clamp_(-weight_cliping_limit, weight_cliping_limit)

                ecg_batch = data['cardiac_cycle'].float().to(device)
                b_size = ecg_batch.shape[0]

                # Check for batch to have full batch_size
                if (b_size != batch_size):
                    continue
                num_of_beats_seen += ecg_batch.shape[0]

                output = netD(ecg_batch)

                # Adversarial loss
                loss_d_real = -torch.mean(output)

                writer.add_scalar('Discriminator/cross_entropy_on_real_batch',
                                  loss_d_real.item(),
                                  global_step=iters)
                writer.add_scalars(
                    'Merged/losses',
                    {'d_cross_entropy_on_real_batch': loss_d_real.item()},
                    global_step=iters)
                loss_d_real.backward()
                loss_d_real_hist.append(loss_d_real.item())

                mean_d_real_output = output.mean().item()
                d_real_pred_hist.append(mean_d_real_output)

                #
                # D loss from fake:
                #
                noise_input = torch.from_numpy(
                    np.random.uniform(0, 1, (b_size, 100))).float().to(device)

                output_g_fake = netG(noise_input)
                output = netD(output_g_fake.detach())

                loss_d_fake = torch.mean(output)
                # ce_loss_d_fake = cross_entropy_loss(output, labels)
                writer.add_scalar('Discriminator/cross_entropy_on_fake_batch',
                                  loss_d_fake.item(), iters)
                writer.add_scalars(
                    'Merged/losses',
                    {'d_cross_entropy_on_fake_batch': loss_d_fake.item()},
                    global_step=iters)
                loss_d_fake.backward()

                loss_d_fake_hist.append(loss_d_fake.item())

                mean_d_fake_output = output.mean().item()
                d_fake_pred_hist.append(mean_d_fake_output)
                total_loss_d = loss_d_fake + loss_d_real
                writer.add_scalar(tag='Discriminator/total_loss',
                                  scalar_value=total_loss_d.item(),
                                  global_step=iters)
                optimizer_d.step()

            #
            # Generator updates:
            #
            for p in netD.parameters():
                p.requires_grad = False  # to avoid computation

            netG.zero_grad()

            noise_input = torch.from_numpy(
                np.random.uniform(0, 1, (batch_size, 100))).float().to(device)

            output_g_fake = netG(noise_input)

            output = netD(output_g_fake)

            # Adversarial loss
            loss_g_fake = -torch.mean(output)

            loss_g_fake.backward()
            loss_g_fake_hist.append(loss_g_fake.item())
            writer.add_scalar(tag='Generator/cross_entropy_on_fake_batch',
                              scalar_value=loss_g_fake.item(),
                              global_step=iters)
            writer.add_scalars(
                'Merged/losses',
                {'g_cross_entropy_on_fake_batch': loss_g_fake.item()},
                global_step=iters)
            mean_d_fake_output_2 = output.mean().item()

            optimizer_g.step()

            print(
                "{}/{}: Epoch #{}: Iteration #{}: Mean D(real_hb_batch) = {}, mean D(G(z)) = {}."
                .format(num_of_beats_seen, len(dataset), epoch, iters,
                        mean_d_real_output, mean_d_fake_output),
                end=" ")
            print("mean D(G(z)) = {} After backprop of D".format(
                mean_d_fake_output_2))

            print(
                "Loss D from real beats = {}. Loss D from Fake beats = {}. Total Loss D = {}"
                .format(loss_d_real, loss_d_fake, total_loss_d),
                end=" ")
            print("Loss G = {}".format(loss_g_fake))

            # Norma of gradients:
            gNormGrad = get_gradient_norm_l2(netG)
            dNormGrad = get_gradient_norm_l2(netD)
            writer.add_scalar('Generator/gradients_norm', gNormGrad, iters)
            writer.add_scalar('Discriminator/gradients_norm', dNormGrad, iters)
            norm_grad_d.append(dNormGrad)
            norma_grad_g.append(gNormGrad)
            print(
                "Generator Norm of gradients = {}. Discriminator Norm of gradients = {}."
                .format(gNormGrad, dNormGrad))

            if iters % 25 == 0:
                with torch.no_grad():
                    output_g = netG(val_noise)
                    fig = plt.figure()
                    plt.title(
                        "Fake beats from Generator. iteration {}".format(i))
                    for p in range(4):
                        plt.subplot(2, 2, p + 1)
                        plt.plot(output_g[p].cpu().detach().numpy(),
                                 label="fake beat")
                        plt.plot(ecg_batch[p].cpu().detach().numpy(),
                                 label="real beat")
                        plt.legend()
                    writer.add_figure('Generator/output_example', fig, iters)
                    plt.close()
            iters += 1
        epoch += 1

    torch.save(
        {
            'epoch': epoch,
            'generator_state_dict': netG.state_dict(),
            'discriminator_state_dict': netD.state_dict(),
            'optimizer_g_state_dict': optimizer_g.state_dict(),
            'optimizer_d_state_dict': optimizer_d.state_dict(),
        }, model_dir + '/checkpoint_epoch_{}_iters_{}'.format(epoch, iters))
    writer.close()
Exemplo n.º 11
0
def test(test_loader,model):
    top1 = AverageMeter(config)
    top5 = AverageMeter(config)
    matrix = runningScore(config=config)
    matrix.reset()
    times=0.0
    timeall =0.0
    precision1=0
    precision5=0
    #3.1 confirm the model converted to cuda
    # progress bar
    test_progressor = ProgressBar(mode="test",model_name=config.model_name, total=len(test_loader),weights=config.weights,Status=config.Status,current_time=config.time)
    # 2.2 switch to evaluate mode and confirm model has been transfered to cuda
    model.cuda()
    model.eval()
    with torch.no_grad():
        for i, sample in enumerate(test_loader):
            image=sample['image']
            target=sample['label']
            test_progressor.current = i
            input2_size = image.size()
            input2 = np.zeros(input2_size).astype(np.float32)
            input2 = torch.from_numpy(input2).cuda()
            input = image.cuda()
            target = target.cuda()
            #target = Variable(target).cuda()
            # 2.2.1 compute output
            torch.cuda.synchronize()
            start = time.time()
            _,output= model(input2,input)
            torch.cuda.synchronize()
            end = time.time()
            times=end-start
            timeall=timeall+times
            # output=output.squeeze(2)
            # output=output.squeeze(2)
        
            # 2.2.2 measure accuracy and record loss
            precision1, precision5 = accuracy(output, target, topk=(1, 5))
            matrix.update(output,target)
            top1.update(precision1[0],input.size(0))
            # top1.perclass(class_correct,class_total)
            top5.update(precision5[0], input.size(0))
            test_progressor.current_top1 = top1.avg
            test_progressor.current_top5 = top5.avg
            test_progressor()

            _, predicted = torch.max(output, 1) 

            tag=obj[predicted.item()]
            right_label=obj[target.item()]
            resultdir=os.path.join(config.weights,config.model_name,config.Status,config.time)
            if os.path.exists( resultdir ):
                pass
            else:
                os.makedirs(resultdir)
            f=open(resultdir+'/upload.csv','a')
            csv_writer = csv.writer(f)
            csv_writer.writerow([right_label,tag])
            # img_path=resultdir+'/'+right_label+str(i)+'.png'
            # shutil.copy(str(origin_path),img_path)
        test_progressor.done()
        logdir = os.path.join(config.weights,config.model_name,config.Status,config.time)
        writer = SummaryWriter(logdir)
        confusion_matrix=matrix.get_value()
        np.save(logdir +'/confusion.npy',confusion_matrix)
        # writer.add_figure('confusion matrix',figure=plot_confusion_matrix(confusion_matrix, object_names=obj, title='Not Normalized confusion matrix',normalize=False,),global_step=1)
        writer.add_figure('confusion matrix',figure=plot_confusion_matrix(confusion_matrix, object_names=obj,title='Normalized confusion matrix',config=config,normalize=True),global_step=1)
        # fig=plot_confusion_matrix(confusion_matrix,obj,'Test Confusion_matrix')
        writer.close()
        precision,recall=matrix.get_scores()
        with open(os.path.join(config.weights,config.model_name,config.Status,config.time)+"/%s_test.txt"%config.model_name,"a") as f:
            for i in range(config.num_classes):
                print('Precision of %5s : %f %%' % (
                    obj[i], 100*precision[i]),file=f)
                print('Recall of %5s: %f%%'%(
                    obj[i], 100*recall[i]),file=f)
            print("Top1:%f,Top5:%f"%(top1.avg,top5.avg),file=f)
            print("avg Time:",timeall*1000/len(test_loader),"ms",file=f)
            writer.add_scalar('loss-gen', loss_gen, n_gen_update)
            writer.add_scalar('n_clip_params', float(n_clip_params)/n_params, n_gen_update)
            writer.add_scalar('gen-grad-norm', gen_grad_norm, n_gen_update)
            writer.add_scalar('dis-grad-norm', dis_grad_norm, n_gen_update)

            writer.add_scalar('avg_loss-dis', loss_dis_avg, n_gen_update)
            writer.add_scalar('avg_loss-gen', loss_gen_avg, n_gen_update)
            writer.add_scalar('avg_n_clip_params', float(n_clip_params_avg)/n_params, n_gen_update)
            writer.add_scalar('avg_grad-norm-gen', gen_grad_norm_avg, n_gen_update)
            writer.add_scalar('avg_grad-norm-dis', dis_grad_norm_avg, n_gen_update)


            x_gen = gen(z)
            fig = plt.figure()
            plt.hist(x_gen.cpu().squeeze().data, bins=100)
            writer.add_figure('hist', fig, n_gen_update)
            plt.clf()

            fig = plt.figure()
            plt.hist(x_gen_avg.cpu().squeeze().data, bins=100)
            writer.add_figure('hist_avg', fig, n_gen_update)
            plt.clf()

            if args.save_stats:
                if n_gen_update == 1:
                    checkpoint_1 = torch.load(os.path.join(OUTPUT_PATH, 'checkpoints/%i.state'%(n_gen_update)), map_location=device)

                if n_gen_update > 1:
                    checkpoint_2 = torch.load(os.path.join(OUTPUT_PATH, 'checkpoints/%i.state'%(n_gen_update)), map_location=device)
                    hist = compute_path_stats(gen, dis, checkpoint_1, checkpoint_2, dataloader,
                                              args, model_loss_gen, model_loss_dis, device, verbose=True)
Exemplo n.º 13
0
def eval_epoch(model, data_loader, fold, epoch):
    writer = SummaryWriter(
        os.path.join(args.experiment_path, 'fold{}'.format(fold), 'eval'))

    metrics = {
        'loss': utils.Mean(),
    }

    model.eval()
    with torch.no_grad():
        fold_labels = []
        fold_logits = []
        fold_exps = []

        for images, feats, exps, labels, _ in tqdm(
                data_loader, desc='epoch {} evaluation'.format(epoch)):
            images, feats, labels = images.to(DEVICE), feats.to(
                DEVICE), labels.to(DEVICE)
            logits = model(images, feats)

            loss = compute_loss(input=logits, target=labels)
            metrics['loss'].update(loss.data.cpu().numpy())

            fold_labels.append(labels)
            fold_logits.append(logits)
            fold_exps.extend(exps)

        fold_labels = torch.cat(fold_labels, 0)
        fold_logits = torch.cat(fold_logits, 0)

        if epoch % 10 == 0:
            temp, metric, fig = find_temp_global(probs=fold_logits,
                                                 target=fold_labels,
                                                 exps=fold_exps)
            writer.add_scalar('temp', temp, global_step=epoch)
            writer.add_scalar('metric_final', metric, global_step=epoch)
            writer.add_figure('temps', fig, global_step=epoch)

        temp = 1.  # use default temp
        fold_preds = assign_classes(probs=to_prob(fold_logits,
                                                  temp).data.cpu().numpy(),
                                    exps=fold_exps)
        fold_preds = torch.tensor(fold_preds).to(fold_logits.device)
        metric = compute_metric(input=fold_preds,
                                target=fold_labels,
                                exps=fold_exps)

        metrics = {k: metrics[k].compute_and_reset() for k in metrics}
        for k in metric:
            metrics[k] = metric[k].mean().data.cpu().numpy()
        images = images_to_rgb(images)[:16]
        print('[FOLD {}][EPOCH {}][EVAL] {}'.format(
            fold, epoch,
            ', '.join('{}: {:.4f}'.format(k, metrics[k]) for k in metrics)))
        for k in metrics:
            writer.add_scalar(k, metrics[k], global_step=epoch)
        writer.add_image('images',
                         torchvision.utils.make_grid(
                             images,
                             nrow=math.ceil(math.sqrt(images.size(0))),
                             normalize=True),
                         global_step=epoch)

        return metrics
Exemplo n.º 14
0
def main(args):
    # Get device
    #     device = torch.device('cuda'if torch.cuda.is_available()else 'cpu')
    device = 'cuda'
    # Define model
    model = FastSpeech().to(device)
    print("Model Has Been Defined")
    num_param = utils.get_param_num(model)
    print('Number of FastSpeech Parameters:', num_param)

    current_time = time.strftime("%Y-%m-%dT%H:%M", time.localtime())
    writer = SummaryWriter(log_dir='log/' + current_time)

    optimizer = torch.optim.Adam(model.parameters(),
                                 betas=(0.9, 0.98),
                                 eps=1e-9)

    # Load checkpoint if exists
    try:
        checkpoint_in = open(
            os.path.join(hp.checkpoint_path, 'checkpoint.txt'), 'r')
        args.restore_step = int(checkpoint_in.readline().strip())
        checkpoint_in.close()
        checkpoint = torch.load(
            os.path.join(hp.checkpoint_path,
                         'checkpoint_%08d.pth' % args.restore_step))
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        print("\n---Model Restored at Step %d---\n" % args.restore_step)
    except:

        print("\n---Start New Training---\n")
        if not os.path.exists(hp.checkpoint_path):
            os.mkdir(hp.checkpoint_path)
    # Get dataset
    dataset = FastSpeechDataset()

    # Optimizer and loss

    scheduled_optim = ScheduledOptim(optimizer, hp.d_model, hp.n_warm_up_step,
                                     args.restore_step)
    fastspeech_loss = FastSpeechLoss().to(device)
    print("Defined Optimizer and Loss Function.")

    # Init logger
    if not os.path.exists(hp.logger_path):
        os.mkdir(hp.logger_path)

    # Define Some Information
    Time = np.array([])
    Start = time.perf_counter()

    # Training
    model = model.train()
    t_l = 0.0
    for epoch in range(hp.epochs):
        # Get Training Loader
        training_loader = DataLoader(dataset,
                                     batch_size=hp.batch_size**2,
                                     shuffle=True,
                                     collate_fn=collate_fn,
                                     drop_last=True,
                                     num_workers=0)
        total_step = hp.epochs * len(training_loader) * hp.batch_size

        for i, batchs in enumerate(training_loader):
            for j, data_of_batch in enumerate(batchs):
                start_time = time.perf_counter()

                current_step = i * hp.batch_size + j + args.restore_step + \
                    epoch * len(training_loader)*hp.batch_size + 1

                # Init
                scheduled_optim.zero_grad()

                # Get Data
                condition1 = torch.from_numpy(
                    data_of_batch["condition1"]).long().to(device)
                condition2 = torch.from_numpy(
                    data_of_batch["condition2"]).long().to(device)
                mel_target = torch.from_numpy(
                    data_of_batch["mel_target"]).long().to(device)
                norm_f0 = torch.from_numpy(
                    data_of_batch["norm_f0"]).long().to(device)
                mel_in = torch.from_numpy(
                    data_of_batch["mel_in"]).float().to(device)
                D = torch.from_numpy(data_of_batch["D"]).int().to(device)
                mel_pos = torch.from_numpy(
                    data_of_batch["mel_pos"]).long().to(device)
                src_pos = torch.from_numpy(
                    data_of_batch["src_pos"]).long().to(device)
                lens = data_of_batch["lens"]
                max_mel_len = data_of_batch["mel_max_len"]
                #                 print(condition1,condition2)
                # Forward
                mel_output = model(src_seq1=condition1,
                                   src_seq2=condition2,
                                   mel_in=mel_in,
                                   src_pos=src_pos,
                                   mel_pos=mel_pos,
                                   mel_max_length=max_mel_len,
                                   length_target=D)

                #                 print(mel_target.size())
                #                 print(mel_output)
                #                 print(mel_postnet_output)

                # Cal Loss
                #                 mel_loss, mel_postnet_loss= fastspeech_loss(mel_output,                                                                            mel_postnet_output,mel_target,)
                #                 print(mel_output.shape,mel_target.shape)
                Loss = torch.nn.CrossEntropyLoss()
                predict = mel_output.transpose(1, 2)
                target1 = mel_target.long().squeeze()
                target2 = norm_f0.long().squeeze()
                target = ((target1 + target2) / 2).long().squeeze()

                #                 print(predict.shape,target.shape)
                #                 print(target.float().mean())
                losses = []
                #                 print(lens,target)
                for index in range(predict.shape[0]):
                    #                     print(predict[i,:,:lens[i]].shape,target[i,:lens[i]].shape)
                    losses.append(
                        Loss(predict[index, :, :lens[index]].transpose(0, 1),
                             target[index, :lens[index]]).unsqueeze(0))


#                     losses.append(0.5*Loss(predict[index,:,:lens[index]].transpose(0,1),target2[index,:lens[index]]).unsqueeze(0))
                total_loss = torch.cat(losses).mean()
                t_l += total_loss.item()

                #                 assert np.isnan(t_l)==False

                with open(os.path.join("logger", "total_loss.txt"),
                          "a") as f_total_loss:
                    f_total_loss.write(str(t_l) + "\n")

                # Backward
                if not np.isnan(t_l):
                    total_loss.backward()
                else:
                    print(condition1, condition2, D)

                # Clipping gradients to avoid gradient explosion
                nn.utils.clip_grad_norm_(model.parameters(),
                                         hp.grad_clip_thresh)

                # Update weights
                if args.frozen_learning_rate:
                    scheduled_optim.step_and_update_lr_frozen(
                        args.learning_rate_frozen)
                else:
                    scheduled_optim.step_and_update_lr()

                # Print
                if current_step % hp.log_step == 0:
                    Now = time.perf_counter()

                    str1 = "Epoch[{}/{}] Step[{}/{}]:".format(
                        epoch + 1, hp.epochs, current_step, total_step)
                    str2 = "Loss:{:.4f} ".format(t_l / hp.log_step)

                    str3 = "LR:{:.6f}".format(
                        scheduled_optim.get_learning_rate())
                    str4 = "T: {:.1f}s ETR:{:.1f}s.".format(
                        (Now - Start),
                        (total_step - current_step) * np.mean(Time))

                    print('\r' + str1 + ' ' + str2 + ' ' + str3 + ' ' + str4,
                          end='')
                    writer.add_scalar('loss', t_l / hp.log_step, current_step)
                    writer.add_scalar('lreaning rate',
                                      scheduled_optim.get_learning_rate(),
                                      current_step)

                    if hp.gpu_log_step != -1 and current_step % hp.gpu_log_step == 0:
                        os.system('nvidia-smi')

                    with open(os.path.join("logger", "logger.txt"),
                              "a") as f_logger:
                        f_logger.write(str1 + "\n")
                        f_logger.write(str2 + "\n")
                        f_logger.write(str3 + "\n")
                        f_logger.write(str4 + "\n")
                        f_logger.write("\n")

                    t_l = 0.0

                if current_step % hp.fig_step == 0 or current_step == 20:
                    f = plt.figure()
                    plt.matshow(mel_output[0].cpu().detach().numpy())
                    plt.savefig('out_predicted.png')
                    plt.matshow(
                        F.softmax(predict, dim=1).transpose(
                            1, 2)[0].cpu().detach().numpy())
                    plt.savefig('out_predicted_softmax.png')
                    writer.add_figure('predict', f, current_step)
                    plt.cla()

                    f = plt.figure(figsize=(8, 6))
                    #                   plt.matshow(mel_target[0].cpu().detach().numpy())
                    #                   x=np.arange(mel_target.shape[1])
                    #                   y=sample_from_discretized_mix_logistic(mel_output.transpose(1,2)).cpu().detach().numpy()[0]
                    #                   plt.plot(x,y)
                    sample = []
                    p = F.softmax(predict, dim=1).transpose(
                        1, 2)[0].detach().cpu().numpy()
                    for index in range(p.shape[0]):
                        sample.append(np.random.choice(200, 1, p=p[index]))
                    sample = np.array(sample)
                    plt.plot(np.arange(sample.shape[0]),
                             sample,
                             color='grey',
                             linewidth='1')
                    for index in range(D.shape[1]):
                        x = np.arange(D[0][index].cpu().numpy()
                                      ) + D[0][:index].cpu().numpy().sum()
                        y = np.arange(D[0][index].detach().cpu().numpy())
                        if condition2[0][index].cpu().numpy() != 0:
                            y.fill(
                                (condition2[0][index].cpu().numpy() - 40.0) *
                                5)
                            plt.plot(x, y, color='blue')
                    plt.plot(np.arange(target.shape[1]),
                             target[0].squeeze().detach().cpu().numpy(),
                             color='red',
                             linewidth='1')
                    plt.savefig('out_target.png', dpi=300)
                    writer.add_figure('target', f, current_step)
                    plt.cla()

                    plt.close("all")

                if current_step % (hp.save_step) == 0:
                    print("save model at step %d ..." % current_step, end='')
                    torch.save(
                        {
                            'model': model.state_dict(),
                            'optimizer': optimizer.state_dict()
                        },
                        os.path.join(hp.checkpoint_path,
                                     'checkpoint_%08d.pth' % current_step))
                    checkpoint_out = open(
                        os.path.join(hp.checkpoint_path, 'checkpoint.txt'),
                        'w')
                    checkpoint_out.write(str(current_step))
                    checkpoint_out.close()

                    #                     os.system('python savefig.py')

                    print('save completed')

                end_time = time.perf_counter()
                Time = np.append(Time, end_time - start_time)
                if len(Time) == hp.clear_Time:
                    temp_value = np.mean(Time)
                    Time = np.delete(Time, [i for i in range(len(Time))],
                                     axis=None)
                    Time = np.append(Time, temp_value)
Exemplo n.º 15
0
class Watcher(logging.getLoggerClass()):
    def __init__(self, log_path=None,
                 rank=0):  # local-rank is the most important term
        super().__init__(name="watcher-transformer")

        self.rank = rank

        self.progress_bar = None
        self.best_tracker = None
        self.tb_writer = None
        self.info_logger = None

        if self.rank == 0:

            formatter = logging.Formatter(
                '%(asctime)s %(levelname)s: - %(message)s',
                datefmt='%Y-%m-%d %H:%M:%S')

            if log_path is not None:
                fh = logging.FileHandler(log_path)
                fh.setLevel(logging.DEBUG)
                fh.setFormatter(formatter)
                self.addHandler(fh)

            ch = logging.StreamHandler()
            ch.setLevel(logging.DEBUG)
            ch.setFormatter(formatter)
            self.addHandler(ch)
            self.setLevel(logging.DEBUG)

        else:
            self.setLevel(logging.CRITICAL)

    def info(self, msg, *args, **kwargs):
        if self.rank == 0:
            super().info(msg, *args, **kwargs)

    # ----- progress bar ---- #
    def close_progress_bar(self):
        if self.rank == 0:
            if self.progress_bar is not None:
                self.progressbar.close()

    def set_progress_bar(self, steps=0):
        if self.rank == 0:
            self.progressbar = tqdm(
                total=steps, desc="start a new progress-bar", position=0)

    def step_progress_bar(self, info_str=None, step=1):
        if self.rank == 0:
            self.progressbar.update(step)
            if info_str is not None:
                self.progressbar.set_description(info_str)

    def set_languages(self, langs):
        try:
            from langid.langid import LanguageIdentifier, model
        except ImportError:
            print('Please install package of langid')

        self.langid = LanguageIdentifier.from_modelstring(
            model, norm_probs=True)
        try:
            self.langid.set_languages(langs)
        except ValueError:
            self.langid.set_languages(['en'])

    # ----- tensorboard ---- #
    def set_tensorboard(self, path):
        if self.rank == 0:
            self.tb_writer = SummaryWriter(path)

    def add_tensorboard(self, name, value, iters, dtype='scalar'):
        if self.rank == 0:
            if dtype == 'scalar':
                self.tb_writer.add_scalar(name, value, iters)
            elif dtype == 'figure':
                self.tb_writer.add_figure(name, value, iters)
            elif dtype == 'text':
                self.tb_writer.add_text(name, value, iters)
            else:
                raise NotImplementedError

    # ----- best performance tracker ---- #
    def set_best_tracker(self, model, opt, save_path, device, *names):
        self.best_tracker = Best(
            max, *names, 'i', model=model, opt=opt, path=save_path, gpu=device)

    def acc_best_tracker(self, iters, *values):
        if self.rank == 0:
            self.best_tracker.accumulate(*values, iters)

    def detect_lang(self, line):
        return self.langid.classify(line)[0]

    def match_lang(self, line, lang):
        scores = {l: v for l, v in self.langid.rank(line)}
        if lang not in scores:
            raise KeyError
        return scores[lang]
Exemplo n.º 16
0
class Mytensorboard(NetManager):
    INSTANCE = None

    def __init__(self, comment=''):
        self.writer = SummaryWriter(comment='_' + comment)
        self.writerLayout = {'Loss': {},
                             'PSNR': {},
                             'MSE': {}}
        self.step = 0

    @classmethod
    def get_instance(cls, comment=''):
        if cls.INSTANCE is None:
            cls.INSTANCE = Mytensorboard(comment=comment)
        return cls.INSTANCE

    def plotToTensorboard(self, fig, name):
        self.writer.add_figure(name, fig, global_step=self.step, close=True)

    def imgToTensorboard(self, img, name):
        # img = np.swapaxes(img, 0, 2) # if your TensorFlow + TensorBoard version are >= 1.8 or use tensorflow
        self.writer.add_image(name, img, global_step=self.step)

    def batchImageToTensorBoard(self, recon, resi, name):
        if recon is not None:
            img = (recon.cpu().detach().numpy() + resi.cpu().detach().numpy()) * 255.0
        else:
            img = (resi.cpu().detach().numpy()) * 255.0
        img = np.clip(img, 0, 255).astype(int)
        self.writer.add_image(name, img, global_step=self.step)

    def SaveImageToTensorBoard(self, name, image):
        image = np.clip(image * 255.0, 0, 255).astype(int)
        self.writer.add_image(name, image, global_step=self.step)

    def saveImageFromTest(self, recon, resi, name):
        img = (recon.cpu().detach().numpy() + resi.cpu().detach().numpy()) * 255.0
        img = np.clip(img, 0, 255).astype(np.uint8)
        img = Image.fromarray(img[0, 0])
        img.save(name + 'png')

    @staticmethod
    def Makegrid(imgs, nrow=None):
        if nrow is None:
            nrow = math.ceil(math.sqrt(imgs.shape[0]))
        return vutils.make_grid(imgs, nrow)

    def setObjectStep(self, num_set):
        self.object_step = num_set * self.OBJECT_EPOCH

    def plotScalars(self):
        for key, values in self.writerLayout.items():
            self.writer.add_scalars(key, values, self.step)

    def plotDifferent(self, img, name, percentile=90):
        if isinstance(img, torch.Tensor):
            img = (img.cpu().detach().numpy()) * 1023.0
        else:
            img = img * 1023.0
        percentile = percentile + (100 - percentile) // 2
        img = np.clip(img,
                      np.percentile(img, 100 - percentile, interpolation='higher'),
                      np.percentile(img, percentile, interpolation='lower'))
        img = np.clip(img, -1023.0, 1023.0)
        fig, ax = plt.subplots()
        if img.min() < 0 and img.max() > 0:
            mymax = max(abs(img.min()), img.max())
            mymin = -mymax
        else:
            mymin = img.min()
            mymax = img.max()
        imgs = ax.imshow((img[0]).astype(int), vmin=mymin, vmax=mymax, interpolation='nearest',
                         cmap=plt.cm.get_cmap('seismic'))
        v1 = np.linspace(mymin, mymax, 10, endpoint=True)
        cb = fig.colorbar(imgs, ticks=v1)
        cb.ax.set_yticklabels(["{:4.2f}".format(i) for i in v1])
        self.plotToTensorboard(fig, name + '_percentile' + str(percentile))
        return

    def plotMSEImage(self, resi, name):
        img = ((resi.cpu().detach().numpy()) ** 2) * 1023.0 * 1023.0
        fig, ax = plt.subplots()
        if img.min() < 0 and img.max() > 0:
            mymax = max(abs(img.min()), img.max())
            mymin = -mymax
        else:
            mymin = img.min()
            mymax = img.max()
        imgs = ax.imshow((img[0]).astype(int), vmin=mymin, vmax=mymax, interpolation='nearest',
                         cmap=plt.cm.get_cmap('seismic'))
        v1 = np.linspace(mymin, mymax, 10, endpoint=True)
        cb = fig.colorbar(imgs, ticks=v1)
        cb.ax.set_yticklabels(["{:4.2f}".format(i) for i in v1])
        self.plotToTensorboard(fig, name)
        return

    def plotMAEImage(self, resi, name, percentile=90):
        img = (resi.cpu().detach().numpy()) * 1023.0
        np.abs(img, out=img)
        fig, ax = plt.subplots()
        if percentile < 100:
            np.clip(img,
                    0,
                    np.percentile(img, percentile, interpolation='lower'), out=img)
        mymin = img.min()
        mymax = img.max()
        imgs = ax.imshow((img[0]).astype(int), vmin=mymin, vmax=mymax, interpolation='nearest',
                         cmap=plt.cm.get_cmap('seismic'))
        v1 = np.linspace(mymin, mymax, 10, endpoint=True)
        cb = fig.colorbar(imgs, ticks=v1)
        cb.ax.set_yticklabels(["{:4.2f}".format(i) for i in v1])
        self.plotToTensorboard(fig, name + '_percentile' + str(percentile))
        return

    """
    Use to plot input data 2d
    vminmax is range as list or tuple, example [22, 37]
    for example is qp is 22, 27, 32, 37
    plotMap(qpmap, 'QPMap', vminmax = [22, 37], color_num = 4)
    """

    def plotMap(self, img, name, vminmax=None, color_num=None):
        if vminmax is None:
            vminmax = (img.min().cpu(), img.max().cpu())

        img = self.Makegrid(img)
        fig, ax = plt.subplots()
        img = img.cpu()

        if color_num is None:
            color_num = len(img.unique())
        imgs = ax.imshow((img.numpy()[0]).astype(int), vmin=vminmax[0], vmax=vminmax[1], interpolation='nearest',
                         cmap=plt.cm.get_cmap('viridis', color_num))
        v1 = np.round(np.linspace(vminmax[0], vminmax[1], 10, endpoint=True))
        cb = fig.colorbar(imgs, ticks=v1)
        cb.ax.set_yticklabels(["{:4.2f}".format(i) for i in v1])
        self.plotToTensorboard(fig, name)
        return

    def SetLoss(self, name, value):
        self.writerLayout['Loss'][name] = value

    def SetPSNR(self, name, value):
        self.writerLayout['PSNR'][name] = value

    def SetMSE(self, name, value):
        self.writerLayout['MSE'][name] = value

    def SetLearningRate(self, value):
        self.writer.add_scalars('LearningRate', {'lr': value}, self.step)
Exemplo n.º 17
0
def main(config_file):
    # read from config
    local_config = __import__(config_file)
    model_name = local_config.INPUTS['MODEL']
    model = getattr(__import__('birdsong.models', fromlist=[model_name]),
                    model_name)
    batch_size = local_config.INPUTS['BATCHSIZE']
    optimizer_name = local_config.INPUTS['OPTIMIZER']
    optimizer = getattr(__import__('torch.optim', fromlist=[optimizer_name]),
                        optimizer_name)
    num_epochs = local_config.INPUTS['EPOCHS']
    no_classes = local_config.INPUTS['CLASSES']
    learning_rate = local_config.INPUTS['LR']

    # logging
    start_time = time.time()
    date = time.strftime('%d-%m-%Y-%H-%M-%S', time.localtime())
    log_path = f'./birdsong/run_log/{model_name}_{date}'
    state_fname, log_fname, summ_tensor_board = logger.create_log(log_path)
    writer = SummaryWriter(str(summ_tensor_board))

    # Enhancement
    enh = None  #Exponent(0.17)

    # Augmentation
    aug = SoundscapeNoise('storage/noise_slices', scaling=0.4)

    # Datasets and Dataloaders
    ds_train = SpectralDataset(TRAIN,
                               INPUT_DIR,
                               enhancement_func=enh,
                               augmentation_func=aug)
    ds_test = SpectralDataset(TEST, INPUT_DIR, enhancement_func=enh)

    dl_train = DataLoader(ds_train,
                          batch_size,
                          num_workers=4,
                          pin_memory=PIN,
                          shuffle=True)
    dl_test = DataLoader(ds_test,
                         batch_size,
                         num_workers=4,
                         pin_memory=PIN,
                         shuffle=True)
    print('Dataloaders initialized')

    # Model
    time_axis = ds_test.shape[1]
    freq_axis = ds_test.shape[0]
    net = model(time_axis=time_axis,
                freq_axis=freq_axis,
                no_classes=no_classes)

    criterion = nn.CrossEntropyLoss()
    optimizer = optimizer(net.parameters(), lr=learning_rate)

    # Logging general run information:
    info = f""" INFO: \n
    File type: {FILE_TYPE} \n
    Optimizer: {optimizer_name} \n
    Batch Size: {batch_size} \n
    Classes': {no_classes} \n 
    Enhancement: {ds_train.enhancement_func.__repr__()} \n
    Augmentation: {ds_train.augmentation_func.__repr__()} \n
    Supposed to run for: {num_epochs} \n
    Date: {date}"""

    writer.add_text('Info: ', info)

    # local vars
    best_acc = 0
    for epoch in range(num_epochs):
        train(net, dl_train, epoch, optimizer, criterion, DEVICE)

        train_stats, train_preds = evaluate(net, dl_train, criterion,
                                            no_classes, DEVICE)
        print(
            f'Training: Loss: {train_stats[0]:.5f}, Acc: {train_stats[1]:.5f}, Top 5: {train_stats[2]:.5f}'
        )

        test_stats, test_preds = evaluate(net, dl_test, criterion, no_classes,
                                          DEVICE)
        print(
            f'Validation: Loss: {test_stats[0]:.5f}, Acc: {test_stats[1]:.5f}, Top 5: {test_stats[2]:.5f}'
        )

        is_best = test_stats[1] > best_acc
        best_acc = max(test_stats[1], best_acc)
        print('Best Accuracy: {:.5f}'.format(best_acc))

        logger.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': net.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'best_accuracy': best_acc,
            },
            is_best,
            filename=state_fname)

        # Store confusion matrix every 5 epochs or at the end of training
        if epoch % 5 == 0 or epoch == num_epochs - 1:
            cm_train = plot_confusion_matrix(train_preds[0],
                                             train_preds[1],
                                             np.arange(no_classes),
                                             normalize=True)
            cm_val = plot_confusion_matrix(test_preds[0],
                                           test_preds[1],
                                           np.arange(no_classes),
                                           normalize=True)
            writer.add_figure('Training', cm_train, epoch)
            writer.add_figure('Validation', cm_val, epoch)

        logger.write_summary(writer, epoch, train_stats, test_stats)
        logger.dump_log_txt(date, start_time, local_config, train_stats,
                            test_stats, best_acc, epoch + 1, log_fname)

        # LR schedule
        update_lr(optimizer, epoch, learning_rate, 0.05)

    writer.close()
    print('Finished Training')
Exemplo n.º 18
0
class Logger:
    def __init__(self, log_dir, n_logged_samples=10, summary_writer=None):
        self._log_dir = log_dir
        print('########################')
        print('logging outputs to ', log_dir)
        print('########################')
        self._n_logged_samples = n_logged_samples
        self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1)

    def log_scalar(self, scalar, name, step_):
        self._summ_writer.add_scalar('{}'.format(name), scalar, step_)

    def log_scalars(self, scalar_dict, group_name, step, phase):
        """Will log all scalars in the same plot."""
        self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step)

    def log_image(self, image, name, step):
        assert(len(image.shape) == 3)  # [C, H, W]
        self._summ_writer.add_image('{}'.format(name), image, step)

    def log_video(self, video_frames, name, step, fps=10):
        assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!"
        self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps)

    def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'):

        # reshape the rollouts
        videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths] #0312
        videos = np.flip(videos, 3)
        # max rollout length
        max_videos_to_save = np.min([max_videos_to_save, len(videos)])
        max_length = videos[0].shape[0]
        for i in range(max_videos_to_save):
            if videos[i].shape[0]>max_length:
                max_length = videos[i].shape[0]

        # pad rollouts to all be same length
        for i in range(max_videos_to_save):
            if videos[i].shape[0]<max_length:
                padding = np.tile([videos[i][-1]], (max_length-videos[i].shape[0],1,1,1))
                videos[i] = np.concatenate([videos[i], padding], 0)

        # log videos to tensorboard event file
        videos = np.stack(videos[:max_videos_to_save], 0)
        self.log_video(videos, video_title, step, fps=fps)

    def log_figures(self, figure, name, step, phase):
        """figure: matplotlib.pyplot figure handle"""
        assert figure.shape[0] > 0, "Figure logging requires input shape [batch x figures]!"
        self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)

    def log_figure(self, figure, name, step, phase):
        """figure: matplotlib.pyplot figure handle"""
        self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)

    def log_graph(self, array, name, step, phase):
        """figure: matplotlib.pyplot figure handle"""
        im = plot_graph(array)
        self._summ_writer.add_image('{}_{}'.format(name, phase), im, step)

    def dump_scalars(self, log_path=None):
        log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path
        self._summ_writer.export_scalars_to_json(log_path)

    def flush(self):
        self._summ_writer.flush()
Exemplo n.º 19
0
        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:  # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))

            # 把数据写入tensorflow
            # ...log the running loss
            writer.add_scalar('image training loss', running_loss / 2000,
                              epoch * len(trainloader) + i)

            # ...log a Matplotlib Figure showing the models's predictions on a
            # random mini-batch
            writer.add_figure('predictions vs. actuals',
                              plot_classes_preds(net, inputs, labels),
                              global_step=epoch * len(trainloader) + i)

            running_loss = 0.0

torch.save(net.state_dict(), PATH)

print('Finished Training')
print("Time taken:", datetime.now() - startTime)
print("***************************")

# 获取一些随机测试数据
print("获取一些随机测试数据")
dataiter = iter(testloader)
images, labels = dataiter.next()
Exemplo n.º 20
0
def test(config,logger):
    """
    Note: when 'config.mode' is 'test', you don't neet to set 'config.forward_only' and 'config.resume' to true
    
    test model on test set
    """
    #init logger and seed
    start_time = time.strftime('%Y-%m-%d,%H:%M:%S', time.localtime(time.time()))
    logger.info('[START TESTING]\n{}\n{}'.format(start_time, '=' * 90))
    logger.info(config)
    logger.info('load data...')
    
    #load data
    test_dataset=RafDB(mode="test")
    test_loader=RafDBLoader(dataset=test_dataset,batch_size=config.bsz,shuffle=True)
    logger.info('create net...')
    
     #define net
    net_class=getattr(networks,config.net)
    net=net_class()
    logger.info('check and set GPU...')
    
    #check gpu
    if config.use_gpu==True and torch.cuda.is_available():
#         os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
#         os.environ["CUDA_VISIBLE_DEVICES"] = config.gpu_ids
        device=torch.device('cuda')
        net.to(device)
        if torch.cuda.device_count()>1:
            device_ids=[idx for idx in range(torch.cuda.device_count())]
            torch.distributed.init_process_group(backend='nccl', init_method=f'tcp://localhost:{config.localhost}', rank=0, world_size=1)
            net=torch.nn.parallel.DistributedDataParallel(net, device_ids=device_ids, find_unused_parameters=True)
    logger.info('create loss instance...')
    
    #define loss
    net_criterion=getattr(losses,config.loss)
    if config.loss_is_weighted:
        weights=torch.tensor([float(weight) for weight in config.loss_weights.split(',')])
        if config.use_gpu:
            weights=weights.to(device)
        criterion=net_criterion(weight=weights/torch.sum(weights,dim=0))
    else:
        criterion=net_criterion()
    
    #load checkpoint if needed
    start_n_iter=0
    start_epoch=0
    if len(config.ckpt_path)>0:
        logger.info(f'load checkpoint from {config.ckpt_path}...')
        ckpt=load_checkpoint(config.ckpt_path)
        start_n_iter=ckpt['n_iter']
        start_epoch=ckpt['epoch']
        net.load_state_dict(ckpt['net'])
        logger.info(f"Epoch={start_epoch}, N_iter={start_n_iter}")
    
    #tensorboardX
    logger.info("set tensorboardX...")
    writer_dir=os.path.join(config.output_dir,'boardX')
    if not os.path.exists(writer_dir):
        os.makedirs(writer_dir)
    writer=SummaryWriter(writer_dir)
    
    
    logger.info(f"test on test set...")
    result=np.zeros((7,7))
    net.eval()
    with torch.no_grad():
        pbar=tqdm(enumerate(test_loader),total=len(test_loader))
        start_time=time.time()
        tot_loss=0
        TOT,TP=0,0
        for i,data in pbar:
            #prepare
            x_data,y_data=data
            if len(x_data.size())!=4:
                logger.error(f"x_data size wrong! Now the size is {x_data.size()}")
                raise
            if len(y_data.size())>1:
                y_data=y_data.squeeze(-1)
            if config.use_gpu==True:
                x_data=x_data.to(device)
                y_data=y_data.to(device)
            prepare_time=time.time()-start_time
            #forward and predict
            pred_data=net(x_data)
            loss=criterion(pred_data,y_data)
            loss=torch.mean(loss,dim=0)
            embedding_addable=False
            if isinstance(pred_data,list):
                embedding=pred_data[1]
                pred_data=pred_data[0]
                embedding_addable=True
            pred_data=torch.argmax(pred_data,dim=1)
            result,tp=log_result(result,pred_data,y_data)
            #log
            tot_loss+=loss.item()
            TOT+=y_data.size(0)
            TP+=tp
            process_time=time.time()-start_time-prepare_time
            pbar.set_description("Compute efficiency: {:.2f}, iter: {}/{}:".format(
                process_time/(process_time+prepare_time), i, len(test_loader)))
            writer.add_scalars('loss',{'Test':loss.item()},i)
            if config.add_embedding and embedding_addable:
                add_embedding(writer,mat=embedding,metadata=y_data,label_img=x_data,global_step=i,tag="test set")
        
        #write and log
        writer.add_figure('confusion_matrix_on_test_set',figure=plot_confusion_matrix(result, classes=test_dataset.CLASSNAMES, normalize=True,title='confusion matrix on test set'),global_step=start_n_iter)
        normalized_result = result.astype('float') / (0.0001+result.sum(axis=1)[:, np.newaxis])
        mdv=sum([normalized_result[i][i] for i in range(7)])/7
        writer.add_scalar('mean_diagonal_value',mdv,start_n_iter)
        writer.add_scalar('accuracy',TP/TOT,start_n_iter)
        logger.info(f"loss of checkpoint in {config.ckpt_path}: {tot_loss/len(test_loader)}")
        logger.info(f"mean diagonal value of checkpoint in {config.ckpt_path}: {mdv}")
        logger.info(f"accuracy of checkpoint in {config.ckpt_path}: {TP/TOT}")
        logger.info(f"confusion matrix on test set: {normalized_result}")
        logger.info("exit 0.")
Exemplo n.º 21
0
class Logger(object):
    def __init__(self, log_dir):
        self.writer = SummaryWriter(log_dir)
        self.train_stats = {}
        self.eval_stats = {}

    def tb_model_weights(self, model, step):
        layer_num = 1
        for name, param in model.named_parameters():
            if param.numel() == 1:
                self.writer.add_scalar(
                    "layer{}-{}/value".format(layer_num, name), param.max(),
                    step)
            else:
                self.writer.add_scalar(
                    "layer{}-{}/max".format(layer_num, name), param.max(),
                    step)
                self.writer.add_scalar(
                    "layer{}-{}/min".format(layer_num, name), param.min(),
                    step)
                self.writer.add_scalar(
                    "layer{}-{}/mean".format(layer_num, name), param.mean(),
                    step)
                self.writer.add_scalar(
                    "layer{}-{}/std".format(layer_num, name), param.std(),
                    step)
                self.writer.add_histogram(
                    "layer{}-{}/param".format(layer_num, name), param, step)
                self.writer.add_histogram(
                    "layer{}-{}/grad".format(layer_num, name), param.grad,
                    step)
            layer_num += 1

    def dict_to_tb_scalar(self, scope_name, stats, step):
        for key, value in stats.items():
            self.writer.add_scalar('{}/{}'.format(scope_name, key), value,
                                   step)

    def dict_to_tb_figure(self, scope_name, figures, step):
        for key, value in figures.items():
            self.writer.add_figure('{}/{}'.format(scope_name, key), value,
                                   step)

    def dict_to_tb_audios(self, scope_name, audios, step, sample_rate):
        for key, value in audios.items():
            try:
                self.writer.add_audio('{}/{}'.format(scope_name, key),
                                      value,
                                      step,
                                      sample_rate=sample_rate)
            except:
                traceback.print_exc()

    def tb_train_iter_stats(self, step, stats):
        self.dict_to_tb_scalar("TrainIterStats", stats, step)

    def tb_train_epoch_stats(self, step, stats):
        self.dict_to_tb_scalar("TrainEpochStats", stats, step)

    def tb_train_figures(self, step, figures):
        self.dict_to_tb_figure("TrainFigures", figures, step)

    def tb_train_audios(self, step, audios, sample_rate):
        self.dict_to_tb_audios("TrainAudios", audios, step, sample_rate)

    def tb_eval_stats(self, step, stats):
        self.dict_to_tb_scalar("EvalStats", stats, step)

    def tb_eval_figures(self, step, figures):
        self.dict_to_tb_figure("EvalFigures", figures, step)

    def tb_eval_audios(self, step, audios, sample_rate):
        self.dict_to_tb_audios("EvalAudios", audios, step, sample_rate)

    def tb_test_audios(self, step, audios, sample_rate):
        self.dict_to_tb_audios("TestAudios", audios, step, sample_rate)

    def tb_test_figures(self, step, figures):
        self.dict_to_tb_figure("TestFigures", figures, step)
Exemplo n.º 22
0
def train(config,logger):
    
    #init logger and seed
    start_time = time.strftime('%Y-%m-%d,%H:%M:%S', time.localtime(time.time()))
    logger.info('[START TRAINING]\n{}\n{}'.format(start_time, '=' * 90))
    logger.info(config)
    logger.info('load data...')
    
    #load data
    train_dataset=RafDB(mode="train")
    val_dataset=RafDB(mode="test")
    train_loader=RafDBLoader(dataset=train_dataset,batch_size=config.bsz,shuffle=True)
    val_loader=RafDBLoader(dataset=val_dataset,batch_size=config.bsz,shuffle=True)
    logger.info('create net instance...')
    
    #define net
    net_class=getattr(networks,config.net)
    net=net_class()
    logger.info('check and set GPU...')
    
    #check gpu
    if config.use_gpu==True and torch.cuda.is_available():
#         os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
#         os.environ["CUDA_VISIBLE_DEVICES"] = config.gpu_ids
        device=torch.device('cuda')
        net.to(device)
        if torch.cuda.device_count()>1:
            device_ids=[idx for idx in range(torch.cuda.device_count())]
            torch.distributed.init_process_group(backend='nccl', init_method=f'tcp://localhost:{config.localhost}', rank=0, world_size=1)
            net=torch.nn.parallel.DistributedDataParallel(net, device_ids=device_ids, find_unused_parameters=True)
    logger.info('create loss instance...')
    
    #define loss
    net_criterion=getattr(losses,config.loss)
    if config.loss_is_weighted:
        weights=torch.tensor([float(weight) for weight in config.loss_weights.split(',')])
        if config.use_gpu:
            weights=weights.to(device)
        criterion=net_criterion(weight=weights/torch.sum(weights,dim=0))
    else:
        criterion=net_criterion()
    logger.info('create optimizer...')
    
    #define optimizer
    optimizer=config_optimizer(net.parameters(),config)
    logger.info("check LR scheduler...")
    
    #define lr_scheduler
    schedule_on_iter=config.schedule_on_iter
    schedule_on_epoch=config.schedule_on_epoch
    if schedule_on_iter:
        iter_scheduler=config_scheduler(optimizer,config,mode='iter')
    if schedule_on_epoch:
        epoch_scheduler=config_scheduler(optimizer,config,mode='epoch')
    
    #load checkpoint if needed
    start_n_iter=0
    start_epoch=0
    if config.resume==True:
        logger.info(f'load checkpoint from {config.ckpt_path}...')
        ckpt=load_checkpoint(config.ckpt_path)
        start_n_iter=ckpt['n_iter']
        start_epoch=ckpt['epoch']
        net.load_state_dict(ckpt['net'])
        optimizer.load_state_dict(ckpt['optim'])
    
    #tensorboardX
    logger.info("set tensorboardX...")
    writer_dir=os.path.join(config.output_dir,'boardX')
    if not os.path.exists(writer_dir):
        os.makedirs(writer_dir)
    writer=SummaryWriter(writer_dir)
    
    #ckpt dir
    ckpt_dir=os.path.join(config.output_dir,'ckpt')
    if not os.path.exists(ckpt_dir):
        os.makedirs(ckpt_dir)
    
    #start
    logger.info(f'start with epoch range of [{start_epoch}, {config.epoch})...')
    n_iter=start_n_iter
    mmdv=0
    m_n_iter=0
    if n_iter==0:
        optimizer.zero_grad()
    for epoch in range(start_epoch,config.epoch):
        if config.forward_only==False:
            net.train()
            pbar=tqdm(enumerate(train_loader),total=len(train_loader))
            start_time=time.time()
            tot_loss=0
            for i,data in pbar:
                #prepare
                x_data,y_data=data
                if len(x_data.size())!=4:
                    logger.error(f"x_data size wrong! Now the size is {x_data.size()}")
                    raise
                if len(y_data.size())>1:
                    y_data=y_data.squeeze(-1)
                if config.use_gpu==True:
                    x_data=x_data.to(device)
                    y_data=y_data.to(device)
                prepare_time=time.time()-start_time
                #forward and backward
                pred_data=net(x_data)
                loss=criterion(pred_data,y_data)
                loss=torch.mean(loss,dim=0)
                loss/=config.gd_acc
                loss.backward()
                if n_iter%config.gd_acc==config.gd_acc-1:
                    optimizer.step()
                    optimizer.zero_grad()
                if schedule_on_iter:
                    iter_scheduler.step()
                #log
                tot_loss+=loss.item()*config.gd_acc
                writer.add_scalars('loss',{'Train':loss.item()*config.gd_acc},n_iter)
                process_time=time.time()-start_time-prepare_time
                pbar.set_description("Compute efficiency: {:.2f}, epoch: {}/{}:".format(
                    process_time/(process_time+prepare_time), epoch, config.epoch))
                if config.add_embedding and isinstance(pred_data,list):
                    add_embedding(writer,mat=pred_data[1],metadata=y_data,label_img=x_data,global_step=n_iter,tag="train set")
                n_iter+=1
                
            logger.info(f"[Epoch: {epoch}]TrainLoss:{tot_loss/len(train_loader)}")
            if schedule_on_epoch:
                epoch_scheduler.step()
            
            #val and save
            if epoch%config.save_per_epoch==config.save_per_epoch-1:
                logger.info(f"suspend to save ckpt and  validate on val set when epoch[{epoch}] is complete...")
                
                result=np.zeros((7,7))
                net.eval()
                with torch.no_grad():
                    pbar=tqdm(enumerate(val_loader),total=len(val_loader))
                    start_time=time.time()
                    tot_loss=0
                    TOT,TP=0,0
                    for i,data in pbar:
                        #prepare
                        x_data,y_data=data
                        if len(x_data.size())!=4:
                            logger.error(f"x_data size wrong! Now the size is {x_data.size()}")
                            raise
                        if len(y_data.size())>1:
                            y_data=y_data.squeeze(-1)
                        if config.use_gpu==True:
                            x_data=x_data.to(device)
                            y_data=y_data.to(device)
                        prepare_time=time.time()-start_time
                        #forward and predict
                        pred_data=net(x_data)
                        loss=criterion(pred_data,y_data)
                        loss=torch.mean(loss,dim=0)
                        if isinstance(pred_data,list):
                            pred_data=pred_data[0]
                        pred_data=torch.argmax(pred_data,dim=1)
                        result,tp=log_result(result,pred_data,y_data)
                        #log
                        tot_loss+=loss.item()
                        TP+=tp
                        TOT+=y_data.size(0)
                        process_time=time.time()-start_time-prepare_time
                        pbar.set_description("Compute efficiency: {:.2f}, epoch: {}/{}:".format(
                            process_time/(process_time+prepare_time), epoch, config.epoch))
                    writer.add_scalars('loss',{'Val':tot_loss/len(val_loader)},n_iter)
                    normalized_result = result.astype('float') / (0.0001+result.sum(axis=1)[:, np.newaxis])
                    tmdv=sum([normalized_result[i][i] for i in range(7)])/7
                    writer.add_scalar('mean_diagonal_value',tmdv,n_iter)
                    writer.add_figure('confusion_matrix_on_val_set',figure=plot_confusion_matrix(result, classes=val_dataset.CLASSNAMES, normalize=True,title='confusion matrix on val set'),global_step=n_iter)
                    writer.add_scalar('accuracy',TP/TOT,n_iter)
                    
                    if check_save(tmdv,mmdv):
                        mmdv=tmdv
                        if m_n_iter:
                            os.remove(os.path.join(ckpt_dir,f'ckpt-{m_n_iter}.pickle'))
                        m_n_iter=n_iter
                        save_checkpoint(os.path.join(ckpt_dir,f'ckpt-{n_iter}.pickle'),net,optimizer,epoch,n_iter)
                    
                    logger.info(f"[Epoch: {epoch}]ValLoss:{tot_loss/len(val_loader)}")
        else:
            logger.info(f"forward only! So validate on train set when epoch[{epoch}] is complete...")
            result=np.zeros((7,7))
            net.eval()
            with torch.no_grad():
                pbar=tqdm(enumerate(train_loader),total=len(train_loader))
                start_time=time.time()
                tot_loss=0
                TOT,TP=0,0
                for i,data in pbar:
                    #prepare
                    x_data,y_data=data
                    if len(x_data.size())!=4:
                        logger.error(f"x_data size wrong! Now the size is {x_data.size()}")
                        raise
                    if len(y_data.size())>1:
                        y_data=y_data.squeeze(-1)
                    if config.use_gpu==True:
                        x_data=x_data.to(device)
                        y_data=y_data.to(device)
                    prepare_time=time.time()-start_time
                    #forward and predict
                    pred_data=net(x_data)
                    loss=criterion(pred_data,y_data)
                    loss=torch.mean(loss,dim=0)
                    embedding_addable=False
                    if isinstance(pred_data,list):
                        embedding=pred_data[1]
                        pred_data=pred_data[0]
                        embedding_addable=True
                    pred_label=torch.argmax(pred_data,dim=1)
                    result,tp=log_result(result,pred_label,y_data)
                    #log
                    tot_loss+=loss.item()
                    TP+=tp
                    TOT+=y_data.size(0)
                    process_time=time.time()-start_time-prepare_time
                    pbar.set_description("Compute efficiency: {:.2f}, epoch: {}/{}:".format(
                        process_time/(process_time+prepare_time), epoch, config.epoch))
                    writer.add_scalars('loss',{'Train':loss.item()},i)
                    if config.add_embedding and embedding_addable:
                        add_embedding(writer,mat=pred_data,metadata=y_data,label_img=x_data,global_step=i,tag="7-dim vectors on train set")
                writer.add_figure('confusion_matrix_on_train_set',figure=plot_confusion_matrix(result, classes=train_dataset.CLASSNAMES, normalize=True,title='confusion matrix on train set'),global_step=n_iter)
                normalized_result = result.astype('float') / (0.0001+result.sum(axis=1)[:, np.newaxis])
                writer.add_scalar('mean_diagonal_value',sum([normalized_result[i][i] for i in range(7)])/7,n_iter)
                writer.add_scalar('accuracy',TP/TOT,n_iter)
                logger.info(f"[Epoch: {epoch}]TrainLoss:{tot_loss/len(train_loader)}")
            break
    logger.info("exit 0.")
Exemplo n.º 23
0
def main(model_restart, lr, hidden_layers, epochs):
    # Load params
    if model_restart:
        with open(op.join("models", model_restart+'.json'), "r") as f:
            params = json.load(f)
    else:
        params= dict(
            hidden_layers = [4,4],
            epochs = 100000,
            batch_size = 500,
            lr=6e-4
        )
    if lr:
        params['lr'] = lr
    if epochs:
        params['epochs'] = epochs
    if hidden_layers:
        hidden_layers = hidden_layers[1:-1] if hidden_layers[0]=='[' else hidden_layers
        hidden_layers = [int(l) for l in hidden_layers.split(',')]

    # Load data
    loader, dataset, inp, out, msk, inp_mean, out_mean, inp_std, out_std, e231 = load_data("e231.csv", params)
    writer = SummaryWriter()

    # Build model
    model = build_model(params["hidden_layers"])
    if model_restart:
        model.load_state_dict(torch.load(op.join("models", model_restart + ".mdl")))

    # model = nn.Linear(2,3)
    opt = torch.optim.Adam(model.parameters(),
                        lr=params['lr'],
                        weight_decay=0.001,
                        betas=(0.9, 0.999), 
                        eps=1e-08
    )
                        # momentum=0.9,
                        # dampening=0.,
                        # nesterov=True)

    # Train
    model.train()
    # plt.plot(e231.alpha, e231.Cl, '+-')
    f, a = plt.subplots()
    a.plot(inp.data[:,0], model(inp)[:,0].data, '+')
    a.plot(inp.data[:,0], out[:,0].data, '+')
    f.savefig("tmp.png")
    f_, a_ = plt.subplots()
    LL = []

    try:
        for e in range(params["epochs"]):
            L = 0
            ite = 0
            for i, o, m in loader:
                loss = (((model(i) - o)*m)**2).sum()
                opt.zero_grad()
                loss.backward()
                opt.step()
                L += loss.data
                ite += i.shape[0]
            LL.append(L/ite)

            if e%10 ==0:
                writer.add_scalar('loss',L/ite, e)

            if e%100 == 0:
                a.clear()
                a.plot(inp.data[:,0], model(inp)[:,0].data, '+')
                a.plot(inp.data[:,0], out[:,0].data, '+')
                writer.add_figure("data_fit",f)
                # f.savefig("tmp.png")
                # a_.clear()
                # a_.loglog(LL)
                # f_.savefig(learning.png")
                # writer.add_figure(f_)

    except KeyboardInterrupt:
        pass

    # Save model
    model_file_name = os.path.join("models", os.path.basename(writer.logdir)+ ".mdl")
    torch.save(model.state_dict(), model_file_name)
    with open(model_file_name[:-4]+'.json', "w") as f:
        params["epochs"] = e
        json.dump(params, f)
Exemplo n.º 24
0
def svm_fit_and_test(config,logger):
    """
    Note: only when config.mode=='test' and config.classifier=='svm'
    """
    #init logger and seed
    start_time = time.strftime('%Y-%m-%d,%H:%M:%S', time.localtime(time.time()))
    logger.info('[START TESTING]\n{}\n{}'.format(start_time, '=' * 90))
    logger.info(config)
    logger.info('load data...')
    
    #load data
    train_dataset=RafDB(mode="train")
    train_loader=RafDBLoader(dataset=train_dataset,batch_size=config.bsz,shuffle=True)
    test_dataset=RafDB(mode="test")
    test_loader=RafDBLoader(dataset=test_dataset,batch_size=config.bsz,shuffle=True)
    logger.info('create net...')
    
    #define net
    net_class=getattr(networks,config.net)
    net=net_class()
    logger.info('check and set GPU...')
    
    #check gpu
    if config.use_gpu==True and torch.cuda.is_available():
#         os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
#         os.environ["CUDA_VISIBLE_DEVICES"] = config.gpu_ids
        device=torch.device('cuda')
        net.to(device)
        if torch.cuda.device_count()>1:
            device_ids=[idx for idx in range(torch.cuda.device_count())]
            torch.distributed.init_process_group(backend='nccl', init_method=f'tcp://localhost:{config.localhost}', rank=0, world_size=1)
            net=torch.nn.parallel.DistributedDataParallel(net, device_ids=device_ids, find_unused_parameters=True)
    logger.info('create loss instance...')
    
    #load checkpoint if needed
    start_n_iter=0
    start_epoch=0
    if len(config.ckpt_path)>0:
        logger.info(f'load checkpoint from {config.ckpt_path}...')
        ckpt=load_checkpoint(config.ckpt_path)
        start_n_iter=ckpt['n_iter']
        start_epoch=ckpt['epoch']
        net.load_state_dict(ckpt['net'])
        logger.info(f"Epoch={start_epoch}, N_iter={start_n_iter}")
    
    #tensorboardX
    logger.info("set tensorboardX...")
    writer_dir=os.path.join(config.output_dir,'boardX')
    if not os.path.exists(writer_dir):
        os.makedirs(writer_dir)
    writer=SummaryWriter(writer_dir)
    
    #train svm
    logger.info(f"extract features on train set...")
    net.eval()
    with torch.no_grad():
        pbar=tqdm(enumerate(train_loader),total=len(train_loader))
        start_time=time.time()
        feats,labels=[],[]
        for i,data in pbar:
            #prepare
            x_data,y_data=data
            if len(x_data.size())!=4:
                logger.error(f"x_data size wrong! Now the size is {x_data.size()}")
                raise
            if len(y_data.size())>1:
                y_data=y_data.squeeze(-1)
            if config.use_gpu==True:
                x_data=x_data.to(device)
            prepare_time=time.time()-start_time
            #forward and predict
            pred_data=net(x_data)
            embedding_addable=False
            if isinstance(pred_data,list):
                embedding=pred_data[1].detach().cpu()
                pred_data=pred_data[0].detach().cpu()
                embedding_addable=True
                feats+=[embedding]
                labels+=[y_data]
            process_time=time.time()-start_time-prepare_time
            pbar.set_description("Compute efficiency: {:.2f}, iter: {}/{}:".format(
                process_time/(process_time+prepare_time), i, len(test_loader)))
            
            if config.add_embedding and embedding_addable:
                add_embedding(writer,mat=embedding,metadata=y_data,label_img=x_data,global_step=i,tag="train set")
        
        logger.info("create svm...")
        svmX=torch.cat(feats,dim=0).numpy()
        svmY=torch.cat(labels,dim=0).numpy()
        sample_weight=class_weight=None
        if config.loss_is_weighted:
            weights=np.array([float(weight) for weight in config.loss_weights.split(',')])
            weights=weights/sum(weights)
            class_weight=dict(enumerate(weights))
            sample_weight=np.array([class_weight[c] for c in svmY])
        msvm_class=getattr(sklearn.svm,config.classifier)
        if config.classifier!='LinearSVC':
            msvm=msvm_class(class_weight=class_weight,decision_function_shape='ovo')
        else:
            msvm=msvm_class(class_weight=class_weight)
            
        logger.info(f"fit svm on train set...")
        msvm.fit(svmX,svmY,sample_weight=sample_weight)
        
        logger.info("predict on train set...")
        result=np.zeros((7,7))
        TOT=svmY.shape[0]
        if config.classifier!='LinearSVC':
            msvm.decision_function_shape="ovr"
        pred_data=msvm.decision_function(svmX)
        pred_data=torch.argmax(torch.from_numpy(pred_data),dim=1)
        result,TP=log_result(result,pred_data,torch.from_numpy(svmY))
        
        #write and log
        writer.add_figure('confusion_matrix_on_train_set',figure=plot_confusion_matrix(result, classes=train_dataset.CLASSNAMES, normalize=True,title='confusion matrix on train set'),global_step=start_n_iter)
        normalized_result = result.astype('float') / (0.0001+result.sum(axis=1)[:, np.newaxis])
        mdv=sum([normalized_result[i][i] for i in range(7)])/7
        writer.add_scalar('mean_diagonal_value_on_train_set',mdv,start_n_iter)
        writer.add_scalar('accuracy_on_train_set',TP/TOT,start_n_iter)
        logger.info(f"train mean diagonal value of checkpoint in {config.ckpt_path}: {mdv}")
        logger.info(f"train accuracy of checkpoint in {config.ckpt_path}: {TP/TOT}")
        logger.info(f"confusion matrix on train set: {normalized_result}")
        
        #predict on test set
        logger.info("extract features on test set...")
        pbar=tqdm(enumerate(test_loader),total=len(test_loader))
        start_time=time.time()
        feats,labels=[],[]
        for i,data in pbar:
            #prepare
            x_data,y_data=data
            if len(x_data.size())!=4:
                logger.error(f"x_data size wrong! Now the size is {x_data.size()}")
                raise
            if len(y_data.size())>1:
                y_data=y_data.squeeze(-1)
            if config.use_gpu==True:
                x_data=x_data.to(device)
            prepare_time=time.time()-start_time
            #forward and predict
            pred_data=net(x_data)
            embedding_addable=False
            if isinstance(pred_data,list):
                embedding=pred_data[1].detach().cpu()
                pred_data=pred_data[0].detach().cpu()
                embedding_addable=True
                feats+=[embedding]
                labels+=[y_data]
            process_time=time.time()-start_time-prepare_time
            pbar.set_description("Compute efficiency: {:.2f}, iter: {}/{}:".format(
                process_time/(process_time+prepare_time), i, len(test_loader)))
            
            if config.add_embedding and embedding_addable:
                add_embedding(writer,mat=embedding,metadata=y_data,label_img=x_data,global_step=i,tag="test set")
        
        logger.info("predict on test set...")
        svmX=torch.cat(feats,dim=0).numpy()
        svmY=torch.cat(labels,dim=0).numpy()
        result=np.zeros((7,7))
        TOT=svmY.shape[0]
        pred_data=msvm.decision_function(svmX)
        pred_data=torch.argmax(torch.from_numpy(pred_data),dim=1)
        result,TP=log_result(result,pred_data,torch.from_numpy(svmY))
        
        #write and log
        writer.add_figure('confusion_matrix_on_test_set',figure=plot_confusion_matrix(result, classes=test_dataset.CLASSNAMES, normalize=True,title='confusion matrix on test set'),global_step=start_n_iter)
        normalized_result = result.astype('float') / (0.0001+result.sum(axis=1)[:, np.newaxis])
        mdv=sum([normalized_result[i][i] for i in range(7)])/7
        writer.add_scalar('mean_diagonal_value_on_test_set',mdv,start_n_iter)
        writer.add_scalar('accuracy_on_test_set',TP/TOT,start_n_iter)
        logger.info(f"test mean diagonal value of checkpoint in {config.ckpt_path}: {mdv}")
        logger.info(f"test accuracy of checkpoint in {config.ckpt_path}: {TP/TOT}")
        logger.info(f"confusion matrix on test set: {normalized_result}")
        logger.info("exit 0.")
        
        
        
Exemplo n.º 25
0
                num_iters += 1

        epoch_loss = running_loss / len(dataloader[stage].dataset)
        epoch_acc = running_corrects.double() / len(dataloader[stage].dataset)
        writer.add_scalar('{}/epoch_loss'.format(stage), epoch_loss, ep + 1)
        writer.add_scalar('{}/epoch_acc'.format(stage), epoch_acc, ep + 1)

        print('{} Loss: {:.4f}, acc: {:.4f}'.format(stage, epoch_loss,
                                                    epoch_acc))

        if stage == 'test' and epoch_acc > best_acc:
            best_acc = epoch_acc
            best_model_wts = copy.deepcopy(model.state_dict())

if args.lr_range_test:
    fig = plt.figure(figsize=(12, 10))
    plt.plot(lr_history, step_acc_history)
    writer.add_figure('train/clr', fig)

time_elapsed = time.time() - since
print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
    time_elapsed // 3600, time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))

with open(
        os.path.join(args.summary_dir,
                     'model_bestValACC_{:.3f}.pkl'.format(best_acc)),
        'wb') as f:
    pkl.dump(best_model_wts, f)
Exemplo n.º 26
0
def train(resume=False):

    writer = SummaryWriter('../runs/' + hparams.exp_name)

    for k in hparams.__dict__.keys():
        writer.add_text(str(k), str(hparams.__dict__[k]))

    train_dataset = AudioData(
        data_csv=hparams.train_csv,
        data_file=hparams.dev_file,
        ds_type='train',  # augment=False,
        transform=transforms.Compose([
            transforms.ToTensor(),
        ]))

    validation_dataset = AudioData(data_csv=hparams.valid_csv,
                                   data_file=hparams.dev_file,
                                   ds_type='valid',
                                   augment=False,
                                   transform=transforms.Compose([
                                       transforms.ToTensor(),
                                   ]))

    # train_sampler = WeightedRandomSampler()

    train_loader = DataLoader(train_dataset,
                              batch_size=hparams.batch_size,
                              shuffle=True,
                              num_workers=2)

    validation_loader = DataLoader(validation_dataset,
                                   batch_size=hparams.batch_size,
                                   shuffle=True,
                                   num_workers=2)

    print('loaded train data of length : {}'.format(len(train_dataset)))

    adversarial_loss = torch.nn.CrossEntropyLoss().to(hparams.gpu_device)
    discriminator = Discriminator().to(hparams.gpu_device)

    if hparams.cuda:
        discriminator = nn.DataParallel(discriminator,
                                        device_ids=hparams.device_ids)

    params_count = 0
    for param in discriminator.parameters():
        params_count += np.prod(param.size())
    print('Model has {0} trainable parameters'.format(params_count))

    if not hparams.pretrained:
        discriminator.apply(weights_init_normal)

    optimizer_D = torch.optim.Adam(discriminator.parameters(),
                                   lr=hparams.learning_rate)

    scheduler_D = ReduceLROnPlateau(optimizer_D,
                                    mode='min',
                                    factor=0.3,
                                    patience=4,
                                    verbose=True,
                                    cooldown=0)

    Tensor = torch.cuda.FloatTensor if hparams.cuda else torch.FloatTensor

    def validation(discriminator, send_stats=False, epoch=0):
        print('Validating model on {0} examples. '.format(
            len(validation_dataset)))
        discriminator_ = discriminator.eval()

        with torch.no_grad():
            pred_logits_list = []
            labels_list = []

            for (inp, labels, imgs_names) in tqdm(validation_loader):
                inp = Variable(inp.float(), requires_grad=False)
                labels = Variable(labels.long(), requires_grad=False)

                if hparams.dim3:
                    inp = inp.view(-1, 1, 640, 64)
                    inp = torch.cat([inp] * 3, dim=1)

                inp = inp.to(hparams.gpu_device)
                labels = labels.to(hparams.gpu_device)

                pred_logits = discriminator_(inp)

                pred_logits_list.append(pred_logits)
                labels_list.append(labels)

            pred_logits = torch.cat(pred_logits_list, dim=0)
            labels = torch.cat(labels_list, dim=0)

            val_loss = adversarial_loss(pred_logits, labels)

        return accuracy_metrics(
            labels.long(), pred_logits
        ), val_loss  #, plot_auc='train_val_'+str(epoch+1), plot_path=hparams.result_dir+'train_val_{}_'.format(epoch)), val_loss

    print('Starting training.. (log saved in:{})'.format(hparams.exp_name))
    start_time = time.time()
    best_valid_acc = 0

    # print(model)
    for epoch in range(hparams.num_epochs):
        train_logits = []
        train_labels = []
        for batch, (inp, labels, imgs_name) in enumerate(tqdm(train_loader)):

            inp = Variable(inp.float(), requires_grad=False)
            labels = Variable(labels.long(), requires_grad=False)

            inp = inp.to(hparams.gpu_device)
            labels = labels.to(hparams.gpu_device)

            if hparams.dim3:
                inp = inp.view(-1, 1, 640, 64)
                inp = torch.cat([inp] * 3, dim=1)

            # ---------------------
            #  Train Discriminator
            # ---------------------
            optimizer_D.zero_grad()

            pred_logits = discriminator(inp)
            train_logits.append(pred_logits)
            train_labels.append(labels)

            d_loss = adversarial_loss(pred_logits, labels)

            d_loss.backward()
            optimizer_D.step()

            writer.add_scalar('d_loss',
                              d_loss.item(),
                              global_step=batch + epoch * len(train_loader))

            # if batch % hparams.print_interval == 0:
            #     pred_labels = (pred_logits >= hparams.thresh)
            #     pred_labels = pred_labels.float()
            #     auc, f1, acc, _, _ = accuracy_metrics(pred_labels, labels.long(), pred_logits)
            #     print('[Epoch - {0:.1f}, batch - {1:.3f}, d_loss - {2:.6f}, acc - {3:.4f}, f1 - {4:.5f}, auc - {5:.4f}]'.\
            #     format(1.0*epoch, 100.0*batch/len(train_loader), d_loss.item(), acc['avg'], f1[hparams.avg_mode], auc[hparams.avg_mode]))

        (val_auc, val_f1, val_acc,
         val_conf_mat), val_loss = validation(discriminator, epoch=epoch)

        train_logits = torch.cat(train_logits, dim=0)
        train_labels = torch.cat(train_labels, dim=0)

        train_auc, train_f1, train_acc, train_conf_mat = accuracy_metrics(
            train_labels.long(), train_logits)

        fig = plot_cf(val_conf_mat)
        writer.add_figure('val_conf', fig, global_step=epoch)
        plt.close(fig)
        for lbl in range(hparams.num_classes):
            writer.add_scalar('val_f1_{}'.format(hparams.id_to_class[lbl]),
                              val_f1[lbl],
                              global_step=epoch)
            writer.add_scalar('val_auc_{}'.format(hparams.id_to_class[lbl]),
                              val_auc[lbl],
                              global_step=epoch)
            writer.add_scalar('val_acc_{}'.format(hparams.id_to_class[lbl]),
                              val_acc[lbl],
                              global_step=epoch)
        writer.add_scalar('val_f1_{}'.format('micro'),
                          val_f1['micro'],
                          global_step=epoch)
        writer.add_scalar('val_auc_{}'.format('micro'),
                          val_auc['micro'],
                          global_step=epoch)
        writer.add_scalar('val_f1_{}'.format('macro'),
                          val_f1['macro'],
                          global_step=epoch)
        writer.add_scalar('val_auc_{}'.format('macro'),
                          val_auc['macro'],
                          global_step=epoch)
        writer.add_scalar('val_loss', val_loss, global_step=epoch)
        writer.add_scalar('val_f1',
                          val_f1[hparams.avg_mode],
                          global_step=epoch)
        writer.add_scalar('val_auc',
                          val_auc[hparams.avg_mode],
                          global_step=epoch)
        writer.add_scalar('val_acc', val_acc['avg'], global_step=epoch)
        scheduler_D.step(val_loss)
        writer.add_scalar('learning_rate',
                          optimizer_D.param_groups[0]['lr'],
                          global_step=epoch)

        # torch.save({
        #     'epoch': epoch,
        #     'discriminator_state_dict': discriminator.state_dict(),
        #     'optimizer_D_state_dict': optimizer_D.state_dict(),
        #     }, hparams.model+'.'+str(epoch))
        if best_valid_acc <= val_acc['avg']:
            best_valid_acc = val_acc['avg']
            fig = plot_cf(val_conf_mat)
            writer.add_figure('best_val_conf', fig, global_step=epoch)
            plt.close(fig)
            torch.save(
                {
                    'epoch': epoch,
                    'discriminator_state_dict': discriminator.state_dict(),
                    'optimizer_D_state_dict': optimizer_D.state_dict(),
                }, hparams.model + '.best')
            print('best model on validation set saved.')

        print('[Epoch - {0:.1f} ---> train_acc - {1:.4f}, current_lr - {2:.6f}, val_loss - {3:.4f}, best_val_acc - {4:.4f}, val_acc - {5:.4f}, val_f1 - {6:.4f}] - time - {7:.1f}'\
            .format(1.0*epoch, train_acc['avg'], optimizer_D.param_groups[0]['lr'], val_loss, best_valid_acc, val_acc['avg'], val_f1[hparams.avg_mode], time.time()-start_time))
        start_time = time.time()
Exemplo n.º 27
0
def lr_search(train_eval_data):
    train_eval_dataset = TrainEvalDataset(train_eval_data, transform=train_transform)
    train_eval_data_loader = torch.utils.data.DataLoader(
        train_eval_dataset,
        batch_size=config.batch_size,
        drop_last=True,
        shuffle=True,
        num_workers=args.workers,
        worker_init_fn=worker_init_fn)

    min_lr = 1e-7
    max_lr = 10.
    gamma = (max_lr / min_lr)**(1 / len(train_eval_data_loader))

    lrs = []
    losses = []
    lim = None

    model = Model(config.model, NUM_CLASSES)
    model = model.to(DEVICE)

    optimizer = build_optimizer(config.opt, model.parameters())
    for param_group in optimizer.param_groups:
        param_group['lr'] = min_lr
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma)

    optimizer.train()
    update_transforms(1.)
    model.train()
    optimizer.zero_grad()
    for i, (images, feats, _, labels, real, _) in enumerate(tqdm(train_eval_data_loader, desc='lr search'), 1):
        images, feats, labels, real = images.to(DEVICE), feats.to(DEVICE), labels.to(DEVICE), real.to(DEVICE)
        logits = model(images, feats, labels)

        loss = compute_loss(input=logits, target=labels, real=real)

        lrs.append(np.squeeze(scheduler.get_lr()))
        losses.append(loss.data.cpu().numpy().mean())

        if lim is None:
            lim = losses[0] * 1.1

        if lim < losses[-1]:
            break

        (loss.mean() / config.opt.acc_steps).backward()

        if i % config.opt.acc_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        scheduler.step()

    writer = SummaryWriter(os.path.join(args.experiment_path, 'lr_search'))

    with torch.no_grad():
        losses = np.clip(losses, 0, lim)
        minima_loss = losses[np.argmin(utils.smooth(losses))]
        minima_lr = lrs[np.argmin(utils.smooth(losses))]

        step = 0
        for loss, loss_sm in zip(losses, utils.smooth(losses)):
            writer.add_scalar('search_loss', loss, global_step=step)
            writer.add_scalar('search_loss_sm', loss_sm, global_step=step)
            step += config.batch_size

        fig = plt.figure()
        plt.plot(lrs, losses)
        plt.plot(lrs, utils.smooth(losses))
        plt.axvline(minima_lr)
        plt.xscale('log')
        plt.title('loss: {:.8f}, lr: {:.8f}'.format(minima_loss, minima_lr))
        writer.add_figure('search', fig, global_step=0)

        return minima_lr
Exemplo n.º 28
0
class Writer:
    _STDOUT = sys.stdout
    _STDERR = sys.stderr

    def __init__(self, logdir, make_subdir, tag_group):
        if make_subdir:
            os.makedirs(logdir, exist_ok=True)

            timestamp = f"{datetime.datetime.now().strftime('%b%d_%H-%M-%S')}"
            logdir = os.path.join(logdir, timestamp)

        self._writer = SummaryWriter(logdir=logdir)

        assert logdir == self._writer.logdir
        self._logdir = logdir

        self._tag_group = tag_group

        LINE_BUFFERING = 1

        sys.stdout = Tee(primary_file=self._STDOUT,
                         secondary_file=open(os.path.join(logdir, "stdout"),
                                             "a",
                                             buffering=LINE_BUFFERING))

        sys.stderr = Tee(primary_file=self._STDERR,
                         secondary_file=open(os.path.join(logdir, "stderr"),
                                             "a",
                                             buffering=LINE_BUFFERING))

    def write_scalar(self, tag, scalar_value, global_step=None):
        self._writer.add_scalar(self._tag(tag),
                                scalar_value,
                                global_step=global_step)

    def write_image(self, tag, img_tensor, global_step=None):
        self._writer.add_image(self._tag(tag),
                               img_tensor,
                               global_step=global_step)

    def write_figure(self, tag, figure, global_step=None):
        self._writer.add_figure(self._tag(tag),
                                figure,
                                global_step=global_step)

    def write_hparams(self, hparam_dict=None, metric_dict=None):
        self._writer.add_hparams(hparam_dict=hparam_dict,
                                 metric_dict=metric_dict)

    def write_json(self, tag, data):
        text = json.dumps(data, indent=4)

        self._writer.add_text(
            self._tag(tag),
            4 * " " +
            text.replace("\n", "\n" +
                         4 * " ")  # Indent by 4 to ensure codeblock formatting
        )

        json_path = os.path.join(self._logdir, f"{tag}.json")

        with open(json_path, "w") as f:
            f.write(text)

    def write_textfile(self, tag, text):
        path = os.path.join(self._logdir, f"{tag}.txt")
        with open(path, "w") as f:
            f.write(text)

    def write_checkpoint(self, tag, data):
        os.makedirs(self._checkpoints_dir, exist_ok=True)
        checkpoint_path = self._checkpoint_path(tag)

        tmp_checkpoint_path = os.path.join(
            os.path.dirname(checkpoint_path),
            f"{os.path.basename(checkpoint_path)}.tmp")

        torch.save(data, tmp_checkpoint_path)
        # replace is atomic, so we guarantee our checkpoints are always good
        os.replace(tmp_checkpoint_path, checkpoint_path)

    def load_checkpoint(self, tag, device):
        return torch.load(self._checkpoint_path(tag), map_location=device)

    def _checkpoint_path(self, tag):
        return os.path.join(self._checkpoints_dir, f"{tag}.pt")

    @property
    def _checkpoints_dir(self):
        return os.path.join(self._logdir, "checkpoints")

    def _tag(self, tag):
        return f"{self._tag_group}/{tag}"
Exemplo n.º 29
0
def run(seed):

    assert torch.cuda.is_available()
    device = torch.device('cuda')
    torch.set_default_tensor_type('torch.cuda.FloatTensor')

    np.random.seed(seed)
    torch.manual_seed(seed)

    # Create training data.
    data_transform = tvtransforms.Compose(
        [tvtransforms.ToTensor(),
         tvtransforms.Lambda(torch.bernoulli)])

    if args.dataset_name == 'mnist':
        dataset = datasets.MNIST(root=os.path.join(utils.get_data_root(),
                                                   'mnist'),
                                 train=True,
                                 download=True,
                                 transform=data_transform)
        test_dataset = datasets.MNIST(root=os.path.join(
            utils.get_data_root(), 'mnist'),
                                      train=False,
                                      download=True,
                                      transform=data_transform)
    elif args.dataset_name == 'fashion-mnist':
        dataset = datasets.FashionMNIST(root=os.path.join(
            utils.get_data_root(), 'fashion-mnist'),
                                        train=True,
                                        download=True,
                                        transform=data_transform)
        test_dataset = datasets.FashionMNIST(root=os.path.join(
            utils.get_data_root(), 'fashion-mnist'),
                                             train=False,
                                             download=True,
                                             transform=data_transform)
    elif args.dataset_name == 'omniglot':
        dataset = data_.OmniglotDataset(split='train',
                                        transform=data_transform)
        test_dataset = data_.OmniglotDataset(split='test',
                                             transform=data_transform)
    elif args.dataset_name == 'emnist':
        rotate = partial(tvF.rotate, angle=-90)
        hflip = tvF.hflip
        data_transform = tvtransforms.Compose([
            tvtransforms.Lambda(rotate),
            tvtransforms.Lambda(hflip),
            tvtransforms.ToTensor(),
            tvtransforms.Lambda(torch.bernoulli)
        ])
        dataset = datasets.EMNIST(root=os.path.join(utils.get_data_root(),
                                                    'emnist'),
                                  split='letters',
                                  train=True,
                                  transform=data_transform,
                                  download=True)
        test_dataset = datasets.EMNIST(root=os.path.join(
            utils.get_data_root(), 'emnist'),
                                       split='letters',
                                       train=False,
                                       transform=data_transform,
                                       download=True)
    else:
        raise ValueError

    if args.dataset_name == 'omniglot':
        split = -1345
    elif args.dataset_name == 'emnist':
        split = -20000
    else:
        split = -10000
    indices = np.arange(len(dataset))
    np.random.shuffle(indices)
    train_indices, val_indices = indices[:split], indices[split:]
    train_sampler = SubsetRandomSampler(train_indices)
    val_sampler = SubsetRandomSampler(val_indices)
    train_loader = data.DataLoader(
        dataset=dataset,
        batch_size=args.batch_size,
        sampler=train_sampler,
        num_workers=4 if args.dataset_name == 'emnist' else 0)
    train_generator = data_.batch_generator(train_loader)
    val_loader = data.DataLoader(dataset=dataset,
                                 batch_size=1024,
                                 sampler=val_sampler,
                                 shuffle=False,
                                 drop_last=False)
    val_batch = next(iter(val_loader))[0]
    test_loader = data.DataLoader(
        test_dataset,
        batch_size=16,
        shuffle=False,
        drop_last=False,
    )

    def create_linear_transform():
        if args.linear_type == 'lu':
            return transforms.CompositeTransform([
                transforms.RandomPermutation(args.latent_features),
                transforms.LULinear(args.latent_features, identity_init=True)
            ])
        elif args.linear_type == 'svd':
            return transforms.SVDLinear(args.latent_features,
                                        num_householder=4,
                                        identity_init=True)
        elif args.linear_type == 'perm':
            return transforms.RandomPermutation(args.latent_features)
        else:
            raise ValueError

    def create_base_transform(i, context_features=None):
        if args.prior_type == 'affine-coupling':
            return transforms.AffineCouplingTransform(
                mask=utils.create_alternating_binary_mask(
                    features=args.latent_features, even=(i % 2 == 0)),
                transform_net_create_fn=lambda in_features, out_features: nn_.
                ResidualNet(in_features=in_features,
                            out_features=out_features,
                            hidden_features=args.hidden_features,
                            context_features=context_features,
                            num_blocks=args.num_transform_blocks,
                            activation=F.relu,
                            dropout_probability=args.dropout_probability,
                            use_batch_norm=args.use_batch_norm))
        elif args.prior_type == 'rq-coupling':
            return transforms.PiecewiseRationalQuadraticCouplingTransform(
                mask=utils.create_alternating_binary_mask(
                    features=args.latent_features, even=(i % 2 == 0)),
                transform_net_create_fn=lambda in_features, out_features: nn_.
                ResidualNet(in_features=in_features,
                            out_features=out_features,
                            hidden_features=args.hidden_features,
                            context_features=context_features,
                            num_blocks=args.num_transform_blocks,
                            activation=F.relu,
                            dropout_probability=args.dropout_probability,
                            use_batch_norm=args.use_batch_norm),
                num_bins=args.num_bins,
                tails='linear',
                tail_bound=args.tail_bound,
                apply_unconditional_transform=args.
                apply_unconditional_transform,
            )
        elif args.prior_type == 'affine-autoregressive':
            return transforms.MaskedAffineAutoregressiveTransform(
                features=args.latent_features,
                hidden_features=args.hidden_features,
                context_features=context_features,
                num_blocks=args.num_transform_blocks,
                use_residual_blocks=True,
                random_mask=False,
                activation=F.relu,
                dropout_probability=args.dropout_probability,
                use_batch_norm=args.use_batch_norm)
        elif args.prior_type == 'rq-autoregressive':
            return transforms.MaskedPiecewiseRationalQuadraticAutoregressiveTransform(
                features=args.latent_features,
                hidden_features=args.hidden_features,
                context_features=context_features,
                num_bins=args.num_bins,
                tails='linear',
                tail_bound=args.tail_bound,
                num_blocks=args.num_transform_blocks,
                use_residual_blocks=True,
                random_mask=False,
                activation=F.relu,
                dropout_probability=args.dropout_probability,
                use_batch_norm=args.use_batch_norm)
        else:
            raise ValueError

    # ---------------
    # prior
    # ---------------
    def create_prior():
        if args.prior_type == 'standard-normal':
            prior = distributions_.StandardNormal((args.latent_features, ))

        else:
            distribution = distributions_.StandardNormal(
                (args.latent_features, ))
            transform = transforms.CompositeTransform([
                transforms.CompositeTransform(
                    [create_linear_transform(),
                     create_base_transform(i)])
                for i in range(args.num_flow_steps)
            ])
            transform = transforms.CompositeTransform(
                [transform, create_linear_transform()])
            prior = flows.Flow(transform, distribution)

        return prior

    # ---------------
    # inputs encoder
    # ---------------
    def create_inputs_encoder():
        if args.approximate_posterior_type == 'diagonal-normal':
            inputs_encoder = None
        else:
            inputs_encoder = nn_.ConvEncoder(
                context_features=args.context_features,
                channels_multiplier=16,
                dropout_probability=args.dropout_probability_encoder_decoder)
        return inputs_encoder

    # ---------------
    # approximate posterior
    # ---------------
    def create_approximate_posterior():
        if args.approximate_posterior_type == 'diagonal-normal':
            context_encoder = nn_.ConvEncoder(
                context_features=args.context_features,
                channels_multiplier=16,
                dropout_probability=args.dropout_probability_encoder_decoder)
            approximate_posterior = distributions_.ConditionalDiagonalNormal(
                shape=[args.latent_features], context_encoder=context_encoder)

        else:
            context_encoder = nn.Linear(args.context_features,
                                        2 * args.latent_features)
            distribution = distributions_.ConditionalDiagonalNormal(
                shape=[args.latent_features], context_encoder=context_encoder)

            transform = transforms.CompositeTransform([
                transforms.CompositeTransform([
                    create_linear_transform(),
                    create_base_transform(
                        i, context_features=args.context_features)
                ]) for i in range(args.num_flow_steps)
            ])
            transform = transforms.CompositeTransform(
                [transform, create_linear_transform()])
            approximate_posterior = flows.Flow(
                transforms.InverseTransform(transform), distribution)

        return approximate_posterior

    # ---------------
    # likelihood
    # ---------------
    def create_likelihood():
        latent_decoder = nn_.ConvDecoder(
            latent_features=args.latent_features,
            channels_multiplier=16,
            dropout_probability=args.dropout_probability_encoder_decoder)

        likelihood = distributions_.ConditionalIndependentBernoulli(
            shape=[1, 28, 28], context_encoder=latent_decoder)

        return likelihood

    prior = create_prior()
    approximate_posterior = create_approximate_posterior()
    likelihood = create_likelihood()
    inputs_encoder = create_inputs_encoder()

    model = vae.VariationalAutoencoder(
        prior=prior,
        approximate_posterior=approximate_posterior,
        likelihood=likelihood,
        inputs_encoder=inputs_encoder)

    n_params = utils.get_num_parameters(model)
    print('There are {} trainable parameters in this model.'.format(n_params))

    optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(
        optimizer=optimizer, T_max=args.num_training_steps, eta_min=0)

    def get_kl_multiplier(step):
        if args.kl_multiplier_schedule == 'constant':
            return args.kl_multiplier_initial
        elif args.kl_multiplier_schedule == 'linear':
            multiplier = min(
                step / (args.num_training_steps * args.kl_warmup_fraction), 1.)
            return args.kl_multiplier_initial * (1. + multiplier)

    # create summary writer and write to log directory
    timestamp = cutils.get_timestamp()
    if cutils.on_cluster():
        timestamp += '||{}'.format(os.environ['SLURM_JOB_ID'])
    log_dir = os.path.join(cutils.get_log_root(), args.dataset_name, timestamp)
    while True:
        try:
            writer = SummaryWriter(log_dir=log_dir, max_queue=20)
            break
        except FileExistsError:
            sleep(5)
    filename = os.path.join(log_dir, 'config.json')
    with open(filename, 'w') as file:
        json.dump(vars(args), file)

    best_val_elbo = -np.inf
    tbar = tqdm(range(args.num_training_steps))
    for step in tbar:
        model.train()
        optimizer.zero_grad()

        batch = next(train_generator)[0].to(device)
        elbo = model.stochastic_elbo(batch,
                                     kl_multiplier=get_kl_multiplier(step))
        loss = -torch.mean(elbo)
        loss.backward()
        optimizer.step()
        scheduler.step(step)

        if (step + 1) % args.monitor_interval == 0:
            model.eval()
            with torch.no_grad():
                elbo = model.stochastic_elbo(val_batch.to(device))
                mean_val_elbo = elbo.mean()

            if mean_val_elbo > best_val_elbo:
                best_val_elbo = mean_val_elbo
                path = os.path.join(
                    cutils.get_checkpoint_root(),
                    '{}-best-val-{}.t'.format(args.dataset_name, timestamp))
                torch.save(model.state_dict(), path)

            writer.add_scalar(tag='val-elbo',
                              scalar_value=mean_val_elbo,
                              global_step=step)

            writer.add_scalar(tag='best-val-elbo',
                              scalar_value=best_val_elbo,
                              global_step=step)

            with torch.no_grad():
                samples = model.sample(64)
            fig, ax = plt.subplots(figsize=(10, 10))
            cutils.gridimshow(make_grid(samples.view(64, 1, 28, 28), nrow=8),
                              ax)
            writer.add_figure(tag='vae-samples', figure=fig, global_step=step)
            plt.close()

    # load best val model
    path = os.path.join(
        cutils.get_checkpoint_root(),
        '{}-best-val-{}.t'.format(args.dataset_name, timestamp))
    model.load_state_dict(torch.load(path))
    model.eval()

    np.random.seed(5)
    torch.manual_seed(5)

    # compute elbo on test set
    with torch.no_grad():
        elbo = torch.Tensor([])
        log_prob_lower_bound = torch.Tensor([])
        for batch in tqdm(test_loader):
            elbo_ = model.stochastic_elbo(batch[0].to(device))
            elbo = torch.cat([elbo, elbo_])
            log_prob_lower_bound_ = model.log_prob_lower_bound(
                batch[0].to(device), num_samples=1000)
            log_prob_lower_bound = torch.cat(
                [log_prob_lower_bound, log_prob_lower_bound_])
    path = os.path.join(
        log_dir, '{}-prior-{}-posterior-{}-elbo.npy'.format(
            args.dataset_name, args.prior_type,
            args.approximate_posterior_type))
    np.save(path, utils.tensor2numpy(elbo))
    path = os.path.join(
        log_dir, '{}-prior-{}-posterior-{}-log-prob-lower-bound.npy'.format(
            args.dataset_name, args.prior_type,
            args.approximate_posterior_type))
    np.save(path, utils.tensor2numpy(log_prob_lower_bound))

    # save elbo and log prob lower bound
    mean_elbo = elbo.mean()
    std_elbo = elbo.std()
    mean_log_prob_lower_bound = log_prob_lower_bound.mean()
    std_log_prob_lower_bound = log_prob_lower_bound.std()
    s = 'ELBO: {:.2f} +- {:.2f}, LOG PROB LOWER BOUND: {:.2f} +- {:.2f}'.format(
        mean_elbo.item(), 2 * std_elbo.item() / np.sqrt(len(test_dataset)),
        mean_log_prob_lower_bound.item(),
        2 * std_log_prob_lower_bound.item() / np.sqrt(len(test_dataset)))
    filename = os.path.join(log_dir, 'test-results.txt')
    with open(filename, 'w') as file:
        file.write(s)
Exemplo n.º 30
0
def train(logger):
    """
    perform the training routine for a given fold. saves plots and selected parameters to the experiment dir
    specified in the configs.
    """
    logger.info('performing training in {}D over fold {} on experiment {} with model {}'.format(
        cf.dim, cf.fold, cf.exp_dir, cf.model))
    
    writer = SummaryWriter(os.path.join(cf.exp_dir,'tensorboard'))

    net = model.net(cf, logger).cuda()
    #print('finish initial network')
    optimizer = torch.optim.Adam(net.parameters(), lr=cf.learning_rate[0], weight_decay=cf.weight_decay)
    #print('finish initial optimizer')
    model_selector = utils.ModelSelector(cf, logger)
    train_evaluator = Evaluator(cf, logger, mode='train')
    val_evaluator = Evaluator(cf, logger, mode=cf.val_mode)#val_sampling

    starting_epoch = 1

    # prepare monitoring
    #monitor_metrics, TrainingPlot = utils.prepare_monitoring(cf)
    #print('monitor_metrics',monitor_metrics)
    if cf.resume_to_checkpoint:#default: False
        best_epoch = np.load(cf.resume_to_checkpoint + 'epoch_ranking.npy')[0] 
        df = open(cf.resume_to_checkpoint+'monitor_metrics.pickle','rb')
        monitor_metrics = pickle.load(df)
        df.close()
        starting_epoch = utils.load_checkpoint(cf.resume_to_checkpoint, net, optimizer)
        logger.info('resumed to checkpoint {} at epoch {}'.format(cf.resume_to_checkpoint, starting_epoch))
        num_batch = starting_epoch * cf.num_train_batches+1
        num_val = starting_epoch * cf.num_val_batches+1
    else:
        monitor_metrics = utils.prepare_monitoring(cf)
        num_batch = 0#for show loss
        num_val = 0
    logger.info('loading dataset and initializing batch generators...')
    batch_gen = data_loader.get_train_generators(cf, logger)
    #for k in batch_gen.keys():
    #    print('k in batch_gen are {}'.format(k))
    best_train_recall,best_val_recall = 0,0
    for epoch in range(starting_epoch, cf.num_epochs + 1):

        logger.info('starting training epoch {}'.format(epoch))
        for param_group in optimizer.param_groups:
            param_group['lr'] = cf.learning_rate[epoch - 1]

        start_time = time.time()

        net.train()
        train_results_list = []#this batch

        #print('net.train()')
        for bix in range(cf.num_train_batches):#200
            num_batch += 1
            batch = next(batch_gen['train'])#data,seg,pid,class_target,bb_target,roi_masks,roi_labels
            #print('training',batch['pid'])
            for ii,i in enumerate(batch['roi_labels']):
                if i[0] > 0:
                    batch['roi_labels'][ii] = [1]
                else:
                    batch['roi_labels'][ii] = [-1]
            #for k in batch.keys():
            #    print('k',k)

            tic_fw = time.time()
            results_dict = net.train_forward(batch)
            tic_bw = time.time()

            optimizer.zero_grad()
            results_dict['torch_loss'].backward()#total loss
            optimizer.step()
            
            if (num_batch) % cf.show_train_images == 0:
                fig = plot_batch_prediction(batch, results_dict, cf,'train')
                writer.add_figure('/Train/results',fig,num_batch)
                fig.clear()
            logger.info('tr. batch {0}/{1} (ep. {2}) fw {3:.3f}s / bw {4:.3f}s / total {5:.3f}s || '
                        .format(bix + 1, cf.num_train_batches, epoch, tic_bw - tic_fw,
                                time.time() - tic_bw, time.time() - tic_fw) + results_dict['logger_string'])
            writer.add_scalar('Train/total_loss',results_dict['torch_loss'].item(),num_batch)
            writer.add_scalar('Train/rpn_class_loss',results_dict['monitor_losses']['rpn_class_loss'],num_batch)
            writer.add_scalar('Train/rpn_bbox_loss',results_dict['monitor_losses']['rpn_bbox_loss'],num_batch)
            writer.add_scalar('Train/mrcnn_class_loss',results_dict['monitor_losses']['mrcnn_class_loss'],num_batch)
            writer.add_scalar('Train/mrcnn_bbox_loss',results_dict['monitor_losses']['mrcnn_bbox_loss'],num_batch)
            if 'mrcnn' in cf.model_path:
                writer.add_scalar('Train/mrcnn_mask_loss',results_dict['monitor_losses']['mrcnn_mask_loss'],num_batch)
            if 'ufrcnn' in cf.model_path:
                writer.add_scalar('Train/seg_dice_loss',results_dict['monitor_losses']['seg_loss_dice'],num_batch)
            train_results_list.append([results_dict['boxes'], batch['pid']])#just gt and det
            monitor_metrics['train']['monitor_values'][epoch].append(results_dict['monitor_values'])

        count_train = train_evaluator.evaluate_predictions(train_results_list,epoch,cf,flag = 'train')
        print('tp_patient {}, tp_roi {}, fp_roi {}, total_num {}'.format(count_train[0],count_train[1],count_train[2],count_train[3]))

        precision = count_train[0]/ (count_train[0]+count_train[2]+0.01)
        recall = count_train[0]/ (count_train[3])
        print('precision:{}, recall:{}'.format(precision,recall))
        monitor_metrics['train']['train_recall'].append(recall)
        monitor_metrics['train']['train_percision'].append(precision)
        writer.add_scalar('Train/train_precision',precision,epoch)
        writer.add_scalar('Train/train_recall',recall,epoch)

        train_time = time.time() - start_time
        print('*'*50 + 'finish epoch {}'.format(epoch))

        logger.info('starting validation in mode {}.'.format(cf.val_mode))
        with torch.no_grad():
            net.eval()
            if cf.do_validation:
                val_results_list = []
                val_predictor = Predictor(cf, net, logger, mode='val')
                dice_val = [] 
                for _ in range(batch_gen['n_val']):#50
                    num_val += 1
                    batch = next(batch_gen[cf.val_mode])
                    #print('valing',batch['pid'])
                    for ii,i in enumerate(batch['roi_labels']):
                        if i[0] > 0:
                            batch['roi_labels'][ii] = [1]
                        else:
                            batch['roi_labels'][ii] = [-1]
                    if cf.val_mode == 'val_patient':
                        results_dict = val_predictor.predict_patient(batch)
                    elif cf.val_mode == 'val_sampling':
                        results_dict = net.train_forward(batch, is_validation=True)
                        if (num_val) % cf.show_val_images == 0:
                            fig = plot_batch_prediction(batch, results_dict, cf,'val')
                            writer.add_figure('Val/results',fig,num_val)
                            fig.clear()

                    this_batch_seg_label = torch.FloatTensor(mutils.get_one_hot_encoding(batch['seg'], cf.num_seg_classes)).cuda()
                    this_batch_dice = DiceLoss()
                    dice = 1- this_batch_dice(F.softmax(results_dict['seg_logits'],dim=1),this_batch_seg_label)
                    #this_batch_dice = batch_dice(F.softmax(results_dict['seg_logits'],dim = 1),this_batch_seg_label,showdice = True)
                    dice_val.append(dice)
                    val_results_list.append([results_dict['boxes'], batch['pid']])
                    monitor_metrics['val']['monitor_values'][epoch].append(results_dict['monitor_values'])

                count_val = val_evaluator.evaluate_predictions(val_results_list,epoch,cf,flag = 'val')
                print('tp_patient {}, tp_roi {}, fp_roi {}, total_num {}'.format(count_val[0],count_val[1],count_val[2],count_val[3]))
                precision = count_val[0]/ (count_val[0]+count_val[2]+0.01)
                recall = count_val[0]/ (count_val[3])
                print('precision:{}, recall:{}'.format(precision,recall))
                monitor_metrics['val']['val_recall'].append(recall)
                monitor_metrics['val']['val_percision'].append(precision) 
                writer.add_scalar('Val/val_precision',precision,epoch)
                writer.add_scalar('Val/val_recall',recall,epoch)
                writer.add_scalar('Val/val_dice',sum(dice_val)/float(len(dice_val)),epoch)
                model_selector.run_model_selection(net, optimizer, monitor_metrics, epoch)

            # update monitoring and prediction plots
            #TrainingPlot.update_and_save(monitor_metrics, epoch)
            epoch_time = time.time() - start_time
            logger.info('trained epoch {}: took {} sec. ({} train / {} val)'.format(
                epoch, epoch_time, train_time, epoch_time-train_time))
    writer.close()