Exemplo n.º 1
0
class TensorBoard(Callback):

    # TODO: add option to write images; find fix for graph

    def __init__(self, log_dir, update_frequency = 10):
        super(Callback, self).__init__()
        self.log_dir = log_dir
        self.writer = None
        self.update_frequency = update_frequency

    def on_train_begin(self, **_):
        self.writer = SummaryWriter(os.path.join(self.log_dir, datetime.datetime.now().__str__()))
        rndm_input = torch.autograd.Variable(torch.rand(1, *self.model.input_shape), requires_grad = True).to(self.logger['device'])
        # fwd_pass = self.model(rndm_input)
        self.writer.add_graph(self.model, rndm_input)
        return self

    def on_epoch_end(self, **_):
        if (self.logger['epoch'] % self.update_frequency) == 0:
            epoch_metrics = self.logger['epoch_metrics'][self.logger['epoch']]
            for e_metric, e_metric_dct in epoch_metrics.iteritems():
                for e_metric_split, e_metric_val in e_metric_dct.iteritems():
                    self.writer.add_scalar('{}/{}'.format(e_metric_split, e_metric), e_metric_val, self.logger['epoch'])
            for name, param in self.model.named_parameters():
                self.writer.add_histogram(name.replace('.', '/'), param.clone().cpu().data.numpy(), self.logger['epoch'])
        return self

    def on_train_end(self, **_):
        return self.writer.close()
Exemplo n.º 2
0
    np.random.shuffle(lines)
    np.random.seed(None)
    num_val = int(len(lines) * val_split)
    num_train = len(lines) - num_val
    print('Train dataset : {0}, Val dataset: {1}'.format(num_train, num_val))

    writer = SummaryWriter(log_dir='logs', flush_secs=60)
    if Cuda:
        graph_inputs = torch.from_numpy(
            np.random.rand(1, 3, input_shape[0],
                           input_shape[1])).type(torch.FloatTensor).cuda()
    else:
        graph_inputs = torch.from_numpy(
            np.random.rand(1, 3, input_shape[0],
                           input_shape[1])).type(torch.FloatTensor)
    writer.add_graph(model, (graph_inputs, ))

    if True:
        lr = 1e-3
        Batch_size = 4
        Init_Epoch = 0
        Freeze_Epoch = 50

        optimizer = optim.Adam(net.parameters(), lr, weight_decay=5e-4)
        if Cosine_lr:
            lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                                T_max=5,
                                                                eta_min=1e-5)
        else:
            lr_scheduler = optim.lr_scheduler.StepLR(optimizer,
                                                     step_size=1,
Exemplo n.º 3
0
                      % (args.session, epoch, step, iters_per_epoch, loss_temp, lr))
                print("\t\t\tfg/bg=(%d/%d), time cost: %f" % (fg_cnt, bg_cnt, end - start))
                print("\t\t\trpn_cls: %.4f, rpn_box: %.4f, rcnn_cls: %.4f, rcnn_box %.4f" \
                      % (loss_rpn_cls, loss_rpn_box, loss_rcnn_cls, loss_rcnn_box))
                if args.use_tfboard:
                    info = {
                        'loss': loss_temp,
                        'loss_rpn_cls': loss_rpn_cls,
                        'loss_rpn_box': loss_rpn_box,
                        'loss_rcnn_cls': loss_rcnn_cls,
                        'loss_rcnn_box': loss_rcnn_box
                    }
                    logger.add_scalars("logs_s_{}/losses".format(args.session), info,
                                       (epoch - 1) * iters_per_epoch + step)
            if args.use_tfboard and step == 1:
                logger.add_graph(fasterRCNN, (im_data, im_info, gt_boxes, num_boxes))

                loss_temp = 0
                start = time.time()

        save_name = os.path.join(output_dir, 'faster_rcnn_{}_{}_{}.pth'.format(args.session, epoch, step))
        save_checkpoint({
            'session': args.session,
            'epoch': epoch + 1,
            'model': fasterRCNN.module.state_dict() if args.mGPUs else fasterRCNN.state_dict(),
            'optimizer': optimizer.state_dict(),
            'pooling_mode': cfg.POOLING_MODE,
            'class_agnostic': args.class_agnostic,
        }, save_name)
        print('save model: {}'.format(save_name))
Exemplo n.º 4
0

if __name__ == '__main__':
    resnet50 = ResNet50().to(device)

    optimizer = torch.optim.SGD(resnet50.parameters(),
                                lr=lr,
                                momentum=0.9,
                                weight_decay=0.0005,
                                nesterov=True)
    scheduler = StepLR(optimizer, step_size=step_size, gamma=0.5)
    loss_func = torch.nn.CrossEntropyLoss()

    summary_writer = SummaryWriter()
    dump_input = torch.rand(1, 3, 224, 224).to(device)
    summary_writer.add_graph(resnet50, (dump_input, ), verbose=False)

    for epoch in range(num_epoches):

        resnet50.train()
        running_loss = 0.0
        running_acc = 0.0
        for step, (
                batch_x,
                batch_y) in enumerate(train_loader):  # 每一步 loader 释放一小批数据用来学习
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)

            out = resnet50(batch_x)
            loss = loss_func(out, batch_y)
            running_loss += loss.data.item() * batch_y.size(0)
            _, pred = torch.max(out, 1)
Exemplo n.º 5
0
class LossHistory():
    def __init__(self, model, patience=5):
        import datetime
        curr_time = datetime.datetime.now()
        time_str = datetime.datetime.strftime(curr_time, '%Y_%m_%d_%H_%M_%S')
        self.log_dir = "logs//SegNet/"
        self.time_str = time_str
        self.save_path = os.path.join(self.log_dir,
                                      "loss_" + str(self.time_str))
        self.losses = []
        self.val_loss = []
        self.writer = SummaryWriter(
            log_dir=os.path.join(self.log_dir, "run_" + str(self.time_str)))
        self.freeze = False

        # write model summary
        x = threading.Thread(target=self.write_summary,
                             args=([deepcopy(model.module).cpu()]))
        x.start()

        # launch tensorboard
        t = threading.Thread(target=self.launchTensorBoard,
                             args=([self.log_dir]))
        t.start()

        # initial EarlyStopping
        self.patience = patience
        self.reset_stop()

        os.makedirs(self.save_path)

    def write_summary(self, cpu_model):
        print("write model summary ready")
        rndm_input = torch.autograd.Variable(torch.rand(1, 3, 512, 512),
                                             requires_grad=False).cpu()
        self.writer.add_graph(cpu_model, rndm_input)

        print("tensroboard model summary finished")

        f = io.StringIO()
        with redirect_stdout(f):
            summary(cpu_model, (3, 512, 512), device="cpu")
        lines = f.getvalue()
        with open(os.path.join(self.log_dir, "summary.txt"), "w") as f:
            [f.write(line) for line in lines]

        print("write model summary finished")
        return

    def launchTensorBoard(self, tensorBoardPath, port=8888):
        os.system('tensorboard --logdir=%s --port=%s' %
                  (tensorBoardPath, port))
        url = "http://localhost:%s/" % (port)
        # webbrowser.open_new(url)
        return

    def reset_stop(self):
        self.best_epoch_loss = np.Inf
        self.stopping = False
        self.counter = 0

    def set_status(self, freeze):
        self.freeze = freeze

    def epoch_loss(self, loss, val_loss, epoch):
        self.losses.append(loss)
        self.val_loss.append(val_loss)
        with open(
                os.path.join(self.save_path,
                             "epoch_loss_" + str(self.time_str) + ".txt"),
                'a') as f:
            f.write(str(loss))
            f.write("\n")
        with open(
                os.path.join(self.save_path,
                             "epoch_val_loss_" + str(self.time_str) + ".txt"),
                'a') as f:
            f.write(str(val_loss))
            f.write("\n")

        self.loss_plot()

        prefix = "Freeze_epoch/" if self.freeze else "UnFreeze_epoch/"
        self.writer.add_scalar(prefix + 'Loss/Train', loss, epoch)
        self.writer.add_scalar(prefix + 'Loss/Val', val_loss, epoch)
        self.decide(val_loss)

    def epoch_loss_no_val(self, loss, epoch):
        self.losses.append(loss)
        with open(
                os.path.join(self.save_path,
                             "epoch_loss_" + str(self.time_str) + ".txt"),
                'a') as f:
            f.write(str(loss))
            f.write("\n")

        self.loss_plot()

        prefix = "Freeze_epoch/" if self.freeze else "UnFreeze_epoch/"
        self.writer.add_scalar(prefix + 'Loss/Train', loss, epoch)

    def step(self, steploss, stepfscore, iteration):
        prefix = "Freeze_step/" if self.freeze else "UnFreeze_step/"
        self.writer.add_scalar(prefix + 'Train/Loss', steploss, iteration)
        self.writer.add_scalar(prefix + 'Train/F_Score', stepfscore, iteration)

    def decide(self, epoch_loss):
        if epoch_loss > self.best_epoch_loss:
            self.counter += 1
            print(
                f'EarlyStopping counter: {self.counter} out of {self.patience}'
            )
            if self.counter >= self.patience:
                print(f'Best lower loss:{self.best_epoch_loss}')
                self.stopping = True
        else:
            self.best_epoch_loss = epoch_loss
            self.counter = 0
            self.stopping = False

    def loss_plot(self):
        iters = range(len(self.losses))

        plt.figure()
        plt.plot(iters, self.losses, 'red', linewidth=2, label='train loss')
        plt.plot(iters, self.val_loss, 'coral', linewidth=2, label='val loss')
        try:
            if len(self.losses) < 25:
                num = 5
            else:
                num = 15

            plt.plot(iters,
                     scipy.signal.savgol_filter(self.losses, num, 3),
                     'green',
                     linestyle='--',
                     linewidth=2,
                     label='smooth train loss')
            plt.plot(iters,
                     scipy.signal.savgol_filter(self.val_loss, num, 3),
                     '#8B4513',
                     linestyle='--',
                     linewidth=2,
                     label='smooth val loss')
        except:
            pass

        plt.grid(True)
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend(loc="upper right")

        plt.savefig(
            os.path.join(self.save_path,
                         "epoch_loss_" + str(self.time_str) + ".png"))

        plt.cla()
        plt.close("all")
Exemplo n.º 6
0
def train(args, data_root, save_root):
    weight_dir = "{}weights/".format(save_root)
    log_dir = "{}logs/MobileNetV2Vortex-{}".format(
        save_root, time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()))

    # +++++++++++++++++++++++++++++++++++++++++++++++++++ #
    # 1. Setup Augmentations
    # +++++++++++++++++++++++++++++++++++++++++++++++++++ #
    net_h, net_w = int(args.img_rows * args.crop_ratio), int(args.img_cols *
                                                             args.crop_ratio)

    augment_train = Compose([
        RandomHorizontallyFlip(),
        RandomSized((0.5, 0.75)),
        RandomRotate(5),
        RandomCrop((net_h, net_w))
    ])
    augment_valid = Compose([
        RandomHorizontallyFlip(),
        Scale((args.img_rows, args.img_cols)),
        CenterCrop((net_h, net_w))
    ])

    print("> # +++++++++++++++++++++++++++++++++++++++++++++++++++++++ #")
    print("> 0. Setting up DataLoader...")
    print("> # +++++++++++++++++++++++++++++++++++++++++++++++++++++++ #")
    train_loader = CityscapesLoader(data_root,
                                    gt="gtFine",
                                    is_transform=True,
                                    split='train',
                                    img_size=(args.img_rows, args.img_cols),
                                    augmentations=augment_train)
    valid_loader = CityscapesLoader(data_root,
                                    gt="gtFine",
                                    is_transform=True,
                                    split='val',
                                    img_size=(args.img_rows, args.img_cols),
                                    augmentations=augment_valid)

    n_classes = train_loader.n_classes

    # +++++++++++++++++++++++++++++++++++++++++++++++++++ #
    # 2. Setup Metrics
    # +++++++++++++++++++++++++++++++++++++++++++++++++++ #
    running_metrics = RunningScore(n_classes)

    # +++++++++++++++++++++++++++++++++++++++++++++++++++ #
    # 4. Setup Model
    # +++++++++++++++++++++++++++++++++++++++++++++++++++ #
    print("> # +++++++++++++++++++++++++++++++++++++++++++++++++++++++ #")
    print("> 1. Setting up Model...")

    model = MobileNetV2Vortex(n_class=19,
                              in_size=(net_h, net_w),
                              width_mult=1.,
                              out_sec=256,
                              rate_sec=(3, 9, 27),
                              norm_act=partial(InPlaceABNWrapper,
                                               activation="leaky_relu",
                                               slope=0.1))
    """

    model = MobileNetV2Plus(n_class=n_classes, in_size=(net_h, net_w), width_mult=1.0,
                            out_sec=256, aspp_sec=(12, 24, 36),
                            norm_act=partial(InPlaceABNWrapper, activation="leaky_relu", slope=0.1))
    """
    # np.arange(torch.cuda.device_count())
    model = torch.nn.DataParallel(model, device_ids=[0, 1]).cuda()

    # 4.1 Setup Optimizer
    # +++++++++++++++++++++++++++++++++++++++++++++++++++ #
    # Check if model has custom optimizer / loss
    if hasattr(model.module, 'optimizer'):
        optimizer = model.module.optimizer
    else:
        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=args.l_rate,
                                    momentum=0.90,
                                    weight_decay=5e-4,
                                    nesterov=True)

        # for pg in optimizer.param_groups:
        #     print(pg['lr'])

        # optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999),
        #                             eps=1e-08, weight_decay=0, amsgrad=True)
        # optimizer = YFOptimizer(model.parameters(), lr=2.5e-3, mu=0.9, clip_thresh=10000, weight_decay=5e-4)

    # 4.2 Setup Loss
    # +++++++++++++++++++++++++++++++++++++++++++++++++++ #
    class_weight = None
    if hasattr(model.module, 'loss'):
        print('> Using custom loss')
        loss_fn = model.module.loss
    else:
        # loss_fn = cross_entropy2d

        class_weight = np.array([
            0.05570516, 0.32337477, 0.08998544, 1.03602707, 1.03413147,
            1.68195437, 5.58540548, 3.56563995, 0.12704978, 1., 0.46783719,
            1.34551528, 5.29974114, 0.28342531, 0.9396095, 0.81551811,
            0.42679146, 3.6399074, 2.78376194
        ],
                                dtype=float)
        """
        class_weight = np.array([3.045384,  12.862123,   4.509889,  38.15694,  35.25279,  31.482613,
                                 45.792305,  39.694073,  6.0639296,  32.16484,  17.109228,   31.563286,
                                 47.333973,  11.610675,  44.60042,   45.23716,  45.283024,  48.14782,
                                 41.924667], dtype=float)/10.0
        """
        class_weight = torch.from_numpy(class_weight).float().cuda()
        loss_fn = bootstrapped_cross_entropy2d
        # loss_fn = cross_entropy2d

    # +++++++++++++++++++++++++++++++++++++++++++++++++++ #
    # 5. Resume Model
    # +++++++++++++++++++++++++++++++++++++++++++++++++++ #
    best_iou = -100.0
    args.start_epoch = 0
    if args.resume is not None:
        full_path = "{}{}".format(weight_dir, args.resume)
        if os.path.isfile(full_path):
            print("> Loading model and optimizer from checkpoint '{}'".format(
                args.resume))

            checkpoint = torch.load(full_path)
            args.start_epoch = checkpoint['epoch']
            best_iou = checkpoint['best_iou']
            model.load_state_dict(checkpoint['model_state'])  # weights
            optimizer.load_state_dict(
                checkpoint['optimizer_state'])  # gradient state

            # for param_group in optimizer.param_groups:
            # s    param_group['lr'] = 1e-5

            del checkpoint
            print("> Loaded checkpoint '{}' (epoch {}, iou {})".format(
                args.resume, args.start_epoch, best_iou))

        else:
            print("> No checkpoint found at '{}'".format(args.resume))
    else:
        if args.pre_trained is not None:
            print("> Loading weights from pre-trained model '{}'".format(
                args.pre_trained))
            full_path = "{}{}".format(weight_dir, args.pre_trained)

            pre_weight = torch.load(full_path)
            pre_weight = pre_weight["model_state"]
            # pre_weight = pre_weight["state_dict"]

            model_dict = model.state_dict()

            pretrained_dict = {
                k: v
                for k, v in pre_weight.items() if k in model_dict
            }
            model_dict.update(pretrained_dict)
            model.load_state_dict(model_dict)

            del pre_weight
            del model_dict
            del pretrained_dict

    # +++++++++++++++++++++++++++++++++++++++++++++++++++ #
    # 3. Setup tensor_board for visualization
    # +++++++++++++++++++++++++++++++++++++++++++++++++++ #
    writer = None
    if args.tensor_board:
        writer = SummaryWriter(log_dir=log_dir, comment="MobileNetV2Vortex")

    if args.tensor_board:
        dummy_input = Variable(torch.rand(1, 3, net_h, net_w).cuda(),
                               requires_grad=True)
        writer.add_graph(model, dummy_input)

    # +++++++++++++++++++++++++++++++++++++++++++++++++++ #
    # 6. Train Model
    # +++++++++++++++++++++++++++++++++++++++++++++++++++ #
    print("> 2. Model Training start...")
    train_loader = data.DataLoader(train_loader,
                                   batch_size=args.batch_size,
                                   num_workers=6,
                                   shuffle=True)
    valid_loader = data.DataLoader(valid_loader,
                                   batch_size=args.batch_size,
                                   num_workers=6)

    num_batches = int(
        math.ceil(
            len(train_loader.dataset.files[train_loader.dataset.split]) /
            float(train_loader.batch_size)))

    lr_period = 20 * num_batches
    swa_weights = model.state_dict()

    # scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.90)
    # scheduler = CyclicLR(optimizer, base_lr=1.0e-3, max_lr=6.0e-3, step_size=2*num_batches)
    # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=32, gamma=0.1)
    # scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[30, 80], gamma=0.1)

    topk_init = 512
    # topk_multipliers = [64, 128, 256, 512]
    for epoch in np.arange(args.start_epoch, args.n_epoch):
        # +++++++++++++++++++++++++++++++++++++++++++++++++++ #
        # 7.1 Mini-Batch Learning
        # +++++++++++++++++++++++++++++++++++++++++++++++++++ #
        # print("> Training Epoch [%d/%d]:" % (epoch + 1, args.n_epoch))
        model.train()

        last_loss = 0.0
        topk_base = topk_init
        pbar = tqdm(np.arange(num_batches))
        for train_i, (images, labels) in enumerate(
                train_loader):  # One mini-Batch data, One iteration
            full_iter = (epoch * num_batches) + train_i + 1

            # poly_lr_scheduler(optimizer, init_lr=args.l_rate, iter=full_iter,
            #                   lr_decay_iter=1, max_iter=args.n_epoch*num_batches, power=0.9)

            batch_lr = args.l_rate * cosine_annealing_lr(lr_period, full_iter)
            optimizer = set_optimizer_lr(optimizer, batch_lr)

            topk_base = poly_topk_scheduler(init_topk=topk_init,
                                            iter=full_iter,
                                            topk_decay_iter=1,
                                            max_iter=args.n_epoch *
                                            num_batches,
                                            power=0.95)

            images = Variable(
                images.cuda(),
                requires_grad=True)  # Image feed into the deep neural network
            labels = Variable(labels.cuda(), requires_grad=False)

            optimizer.zero_grad()
            net_out = model(images)  # Here we have 3 output for 3 loss

            topk = topk_base * 512
            if random.random() < 0.20:
                train_loss = loss_fn(input=net_out,
                                     target=labels,
                                     K=topk,
                                     weight=class_weight)
            else:
                train_loss = loss_fn(input=net_out,
                                     target=labels,
                                     K=topk,
                                     weight=None)

            last_loss = train_loss.data[0]
            pbar.update(1)
            pbar.set_description("> Epoch [%d/%d]" % (epoch + 1, args.n_epoch))
            pbar.set_postfix(Loss=last_loss, TopK=topk_base, LR=batch_lr)

            train_loss.backward()
            optimizer.step()

            if full_iter % lr_period == 0:
                swa_weights = update_aggregated_weight_average(
                    model, swa_weights, full_iter, lr_period)
                state = {'model_state': swa_weights}
                torch.save(
                    state, "{}{}_mobilenetv2vortex_swa_model.pkl".format(
                        weight_dir, args.dataset))

            if (train_i + 1) % 31 == 0:
                loss_log = "Epoch [%d/%d], Iter: %d Loss: \t %.4f" % (
                    epoch + 1, args.n_epoch, train_i + 1, last_loss)

                net_out = F.softmax(net_out, dim=1)
                pred = net_out.data.max(1)[1].cpu().numpy()
                gt = labels.data.cpu().numpy()

                running_metrics.update(gt, pred)
                score, class_iou = running_metrics.get_scores()

                metric_log = ""
                for k, v in score.items():
                    metric_log += " {}: \t %.4f, ".format(k) % v
                running_metrics.reset()

                logs = loss_log + metric_log
                # print(logs)

                if args.tensor_board:
                    writer.add_scalar('Training/Losses', last_loss, full_iter)
                    writer.add_scalars('Training/Metrics', score, full_iter)
                    writer.add_text('Training/Text', logs, full_iter)

                    for name, param in model.named_parameters():
                        writer.add_histogram(name,
                                             param.clone().cpu().data.numpy(),
                                             full_iter)

        # +++++++++++++++++++++++++++++++++++++++++++++++++++ #
        # 7.2 Mini-Batch Validation
        # +++++++++++++++++++++++++++++++++++++++++++++++++++ #
        # print("> Validation for Epoch [%d/%d]:" % (epoch + 1, args.n_epoch))
        model.eval()

        mval_loss = 0.0
        vali_count = 0
        for i_val, (images_val, labels_val) in enumerate(valid_loader):
            vali_count += 1

            images_val = Variable(images_val.cuda(), volatile=True)
            labels_val = Variable(labels_val.cuda(), volatile=True)

            net_out = model(images_val)  # Here we have 4 output for 4 loss

            topk = topk_base * 512
            val_loss = loss_fn(input=net_out,
                               target=labels_val,
                               K=topk,
                               weight=None)

            mval_loss += val_loss.data[0]

            net_out = F.softmax(net_out, dim=1)
            pred = net_out.data.max(1)[1].cpu().numpy()
            gt = labels_val.data.cpu().numpy()
            running_metrics.update(gt, pred)

        mval_loss /= vali_count

        loss_log = "Epoch [%d/%d] Loss: \t %.4f" % (epoch + 1, args.n_epoch,
                                                    mval_loss)
        metric_log = ""
        score, class_iou = running_metrics.get_scores()
        for k, v in score.items():
            metric_log += " {} \t %.4f, ".format(k) % v
        running_metrics.reset()

        logs = loss_log + metric_log
        # print(logs)
        pbar.set_postfix(Train_Loss=last_loss,
                         Vali_Loss=mval_loss,
                         Vali_mIoU=score['Mean_IoU'])

        if args.tensor_board:
            writer.add_scalar('Validation/Losses', mval_loss, epoch)
            writer.add_scalars('Validation/Metrics', score, epoch)
            writer.add_text('Validation/Text', logs, epoch)

            for name, param in model.named_parameters():
                writer.add_histogram(name,
                                     param.clone().cpu().data.numpy(), epoch)

            # export scalar data to JSON for external processing
            # writer.export_scalars_to_json("{}/all_scalars.json".format(log_dir))

        if score['Mean_IoU'] >= best_iou:
            best_iou = score['Mean_IoU']
            state = {
                'epoch': epoch + 1,
                "best_iou": best_iou,
                'model_state': model.state_dict(),
                'optimizer_state': optimizer.state_dict()
            }
            torch.save(
                state, "{}{}_mobilenetv2vortex_best_model.pkl".format(
                    weight_dir, args.dataset))

        # scheduler.step()
        # scheduler.batch_step()
        pbar.close()

    if args.tensor_board:
        # export scalar data to JSON for external processing
        # writer.export_scalars_to_json("{}/all_scalars.json".format(log_dir))
        writer.close()
    print("> # +++++++++++++++++++++++++++++++++++++++++++++++++++++++ #")
    print("> Training Done!!!")
    print("> # +++++++++++++++++++++++++++++++++++++++++++++++++++++++ #")
Exemplo n.º 7
0
class BaseTrainer(object):
    """Operations of training a model, including data loading, gradient descent, and validation.
    """
    def __init__(self, **kwargs):
        """
        :param kwargs: dict of (key, value), or dict-like object. key is str.
        The base trainer requires the following keys:
            - epochs: int, the number of epochs in training
            - validate: bool, whether or not to validate on dev set
            - batch_size: int
            - pickle_path: str, the path to pickle files for pre-processing
        """
        super(BaseTrainer, self).__init__()
        """
            "default_args" provides default value for important settings. 
            The initialization arguments "kwargs" with the same key (name) will override the default value. 
            "kwargs" must have the same type as "default_args" on corresponding keys. 
            Otherwise, error will raise.
        """
        default_args = {
            "epochs": 3,
            "batch_size": 8,
            "validate": True,
            "use_cuda": True,
            "pickle_path": "./save/",
            "save_best_dev": True,
            "model_name": "default_model_name.pkl",
            "print_every_step": 1,
            "loss": Loss(None),  # used to pass type check
            "optimizer": Optimizer("Adam", lr=0.001, weight_decay=0)
        }
        """
            "required_args" is the collection of arguments that users must pass to Trainer explicitly. 
            This is used to warn users of essential settings in the training. 
            Obviously, "required_args" is the subset of "default_args". 
            The value in "default_args" to the keys in "required_args" is simply for type check. 
        """
        #add required arguments here
        required_args = {}

        for req_key in required_args:
            if req_key not in kwargs:
                logger.error("Trainer lacks argument {}".format(req_key))
                raise ValueError("Trainer lacks argument {}".format(req_key))

        for key in default_args:
            if key in kwargs:
                if isinstance(kwargs[key], type(default_args[key])):
                    default_args[key] = kwargs[key]
                else:
                    msg = "Argument %s type mismatch: expected %s while get %s" % (
                        key, type(default_args[key]), type(kwargs[key]))
                    logger.error(msg)
                    raise ValueError(msg)
            else:
                # BaseTrainer doesn't care about extra arguments
                pass
        print(default_args)

        self.n_epochs = default_args["epochs"]
        self.batch_size = default_args["batch_size"]
        self.pickle_path = default_args["pickle_path"]
        self.validate = default_args["validate"]
        self.save_best_dev = default_args["save_best_dev"]
        self.use_cuda = default_args["use_cuda"]
        self.model_name = default_args["model_name"]
        self.print_every_step = default_args["print_every_step"]

        self._model = None
        self._loss_func = default_args["loss"].get(
        )  # return a pytorch loss function or None
        self._optimizer = None
        self._optimizer_proto = default_args["optimizer"]
        self._summary_writer = SummaryWriter(self.pickle_path +
                                             'tensorboard_logs')
        self._graph_summaried = False

    def train(self, network, train_data, dev_data=None):
        """General Training Procedure

        :param network: a model
        :param train_data: three-level list, the training set.
        :param dev_data: three-level list, the validation data (optional)
        """
        # transfer model to gpu if available
        if torch.cuda.is_available() and self.use_cuda:
            self._model = network.cuda()
            # self._model is used to access model-specific loss
        else:
            self._model = network

        # define Tester over dev data
        if self.validate:
            default_valid_args = {
                "save_output": True,
                "validate_in_training": True,
                "save_dev_input": True,
                "save_loss": True,
                "batch_size": self.batch_size,
                "pickle_path": self.pickle_path,
                "use_cuda": self.use_cuda,
                "print_every_step": 0
            }
            validator = self._create_validator(default_valid_args)
            logger.info("validator defined as {}".format(str(validator)))

        # optimizer and loss
        self.define_optimizer()
        logger.info("optimizer defined as {}".format(str(self._optimizer)))
        self.define_loss()
        logger.info("loss function defined as {}".format(str(self._loss_func)))

        # main training procedure
        start = time.time()
        logger.info("training epochs started")
        for epoch in range(1, self.n_epochs + 1):
            logger.info("training epoch {}".format(epoch))

            # turn on network training mode
            self.mode(network, test=False)
            # prepare mini-batch iterator
            data_iterator = iter(
                Batchifier(RandomSampler(train_data),
                           self.batch_size,
                           drop_last=False))
            logger.info("prepared data iterator")

            # one forward and backward pass
            self._train_step(data_iterator,
                             network,
                             start=start,
                             n_print=self.print_every_step,
                             epoch=epoch)

            # validation
            if self.validate:
                logger.info("validation started")
                validator.test(network, dev_data)

                if self.save_best_dev and self.best_eval_result(validator):
                    self.save_model(network, self.model_name)
                    print("Saved better model selected by validation.")
                    logger.info("Saved better model selected by validation.")

                valid_results = validator.show_metrics()
                print("[epoch {}] {}".format(epoch, valid_results))
                logger.info("[epoch {}] {}".format(epoch, valid_results))

    def _train_step(self, data_iterator, network, **kwargs):
        """Training process in one epoch.

            kwargs should contain:
                - n_print: int, print training information every n steps.
                - start: time.time(), the starting time of this step.
                - epoch: int,
        """
        step = 0
        for batch_x, batch_y in self.make_batch(data_iterator):

            prediction = self.data_forward(network, batch_x)

            loss = self.get_loss(prediction, batch_y)
            self.grad_backward(loss)
            self.update()
            self._summary_writer.add_scalar("loss",
                                            loss.item(),
                                            global_step=step)

            if not self._graph_summaried:
                self._summary_writer.add_graph(network, batch_x)
                self._graph_summaried = True

            if kwargs["n_print"] > 0 and step % kwargs["n_print"] == 0:
                end = time.time()
                diff = timedelta(seconds=round(end - kwargs["start"]))
                print_output = "[epoch: {:>3} step: {:>4}] train loss: {:>4.2} time: {}".format(
                    kwargs["epoch"], step, loss.data, diff)
                print(print_output)
                logger.info(print_output)
            step += 1

    def cross_validate(self, network, train_data_cv, dev_data_cv):
        """Training with cross validation.
        :param network: the model
        :param train_data_cv: four-level list, of shape [num_folds, num_examples, 2, ?]
        :param dev_data_cv: four-level list, of shape [num_folds, num_examples, 2, ?]
        """
        if len(train_data_cv) != len(dev_data_cv):
            logger.error(
                "the number of folds in train and dev data unequals {}!={}".
                format(len(train_data_cv), len(dev_data_cv)))
            raise RuntimeError(
                "the number of folds in train and dev data unequals")
        if self.validate is False:
            logger.warn(
                "Cross validation requires self.validate to be True. Please turn it on. "
            )
            print(
                "[warning] Cross validation requires self.validate to be True. Please turn it on. "
            )
            self.validate = True

        n_fold = len(train_data_cv)
        logger.info("perform {} folds cross validation.".format(n_fold))
        for i in range(n_fold):
            print("CV:", i)
            logger.info("running the {} of {} folds cross validation".format(
                i + 1, n_fold))
            network_copy = copy.deepcopy(network)
            self.train(network_copy, train_data_cv[i], dev_data_cv[i])

    def make_batch(self, iterator):
        raise NotImplementedError

    def mode(self, network, test):
        Action.mode(network, test)

    def define_optimizer(self):
        """Define framework-specific optimizer specified by the models.

        """
        self._optimizer = self._optimizer_proto.construct_from_pytorch(
            self._model.parameters())

    def update(self):
        """
        Perform weight update on a model.

        For PyTorch, just call optimizer to update.
        """
        self._optimizer.step()

    def data_forward(self, network, x):
        raise NotImplementedError

    def grad_backward(self, loss):
        """Compute gradient with link rules.

        :param loss: a scalar where back-prop starts
        For PyTorch, just do "loss.backward()"
        """
        self._model.zero_grad()
        loss.backward()

    def get_loss(self, predict, truth):
        """Compute loss given prediction and ground truth.

        :param predict: prediction label vector
        :param truth: ground truth label vector
        :return: a scalar
        """
        return self._loss_func(predict, truth)

    def define_loss(self):
        """Define a loss for the trainer.

        If the model defines a loss, use model's loss.
        Otherwise, Trainer must has a loss argument, use it as loss.
        These two losses cannot be defined at the same time.
        Trainer does not handle loss definition or choose default losses.
        """
        if hasattr(self._model, "loss") and self._loss_func is not None:
            raise ValueError(
                "Both the model and Trainer define loss. Please take out your loss."
            )

        if hasattr(self._model, "loss"):
            self._loss_func = self._model.loss
            logger.info("The model has a loss function, use it.")
        else:
            if self._loss_func is None:
                raise ValueError("Please specify a loss function.")
            logger.info("The model didn't define loss, use Trainer's loss.")

    def best_eval_result(self, validator):
        """Check if the current epoch yields better validation results.

        :param validator: a Tester instance
        :return: bool, True means current results on dev set is the best.
        """
        accuracy = validator.metrics()
        if accuracy > self.best_accuracy:
            self.best_accuracy = accuracy
            return True
        else:
            return False

    def save_model(self, network, model_name):
        """Save this model with such a name.
        This method may be called multiple times by Trainer to overwritten a better model.
        :param network: the PyTorch model
        :param model_name: str
        """
        if model_name[-4:] != ".pkl":
            model_name += ".pkl"
        ModelSaver(os.path.join(self.pickle_path,
                                model_name)).save_pytorch(network)

    def _create_validator(self, valid_args):
        raise NotImplementedError
def train(opt):
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    learning_rate_schedule = {"0": 1e-5, "5": 1e-4, "80": 1e-5, "110": 1e-6}
    training_params = {
        "batch_size": opt.batch_size,
        "shuffle": True,
        "drop_last": True,
        "collate_fn": custom_collate_fn
    }

    test_params = {
        "batch_size": opt.batch_size,
        "shuffle": False,
        "drop_last": False,
        "collate_fn": custom_collate_fn
    }

    training_set = []
    training_generator = []
    training_set.append(
        COCODataset(opt.data_path, "2014", "train", opt.image_size))
    training_set.append(
        COCODataset(opt.data_path, "2014", "val", opt.image_size))
    training_set.append(
        COCODataset(opt.data_path, "2017", "train", opt.image_size))
    training_generator.append(DataLoader(training_set[0], **training_params))
    training_generator.append(DataLoader(training_set[1], **training_params))
    training_generator.append(DataLoader(training_set[2], **training_params))

    test_set = COCODataset(opt.data_path,
                           "2017",
                           "val",
                           opt.image_size,
                           is_training=False)
    test_generator = DataLoader(test_set, **test_params)

    if torch.cuda.is_available():
        if opt.pre_trained_model_type == "model":
            model = torch.load(opt.pre_trained_model_path)
        else:
            model = Yolo(training_set[0].num_classes)
            model.load_state_dict(torch.load(opt.pre_trained_model_path))
    else:
        if opt.pre_trained_model_type == "model":
            model = torch.load(opt.pre_trained_model_path,
                               map_location=lambda storage, loc: storage)
        else:
            model = Yolo(training_set[0].num_classes)
            model.load_state_dict(
                torch.load(opt.pre_trained_model_path,
                           map_location=lambda storage, loc: storage))
    # The following line will re-initialize weight for the last layer, which is useful
    # when you want to retrain the model based on my trained weights. if you uncomment it,
    # you will see the loss is already very small at the beginning.
    nn.init.normal_(list(model.modules())[-1].weight, 0, 0.01)
    log_path = os.path.join(opt.log_path, "{}".format("2014and2017"))
    if os.path.isdir(log_path):
        shutil.rmtree(log_path)
    os.makedirs(log_path)
    writer = SummaryWriter(log_path)
    if torch.cuda.is_available():
        writer.add_graph(
            model, torch.rand(opt.batch_size, 3, opt.image_size,
                              opt.image_size))
        model.cuda()
    else:
        writer.add_graph(
            model, torch.rand(opt.batch_size, 3, opt.image_size,
                              opt.image_size))
    criterion = YoloLoss(training_set[0].num_classes, model.anchors,
                         opt.reduction)
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=1e-5,
                                momentum=opt.momentum,
                                weight_decay=opt.decay)
    best_loss = 1e10
    best_epoch = 0
    model.train()
    num_iter_per_epoch = 0
    for generator in training_generator:
        num_iter_per_epoch += len(generator)
    for epoch in range(opt.num_epoches):
        if str(epoch) in learning_rate_schedule.keys():
            for param_group in optimizer.param_groups:
                param_group['lr'] = learning_rate_schedule[str(epoch)]
        for generator in training_generator:
            for iter, batch in enumerate(generator):
                image, label = batch
                if torch.cuda.is_available():
                    image = Variable(image.cuda(), requires_grad=True)
                else:
                    image = Variable(image, requires_grad=True)
                optimizer.zero_grad()
                logits = model(image)
                loss, loss_coord, loss_conf, loss_cls = criterion(
                    logits, label)
                loss.backward()
                optimizer.step()
                print(
                    "Epoch: {}/{}, Iteration: {}/{}, Lr: {}, Loss:{:.2f} (Coord:{:.2f} Conf:{:.2f} Cls:{:.2f})"
                    .format(epoch + 1, opt.num_epoches, iter + 1,
                            num_iter_per_epoch,
                            optimizer.param_groups[0]['lr'], loss, loss_coord,
                            loss_conf, loss_cls))
                writer.add_scalar('Train/Total_loss', loss,
                                  epoch * num_iter_per_epoch + iter)
                writer.add_scalar('Train/Coordination_loss', loss_coord,
                                  epoch * num_iter_per_epoch + iter)
                writer.add_scalar('Train/Confidence_loss', loss_conf,
                                  epoch * num_iter_per_epoch + iter)
                writer.add_scalar('Train/Class_loss', loss_cls,
                                  epoch * num_iter_per_epoch + iter)
        if epoch % opt.test_interval == 0:
            model.eval()
            loss_ls = []
            loss_coord_ls = []
            loss_conf_ls = []
            loss_cls_ls = []
            for te_iter, te_batch in enumerate(test_generator):
                te_image, te_label = te_batch
                num_sample = len(te_label)
                if torch.cuda.is_available():
                    te_image = te_image.cuda()
                with torch.no_grad():
                    te_logits = model(te_image)
                    batch_loss, batch_loss_coord, batch_loss_conf, batch_loss_cls = criterion(
                        te_logits, te_label)
                loss_ls.append(batch_loss * num_sample)
                loss_coord_ls.append(batch_loss_coord * num_sample)
                loss_conf_ls.append(batch_loss_conf * num_sample)
                loss_cls_ls.append(batch_loss_cls * num_sample)
            te_loss = sum(loss_ls) / test_set.__len__()
            te_coord_loss = sum(loss_coord_ls) / test_set.__len__()
            te_conf_loss = sum(loss_conf_ls) / test_set.__len__()
            te_cls_loss = sum(loss_cls_ls) / test_set.__len__()
            print(
                "Epoch: {}/{}, Lr: {}, Loss:{:.2f} (Coord:{:.2f} Conf:{:.2f} Cls:{:.2f})"
                .format(epoch + 1, opt.num_epoches,
                        optimizer.param_groups[0]['lr'], te_loss,
                        te_coord_loss, te_conf_loss, te_cls_loss))
            writer.add_scalar('Test/Total_loss', te_loss, epoch)
            writer.add_scalar('Test/Coordination_loss', te_coord_loss, epoch)
            writer.add_scalar('Test/Confidence_loss', te_conf_loss, epoch)
            writer.add_scalar('Test/Class_loss', te_cls_loss, epoch)
            model.train()
            if te_loss + opt.es_min_delta < best_loss:
                best_loss = te_loss
                best_epoch = epoch
                # torch.save(model, opt.saved_path + os.sep + "trained_yolo_coco")
                torch.save(
                    model.state_dict(),
                    opt.saved_path + os.sep + "only_params_trained_yolo_coco")
                torch.save(
                    model,
                    opt.saved_path + os.sep + "whole_model_trained_yolo_coco")

            # Early stopping
            if epoch - best_epoch > opt.es_patience > 0:
                print(
                    "Stop training at epoch {}. The lowest loss achieved is {}"
                    .format(epoch, te_loss))
                break
    # writer.export_scalars_to_json(log_path + os.sep + "all_logs.json")
    writer.close()
Exemplo n.º 9
0
def main():
    if args.arch == "resnet50":
        backbone = resnet50()
    elif args.arch == "resnet18":
        backbone = resnet18()
    else:
        raise NotImplementedError
    

    net = RetinaFace(backbone, pretrained_model_path=args.pretrained)
    if torch.cuda.is_available():
        if args.cuda:
            # torch.set_default_tensor_type('torch.cuda.FloatTensor')
            if args.num_workers>1:
                net = torch.nn.DataParallel(net)  # must after loading model weigths
            else:
                # raise NotImplementedError
                pass
            net.cuda()
            # net.to(device)
            cudnn.benchmark = True

    if args.use_tensorboard:
        from tensorboardX import SummaryWriter
        if not osp.exists(args.log_dir):
            os.mkdir(args.log_dir)
        if args.log_dir:
            if not osp.exists(args.log_dir):
                os.mkdir(args.log_dir)
        train_writer = SummaryWriter(log_dir="{}".format(args.log_dir), comment=args.arch)
        
        dummy_input = torch.rand(1, 3, 640, 640).cuda()
        train_writer.add_graph(backbone, (dummy_input, ))

    train_dataset = WiderFaceDetection(root_path=args.root, data_path=args.dataset_root, phase="train", 
                                       dataset_name="WiderFace", transform=None)
    train_loader = data.DataLoader(train_dataset, args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=detection_collate)

    print("sucess train_loader")

    anchors = Anchor_Box()
    with torch.no_grad():
        anchors = anchors.forward()
        anchors = anchors.cuda()
    print("anchors ready")
    
    optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
    
    start_epoch = 0
    end_epoch = args.max_epoch
    criterion = MultiTaskLoss()
    for epoch in range(start_epoch + 1, end_epoch + 1):
        lr = adjust_learning_rate(optimizer=optimizer, 
                                  epoch=epoch, 
                                  step_epoch=[55, 68, 80], 
                                  gamma=0.1, 
                                  base_lr=args.lr,  # 0.001
                                  warm_up_end_lr=0.01, 
                                  warmup_epoch=5
                                 )
        print("Epoch[{}]  lr: {}".format(epoch, lr))
        if args.use_tensorboard:
            train_writer.add_scalar('learning_rate', lr, epoch)
            # train
            train_net(train_loader, net, criterion, optimizer, epoch, anchors, train_writer=train_writer)
        else:
            train_net(train_loader, net, criterion, optimizer, epoch, anchors)
        
        if epoch % 5 == 0:
            pass # TODO

        if (epoch == end_epoch) or (epoch % 5 == 0):
            torch.save(net.state_dict(), "/home/shanma/Workspace/zhubin/github_file/RetinaFace-pytorch/weights/retinaface_epoch{}_{}.pth".format(epoch, get_cur_time()))
            # torch.save(net.state_dict(), "/home/dc2-user/zhubin/RetinaFace-pytorch/weights/retinaface_epoch{}_{}.pth".format(epoch, get_cur_time()))

    #     if (epoch >= 50 and epoch % 10 == 0):
    #         eval_net(
    #             val_dataset,
    #             val_loader,
    #             net,
    #             detector,
    #             cfg,
    #             ValTransform,
    #             top_k,
    #             thresh=thresh,
    #             batch_size=batch_size)

    # save_checkpoint(net, end_epoch, size, optimizer)

    if args.use_tensorboard:
        train_writer.close()
Exemplo n.º 10
0
def main(args):
    r"""Performs the main training loop
	"""
    # Load dataset
    print('> Loading dataset ...')
    dataset_train = Dataset(train=True, gray_mode=args.gray, shuffle=True)
    dataset_val = Dataset(train=False, gray_mode=args.gray, shuffle=False)
    loader_train = DataLoader(dataset=dataset_train, num_workers=6, \
             batch_size=args.batch_size, shuffle=True)
    print("\t# of training samples: %d\n" % int(len(dataset_train)))

    # Init loggers
    if not os.path.exists(args.log_dir):
        os.makedirs(args.log_dir)
    writer = SummaryWriter(args.log_dir)
    logger = init_logger(args)

    # Create model
    if not args.gray:
        in_ch = 3
    else:
        in_ch = 1
    net = FFDNet(num_input_channels=in_ch)
    # Initialize model with He init
    net.apply(weights_init_kaiming)
    # Define loss
    criterion = nn.MSELoss(size_average=False)

    # Move to GPU
    device_ids = [0]
    model = nn.DataParallel(net, device_ids=device_ids).cuda()
    criterion.cuda()

    # Optimizer
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    # Resume training or start anew
    if args.resume_training:
        resumef = os.path.join(args.log_dir, 'ckpt.pth')
        if os.path.isfile(resumef):
            checkpoint = torch.load(resumef)
            print("> Resuming previous training")
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            new_epoch = args.epochs
            new_milestone = args.milestone
            current_lr = args.lr
            args = checkpoint['args']
            training_params = checkpoint['training_params']
            start_epoch = training_params['start_epoch']
            args.epochs = new_epoch
            args.milestone = new_milestone
            args.lr = current_lr
            print("=> loaded checkpoint '{}' (epoch {})"\
               .format(resumef, start_epoch))
            print("=> loaded parameters :")
            print("==> checkpoint['optimizer']['param_groups']")
            print("\t{}".format(checkpoint['optimizer']['param_groups']))
            print("==> checkpoint['training_params']")
            for k in checkpoint['training_params']:
                print("\t{}, {}".format(k, checkpoint['training_params'][k]))
            argpri = vars(checkpoint['args'])
            print("==> checkpoint['args']")
            for k in argpri:
                print("\t{}, {}".format(k, argpri[k]))

            args.resume_training = False
        else:
            raise Exception("Couldn't resume training with checkpoint {}".\
                format(resumef))
    else:
        start_epoch = 0
        training_params = {}
        training_params['step'] = 0
        training_params['current_lr'] = 0
        training_params['no_orthog'] = args.no_orthog

    # Training
    for epoch in range(start_epoch, args.epochs):
        # Learning rate value scheduling according to args.milestone
        if epoch > args.milestone[1]:
            current_lr = args.lr / 1000.
            training_params['no_orthog'] = True
        elif epoch > args.milestone[0]:
            current_lr = args.lr / 10.
        else:
            current_lr = args.lr

        # set learning rate in optimizer
        for param_group in optimizer.param_groups:
            param_group["lr"] = current_lr
        print('learning rate %f' % current_lr)

        # train
        for i, data in enumerate(loader_train, 0):
            # Pre-training step
            model.train()
            model.zero_grad()
            optimizer.zero_grad()

            # inputs: noise and noisy image
            img_train = data
            noise = torch.zeros(img_train.size())
            stdn = np.random.uniform(args.noiseIntL[0], args.noiseIntL[1], \
                size=noise.size()[0])
            for nx in range(noise.size()[0]):
                sizen = noise[0, :, :, :].size()
                noise[nx, :, :, :] = torch.FloatTensor(sizen).\
                     normal_(mean=0, std=stdn[nx])
            imgn_train = img_train + noise
            # Create input Variables
            img_train = Variable(img_train.cuda())
            imgn_train = Variable(imgn_train.cuda())
            noise = Variable(noise.cuda())
            stdn_var = Variable(torch.cuda.FloatTensor(stdn))

            # Evaluate model and optimize it
            out_train = model(imgn_train, stdn_var)
            loss = criterion(out_train, noise) / (imgn_train.size()[0] * 2)
            loss.backward()
            optimizer.step()

            # Results
            model.eval()
            out_train = torch.clamp(imgn_train - model(imgn_train, stdn_var),
                                    0., 1.)
            psnr_train = batch_psnr(out_train, img_train, 1.)
            # PyTorch v0.4.0: loss.data[0] --> loss.item()

            if training_params['step'] % args.save_every == 0:
                # Apply regularization by orthogonalizing filters
                if not training_params['no_orthog']:
                    model.apply(svd_orthogonalization)

                # Log the scalar values
                writer.add_scalar('loss', loss.item(), training_params['step'])
                writer.add_scalar('PSNR on training data', psnr_train, \
                   training_params['step'])
                print("[epoch %d][%d/%d] loss: %.4f PSNR_train: %.4f" %\
                 (epoch+1, i+1, len(loader_train), loss.item(), psnr_train))
            training_params['step'] += 1
        # The end of each epoch
        model.eval()

        # Validation
        psnr_val = 0
        for valimg in dataset_val:
            img_val = torch.unsqueeze(valimg, 0)
            noise = torch.FloatTensor(img_val.size()).\
              normal_(mean=0, std=args.val_noiseL)
            imgn_val = img_val + noise
            img_val, imgn_val = Variable(img_val.cuda()), Variable(
                imgn_val.cuda())
            sigma_noise = Variable(torch.cuda.FloatTensor([args.val_noiseL]))
            out_val = torch.clamp(imgn_val - model(imgn_val, sigma_noise), 0.,
                                  1.)
            psnr_val += batch_psnr(out_val, img_val, 1.)
        psnr_val /= len(dataset_val)
        print("\n[epoch %d] PSNR_val: %.4f" % (epoch + 1, psnr_val))
        writer.add_scalar('PSNR on validation data', psnr_val, epoch)
        writer.add_scalar('Learning rate', current_lr, epoch)

        # Log val images
        try:
            if epoch == 0:
                # Log graph of the model
                writer.add_graph(
                    model,
                    (imgn_val, sigma_noise),
                )
                # Log validation images
                for idx in range(2):
                    imclean = utils.make_grid(img_val.data[idx].clamp(0., 1.), \
                          nrow=2, normalize=False, scale_each=False)
                    imnsy = utils.make_grid(imgn_val.data[idx].clamp(0., 1.), \
                          nrow=2, normalize=False, scale_each=False)
                    writer.add_image('Clean validation image {}'.format(idx),
                                     imclean, epoch)
                    writer.add_image('Noisy validation image {}'.format(idx),
                                     imnsy, epoch)
            for idx in range(2):
                imrecons = utils.make_grid(out_val.data[idx].clamp(0., 1.), \
                      nrow=2, normalize=False, scale_each=False)
                writer.add_image('Reconstructed validation image {}'.format(idx), \
                    imrecons, epoch)
            # Log training images
            imclean = utils.make_grid(img_train.data, nrow=8, normalize=True, \
                scale_each=True)
            writer.add_image('Training patches', imclean, epoch)

        except Exception as e:
            logger.error("Couldn't log results: {}".format(e))

        # save model and checkpoint
        training_params['start_epoch'] = epoch + 1
        torch.save(model.state_dict(), os.path.join(args.log_dir, 'net.pth'))
        save_dict = { \
         'state_dict': model.state_dict(), \
         'optimizer' : optimizer.state_dict(), \
         'training_params': training_params, \
         'args': args\
         }
        torch.save(save_dict, os.path.join(args.log_dir, 'ckpt.pth'))
        if epoch % args.save_every_epochs == 0:
            torch.save(save_dict, os.path.join(args.log_dir, \
                    'ckpt_e{}.pth'.format(epoch+1)))
        del save_dict
Exemplo n.º 11
0
        num_workers=0
    )
    return dataloader
from torchnet.meter import AverageValueMeter

if __name__ == '__main__':
    dataloader = getdataloader(mode=True)
    model = LinearModel()
    model.cuda()
    trainable_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print("trainable parameters ",trainable_num)
    criterion = torch.nn.MSELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)  # model.parameters()自动完成参数的初始化操作
    summary = SummaryWriter('log')
    x = torch.rand(size=(17,IN_FEATURE)).cuda()
    summary.add_graph(model,x)
    loss_meter = AverageValueMeter()
    # training cycle forward, backward, update
    for i in range(EPOCH):
        myloss = 0
        iter=0
        for epoch, (x_data, y_data) in enumerate(dataloader):
            y_pred = model(x_data)  # forward:predict
            loss = criterion(y_pred, y_data)  # forward: loss
            myloss += loss.item()
            loss_meter.add(loss.item())
            # print(myloss)
            optimizer.zero_grad()  # the grad computer by .backward() will be accumulated. so before backward, remember set the grad to zero
            loss.backward()  # backward: autograd
            optimizer.step()  # update 参数,即更新w和b的值
            iter = epoch
Exemplo n.º 12
0
                          weight_decay=5e-4)
    train_scheduler = optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=settings.MILESTONES,
        gamma=0.2)  #learning rate decay
    iter_per_epoch = len(cifar100_training_loader)
    warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * args.warm)
    checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, args.net,
                                   settings.TIME_NOW)

    #use tensorboard
    if not os.path.exists(settings.LOG_DIR):
        os.mkdir(settings.LOG_DIR)
    writer = SummaryWriter(
        logdir=os.path.join(settings.LOG_DIR, args.net, settings.TIME_NOW))
    input_tensor = torch.Tensor(12, 3, 32, 32).cuda()
    writer.add_graph(net, Variable(input_tensor, requires_grad=True))

    #create checkpoint folder to save model
    if not os.path.exists(checkpoint_path):
        os.makedirs(checkpoint_path)
    checkpoint_path = os.path.join(checkpoint_path, '{net}-{epoch}-{type}.pth')

    best_acc = 0.0
    for epoch in range(1, settings.EPOCH):
        if epoch > args.warm:
            train_scheduler.step(epoch)

        train(epoch)
        acc = eval_training(epoch)

        #start to save best performance model after learning rate decay to 0.01
Exemplo n.º 13
0
print(model)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(),
                            lr=0.001)  # optimize all cnn parameters
writer = SummaryWriter(comment='Action_Net')

for epoch in range(5):
    for step, (inputs, labels) in enumerate(train_loader):
        optimizer.zero_grad()  # clear last grad
        inputs = torch.tensor(inputs, dtype=torch.float32)
        out = model(inputs)
        loss = criterion(out, labels)  # calculate loss
        loss.backward()  # loss backward, calculate new data
        optimizer.step()  # add new weight to net parameters
        writer.add_graph(model, inputs)
        writer.add_scalar('Loss', loss, epoch * 100 + step)
        if step % 100 == 0:
            for i, (test_data, test_label) in enumerate(test_loader):
                test_data = torch.tensor(test_data, dtype=torch.float32)
                test_output = model(test_data)
                pred_y = torch.max(test_output, 1)[1].data.numpy()
                accuracy = float(
                    (pred_y
                     == test_label.data.numpy()).astype(int).sum()) / float(
                         test_label.size(0))
                writer.add_scalar('Accuracy', accuracy, epoch * 100 + step)
            print('Epoch: ', epoch, '| train loss: %.4f' % loss.data.numpy(),
                  '| test accuracy: %.2f' % accuracy)
writer.close()
torch.save(model, 'model/net.pkl')  # 保存整个神经网络的结构和模型参数
Exemplo n.º 14
0
        val_dataloader = DataLoader(val_dataset,
                                    batch_size=args.bs,
                                    shuffle=False,
                                    num_workers=args.workers,
                                    pin_memory=True)

        if args.ckpt:
            pass
        else:
            # save graph and clips_order samples
            for data in train_dataloader:
                clips, idxs = data
                writer.add_video('train/clips', clips, 0, fps=8)
                writer.add_text('train/idxs', str(idxs.tolist()), 0)
                clips = clips.to(device)
                writer.add_graph(model, clips)
                break
            # save init params at step 0
            for name, param in model.named_parameters():
                writer.add_histogram('params/{}'.format(name), param, 0)

        ### loss funciton, optimizer and scheduler ###
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(model.parameters(),
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=args.wd)
        #scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', min_lr=1e-5, patience=50, factor=0.1)
        scheduler = optim.lr_scheduler.MultiStepLR(optimizer,
                                                   milestones=[150],
                                                   gamma=0.1)
class TrickLearner(object):
    """ Performs vanilla training with tricks:
    1. Label Smooth;
    2. Mixup;
    3. SE-module is deployed in nets/decode models.
  """
    def __init__(self, model, loaders, args, device):
        self.args = args
        self.device = device
        self.model = model
        self.__build_path()
        self.train_loader, self.test_loader = loaders
        self.setup_optim()
        self.criterion = nn.CrossEntropyLoss().cuda()
        if self.args.label_smooth:
            classes = 10 if args.dataset == 'cifar10' else (
                1000 if args.dataset == 'ilsvrc_12' else 100)
            self.criterion_smooth = CrossEntropyLabelSmooth(
                classes, args.label_smooth_eps).cuda()

        if self.check_is_primary():
            self.writer = SummaryWriter(os.path.dirname(self.save_path))
            # self.add_graph()

    def train(self, train_sampler=None):
        for epoch in range(self.args.epochs):
            if self.args.distributed:
                assert train_sampler != None
                train_sampler.set_epoch(epoch)

            self.model.train()
            if self.check_is_primary():
                logging.info("Training at Epoch: %d" % epoch)
            train_acc, train_loss = self.epoch(True)

            if self.check_is_primary():
                self.writer.add_scalar('train_acc', train_acc, epoch)
                self.writer.add_scalar('train_loss', train_loss, epoch)

            if (epoch + 1) % self.args.eval_epoch == 0:
                # evaluate every GPU, but we only show the results on a single one.!
                if self.check_is_primary():
                    logging.info("Evaluation at Epoch: %d" % epoch)
                self.evaluate(True, epoch)

                if self.check_is_primary():
                    self.save_model()

    def evaluate(self, is_train=False, epoch=None):
        self.model.eval()
        # NOTE: syncronizing the BN statistics
        if self.args.distributed:
            sync_bn_stat(self.model, self.args.world_size)

        if not is_train:
            self.load_model()

        with torch.no_grad():
            test_acc, test_loss = self.epoch(False)

        if is_train and epoch and self.check_is_primary():
            self.writer.add_scalar('test_acc', test_acc, epoch)
            self.writer.add_scalar('test_loss', test_loss, epoch)
        return test_acc, test_loss

    def finetune(self, train_sampler):
        self.load_model()
        self.evaluate()

        for epoch in range(self.args.epochs):
            if self.args.distributed:
                assert train_sampler != None
                train_sampler.set_epoch(epoch)

            self.model.train()

            # NOTE: use the preset learning rate for all epochs.
            ft_acc, ft_loss = self.epoch(True)

            if self.check_is_primary():
                self.writer.add_scalar('ft_acc', ft_acc, epoch)
                self.writer.add_scalar('ft_loss', ft_loss, epoch)

            # evaluate every k step
            if (epoch + 1) % self.args.eval_epoch == 0:
                if self.check_is_primary():
                    logging.info("Evaluation at Epoch: %d" % epoch)
                self.evaluate(True, epoch)

                # save the model
            if self.check_is_primary():
                self.save_model()

    def misc(self):
        raise NotImplementedError(
            "Misc functions are implemented in sub classes")

    def epoch(self, is_train):
        """ Rewrite this function if necessary in the sub-classes. """

        loader = self.train_loader if is_train else self.test_loader

        # setup statistics
        batch_time = AverageMeter('Time', ':3.3f')
        # data_time = AverageMeter('Data', ':6.3f')
        losses = AverageMeter('Loss', ':.4e')
        lrs = AverageMeter('Lr', ':.4e')
        top1 = AverageMeter('Acc@1', ':3.3f')
        top5 = AverageMeter('Acc@5', ':3.3f')
        metrics = [batch_time, lrs, top1, top5, losses]

        loader_len = len(loader)
        progress = ProgressMeter(loader_len,
                                 *metrics,
                                 prefix='Job id: %s, ' % self.args.job_id)
        end = time.time()

        for idx, (X, y) in enumerate(loader):

            # data_time.update(time.time() - end)
            criterion = self.criterion_smooth if is_train and self.args.label_smooth else self.criterion
            X, y = X.to(self.device), y.to(self.device)

            if is_train and self.args.mixup:
                mixed_X, y_a, y_b, lam = self.mixup_data(X, y)
                mixed_yp = self.model(mixed_X)
                loss = (lam * criterion(mixed_yp, y_a) + (1.0 - lam) *
                        criterion(mixed_yp, y_b)) / self.args.world_size
                acc1_a, acc5_a = accuracy(mixed_yp, y_a, topk=(1, 5))
                acc1_b, acc5_b = accuracy(mixed_yp, y_b, topk=(1, 5))
                acc1, acc5 = lam * acc1_a + (
                    1 - lam) * acc1_b, lam * acc5_a + (1 - lam) * acc5_b
            else:
                yp = self.model(X)
                loss = criterion(yp, y) / self.args.world_size
                acc1, acc5 = accuracy(yp, y, topk=(1, 5))

            reduced_loss = loss.data.clone()
            reduced_acc1 = acc1.clone() / self.args.world_size
            reduced_acc5 = acc5.clone() / self.args.world_size

            if self.args.distributed:
                dist.all_reduce(reduced_loss)
                dist.all_reduce(reduced_acc1)
                dist.all_reduce(reduced_acc5)

            if is_train:
                self.opt.zero_grad()
                loss.backward()
                if self.args.distributed:
                    average_gradients(self.model)  # NOTE: important
                self.opt.step()

            if self.lr_scheduler:
                self.lr_scheduler.step()

            # update statistics
            top1.update(reduced_acc1[0].item(), X.shape[0])
            top5.update(reduced_acc5[0].item(), X.shape[0])
            losses.update(reduced_loss.item(), X.shape[0])
            lrs.update(self.lr_scheduler.get_lr()[0])
            batch_time.update(time.time() - end)
            end = time.time()

            # show the training/evaluating statistics
            if self.check_is_primary() and ((idx % self.args.print_freq == 0)
                                            or (idx + 1) % loader_len == 0):
                progress.show(idx)

        return top1.avg, losses.avg

    def setup_optim(self):
        max_iter = len(self.train_loader) * self.args.epochs

        if self.args.model_type.startswith('model_'):
            self.opt = optim.SGD(self.model.parameters(), lr=self.args.lr, \
                momentum=self.args.momentum, nesterov=self.args.nesterov, \
                weight_decay=self.args.weight_decay)
            self.lr_scheduler = optim.lr_scheduler.MultiStepLR(
                self.opt,
                milestones=[int(max_iter * 0.5),
                            int(max_iter * 0.75)])

        elif self.args.model_type.startswith('resnet_'):
            self.opt = optim.SGD(self.model.parameters(), lr=self.args.lr, \
                momentum=self.args.momentum, nesterov=self.args.nesterov, \
                weight_decay=self.args.weight_decay)
            if self.args.lr_decy_type == 'cosine':
                self.lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(
                    self.opt, max_iter, eta_min=0)
            elif self.args.lr_decy_type == 'multi_step':
                self.lr_scheduler = optim.lr_scheduler.MultiStepLR(
                    self.opt,
                    milestones=[int(max_iter * 0.5),
                                int(max_iter * 0.75)])
            else:
                raise ValueError("Unknown learning rate decay type")

        elif self.args.model_type.startswith('mobilenet_v2'):
            # default on 8-gpu: 250 epochs, 2e-1 lr with cosine, 4e-5 wd, no dropout, warmup to 8e-1, nestrov, no wd for BN and bias
            param_groups = self.get_param_group()
            self.opt = optim.SGD(param_groups, lr=self.args.lr, \
                momentum=self.args.momentum, nesterov=self.args.nesterov, \
                weight_decay=self.args.weight_decay)
            # self.lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(self.opt, self.args.epochs, eta_min=0)
            warmup_lr = 4 * self.args.lr
            warmup_steps = 1250
            self.lr_scheduler = WarmUpCosineLRScheduler(self.opt,
                                                        max_iter,
                                                        self.args.lr_min,
                                                        self.args.lr,
                                                        warmup_lr,
                                                        warmup_steps,
                                                        last_iter=-1)

        else:
            raise ValueError("Unknown model, failed to initalize optim")

    def add_graph(self):
        # create dummy input
        x = torch.randn(self.args.batch_size, 3, 32, 32)
        with self.writer:
            self.writer.add_graph(self.model, (x, ))

    def __build_path(self):
        if self.args.exec_mode == 'finetune':
            self.load_path = self.args.load_path
            self.save_path = os.path.join(os.path.dirname(self.load_path),
                                          'model_ft.pt')
        elif self.args.exec_mode == 'train':
            self.save_path = os.path.join(
                self.args.save_path,
                '_'.join([self.args.model_type,
                          self.args.learner]), self.args.job_id, 'model.pt')
            self.load_path = self.save_path
        else:
            self.load_path = self.args.load_path
            self.save_path = self.load_path

    def check_is_primary(self):
        if (self.args.distributed and self.args.rank == 0) or \
            not self.args.distributed:
            return True
        else:
            return False

    def save_model(self):
        state = {'state_dict': self.model.state_dict(), \
            'optimizer': self.opt.state_dict()}
        torch.save(state, self.save_path)
        logging.info("Model stored at: " + self.save_path)

    def load_model(self):
        if self.args.distributed:
            # read parameters to each GPU seperately
            loc = 'cuda:{}'.format(torch.cuda.current_device())
            checkpoint = torch.load(self.load_path, map_location=loc)
        else:
            checkpoint = torch.load(self.load_path)

        self.model.load_state_dict(checkpoint['state_dict'])
        self.opt.load_state_dict(checkpoint['optimizer'])
        logging.info("Model succesfully restored from %s" % self.load_path)

        if self.args.distributed:
            broadcast_params(self.model)

    def mixup_data(self, X, y):
        batch_size = X.size()[0]
        alpha = self.args.mixup_alpha
        if alpha > 0:
            lam = np.random.beta(alpha, alpha)
        else:
            lam = 1
        index = torch.randperm(batch_size).to(self.device)
        mixed_X = lam * X + (1 - lam) * X[index, :]
        y_a, y_b = y, y[index]
        return mixed_X, y_a, y_b, lam

    def get_param_group(self):
        param_group_no_wd = []
        names_no_wd = []
        param_group_normal = []

        for name, m in self.model.named_modules():
            if isinstance(m, nn.Conv2d):
                if m.bias is not None:
                    param_group_no_wd.append(m.bias)
                    names_no_wd.append(name + '.bias')
            elif isinstance(m, nn.Linear):
                if m.bias is not None:
                    param_group_no_wd.append(m.bias)
                    names_no_wd.append(name + '.bias')
            elif isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d)):
                if m.weight is not None:
                    param_group_no_wd.append(m.weight)
                    names_no_wd.append(name + '.weight')
                if m.bias is not None:
                    param_group_no_wd.append(m.bias)
                    names_no_wd.append(name + '.bias')

        for name, p in self.model.named_parameters():
            if name not in names_no_wd:
                param_group_normal.append(p)
        return [{
            'params': param_group_normal
        }, {
            'params': param_group_no_wd,
            'weight_decay': 0.0
        }]
Exemplo n.º 16
0
class model:
    def __init__(self,
                 package_name,
                 model_name,
                 description='',
                 model_path=None,
                 args=None):

        self.package_name = __import__(package_name)
        self.model_name = model_name
        self.model_path = model_path
        self.model = None
        self.writer = None
        self.loss = ''
        self.optimizer_name = ''
        self.accuracy_name = ''
        self.checkpoint_name = ''
        self.optimizer = None
        self.accuracy = None
        self.criterion = None
        self.epoch = 0
        self.description = description
        self.create_model(args=args)
        # self.create_writer()

    def create_model(self, args):
        if self.model_path is not None:
            print("=> Loading checkpoint '{}'".format(self.model_path))
            self.load_checkpoint(self.model_path, args=args)
        else:
            print("=> Creating new model")
            self.model = getattr(self.package_name, self.model_name)(args)
        if torch.cuda.is_available():
            print("Using GPU")
            # self.model = nn.DataParallel(self.model,  device_ids=[0])
            self.model = self.model.cuda()

    def create_writer(self, checkpoint_name=''):
        if checkpoint_name is '':
            self.checkpoint_name = datetime.datetime.now().strftime(
                '%b%d_%H-%M') + '_' + self.model_name + '_' + self.description
        else:
            self.checkpoint_name = checkpoint_name
        writer_name = '{checkpoint_name}_{optimizer}_{loss_name}.pth.tar'\
            .format(checkpoint_name=self.checkpoint_name, optimizer=self.optimizer_name, loss_name=self.loss)
        writer_dir = os.path.join('runs', writer_name)
        self.writer = SummaryWriter(log_dir=writer_dir)

    def fit(self,
            loss='MSELoss',
            optimizer_name='Adam',
            lr=0.01,
            weight_decay=0,
            accuracy_name='',
            create_writer=True):
        self.loss = loss
        self.optimizer_name = optimizer_name
        self.accuracy_name = accuracy_name
        self.criterion = getattr(nn, loss)()
        # self.criterion = getattr(self.package_name, loss)()
        if optimizer_name == 'SGD':
            self.optimizer = getattr(optim,
                                     optimizer_name)(self.model.parameters(),
                                                     lr=lr,
                                                     momentum=0.9)
        else:
            self.optimizer = getattr(optim,
                                     optimizer_name)(self.model.parameters(),
                                                     lr=lr)
        if accuracy_name == 'argmax':
            self.accuracy = argmax
        elif accuracy_name == 'count_success':
            self.accuracy = count_success
        elif accuracy_name != '':
            self.accuracy = getattr(nn, accuracy_name)()
            # self.accuracy = getattr(self.package_name, accuracy_name)()
        if create_writer:
            self.create_writer(self.checkpoint_name)

    def save_checkpoint(self, save_dir='', epochs_per_save=1):
        # TODO: add is_best to save, to save the best model in the training
        filename = '{checkpoint_name}_{epoch_num}_{optimizer}_{loss_name}.pth.tar'\
            .format(checkpoint_name=self.checkpoint_name, epoch_num=self.epoch,
                    optimizer=self.optimizer_name, loss_name=self.loss)
        torch.save(
            {
                'epoch': self.epoch,
                'model_state_dict': self.model.state_dict(),
                'optimizer_state_dict': self.optimizer.state_dict(),
                'loss': self.loss,
                'optimizer_name': self.optimizer_name,
                'accuracy_name': self.accuracy_name,
                'checkpoint_name': self.checkpoint_name
            }, os.path.join(save_dir, filename))

        print("Saved checkpoint as: {}".format(os.path.join(
            save_dir, filename)))

        # removing the old checkpoint:
        # TODO: need to check if the remove is working
        old_filename = '{checkpoint_name}_{epoch_num}_{optimizer}_{loss_name}.pth.tar'\
            .format(checkpoint_name=self.checkpoint_name, epoch_num=self.epoch-epochs_per_save,
                    optimizer=self.optimizer_name, loss_name=self.loss)
        #TODO check it
        # if os.path.exists(os.path.join(save_dir, old_filename)):
        #     os.remove(os.path.join(save_dir, old_filename))

    def load_checkpoint(self, filename, args):
        """
        loads checkpoint (that was save with save_checkpoint)
        No need to do .fit after
        :param filename: path to the checkpoint
        :return:
        """
        self.model = getattr(self.package_name, self.model_name)(args)

        checkpoint = torch.load(filename)

        self.model.load_state_dict(checkpoint['model_state_dict'])

        self.epoch = checkpoint['epoch']
        self.loss = checkpoint['loss']
        self.optimizer_name = checkpoint['optimizer_name']
        self.accuracy_name = checkpoint['accuracy_name']
        # self.checkpoint_name = None
        self.checkpoint_name = checkpoint['checkpoint_name']

        self.fit(self.loss,
                 self.optimizer_name,
                 accuracy_name=self.accuracy_name)
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

        print("Loaded checkpoint as: {}".format(filename))

    def print_summary(self, input_size=(1, 32, 128, 128)):
        summ = summary(self.model, input_size=input_size)
        # self.writer.add_text('Summary', summ)

    def print_graph(self, dummy_input):
        # dummy_input = Variable(torch.rand(1, 1, 32, 64, 64))
        if self.writer is not None:
            self.writer.add_graph(model=self.model,
                                  input_to_model=(dummy_input, ))

    def print_epoch_statistics(self,
                               epoch,
                               epoch_time,
                               running_loss,
                               running_accuracy,
                               validation_accuracy=None):
        """
        :param epoch: number of epoch this results from
        :param running_loss: array of all the losses in this epoch
        :param running_accuracy: array of all the training accuracies in this epoch
        :param validation_accuracy: array of all the validation accuracies in this epoch
        :return: print on the stdout the results and log in tensorboard if defined
        """
        if validation_accuracy is None:
            print(
                "End of epoch {:3d} in {:3d} sec | Training loss = {:5.4f} | Training acc = {:5.4f}"
                .format(epoch, int(epoch_time), np.mean(running_loss),
                        np.mean(running_accuracy)))
        else:
            print(
                "End of epoch {:3d} in {:3d} sec | Training loss = {:5.4f} | Training acc = {:5.4f} | Valid acc =  {:5.4f}"
                .format(epoch, int(epoch_time), np.mean(running_loss),
                        np.mean(running_accuracy),
                        np.mean(validation_accuracy)))
        if self.writer is not None:
            self.writer.add_scalar('Train/Loss', float(np.mean(running_loss)),
                                   epoch)
            self.writer.add_scalar('Train/accuracy',
                                   float(np.mean(running_accuracy)), epoch)
            if validation_accuracy is not None:
                self.writer.add_scalar('Validation/accuracy',
                                       float(np.mean(validation_accuracy)),
                                       epoch)

    def add_images_tensorboard(self, inputs, labels,
                               outputs):  # TODO: check this function
        """

        :param inputs: the net input, a 5 dim tensor shape: [batch, channels, z, x, y]
        :param labels: the ground truth, a 5 dim tensor shape: [batch, channels, z, x, y]
        :param outputs: the net output, a 5 dim tensor shape: [batch, channels, z, x, y]
        :return: add images to tensorboard
        """

    def test_validation(self, validationloader=None):
        validation_accuracy = None
        if validationloader is not None:
            self.model.eval()  # changing to eval mode
            valid_running_accuracy = []
            with torch.no_grad():
                for k, sample in enumerate(validationloader, 0):
                    if isinstance(sample, dict):
                        if self.model_name == 'LSTMClassifaierAndDenoise':
                            valid_mfcc = sample['mfcc'].reshape(-1, 1, 39)
                            valid_stft = sample['stft'].reshape(-1, 1, 257)
                            valid_labels = sample['ground_truth'].reshape(
                                -1, 257)
                        else:
                            valid_mfcc = sample['mfcc'].reshape(-1, 1, 351)
                            valid_stft = sample['stft'].reshape(-1, 1, 2313)
                            valid_labels = sample['ground_truth'].reshape(
                                -1, 1, 257)
                    else:
                        valid_mfcc, valid_stft, valid_labels = sample

                    # wrap them in Variable
                    if torch.cuda.is_available():
                        valid_mfcc,valid_stft, valid_labels = Variable(valid_mfcc.cuda()).float(),\
                                                              Variable(valid_stft.cuda()).float(),\
                                                              Variable(valid_labels.cuda()).float()
                    else:
                        valid_mfcc,valid_stft, valid_labels = Variable(valid_mfcc), Variable(valid_stft), \
                                                     Variable(valid_labels)

                    valid_outputs = self.model(valid_stft, valid_mfcc).cuda()
                    acc = self.accuracy(valid_outputs.cpu().data,
                                        valid_labels.cpu().data)
                    valid_running_accuracy.append(acc)
            validation_accuracy = valid_running_accuracy
            self.model.train()  # back to train mode
        return validation_accuracy

    def train(self,
              num_epochs,
              trainloader,
              valloader=None,
              epochs_per_save=10):
        print("Start training")
        start_train_time = time.time()
        for epoch in range(
                self.epoch, num_epochs
        ):  # loop over the dataset multiple times # adds that aaafter loadndigng the epochs will start for the last one
            start_epoch_time = time.time()
            self.epoch = epoch
            running_loss = []
            running_accuracy = []
            for i, sample in enumerate(trainloader, 0):
                # print(i)
                if isinstance(sample, dict):
                    if self.model_name == 'LSTMClassifaierAndDenoise':
                        mfcc = sample['mfcc'].reshape(-1, 1, 39)
                        stft = sample['stft'].reshape(-1, 1, 257)
                        labels = sample['ground_truth'].reshape(-1, 257)
                    else:
                        # reshape because we entered batch as one sample
                        mfcc = sample['mfcc'].reshape(-1, 1, 351)
                        stft = sample['stft'].reshape(-1, 1, 2313)
                        labels = sample['ground_truth'].reshape(-1, 1, 257)
                else:
                    inputs, labels = sample

                # wrap them in Variable
                if torch.cuda.is_available():
                    mfcc, stft, labels = Variable(
                        mfcc.cuda()).float(), Variable(
                            stft.cuda()).float(), Variable(
                                labels.cuda()).float()
                else:
                    mfcc, stft, labels = Variable(mfcc), Variable(
                        stft), Variable(labels)

                # forward + backward + optimize
                outputs = self.model(stft, mfcc).cuda()
                loss = self.criterion(outputs, labels)
                # zero the parameter gradients
                self.optimizer.zero_grad()
                loss.backward()
                # if i >0:
                #     loss.backward()
                # else:
                #     loss.backward(retain_graph=True)
                self.optimizer.step()

                # for loss per epoch
                running_loss.append(loss.item())
                if self.accuracy is not None:
                    # for accuracy per epoch
                    running_accuracy.append(
                        self.accuracy(outputs.cpu().data,
                                      labels.cpu().data))
                    if i % 10 == 0:
                        print('tmp accuracy {} in i = {} in epoch {}'.format(
                            np.mean(running_accuracy), i, epoch))
            validation_accuracy = self.test_validation(valloader)
            self.print_epoch_statistics(
                epoch,
                int(time.time() - start_epoch_time),
                running_loss,
                running_accuracy,
                validation_accuracy,
            )
            if epoch % epochs_per_save == 0:
                self.save_checkpoint('saved_models', epochs_per_save)
                # self.add_images_tensorboard(inputs, labels, outputs)
        self.save_checkpoint('saved_models')
        print('=' * 89)
        print("Finish Training, {} epochs in {} seconds".format(
            num_epochs, int(time.time() - start_train_time)))
        print('=' * 89)
Exemplo n.º 17
0
class SummaryWorker(multiprocessing.Process):
    def __init__(self, env):
        super(SummaryWorker, self).__init__()
        self.env = env
        self.config = env.config
        self.queue = multiprocessing.Queue()
        try:
            self.timer_scalar = utils.train.Timer(env.config.getfloat('summary', 'scalar'))
        except configparser.NoOptionError:
            self.timer_scalar = lambda: False
        try:
            self.timer_image = utils.train.Timer(env.config.getfloat('summary', 'image'))
        except configparser.NoOptionError:
            self.timer_image = lambda: False
        try:
            self.timer_histogram = utils.train.Timer(env.config.getfloat('summary', 'histogram'))
        except configparser.NoOptionError:
            self.timer_histogram = lambda: False
        with open(os.path.expanduser(os.path.expandvars(env.config.get('summary_histogram', 'parameters'))), 'r') as f:
            self.histogram_parameters = utils.RegexList([line.rstrip() for line in f])
        self.draw_points = utils.visualize.DrawPoints(env.limbs_index, colors=env.config.get('draw_points', 'colors').split())
        self._draw_points = utils.visualize.DrawPoints(env.limbs_index, thickness=1)
        self.draw_bbox = utils.visualize.DrawBBox()
        self.draw_feature = utils.visualize.DrawFeature()
        self.draw_cluster = utils.visualize.DrawCluster()

    def __call__(self, name, **kwargs):
        if getattr(self, 'timer_' + name)():
            kwargs = getattr(self, 'copy_' + name)(**kwargs)
            self.queue.put((name, kwargs))

    def stop(self):
        self.queue.put((None, {}))

    def run(self):
        self.writer = SummaryWriter(os.path.join(self.env.model_dir, self.env.args.run))
        try:
            height, width = tuple(map(int, self.config.get('image', 'size').split()))
            tensor = torch.randn(1, 3, height, width)
            step, epoch, dnn, stages = self.env.load()
            inference = model.Inference(self.config, dnn, stages)
            forward = inference.forward
            inference.forward = lambda self, *x: list(forward(self, *x)[-1].values())
            self.writer.add_graph(inference, (tensor,))
        except:
            traceback.print_exc()
        while True:
            name, kwargs = self.queue.get()
            if name is None:
                break
            func = getattr(self, 'summary_' + name)
            try:
                func(**kwargs)
            except:
                traceback.print_exc()

    def copy_scalar(self, **kwargs):
        step, loss_total, losses, losses_hparam = (kwargs[key] for key in 'step, loss_total, losses, losses_hparam'.split(', '))
        loss_total = loss_total.detach().cpu().numpy()
        losses = [{name: l.detach().cpu().numpy() for name, l in loss.items()} for loss in losses]
        losses_hparam = [{name: l.detach().cpu().numpy() for name, l in loss.items()} for loss in losses_hparam]
        return dict(
            step=step,
            loss_total=loss_total,
            losses=losses, losses_hparam=losses_hparam,
        )

    def summary_scalar(self, **kwargs):
        step, loss_total, losses, losses_hparam = (kwargs[key] for key in 'step, loss_total, losses, losses_hparam'.split(', '))
        for i, loss in enumerate(losses):
            for name, l in loss.items():
                self.writer.add_scalar('loss/%s%d' % (name, i), l, step)
        if self.config.getboolean('summary_scalar', 'loss_hparam'):
            self.writer.add_scalars('loss_hparam', {'%s%d' % (name, i): l for name, l in loss.items() for i, loss in enumerate(losses_hparam)}, step)
        self.writer.add_scalar('loss_total', loss_total, step)

    def copy_image(self, **kwargs):
        step, height, width, data, outputs = (kwargs[key] for key in 'step, height, width, data, outputs'.split(', '))
        image, mask, keypoints, yx_min, yx_max, parts, limbs, index = (data[key].clone().cpu().numpy() for key in 'image, mask, keypoints, yx_min, yx_max, parts, limbs, index'.split(', '))
        output = outputs[self.config.getint('summary_image', 'stage')]
        output = {name: output[name].detach().cpu().numpy() for name in self.config.get('summary_image', 'output').split()}
        return dict(
            step=step, height=height, width=width,
            image=image, mask=mask, keypoints=keypoints, yx_min=yx_min, yx_max=yx_max, parts=parts, limbs=limbs, index=index,
            output=output,
        )

    def summary_image(self, **kwargs):
        step, height, width, image, mask, keypoints, yx_min, yx_max, parts, limbs, index, output = (kwargs[key] for key in 'step, height, width, image, mask, keypoints, yx_min, yx_max, parts, limbs, index, output'.split(', '))
        limit = min(self.config.getint('summary_image', 'limit'), image.shape[0])
        image = image[:limit, :, :, :]
        if self.config.getboolean('summary_image', 'estimate'):
            canvas = np.copy(image)
            fn = pybenchmark.profile('output/estimate')(self.draw_clusters)
            canvas = [fn(canvas, parts[:-1], limbs) for canvas, parts, limbs in zip(canvas, *(output[name] for name in 'parts, limbs'.split(', ')))]
            self.writer.add_image('output/estimate', torchvision.utils.make_grid(torch.from_numpy(np.stack(canvas)).permute(0, 3, 1, 2).float(), normalize=True, scale_each=True), step)
        if self.config.getboolean('summary_image', 'data_keypoints'):
            canvas = np.copy(image)
            fn = pybenchmark.profile('data/keypoints')(self.draw_keypoints)
            canvas = [fn(canvas, mask, keypoints, yx_min, yx_max, index) for canvas, mask, keypoints, yx_min, yx_max, index in zip(canvas, mask, keypoints, yx_min, yx_max, index)]
            self.writer.add_image('data/keypoints', torchvision.utils.make_grid(torch.from_numpy(np.stack(canvas)).permute(0, 3, 1, 2).float(), normalize=True, scale_each=True), step)
        if self.config.getboolean('summary_image', 'data_parts'):
            fn = pybenchmark.profile('data/parts')(self.draw_feature)
            for i in range(parts.shape[1]):
                canvas = np.copy(image)
                canvas = [fn(canvas, feature[i]) for canvas, feature in zip(canvas, parts)]
                self.writer.add_image('data/parts%d' % i, torchvision.utils.make_grid(torch.from_numpy(np.stack(canvas)).permute(0, 3, 1, 2).float(), normalize=True, scale_each=True), step)
        if self.config.getboolean('summary_image', 'data_limbs'):
            fn = pybenchmark.profile('data/limbs')(self.draw_feature)
            for i in range(limbs.shape[1]):
                canvas = np.copy(image)
                canvas = [fn(canvas, feature[i]) for canvas, feature in zip(canvas, limbs)]
                self.writer.add_image('data/limbs%d' % i, torchvision.utils.make_grid(torch.from_numpy(np.stack(canvas)).permute(0, 3, 1, 2).float(), normalize=True, scale_each=True), step)
        for name, feature in output.items():
            fn = pybenchmark.profile('output/' + name)(self.draw_feature)
            for i in range(feature.shape[1]):
                canvas = np.copy(image)
                canvas = [fn(canvas, feature[i]) for canvas, feature in zip(canvas, feature)]
                self.writer.add_image('output/%s%d' % (name, i), torchvision.utils.make_grid(torch.from_numpy(np.stack(canvas)).permute(0, 3, 1, 2).float(), normalize=True, scale_each=True), step)

    def draw_keypoints(self, image, mask, keypoints, yx_min, yx_max, index):
        image = utils.visualize.draw_mask(image, mask, 1)
        size = yx_max - yx_min
        target = np.logical_and(*[np.squeeze(a, -1) > 0 for a in np.split(size, size.shape[-1], -1)])
        keypoints, yx_min, yx_max = (a[target] for a in (keypoints, yx_min, yx_max))
        for i, points in enumerate(keypoints):
            if i == index:
                image = self.draw_points(image, points)
            else:
                image = self._draw_points(image, points)
        image = self.draw_bbox(image, yx_min.astype(np.int), yx_max.astype(np.int))
        return image

    def draw_clusters(self, image, parts, limbs):
        try:
            interpolation = getattr(cv2, 'INTER_' + self.config.get('estimate', 'interpolation').upper())
            parts, limbs = (np.stack([cv2.resize(feature, image.shape[1::-1], interpolation=interpolation) for feature in a]) for a in (parts, limbs))
        except configparser.NoOptionError:
            pass
        clusters = pyopenpose.estimate(
            parts, limbs,
            self.env.limbs_index,
            self.config.getfloat('nms', 'threshold'),
            self.config.getfloat('integration', 'step'), tuple(map(int, self.config.get('integration', 'step_limits').split())), self.config.getfloat('integration', 'min_score'), self.config.getint('integration', 'min_count'),
            self.config.getfloat('cluster', 'min_score'), self.config.getint('cluster', 'min_count'),
        )
        scale_y, scale_x = np.array(image.shape[1::-1], parts.dtype) / np.array(parts.shape[-2:], parts.dtype)
        for cluster in clusters:
            cluster = [((i1, int(y1 * scale_y), int(x1 * scale_x)), (i2, int(y2 * scale_y), int(x2 * scale_x))) for (i1, y1, x1), (i2, y2, x2) in cluster]
            image = self.draw_cluster(image, cluster)
        return image

    def copy_histogram(self, **kwargs):
        return {
            'step': kwargs['step'],
            'state_dict': self.env.dnn.state_dict(),
        }

    def summary_histogram(self, **kwargs):
        step, state_dict = (kwargs[key] for key in 'step, state_dict'.split(', '))
        for name, var in state_dict.items():
            if self.histogram_parameters(name):
                self.writer.add_histogram(name, var, step)
Exemplo n.º 18
0
def train(opt):
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)

    training_params = {
        "batch_size": opt.batch_size,
        "shuffle": True,
        "drop_last": True,
        "collate_fn": custom_collate_fn
    }

    test_params = {
        "batch_size": opt.batch_size,
        "shuffle": False,
        "drop_last": False,
        "collate_fn": custom_collate_fn
    }

    training_set = VOCDataset(opt.data_path, opt.dataset, opt.image_size)
    training_generator = DataLoader(training_set, **training_params)

    test_set = VOCDataset(opt.data_path,
                          opt.dataset,
                          opt.image_size,
                          is_training=False)
    test_generator = DataLoader(test_set, **test_params)

    model = Deeplab(num_classes=training_set.num_classes + 1)
    model.load_state_dict(torch.load(opt.pre_trained_model))
    log_path = os.path.join(opt.log_path, "{}".format(opt.dataset))
    if os.path.isdir(log_path):
        shutil.rmtree(log_path)
    os.makedirs(log_path)
    writer = SummaryWriter(log_path)
    writer.add_graph(
        model, torch.rand(opt.batch_size, 3, opt.image_size, opt.image_size))
    if torch.cuda.is_available():
        model.cuda()

    best_loss = 1e10
    best_epoch = 0
    model.train()
    num_iter_per_epoch = len(training_generator)
    for epoch in range(opt.num_epoches):
        for iter, batch in enumerate(training_generator):
            current_step = epoch * num_iter_per_epoch + iter
            current_lr = update_lr(opt.lr, current_step,
                                   num_iter_per_epoch * opt.num_epoches)
            optimizer = get_optimizer(model, current_lr, opt.momentum,
                                      opt.decay)
            if torch.cuda.is_available():
                batch = [torch.Tensor(record).cuda() for record in batch]
            else:
                batch = [torch.Tensor(record) for record in batch]
            image, gt1, gt2 = batch
            gt1 = gt1.long()
            gt2 = gt2.long()
            optimizer.zero_grad()
            results = model(image)

            mul_losses = multiple_losses(results, [gt1, gt1, gt2, gt1])
            mul_losses[4].backward()
            optimizer.step()
            print(
                "Epoch: {}/{}, Iteration: {}/{}, Lr: {}, Loss: {:.2f} (1xloss: {:.2f} 0.75xloss: {:.2f} 0.5xloss: {:.2f} Max_merged_loss: {:.2f})"
                .format(epoch + 1, opt.num_epoches, iter + 1,
                        num_iter_per_epoch, optimizer.param_groups[0]['lr'],
                        mul_losses[4], mul_losses[0], mul_losses[1],
                        mul_losses[2], mul_losses[3]))
            writer.add_scalar('Train/Total_loss', mul_losses[4], current_step)
            writer.add_scalar('Train/1x_scale_loss', mul_losses[0],
                              current_step)
            writer.add_scalar('Train/0.75x_scale_loss', mul_losses[1],
                              current_step)
            writer.add_scalar('Train/0.5x_scale_loss', mul_losses[2],
                              current_step)
            writer.add_scalar('Train/Max_merged_loss', mul_losses[3],
                              current_step)

        if epoch % opt.test_interval == 0:
            model.eval()
            loss_ls = []
            loss_scale_1_ls = []
            loss_scale_2_ls = []
            loss_scale_3_ls = []
            loss_max_merged_ls = []

            for te_batch in test_generator:
                if torch.cuda.is_available():
                    te_batch = [
                        torch.Tensor(record).cuda() for record in te_batch
                    ]
                else:
                    te_batch = [torch.Tensor(record) for record in te_batch]
                te_image, te_gt1, te_gt2 = te_batch
                te_gt1 = te_gt1.long()
                te_gt2 = te_gt2.long()
                num_sample = len(te_gt1)

                with torch.no_grad():
                    te_results = model(te_image)
                    te_mul_losses = multiple_losses(
                        te_results, [te_gt1, te_gt1, te_gt2, te_gt1])
                loss_ls.append(te_mul_losses[4] * num_sample)
                loss_scale_1_ls.append(te_mul_losses[0] * num_sample)
                loss_scale_2_ls.append(te_mul_losses[1] * num_sample)
                loss_scale_3_ls.append(te_mul_losses[2] * num_sample)
                loss_max_merged_ls.append(te_mul_losses[3] * num_sample)

            te_loss = sum(loss_ls) / test_set.__len__()
            te_scale_1_loss = sum(loss_scale_1_ls) / test_set.__len__()
            te_scale_2_loss = sum(loss_scale_2_ls) / test_set.__len__()
            te_scale_3_loss = sum(loss_scale_3_ls) / test_set.__len__()
            te_max_merged_loss = sum(loss_max_merged_ls) / test_set.__len__()

            print(
                "Epoch: {}/{}, Lr: {}, Loss: {:.2f} (1xloss: {:.2f} 0.75xloss: {:.2f} 0.5xloss: {:.2f} Max_merged_loss: {:.2f})"
                .format(epoch + 1, opt.num_epoches,
                        optimizer.param_groups[0]['lr'], te_loss,
                        te_scale_1_loss, te_scale_2_loss, te_scale_3_loss,
                        te_max_merged_loss))

            writer.add_scalar('Test/Total_loss', te_loss, epoch)
            writer.add_scalar('Test/1x_scale_loss', te_scale_1_loss, epoch)
            writer.add_scalar('Test/0.75x_scale_loss', te_scale_2_loss, epoch)
            writer.add_scalar('Test/0.5x_scale_loss', te_scale_3_loss, epoch)
            writer.add_scalar('Test/Max_merged_loss', te_max_merged_loss,
                              epoch)

            model.train()
            if te_loss + opt.es_min_delta < best_loss:
                best_loss = te_loss
                best_epoch = epoch
                torch.save(
                    model.state_dict(), opt.saved_path + os.sep +
                    "only_params_trained_deeplab_voc")
                torch.save(
                    model, opt.saved_path + os.sep +
                    "whole_model_trained_deeplab_voc")

            # Early stopping
            if epoch - best_epoch > opt.es_patience > 0:
                print(
                    "Stop training at epoch {}. The lowest loss achieved is {}"
                    .format(epoch, te_loss))
                break
    writer.close()
Exemplo n.º 19
0
net = build_refinedet(Config.INPUT_SIZE, len(Config.CLASSES), is_refine=True)
if torch.cuda.device_count() > 1:  # 判断是不是有多个GPU
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    net = torch.nn.DataParallel(net,
                                device_ids=range(torch.cuda.device_count()))

device = torch.device('cpu')
if torch.cuda.is_available() and Config.DEVICE == 'gpu':
    device = torch.device('cuda')
    net.to(device)
    cudnn.benchmark = True

if Config.IS_TENSORBOARDX:
    net_input_size = torch.zeros(Config.BATCH_SIZE, 3, Config.INPUT_SIZE[0],
                                 Config.INPUT_SIZE[1])
    writer.add_graph(net, (net_input_size, ))

model_info = {'RESUME_EPOCH': 0, 'RESUME_MODEL': None}
if not op.exists('tools/generate_dep_info/model_info.json'):
    with open('tools/generate_dep_info/model_info.json', 'w',
              encoding='utf-8') as f:
        json.dump(model_info, f)
with open('tools/generate_dep_info/model_info.json', 'r',
          encoding='utf-8') as f:
    model_info = json.load(f)

if model_info['RESUME_MODEL'] is None or not op.exists(
        model_info['RESUME_MODEL']):
    model_info['RESUME_EPOCH'] = 0
    print('Loading base network...')
Exemplo n.º 20
0
def train_challenge2020(hype_space):
    # Paths to save log, checkpoint, tensorboard logs and results
    run_id = datetime.now().strftime(r'%m%d_%H%M%S')
    base_path = save_path + '/' + run_id
    os.makedirs(base_path)
    write_json(hype_space, base_path + '/hype_space.json')

    checkpoint_dir = base_path + '/checkpoints'
    log_dir = base_path + '/log'
    tb_dir = base_path + '/tb_log'
    result_dir = base_path + '/results'

    os.makedirs(result_dir)
    os.makedirs(log_dir)
    os.makedirs(checkpoint_dir)
    os.makedirs(tb_dir)

    # Logger for train
    logger = get_logger(log_dir + '/info.log', name='train' + run_id)
    logger.info(hype_space)

    # Tensorboard
    train_writer = SummaryWriter(tb_dir + '/train')
    val_writer = SummaryWriter(tb_dir + '/valid')

    # Hyper Parameters
    split_index = "../process/data_split/" + hype_space['data_split']

    # Setup Cuda
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    # Data_loader
    train_loader = ChallengeDataLoader2(
        label_dir,
        data_dir,
        split_index,
        batch_size=hype_space['trainer']['batch_size'],
        normalization=hype_space['data_normalization'],
        augmentations=hype_space['augmentation']['method'],
        p=hype_space['augmentation']['prob'])
    valid_loader = train_loader.valid_data_loader
    test_loader = train_loader.test_data_loader

    # Build model architecture
    global model
    for file, types in files_models.items():
        for type in types:
            if hype_space["arch"]["type"] == type:
                model = init_obj(hype_space, 'arch',
                                 eval("module_arch_" + file))

    dummy_input = Variable(torch.rand(16, 12, 3000))
    train_writer.add_graph(model, (dummy_input, ))

    model.to(device)

    # Get function handles of loss and metrics
    criterion = getattr(module_loss, hype_space['loss']['type'])

    # Get function handles of metrics
    challenge_metrics = ChallengeMetric(label_dir)
    metric = challenge_metrics.challenge_metric

    # Get indices of the scored labels
    if hype_space['only_scored']:
        indices = challenge_metrics.indices
    else:
        indices = None

    # Build optimizer, learning rate scheduler
    trainable_params = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = init_obj(hype_space, 'optimizer', torch.optim,
                         trainable_params)
    if hype_space['lr_scheduler']['type'] == 'GradualWarmupScheduler':
        params = hype_space["lr_scheduler"]["args"]
        scheduler_steplr_args = dict(params["after_scheduler"]["args"])
        scheduler_steplr = getattr(torch.optim.lr_scheduler,
                                   params["after_scheduler"]["type"])(
                                       optimizer, **scheduler_steplr_args)
        lr_scheduler = GradualWarmupScheduler(
            optimizer,
            multiplier=params["multiplier"],
            total_epoch=params["total_epoch"],
            after_scheduler=scheduler_steplr)
    else:
        lr_scheduler = init_obj(hype_space, 'lr_scheduler',
                                torch.optim.lr_scheduler, optimizer)

    # Begin training process
    trainer = hype_space['trainer']
    epochs = trainer['epochs']

    # Full train and valid logic
    mnt_metric_name, mnt_mode, mnt_best, early_stop = get_mnt_mode(trainer)
    not_improved_count = 0

    for epoch in range(epochs):
        best = False
        train_loss, train_metric = train(model,
                                         optimizer,
                                         train_loader,
                                         criterion,
                                         metric,
                                         indices,
                                         epoch,
                                         device=device)
        val_loss, val_metric = valid(model,
                                     valid_loader,
                                     criterion,
                                     metric,
                                     indices,
                                     device=device)

        if hype_space['lr_scheduler']['type'] == 'ReduceLROnPlateau':
            # if hype_space['lr_scheduler']['args']['mode'] == 'min':
            #     lr_scheduler.step(train_loss)
            # else:
            #     lr_scheduler.step(train_metric)
            lr_scheduler.step(val_loss)
        elif hype_space['lr_scheduler']['type'] == 'GradualWarmupScheduler':
            lr_scheduler.step(epoch, val_loss)
        else:
            lr_scheduler.step()

        logger.info('Epoch:[{}/{}]\t {:10s}: {:.5f}\t {:10s}: {:.5f}'.format(
            epoch, epochs, 'loss', train_loss, 'metric', train_metric))
        logger.info('             \t {:10s}: {:.5f}\t {:10s}: {:.5f}'.format(
            'val_loss', val_loss, 'val_metric', val_metric))
        logger.info('             \t learning_rate: {}'.format(
            optimizer.param_groups[0]['lr']))

        # check whether model performance improved or not, according to specified metric(mnt_metric)
        if mnt_mode != 'off':
            mnt_metric = val_loss if mnt_metric_name == 'val_loss' else val_metric
            improved = (mnt_mode == 'min' and mnt_metric <= mnt_best) or \
                       (mnt_mode == 'max' and mnt_metric >= mnt_best)
            if improved:
                mnt_best = mnt_metric
                not_improved_count = 0
                best = True
            else:
                not_improved_count += 1

            if not_improved_count > early_stop:
                logger.info(
                    "Validation performance didn\'t improve for {} epochs. Training stops."
                    .format(early_stop))
                break

        if best == True:
            save_checkpoint(model,
                            epoch,
                            optimizer,
                            mnt_best,
                            hype_space,
                            checkpoint_dir,
                            save_best=True)
            logger.info("Saving current best: model_best.pth ...")

        # Tensorboard log
        train_writer.add_scalar('loss', train_loss, epoch)
        train_writer.add_scalar('metric', train_metric, epoch)
        train_writer.add_scalar('learning_rate',
                                optimizer.param_groups[0]['lr'], epoch)

        val_writer.add_scalar('loss', val_loss, epoch)
        val_writer.add_scalar('metric', val_metric, epoch)

    # Logger for test
    logger = get_logger(result_dir + '/info.log', name='test' + run_id)
    logger.propagate = False

    # Load model_best checkpoint
    model = load_checkpoint(model, checkpoint_dir + '/model_best.pth', logger)

    # Testing
    test_loss, test_metric = test(model,
                                  test_loader,
                                  criterion,
                                  metric,
                                  device=device)
    logger.info('    {:10s}: {:.5f}\t {:10s}: {:.5f}'.format(
        'loss', test_loss, 'metric', test_metric))

    challenge_metrics.return_metric_list()
    analyze(model,
            test_loader,
            criterion,
            challenge_metrics,
            logger,
            result_dir,
            device=device)

    write_json(hype_space, '{}/{}_{:.5f}.json'.format(save_path, run_id,
                                                      test_metric))

    return -test_metric
Exemplo n.º 21
0
# -*- coding:utf-8 -*-
import torch
import torchvision
from tensorboardX import SummaryWriter
from models.I2T import Encoder_Image, Generator_I2T
from models.T2I import Generator_T2I
from models.cyclegan_TI import CycleGAN_TI
net1 = Encoder_Image(num_channels=100).cuda()
net2 = Generator_I2T(vocab_size=1000, input_size=200, hidden_size=100).cuda()
net3 = Generator_T2I(
    embedding_size=100,
    filter_sizes=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20],
    num_filters=[100, 200, 200, 200, 200, 100, 100, 100, 100, 100, 160,
                 160]).cuda()
model = CycleGAN_TI().cuda()
writer1 = SummaryWriter(log_dir='../logs1', comment='Encoder_Image')
writer2 = SummaryWriter(log_dir='../logs2', comment='Generator_T2I')

images = torch.ones([9, 3, 64, 64]).cuda()
text_input = torch.ones([9, 1, 20, 100]).cuda()
with writer1:
    writer1.add_graph(net1, input_to_model=(images, ), verbose=True)
with writer2:
    writer2.add_graph(net3, input_to_model=(text_input, ), verbose=True)
    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 44180)
        x = F.relu(self.fc1(x))
        #x = F.dropout(x,p=0.1, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)


# -----------------------------------------------------------------------------------
cnn = CNN()

# ---- write the model on tensorboard
writer_train.add_graph(
    cnn, Variable(
        (torch.Tensor(train_loader.dataset.dataImages[0:1])).cpu(), ))

if is_cuda:
    cnn.cuda()
# -----------------------------------------------------------------------------------
# Loss and Optimizer

learning_rate = 0.001
criterion = nn.CrossEntropyLoss()

#criterion = F.cross_entropy()
optimizer = torch.optim.Adam(cnn.parameters(), lr=learning_rate)


# -----------------------------------------------------------------------------------
Exemplo n.º 23
0
targets = autograd.Variable(targets_tens)

# Set up network
model = nn.Sequential(
    nn.Linear(3, 4, bias=False),
    nn.Sigmoid(),
    nn.Linear(4, 1, bias=False),
    nn.Sigmoid()
    )
loss_func = nn.MSELoss(size_average=False)
params = list(model.parameters())

# For tensorboard, evaluate loss once to make the graph
writer = SummaryWriter(comment='_ann_basic')
loss = loss_func(model(inputs), targets)
writer.add_graph(model, loss)
# add random image just for testing
image = torch.Tensor(scipy.misc.ascent())
writer.add_image('ascent', image)

# Parameters for learning
learning_rate = 1.0
niter = 5000


# Function mapping output to predicted labels
def predicted_labels(x):
    if isinstance(x, autograd.Variable):
        x = x.data

    out = torch.zeros_like(x)
Exemplo n.º 24
0
class TensorBoardLogger(Callback):
    """Callback that logs epoch results to a TensorBoard file."""
    def __init__(self,
                 log_dir=None,
                 comment='',
                 ignores=None,
                 log_model_graph=False,
                 log_param_interval=0,
                 *args,
                 **kwargs):
        """Initialization for TensorBoardLogger.

        Parameters
        ----------
        log_dir: str
            Path to save tensorboard file,
            Default: 'runs/{fmt_datetime}_{hostname}{comment}'.
        comment: str
            Comment that appends to the log_dir. Default: ''.
        ignores: list
            A list of names will be not logged. Default: None.
        log_model_graph: bool
            Whether to save model graph definition. Default: False.
        log_param_interlval: int
            Number of epochs between logging parameters histogram.
            Default: 0(No log).
        """
        super(TensorBoardLogger, self).__init__(*args, **kwargs)
        self.writer = SummaryWriter(log_dir, comment=comment)
        if ignores is None:
            ignores = []
        self.ignores = ignores
        self.log_model_graph = log_model_graph
        self.log_param_interval = log_param_interval
        self.epochs_since_logged_params = 0

    def _teardown(self):
        self.writer.close()

    def log(self, step, meter):
        log_type = meter.meter_type
        method = getattr(self, 'log_' + log_type, None)
        if not method:
            return
        method(meter.alias, meter.value, step)

    def log_image(self, tag, img_tensor, step=None):
        self.writer.add_image(tag, img_tensor, step)

    def log_scalar(self, tag, scalar_value, step=None):
        self.writer.add_scalar(tag, scalar_value, step)

    def log_graph(self, model, input):
        self.writer.add_graph(model, input)

    def log_hist(self, tag, value, step=None, bins='tensorflow'):
        self.writer.add_histogram(tag, value, step, bins)

    def log_text(self):
        pass

    def log_audio(self):
        pass

    def _log_model_and_params(self, trainer, state):
        if state['mode'] != TRAIN_MODE:
            return

        if self.log_model_graph:
            model = state['model']
            input = state['input']
            self.log_graph(model, input)
            self.log_model_graph = False

        if self.log_param_interval == 0:
            return

        self.epochs_since_logged_params += 1
        if self.epochs_since_logged_params < self.log_param_interval:
            return
        self.epochs_since_logged_params = 0

        model = state['model']
        epochs = state['epochs']
        for name, params in model.named_parameters():
            self.log_hist(name, params.clone().cpu().data.numpy(), epochs)

    def __on_batch_end(self, trainer, state):
        """Deprecated"""
        iters = state['iters']
        mode = state['mode']
        for name, meter in state['meters'].items():
            if meter.meter_mode != mode:
                continue
            if meter.reset_mode == BATCH_RESET and \
                    name not in self.ignores and meter.can_call:
                self.log(iters, meter)

    def on_epoch_end(self, trainer, state):
        self._log_model_and_params(trainer, state)

        epochs = state['epochs']
        mode = state['mode']
        for meter in state['meters'].values():
            if meter.mode != mode:
                continue
            alias = meter.alias
            if (meter.reset_mode == EPOCH_RESET and alias not in self.ignores):
                self.log(epochs, meter)

    def on_validate_end(self, trainer, state):
        self.on_epoch_end(trainer, state)
Exemplo n.º 25
0
# coding=utf-8
from mypackage.utils import Model
from tensorboardX import SummaryWriter
import torch
import torchvision
model = Model(50).model
writer = SummaryWriter(log_dir="log/network")

input = torch.autograd.Variable(torch.Tensor(1, 1, 256, 256),
                                requires_grad=True)
writer.add_graph(model=model, input_to_model=(input, ))

# model = torchvision.models.AlexNet(num_classes=10)
# # 准备写tensorboard, 必须放在'.to(device)'之前,不然会报错
# writer = SummaryWriter(log_dir="log/network")
# dummy_input = torch.autograd.Variable(torch.rand(1, 3, 227, 227))
# writer.add_graph(model=model, input_to_model=dummy_input)
Exemplo n.º 26
0
        model = save["model"]
        params_list = changegrad(model)
        model.cuda()
        optimizer = torch.optim.Adam(params_list, lr=lr, weight_decay=0.00008)
        try:
            optimizer.load_state_dict(save["optimizer"])
        except:
            pass
        epoch = save["epoch"]
        print(f"load from './model/{NetTitle}Newest'")
    else:
        print("全新训练")
        params_list = changegrad(modelNet)
        optimizer = torch.optim.Adam(params_list, lr=lr, weight_decay=0.00008)
        epoch = 0
        writer.add_graph(modelNet, (torch.rand([1, 1, 224, 224])))
        model = modelNet.cuda()

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        mode='min',
        factor=0.96,
        patience=2 * len(train_loader),
        verbose=True,
        threshold=0.0001,
        threshold_mode='rel',
        cooldown=1,
        min_lr=0,
        eps=1e-08)

    rate1, rate2 = test()
Exemplo n.º 27
0
class Trainer(object):
    def __init__(self, model, datasets, criterion, args):
        """
        :param model: `torch.nn.Module` to be trained
        :param datasets: dict of datasets including 'train', 'valid', and 'test'
        :param criterion: callable loss function, returns dict of losses
        :param args: parsed results of `ArgumentParser`
        """
        self.is_main_process = args.local_rank is None or args.local_rank == 0
        self.datasets = datasets
        log_dir = Path(args.log_dir)

        # iteration counters
        self.iteration = 0
        self.start_epoch = 0
        self.min_epoch_loss = float('inf')
        self.max_metric_score = 0
        optimizer_state = None
        lr_scheduler_state = None
        self.args = args

        if self.is_main_process:
            # print args
            args_yaml = yaml.dump((vars(args)))
            terminal_columns = shutil.get_terminal_size().columns
            self.println("=" * terminal_columns)
            self.println(args_yaml + ("=" * terminal_columns))

        if args.local_rank is not None:
            assert args.device in ('auto', 'cuda')
            torch.cuda.set_device(args.local_rank)
            args.device = 'cuda'  # only support GPU

        if args.resume:
            self.println('resume checkpoint ...')
            resume_checkpoint = torch.load(
                args.resume_checkpoint_file,
                map_location=lambda storage, loc: storage)
            model = model.load(resume_checkpoint['model_file'])
            self.start_epoch = resume_checkpoint['epoch']
            self.min_epoch_loss = resume_checkpoint.get(
                'min_epoch_loss', self.min_epoch_loss)
            self.max_metric_score = resume_checkpoint.get(
                'max_metric_score', self.max_metric_score)
            self.iteration = resume_checkpoint['iteration']
            optimizer_state = resume_checkpoint['optimizer']
            lr_scheduler_state = resume_checkpoint['lr_scheduler']
            self.println('resume epoch {} iteration {}'.format(
                self.start_epoch, self.iteration))

        device = choose_device(args.device)
        self.use_cuda = device.type == 'cuda'

        self.mixup_epochs = args.no_mixup_epochs if args.no_mixup_epochs > 1.0 else (
            1 - args.no_mixup_epochs) * args.max_epochs
        self.criterion = criterion
        self.model = model
        self.net, self.optimizer = create_optimizer(
            model,
            args.optim,
            args.learning_rate,
            args.weight_decay,
            args.momentum,
            args.apex_opt_level,
            optimizer_state=optimizer_state,
            device=device,
            no_bn_wd=args.no_bn_wd,
            local_rank=args.local_rank,
            sync_bn=args.sync_bn)

        self.data_loaders = {
            k: create_dataset_loaders(d,
                                      args,
                                      self.use_cuda,
                                      shuffle=(k == 'train'))
            for k, d in datasets.items() if k != 'test'
        }

        self.lr_scheduler = create_lr_scheduler(self.optimizer, **vars(args))
        if lr_scheduler_state:
            print(f'resume lr_scheduler_state {lr_scheduler_state}')
            self.lr_scheduler.load_state_dict(lr_scheduler_state)
            print(
                f'resumed lr_scheduler_state{self.lr_scheduler.state_dict()}')

        self.checkpoints_folder = log_dir / 'checkpoints'
        self.checkpoints_folder.mkdir(parents=True, exist_ok=True)

        datasets_text = '\n '.join(
            ['{} {}'.format(k, v) for k, v in datasets.items()])
        self.println("datasets:\n")
        self.println(datasets_text)

        self.tb_writer = None
        if self.is_main_process:
            print("logging into {}".format(log_dir))
            self.tb_writer = SummaryWriter(log_dir=str(log_dir))
            self.tb_writer.add_text('args', repr(args_yaml)[1:-1], 0)
            self.tb_writer.add_text('datasets', repr(datasets_text)[1:-1], 0)
            with (log_dir / 'args.yml').open('w') as f:
                f.write(args_yaml)
            #tb_writer.add_text('cfg', str(ssd_net) + "\n" + str(ssd_net.cfg), 0)

            #if not use_fp16:
            if args.write_graph:
                # write graph
                with torch.no_grad():
                    images = next(iter(self.data_loaders['train']))['image']
                    images = images.to(device)
                    model.trace_mode = True
                    self.tb_writer.add_graph(model, images)
                    model.trace_mode = False

    def println(self, *args, **kwargs):
        if self.is_main_process:
            print(*args, **kwargs)

    def run_epoch(self, epoch, phase):
        data_loader = self.data_loaders[phase]
        if isinstance(data_loader.sampler, DistributedSampler):
            data_loader.sampler.set_epoch(epoch)
            # loss counters
        epoch_loss_dict = {}

        is_train = phase == 'train'
        if is_train:
            if self.lr_scheduler.name != 'plateau':
                self.lr_scheduler.step(epoch=epoch)
            self.optimizer.zero_grad()
            if self.tb_writer:
                self.tb_writer.add_scalar('learning_rate', self.get_lr(),
                                          epoch)

        self.net.train(is_train)
        torch.set_grad_enabled(is_train)

        desc = f"Epoch {epoch} {phase}"
        if self.args.local_rank is not None:
            desc = f"[{self.args.local_rank}]" + desc
        pbar_disable = False if epoch == self.start_epoch + 1 else None
        pbar = tqdm(data_loader,
                    desc=desc,
                    unit="images",
                    unit_scale=data_loader.batch_size,
                    leave=False,
                    disable=pbar_disable,
                    mininterval=10,
                    smoothing=1)
        it = 0
        # for logging images
        min_loss_in_epoch = float("inf")
        max_loss_in_epoch = 0
        batch_of_min_loss_in_epoch = None
        batch_of_max_loss_in_epoch = None

        for batch in pbar:
            inputs = batch.pop('input', None)
            targets = batch
            if inputs is None:
                warnings.warn(
                    f'no input, skip (data in batch are {batch.keys()})')
                assert False
                continue

            if self.use_cuda:
                inputs = inputs.cuda(non_blocking=True)
                targets = targets_to_cuda(targets)

            criterion = self.criterion
            if phase == 'train' and self.args.mixup > 0:
                if epoch < self.mixup_epochs:
                    inputs, criterion = mixup(inputs,
                                              alpha=self.args.mixup,
                                              criterion=criterion)

            # forward
            outputs = self.net(inputs)
            losses = criterion(outputs, targets)

            # compute overall loss if multi losses is returned
            if isinstance(losses, dict):
                if 'All' not in losses:
                    losses['All'] = sum(losses.values())
            elif isinstance(losses, torch.Tensor):
                losses = dict(All=losses)
            else:
                raise RuntimeError(type(losses))
            loss = losses['All']
            optimize_step = False
            if phase == 'train':
                self.optimizer.backward(loss / self.args.gradient_accumulation)
                if self.iteration % self.args.gradient_accumulation == 0:
                    optimize_step = True
                    it += self.args.gradient_accumulation
                    if self.args.clip_grad_norm > 0:
                        clip_grad_norm_(self.net.parameters(),
                                        self.args.clip_grad_norm)
                    self.optimizer.step()
                    self.optimizer.zero_grad()
                self.iteration += 1
                if self.lr_scheduler and self.lr_scheduler.name == 'findlr':
                    self.lr_scheduler.step(self.iteration)
                    if self.tb_writer:
                        self.tb_writer.add_scalar('learning_rate',
                                                  self.get_lr(),
                                                  self.iteration)
            elif phase == 'valid':
                it += 1

            if self.args.local_rank is not None:
                # sync loss between processes
                world_size = torch.distributed.get_world_size()
                for l in losses.values():
                    torch.distributed.reduce(l, dst=0)
                    if self.is_main_process:
                        l /= world_size

            if not self.is_main_process:
                continue
            # Below are logging in optimization step

            batch_loss_dict = {k: v.item() for k, v in losses.items()}

            if self.tb_writer and optimize_step and self.args.log_loss_interval > 0 and self.iteration % self.args.log_loss_interval == 0:
                # tb_writer.add_scalars('Loss', batch_loss_dict, iteration)
                for k, v in batch_loss_dict.items():
                    self.tb_writer.add_scalar(phase + '/Loss/' + k, v, epoch)

            epoch_loss_dict = {
                k: epoch_loss_dict.get(k, 0) + v
                for k, v in batch_loss_dict.items()
            }

            batch_loss = batch_loss_dict['All']
            if batch_loss < min_loss_in_epoch:
                min_loss_in_epoch = batch_loss
                batch_of_min_loss_in_epoch = (inputs, targets)
            if batch_loss > max_loss_in_epoch:
                max_loss_in_epoch = batch_loss
                batch_of_max_loss_in_epoch = (inputs, targets)

            if it > 0:
                # update the progress bar
                scalars = {
                    k: "%.03f" % (v / it)
                    for k, v in epoch_loss_dict.items()
                }
                pbar.set_postfix(scalars, refresh=False)

        if not self.is_main_process:
            return 0

        epoch_loss_dict = {k: v / it for k, v in epoch_loss_dict.items()}
        if self.tb_writer:
            if self.args.log_images:
                name_batch = {
                    "min_loss": batch_of_min_loss_in_epoch,
                    "max_loss": batch_of_max_loss_in_epoch
                }
                for name, batch in name_batch.items():
                    if batch is not None:
                        images = self.visualize_batch(*batch)
                        images_grid = vutils.make_grid(images, normalize=False)
                        self.tb_writer.add_image('/'.join([phase, name]),
                                                 images_grid, epoch)

            #scalars = {phase + k: v for k, v in epoch_loss_dict.items()}
            #tb_writer.add_scalars('EpochLoss', scalars, epoch)
            for k, v in epoch_loss_dict.items():
                self.tb_writer.add_scalar(phase + '/EpochLoss/' + k, v, epoch)

        return epoch_loss_dict['All']

    def get_lr(self):
        return self.optimizer.param_groups[0]['lr']

    def save_checkpoint(self, epoch, model_filename, checkpoint_filename=None):
        if not checkpoint_filename:
            checkpoint_filename = model_filename
        model_filename = str(
            self.checkpoints_folder / model_filename) + '.model.pth'
        checkpoint_filename = str(
            self.checkpoints_folder / checkpoint_filename) + '.checkpoint.pth'

        self.model.save(model_filename)

        optimizer_state_dict = optimizer_cpu_state_dict(self.optimizer)

        torch.save(
            {
                'epoch': epoch,
                'min_epoch_loss': self.min_epoch_loss,
                'max_metric_score': self.max_metric_score,
                'iteration': self.iteration,
                'optimizer': optimizer_state_dict,
                'lr_scheduler': self.lr_scheduler.state_dict(),
                'model_file': model_filename,
                'args': self.args
            }, checkpoint_filename)

        checkpoint_saved = Path(checkpoint_filename)
        last_checkpoint_file = self.checkpoints_folder / 'last.checkpoint'
        if last_checkpoint_file.exists():
            last_checkpoint_file.unlink()
        last_checkpoint_file.symlink_to(
            checkpoint_saved.relative_to(self.checkpoints_folder))

    def run(self):
        self.println('Training', repr(self.model), 'Epochs:', self.start_epoch,
                     '/', self.args.max_epochs)
        pbar_epoch = trange(self.start_epoch + 1,
                            self.args.max_epochs + 1,
                            unit="epoch",
                            disable=not self.is_main_process)

        for epoch in pbar_epoch:
            epoch_state = {}
            for phase in self.data_loaders:
                if phase == 'valid' and epoch % self.args.validation_interval != 0:
                    continue

                epoch_loss = self.run_epoch(epoch, phase)

                evaluation = None
                if 'test' in self.datasets and phase == 'valid':
                    evaluation = self.test()

                if not self.is_main_process:
                    continue
                # Below are processing between epoch, e.g. save checkpoints, logging, etc.

                early_stopping = False
                if evaluation is not None:
                    epoch_state['metric'] = metric_score = evaluation['score']
                    for k, v in evaluation.items():
                        if isinstance(v, dict) and 'score' in v:
                            self.tb_writer.add_scalar(
                                'test/' + k.replace(' ', '_'), v['score'],
                                epoch)

                    if metric_score > self.max_metric_score:
                        self.max_metric_score = metric_score
                        print(
                            '\nsave checkpoint at epoch {} with best {} metric {}'
                            .format(epoch, phase, self.max_metric_score))
                        self.save_checkpoint(epoch, "best_metric")

                if phase == 'valid' or 'valid' not in self.data_loaders:
                    if self.min_epoch_loss > epoch_loss:
                        self.min_epoch_loss = epoch_loss
                        print(
                            '\nsave checkpoint at epoch {} with best {} loss {}'
                            .format(epoch, phase, self.min_epoch_loss))
                        self.save_checkpoint(epoch, 'best_loss')

                    if epoch % self.args.validation_interval == 0:
                        if self.args.lr_scheduler == 'plateau':
                            self.lr_scheduler.step(metrics=epoch_loss)

                        early_stopping = (self.get_lr() <
                                          self.args.stopping_learning_rate)

                if (early_stopping or (epoch == self.args.max_epochs)
                        or (phase == 'valid')
                        or (self.args.checkpoints_interval > 0
                            and epoch % self.args.checkpoints_interval == 0
                            and epoch % self.args.validation_interval != 0)):
                    print(
                        '\nsave checkpoint at epoch {} with {} loss {}'.format(
                            epoch, phase, epoch_loss))
                    self.save_checkpoint(epoch, "last")

                epoch_state[phase + '_loss'] = epoch_loss

                if early_stopping:
                    print('early stopping!')
                    print('Metric Score = {}'.format(self.max_metric_score))
                    return

                if self.args.lr_scheduler == 'findlr':
                    print('finish find lr')
                    return

            if self.is_main_process:
                epoch_state['time'] = datetime.now().strftime('%d%b%H:%M')
                epoch_state['min_loss'] = self.min_epoch_loss
                epoch_state['lr'] = self.get_lr()
                pbar_epoch.set_postfix(epoch_state, refresh=False)

        self.println('Metric Score = {}'.format(self.max_metric_score))

    def test(self):
        raise NotImplementedError()

    def visualize_batch(self, inputs, targets):
        raise NotImplementedError()
def pre_trained(judge):

    writer = SummaryWriter(log_dir='./loss/pre_train_loss_model1/pre_train_loss_SMI_2020_07_31_0%d' % date_num)

    if judge == 0:
        model = BSsequential_net_lstm().to(device)
        print("Total number of paramerters in networks is {}  ".format(sum(x.numel() for x in model.parameters())))
        # model.apply(weights_init)
        temp = 10000000000000
        epoch_num = 1
    else:
        model = BSsequential_net_lstm().to(device)
        mode_patch = './model_file/pre_trained_network_model_model1/pre_trained_network_model_SMI_2020_07_24_02.pth'
        model.load_state_dict(torch.load(mode_patch))
        temp = 10000000000000
        epoch_num = 1
        # path_temp = './Temporary_parameters/pre_temp_model1.mat'
        # temp = scipio.loadmat(path_temp)
        # temp = temp['temp'].item()
        # path_epoch = './Temporary_parameters/pre_epoch_num_model1.mat'
        # epoch_num = scipio.loadmat(path_epoch)
        # epoch_num = epoch_num['epoch_num'].item()+1
    if is_consistent == 0:
        map_xline = np.zeros(0)
        map_inline = np.zeros(0)
    else:
        is_path = './SMI_out/map_number_2020_05_14_06.mat'
        Random_path = scipio.loadmat(is_path)
        map_xline = Random_path['map_xline']
        map_inline = Random_path['map_inline']
    count = 0  # 检验网络权重是否变化的计数器
    lr = 0.001  # 学习步长
    for epoch in range(epoch_num, EPOCHS+1):
        print(epoch, count)
        # temp_weight = model.fc60.weight   # 检验网络权重是否变化的初始网络参数
        # temp_a = torch.sum(temp_weight.data)
        # print(temp_weight)
        temp_weight = model.lstm60
        temp_a = torch.sum(temp_weight.weight_hh_l0.data) + torch.sum(temp_weight.weight_ih_l0.data)
        # print(a)

        if np.mod(epoch + 1, 200) == 0:
            lr = lr * 0.99
        optimizer = optim.Adam(model.parameters(), lr=lr)
        if is_consistent == 1:
            trace_number = np.int(map_xline[0, epoch-1]*142+map_inline[0, epoch-1])
        else:
            temp_1 = np.random.randint(0, 142, 1)  # 29
            temp_2 = np.random.randint(0, 110, 1)  # 22
            trace_number = temp_2*142+temp_1
            map_xline = np.append(map_xline, temp_2)
            map_inline = np.append(map_inline, temp_1)
            # trace_number = temp_2*5*142+temp_1*5
            # map_xline = np.append(map_xline, temp_2 * 5)
            # map_inline = np.append(map_inline, temp_1 * 5)
        # trace_number = np.random.randint(0, 142*110*data_rate, 1)
        # print(trace_number)

        # 计算相关系数
        coef_seismic = np.zeros((105, Xline1_110_label_impedance.shape[1]))
        coef_seismic[0, :] = train1_110_seismic[trace_number, :]
        coef_seismic[1:105, :] = train_well_seismic[:, :]
        temp_coef = np.corrcoef(coef_seismic)

        # 优选出相关系数大于阈值并且半径范围内的井
        tempval_1 = np.zeros(0)
        temp_train_well_1 = np.zeros(0)
        temp_train_well_seisic_1 = np.zeros(0)
        absCORcoef = np.abs(temp_coef[0, 1:105])
        if which_choose_well == 1:
            num = 0
            for k in range(0, 104):
                if absCORcoef[k] > coefval:
                    # 井数据的坐标
                    wellxline = Xline_Inline_number[0, k]
                    wellinline = Xline_Inline_number[1, k]
                    # 目标地震数据的坐标
                    seismicinline = np.mod(trace_number + 1, 142)
                    seismicxline = (trace_number + 1 - seismicinline) / 142 + 1
                    R = np.sqrt((seismicxline - wellxline) * (seismicxline - wellxline) + (seismicinline - wellinline) * (
                            seismicinline - wellinline))
                    if R < Rval:
                        tempval_1 = np.append(tempval_1, absCORcoef[k])
                        temp_train_well_1 = np.append(temp_train_well_1, train_well[k, :])
                        temp_train_well_seisic_1 = np.append(temp_train_well_seisic_1, train_well_seismic[k, :])
                        num = num + 1

            temp_train_well = np.zeros(0)
            temp_train_well_seisic = np.zeros(0)
            if num < num_well:
                num = num_well
                tempval = np.zeros(0)
                for max_num in range(0, num):
                    temp_tempval = max(absCORcoef)
                    tempval = np.append(tempval, temp_tempval)
                    for max_num2 in range(0, 104):
                        if temp_tempval == absCORcoef[max_num2]:
                            absCORcoef[max_num2] = 0
                            temp_train_well = np.append(temp_train_well, train_well[max_num2, :])
                            temp_train_well_seisic = np.append(temp_train_well_seisic, train_well_seismic[max_num2, :])
            else:
                tempval = np.zeros(0)
                temp_train_well_1 = torch.from_numpy(temp_train_well_1)
                temp_train_well_1 = temp_train_well_1.view(num, -1)
                temp_train_well_1 = temp_train_well_1.cpu().detach().numpy()
                temp_train_well_seisic_1 = torch.from_numpy(temp_train_well_seisic_1)
                temp_train_well_seisic_1 = temp_train_well_seisic_1.view(num, -1)
                temp_train_well_seisic_1 = temp_train_well_seisic_1.cpu().detach().numpy()
                for max_num in range(0, num_well):
                    temp_tempval = max(tempval_1)
                    tempval = np.append(tempval, temp_tempval)
                    for max_num2 in range(0, num):
                        if temp_tempval == tempval_1[max_num2]:
                            tempval_1[max_num2] = 0
                            temp_train_well = np.append(temp_train_well, temp_train_well_1[max_num2, :])
                            temp_train_well_seisic = np.append(temp_train_well_seisic, temp_train_well_seisic_1[max_num2, :])
        else:
            num = num_well
            tempval = np.zeros(0)
            temp_train_well = np.zeros(0)
            temp_train_well_seisic = np.zeros(0)
            for max_num in range(0, num):
                temp_tempval = max(absCORcoef)
                tempval = np.append(tempval, temp_tempval)
                for max_num2 in range(0, 104):
                    if temp_tempval == absCORcoef[max_num2]:
                        absCORcoef[max_num2] = 0
                        temp_train_well = np.append(temp_train_well, train_well[max_num2, :])
                        temp_train_well_seisic = np.append(temp_train_well_seisic, train_well_seismic[max_num2, :])

        num = num_well
        maxval = max(tempval)
        minval = min(tempval)
        max_minlen = maxval - minval
        tempval = (tempval - minval) / max_minlen
        valsum = sum(tempval)
        tempval = tempval / valsum

        tempval = torch.from_numpy(tempval)
        tempval = tempval.view(1, -1)
        tempval = tempval.float()
        tempval = tempval.to(device)

        temp_train_well = torch.from_numpy(temp_train_well)
        temp_train_well = temp_train_well.view(num, -1)
        temp_train_well = temp_train_well.float()
        # temp_train_well = temp_train_well.to(device)
        # temp_train_well = temp_train_well.view(num, -1)

        # temp_train_well_seisic = torch.from_numpy(temp_train_well_seisic)
        # temp_train_well_seisic = temp_train_well_seisic.float()
        # temp_train_well_seisic = temp_train_well_seisic.to(device)
        # temp_train_well_seisic = temp_train_well_seisic.view(num, -1)
        # temp_seismic = torch.from_numpy(train1_75_seismic[trace_number, :])
        # temp_seismic = temp_seismic.float()
        # temp_seismic = temp_seismic.to(device)
        # temp_seismic = temp_seismic.view(1, -1)

        temp_lable = torch.from_numpy(Xline1_110_label_impedance[trace_number, :])
        temp_lable = temp_lable.float()
        # temp_lable = temp_lable.to(device)
        temp_lable = temp_lable.view(1, -1)
        # for rand in range(0, 60 - BATCH_LEN + 1):
        for num_rand in range(0, number):
            rand = np.random.randint(0, 60 - BATCH_LEN + 1, 1)
            temp_train_seismic = train1_110_seismic[trace_number, rand[0]:rand[0] + BATCH_LEN]
            temp_train_seismic = torch.from_numpy(temp_train_seismic)
            temp_train_seismic = temp_train_seismic.float()
            temp_train_seismic = temp_train_seismic.to(device)
            temp_train_seismic = temp_train_seismic.view(1, -1)

            # 利用优选出来的井数据,井旁道,加上一个目标道组成网络的输入

            train_dataset = MyDataset2(temp_train_well[:, rand[0]:rand[0] + BATCH_LEN], temp_lable[:, rand[0]:rand[0] + BATCH_LEN])
            train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers=1, shuffle=True, drop_last=False)
            epoch_loss = []

            for itr, (train_dt, train_lable) in enumerate(train_dataloader):
                train_dt, train_lable = train_dt.to(device), train_lable.to(device)
                train_dt = train_dt.float()
                train_lable = train_lable.float()

                model.train()
                optimizer.zero_grad()
                output = model(train_dt, temp_train_seismic)
                if is_synseismic == 1:
                    syn_seismic = syn_seismic_fun2(output, wavelet)
                    syn_seismic = syn_seismic.float()
                    loss = F.mse_loss(syn_seismic, temp_train_seismic) + F.mse_loss(output, train_lable)
                else:
                    loss = F.mse_loss(output, train_lable)

                loss.backward()
                optimizer.step()

                # print(model.conv1.weight)
                # print(model.conv2.weight)
                # print(model.lstm.weight)
                # print(model.fc1.weight.data[:, 0])

                epoch_loss.append(loss.item())

        # temp_b = torch.sum(model.fc60.weight.data)
        temp_b = torch.sum(model.lstm60.weight_hh_l0.data) + torch.sum(model.lstm60.weight_ih_l0.data)
        # print(b)
        if temp_a == temp_b:
            count = count + 1
        else:
            count = 0
        if count > 50:
            break

        epoch_loss = np.sum(np.array(epoch_loss))
        writer.add_scalar('Train/MSE', epoch_loss, epoch)
        epoch_num = epoch
        print('Train set: Average loss: {:.15f}'.format(epoch_loss))
        if epoch_loss < temp:
            path = './model_file/pre_trained_network_model_model1/pre_trained_network_model_SMI_2020_07_31_0%d.pth' % date_num
            torch.save(model.state_dict(), path)
        path_loss = './Temporary_parameters/pre_temp_model1.mat'
        path_epoch = './Temporary_parameters/pre_epoch_num_model1.mat'
        scipio.savemat(path_loss, {'epoch_loss': epoch_loss})
        scipio.savemat(path_epoch, {'epoch_num': epoch_num})
    if is_consistent == 0:
        pathmat = './SMI_out/map_number_2020_07_31_0%d.mat' % date_num
        scipio.savemat(pathmat, {'map_xline': map_xline, 'map_inline': map_inline})
    writer.add_graph(model, (train_dt, temp_train_seismic))
    writer.close()
Exemplo n.º 29
0
class GN():
    def __init__(self, lr=1e-3, batchs=8, cuda=True):
        '''
        :param tt: train_test
        :param tag: 1 - evaluation on testing data, 0 - without evaluation on testing data
        :param lr:
        :param batchs:
        :param cuda:
        '''
        # all the tensor should set the 'volatile' as True, and False when update the network
        self.hungarian = Munkres()
        self.device = torch.device("cuda" if cuda else "cpu")
        self.nEpochs = 999
        self.lr = lr
        self.batchsize = batchs
        self.numWorker = 4

        self.show_process = 0  # interaction
        self.step_input = 1

        print '     Preparing the model...'
        self.resetU()

        self.Uphi = uphi().to(self.device)
        self.Ephi = ephi().to(self.device)

        self.criterion = nn.MSELoss() if criterion_s else nn.CrossEntropyLoss()
        self.criterion = self.criterion.to(self.device)

        self.optimizer = optim.Adam([{
            'params': self.Uphi.parameters()
        }, {
            'params': self.Ephi.parameters()
        }],
                                    lr=lr)

        # seqs = [2, 4, 5, 9, 10, 11, 13]
        # lengths = [600, 1050, 837, 525, 654, 900, 750]
        seqs = [2, 4, 5, 10]
        lengths = [600, 1050, 837, 654]

        for i in xrange(len(seqs)):
            self.writer = SummaryWriter()
            # print '     Loading Data...'
            seq = seqs[i]
            self.seq_index = seq
            start = time.time()
            sequence_dir = 'MOT16/train/MOT16-%02d' % seq
            self.outName = t_dir + 'result_%02d.txt' % seq
            self.train_set = DatasetFromFolder(sequence_dir, self.outName)

            self.train_test = lengths[i]
            self.tag = 0
            self.loss_threhold = 0.03
            self.update()

            print '     Logging...'
            t_data = time.time() - start
            self.log(t_data)

    def getEdges(
            self
    ):  # the statistic data of the graph among two frames' detections
        self.train_set.setBuffer(1)
        step = 1
        edge_counter = 0.0
        for head in xrange(1, self.train_test):
            self.train_set.loadNext()  # Get the next frame
            edge_counter += self.train_set.m * self.train_set.n
            step += 1
            self.train_set.swapFC()
        out = open(self.outName, 'a')
        print >> out, 'Average edge:', edge_counter * 1.0 / step
        out.close()

    def showNetwork(self):
        # add the graph into tensorboard
        E = torch.rand(1, 2).to(self.device)
        V = torch.rand(1, 512).to(self.device)
        u = torch.rand(1, 100).to(self.device)
        self.writer.add_graph(self.Uphi, (E, V, u))

        E = torch.rand(1, 2).to(self.device)
        V1 = torch.rand(1, 512).to(self.device)
        V2 = torch.rand(1, 512).to(self.device)
        u = torch.rand(1, 100).to(self.device)
        self.writer.add_graph(self.Ephi, (E, V1, V2, u))

    def log(self, t_data):
        out = open(self.outName, 'w')
        print >> out, self.criterion
        print >> out, 'lr:{}'.format(self.lr)
        print >> out, self.optimizer.state_dict()
        print >> out, self.Uphi
        print >> out, self.Ephi
        print >> out, 'Time consuming for loading datasets:', t_data
        out.close()
        # self.showNetwork()

    def resetU(self):
        if u_initial:
            self.u = torch.FloatTensor(
                [random.random() for i in xrange(u_num)]).view(1, -1)
        else:
            self.u = torch.FloatTensor([0.0
                                        for i in xrange(u_num)]).view(1, -1)
        self.u = self.u.to(self.device)

    def updateNetwork(self):
        self.train_set.setBuffer(1)
        step = 1
        average_epoch = 0
        edge_counter = 0.0
        for head in xrange(1, self.train_test):
            self.train_set.loadNext()  # Get the next frame
            edge_counter += self.train_set.m * self.train_set.n
            start = time.time()
            show_name = 'LOSS_{}'.format(step)
            # print '         Step -', step
            data_loader = DataLoader(dataset=self.train_set,
                                     num_workers=self.numWorker,
                                     batch_size=self.batchsize,
                                     shuffle=True)
            for epoch in xrange(1, self.nEpochs):
                num = 0
                epoch_loss = 0.0
                arpha_loss = 0.0
                for iteration in enumerate(data_loader, 1):
                    index, (e, gt, vs_index, vr_index) = iteration
                    # print '*'*36
                    # print e.size()
                    # print gt.size()
                    e = e.to(self.device)
                    gt = gt.to(self.device)

                    self.optimizer.zero_grad()

                    u_ = self.Uphi(self.train_set.E, self.train_set.V, self.u)
                    v1 = self.train_set.getApp(1, vs_index)
                    v2 = self.train_set.getApp(0, vr_index)
                    e_ = self.Ephi(e, v1, v2, u_)

                    if self.show_process:
                        print '-' * 66
                        print vs_index, vr_index
                        print 'e:', e.cpu().data.numpy()[0][0],
                        print 'e_:', e_.cpu().data.numpy()[0][0],
                        if criterion_s:
                            print 'GT:', gt.cpu().data.numpy()[0][0]
                        else:
                            print 'GT:', gt.cpu().data.numpy()[0]

                    # Penalize the u to let its value not too big
                    arpha = torch.mean(torch.abs(u_))
                    arpha_loss += arpha.item()
                    arpha.backward(retain_graph=True)

                    #  The regular loss
                    # print e_.size(), e_
                    # print gt.size(), gt
                    loss = self.criterion(e_, gt.squeeze(1))
                    # print loss
                    epoch_loss += loss.item()
                    loss.backward()

                    # update the network: Uphi and Ephi
                    self.optimizer.step()

                    #  Show the parameters of the Uphi and Ephi to check the process of optimiser
                    # print self.Uphi.features[0].weight.data
                    # print self.Ephi.features[0].weight.data
                    # raw_input('continue?')

                    num += self.batchsize

                if self.show_process and self.step_input:
                    a = raw_input(
                        'Continue(0-step, 1-run, 2-run with showing)?')
                    if a == '1':
                        self.show_process = 0
                    elif a == '2':
                        self.step_input = 0

                epoch_loss /= num
                # print '         Loss of epoch {}: {}.'.format(epoch, epoch_loss)
                self.writer.add_scalars(show_name, {
                    'regular': epoch_loss,
                    'u': arpha_loss / num * self.batchsize
                }, epoch)
                if epoch_loss < self.loss_threhold:
                    break

            # print '         Time consuming:{}\n\n'.format(time.time()-start)
            self.updateUE()
            self.train_set.showE()
            self.showU()
            average_epoch += epoch
            self.writer.add_scalar('epoch', epoch, step)
            step += 1
            self.train_set.swapFC()
        out = open(self.outName, 'a')
        print >> out, 'Average edge:', edge_counter * 1.0 / step, '.',
        print >> out, 'Average epoch:', average_epoch * 1.0 / step, 'for',
        print >> out, 'Random' if edge_initial else 'IoU'
        out.close()

    def saveModel(self):
        print 'Saving the Uphi model...'
        torch.save(self.Uphi, t_dir + 'uphi_%02d.pth' % self.seq_index)
        print 'Saving the Ephi model...'
        torch.save(self.Ephi, t_dir + 'ephi_%02d.pth' % self.seq_index)
        print 'Saving the global variable u...'
        torch.save(self.u, t_dir + 'u_%02d.pth' % self.seq_index)
        print 'Done!'

    def updateUE(self):
        u_ = self.Uphi(self.train_set.E, self.train_set.V, self.u)

        self.u = u_.data

        # update the edges
        for edge in self.train_set:
            e, gt, vs_index, vr_index = edge
            e = e.to(self.device).view(1, -1)
            v1 = self.train_set.getApp(1, vs_index)
            v2 = self.train_set.getApp(0, vr_index)
            e_ = self.Ephi(e, v1, v2, u_)
            self.train_set.edges[vs_index][vr_index] = e_.data.view(-1)

    def update(self):
        start = time.time()
        self.evaluation(1)
        if self.tag:
            self.evaluation(self.train_test)
        self.updateNetwork()
        self.saveModel()
        self.evaluation(1)
        if self.tag:
            self.evaluation(self.train_test)
        out = open(self.outName, 'a')
        print >> out, 'The final time consuming:{}\n\n'.format(
            (time.time() - start) / 60)
        out.close()
        self.outputScalars()

    def outputScalars(self):
        self.writer.export_scalars_to_json(t_dir + 'scalars_%02d.json' %
                                           self.seq_index)
        self.writer.close()

    def evaluation(self, head):
        self.train_set.setBuffer(head)
        total_gt = 0.0
        total_ed = 0.0
        for step in xrange(1, self.train_test):
            self.train_set.loadNext()
            # print head+step, 'F',

            u_ = self.Uphi(self.train_set.E, self.train_set.V, self.u)

            # print 'Fo'
            m = self.train_set.m
            n = self.train_set.n
            ret = [[0.0 for i in xrange(n)] for j in xrange(m)]
            step_gt = self.train_set.step_gt
            total_gt += step_gt

            # update the edges
            # print 'T',
            for edge in self.train_set.candidates:
                e, gt, vs_index, vr_index = edge
                e = e.to(self.device).view(1, -1)
                v1 = self.train_set.getApp(1, vs_index)
                v2 = self.train_set.getApp(0, vr_index)
                e_ = self.Ephi(e, v1, v2, u_)
                self.train_set.edges[vs_index][vr_index] = e_.data.view(-1)
                tmp = F.softmax(e_)
                tmp = tmp.cpu().data.numpy()[0]
                ret[vs_index][vr_index] = float(tmp[0])

            self.train_set.showE()
            self.showU()

            # for j in ret:
            #     print j
            results = self.hungarian.compute(ret)
            # print head+step, results,
            step_ed = 0.0
            for (j, k) in results:
                step_ed += self.train_set.gts[j][k].numpy()[0]
            total_ed += step_ed

            # print 'Fi'
            # print 'Step ACC:{}/{}({}%)'.format(int(step_ed), int(step_gt), step_ed/step_gt*100)
            self.train_set.swapFC()

        tra_tst = 'training sets' if head == 1 else 'testing sets'
        # print 'Final {} ACC:{}/{}({}%)'.format(tra_tst, int(total_ed), int(total_gt), total_ed/total_gt*100)
        out = open(self.outName, 'a')
        print >> out, 'Final {} ACC:{}/{}({}%)'.format(
            tra_tst, int(total_ed), int(total_gt), total_ed / total_gt * 100)
        out.close()

    def showU(self):
        out = open(self.outName, 'a')
        print >> out, '     u'
        print >> out, self.u.view(
            10, -1)  # reshape the size of z with aspect of 10 * 10
        out.close()
Exemplo n.º 30
0
                                    num_workers=args.workers,
                                    pin_memory=True)

        if args.ckpt:
            pass
        else:
            # save graph and clips_order samples
            for i, data in enumerate(train_dataloader):
                tuple_clips, targets = data
                for i in range(args.tl):
                    writer.add_video('train/tuple_clips',
                                     tuple_clips[:, i, :, :, :, :],
                                     i,
                                     fps=8)
                tuple_clips = tuple_clips.to(device)
                writer.add_graph(vcpn, tuple_clips)
                break
            # save init params at step 0
            for name, param in vcpn.named_parameters():
                writer.add_histogram('params/{}'.format(name), param, 0)

        ### loss funciton, optimizer and scheduler ###
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(vcpn.parameters(),
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=args.wd)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                         'min',
                                                         min_lr=1e-5,
                                                         patience=50,
class Train:
    __device = []
    __writer = []
    __model = []
    __transformations = []
    __dataset_train = []
    __train_loader = []
    __loss_func = []
    __optimizer = []
    __exp_lr_scheduler = []

    def __init__(self, gpu='0'):
        # Device configuration
        self.__device = torch.device('cuda:'+gpu if torch.cuda.is_available() else 'cpu')
        self.__writer = SummaryWriter('logs')
        self.__model = CNNDriver()
        # Set model to train mode
        self.__model.train()
        print(self.__model)
        self.__writer.add_graph(self.__model, torch.rand(10, 3, 66, 200))
        # Put model on GPU
        self.__model = self.__model.to(self.__device)

    def train(self, num_epochs=100, batch_size=400, lr=0.0001, l2_norm=0.001, save_dir='./save', input='./DataLMDB'):
        # Create log/save directory if it does not exist
        if not os.path.exists('./logs'):
            os.makedirs('./logs')
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        self.__transformations = transforms.Compose([AugmentDrivingTransform(), 
                                                     RandomBrightness(), ConvertToGray(), 
                                                     ConvertToSepia(), AddNoise(), DrivingDataToTensor(),])
        self.__dataset_train = DriveData_LMDB(input, self.__transformations)
        self.__train_loader = DataLoader(self.__dataset_train, batch_size=batch_size, shuffle=True, num_workers=4)

        # Loss and Optimizer
        self.__loss_func = nn.MSELoss()
        # self.__loss_func = nn.SmoothL1Loss()
        self.__optimizer = torch.optim.Adam(self.__model.parameters(), lr=lr, weight_decay=l2_norm)

        # Decay LR by a factor of 0.1 every 10 epochs
        self.__exp_lr_scheduler = lr_scheduler.StepLR(self.__optimizer, step_size=15, gamma=0.1)

        print('Train size:', len(self.__dataset_train), 'Batch size:', batch_size)
        print('Batches per epoch:', len(self.__dataset_train) // batch_size)

        # Train the Model
        iteration_count = 0
        for epoch in range(num_epochs):
            for batch_idx, samples in enumerate(self.__train_loader):

                # Send inputs/labels to GPU
                images = samples['image'].to(self.__device)
                labels = samples['label'].to(self.__device)

                self.__optimizer.zero_grad()

                # Forward + Backward + Optimize
                outputs = self.__model(images)
                loss = self.__loss_func(outputs, labels.unsqueeze(dim=1))

                loss.backward()
                self.__optimizer.step()
                self.__exp_lr_scheduler.step(epoch)

                # Send loss to tensorboard
                self.__writer.add_scalar('loss/', loss.item(), iteration_count)
                self.__writer.add_histogram('steering_out', outputs.clone().detach().cpu().numpy(), iteration_count, bins='doane')
                self.__writer.add_histogram('steering_in', 
                                            labels.unsqueeze(dim=1).clone().detach().cpu().numpy(), iteration_count, bins='doane')

                # Get current learning rate (To display on Tensorboard)
                for param_group in self.__optimizer.param_groups:
                    curr_learning_rate = param_group['lr']
                    self.__writer.add_scalar('learning_rate/', curr_learning_rate, iteration_count)

                # Display on each epoch
                if batch_idx == 0:
                    # Send image to tensorboard
                    self.__writer.add_image('Image', images, epoch)
                    self.__writer.add_text('Steering', 'Steering:' + str(outputs[batch_idx].item()), epoch)
                    # Print Epoch and loss
                    print('Epoch [%d/%d] Loss: %.4f' % (epoch + 1, num_epochs, loss.item()))
                    # Save the Trained Model parameters
                    torch.save(self.__model.state_dict(), save_dir+'/cnn_' + str(epoch) + '.pkl')

                iteration_count += 1
Exemplo n.º 32
0
def train(h5file, h5key, pklfile, validationh5, trainedlossplot, train_target,
          train_lossh5):

    # ******* input dataset from h5, then divide it into train dataset and test dataset(16:1)

    print("Let's use", torch.cuda.device_count(), "GPUs!")
    net = Net(n_feature=75, n_output=1)
    # pklfile6 = 'train6/NN_train_params_3975284924_2.pkl'
    # net.load_state_dict(torch.load(pklfile6))
    net.cuda()
    net = net.double()
    print(net)

    # optimizer = torch.optim.SGD(net.parameters(), lr=LR, weight_decay=0.01,momentum=0.9)
    # optimizer = torch.optim.SGD(net.parameters(), lr=LR, momentum=0.5)
    # optimizer = torch.optim.Adagrad(net.parameters(), lr=LR, lr_decay=0.01)
    optimizer = torch.optim.Adam(net.parameters(), lr=LR)
    # optimizer = torch.optim.RMSprop(net.parameters(), lr=LR, weight_decay=5e-2)
    loss_func = nn.MSELoss()

    train_mode_file = Dir_training + "train_mode.txt"
    train_mode = open(train_mode_file, "w")
    train_mode.write(str(net) + '\n')
    train_mode.write("Activation:  " + "Relu" + '\n')
    train_mode.write("Optimizer:  " + str(optimizer) + '\n')
    train_mode.write("EPOCH:  " + str(EPOCH) + '\n')
    train_mode.write("BATCH_SIZE:  " + str(BATCH_SIZE) + '\n')
    train_mode.write("Leaning rate:  " + str(LR) + '\n')
    train_mode.write("Training data set  :  " + h5file + '\n')
    train_mode.write("Test data size  :  " + "1000" + '\n')
    train_mode.write("Additional  :  " + "For crystal 2626. And wide layer." +
                     '\n')
    train_mode.close()

    logdir = Dir_training + 'NN_logs_' + h5key
    if os.path.isdir(logdir):
        shutil.rmtree(logdir)
    logger = Logger(logdir)

    if os.path.exists(train_lossh5):
        print("The file", train_lossh5, " exist, will remove it!")
        os.remove(train_lossh5)
    else:
        print("The file", train_lossh5, "does not exist!")

    plt.ion()
    plt.figure(figsize=(10, 4))
    loss_list_train = []
    loss_list_test = []
    step_list = []
    # par_np = net.parameters()

    Step = 0
    lri = LR

    # ****** test dataset
    mydf_test = pd.read_hdf(h5file, h5key, start=0, stop=400)
    test_data_np = mydf_test.iloc[:, 4:].replace(np.nan, 0.0).values
    test_data_tensor = torch.from_numpy(test_data_np).double()

    if train_target == 'phi':
        test_labels_np = mydf_test.mcPhi.values.reshape(
            (mydf_test.shape[0], 1))
        test_rec_np = mydf_test.phi.values.reshape((mydf_test.shape[0], 1))
    elif train_target == 'theta':
        test_labels_np = mydf_test.mcTheta.values.reshape(
            (mydf_test.shape[0], 1))
        test_rec_np = mydf_test.theta.values.reshape((mydf_test.shape[0], 1))
    else:
        print("Wrong train target!")

    test_labels_tensor = torch.from_numpy(test_labels_np).double()
    test_rec_tensor = torch.from_numpy(test_rec_np).double()
    test_dataset = Data.TensorDataset(test_data_tensor, test_labels_tensor)
    test_loader = Data.DataLoader(test_dataset, batch_size=BATCH_SIZE_test)

    res = test_data_tensor.cuda()
    #res = Variable(torch.rand(75,640))
    writer = SummaryWriter(logdir)
    writer.add_graph(net, (res, ))
    writer.close()

    for epoch in range(EPOCH):
        print('EPOCH:  ', epoch)
        loss_df_EPOCH_i = pd.DataFrame(columns=['step', 'train', 'test'])
        reader = pd.read_hdf(h5file,
                             h5key,
                             chunksize=BATCH_SIZE * 2,
                             start=400)
        for mydf_readd5 in reader:

            mydf_train = mydf_readd5
            # mydf_train = mydf_readd5.iloc[: int(mydf_readd5.shape[0]*15/16)]
            # mydf_test  = mydf_readd5.iloc[int(mydf_readd5.shape[0]*15/16):]
            # print(mydf_train.iloc[:,54:].head())
            # print(mydf_test.iloc[:,54:].head())
            # print(mydf_train.shape)

            # ****** train dataset
            train_data_np = mydf_train.iloc[:, 4:].replace(np.nan, 0.0).values
            train_data_tensor = torch.from_numpy(train_data_np).double()
            if train_target == 'phi':
                train_labels_np = mydf_train.mcPhi.values.reshape(
                    (mydf_train.shape[0], 1))
            elif train_target == 'theta':
                train_labels_np = mydf_train.mcTheta.values.reshape(
                    (mydf_train.shape[0], 1))
            else:
                print("Wrong train target!")

            train_labels_tensor = torch.from_numpy(train_labels_np).double()
            train_dataset = Data.TensorDataset(train_data_tensor,
                                               train_labels_tensor)
            train_loader = Data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           shuffle=True,
                                           num_workers=8)

            for step, data in enumerate(train_loader):
                # b_x, b_y = data
                b_X, b_Y = data
                b_x = b_X.cuda()
                b_y = b_Y.cuda()

                # ****** L2 regularization
                reg_lambda = torch.tensor(0.2)
                l2_reg = torch.tensor(0.)
                for param in net.parameters():
                    l2_reg += param.cpu().float().norm(2)

                prediction = net(b_x).cuda()
                loss = loss_func(prediction, b_y)
                # loss +=  (reg_lambda*l2_reg).cuda().double()
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                Step += 1

                if (Step + 1) % 100 == 0:
                    test_output = net(test_data_tensor.cuda())
                    test_pred_y = test_output.cpu().data.numpy()
                    # test_pred_y = test_output.data.numpy()
                    accuracy_test = sum(test_pred_y - test_labels_np)
                    loss_test = loss_func(test_output,
                                          test_labels_tensor.cuda())
                    # loss_rec = loss_func(test_rec_tensor.cuda(), test_labels_tensor.cuda())
                    print('Epoch:', epoch, '|step:', Step,
                          '|train loss:%.8f' % loss.item(),
                          '|test loss:%.8f' % loss_test.item())
                    step_list.append(Step)
                    loss_list_train.append(loss.item())
                    loss_list_test.append(loss_test.item())

                    loss_df = pd.DataFrame.from_dict({
                        'step': [Step],
                        'train': [loss.item()],
                        'test': [loss_test.item()]
                    })
                    loss_df.to_hdf(train_lossh5,
                                   key=h5key + 'step',
                                   append=True,
                                   mode='a')
                    loss_df_EPOCH_i = pd.DataFrame.from_dict({
                        'epoch': [epoch],
                        'train': [loss.item()],
                        'test': [loss_test.item()]
                    })

                    plt.subplot(131)
                    plt.cla()
                    plt.plot(step_list,
                             loss_list_train,
                             'b-',
                             lw=1,
                             label='train')
                    plt.plot(step_list,
                             loss_list_test,
                             'r-',
                             lw=3,
                             label='test')
                    plt.xlabel('step')
                    plt.ylabel('loss')
                    plt.text(10,
                             0.027,
                             'Loss_train=%.8f' % loss.item(),
                             fontdict={
                                 'size': 10,
                                 'color': 'blue'
                             })
                    plt.text(10,
                             0.025,
                             'Loss_test=%.8f' % loss_test.item(),
                             fontdict={
                                 'size': 10,
                                 'color': 'red'
                             })
                    # plt.text(10, 0.023, 'Loss_rec=%.8f' % loss_rec.data[0], fontdict={'size': 10, 'color':  'red'})
                    legend = plt.legend(loc="best")  #(loc="best")
                    frame = legend.get_frame()
                    frame.set_facecolor('none')  # 璁剧疆鍥句緥legend鑳屾櫙閫忔槑

                    Theta1 = 0.8336485385269553
                    Theta2 = 0.8647267287924316
                    if train_target == 'phi':
                        Range = [-3.2, 3.2]
                    elif train_target == 'theta':
                        Range = [Theta1 * 0.995, Theta2 * 1.005]  # [0.4, 2.4]

                    plt.subplot(133)
                    plt.cla()
                    plt.hist(test_labels_np,
                             bins=200,
                             range=Range,
                             color='red',
                             alpha=0.7,
                             fill=False,
                             histtype='step',
                             label='test_truth')
                    plt.hist(test_pred_y,
                             bins=200,
                             range=Range,
                             color='blue',
                             alpha=0.7,
                             fill=False,
                             histtype='step',
                             label='test_pre')
                    plt.hist(test_rec_np,
                             bins=200,
                             range=Range,
                             color='green',
                             alpha=0.7,
                             fill=False,
                             histtype='step',
                             label='test_rec')
                    plt.xlabel(r'$' + '\\' + train_target + '$')
                    legend = plt.legend(loc="best")  #(loc="best")
                    frame = legend.get_frame()
                    frame.set_facecolor('none')  # 璁剧疆鍥句緥legend鑳屾櫙閫忔槑

                    plt.subplot(132)
                    plt.cla()
                    plt.hist(b_y.cpu().data.numpy(),
                             bins=200,
                             range=Range,
                             color='red',
                             alpha=0.7,
                             fill=False,
                             histtype='step',
                             label='train_truth')
                    plt.hist(prediction.cpu().data.numpy(),
                             bins=200,
                             range=Range,
                             color='blue',
                             alpha=0.7,
                             fill=False,
                             histtype='step',
                             label='train_pre')
                    plt.xlabel(r'$' + '\\' + train_target + '$')
                    legend = plt.legend(loc="best")  #(loc="best")
                    frame = legend.get_frame()
                    frame.set_facecolor('none')  # 璁剧疆鍥句緥legend鑳屾櫙閫忔槑
                    plt.pause(0.1)

                    # ================================================================== #
                    #                        Tensorboard Logging                         #
                    # ================================================================== #

                    # 1. Log scalar values (scalar summary)
                    info = {
                        'loss': loss.item(),
                        'loss_test': loss_test.item(),
                        'accuracy': accuracy_test.item()
                    }

                    for tag, value in info.items():
                        logger.scalar_summary(tag, value, Step + 1)

                    # 2. Log values and gradients of the parameters (histogram summary)
                    for tag, value in net.named_parameters():
                        tag = tag.replace('.', '/')
                        logger.histo_summary(tag,
                                             value.data.cpu().numpy(),
                                             Step + 1)
                        logger.histo_summary(tag + '/grad',
                                             value.grad.data.cpu().numpy(),
                                             Step + 1)

                    # 3. Log training images (image summary)
                    info = {'images': b_x.view(-1, 5, 5)[:10].cpu().numpy()}

                    for tag, images in info.items():
                        logger.image_summary(tag, images, Step + 1)

        #lri = lri/(1 + 0.005)
        # print("lri:  ",lri)
        # for param_group in optimizer.param_groups:
        #     param_group['lr'] = lri
        loss_df_EPOCH_i.to_hdf(train_lossh5,
                               key=h5key + 'epoch',
                               append=True,
                               mode='a')
        if (epoch + 1) % 50 == 0:
            pklfile_epoch = Dir_pkl + 'NN_train_params_epoch' + str(
                epoch) + '.pkl'
            torch.save(net.state_dict(), pklfile_epoch)

    plt.ioff()
    plt.savefig(trainedlossplot, dpi=300)
    plt.show()

    #loss_df = pd.DataFrame.from_dict({'step' : step_list, 'train' : loss_list_train, 'test' : loss_list_test})
    #loss_df.to_hdf(train_lossh5, key=h5key, mode='w')

    test_output = net(test_data_tensor[:10].cuda())
    test_pred_y = test_output.cpu().data.numpy()
    # test_pred_y = test_output.data.numpy()
    print('prediction number:  ', test_pred_y)
    print('real number:  ', test_labels_np[:10])

    # ****** The model after train
    for name, param in net.state_dict().items():
        print(name, param.size())

    # ****** save the whole model
    # torch.save(model_object, 'model.pkl')
    # only save the parameters ((recommended))
    torch.save(net.state_dict(), pklfile)

    test_pred_y = np.empty((0, 1))
    for step, data in enumerate(test_loader):
        t_X, t_Y = data
        t_x = t_X.cuda()
        t_y = t_Y.cuda()
        test_output = net(t_x).cuda()
        test_pred_y = np.vstack([test_pred_y, test_output.cpu().data.numpy()])

    # test_pred_y = np.delete(test_pred_y, 0, 0)
    print("shapes:  ", test_pred_y.shape)
    pred_df = pd.DataFrame(mydf_test[['mcPhi', 'phi', 'mcTheta', 'theta']])
    print("shapes:  ", test_pred_y.shape, pred_df.shape)
    if train_target == 'phi':
        pred_df['prePhi'] = test_pred_y
    elif train_target == 'theta':
        pred_df['preTheta'] = test_pred_y
    pred_df.to_hdf(validationh5, key=h5key, mode='w')