Пример #1
0
def train(config):
    config["global_step"] = config.get("start_step", 0)
    is_training = False if config.get("export_onnx") else True

    anchors = [int(x) for x in config["yolo"]["anchors"].split(",")]
    anchors = [[[anchors[i], anchors[i + 1]], [anchors[i + 2], anchors[i + 3]], [anchors[i + 4], anchors[i + 5]]] for i
               in range(0, len(anchors), 6)]
    anchors.reverse()
    config["yolo"]["anchors"] = []
    for i in range(3):
        config["yolo"]["anchors"].append(anchors[i])
    # Load and initialize network
    net = ModelMain(config, is_training=is_training)
    net.train(is_training)

    # Optimizer and learning rate
    optimizer = _get_optimizer(config, net)
    lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)
    # lr_scheduler = optim.lr_scheduler.StepLR(
    #     optimizer,
    #     step_size=config["lr"]["decay_step"],
    #     gamma=config["lr"]["decay_gamma"])

    # Set data parallel
    net = nn.DataParallel(net)
    net = net.cuda()

    # Restore pretrain model
    if config["pretrain_snapshot"]:
        logging.info("Load pretrained weights from {}".format(config["pretrain_snapshot"]))
        state_dict = torch.load(config["pretrain_snapshot"])
        net.load_state_dict(state_dict)

    # YOLO loss with 3 scales
    yolo_losses = []
    for i in range(3):
        yolo_losses.append(YOLOLayer(config["batch_size"],i,config["yolo"]["anchors"][i],
                                     config["yolo"]["classes"], (config["img_w"], config["img_h"])))

    # DataLoader
    dataloader = torch.utils.data.DataLoader(COCODataset(config["train_path"],
                                                         (config["img_w"], config["img_h"]),
                                                         is_training=True,is_scene=True),
                                             batch_size=config["batch_size"],
                                             shuffle=True,drop_last=True, num_workers=0, pin_memory=True)

    # Start the training loop
    logging.info("Start training.")
    dataload_len=len(dataloader)
    best_acc=0.5
    for epoch in range(config["epochs"]):
        recall = 0
        mini_step = 0
        for step, samples in enumerate(dataloader):
            images, labels = samples["image"], samples["label"]
            start_time = time.time()
            config["global_step"] += 1
            # Forward and backward
            optimizer.zero_grad()
            outputs = net(images)
            losses_name = ["total_loss", "x", "y", "w", "h", "conf", "cls", "recall"]
            losses = [0] * len(losses_name)
            for i in range(3):
                _loss_item = yolo_losses[i](outputs[i], labels)
                for j, l in enumerate(_loss_item):
                    losses[j] += l
            # losses = [sum(l) for l in losses]
            loss = losses[0]
            loss.backward()
            optimizer.step()
            _loss = loss.item()
            # example_per_second = config["batch_size"] / duration
            lr = optimizer.param_groups[0]['lr']

            strftime = datetime.datetime.now().strftime("%H:%M:%S")
            # if (losses[7] / 3 >= recall / (step + 1)):#mini_batchΪ0×ßÕâÀï
            recall += losses[7] / 3
            print('%s [Epoch %d/%d,batch %03d/%d loss:x %.5f,y %.5f,w %.5f,h %.5f,conf %.5f,cls %.5f,total %.5f,rec %.3f,avrec %.3f %.3f]' %
                (strftime, epoch, config["epochs"], step, dataload_len,
                 losses[1], losses[2], losses[3],
                 losses[4], losses[5], losses[6],
                 _loss, losses[7] / 3, recall / (step + 1), lr))

        if recall / len(dataloader) > best_acc:
            best_acc=recall / len(dataloader)
            if epoch>0:
                torch.save(net.state_dict(), '%s/%.4f_%04d.weights' % (checkpoint_dir, recall / len(dataloader), epoch))

        lr_scheduler.step()
        net.train(is_training)
        torch.cuda.empty_cache()
    # net.train(True)
    logging.info("Bye bye")
def train(config):
    config["global_step"] = config.get("start_step", 0)
    is_training = False if config.get("export_onnx") else True

    # Load and initialize network
    net = ModelMain(config, is_training=is_training)
    net.train(is_training)

    # Optimizer and learning rate
    optimizer = _get_optimizer(config, net)
    lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=config["lr"]["decay_step"],
        gamma=config["lr"]["decay_gamma"])

    # Set data parallel
    net = nn.DataParallel(net)
    net = net.cuda()

    # Restore pretrain model
    if config["pretrain_snapshot"]:
        logging.info("Load pretrained weights from {}".format(
            config["pretrain_snapshot"]))
        state_dict = torch.load(config["pretrain_snapshot"])
        net.load_state_dict(state_dict)

    # Only export onnx
    # if config.get("export_onnx"):
    # real_model = net.module
    # real_model.eval()
    # dummy_input = torch.randn(8, 3, config["img_h"], config["img_w"]).cuda()
    # save_path = os.path.join(config["sub_working_dir"], "pytorch.onnx")
    # logging.info("Exporting onnx to {}".format(save_path))
    # torch.onnx.export(real_model, dummy_input, save_path, verbose=False)
    # logging.info("Done. Exiting now.")
    # sys.exit()

    # Evaluate interface
    # if config["evaluate_type"]:
    # logging.info("Using {} to evaluate model.".format(config["evaluate_type"]))
    # evaluate_func = importlib.import_module(config["evaluate_type"]).run_eval
    # config["online_net"] = net

    # YOLO loss with 3 scales
    yolo_losses = []
    for i in range(3):
        yolo_losses.append(
            YOLOLoss(config["yolo"]["anchors"][i], config["yolo"]["classes"],
                     (config["img_w"], config["img_h"])))

    # DataLoader
    dataloader = torch.utils.data.DataLoader(COCODataset(
        config["train_path"], (config["img_w"], config["img_h"]),
        is_training=True),
                                             batch_size=config["batch_size"],
                                             shuffle=True,
                                             num_workers=32,
                                             pin_memory=True)

    # Start the training loop
    logging.info("Start training.")
    for epoch in range(config["epochs"]):
        for step, samples in enumerate(dataloader):
            images, labels = samples["image"], samples["label"]
            start_time = time.time()
            config["global_step"] += 1

            # Forward and backward
            optimizer.zero_grad()
            outputs = net(images)
            losses_name = ["total_loss", "x", "y", "w", "h", "conf", "cls"]
            losses = []
            for _ in range(len(losses_name)):
                losses.append([])
            for i in range(3):
                _loss_item = yolo_losses[i](outputs[i], labels)
                for j, l in enumerate(_loss_item):
                    losses[j].append(l)
            losses = [sum(l) for l in losses]
            loss = losses[0]
            loss.backward()
            optimizer.step()

            if step > 0 and step % 10 == 0:
                _loss = loss.item()
                duration = float(time.time() - start_time)
                example_per_second = config["batch_size"] / duration
                lr = optimizer.param_groups[0]['lr']
                logging.info(
                    "epoch [%.3d] iter = %d loss = %.2f example/sec = %.3f lr = %.5f "
                    % (epoch, step, _loss, example_per_second, lr))
                config["tensorboard_writer"].add_scalar(
                    "lr", lr, config["global_step"])
                config["tensorboard_writer"].add_scalar(
                    "example/sec", example_per_second, config["global_step"])
                for i, name in enumerate(losses_name):
                    value = _loss if i == 0 else losses[i]
                    config["tensorboard_writer"].add_scalar(
                        name, value, config["global_step"])

        # if step > 0 and step % 1000 == 0:
        # net.train(False)
        # _save_checkpoint(net.state_dict(), config)
        # net.train(True)

        _save_checkpoint(net.state_dict(), config)
        lr_scheduler.step()

    # net.train(False)
    _save_checkpoint(net.state_dict(), config)
    # net.train(True)
    logging.info("Bye~")
Пример #3
0
def train(config):
    config["global_step"] = config.get("start_step", 0)
    is_training = False if config.get("export_onnx") else True

    anchors = [int(x) for x in config["yolo"]["anchors"].split(",")]
    anchors = [[[anchors[i], anchors[i + 1]], [anchors[i + 2], anchors[i + 3]],
                [anchors[i + 4], anchors[i + 5]]]
               for i in range(0, len(anchors), 6)]
    anchors.reverse()
    config["yolo"]["anchors"] = []
    for i in range(3):
        config["yolo"]["anchors"].append(anchors[i])
    # Load and initialize network
    net = ModelMain(config, is_training=is_training)
    net.train(is_training)

    # Optimizer and learning rate
    optimizer = _get_optimizer(config, net)
    lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=config["lr"]["decay_step"],
        gamma=config["lr"]["decay_gamma"])

    # Set data parallel
    net = nn.DataParallel(net)
    net = net.cuda()

    # Restore pretrain model
    if config["pretrain_snapshot"]:
        logging.info("Load pretrained weights from {}".format(
            config["pretrain_snapshot"]))
        state_dict = torch.load(config["pretrain_snapshot"])
        net.load_state_dict(state_dict)

    # Only export onnx
    # if config.get("export_onnx"):
    # real_model = net.module
    # real_model.eval()
    # dummy_input = torch.randn(8, 3, config["img_h"], config["img_w"]).cuda()
    # save_path = os.path.join(config["sub_working_dir"], "pytorch.onnx")
    # logging.info("Exporting onnx to {}".format(save_path))
    # torch.onnx.export(real_model, dummy_input, save_path, verbose=False)
    # logging.info("Done. Exiting now.")
    # sys.exit()

    # Evaluate interface
    # if config["evaluate_type"]:
    # logging.info("Using {} to evaluate model.".format(config["evaluate_type"]))
    # evaluate_func = importlib.import_module(config["evaluate_type"]).run_eval
    # config["online_net"] = net

    # YOLO loss with 3 scales
    yolo_losses = []
    for i in range(3):
        yolo_losses.append(
            YOLOLayer(config["batch_size"], i, config["yolo"]["anchors"][i],
                      config["yolo"]["classes"],
                      (config["img_w"], config["img_h"])))

    # DataLoader
    dataloader = torch.utils.data.DataLoader(COCODataset(
        config["train_path"], (config["img_w"], config["img_h"]),
        is_training=True,
        is_scene=True),
                                             batch_size=config["batch_size"],
                                             shuffle=True,
                                             drop_last=True,
                                             num_workers=0,
                                             pin_memory=True)

    # Start the training loop
    logging.info("Start training.")
    dataload_len = len(dataloader)
    for epoch in range(config["epochs"]):
        recall = 0
        mini_step = 0
        for step, samples in enumerate(dataloader):
            images, labels = samples["image"], samples["label"]
            start_time = time.time()
            config["global_step"] += 1
            for mini_batch in range(3):
                mini_step += 1
                # Forward and backward
                optimizer.zero_grad()
                outputs = net(images)
                losses_name = [
                    "total_loss", "x", "y", "w", "h", "conf", "cls", "recall"
                ]
                losses = [0] * len(losses_name)
                for i in range(3):
                    _loss_item = yolo_losses[i](outputs[i], labels)
                    for j, l in enumerate(_loss_item):
                        losses[j] += l
                # losses = [sum(l) for l in losses]
                loss = losses[0]
                loss.backward()
                optimizer.step()
                _loss = loss.item()
                # example_per_second = config["batch_size"] / duration
                # lr = optimizer.param_groups[0]['lr']

                strftime = datetime.datetime.now().strftime("%H:%M:%S")
                if (losses[7] / 3 >= recall /
                    (step + 1)) or mini_batch == (3 - 1):  #mini_batch为0走这里
                    recall += losses[7] / 3
                    print(
                        '%s [Epoch %d/%d,batch %03d/%d loss:x %.5f,y %.5f,w %.5f,h %.5f,conf %.5f,cls %.5f,total %.5f,rec %.3f,avrec %.3f %d]'
                        %
                        (strftime, epoch, config["epochs"], step, dataload_len,
                         losses[1], losses[2], losses[3], losses[4], losses[5],
                         losses[6], _loss, losses[7] / 3, recall /
                         (step + 1), mini_batch))
                    break
                else:
                    print(
                        '%s [Epoch %d/%d,batch %03d/%d loss:x %.5f,y %.5f,w %.5f,h %.5f,conf %.5f,cls %.5f,total %.5f,rec %.3f,prerc %.3f %d]'
                        % (strftime, epoch, config["epochs"], step,
                           dataload_len, losses[1], losses[2], losses[3],
                           losses[4], losses[5], losses[6], _loss,
                           losses[7] / 3, recall / step, mini_batch))
                    # logging.info(epoch [%.3d] iter = %d loss = %.2f example/sec = %.3f lr = %.5f "%
                    #     (epoch, step, _loss, example_per_second, lr))
                    # config["tensorboard_writer"].add_scalar("lr",
                    #                                         lr,
                    #                                         config["global_step"])
                    # config["tensorboard_writer"].add_scalar("example/sec",
                    #                                         example_per_second,
                    #                                         config["global_step"])
                    # for i, name in enumerate(losses_name):
                    #     value = _loss if i == 0 else losses[i]
                    #     config["tensorboard_writer"].add_scalar(name,
                    #                                             value,
                    #                                             config["global_step"])

        if (epoch % 2 == 0 and recall / len(dataloader) > 0.7
            ) or recall / len(dataloader) > 0.96:
            torch.save(
                net.state_dict(), '%s/%.4f_%04d.weights' %
                (checkpoint_dir, recall / len(dataloader), epoch))

        lr_scheduler.step()
    # net.train(True)
    logging.info("Bye bye")
Пример #4
0
def train(config):
    config["global_step"] = config.get("start_step", 0)
    is_training = False if config.get("export_onnx") else True

    # Load and initialize network
    net = ModelMain(config, is_training=is_training)
    net.train(is_training)

    # Optimizer and learning rate
    optimizer = _get_optimizer(config, net)
    lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=15)
    # lr_scheduler = optim.lr_scheduler.StepLR(
    #     optimizer,
    #     step_size=config["lr"]["decay_step"],
    #     gamma=config["lr"]["decay_gamma"])

    # Set data parallel
    net = nn.DataParallel(net)
    net = net.cuda()

    # Restore pretrain model
    if config["pretrain_snapshot"]:
        logging.info("Load pretrained weights from {}".format(
            config["pretrain_snapshot"]))
        state_dict = torch.load(config["pretrain_snapshot"])
        net.load_state_dict(state_dict)

    # Start the training loop
    logging.info("Start training.")
    dataload_len = len(dataloader)
    epoch_size = 4
    start = time.time()
    pruned_pct = 0
    global index, pruned_book, num_pruned
    global num_weights
    global weight_masks, bias_masks
    for epoch in range(config["epochs"]):
        if epoch % 4 == 0:
            index = 0
            num_pruned = 0
            num_weights = 0
            net.apply(prune)
            torch.save(net.state_dict(),
                       '%s/%.4f_%04d.weights' % (checkpoint_dir, 0.01, 1))
            print('previously pruned: %.3f %%' % (100 * (pruned_pct)))
            print('number pruned: %.3f %%' % (100 *
                                              (num_pruned / num_weights)))
            new_pruned = num_pruned / num_weights - pruned_pct
            pruned_pct = num_pruned / num_weights
            # if new_pruned <= 0.01:
            #     time_elapse = time.time() - start
            #     print('training time:', str(timedelta(seconds=time_elapse)))
            #     break
        recall = 0
        mini_step = 0
        for step, samples in enumerate(dataloader):
            index = 0
            images, labels = samples["image"], samples["label"]
            start_time = time.time()
            optimizer.zero_grad()
            outputs = net(images)
            losses_name = [
                "total_loss", "x", "y", "w", "h", "conf", "cls", "recall"
            ]
            losses = [0] * len(losses_name)
            for i in range(3):
                _loss_item = yolo_losses[i](outputs[i], labels)
                for j, l in enumerate(_loss_item):
                    losses[j] += l
            # losses = [sum(l) for l in losses]
            loss = losses[0]
            loss.backward()

            net.apply(set_grad)
            optimizer.step()
            _loss = loss.item()
            # example_per_second = config["batch_size"] / duration
            lr = optimizer.param_groups[0]['lr']

            strftime = datetime.datetime.now().strftime("%H:%M:%S")
            recall += losses[7] / 3
            print(
                '%s [Epoch %d/%d,batch %03d/%d loss:x %.5f,y %.5f,w %.5f,h %.5f,conf %.5f,cls %.5f,total %.5f,rec %.3f,avrec %.3f %.3f]'
                % (strftime, epoch, config["epochs"], step, dataload_len,
                   losses[1], losses[2], losses[3], losses[4], losses[5],
                   losses[6], _loss, losses[7] / 3, recall / (step + 1), lr))

        if (epoch % 2 == 0 and recall / len(dataloader) > 0.5
            ) or recall / len(dataloader) > 0:
            # torch.save(net.state_dict(), '%s/%.4f_%04d.weights' % (checkpoint_dir, recall / len(dataloader), epoch))
            torch.save(
                net.state_dict(), '%s/%.4f_%04d.weights' %
                (checkpoint_dir, recall / len(dataloader), epoch))

        lr_scheduler.step()
    # net.train(True)
    logging.info("Bye bye")
Пример #5
0
def train():
    global_step = 0
    is_training = True

    # Load and Initialize Network
    net = ModelMain(is_training)
    net.train(is_training)

    # Optimizer and Lr
    optimizer = _get_optimizer(net)
    lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=lr_decay_step,  #20
        gamma=lr_decay_gamma)  # 0.1

    # Set Data Paraller:
    net = nn.DataParallel(net)
    net = net.cuda()
    logging.info("Net of Cuda is Done!")

    # Restore pretrain model 从预训练模型中恢复
    if pretrain_snapshot:
        logging.info(
            "Load pretrained weights from {}".format(pretrain_snapshot))
        state_dic = torch.load(pretrain_snapshot)
        net.load_state_dict(state_dic)

    yolo_losses = []
    for i in range(3):
        yolo_losses.append(
            YOLOLoss(anchors[i], classes, (img_w, img_h)).cuda())
    print('YOLO_Losses: \n', yolo_losses)

    # DataLoader
    train_data_loader = DATA.DataLoader(dataset=COCODataset(train_path,
                                                            (img_w, img_h),
                                                            is_training=True),
                                        batch_size=batch_size,
                                        shuffle=True,
                                        pin_memory=False)
    # Start the training loop
    logging.info("Start training......")
    for epoch in range(epochs):
        for step, samples in enumerate(train_data_loader):
            images, labels = samples['image'].cuda(), samples["label"].cuda()
            start_time = time.time()
            global_step += 1

            # Forward & Backward
            optimizer.zero_grad()
            outputs = net(images)
            losses_name = ["total_loss", "x", "y", "w", "h", "conf", "cls"]
            losses = [[]] * len(
                losses_name)  # [[]] ---> [[], [], [], [], [], [], []]
            for i in range(3):  # YOLO 3 scales
                _loss_item = yolo_losses[i](outputs[i], labels)
                for j, l in enumerate(_loss_item):
                    # print('j: ', j, 'l: ', l) j: index(0-6); l内容: 总loss, x, y, w, h, conf, cls
                    losses[j].append(l)
            losses = [sum(l) for l in losses]
            loss = losses[0]  # losses[0]为总Loss
            conf = losses[5]
            loss.backward()
            optimizer.step()

            if step > 0 and step % 10 == 0:
                _loss = loss.item()
                _conf = conf.item()
                duration = float(time.time() - start_time)  # 总用时
                example_per_second = batch_size / duration  # 每个样本用时
                lr = optimizer.param_groups[0]['lr']
                logging.info(
                    "epoch [%.3d] iter = %d loss = %.2f conf = %.2f example/sec = %.3f lr = %.5f "
                    % (epoch, step, _loss, _conf, example_per_second, lr))
            if step >= 0 and step % 1000 == 0:
                # net.train(False)
                _save_checkpoint(net.state_dict(), epoch, step)
                # net.train(True)

        lr_scheduler.step()

    _save_checkpoint(net.state_dict(), 100, 9999)
    logging.info("Bye~")
Пример #6
0
def train(imgs, labels, checkpoint_path, config):
    config["global_step"] = config.get("start_step", 0)
    is_training = False if config.get("export_onnx") else True

    # Load and initialize network
    net = ModelMain(config, is_training=is_training)
    net.train(is_training)

    # Optimizer and learning rate
    optimizer = _get_optimizer(config, net)
    lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=config["lr"]["decay_step"],
        gamma=config["lr"]["decay_gamma"])

    # Set data parallel
    net = nn.DataParallel(net)
    net = net.cuda()

    # Restore pretrain model
    if checkpoint_path:
        logging.info("Load pretrained weights from {}".format(checkpoint_path))
        state_dict = torch.load(checkpoint_path)
        net.load_state_dict(state_dict)

    # YOLO loss with 3 scales
    yolo_losses = []
    for i in range(3):
        yolo_losses.append(
            YOLOLoss(config["yolo"]["anchors"][i], config["yolo"]["classes"],
                     (config["img_w"], config["img_h"])))

    # DataLoader
    dataloader = torch.utils.data.DataLoader(SatDataset(
        imgs, labels, (config["img_w"], config["img_h"]), is_training=True),
                                             batch_size=config["batch_size"],
                                             shuffle=True,
                                             num_workers=1,
                                             pin_memory=True)

    # Start the training loop
    logging.info("Start training.")
    for epoch in range(config["epochs"]):
        for step, samples in enumerate(dataloader):
            images, labels = samples["image"], samples["label"]
            start_time = time.time()
            config["global_step"] += 1

            # Forward and backward
            optimizer.zero_grad()
            outputs = net(images)
            losses_name = ["total_loss", "x", "y", "w", "h", "conf", "cls"]
            losses = [[]] * len(losses_name)
            for i in range(3):
                _loss_item = yolo_losses[i](outputs[i], labels)
                for j, l in enumerate(_loss_item):
                    losses[j].append(l)
            losses = [sum(l) for l in losses]
            loss = losses[0]
            loss.backward()
            optimizer.step()

            if step > 0 and step % 10 == 0:
                _loss = loss.item()
                duration = float(time.time() - start_time)
                example_per_second = config["batch_size"] / duration
                lr = optimizer.param_groups[0]['lr']
                logging.info(
                    "epoch [%.3d] iter = %d loss = %.2f example/sec = %.3f lr = %.5f "
                    % (epoch, step, _loss, example_per_second, lr))
                config["tensorboard_writer"].add_scalar(
                    "lr", lr, config["global_step"])
                config["tensorboard_writer"].add_scalar(
                    "example/sec", example_per_second, config["global_step"])
                for i, name in enumerate(losses_name):
                    value = _loss if i == 0 else losses[i]
                    config["tensorboard_writer"].add_scalar(
                        name, value, config["global_step"])
        lr_scheduler.step()

    # net.train(False)
    checkpoint_path = _save_checkpoint(net.state_dict(), config)
    # net.train(True)
    logging.info("Bye~")
    return checkpoint_path
Пример #7
0
def train(config):
    config["global_step"] = config.get("start_step", 0)
    is_training = False if config.get("export_onnx") else True

    anchors = [int(x) for x in config["yolo"]["anchors"].split(",")]
    anchors = [[[anchors[i], anchors[i + 1]], [anchors[i + 2], anchors[i + 3]],
                [anchors[i + 4], anchors[i + 5]]]
               for i in range(0, len(anchors), 6)]
    anchors.reverse()
    config["yolo"]["anchors"] = []
    for i in range(3):
        config["yolo"]["anchors"].append(anchors[i])
    # Load and initialize network
    net = ModelMain(config, is_training=is_training)
    net.train(is_training)

    # Optimizer and learning rate
    optimizer = _get_optimizer(config, net)
    t_max = 50
    # lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=t_max,eta_min=1e-05)
    lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=config["lr"]["decay_step"],
        gamma=config["lr"]["decay_gamma"])

    # Set data parallel
    net = nn.DataParallel(net)
    net = net.cuda()
    # Restore pretrain model
    if config["pretrain_snapshot"]:
        logging.info("Load pretrained weights from {}".format(
            config["pretrain_snapshot"]))
        state_dict = torch.load(config["pretrain_snapshot"])
        net.load_state_dict(state_dict)

    # Only export onnx
    # if config.get("export_onnx"):
    # real_model = net.module
    # real_model.eval()
    # dummy_input = torch.randn(8, 3, config["img_h"], config["img_w"]).cuda()
    # save_path = os.path.join(config["sub_working_dir"], "pytorch.onnx")
    # logging.info("Exporting onnx to {}".format(save_path))
    # torch.onnx.export(real_model, dummy_input, save_path, verbose=False)
    # logging.info("Done. Exiting now.")
    # sys.exit()

    # Evaluate interface
    # if config["evaluate_type"]:
    # logging.info("Using {} to evaluate model.".format(config["evaluate_type"]))
    # evaluate_func = importlib.import_module(config["evaluate_type"]).run_eval
    # config["online_net"] = net

    # YOLO loss with 3 scales

    # DataLoader
    dataloader = torch.utils.data.DataLoader(
        COCODataset(config["train_path"], (config["img_w"], config["img_h"]),
                    is_training=True,
                    is_scene=True),
        batch_size=config["batch_size"] * config["parallels"],
        shuffle=True,
        drop_last=True,
        num_workers=0,
        pin_memory=True)

    # Start the training loop
    logging.info("Start training.")
    dataload_len = len(dataloader)
    best_acc = 0.2
    last_recall = 0.6
    for epoch in range(config["epochs"]):
        recall = 0
        mini_step = 0
        for step, samples in enumerate(dataloader):
            start = time.time()
            images, labels = samples["image"], samples["label"]
            config["global_step"] += 1
            # Forward and backward
            optimizer.zero_grad()
            losses = net(images.cuda(), labels.cuda())

            # current_recall = mAP(detections, labels, config["img_w"])
            # current_recall = np.mean(current_recall)

            if config["parallels"] > 1:
                losses = losses.view(config["parallels"], 8)[0] + losses.view(
                    config["parallels"], 8)[1]
            loss = losses[0]
            if epoch > 0:
                loss = loss * 20
            current_recall = float(losses[7] / 3 / config["parallels"])
            if last_recall < 0.65:
                loss = loss + 20 * (1 - current_recall)  # * 0.8
            else:
                loss = loss + 20 * (1 - current_recall)

            loss.backward()
            optimizer.step()
            _loss = loss.item()
            # example_per_second = config["batch_size"] / duration
            lr = optimizer.param_groups[0]['lr']
            #
            strftime = datetime.datetime.now().strftime("%H:%M:%S")
            # # if (losses[7] / 3 >= recall / (step + 1)):#mini_batch为0走这里
            recall += current_recall
            print(
                '%s [Epoch %d/%d,batch %03d/%d loss:x %.5f,y %.5f,w %.5f,h %.5f,conf %.5f,cls %.5f,total %.5f,rec %.3f,avrec %.3f %.3f]'
                % (strftime, epoch, config["epochs"], step, dataload_len,
                   losses[1], losses[2], losses[3], losses[4], losses[5],
                   losses[6], _loss, current_recall, recall / (step + 1), lr))
        last_recall = recall / len(dataloader)
        if recall / len(dataloader) > best_acc:
            best_acc = recall / len(dataloader)
            torch.save(
                net.state_dict(), '%s/%.4f_%04d.weights' %
                (checkpoint_dir, recall / len(dataloader), epoch))

        lr_scheduler.step()
        # if epoch % (lr_scheduler.T_max + next_need) == (lr_scheduler.T_max + next_need - 1):
        #     next_need += float(lr_scheduler.T_max)
        #     lr_scheduler.T_max += 2
        #     lr_scheduler.last_epoch = 0
        # lr_scheduler.base_lrs*=0.98
        # lr_scheduler.base_lrs[0] *= 0.95
        # lr_scheduler.base_lrs[1] *= 0.95

        # net.train(is_training)
        # torch.cuda.empty_cache()
    # net.train(True)
    logging.info("Bye bye")
Пример #8
0
def train(config):
    config["global_step"] = config.get("start_step", 0)
    is_training = False if config.get("export_onnx") else True

    anchors = [int(x) for x in config["yolo"]["anchors"].split(",")]
    anchors = [[[anchors[i], anchors[i + 1]], [anchors[i + 2], anchors[i + 3]],
                [anchors[i + 4], anchors[i + 5]]]
               for i in range(0, len(anchors), 6)]
    anchors.reverse()
    config["yolo"]["anchors"] = []
    for i in range(3):
        config["yolo"]["anchors"].append(anchors[i])
    # Load and initialize network
    net = ModelMain(config, is_training=is_training)
    net.train(is_training)

    # Optimizer and learning rate
    optimizer = _get_optimizer(config, net)
    lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=config["lr"]["decay_step"],
        gamma=config["lr"]["decay_gamma"])

    # Set data parallel
    net = nn.DataParallel(net)
    net = net.cuda()

    # Restore pretrain model
    if config["pretrain_snapshot"]:
        logging.info("Load pretrained weights from {}".format(
            config["pretrain_snapshot"]))
        state_dict = torch.load(config["pretrain_snapshot"])
        net.load_state_dict(state_dict)

    # YOLO loss with 3 scales
    yolo_losses = []
    for i in range(3):
        yolo_losses.append(
            YOLOLayer(config["batch_size"], i, config["yolo"]["anchors"][i],
                      config["yolo"]["classes"],
                      (config["img_w"], config["img_h"])))

    total_loss = 0
    last_total_loss = 0

    manager = Manager()
    # 父进程创建Queue,并传给各个子进程:
    q = manager.Queue(1)
    lock = manager.Lock()  # 初始化一把锁
    p = Pool()
    pw = p.apply_async(get_data, args=(q, lock))

    batch_len = q.get()
    if batch_len[0] == "len":
        batch_len = batch_len[1]
    logging.info("Start training.")
    for epoch in range(config["epochs"]):
        recall = 0
        for step in range(batch_len):
            samples = q.get()
            images, labels = samples["image"], samples["label"]
            start_time = time.time()
            config["global_step"] += 1

            # Forward and backward
            optimizer.zero_grad()
            outputs = net(images)
            losses_name = [
                "total_loss", "x", "y", "w", "h", "conf", "cls", "recall"
            ]
            losses = [0] * len(losses_name)
            for i in range(3):
                _loss_item = yolo_losses[i](outputs[i], labels)
                for j, l in enumerate(_loss_item):
                    losses[j] += l
            # losses = [sum(l) for l in losses]
            loss = losses[0]
            loss.backward()
            optimizer.step()

            if step > 0 and step % 2 == 0:
                _loss = loss.item()
                duration = float(time.time() - start_time)
                example_per_second = config["batch_size"] / duration
                lr = optimizer.param_groups[0]['lr']

                strftime = datetime.datetime.now().strftime("%H:%M:%S")
                recall += losses[7] / 3
                print(
                    '%s [Epoch %d/%d, Batch %03d/%d losses: x %.5f, y %.5f, w %.5f, h %.5f, conf %.5f, cls %.5f, total %.5f, recall: %.3f]'
                    % (strftime, epoch, config["epochs"], step, batch_len,
                       losses[1], losses[2], losses[3], losses[4], losses[5],
                       losses[6], _loss, losses[7] / 3))
                # logging.info(epoch [%.3d] iter = %d loss = %.2f example/sec = %.3f lr = %.5f "%
                #     (epoch, step, _loss, example_per_second, lr))
                # config["tensorboard_writer"].add_scalar("lr",
                #                                         lr,
                #                                         config["global_step"])
                # config["tensorboard_writer"].add_scalar("example/sec",
                #                                         example_per_second,
                #                                         config["global_step"])
                # for i, name in enumerate(losses_name):
                #     value = _loss if i == 0 else losses[i]
                #     config["tensorboard_writer"].add_scalar(name,
                #                                             value,
                #                                             config["global_step"])

        if (epoch % 2 == 0
                and recall / batch_len > 0.7) or recall / batch_len > 0.96:
            torch.save(net.state_dict(),
                       '%s/%04d.weights' % (checkpoint_dir, epoch))

        lr_scheduler.step()
Пример #9
0
def train(config):
    # Hyper-parameters
    config["global_step"] = config.get("start_step", 0)
    is_training =  True

    # Net & Loss & Optimizer
    ## Net Main
    net = ModelMain(config, is_training=is_training)
    net.train(is_training)

    ## YOLO Loss with 3 scales
    yolo_losses = []
    for i in range(3):
        yolo_loss = YOLOLoss(config["yolo"]["anchors"][i],
                             config["yolo"]["classes"], (config["img_w"], config["img_h"]))
        yolo_losses.append(yolo_loss)

    ## Optimizer and LR scheduler
    optimizer = _get_optimizer(config, net)
    lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=config["lr"]["decay_step"], gamma=config["lr"]["decay_gamma"])

    net = nn.DataParallel(net)
    net = net.cuda()

    # Load checkpoint
    if config["pretrain_snapshot"]:
        logging.info("Load pretrained weights from {}".format(config["pretrain_snapshot"]))
        state_dict = torch.load(config["pretrain_snapshot"])
        net.load_state_dict(state_dict)

    # DataLoader
    dataloader = torch.utils.data.DataLoader(AIPrimeDataset(config["train_path"]),
                                             batch_size=config["batch_size"],
                                             shuffle=True, num_workers=16, pin_memory=False)

    # Start the training
    logging.info("Start training.")
    for epoch in range(config["start_epoch"], config["epochs"]):
        for step, (images, labels) in enumerate(dataloader):
            start_time = time.time()
            config["global_step"] += 1

            # Forward
            outputs = net(images)

            # Loss
            losses_name = ["total_loss", "x", "y", "w", "h", "conf", "cls"]
            losses = [[]] * len(losses_name)
            for i in range(3):
                _loss_item = yolo_losses[i](outputs[i], labels)
                for j, l in enumerate(_loss_item):
                    losses[j].append(l)
            losses = [sum(l) for l in losses]
            loss = losses[0]

            # Zero & Backward & Step
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Logging
            if step > 0 and step % 10 == 0:
                _loss = loss.item()
                duration = float(time.time() - start_time)
                example_per_second = config["batch_size"] / duration
                lr = optimizer.param_groups[0]['lr']
                logging.info(
                    "epoch [%.3d] iter = %d loss = %.2f example/sec = %.3f lr = %.5f " %
                    (epoch, step, _loss, example_per_second, lr)
                )

        # Things to be done for every epoch
        ## LR schedule
        lr_scheduler.step()
        ## Save checkpoint
        _save_checkpoint(net.state_dict(), config, epoch)

    # Finish training
    logging.info("QiaJiaBa~ BeiBei")