예제 #1
0
def do_training(model, torch_dataset, torch_dataset_test, num_epochs, writer):
    data_loader = DataLoader(
        torch_dataset, batch_size=8, shuffle=True, collate_fn=utils.collate_fn
    )
    data_loader_test = DataLoader(
        torch_dataset_test, batch_size=2, shuffle=False, collate_fn=utils.collate_fn
    )

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    print("Using device {}".format(device))
    model.to(device)

    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

    for epoch in range(num_epochs):
        log = train_one_epoch(
            model, optimizer, data_loader, device, epoch, print_freq=10
        )

        writer.add_scalar("Train/Learning rate", log.meters["lr"].value, epoch)
        writer.add_scalar("Train/Loss", log.meters["loss"].value, epoch)

        lr_scheduler.step()

        evaluate(model, data_loader_test, device)
예제 #2
0
파일: bbox.py 프로젝트: p-ai-org/P-Agent
def main():
    """
    For data_dir, we assume hierarchy (data_dir/Normal, data_dir/Segmentation)
    and all images include package
    model_out_path: the path/name of the model file saved
    """
    data_dir = sys.argv[1]
    model_out_path = sys.argv[2]
    num_epochs = int(sys.argv[3]) if len(sys.argv) > 3 else 6
    plot_train_loss = bool(distutils.util.strtobool(sys.argv[4])) if len(sys.argv) > 4 else False
    plot_valid_loss = bool(distutils.util.strtobool(sys.argv[5])) if len(sys.argv) > 5 else False

    # using dataset and defined transformations
    dataset = PackageDataset(data_dir, get_transform(train=True))
    dataset_test = PackageDataset(data_dir, get_transform(train=False))

    # split dataset into train and test set
    torch.manual_seed(1)
    indices = torch.randperm(len(dataset)).tolist()
    dataset = torch.utils.data.Subset(dataset, indices[:-20])
    dataset_test = torch.utils.data.Subset(dataset_test, indices[-20:])

    # define training and validation data loaders
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=2, shuffle=True, num_workers=4,
        collate_fn=references.utils.collate_fn)

    data_loader_test = torch.utils.data.DataLoader(
        dataset_test, batch_size=1, shuffle=False, num_workers=4,
        collate_fn=references.utils.collate_fn)

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    # only background and package for this dataset
    num_classes = 2

    # get model
    my_model = get_model_instance_segmentation(num_classes)

    # move model to correct device
    my_model.to(device)

    # construct optimizer
    params = [p for p in my_model.parameters() if p.requires_grad]

    optimizer = torch.optim.SGD(params, lr=0.005,
                                momentum=0.9, weight_decay=0.0005)

    # lr_scheduler that decreases learning rate by 10x every 3 epochs
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=3,
                                                gamma=0.1)    

    # TRAINING
    all_train_loss = []
    all_valid_loss = []
    for epoch in range(num_epochs):
        print(f"TRAIN")
        # train for one epoch and printing every 10 iterations
        # train_one_epoch(my_model, optimizer, data_loader, device, epoch, print_freq=10)
        _, all_loss_epoch = train_one_epoch(my_model, optimizer, data_loader, device, epoch, print_freq=60)
        
        if plot_train_loss:
            num_recorded = len(all_loss_epoch)
            avg_loss_epoch = sum(all_loss_epoch) / num_recorded
            all_train_loss.extend(all_loss_epoch)

        print(f"AVERAGE LOSS EPOCH {epoch}: {avg_loss_epoch}")

        # Validation loss
        if plot_valid_loss:
            with torch.no_grad():
                for images, targets in data_loader_test:
                    images = list(image.to(device) for image in images)
                    targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

                    loss_dict = my_model(images, targets)

                    loss_dict_reduced = references.utils.reduce_dict(loss_dict)
                    losses_reduced = sum(loss for loss in loss_dict_reduced.values())

                    loss_value = losses_reduced.item()

                    all_valid_loss.append(loss_value)

        # update learning rate
        lr_scheduler.step()
        print(f"EVALUATE")

        # evaluate on test dataset
        evaluate(my_model, data_loader_test, device=device)

    if plot_train_loss:
        train_x = [a * num_epochs / len(all_train_loss) for a in range(len(all_train_loss))]
        plt.plot(train_x, all_train_loss, label="training loss")

    if plot_valid_loss:
        valid_x = [a * len(all_train_loss) / len(all_valid_loss) for a in range(len(all_valid_loss))]
        plt.plot(valid_x, all_valid_loss, label="validation loss")
    
    if plot_train_loss or plot_valid_loss:
        plt.xlabel("Epochs")
        plt.ylabel("Loss")
        plt.legend()
        plt.title("Loss vs. Epochs")

        plt.xticks([a for a in range(num_epochs + 1)])
        # plt.savefig('./eighth_plot.png')
        plt.savefig(f"{model_out_path}-loss.png")

    torch.save(my_model, model_out_path)
# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params,
                            lr=0.005,
                            momentum=0.9,
                            weight_decay=0.0005)
# and a learning rete scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                               step_size=3,
                                               gamma=0.1)

# let's train it for 10 epochs
num_epoch = 10

for epoch in range(num_epoch):
    # train for one epoch, printing every 10 iterations
    train_one_epoch(model,
                    optimizer,
                    data_loader,
                    device,
                    epoch,
                    print_freq=10)
    # update the learning rate
    lr_scheduler.step()
    # evaluate on the test dataset
    evaluate(model, data_loader_test, device=device)

# Saving Model for Inference
torch.save(model.state_dict(), "dict.pth")

print("That's it!")
def main(args):

    experiment_name = ''
    output_base_url = osp.join(
        project_root, 'weights',
        '{}_{}'.format(experiment_name,
                       datetime.now().strftime("%d-%m-%Y-%H-%M")))

    # get model
    print('loading model...')
    model = get_resnet50_pretrained_model()

    # create datasets
    print('loading train dataset...')
    train_dataset = PoseDataset([
        osp.join(project_root, 'data/vzf/freestyle/freestyle_1'),
        osp.join(project_root, 'data/vzf/freestyle/freestyle_2'),
        osp.join(project_root, 'data/vzf/freestyle/freestyle_3'),
        osp.join(project_root, 'data/vzf/freestyle/freestyle_4')
    ],
                                train=True)
    print('train dataset size: {}'.format(len(train_dataset)))

    print('loading val dataset...')
    val_dataset = PoseDataset([
        osp.join(project_root, 'data/vzf/freestyle/freestyle_5'),
        osp.join(project_root, 'data/vzf/freestyle/freestyle_6')
    ],
                              train=False)
    print('test dataset size: {}'.format(len(val_dataset)))

    # split the dataset in train and test set
    #indices = torch.randperm(len(dataset)).tolist()
    #dataset_train = torch.utils.data.Subset(dataset, indices[:-20])
    #dataset_test = torch.utils.data.Subset(dataset, indices[-20:])

    # create dataloaders
    print('creating dataloaders...')
    data_loader = DataLoader(train_dataset,
                             batch_size=10,
                             shuffle=True,
                             num_workers=4,
                             collate_fn=collate_fn)

    data_loader_test = DataLoader(val_dataset,
                                  batch_size=10,
                                  shuffle=True,
                                  num_workers=4,
                                  collate_fn=collate_fn)

    # get device
    device = select_best_gpu(
        min_mem=6100) if torch.cuda.is_available() else torch.device('cpu')
    print('selected device: {}'.format(device))

    # only set roi_heads trainable
    train_only_roi_heads(model)

    # grab trainable parameters
    params = [p for p in model.parameters() if p.requires_grad]

    # create optimizer and scheduler
    optimizer = torch.optim.SGD(params,
                                lr=0.005,
                                momentum=0.9,
                                weight_decay=0.0005)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   step_size=3,
                                                   gamma=0.1)

    #train
    print('loading model onto device')
    model.to(device)

    # training mode
    model.train()

    start = time.time()

    # initialize to very large value
    min_box_loss = 10000
    min_kp_loss = 10000

    num_epochs = 100
    get_validation_error(model, data_loader_test, device)
    for epoch in tqdm(range(0, num_epochs)):
        train_one_epoch(model,
                        optimizer,
                        data_loader,
                        device,
                        epoch,
                        print_freq=10)

        if epoch % 5 == 0 or epoch == num_epochs - 1:
            lr_scheduler.step()
            # validation
            box_loss, kp_loss = get_validation_error(model, data_loader_test,
                                                     device)
            print('box_loss: {}, kp_loss: {}'.format(box_loss, kp_loss))
            if kp_loss < min_kp_loss:
                print('improved val score, saving state dict...')
                # lower validation score found
                min_kp_loss = kp_loss
                min_box_loss = box_loss

                temp_state_dict = copy.deepcopy(model.state_dict())
            else:
                print(
                    'loading previous state dict (current best: {})...'.format(
                        min_kp_loss))
                model.load_state_dict(temp_state_dict)

        # every 10 epochs use coco to evaluate
        if epoch % 10 == 0 or epoch == num_epochs - 1:
            print('COCO EVAL EPOCH {}'.format(epoch))
            evaluate(model, data_loader_test, device=device)

    evaluator = evaluate(model, data_loader_test, device=device)
    torch.save(
        model.state_dict(),
        output_base_url + '_epoch{}-{}_min_val_loss_{}.wth'.format(
            epoch, num_epochs, min_kp_loss))
    end = time.time()

    duration_min = int((end - start) / 60)

    # post result to slack channel
    slack_message(
        "Done Training, took {}min \n box loss: {}, KP loss: {}".format(
            duration_min, min_box_loss, min_kp_loss),
        channel='#training')
예제 #5
0
파일: train.py 프로젝트: mlej8/iWildCam
    # move model to the right device
    model.to(device)

    # construct an optimizer
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params,
                                lr=0.00001,
                                momentum=0.9,
                                weight_decay=0.0005)

    # and a learning rate scheduler which decreases the learning rate by 10x every 3 epochs
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   step_size=10,
                                                   gamma=0.1)

    for epoch in range(1, NUM_EPOCHS + 1):
        train_one_epoch(model,
                        optimizer,
                        train_loader,
                        device,
                        epoch,
                        print_freq=200)

        # update the learning rate
        lr_scheduler.step()

        # evaluate on the test dataset
        if epoch % 2 == 0:
            # evaluate_and_write_result_files(model, data_loader_test)
            torch.save(model.state_dict(),
                       os.path.join("model", f"model_epoch_{epoch}.model"))
예제 #6
0
def main(args):
    utils.init_distributed_mode(args)
    print(args)

    device = torch.device(args.device)

    # Data loading code
    print("Loading data")

    dataset = BirdDataset(name=args.dataset, transforms=get_transform(True), train=True, small_set=args.small_set,
        only_instance=args.only_instance)
    dataset_test = BirdDataset(name=args.dataset, transforms=get_transform(False), train=False, small_set=args.small_set,
        only_instance=args.only_instance)

    print("Creating data loaders")
    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
        test_sampler = torch.utils.data.distributed.DistributedSampler(dataset_test)
    else:
        train_sampler = torch.utils.data.RandomSampler(dataset)
        test_sampler = torch.utils.data.SequentialSampler(dataset_test)

    if args.aspect_ratio_group_factor >= 0:
        group_ids = create_aspect_ratio_groups(dataset, k=args.aspect_ratio_group_factor)
        train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size)
    else:
        train_batch_sampler = torch.utils.data.BatchSampler(
            train_sampler, args.batch_size, drop_last=True)

    data_loader = torch.utils.data.DataLoader(
        dataset, batch_sampler=train_batch_sampler, num_workers=args.workers,
        collate_fn=utils.collate_fn)

    data_loader_test = torch.utils.data.DataLoader(
        dataset_test, batch_size=1,
        sampler=test_sampler, num_workers=args.workers,
        collate_fn=utils.collate_fn)

    print("Creating model")
    if args.model == 'normal':
        print('normal model')
        model = get_model_attention(num_classes=args.num_classes,
            use_focal_loss=args.use_focal_loss, focal_gamma=args.focal_gamma,
            use_attention=False)
    elif args.model == 'attention':
        print('attention model')
        model = get_model_attention(num_classes=args.num_classes,
                                    attention_head_output_channels=args.num_parts,
                                    use_focal_loss=args.use_focal_loss, focal_gamma=args.focal_gamma,
                                    use_attention=True)
    elif args.model == 'attention_transformer':
        print('attention transformer model')
        model = get_model_attention(transformer=True, num_classes=args.num_classes,
                                    attention_head_output_channels=args.num_parts,
                                    use_focal_loss=args.use_focal_loss, focal_gamma=args.focal_gamma,
                                    use_attention=True)
    else:
        raise Exception("'model' must be 'normal' or 'attention' or 'attention_transformer'")
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True)
        model_without_ddp = model.module

    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(
        params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)

    # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma)
    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_gamma)

    if args.resume:
        print("load resume")
        checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'], strict=False)
        if not args.ft:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            args.start_epoch = checkpoint['epoch'] + 1

    if args.test_only:
        evaluator = evaluate(model, data_loader_test, device=device, epoch=0, name=Path(args.output_dir).name, do_record=True)
        return evaluator

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq,
                        name=Path(args.output_dir).name, use_aug=args.use_aug)
        lr_scheduler.step()
        if args.output_dir:
            utils.save_on_master({
                'model': model_without_ddp.state_dict(),
                'optimizer': optimizer.state_dict(),
                'lr_scheduler': lr_scheduler.state_dict(),
                'args': args,
                'epoch': epoch},
                os.path.join(args.output_dir, 'model_{}.pth'.format(epoch)))

        # evaluate after every epoch
        if not args.no_eval:
            do_record = False
            if epoch == args.epochs - 1:
                do_record = True
            evaluate(model, data_loader_test, epoch=epoch, device=device, name=Path(args.output_dir).name, do_record=do_record)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
예제 #7
0
파일: train.py 프로젝트: chan8616/PoAI
def do(config):
    # use our dataset and defined transformations
    if config.data_type == 'PennFudanPed':
        dataset = PennFudanDataset(config.root, get_transform(train=True))
        dataset_test = PennFudanDataset(config.root,
                                        get_transform(train=False))
    else:
        dataset = UserDataset(config.root, get_transform(train=True))
        dataset_test = UserDataset(config.root, get_transform(train=False))

    # split the dataset in train and test set
    torch.manual_seed(1)
    indices = torch.randperm(len(dataset)).tolist()
    dataset = torch.utils.data.Subset(dataset, indices[:-50])
    dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:])

    # define training and validation data loaders
    data_loader = torch.utils.data.DataLoader(dataset,
                                              batch_size=config.batch_size,
                                              shuffle=True,
                                              collate_fn=utils.collate_fn)

    data_loader_test = torch.utils.data.DataLoader(
        dataset_test,
        batch_size=config.batch_size,
        shuffle=False,
        collate_fn=utils.collate_fn)

    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')

    # our dataset has two classes only - background and person

    # get the model using our helper function
    model = get_instance_segmentation_model(config.num_classes)
    # move model to the right device
    model.to(device)

    # construct an optimizer
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params,
                                lr=config.lr,
                                momentum=config.momentum,
                                weight_decay=config.weight_decay)

    # and a learning rate scheduler which decreases the learning rate by
    # 10x every 3 epochs
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   step_size=config.step_size,
                                                   gamma=config.gamma)

    for epoch in range(config.num_epochs):
        # train for one epoch, printing every 10 iterations
        train_one_epoch(model,
                        optimizer,
                        data_loader,
                        device,
                        epoch,
                        print_freq=10)
        # update the learning rate
        lr_scheduler.step()
        # evaluate on the test dataset
        evaluate(model, data_loader_test, device=device)

    torch.save(model, os.path.join(config.save_directory, config.ckpt_name))