Exemplo n.º 1
0
def run_srgan(device,
              image_size,
              batch_size,
              config,
              run_dir,
              saved_dir,
              run_name,
              num_epochs,
              val_dataset,
              train_dataset=None,
              checkpoints=None,
              mode='train',
              gpu_num=1):

    srgan_generator = SRGenerator(device).to(device)
    srgan_discriminator = SRDiscriminator().to(device)
    summary(srgan_discriminator, (3, image_size, image_size))
    summary(srgan_generator, (3, image_size // 4, image_size // 4))

    if checkpoints is not None:
        utils.load_from_checkpoint(srgan_generator, saved_dir,
                                   checkpoints["generator"])
        utils.load_from_checkpoint(srgan_discriminator, saved_dir,
                                   checkpoints["discriminator"])

    run_name = 'SRGAN' + '_' + run_name
    if mode == 'train':
        inception_FID_scores = train_gan(num_epochs,
                                         batch_size,
                                         None,
                                         device,
                                         train_dataset,
                                         val_dataset,
                                         srgan_generator,
                                         srgan_discriminator,
                                         type='SRGAN',
                                         config=config,
                                         run_dir=run_dir,
                                         saved_dir=saved_dir,
                                         run_name=run_name,
                                         calc_IS=False)
    elif mode == 'test':
        inception_FID_scores = [
            calc_inception_FID_score(batch_size,
                                     device,
                                     val_dataset,
                                     srgan_generator,
                                     type='SRGAN')
        ]

    date_str = datetime.datetime.now().strftime("%m%d%Y%H")
    utils.save_to_pickle(
        inception_FID_scores,
        os.path.join(saved_dir, 'srgan_fid_' + date_str + ".pickle"))

    return inception_FID_scores
Exemplo n.º 2
0
def ternausnet(num_classes, state_dict_path):
    """
    pretrained:
            False - no pre-trained network is used
            True  - encoder is pre-trained with VGG11
            carvana - all weights are pre-trained on
                Kaggle: Carvana dataset https://www.kaggle.com/c/carvana-image-masking-challenge
    """
    model = UNet11(num_classes)
    model = load_from_checkpoint(state_dict_path, model)
    return model
Exemplo n.º 3
0
def main(params):
    """
    Identify the class to which each image belongs.
    :param params: (dict) Parameters found in the yaml config file.

    """
    since = time.time()
    csv_file = params['inference']['img_csv_file']

    bucket = None
    bucket_name = params['global']['bucket_name']

    model, state_dict_path, model_name = net(params, inference=True)
    if torch.cuda.is_available():
        model = model.cuda()

    if bucket_name:
        s3 = boto3.resource('s3')
        bucket = s3.Bucket(bucket_name)
        bucket.download_file(csv_file, 'img_csv_file.csv')
        list_img = read_csv('img_csv_file.csv', inference=True)
    else:
        list_img = read_csv(csv_file, inference=True)

    if params['global']['task'] == 'classification':
        classifier(params, list_img, model)

    elif params['global']['task'] == 'segmentation':
        if bucket:
            bucket.download_file(state_dict_path, "saved_model.pth.tar")
            model = load_from_checkpoint("saved_model.pth.tar", model)
        else:
            model = load_from_checkpoint(state_dict_path, model)

        chunk_size, nbr_pix_overlap = calc_overlap(params)
        num_classes = params['global']['num_classes']
        for img in list_img:
            img_name = os.path.basename(img['tif'])
            if bucket:
                local_img = f"Images/{img_name}"
                bucket.download_file(img['tif'], local_img)
                inference_image = f"Classified_Images/{img_name.split('.')[0]}_inference.tif"
            else:
                local_img = img['tif']
                inference_image = os.path.join(
                    params['inference']['working_folder'],
                    f"{img_name.split('.')[0]}_inference.tif")

            assert_band_number(local_img, params['global']['number_of_bands'])

            nd_array_tif = image_reader_as_array(local_img)
            sem_seg_results = sem_seg_inference(model, nd_array_tif,
                                                nbr_pix_overlap, chunk_size,
                                                num_classes)
            create_new_raster_from_base(local_img, inference_image,
                                        sem_seg_results)
            print(f"Semantic segmentation of image {img_name} completed")
            if bucket:
                bucket.upload_file(
                    inference_image,
                    os.path.join(params['inference']['working_folder'],
                                 f"{img_name.split('.')[0]}_inference.tif"))
    else:
        raise ValueError(
            f"The task should be either classification or segmentation. The provided value is {params['global']['task']}"
        )

    time_elapsed = time.time() - since
    print('Inference completed in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
Exemplo n.º 4
0
def main(bucket_name, data_path, output_path, num_trn_samples, num_val_samples,
         pretrained, batch_size, num_epochs, learning_rate, weight_decay,
         step_size, gamma, num_classes, class_weights, batch_metrics, model,
         classifier, model_name):
    """Function to train and validate a models for semantic segmentation.
    Args:
        bucket_name: bucket in which data is stored if using AWS S3
        data_path: full file path of the folder containing h5py files
        output_path: full file path in which the model will be saved
        num_trn_samples: number of training samples
        num_val_samples: number of validation samples
        pretrained: booleam indicating if the model is pretrained
        batch_size: number of samples to process simultaneously
        num_epochs: number of epochs
        learning_rate: learning rate
        weight_decay: weight decay
        step_size: step size
        gamma: multiplicative factor of learning rate decay
        num_classes: number of classes
        class_weights: weights to apply to each class. A value > 1.0 will apply more weights to the learning of the class
        batch_metrics:(int) Metrics computed every (int) batches. If left blank, will not perform metrics.
        model: CNN model (tensor)
        classifier: True if doing image classification, False if doing semantic segmentation.
        model_name: name of the model used for training.
    Returns:
        Files 'checkpoint.pth.tar' and 'last_epoch.pth.tar' containing trained weight
    """
    if bucket_name:
        if output_path is None:
            bucket_output_path = None
        else:
            bucket_output_path = output_path
        output_path = 'output_path'
        try:
            os.mkdir(output_path)
        except FileExistsError:
            pass
        s3 = boto3.resource('s3')
        bucket = s3.Bucket(bucket_name)
        if classifier:
            for i in ['trn', 'val']:
                get_s3_classification_images(i, bucket, bucket_name, data_path,
                                             output_path, num_classes)
                class_file = os.path.join(output_path, 'classes.csv')
                if bucket_output_path:
                    bucket.upload_file(
                        class_file,
                        os.path.join(bucket_output_path, 'classes.csv'))
                else:
                    bucket.upload_file(class_file, 'classes.csv')
            data_path = 'Images'
        else:
            if data_path:
                bucket.download_file(
                    os.path.join(data_path, 'samples/trn_samples.hdf5'),
                    'samples/trn_samples.hdf5')
                bucket.download_file(
                    os.path.join(data_path, 'samples/val_samples.hdf5'),
                    'samples/val_samples.hdf5')
            else:
                bucket.download_file('samples/trn_samples.hdf5',
                                     'samples/trn_samples.hdf5')
                bucket.download_file('samples/val_samples.hdf5',
                                     'samples/val_samples.hdf5')
            verify_sample_count(num_trn_samples, num_val_samples, data_path,
                                bucket_name)
    elif classifier:
        get_local_classes(num_classes, data_path, output_path)
    else:
        verify_sample_count(num_trn_samples, num_val_samples, data_path,
                            bucket_name)
    verify_weights(num_classes, class_weights)

    since = time.time()
    best_loss = 999

    trn_log = InformationLogger(output_path, 'trn')
    val_log = InformationLogger(output_path, 'val')

    if torch.cuda.is_available():
        model = model.cuda()
        if class_weights:
            criterion = nn.CrossEntropyLoss(
                weight=torch.tensor(class_weights)).cuda()
        else:
            criterion = nn.CrossEntropyLoss().cuda()
    else:
        if class_weights:
            criterion = nn.CrossEntropyLoss(weight=torch.tensor(class_weights))
        else:
            criterion = nn.CrossEntropyLoss()

    optimizer = optim.Adam(model.parameters(),
                           lr=learning_rate,
                           weight_decay=weight_decay)
    lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer, step_size=step_size, gamma=gamma)  # learning rate decay

    if pretrained != '':
        model, optimizer = load_from_checkpoint(pretrained, model, optimizer)

    if classifier:
        trn_dataset = torchvision.datasets.ImageFolder(
            os.path.join(data_path, "trn"),
            transform=transforms.Compose([
                transforms.RandomRotation((0, 275)),
                transforms.RandomHorizontalFlip(),
                transforms.Resize(299),
                transforms.ToTensor()
            ]),
            loader=loader)
        val_dataset = torchvision.datasets.ImageFolder(
            os.path.join(data_path, "val"),
            transform=transforms.Compose(
                [transforms.Resize(299),
                 transforms.ToTensor()]),
            loader=loader)
    else:
        if not bucket_name:
            trn_dataset = CreateDataset.SegmentationDataset(
                os.path.join(data_path, "samples"),
                num_trn_samples,
                "trn",
                transform=transforms.Compose([
                    aug.RandomRotationTarget(),
                    aug.HorizontalFlip(),
                    aug.ToTensorTarget()
                ]))
            val_dataset = CreateDataset.SegmentationDataset(
                os.path.join(data_path, "samples"),
                num_val_samples,
                "val",
                transform=transforms.Compose([aug.ToTensorTarget()]))
        else:
            trn_dataset = CreateDataset.SegmentationDataset(
                'samples',
                num_trn_samples,
                "trn",
                transform=transforms.Compose([
                    aug.RandomRotationTarget(),
                    aug.HorizontalFlip(),
                    aug.ToTensorTarget()
                ]))
            val_dataset = CreateDataset.SegmentationDataset(
                "samples",
                num_val_samples,
                "val",
                transform=transforms.Compose([aug.ToTensorTarget()]))

    # Shuffle must be set to True.
    trn_dataloader = DataLoader(trn_dataset,
                                batch_size=batch_size,
                                num_workers=4,
                                shuffle=True)
    val_dataloader = DataLoader(val_dataset,
                                batch_size=batch_size,
                                num_workers=4,
                                shuffle=True)

    now = datetime.datetime.now().strftime("%Y-%m-%d_%I-%M ")
    for epoch in range(0, num_epochs):
        print()
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 20)

        trn_report = train(trn_dataloader, model, criterion, optimizer,
                           lr_scheduler, num_classes, batch_size, classifier)
        trn_log.add_values(trn_report, epoch)

        val_report = validation(val_dataloader, model, criterion, num_classes,
                                batch_size, classifier, batch_metrics)
        val_loss = val_report['loss'].avg
        if batch_metrics is not None:
            val_log.add_values(val_report, epoch, log_metrics=True)
        else:
            val_log.add_values(val_report, epoch)

        if val_loss < best_loss:
            print("save checkpoint")
            filename = os.path.join(output_path, 'checkpoint.pth.tar')
            best_loss = val_loss
            save_checkpoint(
                {
                    'epoch': epoch,
                    'arch': model_name,
                    'model': model.state_dict(),
                    'best_loss': best_loss,
                    'optimizer': optimizer.state_dict()
                }, filename)

            if bucket_name:
                if bucket_output_path:
                    bucket_filename = os.path.join(bucket_output_path,
                                                   'checkpoint.pth.tar')
                else:
                    bucket_filename = 'checkpoint.pth.tar'
                bucket.upload_file(filename, bucket_filename)

        if bucket_name:
            save_logs_to_bucket(bucket, bucket_output_path, output_path, now,
                                batch_metrics)

        cur_elapsed = time.time() - since
        print('Current elapsed time {:.0f}m {:.0f}s'.format(
            cur_elapsed // 60, cur_elapsed % 60))

    filename = os.path.join(output_path, 'last_epoch.pth.tar')
    save_checkpoint(
        {
            'epoch': epoch,
            'arch': model_name,
            'model': model.state_dict(),
            'best_loss': best_loss,
            'optimizer': optimizer.state_dict()
        }, filename)

    if bucket_name:
        if bucket_output_path:
            bucket_filename = os.path.join(bucket_output_path,
                                           'last_epoch.pth.tar')
            bucket.upload_file(
                "output.txt",
                os.path.join(bucket_output_path, f"Logs/{now}_output.txt"))
        else:
            bucket_filename = 'last_epoch.pth.tar'
            bucket.upload_file("output.txt", f"Logs/{now}_output.txt")
        bucket.upload_file(filename, bucket_filename)

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
Exemplo n.º 5
0
def main(bucket, work_folder, img_list, weights_file_name, model,
         number_of_bands, overlay, classify, num_classes):
    """Identify the class to which each image belongs.
    Args:
        bucket: bucket in which data is stored if using AWS S3
        work_folder: full file path of the folder containing images
        img_list: list containing images to classify
        weights_file_name: full file path of the file containing weights
        model: loaded model with which inference should be done
        number_of_bands: number of bands in the input rasters
        overlay: amount of overlay to apply
        classify: True if doing a classification task, False if doing semantic segmentation
    """
    if torch.cuda.is_available():
        model = model.cuda()
    if bucket:
        bucket.download_file(weights_file_name, "saved_model.pth.tar")
        model = load_from_checkpoint("saved_model.pth.tar", model)
        if classify:
            classes_file = weights_file_name.split('/')[:-1]
            class_csv = ''
            for folder in classes_file:
                class_csv = os.path.join(class_csv, folder)
            bucket.download_file(os.path.join(class_csv, 'classes.csv'),
                                 'classes.csv')
            with open('classes.csv', 'rt') as file:
                reader = csv.reader(file)
                classes = list(reader)
    else:
        model = load_from_checkpoint(weights_file_name, model)
        if classify:
            classes_file = weights_file_name.split('/')[:-1]
            class_path = ''
            for c in classes_file:
                class_path = class_path + c + '/'
            with open(class_path + 'classes.csv', 'rt') as f:
                reader = csv.reader(f)
                classes = list(reader)
    since = time.time()
    classified_results = np.empty((0, 2 + num_classes))

    for img in img_list:
        img_name = os.path.basename(img['tif'])
        if bucket:
            local_img = f"Images/{img_name}"
            bucket.download_file(img['tif'], local_img)
            inference_image = f"Classified_Images/{img_name.split('.')[0]}_inference.tif"
        else:
            local_img = img['tif']
            inference_image = os.path.join(
                work_folder, f"{img_name.split('.')[0]}_inference.tif")

        assert_band_number(local_img, number_of_bands)
        if classify:
            outputs, predicted = classifier(bucket, model, img['tif'])
            top5 = heapq.nlargest(5, outputs.cpu().numpy()[0])
            top5_loc = []
            for i in top5:
                top5_loc.append(np.where(outputs.cpu().numpy()[0] == i)[0][0])
            print(f"Image {img_name} classified as {classes[0][predicted]}")
            print('Top 5 classes:')
            for i in range(0, 5):
                print(f"\t{classes[0][top5_loc[i]]} : {top5[i]}")
            classified_results = np.append(classified_results, [
                np.append([img['tif'], classes[0][predicted]],
                          outputs.cpu().numpy()[0])
            ],
                                           axis=0)
            print()
        else:
            sem_seg_results = sem_seg_inference(bucket, model, img['tif'],
                                                overlay)
            create_new_raster_from_base(local_img, inference_image,
                                        sem_seg_results)
            print(f"Semantic segmentation of image {img_name} completed")

        if bucket:
            if not classify:
                bucket.upload_file(
                    inference_image,
                    os.path.join(work_folder,
                                 f"{img_name.split('.')[0]}_inference.tif"))

    if classify:
        csv_results = 'classification_results.csv'
        if bucket:
            np.savetxt(csv_results,
                       classified_results,
                       fmt='%s',
                       delimiter=',')
            bucket.upload_file(csv_results,
                               os.path.join(work_folder, csv_results))
        else:
            np.savetxt(os.path.join(work_folder, csv_results),
                       classified_results,
                       fmt='%s',
                       delimiter=',')

    time_elapsed = time.time() - since
    print('Inference completed in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
Exemplo n.º 6
0
                        action='store',
                        dest='top_k',
                        default=5,
                        help='the number of probabilities to show')
    parser.add_argument('--category_names',
                        action='store',
                        dest='category_names',
                        default=None,
                        help='the label names')
    parser.add_argument('--gpu',
                        action='store',
                        dest='gpu',
                        default=True,
                        help='the model architecture')
    results = parser.parse_args()
    image_path = results.input
    checkpoint = results.checkpoint
    top_k = int(results.top_k)
    category_names_json = results.category_names
    gpu = results.gpu

    category_names = None
    if category_names_json == None:
        print("No label to index mapping provided")
    else:
        with open(str(category_names_json), 'r') as f:
            category_names = json.load(f)

    model = utils.load_from_checkpoint(checkpoint)
    pil_image = Image.open(image_path, 'r')
    predict(pil_image, model, gpu, category_names, topk=top_k)
Exemplo n.º 7
0
def run_dcgan(device,
              image_size,
              noise_size,
              batch_size,
              config,
              run_dir,
              saved_dir,
              run_name,
              num_epochs,
              val_dataset,
              train_dataset=None,
              checkpoints=None,
              mode='train',
              gpu_num=1):

    #Run DCGAN
    type = 'DCGAN'
    dcgan_generator = Generator(noise_size=noise_size,
                                image_size=image_size).to(device)
    dcgan_discriminator = Discriminator(image_size=image_size).to(device)
    #Parallel for improved performence
    if device.type == 'cuda' and gpu_num > 1:
        dcgan_generator = nn.DataParallel(dcgan_generator,
                                          list(range(gpu_num)))
        dcgan_discriminator = nn.DataParallel(dcgan_discriminator,
                                              list(range(gpu_num)))

    #Print networks
    print('Discriminator')
    summary(dcgan_discriminator, (3, image_size, image_size))
    print('Generator')
    summary(dcgan_generator, (noise_size, 1, 1))

    if checkpoints is not None:
        utils.load_from_checkpoint(dcgan_generator, saved_dir,
                                   checkpoints["generator"])
        utils.load_from_checkpoint(dcgan_discriminator, saved_dir,
                                   checkpoints["discriminator"])

    run_name = 'DCGAN' + '_' + run_name
    #We train the model in train phase and only calculate scores in test mode
    if mode == 'train':
        inception_FID_scores, inception_scores = train_gan(num_epochs,
                                                           batch_size,
                                                           noise_size,
                                                           device,
                                                           train_dataset,
                                                           val_dataset,
                                                           dcgan_generator,
                                                           dcgan_discriminator,
                                                           type='DCGAN',
                                                           config=config,
                                                           run_dir=run_dir,
                                                           saved_dir=saved_dir,
                                                           run_name=run_name)
    elif mode == 'test':
        inception_FID_scores = [
            calc_inception_FID_score(batch_size, device, val_dataset,
                                     dcgan_generator, type, noise_size)
        ]
        inception_scores = [
            calc_inception_score(device,
                                 noise_size,
                                 dcgan_generator,
                                 eval_size=len(val_dataset))
        ]
    #Return list of all score accumulated in epochs

    date_str = datetime.datetime.now().strftime("%m%d%Y%H")
    save_to_pickle(
        inception_FID_scores,
        os.path.join(saved_dir,
                     'dcgan_fid_' + run_name + date_str + ".pickle"))
    save_to_pickle(
        inception_scores,
        os.path.join(saved_dir, 'dcgan_IS_' + run_name + date_str + ".pickle"))

    return inception_FID_scores, inception_scores
Exemplo n.º 8
0
def run_sagan(device,
              image_size,
              noise_size,
              batch_size,
              config,
              run_dir,
              saved_dir,
              run_name,
              num_epochs,
              val_dataset,
              train_dataset=None,
              checkpoints=None,
              mode='train',
              gpu_num=1):
    type = 'SAGAN'
    sagan_generator = SAGenerator(noise_size=noise_size,
                                  image_size=image_size).to(device)
    sagan_discriminator = SADiscriminator(image_size=image_size).to(device)
    # Parallel for improved performance
    if ((device.type == 'cuda') and (gpu_num > 1)):
        sagan_generator = nn.DataParallel(sagan_generator,
                                          list(range(gpu_num)))
        sagan_discriminator = nn.DataParallel(sagan_discriminator,
                                              list(range(gpu_num)))
    # Print networks
    print('Discriminator')
    summary(sagan_discriminator, (3, image_size, image_size))
    print('Generator')
    summary(sagan_generator, (noise_size, 1, 1))
    if checkpoints is not None:
        utils.load_from_checkpoint(sagan_generator, saved_dir,
                                   checkpoints["generator"])
        utils.load_from_checkpoint(sagan_discriminator, saved_dir,
                                   checkpoints["discriminator"])

    run_name = 'SAGAN' + '_' + run_name
    if mode == 'train':
        inception_FID_scores, inception_scores = train_gan(num_epochs,
                                                           batch_size,
                                                           noise_size,
                                                           device,
                                                           train_dataset,
                                                           val_dataset,
                                                           sagan_generator,
                                                           sagan_discriminator,
                                                           type='SAGAN',
                                                           config=config,
                                                           run_dir=run_dir,
                                                           saved_dir=saved_dir,
                                                           run_name=run_name)
    elif mode == 'test':
        inception_FID_scores = [
            calc_inception_FID_score(batch_size, device, val_dataset,
                                     sagan_generator, type, noise_size)
        ]
        inception_scores = [
            calc_inception_score(device,
                                 noise_size,
                                 sagan_generator,
                                 eval_size=len(val_dataset))
        ]

    date_str = datetime.datetime.now().strftime("%m%d%Y%H")
    utils.save_to_pickle(
        inception_FID_scores,
        os.path.join(saved_dir,
                     'sagan_fid_' + run_name + date_str + ".pickle"))
    utils.save_to_pickle(
        inception_scores,
        os.path.join(saved_dir, 'sagan_IS_' + run_name + date_str + ".pickle"))

    return inception_FID_scores, inception_scores
Exemplo n.º 9
0
def main(options=None):
    args = get_args()
    if options is not None:
        args = utils.load_options(args, options)

    seed = 1  # Do NOT modify the seed. The captions have been generated from images generated from this seed.
    torch.cuda.manual_seed_all(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    # -------------------------------- INSTANTIATE MAIN ACTORS ----------------------------- #

    # --------------- Create dataset ---------------- #
    print('Creating dataset', flush=True)
    image_normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                           std=[0.229, 0.224, 0.225])
    transform = transforms.Compose([
        transforms.Resize(128),  # Smaller edge will be matched to this number
        transforms.CenterCrop((128, 128)),
        transforms.ToTensor(),
        image_normalize,
    ])

    train_dataset = dataset.ImageAudioDataset(args.folder_dataset +
                                              args.name_dataset,
                                              split='train',
                                              random_sampling=True,
                                              transform=transform,
                                              loading_image=args.loading_image)
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=None)

    val_dataset = dataset.ImageAudioDataset(args.folder_dataset +
                                            args.name_dataset,
                                            split='val',
                                            transform=transform,
                                            loading_image=args.loading_image)
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    test_dataset = dataset.ImageAudioDataset(args.folder_dataset +
                                             args.name_dataset,
                                             split='test',
                                             transform=transform,
                                             loading_image=args.loading_image)
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=args.batch_size,
                                              shuffle=False,
                                              num_workers=args.workers,
                                              pin_memory=True)

    # -------------- Create model --------------- #
    print('Creating model', flush=True)
    module = __import__('models')
    model_class = getattr(module, args.model)
    model = model_class(args)
    model = torch.nn.DataParallel(model).cuda()
    # Print model information
    utils.print_model_report(model)

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # Load model
    resume_epoch = 0
    if args.seed:
        if args.seed == 'EXPDIR':
            if args.name_checkpoint == '':
                name = args.model + '_' + args.name_dataset
            else:
                name = args.name_checkpoint
            path_load = args.expdir + 'model_best_' + name + '.pth.tar'
        else:
            path_load = args.seed
        if args.resume:
            utils.load_from_checkpoint(model,
                                       path_load,
                                       submodels_load=args.submodels_load,
                                       optimizer=None)
            checkpoint = torch.load(path_load)
            resume_epoch = checkpoint['epoch']
        else:
            utils.load_from_checkpoint(model,
                                       path_load,
                                       submodels_load=args.submodels_load,
                                       optimizer=None)

    # --------------- Instantiate trainer --------------- #
    print('Instantiating trainer', flush=True)
    all_loaders = {
        'val': val_loader,
        'train': train_loader,
        'test': test_loader
    }
    trainer = Trainer(model,
                      optimizer,
                      all_loaders,
                      args,
                      resume_epoch=resume_epoch)

    # ------------------------- Others ----------------------- #
    current_time = datetime.now().strftime('%b%d_%H-%M-%S')
    log_dir = os.path.join(
        args.results, 'runs',
        args.name_checkpoint + '_' + current_time + '_' + socket.gethostname())
    args.writer = SummaryWriter(log_dir=log_dir)

    # ----------------------------------- TRAIN ------------------------------------------ #
    if args.experiment:
        print("Running experiment", flush=True)
        experiments.experiment(args.experiment_name, trainer)
    elif args.evaluate:
        print("Performing evaluation epoch", flush=True)
        trainer.eval()
    elif args.generate_active_learning:
        print("Generating active learning samples", flush=True)
        active_learning.generate_active_learning(trainer)
    else:
        print("Beginning training", flush=True)
        trainer.train()
Exemplo n.º 10
0
def main(params):
    """
    Function to train and validate a models for semantic segmentation or classification.
    :param params: (dict) Parameters found in the yaml config file.

    """
    model, state_dict_path, model_name = net(params)
    bucket_name = params['global']['bucket_name']
    output_path = params['training']['output_path']
    data_path = params['global']['data_path']
    task = params['global']['task']
    num_classes = params['global']['num_classes']
    batch_size = params['training']['batch_size']

    if bucket_name:
        bucket, bucket_output_path, output_path, data_path = download_s3_files(
            bucket_name=bucket_name,
            data_path=data_path,
            output_path=output_path,
            num_classes=num_classes,
            task=task)

    elif not bucket_name and task == 'classification':
        get_local_classes(num_classes, data_path, output_path)

    since = time.time()
    best_loss = 999

    progress_log = Path(output_path) / 'progress.log'
    if not progress_log.exists():
        # Add header
        progress_log.open('w', buffering=1).write(
            tsv_line('ep_idx', 'phase', 'iter', 'i_p_ep', 'time'))

    trn_log = InformationLogger(output_path, 'trn')
    val_log = InformationLogger(output_path, 'val')
    tst_log = InformationLogger(output_path, 'tst')

    model, criterion, optimizer, lr_scheduler, num_devices = set_hyperparameters(
        params, model, state_dict_path)

    num_samples = get_num_samples(data_path=data_path, params=params)
    print(f"Number of samples : {num_samples}")
    trn_dataloader, val_dataloader, tst_dataloader = create_dataloader(
        data_path=data_path,
        num_samples=num_samples,
        batch_size=batch_size,
        task=task)

    now = datetime.datetime.now().strftime("%Y-%m-%d_%I-%M ")
    filename = os.path.join(output_path, 'checkpoint.pth.tar')

    for epoch in range(0, params['training']['num_epochs']):
        print()
        print('Epoch {}/{}'.format(epoch,
                                   params['training']['num_epochs'] - 1))
        print('-' * 20)

        trn_report = train(train_loader=trn_dataloader,
                           model=model,
                           criterion=criterion,
                           optimizer=optimizer,
                           scheduler=lr_scheduler,
                           num_classes=num_classes,
                           batch_size=batch_size,
                           task=task,
                           ep_idx=epoch,
                           progress_log=progress_log,
                           num_devices=num_devices)
        trn_log.add_values(trn_report,
                           epoch,
                           ignore=['precision', 'recall', 'fscore', 'iou'])

        val_report = evaluation(
            eval_loader=val_dataloader,
            model=model,
            criterion=criterion,
            num_classes=num_classes,
            batch_size=batch_size,
            task=task,
            ep_idx=epoch,
            progress_log=progress_log,
            batch_metrics=params['training']['batch_metrics'],
            dataset='val',
            num_devices=num_devices)
        val_loss = val_report['loss'].avg
        if params['training']['batch_metrics'] is not None:
            val_log.add_values(val_report, epoch)
        else:
            val_log.add_values(val_report,
                               epoch,
                               ignore=['precision', 'recall', 'fscore', 'iou'])

        if val_loss < best_loss:
            print("save checkpoint")
            best_loss = val_loss
            torch.save(
                {
                    'epoch': epoch,
                    'arch': model_name,
                    'model': model.state_dict(),
                    'best_loss': best_loss,
                    'optimizer': optimizer.state_dict()
                }, filename)

            if bucket_name:
                bucket_filename = os.path.join(bucket_output_path,
                                               'checkpoint.pth.tar')
                bucket.upload_file(filename, bucket_filename)

        if bucket_name:
            save_logs_to_bucket(bucket, bucket_output_path, output_path, now,
                                params['training']['batch_metrics'])

        cur_elapsed = time.time() - since
        print('Current elapsed time {:.0f}m {:.0f}s'.format(
            cur_elapsed // 60, cur_elapsed % 60))

    # load checkpoint model and evaluate it on test dataset.
    model = load_from_checkpoint(filename, model)
    tst_report = evaluation(eval_loader=tst_dataloader,
                            model=model,
                            criterion=criterion,
                            num_classes=num_classes,
                            batch_size=batch_size,
                            task=task,
                            ep_idx=params['training']['num_epochs'],
                            progress_log=progress_log,
                            batch_metrics=params['training']['batch_metrics'],
                            dataset='tst',
                            num_devices=num_devices)
    tst_log.add_values(tst_report, params['training']['num_epochs'])

    if bucket_name:
        bucket_filename = os.path.join(bucket_output_path,
                                       'last_epoch.pth.tar')
        bucket.upload_file(
            "output.txt",
            os.path.join(bucket_output_path, f"Logs/{now}_output.txt"))
        bucket.upload_file(filename, bucket_filename)

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
Exemplo n.º 11
0
def set_hyperparameters(params, model, state_dict_path):
    """
    Function to set hyperparameters based on values provided in yaml config file.
    Will also set model to GPU, if available.
    If none provided, default functions values are used.
    :param params: (dict) Parameters found in the yaml config file
    :param model: Model loaded from model_choice.py
    :param state_dict_path: (str) Full file path to the state dict
    :return: model, criterion, optimizer, lr_scheduler, num_gpus
    """
    loss_signature = inspect.signature(nn.CrossEntropyLoss).parameters
    adam_signature = inspect.signature(optim.Adam).parameters
    lr_scheduler_signature = inspect.signature(
        optim.lr_scheduler.StepLR).parameters
    class_weights = loss_signature['weight'].default
    ignore_index = loss_signature['ignore_index'].default
    lr = adam_signature['lr'].default
    weight_decay = adam_signature['weight_decay'].default
    step_size = lr_scheduler_signature['step_size'].default
    if not isinstance(step_size, int):
        step_size = params['training']['num_epochs'] + 1
    gamma = lr_scheduler_signature['gamma'].default
    num_devices = 0

    if params['training']['class_weights'] is not None:
        class_weights = torch.tensor(params['training']['class_weights'])
        verify_weights(params['global']['num_classes'], class_weights)
    if params['training']['ignore_index'] is not None:
        ignore_index = params['training']['ignore_index']
    if params['training']['learning_rate'] is not None:
        lr = params['training']['learning_rate']
    if params['training']['weight_decay'] is not None:
        weight_decay = params['training']['weight_decay']
    if params['training']['step_size'] is not None:
        step_size = params['training']['step_size']
    if params['training']['gamma'] is not None:
        gamma = params['training']['gamma']
    if params['global']['num_gpus'] is not None:
        num_devices = params['global']['num_gpus']

    if torch.cuda.is_available():
        lst_device_ids = get_device_ids(num_devices)
    else:
        lst_device_ids = []
    if lst_device_ids:
        if len(lst_device_ids) == 1:
            print(f"Using Cuda device {lst_device_ids[0]}")
            torch.cuda.set_device(lst_device_ids[0])

        model = model.cuda()
        criterion = nn.CrossEntropyLoss(weight=class_weights,
                                        ignore_index=ignore_index).cuda()
        num_devices = len(lst_device_ids)
        if len(lst_device_ids) > 1:
            print(
                f"Using data parallel on devices {str(lst_device_ids)[1:-1]}")
            model = nn.DataParallel(model, device_ids=lst_device_ids)
    else:
        warnings.warn(
            f"No Cuda device available. This process will only run on CPU")
        num_devices = 0
        criterion = nn.CrossEntropyLoss(weight=class_weights,
                                        ignore_index=ignore_index)

    optimizer = optim.Adam(params=model.parameters(),
                           lr=lr,
                           weight_decay=weight_decay)
    lr_scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer,
                                             step_size=step_size,
                                             gamma=gamma)

    if state_dict_path != '':
        model, optimizer = load_from_checkpoint(state_dict_path,
                                                model,
                                                optimizer=True)

    return model, criterion, optimizer, lr_scheduler, num_devices