예제 #1
0
def main():
    # Command line options
    args = parser.parse_args()
    print("Command line options:")
    for arg in vars(args):
        print(arg, getattr(args, arg))

    if args.debug:
        pdb.set_trace()

    # Check whether GPU is available and can be used
    # if CUDA is found then device is set accordingly
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Launch a writer for the tensorboard summary writer instance
    save_path = 'runs/' + strftime(
        "%Y-%m-%d_%H-%M-%S",
        gmtime()) + '_' + args.dataset + '_' + args.architecture

    # if we are resuming a previous training, note it in the name
    if args.resume:
        save_path = save_path + '_resumed'
    writer = SummaryWriter(save_path)

    # saving the parsed args to file
    log_file = os.path.join(save_path, "stdout")
    log = open(log_file, "a")
    for arg in vars(args):
        log.write(arg + ':' + str(getattr(args, arg)) + '\n')

    # Dataset loading
    data_init_method = getattr(datasets, args.dataset)
    dataset = data_init_method(torch.cuda.is_available(), args)
    # get the number of classes from the class dictionary
    num_classes = dataset.num_classes

    # we set an epoch multiplier to 1 for isolated training and increase it proportional to amount of tasks in CL
    epoch_multiplier = 1

    # add command line options to TensorBoard
    args_to_tensorboard(writer, args)
    log.close()

    # build the model
    model = architectures.Inos_model(args.num_class, args)

    # Parallel container for multi GPU use and cast to available device
    model = torch.nn.DataParallel(model).to(device)
    print(model)

    if not args.pretrained:
        # Initialize the weights of the model, by default according to He et al.
        print("Initializing network with: " + args.weight_init)
        WeightInitializer = WeightInit(args.weight_init)
        WeightInitializer.init_model(model)

    # Define optimizer and loss function (criterion)
    optimizer = torch.optim.SGD(model.parameters(),
                                args.learning_rate,
                                momentum=0.9,
                                weight_decay=2e-4)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=[30, 60, 80, 100], gamma=0.5)

    epoch = 0
    best_prec = 0
    best_loss = random.getrandbits(128)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            epoch = checkpoint['epoch']
            best_prec = checkpoint['best_prec']
            best_loss = checkpoint['best_loss']
            model.load_state_dict(checkpoint['state_dict'])
            # optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # optimize until final amount of epochs is reached. Final amount of epochs is determined through the
    while epoch < (args.epochs * epoch_multiplier):
        if epoch + 2 == epoch % args.epochs:
            print("debug perpose")

        # train
        train(dataset, model, criterion, epoch, optimizer, writer, device,
              args)

        # evaluate on validation set
        prec, loss = validate(dataset, model, criterion, epoch, writer, device,
                              save_path, args)

        # evaluate on test set
        prec_t, loss_t = test(dataset, model, criterion, epoch, writer, device,
                              save_path, args)

        # remember best prec@1 and save checkpoint
        is_best = loss < best_loss
        best_loss = min(loss, best_loss)
        best_prec = max(prec, best_prec)
        save_checkpoint(
            {
                'epoch': epoch,
                'arch': args.architecture,
                'state_dict': model.state_dict(),
                'best_prec': best_prec,
                'best_loss': best_loss,
                'optimizer': optimizer.state_dict()
            }, is_best, save_path)

        # increment epoch counters
        epoch += 1
        scheduler.step()

    writer.close()
예제 #2
0
def main():
    # Command line options
    args = parser.parse_args()
    print("Command line options:")
    for arg in vars(args):
        print(arg, getattr(args, arg))

    if args.cross_dataset and not args.incremental_data:
        raise ValueError(
            'cross-dataset training possible only if incremental-data flag set'
        )

    # Check whether GPU is available and can be used
    # if CUDA is found then device is set accordingly
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Launch a writer for the tensorboard summary writer instance
    save_path = 'runs/' + strftime("%Y-%m-%d_%H-%M-%S", gmtime()) + '_' + args.dataset + '_' + args.architecture +\
                '_variational_samples_' + str(args.var_samples) + '_latent_dim_' + str(args.var_latent_dim)

    # add option specific naming to separate tensorboard log files later
    if args.autoregression:
        save_path += '_pixelcnn'

    if args.incremental_data:
        save_path += '_incremental'
        if args.train_incremental_upper_bound:
            save_path += '_upper_bound'
        if args.generative_replay:
            save_path += '_genreplay'
        if args.openset_generative_replay:
            save_path += '_opensetreplay'
    if args.cross_dataset:
        save_path += '_cross_dataset_' + args.dataset_order

    # if we are resuming a previous training, note it in the name
    if args.resume:
        save_path = save_path + '_resumed'
    writer = SummaryWriter(save_path)

    # saving the parsed args to file
    log_file = os.path.join(save_path, "stdout")
    log = open(log_file, "a")
    for arg in vars(args):
        log.write(arg + ':' + str(getattr(args, arg)) + '\n')

    # Dataset loading
    data_init_method = getattr(datasets, args.dataset)
    dataset = data_init_method(torch.cuda.is_available(), args)
    # get the number of classes from the class dictionary
    num_classes = dataset.num_classes

    # we set an epoch multiplier to 1 for isolated training and increase it proportional to amount of tasks in CL
    epoch_multiplier = 1
    if args.incremental_data:
        from lib.Datasets.incremental_dataset import get_incremental_dataset

        # get the method to create the incremental dataste (inherits from the chosen data loader)
        inc_dataset_init_method = get_incremental_dataset(
            data_init_method, args)

        # different options for class incremental vs. cross-dataset experiments
        if args.cross_dataset:
            # if a task order file is specified, load the task order from it
            if args.load_task_order:
                # check if file exists and if file ends with extension '.txt'
                if os.path.isfile(args.load_task_order) and len(args.load_task_order) >= 4\
                        and args.load_task_order[-4:] == '.txt':
                    print("=> loading task order from '{}'".format(
                        args.load_task_order))
                    with open(args.load_task_order, 'rb') as fp:
                        task_order = pickle.load(fp)
                # if no file is found default to cmd line task order
                else:
                    # parse and split string at commas
                    task_order = args.dataset_order.split(',')
                    for i in range(len(task_order)):
                        # remove blank spaces in dataset names
                        task_order[i] = task_order[i].replace(" ", "")
            # use task order as specified in command line
            else:
                # parse and split string at commas
                task_order = args.dataset_order.split(',')
                for i in range(len(task_order)):
                    # remove blank spaces in dataset names
                    task_order[i] = task_order[i].replace(" ", "")

            # just for getting the number of classes in the first dataset
            num_classes = 0
            for i in range(args.num_base_tasks):
                temp_dataset_init_method = getattr(datasets, task_order[i])
                temp_dataset = temp_dataset_init_method(
                    torch.cuda.is_available(), args)
                num_classes += temp_dataset.num_classes
                del temp_dataset

            # multiply epochs by number of tasks
            if args.num_increment_tasks:
                epoch_multiplier = ((len(task_order) - args.num_base_tasks) /
                                    args.num_increment_tasks) + 1
            else:
                # this branch will get active if num_increment_tasks is set to zero. This is useful when training
                # any isolated upper bound with all datasets present from the start.
                epoch_multiplier = 1.0
        else:
            # class incremental
            # if specified load task order from file
            if args.load_task_order:
                if os.path.isfile(args.load_task_order):
                    print("=> loading task order from '{}'".format(
                        args.load_task_order))
                    task_order = np.load(args.load_task_order).tolist()
                else:
                    # if no file is found a random task order is created
                    print(
                        "=> no task order found. Creating randomized task order"
                    )
                    task_order = np.random.permutation(num_classes).tolist()
            else:
                # if randomize task order is specified create a random task order, else task order is sequential
                task_order = []
                for i in range(dataset.num_classes):
                    task_order.append(i)

                if args.randomize_task_order:
                    task_order = np.random.permutation(num_classes).tolist()

            # save the task order
            np.save(os.path.join(save_path, 'task_order.npy'), task_order)
            # set the number of classes to base tasks + 1 because base tasks is always one less.
            # E.g. if you have 2 classes it's one task. This is a little inconsistent from the naming point of view
            # but we wanted a single variable to work for both class incremental as well as cross-dataset experiments
            num_classes = args.num_base_tasks + 1
            # multiply epochs by number of tasks
            epoch_multiplier = (
                (len(task_order) -
                 (args.num_base_tasks + 1)) / args.num_increment_tasks) + 1

        print("Task order: ", task_order)
        # log the task order into the text file
        log.write('task_order:' + str(task_order) + '\n')
        args.task_order = task_order

        # this is a little weird, but it needs to be here because the below method pops items from task_order
        args_to_tensorboard(writer, args)

        assert epoch_multiplier.is_integer(), print(
            "uneven task division, make sure number of tasks are integers.")

        # Get the incremental dataset
        dataset = inc_dataset_init_method(torch.cuda.is_available(), device,
                                          task_order, args)
    else:
        # add command line options to TensorBoard
        args_to_tensorboard(writer, args)

    log.close()

    # Get a sample input from the data loader to infer color channels/size
    net_input, _ = next(iter(dataset.train_loader))
    # get the amount of color channels in the input images
    num_colors = net_input.size(1)

    # import model from architectures class
    net_init_method = getattr(architectures, args.architecture)

    # if we are not building an autoregressive model the number of output channels of the model is equivalent to
    # the amount of input channels. For an autoregressive models we set the number of output channels of the
    # non-autoregressive decoder portion according to the command line option below
    if not args.autoregression:
        args.out_channels = num_colors

    # build the model
    model = net_init_method(device, num_classes, num_colors, args)

    # optionally add the autoregressive decoder
    if args.autoregression:
        model.pixelcnn = PixelCNN(device,
                                  num_colors,
                                  args.out_channels,
                                  args.pixel_cnn_channels,
                                  num_layers=args.pixel_cnn_layers,
                                  k=args.pixel_cnn_kernel_size,
                                  padding=args.pixel_cnn_kernel_size // 2)

    # Parallel container for multi GPU use and cast to available device
    model = torch.nn.DataParallel(model).to(device)
    print(model)

    # Initialize the weights of the model, by default according to He et al.
    print("Initializing network with: " + args.weight_init)
    WeightInitializer = WeightInit(args.weight_init)
    WeightInitializer.init_model(model)

    # Define optimizer and loss function (criterion)
    optimizer = torch.optim.Adam(model.parameters(), args.learning_rate)

    epoch = 0
    best_prec = 0
    best_loss = random.getrandbits(128)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            epoch = checkpoint['epoch']
            best_prec = checkpoint['best_prec']
            best_loss = checkpoint['best_loss']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # optimize until final amount of epochs is reached. Final amount of epochs is determined through the
    while epoch < (args.epochs * epoch_multiplier):
        # visualize the latent space before each task increment and at the end of training if it is 2-D
        if epoch % args.epochs == 0 and epoch > 0 or (epoch + 1) % (
                args.epochs * epoch_multiplier) == 0:
            if model.module.latent_dim == 2:
                print("Calculating and visualizing dataset embedding")
                # infer the number of current tasks to plot the different classes in the embedding
                if args.incremental_data:
                    if args.cross_dataset:
                        num_tasks = sum(
                            dataset.num_classes_per_task[:len(dataset.
                                                              seen_tasks)])
                    else:
                        num_tasks = len(dataset.seen_tasks)
                else:
                    num_tasks = num_classes

                zs = get_latent_embedding(model, dataset.train_loader,
                                          num_tasks, device)
                visualize_dataset_in_2d_embedding(writer,
                                                  zs,
                                                  args.dataset,
                                                  save_path,
                                                  task=num_tasks)

        # continual learning specific part
        if args.incremental_data:
            # at the end of each task increment
            if epoch % args.epochs == 0 and epoch > 0:
                print('Saving the last checkpoint from the previous task ...')
                save_task_checkpoint(save_path, epoch // args.epochs)

                print("Incrementing dataset ...")
                dataset.increment_tasks(
                    model,
                    args.batch_size,
                    args.workers,
                    writer,
                    save_path,
                    is_gpu=torch.cuda.is_available(),
                    upper_bound_baseline=args.train_incremental_upper_bound,
                    generative_replay=args.generative_replay,
                    openset_generative_replay=args.openset_generative_replay,
                    openset_threshold=args.openset_generative_replay_threshold,
                    openset_tailsize=args.openset_weibull_tailsize,
                    autoregression=args.autoregression)

                # grow the classifier and increment the variable for number of overall classes so we can use it later
                if args.cross_dataset:
                    grow_classifier(
                        model.module.classifier,
                        sum(dataset.num_classes_per_task[:len(dataset.
                                                              seen_tasks)]) -
                        model.module.num_classes, WeightInitializer)
                    model.module.num_classes = sum(
                        dataset.num_classes_per_task[:len(dataset.seen_tasks)])
                else:
                    model.module.num_classes += args.num_increment_tasks
                    grow_classifier(model.module.classifier,
                                    args.num_increment_tasks,
                                    WeightInitializer)

                # reset moving averages etc. of the optimizer
                optimizer = torch.optim.Adam(model.parameters(),
                                             args.learning_rate)

            # change the number of seen classes
            if epoch % args.epochs == 0:
                model.module.seen_tasks = dataset.seen_tasks

        # train
        train(dataset, model, criterion, epoch, optimizer, writer, device,
              args)

        # evaluate on validation set
        prec, loss = validate(dataset, model, criterion, epoch, writer, device,
                              save_path, args)

        # remember best prec@1 and save checkpoint
        is_best = loss < best_loss
        best_loss = min(loss, best_loss)
        best_prec = max(prec, best_prec)
        save_checkpoint(
            {
                'epoch': epoch,
                'arch': args.architecture,
                'state_dict': model.state_dict(),
                'best_prec': best_prec,
                'best_loss': best_loss,
                'optimizer': optimizer.state_dict()
            }, is_best, save_path)

        # increment epoch counters
        epoch += 1

        # if a new task begins reset the best prec so that new best model can be stored.
        if args.incremental_data and epoch % args.epochs == 0:
            best_prec = 0
            best_loss = random.getrandbits(128)

    writer.close()
예제 #3
0
    def __train_val_net(self, state_list, state_space_parameters, dataset):
        # TODO: for average as reward
        # reward = AverageMeter()
        # TODO: for best reward
        reward = 0.
        net_input, _ = next(iter(dataset.val_loader))

        gen_model = net(state_list, state_space_parameters, net_input,
                        self.args.batch_norm, self.args.drop_out_drop)
        disc_model = discriminator(state_space_parameters, net_input,
                                   self.args.discriminator_classes)
        print(gen_model)
        print('-' * 80)
        print('Input size: {}'.format(gen_model.input_size))
        print('-' * 80)
        print('Estimated total gpu usage of model: {gpu_usage:.4f} GB'.format(
            gpu_usage=gen_model.gpu_usage))
        model_activations_gpu = gen_model.gpu_usage
        cudnn.benchmark = True
        self.WeightInitializer.init_model(gen_model)
        self.WeightInitializer.init_model(disc_model)
        gen_model = gen_model.to(self.device)
        disc_model = disc_model.to(self.device)
        print('available:{}'.format(
            (self.gpu_mem_0.total_mem -
             self.gpu_mem_0.total_mem * self.gpu_mem_0.get_mem_util()) /
            1024.))
        print('required per gpu with buffer: {}'.format(
            (3. / float(self.args.no_gpus) * model_activations_gpu) + 1))
        print('-' * 80)
        if ((self.gpu_mem_0.total_mem - self.gpu_mem_0.total_mem *
             self.gpu_mem_0.get_mem_util()) / 1024.) < (
                 (3. / float(self.args.no_gpus) * model_activations_gpu) + 1):
            del gen_model, disc_model
            return [None] * 2
        elif not (gen_model.convT_no > 0 or gen_model.wrnT_bb_no > 0
                  or gen_model.fc_no > 0):
            del gen_model, disc_model
            return [None] * 2
        if int(self.args.no_gpus) > 1:
            gen_model = torch.nn.DataParallel(gen_model)
            disc_model = torch.nn.DataParallel(disc_model)
        criterion = nn.BCELoss(size_average=True).to(self.device)
        gen_optimizer = optim.SGD(filter(lambda p: p.requires_grad,
                                         gen_model.parameters()),
                                  lr=self.args.learning_rate,
                                  momentum=self.args.momentum,
                                  weight_decay=self.args.weight_decay)
        disc_optimizer = optim.SGD(filter(lambda p: p.requires_grad,
                                          disc_model.parameters()),
                                   lr=self.args.learning_rate,
                                   momentum=self.args.momentum,
                                   weight_decay=self.args.weight_decay)

        lr_scheduler = LearningRateScheduler(self.args.lr_wr_epochs,
                                             len(dataset.train_loader.dataset),
                                             self.args.batch_size,
                                             self.args.learning_rate,
                                             self.args.lr_wr_mul,
                                             self.args.lr_wr_min)
        save_path_pictures = os.path.join(self.save_path, str(self.count + 1))
        if not os.path.exists(save_path_pictures):
            os.mkdir(save_path_pictures)
        train_flag = True
        epoch = 0
        while epoch < self.args.epochs:
            disc_losses_train, gen_losses_train = train(dataset, gen_model, disc_model, criterion,\
                                 epoch, gen_optimizer, disc_optimizer,\
                                 lr_scheduler, self.device, self.args)
            disc_losses_valid, gen_losses_valid = validate(dataset, gen_model, disc_model, criterion, epoch,\
                                             self.device, self.args, save_path_pictures)
            reward = max(reward, 1. / (disc_losses_valid + gen_losses_valid))
            # TODO: include early stopping criterion, plotting
            epoch += 1
        del gen_model, disc_model, criterion, disc_optimizer, gen_optimizer, lr_scheduler
        return reward, train_flag
def main():
    # Command line options
    args = parser.parse_args()
    print("Command line options:")
    for arg in vars(args):
        print(arg, getattr(args, arg))

    # Check whether GPU is available and can be used
    # if CUDA is found then device is set accordingly
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    cudnn.benchmark = True
    num_GPUs = torch.cuda.device_count()

    # If save directory for runs doesn't exist then create it
    if not os.path.exists('runs'):
        os.mkdir('runs')

    # Create a time-stamped save path for individual experiment
    save_path = 'runs/' + strftime("%Y-%m-%d_%H-%M-%S", gmtime()) + \
                ';' + args.dataset + ';' + args.architecture
    os.mkdir(save_path)

    # List of values to log to csv
    columns_list = [
        'Filters', 'Parameters', 'Mean', 'Variance', 'Skew', 'BestVal',
        'BestValsTrain', 'BestEpoch', 'LastValPrec', 'LastTrainPrec',
        'AllTrain', 'AllVal'
    ]
    df = pd.DataFrame(columns=columns_list)

    # Dataset loading
    data_init_method = getattr(datasets, args.dataset)
    dataset = data_init_method(torch.cuda.is_available(), args)

    # get the amount of color channels in the input images
    net_input, _ = next(iter(dataset.train_loader))
    num_colors = net_input.size(1)

    # import model from architectures class
    net_init_method = getattr(architectures, args.architecture)

    # Get the parameters for all valid skewed models
    SNModels = SkewNormalModels(depth=args.vgg_depth,
                                num_classes=dataset.num_classes,
                                patch_size=args.patch_size)
    skew_model_params = SNModels.get_valid_models()
    print("Total number of models: ", len(skew_model_params["filters"]))

    # Weight-init method
    WeightInitializer = WeightInit(args.weight_init)

    # Optionally resume a previous experiment
    current_id = args.resume_model_id
    for i in range(len(skew_model_params["filters"]) - current_id):
        print("Model filters: ", skew_model_params["filters"][i + current_id])
        print("Model parameters: ",
              skew_model_params["total_params"][i + current_id], " mean: ",
              skew_model_params["means"][i + current_id], " var: ",
              skew_model_params["vars"][i + current_id], " skew: ",
              skew_model_params["skews"][i + current_id])

        model = net_init_method(device,
                                dataset.num_classes,
                                num_colors,
                                args,
                                skew_model_params["filters"][i + current_id],
                                custom_filters=True)

        # Parallel container for multi GPU use and cast to available device
        model = torch.nn.DataParallel(model).to(device)
        print(model)

        # Initialize the weights of the model
        print("Initializing networks with: " + args.weight_init)
        WeightInitializer.init_model(model)

        # Define criterion and optimizer
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.SGD(model.parameters(),
                                    args.learning_rate,
                                    momentum=args.momentum,
                                    weight_decay=args.weight_decay,
                                    nesterov=args.nesterov)

        # Initialize SGDWR learning rate scheduler
        lr_scheduler = LearningRateScheduler(args.lr_wr_epochs,
                                             len(dataset.train_loader.dataset),
                                             args.batch_size,
                                             args.learning_rate,
                                             args.lr_wr_mul, args.lr_wr_min)

        # Get estimated GPU memory usage of the model and split batch if too little memory is available
        if torch.cuda.is_available():
            GPUMemory = GPUMem(torch.cuda.is_available())
            print('available:{}'.format(
                (GPUMemory.total_mem -
                 GPUMemory.total_mem * GPUMemory.get_mem_util()) / 1024.))
            print('required per gpu with buffer: {}'.format(
                (4. / float(num_GPUs) * model.module.gpu_usage) + 1.))

            # calculate smaller chunk size to split batch into sequential computations
            mem_scale_factor = 4.0  # TODO: WEIRD factor... why is this necessary and where does it come from?
            # TODO: the + 1 Gb should be taken from the cache allocator
            if ((GPUMemory.total_mem -
                 GPUMemory.total_mem * GPUMemory.get_mem_util()) / 1024.) < (
                     (mem_scale_factor / float(num_GPUs) *
                      model.module.gpu_usage) + 1.):

                # code for variable batch size implementation as per gpu constraint; remove for old code
                approx_small_batch_size = (((GPUMemory.total_mem - GPUMemory.total_mem * GPUMemory.get_mem_util()) / 1024.
                                            - 1.) * float(num_GPUs) / mem_scale_factor) //\
                                          (model.module.gpu_usage / float(args.batch_size))

                diff = float('inf')
                temp_small_batch_size = approx_small_batch_size
                for j in range(1, (args.batch_size // 2) + 1):
                    if args.batch_size % j == 0 and abs(
                            j - approx_small_batch_size) < diff:
                        diff = abs(j - approx_small_batch_size)
                        temp_small_batch_size = j
                batch_seq_split_size = temp_small_batch_size
            else:
                batch_seq_split_size = args.batch_size
        else:
            batch_seq_split_size = args.batch_size

        # Get training and validation dataset loaders
        dataset.train_loader, dataset.val_loader = dataset.get_dataset_loader(
            batch_seq_split_size, args.workers, device)

        print(
            'sequential batch size split size:{}'.format(batch_seq_split_size))

        epoch = 0
        best_epoch = 0
        best_prec = 0
        best_val_train_prec = 0
        all_train = []
        all_val = []

        while epoch < args.epochs:
            # train for one epoch
            train_prec = train(dataset.train_loader, model, criterion, epoch,
                               optimizer, lr_scheduler, device,
                               batch_seq_split_size, args)
            # evaluate on validation set
            prec = validate(dataset.val_loader, model, criterion, epoch,
                            device, args)

            all_train.append(train_prec)
            all_val.append(prec)

            # remember best prec@1 and save checkpoint
            is_best = prec > best_prec
            if is_best:
                best_epoch = epoch
                best_val_train_prec = train_prec
                best_prec = prec

            # if architecture doesn't train at all skip it
            if epoch == args.lr_wr_epochs - 1 and train_prec < (
                    2 * 100.0 / dataset.num_classes):
                break

            # increment epoch counters
            epoch += 1
            lr_scheduler.scheduler_epoch += 1

        # append architecture results to csv
        df = df.append(pd.DataFrame([[
            skew_model_params["filters"][i + current_id],
            skew_model_params["total_params"][i + current_id],
            skew_model_params["means"][i + current_id],
            skew_model_params["vars"][i + current_id],
            skew_model_params["skews"][i + current_id], best_prec,
            best_val_train_prec, best_epoch, prec, train_prec, all_train,
            all_val
        ]],
                                    columns=columns_list),
                       ignore_index=True)
        df.to_csv(save_path + '/model_%03d' % (i + 1 + current_id) + '.csv')

        del model
        del optimizer
    def __train_val_net(self, state_list, state_space_parameters, dataset):
        best_prec = 0.
        num_classes = len(dataset.val_loader.dataset.class_to_idx)
        net_input, _ = next(iter(dataset.val_loader))

        model = net(state_list, state_space_parameters, num_classes, net_input,
                    self.args.batch_norm, self.args.drop_out_drop)

        print(model)
        print('-' * 80)
        print('SPP levels: {}'.format(model.spp_filter_size))
        print('-' * 80)
        print('Estimated total gpu usage of model: {gpu_usage:.4f} GB'.format(
            gpu_usage=model.gpu_usage))
        model_activations_gpu = model.gpu_usage
        cudnn.benchmark = True
        self.WeightInitializer.init_model(model)
        model = model.to(self.device)
        print('available:{}'.format(
            (self.gpu_mem_0.total_mem -
             self.gpu_mem_0.total_mem * self.gpu_mem_0.get_mem_util()) /
            1024.))
        print('required per gpu with buffer: {}'.format(
            (3. / float(self.args.no_gpus) * model_activations_gpu) + 1))
        print('-' * 80)
        if ((self.gpu_mem_0.total_mem - self.gpu_mem_0.total_mem *
             self.gpu_mem_0.get_mem_util()) / 1024.) < (
                 (3. / float(self.args.no_gpus) * model_activations_gpu) + 1):
            del model
            return [None] * 12
        if int(self.args.no_gpus) > 1:
            model = torch.nn.DataParallel(model)
        criterion = nn.BCELoss(size_average=True).to(self.device)
        optimizer = optim.SGD(filter(lambda p: p.requires_grad,
                                     model.parameters()),
                              lr=self.args.learning_rate,
                              momentum=self.args.momentum,
                              weight_decay=self.args.weight_decay)
        lr_scheduler = LearningRateScheduler(self.args.lr_wr_epochs,
                                             len(dataset.train_loader.dataset),
                                             self.args.batch_size,
                                             self.args.learning_rate,
                                             self.args.lr_wr_mul,
                                             self.args.lr_wr_min)

        train_flag = True
        epoch = 0
        while epoch < self.args.epochs:
            train(dataset, model, criterion, epoch, optimizer, lr_scheduler,
                  self.device, self.args)
            prec = validate(dataset, model, criterion, epoch, self.device,
                            self.args)
            best_prec = max(prec, best_prec)
            # TODO: hard-coded early stopping criterion of last prec < 15%
            if epoch == (self.args.lr_wr_epochs -
                         1) and float(prec) < (1.5 * 100. / 10):
                train_flag = False
                break
            epoch += 1
        if self.args.no_gpus > 1:
            spp_filter_size = model.module.spp_filter_size
        else:
            spp_filter_size = model.spp_filter_size
        del model, criterion, optimizer, lr_scheduler
        return spp_filter_size, best_prec, train_flag
예제 #6
0
def main():
    # Command line options
    args = parser.parse_args()
    print("Command line options:")
    for arg in vars(args):
        print(arg, getattr(args, arg))

    # import the correct loss and training functions depending which model to optimize
    # TODO: these could easily be refactored into one function, but we kept it this way for modularity
    if args.train_var:
        if args.joint:
            from lib.Training.train import train_var_joint as train
            from lib.Training.validate import validate_var_joint as validate
            from lib.Training.loss_functions import var_loss_function_joint as criterion
        else:
            from lib.Training.train import train_var as train
            from lib.Training.validate import validate_var as validate
            from lib.Training.loss_functions import var_loss_function as criterion
    else:
        if args.joint:
            from lib.Training.train import train_joint as train
            from lib.Training.validate import validate_joint as validate
            from lib.Training.loss_functions import loss_function_joint as criterion
        else:
            from lib.Training.train import train as train
            from lib.Training.validate import validate as validate
            from lib.Training.loss_functions import loss_function as criterion

    # Check whether GPU is available and can be used
    # if CUDA is found then device is set accordingly
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Launch a writer for the tensorboard summary writer instance
    save_path = 'runs/' + strftime("%Y-%m-%d_%H-%M-%S", gmtime()) + '_' + args.dataset + '_' + args.architecture +\
                '_dropout_' + str(args.dropout)

    if args.train_var:
        save_path += '_variational_samples_' + str(
            args.var_samples) + '_latent_dim_' + str(args.var_latent_dim)

    if args.joint:
        save_path += '_joint'

    # if we are resuming a previous training, note it in the name
    if args.resume:
        save_path = save_path + '_resumed'
    writer = SummaryWriter(save_path)

    # saving the parsed args to file
    log_file = os.path.join(save_path, "stdout")
    log = open(log_file, "a")
    for arg in vars(args):
        log.write(arg + ':' + str(getattr(args, arg)) + '\n')

    # Dataset loading
    data_init_method = getattr(datasets, args.dataset)
    dataset = data_init_method(torch.cuda.is_available(), args)
    # get the number of classes from the class dictionary
    num_classes = dataset.num_classes

    # add command line options to TensorBoard
    args_to_tensorboard(writer, args)

    log.close()

    # Get a sample input from the data loader to infer color channels/size
    net_input, _ = next(iter(dataset.train_loader))
    # get the amount of color channels in the input images
    num_colors = net_input.size(1)

    # import model from architectures class
    net_init_method = getattr(architectures, args.architecture)

    # build the model
    model = net_init_method(device, num_classes, num_colors, args)

    # Parallel container for multi GPU use and cast to available device
    model = torch.nn.DataParallel(model).to(device)
    print(model)

    # Initialize the weights of the model, by default according to He et al.
    print("Initializing network with: " + args.weight_init)
    WeightInitializer = WeightInit(args.weight_init)
    WeightInitializer.init_model(model)

    # Define optimizer and loss function (criterion)
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.learning_rate,
                                 weight_decay=args.weight_decay)

    epoch = 0
    best_prec = 0
    best_loss = random.getrandbits(128)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            epoch = checkpoint['epoch']
            best_prec = checkpoint['best_prec']
            best_loss = checkpoint['best_loss']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # optimize until final amount of epochs is reached.
    while epoch < args.epochs:
        # train
        train(dataset, model, criterion, epoch, optimizer, writer, device,
              args)

        # evaluate on validation set
        prec, loss = validate(dataset, model, criterion, epoch, writer, device,
                              args)

        # remember best prec@1 and save checkpoint
        is_best = loss < best_loss
        best_loss = min(loss, best_loss)
        best_prec = max(prec, best_prec)
        save_checkpoint(
            {
                'epoch': epoch,
                'arch': args.architecture,
                'state_dict': model.state_dict(),
                'best_prec': best_prec,
                'best_loss': best_loss,
                'optimizer': optimizer.state_dict()
            }, is_best, save_path)

        # increment epoch counters
        epoch += 1

    writer.close()
예제 #7
0
def train_val_net(state_list, dataset, weight_initializer, device, args,
                  save_path):
    """
    builds a net given a state list, and trains and validates it
    
    Parameters:
        state_list (list): list of states to build the net
        dataset (lib.Datasets.datasets.CODEBRIM): dataset to train and validate the net on
        weight_initializer (lib.Models.initialization.WeightInit): weight initializer for initializing the weights of
                                                                   the network
        device (torch.device): type of computational device available (cpu / gpu)
        args (argparse.ArgumentParser): parsed command line arguments
        save_path (string): path for saving results to
    
    Returns:
        memfit (bool): True if the network fits the memory after batch splitting, False otherwise
        val_acc_all_epochs (list): list of validation accuracies in all epochs
        train_flag (bool): False if net's been early-stopped, False otherwise
    """
    # reset the data loaders
    dataset.train_loader, dataset.val_loader, dataset.test_loader = dataset.get_dataset_loader(
        args.batch_size, args.workers, torch.cuda.is_available())
    net_input, _ = next(iter(dataset.train_loader))

    num_classes = dataset.num_classes
    batch_size = net_input.size(0)

    # gets number of available gpus and total gpu memory
    num_gpu = float(torch.cuda.device_count())
    gpu_mem = GPUMem(torch.device('cuda') == device)

    # builds the net from the state list
    model = Net(state_list, num_classes, net_input, args.batch_norm,
                args.drop_out_drop)

    print(model)
    print('*' * 80)
    print('no. of spp scales: {}'.format(model.spp_size))
    print('*' * 80)

    # sets cudnn benchmark flag
    cudnn.benchmark = True

    # initializes weights
    weight_initializer.init_model(model)

    # puts model on gpu/cpu
    model = model.to(device)

    # gets available gpu memory
    gpu_avail = (gpu_mem.total_mem -
                 gpu_mem.total_mem * gpu_mem.get_mem_util()) / 1024.
    print('gpu memory available:{gpu_avail:.4f}'.format(gpu_avail=gpu_avail))

    # prints estimated gpu requirement of model but actual memory requirement is higher than what's estimated (from
    # experiments)
    print("model's estimated gpu memory requirement: {gpu_mem_req:.4f} GB".
          format(gpu_mem_req=model.gpu_mem_req))

    # scaling factor and buffer for matching expected memory requirement with empirically observed memory requirement
    scale_factor = 4.0
    scale_buffer = 1.0
    scaled_gpu_mem_req = (scale_factor /
                          num_gpu) * model.gpu_mem_req + scale_buffer
    print(
        "model's empirically scaled gpu memory requirement: {scaled_gpu_mem_req:.4f}"
        .format(scaled_gpu_mem_req=scaled_gpu_mem_req))
    split_batch_size = batch_size
    # splits batch into smaller batches
    if gpu_avail < scaled_gpu_mem_req:
        # estimates split batch size as per available gpu mem. (may not be a factor of original batch size)
        approx_split_batch_size = int(
            ((gpu_avail - scale_buffer) * num_gpu / scale_factor) //
            (model.gpu_mem_req / float(batch_size)))

        diff = float('inf')
        temp_split_batch_size = 1
        # sets split batch size such that it's close to the estimated split batch size, is also a factor of original
        # batch size & should give a terminal batch size of more than 1
        for j in range(2, approx_split_batch_size + 1):
            if batch_size % j == 0 and abs(
                    j - approx_split_batch_size) < diff and (
                        len(dataset.train_set) % j > 1):
                diff = abs(j - approx_split_batch_size)
                temp_split_batch_size = j
        split_batch_size = temp_split_batch_size

    print('split batch size:{}'.format(split_batch_size))
    print('*' * 80)

    # returns memfit = False if model doesn't fit in memory even after splitting the batch size to as small as 1
    if split_batch_size < 2:
        return False, None, None, None, None, None, False, None, None, None, None, None, None

    # set the data loaders using the split batch size
    dataset.train_loader, dataset.val_loader, dataset.test_loader = dataset.get_dataset_loader(
        split_batch_size, args.workers, torch.cuda.is_available())

    # use data parallelism for multi-gpu machine
    model = torch.nn.DataParallel(model)

    # cross entropy loss criterion (LogSoftmax and NLLoss together)
    criterion = nn.BCELoss(reduction='mean').to(device)

    # SGD optimizer with warm restarts
    optimizer = optim.SGD(filter(lambda p: p.requires_grad,
                                 model.parameters()),
                          lr=args.learning_rate,
                          momentum=args.momentum,
                          weight_decay=args.weight_decay)

    # quarter cosine learning rate schedule for SGD with warm restarts
    lr_scheduler = LearningRateScheduler(args.lr_wr_epochs,
                                         len(dataset.train_loader.dataset),
                                         args.batch_size, args.learning_rate,
                                         args.lr_wr_mul, args.lr_wr_min)

    train_flag = True
    epoch = 0
    loss_val_all_epochs = []
    hard_val_all_epochs = []
    soft_val_all_epochs = []
    hard_best_background = 0.0
    hard_best_crack = 0.0
    hard_best_spallation = 0.0
    hard_best_exposed_bars = 0.0
    hard_best_efflorescence = 0.0
    hard_best_corrosion_stain = 0.0

    while epoch < args.epochs:
        # train and validate the model
        train(dataset.train_loader, model, criterion, epoch, optimizer,
              lr_scheduler, device, args, split_batch_size)
        loss_val, hard_val, soft_val, hard_background, hard_crack, hard_spallation, hard_exposed_bars,\
            hard_efflorescence, hard_corrosion_stain = val(dataset.val_loader, model, criterion, device)
        if int(args.task) == 2:
            _ = val(dataset.test_loader,
                    model,
                    criterion,
                    device,
                    is_val=False)

        if len(hard_val_all_epochs) == 0 or hard_val == max(
                hard_val_all_epochs):
            hard_best_background = hard_background
            hard_best_crack = hard_crack
            hard_best_spallation = hard_spallation
            hard_best_exposed_bars = hard_exposed_bars
            hard_best_efflorescence = hard_efflorescence
            hard_best_corrosion_stain = hard_corrosion_stain
        loss_val_all_epochs.append(loss_val)
        hard_val_all_epochs.append(hard_val)
        soft_val_all_epochs.append(soft_val)

        if int(args.task) == 2:
            # saves model dict while training fixed net
            state = {
                'epoch':
                epoch,
                'arch':
                'Fixed net: replay buffer - {}, index no - {}'.format(
                    args.replay_buffer_csv_path, args.fixed_net_index_no),
                'state_dict':
                model.state_dict(),
                'hard_val':
                hard_val,
                'optimizer':
                optimizer.state_dict()
            }
            save_checkpoint(state,
                            max(hard_val_all_epochs) == hard_val, save_path)

        # checks for early stopping; early-stops if the mean of the validation accuracy from the last 3 epochs before
        # the early stopping epoch isn't at least as high as the early stopping threshold
        if epoch == (args.early_stopping_epoch - 1) and float(np.mean(hard_val_all_epochs[-5:])) <\
                (args.early_stopping_thresh * 100.):
            train_flag = False
            break

        epoch += 1
    hard_best_val = max(hard_val_all_epochs)
    soft_best_val = max(soft_val_all_epochs)

    # free up memory by deleting objects
    spp_size = model.module.spp_size
    del model, criterion, optimizer, lr_scheduler

    return True, spp_size, hard_best_val, hard_val_all_epochs, soft_best_val, soft_val_all_epochs, train_flag,\
           hard_best_background, hard_best_crack, hard_best_spallation, hard_best_exposed_bars,\
           hard_best_efflorescence, hard_best_corrosion_stain