def train(self, config: ConfigurationNode = None):
        """
        Take a configuration node and train the model from it.
        :param config:
        :return:
        """
        if config is None:
            config = self.config
        # Create writable timestamp for easier record keeping
        timestamp = datetime.now().isoformat(sep="T", timespec="auto")
        name_timestamp = timestamp.replace(":", "_")

        # Start the mlflow run:
        mlflow.start_run(run_name=name_timestamp)

        # Check valid output path, set path from the path_cfg_override modules respectively
        assert config.OUTPUT_PATH != ''
        path_output = config.OUTPUT_PATH  # output folder
        path_train = config.DATASET.TRAIN_DATA_PATH  # training data folder
        path_val = config.DATASET.VAL_DATA_PATH  # validation data folder

        # Make output dir and its parents if not exist.
        if not os.path.exists(path_output):
            os.makedirs(path_output)

        # Make result folders if they do not exist.
        self.results_dir = (Path(path_output) / name_timestamp)
        if not os.path.exists(self.results_dir):
            os.makedirs(self.results_dir)

        # Make backup folders if they do not exist.
        self.backup_dir = os.path.join(self.results_dir, 'model_backups')
        if not os.path.exists(self.backup_dir):
            os.makedirs(self.backup_dir)

        writer_tensorboard = SummaryWriter(log_dir=Path(self.results_dir /
                                                        "logs_tensorflow"))

        # Now that CFG has been properly merged with new data along the way, time to dump a version of it into a string for trackability purposes.
        config.dump(stream=open(
            os.path.join(self.results_dir, f'config{name_timestamp}.yaml'),
            'w'))

        # file path to store the state of the model.
        state_fpath = os.path.join(self.results_dir,
                                   f'model{name_timestamp}.pt')

        # ????
        perf_path = os.path.join(self.results_dir, f'trace{name_timestamp}.p')
        perf_trace = []

        # Load data, create the data loader objects from them.
        data_train = pickle.load(open(path_train, 'rb'))
        data_val = pickle.load(open(path_val, 'rb'))
        self.loader_train = build_data_loader(data_train, config.DATASET, True)
        self.loader_val = build_data_loader(data_val, config.DATASET, False)

        # Build the model using configue dict node
        self.model = build_model(config.MODEL)

        # Enable parallel multi GPU mode if the config specify it.
        if config.MODEL.PARALLEL:
            print("Utilized parallel processing")
            self.model = torch.nn.DataParallel(self.model)

        current_epoch = 0

        # For resuming training (i.e. load checkpoint)
        if config.RESUME_PATH != "":
            checkpoint = torch.load(config.RESUME_PATH, map_location='cpu')
            current_epoch = checkpoint['epoch']
            self.model.load_state_dict(checkpoint["model_state"])
        _ = self.model.cuda()

        # SOLVER EVALUATOR
        cfg_solver = config.MODEL.SOLVER

        # Build optimizer (between train/validation, using the solver portion of the configuration.
        optimizer = build_optimizer(self.model, cfg_solver)

        # Build evaluator (between train/validation, using the solver portion of the configuration.
        evaluator = build_evaluator(cfg_solver)

        evaluator.float().cuda()
        total_epochs = cfg_solver.TOTAL_EPOCHS

        # Main training epoch loop starts here.
        for epoch in range(current_epoch, total_epochs):

            # Train a single epoch
            self.train_epoch(epoch, evaluator, optimizer, perf_path,
                             perf_trace, state_fpath, writer_tensorboard)

        mlflow.end_run()
def find_lr(cfg, max_iter=400, init_value=1e-6, final_value=1.0):
    '''
    WIP
    We track the losses given different lr values. 
    Same training loop, but we update the lr according to an update step for each batch iteration
    We apply a smoothing function to the losses for better visualization afterward. 
    '''
    # FILES, PATHS
    train_path = cfg.DATASET.TRAIN_DATA_PATH
    val_path = cfg.DATASET.VAL_DATA_PATH

    # DATA LOADER
    train_data = pickle.load(open(train_path, 'rb'))
    val_data = pickle.load(open(val_path, 'rb'))
    train_loader = build_data_loader(train_data, cfg.DATASET, True)
    val_loader = build_data_loader(val_data, cfg.DATASET, False)

    # MODEL
    model = build_model(cfg.MODEL)
    current_epoch = 0
    if cfg.RESUME_PATH != "":
        checkpoint = torch.load(cfg.RESUME_PATH, map_location='cpu')
        current_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint["model_state"])
    _ = model.cuda()

    # SOLVER EVALUATOR
    solver_cfg = cfg.MODEL.SOLVER
    optimizer = build_optimizer(model, solver_cfg)
    evaluator = build_evaluator(solver_cfg)
    evaluator.float().cuda()
    total_epochs = solver_cfg.TOTAL_EPOCHS

    # find_lr variables
    number_in_epoch = len(train_loader) - 1
    update_step = (final_value / init_value)**(2 / number_in_epoch)
    lr = init_value
    optimizer.param_groups[0]["lr"] = lr
    best_loss, batch_num = 0.0, 0
    losses, log_lrs = [], []

    model.train()
    train_itr = iter(train_loader)
    total_err = 0
    total_acc = 0
    for idx, (inputs, labels) in enumerate(train_itr):
        batch_num += 1
        # compute
        input_data = inputs.float().cuda()
        labels = labels.cuda()
        grapheme_logits, vowel_logits, consonant_logits = model(input_data)

        eval_result = evaluator(grapheme_logits, vowel_logits,
                                consonant_logits, labels)

        # keep track of the loss
        loss = eval_result['loss']

        # Stopping condition: if loss explodes ogir idx = 2000
        if batch_num > 1 and loss > 4 * best_loss or idx == 2000:
            losses = [x.item() for x in losses]
            losses = smoothen_by_spline(log_lrs, losses, s=4)
            return log_lrs[10:-5], losses[10:-5]

        # Record the best loss

        if loss < best_loss or batch_num == 1:
            best_loss = loss

            # Store the values

        losses.append(loss)
        log_lrs.append(math.log10(lr))

        # Do the backward pass and optimize

        optimizer.zero_grad()
        eval_result['loss'].backward()
        optimizer.step()

        eval_result = {k: eval_result[k].item() for k in eval_result}
        total_err += eval_result['loss']
        total_acc += eval_result['acc']
        if idx % 100 == 0:
            print(idx, eval_result['loss'], eval_result['acc'])

        # update the lr
        lr *= update_step
        optimizer.param_groups[0]["lr"] = lr
Exemplo n.º 3
0
def train(cfg, debug=False):
    
    #############################
    # Pre-training 
    #############################

    # PATHS
    assert cfg.OUTPUT_PATH != ''
    output_path = cfg.OUTPUT_PATH
    train_path = cfg.DATASET.TRAIN_DATA_PATH
    val_path = cfg.DATASET.VAL_DATA_PATH

    # sample is 1/4th of the train images - aka 1 .parquet file
    train_path_sample = cfg.DATASET.TRAIN_DATA_SAMPLE
    valid_path_sample = cfg.DATASET.VALID_DATA_SAMPLE

    # Create writable timestamp for easier record keeping
    timestamp = datetime.now().isoformat(sep="T", timespec="auto")
    name_timestamp = timestamp.replace(":", "_")

    # Make output dir and its parents if they do not exist
    if not os.path.exists(output_path):
        os.mkdir(output_path)

    # Make backup folders if they do not exist
    backup_dir = os.path.join(output_path, 'model_backups')
    if not os.path.exists(backup_dir):
        os.mkdir(backup_dir)

    # Make result folders if they do not exist 
    results_dir = os.path.join(output_path, 'results')
    if not os.path.exists(results_dir):
        os.mkdir(results_dir)

    # to initialize Tensorboard
    writer_tensorboard = SummaryWriter(log_dir=results_dir + "logs_tensorflow")

    # Save configs
    cfg.dump(stream=open(os.path.join(results_dir, f'config_{name_timestamp}.yaml'), 'w'))
    
    # File path to store the state of the model
    state_fpath = os.path.join(output_path, 'model.pt')

    # Performance path where we'll save our metrics to trace.p
    perf_path = os.path.join(results_dir, 'trace.p')
    perf_trace = []

    # debug: load a smaller training file
    if debug:
        train_data = pickle.load(open(train_path_sample, 'rb'))
        val_data = pickle.load(open(valid_path_sample, 'rb'))

    # Folds
    if cfg.DATASET.USE_FOLDS_DATA:
        data_path = cfg.DATASET.FOLDS_PATH
        all_data_folds = pickle.load(open(data_path, 'rb'))
        val_fold = cfg.DATASET.VALIDATION_FOLD
        train_data = []
        val_data = []
        for idx, entries in enumerate(all_data_folds):
            if idx == val_fold:
                val_data = entries
            else:
                train_data = train_data + entries
    else:
        train_data = pickle.load(open(train_path, 'rb'))
        val_data = pickle.load(open(val_path, 'rb'))

    # witchcraft: only train on few classes
    focus_cls = cfg.DATASET.FOCUS_CLASS
    if len(focus_cls) > 0:
        train_data = [x for x in train_data if x[1][0] in focus_cls]
        val_data = [x for x in val_data if x[1][0] in focus_cls]

    # DataLoader
    train_loader = build_data_loader(train_data, cfg.DATASET, True)
    val_loader = build_data_loader(val_data, cfg.DATASET, False)

    # Build model using config dict node
    model = build_model(cfg.MODEL)

    # Solver evaluator
    solver_cfg = cfg.MODEL.SOLVER

    # Epochs
    total_epochs = solver_cfg.TOTAL_EPOCHS

    # Loss function
    loss_fn = solver_cfg.LOSS.NAME

    # for weighted focal loss, initialize last layer bias weights as constant
    if loss_fn == 'weighted_focal_loss':
        last_layer = model.head.fc_layers[-1]
        for m in last_layer.modules():
            if isinstance(m, torch.nn.Linear):
                torch.nn.init.constant_(m.bias, -3.0)

    current_epoch = 0

    # MultiGPU training
    multi_gpu_training = cfg.MULTI_GPU_TRAINING
    if cfg.RESUME_PATH != "":
        checkpoint = torch.load(cfg.RESUME_PATH, map_location='cpu')
        current_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint["model_state"])
    if multi_gpu_training:
        model = torch.nn.DataParallel(model)
    _ = model.cuda()

    # Optimizer, scheduler, amp
    opti_cfg = solver_cfg.OPTIMIZER
    optimizer = build_optimizer(model, opti_cfg)
    use_amp = solver_cfg.AMP

    # ------ Uncomment if we use apex library --------
    # if use_amp:
    #     opt_level = 'O1'
    #     model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)

    # Resume training with correct optimizer and scheduler
    if cfg.RESUME_PATH != "":
        if 'optimizer_state' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer_state'])
        if 'scheduler_state' in checkpoint and scheduler is not None:
            scheduler.load_state_dict(checkpoint['scheduler_state'])


        # ------ Uncomment if we use apex library --------
        # if use_amp and 'amp_state' in checkpoint:
        #     amp.load_state_dict(checkpoint['amp_state'])

    # Build Scheduler
    scheduler_cfg = solver_cfg.SCHEDULER
    scheduler_type = scheduler_cfg.NAME
    scheduler = build_scheduler(optimizer, scheduler_cfg)

    # Build evaluator with or without Mixup
    mixup_training = solver_cfg.MIXUP_AUGMENT
    if mixup_training:
        mixup_augmenter = MixupAugmenter(solver_cfg.MIXUP)

    evaluator, mixup_evaluator = build_evaluator(solver_cfg)
    evaluator.float().cuda()
    if mixup_evaluator is not None:
        mixup_evaluator.float().cuda()

    ##########################################
    # Main training epoch loop starts here   
    ##########################################

    s_time = time.time()
    parameters = list(model.parameters())
    for epoch in range(current_epoch, total_epochs):
        model.train()
        if multi_gpu_training:
            model.freeze_bn()
        print('Start epoch', epoch)
        train_itr = iter(train_loader)
        total_err = 0
        total_acc = 0
        inputs, labels = next(train_itr)

        for idx, (inputs, labels) in enumerate(train_itr):

            # compute
            input_data = inputs.float().cuda()
            labels = labels.cuda()

            # Use the model to produce the classification
            if mixup_training:
                input_data, labels = mixup_augmenter(input_data, labels)
            grapheme_logits, vowel_logits, consonant_logits = model(input_data)

            # Calling MultiHeadsEval forward function to produce evaluator results
            if mixup_training:
                eval_result = mixup_evaluator(grapheme_logits, vowel_logits, consonant_logits, labels)
            else:
                eval_result = evaluator(grapheme_logits, vowel_logits, consonant_logits, labels)
            optimizer.zero_grad()
            loss = eval_result['loss']


            # ------ Uncomment if we use apex library --------
            # if use_amp:
            #     with amp.scale_loss(loss, optimizer) as scaled_loss:
            #         scaled_loss.backward()

            # get loss, back propagate, step
            loss.backward()
            max_grad = torch.max(parameters[-1].grad)
            if not torch.isnan(max_grad):
                optimizer.step()
            else:
                print('NAN in gradient, skip this step')
                optimizer.zero_grad()

            # tabulate the steps from the evaluation
            eval_result = {k: eval_result[k].item() for k in eval_result}
            
            # Update Scheduler at this point only if scheduler_type is 'OneCycleLR'
            if scheduler_type == 'OneCycleLR':
                scheduler.step()
            
            if idx % 100 == 0:
                t_time = time.time()
                print(idx, eval_result['loss'], eval_result['acc'], t_time - s_time)
                s_time = time.time()

        ###############################
        # Send images to Tensorboard 
        # -- could also do this outside the loop with xb, yb = next(itr(DL))
        ###############################

        if epoch == 0: 
            # Get the std and mean of each channel
            std = torch.FloatTensor(cfg.DATASET.NORMALIZE_STD).view(3,1,1)
            m = torch.FloatTensor(cfg.DATASET.NORMALIZE_MEAN).view(3,1,1)

            # Un-normalize images, send mean and std to gpu for mixuped images
            imgs, imgs_mixup = ((inputs*std)+m)*255, ((input_data*std.cuda())+m.cuda())*255
            imgs, imgs_mixup = imgs.type(torch.uint8), imgs_mixup.type(torch.uint8)
            img_grid = torchvision.utils.make_grid(imgs)
            img_grid_mixup = torchvision.utils.make_grid(imgs_mixup)

            img_grid = torchvision.utils.make_grid(imgs)
            img_grid_mixup = torchvision.utils.make_grid(imgs_mixup)

            writer_tensorboard.add_image("images no mixup", img_grid)
            writer_tensorboard.add_image("images with mixup", img_grid_mixup)

        ####################
        # Training metrics
        ####################
        if mixup_training:
            train_result = mixup_evaluator.evalulate_on_cache()
            mixup_evaluator.clear_cache()
        else:
            train_result = evaluator.evalulate_on_cache()

        # Store training loss, accuracy, kaggle score and write to Tensorboard    
        train_total_err = train_result['loss']
        writer_tensorboard.add_scalar('Loss/train', train_total_err, global_step=epoch)

        train_total_acc = train_result['acc']
        writer_tensorboard.add_scalar('Accuracy/train', train_total_acc, global_step=epoch)

        train_kaggle_score = train_result['kaggle_score']
        writer_tensorboard.add_scalar('Kaggle_Score/train', train_kaggle_score, global_step=epoch)
        
        lr = optimizer.param_groups[-1]['lr']
        
        
        print("Epoch {0} Training, Loss {1}, Acc {2}".format(epoch, train_total_err, train_total_acc))
        evaluator.clear_cache()

        ###############################
        # Compute validation error
        ###############################

        model.eval()
        val_itr = iter(val_loader)
        with torch.no_grad():
            for idx, (inputs, labels) in enumerate(val_itr):
                input_data = inputs.float().cuda()
                labels = labels.cuda()
                grapheme_logits, vowel_logits, consonant_logits = model(input_data)
                eval_result = evaluator(grapheme_logits, vowel_logits, consonant_logits, labels)
                eval_result = {k: eval_result[k].item() for k in eval_result}
                total_err += eval_result['loss']
                total_acc += eval_result['acc']

        val_result = evaluator.evalulate_on_cache()
        val_total_err = val_result['loss']
        val_total_acc = val_result['acc']
        val_kaggle_score = val_result['kaggle_score']

        print("Epoch {0} Eval, Loss {1}, Acc {2}".format(epoch, val_total_err, val_total_acc))
        evaluator.clear_cache()

        # Update scheudler here if not 'OneCycleLR'
        if scheduler is not None and scheduler != 'OneCycleLR':
            if scheduler_type == 'reduce_on_plateau':
                scheduler.step(val_total_err)
            else:
                scheduler.step()


        ######################################
        # Saving the model + performance
        ######################################

        print("Saving the model (epoch %d)" % epoch)
        save_state = {
            "epoch": epoch + 1,
            "model_state": model.state_dict(),
            "optimizer_state": optimizer.state_dict(),
        }
        if scheduler is not None:
            save_state['scheduler_state'] = scheduler.state_dict()

        # ------ Uncomment if we use apex library --------
        # if use_amp:
        #     save_state['amp_state'] = amp.state_dict()
        torch.save(save_state, state_fpath)

        print("Making a backup (step %d)" % epoch)
        backup_fpath = os.path.join(backup_dir, "model_bak_%06d.pt" % (epoch,))
        torch.save(save_state, backup_fpath)

        # Dump the traces
        perf_trace.append(
            {
                'epoch': epoch,
                'train_err': train_total_err,
                'train_acc': train_total_acc,
                'train_kaggle_score': train_kaggle_score,
                'val_err': val_total_err,
                'val_acc': val_total_acc,
                'val_kaggle_score': val_kaggle_score
            }
        )
        pickle.dump(perf_trace, open(perf_path, 'wb'))

        # store epoch full result separately
        epoch_result = {
            'epoch': epoch,
            'train_result': train_result,
            'val_result': val_result
        }
        pickle.dump(epoch_result, open(os.path.join(results_dir, 'result_epoch_{0}.p'.format(epoch)), 'wb'))

        # output_path_base = os.path.basename(output_path)
        # os.system('aws s3 sync /root/bengali_data/{0} s3://eaitest1/{1}'.format(output_path_base, output_path_base))
        # os.system('rm -r /root/bengali_data/{0}/model_backups'.format(output_path_base))
        # os.system('mkdir /root/bengali_data/{0}/model_backups'.format(output_path_base))

        
        # Add model to Tensorboard to inspect the details of the architecture
        writer_tensorboard.add_graph(model, input_data)
        writer_tensorboard.close()
Exemplo n.º 4
0
def find_lr(cfg, max_iter=1400, init_value=1e-6, final_value=1.0):
    '''
    WIP
    We track the losses given different lr values. 
    Same training loop, but we update the lr according to an update step for each batch iteration
    We apply a smoothing function to the losses for better visualization afterward. 
    '''
    # FILES, PATHS
    train_path = cfg.DATASET.TRAIN_DATA_PATH

    # DATA LOADER
    train_data = pickle.load(open(train_path, 'rb'))
    train_loader = build_data_loader(train_data, cfg.DATASET, True)

    # MODEL
    model = build_model(cfg.MODEL)
    model.cuda()

    # Solver evaluator
    solver_cfg = cfg.MODEL.SOLVER

    total_epochs = solver_cfg.SCHEDULER.TOTAL_EPOCHS

    # Build optimizerW
    opti_cfg = solver_cfg.OPTIMIZER
    optimizer = build_optimizer(model, opti_cfg)

    # Build scheduler
    sched_cfg = solver_cfg.SCHEDULER
    scheduler = build_scheduler(optimizer,
                                sched_cfg,
                                steps_per_epoch=np.int(len(train_loader)),
                                epochs=total_epochs)

    # Build evaluator with or without Mixup
    mixup_training = solver_cfg.MIXUP_AUGMENT
    if mixup_training:
        mixup_augmenter = MixupAugmenter(solver_cfg.MIXUP)
    evaluator, mixup_evaluator = build_evaluator(solver_cfg)
    evaluator.float().cuda()
    if mixup_evaluator is not None:
        mixup_evaluator.float().cuda()

    # find_lr variables
    number_in_epoch = len(train_loader) - 1
    update_step = (final_value / init_value)**(2 / number_in_epoch)
    lr = init_value
    optimizer.param_groups[0]["lr"] = lr
    best_loss, batch_num = 0.0, 0
    losses, log_lrs = [], []

    model.train()
    train_itr = iter(train_loader)
    for idx, (inputs, labels) in enumerate(train_itr):
        batch_num += 1
        # compute
        input_data = inputs.float().cuda()
        labels = labels.cuda()

        # Use the model to produce the classification
        if mixup_training:
            input_data, labels = mixup_augmenter(input_data, labels)
        grapheme_logits, vowel_logits, consonant_logits = model(input_data)

        # Calling MultiHeadsEval forward function to produce evaluator results
        if mixup_training:
            eval_result = mixup_evaluator(grapheme_logits, vowel_logits,
                                          consonant_logits, labels)
        else:
            eval_result = evaluator(grapheme_logits, vowel_logits,
                                    consonant_logits, labels)
        optimizer.zero_grad()

        # get loss, back propagate, step
        loss = eval_result['loss']

        # Stopping condition: if loss explodes ogir idx = 2000
        if batch_num > 1 and loss > 4 * best_loss or idx == max_iter:
            losses = [x.item() for x in losses]
            losses = smoothen_by_spline(log_lrs, losses, s=4)
            return log_lrs[10:-5], losses[10:-5]

        # Record the best loss
        if loss < best_loss or batch_num == 1:
            best_loss = loss

        # Store the values
        losses.append(loss)
        log_lrs.append(math.log10(lr))

        # Do the backward pass and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        eval_result = {k: eval_result[k].item() for k in eval_result}

        if idx % 100 == 0:
            print(idx, eval_result['loss'], eval_result['acc'])

        # update the lr
        lr *= update_step
        optimizer.param_groups[0]["lr"] = lr
def train(cfg, debug=False):

    # FILES, PATHS
    assert cfg.OUTPUT_PATH != ''
    output_path = cfg.OUTPUT_PATH
    train_path = cfg.DATASET.TRAIN_DATA_PATH
    val_path = cfg.DATASET.VAL_DATA_PATH
    train_path_sample = cfg.DATASET.TRAIN_DATA_SAMPLE
    valid_path_sample = cfg.DATASET.VALID_DATA_SAMPLE

    if not os.path.exists(output_path):
        os.mkdir(output_path)
    backup_dir = os.path.join(output_path, 'model_backups')
    if not os.path.exists(backup_dir):
        os.mkdir(backup_dir)

    results_dir = os.path.join(output_path, 'results')
    if not os.path.exists(results_dir):
        os.mkdir(results_dir)

    cfg.dump(stream=open(os.path.join(output_path, 'config.yaml'), 'w'))
    state_fpath = os.path.join(output_path, 'model.pt')

    perf_path = os.path.join(results_dir, 'trace.p')
    perf_trace = []

    if debug:
        train_data = pickle.load(open(train_path_sample, 'rb'))
        val_data = pickle.load(open(valid_path_sample, 'rb'))

    # DATA LOADER
    if cfg.DATASET.USE_FOLDS_DATA:
        data_path = cfg.DATASET.FOLDS_PATH
        all_data_folds = pickle.load(open(data_path, 'rb'))
        val_fold = cfg.DATASET.VALIDATION_FOLD
        train_data = []
        val_data = []
        for idx, entries in enumerate(all_data_folds):
            if idx == val_fold:
                val_data = entries
            else:
                train_data = train_data + entries
    else:
        train_data = pickle.load(open(train_path, 'rb'))
        val_data = pickle.load(open(val_path, 'rb'))

    # witchcraft: only train on few classes
    focus_cls = cfg.DATASET.FOCUS_CLASS
    if len(focus_cls) > 0:
        train_data = [x for x in train_data if x[1][0] in focus_cls]
        val_data = [x for x in val_data if x[1][0] in focus_cls]

    train_loader = build_data_loader(train_data, cfg.DATASET, True)
    val_loader = build_data_loader(val_data, cfg.DATASET, False)

    # MODEL
    model = build_model(cfg.MODEL)
    solver_cfg = cfg.MODEL.SOLVER
    total_epochs = solver_cfg.TOTAL_EPOCHS
    loss_fn = solver_cfg.LOSS.NAME
    # for weighted focal loss, initialize last layer bias weights as constant
    if loss_fn == 'weighted_focal_loss':
        last_layer = model.head.fc_layers[-1]
        for m in last_layer.modules():
            if isinstance(m, torch.nn.Linear):
                torch.nn.init.constant_(m.bias, -3.0)

    current_epoch = 0
    multi_gpu_training = cfg.MULTI_GPU_TRAINING
    if cfg.RESUME_PATH != "":
        checkpoint = torch.load(cfg.RESUME_PATH, map_location='cpu')
        current_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint["model_state"])
    if multi_gpu_training:
        model = torch.nn.DataParallel(model)
    _ = model.cuda()

    # optimizer, scheduler, amp
    opti_cfg = solver_cfg.OPTIMIZER
    optimizer = build_optimizer(model, opti_cfg)
    use_amp = solver_cfg.AMP

    # ------ Uncomment if we use apex library --------
    # if use_amp:
    #     opt_level = 'O1'
    #     model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)

    if cfg.RESUME_PATH != "":
        if 'optimizer_state' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer_state'])

        # ------ Uncomment if we use apex library --------
        # if use_amp and 'amp_state' in checkpoint:
        #     amp.load_state_dict(checkpoint['amp_state'])

    scheduler_cfg = solver_cfg.SCHEDULER
    scheduler_type = scheduler_cfg.NAME
    scheduler = build_scheduler(optimizer, scheduler_cfg)

    # evaluator
    mixup_training = solver_cfg.MIXUP_AUGMENT
    if mixup_training:
        mixup_augmenter = MixupAugmenter(solver_cfg.MIXUP)

    evaluator, mixup_evaluator = build_evaluator(solver_cfg)
    evaluator.float().cuda()
    if mixup_evaluator is not None:
        mixup_evaluator.float().cuda()

    s_time = time.time()
    parameters = list(model.parameters())
    for epoch in range(current_epoch, total_epochs):
        model.train()
        if multi_gpu_training:
            model.freeze_bn()
        print('Start epoch', epoch)
        train_itr = iter(train_loader)
        total_err = 0
        total_acc = 0
        inputs, labels = next(train_itr)

        for idx, (inputs, labels) in enumerate(train_itr):

            # compute
            input_data = inputs.float().cuda()
            labels = labels.cuda()
            if mixup_training:
                input_data, labels = mixup_augmenter(input_data, labels)
            grapheme_logits, vowel_logits, consonant_logits = model(input_data)

            if mixup_training:
                eval_result = mixup_evaluator(grapheme_logits, vowel_logits,
                                              consonant_logits, labels)
            else:
                eval_result = evaluator(grapheme_logits, vowel_logits,
                                        consonant_logits, labels)
            optimizer.zero_grad()
            loss = eval_result['loss']

            # ------ Uncomment if we use apex library --------
            # if use_amp:
            #     with amp.scale_loss(loss, optimizer) as scaled_loss:
            #         scaled_loss.backward()

            loss.backward()
            max_grad = torch.max(parameters[-1].grad)
            if not torch.isnan(max_grad):
                optimizer.step()
            else:
                print('NAN in gradient, skip this step')
                optimizer.zero_grad()

            eval_result = {k: eval_result[k].item() for k in eval_result}
            if idx % 100 == 0:
                t_time = time.time()
                print(idx, eval_result['loss'], eval_result['acc'],
                      t_time - s_time)
                s_time = time.time()
        if mixup_training:
            train_result = mixup_evaluator.evalulate_on_cache()
            mixup_evaluator.clear_cache()
        else:
            train_result = evaluator.evalulate_on_cache()
        train_total_err = train_result['loss']
        train_total_acc = train_result['acc']
        train_kaggle_score = train_result['kaggle_score']
        print("Epoch {0} Training, Loss {1}, Acc {2}".format(
            epoch, train_total_err, train_total_acc))
        evaluator.clear_cache()

        # compute validation error
        model.eval()
        val_itr = iter(val_loader)
        with torch.no_grad():
            for idx, (inputs, labels) in enumerate(val_itr):
                input_data = inputs.float().cuda()
                labels = labels.cuda()
                grapheme_logits, vowel_logits, consonant_logits = model(
                    input_data)
                eval_result = evaluator(grapheme_logits, vowel_logits,
                                        consonant_logits, labels)
                eval_result = {k: eval_result[k].item() for k in eval_result}
                total_err += eval_result['loss']
                total_acc += eval_result['acc']

        val_result = evaluator.evalulate_on_cache()
        val_total_err = val_result['loss']
        val_total_acc = val_result['acc']
        val_kaggle_score = val_result['kaggle_score']

        print("Epoch {0} Eval, Loss {1}, Acc {2}".format(
            epoch, val_total_err, val_total_acc))
        evaluator.clear_cache()

        if scheduler is not None:
            if scheduler_type == 'reduce_on_plateau':
                scheduler.step(val_total_err)
            else:
                scheduler.step()

        print("Saving the model (epoch %d)" % epoch)
        save_state = {
            "epoch": epoch + 1,
            "model_state": model.state_dict(),
            "optimizer_state": optimizer.state_dict(),
        }
        if scheduler is not None:
            save_state['scheduler_state'] = scheduler.state_dict()

        # ------ Uncomment if we use apex library --------
        # if use_amp:
        #     save_state['amp_state'] = amp.state_dict()
        torch.save(save_state, state_fpath)

        print("Making a backup (step %d)" % epoch)
        backup_fpath = os.path.join(backup_dir,
                                    "model_bak_%06d.pt" % (epoch, ))
        torch.save(save_state, backup_fpath)

        perf_trace.append({
            'epoch': epoch,
            'train_err': train_total_err,
            'train_acc': train_total_acc,
            'train_kaggle_score': train_kaggle_score,
            'val_err': val_total_err,
            'val_acc': val_total_acc,
            'val_kaggle_score': val_kaggle_score
        })
        pickle.dump(perf_trace, open(perf_path, 'wb'))

        # store epoch full result separately
        epoch_result = {
            'epoch': epoch,
            'train_result': train_result,
            'val_result': val_result
        }
        pickle.dump(
            epoch_result,
            open(os.path.join(results_dir, 'result_epoch_{0}.p'.format(epoch)),
                 'wb'))
def train(cfg):
    # FILES, PATHS
    assert cfg.OUTPUT_PATH != ''
    output_path = cfg.OUTPUT_PATH
    train_path = cfg.DATASET.TRAIN_DATA_PATH
    val_path = cfg.DATASET.VAL_DATA_PATH

    if not os.path.exists(output_path):
        os.mkdir(output_path)
    backup_dir = os.path.join(output_path, 'model_backups')
    if not os.path.exists(backup_dir):
        os.mkdir(backup_dir)
    state_fpath = os.path.join(output_path, 'model.pt')
    perf_path = os.path.join(output_path, 'trace.json')
    perf_trace = []

    # DATA LOADER
    train_data = pickle.load(open(train_path, 'rb'))
    val_data = pickle.load(open(val_path, 'rb'))
    train_loader = build_data_loader(train_data, cfg.DATASET, True)
    val_loader = build_data_loader(val_data, cfg.DATASET, False)

    # MODEL
    model = build_model(cfg.MODEL)
    current_epoch = 0
    if cfg.RESUME_PATH != "":
        checkpoint = torch.load(cfg.RESUME_PATH, map_location='cpu')
        current_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint["model_state"])
    _ = model.cuda()

    # SOLVER EVALUATOR
    solver_cfg = cfg.MODEL.SOLVER
    optimizer = build_optimizer(model, solver_cfg)
    evaluator = build_evaluator(solver_cfg)
    evaluator.float().cuda()
    total_epochs = solver_cfg.TOTAL_EPOCHS

    for epoch in range(current_epoch, total_epochs):
        model.train()
        print('Start epoch', epoch)
        train_itr = iter(train_loader)
        total_err = 0
        total_acc = 0
        for idx, (inputs, labels) in enumerate(train_itr):

            # compute
            input_data = inputs.float().cuda()
            labels = labels.cuda()
            grapheme_logits, vowel_logits, consonant_logits = model(input_data)

            eval_result = evaluator(grapheme_logits, vowel_logits, consonant_logits, labels)
            optimizer.zero_grad()
            eval_result['loss'].backward()
            optimizer.step()

            eval_result = {k: eval_result[k].item() for k in eval_result}
            total_err += eval_result['loss']
            total_acc += eval_result['acc']
            if idx % 100 == 0:
                print(idx, eval_result['loss'], eval_result['acc'])

        train_total_err = total_err / (1 + idx)
        train_total_acc = total_acc / (1 + idx)
        print("Epoch {0} Training, Loss {1}, Acc {2}".format(epoch, train_total_err, train_total_acc))

        # compute validation error
        model.eval()
        val_itr = iter(val_loader)
        total_err = 0
        total_acc = 0
        with torch.no_grad():
            for idx, (image_id, images, label1, label2, label3) in enumerate(val_itr):
                input_data = images.float().cuda()
                labels = labels.cuda()
                grapheme_logits, vowel_logits, consonant_logits = model(input_data)
                eval_result = evaluator(grapheme_logits, vowel_logits, consonant_logits, labels)
                eval_result = {k: eval_result[k].item() for k in eval_result}
                total_err += eval_result['loss']
                total_acc += eval_result['acc']
                # print(total_err / (1 + idx), total_acc / (1 + idx))

        val_total_err = total_err / (1 + idx)
        val_total_acc = total_acc / (1 + idx)
        print("Epoch {0} Eval, Loss {1}, Acc {2}".format(epoch, val_total_err, val_total_acc))

        print("Saving the model (epoch %d)" % epoch)
        torch.save({
            "epoch": epoch + 1,
            "model_state": model.state_dict(),
            "optimizer_state": optimizer.state_dict(),
        }, state_fpath)

        print("Making a backup (step %d)" % epoch)
        backup_fpath = os.path.join(backup_dir, "model_bak_%06d.pt" % (epoch,))
        torch.save({
            "epoch": epoch + 1,
            "model_state": model.state_dict(),
            "optimizer_state": optimizer.state_dict(),
        }, backup_fpath)

        perf_trace.append(
            {
                'epoch': epoch,
                'train_err': train_total_err,
                'train_acc': train_total_acc,
                'val_err': val_total_err,
                'val_acc': val_total_acc
            }
        )
        json.dump(perf_trace, open(perf_path, 'w'))
Exemplo n.º 7
0
from src.data.bengali_data import build_data_loader
from src.modeling.solver.optimizer import build_optimizer
from src.modeling.solver.evaluation import build_evaluator
from src.config.config import cfg

# FILES, PATHS
assert cfg.OUTPUT_PATH != ''
output_path = cfg.OUTPUT_PATH
train_path = cfg.DATASET.TRAIN_DATA_0

# DATA LOADER
train_data = pickle.load(open(train_path, 'rb'))
train_loader = build_data_loader(train_data, cfg.DATASET, True)

# MODEL
model = build_model(cfg.MODEL)

# SOLVER EVALUATOR
solver_cfg = cfg.MODEL.SOLVER
optimizer = build_optimizer(model, solver_cfg)
evaluator = build_evaluator(solver_cfg)
evaluator.float().cuda()
total_epochs = solver_cfg.TOTAL_EPOCHS

model.train()
print('Start training')
train_itr = iter(train_loader)
total_err = 0
total_acc = 0
for idx, (inputs, labels) in enumerate(train_itr):