예제 #1
0
def do_test(test_file, batch_size, gpu_mode, num_workers, model_path,
            print_details):
    """
    Train a model and save
    :param test_file: A CSV file containing test image information
    :param batch_size: Batch size for training
    :param gpu_mode: If true the model will be trained on GPU
    :param num_workers: Number of workers for data loading
    :param model_path: Path to a saved model
    :param num_classes: Number of output classes
    :return:
    """
    sys.stderr.write(TextColor.PURPLE + 'Loading data\n' + TextColor.END)

    if os.path.isfile(model_path) is False:
        sys.stderr.write(TextColor.RED + "ERROR: INVALID PATH TO MODEL\n")
        exit(1)

    sys.stderr.write(TextColor.GREEN + "INFO: MODEL LOADING\n" + TextColor.END)

    transducer_model, hidden_size, gru_layers, prev_ite = \
        ModelHandler.load_simple_model_for_training(model_path,
                                                    input_channels=ImageSizeOptions.IMAGE_CHANNELS,
                                                    image_features=ImageSizeOptions.IMAGE_HEIGHT,
                                                    seq_len=ImageSizeOptions.SEQ_LENGTH,
                                                    num_classes=ImageSizeOptions.TOTAL_LABELS)

    sys.stderr.write(TextColor.GREEN + "INFO: MODEL LOADED\n" + TextColor.END)

    if gpu_mode:
        transducer_model = torch.nn.DataParallel(transducer_model).cuda()

    stats_dictioanry = test(test_file,
                            batch_size,
                            gpu_mode,
                            transducer_model,
                            num_workers,
                            gru_layers,
                            hidden_size,
                            num_classes=ImageSizeOptions.TOTAL_LABELS,
                            print_details=print_details)

    sys.stderr.write(TextColor.PURPLE + 'DONE\n' + TextColor.END)
예제 #2
0
def train(train_file, test_file, batch_size, epoch_limit, gpu_mode,
          num_workers, retrain_model, retrain_model_path, gru_layers,
          hidden_size, lr, decay, model_dir, stats_dir, train_mode):

    if train_mode is True:
        train_loss_logger = open(stats_dir + "train_loss.csv", 'w')
        test_loss_logger = open(stats_dir + "test_loss.csv", 'w')
        confusion_matrix_logger = open(stats_dir + "confusion_matrix.txt", 'w')
    else:
        train_loss_logger = None
        test_loss_logger = None
        confusion_matrix_logger = None

    sys.stderr.write(TextColor.PURPLE + 'Loading data\n' + TextColor.END)
    train_data_set = SequenceDataset(train_file)
    train_loader = DataLoader(train_data_set,
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=num_workers,
                              pin_memory=gpu_mode)
    num_classes = ImageSizeOptions.TOTAL_LABELS

    if retrain_model is True:
        if os.path.isfile(retrain_model_path) is False:
            sys.stderr.write(
                TextColor.RED +
                "ERROR: INVALID PATH TO RETRAIN PATH MODEL --retrain_model_path\n"
            )
            exit(1)
        sys.stderr.write(TextColor.GREEN + "INFO: RETRAIN MODEL LOADING\n" +
                         TextColor.END)
        transducer_model, hidden_size, gru_layers, prev_ite = \
            ModelHandler.load_simple_model_for_training(retrain_model_path,
                                                        input_channels=ImageSizeOptions.IMAGE_CHANNELS,
                                                        image_features=ImageSizeOptions.IMAGE_HEIGHT,
                                                        seq_len=ImageSizeOptions.SEQ_LENGTH,
                                                        num_classes=num_classes)

        if train_mode is True:
            epoch_limit = prev_ite + epoch_limit

        sys.stderr.write(TextColor.GREEN + "INFO: RETRAIN MODEL LOADED\n" +
                         TextColor.END)
    else:
        transducer_model = ModelHandler.get_new_gru_model(
            input_channels=ImageSizeOptions.IMAGE_CHANNELS,
            image_features=ImageSizeOptions.IMAGE_HEIGHT,
            gru_layers=gru_layers,
            hidden_size=hidden_size,
            num_classes=num_classes)
        prev_ite = 0

    param_count = sum(p.numel() for p in transducer_model.parameters()
                      if p.requires_grad)
    sys.stderr.write(TextColor.RED + "INFO: TOTAL TRAINABLE PARAMETERS:\t" +
                     str(param_count) + "\n" + TextColor.END)

    if gpu_mode:
        transducer_model = torch.nn.DataParallel(transducer_model).cuda()

    class_weights = torch.Tensor(CLASS_WEIGHTS)
    # Loss
    criterion = nn.CrossEntropyLoss(class_weights)

    if gpu_mode is True:
        criterion = criterion.cuda()

    model_optimizer = torch.optim.Adam(transducer_model.parameters(),
                                       lr=lr,
                                       weight_decay=decay)
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        model_optimizer, 'min')

    if retrain_model is True:
        sys.stderr.write(TextColor.GREEN + "INFO: OPTIMIZER LOADING\n" +
                         TextColor.END)
        model_optimizer = ModelHandler.load_simple_optimizer(
            model_optimizer, retrain_model_path, gpu_mode)
        sys.stderr.write(TextColor.GREEN + "INFO: OPTIMIZER LOADED\n" +
                         TextColor.END)

    start_epoch = prev_ite

    # Train the Model
    sys.stderr.write(TextColor.PURPLE + 'Training starting\n' + TextColor.END)
    stats = dict()
    stats['loss_epoch'] = []
    stats['accuracy_epoch'] = []
    sys.stderr.write(TextColor.BLUE + 'Start: ' + str(start_epoch + 1) +
                     ' End: ' + str(epoch_limit) + "\n")
    for epoch in range(start_epoch, epoch_limit, 1):
        total_loss = 0
        total_images = 0
        sys.stderr.write(TextColor.BLUE + 'Train epoch: ' + str(epoch + 1) +
                         "\n")
        # make sure the model is in train mode. BN is different in train and eval.

        batch_no = 1
        with tqdm(total=len(train_loader), desc='Loss', leave=True,
                  ncols=100) as progress_bar:
            transducer_model.train()
            for images, labels in train_loader:
                labels = labels.type(torch.LongTensor)
                images = images.type(torch.FloatTensor)

                if gpu_mode:
                    # encoder_hidden = encoder_hidden.cuda()
                    images = images.cuda()
                    labels = labels.cuda()

                hidden = torch.zeros(images.size(0),
                                     2 * TrainOptions.GRU_LAYERS,
                                     TrainOptions.HIDDEN_SIZE)

                if gpu_mode:
                    hidden = hidden.cuda()

                for i in range(0, ImageSizeOptions.SEQ_LENGTH,
                               TrainOptions.WINDOW_JUMP):
                    model_optimizer.zero_grad()

                    if i + TrainOptions.TRAIN_WINDOW > ImageSizeOptions.SEQ_LENGTH:
                        break
                    image_chunk = images[:, i:i + TrainOptions.TRAIN_WINDOW]
                    label_chunk = labels[:, i:i + TrainOptions.TRAIN_WINDOW]

                    output_, hidden = transducer_model(image_chunk, hidden)

                    loss = criterion(
                        output_.contiguous().view(-1, num_classes),
                        label_chunk.contiguous().view(-1))

                    loss.backward()
                    model_optimizer.step()

                    total_loss += loss.item()
                    total_images += image_chunk.size(0)

                    hidden = hidden.detach()

                # update the progress bar
                avg_loss = (total_loss / total_images) if total_images else 0
                progress_bar.set_description("Loss: " + str(avg_loss))

                if train_mode is True:
                    train_loss_logger.write(
                        str(epoch + 1) + "," + str(batch_no) + "," +
                        str(avg_loss) + "\n")
                progress_bar.refresh()
                progress_bar.update(1)
                batch_no += 1

            progress_bar.close()

        stats_dictioanry = test(test_file,
                                batch_size,
                                gpu_mode,
                                transducer_model,
                                num_workers,
                                gru_layers,
                                hidden_size,
                                num_classes=ImageSizeOptions.TOTAL_LABELS)
        stats['loss'] = stats_dictioanry['loss']
        stats['accuracy'] = stats_dictioanry['accuracy']
        stats['loss_epoch'].append((epoch, stats_dictioanry['loss']))
        stats['accuracy_epoch'].append((epoch, stats_dictioanry['accuracy']))

        lr_scheduler.step(stats['loss'])

        # update the loggers
        if train_mode is True:
            # save the model after each epoch
            # encoder_model, decoder_model, encoder_optimizer, decoder_optimizer, hidden_size, layers, epoch,
            # file_name
            save_best_model(
                transducer_model, model_optimizer, hidden_size, gru_layers,
                epoch,
                model_dir + "_epoch_" + str(epoch + 1) + '_checkpoint.pkl')

            test_loss_logger.write(
                str(epoch + 1) + "," + str(stats['loss']) + "," +
                str(stats['accuracy']) + "\n")
            confusion_matrix_logger.write(
                str(epoch + 1) + "\n" +
                str(stats_dictioanry['confusion_matrix']) + "\n")
            train_loss_logger.flush()
            test_loss_logger.flush()
            confusion_matrix_logger.flush()
        else:
            # this setup is for hyperband
            if epoch + 1 >= 10 and stats['accuracy'] < 98:
                sys.stderr.write(
                    TextColor.PURPLE +
                    'EARLY STOPPING AS THE MODEL NOT DOING WELL\n' +
                    TextColor.END)
                return transducer_model, model_optimizer, stats

    sys.stderr.write(TextColor.PURPLE + 'Finished training\n' + TextColor.END)

    return transducer_model, model_optimizer, stats
예제 #3
0
def train(train_file, test_file, batch_size, epoch_limit, gpu_mode,
          num_workers, retrain_model, retrain_model_path, gru_layers,
          hidden_size, encoder_lr, encoder_decay, decoder_lr, decoder_decay,
          model_dir, stats_dir, train_mode):

    if train_mode is True:
        train_loss_logger = open(stats_dir + "train_loss.csv", 'w')
        test_loss_logger = open(stats_dir + "test_loss.csv", 'w')
        confusion_matrix_logger = open(stats_dir + "confusion_matrix.txt", 'w')
    else:
        train_loss_logger = None
        test_loss_logger = None
        confusion_matrix_logger = None

    sys.stderr.write(TextColor.PURPLE + 'Loading data\n' + TextColor.END)
    train_data_set = SequenceDataset(train_file)
    train_loader = DataLoader(train_data_set,
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=num_workers,
                              pin_memory=gpu_mode)
    if retrain_model is True:
        if os.path.isfile(retrain_model_path) is False:
            sys.stderr.write(
                TextColor.RED +
                "ERROR: INVALID PATH TO RETRAIN PATH MODEL --retrain_model_path\n"
            )
            exit(1)
        sys.stderr.write(TextColor.GREEN + "INFO: RETRAIN MODEL LOADING\n" +
                         TextColor.END)
        encoder_model, decoder_model, hidden_size, gru_layers, prev_ite = \
            ModelHandler.load_model_for_training(retrain_model_path,
                                                 input_channels=5,
                                                 seq_len=ImageSizeOptions.SEQ_LENGTH,
                                                 num_classes=3)

        if train_mode is True:
            epoch_limit = prev_ite + epoch_limit

        sys.stderr.write(TextColor.GREEN + "INFO: RETRAIN MODEL LOADED\n" +
                         TextColor.END)
    else:
        encoder_model, decoder_model = ModelHandler.get_new_model(
            input_channels=5,
            gru_layers=gru_layers,
            hidden_size=hidden_size,
            seq_len=ImageSizeOptions.SEQ_LENGTH,
            num_classes=3)
        prev_ite = 0

    encoder_optimizer = torch.optim.Adam(encoder_model.parameters(),
                                         lr=encoder_lr,
                                         weight_decay=encoder_decay)
    decoder_optimizer = torch.optim.Adam(decoder_model.parameters(),
                                         lr=decoder_lr,
                                         weight_decay=decoder_decay)

    if retrain_model is True:
        sys.stderr.write(TextColor.GREEN + "INFO: OPTIMIZER LOADING\n" +
                         TextColor.END)
        encoder_optimizer, decoder_optimizer = ModelHandler.load_optimizer(
            encoder_optimizer, decoder_optimizer, retrain_model_path, gpu_mode)
        sys.stderr.write(TextColor.GREEN + "INFO: OPTIMIZER LOADED\n" +
                         TextColor.END)

    if gpu_mode:
        encoder_model = torch.nn.DataParallel(encoder_model).cuda()
        decoder_model = torch.nn.DataParallel(decoder_model).cuda()

    class_weights = torch.FloatTensor(CLASS_WEIGHTS)
    # Loss
    criterion = nn.CrossEntropyLoss(weight=class_weights)

    if gpu_mode is True:
        criterion = criterion.cuda()

    start_epoch = prev_ite

    # Train the Model
    sys.stderr.write(TextColor.PURPLE + 'Training starting\n' + TextColor.END)
    stats = dict()
    stats['loss_epoch'] = []
    stats['accuracy_epoch'] = []
    sys.stderr.write(TextColor.PURPLE + 'Start: ' + str(start_epoch + 1) +
                     ' End: ' + str(epoch_limit + 1) + "\n")
    for epoch in range(start_epoch, epoch_limit, 1):
        total_loss = 0
        total_images = 0
        sys.stderr.write(TextColor.BLUE + 'Train epoch: ' + str(epoch + 1) +
                         "\n")
        # make sure the model is in train mode. BN is different in train and eval.
        encoder_model.train()
        decoder_model.train()
        batch_no = 1
        with tqdm(total=len(train_loader), desc='Loss', leave=True,
                  ncols=100) as progress_bar:
            for images, labels in train_loader:
                # print(images.size(), labels.size())

                # from modules.python.helper.tensor_analyzer import analyze_tensor
                # for label in labels[0].data:
                #     print(label.item(), end='')
                # print()
                # analyze_tensor(images[0])
                # exit()
                if gpu_mode:
                    # encoder_hidden = encoder_hidden.cuda()
                    images = images.cuda()
                    labels = labels.cuda()

                encoder_hidden = torch.FloatTensor(images.size(0),
                                                   gru_layers * 2,
                                                   hidden_size).zero_()

                if gpu_mode:
                    encoder_hidden = encoder_hidden.cuda()

                encoder_optimizer.zero_grad()
                decoder_optimizer.zero_grad()

                loss = 0
                total_seq_length = images.size(2)
                start_index = 0
                end_index = images.size(2)

                # from analysis.analyze_png_img import analyze_tensor
                # print(labels[0, :].data.numpy())
                # analyze_tensor(images[0, :, :, :])
                # exit()
                context_vector, hidden_encoder = encoder_model(
                    images, encoder_hidden)

                for seq_index in range(start_index, end_index):
                    current_batch_size = images.size(0)
                    y = labels[:, seq_index - start_index]
                    attention_index = torch.from_numpy(
                        np.asarray([seq_index] * current_batch_size)).view(
                            -1, 1)

                    attention_index_onehot = torch.FloatTensor(
                        current_batch_size, total_seq_length)

                    attention_index_onehot.zero_()
                    attention_index_onehot.scatter_(1, attention_index, 1)
                    # print("\n", seq_index, attention_index_onehot)
                    # exit()

                    output_dec, decoder_hidden, attn = decoder_model(
                        attention_index_onehot,
                        context_vector=context_vector,
                        encoder_hidden=hidden_encoder)
                    # loss
                    loss += criterion(output_dec, y)

                loss.backward()
                encoder_optimizer.step()
                decoder_optimizer.step()

                total_loss += loss.item()
                total_images += labels.size(0)

                # update the progress bar
                avg_loss = (total_loss / total_images) if total_images else 0
                progress_bar.set_description("Loss: " + str(avg_loss))
                if train_mode is True:
                    train_loss_logger.write(
                        str(epoch + 1) + "," + str(batch_no) + "," +
                        str(avg_loss) + "\n")
                progress_bar.refresh()
                progress_bar.update(1)
                batch_no += 1

            progress_bar.close()

        stats_dictioanry = test(test_file,
                                batch_size,
                                gpu_mode,
                                encoder_model,
                                decoder_model,
                                num_workers,
                                gru_layers,
                                hidden_size,
                                num_classes=3)
        stats['loss'] = stats_dictioanry['loss']
        stats['accuracy'] = stats_dictioanry['accuracy']
        stats['loss_epoch'].append((epoch, stats_dictioanry['loss']))
        stats['accuracy_epoch'].append((epoch, stats_dictioanry['accuracy']))

        # update the loggers
        if train_mode is True:
            # save the model after each epoch
            # encoder_model, decoder_model, encoder_optimizer, decoder_optimizer, hidden_size, layers, epoch,
            # file_name
            save_best_model(
                encoder_model, decoder_model, encoder_optimizer,
                decoder_optimizer, hidden_size, gru_layers, epoch,
                model_dir + "_epoch_" + str(epoch + 1) + '_checkpoint.pkl')

            test_loss_logger.write(
                str(epoch + 1) + "," + str(stats['loss']) + "," +
                str(stats['accuracy']) + "\n")
            confusion_matrix_logger.write(
                str(epoch + 1) + "\n" +
                str(stats_dictioanry['confusion_matrix']) + "\n")
            train_loss_logger.flush()
            test_loss_logger.flush()
            confusion_matrix_logger.flush()
        else:
            # this setup is for hyperband
            if epoch + 1 >= 2 and stats['accuracy'] < 90:
                sys.stderr.write(
                    TextColor.PURPLE +
                    'EARLY STOPPING AS THE MODEL NOT DOING WELL\n' +
                    TextColor.END)
                return encoder_model, decoder_model, encoder_optimizer, decoder_optimizer, stats

    sys.stderr.write(TextColor.PURPLE + 'Finished training\n' + TextColor.END)

    return encoder_model, decoder_model, encoder_optimizer, decoder_optimizer, stats