def main():
    epochs = 301
    seq_batch_size = 200
    print_yes = 100
    iscuda = False

    # create our network, optimizer and loss function
    net = RNN(len(chars), 100, 150, 2)  #instanciate a RNN object
    optim = torch.optim.Adam(net.parameters(), lr=6e-4)
    loss_func = torch.nn.functional.nll_loss

    if iscuda:
        net = net.cuda()

    # main training loop:
    for epoch in range(epochs):
        dat = getSequence(book, seq_batch_size)
        dat = torch.LongTensor(
            [chars.find(item) for item in dat]
        )  #find corresponding char index for each character and store this in tensor

        # pull x, y and initialize hidden state
        if iscuda:
            x_t = dat[:-1].cuda()
            y_t = dat[1:].cuda()
            hidden = net.init_hidden().cuda()
        else:
            x_t = dat[:-1]
            y_t = dat[1:]
            hidden = net.init_hidden()

        # forward pass
        logprob, hidden = net.forward(x_t, hidden)
        loss = loss_func(logprob, y_t)
        # update
        optim.zero_grad()
        loss.backward()
        optim.step()
        # print the loss for every kth iteration
        if epoch % print_yes == 0:
            print('*' * 60)
            print('\n epoch {}, loss:{} \n'.format(epoch, loss))
            print('sample speech:\n', test_words(net, chars, seq_batch_size))

    torch.save(net.state_dict(), 'trainedBook_v2.pt')
示例#2
0
loader_train = DataLoader(train_set, batch_size=BATCH_SIZE, sampler=train_sampler,
                          shuffle=False, num_workers=4)

loader_val = DataLoader(train_set, batch_size=BATCH_SIZE, sampler=validation_sampler,
                        shuffle=False, num_workers=4)

print("Train Size: ", len(loader_train.sampler.indices))
print("Validation Size: ", len(loader_val.sampler.indices))

num_classes = len(train_set.label_encoder.classes_)
model = RNN(embeddings, num_classes=num_classes, **_hparams)

weights = class_weigths(train_set.labels)

if torch.cuda.is_available():
    model.cuda()
    weights = weights.cuda()

criterion = torch.nn.CrossEntropyLoss(weight=weights)

parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.Adam(parameters, lr=lr)

#############################################################
# Train
#############################################################

best_val_loss = None

colms_l = ['Train_Loss', 'Val_Loss']
colms_acc = ['Train_Acc', 'Val_Acc']
示例#3
0
def main(argv):
    global args
    args = parser.parse_args(argv)
    if args.threads == -1:
        args.threads = torch.multiprocessing.cpu_count() - 1 or 1
    print('===> Configuration')
    print(args)

    cuda = args.cuda
    if cuda:
        if torch.cuda.is_available():
            print('===> {} GPUs are available'.format(
                torch.cuda.device_count()))
        else:
            raise Exception("No GPU found, please run with --no-cuda")

    # Fix the random seed for reproducibility
    # random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if cuda:
        torch.cuda.manual_seed(args.seed)

    # Data loading
    print('===> Loading entire datasets')
    with open(args.data_path + 'train.seqs', 'rb') as f:
        train_seqs = pickle.load(f)
    with open(args.data_path + 'train.labels', 'rb') as f:
        train_labels = pickle.load(f)
    with open(args.data_path + 'valid.seqs', 'rb') as f:
        valid_seqs = pickle.load(f)
    with open(args.data_path + 'valid.labels', 'rb') as f:
        valid_labels = pickle.load(f)
    with open(args.data_path + 'test.seqs', 'rb') as f:
        test_seqs = pickle.load(f)
    with open(args.data_path + 'test.labels', 'rb') as f:
        test_labels = pickle.load(f)

    max_code = max(
        map(lambda p: max(map(lambda v: max(v), p)),
            train_seqs + valid_seqs + test_seqs))
    num_features = max_code + 1

    print("     ===> Construct train set")
    train_set = VisitSequenceWithLabelDataset(train_seqs,
                                              train_labels,
                                              num_features,
                                              reverse=False)
    print("     ===> Construct validation set")
    valid_set = VisitSequenceWithLabelDataset(valid_seqs,
                                              valid_labels,
                                              num_features,
                                              reverse=False)
    print("     ===> Construct test set")
    test_set = VisitSequenceWithLabelDataset(test_seqs,
                                             test_labels,
                                             num_features,
                                             reverse=False)

    train_loader = DataLoader(dataset=train_set,
                              batch_size=args.batch_size,
                              shuffle=True,
                              collate_fn=visit_collate_fn,
                              num_workers=args.threads)
    valid_loader = DataLoader(dataset=valid_set,
                              batch_size=args.eval_batch_size,
                              shuffle=False,
                              collate_fn=visit_collate_fn,
                              num_workers=args.threads)
    test_loader = DataLoader(dataset=test_set,
                             batch_size=args.eval_batch_size,
                             shuffle=False,
                             collate_fn=visit_collate_fn,
                             num_workers=args.threads)
    print('===> Dataset loaded!')

    # Create model
    print('===> Building a Model')

    model = RNN(dim_input=num_features, dim_emb=128, dim_hidden=128)

    if cuda:
        model = model.cuda()
    print(model)
    print('===> Model built!')

    weight_class0 = torch.mean(torch.FloatTensor(train_set.labels))
    weight_class1 = 1.0 - weight_class0
    weight = torch.FloatTensor([weight_class0, weight_class1])

    criterion = nn.CrossEntropyLoss(weight=weight)
    if args.cuda:
        criterion = criterion.cuda()

    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.lr,
                                momentum=args.momentum,
                                nesterov=False,
                                weight_decay=args.weight_decay)
    scheduler = ReduceLROnPlateau(optimizer, 'min')

    best_valid_epoch = 0
    best_valid_loss = sys.float_info.max

    train_losses = []
    valid_losses = []

    if not os.path.exists(args.save):
        os.makedirs(args.save)

    for ei in trange(args.epochs, desc="Epochs"):
        # Train
        _, _, train_loss = rnn_epoch(train_loader,
                                     model,
                                     criterion=criterion,
                                     optimizer=optimizer,
                                     train=True)
        train_losses.append(train_loss)

        # Eval
        _, _, valid_loss = rnn_epoch(valid_loader, model, criterion=criterion)
        valid_losses.append(valid_loss)

        scheduler.step(valid_loss)

        is_best = valid_loss < best_valid_loss

        if is_best:
            best_valid_epoch = ei
            best_valid_loss = valid_loss

            # evaluate on the test set
            test_y_true, test_y_pred, test_loss = rnn_epoch(
                test_loader, model, criterion=criterion)

            if args.cuda:
                test_y_true = test_y_true.cpu()
                test_y_pred = test_y_pred.cpu()

            test_auc = roc_auc_score(test_y_true.numpy(),
                                     test_y_pred.numpy()[:, 1],
                                     average="weighted")
            test_aupr = average_precision_score(test_y_true.numpy(),
                                                test_y_pred.numpy()[:, 1],
                                                average="weighted")

            with open(args.save + 'train_result.txt', 'w') as f:
                f.write('Best Validation Epoch: {}\n'.format(ei))
                f.write('Best Validation Loss: {}\n'.format(valid_loss))
                f.write('Train Loss: {}\n'.format(train_loss))
                f.write('Test Loss: {}\n'.format(test_loss))
                f.write('Test AUROC: {}\n'.format(test_auc))
                f.write('Test AUPR: {}\n'.format(test_aupr))

            torch.save(model, args.save + 'best_model.pth')
            torch.save(model.state_dict(), args.save + 'best_model_params.pth')

        # plot
        if args.plot:
            plt.figure(figsize=(12, 9))
            plt.plot(np.arange(len(train_losses)),
                     np.array(train_losses),
                     label='Training Loss')
            plt.plot(np.arange(len(valid_losses)),
                     np.array(valid_losses),
                     label='Validation Loss')
            plt.xlabel('epoch')
            plt.ylabel('Loss')
            plt.legend(loc="best")
            plt.tight_layout()
            plt.savefig(args.save + 'loss_plot.eps', format='eps')
            plt.close()

    print('Best Validation Epoch: {}\n'.format(best_valid_epoch))
    print('Best Validation Loss: {}\n'.format(best_valid_loss))
    print('Train Loss: {}\n'.format(train_loss))
    print('Test Loss: {}\n'.format(test_loss))
    print('Test AUROC: {}\n'.format(test_auc))
    print('Test AUPR: {}\n'.format(test_aupr))
示例#4
0
    params = {'LR': LR, 'VOCAB_SIZE': VOCAB_SIZE, 'NO_WORD_EMBEDDINGS': NO_WORD_EMBEDDINGS, 'HIDDEN_SIZE': HIDDEN_SIZE,
              'BATCH_SIZE': BATCH_SIZE, 'NUM_LAYERS': NUM_LAYERS, 'train_imagepaths_and_captions': train_imagepaths_and_captions,
              'val_imagepaths_and_captions': val_imagepaths_and_captions, 'pretrained_cnn_dir': pretrained_cnn_dir,
              'pretrained_word_embeddings_file': pretrained_word_embeddings_file, 'transform_train': transform_train, 
              'transform_val': transform_val, 'WEIGHT_DECAY': WEIGHT_DECAY, 'ADAM_FLAG': ADAM_FLAG, 'RNN_DROPOUT':RNN_DROPOUT,
              'CNN_DROPOUT': CNN_DROPOUT, 'GRAD_CLIP': GRAD_CLIP}


    print('Initializing models...')
    encoder = CNN(NO_WORD_EMBEDDINGS, pretrained_cnn_dir, freeze=True, dropout_prob=CNN_DROPOUT, model_name='resnet152')
    decoder = RNN(VOCAB_SIZE, NO_WORD_EMBEDDINGS, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS,
                  pre_trained_file=pretrained_word_embeddings_file, freeze=False, dropout_prob=RNN_DROPOUT)
    params['encoder'] = encoder
    params['decoder'] = decoder
    encoder.cuda()
    decoder.cuda()

    print('Initializing optimizer...')
    model_paras = list(encoder.parameters()) + list(decoder.parameters())
    optimizer = optim.Adam(model_paras, lr=LR, weight_decay=WEIGHT_DECAY)
    params['optimizer'] = optimizer


    pickle.dump(params, open(init_params_file, 'wb'))


# initialize accumulators.
current_epoch = 1
batch_step_count = 1
time_used_global = 0.0
checkpoint = 1
示例#5
0
class TextClassifier:
    def __init__(self, batch_size, iterations, initial_lr, hidden_size,
                 dropout, kernel_sz, num_layers):

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device('cuda:0' if self.use_cuda else 'cpu')

        self.data = DataReader()
        train_iter, val_iter, test_iter = self.data.init_dataset(
            batch_size, ('cuda:0' if self.use_cuda else 'cpu'))
        self.train_batch_loader = BatchGenerator(train_iter, 'text', 'label')
        self.val_batch_loader = BatchGenerator(val_iter, 'text', 'label')
        self.test_batch_loader = BatchGenerator(test_iter, 'text', 'label')

        # Store hyperparameters
        self.batch_size = batch_size
        self.iterations = iterations
        self.initial_lr = initial_lr

        # Create Model
        emb_size, emb_dim = self.data.TEXT.vocab.vectors.size()
        #         padding = (math.floor(kernel_sz / 2), 0)

        #         self.model = CNN(emb_size=emb_size, emb_dimension=emb_dim,
        #                              output_size=len(self.data.LABEL.vocab),
        #                              dropout=dropout, kernel_sz=kernel_sz, stride=1, padding=padding,
        #                              out_filters=hidden_size, pretrained_emb=self.data.TEXT.vocab.vectors)

        self.model = RNN(emb_size=emb_size,
                         emb_dimension=emb_dim,
                         pretrained_emb=self.data.TEXT.vocab.vectors,
                         output_size=len(self.data.LABEL.vocab),
                         num_layers=num_layers,
                         hidden_size=hidden_size,
                         dropout=dropout)

        if self.use_cuda:
            self.model.cuda()

    def train(self, min_stride=3):

        train_loss_hist = []
        val_loss_hist = []
        train_acc_hist = []
        val_acc_hist = []
        test_acc_hist = []

        best_score = 0.0

        loss = 0.0

        for itr in range(self.iterations):
            print("\nIteration: " + str(itr + 1))
            optimizer = optim.SGD(self.model.parameters(), lr=self.initial_lr)
            self.model.train()
            total_loss = 0.0
            total_acc = 0.0
            steps = 0

            data_iter = iter(self.train_batch_loader)

            for i in range(len(self.train_batch_loader)):

                ((x_batch, x_len_batch), y_batch) = next(data_iter)

                #                 if torch.min(x_len_batch) > min_stride:
                optimizer.zero_grad()

                loss, logits = self.model.forward(x_batch, y_batch)

                acc = torch.sum(torch.argmax(logits, dim=1) == y_batch)

                total_loss += loss.item()
                total_acc += acc.item()
                steps += 1

                loss.backward()

                optimizer.step()

            train_loss_hist.append(total_loss / steps)
            train_acc_hist.append(total_acc / len(self.data.train_data))

            val_loss, val_acc = self.eval_model(self.val_batch_loader,
                                                len(self.data.val_data))

            val_loss_hist.append(val_loss)
            val_acc_hist.append(val_acc)

            if val_acc > best_score:
                best_score = val_acc
                test_loss, test_acc = self.eval_model(self.test_batch_loader,
                                                      len(self.data.test_data))

            print("Train: {Loss: " + str(total_loss / steps) + ", Acc: " +
                  str(total_acc / len(self.data.train_data)) + " }")
            print("Val: {Loss: " + str(val_loss) + ", Acc: " + str(val_acc) +
                  " }")

        test_acc_hist.append(test_acc)

        return train_loss_hist, train_acc_hist, val_loss_hist, val_acc_hist, test_acc_hist

    def eval_model(self, batch_loader, N, min_stride=3):
        self.model.eval()

        total_loss = 0.0
        total_acc = 0.0
        steps = 0

        batch_iterator = iter(batch_loader)

        with torch.no_grad():
            for i in range(len(batch_loader)):

                ((x_batch, x_len_batch), y_batch) = next(batch_iterator)

                #                 if torch.min(x_len_batch) > min_stride:
                loss, logits = self.model(x_batch, y_batch)

                acc = torch.sum(torch.argmax(logits, dim=1) == y_batch)

                total_loss += loss.item()
                total_acc += acc.item()

        return (total_loss / N), (total_acc / N)
示例#6
0
def main(args):
    # hyperparameters
    batch_size = args.batch_size
    num_workers = 1

    # Image Preprocessing
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    ])

    # load COCOs dataset
    IMAGES_PATH = 'data/train2014'
    CAPTION_FILE_PATH = 'data/annotations/captions_train2014.json'

    vocab = load_vocab()
    train_loader = get_coco_data_loader(path=IMAGES_PATH,
                                        json=CAPTION_FILE_PATH,
                                        vocab=vocab,
                                        transform=transform,
                                        batch_size=batch_size,
                                        shuffle=True,
                                        num_workers=num_workers)

    IMAGES_PATH = 'data/val2014'
    CAPTION_FILE_PATH = 'data/annotations/captions_val2014.json'
    val_loader = get_coco_data_loader(path=IMAGES_PATH,
                                      json=CAPTION_FILE_PATH,
                                      vocab=vocab,
                                      transform=transform,
                                      batch_size=batch_size,
                                      shuffle=True,
                                      num_workers=num_workers)

    losses_val = []
    losses_train = []

    # Build the models
    ngpu = 1
    initial_step = initial_epoch = 0
    embed_size = args.embed_size
    num_hiddens = args.num_hidden
    learning_rate = 1e-3
    num_epochs = 3
    log_step = args.log_step
    save_step = 500
    checkpoint_dir = args.checkpoint_dir

    encoder = CNN(embed_size)
    decoder = RNN(embed_size,
                  num_hiddens,
                  len(vocab),
                  1,
                  rec_unit=args.rec_unit)

    # Loss
    criterion = nn.CrossEntropyLoss()

    if args.checkpoint_file:
        encoder_state_dict, decoder_state_dict, optimizer, *meta = utils.load_models(
            args.checkpoint_file, args.sample)
        initial_step, initial_epoch, losses_train, losses_val = meta
        encoder.load_state_dict(encoder_state_dict)
        decoder.load_state_dict(decoder_state_dict)
    else:
        params = list(decoder.parameters()) + list(
            encoder.linear.parameters()) + list(encoder.batchnorm.parameters())
        optimizer = torch.optim.Adam(params, lr=learning_rate)

    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    if args.sample:
        return utils.sample(encoder, decoder, vocab, val_loader)

    # Train the Models
    total_step = len(train_loader)
    try:
        for epoch in range(initial_epoch, num_epochs):

            for step, (images, captions,
                       lengths) in enumerate(train_loader, start=initial_step):

                # Set mini-batch dataset
                images = utils.to_var(images, volatile=True)
                captions = utils.to_var(captions)
                targets = pack_padded_sequence(captions,
                                               lengths,
                                               batch_first=True)[0]

                # Forward, Backward and Optimize
                decoder.zero_grad()
                encoder.zero_grad()

                if ngpu > 1:
                    # run on multiple GPU
                    features = nn.parallel.data_parallel(
                        encoder, images, range(ngpu))
                    outputs = nn.parallel.data_parallel(
                        decoder, features, range(ngpu))
                else:
                    # run on single GPU
                    features = encoder(images)
                    outputs = decoder(features, captions, lengths)

                train_loss = criterion(outputs, targets)
                losses_train.append(train_loss.data[0])
                train_loss.backward()
                optimizer.step()

                # Run validation set and predict
                if step % log_step == 0:
                    encoder.batchnorm.eval()
                    # run validation set
                    batch_loss_val = []
                    for val_step, (images, captions,
                                   lengths) in enumerate(val_loader):
                        images = utils.to_var(images, volatile=True)
                        captions = utils.to_var(captions, volatile=True)

                        targets = pack_padded_sequence(captions,
                                                       lengths,
                                                       batch_first=True)[0]
                        features = encoder(images)
                        outputs = decoder(features, captions, lengths)
                        val_loss = criterion(outputs, targets)
                        batch_loss_val.append(val_loss.data[0])

                    losses_val.append(np.mean(batch_loss_val))

                    # predict
                    sampled_ids = decoder.sample(features)
                    sampled_ids = sampled_ids.cpu().data.numpy()[0]
                    sentence = utils.convert_back_to_text(sampled_ids, vocab)
                    print('Sample:', sentence)

                    true_ids = captions.cpu().data.numpy()[0]
                    sentence = utils.convert_back_to_text(true_ids, vocab)
                    print('Target:', sentence)

                    print(
                        'Epoch: {} - Step: {} - Train Loss: {} - Eval Loss: {}'
                        .format(epoch, step, losses_train[-1], losses_val[-1]))
                    encoder.batchnorm.train()

                # Save the models
                if (step + 1) % save_step == 0:
                    utils.save_models(encoder, decoder, optimizer, step, epoch,
                                      losses_train, losses_val, checkpoint_dir)
                    utils.dump_losses(
                        losses_train, losses_val,
                        os.path.join(checkpoint_dir, 'losses.pkl'))

    except KeyboardInterrupt:
        pass
    finally:
        # Do final save
        utils.save_models(encoder, decoder, optimizer, step, epoch,
                          losses_train, losses_val, checkpoint_dir)
        utils.dump_losses(losses_train, losses_val,
                          os.path.join(checkpoint_dir, 'losses.pkl'))
示例#7
0
def main(args):
    # hyperparameters
    batch_size = args.batch_size
    num_workers = 2

    # Image Preprocessing
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    ])

    vocab = load_vocab()

    loader = get_basic_loader(dir_path=os.path.join(args.image_path),
                              transform=transform,
                              batch_size=batch_size,
                              shuffle=False,
                              num_workers=num_workers)

    # Build the models
    embed_size = args.embed_size
    num_hiddens = args.num_hidden
    checkpoint_path = 'checkpoints'

    encoder = CNN(embed_size)
    decoder = RNN(embed_size,
                  num_hiddens,
                  len(vocab),
                  1,
                  rec_unit=args.rec_unit)

    encoder_state_dict, decoder_state_dict, optimizer, *meta = utils.load_models(
        args.checkpoint_file)
    encoder.load_state_dict(encoder_state_dict)
    decoder.load_state_dict(decoder_state_dict)

    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Train the Models
    try:
        results = []
        with torch.no_grad():
            for step, (images, image_ids) in enumerate(tqdm(loader)):
                images = utils.to_var(images)

                features = encoder(images)
                captions = beam_sample(decoder, features)
                # captions = decoder.sample(features)
                captions = captions.cpu().data.numpy()
                captions = [
                    utils.convert_back_to_text(cap, vocab) for cap in captions
                ]
                captions_formatted = [{
                    'image_id': int(img_id),
                    'caption': cap
                } for img_id, cap in zip(image_ids, captions)]
                results.extend(captions_formatted)
                print('Sample:', captions_formatted[0])
    except KeyboardInterrupt:
        print('Ok bye!')
    finally:
        import json
        file_name = 'captions_model.json'
        with open(file_name, 'w') as f:
            json.dump(results, f)
示例#8
0
def prepare_datasets():
    """
    Prepares the training, validation and test (Kaggle) datasets used by the XGBoost model \n
    This function looks into the XGBOOST_CONFIG dictionary in config.py for the following information: \n
    * which embedding NN to use (looks for the .pth checkpoint in XGBOOST_CONFIG['embedder'])
    * how to extract the embedding: XGBOOST_CONFIG['embedding_use_hidden', 'embedding_use_output', 'embedding_size']
    * how many numeric variables to add as input
    * where to dump the prepared .npy files: XGBOOST_CONFIG['train_file', 'val_file', 'test_file']
    """
    checkpoint = torch.load(XGBOOST_CONFIG['embedder'])
    embed = RNN(config=checkpoint['net_config']).eval()
    embed.load_state_dict(checkpoint['model'])
    embed = embed.cuda()

    annotated_dataset = TweetDataset(dataset_type='all')
    test_dataset = TweetDataset(dataset_type='test')

    def get_data(dataset, message):
        N = len(dataset)
        data = np.zeros((N, XGBOOST_CONFIG['numeric_data_size'] +
                         XGBOOST_CONFIG['embedding_size'] + 1))  # 1 for answer
        loader = DataLoader(dataset,
                            batch_size=TRAIN_CONFIG['batch_size'],
                            num_workers=TRAIN_CONFIG['workers'],
                            collate_fn=collate_function,
                            shuffle=False)
        current_idx = 0
        n = len(loader)
        print('')
        for batch_index, batch in enumerate(loader):
            printProgressBar(batch_index, n, prefix=message)
            batch_size = batch['numeric'].shape[0]

            numeric = batch['numeric'].cuda()
            text = batch['embedding'].cuda()

            if XGBOOST_CONFIG['embedding_use_hidden']:
                embedding = embed(
                    text,
                    numeric[:, :checkpoint['net_config']['numeric_data_size']]
                )[1]
            elif XGBOOST_CONFIG['embedding_use_output']:
                embedding = torch.exp(
                    embed(
                        text, numeric[:, :checkpoint['net_config']
                                      ['numeric_data_size']])[0]) - 1
            else:  # expecting a built-in embedding layer -> taking the mean of the embeddings
                embedding = embed.emb(text).mean(axis=1)

            data[current_idx:current_idx+batch_size, XGBOOST_CONFIG['numeric_data_size']:-1] = \
                embedding.detach().cpu().numpy()
            data[current_idx:current_idx+batch_size, :XGBOOST_CONFIG['numeric_data_size']] = \
                numeric.detach().cpu().numpy()
            data[current_idx:current_idx + batch_size,
                 -1] = batch['target'].numpy()

            current_idx += batch_size

        return data

    annotated_data = get_data(annotated_dataset, "Preparing train.csv ...")
    split = int(len(annotated_dataset) * DATASET_CONFIG['train_percent'])
    np.save(XGBOOST_CONFIG['train_file'], annotated_data[1:split])
    np.save(XGBOOST_CONFIG['val_file'], annotated_data[split:])

    test_data = get_data(test_dataset, "Preparing evaluation.csv ...")
    with open(DATASET_CONFIG['test_csv_relative_path'], newline='') as csvfile:
        ids = [line[0] for line in list(csv.reader(csvfile))[1:]]

    ids = np.array(ids).reshape(np.shape(ids)[0], 1)
    prepared_test_data = np.concatenate((test_data, ids), axis=1)
    np.save(XGBOOST_CONFIG['test_file'], prepared_test_data)