Пример #1
0
def main():
    if len(sys.argv) <= 2:
        print(
            "please specify a number of examples and a model name (e.g. models.baseline.random_handle)"
        )
        sys.exit(1)

    eval_set_size = int(sys.argv[1])
    module_name = sys.argv[2]

    # splitting training data
    print("splitting training data into", eval_set_size,
          "(test) v. rest (train)")
    data.load_train()
    tweets = np.array(data.TRAIN)
    np.random.seed(SEED)
    np.random.shuffle(tweets)
    test_tweets, train_tweets = tweets[:eval_set_size], tweets[eval_set_size:]

    hyper_parameters = models.parse_hyper_parameters(sys.argv[3:])
    model_class = importlib.import_module(module_name).Model
    print("Model:", module_name, hyper_parameters)

    print("Training...")
    model = model_class(tqdm(train_tweets, dynamic_ncols=True),
                        **hyper_parameters)
    print("Evaluating...")
    accuracy, correct, tests = eval.evaluate(
        model, tqdm(test_tweets, dynamic_ncols=True))
    print(f"Label accuracy: {correct}/{tests} ({accuracy:%})")
Пример #2
0
def train(model: keras.Model):
    from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard, \
        ReduceLROnPlateau
    from tensorflow.keras.optimizers import SGD
    model.compile(optimizer=SGD(learning_rate=0.1, momentum=0.9,
                                nesterov=True),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    weight_path = 'weights/%s.h5' % model.name
    x_train, y_train = data.load_train('cifar10', channel_first=False)
    train_iter, val_iter = data.get_train_val_iterator(x_train, y_train)
    callbacks = [
        ReduceLROnPlateau(patience=10, min_lr=1e-3, verbose=1),
        ModelCheckpoint(weight_path,
                        verbose=1,
                        save_best_only=True,
                        save_weights_only=True),
        TensorBoard(),
    ]
    steps_per_epoch = int(len(x_train) *
                          (1 - data.val_split)) // common.batch_size
    model.fit(train_iter,
              epochs=200,
              callbacks=callbacks,
              steps_per_epoch=steps_per_epoch,
              validation_data=val_iter)
Пример #3
0
def train(options):
    attributes_train, labels_train = preprocess(load_train(),
                                                normalize=options.normalize)
    attributes_val, labels_val = preprocess(load_val(),
                                            normalize=options.normalize)
    n_attributes = attributes_train.shape[1]
    model = get_model(options, n_attributes)
    model.train(attributes_train, labels_train, attributes_val, labels_val)

    # save model
    if options.save_model is not None:
        model.save(options.save_model)

    # compute validation scores
    predictions_val = model.predict(attributes_val)
    return get_binary_class_scores(labels_val, predictions_val)
Пример #4
0
from keras.optimizers import Adam, Adadelta
from sklearn.model_selection import train_test_split
from keras import backend as K
from matplotlib import pyplot as plt

from data import load_train
from models.cnn_vgg import VGG

print(K.tensorflow_backend._get_available_gpus())

# Data properties
num_classes = 10
img_x, img_y = 64, 64

# Load training data
train_images, train_labels = load_train()
x_train, x_valid, y_train, y_valid = train_test_split(train_images, train_labels, test_size=0.2, random_state=42, stratify=train_labels)

# Reshape and normalize images
x_train = x_train.reshape(x_train.shape[0], img_x, img_y, 1)
x_valid = x_valid.reshape(x_valid.shape[0], img_x, img_y, 1)
x_train = x_train.astype('float32')
x_valid = x_valid.astype('float32')
x_train /= 255.
x_valid /= 255.

# One-hot encode labels
y_train = to_categorical(y_train, num_classes)
y_valid = to_categorical(y_valid, num_classes)

print(f'Train images dim: {x_train.shape}')
Пример #5
0
def plot_confusion_matrix(X, Y, figsize=(10, 6), cmap=plt.cm.Greens):
    Y_pred = model.predict(X)
    Y_pred = np.argmax(Y_pred, axis=1)
    Y_true = np.argmax(Y, axis=1)
    cm = confusion_matrix(Y_true, Y_pred)

    plt.figure(figsize=figsize)
    ax = sns.heatmap(cm, cmap=cmap, annot=True, square=True)
    ax.set_ylabel('Actual', fontsize=30)
    ax.set_xlabel('Predicted', fontsize=30)
    plt.show()
    

if __name__ == '__main__':
    train_data = load_train()
    X_train, Y_train = separate_train(train_data)
    X_train, Y_train = preprocess_input(X_train, Y_train)
    X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train,
                                                      test_size=0.1,
                                                      random_state=SEED)
    
    # train_model(X_train, X_val, Y_train, Y_val)
    model = load_model('model.h5')

    # To test new data, load the data, separate the features from the labels
    # and preprocess the data. Then change X_val and Y_val to the desired data
    final_loss, final_accuracy = model.evaluate(X_val, Y_val, verbose=0)
    print('Final Loss: {:.4f}, Final Accuracy: {:.4f}'.format(
        final_loss, final_accuracy))
    
Пример #6
0
def main():
    options = parse_arguments()
    functional_features, non_functional_features, normal_ff, normal_nff = split_features(load_train(), selected_attack_class=options.attack)
    nff_attributes, labels_mal = preprocess(non_functional_features, normalize=options.normalize)
    normal_attributes, labels_nor = preprocess(normal_nff, normalize=options.normalize)
    n_attributes = nff_attributes.shape[1]
    trainingset = (normal_attributes, nff_attributes, labels_nor, labels_mal)

    functional_features, non_functional_features, normal_ff, normal_nff = split_features(load_val(), selected_attack_class=options.attack)
    nff_attributes, labels_mal = preprocess(non_functional_features, normalize=options.normalize)
    normal_attributes, labels_nor = preprocess(normal_nff, normalize=options.normalize)
    n_attributes = nff_attributes.shape[1]
    validationset = (normal_attributes, nff_attributes, labels_nor, labels_mal)

    model = WGAN(options, n_attributes)
    model.train(trainingset, validationset)

    # save model
    if options.save_model is not None:
        save_model_directory = os.path.join(options.save_model, options.name)
        os.makedirs(save_model_directory, exist_ok=True)
        model.save(save_model_directory)
Пример #7
0
            writer.writerow(review.__dict__)


def remove_diacritic(input):
    """
    Accept a unicode string, and return a normal string without any diacritical marks.
    input arguments:
        input: the string to strip accents from
    output arguments:
        the stripped input
    """
    return unicodedata.normalize('NFKD', input).encode('ASCII', 'ignore')


if __name__ == "__main__":
    dataset = sys.argv[1]
    if dataset == 'train':
        reviews = data.load_train()
    elif dataset == 'test':
        reviews = data.load_test()
    else:
        raise ValueError('No dataset ' + dataset + ' found!')
    print "reviews loaded"
    reviews_dict_languages = split_by_language(reviews)

    for k, v in reviews_dict_languages.iteritems():
        print k
        review_list = correct_spelling_and_stem(k, v)
        print "corrected and stemmed"
        save_reviews_to_csv(k, review_list, dataset)
        print "saved to csv"
Пример #8
0
'''

from nltk.corpus import stopwords
from textblob import TextBlob
import nltk
from nltk import word_tokenize as wt
from nltk.stem import WordNetLemmatizer

from gensim import corpora, models, similarities
from gensim.models import LdaModel

stop = set(stopwords.words())

from data import load_train

sentences, label = load_train()

texts = [[word for word in document.lower().split() if word not in stop]
         for document in sentences]

dictionary = corpora.Dictionary(texts)

corpus = [dictionary.doc2bow(text) for text in texts]

topic_num = 100

lda = LdaModel(corpus, num_topics=topic_num, eval_every=5,
               passes=200)  # train model

max_num = topic_num + 10
Пример #9
0
    print(title)
    start = time.time()
    pipeline.fit(x, y)
    train_time = time.time()
    print(f'Training time: {train_time-start}')
    print(f'Training accuracy: {pipeline.score(x, y)}')
    print(f'Validation accuracy: {pipeline.score(x_v, y_v)}')
    print(f'Scoring time: {time.time()-train_time}')
    save_model(pipeline, f'{model_name}.joblib')


if __name__ == '__main__':
    print('Loading data...')
    start = time.time()
    max_features = None
    data_train = load_train()
    train, validation = train_test_split(data_train,
                                         test_size=0.3,
                                         random_state=42)

    x, y = zip(*train)
    x_v, y_v = zip(*validation)
    print(f'Time to load data: {time.time()-start}')
    print(f'Training with max_features: {max_features}')

    logreg_unigram_tfidf = logreg_bigram_tfidf = sgd_unigram_tfidf = sgd_bigram_tfidf = linsvc_unigram_tfidf = linsvc_bigram_tfidf = nb_unigram = True
    # logreg_unigram_tfidf = False
    # logreg_bigram_tfidf = False
    # sgd_unigram_tfidf = False
    # sgd_bigram_tfidf = False
    # linsvc_unigram_tfidf = False
Пример #10
0
import data
from bs4 import BeautifulSoup
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.linear_model import LogisticRegression
from data import load_train
from data import load_test
from matplotlib import pyplot as plt
from create_submission import write_predictions_to_csv
import math

#Loading training set
tot_train = load_train()

#Extracting the reviews' contents
tot_x_content_train = [review.content for review in tot_train][:196539]
#Creating the labels for the training set, taking them directly from the data
tot_y_train = np.array([review.rating for review in tot_train])[:196539]

import nltk
nltk.download()
from nltk.corpus import stopwords
stop_words = stopwords.words("english")
print stop_words

# Adding very often used words in the field of hotels' reviewing to the stop-words list.
stop_words.extend([
    u'hotel', u'hotels', u'room', u'rooms', u'night', u'nights', u'location',
    u'bed', u'beds', u'place', u'breakfast', u'position', u'station', u'stay',
    u'stayed', u'staff', u'accomodation', u'accommodations', u'during',
Пример #11
0
            pass

    if len(vecs) == 0:
        printv('Warning: entirely OOV tweet, zeroing...')
        return np.zeros(300)

    return np.mean(vecs, axis=0)


if __name__ == '__main__':
    print('Loading word2vec...')
    embeddings = gensim.models.KeyedVectors.load_word2vec_format(WORD2VEC,
                                                                 binary=True)

    print('Loading training data...')
    train = data.load_train()  #[:LIMIT]
    dic = data.load_dic()

    print('Computing tweet averages...')
    X = np.zeros(shape=(len(train), 300))
    y = np.zeros(shape=(len(train), ), dtype=int)
    for i, tweet in enumerate(train):
        X[i] = tweet_embedding_by_average(tweet[0], dic, embeddings)
        y[i] = tweet[1]

    print('Training the model...')
    clf = RandomForestClassifier(n_estimators=200, max_depth=10)
    clf.fit(X, y)

    print('Loading test data...')
    test = data.load_test()  #[:LIMIT]
                batch_scores = sess.run(cnn.q_ap_cosine, feed_dict)
                for score, qid, label in zip(batch_scores, qids, labels):
                    scoreDict.setdefault(qid, list)
                    scoreDict[qid].append([score, label])
            lev1 = .0
            lev0 = .0
            for k, v in scoreDict.items():
                v.sort(key=operator.itemgetter(0), reverse=True)
                score, flag = v[0]
                if flag == '1':
                    lev1 += 1
                if flag == '0':
                    lev0 += 1
            # 回答的正确数和错误数
            print('回答正确数 ' + str(lev1))
            print('回答错误数 ' + str(lev0))
            print('准确率 ' + str(float(lev1)/(lev1+lev0)))

        # 每5000步测试一下
        evaluate_every = 5000
        # 开始训练和测试
        sess.run(tf.global_variables_initializer())
        for i in range(config.num_epochs):
            for (_, x_batch_1, x_batch_2, x_batch_3) in data.load_train(config.batch_size, config.sequence_length, config.sequence_length):
                train_step(x_batch_1, x_batch_2, x_batch_3)
                if (i+1) % evaluate_every == 0:
                    print("\n测试{}:".format((i+1)/evaluate_every))
                    dev_step()
                    print

Пример #13
0
def main():
    logs = {
        'start-time': now(),
        'lock': LOCK,
        'num_workers': WORKERS,
        'reg_lambda': REG_LAMBDA,
        'epochs': EPOCHS,
        'learning_rate': LEARNING_RATE
    }
    # Logging configuration
    logging.basicConfig(filename='logs/tmp_logs.txt', level=logging.WARNING)

    with Manager() as manager:
        logging.warning("{}:Loading Training Data...".format(now()))
        logging.warning("{}:FULL TEST {}".format(now(), FULL_TEST))
        logging.warning("{}:WORKERS {}".format(now(), WORKERS))
        logging.warning("{}:LOCK {}".format(now(), LOCK))

        val, train = data.load_train()
        train = manager.dict(train)
        dim = max([max(k) for k in train['features']]) + 1
        init_w = [0.0] * dim

        if LOCK:
            lock = Lock()
            w = Array(c_double, init_w, lock=lock)
        else:
            w = RawArray(c_double, init_w)

        logs['start-compute-time'] = now()
        start_time = time()
        logging.warning("{}:Starting SGD...".format(
            logs['start-compute-time']))

        val_queue = Queue()
        workers = []
        for worker in range(WORKERS):
            p = Process(target=sgd, args=(worker, train, w, val_queue))
            p.start()
            workers.append(p)

        logs['epochs-stats'] = []

        # Initial early stopping variables
        persistence = [0.0] * PERSISTENCE
        smallest_val_loss = float('inf')
        workers_done = [False] * WORKERS
        while True:
            workers_alive = any([p.is_alive() for p in workers])
            if not workers_alive:
                logging.warning("{}:WORKERS DONE!".format(now()))
                logs['end-compute-time'] = now()
                logging.warning("{}:END TIME {}".format(
                    now(),
                    time() - start_time))
            if not workers_alive and val_queue.empty():
                logging.warning("{}:WORKERS DONE AND QUEUE EMPTY!".format(
                    now()))
                final_weights = w[:]
                break
            # Block until getting a message
            val_queue_item = val_queue.get()
            worker = val_queue_item['worker']
            epoch = val_queue_item['epoch']
            weights = val_queue_item['weights']

            val_loss = loss(val, weights)

            logging.warning("{}:EPOCH:{}".format(now(), epoch))
            logging.warning("{}:VAL. LOSS:{}".format(now(), val_loss))
            logs['epochs-stats'].append({
                'epoch_number': epoch,
                'val_loss': val_loss
            })

            # Early stopping criteria
            persistence[epoch % PERSISTENCE] = val_loss
            if smallest_val_loss < min(persistence):
                # Early stop
                logging.warning("{}:EARLY STOP!".format(now()))
                # Terminate all workers, but save the weights before
                # because a worker could have a lock on them. Terminating
                # a worker doesn't release its lock.
                final_weights = w[:]
                for p in workers:
                    p.terminate()
                logs['end-compute-time'] = now()
                logging.warning("{}:END TIME {}".format(
                    now(),
                    time() - start_time))
                break
            else:
                smallest_val_loss = val_loss if val_loss < smallest_val_loss else smallest_val_loss

        # Close queue
        val_queue.close()
        val_queue.join_thread()

        logging.warning("{}:Calculating Train Accuracy".format(now()))
        train_accuracy = accuracy(train, final_weights)
        logs['train_accuracy'] = train_accuracy
        logging.warning("{}:TRAIN ACC:{}".format(now(), train_accuracy))

        # Calculate test accuracy
        logging.warning("{}:Calculating Test Accuracy".format(now()))
        test = data.load_test(FULL_TEST)
        test_accuracy = accuracy(test, final_weights)
        logs['test_accuracy'] = test_accuracy
        logging.warning("{}:TEST ACC:{}".format(now(), test_accuracy))

        logs['end_time'] = now()
        with open(
                'logs/logs.w_{}.l_{}.e_{}.time_{}.json'.format(
                    WORKERS, LOCK, EPOCHS, logs['start-time']), 'w') as f:
            json.dump([logs], f)
Пример #14
0
        emb_dropout=0.1)

    model = MoCo(dim=args.moco_dim,
                 K=args.moco_k,
                 m=args.moco_m,
                 T=args.moco_t,
                 ver=args.version,
                 arch=args.arch,
                 bn_splits=args.bn_splits,
                 symmetric=args.symmetric,
                 v3_encoder=vit).cuda()

    print(model)
    # exit(0)

    train_data, train_loader = load_train(args)
    memory_data, memory_loader = load_memory(args)
    test_data, test_loader = load_test(args)

    # define optimizer
    if args.version == 3:
        optimizer = torch.optim.AdamW(model.parameters(),
                                      lr=args.lr,
                                      weight_decay=args.wd)
    else:
        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=args.lr,
                                    weight_decay=args.wd,
                                    momentum=0.9)

    # load model if resume
Пример #15
0
def main(unused_argv):
    '''
    开始训练和测试
    '''
    with tf.device('/gpu:0'), tf.Session(config=config.cf) as sess:
        # 建立CNN网络
        cnn = QACNN(config, sess)
        # 保存Metrics数据
        tf_writer = tf.summary.FileWriter(logdir=os.path.join(
            curdir, 'sdist/'),
                                          graph=sess.graph)
        # Summaries for loss and accuracy during training
        summary_loss = tf.summary.scalar("train/loss", cnn.loss)
        summary_accu = tf.summary.scalar("train/accuracy", cnn.accu)
        summary_op = tf.summary.merge([summary_loss, summary_accu])

        # 训练函数
        def train_step(x_batch_1, x_batch_2, x_batch_3):
            feed_dict = {
                cnn.q: x_batch_1,
                cnn.aplus: x_batch_2,
                cnn.aminus: x_batch_3,
                cnn.keep_prob: config.keep_prob
            }
            _, step, loss, accuracy, summaries = sess.run([
                cnn.train_op, cnn.global_step, cnn.loss, cnn.accu, summary_op
            ], feed_dict)
            tf_writer.add_summary(summaries, step)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(
                time_str, step, loss, accuracy))
            return time_str, step, loss, accuracy

        # 测试函数
        def dev_step(step):
            # 混淆矩阵建立评估
            # http://www.uta.fi/sis/tie/tl/index/Rates.pdf
            quality = {'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0}
            losses = []
            labels = []
            scores = []
            pbar = tqdm(config.test_data)
            pbar.set_description("evaluate step %s" % step)
            for x in pbar:
                _, loss, score = cnn.predict(
                    dict({
                        'question': x[1],
                        'utterance': x[2]
                    }), x[3])
                scores.append(score)
                losses.append(loss)
                labels.append(x[3])

            # 使用Roc Curve生成Threshold
            # http://alexkong.net/2013/06/introduction-to-auc-and-roc/
            fpr, tpr, th = metrics.roc_curve(labels, scores)
            threshold = round(metrics.auc(fpr, tpr), 5)

            if score >= threshold and x[3] == 1:
                quality['tp'] += 1
            elif score >= threshold and x[3] == 0:
                quality['fp'] += 1
            elif score < threshold and x[3] == 1:
                quality['fn'] += 1
            else:
                quality['tn'] += 1

            accuracy = float(quality['tp'] + quality['tn']) / (
                quality['tp'] + quality['tn'] + quality['fp'] + quality['fn'])
            loss = tf.reduce_mean(losses).eval()
            tf_writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag="evaluate/loss", simple_value=loss),
                    tf.Summary.Value(tag="evaluate/accuracy",
                                     simple_value=accuracy)
                ]), step)

            print('evaluation @ step %d: 准确率: %d, 损失函数: %s, threshold: %d' %
                  (step, accuracy, loss, threshold))

        # 每500步测试一下
        # 开始训练和测试
        sess.run(tf.global_variables_initializer())
        for i in range(config.num_epochs):
            for (_, x_question, x_utterance,
                 y) in data.load_train(config.batch_size,
                                       config.sequence_length,
                                       config.sequence_length):
                if len(
                        _
                ) == config.batch_size:  # 在epoch的最后一个mini batch中,数据条数可能不等于 batch_size
                    _, global_step, _, _ = train_step(x_question, x_utterance,
                                                      y)

                if global_step % FLAGS.evaluate_every == 0:
                    dev_step(global_step)
Пример #16
0
def main():
    logs = {
        'start-time': now(),
        'num_workers': PARTITIONS,
        'reg_lambda': REG_LAMBDA,
        'epochs': EPOCHS,
        'batch': BATCH,
        'learning_rate': LEARNING_RATE
    }
    # Logging configuration
    logging.basicConfig(filename='/data/logs/tmp_logs.txt',
                        level=logging.WARNING)

    logging.warning("{}:Loading Training Data...".format(now()))
    # Load data
    val_df, train_df = data.load_train(spark)

    # Collect validation for loss computation
    val_collected = val_df.collect()

    # Create initial weight vector
    dimensions = train_df.rdd \
                         .map(lambda row: max(row.features.keys())).max() + 1
    w = [0.0] * dimensions

    # Create the partitions of the train dataset
    partitions = train_df.rdd.zipWithIndex() \
                             .map(lambda x: (x[1], x[0])) \
                             .partitionBy(PARTITIONS)

    persistence = [0.0] * PERSISTENCE
    smallest_val_loss = float('inf')

    logs['start-compute-time'] = now()
    logging.warning("{}:Starting SGD...".format(logs['start-compute-time']))
    logs['epochs-stats'] = []
    for epoch in range(EPOCHS):
        epoch_stat = {'epoch_number': epoch, 'epoch_start': now()}
        logging.warning("{}:EPOCH:{}".format(now(), epoch))
        # Broadcast w to make it available for each worker
        w_b = sc.broadcast(w)
        # Calculate Mini Batch Gradient Descent for each partition
        partition_deltas_w = \
            partitions.mapPartitions(lambda x: sgd(x, w_b)).collect()
        # Collect total update weights for all workers in one epoch
        total_delta_w = {}
        for delta_w in partition_deltas_w:
            for k, v in delta_w.items():
                if k in total_delta_w:
                    total_delta_w[k] += v
                else:
                    total_delta_w[k] = v

        # Update weights
        for k, v in total_delta_w.items():
            w[k] += LEARNING_RATE * v

        val_loss = loss(val_collected, w)
        epoch_stat['val_loss'] = val_loss
        epoch_stat['epoch_end'] = now()
        logs['epochs-stats'].append(epoch_stat)
        logging.warning("{}:VAL. LOSS:{}".format(now(), val_loss))

        # Early stopping criteria
        persistence[epoch % PERSISTENCE] = val_loss
        if smallest_val_loss < min(persistence):
            # Early stop
            logging.warning("{}:EARLY STOP!".format(now()))
            break
        else:
            smallest_val_loss = val_loss if val_loss < smallest_val_loss else smallest_val_loss

    logs['end-compute-time'] = now()

    logging.warning("{}:Calculating Train Accuracy".format(now()))
    train_accuracy = accuracy(train_df, w)
    logs['train_accuracy'] = train_accuracy

    logging.warning("{}:TRAIN ACC:{}".format(now(), train_accuracy))

    logging.warning("{}:Calculating Test Accuracy".format(now()))
    test_df = data.load_test(spark)
    test_accuracy = accuracy(test_df, w)
    logs['test_accuracy'] = test_accuracy

    logging.warning("{}:TEST ACC:{}".format(now(), test_accuracy))

    spark.stop()

    logs['end_time'] = now()
    with open(
            '/data/logs/logs.workers_{}.batch_{}.epochs_{}.time_{}.json'.
            format(PARTITIONS, BATCH, EPOCHS, logs['start-time']), 'w') as f:
        json.dump([logs], f)
Пример #17
0
import sys
import importlib

from tqdm import tqdm

import data
data.load_train()
import models


def main():
    # interpret command line arguments
    if len(sys.argv) <= 1:
        print(
            "please specify a model name (e.g. models.baseline.random_handle)")
        sys.exit(1)
    module_name = sys.argv[1]
    hyper_parameters = models.parse_hyper_parameters(sys.argv[2:])

    # training model
    module = importlib.import_module(module_name)
    print(
        f"Training {module_name}.Model with hyperparameters {hyper_parameters}"
    )
    model = module.Model(tqdm(data.TRAIN, dynamic_ncols=True),
                         **hyper_parameters)
    print("Training done!")

    models.save(model, module_name, hyper_parameters)

Пример #18
0
def main(
    data_path,
    train_data_path,
    val_data_path,
    test_data_path,
    output_path,
    prediction_name='suggestion.json',
    cache_dir=None,
    model_type='lda',
):
    '''
    train a model and make a prediction

    Args:
        data_path: path to the data json file
        train_data_path: path to the train data
        val_data_path: path to the val data
        test_data_path: path to the test data
        output_path: path to the output dir
        prediction_name: the name of prediction output file
        cache_dir: where to save cache
        model: which model to use

    Returns:
        None
    '''
    # load data
    print('Loading data')
    documents, titles = data.load_doc_title(
        data_path,
        cache_path=os.path.join(cache_dir, 'preproccessed')
        if cache_dir is not None else None,
    )
    train_data = data.load_train(train_data_path)
    val_data = data.load_val(val_data_path)
    test_data = data.load_test(test_data_path)

    # convert to corpus if needed
    if model_type in ('lda', ):
        print('Preparing corpus')
        dictionary = utils.make_dictionary(
            documents.content,
            cache_path=os.path.join(cache_dir, 'dictionary')
            if cache_dir is not None else None,
            filter_=False,
        )
        documents['bow'] = utils.make_corpus(documents.content, dictionary)
        titles['bow'] = utils.make_corpus(titles.content, dictionary)

    # train
    print('Training model')
    if model_type == 'lda':
        model = engine.CustomLDA(documents, titles, dictionary)
        model = model.train(train_data, val_data, output_path)
    elif model_type == 'doc2vec':
        model = engine.CustomDoc2vec(documents, titles)
        model = model.train(train_data, val_data, output_path)
    else:
        raise ValueError(model_type)

    # inference
    prediction = model.predict(test_data)
    prediction_output = os.path.join(output_path, prediction_name)
    data.dump_prediction(prediction, prediction_output)
    return