def train():
    print('Preprocessing raw data')
    preprocessor = Preprocessor()
    preprocessor.preprocess()

    dataset = Dataset(preprocessor)

    print('Training MF')
    mf = MF(preprocessor, dataset)
    mf.train_or_load_if_exists()

    print('Building I2I')
    i2i = Item2Item(dataset)

    print('Generating candidates')
    candidate_generator = CandidateGenerator(preprocessor, dataset, mf, i2i)
    X_train, y_train, q_train, q_train_reader = candidate_generator.generate_train()
    X_val, y_val, q_val, q_val_reader = candidate_generator.generate_val()

    import pickle
    try:
        with open('puke.pkl', 'wb') as f:
            pickle.dump((X_train, y_train, q_train, q_train_reader,
                         X_val, y_val, q_val, q_val_reader), f)
    except:
        print("Couldn't save puke")

    print('Training ranker')
    ranker = Ranker()
    ranker.train(X_train, y_train, q_train, X_val, y_val, q_val)
    ranker.save()

    print('Validating ranker')
    rank_scores = ranker.rank(X_val)
    print('ndcg', dataset.validate_ndcg(y_val, q_val, q_val_reader, rank_scores))
예제 #2
0
def main(args):
    torch.manual_seed(333)
    if use_cuda:
        torch.cuda.manual_seed(333)
    random.seed(333)
    train_data_path = "data/training.dat"
    train_eval_data_path = "data/train-eval.dat"
    dev_data_path = "data/full/dev.dat"
    eval_data_path = "data/full/evaluation.dat"
    feats_path = "data/model.features"
    num_feats = len([line for line in open(feats_path)])
    batch_size = 80
    ranker = Ranker(num_feats, 256)
    ## Instances for training - loaded as pairs
    feat_indices = set([i for i in range(num_feats)])
    train_instances = load_data(train_data_path, num_feats, feat_indices)
    train_eval_instances = load_eval_data(train_data_path, num_feats,
                                          feat_indices)
    dev_instances = load_data(dev_data_path, num_feats, feat_indices)
    dev_eval_instances = load_eval_data(dev_data_path, num_feats, feat_indices)
    tst_instances = load_eval_data(eval_data_path, num_feats, feat_indices)
    logger.info("Loaded {} training instances with {} features".format(
        len(train_instances), num_feats))
    trainer = RankerTrainer(ranker, batch_size, 'output/')
    trainer.train(train_instances, dev_instances, train_eval_instances,
                  dev_eval_instances, tst_instances)
    ranker.save('output/ranker.model')