metric = NormalizedDiscountedCumulativeGain(
    38, queries=[train_queries, valid_queries, test_queries])

# Initialize LambdaMART model and train it.
# model = LambdaMART(n_estimators=10000, max_depth=2, shrinkage=0.07,
#                    estopping=100, n_jobs=-1)
# model.fit(metric, train_queries)  # , validation=valid_queries)

model = LambdaMART(
    metric='NDCG@38',
    max_leaf_nodes=7,
    shrinkage=0.07,
    estopping=100,
    n_jobs=-1,
    random_state=42,
    use_pines=True,
    pines_kwargs=dict(
        switch_criterion=ObliviousCartSwitchCriterionType.OBLIVIOUS_WHILE_CAN,
        tree_type=TreeType.OBLIVIOUS_CART,
        max_n_splits=10,
        min_samples_leaf=50,
        max_depth=2,
    ))

model.fit(train_queries, validation_queries=valid_queries)
logging.info('===============================================================')
# Save the model to files
# os.mkdir(MODELLER_DIR)
# logging.info('New folder is created: %s' % MODELLER_DIR)
# joblib.dump(model, MODELLER_DIR + model_name + '.pkl')
# logging.info('Model is saves as: %s' % MODELLER_DIR + model_name + '.pkl')
示例#2
0
    #parameters
    metric = 'nDCG@10'
    max_leaf_nodes = 7
    min_samples_split = 2
    estopping = 10
    #TODO change these to the optimal ones found from the grid search
    n_estimators = 1000
    max_features = None
    min_samples_leaf = 50
    learn_rate = 0.2

    model = LambdaMART(metric=metric,
                       max_leaf_nodes=max_leaf_nodes,
                       min_samples_split=min_samples_split,
                       estopping=estopping,
                       n_estimators=n_estimators,
                       max_features=max_features,
                       min_samples_leaf=min_samples_leaf,
                       shrinkage=learn_rate,
                       n_jobs=-1,
                       random_state=42)

    model.fit(training_queries, validation_queries=validation_queries)

    logging.info(
        '================================================================================'
    )

    test_ranks = model.predict_rankings(test_queries)
    dcg_score = NDCG(test_queries, test_ranks).mean_ndcg()
    map_score = MAP(test_queries, test_ranks).mean_average_precision()
    dcg_folds_scores.append(dcg_score)
示例#3
0
cfs = find_constant_features(
    [training_queries, validation_queries, test_queries])

# Get rid of constant features and (possibly) remove useless queries.
training_queries.adjust(remove_features=cfs, purge=remove_useless_queries)
validation_queries.adjust(remove_features=cfs, purge=remove_useless_queries)
test_queries.adjust(remove_features=cfs)

# Print basic info about query datasets.
logging.info('Train queries: %s' % training_queries)
logging.info('Valid queries: %s' % validation_queries)
logging.info('Test queries: %s' % test_queries)

logging.info('=' * 80)

model = LambdaMART(metric='NDCG@10',
                   max_leaf_nodes=7,
                   shrinkage=0.1,
                   estopping=50,
                   n_jobs=-1,
                   min_samples_leaf=50,
                   random_state=42)

model.fit(training_queries, validation_queries=validation_queries)

logging.info('=' * 80)

logging.info('%s on the test queries: %.8f' %
             (model.metric, model.evaluate(test_queries, n_jobs=-1)))

model.save('LambdaMART_L7_S0.1_E50_' + model.metric)
#train_queries.save('../data/train_bin')
#valid_queries.save('../data/validation_bin')
#test_queries.save('../data/test_bin')

# ... because loading them will be then faster.
#train_queries = Queries.load('../data/train_bin')
#valid_queries = Queries.load('../data/validation_bin')
#test_queries = Queries.load('../data/test_bin')

logging.info('================================================================================')

# Print basic info about query datasets.
logging.info('Train queries: %s' % train_queries)
logging.info('Valid queries: %s' % valid_queries)
logging.info('Test queries: %s' %test_queries)

logging.info('================================================================================')

# Prepare metric for this set of queries.
metric = NormalizedDiscountedCumulativeGain(38, queries=[train_queries, valid_queries, test_queries])

# Initialize LambdaMART model and train it.
model = LambdaMART(n_estimators=10000, max_depth=4, shrinkage=0.08, estopping=100, n_jobs=-1)
model.fit(metric, train_queries, validation=valid_queries)

logging.info('================================================================================')

# Print out the performance on the test set.
logging.info('%s on the test queries: %.8f' % (metric, metric.evaluate_queries(test_queries, model.predict(test_queries, n_jobs=-1))))

valid_queries = Queries.load_from_text('../data/svmlight_val.txt')
test_queries = Queries.load_from_text('../data/svmlight_test.txt')

logging.info(
    '================================================================================'
)
logging.info('train LambdaMart')

# Prepare metric for this set of queries.
metric = NormalizedDiscountedCumulativeGain(
    38, queries=[train_queries, valid_queries, test_queries])

# Initialize LambdaMART model and train it.
model = LambdaMART(n_estimators=10000,
                   max_depth=5,
                   shrinkage=0.08,
                   estopping=100,
                   n_jobs=-1,
                   n_iterations=100)
model.fit(metric, train_queries, validation=valid_queries)

logging.info(
    '================================================================================'
)
logging.info('test')

# Print out the performance on the test set.
logging.info('%s on the test queries: %.8f' %
             (metric,
              metric.evaluate_queries(test_queries,
                                      model.predict(test_queries, n_jobs=-1))))
示例#6
0
    '================================================================================'
)
metrics = {}
# Prepare metric for this set of queries.
metrics[0] = NormalizedDiscountedCumulativeGain(
    10, queries=[train_queries, valid_queries, test_queries])
# metrics[1] = SeznamRank(10, queries=[train_queries, valid_queries, test_queries])
metrics[1] = DiscountedCumulativeGain(
    10, queries=[train_queries, valid_queries, test_queries])
metrics[2] = WinnerTakesAll(
    10, queries=[train_queries, valid_queries, test_queries])
# metrics[4] = ExpectedReciprocalRank(10, queries=[train_queries, valid_queries, test_queries])
# Initialize LambdaMART model and train it.
model = LambdaMART(n_estimators=50000,
                   max_depth=4,
                   shrinkage=0.1,
                   estopping=args.iter,
                   n_jobs=4)
metric = metrics[args.metric]
model.fit(metric, train_queries, validation=valid_queries)

logging.info(
    '================================================================================'
)

# Print out the performance on the test set.
logging.info('%s on the test queries: %.8f' %
             (metric,
              metric.evaluate_queries(test_queries,
                                      model.predict(test_queries, n_jobs=-1))))
#EOF