metric = NormalizedDiscountedCumulativeGain( 38, queries=[train_queries, valid_queries, test_queries]) # Initialize LambdaMART model and train it. # model = LambdaMART(n_estimators=10000, max_depth=2, shrinkage=0.07, # estopping=100, n_jobs=-1) # model.fit(metric, train_queries) # , validation=valid_queries) model = LambdaMART( metric='NDCG@38', max_leaf_nodes=7, shrinkage=0.07, estopping=100, n_jobs=-1, random_state=42, use_pines=True, pines_kwargs=dict( switch_criterion=ObliviousCartSwitchCriterionType.OBLIVIOUS_WHILE_CAN, tree_type=TreeType.OBLIVIOUS_CART, max_n_splits=10, min_samples_leaf=50, max_depth=2, )) model.fit(train_queries, validation_queries=valid_queries) logging.info('===============================================================') # Save the model to files # os.mkdir(MODELLER_DIR) # logging.info('New folder is created: %s' % MODELLER_DIR) # joblib.dump(model, MODELLER_DIR + model_name + '.pkl') # logging.info('Model is saves as: %s' % MODELLER_DIR + model_name + '.pkl')
#parameters metric = 'nDCG@10' max_leaf_nodes = 7 min_samples_split = 2 estopping = 10 #TODO change these to the optimal ones found from the grid search n_estimators = 1000 max_features = None min_samples_leaf = 50 learn_rate = 0.2 model = LambdaMART(metric=metric, max_leaf_nodes=max_leaf_nodes, min_samples_split=min_samples_split, estopping=estopping, n_estimators=n_estimators, max_features=max_features, min_samples_leaf=min_samples_leaf, shrinkage=learn_rate, n_jobs=-1, random_state=42) model.fit(training_queries, validation_queries=validation_queries) logging.info( '================================================================================' ) test_ranks = model.predict_rankings(test_queries) dcg_score = NDCG(test_queries, test_ranks).mean_ndcg() map_score = MAP(test_queries, test_ranks).mean_average_precision() dcg_folds_scores.append(dcg_score)
cfs = find_constant_features( [training_queries, validation_queries, test_queries]) # Get rid of constant features and (possibly) remove useless queries. training_queries.adjust(remove_features=cfs, purge=remove_useless_queries) validation_queries.adjust(remove_features=cfs, purge=remove_useless_queries) test_queries.adjust(remove_features=cfs) # Print basic info about query datasets. logging.info('Train queries: %s' % training_queries) logging.info('Valid queries: %s' % validation_queries) logging.info('Test queries: %s' % test_queries) logging.info('=' * 80) model = LambdaMART(metric='NDCG@10', max_leaf_nodes=7, shrinkage=0.1, estopping=50, n_jobs=-1, min_samples_leaf=50, random_state=42) model.fit(training_queries, validation_queries=validation_queries) logging.info('=' * 80) logging.info('%s on the test queries: %.8f' % (model.metric, model.evaluate(test_queries, n_jobs=-1))) model.save('LambdaMART_L7_S0.1_E50_' + model.metric)
#train_queries.save('../data/train_bin') #valid_queries.save('../data/validation_bin') #test_queries.save('../data/test_bin') # ... because loading them will be then faster. #train_queries = Queries.load('../data/train_bin') #valid_queries = Queries.load('../data/validation_bin') #test_queries = Queries.load('../data/test_bin') logging.info('================================================================================') # Print basic info about query datasets. logging.info('Train queries: %s' % train_queries) logging.info('Valid queries: %s' % valid_queries) logging.info('Test queries: %s' %test_queries) logging.info('================================================================================') # Prepare metric for this set of queries. metric = NormalizedDiscountedCumulativeGain(38, queries=[train_queries, valid_queries, test_queries]) # Initialize LambdaMART model and train it. model = LambdaMART(n_estimators=10000, max_depth=4, shrinkage=0.08, estopping=100, n_jobs=-1) model.fit(metric, train_queries, validation=valid_queries) logging.info('================================================================================') # Print out the performance on the test set. logging.info('%s on the test queries: %.8f' % (metric, metric.evaluate_queries(test_queries, model.predict(test_queries, n_jobs=-1))))
valid_queries = Queries.load_from_text('../data/svmlight_val.txt') test_queries = Queries.load_from_text('../data/svmlight_test.txt') logging.info( '================================================================================' ) logging.info('train LambdaMart') # Prepare metric for this set of queries. metric = NormalizedDiscountedCumulativeGain( 38, queries=[train_queries, valid_queries, test_queries]) # Initialize LambdaMART model and train it. model = LambdaMART(n_estimators=10000, max_depth=5, shrinkage=0.08, estopping=100, n_jobs=-1, n_iterations=100) model.fit(metric, train_queries, validation=valid_queries) logging.info( '================================================================================' ) logging.info('test') # Print out the performance on the test set. logging.info('%s on the test queries: %.8f' % (metric, metric.evaluate_queries(test_queries, model.predict(test_queries, n_jobs=-1))))
'================================================================================' ) metrics = {} # Prepare metric for this set of queries. metrics[0] = NormalizedDiscountedCumulativeGain( 10, queries=[train_queries, valid_queries, test_queries]) # metrics[1] = SeznamRank(10, queries=[train_queries, valid_queries, test_queries]) metrics[1] = DiscountedCumulativeGain( 10, queries=[train_queries, valid_queries, test_queries]) metrics[2] = WinnerTakesAll( 10, queries=[train_queries, valid_queries, test_queries]) # metrics[4] = ExpectedReciprocalRank(10, queries=[train_queries, valid_queries, test_queries]) # Initialize LambdaMART model and train it. model = LambdaMART(n_estimators=50000, max_depth=4, shrinkage=0.1, estopping=args.iter, n_jobs=4) metric = metrics[args.metric] model.fit(metric, train_queries, validation=valid_queries) logging.info( '================================================================================' ) # Print out the performance on the test set. logging.info('%s on the test queries: %.8f' % (metric, metric.evaluate_queries(test_queries, model.predict(test_queries, n_jobs=-1)))) #EOF