level=logging.INFO) # Load the query datasets. training_queries = Queries.load_from_text('data/MQ2007/Fold1/train.txt') validation_queries = Queries.load_from_text('data/MQ2007/Fold1/vali.txt') test_queries = Queries.load_from_text('data/MQ2007/Fold1/test.txt') logging.info('=' * 80) # Save them to binary format ... training_queries.save('data/MQ2007/Fold1/training') validation_queries.save('data/MQ2007/Fold1/validation') test_queries.save('data/MQ2007/Fold1/test') # ... because loading them will be then faster. training_queries = Queries.load('data/MQ2007/Fold1/training') validation_queries = Queries.load('data/MQ2007/Fold1/validation') test_queries = Queries.load('data/MQ2007/Fold1/test') logging.info('=' * 80) # Set this to True in order to remove queries containing all documents # of the same relevance score -- these are useless for LambdaMART. remove_useless_queries = False # Find constant query-document features. cfs = find_constant_features( [training_queries, validation_queries, test_queries]) # Get rid of constant features and (possibly) remove useless queries. training_queries.adjust(remove_features=cfs, purge=remove_useless_queries)
from rankpy.queries import Queries from rankpy.queries import find_constant_features from rankpy.models import LambdaMART from sklearn.grid_search import ParameterGrid from ndcg import NDCG from maprec import MAP # Turn on logging. logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO) dcg_folds_scores = [] map_folds_scores = [] # load the data for each fold for i in xrange(1, 6): # load training, validation and testing sets for current training_queries = Queries.load('MSLR-WEB10K/Fold' + str(i) + '/training') validation_queries = Queries.load('MSLR-WEB10K/Fold' + str(i) + '/validation') test_queries = Queries.load('MSLR-WEB10K/Fold' + str(i) + '/test') logging.info( '================================================================================' ) # Print basic info about query datasets. logging.info('Train queries: %s' % training_queries) logging.info('Valid queries: %s' % validation_queries) logging.info('Test queries: %s' % test_queries) logging.info( '================================================================================' )
'%(message)s', level=logging.INFO) # Load the query datasets. training_queries = Queries.load_from_text('data/MQ2007/Fold1/train.txt') validation_queries = Queries.load_from_text('data/MQ2007/Fold1/vali.txt') test_queries = Queries.load_from_text('data/MQ2007/Fold1/test.txt') logging.info('=' * 80) # Save them to binary format ... training_queries.save('data/MQ2007/Fold1/training') validation_queries.save('data/MQ2007/Fold1/validation') test_queries.save('data/MQ2007/Fold1/test') # ... because loading them will be then faster. training_queries = Queries.load('data/MQ2007/Fold1/training') validation_queries = Queries.load('data/MQ2007/Fold1/validation') test_queries = Queries.load('data/MQ2007/Fold1/test') logging.info('=' * 80) # Set this to True in order to remove queries containing all documents # of the same relevance score -- these are useless for LambdaMART. remove_useless_queries = False # Find constant query-document features. cfs = find_constant_features([training_queries, validation_queries, test_queries]) # Get rid of constant features and (possibly) remove useless queries.
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO) # Load the query datasets. train_queries = Queries.load_from_text('data/train.txt') valid_queries = Queries.load_from_text('data/vali.txt') test_queries = Queries.load_from_text('data/test.txt') logging.info('================================================================================') # Save them to binary format ... train_queries.save('data/fold2_train') valid_queries.save('data/fold2_vali') test_queries.save('data/fold2_test') # ... because loading them will be then faster. train_queries = Queries.load('data/fold2_train') valid_queries = Queries.load('data/fold2_vali') test_queries = Queries.load('data/fold2_test') logging.info('================================================================================') # Print basic info about query datasets. logging.info('Train queries: %s' % train_queries) logging.info('Valid queries: %s' % valid_queries) logging.info('Test queries: %s' %test_queries) logging.info('================================================================================') metrics = {} # Prepare metric for this set of queries. metrics[0] = NormalizedDiscountedCumulativeGain(10, queries=[train_queries, valid_queries, test_queries]) # metrics[1] = SeznamRank(10, queries=[train_queries, valid_queries, test_queries])
# Load the query datasets. train_queries = Queries.load_from_text('data/train.txt') valid_queries = Queries.load_from_text('data/vali.txt') test_queries = Queries.load_from_text('data/test.txt') logging.info( '================================================================================' ) # Save them to binary format ... train_queries.save('data/fold2_train') valid_queries.save('data/fold2_vali') test_queries.save('data/fold2_test') # ... because loading them will be then faster. train_queries = Queries.load('data/fold2_train') valid_queries = Queries.load('data/fold2_vali') test_queries = Queries.load('data/fold2_test') logging.info( '================================================================================' ) # Print basic info about query datasets. logging.info('Train queries: %s' % train_queries) logging.info('Valid queries: %s' % valid_queries) logging.info('Test queries: %s' % test_queries) logging.info( '================================================================================' )