level=logging.INFO)

# Load the query datasets.
training_queries = Queries.load_from_text('data/MQ2007/Fold1/train.txt')
validation_queries = Queries.load_from_text('data/MQ2007/Fold1/vali.txt')
test_queries = Queries.load_from_text('data/MQ2007/Fold1/test.txt')

logging.info('=' * 80)

# Save them to binary format ...
training_queries.save('data/MQ2007/Fold1/training')
validation_queries.save('data/MQ2007/Fold1/validation')
test_queries.save('data/MQ2007/Fold1/test')

# ... because loading them will be then faster.
training_queries = Queries.load('data/MQ2007/Fold1/training')
validation_queries = Queries.load('data/MQ2007/Fold1/validation')
test_queries = Queries.load('data/MQ2007/Fold1/test')

logging.info('=' * 80)

# Set this to True in order to remove queries containing all documents
# of the same relevance score -- these are useless for LambdaMART.
remove_useless_queries = False

# Find constant query-document features.
cfs = find_constant_features(
    [training_queries, validation_queries, test_queries])

# Get rid of constant features and (possibly) remove useless queries.
training_queries.adjust(remove_features=cfs, purge=remove_useless_queries)
예제 #2
0
from rankpy.queries import Queries
from rankpy.queries import find_constant_features
from rankpy.models import LambdaMART
from sklearn.grid_search import ParameterGrid

from ndcg import NDCG
from maprec import MAP

# Turn on logging.
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO)
dcg_folds_scores = []
map_folds_scores = []
# load the data for each fold
for i in xrange(1, 6):
    # load training, validation and testing sets for current
    training_queries = Queries.load('MSLR-WEB10K/Fold' + str(i) + '/training')
    validation_queries = Queries.load('MSLR-WEB10K/Fold' + str(i) +
                                      '/validation')
    test_queries = Queries.load('MSLR-WEB10K/Fold' + str(i) + '/test')

    logging.info(
        '================================================================================'
    )
    # Print basic info about query datasets.
    logging.info('Train queries: %s' % training_queries)
    logging.info('Valid queries: %s' % validation_queries)
    logging.info('Test queries: %s' % test_queries)

    logging.info(
        '================================================================================'
    )
예제 #3
0
                    '%(message)s', level=logging.INFO)

# Load the query datasets.
training_queries = Queries.load_from_text('data/MQ2007/Fold1/train.txt')
validation_queries = Queries.load_from_text('data/MQ2007/Fold1/vali.txt')
test_queries = Queries.load_from_text('data/MQ2007/Fold1/test.txt')

logging.info('=' * 80)

# Save them to binary format ...
training_queries.save('data/MQ2007/Fold1/training')
validation_queries.save('data/MQ2007/Fold1/validation')
test_queries.save('data/MQ2007/Fold1/test')

# ... because loading them will be then faster.
training_queries = Queries.load('data/MQ2007/Fold1/training')
validation_queries = Queries.load('data/MQ2007/Fold1/validation')
test_queries = Queries.load('data/MQ2007/Fold1/test')

logging.info('=' * 80)

# Set this to True in order to remove queries containing all documents
# of the same relevance score -- these are useless for LambdaMART.
remove_useless_queries = False

# Find constant query-document features.
cfs = find_constant_features([training_queries,
                              validation_queries,
                              test_queries])

# Get rid of constant features and (possibly) remove useless queries.
예제 #4
0
파일: rank-py.py 프로젝트: matulma4/esc
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO)

# Load the query datasets.
train_queries = Queries.load_from_text('data/train.txt')
valid_queries = Queries.load_from_text('data/vali.txt')
test_queries = Queries.load_from_text('data/test.txt')

logging.info('================================================================================')

# Save them to binary format ...
train_queries.save('data/fold2_train')
valid_queries.save('data/fold2_vali')
test_queries.save('data/fold2_test')

# ... because loading them will be then faster.
train_queries = Queries.load('data/fold2_train')
valid_queries = Queries.load('data/fold2_vali')
test_queries = Queries.load('data/fold2_test')

logging.info('================================================================================')

# Print basic info about query datasets.
logging.info('Train queries: %s' % train_queries)
logging.info('Valid queries: %s' % valid_queries)
logging.info('Test queries: %s' %test_queries)

logging.info('================================================================================')
metrics = {}
# Prepare metric for this set of queries.
metrics[0] = NormalizedDiscountedCumulativeGain(10, queries=[train_queries, valid_queries, test_queries])
# metrics[1] = SeznamRank(10, queries=[train_queries, valid_queries, test_queries])
예제 #5
0
# Load the query datasets.
train_queries = Queries.load_from_text('data/train.txt')
valid_queries = Queries.load_from_text('data/vali.txt')
test_queries = Queries.load_from_text('data/test.txt')

logging.info(
    '================================================================================'
)

# Save them to binary format ...
train_queries.save('data/fold2_train')
valid_queries.save('data/fold2_vali')
test_queries.save('data/fold2_test')

# ... because loading them will be then faster.
train_queries = Queries.load('data/fold2_train')
valid_queries = Queries.load('data/fold2_vali')
test_queries = Queries.load('data/fold2_test')

logging.info(
    '================================================================================'
)

# Print basic info about query datasets.
logging.info('Train queries: %s' % train_queries)
logging.info('Valid queries: %s' % valid_queries)
logging.info('Test queries: %s' % test_queries)

logging.info(
    '================================================================================'
)