def predict(self, X, ids, weight, feature_names=None): self.feature_names = feature_names query_indptr, query_ids = self._build_query_indptr(ids) # We wont be using this, but Queries wont instantiate without it y = np.zeros(X.shape[0]) q = Queries(X, y, query_indptr, query_ids=query_ids) y_pred = self.model.predict(q, n_jobs=self.params['n_jobs']) return y_pred
def _build_queries(self, X, y, ids, w): query_indptr, query_ids = self._build_query_indptr(ids) q = Queries(X, y, query_indptr, query_ids=query_ids) # weights as per query instead of per-row ... just guess wn = [ np.mean(w[query_indptr[i]:query_indptr[i + 1]]) for i in range(len(query_indptr) - 1) ] wn = [w[i] for i in query_indptr[:-1]] return q, np.ascontiguousarray(wn, dtype='float64')
def output_prediction(test_data,head=True): test_queries = Queries.load_from_text('../data/svmlight_true_test_chunk_temp1.txt',purge=None) metric = NormalizedDiscountedCumulativeGain(38, queries=test_queries) # Make the prediction predict_scores = model.predict(test_queries, n_jobs=-1) # Extract the srch_id and prop_id into a dataframe result_unordered = test_data.loc[:,['srch_id','prop_id']] result_unordered['scores'] = predict_scores result_ordered = result_unordered.sort(['srch_id','scores'],ascending=[1,0]) # Write the submission into file result_ordered.loc[:,['srch_id','prop_id']].to_csv('../data/predict_test_data.csv',index=False, mode='a',header = head) print "Prediction has been written inte the file..."
def output_prediction(test_data, head=True): test_queries = Queries.load_from_text( '../data/svmlight_true_test_chunk_temp1.txt', purge=None) metric = NormalizedDiscountedCumulativeGain(38, queries=test_queries) # Make the prediction predict_scores = model.predict(test_queries, n_jobs=-1) # Extract the srch_id and prop_id into a dataframe result_unordered = test_data.loc[:, ['srch_id', 'prop_id']] result_unordered['scores'] = predict_scores result_ordered = result_unordered.sort(['srch_id', 'scores'], ascending=[1, 0]) # Write the submission into file result_ordered.loc[:, ['srch_id', 'prop_id']].to_csv( '../data/predict_test_data.csv', index=False, mode='a', header=head) print "Prediction has been written inte the file..."
'booking_bool']].apply(relevance, axis=1), '../data/svmlight_test_avg_mean_std_competitors_m2.txt', query_id=test_data.srch_id) # Please set it for each training model_name = 'model_012' # Parameters for file recording # LOG = model_log_folder + model_name + '.log' # MODELLER_DIR = modeller_folder + model_name + '/' # Turn on logging. # logging.basicConfig(filename=LOG, format='%(asctime)s : %(message)s', # level=logging.INFO) # Load the query datasets. train_queries = Queries.load_from_text( '../data/svmlight_training_avg_mean_std_competitors_m2.txt') with open('../data/train_queries2.pkl', 'wb') as pickle_output_train: pickle.dump(train_queries, pickle_output_train) valid_queries = Queries.load_from_text( '../data/svmlight_validation_avg_mean_std_competitors_m2.txt') with open('../data/valid_queries2.pkl', 'wb') as pickle_output_valid: pickle.dump(valid_queries, pickle_output_valid) test_queries = Queries.load_from_text( '../data/svmlight_test_avg_mean_std_competitors_m2.txt') with open('../data/test_queries2.pkl', 'wb') as pickle_output_test: pickle.dump(test_queries, pickle_output_test) # pickle_output_test.close() logging.info('===============================================================')
from rankpy.queries import Queries from rankpy.queries import find_constant_features from rankpy.models import LambdaRandomForest from rankpy.gridsearch import gridsearch from rankpy.gridsearch import train_test_split # Turn on logging. logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : ' '%(message)s', level=logging.INFO) # Load the query datasets. training_queries = Queries.load_from_text('data/MQ2007/Fold1/train.txt') validation_queries = Queries.load_from_text('data/MQ2007/Fold1/vali.txt') test_queries = Queries.load_from_text('data/MQ2007/Fold1/test.txt') logging.info('=' * 80) # Save them to binary format ... training_queries.save('data/MQ2007/Fold1/training') validation_queries.save('data/MQ2007/Fold1/validation') test_queries.save('data/MQ2007/Fold1/test') # ... because loading them will be then faster. training_queries = Queries.load('data/MQ2007/Fold1/training') validation_queries = Queries.load('data/MQ2007/Fold1/validation') test_queries = Queries.load('data/MQ2007/Fold1/test')
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO) logging.info('dump train file') # listwise_sampling() logging.info('================================================================================') logging.info('dump test file') # listwise_sampling_test() logging.info('================================================================================') logging.info('load query database') train_queries = Queries.load_from_text('../data/svmlight_train.txt') valid_queries = Queries.load_from_text('../data/svmlight_val.txt') test_queries = Queries.load_from_text('../data/svmlight_test.txt') logging.info('================================================================================') logging.info('train LambdaMart') # Prepare metric for this set of queries. metric = NormalizedDiscountedCumulativeGain(38, queries=[train_queries, valid_queries, test_queries]) # Initialize LambdaMART model and train it. model = LambdaMART(n_estimators=10000, max_depth=5, shrinkage=0.08, estopping=100, n_jobs=-1, n_iterations=100) model.fit(metric, train_queries, validation=valid_queries) logging.info('================================================================================') logging.info('test')
from rankpy.queries import Queries from rankpy.queries import find_constant_features from rankpy.models import LambdaMART from sklearn.grid_search import ParameterGrid from ndcg import NDCG from maprec import MAP # Turn on logging. logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO) dcg_folds_scores = [] map_folds_scores = [] # load the data for each fold for i in xrange(1, 6): # load training, validation and testing sets for current training_queries = Queries.load('MSLR-WEB10K/Fold' + str(i) + '/training') validation_queries = Queries.load('MSLR-WEB10K/Fold' + str(i) + '/validation') test_queries = Queries.load('MSLR-WEB10K/Fold' + str(i) + '/test') logging.info( '================================================================================' ) # Print basic info about query datasets. logging.info('Train queries: %s' % training_queries) logging.info('Valid queries: %s' % validation_queries) logging.info('Test queries: %s' % test_queries) logging.info( '================================================================================' )
import numpy as np import logging import rankpy from rankpy.queries import Queries # Turn on logging. logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO) for i in xrange(1, 6): # Load the query datasets. training_queries = Queries.load_from_text('MSLR-WEB10K/Fold' + str(i) + '/train.txt') validation_queries = Queries.load_from_text('MSLR-WEB10K/Fold' + str(i) + '/vali.txt') test_queries = Queries.load_from_text('MSLR-WEB10K/Fold' + str(i) + '/test.txt') logging.info( '================================================================================' ) # Save them to binary format ... training_queries.save('MSLR-WEB10K/Fold' + str(i) + '/training') validation_queries.save('MSLR-WEB10K/Fold' + str(i) + '/validation') test_queries.save('MSLR-WEB10K/Fold' + str(i) + '/test') # ... because loading them will be then faster. training_queries = Queries.load('MSLR-WEB10K/Fold' + str(i) + '/training') validation_queries = Queries.load('MSLR-WEB10K/Fold' + str(i) + '/validation') test_queries = Queries.load('MSLR-WEB10K/Fold' + str(i) + '/test')
import logging,argparse from rankpy.queries import Queries from rankpy.models import LambdaMART from rankpy.metrics import * parser = argparse.ArgumentParser(description="Rank py.") parser.add_argument("metric",help="metric",type=int) parser.add_argument("iter",help="iterations",type=int) args = parser.parse_args() # Turn on logging. logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO) # Load the query datasets. train_queries = Queries.load_from_text('data/train.txt') valid_queries = Queries.load_from_text('data/vali.txt') test_queries = Queries.load_from_text('data/test.txt') logging.info('================================================================================') # Save them to binary format ... train_queries.save('data/fold2_train') valid_queries.save('data/fold2_vali') test_queries.save('data/fold2_test') # ... because loading them will be then faster. train_queries = Queries.load('data/fold2_train') valid_queries = Queries.load('data/fold2_vali') test_queries = Queries.load('data/fold2_test')
dump_svmlight_file(validation_data.iloc[:,1:-2].values,validation_data.iloc[:,-2:].apply(relevance,axis = 1),'../data/svmlight_validation.txt',query_id=validation_data.srch_id) # In[52]: dump_svmlight_file(test_data.iloc[:,1:-2].values,test_data.iloc[:,-2:].apply(relevance,axis = 1),'../data/svmlight_test.txt',query_id = test_data.srch_id) # In[54]: # Turn on logging. logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO) # Load the query datasets. train_queries = Queries.load_from_text('../data/svmlight_training.txt') valid_queries = Queries.load_from_text('../data/svmlight_validation.txt') test_queries = Queries.load_from_text('../data/svmlight_test.txt') logging.info('================================================================================') # Save them to binary format ... #train_queries.save('../data/train_bin') #valid_queries.save('../data/validation_bin') #test_queries.save('../data/test_bin') # ... because loading them will be then faster. #train_queries = Queries.load('../data/train_bin') #valid_queries = Queries.load('../data/validation_bin') #test_queries = Queries.load('../data/test_bin')
# Generate the SVMLight format file dump_svmlight_file(training_data_new[col_names].values,training_data.iloc[:,-2:].apply(relevance,axis = 1),'../data/svmlight_training_avg_mean_std_competitors_m2.txt',query_id=training_data_new.srch_id) dump_svmlight_file(validation_data_new[col_names].values,validation_data.iloc[:,-2:].apply(relevance,axis = 1),'../data/svmlight_validation_avg_mean_std_competitors_m2.txt',query_id=validation_data_new.srch_id) dump_svmlight_file(test_data_new[col_names].values,test_data.iloc[:,-2:].apply(relevance,axis = 1),'../data/svmlight_test_avg_mean_std_competitors_m2.txt',query_id = test_data_new.srch_id) # Please set it for each training model_name = 'model_012' # Parameters for file recording LOG = model_log_folder + model_name +'.log' MODELLER_DIR = modeller_folder + model_name + '/' # Turn on logging. logging.basicConfig(filename = LOG,format='%(asctime)s : %(message)s', level=logging.INFO) # Load the query datasets. train_queries = Queries.load_from_text('../data/svmlight_training_avg_mean_std_competitors_m2.txt') pickle_output_train = open('../data/train_queries2.pkl','w') pickle.dump(train_queries,pickle_output_train) pickle_output_train.close() valid_queries = Queries.load_from_text('../data/svmlight_validation_avg_mean_std_competitors_m2.txt') pickle_output_valid = open('../data/valid_queries2.pkl','w') pickle.dump(valid_queries,pickle_output_valid) pickle_output_valid.close() test_queries = Queries.load_from_text('../data/svmlight_test_avg_mean_std_competitors_m2.txt') pickle_output_test = open('../data/test_queries2.pkl','w') pickle.dump(test_queries,pickle_output_test) pickle_output_test.close() logging.info('================================================================================') # Save them to binary format ...
import logging from rankpy.queries import Queries from rankpy.queries import find_constant_features from rankpy.models import LambdaMART from rankpy.gridsearch import gridsearch from rankpy.gridsearch import train_test_split # Turn on logging. logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : ' '%(message)s', level=logging.INFO) # Load the query datasets. training_queries = Queries.load_from_text('data/MQ2007/Fold1/train.txt') validation_queries = Queries.load_from_text('data/MQ2007/Fold1/vali.txt') test_queries = Queries.load_from_text('data/MQ2007/Fold1/test.txt') logging.info('=' * 80) # Save them to binary format ... training_queries.save('data/MQ2007/Fold1/training') validation_queries.save('data/MQ2007/Fold1/validation') test_queries.save('data/MQ2007/Fold1/test') # ... because loading them will be then faster. training_queries = Queries.load('data/MQ2007/Fold1/training') validation_queries = Queries.load('data/MQ2007/Fold1/validation') test_queries = Queries.load('data/MQ2007/Fold1/test')
# -*- coding: utf-8 -*- import numpy as np import pandas as pd import logging from rankpy.queries import Queries from rankpy.queries import find_constant_features from rankpy.models import LambdaMART # Turn on logging. logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO) # Load the query datasets. training_queries = Queries.load_from_text('train.svmlight') validation_queries = Queries.load_from_text('vali.svmlight') test_queries = Queries.load_from_text('test.svmlight') logging.info( '================================================================================' ) # Print basic info about query datasets. logging.info('Train queries: %s' % training_queries) logging.info('Valid queries: %s' % validation_queries) logging.info('Test queries: %s' % test_queries) logging.info( '================================================================================' )
import numpy as np import pandas as pd import logging from rankpy.queries import Queries from rankpy.queries import find_constant_features from rankpy.models import LambdaMART # Turn on logging. logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO) # Load the query datasets. training_queries = Queries.load_from_text('train.svmlight') validation_queries = Queries.load_from_text('vali.svmlight') test_queries = Queries.load_from_text('test.svmlight') logging.info('================================================================================') # Print basic info about query datasets. logging.info('Train queries: %s' % training_queries) logging.info('Valid queries: %s' % validation_queries) logging.info('Test queries: %s' % test_queries) logging.info('================================================================================') # Set this to True in order to remove queries containing all documents # of the same relevance score -- these are useless for LambdaMART. remove_useless_queries = False
import logging, argparse from rankpy.queries import Queries from rankpy.models import LambdaMART from rankpy.metrics import * parser = argparse.ArgumentParser(description="Rank py.") parser.add_argument("metric", help="metric", type=int) parser.add_argument("iter", help="iterations", type=int) args = parser.parse_args() # Turn on logging. logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO) # Load the query datasets. train_queries = Queries.load_from_text('data/train.txt') valid_queries = Queries.load_from_text('data/vali.txt') test_queries = Queries.load_from_text('data/test.txt') logging.info( '================================================================================' ) # Save them to binary format ... train_queries.save('data/fold2_train') valid_queries.save('data/fold2_vali') test_queries.save('data/fold2_test') # ... because loading them will be then faster. train_queries = Queries.load('data/fold2_train') valid_queries = Queries.load('data/fold2_vali')