예제 #1
0
def compareSystems(vali_queries,classifierPath,basic_ranker_path,clust_data_path,data,click):
    
    print "-Loading Data-"
    clf = pickle.load( open( classifierPath ) )
    basic_ranker=pickle.load( open( basic_ranker_path ) )
    clusterData=pickle.load(open(clust_data_path))
    queryData=pickle.load(open(data))
    
    ranker_tie="random"
    feature_count=basic_ranker.feature_count
    ranker_args="3"
    arg_str=""
    sample_send="sample_unit_sphere"
    iterations=100
    
    rankers=[0]*2
    rankers[0]=basic_ranker
    
    
    user_model = environment.CascadeUserModel(click)
    training_queries = query.load_queries(vali_queries, feature_count)
    compar_interleave=ProbabilisticInterleave(None)

    first_win=0
    print "-Calculating-"
    
    for i in range(iterations):
        if i%(iterations/10)==0:
            print str(float(i)*100/float(iterations))+"%"
        q = training_queries.get_query(random.choice(queryData.query_ranker.keys()))
        
        test=queryData.query_ranker[q.get_qid()][0]
        testWeights=str(test)
        testWeights=testWeights.replace("[", "")
        testWeights=testWeights.replace("]", "")
        weights = np.array([float(num) for num in testWeights.split(",")])
        print len(weights)
        ranker_tie="random"
        ranker_args="3"
        sample_send="sample_unit_sphere"

        rankers[1]=rankerClass.ProbabilisticRankingFunction(ranker_args,
                                                ranker_tie,
                                                feature_count,
                                                sample=sample_send,
                                                init=testWeights)
        
        
        l, a = compar_interleave.interleave(rankers[0], rankers[1], q, 10)
        c = user_model.get_clicks(l, q.get_labels())
        o = compar_interleave.infer_outcome(l, a, c, q)
        if(o<0):
            first_win+=1
        elif(o==0):
            coin=random.random()
            if(coin>0.5):
                first_win+=1
    result_com=float(first_win)/float(iterations)
    print "Basic ranker win rate:"+ str(result_com)
예제 #2
0
def compareSystems(vali_queries, classifierPath, basic_ranker_path,
                   clust_data_path, click):

    print "-Loading Data-"
    clf = pickle.load(open(classifierPath))
    basic_ranker = pickle.load(open(basic_ranker_path))
    clusterData = pickle.load(open(clust_data_path))

    ranker_tie = "random"
    feature_count = basic_ranker.feature_count
    ranker_args = "3"
    arg_str = ""
    sample_send = "sample_unit_sphere"
    iterations = 100

    rankers = [0] * 2
    rankers[0] = basic_ranker

    user_model = environment.CascadeUserModel(click)
    training_queries = query.load_queries(vali_queries, feature_count)
    compar_interleave = ProbabilisticInterleave(None)

    second_win = 0
    second_win_or_e = 0
    generic_win = 0
    equal = 0
    print "-Calculating-"
    for i in range(iterations):
        if i % (iterations / 10) == 0:
            print str(float(i) * 100 / float(iterations)) + "%"
        q = training_queries[random.choice(training_queries.keys())]
        rankers[1] = classifier.getRanker(clf, basic_ranker, q, clusterData)
        l, a = compar_interleave.interleave(rankers[0], rankers[1], q, 10)
        c = user_model.get_clicks(l, q.get_labels())
        o = compar_interleave.infer_outcome(l, a, c, q)
        if (o > 0):
            second_win += 1
            second_win_or_e += 1
        elif (o == 0):
            equal += 1
            coin = random.random()
            if (coin > 0.5):
                second_win_or_e += 1
        else:
            generic_win += 1

    result_com = float(second_win_or_e) / float(iterations)
    result_win = float(second_win) / float(iterations)
    result_win_generic = float(generic_win) / float(iterations)
    print "Our ranker win rate (with random choice if result was equal):" + str(
        result_com)
    print "Our ranker win rate:" + str(result_win)
    print "Generic ranker win rate:" + str(result_win_generic)
    print "Number win ours:" + str(second_win)
    print "Number win generic:" + str(generic_win)
    print "Number equal:" + str(equal)
    print "Total number iterations:" + str(iterations)
예제 #3
0
    def queryRanker(self):
        #Extract the high frequency queries from the training_queries
        HighFreqQueries = []
        training_queries = queryClass.load_queries(self.path_train, self.feature_count)
        test_queries = queryClass.load_queries(self.path_test, self.feature_count)
        #loop through all queries in the training set
        for index in training_queries.get_qids():
            highQuery = training_queries.get_query(index)
            #only keep the frequent queries 
            if(len(highQuery.__labels__) > self.minFreqCount):
                HighFreqQueries.append(highQuery)    
        print "found "+ str(len(HighFreqQueries)) + " high frequency queries"

        #build the query-ranker dictionary
        BestRanker = queryRankers()

        user_model = environment.CascadeUserModel(self.clickModel)
        evaluation2 = evaluation.NdcgEval()
        #test_queries = query.load_queries(sys.argv[2], feature_count)
        print "Read in training and testing queries"
        #for every query learn the best ranker and save it to the dictionary
        iter=0
        for highQuery in HighFreqQueries:
            ran=random.random()
            iter=iter+1
            if ran<self.threshold:
                print str(iter*100/len(HighFreqQueries))+"%"
                for i in xrange(self.rankersPerQuery):
                    learner = retrieval_system.ListwiseLearningSystem(self.feature_count, '-w random -c comparison.ProbabilisticInterleave -r ranker.ProbabilisticRankingFunction -s 3 -d 0.1 -a 0.01')
                    BestRanker.addInitRank(highQuery.get_qid(),learner.get_solution().w)
                    q = highQuery
                    for t in range(self.iterationCount):
                        l = learner.get_ranked_list(q)
                        c = user_model.get_clicks(l, q.get_labels())
                        s = learner.update_solution(c)
                        e = evaluation2.evaluate_all(s, test_queries)
                    
    
                    BestRanker.add(highQuery.get_qid(),learner.get_solution().w)
                    BestRanker.addList(highQuery.get_qid(),l)
                    BestRanker.addEval(highQuery.get_qid(),e)

        #save the dictionary to a file ('bestRanker.p')
        paths=self.path_train.split('/')
        name=paths[1]
        #pickle.dump(BestRanker, open( "QueryData/"+name+".data", "wb" ) )
        pickle.dump(BestRanker, open( "QueryData/"+self.dataset+str(self.iterationCount)+".data", "wb" ) )
        test = pickle.load( open( "QueryData/"+self.dataset+str(self.iterationCount)+".data", "rb" ) )
        print test.query_ranker.values()
예제 #4
0
    def groupRanker(self):
        #Extract the high frequency queries from the training_queries
        clusterData=pickle.load(open( self.clusterDataPath, "rb" ) )
        queryData= self.queryData

        
        HighFreqQueries = []
        training_queries = queryClass.load_queries(self.path_train, self.feature_count)
        test_queries = queryClass.load_queries(self.path_test, self.feature_count)
        #loop through all queries in the training set
        

        #build the query-ranker dictionary
        BestRanker = queryRankers()

        user_model = environment.CascadeUserModel(self.clickModel)
        evaluation2 = evaluation.NdcgEval()
        #test_queries = query.load_queries(sys.argv[2], feature_count)
        print "Read in training and testing queries"
        #for every query learn the best ranker and save it to the dictionary
        iter=0
        learner=[0]*len(clusterData.clusterToRanker.keys())
        for cluster in clusterData.clusterToRanker:
            learner[cluster] = retrieval_system.ListwiseLearningSystem(self.feature_count, '-w random -c comparison.ProbabilisticInterleave -r ranker.ProbabilisticRankingFunction -s 3 -d 0.1 -a 0.01')  
        for t in range(self.iterationCount):
            q = training_queries[random.choice(training_queries.keys())]
            temp=(float(np.sum(clusterData.queryToCluster[q.get_qid()])))/(float(len(clusterData.queryToCluster[q.get_qid()])))
            temp=int(temp+0.5)
            cluster=temp
            #cluster=clusterData.queryToCluster[q.get_qid()][0]
            
            iter=iter+1
            if iter%200==0:
                print str(iter*100/self.iterationCount)+"%"
            l = learner[cluster].get_ranked_list(q)
            c = user_model.get_clicks(l, q.get_labels())
            s = learner[cluster].update_solution(c)
            #e = evaluation2.evaluate_all(s, test_queries)
        for cluster in clusterData.clusterToRanker:
             clusterData.clusterToRanker[cluster]=[learner[cluster].get_solution().w.tolist()]
      
            
        #save the dictionary to a file ('bestRanker.p')
        paths=self.path_train.split('/')
        name=paths[1]
        #pickle.dump(BestRanker, open( "QueryData/"+name+".data", "wb" ) )
        pickle.dump(clusterData, open( "ClusterData/"+self.dataset+".data", "wb" ) )
예제 #5
0
def perform():
    # init
    test_num_features = 64
    queries = query.load_queries('../../data/NP2004/Fold1/test.txt', 64)
    bi = comparison.BalancedInterleave()
    user_model = environment.CascadeUserModel(
        '--p_click 0:0.0,1:1 --p_stop 0:0.0,1:0.0')

    # make rankers
    rankers = []
    for i in range(0, 5):
        rankers.append(
            ranker.ProbabilisticRankingFunction(
                '3',
                'random',
                64,
                init=parseRanker('../../data/features64/ranker-0' + str(i) +
                                 '.txt'),
                sample='sample_unit_sphere'))

    # main loop
    for N in [100, 1000, 10000]:
        pref_matrix = [[0 for x in xrange(5)] for x in xrange(5)]
        for iter in range(0, N):
            q = queries[random.choice(queries.keys())]
            for i in range(0, 5):
                for j in range(0, 5):
                    if i != j:
                        list, context = bi.interleave(rankers[i], rankers[j],
                                                      q, 10)
                        clicks = user_model.get_clicks(list, q.get_labels())
                        result = bi.infer_outcome(list, context, clicks, q)
                        if result < 0:
                            pref_matrix[i][j] += 1
                    else:
                        pref_matrix[i][j] = 0.50
        pref_matrix = generateProbabilityMatrix(pref_matrix, N)
        printMatrix(pref_matrix)
        print 'Best ranker is ' + '0' + str(
            getBestRanker(pref_matrix)) + ' (N = ' + str(N) + ').'
    print 'done!'
예제 #6
0
	def __init__(self, data_path, features):
		self.bi = comparison.BalancedInterleave()
		self.user_model = environment.CascadeUserModel('--p_click 0:0.0,1:1 --p_stop 0:0.0,1:0.0')	
		self.queries = query.load_queries(data_path, features)
예제 #7
0
import environment, evaluation, query, retrieval_system
import time
import datetime
import numpy as np

# init data, query_samples, d's
train_queries = query.load_queries('../../DATA/NP2004/Fold1/train.txt', 64)
test_queries = query.load_queries('../../DATA/NP2004/Fold1/test.txt', 64)
query_samples = 5000  # how many queries we sample

d = 3
k = 10
number_of_evaluation = query_samples / k

# init user model, evaluation methods
user_model = environment.CascadeUserModel(
    '--p_click 0:0.0,1:1 --p_stop 0:0.0,1:0.0')
evaluation = evaluation.NdcgEval()

rem_ndcg_evaluation_train = []
full_ndcg_evaluation_train = []
rem_ndcg_evaluation_test = []
full_ndcg_evaluation_test = []

for m in range(0, k):
    # for each k, we have different A matrix
    # as mentioned on the REMBO paper
    rem_learner = retrieval_system.ListwiseLearningSystemREMBO(
        64, d,
        '-w random -c comparison.ProbabilisticInterleave -r ranker.ProbabilisticRankingFunctionREMBO -s 3 -d 0.1 -a 0.01'
    )
    full_learner = retrieval_system.ListwiseLearningSystem(
예제 #8
0
def compareSystemsHist(vali_queries,classifierPath,basic_ranker_path,clust_data_path,data,click):
    
    print "-Loading Data-"
    clf = pickle.load( open( classifierPath ) )
    basic_ranker=pickle.load( open( basic_ranker_path ) )
    clusterData=pickle.load(open(clust_data_path))
    queryData=pickle.load(open(data))
    
    ranker_tie="random"
    feature_count=basic_ranker.feature_count
    ranker_args="3"
    arg_str=""
    sample_send="sample_unit_sphere"
    iterations=100
    
    rankers=[0]*2
    rankers[0]=basic_ranker
    
    
    user_model = environment.CascadeUserModel(click)
    training_queries = query.load_queries(vali_queries, feature_count)
    compar_interleave=ProbabilisticInterleave(None)

    print "-Calculating-"
    
    ii=0


    results=[]
    for qid in queryData.query_ranker.keys():
        print str(float(ii)*100/float(len(queryData.query_ranker.keys())))+"%"
        ii+=1
        q=training_queries.get_query(qid)
        for val in queryData.query_ranker[qid]:
            test=val
            #test=queryData.query_ranker[q][0]
            testWeights=str(test.tolist())
            testWeights=testWeights.replace("[", "")
            testWeights=testWeights.replace("]", "")
            #weights = np.array([float(num) for num in testWeights.split(",")])
            #print len(weights)
            ranker_tie="random"
            ranker_args="3"
            sample_send="sample_unit_sphere"
    
            rankers[1]=rankerClass.ProbabilisticRankingFunction(ranker_args,
                                                    ranker_tie,
                                                    feature_count,
                                                    sample=sample_send,
                                                    init=testWeights)
           
            second_win=0
            for i in range(iterations):
                #q = training_queries.get_query(random.choice(training_queries.keys()))          
                l, a = compar_interleave.interleave(rankers[0], rankers[1], q, 10)
                c = user_model.get_clicks(l, q.get_labels())
                o = compar_interleave.infer_outcome(l, a, c, q)
                if(o>0):
                    second_win+=1
                elif(o==0):
                    coin=random.random()
                    if(coin>0.5):
                        second_win+=1
            result_com=float(second_win)/float(iterations)
            results.append(result_com)

    g=P.hist(results, bins = 20,range=[0,1])
    P.xlabel("The win rate of the ranker",fontsize=20)
    P.ylabel("Number of rankers",fontsize=20)
    P.show(g)