def Save(self): print "Loading Data" training_queries = queryClass.load_queries(self.testQueries, self.feature_count) ranker = pickle.load(open(self.rankerPath)) max = 100 #max number of docs in the ranking #print clusterData.queryToCluster.keys() #print training_queries.keys() BestRanker = queryFeatures() print "Loading training objects" i = 0 for query in training_queries: #print str(i*100/len(training_queries))+"%" i = i + 1 #query = training_queries.get_query(qid) ranker.init_ranking(query) docIds = ranker.get_ranking() iter = 0 for docId in docIds: if iter > max: break iter = iter + 1 features = query.get_feature_vector(docId) BestRanker.add(query.get_qid(), features) #print features #BestRanker.addFeaturesToQid([float(i) for i in features],query.get_qid()) pickle.dump(BestRanker, open("QueryData/" + self.dataset + ".data", "wb"))
def Train(self): print "Loading Data" clusterData=pickle.load(open( self.clusterDataPath, "rb" ) ) feature_count=len(clusterData.clusterToRanker[0][0]) training_queries = queryClass.load_queries(self.testQueries, feature_count) ranker=pickle.load( open( self.rankerPath ) ) """ testWeights=str(clusterData.clusterToRanker[0][0]) testWeights=testWeights.replace("[", "") testWeights=testWeights.replace("]", "") weights = np.array([float(num) for num in testWeights.split(",")]) ranker_tie="random" ranker_args="3" sample_send="sample_unit_sphere" ranker=rankerClass.ProbabilisticRankingFunction(ranker_args, ranker_tie, feature_count, sample=sample_send, init=testWeights) """ X=[] Y=[] max=100 #max number of docs in the ranking #print clusterData.queryToCluster.keys() #print training_queries.keys() print "Loading training objects" for qid in clusterData.queryToCluster: query = training_queries.get_query(qid) ranker.init_ranking(query) docIds=ranker.get_ranking() iter=0 for docId in docIds: if iter>max: break features=query.get_feature_vector(docId) X.append(features) Y.append(clusterData.queryToCluster[qid][0]) iter=iter+1 #X = [[0, 0], [1, 1]] #y = [0, 1] X=np.array(X) Y=np.array(Y) print "Training" clf = svm.SVC() clf.fit(X, Y) if not os.path.exists("Classifier"): os.makedirs("Classifier") paths=self.clusterDataPath.split('/') name=paths[len(paths)-1] parts=name.split('.') name=parts[0] pickle.dump(clf, open( "Classifier/"+name+".data", "wb" ) )