Python SVRankerSoft примеры использования

Язык программирования: Python

Класс/Тип: SVRankerSoft

Примеров на hotexamples.com: 3

Python SVRankerSoft - 3 примера найдено. Это лучшие примеры Python кода для SVRankerSoft, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

svr_optimization(3)

Пример #1

Показать файл

Файл: PerformCV_All.py Проект: guptarah/RankingExp

def PerformCV(qid_file, diff_feat_dir, feat_file, true_labels_file, noisy_labels_dir, batch_size, count_annts):
#"""
#diff_feat_dir: directory where the diff features are stored
#noisy_labels_dir: directory where the noisy labels are stored
#true_labels_file: the true labels file used for test set evaluation
#qid_file: the qid file location
#noisy_labels_dir: directory containing noisy preferences corresponding to
#features in the diff_feat_dir
#count_annts: number of annotators
#
#Example values:
#qid_file= '/auto/rcf-proj/pg/guptarah/RankingExp/data/wine_quality/qids'
#diff_feat_dir='/auto/rcf-proj/pg/guptarah/RankingExp/data/wine_quality/noisy_features/labels/'
#feat_file='/auto/rcf-proj/pg/guptarah/RankingExp/data/wine_quality/features'
#true_labels_file='/auto/rcf-proj/pg/guptarah/RankingExp/data/wine_quality/labels'
#noisy_labels_dir='/auto/rcf-proj/pg/guptarah/RankingExp/data/wine_quality/noisy_labels_pairwise'
#batch_size=1
#count_annts=6
#"""

   qids = numpy.genfromtxt(qid_file,dtype='int')
   qids_unique = numpy.unique(qids)

   features = numpy.genfromtxt(feat_file,delimiter=',')
   labels = numpy.genfromtxt(true_labels_file,delimiter=',')

   if numpy.remainder(len(qids_unique),batch_size):
      print "Please provide a split that divides number of unique qids"
      return

   num_batches = len(qids_unique)/batch_size
   mean_result_storage = numpy.zeros((4,5+count_annts))
   # 5 for True, EM, EMRelEst, Borda results and Majority vote and other for each annotator
   # 2 rows for correcit pairwise identification: first for test, second for dev
   # 2 rows for spearman correlation: first for test, second for dev
   
   for i in range(num_batches):
      # Determine the qids in test, dev and train sets
      test_id = i 
      test_batch_qids = qids_unique[numpy.arange(batch_size*test_id,batch_size*(test_id+1))]

      dev_id = numpy.remainder(i+1,num_batches)
      dev_batch_qids = qids_unique[numpy.arange(batch_size*dev_id,batch_size*(dev_id+1))]
      
      train_batch_qids = numpy.setdiff1d(qids_unique,numpy.union1d(test_batch_qids,dev_batch_qids))  
   
      # find the features and labels for the train and the dev set
      test_features = features[numpy.in1d(qids,test_batch_qids).T,:] 
      test_labels = numpy.matrix(labels[numpy.in1d(qids,test_batch_qids)]).T
      test_diff_features = numpy.empty([0,test_features.shape[1]])
      for test_batch_qid in test_batch_qids:
         feature_diff_file = diff_feat_dir + '/labels/' + str(int(test_batch_qid)) + '.features' 
         feature_diff = numpy.genfromtxt(feature_diff_file,delimiter=',')
         test_diff_features = numpy.vstack((test_diff_features,feature_diff))

      dev_features = features[numpy.in1d(qids,dev_batch_qids),:] 
      dev_labels = numpy.matrix(labels[numpy.in1d(qids,dev_batch_qids)]).T
      dev_diff_features = numpy.empty([0,dev_features.shape[1]])
      for dev_batch_qid in dev_batch_qids:
         feature_diff_file = diff_feat_dir + '/labels/' + str(int(dev_batch_qid)) + '.features' 
         feature_diff = numpy.genfromtxt(feature_diff_file,delimiter=',')
         dev_diff_features = numpy.vstack((dev_diff_features,feature_diff))

      # get all train set features together
      train_diff_features = numpy.empty([0,test_features.shape[1]])
      annt_labels = numpy.empty([count_annts,0]).tolist()
      for train_batch_qid in train_batch_qids:
         feature_diff_file = diff_feat_dir + '/labels/' + str(int(train_batch_qid)) + '.features' 
         feature_diff = numpy.genfromtxt(feature_diff_file,delimiter=',')
         train_diff_features = numpy.vstack((train_diff_features,feature_diff))

         # getting the labels on train set from different annotators
         for annt_id in range(count_annts):
            cur_annt_labels = annt_labels[annt_id]
            annt_lables_for_qid_file = noisy_labels_dir + '/' + str(train_batch_qid) + '.noisy_labels' + str(annt_id+1)
            annt_lables_for_qid = numpy.genfromtxt(annt_lables_for_qid_file)
            cur_annt_labels = numpy.hstack((cur_annt_labels,annt_lables_for_qid))
            annt_labels[annt_id] = cur_annt_labels

      print annt_labels[0].shape

      for annt_id in range(count_annts):
         annt_labels[annt_id] = numpy.ravel(numpy.random.uniform(0,1,size=(train_diff_features.shape[0],1)) > .3)*1

      print annt_labels[0].shape

      ext_diff_feats = numpy.hstack((train_diff_features,numpy.ones((train_diff_features.shape[0],1))))
      max_iter = 20
      w,k = TrainEM.TrainModel(ext_diff_feats,annt_labels,max_iter)
      print numpy.mean(k>0.5)
      w = .01*numpy.ones((1,1+train_diff_features.shape[1]))
      n_epochs, learning_rate, lambda_w = 2800, .01, .001

      last_dev_result = 0
      for train_epoch in range(n_epochs): 
         w = SVRankerSoft.svr_optimization(train_diff_features,1*(k>0.5),w,.01,1,.001)      
         cur_dev_result = GetResults(w,dev_diff_features)
         delta_performance = cur_dev_result - last_dev_result
         if (delta_performance < .0000) and (cur_dev_result > last_dev_result):
            print 'break at iter ',train_epoch
            break 
         last_dev_result = cur_dev_result

      print 'Basic multiple annotator Test Results:'
      mean_result_storage[2,1] += PrintResultsStats(w,test_features,test_labels)
      mean_result_storage[0,1] += PrintResults(w,test_diff_features) 
 
      print 'Basic multiple annotator Dev Results:'
      mean_result_storage[3,1] += PrintResultsStats(w,dev_features,dev_labels)
      mean_result_storage[1,1] += PrintResults(w,dev_diff_features) 

      print 'Correct identifications on train set: %f' %(numpy.mean(k>.5))

      print '-----------------------------'
      print ''

#      # Training model using TrainEMRelEst function
#      w,k = TrainEMRelEst.TrainModel(ext_diff_feats,annt_labels,max_iter)
#      w = SVRankerSoft.svr_optimization(train_diff_features,numpy.around(k),w,.02,2000,.001)      
#      
#      print 'Rel est multiple annotator Test Results:'
#      mean_result_storage[2,2] += PrintResultsStats(w,test_features,test_labels)
#      mean_result_storage[0,2] += PrintResults(w,test_diff_features) 
# 
#      print 'Rel est multiple annotator Dev Results:'
#      mean_result_storage[3,2] += PrintResultsStats(w,dev_features,dev_labels)
#      mean_result_storage[1,2] += PrintResults(w,dev_diff_features) 
#
#      print 'Correct identifications on train set: %f' %(numpy.mean(k>.5))
#
#      print '-----------------------------'
#      print ''

      # Getting results using majority vote 
      majority_vote = ((numpy.mean(numpy.matrix(annt_labels),axis=0) > .5)*1).T
      w = .01*numpy.ones((1,1+train_diff_features.shape[1]))
      #w = SVRankerSoft.svr_optimization(train_diff_features,majority_vote,w,.02,2000,.001)      
      
      last_dev_result = 0
      for train_epoch in range(n_epochs): 
         w = SVRankerSoft.svr_optimization(train_diff_features,majority_vote,w,.01,1,.001)      
         cur_dev_result = GetResults(w,dev_diff_features)
         delta_performance = cur_dev_result - last_dev_result
         if (delta_performance < .0000) and (cur_dev_result > last_dev_result):
            print 'break at iter ',train_epoch
            break 
         last_dev_result = cur_dev_result
      print 'Majority vote Test Results:'
      
      mean_result_storage[2,4] += PrintResultsStats(w,test_features,test_labels)
      mean_result_storage[0,4] += PrintResults(w,test_diff_features) 
  
      print 'Majority vote Dev Results:'
      mean_result_storage[3,4] += PrintResultsStats(w,dev_features,dev_labels)
      mean_result_storage[1,4] += PrintResults(w,dev_diff_features) 
 
      print 'Correct identifications on train set: %f' %(numpy.mean(majority_vote))
      print '-----------------------------'
      print ''
      
      # Getting results using true labels during training
      w = .01*numpy.ones((1,1+train_diff_features.shape[1]))
      print 'training true baseline model for iter ... %d' % (i)
      n_epochs, learning_rate, lambda_w = 2000, .01, .001
      #w = SVRanker3.svr_optimization(train_diff_features,w,learning_rate,n_epochs,lambda_w)
      w = SVRankerSoft.svr_optimization(train_diff_features,numpy.ones(majority_vote.shape),w,learning_rate,n_epochs,lambda_w)
   
      print 'True Baseline Test Results:'
      mean_result_storage[2,0] += PrintResultsStats(w,test_features,test_labels)
      mean_result_storage[0,0] += PrintResults(w,test_diff_features) 
  
      print 'True Baseline Dev Results:'
      mean_result_storage[3,0] += PrintResultsStats(w,dev_features,dev_labels)
      mean_result_storage[1,0] += PrintResults(w,dev_diff_features) 
 
      print '-----------------------------'
      print ''

      print 'Getting results for each annotator'
      # Getting results on each annotator
      w_borda = numpy.zeros((1,1+train_diff_features.shape[1]))
      for noisy_annt_id in range(count_annts):
         print 'at noisy annotor id: %d' %(noisy_annt_id)
#         train_diff_features = numpy.empty([0,test_features.shape[1]])
#         for train_batch_qid in train_batch_qids:
#            feature_diff_file = diff_feat_dir + '/noisy_labels' + str(noisy_annt_id+1) + '/' + str(int(train_batch_qid)) + '.features'
#            feature_diff = numpy.genfromtxt(feature_diff_file,delimiter=',')
#            train_diff_features = numpy.vstack((train_diff_features,feature_diff)) 
         cur_annt_labels = numpy.matrix(annt_labels[noisy_annt_id]).T
         w = .01*numpy.ones((1,1+train_diff_features.shape[1])) # initial w
         w = SVRankerSoft.svr_optimization(train_diff_features,cur_annt_labels,w,learning_rate,n_epochs,lambda_w)
  
         print 'Annotator %d Test Results:' %(noisy_annt_id)
         mean_result_storage[2,5+noisy_annt_id] += PrintResultsStats(w,test_features,test_labels)
         mean_result_storage[0,5+noisy_annt_id] += PrintResults(w,test_diff_features) 
  
         print 'Annotator %d Dev Results:' %(noisy_annt_id)
         mean_result_storage[3,5+noisy_annt_id] += PrintResultsStats(w,dev_features,dev_labels)
         mean_result_storage[1,5+noisy_annt_id] += PrintResults(w,dev_diff_features) 

         w_borda = w_borda + w/numpy.linalg.norm(w,2) 
         print 'Correct identifications on train set: %f' %(numpy.mean(cur_annt_labels))

      print 'Borda count Test Results:' 
      mean_result_storage[2,3] += PrintResultsStats(w_borda,test_features,test_labels)
      mean_result_storage[0,3] += PrintResults(w,test_diff_features) 
  
      print 'Borda count Dev Results:' 
      mean_result_storage[3,3] += PrintResultsStats(w_borda,dev_features,dev_labels)
      mean_result_storage[1,3] += PrintResults(w,dev_diff_features) 


      print 'Running mean of performances:'  
      print mean_result_storage

Пример #2

Показать файл

Файл: TrainEMRelEst.py Проект: guptarah/RankingExp

def TrainModel(ext_diff_feats,annt_comparison_labels,max_iter=100):
   """
   diff_feats: difference between features extended with ones
   annt_comparison_labels: comparison labels (0/1) showing if annotator said if x_g > x_l (labels it: 1) or x_g < x_l (labels it: 0)
   """  
   
   N = ext_diff_feats.shape[0] # number of data point comparisons  
   R = len(annt_comparison_labels) # number of annotators
   D = ext_diff_feats.shape[1] # feature dimensionality
 
   # Initialization
   k = InitializeK(N) 
   w = InitializeW(D)
   A = numpy.empty((R,0)).tolist()
   for i in range(R):
      A[i] = InitializeA(N)


   convergence_flag = 1
   iter_counter = 0
   while convergence_flag:
      iter_counter = iter_counter + 1 
      
      # E step. Estimating k
      model_probs = SigmoidProb(ext_diff_feats, w)
      prod_probs_E1 = model_probs # probability that assumed diff is correct 
      prod_probs_E0 = numpy.ones(model_probs.shape) - model_probs # probability that assumed diff is incorrect
      for i in range(R):
         A_cur = A[i]
         cur_annt_labels = annt_comparison_labels[i]       
         cur_label_mat = numpy.vstack((numpy.logical_not(cur_annt_labels),cur_annt_labels))         
 
         # Below for E1 if an annotator said 0, flipping probability is multiplied and otherwise
         cur_annt_probs_E1 = numpy.matrix(numpy.sum(numpy.multiply(A_cur,cur_label_mat),axis=0))
         prod_probs_E1 = numpy.multiply(prod_probs_E1,cur_annt_probs_E1.T)

         # Below for E0 if an annotator said 1, flipping probability is multiplied and otherwise
         cur_annt_probs_E0 = numpy.matrix(numpy.sum(numpy.multiply(A_cur,numpy.logical_not(cur_label_mat)),axis=0))
         prod_probs_E0 = numpy.multiply(prod_probs_E0,cur_annt_probs_E0.T)

      k_term1 = prod_probs_E1
      k_term2 = prod_probs_E1+prod_probs_E0 #+ .001*numpy.ones(prod_probs_E1.shape)
      k = numpy.divide(k_term1,k_term2)

      # M step. 
      # Estimating w 
      diff_feats = ext_diff_feats[:,:-1] # unfortunately ones are appended again in SVRankerSoft 
      learning_rate = 0.02
      n_epochs = 20
      lambda_w = .001  
      w = SVRankerSoft.svr_optimization(diff_feats,k,w,learning_rate,n_epochs,lambda_w)

      # Estimating A's
      for i in range(R):
         cur_annt_labels = annt_comparison_labels[i]     
         A[i] = ComputeA(k,cur_annt_labels,ext_diff_feats) 
      if iter_counter > max_iter:
         convergence_flag = 0

   print 'Finished training'      
   for i in range(R):
      print numpy.mean(A[i], axis=1)

   return w,k

Пример #3

Показать файл

Файл: Check.py Проект: guptarah/RankingExp

def PerformCV(qid_file, diff_feat_dir, feat_file, true_labels_file, noisy_labels_dir, batch_size, count_annts):
#"""
#diff_feat_dir: directory where the diff features are stored
#noisy_labels_dir: directory where the noisy labels are stored
#true_labels_file: the true labels file used for test set evaluation
#qid_file: the qid file location
#noisy_labels_dir: directory containing noisy preferences corresponding to
#features in the diff_feat_dir
#count_annts: number of annotators
#
#Example values:
#qid_file= '/auto/rcf-proj/pg/guptarah/RankingExp/data/wine_quality/qids'
#diff_feat_dir='/auto/rcf-proj/pg/guptarah/RankingExp/data/wine_quality/noisy_features/labels/'
#feat_file='/auto/rcf-proj/pg/guptarah/RankingExp/data/wine_quality/features'
#true_labels_file='/auto/rcf-proj/pg/guptarah/RankingExp/data/wine_quality/labels'
#noisy_labels_dir='/auto/rcf-proj/pg/guptarah/RankingExp/data/wine_quality/noisy_labels_pairwise'
#batch_size=1
#count_annts=6
#"""

   qids = numpy.genfromtxt(qid_file,dtype='int')
   qids_unique = numpy.unique(qids)

   features = numpy.genfromtxt(feat_file,delimiter=',')
   labels = numpy.genfromtxt(true_labels_file,delimiter=',')

   if numpy.remainder(len(qids_unique),batch_size):
      print "Please provide a split that divides number of unique qids"
      return

   num_batches = len(qids_unique)/batch_size
   mean_result_storage = numpy.zeros((4,5+count_annts))
   # 5 for True, EM, EMRelEst, Borda results and Majority vote and other for each annotator
   # 2 rows for correcit pairwise identification: first for test, second for dev
   # 2 rows for spearman correlation: first for test, second for dev
   
   for i in range(num_batches):
      # Determine the qids in test, dev and train sets
      test_id = i 
      test_batch_qids = qids_unique[numpy.arange(batch_size*test_id,batch_size*(test_id+1))]

      dev_id = numpy.remainder(i+1,num_batches)
      dev_batch_qids = qids_unique[numpy.arange(batch_size*dev_id,batch_size*(dev_id+1))]
      
      train_batch_qids = numpy.setdiff1d(qids_unique,numpy.union1d(test_batch_qids,dev_batch_qids))  
   
      # find the features and labels for the train and the dev set
      test_features = features[numpy.in1d(qids,test_batch_qids).T,:] 
      test_labels = numpy.matrix(labels[numpy.in1d(qids,test_batch_qids)]).T
      test_diff_features = numpy.empty([0,test_features.shape[1]])
      for test_batch_qid in test_batch_qids:
         feature_diff_file = diff_feat_dir + '/labels/' + str(int(test_batch_qid)) + '.features' 
         feature_diff = numpy.genfromtxt(feature_diff_file,delimiter=',')
         test_diff_features = numpy.vstack((test_diff_features,feature_diff))

      dev_features = features[numpy.in1d(qids,dev_batch_qids),:] 
      dev_labels = numpy.matrix(labels[numpy.in1d(qids,dev_batch_qids)]).T
      dev_diff_features = numpy.empty([0,dev_features.shape[1]])
      for dev_batch_qid in dev_batch_qids:
         feature_diff_file = diff_feat_dir + '/labels/' + str(int(dev_batch_qid)) + '.features' 
         feature_diff = numpy.genfromtxt(feature_diff_file,delimiter=',')
         dev_diff_features = numpy.vstack((dev_diff_features,feature_diff))

      # get all train set features together
      train_diff_features = numpy.empty([0,test_features.shape[1]])
      annt_labels = numpy.empty([count_annts,0]).tolist()
      for train_batch_qid in train_batch_qids:
         feature_diff_file = diff_feat_dir + '/labels/' + str(int(train_batch_qid)) + '.features' 
         feature_diff = numpy.genfromtxt(feature_diff_file,delimiter=',')
         train_diff_features = numpy.vstack((train_diff_features,feature_diff))

         # getting the labels on train set from different annotators
         for annt_id in range(count_annts):
            cur_annt_labels = annt_labels[annt_id]
            annt_lables_for_qid_file = noisy_labels_dir + '/' + str(train_batch_qid) + '.noisy_labels' + str(annt_id+1)
            annt_lables_for_qid = numpy.genfromtxt(annt_lables_for_qid_file)
            cur_annt_labels = numpy.hstack((cur_annt_labels,annt_lables_for_qid))
            annt_labels[annt_id] = cur_annt_labels

      ext_diff_feats = numpy.hstack((train_diff_features,numpy.ones((train_diff_features.shape[0],1))))
      
      # Getting results using true labels during training
      for check_id in range(10):
         w = .01*numpy.ones((1,1+train_diff_features.shape[1]))
         print 'training true baseline model for iter ... %d' % (i)
         n_epochs, learning_rate, lambda_w = 2000, .01, .001
         random_scores = (numpy.random.uniform(0,1,size=(train_diff_features.shape[0],1)) > (.05*check_id))*1
         print 'correct supplied: ', numpy.mean(random_scores)
         w = SVRankerSoft.svr_optimization(train_diff_features,random_scores,w,learning_rate,n_epochs,lambda_w)

         print 'True Baseline Test Results:'
         mean_result_storage[2,0] += PrintResultsStats(w,test_features,test_labels)
         mean_result_storage[0,0] += PrintResults(w,test_diff_features) 

         print 'True Baseline Dev Results:'
         mean_result_storage[3,0] += PrintResultsStats(w,dev_features,dev_labels)
         mean_result_storage[1,0] += PrintResults(w,dev_diff_features) 

         print '-----------------------------'
         print ''