def PerformCV(qid_file,diff_feat_dir,feat_file,labels_file_dir,cnt_noisy_labels_file,batch_size=5): """ Given the list of qids, it takes the training features from diff_feat folder per noisy annotator, test and dev features from feat_file. Model is trained on diff_feat_dir files and evaluated on feat_file features for Kendall tau Make sure that the sum of split ratio is a divisor of number of unique qids """ qids = numpy.genfromtxt(qid_file) qids_unique = numpy.unique(qids) features = numpy.genfromtxt(feat_file,delimiter=',') # creating labels list true_labels = numpy.genfromtxt(labels_file,delimiter=',') if numpy.remainder(len(qids_unique),batch_size): print "Please provide a split that divides number of unique qids" return num_batches = len(qids_unique)/batch_size all_test_scores, all_dev_scores = numpy.empty([0,1]), numpy.empty([0,1]) all_test_labels, all_dev_labels = numpy.empty([0,1]), numpy.empty([0,1]) for i in range(num_batches): # Determine the qids in test, dev and train sets test_id = i test_batch_qids = qids_unique[numpy.arange(batch_size*test_id,batch_size*(test_id+1))] dev_id = numpy.remainder(i+1,num_batches) dev_batch_qids = qids_unique[numpy.arange(batch_size*dev_id,batch_size*(dev_id+1))] train_batch_qids = numpy.setdiff1d(qids_unique,numpy.union1d(test_batch_qids,dev_batch_qids)) # find the features and labels for the train and the dev set test_features = features[numpy.in1d(qids,test_batch_qids).T,:] test_labels = numpy.matrix(true_labels[numpy.in1d(qids,test_batch_qids)]).T dev_features = features[numpy.in1d(qids,dev_batch_qids),:] dev_labels = numpy.matrix(true_labels[numpy.in1d(qids,dev_batch_qids)]).T # get all train set features together from all the annotators w_per_annt = numpy.empty((cnt_noisy_labels_file,0)).tolist() print 'training model for iter ... %d' % (i) for noisy_annt_id in range(cnt_noisy_labels_file): print 'at noisy annotor id: %d' %(noisy_annt_id) train_diff_features = numpy.empty([0,test_features.shape[1]]) for train_batch_qid in train_batch_qids: feature_diff_file = diff_feat_dir + '/noisy_labels' + str(noisy_annt_id+1) + '/' + str(int(train_batch_qid)) + '.features' feature_diff = numpy.genfromtxt(feature_diff_file,delimiter=',') train_diff_features = numpy.vstack((train_diff_features,feature_diff)) w = numpy.ones((1,1+train_diff_features.shape[1])) # initial w for iter in range(1): n_epochs, learning_rate, lambda_w = 2400, .02, .001 w = SVRanker3.svr_optimization(train_diff_features,w,learning_rate,n_epochs,lambda_w) w_per_annt[noisy_annt_id] = w print 'Model Trained.' print 'Results:' # perform unweighted fusion to get results on test and dev set test_features_ext = numpy.hstack((test_features,numpy.ones((test_features.shape[0],1)))) test_scores = 0 dev_features_ext = numpy.hstack((dev_features,numpy.ones((dev_features.shape[0],1)))) dev_scores = 0 for noisy_annt_id in range(cnt_noisy_labels_file): cur_test_scores = numpy.dot(test_features_ext,w_per_annt[noisy_annt_id].T) print 'Annotator specific results for annotator %d' % (noisy_annt_id) print 'TEST: Kendall Tau: %f, Spearman correlation: %f, Pearson correlation: %f' \ %(stats.kendalltau(cur_test_scores,test_labels)[0], stats.spearmanr(cur_test_scores,test_labels)[0], \ numpy.corrcoef(cur_test_scores.T,test_labels.T)[0,1]) test_scores += numpy.dot(test_features_ext,w_per_annt[noisy_annt_id].T) cur_dev_scores = numpy.dot(dev_features_ext,w_per_annt[noisy_annt_id].T) print 'DEV: Kendall Tau: %f, Spearman correlation: %f, Pearson correlation: %f' \ %(stats.kendalltau(cur_dev_scores,dev_labels)[0], stats.spearmanr(cur_dev_scores,dev_labels)[0], \ numpy.corrcoef(cur_dev_scores.T,dev_labels.T)[0,1]) dev_scores += numpy.dot(dev_features_ext,w_per_annt[noisy_annt_id].T) print '' print 'Borda count results' print 'TEST: Kendall Tau: %f, Spearman correlation: %f, Pearson correlation: %f' \ %(stats.kendalltau(test_scores,test_labels)[0], stats.spearmanr(test_scores,test_labels)[0], \ numpy.corrcoef(test_scores.T,test_labels.T)[0,1]) print 'DEV: Kendall Tau: %f, Spearman correlation: %f, Pearson correlation: %f' \ %(stats.kendalltau(dev_scores,dev_labels)[0], stats.spearmanr(dev_scores,dev_labels)[0], \ numpy.corrcoef(dev_scores.T,dev_labels.T)[0,1])
def TrainOnAll(qid_file, diff_feat_dir, feat_file, true_labels_file, noisy_labels_dir, batch_size, count_annts): #""" #diff_feat_dir: directory where the diff features are stored #noisy_labels_dir: directory where the noisy labels are stored #true_labels_file: the true labels file used for test set evaluation #qid_file: the qid file location #noisy_labels_dir: directory containing noisy preferences corresponding to #features in the diff_feat_dir #count_annts: number of annotators # #Example values: #qid_file= '/auto/rcf-proj/pg/guptarah/RankingExp/data/wine_quality/qids' #diff_feat_dir='/auto/rcf-proj/pg/guptarah/RankingExp/data/wine_quality/noisy_features/labels/' #feat_file='/auto/rcf-proj/pg/guptarah/RankingExp/data/wine_quality/features' #true_labels_file='/auto/rcf-proj/pg/guptarah/RankingExp/data/wine_quality/labels' #noisy_labels_dir='/auto/rcf-proj/pg/guptarah/RankingExp/data/wine_quality/noisy_labels_pairwise' #batch_size=1 #count_annts=6 #""" qids = numpy.genfromtxt(qid_file,dtype='int') qids_unique = numpy.unique(qids) features = numpy.genfromtxt(feat_file,delimiter=',') labels = numpy.genfromtxt(true_labels_file,delimiter=',') if numpy.remainder(len(qids_unique),batch_size): print "Please provide a split that divides number of unique qids" return num_batches = 1 mean_result_storage = numpy.zeros((2,5+count_annts)) # 3 for True, EM, EMRelEst, Borda results and Majority vote and other for each annotator # 2 rows: first for dev, second for dev for i in range(num_batches): train_batch_qids = qids_unique # get all train set features together train_diff_features = numpy.empty([0,features.shape[1]]) annt_labels = numpy.empty([count_annts,0]).tolist() for train_batch_qid in train_batch_qids: feature_diff_file = diff_feat_dir + '/labels/' + str(int(train_batch_qid)) + '.features' feature_diff = numpy.genfromtxt(feature_diff_file,delimiter=',') train_diff_features = numpy.vstack((train_diff_features,feature_diff)) # getting the labels on train set from different annotators for annt_id in range(count_annts): cur_annt_labels = annt_labels[annt_id] annt_lables_for_qid_file = noisy_labels_dir + '/' + str(train_batch_qid) + '.noisy_labels' + str(annt_id+1) annt_lables_for_qid = numpy.genfromtxt(annt_lables_for_qid_file) cur_annt_labels = numpy.hstack((cur_annt_labels,annt_lables_for_qid)) annt_labels[annt_id] = cur_annt_labels ext_diff_feats = numpy.hstack((train_diff_features,numpy.ones((train_diff_features.shape[0],1)))) max_iter = 20 w,k = TrainEM.TrainModel(ext_diff_feats,annt_labels,max_iter) print 'TrainEM Results:' print 'Correct identifications on train set: %f' %(numpy.mean(k>.5)) print '-----------------------------' print '' # Training model using TrainEMRelEst function w,k = TrainEMRelEst.TrainModel(ext_diff_feats,annt_labels,max_iter) print 'TrainEMRelEst Results:' print 'Correct identifications on train set: %f' %(numpy.mean(k>.5)+(.5*numpy.mean(k==.5))) print '-----------------------------' print '' # Getting results using majority vote majority_vote = (((numpy.mean(numpy.matrix(annt_labels),axis=0) > .5)+.5*(numpy.mean(numpy.matrix(annt_labels),axis=0) == .5))*1).T print 'Majority vote Results:' print 'Correct identifications on train set: %f' %(numpy.mean(majority_vote)) print '-----------------------------' print '' print 'Getting results for each annotator' # Getting results on each annotator w_borda = numpy.zeros((1,1+train_diff_features.shape[1])) for noisy_annt_id in range(count_annts): print 'at noisy annotor id: %d' %(noisy_annt_id) train_diff_features = numpy.empty([0,features.shape[1]]) for train_batch_qid in train_batch_qids: feature_diff_file = diff_feat_dir + '/noisy_labels' + str(noisy_annt_id+1) + '/' + str(int(train_batch_qid)) + '.features' feature_diff = numpy.genfromtxt(feature_diff_file,delimiter=',') train_diff_features = numpy.vstack((train_diff_features,feature_diff)) w = numpy.ones((1,1+train_diff_features.shape[1])) # initial w n_epochs, learning_rate, lambda_w = 2000, .02, .001 w = SVRanker3.svr_optimization(train_diff_features,w,learning_rate,n_epochs,lambda_w) print 'Annotator %d results:' %(noisy_annt_id) PrintResults(w,train_diff_features) print 'Annotator correct identification:', numpy.mean(annt_labels[noisy_annt_id]) w_borda = w_borda + w/numpy.linalg.norm(w,2) print 'Borda count Results:' PrintResults(w_borda,train_diff_features)