def __task_aggregation__(self, raw_classifications, gold_standard={}): # do we actually need to run ibcc - no if there wasn't any confusion # borderline degenerate case but we need to be prepared for it # highest_class is needed for helping the degenerate cases run_ibcc, highest_class = self.__ibcc_setup__(raw_classifications, gold_standard) print gold_standard var = raw_input("Please enter something: ") # with open("/tmp/config.py",'rb') as f_input,open("/tmp/ibcc_gold.csv",'rb') as f_gold: # for l in f_input.readlines(): # print l[:-1] # print # print # # for l in f_gold.readlines(): # print l[:-1] # var = raw_input("Please enter something: ") # run ibcc if run_ibcc: ibcc.load_and_run_ibcc("/tmp/config.py") # now analyze the results print "not degenerate" return self.__ibcc_analyze__(raw_classifications) else: print "degenerate case" return self.__degenerate_ibcc__(raw_classifications, highest_class)
def testTable_withGold_5classes_balanced(self): #Gold labels is longer than the no. crowd-labelled data points configFile = './config/table_gold5.py' pT, combiner = ibcc.load_and_run_ibcc( configFile, ibcc_class=BalancedIBCC) check_outputsize(pT, combiner, (5, 5, 5)) check_accuracy_multi(pT, 1)
def __task_aggregation__(self,raw_classifications,gold_standard=False): # do we actually need to run ibcc - no if there wasn't any confusion # borderline degenerate case but we need to be prepared for it # highest_class is needed for helping the degenerate cases run_ibcc,highest_class =self.__ibcc_setup__(raw_classifications) # run ibcc if run_ibcc: ibcc.load_and_run_ibcc("/tmp/config.py") # now analyze the results print "not degenerate" return self.__ibcc_analyze__(raw_classifications) else: print "degenerate case" return self.__degenerate_ibcc__(raw_classifications,highest_class)
def test_Table_lowerbound_5classes_cbcc(self): #Gold labels is longer than the no. crowd-labelled data points configFile = './config/table_gold5_lowerbound.py' pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=CBCC) check_outputsize(pT, combiner, (5,5, combiner.nclusters )) check_accuracy_multi(pT, 0.99)
def testTable_withGold_5classes_opt(self): #Gold labels is longer than the no. crowd-labelled data points configFile = './config/table_gold5.py' pT, combiner = ibcc.load_and_run_ibcc( configFile, ibcc_class=None, optimise_hyperparams=True) check_outputsize(pT, combiner, (5, 5, 5)) check_accuracy_multi(pT, 1)
def testTable_shortGold(self): #Gold labels is shorter than the no. crowd-labelled data points configFile = './config/table_shortgold.py' pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=None) check_outputsize(pT, combiner) check_accuracy(pT, 0.95)
def test_Table_lowerbound_5classes_dyn(self): #Gold labels is longer than the no. crowd-labelled data points configFile = './config/table_gold5_lowerbound.py' pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=DynIBCC) check_outputsize(pT, combiner, (5, 5, 500)) check_accuracy_multi(pT, 1)
def test_SparseList_lowerbound_dyn(self): #Gold labels is longer than the no. crowd-labelled data points configFile = './config/sparse_gold_lowerbound.py' pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=DynIBCC) check_outputsize(pT, combiner, (2, 2, 375)) check_accuracy(pT, 0.93)
def test_Table_withGold_dyn(self): #Gold labels is longer than the no. crowd-labelled data points configFile = './config/table_gold.py' pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=DynIBCC) check_outputsize(pT, combiner, (2, 2, 500)) check_accuracy(pT, 0.94)
def test_Table_withGold_dyn(self): #Gold labels is longer than the no. crowd-labelled data points configFile = './config/table_gold.py' pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=DynIBCC) check_outputsize(pT, combiner,(2,2,500)) check_accuracy(pT, 0.94)
def __classify__(self,subject_ids,gold_standard=False): self.results = {} # might be over doing the elections analogy but can't think of a better way to describe things # ridings is a list of tuples (subject_ids, cluster_center) so we can match up the results from IBCC # if no clustering was involved (so only one classification per subject_id) then cluster_center should # be None ridings = [] # ridings_dict stores the "ridings" by subject id - that way, we don't need to search through all # of the ridings, everytime we want to find the "elections" for a given subject_id ridings_dict = {} # candidates = [] users = [] agreement = 0 nonagreement = 0 notenough = 0 # all_elections = {} # self.create_configfile(len(self.species)) nclasses = len(self.species) nu0 = [100/nclasses for i in range(nclasses)] confusion_matrix = [[0.2 for i in range(nclasses)] for j in range(nclasses)] # classifer = ibcc.IBCC(nclasses=nclasses,nscores=nclasses,alpha0=confusion_matrix,nu0=nu0) priors = {s:1 for s in self.candidates} # confusion = [[1 for i in self.candidates] for j in self.candidates] # for i in range(nclasses): # confusion[i][i] = 20 with open(self.base_directory+"Databases/plankton_ibcc.csv",'wb') as f: f.write("a,b,c\n") for subject_id in subject_ids: # print "-----" # print self.project.gold_annotations[subject_id] self.results[subject_id] = [] # cluster centers only make sense if we have a clustering setup - otherwise they should just be empty cluster_centers,polls = self.project.__get_classifications__(subject_id,cluster_alg=self.cluster_alg,gold_standard=gold_standard) for poll_index,(center,poll) in enumerate(zip(cluster_centers,polls)): print center print poll print # local_candidates = set() vote_counts = {} if len(poll) >=4: # classification_counter += 1 ridings.append((subject_id,center)) if not(subject_id in ridings_dict): ridings_dict[subject_id] = [center] else: ridings_dict[subject_id].append(center) for user,vote,pt in poll: # assert isinstance(vote,unicode) # local_candidates.add(vote) # use majority voting to establish priors if not(vote in vote_counts): vote_counts[vote] = 1 else: vote_counts[vote] += 1 # if not(vote in candidates): # candidates.append(vote) if not(user in users): users.append(user) # print vote,self.species[vote.lower()],pt f.write(str(users.index(user))+","+str(len(ridings)-1)+","+str(self.candidates.index(vote.lower()))+"\n") # print users.index(user),classification_counter,self.candidates.index(vote) most_votes = max(vote_counts,key=lambda x:vote_counts[x]) priors[most_votes.lower()] += 1 # now that we know what the majority vote estimate is, estimate the confusion matrix most_votes_index = self.candidates.index(most_votes.lower()) for user,vote,pt in poll: confusion_matrix[most_votes_index][self.candidates.index(vote.lower())] += 1/float(len(poll)) if len(vote_counts) ==1: agreement +=1 else: nonagreement += 1 # print local_candidates # local_candidates = tuple(sorted(list(local_candidates))) # if not(local_candidates in all_elections): # all_elections[local_candidates] = 1 # else: # all_elections[local_candidates] += 1 else: notenough +=1 # confusion_matrix = [] print "^^^^^" for i,row in enumerate(confusion_matrix): # print c confusion_matrix[i] = [int(a/min(row)) for a in row] # print print print sum(priors.values()) self.create_configfile(priors,confusion_matrix) # ibcc.runIbcc(self.base_directory+"Databases/config.py") ibcc.load_and_run_ibcc(self.base_directory+"Databases/config.py") results = {} with open(self.base_directory+"Databases/plankton_ibcc.out","rb") as f: for i,l in enumerate(f.readlines()): # print "===-----" subject_id,center = ridings[i] if not(subject_id in results): results[subject_id] = [] # print elections[i] probabilities = [float(p) for j,p in enumerate(l.split(" ")[1:])] results[subject_id].append(probabilities) # print probabilities # ibcc_most_likely = max(probabilities, key= lambda x:x[1]) # print ibcc_most_likely # print self.candidates[ibcc_most_likely[0]] # self.results[subject_id].append(max(vote_counts,key=lambda x:vote_counts[x])) # print all_elections # G=nx.Graph() # species_keys = self.species.keys() # G.add_nodes_from(range(len(species_keys))) # for e in all_elections.keys(): # for a,b in findsubsets(e,2): # G.add_edge(species_keys.index(a.lower()),species_keys.index(b.lower())) # # nx.draw(G) # plt.show() # print agreement,nonagreement,notenough return self.candidates,ridings_dict,results
def testTable_noGold(self): # Crowdlabels contains some NaNs and some -1s. configFile = './config/table_nogold.py' pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=None) check_outputsize(pT, combiner) check_accuracy(pT, 0.82)
def testSparseList_noGold(self): configFile = './config/sparse_nogold.py' pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=None) check_outputsize(pT, combiner, ptlength=199) check_accuracy(pT, 0.82, goldfile='./data/gold_mixed_verify.csv')
def testTable_withGold_5classes_opt(self): #Gold labels is longer than the no. crowd-labelled data points configFile = './config/table_gold5.py' pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=None, optimise_hyperparams=True) check_outputsize(pT, combiner, (5,5,5)) check_accuracy_multi(pT, 1)
def testTable_withGold_5classes_balanced(self): #Gold labels is longer than the no. crowd-labelled data points configFile = './config/table_gold5.py' pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=BalancedIBCC) check_outputsize(pT, combiner, (5,5,5)) check_accuracy_multi(pT, 1)
def test_SparseList_lowerbound_cbcc(self): #Gold labels is longer than the no. crowd-labelled data points configFile = './config/sparse_gold_lowerbound.py' pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=CBCC) check_outputsize(pT, combiner, (2,2, combiner.nclusters)) check_accuracy(pT, 0.95)
def testTable_shortGoldMatrix(self): #Gold labels is shorter than the no. crowd-labelled data points configFile = './config/table_shortgoldmat.py' pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=None) check_outputsize(pT, combiner, ptlength=199) check_accuracy(pT, 0.94, goldfile='./data/gold_mixed_verify.csv')
def testSparseList_withGold(self): #Gold labels is longer than the no. crowd-labelled data points configFile = './config/sparse_gold.py' pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=None) check_outputsize(pT, combiner) check_accuracy(pT, 0.95)
def testSparseList_withGold_5classes(self): #Gold labels is longer than the no. crowd-labelled data points configFile = './config/sparse_gold5.py' pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=None) check_outputsize(pT, combiner, (5,5,5)) check_accuracy_multi(pT, 1)
else: data = np.zeros((self.N, self.K), dtype=float) for l in range(self.nscores): if self.table_format_flag: data += self.lnPi[j, l, self.tauidxs_test] * self.Ctest[l] else: data[self.Cobjects_test, self.Cagents_test] += self.lnPi[j, l, self.tauidxs_test] * self.Ctest[l] if not self.table_format_flag: data = data[self.testidxs,:] self.lnpCT[self.testidxs, j] = np.sum(data, 1) + self.lnkappa[j] def post_lnpi(self): if self.alpha0_tau==[]: if self.table_format_flag: self.alpha0_tau = np.tile(self.alpha0, (1,1,self.N)) self.piprior_const = np.sum(gammaln(np.sum(self.alpha0,1)) - np.sum(gammaln(self.alpha0),1)) * self.N else: self.alpha0_tau = self.alpha0[:,:,self.Cagents] self.piprior_const = np.sum(gammaln(np.sum(self.alpha0_tau,1)) - np.sum(gammaln(self.alpha0_tau),1)) return np.sum(np.sum((self.alpha0_tau-1)*self.lnPi,1)) + self.piprior_const # Loader and Runner helper functions ------------------------------------------------------------------------------- if __name__ == '__main__': if len(sys.argv)>1: configFile = sys.argv[1] else: configFile = './config/my_project.py' ibcc.load_and_run_ibcc(configFile, DynIBCC)