Exemplo n.º 1
0
    def __task_aggregation__(self, raw_classifications, gold_standard={}):
        # do we actually need to run ibcc - no if there wasn't any confusion
        # borderline degenerate case but we need to be prepared for it
        # highest_class is needed for helping the degenerate cases
        run_ibcc, highest_class = self.__ibcc_setup__(raw_classifications, gold_standard)
        print gold_standard
        var = raw_input("Please enter something: ")
        # with open("/tmp/config.py",'rb') as f_input,open("/tmp/ibcc_gold.csv",'rb') as f_gold:
        #     for l in f_input.readlines():
        #         print l[:-1]
        #     print
        #     print
        #
        #     for l in f_gold.readlines():
        #         print l[:-1]
        #     var = raw_input("Please enter something: ")

        # run ibcc
        if run_ibcc:
            ibcc.load_and_run_ibcc("/tmp/config.py")

            # now analyze the results
            print "not degenerate"
            return self.__ibcc_analyze__(raw_classifications)
        else:
            print "degenerate case"
            return self.__degenerate_ibcc__(raw_classifications, highest_class)
Exemplo n.º 2
0
    def __task_aggregation__(self, raw_classifications, gold_standard={}):
        # do we actually need to run ibcc - no if there wasn't any confusion
        # borderline degenerate case but we need to be prepared for it
        # highest_class is needed for helping the degenerate cases
        run_ibcc, highest_class = self.__ibcc_setup__(raw_classifications,
                                                      gold_standard)
        print gold_standard
        var = raw_input("Please enter something: ")
        # with open("/tmp/config.py",'rb') as f_input,open("/tmp/ibcc_gold.csv",'rb') as f_gold:
        #     for l in f_input.readlines():
        #         print l[:-1]
        #     print
        #     print
        #
        #     for l in f_gold.readlines():
        #         print l[:-1]
        #     var = raw_input("Please enter something: ")

        # run ibcc
        if run_ibcc:
            ibcc.load_and_run_ibcc("/tmp/config.py")

            # now analyze the results
            print "not degenerate"
            return self.__ibcc_analyze__(raw_classifications)
        else:
            print "degenerate case"
            return self.__degenerate_ibcc__(raw_classifications, highest_class)
Exemplo n.º 3
0
 def testTable_withGold_5classes_balanced(self):
     #Gold labels is longer than the no. crowd-labelled data points
     configFile = './config/table_gold5.py'
     pT, combiner = ibcc.load_and_run_ibcc(
         configFile, ibcc_class=BalancedIBCC)
     check_outputsize(pT, combiner, (5, 5, 5))
     check_accuracy_multi(pT, 1)
Exemplo n.º 4
0
    def __task_aggregation__(self,raw_classifications,gold_standard=False):
        # do we actually need to run ibcc - no if there wasn't any confusion
        # borderline degenerate case but we need to be prepared for it
        # highest_class is needed for helping the degenerate cases
        run_ibcc,highest_class =self.__ibcc_setup__(raw_classifications)

        # run ibcc
        if run_ibcc:
            ibcc.load_and_run_ibcc("/tmp/config.py")

            # now analyze the results
            print "not degenerate"
            return self.__ibcc_analyze__(raw_classifications)
        else:
            print "degenerate case"
            return self.__degenerate_ibcc__(raw_classifications,highest_class)
Exemplo n.º 5
0
 def test_Table_lowerbound_5classes_cbcc(self):
     #Gold labels is longer than the no. crowd-labelled data points
     configFile = './config/table_gold5_lowerbound.py'
     pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=CBCC)
     check_outputsize(pT, combiner, (5,5, combiner.nclusters
                                     ))
     check_accuracy_multi(pT, 0.99)  
Exemplo n.º 6
0
 def testTable_withGold_5classes_opt(self):
     #Gold labels is longer than the no. crowd-labelled data points
     configFile = './config/table_gold5.py'
     pT, combiner = ibcc.load_and_run_ibcc(
         configFile, ibcc_class=None, optimise_hyperparams=True)
     check_outputsize(pT, combiner, (5, 5, 5))
     check_accuracy_multi(pT, 1)
Exemplo n.º 7
0
 def testTable_shortGold(self):
     #Gold labels is shorter than the no. crowd-labelled data points
     configFile = './config/table_shortgold.py'
     pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=None)
     check_outputsize(pT, combiner)
     check_accuracy(pT, 0.95)     
Exemplo n.º 8
0
 def test_Table_lowerbound_5classes_dyn(self):
     #Gold labels is longer than the no. crowd-labelled data points
     configFile = './config/table_gold5_lowerbound.py'
     pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=DynIBCC)
     check_outputsize(pT, combiner, (5, 5, 500))
     check_accuracy_multi(pT, 1)
Exemplo n.º 9
0
 def test_SparseList_lowerbound_dyn(self):
     #Gold labels is longer than the no. crowd-labelled data points
     configFile = './config/sparse_gold_lowerbound.py'
     pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=DynIBCC)
     check_outputsize(pT, combiner, (2, 2, 375))
     check_accuracy(pT, 0.93)
Exemplo n.º 10
0
 def test_Table_withGold_dyn(self):
     #Gold labels is longer than the no. crowd-labelled data points
     configFile = './config/table_gold.py'
     pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=DynIBCC)
     check_outputsize(pT, combiner, (2, 2, 500))
     check_accuracy(pT, 0.94)
Exemplo n.º 11
0
 def test_Table_withGold_dyn(self):
     #Gold labels is longer than the no. crowd-labelled data points
     configFile = './config/table_gold.py'
     pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=DynIBCC)
     check_outputsize(pT, combiner,(2,2,500))
     check_accuracy(pT, 0.94) 
Exemplo n.º 12
0
    def __classify__(self,subject_ids,gold_standard=False):
        self.results = {}
        # might be over doing the elections analogy but can't think of a better way to describe things
        # ridings is a list of tuples (subject_ids, cluster_center) so we can match up the results from IBCC
        # if no clustering was involved (so only one classification per subject_id) then cluster_center should
        # be None
        ridings = []
        # ridings_dict stores the "ridings" by subject id - that way, we don't need to search through all
        # of the ridings, everytime we want to find the "elections" for a given subject_id
        ridings_dict = {}
        # candidates = []
        users = []
        agreement = 0
        nonagreement = 0
        notenough = 0
        # all_elections = {}
        # self.create_configfile(len(self.species))
        nclasses = len(self.species)
        nu0 = [100/nclasses for i in range(nclasses)]
        confusion_matrix = [[0.2 for i in range(nclasses)] for j in range(nclasses)]



        # classifer = ibcc.IBCC(nclasses=nclasses,nscores=nclasses,alpha0=confusion_matrix,nu0=nu0)

        priors = {s:1 for s in self.candidates}
        # confusion = [[1 for i in self.candidates] for j in self.candidates]

        # for i in range(nclasses):
        #     confusion[i][i] = 20

        with open(self.base_directory+"Databases/plankton_ibcc.csv",'wb') as f:
            f.write("a,b,c\n")
            for subject_id in subject_ids:
                # print "-----"
                # print self.project.gold_annotations[subject_id]
                self.results[subject_id] = []

                # cluster centers only make sense if we have a clustering setup - otherwise they should just be empty
                cluster_centers,polls = self.project.__get_classifications__(subject_id,cluster_alg=self.cluster_alg,gold_standard=gold_standard)

                for poll_index,(center,poll) in enumerate(zip(cluster_centers,polls)):
                    print center
                    print poll
                    print
                    # local_candidates = set()
                    vote_counts = {}
                    if len(poll) >=4:
                        # classification_counter  += 1
                        ridings.append((subject_id,center))
                        if not(subject_id in ridings_dict):
                            ridings_dict[subject_id] = [center]
                        else:
                            ridings_dict[subject_id].append(center)

                        for user,vote,pt in poll:
                            # assert isinstance(vote,unicode)
                            # local_candidates.add(vote)

                            # use majority voting to establish priors
                            if not(vote in vote_counts):
                                vote_counts[vote] = 1
                            else:
                                vote_counts[vote] += 1
                            # if not(vote in candidates):
                            #     candidates.append(vote)
                            if not(user in users):
                                users.append(user)
                            # print vote,self.species[vote.lower()],pt
                            f.write(str(users.index(user))+","+str(len(ridings)-1)+","+str(self.candidates.index(vote.lower()))+"\n")
                            # print users.index(user),classification_counter,self.candidates.index(vote)

                        most_votes = max(vote_counts,key=lambda x:vote_counts[x])
                        priors[most_votes.lower()] += 1

                        # now that we know what the majority vote estimate is, estimate the confusion matrix
                        most_votes_index = self.candidates.index(most_votes.lower())
                        for user,vote,pt in poll:
                            confusion_matrix[most_votes_index][self.candidates.index(vote.lower())] += 1/float(len(poll))

                        if len(vote_counts) ==1:
                            agreement +=1
                        else:
                            nonagreement += 1
                        # print local_candidates
                        # local_candidates = tuple(sorted(list(local_candidates)))
                        # if not(local_candidates in all_elections):
                        #     all_elections[local_candidates] = 1
                        # else:
                        #     all_elections[local_candidates] += 1
                    else:
                        notenough +=1

        # confusion_matrix = []
        print "^^^^^"
        for i,row in enumerate(confusion_matrix):
            # print c
            confusion_matrix[i] = [int(a/min(row)) for a in row]

            # print
        print
        print sum(priors.values())
        self.create_configfile(priors,confusion_matrix)

        # ibcc.runIbcc(self.base_directory+"Databases/config.py")
        ibcc.load_and_run_ibcc(self.base_directory+"Databases/config.py")
        results = {}
        with open(self.base_directory+"Databases/plankton_ibcc.out","rb") as f:
            for i,l in enumerate(f.readlines()):
                # print "===-----"
                subject_id,center = ridings[i]

                if not(subject_id in results):
                    results[subject_id] = []

                # print elections[i]
                probabilities = [float(p) for j,p in enumerate(l.split(" ")[1:])]
                results[subject_id].append(probabilities)
                # print probabilities
                # ibcc_most_likely = max(probabilities, key= lambda x:x[1])
                # print ibcc_most_likely
                # print self.candidates[ibcc_most_likely[0]]
                # self.results[subject_id].append(max(vote_counts,key=lambda x:vote_counts[x]))
        # print all_elections
        # G=nx.Graph()
        # species_keys = self.species.keys()
        # G.add_nodes_from(range(len(species_keys)))
        # for e in all_elections.keys():
        #     for a,b in findsubsets(e,2):
        #         G.add_edge(species_keys.index(a.lower()),species_keys.index(b.lower()))
        #
        # nx.draw(G)
        # plt.show()
        # print agreement,nonagreement,notenough
        return self.candidates,ridings_dict,results
Exemplo n.º 13
0
 def testTable_shortGold(self):
     #Gold labels is shorter than the no. crowd-labelled data points
     configFile = './config/table_shortgold.py'
     pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=None)
     check_outputsize(pT, combiner)
     check_accuracy(pT, 0.95)
Exemplo n.º 14
0
 def testTable_noGold(self):
     # Crowdlabels contains some NaNs and some -1s.
     configFile = './config/table_nogold.py'
     pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=None)
     check_outputsize(pT, combiner)
     check_accuracy(pT, 0.82)
Exemplo n.º 15
0
 def testTable_noGold(self):
     # Crowdlabels contains some NaNs and some -1s.
     configFile = './config/table_nogold.py'
     pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=None)
     check_outputsize(pT, combiner)
     check_accuracy(pT, 0.82) 
Exemplo n.º 16
0
 def testSparseList_noGold(self):
     configFile = './config/sparse_nogold.py'
     pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=None)
     check_outputsize(pT, combiner, ptlength=199)
     check_accuracy(pT, 0.82, goldfile='./data/gold_mixed_verify.csv')
Exemplo n.º 17
0
 def testTable_withGold_5classes_opt(self):
     #Gold labels is longer than the no. crowd-labelled data points
     configFile = './config/table_gold5.py'
     pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=None, optimise_hyperparams=True)
     check_outputsize(pT, combiner, (5,5,5))
     check_accuracy_multi(pT, 1)      
Exemplo n.º 18
0
 def testTable_withGold_5classes_balanced(self):
     #Gold labels is longer than the no. crowd-labelled data points
     configFile = './config/table_gold5.py'
     pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=BalancedIBCC)
     check_outputsize(pT, combiner, (5,5,5))
     check_accuracy_multi(pT, 1)  
Exemplo n.º 19
0
 def test_SparseList_lowerbound_cbcc(self):
     #Gold labels is longer than the no. crowd-labelled data points
     configFile = './config/sparse_gold_lowerbound.py'
     pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=CBCC)
     check_outputsize(pT, combiner, (2,2, combiner.nclusters))
     check_accuracy(pT, 0.95)
Exemplo n.º 20
0
 def testSparseList_noGold(self):
     configFile = './config/sparse_nogold.py'
     pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=None)
     check_outputsize(pT, combiner, ptlength=199)
     check_accuracy(pT, 0.82, goldfile='./data/gold_mixed_verify.csv')
Exemplo n.º 21
0
 def testTable_shortGoldMatrix(self):
     #Gold labels is shorter than the no. crowd-labelled data points
     configFile = './config/table_shortgoldmat.py'
     pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=None)
     check_outputsize(pT, combiner, ptlength=199)
     check_accuracy(pT, 0.94, goldfile='./data/gold_mixed_verify.csv')   
Exemplo n.º 22
0
 def testSparseList_withGold(self):
     #Gold labels is longer than the no. crowd-labelled data points
     configFile = './config/sparse_gold.py'
     pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=None)
     check_outputsize(pT, combiner)
     check_accuracy(pT, 0.95)
Exemplo n.º 23
0
 def testSparseList_withGold_5classes(self):
     #Gold labels is longer than the no. crowd-labelled data points
     configFile = './config/sparse_gold5.py'
     pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=None)
     check_outputsize(pT, combiner, (5,5,5))
     check_accuracy_multi(pT, 1)
Exemplo n.º 24
0
 def testTable_shortGoldMatrix(self):
     #Gold labels is shorter than the no. crowd-labelled data points
     configFile = './config/table_shortgoldmat.py'
     pT, combiner = ibcc.load_and_run_ibcc(configFile, ibcc_class=None)
     check_outputsize(pT, combiner, ptlength=199)
     check_accuracy(pT, 0.94, goldfile='./data/gold_mixed_verify.csv')
Exemplo n.º 25
0
                else:
                    data = np.zeros((self.N, self.K), dtype=float)
                for l in range(self.nscores):
                    if self.table_format_flag:
                        data += self.lnPi[j, l, self.tauidxs_test] * self.Ctest[l]
                    else:
                        data[self.Cobjects_test, self.Cagents_test] += self.lnPi[j, l, self.tauidxs_test] * self.Ctest[l]
                if not self.table_format_flag:
                    data = data[self.testidxs,:]
                self.lnpCT[self.testidxs, j] = np.sum(data, 1) + self.lnkappa[j]


    def post_lnpi(self):
        if self.alpha0_tau==[]:
            if self.table_format_flag:
                self.alpha0_tau = np.tile(self.alpha0, (1,1,self.N))
                self.piprior_const = np.sum(gammaln(np.sum(self.alpha0,1)) - np.sum(gammaln(self.alpha0),1)) * self.N               
            else:
                self.alpha0_tau = self.alpha0[:,:,self.Cagents]
                self.piprior_const = np.sum(gammaln(np.sum(self.alpha0_tau,1)) - np.sum(gammaln(self.alpha0_tau),1))
        return np.sum(np.sum((self.alpha0_tau-1)*self.lnPi,1)) + self.piprior_const

# Loader and Runner helper functions -------------------------------------------------------------------------------
if __name__ == '__main__':
    if len(sys.argv)>1:
        configFile = sys.argv[1]
    else:
        configFile = './config/my_project.py'
    ibcc.load_and_run_ibcc(configFile, DynIBCC)