def update(individual_classifications): #start by removing all temp files try: os.remove("/home/greg/Databases/condor_ibcc.out") except OSError: pass try: os.remove("/home/greg/Databases/condor_ibcc.mat") except OSError: pass try: os.remove("/home/greg/Databases/condor_ibcc.csv.dat") except OSError: pass with open("/home/greg/Databases/condor_ibcc.csv","a") as f: for u, s, b in individual_classifications: f.write(str(u)+","+str(s)+","+str(b)+"\n") print datetime.datetime.time(datetime.datetime.now()) ibcc.runIbcc("/home/greg/Databases/condor_ibcc.py") print datetime.datetime.time(datetime.datetime.now())
def __ibcc__2(results_dict,users_per_subject): # results needs to be a dictionary which maps from a subject ID to a list of found clusters # users_per_subject needs to be different, just in case someone clicked on nothing # get a list of all of the users assert type(users_per_subject) == dict global_users = [] for u_list in users_per_subject.values(): for u in u_list: if not(u in global_users): global_users.append(u) things_in_subject = {} off_by_one_clusters = [] things_list = [] thing_index = 0 for zooinverse_id in results_dict: things_in_subject[zooinverse_id] = [] centers,clusters,users = results_dict[zooinverse_id] pairs = __find_closest__(centers,clusters,users,user_threshold=1,offset=thing_index) off_by_one_clusters.extend(list(pairs)) for users_per_marking in users: things_in_subject[zooinverse_id].append(thing_index) # find out who saw or did not see this "thing" - out of everyone who viewed this subject t = [] for u in users_per_subject[zooinverse_id]: if u in users_per_marking: t.append((global_users.index(u),1)) else: t.append((global_users.index(u),0)) things_list.append(t[:]) thing_index += 1 # run ibcc with out combining any of the clusters with open(base_directory+"/Databases/base_ibcc.csv","wb") as f: f.write("a,b,c\n") for thing_index in range(len(things_list)): for user_index, marked in things_list[thing_index]: f.write(str(user_index)+","+str(thing_index)+","+str(marked)+"\n") __ibcc_init__("base") ibcc.runIbcc(base_directory+"/Databases/base_ibcc.py") confusions = [] with open(base_directory+"/Databases/base_ibcc.mat") as f: for user_index, l in enumerate(f.readlines()): confusions.append([float(f) for f in l[:-1].split(" ")]) for count,(c1,c2,overlap) in enumerate(off_by_one_clusters): print things_list[c1] print things_list[c2] users = zip(*things_list[c1])[0] for u in users: print confusions[u][2],confusions[u][3] break
def __ibcc__(self): for species in self.speciesList: #check to see whether or not this file exists if not(os.path.isfile(self.baseDir+"ibcc/"+species+"_ibcc.out"+str(self.cutOff))): ibcc.runIbcc(self.baseDir+"ibcc/"+str(species)+str(self.cutOff)+"config.py") #i = IBCCsetup() #i.__createConfigs__() #i.__filterUserClassifications__() #i.__ibcc__()
def __classify__(self): userNames = self.userDict.keys() subjectNames = self.subjectDict.keys() f = open(self.baseDir+"ibcc/input",'wb') for u in self.userDict: classifications = self.userDict[u].__getClassifications__() for (s,r) in classifications: f.write(str(userNames.index(u)) + "," + str(subjectNames.index(s)) + "," + str(r) + "\n") f.close() #now - write the config file f = open(self.baseDir+"ibcc/config.py",'wb') f.write("import numpy as np\nscores = np.array([0,1])\n") f.write("nScores = len(scores)\n") f.write("nClasses = 2\n") f.write("inputFile = '"+self.baseDir+"ibcc/input'\n") f.write("outputFile = '"+self.baseDir+"ibcc/output'\n") f.write("confMatFile = '"+self.baseDir+"ibcc/confusion'\n") #f.write("nu0 = np.array([45.0,55.0])\n") f.close() ibcc.runIbcc(self.baseDir+"ibcc/config.py")
pass try: os.remove(data_directory + "/galaxy_zoo_ibcc.mat") except OSError: pass try: os.remove(data_directory + "/galaxy_zoo_ibcc.csv.dat") except OSError: pass import datetime print datetime.datetime.time(datetime.datetime.now()) print base_directory + "/Databases/galaxy_zoo_ibcc.py" ibcc.runIbcc(data_directory + "/galaxy_zoo_ibcc.py") print datetime.datetime.time(datetime.datetime.now()) #read in gold standard data pos0 = [] pos1 = [] pos2 = [] with open(data_directory + "/candels_t01_a00_positive.dat", "rb") as f: for l in f.readlines(): pos0.append(l[:-1]) with open(data_directory + "/candels_t01_a01_positive.dat", "rb") as f: for l in f.readlines(): pos1.append(l[:-1]) with open(data_directory + "/candels_t01_a02_positive.dat", "rb") as f: for l in f.readlines(): pos2.append(l[:-1])
def __signal_ibcc_majority__(self,split_ip_address=True): """ run ibcc to determine which clusters are signal or noise use majority voting to determine priors :param split_ip_address: for user ids which are ip addresses - when people are not logged in - should be treat each subject completely separate. That is if ip address X marked subjects A and B should we treat X as being two completely different people for those two classifications. There is no guarantee that they are the same person but seems like a lot of information to throw away. The param is to allow exploring the options and results. :return: """ # todo: implement a middle ground for split_ip_address where we treat the same ip address as the same person # todo: as long as the classifications are close enough together time wise # get all users who have viewed any subjects which are processing - also get the list of those who # did so while not logged in all_users = list(self.project_api.__all_users__()) all_ips = list(self.project_api.__all_ips__()) # global cluster count - across all images/subjects cluster_count = -1 # need to give the ip addresses unique indices, so update ip_offset after every subject ip_offset = 0 # needed for determining priors for IBCC real_animals = 0 fake_animals = 0 # needed for prior confusion matrix true_pos = [] true_neg = [] # intermediate holder variable # because ibcc needs indices to be nice and ordered with no gaps, we have to make two passes through the data to_ibcc = [] # for each global cluster index, store what image/subject it is from and what its local index is # wrt to that subject self.global_to_local = [] # print out the classifications and set up the priors using majority voting for zooniverse_id in self.clusterResults: if self.clusterResults[zooniverse_id] is None: continue # get the list of all the users who viewed this subject # and the ip addresses of every user who was not logged in while viewing the subjects users_per_subject = self.project_api.__users__(zooniverse_id) ips_per_subject = self.project_api.__ips__(zooniverse_id) # process each cluster (possible animal), one at a time # only the names of users who marked this cluster matter - the specific x,y points are irrelevant right now for local_index,user_per_cluster in enumerate(self.clusterResults[zooniverse_id][2]): # moving on to the next animal so increase counter # universal counter over all images cluster_count += 1 # needed for determining priors for IBCC pos = 0 neg = 0 # note that the cluster with index cluster_count is from subject zooniverse_id self.global_to_local.append((zooniverse_id,local_index)) # for this cluster, go through each user and see if they marked this cluster # check whether or not each user marked this cluster for user_id in users_per_subject: # if the user was not logged in if user_id in ips_per_subject: # if we are considering the ip addresses of each user (i.e. those that were not logged in) # separately for each image - assign a user index based only this image # use negative indices to differentiate ip addresses and users # +1 assures that we don't have 0 - which is "both" positive and negative if split_ip_address: user_index = -(ips_per_subject.index(user_id)+ip_offset+1) else: # we are treating all occurances of this ip address as being from the same user user_index = -all_ips.index(user_id)-1 else: # user was logged in # todo: use bisect to increase speed user_index = all_users.index(user_id) # did the user mark this cluster or not? if user_id in user_per_cluster: to_ibcc.append((user_id,user_index,cluster_count,1)) pos += 1 else: to_ibcc.append((user_id,user_index,cluster_count,0)) neg += 1 # if a majority of people say that there is an animal - use this for prior values if pos > neg: real_animals += 1 # for estimating the confusion matrix true_pos.append(pos/float(pos+neg)) else: fake_animals += 1 true_neg.append(neg/float(pos+neg)) ip_offset += len(ips_per_subject) # now run through again - this will make sure that all of the indices are ordered with no gaps # since the user list is created by reading through all the users, even those which haven't annotated # of the specific images we are currently looking at ibcc_user_list = [] # this is also for other functions to be able to interpret the results self.ibcc_users = [] for user,user_index,animal_index,found in to_ibcc: # can't use bisect or the indices will be out of order if not(user_index in ibcc_user_list): ibcc_user_list.append(user_index) self.ibcc_users.append(user) # write out the input file for IBCC with open(self.base_directory+"/Databases/"+self.alg+"_ibcc.csv","wb") as f: f.write("a,b,c\n") for marking_count,(user,user_index,animal_index,found) in enumerate(to_ibcc): if marking_count == 200: break i = ibcc_user_list.index(user_index) f.write(str(i)+","+str(animal_index)+","+str(found)+"\n") # create the prior estimate and the default confusion matrix prior = real_animals/float(real_animals + fake_animals) t = np.mean(true_pos) f = np.mean(true_neg) # what the weight should be # todo: should this be hard coded or set as a param? weight = 10 # the confusion matrix cannot have any zero values confusion = [[max(int(t*weight),1),max(int((1-t)*weight),1)],[max(int((1-f)*weight),1),max(int(f*weight),1)]] # create the config file with open(self.base_directory+"/Databases/"+self.alg+"_ibcc.py","wb") as f: f.write("import numpy as np\n") f.write("scores = np.array([0,1])\n") f.write("nScores = len(scores)\n") f.write("nClasses = 2\n") f.write("inputFile = \""+self.base_directory+"/Databases/"+self.alg+"_ibcc.csv\"\n") f.write("outputFile = \""+self.base_directory+"/Databases/"+self.alg+"_signal.out\"\n") f.write("confMatFile = \""+self.base_directory+"/Databases/"+self.alg+"_ibcc.mat\"\n") f.write("nu0 = np.array(["+str(max(int((1-prior)*100),1))+","+str(max(int(prior*100),1))+"])\n") f.write("alpha0 = np.array("+str(confusion)+")\n") # start by removing all temp files try: os.remove(self.base_directory+"/Databases/"+self.alg+"_signal.out") except OSError: pass try: os.remove(self.base_directory+"/Databases/"+self.alg+"_ibcc.mat") except OSError: pass try: os.remove(self.base_directory+"/Databases/"+self.alg+"_ibcc.csv.dat") except OSError: pass # pickle.dump((big_subjectList,big_userList),open(base_directory+"/Databases/tempOut.pickle","wb")) ibcc.runIbcc(self.base_directory+"/Databases/"+self.alg+"_ibcc.py")
def __signal_ibcc_gold__(self,global_indices,gold_standard_pts,split_ip_address=True): """ uses gold standard from experts instead of priors based on majority voting :param global_indices: :param split_ip_address: :param gold_standard_pts: the list of global indices for which we are going to provide gold standard data using a negative index is a way of giving a false positive :return: """ # intermediate holder variable # because ibcc needs indices to be nice and ordered with no gaps, we have to make two passes through the data to_ibcc = [] # there will be some redundancy reading in the subject list - so keep track of the current subject_id # and only update when necessary users_per_subject = None ips_per_subject = None current_subject = None # we may skip over some indices if they correspond to gold standard points which no one marked and # are not being used as provided gold standard data actually_used_clusters = [] for global_cluster_index,(subject_id,local_index) in enumerate(global_indices): # only update when necessary - when we have moved on to a new subject if subject_id != current_subject: # get the list of all the users who viewed this subject # and the ip addresses of every user who was not logged in while viewing the subjects users_per_subject = self.project_api.__users__(subject_id) # ips_per_subject = self.project_api.__ips__(subject_id) current_subject = subject_id if local_index is None: # in this case, we know that no user marked this animal # this is either provided gold standard data, or a test - in which case we should ignore # this data if not(global_cluster_index in gold_standard_pts): continue else: user_per_cluster = [] else: user_per_cluster = self.clusterResults[subject_id][2][local_index] actually_used_clusters.append(global_cluster_index) for user_id in list(users_per_subject): # check to see if this user was logged in - if not, the user_id should be an ip address # if not logged in, we just need to decide whether to add the subject_name on to the user_id # which as a result treats the same ip address for different subjects as completely different # users try: socket.inet_aton(user_id) if split_ip_address: user_id += subject_id except (socket.error,UnicodeEncodeError) as e: # logged in user, nothing to do pass if user_id in user_per_cluster: to_ibcc.append((user_id,global_cluster_index,1)) else: to_ibcc.append((user_id,global_cluster_index,0)) # gives each user an index with no gaps in the list user_indices = [] # write out the input file for IBCC with open(self.base_directory+"/Databases/"+self.alg+"_ibcc.csv","wb") as f: f.write("a,b,c\n") for user,cluster_index,found in to_ibcc: if not(user in user_indices): user_indices.append(user) i = user_indices.index(user) j = actually_used_clusters.index(cluster_index) f.write(str(i)+","+str(j)+","+str(found)+"\n") #return user_indices,actually_used_clusters # write out the input file for IBCC # create the config file with open(self.base_directory+"/Databases/"+self.alg+"_ibcc.py","wb") as f: f.write("import numpy as np\n") f.write("scores = np.array([0,1])\n") f.write("nScores = len(scores)\n") f.write("nClasses = 2\n") f.write("inputFile = \""+self.base_directory+"/Databases/"+self.alg+"_ibcc.csv\"\n") f.write("outputFile = \""+self.base_directory+"/Databases/"+self.alg+"_signal.out\"\n") f.write("confMatFile = \""+self.base_directory+"/Databases/"+self.alg+"_ibcc.mat\"\n") f.write("goldFile= \""+self.base_directory+"/Databases/"+self.alg+"_gold.csv\"\n") # start by removing all temp files try: os.remove(self.base_directory+"/Databases/"+self.alg+"_signal.out") except OSError: pass try: os.remove(self.base_directory+"/Databases/"+self.alg+"_ibcc.mat") except OSError: pass try: os.remove(self.base_directory+"/Databases/"+self.alg+"_ibcc.csv.dat") except OSError: pass # pickle.dump((big_subjectList,big_userList),open(base_directory+"/Databases/tempOut.pickle","wb")) ibcc.runIbcc(self.base_directory+"/Databases/"+self.alg+"_ibcc.py") return self.base_directory+"/Databases/"+self.alg+"_signal.out", user_indices,actually_used_clusters
os.remove(base_directory+"/Databases/condor_ibcc.out") except OSError: pass try: os.remove(base_directory+"/Databases/condor_ibcc.mat") except OSError: pass try: os.remove(base_directory+"/Databases/condor_ibcc.csv.dat") except OSError: pass #pickle.dump((big_subjectList,big_userList),open(base_directory+"/Databases/tempOut.pickle","wb")) ibcc.runIbcc(base_directory+"/Databases/condor_ibcc.py") ibcc_v = [] with open(base_directory+"/Databases/condor_ibcc.out","rb") as f: ibcc_results = csv.reader(f, delimiter=' ') for row in ibcc_results: ibcc_v.append(float(row[2])) with open(base_directory+"/Databases/condor_ibcc.mat","rb") as f: ibcc_results = csv.reader(f, delimiter=' ') for row in ibcc_results: ibcc_v.append(float(row[2])) for ii,zooniverse_id in enumerate(results_dict): print zooniverse_id
pass try: os.remove(data_directory+"/galaxy_zoo_ibcc.mat") except OSError: pass try: os.remove(data_directory+"/galaxy_zoo_ibcc.csv.dat") except OSError: pass import datetime print datetime.datetime.time(datetime.datetime.now()) print base_directory+"/Databases/galaxy_zoo_ibcc.py" ibcc.runIbcc(data_directory+"/galaxy_zoo_ibcc.py") print datetime.datetime.time(datetime.datetime.now()) #read in gold standard data pos0 = [] pos1 = [] pos2 = [] with open(data_directory+"/candels_t01_a00_positive.dat","rb") as f: for l in f.readlines(): pos0.append(l[:-1]) with open(data_directory+"/candels_t01_a01_positive.dat","rb") as f: for l in f.readlines(): pos1.append(l[:-1]) with open(data_directory+"/candels_t01_a02_positive.dat","rb") as f: for l in f.readlines(): pos2.append(l[:-1])
f.write(str(user_index)+","+str(subject_index)+","+str(ann)+"\n") print "number of users " + str(len(user_ids)) print "number of gold labels " + str(len(list(gold_set))) with open(baseDir+"Databases/supernova_ibcc.py",'wb') as f: f.write("import numpy as np\nscores = np.array([0,1])\n") f.write("nScores = len(scores)\n") f.write("nClasses =2\n") f.write("inputFile = '"+baseDir+"Databases/supernova_ibcc.csv'\n") f.write("outputFile = '"+baseDir+"Databases/supernova_ibcc.out'\n") f.write("confMatFile = '"+baseDir+"Databases/supernova_ibcc.mat'\n") f.write("goldFile = '"+baseDir+"Databases/supernova_ibcc_gold.csv'\n") os.remove(baseDir+"Databases/supernova_ibcc.csv.dat") ibcc.runIbcc(baseDir+"Databases/supernova_ibcc.py") print "done IBCC" x_values = [] y_values = [] with open(baseDir+"Databases/supernova_ibcc.mat","rb") as f: reader = csv.reader(f,delimiter=" ") for user_index,r in enumerate(reader): count = classification_counts[user_index] if min(count) < 5: continue x = float(r[0]) y = float(r[-1]) x_values.append(x)
try: os.remove(base_directory+"/Databases/penguins_ibcc.out") except OSError: pass try: os.remove(base_directory+"/Databases/penguins_ibcc.mat") except OSError: pass try: os.remove(base_directory+"/Databases/penguins_ibcc.in.dat") except OSError: pass ibcc.runIbcc(base_directory+"/Databases/penguins_ibcc_config.py") print "done that" total = 0 true_positives = [] false_positives = [] with open(base_directory+"/Databases/penguins_ibcc.out",'rb') as f: for l in f.readlines(): penguin_index, neg_prob,pos_prob = l.split(" ") penguin = penguins[max_users][image_index][1][int(float(penguin_index))][0] #is this penguin "real" ie. is in the gold standard? if cluster_compare(gold_standard,[penguin,]) == []: #yes - penguin is real
f.write("import numpy as np\n") f.write("scores = np.array([0,1,2])\n") f.write("nScores = len(scores)\n") f.write("nClasses = 3\n") f.write("inputFile = \"/home/greg/Databases/galaxy_zoo_ibcc.csv\"\n") f.write("outputFile = \"/home/greg/Databases/galaxy_zoo_ibcc.out\"\n") f.write("confMatFile = \"/home/greg/Databases/galaxy_zoo_ibcc.mat\"\n") f.write("nu0 = np.array([40,40,10])\n") f.write("alpha0 = np.array([[5, 2, 2], [2, 5, 2], [3, 3, 3]])\n") try: os.remove("/home/greg/Databases/galaxy_zoo_ibcc.out") except OSError: pass try: os.remove("/home/greg/Databases/galaxy_zoo_ibcc.mat") except OSError: pass try: os.remove("/home/greg/Databases/galaxy_zoo_ibcc.csv.dat") except OSError: pass import datetime print datetime.datetime.time(datetime.datetime.now()) ibcc.runIbcc("/home/greg/Databases/galaxy_zoo_ibcc.py") print datetime.datetime.time(datetime.datetime.now())
def __ibcc__(results_dict,users_per_subject): # create a global index of all users global_users = [] for u_list in users_per_subject.values(): for u in u_list: if not(u in global_users): global_users.append(u) things_in_subject = {} things_list = [] thing_index = 0 off_by_one_clusters = [] for zooinverse_id in results_dict: things_in_subject[zooinverse_id] = [] centers,clusters,users = results_dict[zooinverse_id] pairs = __find_closest__(centers,clusters,users,user_threshold=1,offset=thing_index) off_by_one_clusters.extend(list(pairs)) for users_per_marking in users: things_in_subject[zooinverse_id].append(thing_index) # find out who saw or did not see this "thing" - out of everyone who viewed this subject t = [] for u in users_per_subject[zooinverse_id]: if u in users_per_marking: t.append((global_users.index(u),1)) else: t.append((global_users.index(u),0)) things_list.append(t[:]) thing_index += 1 # run ibcc with out combining any of the clusters with open(base_directory+"/Databases/base_ibcc.csv","wb") as f: f.write("a,b,c\n") for thing_index in range(len(things_list)): for user_index, marked in things_list[thing_index]: f.write(str(user_index)+","+str(thing_index)+","+str(marked)+"\n") __ibcc_init__("base") ibcc.runIbcc(base_directory+"/Databases/base_ibcc.py") # now try merging each possible pair and running ibcc on the resulting set up # yes, this is going to be tedious and time consuming - hope for a better implementation later on for count,(c1,c2,overlap) in enumerate(off_by_one_clusters): # most of the time, thing_index and thing_prime_index will be the same # but can be an off by one indifference to account for that fact that we are skipping over c2 thing_prime_index = 0 print (c1,c2) with open(base_directory+"/Databases/merged_ibcc.csv","wb") as f: f.write("a,b,c\n") for thing_index in range(len(things_list)): #print (thing_index,thing_prime_index) if thing_index == c2: # we skipping this one #print "skip" pass else: if thing_index == c1: # merge assert thing_index == thing_prime_index assert len(list(overlap)) <= 1 #print zip(*things_list[c1])[0] #print zip(*things_list[c1])[1] #print zip(*things_list[c2])[1] for (user_index1,marked1),(user_index2,marked2) in zip(things_list[c1],things_list[c2]): assert user_index1 == user_index2 f.write(str(user_index1)+","+str(thing_prime_index)+","+str(marked1 or marked2)+"\n") else: # proceed as normal # continue for user_index, marked in things_list[thing_index]: f.write(str(user_index)+","+str(thing_prime_index)+","+str(marked)+"\n") thing_prime_index += 1 #print thing_index #print thing_prime_index #assert thing_prime_index == (thing_index-1) __ibcc_init__("merged") ibcc.runIbcc(base_directory+"/Databases/merged_ibcc.py") p1 = load_ibcc_probabilities("base",c1) p2 = load_ibcc_probabilities("base",c2) p3 = load_ibcc_probabilities("merged",c1) print (p1,p2,p3) if p3 < (max(p1,p2)-0.01): break
def __runIBCC__(self): collection = self.db['merged_classifications'+str(self.cutoff)] self.user_list = [] self.subject_list = [] shutil.rmtree(self.baseDir+"ibcc") os.makedirs(self.baseDir+"ibcc") counter = -1 for speciesGroup in self.species_groups: self.user_list = [] self.subject_list = [] count = [] required_l = list(powerset(speciesGroup)) prohibited_l = [[s for s in speciesGroup if not(s in r)] for r in required_l] counter += 1 self.__createConfigFile(counter,len(required_l)) ibcc_input_file = open(self.baseDir+"ibcc/"+str(counter)+".in","wb") for document in collection.find(): user_name = document["user_name"] subject_zooniverse_id = document["subject_zooniverse_id"] user_species_list = document["species_list"] if not(subject_zooniverse_id in self.nonempty_list): continue #IBCC requires an int ID for both user and subject - so convert if user_name in self.user_list: userID = self.user_list.index(user_name) count[userID] += 1 else: self.user_list.append(user_name) userID = len(self.user_list)-1 count.append(1) if subject_zooniverse_id in self.subject_list: subjectID = self.subject_list.index(subject_zooniverse_id) else: self.subject_list.append(subject_zooniverse_id) subjectID = len(self.subject_list)-1 #which class does this classification count as? meet_required = [sorted(list(set(user_species_list).intersection(r))) == sorted(list(r)) for r in required_l] meet_prohibited = [tuple(set(user_species_list).intersection(p)) == () for p in prohibited_l] meet_overall = [r and p for (r, p) in zip(meet_required, meet_prohibited)] assert(sum([1. for o in meet_overall if o]) == 1) class_id = meet_overall.index(True) print(str(userID) + "," + str(subjectID) + "," + str(class_id), file=ibcc_input_file) ibcc_input_file.close() #now run IBCC ibcc.runIbcc(self.baseDir+"ibcc/"+str(counter)+"config.py") print(count)
classifications.append((user_index, subject_index, blank)) print "====----" print errorCount try: os.remove("/home/greg/Databases/condor_ibcc.out") except OSError: pass try: os.remove("/home/greg/Databases/condor_ibcc.mat") except OSError: pass try: os.remove("/home/greg/Databases/condor_ibcc.csv.dat") except OSError: pass with open("/home/greg/Databases/condor_ibcc.csv", "wb") as f: f.write("a,b,c\n") for u, s, b in classifications: f.write(str(u) + "," + str(s) + "," + str(b) + "\n") print datetime.datetime.time(datetime.datetime.now()) ibcc.runIbcc("/home/greg/Databases/condor_ibcc.py") print datetime.datetime.time(datetime.datetime.now()) pickle.dump(subjects, open("/home/greg/Databases/condor_ibcc.pickle", "wb"))
else: userIndex = users.index(userName) if not(photoName in photos): photos.append(photoName) photoIndex = len(photos)- 1 else: photoIndex = photos.index(photoName) if i in classification: print(str(userIndex)+","+str(photoIndex)+",1", file=f) else: print(str(userIndex)+","+str(photoIndex)+",0", file=f) f.close() ibcc.runIbcc(baseDir+"ibcc/"+str(i)+"config.py") #merge the results into the existing ones #assume all photos are now in the list - should be reader = csv.reader(open(baseDir+"ibcc/"+str(i)+".out","rU"), delimiter=" ") for photoIndex, neg, pos in reader: photoIndex = int(float(photoIndex)) pos = float(pos) if len(ibccClassifications) < (photoIndex+1): ibccClassifications.append([]) if pos > 0.5: #print(photoIndex,len(ibccClassifications)) ibccClassifications[photoIndex].append(s)
try: os.remove(base_directory + "/Databases/penguins_ibcc.out") except OSError: pass try: os.remove(base_directory + "/Databases/penguins_ibcc.mat") except OSError: pass try: os.remove(base_directory + "/Databases/penguins_ibcc.in.dat") except OSError: pass ibcc.runIbcc(base_directory + "/Databases/penguins_ibcc_config.py") print "done that" total = 0 true_positives = [] false_positives = [] with open(base_directory + "/Databases/penguins_ibcc.out", 'rb') as f: for l in f.readlines(): penguin_index, neg_prob, pos_prob = l.split(" ") penguin = penguins[max_users][image_index][1][int( float(penguin_index))][0] #is this penguin "real" ie. is in the gold standard? if cluster_compare(gold_standard, [
else: userIndex = users.index(userName) if not (photoName in photos): photos.append(photoName) photoIndex = len(photos) - 1 else: photoIndex = photos.index(photoName) if i in classification: print(str(userIndex) + "," + str(photoIndex) + ",1", file=f) else: print(str(userIndex) + "," + str(photoIndex) + ",0", file=f) f.close() ibcc.runIbcc(baseDir + "ibcc/" + str(i) + "config.py") #merge the results into the existing ones #assume all photos are now in the list - should be reader = csv.reader(open(baseDir + "ibcc/" + str(i) + ".out", "rU"), delimiter=" ") for photoIndex, neg, pos in reader: photoIndex = int(float(photoIndex)) pos = float(pos) if len(ibccClassifications) < (photoIndex + 1): ibccClassifications.append([]) if pos > 0.5: #print(photoIndex,len(ibccClassifications)) ibccClassifications[photoIndex].append(s)
with open("/home/greg/Databases/galaxy_zoo_ibcc.py", "wb") as f: f.write("import numpy as np\n") f.write("scores = np.array([0,1,2])\n") f.write("nScores = len(scores)\n") f.write("nClasses = 3\n") f.write("inputFile = \"/home/greg/Databases/galaxy_zoo_ibcc.csv\"\n") f.write("outputFile = \"/home/greg/Databases/galaxy_zoo_ibcc.out\"\n") f.write("confMatFile = \"/home/greg/Databases/galaxy_zoo_ibcc.mat\"\n") f.write("nu0 = np.array([40,40,10])\n") f.write("alpha0 = np.array([[5, 2, 2], [2, 5, 2], [3, 3, 3]])\n") try: os.remove("/home/greg/Databases/galaxy_zoo_ibcc.out") except OSError: pass try: os.remove("/home/greg/Databases/galaxy_zoo_ibcc.mat") except OSError: pass try: os.remove("/home/greg/Databases/galaxy_zoo_ibcc.csv.dat") except OSError: pass import datetime print datetime.datetime.time(datetime.datetime.now()) ibcc.runIbcc("/home/greg/Databases/galaxy_zoo_ibcc.py") print datetime.datetime.time(datetime.datetime.now())
os.remove(base_directory + "/Databases/condor_ibcc.out") except OSError: pass try: os.remove(base_directory + "/Databases/condor_ibcc.mat") except OSError: pass try: os.remove(base_directory + "/Databases/condor_ibcc.csv.dat") except OSError: pass # pickle.dump((big_subjectList,big_userList),open(base_directory+"/Databases/tempOut.pickle","wb")) ibcc.runIbcc(base_directory + "/Databases/condor_ibcc.py") # now analyze the data # assume for starters that each image does not have a condor X = [] Y = [] X_2 = [] Y_2 = [] contains_condor = {zooniverse_id: False for zooniverse_id in zooniverse_list} condor_probabilities = {zooniverse_id: [] for zooniverse_id in zooniverse_list} with open(base_directory + "/Databases/condor_ibcc.out", "rb") as f: ibcc_results = csv.reader(f, delimiter=" ") for row in ibcc_results: animal_index = int(float(row[0])) condor_p = float(row[2])
print "====----" print errorCount try: os.remove("/home/greg/Databases/condor_ibcc.out") except OSError: pass try: os.remove("/home/greg/Databases/condor_ibcc.mat") except OSError: pass try: os.remove("/home/greg/Databases/condor_ibcc.csv.dat") except OSError: pass with open("/home/greg/Databases/condor_ibcc.csv", "wb") as f: f.write("a,b,c\n") for u, s, b in classifications: f.write(str(u) + "," + str(s) + "," + str(b) + "\n") print datetime.datetime.time(datetime.datetime.now()) ibcc.runIbcc("/home/greg/Databases/condor_ibcc.py") print datetime.datetime.time(datetime.datetime.now()) pickle.dump(subjects, open("/home/greg/Databases/condor_ibcc.pickle", "wb"))