def __marking_header_setup__(self,workflow_id,task_id,shapes,output_directory): """ - create the csv output files for each workflow/task pairing where the task is a marking also write out the header line - since different tools (for the same task) can have completely different shapes, these shapes should be printed out to different files - hence the multiple output files - we will give both a summary file and a detailed report file """ for shape in shapes: fname = str(task_id) + self.instructions[workflow_id][task_id]["instruction"][:50] fname = helper_functions.csv_string(fname) # fname += ".csv" self.file_names[(task_id,shape,"detailed")] = fname + "_" + shape + ".csv" self.file_names[(task_id,shape,"summary")] = fname + "_" + shape + "_summary.csv" # polygons - since they have an arbitary number of points are handled slightly differently if shape == "polygon": id_ = task_id,shape,"detailed" self.csv_files[id_] = open(output_directory+fname+"_"+shape+".csv","wb") self.csv_files[id_].write("subject_id,cluster_index,most_likely_tool,area,list_of_xy_polygon_coordinates\n") id_ = task_id,shape,"summary" self.csv_files[id_] = open(output_directory+fname+"_"+shape+"_summary.csv","wb") # self.csv_files[id_].write("subject_id,\n") polygon_tools = [t_index for t_index,t in enumerate(self.workflows[workflow_id][1][task_id]) if t == "polygon"] header = "subject_id," for tool_id in polygon_tools: tool = self.instructions[workflow_id][task_id]["tools"][tool_id]["marking tool"] tool = helper_functions.csv_string(tool) header += "area("+tool+")," self.csv_files[id_].write(header+"\n") else: id_ = task_id,shape,"detailed" # fname += "_"+shape+".csv" self.csv_files[id_] = open(output_directory+fname+"_"+shape+".csv","wb") header = "subject_id,cluster_index,most_likely_tool," if shape == "point": header += "x,y," elif shape == "rectangle": # todo - fix this header += "x1,y1,x2,y2," elif shape == "line": header += "x1,y1,x2,y2," elif shape == "ellipse": header += "x1,y1,r1,r2,theta," header += "p(most_likely_tool),p(true_positive),num_users" self.csv_files[id_].write(header+"\n") # do the summary output else where self.__summary_header_setup__(output_directory,workflow_id,fname,task_id,shape)
def __get_filename__(self,workflow_id,task_id,summary=False,tool_id=None,followup_id=None): """ use the user's instructions to help create a file name to store the results in :param workflow_id: :param task_id: :param summary: :return: """ assert (tool_id is None) or (followup_id is not None) # read in the instructions # if just a simple classification question if tool_id is None: instructions = self.instructions[workflow_id][task_id]["instruction"] # else a follow up question to a marking - so the instructions are stored in a sligghtly different spot else: instructions = ( self.instructions[workflow_id][task_id]["tools"][tool_id] ["followup_questions"][followup_id].get("question", "") ) fname = str(task_id) + instructions[:50] if summary: fname += "_summary" # get rid of any characters (like extra ","s) that could cause problems fname = helper_functions.csv_string(fname) fname += ".csv" return fname
def __marking_summary_header__(self,workflow_id,task_id,shape): """ setup the summary csv file for a given marking tool all shape aggregation will have a summary file - with one line per subject DON'T call this for polygons - they need to be handled differently :return: """ assert shape != "polygon" id_ = task_id,shape,"summary" with open(self.file_names[id_],"w") as csv_file: # the summary file will contain just line per subject csv_file.write("subject_id") # extract only the tools which can actually make markings of the desired shape # [1] - is the list of marking tasks, i.e. [0] is the list of classification tasks and [2] is # survey tasks for tool_id,tool_shape in enumerate(self.workflows[workflow_id][1][task_id]): # does this particular tool use the desired shape? if tool_shape != shape: continue # what is the label given to this tool - this is what we want to use in our column header # i.e. we don't want to say tool 0, or shape rectangle, we want to say "zebra" tool_label = self.instructions[workflow_id][task_id]["tools"][tool_id]["marking tool"] # remove any characters (such as spaces) which shouldn't be in a csv column header tool_label = helper_functions.csv_string(tool_label) csv_file.write(",median(" + tool_label +")") # as final stats to add csv_file.write(",mean_probability,median_probability,mean_tool,median_tool\n")
def __polygon_row__(self,workflow_id,task_id,subject_id,aggregations): """ print out results for a polygon - include the outline in pixels, the area (as a percentage of image area) and what is the most likely tool to have created this polygon :param workflow_id: :param task_id: :param subject_id: :param aggregations: :return: """ id_ = task_id,"polygon","detailed" with open(self.file_names[id_],"a") as csv_file: for p_index,cluster in aggregations["polygon clusters"].items(): if p_index == "all_users": continue tool_classification = cluster["tool_classification"][0].items() most_likely_tool,tool_probability = max(tool_classification, key = lambda x:x[1]) tool = self.instructions[workflow_id][task_id]["tools"][int(most_likely_tool)]["marking tool"] tool = helper_functions.csv_string(tool) row = str(subject_id) + ","+ str(p_index)+ ","+ tool + ","+ str(cluster["area"]) + ",\"" +str(cluster["center"]) + "\"" csv_file.write(row+"\n")
def __single_choice_classification_row__(self,answers,task_id,subject_id,results,cluster_index=None): """ output a row for a classification task which only allowed allowed one answer global_task_id => the task might actually be a subtask, in which case the id needs to contain the task id, tool and follow up question id :param global_task_id: :param subject_id: :param results: :return: """ # since only one choice is allowed, go for the maximum votes,num_users = results if votes == {}: return most_likely,top_probability = max(votes.items(), key = lambda x:x[1]) # extract the text corresponding to the most likely answer most_likely_label = answers[int(most_likely)] # this corresponds to when the question is a follow up if isinstance(most_likely_label,dict): most_likely_label = most_likely_label["label"] most_likely_label = helper_functions.csv_string(most_likely_label) probabilities = votes.values() entropy = self.__shannon_entropy__(probabilities) row = str(subject_id)+"," if cluster_index is not None: row += str(cluster_index) + "," row += most_likely_label+","+str(top_probability)+","+str(entropy)+","+str(num_users)+"\n" # finally write the stuff out to file self.csv_files[task_id].write(row)
def __polygon_row__(self,workflow_id,task_id,subject_id,aggregations): id_ = task_id,"polygon","detailed" # for p_index,cluster in aggregations["polygon clusters"].items(): # if p_index == "all_users": # continue # # tool_classification = cluster["tool_classification"][0].items() # most_likely_tool,tool_probability = max(tool_classification, key = lambda x:x[1]) # total_area[int(most_likely_tool)] += cluster["area"] for p_index,cluster in aggregations["polygon clusters"].items(): if p_index == "all_users": continue tool_classification = cluster["tool_classification"][0].items() most_likely_tool,tool_probability = max(tool_classification, key = lambda x:x[1]) tool = self.instructions[workflow_id][task_id]["tools"][int(most_likely_tool)]["marking tool"] tool = helper_functions.csv_string(tool) for polygon in cluster["center"]: p = geometry.Polygon(polygon) row = str(subject_id) + ","+ str(p_index)+ ","+ tool + ","+ str(p.area/float(cluster["image area"])) + ",\"" +str(polygon) + "\"" self.csv_files[id_].write(row+"\n")
def __marking_file_setup__(self,output_directory,workflow_id): """ - create the csv output files for each workflow/task pairing where the task is a marking also write out the header line - since different tools (for the same task) can have completely different shapes, these shapes should be printed out to different files - hence the multiple output files - we will give both a summary file and a detailed report file """ tasks = self.workflows[workflow_id] # iterate over each task and the shapes (not tools) available for each task for task_id,tools in tasks['marking'].items(): for shape in set(tools): # get the file name - and remove any characters (such as spaces) which should not be in a file name fname = str(task_id) + self.instructions[workflow_id][task_id]["instruction"][:50] fname = helper_functions.csv_string(fname) # create the files - both detailed and summary self.file_names[(task_id,shape,"detailed")] = output_directory+"/"+fname + "_" + shape + ".csv" self.file_names[(task_id,shape,"summary")] = output_directory+"/"+fname + "_" + shape + "_summary.csv" # polygons - since they have an arbitary number of points are handled slightly differently if shape == "polygon": self.__add_polygon_summary_row__() self.__polygon_detailed_setup__(task_id) else: # write the headers for the csv summary files self.__marking_summary_header__(workflow_id,task_id,shape) # and for the detailed self.__marking_detailed_header__(task_id,shape)
def __get_filename__(self,workflow_id,task_id,summary=False,tool_id=None,followup_id=None): """ use the user's instructions to help create a file name to store the results in :param workflow_id: :param task_id: :param summary: :return: """ assert (tool_id is None) or (followup_id is not None) # read in the instructions # if just a simple classification question if tool_id is None: instructions = self.instructions[workflow_id][task_id]["instruction"] # else a follow up question to a marking - so the instructions are stored in a sligghtly different spot else: instructions = self.instructions[workflow_id][task_id]["tools"][tool_id]["followup_questions"][followup_id]["question"] fname = str(task_id) + instructions[:50] if summary: fname += "_summary" # get rid of any characters (like extra ","s) that could cause problems fname = helper_functions.csv_string(fname) fname += ".csv" return fname
def __survey_file_setup__(self,output_directory,workflow_id): """ set up the csv files for surveys. we will just have one output file :param output_directory: :param workflow_id: :return: """ tasks = self.workflows[workflow_id] for task_id in tasks['survey']: instructions = self.instructions[workflow_id][task_id] self.file_names[task_id] = output_directory+str(task_id) + ".csv" with open(self.file_names[task_id],"w") as csv_file: # now write the header header = "subject_id,num_classifications,pielou_score,species," header += "percentage_of_votes_for_species,number_of_votes_for_species" # always include these headers for HWMN follow up question - these columns may be NA in output # but at least we have the header explaining why header += ",minimum_number_of_animals,most_likely_number_of_animals,percentage,maximum_number_of_animals" # todo - we'll assume, for now, that "how many" is always the first question # for followup_id in instructions["questionsOrder"]: for followup_id in instructions["questions"].keys(): multiple_answers = instructions["questions"][followup_id]["multiple"] label = instructions["questions"][followup_id]["label"] # the question "how many" is treated differently - we'll give the minimum, maximum and mostly likely if followup_id == "HWMN": continue else: if "behavior" in label: stem = "behaviour:" elif "behaviour" in label: stem = "behaviour:" else: stem = helper_functions.csv_string(label) if not multiple_answers: header += ",most_likely(" + stem + ")" for answer_id in instructions["questions"][followup_id]["answersOrder"]: header += ",percentage(" + stem + helper_functions.csv_string(instructions["questions"][followup_id]["answers"][answer_id]["label"]) +")" csv_file.write(header+"\n")
def __classification_summary_row__(self,workflow_id,task_id,subject_id,aggregations,followup_id=None,tool_id = None,cluster_index=None): """ given a result for a specific subject (and possibily a specific cluster within that specific subject) add one row of results to the summary file. that row contains subject_id,tool_index,cluster_index,most_likely,p(most_likely),shannon_entropy,mean_agreement,median_agreement,num_users tool_index & cluster_index are only there if we have a follow up to marking task :param id_: :param subject_id: :param results: :param answer_dict: :return: """ # key for accessing the csv output in the dictionary id_ = (task_id,tool_id,followup_id,"summary") # get what percentage of users voted for each classification votes,num_users = aggregations[task_id] try: most_likely,top_probability = max(votes.items(), key = lambda x:x[1]) # if tool_id is not None -> we have a follow up question # extract the text corresponding to the most likely answer # follow up questions for markings with have a different structure if tool_id is not None: answers = self.instructions[workflow_id][task_id]["tools"][tool_id]["followup_questions"][followup_id]["answers"] most_likely_label = answers[int(most_likely)]["label"] else: most_likely_label = self.instructions[workflow_id][task_id]["answers"][int(most_likely)] # and get rid of any bad characters most_likely_label = helper_functions.csv_string(most_likely_label) # calculate some summary values such as entropy and mean and median percentage of votes for each classification probabilities = votes.values() entropy = self.__shannon_entropy__(probabilities) mean_p = np.mean(votes.values()) median_p = np.median(votes.values()) with open(self.file_names[id_],"a") as results_file: results_file.write(str(subject_id)+",") if cluster_index is not None: results_file.write(str(cluster_index)+",") # write out details regarding the top choice # this might not be a useful value if multiple choices are allowed - in which case just ignore it results_file.write(str(most_likely_label)+","+str(top_probability)) # write out some summaries about the distributions of people's answers # again entropy probably only makes sense if only one answer is allowed # and mean_p and median_p probably only make sense if multiple answers are allowed # so people will need to pick and choose what they want results_file.write(","+str(entropy)+","+str(mean_p)+","+str(median_p)) # finally - how many people have seen this subject for this task results_file.write(","+str(num_users)+"\n") # empty values should be ignored - but shouldn't happen too often either except ValueError: pass
def __make_files__(self,workflow_id): """ create all of the files necessary for this workflow :param workflow_id: :return: """ # close any previously used files (and delete their pointers) for f in self.csv_files.values(): f.close() self.csv_files = {} # now create a sub directory specific to the workflow try: workflow_name = self.workflow_names[workflow_id] except KeyError: warning(self.workflows) warning(self.workflow_names) raise workflow_name = helper_functions.csv_string(workflow_name) output_directory = "/tmp/"+str(self.project_id)+"/" +str(workflow_id) + "_" + workflow_name + "/" if not os.path.exists(output_directory): os.makedirs(output_directory) self.workflow_directories[workflow_id] = output_directory classification_tasks,marking_tasks,survey_tasks = self.workflows[workflow_id] # go through the classification tasks - they will either be simple c. tasks (one answer allowed) # multiple c. tasks (more than one answer allowed) and possibly a follow up question to a marking for task_id in classification_tasks: # is this task a simple classification task? # don't care if the questions allows for multiple answers, or requires a single one if classification_tasks[task_id] in ["single","multiple"]: self.__classification_header__(output_directory,workflow_id,task_id) else: # this classification task is actually a follow up to a marking task for tool_id in classification_tasks[task_id]: for followup_id,answer_type in enumerate(classification_tasks[task_id][tool_id]): # instructions = self.instructions[workflow_id][task_id]["tools"][tool_id]["followup_questions"][followup_index]["question"] self.__classification_header__(output_directory,workflow_id,task_id,tool_id,followup_id) # id_ = (task_id,tool_id,followup_index) # if answer_type == "single": # self.__single_response_csv_header__(output_directory,id_,instructions) # else: # self.__multi_response_csv_header__(output_directory,id_,instructions) # now set things up for the marking tasks for task_id in marking_tasks: shapes = set(marking_tasks[task_id]) self.__marking_header_setup__(workflow_id,task_id,shapes,output_directory) # and finally the survey tasks for task_id in survey_tasks: instructions = self.instructions[workflow_id][task_id] self.__survey_header_setup__(output_directory,task_id,instructions) return output_directory
def __survey_header_setup__(self,output_directory,task_id,instructions): """ create the csv output file for a survey task and give the header row :param output_directory: :param task_id: :param instructions: :return: """ # # start with the summary files # fname = output_directory+str(task_id) + "_survey_summary.csv" # self.file_names[(task_id,"summary")] = fname # with open(fname,"wb") as f: # f.write("subject_id,pielou_index\n") # and then the detailed files fname = output_directory+str(task_id) + "_survey_detailed.csv" self.file_names[(task_id,"detailed")] = fname # now write the header header = "subject_id,num_classifications,pielou_score,species,number_of_votes_for_species" # todo - we'll assume, for now, that "how many" is always the first question for followup_id in instructions["questionsOrder"]: multiple_answers = instructions["questions"][followup_id]["multiple"] label = instructions["questions"][followup_id]["label"] # the question "how many" is treated differently - we'll give the minimum, maximum and mostly likely if followup_id == "HWMN": header += ",minimum_number_of_animals,most_likely_number_of_animals,percentage,maximum_number_of_animals" else: if "behavior" in label: stem = "behaviour:" elif "behaviour" in label: stem = "behaviour:" else: stem = helper_functions.csv_string(label) if not multiple_answers: header += ",most_likely(" + stem + ")" for answer_id in instructions["questions"][followup_id]["answersOrder"]: header += ",percentage(" + stem + helper_functions.csv_string(instructions["questions"][followup_id]["answers"][answer_id]["label"]) +")" with open(fname,"wb") as f: f.write(header+"\n")
def __detailed_marking_row__(self,workflow_id,task_id,subject_id,aggregations,shape): """ output for line segments :param workflow_id: :param task_id: :param subject_id: :param aggregations: :return: """ id_ = (task_id,shape,"detailed") for cluster_index,cluster in aggregations[shape + " clusters"].items(): if cluster_index == "all_users": continue # convert to int - not really sure why but get stored as unicode cluster_index = int(cluster_index) # build up the row bit by bit to have the following structure # "subject_id,most_likely_tool,x,y,p(most_likely_tool),p(true_positive),num_users" row = str(subject_id)+"," # todo for now - always give the cluster index row += str(cluster_index)+"," # extract the most likely tool for this particular marking and convert it to # a string label # not completely sure why some clusters are missing this value but does seem to happen most_likely_tool = cluster["most_likely_tool"] # again - not sure why this percentage would be 0, but does seem to happen tool_probability = cluster["percentage"] assert tool_probability > 0 # convert the tool into the string label tool_str = self.instructions[workflow_id][task_id]["tools"][int(most_likely_tool)]["marking tool"] row += helper_functions.csv_string(tool_str) + "," # get the central coordinates next for center_param in cluster["center"]: if isinstance(center_param,list) or isinstance(center_param,tuple): # if we have a list, split it up into subpieces for param in center_param: row += str(param) + "," else: row += str(center_param) + "," # add on how likely the most likely tool was row += str(tool_probability) + "," # how likely the cluster is to being a true positive and how many users (out of those who saw this # subject) actually marked it. For the most part p(true positive) is equal to the percentage # of people, so slightly redundant but allows for things like weighted voting and IBCC in the future prob_true_positive = cluster["existence"][0]["1"] num_users = cluster["existence"][1] row += str(prob_true_positive) + "," + str(num_users) with open(self.file_names[id_],"a") as csvfile: csvfile.write(row+"\n")
def __detailed_classification_file_setup__(self,output_directory,workflow_id,task_id,tool_id=None,followup_id=None): """ create a csv file for the detailed results of a classification task and set up the headers :param output_directory: :param workflow_id: :param task_id: :param tool_id: :param followup_id: :return: """ # the file name will be based on the task label - which we need to make sure isn't too long and doesn't # have any characters which might cause trouble, such as spaces fname = self.__get_filename__(workflow_id,task_id,tool_id=tool_id,followup_id=followup_id) # start with the detailed results id_ = (task_id,tool_id,followup_id,"detailed") self.file_names[id_] = output_directory+fname # open the file and add the column headers with open(output_directory+fname,"wb") as detailed_results: # now write the headers detailed_results.write("subject_id") # the answer dictionary is structured differently for follow up questions markings if tool_id is not None: # if a follow up question - we will also add a column for the cluster id detailed_results.write(",cluster_id") answer_dict = dict() answers = self.instructions[workflow_id][task_id]["tools"] answers = answers[tool_id]["followup_questions"][followup_id] answers = answers.get("answers", {}) for answer_key, answer in answers.items(): answer_dict[answer_key] = answer["label"] else: answer_dict = self.instructions[workflow_id][task_id]["answers"] # each possible response will have a separate column - this column will be the percentage of people # who selected a certain response. This works whether a single response or multiple ones are allowed for answer_key in sorted(answer_dict.keys()): # break this up into multiple lines so we can be sure that the answers are sorted correctly # order might not matter in the end, but just to be sure answer = answer_dict[answer_key] answer_string = helper_functions.csv_string(answer)[:50] detailed_results.write(",p("+answer_string+")") # the final column will give the number of user # for follow up question - num_users should be the number of users with markings in the cluster detailed_results.write(",num_users\n")
def __marking_row__(self,workflow_id,task_id,subject_id,aggregations,shape): """ output for line segments :param workflow_id: :param task_id: :param subject_id: :param aggregations: :return: """ key = task_id,shape,"detailed" for cluster_index,cluster in aggregations[shape + " clusters"].items(): if cluster_index == "all_users": continue # build up the row bit by bit to have the following structure # "subject_id,most_likely_tool,x,y,p(most_likely_tool),p(true_positive),num_users" row = str(subject_id)+"," # todo for now - always give the cluster index row += str(cluster_index)+"," # extract the most likely tool for this particular marking and convert it to # a string label try: tool_classification = cluster["tool_classification"][0].items() except KeyError: warning(shape) warning(cluster) raise most_likely_tool,tool_probability = max(tool_classification, key = lambda x:x[1]) tool_str = self.instructions[workflow_id][task_id]["tools"][int(most_likely_tool)]["marking tool"] row += helper_functions.csv_string(tool_str) + "," # get the central coordinates next for center_param in cluster["center"]: if isinstance(center_param,list) or isinstance(center_param,tuple): row += "\"" + str(tuple(center_param)) + "\"," else: row += str(center_param) + "," # add on how likely the most likely tool was row += str(tool_probability) + "," # how likely the cluster is to being a true positive and how many users (out of those who saw this # subject) actually marked it. For the most part p(true positive) is equal to the percentage # of people, so slightly redundant but allows for things like weighted voting and IBCC in the future prob_true_positive = cluster["existence"][0]["1"] num_users = cluster["existence"][1] row += str(prob_true_positive) + "," + str(num_users) self.csv_files[key].write(row+"\n")
def __add_summary_row__(self,id_,subject_id,results,answer_dict): """ given a result for a specific subject (and possibily a specific cluster within that specific subject) add one row of results to the summary file. that row contains subject_id,tool_index,cluster_index,most_likely,p(most_likely),shannon_entropy,mean_agreement,median_agreement,num_users tool_index & cluster_index are only there if we have a follow up to marking task :param id_: :param subject_id: :param results: :param answer_dict: :return: """ votes,num_users = results # get the top choice try: most_likely,top_probability = max(votes.items(), key = lambda x:x[1]) except ValueError: warning(results) raise # extract the text corresponding to the most likely answer most_likely_label = answer_dict[int(most_likely)] # and get rid of any bad characters most_likely_label = helper_functions.csv_string(most_likely_label) probabilities = votes.values() entropy = self.__shannon_entropy__(probabilities) mean_p = np.mean(votes.values()) median_p = np.median(votes.values()) with open(self.file_names[id_],"a") as results_file: results_file.write(str(subject_id)+",") # write out details regarding the top choice # this might not be a useful value if multiple choices are allowed - in which case just ignore it results_file.write(str(most_likely_label)+","+str(top_probability)) # write out some summaries about the distributions of people's answers # again entropy probably only makes sense if only one answer is allowed # and mean_p and median_p probably only make sense if multiple answers are allowed # so people will need to pick and choose what they want results_file.write(","+str(entropy)+","+str(mean_p)+","+str(median_p)) # finally - how many people have seen this subject for this task results_file.write(","+str(num_users)+"\n")
def __summary_header_setup__(self,output_directory,workflow_id,fname,task_id,shape): """ all shape aggregation will have a summary file - with one line per subject :return: """ # the summary file will contain just line per subject id_ = task_id,shape,"summary" self.csv_files[id_] = open(output_directory+fname+"_"+shape+"_summary.csv","wb") header = "subject_id" # extract only the tools which can actually make point markings for tool_id in sorted(self.instructions[workflow_id][task_id]["tools"].keys()): tool_id = int(tool_id) # self.workflows[workflow_id][0] is the list of classification tasks # we want [1] which is the list of marking tasks found_shape = self.workflows[workflow_id][1][task_id][tool_id] if found_shape == shape: tool_label = self.instructions[workflow_id][task_id]["tools"][tool_id]["marking tool"] tool_label = helper_functions.csv_string(tool_label) header += ",median(" + tool_label +")" header += ",mean_probability,median_probability,mean_tool,median_tool" self.csv_files[id_].write(header+"\n")
def __classification_header__(self,output_directory,workflow_id,task_id,tool_id=None,followup_id=None): assert (tool_id is None) or (followup_id is not None) # start with the detailed results fname = self.__get_filename__(workflow_id,task_id,tool_id=tool_id,followup_id=followup_id) id_ = (workflow_id,task_id,tool_id,followup_id,"detailed") self.file_names[id_] = output_directory+fname with open(output_directory+fname,"wb") as detailed_results: # now write the headers detailed_results.write("subject_id") if tool_id is not None: detailed_results.write(",cluster_id") answer_dict = self.instructions[workflow_id][task_id]["answers"] for answer_key in sorted(answer_dict.keys()): # break this up into multiple lines so we can be sure that the answers are sorted correctly # order might not matter in the end, but just to be sure answer = answer_dict[answer_key] answer_string = helper_functions.csv_string(answer)[:50] detailed_results.write(",p("+answer_string+")") detailed_results.write(",num_users\n") # now setup the summary file fname = self.__get_filename__(workflow_id,task_id,summary = True,tool_id=tool_id,followup_id=followup_id) id_ = (workflow_id,task_id,tool_id,followup_id,"summary") self.file_names[id_] = output_directory+fname with open(output_directory+fname,"wb") as summary_results: self.csv_files[id_] = summary_results self.csv_files[id_].write("subject_id,") if tool_id is not None: self.csv_files[id_].write("cluster_id,") self.csv_files[id_].write("most_likely,p(most_likely),shannon_entropy,mean_agreement,median_agreement,num_users\n")
def __make_files__(self,workflow_id): """ create all of the files necessary for this workflow :param workflow_id: :return: """ # delete any reference to previous csv outputs - this means we don't have to worry about using # workflow ids in the keys and makes things simplier self.file_names = {} # now create a sub directory specific to the workflow try: workflow_name = self.workflow_names[workflow_id] except KeyError: warning(self.workflows) warning(self.workflow_names) raise # workflow names might have characters (such as spaces) which shouldn't be part of a filename, so clean up the # workflow names workflow_name = helper_functions.csv_string(workflow_name) output_directory = "/tmp/"+str(self.project_id)+"/" +str(workflow_id) + "_" + workflow_name + "/" if not os.path.exists(output_directory): os.makedirs(output_directory) self.workflow_directories[workflow_id] = output_directory # create the csv files for the classification tasks (both simple and follow up ones) self.__classification_file_setup__(output_directory,workflow_id) # now set things up for the marking tasks self.__marking_file_setup__(output_directory,workflow_id) self.__survey_file_setup__(output_directory,workflow_id) return output_directory
def __survey_row__(self,instructions,aggregations): """ for a given workflow, task and subject print one row of aggregations per species found to a csv file where the task correspond to a survey task :param workflow_id: :param task_id: :param subject_id: :param aggregations: :return: """ # what we are returning (to be printed out to file elsewhere) rows = [] # in dev - for a small project a few bad aggregations got into the system - so filer them out if max(aggregations["num species"]) == 0: return [] # on average, how many species did people see? # note - nothing here (or empty or what ever) counts as a species - we just won't give any follow up # answer responses species_in_subject = aggregations.get("num species in image", []) views_of_subject = aggregations["num users"] pielou = aggregations["pielou index"] # only go through the top X species - where X is the median number of species seen for species_id,_ in species_in_subject: if species_id == "num users": continue # how many people voted for this species? num_votes = aggregations[species_id]["num votes"] percentage = num_votes/float(views_of_subject) # extract the species name - just to be sure, make sure that the label is "csv safe" species_label = helper_functions.csv_string(instructions["species"][species_id]) row = "," + str(views_of_subject) + "," + str(pielou) + "," + species_label + "," + str(percentage) + "," + str(num_votes) # if there is nothing here - there are no follow up questions so just move on # same with FR - fire, NTHNG - nothing if species_id in ["NTHNGHR","NTHNG","FR"]: break # do the how many question first row += self.__survey_how_many__(instructions,aggregations,species_id) # now go through each of the other follow up questions # for followup_id in instructions["questionsOrder"]: for ii,followup_id in enumerate(instructions["questions"].keys()): followup_question = instructions["questions"][followup_id] if followup_question["label"] == "How many?": # this gets dealt with separately continue multiple_answers = instructions["questions"][followup_id]["multiple"] # this follow up question might not be relevant to the particular species if followup_id not in aggregations[species_id]["followup"]: # if we do not allow for multiple answers, include blank columns for top candidate # and corresponding percentage if not multiple_answers: row += "," # add in a blank column for each follow up answer (since none of these answers are relevant) for _ in instructions["questions"][followup_id]["answersOrder"]: row += "," else: votes = aggregations[species_id]["followup"][followup_id] # if users are only allowed to pick a single answer - return the most likely answer # but still give the individual break downs if not multiple_answers: votes = aggregations[species_id]["followup"][followup_id] answers =(instructions["questions"][followup_id]["answers"]) top_candidate,percent = self.__get_top_survey_followup__(votes,answers) row += "," + str(top_candidate)# + "," + str(percent) for answer_id in instructions["questions"][followup_id]["answersOrder"]: if answer_id in votes: row += "," + str(votes[answer_id]/float(num_votes)) else: row += ",0" # print(len(row.split(","))) # assert len(row.split(",")) == 58 rows.append(row+"\n") return rows
def __survey_row__(self,instructions,aggregations): """ for a given workflow, task and subject print one row of aggregations per species found to a csv file where the task correspond to a survey task :param workflow_id: :param task_id: :param subject_id: :param aggregations: :return: """ # what we are returning (to be printed out to file elsewhere) rows = [] # in dev - for a small project a few bad aggregations got into the system - so filer them out if max(aggregations["num species"]) == 0: return [] # on average, how many species did people see? # note - nothing here (or empty or what ever) counts as a species - we just won't give any follow up # answer responses species_in_subject = self.__get_species_in_subject(aggregations) views_of_subject = aggregations["num users"] pielou = self.__calc__pielou__(aggregations) # only go through the top X species - where X is the median number of species seen for species_id,_ in species_in_subject: if species_id == "num users": continue # how many people voted for this species? num_votes = aggregations[species_id]["num votes"] # percentage = num_votes/float(views_of_subject) # extract the species name - just to be sure, make sure that the label is "csv safe" species_label = helper_functions.csv_string(instructions["species"][species_id]) row = "," + str(views_of_subject) + "," + str(pielou) + "," + species_label + "," + str(num_votes) # if there is nothing here - there are no follow up questions so just move on # same with FR - fire, NTHNG - nothing if species_id in ["NTHNGHR","NTHNG","FR"]: break # do the how many question first row += self.__survey_how_many__(instructions,aggregations,species_id) # now go through each of the other follow up questions for followup_id in instructions["questionsOrder"]: followup_question = instructions["questions"][followup_id] if followup_question["label"] == "How many?": # this gets dealt with separately continue # this follow up question might not be relevant to the particular species if followup_id not in aggregations[species_id]["followup"]: for answer_id in instructions["questions"][followup_id]["answersOrder"]: row += "," else: votes = aggregations[species_id]["followup"][followup_id] # if users are only allowed to pick a single answer - return the most likely answer # but still give the individual break downs multiple_answers = instructions["questions"][followup_id]["multiple"] if not multiple_answers: votes = aggregations[species_id]["followup"][followup_id].items() answers =(instructions["questions"][followup_id]["answers"]) top_candidate,percent = self.__get_top_survey_followup__(votes,answers) row += "," + str(top_candidate) + "," + str(percent) for answer_id in instructions["questions"][followup_id]["answersOrder"]: if answer_id in votes: row += "," + str(votes[answer_id]/float(num_votes)) else: row += ",0" rows.append(row+"\n") return rows