示例#1
0
    def __set_tags__(self,text):
        # convert to ascii
        try:
            text = text.encode('ascii','ignore')
        except AttributeError:
            warning(text)
            raise

        # good place to check if there is a newline character in the transcription
        # if so, someone tried to transcribe multiple lines at once - this is no longer allowed
        # but there are some legacy transcriptions with \n - such transcriptions are simply ignored
        if "\n" in text:
            return ""

        # the order of the keys matters - we need them to constant across all uses cases
        # we could sort .items() but that would be a rather large statement
        # replace each tag with a single non-standard ascii character (given by chr(num) for some number)
        text = text.strip()

        for key in sorted(self.tags.keys()):
            tag = self.tags[key]
            text = re.sub(tag,chr(key),text)

        # get rid of some other random tags and commands that shouldn't be included at all
        # todo - generalize
        text = re.sub("<br>","",text)
        text = re.sub("<font size=\"1\">","",text)
        text = re.sub("</font>","",text)
        text = re.sub("&nbsp","",text)
        text = re.sub("&amp","&",text)
        text = re.sub("\?\?\?","",text)

        return text
示例#2
0
    def __set_tags__(self, text):
        # convert to ascii
        try:
            text = text.encode('ascii', 'ignore')
        except AttributeError:
            warning(text)
            raise

        # good place to check if there is a newline character in the transcription
        # if so, someone tried to transcribe multiple lines at once - this is no longer allowed
        # but there are some legacy transcriptions with \n - such transcriptions are simply ignored
        if "\n" in text:
            return ""

        # the order of the keys matters - we need them to constant across all uses cases
        # we could sort .items() but that would be a rather large statement
        # replace each tag with a single non-standard ascii character (given by chr(num) for some number)
        text = text.strip()

        for key in sorted(self.tags.keys()):
            tag = self.tags[key]
            text = re.sub(tag, chr(key), text)

        # get rid of some other random tags and commands that shouldn't be included at all
        # todo - generalize
        text = re.sub("<br>", "", text)
        text = re.sub("<font size=\"1\">", "", text)
        text = re.sub("</font>", "", text)
        text = re.sub("&nbsp", "", text)
        text = re.sub("&amp", "&", text)
        text = re.sub("\?\?\?", "", text)

        return text
示例#3
0
    def __make_files__(self,workflow_id):
        """
        create all of the files necessary for this workflow
        :param workflow_id:
        :return:
        """
        # close any previously used files (and delete their pointers)
        for f in self.csv_files.values():
            f.close()
        self.csv_files = {}

        # now create a sub directory specific to the workflow
        try:
            workflow_name = self.workflow_names[workflow_id]
        except KeyError:
            warning(self.workflows)
            warning(self.workflow_names)
            raise

        workflow_name = helper_functions.csv_string(workflow_name)
        output_directory = "/tmp/"+str(self.project_id)+"/" +str(workflow_id) + "_" + workflow_name + "/"

        if not os.path.exists(output_directory):
            os.makedirs(output_directory)
        self.workflow_directories[workflow_id] = output_directory

        classification_tasks,marking_tasks,survey_tasks = self.workflows[workflow_id]

        # go through the classification tasks - they will either be simple c. tasks (one answer allowed)
        # multiple c. tasks (more than one answer allowed) and possibly a follow up question to a marking
        for task_id in classification_tasks:
            # is this task a simple classification task?
            # don't care if the questions allows for multiple answers, or requires a single one
            if classification_tasks[task_id] in ["single","multiple"]:
                self.__classification_header__(output_directory,workflow_id,task_id)

            else:
                # this classification task is actually a follow up to a marking task
                for tool_id in classification_tasks[task_id]:
                    for followup_id,answer_type in enumerate(classification_tasks[task_id][tool_id]):
                        # instructions = self.instructions[workflow_id][task_id]["tools"][tool_id]["followup_questions"][followup_index]["question"]
                        self.__classification_header__(output_directory,workflow_id,task_id,tool_id,followup_id)
                        # id_ = (task_id,tool_id,followup_index)
                        # if answer_type == "single":
                        #     self.__single_response_csv_header__(output_directory,id_,instructions)
                        # else:
                        #     self.__multi_response_csv_header__(output_directory,id_,instructions)

        # now set things up for the marking tasks
        for task_id in marking_tasks:
            shapes = set(marking_tasks[task_id])
            self.__marking_header_setup__(workflow_id,task_id,shapes,output_directory)

        # and finally the survey tasks
        for task_id in survey_tasks:
            instructions = self.instructions[workflow_id][task_id]
            self.__survey_header_setup__(output_directory,task_id,instructions)

        return output_directory
示例#4
0
    def __aggregate__(self,raw_classifications,workflow_id,aggregations):
        # start by looking for empty subjects

        self.to_retire = set()
        for subject_id in raw_classifications["T0"]:
            user_ids,is_subject_empty = zip(*raw_classifications["T0"][subject_id])
            if is_subject_empty != []:
                empty_count = sum([1 for i in is_subject_empty if i == True])
                if empty_count >= 3:
                    self.to_retire.add(subject_id)

        blank_retirement = len(self.to_retire)

        non_blanks = []

        # now look to see if everything has been transcribed
        for subject_id in raw_classifications["T3"]:
            user_ids,completely_transcribed = zip(*raw_classifications["T3"][subject_id])

            completely_count = sum([1 for i in completely_transcribed if i == True])
            if completely_count >= 3:
                self.to_retire.add(subject_id)
                non_blanks.append(subject_id)

        # get an updated token
        assert isinstance(self.project,AggregationAPI)
        self.project.__panoptes_connect__()
        token = self.project.token

        for retired_subject in self.to_retire:
            try:
                headers = {"Accept":"application/vnd.api+json; version=1","Content-Type": "application/json", "Authorization":"Bearer "+token}
                params = {"subject_id":retired_subject}
                r = requests.post("https://panoptes.zooniverse.org/api/workflows/"+str(workflow_id)+"/retired_subjects",headers=headers,data=json.dumps(params))
                # rollbar.report_message("results from trying to retire subjects","info",extra_data=r.text)

            except TypeError as e:
                warning(e)
                rollbar.report_exc_info()

        print("we would have retired " + str(len(self.to_retire)))
        print("with non-blanks " + str(len(self.to_retire)-blank_retirement))
        print(str(len(self.to_retire)-blank_retirement))

        self.num_retired = len(self.to_retire)
        self.non_blanks_retired = len(self.to_retire)-blank_retirement

        return aggregations
示例#5
0
    def __marking_row__(self,workflow_id,task_id,subject_id,aggregations,shape):
        """
        output for line segments
        :param workflow_id:
        :param task_id:
        :param subject_id:
        :param aggregations:
        :return:
        """
        key = task_id,shape,"detailed"
        for cluster_index,cluster in aggregations[shape + " clusters"].items():
            if cluster_index == "all_users":
                continue

            # build up the row bit by bit to have the following structure
            # "subject_id,most_likely_tool,x,y,p(most_likely_tool),p(true_positive),num_users"
            row = str(subject_id)+","
            # todo for now - always give the cluster index
            row += str(cluster_index)+","

            # extract the most likely tool for this particular marking and convert it to
            # a string label
            try:
                tool_classification = cluster["tool_classification"][0].items()
            except KeyError:
                warning(shape)
                warning(cluster)
                raise
            most_likely_tool,tool_probability = max(tool_classification, key = lambda x:x[1])
            tool_str = self.instructions[workflow_id][task_id]["tools"][int(most_likely_tool)]["marking tool"]
            row += helper_functions.csv_string(tool_str) + ","

            # get the central coordinates next
            for center_param in cluster["center"]:
                if isinstance(center_param,list) or isinstance(center_param,tuple):
                    row += "\"" + str(tuple(center_param)) + "\","
                else:
                    row += str(center_param) + ","

            # add on how likely the most likely tool was
            row += str(tool_probability) + ","
            # how likely the cluster is to being a true positive and how many users (out of those who saw this
            # subject) actually marked it. For the most part p(true positive) is equal to the percentage
            # of people, so slightly redundant but allows for things like weighted voting and IBCC in the future
            prob_true_positive = cluster["existence"][0]["1"]
            num_users = cluster["existence"][1]
            row += str(prob_true_positive) + "," + str(num_users)
            self.csv_files[key].write(row+"\n")
示例#6
0
    def __classification_output__(self,workflow_id,task_id,subject_id,aggregations,shape_id=None,followup_id=None):
        """
        add a row to both the summary and detailed csv output files
        """
        # a dictionary containing the index id of each answer and its corresponding label
        answer_dict = self.instructions[workflow_id][task_id]["answers"]

        # start with the summary file
        id_ = (workflow_id,task_id,shape_id,followup_id,"summary")

        try:
            self.__add_summary_row__(id_,subject_id,aggregations,answer_dict)

            id_ = (workflow_id,task_id,shape_id,followup_id,"detailed")
            self.__add_detailed_row__(id_,subject_id,aggregations,answer_dict)
        except ValueError:
             warning("empty aggregations for workflow id " + str(workflow_id) + " task id " + str(task_id) + " and subject id" + str(subject_id) + " -- skipping")
示例#7
0
    def __add_summary_row__(self,id_,subject_id,results,answer_dict):
        """
        given a result for a specific subject (and possibily a specific cluster within that specific subject)
        add one row of results to the summary file. that row contains
        subject_id,tool_index,cluster_index,most_likely,p(most_likely),shannon_entropy,mean_agreement,median_agreement,num_users
        tool_index & cluster_index are only there if we have a follow up to marking task
        :param id_:
        :param subject_id:
        :param results:
        :param answer_dict:
        :return:
        """
        votes,num_users = results

        # get the top choice
        try:
            most_likely,top_probability = max(votes.items(), key = lambda x:x[1])
        except ValueError:
            warning(results)
            raise

        # extract the text corresponding to the most likely answer
        most_likely_label = answer_dict[int(most_likely)]
        # and get rid of any bad characters
        most_likely_label = helper_functions.csv_string(most_likely_label)

        probabilities = votes.values()
        entropy = self.__shannon_entropy__(probabilities)

        mean_p = np.mean(votes.values())
        median_p = np.median(votes.values())

        with open(self.file_names[id_],"a") as results_file:
            results_file.write(str(subject_id)+",")

            # write out details regarding the top choice
            # this might not be a useful value if multiple choices are allowed - in which case just ignore it
            results_file.write(str(most_likely_label)+","+str(top_probability))
            # write out some summaries about the distributions of people's answers
            # again entropy probably only makes sense if only one answer is allowed
            # and mean_p and median_p probably only make sense if multiple answers are allowed
            # so people will need to pick and choose what they want
            results_file.write(","+str(entropy)+","+str(mean_p)+","+str(median_p))
            # finally - how many people have seen this subject for this task
            results_file.write(","+str(num_users)+"\n")
    def __cluster__(self,used_shapes,raw_markings,image_dimensions,aggregations):
        """
        :param aggregations: we're working on a subject by subject basis - aggregations is from previous subjects
        """
        if raw_markings == {}:
            warning("skipping")
            return aggregations

        # start by clustering text
        # print("clustering text")
        # cluster_aggregations = {}
        cluster_aggregations = self.text_algorithm.__aggregate__(raw_markings,image_dimensions)

        aggregations = self.__merge_aggregations__(aggregations,cluster_aggregations)
        # print("clustering images")
        image_aggregations = self.image_algorithm.__aggregate__(raw_markings,image_dimensions)

        aggregations = self.__merge_aggregations__(aggregations,image_aggregations)
        return aggregations
示例#9
0
    def __setup_aligned_text__(self,aligned_text,text_coordinates,user_ids,x1,y1,x2,y2):
        """
        when printing out the individual transcriptions that make up a cluster we need to do a few things
        including sorting them.
        :return:
        """
        # todo - honestly not sure if most of this function is necessary

        new_aligned = []

        for t in aligned_text:
            # todo - figure out if this is necessary or useful
            if t is None:
                warning("text was none - really not sure why but skipping")
                continue
            # put tags back into multicharacter format
            t = self.__reset_tags__(t)
            # instead of chr(24), use "\u0018" - postgres prefers that
            new_aligned.append(t.replace(chr(24),unicode("\u0018")))

        # if the text is horizontal - i.e. the angle of the center is less than 45 degrees
        # sort the aligned text by x coordinates - otherwise sort by DECREASING y coordinates
        # (since 0,0 is at the top left)
        try:
            tan_theta = math.fabs(y1-y2)/math.fabs(x1-x2)
            theta = math.atan(tan_theta)
        except ZeroDivisionError:
            theta = math.pi/2.

        # horizontal
        # pretty sure that X1 < X2 but don't want to make an assumption
        if math.fabs(theta) <= math.pi/4.:
            starting_coordinates = [min(x1,x2) for x1,x2,_,_ in text_coordinates]
        # vertical text
        # pretty not sure about whether Y1<Y2 so playing it safe
        else:
            starting_coordinates = [-max(y1,y2) for _,_,y1,y2 in text_coordinates]
        text_and_ids_with_coordinates = zip(starting_coordinates,new_aligned,user_ids)
        # sort
        text_and_ids_with_coordinates.sort(key = lambda x:x[0])
        _,aligned_text,user_id = zip(*text_and_ids_with_coordinates)

        return aligned_text
示例#10
0
    def __cluster__(self,used_shapes,raw_markings,image_dimensions):
        """
        for when I want to see raw classifications in addition to markings
        :param workflow_id:
        :return:
        """

        if raw_markings == {}:
            warning("warning - empty set of images")
            return {}

        # start by clustering text
        print("clustering text")
        cluster_aggregation = self.text_algorithm.__aggregate__(raw_markings,image_dimensions)
        print("clustering images")
        image_aggregation = self.image_algorithm.__aggregate__(raw_markings,image_dimensions)

        cluster_aggregation = self.__merge_aggregations__(cluster_aggregation,image_aggregation)

        return cluster_aggregation
示例#11
0
    def __line_alignment__(self,lines):
        """
        align.py the text by using MAFFT
        :param lines:
        :return:
        """

        aligned_text = []

        if len(lines) == 1:
            return lines


        with tempfile.NamedTemporaryFile(suffix=".fasta") as in_file, tempfile.NamedTemporaryFile("r") as out_file:
            for line in lines:
                if isinstance(line,tuple):
                    # we have a list of text segments which we should join together
                    line = "".join(line)

                # line = unicodedata.normalize('NFKD', line).encode('ascii','ignore')
                # assert isinstance(line,str)

                # for i in range(max_length-len(line)):
                #     fasta_line += "-"

                try:
                    in_file.write(">\n"+line+"\n")
                except UnicodeEncodeError:
                    warning(line)
                    warning(unicodedata.normalize('NFKD', line).encode('ascii','ignore'))
                    raise
            in_file.flush()

            # todo - play around with gap penalty --op 0.5
            t = "mafft --op 0.85 --text " + in_file.name + " > " + out_file.name +" 2> /dev/null"
            # t = "mafft --text " + in_file.name + " > " + out_file.name +" 2> /dev/null"

            os.system(t)

            cumulative_line = ""
            for line in out_file.readlines():
                if (line == ">\n"):
                    if (cumulative_line != ""):
                        aligned_text.append(cumulative_line)
                        cumulative_line = ""
                else:
                    cumulative_line += line[:-1]

            if cumulative_line == "":
                warning(lines)
                assert False
            aligned_text.append(cumulative_line)

        # no idea why mafft seems to have just including this line in the output
        # also might just be affecting Greg's computer
        if aligned_text[0] == '/usr/lib/mafft/lib/mafft':
            return aligned_text[1:]
        else:
            return aligned_text
    def __cluster__(self, used_shapes, raw_markings, image_dimensions,
                    aggregations):
        """
        :param aggregations: we're working on a subject by subject basis - aggregations is from previous subjects
        """
        if raw_markings == {}:
            warning("skipping")
            return aggregations

        # start by clustering text
        # print("clustering text")
        # cluster_aggregations = {}
        cluster_aggregations = self.text_algorithm.__aggregate__(
            raw_markings, image_dimensions)

        aggregations = self.__merge_aggregations__(aggregations,
                                                   cluster_aggregations)
        # print("clustering images")
        image_aggregations = self.image_algorithm.__aggregate__(
            raw_markings, image_dimensions)

        aggregations = self.__merge_aggregations__(aggregations,
                                                   image_aggregations)
        return aggregations
示例#13
0
    def __make_files__(self,workflow_id):
        """
        create all of the files necessary for this workflow
        :param workflow_id:
        :return:
        """
        # delete any reference to previous csv outputs - this means we don't have to worry about using
        # workflow ids in the keys and makes things simplier
        self.file_names = {}

        # now create a sub directory specific to the workflow
        try:
            workflow_name = self.workflow_names[workflow_id]
        except KeyError:
            warning(self.workflows)
            warning(self.workflow_names)
            raise

        # workflow names might have characters (such as spaces) which shouldn't be part of a filename, so clean up the
        # workflow names
        workflow_name = helper_functions.csv_string(workflow_name)
        output_directory = "/tmp/"+str(self.project_id)+"/" +str(workflow_id) + "_" + workflow_name + "/"

        if not os.path.exists(output_directory):
            os.makedirs(output_directory)
        self.workflow_directories[workflow_id] = output_directory

        # create the csv files for the classification tasks (both simple and follow up ones)
        self.__classification_file_setup__(output_directory,workflow_id)

        # now set things up for the marking tasks
        self.__marking_file_setup__(output_directory,workflow_id)

        self.__survey_file_setup__(output_directory,workflow_id)

        return output_directory
    def __aggregate__(self,raw_classifications,workflow,aggregations,workflow_id):
        """
        classification aggregation for annotate/folger means looking for subjects which we can retire
        :param raw_classifications:
        :param workflow:
        :param aggregations:
        :param workflow_id:
        :return:
        """

        if not isinstance(workflow_id, int):
            raise TypeError('workflow_id must be an int')

        to_retire = set()
        # start by looking for empty subjects
        # "T0" really should always be there but we may have a set of classifications (really old ones before
        # the workflow changed) where it is missing - if "T0" isn't there, just skip
        if "T0" in raw_classifications:
            to_retire.update(self.__get_blank_subjects__(raw_classifications))

        # now look to see what has been completely transcribed
        if "T3" in raw_classifications:
            to_retire.update(self.__get_completed_subjects__(raw_classifications))

        # call the Panoptes API to retire these subjects
        # get an updated token
        time_delta = datetime.datetime.now()-self.token_date
        # update every 30 minutes
        if time_delta.seconds > (30*60):
            self.token_date = datetime.datetime.now()
            if not isinstance(self.project, AggregationAPI):
                raise TypeError(
                    'self.project must be an AggregationAPI instance'
                )
            self.project.__panoptes_connect__()

        token = self.project.token

        self.total_retired += len(to_retire)

        # need to retire the subjects one by one
        for retired_subject in to_retire:
            self.to_retire.add(retired_subject)
            try:
                headers = {"Accept":"application/vnd.api+json; version=1","Content-Type": "application/json", "Authorization":"Bearer "+token}
                params = {"subject_id":retired_subject}
                r = requests.post("https://panoptes.zooniverse.org/api/workflows/"+str(workflow_id)+"/retired_subjects",headers=headers,data=json.dumps(params))
                r.raise_for_status()
            except TypeError as e:
                warning(e)
                rollbar.report_exc_info()

        # if to_retire != set():
        #     print("total retired so far " + str(self.total_retired))

        # print("we would have retired " + str(len(self.to_retire)))
        # print("with non-blanks " + str(len(self.to_retire)-blank_retirement))
        # print(str(len(self.to_retire)-blank_retirement))
        #
        # self.num_retired = len(self.to_retire)
        # self.non_blanks_retired = len(self.to_retire)-blank_retirement

        return aggregations
                    }
                }
            },
            ReplyToAddresses=[
                '*****@*****.**',
            ],
            ReturnPath='*****@*****.**'
        )
        print("response from emailing results")
        print(response)

if __name__ == "__main__":
    try:
        opts, args = getopt.getopt(sys.argv[1:],"shi:e:d:",["summary","project_id=","environment=","end_date="])
    except getopt.GetoptError:
        warning('transcription.py -i <project_id> -e: <environment> -d: <end_date>')
        sys.exit(2)

    environment = os.environ.get('ENVIRONMENT', 'development')
    project_id = None
    end_date = None
    summary = False

    for opt, arg in opts:
        if opt in ["-i","--project_id"]:
            project_id = int(arg)
        elif opt in ["-e","--environment"]:
            environment = arg
        elif opt in ["-d","--end_date"]:
            end_date = parser.parse(arg)
        elif opt in ["-s","--summary"]:
示例#16
0
    def __subject_output__(self,workflow_id,subject_id,aggregations):
        """
        add csv rows for all the output related to this particular workflow/subject_id
        :param workflow_id:
        :param subject_id:
        :param aggregations:
        :return:
        """

        classification_tasks,marking_tasks,survey_tasks = self.workflows[workflow_id]

        for task_id,task_type in classification_tasks.items():
            # a subject might not have results for all tasks
            if task_id not in aggregations:
                continue

            # we have follow up questions
            if isinstance(task_type,dict):
                for tool_id in task_type:
                    for followup_index,answer_type in enumerate(task_type[tool_id]):
                        # what sort of shape are we looking for - help us find relevant clusters
                        shape = self.workflows[workflow_id][1][task_id][tool_id]
                        for cluster_index,cluster in aggregations[task_id][shape + " clusters"].items():
                            if cluster_index == "all_users":
                                continue

                            classification = cluster["tool_classification"][0]
                            most_likely_tool,_ = max(classification.items(),key = lambda x:x[1])

                            # only consider clusters which most likely correspond to the correct tool
                            if int(most_likely_tool) != int(tool_id):
                                continue

                            possible_answers = self.instructions[workflow_id][task_id]["tools"][tool_id]["followup_questions"][followup_index]["answers"]
                            if "followup_question" not in aggregations[task_id][shape + " clusters"][cluster_index]:
                                print("missing follow up response")
                                continue

                            try:
                                results = aggregations[task_id][shape + " clusters"][cluster_index]["followup_question"][str(followup_index)]
                            except KeyError:
                                warning(aggregations[task_id][shape + " clusters"][cluster_index])
                                raise
                            id_ = task_id,tool_id,followup_index
                            if answer_type == "single":
                                self.__single_choice_classification_row__(possible_answers,id_,subject_id,results,cluster_index)
                            else:
                                self.__multi_choice_classification_row__(possible_answers,id_,subject_id,results,cluster_index)
            else:
                results = aggregations[task_id]
                self.__classification_output__(workflow_id,task_id,subject_id,results)

        for task_id,possible_shapes in marking_tasks.items():
            for shape in set(possible_shapes):
                # not every task have been done for every aggregation
                if task_id in aggregations:
                    if shape == "polygon":
                        self.__polygon_row__(workflow_id,task_id,subject_id,aggregations[task_id])
                        self.__polygon_summary_output__(workflow_id,task_id,subject_id,aggregations[task_id])
                    else:
                        self.__marking_row__(workflow_id,task_id,subject_id,aggregations[task_id],shape)
                        self.__shape_summary_output__(workflow_id,task_id,subject_id,aggregations,shape)

        for task_id in survey_tasks:
            instructions = self.instructions[workflow_id][task_id]

            # id_ = (task_id,"summary")
            # with open(self.file_names[id_],"a") as f:
            #     summary_line = self.__survey_summary_row(aggregations)
            #     f.write(str(subject_id)+summary_line)

            id_ = (task_id,"detailed")
            with open(self.file_names[id_],"a") as f:
                detailed_lines = self.__survey_row__(instructions,aggregations)
                for l in detailed_lines:
                    f.write(str(subject_id)+l)
示例#17
0
    def __restructure_json__(self):
        print("restructing json results")
        workflow_id = self.workflows.keys()[0]

        cur = self.postgres_session.cursor()

        stmt = "select subject_id,aggregation from aggregations where workflow_id = " + str(workflow_id)
        cur.execute(stmt)

        new_json = {}

        subjects_with_results = 0

        for ii,(subject_id,aggregation) in enumerate(cur.fetchall()):
            #
            if subject_id not in self.classification_alg.to_retire:
                continue
            try:
                clusters_by_line = {}

                if isinstance(aggregation,str):
                    print("converting aggregation to string")
                    aggregation = json.loads(aggregation)

                for key,cluster in aggregation["T2"]["text clusters"].items():
                    if key == "all_users":
                        continue
                    if isinstance(cluster,str):
                        warning("cluster is in string format for some reason")
                        cluster = json.loads(cluster)

                    try:
                        # for dev only since we may not have updated every transcription
                        if cluster["cluster members"] == []:
                            continue
                    except TypeError:
                        warning(cluster)
                        warning()
                        raise

                    index = cluster["set index"]
                    # text_y_coord.append((cluster["center"][2],cluster["center"][-1]))

                    if index not in clusters_by_line:
                        clusters_by_line[index] = [cluster]
                    else:
                        clusters_by_line[index].append(cluster)

                cluster_set_coordinates = {}

                for set_index,cluster_set in clusters_by_line.items():
                    # clusters are based on purely horizontal lines so we don't need to take the
                    # average or anything like that.
                    # todo - figure out what to do with vertical lines, probably keep them completely separate
                    cluster_set_coordinates[set_index] = cluster_set[0]["center"][2]

                sorted_sets = sorted(cluster_set_coordinates.items(), key = lambda x:x[1])

                for set_index,_ in sorted_sets:
                    cluster_set = clusters_by_line[set_index]

                    # now on the (slightly off chance) that there are multiple clusters for this line, sort them
                    # by x coordinates
                    line = [(cluster["center"][0],cluster["center"][-1]) for cluster in cluster_set]
                    line.sort(key = lambda x:x[0])
                    _,text = zip(*line)

                    text = list(text)
                    # for combining the possible multiple clusters for this line into one
                    merged_line = ""
                    for t in text:
                        # think that storing in postgres converts from str to unicode
                        # for general display, we don't need ord(24) ie skipped characters
                        new_t = t.replace(chr(24),"")
                        merged_line += new_t

                    # we seem to occasionally get lines that are just skipped characters (i.e. the string
                    # if just chr(24)) - don't report these lines
                    if merged_line != "":
                        # is this the first line we've encountered for this subject?
                        if subject_id not in new_json:
                            new_json[subject_id] = {"text":[],"individual transcriptions":[], "accuracy":[], "coordinates" : [],"users_per_line":[]}

                            # add in the metadata
                            metadata = self.__get_subject_metadata__(subject_id)["subjects"][0]["metadata"]
                            new_json[subject_id]["metadata"] = metadata

                            new_json[subject_id]["zooniverse subject id"] = subject_id

                        # add in the line of text
                        new_json[subject_id]["text"].append(merged_line)

                        # now add in the aligned individual transcriptions
                        # use the first cluster we found for this line as a "representative cluster"
                        rep_cluster = cluster_set[0]

                        zooniverse_ids = []
                        for user_id in rep_cluster["cluster members"]:
                            zooniverse_login_name = self.__get_login_name__(user_id)

                            # todo - not sure why None can be returned but does seem to happen
                            if zooniverse_login_name is not None:
                                # new_json[subject_id]["users_per_line"].append(zooniverse_login_name)
                                zooniverse_ids.append(zooniverse_login_name)
                            else:
                                zooniverse_ids.append("None")

                        # todo - if a line is transcribed completely but in distinct separate parts
                        # todo - this may cause trouble
                        new_json[subject_id]["individual transcriptions"].append(rep_cluster["aligned_text"])
                        new_json[subject_id]["users_per_line"].append(zooniverse_ids)

                        # what was the accuracy for this line?
                        accuracy = len([c for c in merged_line if ord(c) != 27])/float(len(merged_line))
                        new_json[subject_id]["accuracy"].append(accuracy)

                        # add in the coordinates
                        # this is only going to work with horizontal lines
                        line_segments = [cluster["center"][:-1] for cluster in cluster_set]
                        x1,x2,y1,y2 = zip(*line_segments)

                        # find the line segments which define the start and end of the line overall
                        x_start = min(x1)
                        x_end = max(x2)

                        start_index = np.argmin(x1)
                        end_index = np.argmax(x2)

                        y_start = y1[start_index]
                        y_end = y1[end_index]

                        new_json[subject_id]["coordinates"].append([x_start,x_end,y_start,y_end])

                # count once per subject
                subjects_with_results += 1
            except KeyError:
                pass

        json.dump(new_json,open("/tmp/"+str(self.project_id)+".json","wb"))

        aws_tar = self.__get_aws_tar_name__()
        print("saving json results")
        with tarfile.open("/tmp/"+aws_tar,mode="w") as t:
            t.add("/tmp/"+str(self.project_id)+".json")
            ReplyToAddresses=[
                '*****@*****.**',
            ],
            ReturnPath='*****@*****.**')
        print("response from emailing results")
        print(response)


if __name__ == "__main__":
    try:
        opts, args = getopt.getopt(
            sys.argv[1:], "shi:e:d:",
            ["summary", "project_id=", "environment=", "end_date="])
    except getopt.GetoptError:
        warning(
            'transcription.py -i <project_id> -e: <environment> -d: <end_date>'
        )
        sys.exit(2)

    environment = os.environ.get('ENVIRONMENT', 'development')
    project_id = None
    end_date = None
    summary = False

    for opt, arg in opts:
        if opt in ["-i", "--project_id"]:
            project_id = int(arg)
        elif opt in ["-e", "--environment"]:
            environment = arg
        elif opt in ["-d", "--end_date"]:
            end_date = parser.parse(arg)
示例#19
0
    def __line_alignment__(self, lines):
        """
        align.py the text by using MAFFT
        :param lines:
        :return:
        """

        aligned_text = []

        if len(lines) == 1:
            return lines

        with tempfile.NamedTemporaryFile(
                suffix=".fasta") as in_file, tempfile.NamedTemporaryFile(
                    "r") as out_file:
            for line in lines:
                if isinstance(line, tuple):
                    # we have a list of text segments which we should join together
                    line = "".join(line)

                # line = unicodedata.normalize('NFKD', line).encode('ascii','ignore')
                # assert isinstance(line,str)

                # for i in range(max_length-len(line)):
                #     fasta_line += "-"

                try:
                    in_file.write(">\n" + line + "\n")
                except UnicodeEncodeError:
                    warning(line)
                    warning(
                        unicodedata.normalize('NFKD',
                                              line).encode('ascii', 'ignore'))
                    raise
            in_file.flush()

            # todo - play around with gap penalty --op 0.5
            t = "mafft --op 0.85 --text " + in_file.name + " > " + out_file.name + " 2> /dev/null"
            # t = "mafft --text " + in_file.name + " > " + out_file.name +" 2> /dev/null"

            os.system(t)

            cumulative_line = ""
            for line in out_file.readlines():
                if (line == ">\n"):
                    if (cumulative_line != ""):
                        aligned_text.append(cumulative_line)
                        cumulative_line = ""
                else:
                    cumulative_line += line[:-1]

            if cumulative_line == "":
                warning(lines)
                assert False
            aligned_text.append(cumulative_line)

        # no idea why mafft seems to have just including this line in the output
        # also might just be affecting Greg's computer
        if aligned_text[0] == '/usr/lib/mafft/lib/mafft':
            return aligned_text[1:]
        else:
            return aligned_text
    def __aggregate__(self, raw_classifications, workflow, aggregations,
                      workflow_id):
        """
        classification aggregation for annotate/folger means looking for subjects which we can retire
        :param raw_classifications:
        :param workflow:
        :param aggregations:
        :param workflow_id:
        :return:
        """

        if not isinstance(workflow_id, int):
            raise TypeError('workflow_id must be an int')

        to_retire = set()
        # start by looking for empty subjects
        # "T0" really should always be there but we may have a set of classifications (really old ones before
        # the workflow changed) where it is missing - if "T0" isn't there, just skip
        if "T0" in raw_classifications:
            to_retire.update(self.__get_blank_subjects__(raw_classifications))

        # now look to see what has been completely transcribed
        if "T3" in raw_classifications:
            to_retire.update(
                self.__get_completed_subjects__(raw_classifications))

        # call the Panoptes API to retire these subjects
        # get an updated token
        time_delta = datetime.datetime.now() - self.token_date
        # update every 30 minutes
        if time_delta.seconds > (30 * 60):
            self.token_date = datetime.datetime.now()
            if not isinstance(self.project, AggregationAPI):
                raise TypeError(
                    'self.project must be an AggregationAPI instance')
            self.project.__panoptes_connect__()

        token = self.project.token

        self.total_retired += len(to_retire)

        # need to retire the subjects one by one
        for retired_subject in to_retire:
            self.to_retire.add(retired_subject)
            try:
                headers = {
                    "Accept": "application/vnd.api+json; version=1",
                    "Content-Type": "application/json",
                    "Authorization": "Bearer " + token
                }
                params = {"subject_id": retired_subject}
                r = requests.post(
                    "https://panoptes.zooniverse.org/api/workflows/" +
                    str(workflow_id) + "/retired_subjects",
                    headers=headers,
                    data=json.dumps(params))
                r.raise_for_status()
            except TypeError as e:
                warning(e)
                rollbar.report_exc_info()

        # if to_retire != set():
        #     print("total retired so far " + str(self.total_retired))

        # print("we would have retired " + str(len(self.to_retire)))
        # print("with non-blanks " + str(len(self.to_retire)-blank_retirement))
        # print(str(len(self.to_retire)-blank_retirement))
        #
        # self.num_retired = len(self.to_retire)
        # self.non_blanks_retired = len(self.to_retire)-blank_retirement

        return aggregations
示例#21
0
class FolgerClustering(TextClustering):
    def __init__(self, shape, project, param_dict):
        TextClustering.__init__(self, shape, project, param_dict)

        self.folger_safe_tags = dict()

        # for folger the tags in the transcriptions are not actually the tags that folger wants
        for key, tag in self.tags.items():
            self.folger_safe_tags[key] = tag.replace("sw-", "")

        self.total = 0
        self.error = 0

    def __accuracy__(self, s):
        assert isinstance(s, str)
        assert len(s) > 0
        return sum([1 for c in s if c != "-"]) / float(len(s))

    def __reset_tags__(self, text):
        """
        with text, we will have tags represented by a single character (with ord() > 128 to indicate
        that something is special) Convert these back to the full text representation
        also take care of folger specific stuff right
        :param text:
        :return:
        """
        assert type(text) in [str, unicode]

        # reverse_map = {v: k for k, v in self.tags.items()}
        # also go with something different for "not sure"
        # this matter when the function is called on the aggregate text
        # reverse_map[200] = chr(27)
        # and for gaps inserted by MAFFT
        # reverse_map[201] = chr(24)

        ret_text = ""

        for c in text:
            if ord(c) > 128:
                ret_text += self.folger_safe_tags[ord(c)]
            else:
                ret_text += c

        return ret_text

    def __find_completed_components__(self, aligned_text, coordinates):
        """
        go through the aggregated text looking for subsets where at least 3 people have transcribed everything
        :param aligned_text:
        :param coordinates:
        :return:
        """
        completed_indices = []

        for char_index in range(len(aligned_text[0])):
            num_char = len(
                [1 for text in aligned_text if ord(text[char_index]) != 25])

            if num_char >= 3:
                completed_indices.append(char_index)

        starting_points = {}
        ending_points = {}

        # transcription_range = {}

        # find consecutive blocks
        if completed_indices != []:
            # find the contiguous blocks of completed transcriptions
            blocks = [
                [completed_indices[0]],
            ]
            for i, char_index in list(enumerate(completed_indices))[1:]:
                # do we have a jump - if so, start a new block
                if completed_indices[i - 1] != (char_index - 1):
                    blocks[-1].append(completed_indices[i - 1])
                    blocks.append([char_index])

            # if the last character started a new block (kinda weird but happens)
            # then we have a block only one character long - skip it
            blocks[-1].append(completed_indices[-1])
            if blocks[-1][0] == blocks[-1][1]:
                blocks = blocks[:-1]

            # technically we can have multiple transcriptions from the same user so
            # instead of user_index, I'll use transcription_index
            # also, technically the same user could give transcribe the same piece of text twice (or more)
            # and include those transcriptions in different annotations. Going to assume that doesn't happen
            for transcription_index, (text, coord) in enumerate(
                    zip(aligned_text, coordinates)):
                x1, x2, y1, y2 = coord
                non_space_characters = [
                    i for (i, c) in enumerate(text) if ord(c) != 25
                ]

                first_char = min(non_space_characters)
                last_char = max(non_space_characters)

                # transcription_range[transcription_index] = (first_char,last_char)

                # look for transcriptions which exactly match up with the completed segment
                # match on either starting OR ending point matching up
                # we'll use these transcriptions to determine where to place the red dots
                # telling people to no longer transcribe that text
                # such transcriptions may not exist - in which case we cannot really do anything
                for b in blocks:
                    b = tuple(b)
                    # does the start of the transcription match up with the start of the completed segment
                    if b[0] == first_char:
                        if b in starting_points:
                            starting_points[b].append((x1, y1))
                        else:
                            starting_points[b] = [(x1, y1)]

                    # does the end of the transcription match up with the end of the completed segment?
                    if b[1] == last_char:
                        if (first_char, last_char) in ending_points:
                            ending_points[b].append((x2, y2))
                        else:
                            ending_points[b] = [(x2, y2)]

        return starting_points, ending_points

    def __create_clusters__(self, (starting_points, ending_points),
                            aggregated_text, cluster_index, aligned_text,
                            variants, user_ids, text_coordinates):
        """
        the aggregated text, split up into completed components and make a result (aggregate) cluster for each
        of those components
        :param aggregated_text:
        :param transcription_range: where (relative to the aggregate text) each transcription string starts and stops
        useful for differentiating between gap markers before or after the text and gaps inside the text
        :param markings: the original markings - without the tags tokenized
        :return:
        """
        clusters = []

        # go through every segment that is considered done
        for (lb, ub) in starting_points:
            # not sure how likely this is to happen, but just to be sure
            # make sure that we have both a starting and ending point
            if (lb, ub) not in ending_points:
                continue

            new_cluster = {}

            X1, Y1 = zip(*starting_points[(lb, ub)])
            X2, Y2 = zip(*ending_points[(lb, ub)])

            x1 = np.median(X1)
            x2 = np.median(X2)
            y1 = np.median(Y1)
            y2 = np.median(Y2)

            completed_text = self.__reset_tags__(aggregated_text[lb:ub + 1])
            # chr(26) means not enough people have transcribed at a given position
            # but we specifically chose this substring as a substring where all the characters have
            # been transcribed by enough people. So sanity check
            assert chr(26) not in completed_text
            assert isinstance(completed_text, str)

            new_cluster["center"] = (x1, x2, y1, y2, completed_text)
            #
            # new_cluster["cluster members"] = list(user_ids)
            new_cluster["individual points"] = zip(X1, Y1, X2, Y2)
            # print(new_cluster["individual points"])
            # assert False

            new_cluster["set index"] = cluster_index

            new_aligned = []

            for t in aligned_text:
                # todo - figure out if this is necessary or useful
                if t is None:
                    warning("text was none - really not sure why but skipping")
                    continue
                # put tags back into multicharacter format
                t = self.__reset_tags__(t)
                # instead of chr(24), use "\u0018" - postgres prefers that
                new_aligned.append(t.replace(chr(24), unicode("\u0018")))

            # if the text is horizontal - i.e. the angle of the center is less than 45 degrees
            # sort the aligned text by x coordinates - otherwise sort by DECREASING y coordinates
            # (since 0,0 is at the top left)
            try:
                tan_theta = math.fabs(y1 - y2) / math.fabs(x1 - x2)
                theta = math.atan(tan_theta)
            except ZeroDivisionError:
                theta = math.pi / 2.

            # horizontal
            # pretty sure that X1 < X2 but don't want to make an assumption
            if math.fabs(theta) <= math.pi / 4.:
                starting_coordinates = [
                    min(x1, x2) for x1, x2, _, _ in text_coordinates
                ]
            # vertical text
            # pretty not sure about whether Y1<Y2 so playing it safe
            else:
                starting_coordinates = [
                    -max(y1, y2) for _, _, y1, y2 in text_coordinates
                ]
            text_and_ids_with_coordinates = zip(starting_coordinates,
                                                new_aligned, user_ids)
            # sort
            text_and_ids_with_coordinates.sort(key=lambda x: x[0])
            _, aligned_text, user_id = zip(*text_and_ids_with_coordinates)

            new_cluster["aligned_text"] = aligned_text
            new_cluster["cluster members"] = user_ids

            new_cluster["num users"] = len(new_cluster["cluster members"])

            new_cluster["variants"] = []
            # since a simple spelling mistake can count as a variant, look for cases where at least
            # two people have given the same variant
            variant_count = dict()
            for variant_list in variants:
                for v in variant_list:
                    if v not in variant_count:
                        variant_count[v] = 1
                    else:
                        variant_count[v] += 1
                        if variant_count[v] == 2:
                            new_cluster["variants"].append(v)

            clusters.append(new_cluster)

        return clusters