Exemplo n.º 1
0
    def process_sequence(self, name, assignments):
        """Processes the given sequence `name` with the given `assignments`."""
        if not assignments:
            return

        seq = SequenceWithAssignments(name, assignments[0].length)
        for assignment in assignments:
            seq.assign(assignment, overlap_check=False)
        self.output_formatter.process_assignments(seq)
Exemplo n.º 2
0
    def process_sequence(self, name, assignments):
        """Processes the given sequence `name` with the given `assignments`."""
        if not assignments:
            return

        seq = SequenceWithAssignments(name, assignments[0].length)
        for assignment in assignments:
            seq.assign(assignment, overlap_check=False)
        self.output_formatter.process_assignments(seq)
Exemplo n.º 3
0
    def process_infile(self, fname):
        self.log.info("Processing input file: %s" % fname)

        for assignment in AssignmentReader(fname):
            try:
                seq = self.seqcat[assignment.id]
            except KeyError:
                seq = SequenceWithAssignments(assignment.id, assignment.length)
                self.seqcat[assignment.id] = seq
            if seq.length != assignment.length:
                raise ValueError, "different lengths encountered for %s: %d and %d" % (seq.name, seq.length, assignment.length)
            seq.assign(assignment)
Exemplo n.º 4
0
 def process_infile(self, fname, interpro=None):
     self.log.info("Processing input file: %s" % fname)
     import sys
     for assignment in AssignmentReader(fname):
         try:
             seq = self.seqcat[assignment.id]
         except KeyError:
             seq = SequenceWithAssignments(assignment.id, assignment.length)
             self.seqcat[assignment.id] = seq
         if seq.length != assignment.length:
             raise ValueError, "different lengths encountered for %s: %d and %d" % (seq.name, seq.length, assignment.length)
         if interpro is not None:
             assignment = assignment.resolve_interpro_ids(interpro)
         seq.assign(assignment, interpro)
Exemplo n.º 5
0
 def process_interpro_file(self, interpro_file):
     from gfam.scripts.find_unassigned import FindUnassignedApp
     unassigned_app = FindUnassignedApp()
     unassigned_app.set_sequence_id_regexp(self.options.sequence_id_regexp)
     unassigned_app.process_sequences_file_old(self.options.sequences_file)
     unassigned_app.process_infile(interpro_file, self.interpro)
     self.seqcat = unassigned_app.seqcat
     for seq_id in set(unassigned_app.seq_ids_to_length.keys())\
             - set(self.seqcat.keys()):
         self.seqcat[seq_id] = SequenceWithAssignments(
             seq_id, unassigned_app.seq_ids_to_length[seq_id])
Exemplo n.º 6
0
    def sort_by_domain_architecture(self):
        self.domain_archs = defaultdict(list)
        if self.options.prefix:
            prefix = self.options.prefix
        else:
            prefix = "NOVEL"

        for seq_id, seq in self.seqcat.items():
            assignments = sorted(seq.assignments,
                                 key=operator.attrgetter("start"))
            domains = []
            if self.details_file:
                print(seq_id, file=self.details_file)

            primary_source = set()

            new_assignments = []
            for assignment in assignments:
                new_assignment = assignment.resolve_interpro_ids(self.interpro)
                if assignment.comment == "1":
                    primary_source.add(assignment.source)
                domains.append(new_assignment.domain)
                new_assignments.append(new_assignment)
            tree_arch = TreeRepresentation(new_assignments, self.interpro)
            seq.architecture = tree_arch.get_string()
            seq.architecture_pos = tree_arch.get_string_positions()

            self.domain_archs[seq.architecture].append(seq_id)

            if not primary_source:
                primary_source = None
            else:
                primary_source = ", ".join(primary_source)

            if self.details_file:
                seq2 = SequenceWithAssignments(seq.name, seq.length)
                seq2.assignments = [
                    assignment for assignment in assignments
                    if assignment.source != "Novel"
                ]
                sources = sorted(
                    set(assignment.source for assignment in assignments
                        if assignment.source != "Novel"))

                print(
                    "    Primary assignment source: {}".format(primary_source),
                    file=self.details_file)
                print("    Number of data sources used: {}".format(
                    len(sources)),
                      file=self.details_file)
                print("    Data sources: %s" % ", ".join(sources),
                      file=self.details_file)
                print("    Coverage: %.3f" % seq.coverage(),
                      file=self.details_file)
                print("    Coverage w/o novel domains: %.3f" % seq2.coverage(),
                      file=self.details_file)
                for assignment in assignments:
                    attrs = assignment._asdict()
                    if assignment.comment is None and \
                       assignment.domain.startswith(prefix):
                        attrs["comment"] = "novel"
                    row = "    %(start)4d-%(end)4d: %(domain)s "\
                          "(%(source)s, stage: %(comment)s)" % attrs
                    print(row, file=self.details_file)
                    interpro_id = assignment.interpro_id
                    if not interpro_id\
                       and assignment.domain in self.interpro.mapping:
                        interpro_id = self.interpro.mapping[assignment.domain]
                    if interpro_id:
                        anc = self.interpro.tree.get_most_remote_ancestor(
                            interpro_id)
                        if interpro_id == anc:
                            print("(InterPro ID: %s)" % anc,
                                  file=self.details_file)
                        else:
                            print("(InterPro ID: %s --> %s)" %
                                  (interpro_id, anc),
                                  file=self.details_file)
                        if anc in self.interpro_names:
                            print("{}{}".format(" " * (row.index(":") + 1),
                                                self.interpro_names[anc]),
                                  file=self.details_file)
                    else:
                        print("", file=self.details_file)
                        if assignment.domain in self.interpro_names:
                            print("{}{}".format(
                                " " * (row.index(":") + 1),
                                self.interpro_names[assignment.domain]),
                                  file=self.details_file)
                print("", file=self.details_file)

            seq.assignments = new_assignments
Exemplo n.º 7
0
    def sort_by_domain_architecture(self):
        self.domain_archs = defaultdict(list)
        for seq_id, seq in self.seqcat.iteritems():
            assignments = sorted(seq.assignments, key=operator.attrgetter("start"))
            domains = []
            if self.details_file:
                print >>self.details_file, seq_id

            primary_source = set()

            new_assignments = []
            for assignment in assignments:
                new_assignment = assignment.resolve_interpro_ids(self.interpro)
                if assignment.comment == "1":
                    primary_source.add(assignment.source)
                domains.append(new_assignment.domain)
                new_assignments.append(new_assignment)
            self.domain_archs[tuple(domains)].append(seq_id)

            if not primary_source:
                primary_source = None
            else:
                primary_source = ", ".join(primary_source)

            if self.details_file:
                seq2 = SequenceWithAssignments(seq.name, seq.length)
                seq2.assignments = [assignment for assignment in assignments \
                                    if assignment.source != "Novel"]
                sources = sorted(set(assignment.source \
                        for assignment in assignments \
                        if assignment.source != "Novel"))

                print >>self.details_file, "    Primary assignment source:", primary_source
                print >>self.details_file, "    Number of data sources used:", len(sources)
                print >>self.details_file, "    Data sources: %s" % ", ".join(sources)
                print >>self.details_file, "    Coverage: %.3f" % seq.coverage()
                print >>self.details_file, "    Coverage w/o novel domains: %.3f" % seq2.coverage()
                for assignment in assignments:
                    attrs = assignment._asdict()
                    if assignment.comment is None and \
                       assignment.domain.startswith("NOVEL"):
                        attrs["comment"] = "novel"
                    row = "    %(start)4d-%(end)4d: %(domain)s "\
                          "(%(source)s, stage: %(comment)s)" % attrs
                    print >>self.details_file, row,
                    interpro_id = assignment.interpro_id
                    if not interpro_id and assignment.domain in self.interpro.mapping:
                        interpro_id = self.interpro.mapping[assignment.domain]
                    if interpro_id:
                        anc = self.interpro.tree.get_most_remote_ancestor(interpro_id)
                        if interpro_id == anc:
                            print >>self.details_file, "(InterPro ID: %s)" % anc
                        else:
                            print >>self.details_file, "(InterPro ID: %s --> %s)" % (interpro_id, anc)
                        if anc in self.interpro_names:
                            print >>self.details_file, " "*(row.index(":")+1), self.interpro_names[anc]
                    else:
                        print >>self.details_file, ""
                        if assignment.domain in self.interpro_names:
                            print >>self.details_file, " "*(row.index(":")+1), self.interpro_names[assignment.domain]
                print >>self.details_file, ""

            seq.assignments = new_assignments
Exemplo n.º 8
0
    def filter_assignments(self, name, assignments_by_source):
        """Given a sequence name and its assignments ordered in a dict by
        their sources, selects a representative assignment set based on the
        rules outlined in the documentation of `FindUnassignedApp`.
        """

        if not assignments_by_source:
            self.log_exclusion(name, "no assignments in the input data file " +
                                     "passed the filters")
            return []

        # Determine the length of the sequence (and check that the length is
        # the same across all assignments; if not, then the input file is
        # inconsistent and the sequence will be skipped).
        source = assignments_by_source.keys()[0]
        seq_length = assignments_by_source[source][0][0].length
        for source, assignments in assignments_by_source.iteritems():
            if any(assignment.length != seq_length \
                   for assignment, _ in assignments):
                self.log.warning("Sequence %s has multiple assignments with "
                                 "different sequence lengths in the "
                                 "input file, skipping" % name)
                self.log_exclusion(name, "ambiguous sequence length in input file")
                return []

        # Initially, the result is empty
        result = []

        # Set up the stages
        stages = self.get_stages_from_config()
        """
        stages = [complementerset(["HMMPanther", "Gene3D"]),
                  complementerset(["HMMPanther", "Gene3D"]),
                  complementerset()]
        """

        # The first stage is treated specially as we have to select a single
        # source thas has the largest coverage. In the remaining stages, we
        # are allowed to cherrypick from different sources.

        # First, find the data source which covers the most of the sequence
        # and is allowed in stage 1
        first_stage = stages.pop(0)
        coverage = {}
        for source, assignments in assignments_by_source.iteritems():
            # Exclude those sources that we don't consider in the first stage
            if source not in first_stage:
                continue

            # Calculate the coverage: we add all the residues covered by 
            # each sequence, not taking overlaps into consideration (by the
            # moment)
            seq = SequenceWithAssignments(name, seq_length)
            for a, _ in assignments:
                seq.assign(a, False, interpro=self.interpro)
            coverage[source] = seq.coverage()

        # Find the source giving the best coverage, add its domains into
        # the current assignment.
        seq = SequenceWithAssignments(name, seq_length)
        if coverage:
            best_source = max(coverage.keys(), key = coverage.__getitem__)
            sorted_assignments = sorted(assignments_by_source[best_source], key=lambda
                    x: x[0].get_assigned_length(), reverse=True)
            for a, line in sorted_assignments:
                line = line.strip()
                if seq.assign(a, True, interpro=self.interpro):
                    tab_count = list(line).count("\t")
                    if tab_count < 13:
                        line = line + "\t" * (13-tab_count)
                    result.append("%s\t%s" % (line, 1))
        else:
            best_source = None

        # Collect the unused assignments (not from the best source)
        # into unused_assignments
        unused_assignments = []
        for source, assignments in assignments_by_source.iteritems():
            if source == best_source:
                continue
            unused_assignments.extend(assignments)

        if not unused_assignments:
            return result

        # Try to fill the unassigned regions with the rest of the assignments
        # that were unused so far, starting from the longest assignment.
        unused_assignments.sort(key = lambda x: -x[0].get_assigned_length())

        # Okay, we're done with the first stage, process the rest.
        # idx_to_stage will contain the indices of the selected
        # assignments as keys and the number of the corresponding
        # stage in which they were selected as values.
        idx_to_stage = {}
        for stage_no, sources in enumerate(stages):
            for idx, (a, _) in enumerate(unused_assignments):
                if a.source in sources and seq.assign(a, True, interpro=self.interpro):
                    idx_to_stage[idx] = stage_no+2
        for idx in sorted(idx_to_stage.keys()):
            row = unused_assignments[idx][1].strip()
            tab_count = list(row).count("\t")
            if tab_count < 13:
                row = row + "\t" * (13-tab_count)
            result.append("%s\t%s" % (row, idx_to_stage[idx]))

        if not result:
            self.log_exclusion(name, "no assignments were selected after "
                                     "executing all the stages")

        return result
Exemplo n.º 9
0
    def filter_assignments(self, name, assignments_by_source):
        """Given a sequence name and its assignments ordered in a dict by
        their sources, selects a representative assignment set based on the
        rules outlined in the documentation of `FindUnassignedApp`.
        """

        if not assignments_by_source:
            self.log_exclusion(
                name, "no assignments in the input data file " +
                "passed the filters")
            return []

        # Determine the length of the sequence (and check that the length is
        # the same across all assignments; if not, then the input file is
        # inconsistent and the sequence will be skipped).
        source = assignments_by_source.keys()[0]
        seq_length = assignments_by_source[source][0][0].length
        for _source, assignments in assignments_by_source.items():
            if any(assignment.length != seq_length
                   for assignment, _ in assignments):
                self.log.warning("Sequence %s has multiple assignments with "
                                 "different sequence lengths in the "
                                 "input file, skipping" % name)
                self.log_exclusion(
                    name, "ambiguous sequence " + "length in input file")
                return []

        # Initially, the result is empty
        result = []

        # Set up the stages
        stages = self.get_stages_from_config()
        """
        stages = [complementerset(["HMMPanther", "Gene3D"]),
                  complementerset(["HMMPanther", "Gene3D"]),
                  complementerset()]
        """

        # The first stage is treated specially as we have to select a single
        # source thas has the largest coverage. In the remaining stages, we
        # are allowed to cherrypick from different sources.

        # First, find the data source which covers the most of the sequence
        # and is allowed in stage 1
        first_stage = stages.pop(0)
        coverage = {}
        for source, assignments in assignments_by_source.items():
            # Exclude those sources that we don't consider in the first stage
            if source not in first_stage:
                continue

            # Calculate the coverage: we add all the residues covered by
            # each sequence, not taking overlaps into consideration (by the
            # moment)
            seq = SequenceWithAssignments(name, seq_length)
            for a, _ in assignments:
                seq.assign(a, False, interpro=self.interpro)
            coverage[source] = seq.coverage()

        # Find the source giving the best coverage, add its domains into
        # the current assignment.
        seq = SequenceWithAssignments(name, seq_length)
        if coverage:
            best_source = max(coverage.keys(), key=coverage.__getitem__)
            sorted_assignments = sorted(
                assignments_by_source[best_source],
                key=lambda x: x[0].get_assigned_length(),
                reverse=True)
            for a, line in sorted_assignments:
                line = line.strip()
                if seq.assign(a, True, interpro=self.interpro):
                    tab_count = list(line).count("\t")
                    if tab_count < 13:
                        line = line + "\t" * (13 - tab_count)
                    result.append("%s\t%s" % (line, 1))
        else:
            best_source = None

        # Collect the unused assignments (not from the best source)
        # into unused_assignments
        unused_assignments = []
        for source, assignments in assignments_by_source.items():
            if source == best_source:
                continue
            unused_assignments.extend(assignments)

        if not unused_assignments:
            return result

        # Try to fill the unassigned regions with the rest of the assignments
        # that were unused so far, starting from the longest assignment.
        unused_assignments.sort(key=lambda x: -x[0].get_assigned_length())

        # Okay, we're done with the first stage, process the rest.
        # idx_to_stage will contain the indices of the selected
        # assignments as keys and the number of the corresponding
        # stage in which they were selected as values.
        idx_to_stage = {}
        for stage_no, sources in enumerate(stages):
            for idx, (a, _) in enumerate(unused_assignments):
                if a.source in sources and seq.assign(
                        a, True, interpro=self.interpro):
                    idx_to_stage[idx] = stage_no + 2
        for idx in sorted(idx_to_stage.keys()):
            row = unused_assignments[idx][1].strip()
            tab_count = list(row).count("\t")
            if tab_count < 13:
                row = row + "\t" * (13 - tab_count)
            result.append("%s\t%s" % (row, idx_to_stage[idx]))

        if not result:
            self.log_exclusion(
                name, "no assignments were selected after "
                "executing all the stages")

        return result
    def sort_by_domain_architecture(self):
        self.domain_archs = defaultdict(list)
        for seq_id, seq in self.seqcat.items():
            assignments = sorted(seq.assignments,
                                 key=operator.attrgetter("start"))
            domains = []
            if self.details_file:
                print(seq_id, file=self.details_file)

            primary_source = set()

            new_assignments = []
            for assignment in assignments:
                new_assignment = assignment.resolve_interpro_ids(self.interpro)
                if assignment.comment == "1":
                    primary_source.add(assignment.source)
                domains.append(new_assignment.domain)
                new_assignments.append(new_assignment)
            tree_arch = TreeRepresentation(new_assignments, self.interpro)
            seq.architecture = tree_arch.get_string()
            seq.architecture_pos = tree_arch.get_string_positions()

            self.domain_archs[seq.architecture].append(seq_id)

            if not primary_source:
                primary_source = None
            else:
                primary_source = ", ".join(primary_source)

            if self.details_file:
                seq2 = SequenceWithAssignments(seq.name, seq.length)
                seq2.assignments = [assignment for assignment in assignments
                                    if assignment.source != "Novel"]
                sources = sorted(set(assignment.source
                                     for assignment in assignments
                                     if assignment.source != "Novel"))

                print("    Primary assignment source: {}".format(
                      primary_source), file=self.details_file)
                print("    Number of data sources used: {}".format(
                      len(sources)), file=self.details_file)
                print("    Data sources: %s" % ", ".join(sources),
                      file=self.details_file)
                print("    Coverage: %.3f" % seq.coverage(),
                      file=self.details_file)
                print("    Coverage w/o novel domains: %.3f" % seq2.coverage(),
                      file=self.details_file)
                for assignment in assignments:
                    attrs = assignment._asdict()
                    if assignment.comment is None and \
                       assignment.domain in self.hmm_domains:
                        attrs["comment"] = "novel"
                    row = "    %(start)4d-%(end)4d: %(domain)s "\
                          "(%(source)s, stage: %(comment)s)" % attrs
                    print(row, file=self.details_file)
                    interpro_id = assignment.interpro_id
                    if not interpro_id\
                       and assignment.domain in self.interpro.mapping:
                        interpro_id = self.interpro.mapping[assignment.domain]
                    if interpro_id:
                        anc = self.interpro.tree.get_most_remote_ancestor(
                            interpro_id)
                        if interpro_id == anc:
                            print("(InterPro ID: %s)" % anc,
                                  file=self.details_file)
                        else:
                            print("(InterPro ID: %s --> %s)"
                                  % (interpro_id, anc),
                                  file=self.details_file)
                        if anc in self.interpro_names:
                            print("{}{}".format(" "*(row.index(":")+1),
                                  self.interpro_names[anc]),
                                  file=self.details_file)
                    else:
                        print("", file=self.details_file)
                        if assignment.domain in self.interpro_names:
                            print("{}{}".format(
                                " "*(row.index(":")+1),
                                self.interpro_names[assignment.domain]),
                                  file=self.details_file)
                print("", file=self.details_file)

            seq.assignments = new_assignments