Exemplo n.º 1
0
    def load_slice_file(self, slice_file):
        """Loads the slice file into a dictionary of lists"""
        self.log.info("Loading slices from %s...", slice_file)
        self.parts = defaultdict(list)

        for line in open_anything(slice_file):
            parts = line.strip().split()
            if not parts:
                continue
            left, right = 1, -1

            if len(parts) == 3:
                # Three cases: (a) both limits (left,right) are specified
                left, right = int(parts[1]), int(parts[2])
            elif len(parts) == 2:
                # (b) only the left limit is specified
                left = int(parts[1])
                # (c) neither left nor right limits are specified
                # (this is why we se left=1 before)

            if left == 0 or right == 0:
                self.log.warning(
                    "Ignoring fragment ID: %s, "
                    "requested start position is zero", parts[0])
            else:
                self.parts[parts[0]].append(array.array('i', [left, right]))
Exemplo n.º 2
0
    def _read_goa_file(self, goa_file):
        """Reads the GOA file, return a dict that,
        for each protein id it has a set of strings with the
        associated GO terms after up-propagation
        """
        d = {}
        self.go_terms_gs = set()
        for line in open_anything(goa_file):
            if not line.startswith("!") or line.startswith("#"):
                # split line, obtain protein_id, go_term, and ev_code
                fields = line.split("\t")
                prot_id, goterm, evcode = fields[1], fields[4], fields[6]
                if evcode in self.valid_ev_codes:
                    terms = map(str, self.go_tree.ancestors(goterm))
                    d[prot_id] = terms
                    self.go_terms_gs.update(terms)

        self.go_terms_gs_ont = {}
        for ontology, terms in self.terms_per_ontology.items():
            gs_terms_o = self.go_terms_gs & terms
            self.go_terms_gs_ont[ontology] = gs_terms_o
            print("# of Terms in {} in GOA file: {}".format(
                ontology, len(gs_terms_o)))

        return d
Exemplo n.º 3
0
    def load_slice_file(self, slice_file):
        """Loads the slice file into a dictionary of lists"""
        self.log.info("Loading slices from %s..." % slice_file)
        self.parts = defaultdict(list)

        for line in open_anything(slice_file):
            parts = line.strip().split()
            if not parts:
                continue
            left, right = 1, -1 

            if len(parts) == 3:
                # Three cases: (a) both limits (left,right) are specified
                left, right = int(parts[1]), int(parts[2])
            elif len(parts) == 2:
                # (b) only the left limit is specified
                left = int(parts[1])
                # (c) neither left nor right limits are specified 
                # (this is why we se left=1 before)

            if left == 0 or right == 0:
                self.log.warning("Ignoring fragment ID: %s, "
                    "requested start position is zero" % parts[0])
            else:
                self.parts[parts[0]].append(array.array('i', [left, right]))
Exemplo n.º 4
0
 def download_interpro(self, url):
     """Downloads the official InterPro ID-name mappings from the given
     URL and prints the mapping to the standard output.
     """
     self.log.info("Downloading InterPro names from %s..." % url)
     for line in open_anything(url):
         sys.stdout.write(line)
Exemplo n.º 5
0
 def download_interpro(self, url):
     """Downloads the official InterPro ID-name mappings from the given
     URL and prints the mapping to the standard output.
     """
     self.log.info("Downloading InterPro names from %s..." % url)
     for line in open_anything(url):
         sys.stdout.write(line)
Exemplo n.º 6
0
    def process_file(self, filename):
        """Processes the given input file"""
        self.log.info("Processing %s..." % filename)

        parser = fasta.Parser(open_anything(filename))
        parser = fasta.regexp_remapper(parser, self.options.sequence_id_regexp)
        for seq in parser:
            print seq.id
Exemplo n.º 7
0
    def process_file(self, filename):
        """Processes the given input file"""
        self.log.info("Processing %s..." % filename)

        parser = fasta.Parser(open_anything(filename))
        parser = fasta.regexp_remapper(parser, self.options.sequence_id_regexp)
        for seq in parser:
            print(seq.id)
Exemplo n.º 8
0
 def load(self, filename):
     """Loads ID-name assignments from a simple tab-separated flat file.
     
     Lines not containins any tab characters are silently ignored."""
     for line in open_anything(filename):
         parts = line.strip().split("\t", 1)
         if len(parts) > 1:
             self.names[parts[0]] = parts[1]
Exemplo n.º 9
0
    def load(self, filename):
        """Loads ID-name assignments from a simple tab-separated flat file.

        Lines not containins any tab characters are silently ignored."""
        for line in open_anything(filename):
            parts = line.strip().split("\t", 1)
            if len(parts) > 1:
                self.names[parts[0]] = parts[1]
Exemplo n.º 10
0
    def process_file(self, slice_file):
        """Processes the given slice file"""
        self.log.info("Processing input file: %s..." % slice_file)

        writer = fasta.Writer(sys.stdout)

        for line in open_anything(slice_file):
            parts = line.strip().split()
            if not parts:
                continue

            seq_id, record = parts[0], None
            try:
                record = self.seqs[seq_id]
            except KeyError:
                if self.options.try_alternative_splicing:
                    try:
                        record = self.seqs[seq_id+".1"]
                    except KeyError:
                        pass

            if record is None:
                if self.options.ignore_unknown:
                    self.log.warning("Ignoring unknown sequence ID: %s" % seq_id)
                    continue
                self.log.fatal("Unknown sequence ID in input file: %s" % seq_id)
                return 1

            if len(parts) == 1:
                start, end = 1, len(record.seq)
                new_id = record.id
            else:
                start = int(parts[1])
                if len(parts) == 2:
                    end = len(record.seq)
                else:
                    end = int(parts[2])

            if start == 0:
                self.log.warning("Ignoring sequence ID: %s, "
                        "requested start position is zero" % seq_id)
            elif end == 0:
                self.log.warning("Ignoring sequence ID: %s, "
                        "requested end position is zero" % seq_id)

            if start < 0:
                start = len(record.seq) + start + 1
            if end < 0:
                end = len(record.seq) + end + 1

            if not self.options.keep_ids:
                new_id = "%s:%d-%d" % (record.id, start, end)
            else:
                new_id = seq_id

            new_record = SeqRecord(record.seq[(start-1):end],
                    id=new_id, name=record.name, description="")
            writer.write(new_record)
Exemplo n.º 11
0
 def process_sequences_file(self, fname):
     self.log.info("Loading sequences from %s..." % fname)
     self.seq_ids_to_length = {}
     parser = fasta.Parser(open_anything(fname))
     parser = fasta.regexp_remapper(parser,
             self.sequence_id_regexp
     )
     for seq in parser:
         self.seq_ids_to_length[seq.id] = len(seq.seq)
Exemplo n.º 12
0
    def load_sequences(self, seq_file):
        """Loads the sequences from the given sequence file in FASTA format"""
        self.log.info("Loading sequences from %s..." % seq_file)

        parser = fasta.Parser(open_anything(seq_file))
        parser = fasta.regexp_remapper(parser,
                self.options.sequence_id_regexp)

        self.seqs = dict(((seq.id, seq) for seq in parser))
Exemplo n.º 13
0
    def download_superfamily(self, url):
        """Downloads the most recent mappings from SCOP sunids to human
        readable names, derives the Superfamily IDs from the SCOP sunids,
        and prints the mapping to the standard output.

        The most recent version of SCOP is identified by applying a regexp
        to the HTML output of the given page. The regexp assumes that the
        most recent description file is linked on the given page using
        an ``<a>`` tag with ``href`` equal to ``dir.des.scop.txt_X.XX``,
        where ``X.XX`` is the version number. If such an identification
        fails, this method will return without printing anything but a
        warning on the logging stream.
        """
        self.log.info("Downloading SCOP page from %s..." % url)
        contents = open_anything(url).read()

        des_link_regexp = re.compile(r"<a href=\"dir.des.scop.txt_([0-9.]+)\">",
                re.IGNORECASE)

        max_version, max_version_nosplit = None, None
        for idx, match in enumerate(des_link_regexp.finditer(contents)):
            version = [int(comp) for comp in match.group(1).split(".")]
            if version > max_version:
                max_version = version
                max_version_nosplit = match.group(1)
                max_idx = idx

        version = max_version_nosplit

        if version is None:
            self.log.warning("Cannot infer the most recent version of SCOP, "
                             "skipping Superfamily IDs")
            return

        self.log.info("Most recent SCOP version is assumed to be %s" % version)
        url = "%sdir.des.scop.txt_%s" % (url, version)

        for line in open_anything(url):
            if line[0] == '#':
                continue
            parts = line.split("\t", 4)
            if parts[1] != "sf" or not parts[4]:
                continue
            sys.stdout.write("SSF%s\t%s" % (parts[0], parts[4]))
Exemplo n.º 14
0
    def download_superfamily(self, url):
        """Downloads the most recent mappings from SCOP sunids to human
        readable names, derives the Superfamily IDs from the SCOP sunids,
        and prints the mapping to the standard output.

        The most recent version of SCOP is identified by applying a regexp
        to the HTML output of the given page. The regexp assumes that the
        most recent description file is linked on the given page using
        an ``<a>`` tag with ``href`` equal to ``dir.des.scop.txt_X.XX``,
        where ``X.XX`` is the version number. If such an identification
        fails, this method will return without printing anything but a
        warning on the logging stream.
        """
        self.log.info("Downloading SCOP page from %s..." % url)
        contents = open_anything(url).read()

        des_link_regexp = re.compile(
            r"<a href=\"dir.des.scop.txt_([0-9.]+)\">", re.IGNORECASE)

        max_version, max_version_nosplit = None, None
        for _idx, match in enumerate(des_link_regexp.finditer(contents)):
            version = [int(comp) for comp in match.group(1).split(".")]
            if version > max_version:
                max_version = version
                max_version_nosplit = match.group(1)

        version = max_version_nosplit

        if version is None:
            self.log.warning("Cannot infer the most recent version of SCOP, "
                             "skipping Superfamily IDs")
            return

        self.log.info("Most recent SCOP version is assumed to be %s" % version)
        url = "%sdir.des.scop.txt_%s" % (url, version)

        for line in open_anything(url):
            if line[0] == '#':
                continue
            parts = line.split("\t", 4)
            if parts[1] != "sf" or not parts[4]:
                continue
            sys.stdout.write("SSF%s\t%s" % (parts[0], parts[4]))
Exemplo n.º 15
0
 def download_pfam(self, url):
     """Downloads the official PFam ID-name mappings from the given URL
     and prints the mapping to the standard output.
     """
     self.log.info("Downloading PFam names from %s..." % url)
     for line in open_anything(url):
         if line[0] == "#":
             continue
         parts = line.split("\t", 2)
         if len(parts) < 3 or not parts[2]:
             continue
         sys.stdout.write("%s\t%s" % (parts[0], parts[2]))
Exemplo n.º 16
0
 def download_smart(self, url):
     """Downloads the official Smart ID-name mappings from the given
     URL and prints the mapping to the standard output.
     """
     self.log.info("Downloading Smart names from %s..." % url)
     for line in open_anything(url):
         parts = line.split("\t", 3)
         if len(parts) < 3 or not parts[2]:
             continue
         if parts[1] == "ACC" and parts[2] == "DEFINITION":
             continue
         sys.stdout.write("%s\t%s\n" % (parts[1], parts[2]))
Exemplo n.º 17
0
 def download_smart(self, url):
     """Downloads the official Smart ID-name mappings from the given
     URL and prints the mapping to the standard output.
     """
     self.log.info("Downloading Smart names from %s..." % url)
     for line in open_anything(url):
         parts = line.split("\t", 3)
         if len(parts) < 3 or not parts[2]:
             continue
         if parts[1] == "ACC" and parts[2] == "DEFINITION":
             continue
         sys.stdout.write("%s\t%s\n" % (parts[1], parts[2]))
Exemplo n.º 18
0
 def download_pfam(self, url):
     """Downloads the official PFam ID-name mappings from the given URL
     and prints the mapping to the standard output.
     """
     self.log.info("Downloading PFam names from %s..." % url)
     for line in open_anything(url):
         if line[0] == "#":
             continue
         parts = line.split("\t", 2)
         if len(parts) < 3 or not parts[2]:
             continue
         sys.stdout.write("%s\t%s" % (parts[0], parts[2]))
Exemplo n.º 19
0
    def process_file(self, input_file):
        """Processes the given input file that contains the domain
        architectures."""

        cache = {}

        num_no_annotations = 0
        num_no_domains = 0
        total_seqs = 0

        for line in open_anything(input_file):
            parts = line.strip().split("\t")
            gene_id = parts[0]
            arch = tuple([
                x for x in parts[3].replace("{", ";").replace("}", ";").split(
                    ";") if x
            ])
            total_seqs += 1

            if arch == ("NO_ASSIGNMENT", ):
                num_no_domains += 1
                num_no_annotations += 1
                continue

            if arch not in cache:
                all_terms = set()
                for domain in arch:
                    all_terms.update(self.go_mapping.get_left(domain, []))
                for path in self.go_tree.paths_to_root(*list(all_terms)):
                    all_terms.difference_update(path[1:])
                all_terms = sorted(
                    all_terms,
                    key=lambda x: len(self.go_mapping.get_right(x, [])))
                cache[arch] = all_terms

            print(gene_id)
            for term in cache[arch]:
                print("  %s (%s)" % (term.id, term.name))
            print()

            if not cache[arch]:
                num_no_annotations += 1

        self.log.info("Total number of sequences processed: %d", total_seqs)
        if num_no_annotations:
            self.log.info(
                "Could not assign functional label "
                "to %d sequences :(", num_no_annotations)
        if num_no_domains:
            self.log.info("%d sequences have no domains at all :(",
                          num_no_domains)
Exemplo n.º 20
0
    def __init__(self, file_handle):
        """Creates an annotation file parser that reads the given file-like
        object. You can also specify filenames. If it ends in ``.gz``,
        the file is assumed to contain gzipped data and it will be unzipped
        on the fly. Example::

          >>> import gfam.go as go
          >>> parser = go.AnnotationFile("gene_association.sgd.gz")

        To read the annotations in the file, you must iterate over the parser
        as if it were a list. The iterator yields `Annotation` objects.
        """
        self.file_handle = open_anything(file_handle)
        self.lineno = 0
Exemplo n.º 21
0
    def process_file(self, input_file):
        """Processes the given input file that contains the domain
        architectures."""

        cache = {}

        num_no_annotations = 0
        num_no_domains = 0
        total_seqs = 0

        for line in open_anything(input_file):
            parts = line.strip().split("\t")
            gene_id = parts[0]
            arch = tuple([x for x in
                          parts[3].replace("{", ";")
                          .replace("}", ";")
                          .split(";") if x])
            total_seqs += 1

            if arch == ("NO_ASSIGNMENT", ):
                num_no_domains += 1
                num_no_annotations += 1
                continue

            if arch not in cache:
                all_terms = set()
                for domain in arch:
                    all_terms.update(self.go_mapping.get_left(domain, []))
                for path in self.go_tree.paths_to_root(*list(all_terms)):
                    all_terms.difference_update(path[1:])
                all_terms = sorted(all_terms,
                                   key=lambda x:
                                   len(self.go_mapping.get_right(x, [])))
                cache[arch] = all_terms

            print(gene_id)
            for term in cache[arch]:
                print("  %s (%s)" % (term.id, term.name))
            print()

            if not cache[arch]:
                num_no_annotations += 1

        self.log.info("Total number of sequences processed: %d", total_seqs)
        if num_no_annotations:
            self.log.info("Could not assign functional label "
                          "to %d sequences :(", num_no_annotations)
        if num_no_domains:
            self.log.info("%d sequences have no domains at all :(",
                          num_no_domains)
Exemplo n.º 22
0
 def process_sequences_file_old(self, fname):
     """ This is the old version, all the entries are
         loaded into memory
     """
     self.log.info("Loading sequences from %s..." % fname)
     parser = fasta.Parser(open_anything(fname))
     parser = fasta.regexp_remapper(parser,
                                    self.sequence_id_regexp)
     seqs, lens = [], []
     for i, seq in enumerate(parser):
         seqs.append(seq.id)
         lens.append(len(seq.seq))
         if i % 1000000 == 0:
             self.log.info("Read {} seqs".format(i))
     self.log.info("...loaded")
     self.seq_ids_to_length = dict(zip(seqs, lens))
Exemplo n.º 23
0
    def separate_sequences(self, table, sequences_file):
        """Separates the fasta sequence file in individual fasta files,
           one per cluster"""
        reader = fasta.Parser(open_anything(sequences_file))
        seqs = dict(((seq.id, seq) for seq in reader))

        for cluster_name, cluster_seqs in table.items():
            output_file_name = self.sequences_dir + os.path.sep + cluster_name
            output_fd = open(output_file_name + ".faa", "w")
            writer = fasta.Writer(output_fd)

            for sequence in cluster_seqs:
                obj = SeqRecord(seqs[sequence].seq, sequence, "", "")
                writer.write(obj)

            output_fd.close()
Exemplo n.º 24
0
    def separate_sequences(self, table, sequences_file):
        """Separates the fasta sequence file in individual fasta files,
           one per cluster"""
        reader = fasta.Parser(open_anything(sequences_file))
        seqs = dict(((seq.id, seq) for seq in reader))

        for cluster_name, cluster_seqs in table.items():
            output_file_name = self.sequences_dir + os.path.sep + cluster_name
            output_fd = open(output_file_name + ".faa", "w")
            writer = fasta.Writer(output_fd)

            for sequence in cluster_seqs:
                obj = SeqRecord(seqs[sequence].seq, sequence, "", "")
                writer.write(obj)

            output_fd.close()
Exemplo n.º 25
0
    def read_goa_file(self, goa_file, ev_codes):
        """Reads the GOA file, return a defaultdict that,
        for each protein id it has a set of strings with the
        associated GO terms
        """
        d = bidict()
        for line in open_anything(goa_file):
            if not line.startswith(("!", "#")):
                # split line, obtain protein_id, go_term, and ev_code
                fields = line.split("\t", 7)
                prot_id, goterm, evcode = fields[1], fields[4], fields[6]

                if evcode in ev_codes:
                    d.add_left(prot_id, self.go_tree.lookup(goterm))
        self.log.info("GOA file read. " + str(d.len_left()) + " proteins loaded")
        return d
Exemplo n.º 26
0
    def from_file(cls, filename, tree):
        """Constructs a mapping from a mapping file. The format of this
        file should be identical to the official ``interpro2go`` file
        provided by the Gene Ontology project. `tree` is a Gene Ontology
        tree object (see `gfam.go.Tree`) that will be used to
        look up terms from IDs."""

        regex = re.compile("InterPro:([A-Z0-9]+) .* > .* ; (GO:[0-9]+)")
        result = cls()
        for line in open_anything(filename):
            if line[0] == '!':
                continue
            match = regex.match(line)
            if not match:
                continue
            result.add_annotation(match.group(1), tree.lookup(match.group(2)))
        return result
Exemplo n.º 27
0
    def from_file(cls, filename, tree):
        """Constructs a mapping from a mapping file. The format of this
        file should be identical to the official ``interpro2go`` file
        provided by the Gene Ontology project. `tree` is a Gene Ontology
        tree object (see `gfam.go.Tree`) that will be used to
        look up terms from IDs."""

        regex = re.compile("InterPro:([A-Z0-9]+) .* > .* ; (GO:[0-9]+)")
        result = cls()
        for line in open_anything(filename):
            if line[0] == '!':
                continue
            match = regex.match(line)
            if not match:
                continue
            result.add_annotation(match.group(1), tree.lookup(match.group(2)))
        return result
Exemplo n.º 28
0
    def read_goa_file(self, goa_file, ev_codes):
        """Reads the GOA file, return a defaultdict that,
        for each protein id it has a set of strings with the
        associated GO terms
        """
        goa_mapping = bidict()
        for line in open_anything(goa_file):
            if not line.startswith(("!", "#")):
                # split line, obtain protein_id, go_term, and ev_code
                fields = line.split("\t", 7)
                prot_id, goterm, evcode = fields[1], fields[4], fields[6]

                if evcode in ev_codes:
                    goa_mapping.add_left(prot_id, self.go_tree.lookup(goterm))
        self.log.info("GOA file read. %s proteins loaded",
                      str(goa_mapping.len_left()))
        return goa_mapping
Exemplo n.º 29
0
    def process_file(self, input_file):
        """Processes the given input file that contains the domain
        architectures."""
        self.log.info("Running overrepresentation analysis")
        self.log.info("p-value = %.4f, correction method = %s" % \
                      (self.options.confidence, self.options.correction))

        overrep = OverrepresentationAnalyser(self.go_tree, self.go_mapping,
                confidence = self.options.confidence,
                min_count = self.options.min_size,
                correction = self.options.correction)
        cache = {}

        num_no_annotations = 0
        num_no_domains = 0
        total_seqs = 0

        for line in open_anything(input_file):
            parts = line.strip().split("\t")
            gene_id, arch = parts[0], tuple(parts[2].split(";"))
            total_seqs += 1

            if arch == ("NO_ASSIGNMENT", ):
                num_no_domains += 1
                num_no_annotations += 1
                continue

            if arch not in cache:
                cache[arch] = overrep.test_group(arch)

            print gene_id
            for term, p_value in cache[arch]:
                print "  %.4f: %s (%s)" % (p_value, term.id, term.name)
            print

            if len(cache[arch]) == 0:
                num_no_annotations += 1

        self.log.info("Total number of sequences processed: %d" % total_seqs)
        if num_no_annotations:
            self.log.info("%d sequences have no overrepresented annotations :("
                    % num_no_annotations)
        if num_no_domains:
            self.log.info("%d sequences have no domains at all :(" % num_no_domains)
Exemplo n.º 30
0
    def run_real(self):
        """Runs the application"""

        # Load valid sequence IDs (if necessary)
        if self.options.sequences_file:
            self.log.info("Loading sequences from %s..." %
                          self.options.sequences_file)
            self.total_sequence_length = 0
            self.valid_sequence_ids = set()
            parser = fasta.Parser(open_anything(self.options.sequences_file))
            parser = fasta.regexp_remapper(parser,
                                           self.options.sequence_id_regexp)
            for seq in parser:
                self.valid_sequence_ids.add(seq.id)
                self.total_sequence_length += len(seq.seq)
        else:
            self.valid_sequence_ids = complementerset()
            self.total_sequence_length = None

        # Find which sources will be allowed
        if not self.options.include_sources:
            self.sources = complementerset()
        else:
            self.sources = set(self.options.include_sources)
        self.sources.difference_update(self.options.exclude_sources)
        if isinstance(self.sources, complementerset):
            self.log.info("Ignored sources: %s" %
                          ", ".join(self.sources.iterexcluded()))
        else:
            self.log.info("Accepted sources: %s" % ", ".join(self.sources))

        if not self.args:
            self.args = ["-"]

        for arg in self.args:
            # Set up the output formatter
            if self.options.print_totals:
                self.output_formatter = GenomeLevelOutputFormatter(self)
            else:
                self.output_formatter = SequenceLevelOutputFormatter(self)
            # Process the file
            self.process_infile(arg)
            # Print the results
            self.output_formatter.finish()
Exemplo n.º 31
0
    def run_real(self):
        """Runs the application"""

        # Load valid sequence IDs (if necessary)
        if self.options.sequences_file:
            self.log.info("Loading sequences from %s..." % self.options.sequences_file)

            self.total_sequence_length = 0
            self.valid_sequence_ids = set()
            parser = fasta.Parser(open_anything(self.options.sequences_file))
            parser = fasta.regexp_remapper(parser, self.options.sequence_id_regexp)
            for seq in parser:
                self.valid_sequence_ids.add(seq.id)
                self.total_sequence_length += len(seq.seq)
        else:
            self.valid_sequence_ids = complementerset()
            self.total_sequence_length = None

        # Find which sources will be allowed
        if not self.options.include_sources:
            self.sources = complementerset()
        else:
            self.sources = set(self.options.include_sources)
        self.sources.difference_update(self.options.exclude_sources)
        if isinstance(self.sources, complementerset):
            self.log.info("Ignored sources: %s" % ", ".join(self.sources.iterexcluded()))
        else:
            self.log.info("Accepted sources: %s" % ", ".join(self.sources))

        if not self.args:
            self.args = ["-"]

        for arg in self.args:
            # Set up the output formatter
            if self.options.print_totals:
                self.output_formatter = GenomeLevelOutputFormatter(self)
            else:
                self.output_formatter = SequenceLevelOutputFormatter(self)
            # Process the file
            self.process_infile(arg)
            # Print the results
            self.output_formatter.finish()
Exemplo n.º 32
0
    def process_sequences_file(self, fname):
        """ In this version we use `shelve` to save
            memory (the pairs (protein accession, length) are
            stored in a temporary database. See `process_sequences_file_old`
            for the old version.
        """
        self.log.info("Loading sequences from {}...".format(fname))
        parser = fasta.Parser(open_anything(fname))
        parser = fasta.regexp_remapper(parser,
                                       self.sequence_id_regexp)
        self.filename_shelve = os.path.join(tempfile.gettempdir(),
                                            "shelve_file")
        self.seq_ids_to_length = shelve.open(self.filename_shelve)

        for i, seq in enumerate(parser):
            self.seq_ids_to_length[seq.id] = len(seq.seq)
            if i % 1000000 == 0:
                self.log.info("Read {} seqs".format(i))
                self.seq_ids_to_length.sync()
        self.log.info("...loaded")
Exemplo n.º 33
0
    def read_low_complexity_regions(self, file_name):
        self.low_complexity_regions = defaultdict(list)
        current_prot_id = ""

        if self.sequence_id_regexp:
            import re 
            regexp = re.compile(self.sequence_id_regexp)
        else: 
            regexp = None

        for line in open_anything(file_name):
            line = line.strip()
            if not line:
                continue
            if line[0] == ">":
                current_prot_id = line.split()[0][1:]
                if regexp is not None:
                    current_prot_id = regexp.sub(r'\g<id>', current_prot_id)
            else:
                (left, _, right) = line.split()
                self.low_complexity_regions[current_prot_id].append((int(left),int(right)))
Exemplo n.º 34
0
    def process_file(self, filename):
        """Processes the input file with the given filename"""
        threshold = self.options.threshold
        adj_list = defaultdict(set)
        idgen = UniqueIdGenerator()

        self.log.info("Processing %s..." % filename)
        for line_no, line in enumerate(open_anything(filename)):
            parts = line.strip().split()
            if not parts:
                continue
            if len(parts) < 2:
                raise ValueError("line %d contains only a single ID" % line_no)
            if len(parts) < 3:
                parts.append(1.0)
            else:
                parts[2] = float(parts[2])

            id1, id2, weight = parts[:3]
            if weight < threshold:
                continue

            id1, id2 = idgen[id1], idgen[id2]
            if id1 == id2:
                continue
            if id1 > id2:
                id1, id2 = id2, id1

            adj_list[id1].add(id2)
            adj_list[id2].add(id1)

        names = idgen.values()

        bfs = BreadthFirstSearch(adj_list)
        not_seen = set(range(len(idgen)))
        while not_seen:
            component = list(bfs.run(not_seen.pop()))
            print("\t".join(names[idx] for idx in component))
            not_seen.difference_update(component)
Exemplo n.º 35
0
    def run_real(self):
        """Runs the application"""
        AssignmentOverlapChecker.max_overlap = self.options.max_overlap

        if self.options.interpro_file:
            self.log.info("Loading known InterPro IDs from %s..." %
                          self.options.interpro_file)
            self.interpro = InterPro.FromFile(self.options.interpro_file)
        else:
            self.interpro = InterPro()

        if self.options.gene_id_file:
            self.log.info("Loading sequence IDs from %s..." %
                          self.options.gene_id_file)
            self.valid_sequence_ids = set()
            for line in open_anything(self.options.gene_id_file):
                self.valid_sequence_ids.add(line.strip())
        else:
            self.valid_sequence_ids = complementerset()

        if self.options.exclusions_log_file:
            self.log.info("Logging excluded sequences to %s." %
                          self.options.exclusions_log_file)
            self.exclusion_log = open(self.options.exclusions_log_file, "a+")
        else:
            self.exclusion_log = None

        self.ignored = set()
        for ignored_source in self.options.ignored:
            parts = ignored_source.split()
            self.ignored.update(parts)

        if not self.args:
            self.args = ["-"]
        if len(self.args) > 1:
            self.error("Only one input file may be given")

        self.process_infile(self.args[0])
Exemplo n.º 36
0
    def FromFile(cls, filename):
        """Constructs this object from an InterPro parent-child mapping file,
        pointed to by the given filename. Both the tree and the ID-name mapping
        will be built from the same file.
        """
        result = cls()
        path_to_root = []

        for line in open_anything(filename):
            line = line.strip()
            dash_count = 0
            while line[dash_count] == "-":
                dash_count += 1
            if dash_count % 2 != 0:
                raise ValueError("dash count in InterPro file not even")

            line = line[dash_count:]
            parts = line.split("::")
            interpro_id, aliases = parts[0], parts[2:]

            level = dash_count // 2 + 1
            if level <= len(path_to_root):
                path_to_root = path_to_root[:level]
                path_to_root[-1] = interpro_id
            else:
                path_to_root.append(interpro_id)
                if level != len(path_to_root):
                    raise ValueError("tree depth increased by more than "
                                     "one between two lines")
            if len(path_to_root) > 1:
                result.tree[interpro_id] = path_to_root[-2]

            for alias in aliases:
                if alias[0:4] == "PTHR" and ":SF" in alias:
                    alias = alias[0:alias.index(":SF")]
                result.mapping[alias] = interpro_id

        return result
Exemplo n.º 37
0
    def run_real(self):
        """Runs the application"""
        AssignmentOverlapChecker.max_overlap = self.options.max_overlap

        if self.options.interpro_file:
            self.log.info("Loading known InterPro IDs from %s..." %
                          self.options.interpro_file)
            self.interpro = InterPro.FromFile(self.options.interpro_file)
        else:
            self.interpro = InterPro()

        if self.options.gene_id_file:
            self.log.info("Loading sequence IDs from %s..." %
                          self.options.gene_id_file)
            self.valid_sequence_ids = set()
            for line in open_anything(self.options.gene_id_file):
                self.valid_sequence_ids.add(line.strip())
        else:
            self.valid_sequence_ids = complementerset()

        if self.options.exclusions_log_file:
            self.log.info("Logging excluded sequences to %s." %
                          self.options.exclusions_log_file)
            self.exclusion_log = open(self.options.exclusions_log_file, "a+")
        else:
            self.exclusion_log = None

        self.ignored = set()
        for ignored_source in self.options.ignored:
            parts = ignored_source.split()
            self.ignored.update(parts)

        if not self.args:
            self.args = ["-"]
        if len(self.args) > 1:
            self.error("Only one input file may be given")

        self.process_infile(self.args[0])
Exemplo n.º 38
0
    def FromFile(cls, filename):
        """Constructs this object from an InterPro parent-child mapping file,
        pointed to by the given filename. Both the tree and the ID-name mapping
        will be built from the same file.
        """
        result = cls()
        path_to_root = []

        for line in open_anything(filename):
            line = line.strip()
            dash_count = 0
            while line[dash_count] == "-":
                dash_count += 1
            if dash_count % 2 != 0:
                raise ValueError("dash count in InterPro file not even")

            line = line[dash_count:]
            parts = line.split("::")
            interpro_id, aliases = parts[0], parts[2:]

            level = dash_count // 2 + 1
            if level <= len(path_to_root):
                path_to_root = path_to_root[:level]
                path_to_root[-1] = interpro_id
            else:
                path_to_root.append(interpro_id)
                if level != len(path_to_root):
                    raise ValueError("tree depth increased by more than "
                                     "one between two lines")
            if len(path_to_root) > 1:
                result.tree[interpro_id] = path_to_root[-2]

            for alias in aliases:
                if alias[0:4] == "PTHR" and ":SF" in alias:
                    alias = alias[0:alias.index(":SF")]
                result.mapping[alias] = interpro_id

        return result
Exemplo n.º 39
0
    def __init__(self, file_handle):
        """Creates an OBO parser that reads the given file-like object.
        If you want to create a parser that reads an OBO file, do this:

          >>> import gfam.go.obo
          >>> parser = gfam.go.obo.Parser(open("gene_ontology.1_2.obo"))

        Only the headers are read when creating the parser. You can
        access these right after construction as follows:

          >>> parser.headers["format-version"]
          ['1.2']

        To read the stanzas in the file, you must iterate over the
        parser as if it were a list. The iterator yields `Stanza`
        objects.
        """
        self.file_handle = open_anything(file_handle)
        self.line_re = re.compile(r"\s*(?P<tag>[^:]+):\s*(?P<value>.*)")
        self.lineno = 0
        self.headers = {}
        self._extra_line = None
        self._read_headers()
Exemplo n.º 40
0
    def __init__(self, file_handle):
        """Creates an OBO parser that reads the given file-like object.
        If you want to create a parser that reads an OBO file, do this:

          >>> import gfam.go.obo
          >>> parser = gfam.go.obo.Parser(open("gene_ontology.1_2.obo"))

        Only the headers are read when creating the parser. You can
        access these right after construction as follows:

          >>> parser.headers["format-version"]
          ['1.2']

        To read the stanzas in the file, you must iterate over the
        parser as if it were a list. The iterator yields `Stanza`
        objects.
        """
        self.file_handle = open_anything(file_handle)
        self.line_re = re.compile(r"\s*(?P<tag>[^:]+):\s*(?P<value>.*)")
        self.lineno = 0
        self.headers = {}
        self._extra_line = None
        self._read_headers()
Exemplo n.º 41
0
    def _read_goa_file(self, goa_file):
        """Reads the GOA file, return a dict that,
        for each protein id it has a set of strings with the
        associated GO terms after up-propagation
        """
        d = {}
        self.go_terms_gs = set()
        for line in open_anything(goa_file):
            if not line.startswith("!") or line.startswith("#"):
                # split line, obtain protein_id, go_term, and ev_code
                fields = line.split("\t")
                prot_id, goterm, evcode = fields[1], fields[4], fields[6]
                if evcode in self.valid_ev_codes:
                    terms = map(str, self.go_tree.ancestors(goterm))
                    d[prot_id] = terms
                    self.go_terms_gs.update(terms)

        self.go_terms_gs_ont = {}
        for ontology, terms in self.terms_per_ontology.items():
            gs_terms_o = self.go_terms_gs & terms
            self.go_terms_gs_ont[ontology] = gs_terms_o
            print "# of Terms in ", ontology, " in GOA file: ", len(gs_terms_o)

        return d
Exemplo n.º 42
0
    def process_file(self, filename):
        """Processes the input file with the given filename"""
        self.log.info("Processing %s...", filename)
        infile = open_anything(filename)
        neis = defaultdict(set)
        for line_no, line in enumerate(infile):
            parts = line.strip().split()
            if not parts:
                continue
            if len(parts) < 2:
                raise ValueError("line %d contains only a single ID" % line_no)
            neis[parts[0]].add(parts[1])
            neis[parts[1]].add(parts[0])

        if self.options.add_loops:
            for k, vals in neis.items():
                vals.add(k)

        all_ids = sorted(neis.keys())
        # lens = dict((id, len(neis1)) for id, neis1 in enumerate(neis))
        for id1 in all_ids:
            neis1 = neis[id1]
            len1 = float(len(neis1))
            if self.options.only_linked:
                others = sorted(neis1)
            else:
                others = all_ids
            for id2 in others:
                if id2 < id1:
                    continue
                neis2 = neis[id2]
                isect = len(neis2.intersection(neis1))
                sim = isect / (len1+len(neis2)-isect)
                if sim < self.options.min_similarity:
                    continue
                print("%s\t%s\t%.8f" % (id1, id2, sim))
Exemplo n.º 43
0
    def process_file(self, filename):
        """Processes the input file with the given filename"""
        self.log.info("Processing %s..." % filename)
        infile = open_anything(filename)
        neis = defaultdict(set)
        for line_no, line in enumerate(infile):
            parts = line.strip().split()
            if not parts:
                continue
            if len(parts) < 2:
                raise ValueError("line %d contains only a single ID" % line_no)
            neis[parts[0]].add(parts[1])
            neis[parts[1]].add(parts[0])

        if self.options.add_loops:
            for k, v in neis.iteritems():
                v.add(k)

        all_ids = sorted(neis.keys())
        lens = dict((id, len(neis1)) for id, neis1 in enumerate(neis))
        for id1 in all_ids:
            neis1 = neis[id1]
            len1 = float(len(neis1))
            if self.options.only_linked:
                others = sorted(neis1)
            else:
                others = all_ids
            for id2 in others:
                if id2 < id1:
                    continue
                neis2 = neis[id2]
                isect = len(neis2.intersection(neis1))
                sim = isect / (len1+len(neis2)-isect)
                if sim < self.options.min_similarity:
                    continue
                print "%s\t%s\t%.8f" % (id1, id2, sim)
Exemplo n.º 44
0
 def load_sequences_from_file(self, fname):
     """Loads the sequences from the given file. The file must
     be in FASTA format. You are allowed to pass file pointers
     or names of gzipped/bzipped files here."""
     return self.load_sequences(fasta.Parser(open_anything(fname)))
Exemplo n.º 45
0
    def process_file(self, input_file):
        """Processes the given input file that contains the domain
        architectures."""
        self.log.info("Running overrepresentation analysis")
        self.log.info("p-value = %.4f, correction method = %s",
                      self.options.confidence, self.options.correction)

        if self.options.arch_file:
            arch_file_name = self.options.arch_file
            if self.options.ignore:
                arch_file_name += "_unfiltered"
            arch_file = open(arch_file_name, "w")

        confidence = self.options.confidence
        if self.options.ignore:
            confidence = float("inf")
        self.log.info("Ignored the significance value."
                      " We will filter results later.")

        overrep = OverrepresentationAnalyser(self.go_tree,
                                             self.go_mapping,
                                             confidence=confidence,
                                             min_count=self.options.min_size,
                                             correction=self.options.
                                             correction)
        cache = {}

        num_no_annotations = 0
        num_no_domains = 0
        total_seqs = 0

        for line in open_anything(input_file):
            parts = line.strip().split("\t")
            gene_id = parts[0]
            prts = parts[3].replace("{", ";").replace("}", ";").split(";")
            arch = tuple([x for x in prts if x])
            total_seqs += 1

            if arch == ("NO_ASSIGNMENT", ):
                num_no_domains += 1
                num_no_annotations += 1
                continue

            if arch not in cache:
                cache[arch] = overrep.test_group(arch)
                if self.options.arch_file:
                    arch_file.write("{}\n".format(parts[3]))  # architecture
                    for term, p_value in cache[arch]:
                        line = "  %.4f: %s (%s)\n" % (p_value, term.id,
                                                      term.name)
                        arch_file.write(line)
                    arch_file.write("\n")
            if self.options.results_by_protein:
                print(gene_id)
                for term, p_value in cache[arch]:
                    print("  %.4f: %s (%s)" % (p_value, term.id, term.name))
                print()

            if not cache[arch]:
                num_no_annotations += 1

        self.log.info("Total number of sequences processed: %d", total_seqs)
        if num_no_annotations:
            self.log.info("%d sequences have no overrepresented annots. :(",
                          num_no_annotations)
        if num_no_domains:
            self.log.info("%d sequences have no domains at all :(",
                          num_no_domains)

        if self.options.arch_file:
            arch_file.close()
            if self.options.ignore:
                # we filter the file with the significance value
                filterer = ResultFileFilter(arch_file_name)
                filterer.filter(self.options.arch_file,
                                confidence=self.options.confidence)
Exemplo n.º 46
0
    def process_sequences_file(self, seq_file):
        """Processes the sequences one by one, extracting all the pieces into
        an output fasta file"""
        self.log.info("Processing fasta file %s...", seq_file)

        parser = fasta.Parser(open_anything(seq_file))
        parser = fasta.regexp_remapper(parser, self.options.sequence_id_regexp)

        ids_to_process = set(self.parts.keys())

        writer = fasta.FastWriter(sys.stdout)
        if self.output_file is not None:
            output_fd = open(self.output_file, "w")
            writer_file = fasta.FastWriter(output_fd)

        for seq in parser:
            seq_id = seq.id
            if seq_id not in self.parts:
                if self.options.try_alternative_splicing:
                    seq_id = seq_id.strip().rstrip(".1")
                    if seq_id not in self.parts:
                        continue
                else:
                    continue

            sequence = seq.seq
            length_seq = len(sequence)
            ids_to_process.remove(seq_id)

            for left, right in self.parts[seq_id]:

                if left < 0:
                    left = length_seq + left + 1
                if right < 0:
                    right = length_seq + right + 1

                right = min(right, length_seq)
                # just in case...

                if left > right:
                    # again, just in case
                    self.log.warning(
                        "Problem with fragment of %s, "
                        "the right part is smaller than "
                        "the left", seq_id)
                    continue

                new_record = None

                if left == 1 and right == length_seq:
                    new_record = seq.fragment(not self.options.keep_ids)
                else:
                    if not self.options.keep_ids:
                        new_id = "%s:%d-%d" % (seq_id, left, right)
                    else:
                        new_id = seq_id
                    new_record = SeqRecord(sequence[(left - 1):right],
                                           id=new_id,
                                           name=seq.name,
                                           description="")
                writer.write(new_record)
                if self.output_file is not None:
                    writer_file.write(new_record)

        if self.output_file is not None:
            output_fd.close()

        if ids_to_process:
            self.log.fatal(
                "The following identifiers of sequences (%s) were"
                "found in the fragments file, but not in the "
                "fasta file ", ",".join(ids_to_process))
            return 1
        return 0
Exemplo n.º 47
0
 def load_sequences_from_file(self, fname):
     """Loads the sequences from the given file. The file must
     be in FASTA format. You are allowed to pass file pointers
     or names of gzipped/bzipped files here."""
     return self.load_sequences(fasta.Parser(open_anything(fname)))
Exemplo n.º 48
0
    def process_sequences_file(self, seq_file):
        """Processes the sequences one by one, extracting all the pieces into
        an output fasta file"""
        self.log.info("Processing fasta file %s..." %seq_file)

        parser = fasta.Parser(open_anything(seq_file))
        parser = fasta.regexp_remapper(parser, 
            self.options.sequence_id_regexp)

        ids_to_process = set(self.parts.keys())

        writer = fasta.FastWriter(sys.stdout)
        if self.output_file is not None:
            output_fd = open(self.output_file,"w")
            writer_file = fasta.FastWriter(output_fd)

        for seq in parser:
            seq_id = seq.id
            if seq_id not in self.parts:
                if self.options.try_alternative_splicing:
                    seq_id = seq_id.strip().rstrip(".1")
                    if seq_id not in self.parts:
                        continue
                else:
                    continue
 
            sequence = seq.seq
            length_seq = len(sequence)
            ids_to_process.remove(seq_id)

            for left, right in self.parts[seq_id]:

                if left < 0:
                    left = length_seq + left + 1
                if right < 0:
                    right = length_seq + right + 1

                right = min(right, length_seq)
                #just in case...

                if left > right:
                    #again, just in case
                    self.log.warning("Problem with fragment of %s, "
                        "the right part is smaller than the left" % seq_id)
                    continue

                new_record = None

                if left == 1 and right == length_seq:
                    new_record = seq.fragment(not self.options.keep_ids)
                else:
                    if not self.options.keep_ids:
                        new_id = "%s:%d-%d" % (seq_id, left, right)
                    else:
                        new_id = seq_id
                    new_record = SeqRecord(sequence[(left-1):right],
                            id=new_id, name=seq.name, description="")
                writer.write(new_record)
                if self.output_file is not None:
                    writer_file.write(new_record)

        if self.output_file is not None:
            output_fd.close()

        if len(ids_to_process) > 0:
            self.log.fatal("The following identifiers of sequences (%s) were"
                    "found in the fragments file, but not in the fasta file"
                    % ",".join(ids_to_process))
            return 1
Exemplo n.º 49
0
    def process_file(self, input_file):
        """Processes the given input file that contains the domain
        architectures."""
        self.log.info("Running overrepresentation analysis")
        self.log.info("p-value = %.4f, correction method = %s",
                      self.options.confidence, self.options.correction)

        if self.options.arch_file:
            arch_file_name = self.options.arch_file
            if self.options.ignore:
                arch_file_name += "_unfiltered"
            arch_file = open(arch_file_name, "w")

        confidence = self.options.confidence
        if self.options.ignore:
            confidence = float("inf")
        self.log.info("Ignored the significance value."
                      " We will filter results later.")

        overrep = OverrepresentationAnalyser(
            self.go_tree,
            self.go_mapping,
            confidence=confidence,
            min_count=self.options.min_size,
            correction=self.options.correction)
        cache = {}

        num_no_annotations = 0
        num_no_domains = 0
        total_seqs = 0

        for line in open_anything(input_file):
            parts = line.strip().split("\t")
            gene_id = parts[0]
            prts = parts[3].replace("{", ";").replace("}", ";").split(";")
            arch = tuple([x for x in prts if x])
            total_seqs += 1

            if arch == ("NO_ASSIGNMENT", ):
                num_no_domains += 1
                num_no_annotations += 1
                continue

            if arch not in cache:
                cache[arch] = overrep.test_group(arch)
                if self.options.arch_file:
                    arch_file.write("{}\n".format(parts[3]))  # architecture
                    for term, p_value in cache[arch]:
                        line = "  %.4f: %s (%s)\n" % (p_value, term.id,
                                                      term.name)
                        arch_file.write(line)
                    arch_file.write("\n")
            if self.options.results_by_protein:
                print(gene_id)
                for term, p_value in cache[arch]:
                    print("  %.4f: %s (%s)" % (p_value, term.id, term.name))
                print()

            if not cache[arch]:
                num_no_annotations += 1

        self.log.info("Total number of sequences processed: %d", total_seqs)
        if num_no_annotations:
            self.log.info("%d sequences have no overrepresented annots. :(",
                          num_no_annotations)
        if num_no_domains:
            self.log.info("%d sequences have no domains at all :(",
                          num_no_domains)

        if self.options.arch_file:
            arch_file.close()
            if self.options.ignore:
                # we filter the file with the significance value
                filterer = ResultFileFilter(arch_file_name)
                filterer.filter(self.options.arch_file,
                                confidence=self.options.confidence)
Exemplo n.º 50
0
 def process_file(self, filename, filter):
     """Processes the given file using the given `filter`."""
     self.log.info("Processing %s..." % filename)
     for line in self.process_lines(open_anything(filename), filter):
         sys.stdout.write(line)
Exemplo n.º 51
0
 def __init__(self, filename):
     self._fp = open_anything(filename)
Exemplo n.º 52
0
 def __init__(self, filename):
     self._fp = open_anything(filename)