def load_slice_file(self, slice_file): """Loads the slice file into a dictionary of lists""" self.log.info("Loading slices from %s...", slice_file) self.parts = defaultdict(list) for line in open_anything(slice_file): parts = line.strip().split() if not parts: continue left, right = 1, -1 if len(parts) == 3: # Three cases: (a) both limits (left,right) are specified left, right = int(parts[1]), int(parts[2]) elif len(parts) == 2: # (b) only the left limit is specified left = int(parts[1]) # (c) neither left nor right limits are specified # (this is why we se left=1 before) if left == 0 or right == 0: self.log.warning( "Ignoring fragment ID: %s, " "requested start position is zero", parts[0]) else: self.parts[parts[0]].append(array.array('i', [left, right]))
def _read_goa_file(self, goa_file): """Reads the GOA file, return a dict that, for each protein id it has a set of strings with the associated GO terms after up-propagation """ d = {} self.go_terms_gs = set() for line in open_anything(goa_file): if not line.startswith("!") or line.startswith("#"): # split line, obtain protein_id, go_term, and ev_code fields = line.split("\t") prot_id, goterm, evcode = fields[1], fields[4], fields[6] if evcode in self.valid_ev_codes: terms = map(str, self.go_tree.ancestors(goterm)) d[prot_id] = terms self.go_terms_gs.update(terms) self.go_terms_gs_ont = {} for ontology, terms in self.terms_per_ontology.items(): gs_terms_o = self.go_terms_gs & terms self.go_terms_gs_ont[ontology] = gs_terms_o print("# of Terms in {} in GOA file: {}".format( ontology, len(gs_terms_o))) return d
def load_slice_file(self, slice_file): """Loads the slice file into a dictionary of lists""" self.log.info("Loading slices from %s..." % slice_file) self.parts = defaultdict(list) for line in open_anything(slice_file): parts = line.strip().split() if not parts: continue left, right = 1, -1 if len(parts) == 3: # Three cases: (a) both limits (left,right) are specified left, right = int(parts[1]), int(parts[2]) elif len(parts) == 2: # (b) only the left limit is specified left = int(parts[1]) # (c) neither left nor right limits are specified # (this is why we se left=1 before) if left == 0 or right == 0: self.log.warning("Ignoring fragment ID: %s, " "requested start position is zero" % parts[0]) else: self.parts[parts[0]].append(array.array('i', [left, right]))
def download_interpro(self, url): """Downloads the official InterPro ID-name mappings from the given URL and prints the mapping to the standard output. """ self.log.info("Downloading InterPro names from %s..." % url) for line in open_anything(url): sys.stdout.write(line)
def process_file(self, filename): """Processes the given input file""" self.log.info("Processing %s..." % filename) parser = fasta.Parser(open_anything(filename)) parser = fasta.regexp_remapper(parser, self.options.sequence_id_regexp) for seq in parser: print seq.id
def process_file(self, filename): """Processes the given input file""" self.log.info("Processing %s..." % filename) parser = fasta.Parser(open_anything(filename)) parser = fasta.regexp_remapper(parser, self.options.sequence_id_regexp) for seq in parser: print(seq.id)
def load(self, filename): """Loads ID-name assignments from a simple tab-separated flat file. Lines not containins any tab characters are silently ignored.""" for line in open_anything(filename): parts = line.strip().split("\t", 1) if len(parts) > 1: self.names[parts[0]] = parts[1]
def process_file(self, slice_file): """Processes the given slice file""" self.log.info("Processing input file: %s..." % slice_file) writer = fasta.Writer(sys.stdout) for line in open_anything(slice_file): parts = line.strip().split() if not parts: continue seq_id, record = parts[0], None try: record = self.seqs[seq_id] except KeyError: if self.options.try_alternative_splicing: try: record = self.seqs[seq_id+".1"] except KeyError: pass if record is None: if self.options.ignore_unknown: self.log.warning("Ignoring unknown sequence ID: %s" % seq_id) continue self.log.fatal("Unknown sequence ID in input file: %s" % seq_id) return 1 if len(parts) == 1: start, end = 1, len(record.seq) new_id = record.id else: start = int(parts[1]) if len(parts) == 2: end = len(record.seq) else: end = int(parts[2]) if start == 0: self.log.warning("Ignoring sequence ID: %s, " "requested start position is zero" % seq_id) elif end == 0: self.log.warning("Ignoring sequence ID: %s, " "requested end position is zero" % seq_id) if start < 0: start = len(record.seq) + start + 1 if end < 0: end = len(record.seq) + end + 1 if not self.options.keep_ids: new_id = "%s:%d-%d" % (record.id, start, end) else: new_id = seq_id new_record = SeqRecord(record.seq[(start-1):end], id=new_id, name=record.name, description="") writer.write(new_record)
def process_sequences_file(self, fname): self.log.info("Loading sequences from %s..." % fname) self.seq_ids_to_length = {} parser = fasta.Parser(open_anything(fname)) parser = fasta.regexp_remapper(parser, self.sequence_id_regexp ) for seq in parser: self.seq_ids_to_length[seq.id] = len(seq.seq)
def load_sequences(self, seq_file): """Loads the sequences from the given sequence file in FASTA format""" self.log.info("Loading sequences from %s..." % seq_file) parser = fasta.Parser(open_anything(seq_file)) parser = fasta.regexp_remapper(parser, self.options.sequence_id_regexp) self.seqs = dict(((seq.id, seq) for seq in parser))
def download_superfamily(self, url): """Downloads the most recent mappings from SCOP sunids to human readable names, derives the Superfamily IDs from the SCOP sunids, and prints the mapping to the standard output. The most recent version of SCOP is identified by applying a regexp to the HTML output of the given page. The regexp assumes that the most recent description file is linked on the given page using an ``<a>`` tag with ``href`` equal to ``dir.des.scop.txt_X.XX``, where ``X.XX`` is the version number. If such an identification fails, this method will return without printing anything but a warning on the logging stream. """ self.log.info("Downloading SCOP page from %s..." % url) contents = open_anything(url).read() des_link_regexp = re.compile(r"<a href=\"dir.des.scop.txt_([0-9.]+)\">", re.IGNORECASE) max_version, max_version_nosplit = None, None for idx, match in enumerate(des_link_regexp.finditer(contents)): version = [int(comp) for comp in match.group(1).split(".")] if version > max_version: max_version = version max_version_nosplit = match.group(1) max_idx = idx version = max_version_nosplit if version is None: self.log.warning("Cannot infer the most recent version of SCOP, " "skipping Superfamily IDs") return self.log.info("Most recent SCOP version is assumed to be %s" % version) url = "%sdir.des.scop.txt_%s" % (url, version) for line in open_anything(url): if line[0] == '#': continue parts = line.split("\t", 4) if parts[1] != "sf" or not parts[4]: continue sys.stdout.write("SSF%s\t%s" % (parts[0], parts[4]))
def download_superfamily(self, url): """Downloads the most recent mappings from SCOP sunids to human readable names, derives the Superfamily IDs from the SCOP sunids, and prints the mapping to the standard output. The most recent version of SCOP is identified by applying a regexp to the HTML output of the given page. The regexp assumes that the most recent description file is linked on the given page using an ``<a>`` tag with ``href`` equal to ``dir.des.scop.txt_X.XX``, where ``X.XX`` is the version number. If such an identification fails, this method will return without printing anything but a warning on the logging stream. """ self.log.info("Downloading SCOP page from %s..." % url) contents = open_anything(url).read() des_link_regexp = re.compile( r"<a href=\"dir.des.scop.txt_([0-9.]+)\">", re.IGNORECASE) max_version, max_version_nosplit = None, None for _idx, match in enumerate(des_link_regexp.finditer(contents)): version = [int(comp) for comp in match.group(1).split(".")] if version > max_version: max_version = version max_version_nosplit = match.group(1) version = max_version_nosplit if version is None: self.log.warning("Cannot infer the most recent version of SCOP, " "skipping Superfamily IDs") return self.log.info("Most recent SCOP version is assumed to be %s" % version) url = "%sdir.des.scop.txt_%s" % (url, version) for line in open_anything(url): if line[0] == '#': continue parts = line.split("\t", 4) if parts[1] != "sf" or not parts[4]: continue sys.stdout.write("SSF%s\t%s" % (parts[0], parts[4]))
def download_pfam(self, url): """Downloads the official PFam ID-name mappings from the given URL and prints the mapping to the standard output. """ self.log.info("Downloading PFam names from %s..." % url) for line in open_anything(url): if line[0] == "#": continue parts = line.split("\t", 2) if len(parts) < 3 or not parts[2]: continue sys.stdout.write("%s\t%s" % (parts[0], parts[2]))
def download_smart(self, url): """Downloads the official Smart ID-name mappings from the given URL and prints the mapping to the standard output. """ self.log.info("Downloading Smart names from %s..." % url) for line in open_anything(url): parts = line.split("\t", 3) if len(parts) < 3 or not parts[2]: continue if parts[1] == "ACC" and parts[2] == "DEFINITION": continue sys.stdout.write("%s\t%s\n" % (parts[1], parts[2]))
def process_file(self, input_file): """Processes the given input file that contains the domain architectures.""" cache = {} num_no_annotations = 0 num_no_domains = 0 total_seqs = 0 for line in open_anything(input_file): parts = line.strip().split("\t") gene_id = parts[0] arch = tuple([ x for x in parts[3].replace("{", ";").replace("}", ";").split( ";") if x ]) total_seqs += 1 if arch == ("NO_ASSIGNMENT", ): num_no_domains += 1 num_no_annotations += 1 continue if arch not in cache: all_terms = set() for domain in arch: all_terms.update(self.go_mapping.get_left(domain, [])) for path in self.go_tree.paths_to_root(*list(all_terms)): all_terms.difference_update(path[1:]) all_terms = sorted( all_terms, key=lambda x: len(self.go_mapping.get_right(x, []))) cache[arch] = all_terms print(gene_id) for term in cache[arch]: print(" %s (%s)" % (term.id, term.name)) print() if not cache[arch]: num_no_annotations += 1 self.log.info("Total number of sequences processed: %d", total_seqs) if num_no_annotations: self.log.info( "Could not assign functional label " "to %d sequences :(", num_no_annotations) if num_no_domains: self.log.info("%d sequences have no domains at all :(", num_no_domains)
def __init__(self, file_handle): """Creates an annotation file parser that reads the given file-like object. You can also specify filenames. If it ends in ``.gz``, the file is assumed to contain gzipped data and it will be unzipped on the fly. Example:: >>> import gfam.go as go >>> parser = go.AnnotationFile("gene_association.sgd.gz") To read the annotations in the file, you must iterate over the parser as if it were a list. The iterator yields `Annotation` objects. """ self.file_handle = open_anything(file_handle) self.lineno = 0
def process_file(self, input_file): """Processes the given input file that contains the domain architectures.""" cache = {} num_no_annotations = 0 num_no_domains = 0 total_seqs = 0 for line in open_anything(input_file): parts = line.strip().split("\t") gene_id = parts[0] arch = tuple([x for x in parts[3].replace("{", ";") .replace("}", ";") .split(";") if x]) total_seqs += 1 if arch == ("NO_ASSIGNMENT", ): num_no_domains += 1 num_no_annotations += 1 continue if arch not in cache: all_terms = set() for domain in arch: all_terms.update(self.go_mapping.get_left(domain, [])) for path in self.go_tree.paths_to_root(*list(all_terms)): all_terms.difference_update(path[1:]) all_terms = sorted(all_terms, key=lambda x: len(self.go_mapping.get_right(x, []))) cache[arch] = all_terms print(gene_id) for term in cache[arch]: print(" %s (%s)" % (term.id, term.name)) print() if not cache[arch]: num_no_annotations += 1 self.log.info("Total number of sequences processed: %d", total_seqs) if num_no_annotations: self.log.info("Could not assign functional label " "to %d sequences :(", num_no_annotations) if num_no_domains: self.log.info("%d sequences have no domains at all :(", num_no_domains)
def process_sequences_file_old(self, fname): """ This is the old version, all the entries are loaded into memory """ self.log.info("Loading sequences from %s..." % fname) parser = fasta.Parser(open_anything(fname)) parser = fasta.regexp_remapper(parser, self.sequence_id_regexp) seqs, lens = [], [] for i, seq in enumerate(parser): seqs.append(seq.id) lens.append(len(seq.seq)) if i % 1000000 == 0: self.log.info("Read {} seqs".format(i)) self.log.info("...loaded") self.seq_ids_to_length = dict(zip(seqs, lens))
def separate_sequences(self, table, sequences_file): """Separates the fasta sequence file in individual fasta files, one per cluster""" reader = fasta.Parser(open_anything(sequences_file)) seqs = dict(((seq.id, seq) for seq in reader)) for cluster_name, cluster_seqs in table.items(): output_file_name = self.sequences_dir + os.path.sep + cluster_name output_fd = open(output_file_name + ".faa", "w") writer = fasta.Writer(output_fd) for sequence in cluster_seqs: obj = SeqRecord(seqs[sequence].seq, sequence, "", "") writer.write(obj) output_fd.close()
def read_goa_file(self, goa_file, ev_codes): """Reads the GOA file, return a defaultdict that, for each protein id it has a set of strings with the associated GO terms """ d = bidict() for line in open_anything(goa_file): if not line.startswith(("!", "#")): # split line, obtain protein_id, go_term, and ev_code fields = line.split("\t", 7) prot_id, goterm, evcode = fields[1], fields[4], fields[6] if evcode in ev_codes: d.add_left(prot_id, self.go_tree.lookup(goterm)) self.log.info("GOA file read. " + str(d.len_left()) + " proteins loaded") return d
def from_file(cls, filename, tree): """Constructs a mapping from a mapping file. The format of this file should be identical to the official ``interpro2go`` file provided by the Gene Ontology project. `tree` is a Gene Ontology tree object (see `gfam.go.Tree`) that will be used to look up terms from IDs.""" regex = re.compile("InterPro:([A-Z0-9]+) .* > .* ; (GO:[0-9]+)") result = cls() for line in open_anything(filename): if line[0] == '!': continue match = regex.match(line) if not match: continue result.add_annotation(match.group(1), tree.lookup(match.group(2))) return result
def read_goa_file(self, goa_file, ev_codes): """Reads the GOA file, return a defaultdict that, for each protein id it has a set of strings with the associated GO terms """ goa_mapping = bidict() for line in open_anything(goa_file): if not line.startswith(("!", "#")): # split line, obtain protein_id, go_term, and ev_code fields = line.split("\t", 7) prot_id, goterm, evcode = fields[1], fields[4], fields[6] if evcode in ev_codes: goa_mapping.add_left(prot_id, self.go_tree.lookup(goterm)) self.log.info("GOA file read. %s proteins loaded", str(goa_mapping.len_left())) return goa_mapping
def process_file(self, input_file): """Processes the given input file that contains the domain architectures.""" self.log.info("Running overrepresentation analysis") self.log.info("p-value = %.4f, correction method = %s" % \ (self.options.confidence, self.options.correction)) overrep = OverrepresentationAnalyser(self.go_tree, self.go_mapping, confidence = self.options.confidence, min_count = self.options.min_size, correction = self.options.correction) cache = {} num_no_annotations = 0 num_no_domains = 0 total_seqs = 0 for line in open_anything(input_file): parts = line.strip().split("\t") gene_id, arch = parts[0], tuple(parts[2].split(";")) total_seqs += 1 if arch == ("NO_ASSIGNMENT", ): num_no_domains += 1 num_no_annotations += 1 continue if arch not in cache: cache[arch] = overrep.test_group(arch) print gene_id for term, p_value in cache[arch]: print " %.4f: %s (%s)" % (p_value, term.id, term.name) print if len(cache[arch]) == 0: num_no_annotations += 1 self.log.info("Total number of sequences processed: %d" % total_seqs) if num_no_annotations: self.log.info("%d sequences have no overrepresented annotations :(" % num_no_annotations) if num_no_domains: self.log.info("%d sequences have no domains at all :(" % num_no_domains)
def run_real(self): """Runs the application""" # Load valid sequence IDs (if necessary) if self.options.sequences_file: self.log.info("Loading sequences from %s..." % self.options.sequences_file) self.total_sequence_length = 0 self.valid_sequence_ids = set() parser = fasta.Parser(open_anything(self.options.sequences_file)) parser = fasta.regexp_remapper(parser, self.options.sequence_id_regexp) for seq in parser: self.valid_sequence_ids.add(seq.id) self.total_sequence_length += len(seq.seq) else: self.valid_sequence_ids = complementerset() self.total_sequence_length = None # Find which sources will be allowed if not self.options.include_sources: self.sources = complementerset() else: self.sources = set(self.options.include_sources) self.sources.difference_update(self.options.exclude_sources) if isinstance(self.sources, complementerset): self.log.info("Ignored sources: %s" % ", ".join(self.sources.iterexcluded())) else: self.log.info("Accepted sources: %s" % ", ".join(self.sources)) if not self.args: self.args = ["-"] for arg in self.args: # Set up the output formatter if self.options.print_totals: self.output_formatter = GenomeLevelOutputFormatter(self) else: self.output_formatter = SequenceLevelOutputFormatter(self) # Process the file self.process_infile(arg) # Print the results self.output_formatter.finish()
def process_sequences_file(self, fname): """ In this version we use `shelve` to save memory (the pairs (protein accession, length) are stored in a temporary database. See `process_sequences_file_old` for the old version. """ self.log.info("Loading sequences from {}...".format(fname)) parser = fasta.Parser(open_anything(fname)) parser = fasta.regexp_remapper(parser, self.sequence_id_regexp) self.filename_shelve = os.path.join(tempfile.gettempdir(), "shelve_file") self.seq_ids_to_length = shelve.open(self.filename_shelve) for i, seq in enumerate(parser): self.seq_ids_to_length[seq.id] = len(seq.seq) if i % 1000000 == 0: self.log.info("Read {} seqs".format(i)) self.seq_ids_to_length.sync() self.log.info("...loaded")
def read_low_complexity_regions(self, file_name): self.low_complexity_regions = defaultdict(list) current_prot_id = "" if self.sequence_id_regexp: import re regexp = re.compile(self.sequence_id_regexp) else: regexp = None for line in open_anything(file_name): line = line.strip() if not line: continue if line[0] == ">": current_prot_id = line.split()[0][1:] if regexp is not None: current_prot_id = regexp.sub(r'\g<id>', current_prot_id) else: (left, _, right) = line.split() self.low_complexity_regions[current_prot_id].append((int(left),int(right)))
def process_file(self, filename): """Processes the input file with the given filename""" threshold = self.options.threshold adj_list = defaultdict(set) idgen = UniqueIdGenerator() self.log.info("Processing %s..." % filename) for line_no, line in enumerate(open_anything(filename)): parts = line.strip().split() if not parts: continue if len(parts) < 2: raise ValueError("line %d contains only a single ID" % line_no) if len(parts) < 3: parts.append(1.0) else: parts[2] = float(parts[2]) id1, id2, weight = parts[:3] if weight < threshold: continue id1, id2 = idgen[id1], idgen[id2] if id1 == id2: continue if id1 > id2: id1, id2 = id2, id1 adj_list[id1].add(id2) adj_list[id2].add(id1) names = idgen.values() bfs = BreadthFirstSearch(adj_list) not_seen = set(range(len(idgen))) while not_seen: component = list(bfs.run(not_seen.pop())) print("\t".join(names[idx] for idx in component)) not_seen.difference_update(component)
def run_real(self): """Runs the application""" AssignmentOverlapChecker.max_overlap = self.options.max_overlap if self.options.interpro_file: self.log.info("Loading known InterPro IDs from %s..." % self.options.interpro_file) self.interpro = InterPro.FromFile(self.options.interpro_file) else: self.interpro = InterPro() if self.options.gene_id_file: self.log.info("Loading sequence IDs from %s..." % self.options.gene_id_file) self.valid_sequence_ids = set() for line in open_anything(self.options.gene_id_file): self.valid_sequence_ids.add(line.strip()) else: self.valid_sequence_ids = complementerset() if self.options.exclusions_log_file: self.log.info("Logging excluded sequences to %s." % self.options.exclusions_log_file) self.exclusion_log = open(self.options.exclusions_log_file, "a+") else: self.exclusion_log = None self.ignored = set() for ignored_source in self.options.ignored: parts = ignored_source.split() self.ignored.update(parts) if not self.args: self.args = ["-"] if len(self.args) > 1: self.error("Only one input file may be given") self.process_infile(self.args[0])
def FromFile(cls, filename): """Constructs this object from an InterPro parent-child mapping file, pointed to by the given filename. Both the tree and the ID-name mapping will be built from the same file. """ result = cls() path_to_root = [] for line in open_anything(filename): line = line.strip() dash_count = 0 while line[dash_count] == "-": dash_count += 1 if dash_count % 2 != 0: raise ValueError("dash count in InterPro file not even") line = line[dash_count:] parts = line.split("::") interpro_id, aliases = parts[0], parts[2:] level = dash_count // 2 + 1 if level <= len(path_to_root): path_to_root = path_to_root[:level] path_to_root[-1] = interpro_id else: path_to_root.append(interpro_id) if level != len(path_to_root): raise ValueError("tree depth increased by more than " "one between two lines") if len(path_to_root) > 1: result.tree[interpro_id] = path_to_root[-2] for alias in aliases: if alias[0:4] == "PTHR" and ":SF" in alias: alias = alias[0:alias.index(":SF")] result.mapping[alias] = interpro_id return result
def __init__(self, file_handle): """Creates an OBO parser that reads the given file-like object. If you want to create a parser that reads an OBO file, do this: >>> import gfam.go.obo >>> parser = gfam.go.obo.Parser(open("gene_ontology.1_2.obo")) Only the headers are read when creating the parser. You can access these right after construction as follows: >>> parser.headers["format-version"] ['1.2'] To read the stanzas in the file, you must iterate over the parser as if it were a list. The iterator yields `Stanza` objects. """ self.file_handle = open_anything(file_handle) self.line_re = re.compile(r"\s*(?P<tag>[^:]+):\s*(?P<value>.*)") self.lineno = 0 self.headers = {} self._extra_line = None self._read_headers()
def _read_goa_file(self, goa_file): """Reads the GOA file, return a dict that, for each protein id it has a set of strings with the associated GO terms after up-propagation """ d = {} self.go_terms_gs = set() for line in open_anything(goa_file): if not line.startswith("!") or line.startswith("#"): # split line, obtain protein_id, go_term, and ev_code fields = line.split("\t") prot_id, goterm, evcode = fields[1], fields[4], fields[6] if evcode in self.valid_ev_codes: terms = map(str, self.go_tree.ancestors(goterm)) d[prot_id] = terms self.go_terms_gs.update(terms) self.go_terms_gs_ont = {} for ontology, terms in self.terms_per_ontology.items(): gs_terms_o = self.go_terms_gs & terms self.go_terms_gs_ont[ontology] = gs_terms_o print "# of Terms in ", ontology, " in GOA file: ", len(gs_terms_o) return d
def process_file(self, filename): """Processes the input file with the given filename""" self.log.info("Processing %s...", filename) infile = open_anything(filename) neis = defaultdict(set) for line_no, line in enumerate(infile): parts = line.strip().split() if not parts: continue if len(parts) < 2: raise ValueError("line %d contains only a single ID" % line_no) neis[parts[0]].add(parts[1]) neis[parts[1]].add(parts[0]) if self.options.add_loops: for k, vals in neis.items(): vals.add(k) all_ids = sorted(neis.keys()) # lens = dict((id, len(neis1)) for id, neis1 in enumerate(neis)) for id1 in all_ids: neis1 = neis[id1] len1 = float(len(neis1)) if self.options.only_linked: others = sorted(neis1) else: others = all_ids for id2 in others: if id2 < id1: continue neis2 = neis[id2] isect = len(neis2.intersection(neis1)) sim = isect / (len1+len(neis2)-isect) if sim < self.options.min_similarity: continue print("%s\t%s\t%.8f" % (id1, id2, sim))
def process_file(self, filename): """Processes the input file with the given filename""" self.log.info("Processing %s..." % filename) infile = open_anything(filename) neis = defaultdict(set) for line_no, line in enumerate(infile): parts = line.strip().split() if not parts: continue if len(parts) < 2: raise ValueError("line %d contains only a single ID" % line_no) neis[parts[0]].add(parts[1]) neis[parts[1]].add(parts[0]) if self.options.add_loops: for k, v in neis.iteritems(): v.add(k) all_ids = sorted(neis.keys()) lens = dict((id, len(neis1)) for id, neis1 in enumerate(neis)) for id1 in all_ids: neis1 = neis[id1] len1 = float(len(neis1)) if self.options.only_linked: others = sorted(neis1) else: others = all_ids for id2 in others: if id2 < id1: continue neis2 = neis[id2] isect = len(neis2.intersection(neis1)) sim = isect / (len1+len(neis2)-isect) if sim < self.options.min_similarity: continue print "%s\t%s\t%.8f" % (id1, id2, sim)
def load_sequences_from_file(self, fname): """Loads the sequences from the given file. The file must be in FASTA format. You are allowed to pass file pointers or names of gzipped/bzipped files here.""" return self.load_sequences(fasta.Parser(open_anything(fname)))
def process_file(self, input_file): """Processes the given input file that contains the domain architectures.""" self.log.info("Running overrepresentation analysis") self.log.info("p-value = %.4f, correction method = %s", self.options.confidence, self.options.correction) if self.options.arch_file: arch_file_name = self.options.arch_file if self.options.ignore: arch_file_name += "_unfiltered" arch_file = open(arch_file_name, "w") confidence = self.options.confidence if self.options.ignore: confidence = float("inf") self.log.info("Ignored the significance value." " We will filter results later.") overrep = OverrepresentationAnalyser(self.go_tree, self.go_mapping, confidence=confidence, min_count=self.options.min_size, correction=self.options. correction) cache = {} num_no_annotations = 0 num_no_domains = 0 total_seqs = 0 for line in open_anything(input_file): parts = line.strip().split("\t") gene_id = parts[0] prts = parts[3].replace("{", ";").replace("}", ";").split(";") arch = tuple([x for x in prts if x]) total_seqs += 1 if arch == ("NO_ASSIGNMENT", ): num_no_domains += 1 num_no_annotations += 1 continue if arch not in cache: cache[arch] = overrep.test_group(arch) if self.options.arch_file: arch_file.write("{}\n".format(parts[3])) # architecture for term, p_value in cache[arch]: line = " %.4f: %s (%s)\n" % (p_value, term.id, term.name) arch_file.write(line) arch_file.write("\n") if self.options.results_by_protein: print(gene_id) for term, p_value in cache[arch]: print(" %.4f: %s (%s)" % (p_value, term.id, term.name)) print() if not cache[arch]: num_no_annotations += 1 self.log.info("Total number of sequences processed: %d", total_seqs) if num_no_annotations: self.log.info("%d sequences have no overrepresented annots. :(", num_no_annotations) if num_no_domains: self.log.info("%d sequences have no domains at all :(", num_no_domains) if self.options.arch_file: arch_file.close() if self.options.ignore: # we filter the file with the significance value filterer = ResultFileFilter(arch_file_name) filterer.filter(self.options.arch_file, confidence=self.options.confidence)
def process_sequences_file(self, seq_file): """Processes the sequences one by one, extracting all the pieces into an output fasta file""" self.log.info("Processing fasta file %s...", seq_file) parser = fasta.Parser(open_anything(seq_file)) parser = fasta.regexp_remapper(parser, self.options.sequence_id_regexp) ids_to_process = set(self.parts.keys()) writer = fasta.FastWriter(sys.stdout) if self.output_file is not None: output_fd = open(self.output_file, "w") writer_file = fasta.FastWriter(output_fd) for seq in parser: seq_id = seq.id if seq_id not in self.parts: if self.options.try_alternative_splicing: seq_id = seq_id.strip().rstrip(".1") if seq_id not in self.parts: continue else: continue sequence = seq.seq length_seq = len(sequence) ids_to_process.remove(seq_id) for left, right in self.parts[seq_id]: if left < 0: left = length_seq + left + 1 if right < 0: right = length_seq + right + 1 right = min(right, length_seq) # just in case... if left > right: # again, just in case self.log.warning( "Problem with fragment of %s, " "the right part is smaller than " "the left", seq_id) continue new_record = None if left == 1 and right == length_seq: new_record = seq.fragment(not self.options.keep_ids) else: if not self.options.keep_ids: new_id = "%s:%d-%d" % (seq_id, left, right) else: new_id = seq_id new_record = SeqRecord(sequence[(left - 1):right], id=new_id, name=seq.name, description="") writer.write(new_record) if self.output_file is not None: writer_file.write(new_record) if self.output_file is not None: output_fd.close() if ids_to_process: self.log.fatal( "The following identifiers of sequences (%s) were" "found in the fragments file, but not in the " "fasta file ", ",".join(ids_to_process)) return 1 return 0
def process_sequences_file(self, seq_file): """Processes the sequences one by one, extracting all the pieces into an output fasta file""" self.log.info("Processing fasta file %s..." %seq_file) parser = fasta.Parser(open_anything(seq_file)) parser = fasta.regexp_remapper(parser, self.options.sequence_id_regexp) ids_to_process = set(self.parts.keys()) writer = fasta.FastWriter(sys.stdout) if self.output_file is not None: output_fd = open(self.output_file,"w") writer_file = fasta.FastWriter(output_fd) for seq in parser: seq_id = seq.id if seq_id not in self.parts: if self.options.try_alternative_splicing: seq_id = seq_id.strip().rstrip(".1") if seq_id not in self.parts: continue else: continue sequence = seq.seq length_seq = len(sequence) ids_to_process.remove(seq_id) for left, right in self.parts[seq_id]: if left < 0: left = length_seq + left + 1 if right < 0: right = length_seq + right + 1 right = min(right, length_seq) #just in case... if left > right: #again, just in case self.log.warning("Problem with fragment of %s, " "the right part is smaller than the left" % seq_id) continue new_record = None if left == 1 and right == length_seq: new_record = seq.fragment(not self.options.keep_ids) else: if not self.options.keep_ids: new_id = "%s:%d-%d" % (seq_id, left, right) else: new_id = seq_id new_record = SeqRecord(sequence[(left-1):right], id=new_id, name=seq.name, description="") writer.write(new_record) if self.output_file is not None: writer_file.write(new_record) if self.output_file is not None: output_fd.close() if len(ids_to_process) > 0: self.log.fatal("The following identifiers of sequences (%s) were" "found in the fragments file, but not in the fasta file" % ",".join(ids_to_process)) return 1
def process_file(self, input_file): """Processes the given input file that contains the domain architectures.""" self.log.info("Running overrepresentation analysis") self.log.info("p-value = %.4f, correction method = %s", self.options.confidence, self.options.correction) if self.options.arch_file: arch_file_name = self.options.arch_file if self.options.ignore: arch_file_name += "_unfiltered" arch_file = open(arch_file_name, "w") confidence = self.options.confidence if self.options.ignore: confidence = float("inf") self.log.info("Ignored the significance value." " We will filter results later.") overrep = OverrepresentationAnalyser( self.go_tree, self.go_mapping, confidence=confidence, min_count=self.options.min_size, correction=self.options.correction) cache = {} num_no_annotations = 0 num_no_domains = 0 total_seqs = 0 for line in open_anything(input_file): parts = line.strip().split("\t") gene_id = parts[0] prts = parts[3].replace("{", ";").replace("}", ";").split(";") arch = tuple([x for x in prts if x]) total_seqs += 1 if arch == ("NO_ASSIGNMENT", ): num_no_domains += 1 num_no_annotations += 1 continue if arch not in cache: cache[arch] = overrep.test_group(arch) if self.options.arch_file: arch_file.write("{}\n".format(parts[3])) # architecture for term, p_value in cache[arch]: line = " %.4f: %s (%s)\n" % (p_value, term.id, term.name) arch_file.write(line) arch_file.write("\n") if self.options.results_by_protein: print(gene_id) for term, p_value in cache[arch]: print(" %.4f: %s (%s)" % (p_value, term.id, term.name)) print() if not cache[arch]: num_no_annotations += 1 self.log.info("Total number of sequences processed: %d", total_seqs) if num_no_annotations: self.log.info("%d sequences have no overrepresented annots. :(", num_no_annotations) if num_no_domains: self.log.info("%d sequences have no domains at all :(", num_no_domains) if self.options.arch_file: arch_file.close() if self.options.ignore: # we filter the file with the significance value filterer = ResultFileFilter(arch_file_name) filterer.filter(self.options.arch_file, confidence=self.options.confidence)
def process_file(self, filename, filter): """Processes the given file using the given `filter`.""" self.log.info("Processing %s..." % filename) for line in self.process_lines(open_anything(filename), filter): sys.stdout.write(line)
def __init__(self, filename): self._fp = open_anything(filename)