def _transfer_from_same_file(self, goa, arch_file): """ Transfer function from same architecture file """ confidence = self.options.max_pvalue if self.options.ignore: confidence = float("inf") self.log.info("Ignored the significance value." " We will filter results later.") ora = OverrepresentationAnalyser(self.go_tree, goa, confidence=confidence, min_count=1, correction=self.options.correction) cov = self.options.minimum_coverage / 100.0 self.log.info("Transferring function from same file. Min coverage=%s", str(cov)) all_annotated = frozenset(goa.left.keys()) if self.options.results_by_arch: arch_file_name = self.options.results_by_arch if self.options.ignore: arch_file_name += "_unfiltered" out = open(arch_file_name, "w") # for each architecture and its associated proteins... carriage_return = os.linesep twocr = os.linesep * 2 self.log.info("Arch file: %s", arch_file) for arch, prots in ArchReader(arch_file, cov): if not prots or arch == "NO_ASSIGNMENT": # if there is no annotation for proteins in the arch... if self.options.results_by_protein: print("{}{}".format(twocr.join(prots), carriage_return)) if self.options.results_by_arch: out.write("{}{}".format(arch, twocr)) continue targets = set(prots) annotated_prots = targets & all_annotated lines = carriage_return.join([ " %.4f: %s (%s)" % (p_value, term.id, term.name) for term, p_value in ora.test_group(targets) ]) if self.options.results_by_arch: out.write("{}{}{}{}".format(arch, carriage_return, lines, twocr)) if self.options.results_by_protein: for rest, prot in i_zip(combinations(prots, len(prots) - 1), reversed(prots)): print(prot) if prot in annotated_prots: for term, p_value in ora.test_group(rest): print(" %.4f: %s (%s)" % (p_value, term.id, term.name)) else: print(lines) print() if self.options.results_by_arch: out.close() self._filter_arch_file(arch_file_name, self.options.results_by_arch)
def _transfer_from_other_file(self, goa, arch_target, arch_source): confidence = self.options.max_pvalue if self.options.ignore: confidence = float("inf") self.log.info("Ignored the significance value." " We will filter results later.") ora = OverrepresentationAnalyser(self.go_tree, goa, confidence=confidence, min_count=1, correction='None') cov = self.options.minimum_coverage / 100.0 goterms = dict() for arch, prots in ArchReader(arch_source, cov): if arch != "NO_ASSIGNMENT": goterms[arch] = ora.test_group(prots) if self.options.results_by_arch: arch_file_name = self.options.results_by_arch if self.options.ignore: arch_file_name += "_unfiltered" with open(arch_file_name, "w") as out: for arch in sorted(goterms.keys()): out.write(arch + "\n") for term, p_value in goterms[arch]: line = " %.4f: %s (%s)" % (p_value, term.id, term.name) out.write(line) out.write("\n") out.write("\n") if self.options.results_by_protein: for arch, prots in ArchReader(arch_target, cov): if arch in goterms: for prot in prots: print(prot) for term, p_value in goterms[arch]: print(" %.4f: %s (%s)" % (p_value, term.id, term.name)) print() else: for prot in prots: print(prot) print() if self.options.results_by_arch: self._filter_arch_file(arch_file_name, self.options.results_by_arch)
def _transfer_from_both(self, goa, arch_target, arch_source): """ Transfer from both an external file and the same file using a single GOA file """ confidence = self.options.max_pvalue if self.options.ignore: confidence = float("inf") self.log.info("Ignored the significance value. " "We will filter results later.") ora = OverrepresentationAnalyser(self.go_tree, goa, confidence=confidence, min_count=1, correction='None') cov = self.options.minimum_coverage / 100.0 self.log.info( "Transferring function from both files." " Min coverage=%s", str(cov)) all_annotated = frozenset(goa.left.keys()) # all annotated proteins prots_per_arch = dict() if self.options.results_by_arch: arch_file_name = self.options.results_by_arch if self.options.ignore: arch_file_name += "_unfiltered" out = open(arch_file_name, "w") self.log.info("\t Source architecture: %s", arch_source) self.log.info("\t Target (and source, as well) architecture: %s", arch_target) for arch, prots in ArchReader(arch_source, cov): prots_per_arch[arch] = prots for arch, prots in ArchReader(arch_target, cov): other_prots = set() if arch in prots_per_arch: other_prots = prots_per_arch[arch] targets = set(other_prots) | set(prots) annotated_prots = targets & all_annotated if not annotated_prots or arch == "NO_ASSIGNMENT": # if there is no annotation for proteins in the arch... if self.options.results_by_protein: print((os.linesep * 2).join(prots)) print() if self.options.results_by_arch: out.write(arch + "\n\n") continue lines = os.linesep.join([ " %.4f: %s (%s)" % (p_value, term.id, term.name) for term, p_value in ora.test_group(targets) ]) if self.options.results_by_arch: out.write(arch + "\n") out.write(lines) out.write("\n") if self.options.results_by_protein: for prot in prots: print(prot) if prot in annotated_prots: grp = targets - set([prot]) for term, p_value in ora.test_group(grp): print(" %.4f: %s (%s)" % (p_value, term.id, term.name)) else: print(lines) print() if self.options.results_by_arch: out.close() self._filter_arch_file(arch_file_name, self.options.results_by_arch)
def process_file(self, input_file): """Processes the given input file that contains the domain architectures.""" self.log.info("Running overrepresentation analysis") self.log.info("p-value = %.4f, correction method = %s", self.options.confidence, self.options.correction) if self.options.arch_file: arch_file_name = self.options.arch_file if self.options.ignore: arch_file_name += "_unfiltered" arch_file = open(arch_file_name, "w") confidence = self.options.confidence if self.options.ignore: confidence = float("inf") self.log.info("Ignored the significance value." " We will filter results later.") overrep = OverrepresentationAnalyser( self.go_tree, self.go_mapping, confidence=confidence, min_count=self.options.min_size, correction=self.options.correction) cache = {} num_no_annotations = 0 num_no_domains = 0 total_seqs = 0 for line in open_anything(input_file): parts = line.strip().split("\t") gene_id = parts[0] prts = parts[3].replace("{", ";").replace("}", ";").split(";") arch = tuple([x for x in prts if x]) total_seqs += 1 if arch == ("NO_ASSIGNMENT", ): num_no_domains += 1 num_no_annotations += 1 continue if arch not in cache: cache[arch] = overrep.test_group(arch) if self.options.arch_file: arch_file.write("{}\n".format(parts[3])) # architecture for term, p_value in cache[arch]: line = " %.4f: %s (%s)\n" % (p_value, term.id, term.name) arch_file.write(line) arch_file.write("\n") if self.options.results_by_protein: print(gene_id) for term, p_value in cache[arch]: print(" %.4f: %s (%s)" % (p_value, term.id, term.name)) print() if not cache[arch]: num_no_annotations += 1 self.log.info("Total number of sequences processed: %d", total_seqs) if num_no_annotations: self.log.info("%d sequences have no overrepresented annots. :(", num_no_annotations) if num_no_domains: self.log.info("%d sequences have no domains at all :(", num_no_domains) if self.options.arch_file: arch_file.close() if self.options.ignore: # we filter the file with the significance value filterer = ResultFileFilter(arch_file_name) filterer.filter(self.options.arch_file, confidence=self.options.confidence)