def predict_orthologs(self, files, workdir, speciestree_file): runOMA = True if os.path.isfile(join(workdir, "Output/OrthologousGroups.txt") ) and "use_cache" in self.other_args: print("Will use cached output files from oma") runOMA = False #quite a few preprocessing steps are needed for OMA. #first, we create directories and files required. dbdir = join(workdir, "DB") if not os.path.exists(dbdir): os.mkdir(dbdir) #OMA requires one file per species. We split our files accordingly here. seqs_by_species = Sequence.combineSequenceFilesBySpecies( files, self.other_args["species_separator"], int(self.other_args["species_index"])) if speciestree_file == "": print("OMA is unpredicatable without a species tree.") spprefix = "" if "species_prefix" in self.other_args: spprefix = self.other_args["species_prefix"] doAA = False if "convertToAA" in self.other_args: doAA = True species_list = "" genes_list = [] for key in seqs_by_species: outfile = join(dbdir, spprefix + key + ".fa") Sequence.outputSequencesToFasta(sequences=seqs_by_species[key], filename=outfile, name_suffix="", aligned=False, convertToAA=doAA, name_prefix=spprefix) if species_list != "": species_list += "," species_list += key for seq in seqs_by_species[key]: genes_list.append(seq.name) #if we use a species tree, OMA requires removing species not appearing in the files #sgutils can restrict the species tree to a subset of species. if speciestree_file != "": oma_sptree_file = join(workdir, "oma_species_tree.nw") with open(speciestree_file, 'r') as myfile: speciestree_newick = myfile.read().replace('\n', '') cmd = "OCR -m restrict_species_tree -l \"" + species_list + "\" -s \"" + speciestree_newick + "\" -o \"" + oma_sptree_file + "\"" print("EXEC " + cmd) os.system(cmd) f = open(oma_sptree_file) speciestree_newick_restricted = f.readline().replace("\n", "") f.close() ################################################################ #special case here: if only one species present, oma makes an error. #In this case, we make 1 gene = 1 cluster if "," not in speciestree_newick_restricted: print("Only one species found. Will make all genes paralogs.") self.clusters_filenames = [join(workdir, "oma.clusters")] self.relations_filenames = [join(workdir, "oma.relations")] clusters = [] for g in genes_list: clusters.append([g]) write_clusters(self.clusters_filenames[0], clusters) f = open(self.relations_filenames[0], 'w') tmp = "" for a in range(len(genes_list)): for b in range(a + 1, len(genes_list)): tmp += genes_list[a] + "\t" + genes_list[ b] + "\t" + "Paralogs;;" tmp = tmp[0:-2] f.write(tmp) f.close() return ################################################################ input_type = "DNA" if "seqtype" in self.other_args and self.other_args["seqtype"] == "AA": print("Input type set to AA") input_type = self.other_args["seqtype"] #now, get into oma dir and execute it cwd = os.getcwd() os.chdir(workdir) os.system("rm parameters.drw") os.system("OMA -p") #this creates a default config file. #we must edit the config to put "DNA", and give the species tree restrcted to the genes at hand. outcfg = "" f = open("parameters.drw") for line in f: line = line.replace("\n", "") if line.startswith("InputDataType"): line = "InputDataType := '" + input_type + "';" elif line.startswith("SpeciesTree") and speciestree_file != "": line = "SpeciesTree := '" + speciestree_newick_restricted + "';" outcfg += line + "\n" f.close() f = open("parameters.drw", 'w') f.write(outcfg) f.close() print("Config edited") cmd = "OMA -n 7" print("EXEC " + cmd) if runOMA: os.system(cmd) else: print("Not really - using cache instead.") os.chdir(cwd) #now that the inference is done, we translate the oma output into our format. #the result is a oma.clusters file and a oma.relations file. self.parse_groups(genes_list, workdir) self.parse_relations(genes_list, workdir) self.clusters_filenames = [join(workdir, "oma.clusters")] self.relations_filenames = [join(workdir, "oma.relations")]
def predict_orthologs(self, files, workdir, speciestree_file): workdir_seqs = join(workdir, "in") workdir_out = join(workdir, "out") allseqs = [] for f in files: seqs = Sequence.readSequences(f) for s in seqs: allseqs.append(s) #one file per species, see OMA comments above if len(self.cached_clusters) == 0: seqs_by_species = Sequence.combineSequenceFilesBySpecies( files, self.other_args["species_separator"], int(self.other_args["species_index"])) isdna = True if "seqtype" in self.other_args and self.other_args[ "seqtype"] == "AA": print("Input type set to AA") isdna = False if not os.path.exists(workdir_seqs): os.mkdir(workdir_seqs) for key in seqs_by_species: outfile = join(workdir_seqs, key + ".fasta") Sequence.outputSequencesToFasta(sequences=seqs_by_species[key], filename=outfile, name_suffix="", aligned=False, convertToAA=isdna, name_prefix="") cmd = "/u/lafonman/src/orthomcl-pipeline/bin/orthomcl-pipeline -i " + workdir_seqs + " -o " + workdir_out + " -m /u/lafonman/src/orthomcl-pipeline/orthomcl.conf --nocompliant --yes" print("EXEC " + cmd) os.system(cmd) seen_genes = set() clusters = [] clfile = join(workdir_out, "groups/groups.txt") f = open(clfile, 'r') for line in f: line = line.replace("\n", "") if line != "": gz = line.split(":")[1].split() cluster = set() for g in gz: gname = g.split("|")[1] cluster.add(gname) seen_genes.add(gname) clusters.append(cluster) f.close() for s in allseqs: name = s.name if not name in seen_genes: cl = set() cl.add(name) clusters.append(cl) self.cached_clusters = clusters else: print("USING CACHED CLUSTERS") clusters = self.cached_clusters #restrict clusters to current family #f_set = set() #for s in gene_family.sequences: # f_set.add(s.name) #f_clusters = [] #for cl in clusters: # inter = f_set.intersection(cl) # if len(inter) > 0: # f_clusters.append(inter) self.clusters_filenames = [join(workdir, "orthomcl.clusters")] self.relations_filenames = [join(workdir, "orthomcl.relations")] write_clusters(self.clusters_filenames[0], clusters) #output relations relstr = "" seen_keys = {} for c in clusters: for c1 in c: for c2 in c: if c1 != c2: key1 = c1 + ";;" + c2 key2 = c2 + ";;" + c1 if key1 not in seen_keys and key2 not in seen_keys: seen_keys[key1] = 1 seen_keys[key2] = 1 relstr += c1 + "\t" + c2 + "\t" sp1 = c1.split( self.other_args["species_separator"])[int( self.other_args["species_index"])] sp2 = c2.split( self.other_args["species_separator"])[int( self.other_args["species_index"])] if sp1 != sp2: relstr += "Orthologs" else: relstr += "Paralogs" relstr += ";;" relstr = relstr[0:-2] #extra ;; at end for i in range(len(allseqs)): for j in range(i + 1, len(allseqs)): n1 = allseqs[i].name n2 = allseqs[j].name key = n1 + ";;" + n2 if not key in seen_keys: relstr += n1 + "\t" + n2 + "\t" + "Paralogs" + ";;" f = open(self.relations_filenames[0], 'w') f.write(relstr) f.close()