def load_from_alignment_file(self, filename): ''' Loads the sequences from the given sequence file (does not need to be an aligned file) ''' self.seq_file = filename self.sequences = Sequence.readSequences(filename)
def load_scores(self, gene_family): dealignedfile = gene_family.seq_file + ".dealigned.fa" Sequence.outputSequencesToFasta(gene_family.sequences, dealignedfile) fsafile = gene_family.seq_file + ".fsa.fa" cmd = "fsa " + dealignedfile + " > " + fsafile print("EXEC " + cmd) if not os.path.isfile( fsafile ) or not "use_cache" in self.other_args or os.path.getsize( fsafile) <= 10: os.system(cmd) else: print("Actually, file exists and we'll use it.") seqs = Sequence.readSequences(fsafile) gene_family.scores = Distances.getPairwisePctID(sequences=seqs, verbose=False, run_nw_algorithm=False)
#here's an annoying problem: when having multiple input files, some (distinct) genes will have the same name #(this happens with simphy) #so here we copy the alginment files, but rename the genes with its file index newinfiles = "" files_list = infiles.split(",") newfile_to_old_file = {} if rename_genes: print("Copying alignment files, ensuring gene name uniqueness...") for i in range(len(files_list)): f = files_list[i] filename, ext = os.path.splitext(f) newfile = join(workdir, os.path.basename(f).replace(ext, "_" + str(i) + ".fa")) sequences = Sequence.readSequences(f) Sequence.outputSequencesToFasta(sequences, newfile, str(i), True) if newinfiles != "": newinfiles += "," newinfiles += newfile newfile_to_old_file[newfile] = f infiles = newinfiles else: for i in range(len(files_list)): f = files_list[i] newfile_to_old_file[f] = f #############################################################################################
def predict_orthologs(self, files, workdir, speciestree_file): workdir_seqs = join(workdir, "in") workdir_out = join(workdir, "out") allseqs = [] for f in files: seqs = Sequence.readSequences(f) for s in seqs: allseqs.append(s) #one file per species, see OMA comments above if len(self.cached_clusters) == 0: seqs_by_species = Sequence.combineSequenceFilesBySpecies( files, self.other_args["species_separator"], int(self.other_args["species_index"])) isdna = True if "seqtype" in self.other_args and self.other_args[ "seqtype"] == "AA": print("Input type set to AA") isdna = False if not os.path.exists(workdir_seqs): os.mkdir(workdir_seqs) for key in seqs_by_species: outfile = join(workdir_seqs, key + ".fasta") Sequence.outputSequencesToFasta(sequences=seqs_by_species[key], filename=outfile, name_suffix="", aligned=False, convertToAA=isdna, name_prefix="") cmd = "/u/lafonman/src/orthomcl-pipeline/bin/orthomcl-pipeline -i " + workdir_seqs + " -o " + workdir_out + " -m /u/lafonman/src/orthomcl-pipeline/orthomcl.conf --nocompliant --yes" print("EXEC " + cmd) os.system(cmd) seen_genes = set() clusters = [] clfile = join(workdir_out, "groups/groups.txt") f = open(clfile, 'r') for line in f: line = line.replace("\n", "") if line != "": gz = line.split(":")[1].split() cluster = set() for g in gz: gname = g.split("|")[1] cluster.add(gname) seen_genes.add(gname) clusters.append(cluster) f.close() for s in allseqs: name = s.name if not name in seen_genes: cl = set() cl.add(name) clusters.append(cl) self.cached_clusters = clusters else: print("USING CACHED CLUSTERS") clusters = self.cached_clusters #restrict clusters to current family #f_set = set() #for s in gene_family.sequences: # f_set.add(s.name) #f_clusters = [] #for cl in clusters: # inter = f_set.intersection(cl) # if len(inter) > 0: # f_clusters.append(inter) self.clusters_filenames = [join(workdir, "orthomcl.clusters")] self.relations_filenames = [join(workdir, "orthomcl.relations")] write_clusters(self.clusters_filenames[0], clusters) #output relations relstr = "" seen_keys = {} for c in clusters: for c1 in c: for c2 in c: if c1 != c2: key1 = c1 + ";;" + c2 key2 = c2 + ";;" + c1 if key1 not in seen_keys and key2 not in seen_keys: seen_keys[key1] = 1 seen_keys[key2] = 1 relstr += c1 + "\t" + c2 + "\t" sp1 = c1.split( self.other_args["species_separator"])[int( self.other_args["species_index"])] sp2 = c2.split( self.other_args["species_separator"])[int( self.other_args["species_index"])] if sp1 != sp2: relstr += "Orthologs" else: relstr += "Paralogs" relstr += ";;" relstr = relstr[0:-2] #extra ;; at end for i in range(len(allseqs)): for j in range(i + 1, len(allseqs)): n1 = allseqs[i].name n2 = allseqs[j].name key = n1 + ";;" + n2 if not key in seen_keys: relstr += n1 + "\t" + n2 + "\t" + "Paralogs" + ";;" f = open(self.relations_filenames[0], 'w') f.write(relstr) f.close()