def finish(self): # Once executed, alignment is converted into relaxed # interleaved phylip format. Both files, fasta and phylip, # remain accessible. # Set Task specific attributes main_job = self.jobs[0] fasta_path = pjoin(main_job.jobdir, "clean.alg.fasta") alg = SeqGroup(fasta_path) if len(alg) != self.size: log.warning( "Trimming was to aggressive and it tried" " to remove one or more sequences." " Alignment trimming will be disabled for this dataset.") self.clean_alg_fasta_file = db.register_task_data( self.taskid, DATATYPES.clean_alg_fasta, self.alg_fasta_file) self.clean_alg_phylip_file = db.register_task_data( self.taskid, DATATYPES.clean_alg_phylip, self.alg_phylip_file) else: for line in open(self.jobs[0].stdout_file): line = line.strip() if line.startswith("#ColumnsMap"): kept_columns = map(int, line.split("\t")[1].split(",")) fasta = alg.write(format="fasta") phylip = alg.write(format="iphylip_relaxed") AlgCleanerTask.store_data(self, fasta, phylip, kept_columns)
def switch_to_codon(alg_fasta_file, kept_columns=None): # Check conservation of columns. If too many identities, # switch to codon alignment and make the tree with DNA. # Mixed models is another possibility. if kept_columns: kept_columns = set(map(int, kept_columns)) else: kept_columns = [] #all_nt_alg = SeqGroup(nt_seed_file) aa_alg = SeqGroup(alg_fasta_file) nt_alg = SeqGroup() for seqname, aaseq, comments in aa_alg.iter_entries(): #ntseq = all_nt_alg.get_seq(seqname).upper() ntseq = db.get_seq(seqname, "nt").upper() ntalgseq = [] nt_pos = 0 for pos, ch in enumerate(aaseq): if ch in GAP_CHARS: codon = "---" else: codon = ntseq[nt_pos:nt_pos+3] nt_pos += 3 if not kept_columns or pos in kept_columns: # we trust the sequence in DB, consistency should have been # checked during the start up ntalgseq.append(codon) ntalgseq = "".join(ntalgseq) nt_alg.set_seq(seqname, ntalgseq) return nt_alg
def finish(self): # Once executed, alignment is converted into relaxed # interleaved phylip format. alg = SeqGroup(self.jobs[0].stdout_file) fasta = alg.write(format="fasta") phylip = alg.write(format="iphylip_relaxed") AlgTask.store_data(self, fasta, phylip)
def finish(self): # Once executed, alignment is converted into relaxed # interleaved phylip format. final_job = self.jobs[2] alg = SeqGroup(os.path.join(final_job.jobdir, "alg.fasta")) alg.write(outfile=self.alg_fasta_file, format="fasta") alg.write(outfile=self.alg_phylip_file, format="iphylip_relaxed") AlgTask.finish(self)
def finish(self): # Once executed, alignment is converted into relaxed # interleaved phylip format. alg = SeqGroup(os.path.join(self.jobs[0].jobdir, "mcoffee.fasta")) fasta = alg.write(format="fasta") phylip = alg.write(format="iphylip_relaxed") alg_list_string = '\n'.join([pjoin(GLOBALS["input_dir"], aname) for aname in self.all_alg_files]) db.add_task_data(self.taskid, DATATYPES.alg_list, alg_list_string) AlgTask.store_data(self, fasta, phylip)
def finish(self): if self.conf[self.confname]["_alg_trimming"]: # If trimming happened after mcoffee, let's save the # resulting output trim_job = self.jobs[-1] alg = SeqGroup(pjoin(trim_job.jobdir, trim_job.alg_fasta_file)) fasta = alg.write(format="fasta") phylip = alg.write(format="iphylip_relaxed") AlgTask.store_data(self, fasta, phylip) else: # If no post trimming, output is just what Mcoffee # produced, so we can recycle its data ids. mc_task = self.jobs[-1] fasta_id = db.get_dataid(mc_task.taskid, DATATYPES.alg_fasta) phylip_id = db.get_dataid(mc_task.taskid, DATATYPES.alg_phylip) db.register_task_data(self.taskid, DATATYPES.alg_fasta, fasta_id) db.register_task_data(self.taskid, DATATYPES.alg_phylip, phylip_id)
def switch_to_codon(alg_fasta_file, kept_columns=None): # Check conservation of columns. If too many identities, # switch to codon alignment and make the tree with DNA. # Mixed models is another possibility. if kept_columns: kept_columns = set(map(int, kept_columns)) else: kept_columns = [] #all_nt_alg = SeqGroup(nt_seed_file) aa_alg = SeqGroup(alg_fasta_file) nt_alg = SeqGroup() for seqname, aaseq, comments in aa_alg.iter_entries(): #ntseq = all_nt_alg.get_seq(seqname).upper() ntseq = db.get_seq(seqname, "nt").upper() ntalgseq = [] nt_pos = 0 for pos, ch in enumerate(aaseq): if ch in GAP_CHARS: codon = "---" else: codon = ntseq[nt_pos:nt_pos + 3] nt_pos += 3 if not kept_columns or pos in kept_columns: # we trust the sequence in DB, consistency should have been # checked during the start up ntalgseq.append(codon) ntalgseq = "".join(ntalgseq) nt_alg.set_seq(seqname, ntalgseq) return nt_alg
def finish(self): # Once executed, alignment is converted into relaxed # interleaved phylip format. Both files, fasta and phylip, # remain accessible. # Set Task specific attributes main_job = self.jobs[0] fasta_path = pjoin(main_job.jobdir, "clean.alg.fasta") alg = SeqGroup(fasta_path) if len(alg) != self.size: log.warning("Trimming was to aggressive and it tried" " to remove one or more sequences." " Alignment trimming will be disabled for this dataset." ) self.clean_alg_fasta_file = db.register_task_data(self.taskid, DATATYPES.clean_alg_fasta, self.alg_fasta_file) self.clean_alg_phylip_file = db.register_task_data(self.taskid, DATATYPES.clean_alg_phylip, self.alg_phylip_file) else: for line in open(self.jobs[0].stdout_file): line = line.strip() if line.startswith("#ColumnsMap"): kept_columns = map(int, line.split("\t")[1].split(",")) fasta = alg.write(format="fasta") phylip = alg.write(format="iphylip_relaxed") AlgCleanerTask.store_data(self, fasta, phylip, kept_columns)
def finish(self): alg = SeqGroup(os.path.join(self.jobs[0].jobdir, "alg.fasta")) fasta = alg.write(format="fasta") phylip = alg.write(format="iphylip_relaxed") AlgTask.store_data(self, fasta, phylip)
log.log(28, "Done thread @@12:%s@@1: in %d iteration(s)", threadname, past_threads[configid]) log.log(28, "Writing final tree for @@13:%s@@1:\n %s\n %s", threadname, final_tree_file+".nw", final_tree_file+".nwx (newick extended)") main_tree.write(outfile=final_tree_file+".nw") main_tree.write(outfile=final_tree_file+ ".nwx", features=[], format_root_node=True) if hasattr(main_tree, "alg_path"): log.log(28, "Writing root node alignment @@13:%s@@1:\n %s", threadname, final_tree_file+".fa") alg = SeqGroup(get_stored_data(main_tree.alg_path)) OUT = open(final_tree_file+".fa", "w") for name, seq, comments in alg: realname = db.get_seq_name(name) print >>OUT, ">%s\n%s" %(realname, seq) OUT.close() if hasattr(main_tree, "clean_alg_path"): log.log(28, "Writing root node trimmed alignment @@13:%s@@1:\n %s", threadname, final_tree_file+".trimmed.fa") alg = SeqGroup(get_stored_data(main_tree.clean_alg_path)) OUT = open(final_tree_file+".trimmed.fa", "w") for name, seq, comments in alg: realname = db.get_seq_name(name) print >>OUT, ">%s\n%s" %(realname, seq)
def get_concatenated_alg(alg_filenames, models=None, sp_field=0, sp_delimiter="_", kill_thr=0.0, keep_species=set()): # Concat alg container concat = SeqGroup() # Used to store different model partitions concat.id2partition = {} if not models: models = ["None"] * len(alg_filenames) else: if len(models) != len(alg_filenames): raise ValueError( "Different number of algs and model names was found!") expected_total_length = 0 # Check algs and gets the whole set of species alg_objects = [] sp2alg = defaultdict(list) for algfile, matrix in zip(alg_filenames, models): alg = SeqGroup(algfile, "fasta") alg_objects.append(alg) lenseq = None browsed_species = set() alg.sp2seq = {} # Set best matrix for this alignment alg.matrix = matrix # Change seq names to contain only species names for i, seq in alg.id2seq.iteritems(): name = db.get_seq_name(alg.id2name[i]) taxid = get_species_code(name, splitter=sp_delimiter, field=sp_field) if lenseq is not None and len(seq) != lenseq: raise Exception( "Inconsistent alignment when concatenating: Unequal length" ) elif lenseq is None: lenseq = len(seq) alg.seqlength = len(seq) expected_total_length += len(seq) if taxid in browsed_species: raise Exception( "Inconsistent alignment when concatenating: Repeated species" ) browsed_species.add( taxid) # Check no duplicated species in the same alg sp2alg[taxid].append(alg) # Records all species seen in all algs. alg.sp2seq[taxid] = seq valid_species = [sp for sp in sp2alg.iterkeys() \ if sp in keep_species or \ len(sp2alg[sp])/float(len(alg_objects)) > kill_thr] log.info("%d out of %d will be kept (missing factor threshold=%g, %d species forced to kept)" %\ (len(valid_species), len(sp2alg), kill_thr, len(keep_species))) def sort_single_algs(alg1, alg2): r = cmp(alg1.matrix, alg2.matrix) if r == 0: return cmp(sorted(alg1.id2name.values()), sorted(alg2.id2name.values())) else: return r sorted_algs = sorted(alg_objects, sort_single_algs) concat_alg_lengths = [alg.seqlength for alg in sorted_algs] model2win = {} model2size = {} for alg in sorted_algs: model2size[alg.matrix] = model2size.get(alg.matrix, 0) + alg.seqlength # Create concat alg concat.id2seq = defaultdict(list) for sp in sorted(valid_species): log.log(20, "Concatenating sequences of [%s]" % sp) for alg in sorted_algs: seq = alg.sp2seq.get(sp, "-" * alg.seqlength) concat.id2seq[sp].append(seq) #current_seq = concat.id2seq.get(sp, "") #concat.id2seq[sp] = current_seq + seq.strip() concat.id2name[sp] = sp concat.name2id[sp] = sp concat.id2comment[sp] = [""] concat.id2seq[sp] = ''.join(concat.id2seq[sp]) current_pos = 0 partitions = [] for model in sorted(model2size.keys()): size = model2size[model] part = "%s, %s = %d-%d" % (model, model+"_genes", \ current_pos + 1,\ current_pos + size) current_pos += size partitions.append(part) # Basic Checks seq_sizes = [len(seq) for seq in concat.id2seq.values()] if len(set(seq_sizes)) != 1: raise Exception( "Concatenated alignment is not consistent: unequal seq length ") if seq_sizes[0] != expected_total_length: raise Exception("The size of concatenated alg is not what expected") return concat, partitions, sp2alg, valid_species, concat_alg_lengths
log.log( 28, "Writing final tree for @@13:%s@@1:\n %s\n %s", threadname, final_tree_file + ".nw", final_tree_file + ".nwx (newick extended)") main_tree.write(outfile=final_tree_file + ".nw") main_tree.write(outfile=final_tree_file + ".nwx", features=[], format_root_node=True) if hasattr(main_tree, "alg_path"): log.log( 28, "Writing root node alignment @@13:%s@@1:\n %s", threadname, final_tree_file + ".fa") alg = SeqGroup(get_stored_data(main_tree.alg_path)) OUT = open(final_tree_file + ".fa", "w") for name, seq, comments in alg: realname = db.get_seq_name(name) print >> OUT, ">%s\n%s" % (realname, seq) OUT.close() if hasattr(main_tree, "clean_alg_path"): log.log( 28, "Writing root node trimmed alignment @@13:%s@@1:\n %s", threadname, final_tree_file + ".trimmed.fa") alg = SeqGroup( get_stored_data(main_tree.clean_alg_path)) OUT = open(final_tree_file + ".trimmed.fa", "w")
def get_concatenated_alg(alg_filenames, models=None, sp_field=0, sp_delimiter="_", kill_thr=0.0, keep_species=set()): # Concat alg container concat = SeqGroup() # Used to store different model partitions concat.id2partition = {} if not models: models = ["None"]*len(alg_filenames) else: if len(models) != len(alg_filenames): raise ValueError("Different number of algs and model names was found!") expected_total_length = 0 # Check algs and gets the whole set of species alg_objects = [] sp2alg = defaultdict(list) for algfile, matrix in zip(alg_filenames, models): alg = SeqGroup(algfile, "fasta") alg_objects.append(alg) lenseq = None browsed_species = set() alg.sp2seq = {} # Set best matrix for this alignment alg.matrix = matrix # Change seq names to contain only species names for i, seq in alg.id2seq.iteritems(): name = db.get_seq_name(alg.id2name[i]) taxid = get_species_code(name, splitter=sp_delimiter, field=sp_field) if lenseq is not None and len(seq) != lenseq: raise Exception("Inconsistent alignment when concatenating: Unequal length") elif lenseq is None: lenseq = len(seq) alg.seqlength = len(seq) expected_total_length += len(seq) if taxid in browsed_species: raise Exception("Inconsistent alignment when concatenating: Repeated species") browsed_species.add(taxid) # Check no duplicated species in the same alg sp2alg[taxid].append(alg) # Records all species seen in all algs. alg.sp2seq[taxid] = seq valid_species = [sp for sp in sp2alg.iterkeys() \ if sp in keep_species or \ len(sp2alg[sp])/float(len(alg_objects)) > kill_thr] log.info("%d out of %d will be kept (missing factor threshold=%g, %d species forced to kept)" %\ (len(valid_species), len(sp2alg), kill_thr, len(keep_species))) def sort_single_algs(alg1, alg2): r = cmp(alg1.matrix, alg2.matrix) if r == 0: return cmp(sorted(alg1.id2name.values()), sorted(alg2.id2name.values())) else: return r sorted_algs = sorted(alg_objects, sort_single_algs) concat_alg_lengths = [alg.seqlength for alg in sorted_algs] model2win = {} model2size = {} for alg in sorted_algs: model2size[alg.matrix] = model2size.get(alg.matrix, 0) + alg.seqlength # Create concat alg concat.id2seq = defaultdict(list) for sp in sorted(valid_species): log.log(20, "Concatenating sequences of [%s]" %sp) for alg in sorted_algs: seq = alg.sp2seq.get(sp, "-" * alg.seqlength) concat.id2seq[sp].append(seq) #current_seq = concat.id2seq.get(sp, "") #concat.id2seq[sp] = current_seq + seq.strip() concat.id2name[sp] = sp concat.name2id[sp] = sp concat.id2comment[sp] = [""] concat.id2seq[sp] = ''.join(concat.id2seq[sp]) current_pos = 0 partitions = [] for model in sorted(model2size.keys()): size = model2size[model] part = "%s, %s = %d-%d" % (model, model+"_genes", \ current_pos + 1,\ current_pos + size) current_pos += size partitions.append(part) # Basic Checks seq_sizes = [len(seq) for seq in concat.id2seq.values()] if len(set(seq_sizes)) != 1: raise Exception("Concatenated alignment is not consistent: unequal seq length ") if seq_sizes[0] != expected_total_length: raise Exception("The size of concatenated alg is not what expected") return concat, partitions, sp2alg, valid_species, concat_alg_lengths