def get_hmm_refalignment(self): sites = [] hmp = open(self.refprofile) l = hmp.readline() start = False while l != "": if l.startswith("//"): break if start: l = l.strip() ll = l.split() usedsite = int(ll[5]) sites.append(usedsite) l = hmp.readline() l = hmp.readline() else: if l.startswith("HMM "): start = True l = hmp.readline() l = hmp.readline() l = hmp.readline() l = hmp.readline() l = hmp.readline() hmp.close() align = SeqGroup(self.refalign) fout = open(self.trimed, "w") for entr in align.get_entries(): fout.write(">" + entr[0] + "\n") for pos in sites: fout.write(entr[1][pos - 1]) fout.write("\n") fout.close() return self.trimed, len(sites)
def trim_refalign_hmm(refaln, hmmprofile): sites = [] hmp = open(hmmprofile) l = hmp.readline() start = False while l!="": if l.startswith("//"): break if start: l = l.strip() ll = l.split() usedsite = int(ll[5]) sites.append(usedsite) l = hmp.readline() l = hmp.readline() else: if l.startswith("HMM "): start = True l = hmp.readline() l = hmp.readline() l = hmp.readline() l = hmp.readline() l = hmp.readline() hmp.close() align = SeqGroup(refaln) fout = open(refaln+".trimed.afa", "w") for entr in align.get_entries(): fout.write(">" + entr[0] + "\n") for pos in sites: fout.write(entr[1][pos-1]) fout.write("\n") fout.close() return refaln+".trimed.afa", len(sites)
def gentesting(ftaxa, fseq, fout, fold = 10): ftax = open(ftaxa) lines = ftax.readlines() ftax.close() #seqs = SeqGroup(fseq, format='phylip_relaxed') seqs = SeqGroup(fseq) idx = range(len(lines)) random.seed(12345) random.shuffle(idx) numtaxa = len(lines) onefold = int(math.ceil(float(numtaxa) / fold)) idx_list = [] for i in range(fold): start = i * onefold end = (i + 1) * onefold if end > numtaxa: end = numtaxa if i == fold -1 : end = numtaxa idx_list.append(idx[start:end]) for i in range(len(idx_list)): idxi = idx_list[i] f1 = open(fout + repr(i+1) + "testing.tax", "w") f2 = open(fout + repr(i+1) + "testing.fa", "w") for index in idxi: tax = lines[index] seqid = tax.split()[0] seq = seqs.get_seq(seqid) seqnogap = seq.replace("-","") f1.write(tax) f2.write(">" + seqid + "\n") f2.write(seqnogap + "\n") f1.close() f2.close() f1 = open(fout + repr(i+1) + "training.tax", "w") f2 = open(fout + repr(i+1) + "training.afa", "w") f3 = open(fout + repr(i+1) + "training.fa", "w") for j in range(len(idx_list)): if not i==j: idxj = idx_list[j] for index in idxj: tax = lines[index] seqid = tax.split()[0] seq = seqs.get_seq(seqid) seqnogap = seq.replace("-","") f1.write(tax) f2.write(">" + seqid + "\n") f2.write(seq + "\n") f3.write(">" + seqid + "\n") f3.write(seqnogap + "\n") f1.close() f2.close() f3.close()
def gen_alignment3(seq_names = [], alignment = SeqGroup()): """generate alignment from the input taxa name list - seq_name, and SeqGroup - alignment""" newalign = SeqGroup() for taxa in seq_names: seq = alignment.get_seq(taxa) newalign.set_seq(taxa, seq) #newalign.write(outfile = outputfile) return newalign
def __init__(self, refaln, type=""): if type == "fasta": self.aln = SeqGroup(sequences=refaln) else: self.aln = SeqGroup(sequences=refaln, format='phylip_relaxed') self.true_spe = {} self._get_truth() self._get_cluster_label()
def __init__(self, refaln, type = ""): if type == "fasta": self.aln = SeqGroup(sequences=refaln) else: self.aln = SeqGroup(sequences=refaln, format='phylip_relaxed') self.true_spe = {} self._get_truth() self._get_cluster_label()
def __write_algn(self, fullpath): """ to write algn in paml format """ seq_group = SeqGroup() for n in self: seq_group.id2seq [n.node_id] = n.nt_sequence seq_group.id2name [n.node_id] = n.name seq_group.name2id [n.name ] = n.node_id seq_group.write(outfile=fullpath, format='paml')
def __write_algn(self, fullpath): """ to write algn in paml format """ seq_group = SeqGroup() for n in self: seq_group.id2seq[n.node_id] = n.nt_sequence seq_group.id2name[n.node_id] = n.name seq_group.name2id[n.name] = n.node_id seq_group.write(outfile=fullpath, format='paml')
def gen_alignment2(seq_names = [], alignment = SeqGroup()): """generate alignment from the input taxa name list - seq_name, and SeqGroup - alignment""" newalign = SeqGroup() for taxa in seq_names: if taxa.startswith("*R*"): seq = alignment.get_seq(taxa[3:]) elif taxa == "sister": continue else: seq = alignment.get_seq(taxa) newalign.set_seq(taxa, seq) #newalign.write(outfile = outputfile) return newalign
def pick_otu(spe_out, alignment): fin = open(spe_out) lines = fin.readlines() fin.close() fout = open(alignment + ".otu", "w") aln = SeqGroup(sequences=alignment) for i in range(len(lines)): line = lines[i] if line.startswith("Species"): nline = lines[i+1].strip() seq = aln.get_seq(nline) fout.write(">" + nline + "\n") fout.write(seq + "\n") fout.close()
def pick_otu(spe_out, alignment): fin = open(spe_out) lines = fin.readlines() fin.close() fout = open(alignment + ".otu", "w") aln = SeqGroup(sequences=alignment) for i in range(len(lines)): line = lines[i] if line.startswith("Species"): nline = lines[i + 1].strip() seq = aln.get_seq(nline) fout.write(">" + nline + "\n") fout.write(seq + "\n") fout.close()
def count_reads(nfolder, pref = "me_leaf_"): cnt = 0 naligns = glob.glob(nfolder + pref + "*") for aln in naligns: a = SeqGroup(sequences = aln) for ent in a.get_entries(): name = ent[0] if name == "root_ref": pass elif name.startswith("*R*"): pass else: numread = int(name.split("*")[-1]) cnt = cnt + numread print cnt
def curator(refseq, reftax, method, output, testingtax=""): seqs = SeqGroup(refseq) ranks = [] with open(reftax) as fo: for line in fo: ll = line.split() ele = [ll[0], ll[1].split(";")] ranks.append(ele) testings = [] if testingtax != "": with open(testingtax) as fo: for line in fo: ll = line.split() ele = [ll[0], ll[1].split(";")] testings.append(ele) else: testings = ranks for test in testings: #refseq, reftax, name, old_tax, method, foutput ru, result_string = findmis(refseq=seqs, reftax=ranks, name=test[0], method=method, foutput=output) print(result_string)
def merge_alignment(aln1, aln2, fout, numsites): seqs1 = SeqGroup(aln1) seqs2 = SeqGroup(aln2) if len(seqs1) == 0 or len(seqs2) == 0: print("No sequences aligned! ") sys.exit() with open(fout, "w") as fo: for seq in seqs1.iter_entries(): if len(seq[1].strip()) == numsites: fo.write(">" + seq[0] + "\n" + seq[1] + "\n") else: print("Error in alignment ....") sys.exit() for seq in seqs2.iter_entries(): if len(seq[1].strip()) == numsites: fo.write(">" + seq[0] + "\n" + seq[1] + "\n") else: print("Error in alignment ....") sys.exit()
def link_to_alignment(self, alignment, alg_format="fasta", **kwargs): missing_leaves = [] missing_internal = [] if type(alignment) == SeqGroup: alg = alignment else: alg = SeqGroup(alignment, format=alg_format, **kwargs) # sets the seq of for n in self.traverse(): try: n.add_feature("sequence", alg.get_seq(n.name)) except KeyError: if n.is_leaf(): missing_leaves.append(n.name) else: missing_internal.append(n.name) if len(missing_leaves) > 0: print >>sys.stderr, \ "Warnning: [%d] terminal nodes could not be found in the alignment." %\ len(missing_leaves)
def link_to_alignment(self, alignment, alg_format="fasta"): missing_leaves = [] missing_internal = [] if type(alignment) == SeqGroup: alg = alignment else: alg = SeqGroup(alignment, format=alg_format) # sets the seq of for n in self.traverse(): try: n.add_feature("sequence",alg.get_seq(n.name)) except KeyError: if n.is_leaf(): missing_leaves.append(n.name) else: missing_internal.append(n.name) if len(missing_leaves)>0: print >>sys.stderr, \ "Warnning: [%d] terminal nodes could not be found in the alignment." %\ len(missing_leaves)
def chimera_removal(nuseach, nalign, nout, chimeraout): align = SeqGroup(nalign) newalign = open(nout, "w") chalign = open(chimeraout, "w") fus = open(nuseach) lines = fus.readlines() fus.close() for line in lines: its = line.split() c = its[-1] sname = its[1] if c == "Y" or c =="?": seq = align.get_seq(sname) chalign.write(">" + sname + "\n") chalign.write(seq + "\n") else: seq = align.get_seq(sname) newalign.write(">" + sname + "\n") newalign.write(seq + "\n") newalign.close() chalign.close()
def raxml_g_after_epa(nfolder, nref_align, suf = "ifa", T = "2"): align_orgin = SeqGroup(sequences = nref_align) ref_taxa = [] for entr in align_orgin.get_entries(): ref_taxa.append(entr[0]) naligns = glob.glob(nfolder + "*." + suf) cnt = 0 for aln in naligns: print(repr(cnt)) cnt = cnt + 1 if os.path.exists(aln.split(".")[0] + ".subtree"): pass else: mttree = aln.split(".")[0] + ".mttree" #raxml constrait search trename = build_constrain_tree(nsfin = aln, ntfin = mttree, nfout = "i"+repr(cnt), nfolder = nfolder, num_thread = T) #read in the fully resolved tree full_tree = Tree(trename, format=1) all_taxa = full_tree.get_leaf_names() target_taxa = [] for taxa in all_taxa: if taxa in ref_taxa: pass else: target_taxa.append(taxa) #the place where the tree can be safely rooted ref_node = full_tree.get_leaves_by_name(ref_taxa[0])[0] #reroot full_tree.set_outgroup(ref_node) #find the common ancestor of the target taxa leafA = full_tree.get_leaves_by_name(target_taxa[0])[0] leaflist = [] for n in target_taxa[1:]: leaflist.append(full_tree.get_leaves_by_name(n)[0]) common = leafA.get_common_ancestor(leaflist) common.up = None common.write(outfile= aln.split(".")[0] + ".subtree", format=5) os.remove(trename) os.remove(mttree)
def random_remove_taxa(falign, num_remove, num_repeat = 1): align = SeqGroup(sequences = falign) entrs = align.get_entries() numseq = len(entrs) index = range(numseq) namel = [] for i in range(num_repeat): newalign = SeqGroup() random.shuffle(index) idxs = index[num_remove:] for idx in idxs: newalign.set_seq(entrs[idx][0], entrs[idx][1]) newalign.write(outfile = falign + "_" + repr(num_remove)+ "_" + repr(i + 1) + ".afa") namel.append(falign + "_" + repr(num_remove)+ "_" + repr(i + 1) + ".afa") return namel
def ngssize(fin, start = 0, end = 2428): fout1 = open(fin+".trim.afa", "w") fout2 = open(fin+".trim.fa", "w") seqs = SeqGroup(fin) for seq in seqs: name = seq[0] sequence = seq[1] cut = len(sequence)/2 sequence_trim = sequence[0:cut] sequence_trim_nogap = sequence_trim.replace("-","") fout1.write(">" + name + "\n") fout2.write(">" + name + "\n") fout1.write(sequence_trim + "\n") fout2.write(sequence_trim_nogap + "\n") fout1.close() fout2.close() return fin+".trim.fa"
class ground_truth: def __init__(self, refaln, type = ""): if type == "fasta": self.aln = SeqGroup(sequences=refaln) else: self.aln = SeqGroup(sequences=refaln, format='phylip_relaxed') self.true_spe = {} self._get_truth() self._get_cluster_label() def _get_truth(self): for entr in self.aln.get_entries(): name = entr[0] gid = name.split(".")[0] self.true_spe[gid] = [] for entr in self.aln.get_entries(): name = entr[0] gid = name.split(".")[0] group = self.true_spe[gid] group.append(name) self.true_spe[gid] = group def _get_cluster_label(self): self.seq_list = [] self.seq_cid_list = [] for entr in self.aln.get_entries(): seq_name = entr[0] cid = int(seq_name.split(".")[0]) self.seq_list.append(seq_name) self.seq_cid_list.append(cid) self.C0 = array(self.seq_cid_list) def get_taxa_order(self): return self.seq_list def set_new_cluster_label(self, new_cid_list, seq_list, newid): if len(new_cid_list) == 0: for i in range(len(self.seq_list)): new_cid_list.append(-1) for i in range(len(self.seq_list)): name = self.seq_list[i] if name in seq_list: new_cid_list[i] = newid return new_cid_list #Mutual information def mutual_info(self,x,y): N=float(len(x)) I=0.0 eps = numpy.finfo(float).eps for l1 in numpy.unique(x): for l2 in numpy.unique(y): #Find the intersections l1_ids=nonzero(x==l1)[0] l2_ids=nonzero(y==l2)[0] pxy=(double(intersect1d(l1_ids,l2_ids).size)/N)+eps I+=pxy*log2(pxy/((l1_ids.size/N)*(l2_ids.size/N))) return I #Normalized mutual information def nmi(self,x,y): N=x.size I=self.mutual_info(x,y) Hx=0 for l1 in unique(x): l1_count=nonzero(x==l1)[0].size Hx+=-(double(l1_count)/N)*log2(double(l1_count)/N) Hy=0 for l2 in unique(y): l2_count=nonzero(y==l2)[0].size Hy+=-(double(l2_count)/N)*log2(double(l2_count)/N) if (Hx+Hy) == 0: return 1.0 else: return I/((Hx+Hy)/2) def get_seq_list(self): return self.seq_list def get_nmi(self, new_cluster_labels): return self.nmi(self.C0, new_cluster_labels) def is_correct(self,names): #*R* newnames = [] for name in names: if name.startswith("*R*"): pass else: newnames.append(name) names_set = set(newnames) for key in self.true_spe.keys(): sps = self.true_spe[key] sps_set = set(sps) if names_set == sps_set: return True return False def get_num_species(self): return len(self.true_spe.keys())
class ground_truth: def __init__(self, refaln, type=""): if type == "fasta": self.aln = SeqGroup(sequences=refaln) else: self.aln = SeqGroup(sequences=refaln, format='phylip_relaxed') self.true_spe = {} self._get_truth() self._get_cluster_label() def _get_truth(self): for entr in self.aln.get_entries(): name = entr[0] gid = name.split(".")[0] self.true_spe[gid] = [] for entr in self.aln.get_entries(): name = entr[0] gid = name.split(".")[0] group = self.true_spe[gid] group.append(name) self.true_spe[gid] = group def _get_cluster_label(self): self.seq_list = [] self.seq_cid_list = [] for entr in self.aln.get_entries(): seq_name = entr[0] cid = int(seq_name.split(".")[0]) self.seq_list.append(seq_name) self.seq_cid_list.append(cid) self.C0 = array(self.seq_cid_list) def get_taxa_order(self): return self.seq_list def set_new_cluster_label(self, new_cid_list, seq_list, newid): if len(new_cid_list) == 0: for i in range(len(self.seq_list)): new_cid_list.append(-1) for i in range(len(self.seq_list)): name = self.seq_list[i] if name in seq_list: new_cid_list[i] = newid return new_cid_list #Mutual information def mutual_info(self, x, y): N = float(len(x)) I = 0.0 eps = numpy.finfo(float).eps for l1 in numpy.unique(x): for l2 in numpy.unique(y): #Find the intersections l1_ids = nonzero(x == l1)[0] l2_ids = nonzero(y == l2)[0] pxy = (double(intersect1d(l1_ids, l2_ids).size) / N) + eps I += pxy * log2(pxy / ((l1_ids.size / N) * (l2_ids.size / N))) return I #Normalized mutual information def nmi(self, x, y): N = x.size I = self.mutual_info(x, y) Hx = 0 for l1 in unique(x): l1_count = nonzero(x == l1)[0].size Hx += -(double(l1_count) / N) * log2(double(l1_count) / N) Hy = 0 for l2 in unique(y): l2_count = nonzero(y == l2)[0].size Hy += -(double(l2_count) / N) * log2(double(l2_count) / N) if (Hx + Hy) == 0: return 1.0 else: return I / ((Hx + Hy) / 2) def get_seq_list(self): return self.seq_list def get_nmi(self, new_cluster_labels): return self.nmi(self.C0, new_cluster_labels) def is_correct(self, names): #*R* newnames = [] for name in names: if name.startswith("*R*"): pass else: newnames.append(name) names_set = set(newnames) for key in self.true_spe.keys(): sps = self.true_spe[key] sps_set = set(sps) if names_set == sps_set: return True return False def get_num_species(self): return len(self.true_spe.keys())
def curator(refseq, reftax, method, output, testingtax=""): start_time = time.time() seqs = SeqGroup(refseq) ranks = [] with open(reftax) as fo: for line in fo: ll = line.split() ele = [ll[0], ll[1].split(";")] ranks.append(ele) testings = [] if len(testingtax) > 1: with open(testingtax) as fo: for line in fo: ll = line.split() ele = [ll[0], ll[1].split(";")] testings.append(ele) else: if testingtax: partno = int(testingtax) partsize = int(len(ranks) / 10) p_start = partno * partsize if partno == 9: p_end = len(ranks) else: p_end = p_start + partsize testings = ranks[p_start:p_end] else: testings = ranks # basepath = os.path.dirname(os.path.abspath(__file__)) basepath = "/home/kozlovay" tmpfolder = basepath + "/tmp/" tmpprefix = tmpfolder + str(time.time()) # prelim_output = output + ".prelim" assf = output + ".ass" old_tax = {} if os.path.isfile(assf): with open(assf, "r") as fi: for line in fi: seqid, rest = line.strip().split("\t", 1) old_tax[seqid] = rest print "Found old file with %d assignments; will continue from there..." % len( old_tax) if True: #not os.path.isfile(output): with open(assf, "a") as fo: i = 0 for test in testings: #refseq, reftax, name, old_tax, method, foutput i += 1 if test[0] in old_tax: continue tmp_fname = "%s.%d" % (tmpprefix, i) ru, result_string = findmis(refseq=seqs, reftax=ranks, name=[test[0]], method=method, foutput=output, tmpname=tmp_fname, refseq_fname=refseq) # print(result_string) fo.write(ru) fo.flush() mis_sid = [] with open(output, "r") as fmis: for line in fmis: toks = line.split("\t") sid = toks[0] lvl = toks[1] conf = float(toks[4]) if lvl != "Species": mis_sid += [sid] print "Leave-one-out test found %d suspicious sequences; running final test to check them..." % len( mis_sid) final_output = output + ".final" tmp_fname = "%s.%s" % (tmpprefix, "fin") findmis(refseq=seqs, reftax=ranks, name=mis_sid, method=method, foutput=final_output, tmpname=tmp_fname, refseq_fname=refseq) elapsed_time = time.time() - start_time print "\nProcessed %d sequences in %.0f seconds.\n" % (len(testings), elapsed_time)
regions = [] for reg in args.target_regions: try: contig, raw_pos = reg.split(':') if not raw_pos: start, end = None, None else: start, end = map(int, raw_pos.split('-')) regions.append([contig.strip(), start, end]) except Exception: print >>sys.stderr, 'ERROR: Invalid contig region. Use contigid:start-end syntax\n' raise args.target_regions = regions if args.target_genes: for g in gene_db.find({"sp":int(taxid), "n":{"$in": args.target_genes}}, {"c":1, "s":1, "e":1}): print g if not args.target_regions: # If not regions requested, scan all contigs completely args.target_regions = [ [None, None, None] ] if args.refseqs: from ete2 import SeqGroup args.refseqs = SeqGroup(args.refseqs) main(args)
from ete2 import PhyloTree, PhylomeDBConnector, SeqGroup p = PhylomeDBConnector() w,x, t = p.get_best_tree("Hsa0000001", 1) a, l = p.get_clean_alg("Hsa0000001", 1) A = SeqGroup(a, "iphylip") for s in A.id2seq: A.id2seq[s]=A.id2seq[s][:30] t.link_to_alignment(A) print t.get_species() print t t.set_outgroup(t&"Ddi0002240") sp = PhyloTree("(((((((((((Hsa, Ptr), Mmu), ((Mms, Rno), (Bta, Cfa))), Mdo), Gga), Xtr), (Dre, Fru))),Cin) (Dme, Aga)), Ddi);") reconciled, evs = t.reconcile(sp) print reconciled reconciled.show()
def extract_placement_crop(nfin_place, nfin_aln, nfout, min_lw = 0.5, logfile = "spcount.log"): if os.path.exists(logfile): os.remove(logfile) if os.path.exists(nfout + "_inode_picked_otus.fasta"): os.remove(nfout + "_inode_picked_otus.fasta") jsondata = open (nfin_place) align_orgin = SeqGroup(sequences = nfin_aln) data = json.load(jsondata) placements = data["placements"] tree = data["tree"] ete_tree = tree.replace("{", "[&&NHX:B=") ete_tree = ete_tree.replace("}", "]") root = Tree(ete_tree, format=1) leaves = root.get_leaves() allnodes = root.get_descendants() allnodes.append(root) """get refseq""" refseqset = [] for leaf in leaves: refseqset.append(leaf.name) refali = gen_alignment2(seq_names = refseqset, alignment = align_orgin) placemap = {} """find how many edges are used for placement""" for placement in placements: edges = placement["p"] curredge = edges[0][0] lw = edges[0][2] if lw >= min_lw: placemap[curredge] = placemap.get(curredge, []) """placement quality control""" discard_file = open(nfout+".discard.placement.txt", "w") """group taxa to edges""" for placement in placements: edges = placement["p"] taxa_names = placement["n"] curredge = edges[0][0] lw = edges[0][2] if lw >= min_lw: a = placemap[curredge] a.extend(taxa_names) placemap[curredge] = a else: discard_file.write(repr(taxa_names) + "\n") discard_file.close() groups = placemap.items() cnt_leaf = 0 cnt_inode = 0 """check each edge""" for i,item in enumerate(groups): seqset_name = item[0] seqset = item[1] """check if placed on leaf node and find the node being placed on""" flag = False place_node = None for node in allnodes: if str(node.B) == str(seqset_name): place_node = node if node.is_leaf(): flag = True break """generate aligment""" if flag: """process leaf node placement""" cnt_leaf = cnt_leaf + 1 newalign = SeqGroup() for taxa in seqset: seq = align_orgin.get_seq(taxa) newalign.set_seq(taxa, seq) place_seq = align_orgin.get_seq(place_node.name) newalign.set_seq("*R*" + place_node.name, place_seq) #set the reference sequence name newalign.write(outfile = nfout + "_leaf_"+repr(cnt_leaf) + ".lfa") else: cnt_inode = cnt_inode + 1 newalign = SeqGroup() for taxa in seqset: seq = align_orgin.get_seq(taxa) newalign.set_seq(taxa, seq) if len(newalign.get_entries()) < 2: count_and_pick_reads(align = newalign, outputfile = nfout + "_inode_picked_otus.fasta") sp_log(sfout = logfile, logs="I the palcement is on an internal node \nD find new species\nK reads number: 1 \n") else: #for entr in refali.get_entries(): # sname = entr[0] # seqe = entr[1] # newalign.set_seq(sname, seq) newalign.write(outfile = nfout + "_inode_"+repr(cnt_inode) + ".ifa")
def extract_placement(nfin_place, nfin_aln, nfout, min_lw = 0.5, logfile = "spcount.log"): if os.path.exists(logfile): os.remove(logfile) if os.path.exists(nfout + "_inode_picked_otus.fasta"): os.remove(nfout + "_inode_picked_otus.fasta") jsondata = open (nfin_place) align_orgin = SeqGroup(sequences = nfin_aln) data = json.load(jsondata) placements = data["placements"] tree = data["tree"] ete_tree = tree.replace("{", "[&&NHX:B=") ete_tree = ete_tree.replace("}", "]") root = Tree(ete_tree, format=1) leaves = root.get_leaves() allnodes = root.get_descendants() allnodes.append(root) """get refseq""" refseqset = [] for leaf in leaves: refseqset.append(leaf.name) refali = gen_alignment2(seq_names = refseqset, alignment = align_orgin) placemap = {} """find how many edges are used for placement""" for placement in placements: edges = placement["p"] curredge = edges[0][0] lw = edges[0][2] if lw >= min_lw: placemap[curredge] = placemap.get(curredge, []) """placement quality control""" discard_file = open(nfout+".discard.placement.txt", "w") """group taxa to edges""" for placement in placements: edges = placement["p"] taxa_names = placement["n"] curredge = edges[0][0] lw = edges[0][2] if lw >= min_lw: a = placemap[curredge] a.extend(taxa_names) placemap[curredge] = a else: discard_file.write(repr(taxa_names) + "\n") discard_file.close() groups = placemap.items() cnt_leaf = 0 cnt_inode = 0 """check each edge""" for i,item in enumerate(groups): seqset_name = item[0] seqset = item[1] """check if placed on leaf node and find the node being placed on""" flag = False place_node = None for node in allnodes: if str(node.B) == str(seqset_name): place_node = node if node.is_leaf(): flag = True break """find the furthest leaf of the placement node""" fnode = place_node.get_farthest_node()[0] outgroup_name = fnode.name """find sister node""" snode = place_node.get_sisters()[0] if not snode.is_leaf(): snode = snode.get_closest_leaf()[0] sister_name = snode.name """generate aligment""" if flag: """process leaf node placement""" cnt_leaf = cnt_leaf + 1 newalign = SeqGroup() for taxa in seqset: seq = align_orgin.get_seq(taxa) newalign.set_seq(taxa, seq) if len(newalign.get_entries()) < 2: #count_and_pick_reads(align = newalign, outputfile = nfout + "_leaf_picked_otus.fasta") og_seq = align_orgin.get_seq(outgroup_name) sis_seq = align_orgin.get_seq(sister_name) newalign.set_seq("sister", sis_seq) #set the sister seqeunce to make 4 taxa newalign.set_seq("root_ref", og_seq) #set the outgroup name place_seq = align_orgin.get_seq(place_node.name) newalign.set_seq("*R*" + place_node.name, place_seq) #set the reference sequence name newalign.write(outfile = nfout + "_leaf_"+repr(cnt_leaf) + ".lfa") else: og_seq = align_orgin.get_seq(outgroup_name) newalign.set_seq("root_ref", og_seq) #set the outgroup name place_seq = align_orgin.get_seq(place_node.name) newalign.set_seq("*R*" + place_node.name, place_seq) #set the reference sequence name newalign.write(outfile = nfout + "_leaf_"+repr(cnt_leaf) + ".lfa") else: """genrate the newwick string to be inserted into the ref tree""" rep = re.compile(r"\{[0-9]*\}") multi_fcating = "(" for seqname in seqset: multi_fcating = multi_fcating + seqname + "," multi_fcating = multi_fcating[:-1] multi_fcating = "{" + repr(seqset_name) + "}," + multi_fcating + ")" mtfc_tree = tree.replace("{" + repr(seqset_name) + "}", multi_fcating) mtfc_tree = rep.sub("", mtfc_tree) cnt_inode = cnt_inode + 1 newalign = SeqGroup() for taxa in seqset: seq = align_orgin.get_seq(taxa) newalign.set_seq(taxa, seq) if len(newalign.get_entries()) < 2: count_and_pick_reads(align = newalign, outputfile = nfout + "_inode_picked_otus.fasta") sp_log(sfout = logfile, logs="I the palcement is on an internal node \nD find new species\nK reads number: 1 \n") else: #og_seq = align_orgin.get_seq(outgroup_name) #newalign.set_seq("root_ref", og_seq) for entr in refali.get_entries(): sname = entr[0] seqe = entr[1] newalign.set_seq(sname, seq) newalign.write(outfile = nfout + "_inode_"+repr(cnt_inode) + ".ifa") mtfc_out = open(nfout + "_inode_"+repr(cnt_inode) + ".mttree", "w") mtfc_out.write(mtfc_tree) mtfc_out.close()
def autotest(refseq, reftax, testingtax, tf = "/home/zhangje/GIT/tax_benchmark/script/tmp/"): testings = [] with open(testingtax) as fo: for line in fo: ll = line.split() ele = [ll[0], ll[1].split(";")] testings.append(ele) seqs = SeqGroup(refseq) ranks = [] with open(reftax) as fo: for line in fo: ll = line.split() ele = [ll[0], ll[1].split(";")] ranks.append(ele) num_corrected_uclust = 0 num_unchanged_uclust = 0 num_corrected_rdp = 0 num_unchanged_rdp = 0 num_corrected_blast = 0 num_unchanged_blast = 0 f_uclust = open(testingtax+".uclust", "w") f_rdp = open(testingtax+".rdp", "w") f_blast = open(testingtax+".blast", "w") f_mis = open(testingtax+".misb", "w") f_umis = open(testingtax+".umisb", "w") for test in testings: ru, result_uclust = findmis(refseq = seqs, reftax = ranks, name = test[0], method = "uclust", temfolder = tf) f_uclust.write(ru) rr, result_rdp = findmis(refseq = seqs, reftax = ranks, name = test[0], method = "rdp", temfolder = tf) f_rdp.write(rr) rb, result_blast = findmis(refseq = seqs, reftax = ranks, name = test[0], method = "blast", temfolder = tf) f_blast.write(rb) truth = test[1] if len(truth) == 8: f_mis.write(test[0] + " " + rank2string(truth[0:-1]) + "\n") rank_nr = int(truth[7]) if len(result_uclust) > rank_nr and result_uclust[rank_nr] == truth[rank_nr]: num_corrected_uclust = num_corrected_uclust + 1 if len(result_rdp) > rank_nr and result_rdp[rank_nr] == truth[rank_nr]: num_corrected_rdp = num_corrected_rdp + 1 if len(result_blast) > rank_nr and result_blast[rank_nr] == truth[rank_nr]: num_corrected_blast = num_corrected_blast + 1 else: f_umis.write(test[0] + " " + rank2string(truth) + "\n") if result_uclust == truth: num_unchanged_uclust = num_unchanged_uclust + 1 if result_rdp == truth: num_unchanged_rdp = num_unchanged_rdp + 1 if result_uclust == truth: num_unchanged_blast = num_unchanged_blast + 1 print("truth:" + repr(truth)) print("uclust:" + repr(result_uclust)) print("rdp:"+ repr(result_rdp)) print("blast:" +repr(result_blast)) f_uclust.close() f_rdp.close() f_blast.close() f_mis.close() f_umis.close() print("method corrected unchanged") print("uclust"+ " " +repr(num_corrected_uclust) + " " + repr(num_unchanged_uclust)) print("rdp"+ " " +repr(num_corrected_rdp) + " " + repr(num_unchanged_rdp)) print("blast"+ " " +repr(num_corrected_blast) + " " + repr(num_unchanged_blast)) with open(testingtax+".results", "w") as fo: fo.write("method corrected unchanged \n") fo.write("uclust"+ " " +repr(num_corrected_uclust) + " " + repr(num_unchanged_uclust) + "\n") fo.write("rdp"+ " " +repr(num_corrected_rdp) + " " + repr(num_unchanged_rdp) + "\n") fo.write("blast"+ " " +repr(num_corrected_blast) + " " + repr(num_unchanged_blast) + "\n")
def get_ref_alignment(self): entries = self.jdata["sequences"] alignment = SeqGroup() for entr in entries: alignment.set_seq(entr[0], entr[1]) return alignment