indir, outdir = core.getOutdir(ins, "run_gblocks", starttime) filelist = os.listdir(indir) print core.getTime() + " | Creating main output directory..." os.system("mkdir " + outdir) logfilename = outdir + "run_gblocks.log" logfile = open(logfilename, "w") logfile.write("") logfile.close() core.logCheck( l, logfilename, "=======================================================================") core.logCheck(l, logfilename, "\t\t\tMasking alignments with GBlocks") core.logCheck(l, logfilename, "\t\t\t" + core.getDateTime()) if fileflag == 1: core.logCheck(l, logfilename, "INPUT | Masking alignment from file: " + ins) else: core.logCheck(l, logfilename, "INPUT | Masking alignments from all files in: " + indir) core.logCheck(l, logfilename, "INFO | GBlocks path set to: " + gb_path) core.logCheck(l, logfilename, "INFO | Sequence type set to: " + seqtype) if m == 1: core.logCheck( l, logfilename, "INFO | Only accepting alignments with < 20% of sequence masked (for tree making)." ) else: core.logCheck(l, logfilename,
if len(sys.argv) not in [1, 2, 3]: print "Usage:\t$ count_pos.py [input directory or filename] [1,0 to display individual file counts or not]" sys.exit() ins = sys.argv[1] disp_file = 0 if len(sys.argv) > 2: disp_file = sys.argv[2] if disp_file not in ["0", "1"]: print "Not printing file counts." disp_file = 0 disp_file = int(disp_file) print "=======================================================================" print "\t\t\t" + core.getDateTime() print "Counting the total number of positions (AAs or NTs) in:\t" + ins if os.path.isfile(ins): if disp_file == 1: print "----------" print "Sequence\tLength" inseqs = core.fastaGetDict(ins) tot_pos = 0 for seq in inseqs: if disp_file == 1: print seq + "\t" + str(len(inseqs[seq])) tot_pos = tot_pos + len(inseqs[seq]) print "----------" print "Total sequences:\t" + str(len(inseqs)) print "Total positions:\t" + str(tot_pos)
else: fileflag = 0; indir, outdir = core.getOutdir(ins, "run_gblocks", starttime); filelist = os.listdir(indir); print core.getTime() + " | Creating main output directory..."; os.system("mkdir " + outdir); logfilename = outdir + "run_gblocks.log"; logfile = open(logfilename, "w"); logfile.write(""); logfile.close(); core.logCheck(l, logfilename, "======================================================================="); core.logCheck(l, logfilename, "\t\t\tMasking alignments with GBlocks"); core.logCheck(l, logfilename, "\t\t\t" + core.getDateTime()); if fileflag == 1: core.logCheck(l, logfilename, "INPUT | Masking alignment from file: " + ins); else: core.logCheck(l, logfilename, "INPUT | Masking alignments from all files in: " + indir); core.logCheck(l, logfilename, "INFO | GBlocks path set to: " + gb_path); core.logCheck(l, logfilename, "INFO | Sequence type set to: " + seqtype); if m == 1: core.logCheck(l, logfilename, "INFO | Only accepting alignments with < 20% of sequence masked (for tree making)."); else: core.logCheck(l, logfilename, "INFO | Using default GBlocks settings (stringent)."); if v == 1: core.logCheck(l, logfilename, "INFO | Printing all GBlocks output to the screen."); else: core.logCheck(l, logfilename, "INFO | Silent mode. Not printing GBlocks output to the screen."); core.logCheck(l, logfilename, "OUTPUT | An output directory has been created within the input directory called: " + outdir);
tid_to_eids = {} new_seqdict = {} for title in seqdict: tid = title.split("|")[1] eids = title.split("|")[2].split(";") tid_to_eids[tid] = eids return tid_to_eids ############################################################ transcripts_file = "../02-Annotation-data/selected-transcripts-targets.txt" ratfile = "../Reference-genomes/Rnor6/rnor6-ens99-ids.tab" core.PWS("# " + core.getDateTime() + " Reading selected transcripts for exome analysis: " + transcripts_file) mtid_to_rpid = {} first = True for line in open(transcripts_file): if line[0] == "#": continue if first: first = False continue # Skip comment and header lines. line = line.strip().split("\t") mtid_to_rpid[line[1]] = line[6] # Add the mouse transcript id to the transcripts dict
if len(sys.argv) not in [1,2,3]: print "Usage:\t$ count_pos.py [input directory or filename] [1,0 to display individual file counts or not]"; sys.exit(); ins = sys.argv[1]; disp_file = 0; if len(sys.argv) > 2: disp_file = sys.argv[2]; if disp_file not in ["0","1"]: print "Not printing file counts."; disp_file = 0; disp_file = int(disp_file); print "======================================================================="; print "\t\t\t" + core.getDateTime(); print "Counting the total number of positions (AAs or NTs) in:\t" + ins; if os.path.isfile(ins): if disp_file == 1: print "----------"; print "Sequence\tLength"; inseqs = core.fastaGetDict(ins); tot_pos = 0; for seq in inseqs: if disp_file == 1: print seq + "\t" + str(len(inseqs[seq])); tot_pos = tot_pos + len(inseqs[seq]); print "----------"; print "Total sequences:\t" + str(len(inseqs)); print "Total positions:\t" + str(tot_pos);
sys.exit(" * Error 3: An output directory must be defined with -o.") if os.path.isdir(args.outdir) and not args.overwrite: sys.exit( " * Error 4: Output directory (-o) already exists! Explicity specify --overwrite to overwrite it." ) if not os.path.isdir(args.outdir): os.system("mkdir " + args.outdir) args.outdir = os.path.abspath(args.outdir) # IO option error checking prequal_dir = False # Maybe add functionality for prequal filtered alignments later. core.PWS("# " + core.getDateTime() + " Starting back translation.") aa_files = [f for f in os.listdir(args.aa_dir) if f.endswith(".fa")] num_files = len(aa_files) # Read the AA alignment file names. counter = 0 for f in aa_files: if counter % 10 == 0: print(counter, "/", num_files) #print(counter) counter += 1 pid = f.split("-")[0].replace(".fa", "") #if pid != "ENSMUSP00000021056": # continue; # Get the protein id by splitting the file name by - and removing the extension.
dataset) #### exclude_samples = [] add_rat = False add_mouse = False rm_samples = False rmdir = "../03-Alignments/samples-to-rm/" # Job variables #### orthfile = "../02-Annotation-data/mm10-rnor6-master-transcript-id-table.tab" # The ortholog file between mouse and rat. core.PWS("# " + core.getDateTime() + " Reading selected transcript IDs: " + orthfile) orth_tids = {} for line in open(orthfile): line = line.strip().split("\t") orth_tids[line[1]] = line[4] # Add the related gene ids to the orths dict. mouse_tids = set(list(orth_tids.keys())) rat_tids = set(list(orth_tids.values())) core.PWS("# " + core.getDateTime() + " Transcripts read: " + str(len(orth_tids))) core.PWS("# ----------------") # Read the list of selected transcripts from the master table, with mouse and rat IDs ####
if tid not in transcripts: transcripts[tid] = { 'coding-exons': 0, 'total-exons': 0 } transcripts[tid]['total-exons'] += 1 if coding == "TRUE": transcripts[tid]['coding-exons'] += 1 core.PWS("# Total transcripts read: " + str(len(transcripts))) core.PWS("# ----------------") # Reads the mouse and target annotation info. core.PWS("# " + core.getDateTime() + " Reading BLAST hits per transcript: " + blast_file) samples = {} for line in open(blast_file): line = line.strip().split(" ") #print(line); sample, tid = line[2], line[5] if tid == "NA": continue if sample not in samples: samples[sample] = {} if tid not in samples[sample]:
############################################################ import sys, os, core ############################################################ orthfile = "../02-Annotation-data/mouse-rat-orths-ens99.txt" transcripts_file = "../02-Annotation-data/selected-transcripts-targets.txt" blastfile = "../03-Alignments/blast/mm10-exon-to-rnor6/exon-to-exon-hits-bit.txt" mousefile = "../Reference-genomes/mm10/mm10-ens99-ids.tab" ratfile = "../Reference-genomes/Rnor6/rnor6-ens99-ids.tab" outfilename = "../02-Annotation-data/mm10-rnor6-master-transcript-id-table.tab" headers = ["mgid", "mtid", "mpid", "meid", "reid", "rpid", "rtid", "rgid"] core.PWS("# " + core.getDateTime() + " Reading mouse-rat one-to-one orthologs: " + orthfile) orths = {} # Dict to convert between mouse transcript IDs and rat protein IDs first = True for line in open(orthfile): if first: first = False continue # Skip the header line = line.strip().split("\t") if len(line) < 6: continue # If there are no orths, skip.
def convCheck(cur_c, c, number_specs, d, ins, outs): # cur_c = 0; init_c = cur_c+1; while cur_c < c: #if c > 1: spec_list = all_specs.values(); rep_specs = []; while len(rep_specs) < number_specs: r = random.choice(spec_list); rep_specs.append(r); spec_list.remove(r); outfilename = outs + "_" + str(cur_c+1) + ".txt"; outfile = open(outfilename, "w"); outfile.write("# ==============================================================================================\n"); outfile.write("# \t\t\tConvergence testing\n"); outfile.write("# \t\t\t" + core.getDateTime() + "\n"); outfile.write("# Using alignments in:\t\t" + indir + "\n"); outfile.write("# Randomly choosing " + str(number_specs) + " species and performing " + str(c) + " replicate tests for convergence.\n"); outfile.write("# This is replicate number " + str(cur_c+1) + "\n"); outfile.write("# Writing output to:\t\t\t" + outfilename + "\n"); if d == 0: outfile.write("# Checking for convergent sites.\n"); elif d == 1: outfile.write("# Checking for divergent sites.\n"); outfile.write("# Using species:\t" + ",".join(rep_specs)); outfile.write("# ---------------------------------------------\n"); #sys.exit(); #cur_c = cur_c + 1; #continue; aligns = os.listdir(ins); numbars = 0; donepercent = []; count = len(aligns); i = 0; numsites = 0; totgenes = 0; outfile.write("# " + core.getTime() + " Starting Scan...\n"); outfile.write("# Site#\tTargetSpecs\tChromosome\tGeneID\tAlignLen\tPosition\tTargetAlleles\tBackgroundAlleles\n"); for align in aligns: #numbars, donepercent = core.loadingBar(i, count, donepercent, numbars); i = i + 1; if align.find(".fa") == -1: continue; infilename = os.path.join(ins, align); gid = "_".join(align.split("_")[:2]); chrome = align[align.find("chr"):align.find("chr")+4] inseqs = core.fastaGetDict(infilename); for t1 in rep_specs: for t2 in rep_specs: if t1 == t2: continue; targets = [t1, t2]; backgrounds = [spec for spec in rep_specs if spec not in targets]; num_targets_present = 0; num_bg_present = 0; for title in inseqs: if any(t in title for t in targets): num_targets_present = num_targets_present + 1; if any(b in title for b in backgrounds): num_bg_present = num_bg_present + 1; if num_targets_present == len(targets) and num_bg_present == len(backgrounds): # print "The following gene has all target and background species and will be checked:\t\t" + gid; totgenes = totgenes + 1; seqlen = len(inseqs[inseqs.keys()[0]]); # print "Alignment length\t\t", seqlen; t_alleles = {}; b_alleles = {}; for x in xrange(len(inseqs[inseqs.keys()[0]])): for title in inseqs: cur_spec = title[1:].replace("\n",""); if cur_spec in targets: t_alleles[cur_spec] = inseqs[title][x]; if cur_spec in backgrounds: b_alleles[cur_spec] = inseqs[title][x]; t_states = t_alleles.values(); #t_gap = t_states.count("-"); #t_missing = t_states.count("X"); #t_stop = t_states.count("*"); b_states = b_alleles.values(); #b_gap = b_states.count("-"); #b_missing = b_states.count("X"); #b_stop = b_states.count("*"); t_final = remGapMiss(t_states); b_final = remGapMiss(b_states); if t_final == [] or b_final == []: continue; if d == 0: if len(t_final) == len(targets) and len(b_final) == len(backgrounds) and t_final.count(t_final[0]) == len(t_final) and t_final[0] not in b_final: numsites = numsites + 1; print core.getTime() + " Convergent site found!"; print "Filename:\t\t" + align; print "Chromosome:\t\t" + chrome; print "Gene ID:\t\t" + gid; print "Alignment length\t", seqlen; print "Target alleles:\t\t" + "".join(t_final); print "Background alleles:\t" + "".join(b_final); print "---------------"; outline = str(numsites) + "\t" + ",".join(targets) + "\t" + chrome + "\t" + gid + "\t" + str(seqlen) + "\t" + str(x+1) + "\t" + "".join(t_final) + "\t" + "".join(b_final) + "\n"; outfile.write(outline); elif d == 1: if len(t_final) == len(targets) and len(b_final) == len(backgrounds) and t_final.count(t_final[0]) != len(t_final) and b_final.count(b_final[0]) == len(b_final): if not any(t in b_final for t in t_final): numsites = numsites + 1; # print "\nDivergent site found!"; # print "Filename:\t\t" + align; # print "Chromosome:\t\t" + chrome; # print "Gene ID:\t\t" + gid; # print "Alignment length\t", seqlen; # print t_final; # print b_final; outline = str(numsites) + "\t" + ",".join(targets) + "\t" + chrome + "\t" + gid + "\t" + str(seqlen) + "\t" + str(x+1) + "\t" + "".join(t_final) + "\t" + "".join(b_final) + "\n"; outfile.write(outline); #pstring = "100.0% complete."; #sys.stderr.write('\b' * len(pstring) + pstring); outfile.write("\n# " + core.getTime() + " Done!\n"); outfile.write("# Total sites found: " + str(numsites) + "\n"); outfile.write("# Total genes checked: " + str(totgenes) + "\n"); outfile.write("# =============================================================================================="); cur_c = cur_c + 1; if ropt != 0: print core.getTime() + " Replicates", init_c, "to", c, "complete.";
core.spacedOut("# --overwrite set:", pad) + "Overwriting previous files in output directory.", logfile) if args.count_only: core.PWS( core.spacedOut("# --count set:", pad) + "Will not output sequences.", logfile) core.PWS(core.spacedOut("# Log file:", pad) + log_file, logfile) core.PWS("# ----------------", logfile) args.seq_filter = args.seq_filter / 100 args.site_filter = args.site_filter / 100 ########################## # Filtering CDS aligns core.PWS("# " + core.getDateTime() + " Beginning filter.", logfile) rm_stop_codons, rm_gappy, rm_protein_gappy = [], [], [] pre_samples, pre_proteins = 0, 0 post_samples, post_proteins = 0, 0 aln_stats = [ "num seqs", "codon aln length", "avg nongap length", "uniq seqs", "ident seqs", "gappy seqs", "invariant sites", "stop codons", "percent sites with gap", "gappy sites" ] aln_headers = ["align"] + ["pre " + s for s in aln_stats] if not args.count_only: aln_headers += ["sites filtered"] + ["post " + s for s in aln_stats] core.PWS("\t".join(aln_headers), logfile) # The alignment global headers
def convCheck(cur_c, c, ropt, targets, backgrounds, d, ins, outs): # cur_c = 0; init_c = cur_c+1; while cur_c < c: #if c > 1: if ropt != 0: outfilename = outs + "_" + str(cur_c+1) + ".txt"; else: outfilename = outs + ".txt"; if ropt != 0: #backgrounds = ["hg19", "rheMac3", "calJac3", "mm10", "rn5", "vicPac2", "bosTau7", "canFam3", "loxAfr3", "papHam1", "monDom5"]; backgrounds = []; cur_r = len(backgrounds); while cur_r < ropt: chosenspec = random.choice(all_specs.values()); if chosenspec not in targets and chosenspec not in backgrounds: backgrounds.append(chosenspec); cur_r = cur_r + 1; outfile = open(outfilename, "w"); outfile.write("# ==============================================================================================\n"); outfile.write("# \t\t\tConvergence testing\n"); outfile.write("# \t\t\t" + core.getDateTime() + "\n"); outfile.write("# Using alignments in:\t\t" + indir + "\n"); outfile.write("# Target species:\t\t\t" + ", ".join(targets) + "\n"); if ropt != 0: outfile.write("# Randomly choosing " + str(r) + " background species and performing " + str(c) + " replicate tests for convergence.\n"); outfile.write("# This is replicate number " + str(cur_c+1) + "\n"); outfile.write("# Background species:\t\t" + ", ".join(backgrounds) + "\n"); outfile.write("# Writing output to:\t\t\t" + outfilename + "\n"); if d == 0: outfile.write("# Checking for convergent sites.\n"); elif d == 1: outfile.write("# Checking for divergent sites.\n"); outfile.write("# ---------------------------------------------\n"); #sys.exit(); #cur_c = cur_c + 1; #continue; aligns = os.listdir(ins); numbars = 0; donepercent = []; count = len(aligns); i = 0; numsites = 0; totgenes = 0; outfile.write("# " + core.getTime() + " Starting Scan...\n"); outfile.write("# Site#\tChromosome\tGeneID\tAlignLen\tPosition\tTargetAlleles\tBackgroundAlleles\n"); for align in aligns: #numbars, donepercent = core.loadingBar(i, count, donepercent, numbars); i = i + 1; if align.find(".fa") == -1: continue; #if i > 25: # break; infilename = ins + align; #print align; gid = "_".join(align.split("_")[:2]); chrome = align[align.find("chr"):align.find("chr")+4] inseqs = core.fastaGetDict(infilename); num_targets_present = 0; num_bg_present = 0; for title in inseqs: if any(t in title for t in targets): num_targets_present = num_targets_present + 1; if any(b in title for b in backgrounds): num_bg_present = num_bg_present + 1; if num_targets_present == len(targets) and num_bg_present == len(backgrounds): #print "The following gene has all target and background species and will be checked:\t\t" + gid; totgenes = totgenes + 1; seqlen = len(inseqs[inseqs.keys()[0]]); #print "Alignment length\t\t", seqlen; t_alleles = {}; b_alleles = {}; for x in xrange(len(inseqs[inseqs.keys()[0]])): for title in inseqs: for t in targets: if t in title: t_alleles[t] = inseqs[title][x]; for b in backgrounds: if b in title: b_alleles[b] = inseqs[title][x]; t_states = t_alleles.values(); t_gap = t_states.count("-"); t_missing = t_states.count("X"); t_stop = t_states.count("*"); b_states = b_alleles.values(); b_gap = b_states.count("-"); b_missing = b_states.count("X"); b_stop = b_states.count("*"); t_final = remGapMiss(t_states); b_final = remGapMiss(b_states); #print t_alleles; #print t_states; #print t_gap; #print t_missing; #print t_stop; #print t_final; #print b_alleles; #print b_states; #print b_gap; #print b_missing; #print b_stop; #print b_final; if t_final == [] or b_final == []: continue; if d == 0: if len(t_final) == len(targets) and len(b_final) == len(backgrounds) and t_final.count(t_final[0]) == len(t_final) and t_final[0] not in b_final: numsites = numsites + 1; #print core.getTime() + " Convergent site found!"; #print "Filename:\t\t" + align; #print "Chromosome:\t\t" + chrome; #print "Gene ID:\t\t" + gid; #print "Alignment length\t", seqlen; #print "Target alleles:\t\t" + "".join(t_final); #print "Background alleles:\t" + "".join(b_final); #print "---------------"; outline = str(numsites) + "\t" + chrome + "\t" + gid + "\t" + str(seqlen) + "\t" + str(x+1) + "\t" + "".join(t_final) + "\t" + "".join(b_final) + "\n"; outfile.write(outline); #sys.exit(); elif d == 1: if len(t_final) == len(targets) and len(b_final) == len(backgrounds) and t_final.count(t_final[0]) != len(t_final) and b_final.count(b_final[0]) == len(b_final): if not any(t in b_final for t in t_final): numsites = numsites + 1; #print "\nDivergent site found!"; #print "Filename:\t\t" + align; #print "Chromosome:\t\t" + chrome; #print "Gene ID:\t\t" + gid; #print "Alignment length\t", seqlen; #print t_final; #print b_final; outline = str(numsites) + "\t" + chrome + "\t" + gid + "\t" + str(seqlen) + "\t" + str(x+1) + "\t" + "".join(t_final) + "\t" + "".join(b_final) + "\n"; outfile.write(outline); #pstring = "100.0% complete."; #sys.stderr.write('\b' * len(pstring) + pstring); outfile.write("\n# " + core.getTime() + " Done!\n"); outfile.write("# Total sites found: " + str(numsites) + "\n"); outfile.write("# Total genes checked: " + str(totgenes) + "\n"); outfile.write("# =============================================================================================="); cur_c = cur_c + 1; if ropt != 0: print core.getTime() + " Replicates", init_c, "to", c, "complete.";
orthfile = "master-transcript-id-table.tab" # The ortholog file between mouse and rat. orth_tids = {} first = True for line in open(orthfile): if first: first = False continue line = line.strip().split("\t") orth_tids[line[1]] = line[4] # Add the related gene ids to the orths dict. mouse_tids = set(list(orth_tids.keys())) rat_tids = set(list(orth_tids.values())) core.PWS("# " + core.getDateTime() + " Orthologs read: " + str(len(orth_tids))) core.PWS("# ----------------") core.PWS("# " + core.getDateTime() + " Reading BLAST file: " + args.blastfile) query_hits = defaultdict(list) total_hits, query_ids, target_ids = 0, [], [] for line in open(args.blastfile): total_hits += 1 line = line.strip().split("\t") query_gid = line[0].split("|")[0] query_tids = set(line[0].split("|")[1].split(";")) query_eid = line[0].split("|")[2] target_gid = line[1].split("|")[0] target_tids = set(line[1].split("|")[1].split(";")) target_eid = line[1].split("|")[2] aln_len = int(line[3])
def convCheck(cur_c, c, number_specs, d, ins, outs): # cur_c = 0; init_c = cur_c + 1 while cur_c < c: #if c > 1: spec_list = all_specs.values() rep_specs = [] while len(rep_specs) < number_specs: r = random.choice(spec_list) rep_specs.append(r) spec_list.remove(r) outfilename = outs + "_" + str(cur_c + 1) + ".txt" outfile = open(outfilename, "w") outfile.write( "# ==============================================================================================\n" ) outfile.write("# \t\t\tConvergence testing\n") outfile.write("# \t\t\t" + core.getDateTime() + "\n") outfile.write("# Using alignments in:\t\t" + indir + "\n") outfile.write("# Randomly choosing " + str(number_specs) + " species and performing " + str(c) + " replicate tests for convergence.\n") outfile.write("# This is replicate number " + str(cur_c + 1) + "\n") outfile.write("# Writing output to:\t\t\t" + outfilename + "\n") if d == 0: outfile.write("# Checking for convergent sites.\n") elif d == 1: outfile.write("# Checking for divergent sites.\n") outfile.write("# Using species:\t" + ",".join(rep_specs)) outfile.write("# ---------------------------------------------\n") #sys.exit(); #cur_c = cur_c + 1; #continue; aligns = os.listdir(ins) numbars = 0 donepercent = [] count = len(aligns) i = 0 numsites = 0 totgenes = 0 outfile.write("# " + core.getTime() + " Starting Scan...\n") outfile.write( "# Site#\tTargetSpecs\tChromosome\tGeneID\tAlignLen\tPosition\tTargetAlleles\tBackgroundAlleles\n" ) for align in aligns: #numbars, donepercent = core.loadingBar(i, count, donepercent, numbars); i = i + 1 if align.find(".fa") == -1: continue infilename = os.path.join(ins, align) gid = "_".join(align.split("_")[:2]) chrome = align[align.find("chr"):align.find("chr") + 4] inseqs = core.fastaGetDict(infilename) for t1 in rep_specs: for t2 in rep_specs: if t1 == t2: continue targets = [t1, t2] backgrounds = [ spec for spec in rep_specs if spec not in targets ] num_targets_present = 0 num_bg_present = 0 for title in inseqs: if any(t in title for t in targets): num_targets_present = num_targets_present + 1 if any(b in title for b in backgrounds): num_bg_present = num_bg_present + 1 if num_targets_present == len( targets) and num_bg_present == len(backgrounds): # print "The following gene has all target and background species and will be checked:\t\t" + gid; totgenes = totgenes + 1 seqlen = len(inseqs[inseqs.keys()[0]]) # print "Alignment length\t\t", seqlen; t_alleles = {} b_alleles = {} for x in xrange(len(inseqs[inseqs.keys()[0]])): for title in inseqs: cur_spec = title[1:].replace("\n", "") if cur_spec in targets: t_alleles[cur_spec] = inseqs[title][x] if cur_spec in backgrounds: b_alleles[cur_spec] = inseqs[title][x] t_states = t_alleles.values() #t_gap = t_states.count("-"); #t_missing = t_states.count("X"); #t_stop = t_states.count("*"); b_states = b_alleles.values() #b_gap = b_states.count("-"); #b_missing = b_states.count("X"); #b_stop = b_states.count("*"); t_final = remGapMiss(t_states) b_final = remGapMiss(b_states) if t_final == [] or b_final == []: continue if d == 0: if len(t_final) == len(targets) and len( b_final ) == len(backgrounds) and t_final.count( t_final[0]) == len( t_final ) and t_final[0] not in b_final: numsites = numsites + 1 print core.getTime( ) + " Convergent site found!" print "Filename:\t\t" + align print "Chromosome:\t\t" + chrome print "Gene ID:\t\t" + gid print "Alignment length\t", seqlen print "Target alleles:\t\t" + "".join( t_final) print "Background alleles:\t" + "".join( b_final) print "---------------" outline = str(numsites) + "\t" + ",".join( targets ) + "\t" + chrome + "\t" + gid + "\t" + str( seqlen) + "\t" + str( x + 1) + "\t" + "".join( t_final) + "\t" + "".join( b_final) + "\n" outfile.write(outline) elif d == 1: if len(t_final) == len(targets) and len( b_final ) == len(backgrounds) and t_final.count( t_final[0]) != len( t_final) and b_final.count( b_final[0]) == len(b_final): if not any(t in b_final for t in t_final): numsites = numsites + 1 # print "\nDivergent site found!"; # print "Filename:\t\t" + align; # print "Chromosome:\t\t" + chrome; # print "Gene ID:\t\t" + gid; # print "Alignment length\t", seqlen; # print t_final; # print b_final; outline = str( numsites ) + "\t" + ",".join( targets ) + "\t" + chrome + "\t" + gid + "\t" + str( seqlen) + "\t" + str( x + 1) + "\t" + "".join( t_final) + "\t" + "".join( b_final) + "\n" outfile.write(outline) #pstring = "100.0% complete."; #sys.stderr.write('\b' * len(pstring) + pstring); outfile.write("\n# " + core.getTime() + " Done!\n") outfile.write("# Total sites found: " + str(numsites) + "\n") outfile.write("# Total genes checked: " + str(totgenes) + "\n") outfile.write( "# ==============================================================================================" ) cur_c = cur_c + 1 if ropt != 0: print core.getTime() + " Replicates", init_c, "to", c, "complete."
def convCheck(cur_c, c, ropt, targets, backgrounds, d, ins, outs): # cur_c = 0; init_c = cur_c + 1 while cur_c < c: #if c > 1: if ropt != 0: outfilename = outs + "_" + str(cur_c + 1) + ".txt" else: outfilename = outs + ".txt" if ropt != 0: #backgrounds = ["hg19", "rheMac3", "calJac3", "mm10", "rn5", "vicPac2", "bosTau7", "canFam3", "loxAfr3", "papHam1", "monDom5"]; backgrounds = [] cur_r = len(backgrounds) while cur_r < ropt: chosenspec = random.choice(all_specs.values()) if chosenspec not in targets and chosenspec not in backgrounds: backgrounds.append(chosenspec) cur_r = cur_r + 1 outfile = open(outfilename, "w") outfile.write( "# ==============================================================================================\n" ) outfile.write("# \t\t\tConvergence testing\n") outfile.write("# \t\t\t" + core.getDateTime() + "\n") outfile.write("# Using alignments in:\t\t" + indir + "\n") outfile.write("# Target species:\t\t\t" + ", ".join(targets) + "\n") if ropt != 0: outfile.write("# Randomly choosing " + str(r) + " background species and performing " + str(c) + " replicate tests for convergence.\n") outfile.write("# This is replicate number " + str(cur_c + 1) + "\n") outfile.write("# Background species:\t\t" + ", ".join(backgrounds) + "\n") outfile.write("# Writing output to:\t\t\t" + outfilename + "\n") if d == 0: outfile.write("# Checking for convergent sites.\n") elif d == 1: outfile.write("# Checking for divergent sites.\n") outfile.write("# ---------------------------------------------\n") #sys.exit(); #cur_c = cur_c + 1; #continue; aligns = os.listdir(ins) numbars = 0 donepercent = [] count = len(aligns) i = 0 numsites = 0 totgenes = 0 outfile.write("# " + core.getTime() + " Starting Scan...\n") outfile.write( "# Site#\tChromosome\tGeneID\tAlignLen\tPosition\tTargetAlleles\tBackgroundAlleles\n" ) for align in aligns: #numbars, donepercent = core.loadingBar(i, count, donepercent, numbars); i = i + 1 if align.find(".fa") == -1: continue #if i > 25: # break; infilename = ins + align #print align; gid = "_".join(align.split("_")[:2]) chrome = align[align.find("chr"):align.find("chr") + 4] inseqs = core.fastaGetDict(infilename) num_targets_present = 0 num_bg_present = 0 for title in inseqs: if any(t in title for t in targets): num_targets_present = num_targets_present + 1 if any(b in title for b in backgrounds): num_bg_present = num_bg_present + 1 if num_targets_present == len(targets) and num_bg_present == len( backgrounds): #print "The following gene has all target and background species and will be checked:\t\t" + gid; totgenes = totgenes + 1 seqlen = len(inseqs[inseqs.keys()[0]]) #print "Alignment length\t\t", seqlen; t_alleles = {} b_alleles = {} for x in xrange(len(inseqs[inseqs.keys()[0]])): for title in inseqs: for t in targets: if t in title: t_alleles[t] = inseqs[title][x] for b in backgrounds: if b in title: b_alleles[b] = inseqs[title][x] t_states = t_alleles.values() t_gap = t_states.count("-") t_missing = t_states.count("X") t_stop = t_states.count("*") b_states = b_alleles.values() b_gap = b_states.count("-") b_missing = b_states.count("X") b_stop = b_states.count("*") t_final = remGapMiss(t_states) b_final = remGapMiss(b_states) #print t_alleles; #print t_states; #print t_gap; #print t_missing; #print t_stop; #print t_final; #print b_alleles; #print b_states; #print b_gap; #print b_missing; #print b_stop; #print b_final; if t_final == [] or b_final == []: continue if d == 0: if len(t_final) == len(targets) and len( b_final) == len(backgrounds) and t_final.count( t_final[0]) == len( t_final) and t_final[0] not in b_final: numsites = numsites + 1 #print core.getTime() + " Convergent site found!"; #print "Filename:\t\t" + align; #print "Chromosome:\t\t" + chrome; #print "Gene ID:\t\t" + gid; #print "Alignment length\t", seqlen; #print "Target alleles:\t\t" + "".join(t_final); #print "Background alleles:\t" + "".join(b_final); #print "---------------"; outline = str( numsites ) + "\t" + chrome + "\t" + gid + "\t" + str( seqlen) + "\t" + str(x + 1) + "\t" + "".join( t_final) + "\t" + "".join(b_final) + "\n" outfile.write(outline) #sys.exit(); elif d == 1: if len(t_final) == len(targets) and len( b_final ) == len(backgrounds) and t_final.count( t_final[0]) != len(t_final) and b_final.count( b_final[0]) == len(b_final): if not any(t in b_final for t in t_final): numsites = numsites + 1 #print "\nDivergent site found!"; #print "Filename:\t\t" + align; #print "Chromosome:\t\t" + chrome; #print "Gene ID:\t\t" + gid; #print "Alignment length\t", seqlen; #print t_final; #print b_final; outline = str( numsites ) + "\t" + chrome + "\t" + gid + "\t" + str( seqlen ) + "\t" + str(x + 1) + "\t" + "".join( t_final) + "\t" + "".join(b_final) + "\n" outfile.write(outline) #pstring = "100.0% complete."; #sys.stderr.write('\b' * len(pstring) + pstring); outfile.write("\n# " + core.getTime() + " Done!\n") outfile.write("# Total sites found: " + str(numsites) + "\n") outfile.write("# Total genes checked: " + str(totgenes) + "\n") outfile.write( "# ==============================================================================================" ) cur_c = cur_c + 1 if ropt != 0: print core.getTime() + " Replicates", init_c, "to", c, "complete."