예제 #1
0
    indir, outdir = core.getOutdir(ins, "run_gblocks", starttime)
    filelist = os.listdir(indir)

print core.getTime() + " | Creating main output directory..."
os.system("mkdir " + outdir)

logfilename = outdir + "run_gblocks.log"
logfile = open(logfilename, "w")
logfile.write("")
logfile.close()

core.logCheck(
    l, logfilename,
    "=======================================================================")
core.logCheck(l, logfilename, "\t\t\tMasking alignments with GBlocks")
core.logCheck(l, logfilename, "\t\t\t" + core.getDateTime())
if fileflag == 1:
    core.logCheck(l, logfilename,
                  "INPUT    | Masking alignment from file: " + ins)
else:
    core.logCheck(l, logfilename,
                  "INPUT    | Masking alignments from all files in: " + indir)
core.logCheck(l, logfilename, "INFO     | GBlocks path set to: " + gb_path)
core.logCheck(l, logfilename, "INFO     | Sequence type set to: " + seqtype)
if m == 1:
    core.logCheck(
        l, logfilename,
        "INFO     | Only accepting alignments with < 20% of sequence masked (for tree making)."
    )
else:
    core.logCheck(l, logfilename,
예제 #2
0
파일: count_pos.py 프로젝트: rtraborn/core
if len(sys.argv) not in [1, 2, 3]:
    print "Usage:\t$ count_pos.py [input directory or filename] [1,0 to display individual file counts or not]"
    sys.exit()

ins = sys.argv[1]
disp_file = 0
if len(sys.argv) > 2:
    disp_file = sys.argv[2]
if disp_file not in ["0", "1"]:
    print "Not printing file counts."
    disp_file = 0

disp_file = int(disp_file)

print "======================================================================="
print "\t\t\t" + core.getDateTime()
print "Counting the total number of positions (AAs or NTs) in:\t" + ins

if os.path.isfile(ins):
    if disp_file == 1:
        print "----------"
        print "Sequence\tLength"
    inseqs = core.fastaGetDict(ins)
    tot_pos = 0
    for seq in inseqs:
        if disp_file == 1:
            print seq + "\t" + str(len(inseqs[seq]))
        tot_pos = tot_pos + len(inseqs[seq])
    print "----------"
    print "Total sequences:\t" + str(len(inseqs))
    print "Total positions:\t" + str(tot_pos)
예제 #3
0
파일: run_gblocks.py 프로젝트: gwct/core
else:
	fileflag = 0;
	indir, outdir = core.getOutdir(ins, "run_gblocks", starttime);
	filelist = os.listdir(indir);

print core.getTime() + " | Creating main output directory...";
os.system("mkdir " + outdir);

logfilename = outdir + "run_gblocks.log";
logfile = open(logfilename, "w");
logfile.write("");
logfile.close();

core.logCheck(l, logfilename, "=======================================================================");
core.logCheck(l, logfilename, "\t\t\tMasking alignments with GBlocks");
core.logCheck(l, logfilename, "\t\t\t" + core.getDateTime());
if fileflag == 1:
	core.logCheck(l, logfilename, "INPUT    | Masking alignment from file: " + ins);
else:
	core.logCheck(l, logfilename, "INPUT    | Masking alignments from all files in: " + indir);
core.logCheck(l, logfilename, "INFO     | GBlocks path set to: " + gb_path);
core.logCheck(l, logfilename, "INFO     | Sequence type set to: " + seqtype);
if m == 1:
	core.logCheck(l, logfilename, "INFO     | Only accepting alignments with < 20% of sequence masked (for tree making).");
else:
	core.logCheck(l, logfilename, "INFO     | Using default GBlocks settings (stringent).");
if v == 1:
	core.logCheck(l, logfilename, "INFO     | Printing all GBlocks output to the screen.");
else:
	core.logCheck(l, logfilename, "INFO     | Silent mode. Not printing GBlocks output to the screen.");
core.logCheck(l, logfilename, "OUTPUT   | An output directory has been created within the input directory called: " + outdir);
    tid_to_eids = {}

    new_seqdict = {}
    for title in seqdict:
        tid = title.split("|")[1]
        eids = title.split("|")[2].split(";")
        tid_to_eids[tid] = eids
    return tid_to_eids


############################################################

transcripts_file = "../02-Annotation-data/selected-transcripts-targets.txt"
ratfile = "../Reference-genomes/Rnor6/rnor6-ens99-ids.tab"

core.PWS("# " + core.getDateTime() +
         " Reading selected transcripts for exome analysis: " +
         transcripts_file)
mtid_to_rpid = {}
first = True
for line in open(transcripts_file):
    if line[0] == "#":
        continue
    if first:
        first = False
        continue
    # Skip comment and header lines.

    line = line.strip().split("\t")
    mtid_to_rpid[line[1]] = line[6]
    # Add the mouse transcript id to the transcripts dict
예제 #5
0
파일: count_pos.py 프로젝트: gwct/core
if len(sys.argv) not in [1,2,3]:
	print "Usage:\t$ count_pos.py [input directory or filename] [1,0 to display individual file counts or not]";
	sys.exit();

ins = sys.argv[1];
disp_file = 0;
if len(sys.argv) > 2:
	disp_file = sys.argv[2];
if disp_file not in ["0","1"]:
	print "Not printing file counts.";
	disp_file = 0;

disp_file = int(disp_file);

print "=======================================================================";
print "\t\t\t" + core.getDateTime();
print "Counting the total number of positions (AAs or NTs) in:\t" + ins;

if os.path.isfile(ins):
	if disp_file == 1:
		print "----------";
		print "Sequence\tLength";
	inseqs = core.fastaGetDict(ins);
	tot_pos = 0;
	for seq in inseqs:
		if disp_file == 1:
			print seq + "\t" + str(len(inseqs[seq]));
		tot_pos = tot_pos + len(inseqs[seq]);
	print "----------";
	print "Total sequences:\t" + str(len(inseqs));
	print "Total positions:\t" + str(tot_pos);
예제 #6
0
    sys.exit(" * Error 3: An output directory must be defined with -o.")

if os.path.isdir(args.outdir) and not args.overwrite:
    sys.exit(
        " * Error 4: Output directory (-o) already exists! Explicity specify --overwrite to overwrite it."
    )

if not os.path.isdir(args.outdir):
    os.system("mkdir " + args.outdir)
args.outdir = os.path.abspath(args.outdir)
# IO option error checking

prequal_dir = False
# Maybe add functionality for prequal filtered alignments later.

core.PWS("# " + core.getDateTime() + " Starting back translation.")
aa_files = [f for f in os.listdir(args.aa_dir) if f.endswith(".fa")]
num_files = len(aa_files)
# Read the AA alignment file names.

counter = 0
for f in aa_files:
    if counter % 10 == 0:
        print(counter, "/", num_files)
    #print(counter)
    counter += 1

    pid = f.split("-")[0].replace(".fa", "")
    #if pid != "ENSMUSP00000021056":
    #    continue;
    # Get the protein id by splitting the file name by - and removing the extension.
예제 #7
0
          dataset)

####

exclude_samples = []
add_rat = False
add_mouse = False
rm_samples = False
rmdir = "../03-Alignments/samples-to-rm/"
# Job variables

####

orthfile = "../02-Annotation-data/mm10-rnor6-master-transcript-id-table.tab"
# The ortholog file between mouse and rat.
core.PWS("# " + core.getDateTime() + " Reading selected transcript IDs: " +
         orthfile)
orth_tids = {}
for line in open(orthfile):
    line = line.strip().split("\t")

    orth_tids[line[1]] = line[4]
    # Add the related gene ids to the orths dict.
mouse_tids = set(list(orth_tids.keys()))
rat_tids = set(list(orth_tids.values()))
core.PWS("# " + core.getDateTime() + " Transcripts read: " +
         str(len(orth_tids)))
core.PWS("# ----------------")
# Read the list of selected transcripts from the master table, with mouse and rat IDs

####
예제 #8
0
    if tid not in transcripts:
        transcripts[tid] = {
            'coding-exons': 0,
            'total-exons': 0
        }

    transcripts[tid]['total-exons'] += 1
    if coding == "TRUE":
        transcripts[tid]['coding-exons'] += 1

core.PWS("# Total transcripts read: " + str(len(transcripts)))
core.PWS("# ----------------")
# Reads the mouse and target annotation info.

core.PWS("# " + core.getDateTime() + " Reading BLAST hits per transcript: " +
         blast_file)
samples = {}
for line in open(blast_file):
    line = line.strip().split(" ")
    #print(line);

    sample, tid = line[2], line[5]

    if tid == "NA":
        continue

    if sample not in samples:
        samples[sample] = {}

    if tid not in samples[sample]:
############################################################

import sys, os, core

############################################################

orthfile = "../02-Annotation-data/mouse-rat-orths-ens99.txt"
transcripts_file = "../02-Annotation-data/selected-transcripts-targets.txt"
blastfile = "../03-Alignments/blast/mm10-exon-to-rnor6/exon-to-exon-hits-bit.txt"
mousefile = "../Reference-genomes/mm10/mm10-ens99-ids.tab"
ratfile = "../Reference-genomes/Rnor6/rnor6-ens99-ids.tab"
outfilename = "../02-Annotation-data/mm10-rnor6-master-transcript-id-table.tab"

headers = ["mgid", "mtid", "mpid", "meid", "reid", "rpid", "rtid", "rgid"]

core.PWS("# " + core.getDateTime() +
         " Reading mouse-rat one-to-one orthologs: " + orthfile)
orths = {}
# Dict to convert between mouse transcript IDs and rat protein IDs
first = True
for line in open(orthfile):
    if first:
        first = False
        continue
    # Skip the header

    line = line.strip().split("\t")

    if len(line) < 6:
        continue
    # If there are no orths, skip.
예제 #10
0
def convCheck(cur_c, c, number_specs, d, ins, outs):
#	cur_c = 0;
	init_c = cur_c+1;
	while cur_c < c:
		#if c > 1:

		spec_list = all_specs.values();
		rep_specs = [];
		while len(rep_specs) < number_specs:
			r = random.choice(spec_list);
			rep_specs.append(r);
			spec_list.remove(r);

		outfilename = outs + "_" + str(cur_c+1) + ".txt";
		outfile = open(outfilename, "w");
		outfile.write("# ==============================================================================================\n");
		outfile.write("# \t\t\tConvergence testing\n");
		outfile.write("# \t\t\t" + core.getDateTime() + "\n");
		outfile.write("# Using alignments in:\t\t" + indir + "\n");
		outfile.write("# Randomly choosing " + str(number_specs) + " species and performing " + str(c) + " replicate tests for convergence.\n");
		outfile.write("# This is replicate number " + str(cur_c+1) + "\n");
		outfile.write("# Writing output to:\t\t\t" + outfilename + "\n");
		if d == 0:
			outfile.write("# Checking for convergent sites.\n");
		elif d == 1:
			outfile.write("# Checking for divergent sites.\n");
		outfile.write("# Using species:\t" + ",".join(rep_specs));
		outfile.write("# ---------------------------------------------\n");

		#sys.exit();
		#cur_c = cur_c + 1;
		#continue;
		aligns = os.listdir(ins);
		numbars = 0;
		donepercent = [];
		count = len(aligns);
		i = 0;
		numsites = 0;
		totgenes = 0;
		outfile.write("# " + core.getTime() + " Starting Scan...\n");
		outfile.write("# Site#\tTargetSpecs\tChromosome\tGeneID\tAlignLen\tPosition\tTargetAlleles\tBackgroundAlleles\n");
		for align in aligns:
			#numbars, donepercent = core.loadingBar(i, count, donepercent, numbars);
			i = i + 1;
			if align.find(".fa") == -1:
				continue;

			infilename = os.path.join(ins, align);
			gid = "_".join(align.split("_")[:2]);
			chrome = align[align.find("chr"):align.find("chr")+4]

			inseqs = core.fastaGetDict(infilename);

			for t1 in rep_specs:
				for t2 in rep_specs:
					if t1 == t2:
						continue;

					targets = [t1, t2];
					backgrounds = [spec for spec in rep_specs if spec not in targets];

					num_targets_present = 0;
					num_bg_present = 0;
					for title in inseqs:
						if any(t in title for t in targets):
							num_targets_present = num_targets_present + 1;
						if any(b in title for b in backgrounds):
							num_bg_present = num_bg_present + 1;

					if num_targets_present == len(targets) and num_bg_present == len(backgrounds):
						# print "The following gene has all target and background species and will be checked:\t\t" + gid;
						totgenes = totgenes + 1;

						seqlen = len(inseqs[inseqs.keys()[0]]);
						# print "Alignment length\t\t", seqlen;

						t_alleles = {};
						b_alleles = {};

						for x in xrange(len(inseqs[inseqs.keys()[0]])):
							for title in inseqs:
								cur_spec = title[1:].replace("\n","");
								if cur_spec in targets:
									t_alleles[cur_spec] = inseqs[title][x];
								if cur_spec in backgrounds:
									b_alleles[cur_spec] = inseqs[title][x];

							t_states = t_alleles.values();
							#t_gap = t_states.count("-");
							#t_missing = t_states.count("X");
							#t_stop = t_states.count("*");

							b_states = b_alleles.values();
							#b_gap = b_states.count("-");
							#b_missing = b_states.count("X");
							#b_stop = b_states.count("*");

							t_final = remGapMiss(t_states);
							b_final = remGapMiss(b_states);

							if t_final == [] or b_final == []:
								continue;

							if d == 0:
								if len(t_final) == len(targets) and len(b_final) == len(backgrounds) and t_final.count(t_final[0]) == len(t_final) and t_final[0] not in b_final:
									numsites = numsites + 1;
									print core.getTime() + " Convergent site found!";
									print "Filename:\t\t" + align;
									print "Chromosome:\t\t" + chrome;
									print "Gene ID:\t\t" + gid;
									print "Alignment length\t", seqlen;
									print "Target alleles:\t\t" + "".join(t_final);
									print "Background alleles:\t" + "".join(b_final);
									print "---------------";
									outline = str(numsites) + "\t" + ",".join(targets) + "\t" + chrome + "\t" + gid + "\t" + str(seqlen) + "\t" + str(x+1) + "\t" + "".join(t_final) + "\t" + "".join(b_final) + "\n";
									outfile.write(outline);

							elif d == 1:
								if len(t_final) == len(targets) and len(b_final) == len(backgrounds) and t_final.count(t_final[0]) != len(t_final) and b_final.count(b_final[0]) == len(b_final):
									if not any(t in b_final for t in t_final):
										numsites = numsites + 1;
										# print "\nDivergent site found!";
										# print "Filename:\t\t" + align;
										# print "Chromosome:\t\t" + chrome;
										# print "Gene ID:\t\t" + gid;
										# print "Alignment length\t", seqlen;
										# print t_final;
										# print b_final;
										outline = str(numsites) + "\t" + ",".join(targets) + "\t" + chrome + "\t" + gid + "\t" + str(seqlen) + "\t" + str(x+1) + "\t" + "".join(t_final) + "\t" + "".join(b_final) + "\n";
										outfile.write(outline);

		#pstring = "100.0% complete.";
		#sys.stderr.write('\b' * len(pstring) + pstring);
		outfile.write("\n# " + core.getTime() + " Done!\n");
		outfile.write("# Total sites found: " + str(numsites) + "\n");
		outfile.write("# Total genes checked: " + str(totgenes) + "\n");
		outfile.write("# ==============================================================================================");
		cur_c = cur_c + 1;
	if ropt != 0:
		print core.getTime() + " Replicates", init_c, "to", c, "complete.";
예제 #11
0
            core.spacedOut("# --overwrite set:", pad) +
            "Overwriting previous files in output directory.", logfile)
    if args.count_only:
        core.PWS(
            core.spacedOut("# --count set:", pad) +
            "Will not output sequences.", logfile)
    core.PWS(core.spacedOut("# Log file:", pad) + log_file, logfile)
    core.PWS("# ----------------", logfile)

    args.seq_filter = args.seq_filter / 100
    args.site_filter = args.site_filter / 100

    ##########################
    # Filtering CDS aligns

    core.PWS("# " + core.getDateTime() + " Beginning filter.", logfile)

    rm_stop_codons, rm_gappy, rm_protein_gappy = [], [], []
    pre_samples, pre_proteins = 0, 0
    post_samples, post_proteins = 0, 0

    aln_stats = [
        "num seqs", "codon aln length", "avg nongap length", "uniq seqs",
        "ident seqs", "gappy seqs", "invariant sites", "stop codons",
        "percent sites with gap", "gappy sites"
    ]
    aln_headers = ["align"] + ["pre " + s for s in aln_stats]
    if not args.count_only:
        aln_headers += ["sites filtered"] + ["post " + s for s in aln_stats]
    core.PWS("\t".join(aln_headers), logfile)
    # The alignment global headers
예제 #12
0
def convCheck(cur_c, c, ropt, targets, backgrounds, d, ins, outs):
#	cur_c = 0;
	init_c = cur_c+1;
	while cur_c < c:
		#if c > 1:
		if ropt != 0:
			outfilename = outs + "_" + str(cur_c+1) + ".txt";
		else:
			outfilename = outs + ".txt";

		if ropt != 0:
			#backgrounds = ["hg19", "rheMac3", "calJac3", "mm10", "rn5", "vicPac2", "bosTau7", "canFam3", "loxAfr3", "papHam1", "monDom5"];
			backgrounds = [];
			cur_r = len(backgrounds);
			while cur_r < ropt:
				chosenspec = random.choice(all_specs.values());

				if chosenspec not in targets and chosenspec not in backgrounds:
					backgrounds.append(chosenspec);
					cur_r = cur_r + 1;

		outfile = open(outfilename, "w");

		outfile.write("# ==============================================================================================\n");
		outfile.write("# \t\t\tConvergence testing\n");
		outfile.write("# \t\t\t" + core.getDateTime() + "\n");
		outfile.write("# Using alignments in:\t\t" + indir + "\n");
		outfile.write("# Target species:\t\t\t" + ", ".join(targets) + "\n");
		if ropt != 0:
			outfile.write("# Randomly choosing " + str(r) + " background species and performing " + str(c) + " replicate tests for convergence.\n");
			outfile.write("# This is replicate number " + str(cur_c+1) + "\n");
		outfile.write("# Background species:\t\t" + ", ".join(backgrounds) + "\n");
		outfile.write("# Writing output to:\t\t\t" + outfilename + "\n");
		if d == 0:
			outfile.write("# Checking for convergent sites.\n");
		elif d == 1:
			outfile.write("# Checking for divergent sites.\n");
		outfile.write("# ---------------------------------------------\n");
		#sys.exit();
		#cur_c = cur_c + 1;
		#continue;
		aligns = os.listdir(ins);

		numbars = 0;
		donepercent = [];
		count = len(aligns);
		i = 0;
		numsites = 0;
		totgenes = 0;
		outfile.write("# " + core.getTime() + " Starting Scan...\n");
		outfile.write("# Site#\tChromosome\tGeneID\tAlignLen\tPosition\tTargetAlleles\tBackgroundAlleles\n");
		for align in aligns:
			#numbars, donepercent = core.loadingBar(i, count, donepercent, numbars);
			i = i + 1;

			if align.find(".fa") == -1:
				continue;

			#if i > 25:
			#	break;

			infilename = ins + align;
			#print align;
			gid = "_".join(align.split("_")[:2]);
			chrome = align[align.find("chr"):align.find("chr")+4]

			inseqs = core.fastaGetDict(infilename);

			num_targets_present = 0;
			num_bg_present = 0;
			for title in inseqs:
				if any(t in title for t in targets):
					num_targets_present = num_targets_present + 1;
				if any(b in title for b in backgrounds):
					num_bg_present = num_bg_present + 1;

			if num_targets_present == len(targets) and num_bg_present == len(backgrounds):
				#print "The following gene has all target and background species and will be checked:\t\t" + gid;
				totgenes = totgenes + 1;

				seqlen = len(inseqs[inseqs.keys()[0]]);
				#print "Alignment length\t\t", seqlen;

				t_alleles = {};
				b_alleles = {};

				for x in xrange(len(inseqs[inseqs.keys()[0]])):
					for title in inseqs:
						for t in targets:
							if t in title:
								t_alleles[t] = inseqs[title][x];
						for b in backgrounds:
							if b in title:
								b_alleles[b] = inseqs[title][x];

					t_states = t_alleles.values();

					t_gap = t_states.count("-");
					t_missing = t_states.count("X");
					t_stop = t_states.count("*");

					b_states = b_alleles.values();

					b_gap = b_states.count("-");
					b_missing = b_states.count("X");
					b_stop = b_states.count("*");

					t_final = remGapMiss(t_states);
					b_final = remGapMiss(b_states);

					#print t_alleles;
					#print t_states;
					#print t_gap;
					#print t_missing;
					#print t_stop;
					#print t_final;

					#print b_alleles;
					#print b_states;
					#print b_gap;
					#print b_missing;
					#print b_stop;
					#print b_final;

					if t_final == [] or b_final == []:
						continue;

					if d == 0:
						if len(t_final) == len(targets) and len(b_final) == len(backgrounds) and t_final.count(t_final[0]) == len(t_final) and t_final[0] not in b_final:
							numsites = numsites + 1;
							#print core.getTime() + " Convergent site found!";
							#print "Filename:\t\t" + align;
							#print "Chromosome:\t\t" + chrome;
							#print "Gene ID:\t\t" + gid;
							#print "Alignment length\t", seqlen;
							#print "Target alleles:\t\t" + "".join(t_final);
							#print "Background alleles:\t" + "".join(b_final);
							#print "---------------";
							outline = str(numsites) + "\t" + chrome + "\t" + gid + "\t" + str(seqlen) + "\t" + str(x+1) + "\t" + "".join(t_final) + "\t" + "".join(b_final) + "\n";
							outfile.write(outline);

							#sys.exit();

					elif d == 1:
						if len(t_final) == len(targets) and len(b_final) == len(backgrounds) and t_final.count(t_final[0]) != len(t_final) and b_final.count(b_final[0]) == len(b_final):
							if not any(t in b_final for t in t_final):
								numsites = numsites + 1;
								#print "\nDivergent site found!";
								#print "Filename:\t\t" + align;
								#print "Chromosome:\t\t" + chrome;
								#print "Gene ID:\t\t" + gid;
								#print "Alignment length\t", seqlen;
								#print t_final;
								#print b_final;
								outline = str(numsites) + "\t" + chrome + "\t" + gid + "\t" + str(seqlen) + "\t" + str(x+1) + "\t" + "".join(t_final) + "\t" + "".join(b_final) + "\n";
								outfile.write(outline);

		#pstring = "100.0% complete.";
		#sys.stderr.write('\b' * len(pstring) + pstring);
		outfile.write("\n# " + core.getTime() + " Done!\n");
		outfile.write("# Total sites found: " + str(numsites) + "\n");
		outfile.write("# Total genes checked: " + str(totgenes) + "\n");
		outfile.write("# ==============================================================================================");
		cur_c = cur_c + 1;
	if ropt != 0:
		print core.getTime() + " Replicates", init_c, "to", c, "complete.";
예제 #13
0
orthfile = "master-transcript-id-table.tab"
# The ortholog file between mouse and rat.
orth_tids = {}
first = True
for line in open(orthfile):
    if first:
        first = False
        continue
    line = line.strip().split("\t")

    orth_tids[line[1]] = line[4]
    # Add the related gene ids to the orths dict.
mouse_tids = set(list(orth_tids.keys()))
rat_tids = set(list(orth_tids.values()))
core.PWS("# " + core.getDateTime() + " Orthologs read: " + str(len(orth_tids)))
core.PWS("# ----------------")

core.PWS("# " + core.getDateTime() + " Reading BLAST file: " + args.blastfile)
query_hits = defaultdict(list)
total_hits, query_ids, target_ids = 0, [], []
for line in open(args.blastfile):
    total_hits += 1
    line = line.strip().split("\t")
    query_gid = line[0].split("|")[0]
    query_tids = set(line[0].split("|")[1].split(";"))
    query_eid = line[0].split("|")[2]
    target_gid = line[1].split("|")[0]
    target_tids = set(line[1].split("|")[1].split(";"))
    target_eid = line[1].split("|")[2]
    aln_len = int(line[3])
예제 #14
0
def convCheck(cur_c, c, number_specs, d, ins, outs):
    #	cur_c = 0;
    init_c = cur_c + 1
    while cur_c < c:
        #if c > 1:

        spec_list = all_specs.values()
        rep_specs = []
        while len(rep_specs) < number_specs:
            r = random.choice(spec_list)
            rep_specs.append(r)
            spec_list.remove(r)

        outfilename = outs + "_" + str(cur_c + 1) + ".txt"
        outfile = open(outfilename, "w")
        outfile.write(
            "# ==============================================================================================\n"
        )
        outfile.write("# \t\t\tConvergence testing\n")
        outfile.write("# \t\t\t" + core.getDateTime() + "\n")
        outfile.write("# Using alignments in:\t\t" + indir + "\n")
        outfile.write("# Randomly choosing " + str(number_specs) +
                      " species and performing " + str(c) +
                      " replicate tests for convergence.\n")
        outfile.write("# This is replicate number " + str(cur_c + 1) + "\n")
        outfile.write("# Writing output to:\t\t\t" + outfilename + "\n")
        if d == 0:
            outfile.write("# Checking for convergent sites.\n")
        elif d == 1:
            outfile.write("# Checking for divergent sites.\n")
        outfile.write("# Using species:\t" + ",".join(rep_specs))
        outfile.write("# ---------------------------------------------\n")

        #sys.exit();
        #cur_c = cur_c + 1;
        #continue;
        aligns = os.listdir(ins)
        numbars = 0
        donepercent = []
        count = len(aligns)
        i = 0
        numsites = 0
        totgenes = 0
        outfile.write("# " + core.getTime() + " Starting Scan...\n")
        outfile.write(
            "# Site#\tTargetSpecs\tChromosome\tGeneID\tAlignLen\tPosition\tTargetAlleles\tBackgroundAlleles\n"
        )
        for align in aligns:
            #numbars, donepercent = core.loadingBar(i, count, donepercent, numbars);
            i = i + 1
            if align.find(".fa") == -1:
                continue

            infilename = os.path.join(ins, align)
            gid = "_".join(align.split("_")[:2])
            chrome = align[align.find("chr"):align.find("chr") + 4]

            inseqs = core.fastaGetDict(infilename)

            for t1 in rep_specs:
                for t2 in rep_specs:
                    if t1 == t2:
                        continue

                    targets = [t1, t2]
                    backgrounds = [
                        spec for spec in rep_specs if spec not in targets
                    ]

                    num_targets_present = 0
                    num_bg_present = 0
                    for title in inseqs:
                        if any(t in title for t in targets):
                            num_targets_present = num_targets_present + 1
                        if any(b in title for b in backgrounds):
                            num_bg_present = num_bg_present + 1

                    if num_targets_present == len(
                            targets) and num_bg_present == len(backgrounds):
                        # print "The following gene has all target and background species and will be checked:\t\t" + gid;
                        totgenes = totgenes + 1

                        seqlen = len(inseqs[inseqs.keys()[0]])
                        # print "Alignment length\t\t", seqlen;

                        t_alleles = {}
                        b_alleles = {}

                        for x in xrange(len(inseqs[inseqs.keys()[0]])):
                            for title in inseqs:
                                cur_spec = title[1:].replace("\n", "")
                                if cur_spec in targets:
                                    t_alleles[cur_spec] = inseqs[title][x]
                                if cur_spec in backgrounds:
                                    b_alleles[cur_spec] = inseqs[title][x]

                            t_states = t_alleles.values()
                            #t_gap = t_states.count("-");
                            #t_missing = t_states.count("X");
                            #t_stop = t_states.count("*");

                            b_states = b_alleles.values()
                            #b_gap = b_states.count("-");
                            #b_missing = b_states.count("X");
                            #b_stop = b_states.count("*");

                            t_final = remGapMiss(t_states)
                            b_final = remGapMiss(b_states)

                            if t_final == [] or b_final == []:
                                continue

                            if d == 0:
                                if len(t_final) == len(targets) and len(
                                        b_final
                                ) == len(backgrounds) and t_final.count(
                                        t_final[0]) == len(
                                            t_final
                                        ) and t_final[0] not in b_final:
                                    numsites = numsites + 1
                                    print core.getTime(
                                    ) + " Convergent site found!"
                                    print "Filename:\t\t" + align
                                    print "Chromosome:\t\t" + chrome
                                    print "Gene ID:\t\t" + gid
                                    print "Alignment length\t", seqlen
                                    print "Target alleles:\t\t" + "".join(
                                        t_final)
                                    print "Background alleles:\t" + "".join(
                                        b_final)
                                    print "---------------"
                                    outline = str(numsites) + "\t" + ",".join(
                                        targets
                                    ) + "\t" + chrome + "\t" + gid + "\t" + str(
                                        seqlen) + "\t" + str(
                                            x + 1) + "\t" + "".join(
                                                t_final) + "\t" + "".join(
                                                    b_final) + "\n"
                                    outfile.write(outline)

                            elif d == 1:
                                if len(t_final) == len(targets) and len(
                                        b_final
                                ) == len(backgrounds) and t_final.count(
                                        t_final[0]) != len(
                                            t_final) and b_final.count(
                                                b_final[0]) == len(b_final):
                                    if not any(t in b_final for t in t_final):
                                        numsites = numsites + 1
                                        # print "\nDivergent site found!";
                                        # print "Filename:\t\t" + align;
                                        # print "Chromosome:\t\t" + chrome;
                                        # print "Gene ID:\t\t" + gid;
                                        # print "Alignment length\t", seqlen;
                                        # print t_final;
                                        # print b_final;
                                        outline = str(
                                            numsites
                                        ) + "\t" + ",".join(
                                            targets
                                        ) + "\t" + chrome + "\t" + gid + "\t" + str(
                                            seqlen) + "\t" + str(
                                                x + 1) + "\t" + "".join(
                                                    t_final) + "\t" + "".join(
                                                        b_final) + "\n"
                                        outfile.write(outline)

        #pstring = "100.0% complete.";
        #sys.stderr.write('\b' * len(pstring) + pstring);
        outfile.write("\n# " + core.getTime() + " Done!\n")
        outfile.write("# Total sites found: " + str(numsites) + "\n")
        outfile.write("# Total genes checked: " + str(totgenes) + "\n")
        outfile.write(
            "# =============================================================================================="
        )
        cur_c = cur_c + 1
    if ropt != 0:
        print core.getTime() + " Replicates", init_c, "to", c, "complete."
예제 #15
0
def convCheck(cur_c, c, ropt, targets, backgrounds, d, ins, outs):
    #	cur_c = 0;
    init_c = cur_c + 1
    while cur_c < c:
        #if c > 1:
        if ropt != 0:
            outfilename = outs + "_" + str(cur_c + 1) + ".txt"
        else:
            outfilename = outs + ".txt"

        if ropt != 0:
            #backgrounds = ["hg19", "rheMac3", "calJac3", "mm10", "rn5", "vicPac2", "bosTau7", "canFam3", "loxAfr3", "papHam1", "monDom5"];
            backgrounds = []
            cur_r = len(backgrounds)
            while cur_r < ropt:
                chosenspec = random.choice(all_specs.values())

                if chosenspec not in targets and chosenspec not in backgrounds:
                    backgrounds.append(chosenspec)
                    cur_r = cur_r + 1

        outfile = open(outfilename, "w")

        outfile.write(
            "# ==============================================================================================\n"
        )
        outfile.write("# \t\t\tConvergence testing\n")
        outfile.write("# \t\t\t" + core.getDateTime() + "\n")
        outfile.write("# Using alignments in:\t\t" + indir + "\n")
        outfile.write("# Target species:\t\t\t" + ", ".join(targets) + "\n")
        if ropt != 0:
            outfile.write("# Randomly choosing " + str(r) +
                          " background species and performing " + str(c) +
                          " replicate tests for convergence.\n")
            outfile.write("# This is replicate number " + str(cur_c + 1) +
                          "\n")
        outfile.write("# Background species:\t\t" + ", ".join(backgrounds) +
                      "\n")
        outfile.write("# Writing output to:\t\t\t" + outfilename + "\n")
        if d == 0:
            outfile.write("# Checking for convergent sites.\n")
        elif d == 1:
            outfile.write("# Checking for divergent sites.\n")
        outfile.write("# ---------------------------------------------\n")
        #sys.exit();
        #cur_c = cur_c + 1;
        #continue;
        aligns = os.listdir(ins)

        numbars = 0
        donepercent = []
        count = len(aligns)
        i = 0
        numsites = 0
        totgenes = 0
        outfile.write("# " + core.getTime() + " Starting Scan...\n")
        outfile.write(
            "# Site#\tChromosome\tGeneID\tAlignLen\tPosition\tTargetAlleles\tBackgroundAlleles\n"
        )
        for align in aligns:
            #numbars, donepercent = core.loadingBar(i, count, donepercent, numbars);
            i = i + 1

            if align.find(".fa") == -1:
                continue

            #if i > 25:
            #	break;

            infilename = ins + align
            #print align;
            gid = "_".join(align.split("_")[:2])
            chrome = align[align.find("chr"):align.find("chr") + 4]

            inseqs = core.fastaGetDict(infilename)

            num_targets_present = 0
            num_bg_present = 0
            for title in inseqs:
                if any(t in title for t in targets):
                    num_targets_present = num_targets_present + 1
                if any(b in title for b in backgrounds):
                    num_bg_present = num_bg_present + 1

            if num_targets_present == len(targets) and num_bg_present == len(
                    backgrounds):
                #print "The following gene has all target and background species and will be checked:\t\t" + gid;
                totgenes = totgenes + 1

                seqlen = len(inseqs[inseqs.keys()[0]])
                #print "Alignment length\t\t", seqlen;

                t_alleles = {}
                b_alleles = {}

                for x in xrange(len(inseqs[inseqs.keys()[0]])):
                    for title in inseqs:
                        for t in targets:
                            if t in title:
                                t_alleles[t] = inseqs[title][x]
                        for b in backgrounds:
                            if b in title:
                                b_alleles[b] = inseqs[title][x]

                    t_states = t_alleles.values()

                    t_gap = t_states.count("-")
                    t_missing = t_states.count("X")
                    t_stop = t_states.count("*")

                    b_states = b_alleles.values()

                    b_gap = b_states.count("-")
                    b_missing = b_states.count("X")
                    b_stop = b_states.count("*")

                    t_final = remGapMiss(t_states)
                    b_final = remGapMiss(b_states)

                    #print t_alleles;
                    #print t_states;
                    #print t_gap;
                    #print t_missing;
                    #print t_stop;
                    #print t_final;

                    #print b_alleles;
                    #print b_states;
                    #print b_gap;
                    #print b_missing;
                    #print b_stop;
                    #print b_final;

                    if t_final == [] or b_final == []:
                        continue

                    if d == 0:
                        if len(t_final) == len(targets) and len(
                                b_final) == len(backgrounds) and t_final.count(
                                    t_final[0]) == len(
                                        t_final) and t_final[0] not in b_final:
                            numsites = numsites + 1
                            #print core.getTime() + " Convergent site found!";
                            #print "Filename:\t\t" + align;
                            #print "Chromosome:\t\t" + chrome;
                            #print "Gene ID:\t\t" + gid;
                            #print "Alignment length\t", seqlen;
                            #print "Target alleles:\t\t" + "".join(t_final);
                            #print "Background alleles:\t" + "".join(b_final);
                            #print "---------------";
                            outline = str(
                                numsites
                            ) + "\t" + chrome + "\t" + gid + "\t" + str(
                                seqlen) + "\t" + str(x + 1) + "\t" + "".join(
                                    t_final) + "\t" + "".join(b_final) + "\n"
                            outfile.write(outline)

                            #sys.exit();

                    elif d == 1:
                        if len(t_final) == len(targets) and len(
                                b_final
                        ) == len(backgrounds) and t_final.count(
                                t_final[0]) != len(t_final) and b_final.count(
                                    b_final[0]) == len(b_final):
                            if not any(t in b_final for t in t_final):
                                numsites = numsites + 1
                                #print "\nDivergent site found!";
                                #print "Filename:\t\t" + align;
                                #print "Chromosome:\t\t" + chrome;
                                #print "Gene ID:\t\t" + gid;
                                #print "Alignment length\t", seqlen;
                                #print t_final;
                                #print b_final;
                                outline = str(
                                    numsites
                                ) + "\t" + chrome + "\t" + gid + "\t" + str(
                                    seqlen
                                ) + "\t" + str(x + 1) + "\t" + "".join(
                                    t_final) + "\t" + "".join(b_final) + "\n"
                                outfile.write(outline)

        #pstring = "100.0% complete.";
        #sys.stderr.write('\b' * len(pstring) + pstring);
        outfile.write("\n# " + core.getTime() + " Done!\n")
        outfile.write("# Total sites found: " + str(numsites) + "\n")
        outfile.write("# Total genes checked: " + str(totgenes) + "\n")
        outfile.write(
            "# =============================================================================================="
        )
        cur_c = cur_c + 1
    if ropt != 0:
        print core.getTime() + " Replicates", init_c, "to", c, "complete."