Exemplo n.º 1
    indir, outdir = core.getOutdir(ins, "run_gblocks", starttime)
    filelist = os.listdir(indir)

print core.getTime() + " | Creating main output directory..."
os.system("mkdir " + outdir)

logfilename = outdir + "run_gblocks.log"
logfile = open(logfilename, "w")

    l, logfilename,
core.logCheck(l, logfilename, "\t\t\tMasking alignments with GBlocks")
core.logCheck(l, logfilename, "\t\t\t" + core.getDateTime())
if fileflag == 1:
    core.logCheck(l, logfilename,
                  "INPUT    | Masking alignment from file: " + ins)
    core.logCheck(l, logfilename,
                  "INPUT    | Masking alignments from all files in: " + indir)
core.logCheck(l, logfilename, "INFO     | GBlocks path set to: " + gb_path)
core.logCheck(l, logfilename, "INFO     | Sequence type set to: " + seqtype)
if m == 1:
        l, logfilename,
        "INFO     | Only accepting alignments with < 20% of sequence masked (for tree making)."
    core.logCheck(l, logfilename,
Exemplo n.º 2
if len(sys.argv) not in [1, 2, 3]:
    print "Usage:\t$ count_pos.py [input directory or filename] [1,0 to display individual file counts or not]"

ins = sys.argv[1]
disp_file = 0
if len(sys.argv) > 2:
    disp_file = sys.argv[2]
if disp_file not in ["0", "1"]:
    print "Not printing file counts."
    disp_file = 0

disp_file = int(disp_file)

print "======================================================================="
print "\t\t\t" + core.getDateTime()
print "Counting the total number of positions (AAs or NTs) in:\t" + ins

if os.path.isfile(ins):
    if disp_file == 1:
        print "----------"
        print "Sequence\tLength"
    inseqs = core.fastaGetDict(ins)
    tot_pos = 0
    for seq in inseqs:
        if disp_file == 1:
            print seq + "\t" + str(len(inseqs[seq]))
        tot_pos = tot_pos + len(inseqs[seq])
    print "----------"
    print "Total sequences:\t" + str(len(inseqs))
    print "Total positions:\t" + str(tot_pos)
Exemplo n.º 7


exclude_samples = []
add_rat = False
add_mouse = False
rm_samples = False
rmdir = "../03-Alignments/samples-to-rm/"
# Job variables


orthfile = "../02-Annotation-data/mm10-rnor6-master-transcript-id-table.tab"
# The ortholog file between mouse and rat.
core.PWS("# " + core.getDateTime() + " Reading selected transcript IDs: " +
orth_tids = {}
for line in open(orthfile):
    line = line.strip().split("\t")

    orth_tids[line[1]] = line[4]
    # Add the related gene ids to the orths dict.
mouse_tids = set(list(orth_tids.keys()))
rat_tids = set(list(orth_tids.values()))
core.PWS("# " + core.getDateTime() + " Transcripts read: " +
core.PWS("# ----------------")
# Read the list of selected transcripts from the master table, with mouse and rat IDs

Exemplo n.º 8
    if tid not in transcripts:
        transcripts[tid] = {
            'coding-exons': 0,
            'total-exons': 0

    transcripts[tid]['total-exons'] += 1
    if coding == "TRUE":
        transcripts[tid]['coding-exons'] += 1

core.PWS("# Total transcripts read: " + str(len(transcripts)))
core.PWS("# ----------------")
# Reads the mouse and target annotation info.

core.PWS("# " + core.getDateTime() + " Reading BLAST hits per transcript: " +
samples = {}
for line in open(blast_file):
    line = line.strip().split(" ")

    sample, tid = line[2], line[5]

    if tid == "NA":

    if sample not in samples:
        samples[sample] = {}

    if tid not in samples[sample]:

import sys, os, core


orthfile = "../02-Annotation-data/mouse-rat-orths-ens99.txt"
transcripts_file = "../02-Annotation-data/selected-transcripts-targets.txt"
blastfile = "../03-Alignments/blast/mm10-exon-to-rnor6/exon-to-exon-hits-bit.txt"
mousefile = "../Reference-genomes/mm10/mm10-ens99-ids.tab"
ratfile = "../Reference-genomes/Rnor6/rnor6-ens99-ids.tab"
outfilename = "../02-Annotation-data/mm10-rnor6-master-transcript-id-table.tab"

headers = ["mgid", "mtid", "mpid", "meid", "reid", "rpid", "rtid", "rgid"]

core.PWS("# " + core.getDateTime() +
         " Reading mouse-rat one-to-one orthologs: " + orthfile)
orths = {}
# Dict to convert between mouse transcript IDs and rat protein IDs
first = True
for line in open(orthfile):
    if first:
        first = False
    # Skip the header

    line = line.strip().split("\t")

    if len(line) < 6:
    # If there are no orths, skip.
Exemplo n.º 10
def convCheck(cur_c, c, number_specs, d, ins, outs):
#	cur_c = 0;
	init_c = cur_c+1;
	while cur_c < c:
		#if c > 1:

		spec_list = all_specs.values();
		rep_specs = [];
		while len(rep_specs) < number_specs:
			r = random.choice(spec_list);

		outfilename = outs + "_" + str(cur_c+1) + ".txt";
		outfile = open(outfilename, "w");
		outfile.write("# ==============================================================================================\n");
		outfile.write("# \t\t\tConvergence testing\n");
		outfile.write("# \t\t\t" + core.getDateTime() + "\n");
		outfile.write("# Using alignments in:\t\t" + indir + "\n");
		outfile.write("# Randomly choosing " + str(number_specs) + " species and performing " + str(c) + " replicate tests for convergence.\n");
		outfile.write("# This is replicate number " + str(cur_c+1) + "\n");
		outfile.write("# Writing output to:\t\t\t" + outfilename + "\n");
		if d == 0:
			outfile.write("# Checking for convergent sites.\n");
		elif d == 1:
			outfile.write("# Checking for divergent sites.\n");
		outfile.write("# Using species:\t" + ",".join(rep_specs));
		outfile.write("# ---------------------------------------------\n");

		#cur_c = cur_c + 1;
		aligns = os.listdir(ins);
		numbars = 0;
		donepercent = [];
		count = len(aligns);
		i = 0;
		numsites = 0;
		totgenes = 0;
		outfile.write("# " + core.getTime() + " Starting Scan...\n");
		outfile.write("# Site#\tTargetSpecs\tChromosome\tGeneID\tAlignLen\tPosition\tTargetAlleles\tBackgroundAlleles\n");
		for align in aligns:
			#numbars, donepercent = core.loadingBar(i, count, donepercent, numbars);
			i = i + 1;
			if align.find(".fa") == -1:

			infilename = os.path.join(ins, align);
			gid = "_".join(align.split("_")[:2]);
			chrome = align[align.find("chr"):align.find("chr")+4]

			inseqs = core.fastaGetDict(infilename);

			for t1 in rep_specs:
				for t2 in rep_specs:
					if t1 == t2:

					targets = [t1, t2];
					backgrounds = [spec for spec in rep_specs if spec not in targets];

					num_targets_present = 0;
					num_bg_present = 0;
					for title in inseqs:
						if any(t in title for t in targets):
							num_targets_present = num_targets_present + 1;
						if any(b in title for b in backgrounds):
							num_bg_present = num_bg_present + 1;

					if num_targets_present == len(targets) and num_bg_present == len(backgrounds):
						# print "The following gene has all target and background species and will be checked:\t\t" + gid;
						totgenes = totgenes + 1;

						seqlen = len(inseqs[inseqs.keys()[0]]);
						# print "Alignment length\t\t", seqlen;

						t_alleles = {};
						b_alleles = {};

						for x in xrange(len(inseqs[inseqs.keys()[0]])):
							for title in inseqs:
								cur_spec = title[1:].replace("\n","");
								if cur_spec in targets:
									t_alleles[cur_spec] = inseqs[title][x];
								if cur_spec in backgrounds:
									b_alleles[cur_spec] = inseqs[title][x];

							t_states = t_alleles.values();
							#t_gap = t_states.count("-");
							#t_missing = t_states.count("X");
							#t_stop = t_states.count("*");

							b_states = b_alleles.values();
							#b_gap = b_states.count("-");
							#b_missing = b_states.count("X");
							#b_stop = b_states.count("*");

							t_final = remGapMiss(t_states);
							b_final = remGapMiss(b_states);

							if t_final == [] or b_final == []:

							if d == 0:
								if len(t_final) == len(targets) and len(b_final) == len(backgrounds) and t_final.count(t_final[0]) == len(t_final) and t_final[0] not in b_final:
									numsites = numsites + 1;
									print core.getTime() + " Convergent site found!";
									print "Filename:\t\t" + align;
									print "Chromosome:\t\t" + chrome;
									print "Gene ID:\t\t" + gid;
									print "Alignment length\t", seqlen;
									print "Target alleles:\t\t" + "".join(t_final);
									print "Background alleles:\t" + "".join(b_final);
									print "---------------";
									outline = str(numsites) + "\t" + ",".join(targets) + "\t" + chrome + "\t" + gid + "\t" + str(seqlen) + "\t" + str(x+1) + "\t" + "".join(t_final) + "\t" + "".join(b_final) + "\n";

							elif d == 1:
								if len(t_final) == len(targets) and len(b_final) == len(backgrounds) and t_final.count(t_final[0]) != len(t_final) and b_final.count(b_final[0]) == len(b_final):
									if not any(t in b_final for t in t_final):
										numsites = numsites + 1;
										# print "\nDivergent site found!";
										# print "Filename:\t\t" + align;
										# print "Chromosome:\t\t" + chrome;
										# print "Gene ID:\t\t" + gid;
										# print "Alignment length\t", seqlen;
										# print t_final;
										# print b_final;
										outline = str(numsites) + "\t" + ",".join(targets) + "\t" + chrome + "\t" + gid + "\t" + str(seqlen) + "\t" + str(x+1) + "\t" + "".join(t_final) + "\t" + "".join(b_final) + "\n";

		#pstring = "100.0% complete.";
		#sys.stderr.write('\b' * len(pstring) + pstring);
		outfile.write("\n# " + core.getTime() + " Done!\n");
		outfile.write("# Total sites found: " + str(numsites) + "\n");
		outfile.write("# Total genes checked: " + str(totgenes) + "\n");
		outfile.write("# ==============================================================================================");
		cur_c = cur_c + 1;
	if ropt != 0:
		print core.getTime() + " Replicates", init_c, "to", c, "complete.";
Exemplo n.º 12
def convCheck(cur_c, c, ropt, targets, backgrounds, d, ins, outs):
#	cur_c = 0;
	init_c = cur_c+1;
	while cur_c < c:
		#if c > 1:
		if ropt != 0:
			outfilename = outs + "_" + str(cur_c+1) + ".txt";
			outfilename = outs + ".txt";

		if ropt != 0:
			#backgrounds = ["hg19", "rheMac3", "calJac3", "mm10", "rn5", "vicPac2", "bosTau7", "canFam3", "loxAfr3", "papHam1", "monDom5"];
			backgrounds = [];
			cur_r = len(backgrounds);
			while cur_r < ropt:
				chosenspec = random.choice(all_specs.values());

				if chosenspec not in targets and chosenspec not in backgrounds:
					cur_r = cur_r + 1;

		outfile = open(outfilename, "w");

		outfile.write("# ==============================================================================================\n");
		outfile.write("# \t\t\tConvergence testing\n");
		outfile.write("# \t\t\t" + core.getDateTime() + "\n");
		outfile.write("# Using alignments in:\t\t" + indir + "\n");
		outfile.write("# Target species:\t\t\t" + ", ".join(targets) + "\n");
		if ropt != 0:
			outfile.write("# Randomly choosing " + str(r) + " background species and performing " + str(c) + " replicate tests for convergence.\n");
			outfile.write("# This is replicate number " + str(cur_c+1) + "\n");
		outfile.write("# Background species:\t\t" + ", ".join(backgrounds) + "\n");
		outfile.write("# Writing output to:\t\t\t" + outfilename + "\n");
		if d == 0:
			outfile.write("# Checking for convergent sites.\n");
		elif d == 1:
			outfile.write("# Checking for divergent sites.\n");
		outfile.write("# ---------------------------------------------\n");
		#cur_c = cur_c + 1;
		aligns = os.listdir(ins);

		numbars = 0;
		donepercent = [];
		count = len(aligns);
		i = 0;
		numsites = 0;
		totgenes = 0;
		outfile.write("# " + core.getTime() + " Starting Scan...\n");
		outfile.write("# Site#\tChromosome\tGeneID\tAlignLen\tPosition\tTargetAlleles\tBackgroundAlleles\n");
		for align in aligns:
			#numbars, donepercent = core.loadingBar(i, count, donepercent, numbars);
			i = i + 1;

			if align.find(".fa") == -1:

			#if i > 25:
			#	break;

			infilename = ins + align;
			#print align;
			gid = "_".join(align.split("_")[:2]);
			chrome = align[align.find("chr"):align.find("chr")+4]

			inseqs = core.fastaGetDict(infilename);

			num_targets_present = 0;
			num_bg_present = 0;
			for title in inseqs:
				if any(t in title for t in targets):
					num_targets_present = num_targets_present + 1;
				if any(b in title for b in backgrounds):
					num_bg_present = num_bg_present + 1;

			if num_targets_present == len(targets) and num_bg_present == len(backgrounds):
				#print "The following gene has all target and background species and will be checked:\t\t" + gid;
				totgenes = totgenes + 1;

				seqlen = len(inseqs[inseqs.keys()[0]]);
				#print "Alignment length\t\t", seqlen;

				t_alleles = {};
				b_alleles = {};

				for x in xrange(len(inseqs[inseqs.keys()[0]])):
					for title in inseqs:
						for t in targets:
							if t in title:
								t_alleles[t] = inseqs[title][x];
						for b in backgrounds:
							if b in title:
								b_alleles[b] = inseqs[title][x];

					t_states = t_alleles.values();

					t_gap = t_states.count("-");
					t_missing = t_states.count("X");
					t_stop = t_states.count("*");

					b_states = b_alleles.values();

					b_gap = b_states.count("-");
					b_missing = b_states.count("X");
					b_stop = b_states.count("*");

					t_final = remGapMiss(t_states);
					b_final = remGapMiss(b_states);

					#print t_alleles;
					#print t_states;
					#print t_gap;
					#print t_missing;
					#print t_stop;
					#print t_final;

					#print b_alleles;
					#print b_states;
					#print b_gap;
					#print b_missing;
					#print b_stop;
					#print b_final;

					if t_final == [] or b_final == []:

					if d == 0:
						if len(t_final) == len(targets) and len(b_final) == len(backgrounds) and t_final.count(t_final[0]) == len(t_final) and t_final[0] not in b_final:
							numsites = numsites + 1;
							#print core.getTime() + " Convergent site found!";
							#print "Filename:\t\t" + align;
							#print "Chromosome:\t\t" + chrome;
							#print "Gene ID:\t\t" + gid;
							#print "Alignment length\t", seqlen;
							#print "Target alleles:\t\t" + "".join(t_final);
							#print "Background alleles:\t" + "".join(b_final);
							#print "---------------";
							outline = str(numsites) + "\t" + chrome + "\t" + gid + "\t" + str(seqlen) + "\t" + str(x+1) + "\t" + "".join(t_final) + "\t" + "".join(b_final) + "\n";


					elif d == 1:
						if len(t_final) == len(targets) and len(b_final) == len(backgrounds) and t_final.count(t_final[0]) != len(t_final) and b_final.count(b_final[0]) == len(b_final):
							if not any(t in b_final for t in t_final):
								numsites = numsites + 1;
								#print "\nDivergent site found!";
								#print "Filename:\t\t" + align;
								#print "Chromosome:\t\t" + chrome;
								#print "Gene ID:\t\t" + gid;
								#print "Alignment length\t", seqlen;
								#print t_final;
								#print b_final;
								outline = str(numsites) + "\t" + chrome + "\t" + gid + "\t" + str(seqlen) + "\t" + str(x+1) + "\t" + "".join(t_final) + "\t" + "".join(b_final) + "\n";

		#pstring = "100.0% complete.";
		#sys.stderr.write('\b' * len(pstring) + pstring);
		outfile.write("\n# " + core.getTime() + " Done!\n");
		outfile.write("# Total sites found: " + str(numsites) + "\n");
		outfile.write("# Total genes checked: " + str(totgenes) + "\n");
		outfile.write("# ==============================================================================================");
		cur_c = cur_c + 1;
	if ropt != 0:
		print core.getTime() + " Replicates", init_c, "to", c, "complete.";
