예제 #1
0
def run_filter(args):
    sz_utils.check_if_files_exist(args.ac_file)
    fOUT = None
    if args.out == sys.stdout:
        fOUT = sys.stdout
    else:
        sz_utils.make_dirs_if_necessary(args.out)
        fOUT = open(args.out, 'w')
    before, after = 0, 0  # number of SNPs before and after filteration
    with open(args.ac_file, 'r') as fAC:
        for line in fAC:
            tmp_line = line.strip().split("\t")
            before += 1
            ref_base = tmp_line[2]
            alt_base = tmp_line[3]
            fail = 0
            for i in range(len(tmp_line[4:])):
                fail = apply_filter(tmp_line[4:][i], fail, args)
                if fail:
                    break
            if not fail:
                fOUT.write(line)
                after += 1
    fOUT.close()
    ColorText().info("Number of SNPs before filtering: %d\n" % (before),
                     "stderr")
    ColorText().info("Number of SNPs after filtering: %d\n" % (after),
                     "stderr")
예제 #2
0
def run_filter(args):
	sz_utils.check_if_files_exist(args.ac_file)
	fOUT = None
	if args.out == sys.stdout:
		fOUT = sys.stdout
	else:
		sz_utils.make_dirs_if_necessary(args.out)
		fOUT = open(args.out, 'w')
	before, after = 0, 0			# number of SNPs before and after filteration
	with open(args.ac_file, 'r') as fAC:
		for line in fAC:
			tmp_line = line.strip().split("\t")
			before += 1
			ref_base = tmp_line[2]
			alt_base  = tmp_line[3]
			fail = 0
			for i in range(len(tmp_line[4:])):
				fail = apply_filter(tmp_line[4:][i], fail, args)
				if fail:
					break
			if not fail:
				fOUT.write(line)
				after += 1
	fOUT.close()
	ColorText().info("Number of SNPs before filtering: %d\n" %(before), "stderr")
	ColorText().info("Number of SNPs after filtering: %d\n" %(after), "stderr")
예제 #3
0
def run_overlap(args):
    ''' getting SNPs identified from both pools '''
    sz_utils.check_if_files_exist(args.file_a, args.file_b)

    snp_a = collections.defaultdict(list)
    with open(args.file_a, 'r') as fA:
        for line in fA:
            tmp_line = line.strip().split("\t")
            snp_a[int(tmp_line[1])] = tmp_line
    ColorText().info(
        "[poolseq_tk]: %d SNPs parsed from %s\n" %
        (len(snp_a), os.path.basename(args.file_a)), "stderr")

    sz_utils.make_dirs_if_necessary(args.out)

    num_overlapion = 0
    with open(args.out, 'w') as fOUT:
        with open(args.file_b, 'r') as fB:
            for line in fB:
                tmp_line = line.strip().split("\t")
                if int(tmp_line[1]) in snp_a:
                    num_overlapion += 1
                    fOUT.write("%s\t%s\n" % ("\t".join(snp_a[int(
                        tmp_line[1])]), "\t".join(tmp_line[-4:])))
    ColorText().info(
        "[poolseq_tk]: %d SNPs identified from both pools\n" %
        (num_overlapion), "stderr")
예제 #4
0
def run_biallelic(args):
    dPileups = syncPileups(args.pileups)

    sz_utils.make_dirs_if_necessary(args.out)
    fOUT = open(args.out, 'w')
    nZeroCov = 0
    nSNPsKept = 0
    nMulti = 0
    for k in sorted(dPileups.iterkeys()):
        chr = k[0]
        pos = k[1]
        ref_base = dPileups[k][0]
        alt_base = dPileups[k][1]
        reads_bases = dPileups[k][2]
        if len(reads_bases) > 0:
            ref_count = reads_bases.count(ref_base) + \
               reads_bases.count(ref_base.lower())
            alt_count = reads_bases.count(alt_base) + \
               reads_bases.count(alt_base.lower())
            other_count = len(reads_bases) - ref_count - alt_count
            if float(other_count) / len(reads_bases) < 0.05:
                fOUT.write("%s\t%d\t%s\t%s\n" % (chr, pos, ref_base, alt_base))
                nSNPsKept += 1
            else:
                nMulti += 1
        else:
            nZeroCov += 1
    fOUT.close()

    print nSNPsKept
    print nMulti
    print nZeroCov
예제 #5
0
def run_merge(args):
	''' combine allele counts across replicates '''
	allele_counts = collections.defaultdict(list)
	data = collections.defaultdict(list)
	for ac_file in args.acs:
		sz_utils.check_if_files_exist(ac_file)
		ColorText().info("[poolseq_tk] reading and updating allele counts from %s ..."
						 %(ac_file), "stderr")
		with open(ac_file) as fAC:
			for line in fAC:
				tmp_line = line.strip().split()
				pos = int(tmp_line[1])
				if not pos in data:
					data[pos] = tmp_line[0:4]
				if not pos in allele_counts:
					allele_counts[pos] = map(int, tmp_line[4].split(':'))
				else:
					allele_counts[pos] = map(sum, zip(allele_counts[pos], map(int, tmp_line[4].split(':'))))
		ColorText().info(" [done]\n", "stderr")

	# output to file
	fOUT = None
	if args.out == sys.stdout:
		fOUT = sys.stdout
	else:
		sz_utils.make_dirs_if_necessary(args.out)
		fOUT = open(args.out, 'w')
	ColorText().info("[poolseq_tk] outputting to %s ..."
					 %(fOUT.name), "stderr")
	for pos in sorted(allele_counts.iterkeys()):
		fOUT.write("%s\t%s\n" %("\t".join(data[pos]), ":".join(map(str, allele_counts[pos]))))
	ColorText().info(" [done]\n", "stderr")
예제 #6
0
def main():
    isnp = sys.argv[1]
    m1 = sys.argv[2]
    m2 = sys.argv[3]
    out = sys.argv[4]

    sz_utils.make_dirs_if_necessary(out)
    run_collapse(isnp, m1, m2, out)
예제 #7
0
def run_prepVCF(args):
    sz_utils.check_if_files_exist(args.infile)

    dfst = collections.defaultdict(list)
    if args.ifst:
        dfst = getFst(args.ifst, dfst)

    dfilters = getFilters(args.filters)

    sz_utils.make_dirs_if_necessary(args.out)
    fOUT = open(args.out, 'w')
    outVCFHeaders(args.samples, fOUT)
    with open(args.infile, "r") as fIN:
        for line in fIN:
            tmp_line = line.strip().split("\t")
            chr = tmp_line[0]
            pos = int(tmp_line[1])
            refBase = tmp_line[2]
            altBase = tmp_line[3]
            pval = float(tmp_line[8])
            corrPval = float(tmp_line[10])
            ratio = float(tmp_line[-1])
            fst = -1.0
            if chr in dfst:
                for j in range(len(dfst[chr])):
                    if pos == dfst[chr][j][0]:
                        fst = float(dfst[chr][j][1])
                        dfst[chr].pop(j)
                        break
            if "ratio" in dfilters:
                if ((dfilters["ratio"][0] == '<'
                     and ratio >= dfilters["ratio"][1])
                        or dfilters["ratio"][0] == '>'
                        and ratio <= dfilters["ratio"][1]):
                    continue
            if "pval" in dfilters:  # fix later
                pass
            if "corrPval" in dfilters:  # fix later
                pass
            fOUT.write("%s\t%s\t.\t%s\t%s\t.\t.\t" %
                       (chr, pos, refBase, altBase))
            if fst == -1.0:
                fOUT.write("pval=%.5g;corrPval=%.5f;ratio=%.5f\t" %
                           (pval, corrPval, ratio))
            else:
                fOUT.write("pval=%.5g;corrPval=%.5f;ratio=%.5f;fst=%.5f\t" %
                           (pval, corrPval, ratio, fst))
            fOUT.write("GT:Table")
            poolIndex = 1
            for i in range(len(tmp_line[4:-3])):
                table = tmp_line[4:-3][i].replace(':', '-')
                fOUT.write("\t./.:%s" % (table))
            fOUT.write("\n")
    fOUT.close()
예제 #8
0
def run_prepVCF(args):
	sz_utils.check_if_files_exist(args.infile)

	dfst = collections.defaultdict(list)
	if args.ifst:
		dfst = getFst(args.ifst, dfst)

	dfilters = getFilters(args.filters)

	sz_utils.make_dirs_if_necessary(args.out)
	fOUT = open(args.out, 'w')
	outVCFHeaders(args.samples, fOUT)
	with open(args.infile, "r") as fIN:
		for line in fIN:
			tmp_line = line.strip().split("\t")
			chr = tmp_line[0]
			pos = int(tmp_line[1])
			refBase = tmp_line[2]
			altBase = tmp_line[3]
			pval = float(tmp_line[8])
			corrPval = float(tmp_line[10])
			ratio = float(tmp_line[-1])
			fst = -1.0
			if chr in dfst:
				for j in range(len(dfst[chr])):
					if pos == dfst[chr][j][0]:
						fst = float(dfst[chr][j][1])
						dfst[chr].pop(j)
						break
			if "ratio" in dfilters:
				if ((dfilters["ratio"][0] == '<' and ratio >= dfilters["ratio"][1]) or
					 dfilters["ratio"][0] == '>' and ratio <= dfilters["ratio"][1]):
					continue
			if "pval" in dfilters:		# fix later
				pass
			if "corrPval" in dfilters:		# fix later
				pass
			fOUT.write("%s\t%s\t.\t%s\t%s\t.\t.\t"
							%(chr, pos, refBase, altBase))
			if fst == -1.0:
				fOUT.write("pval=%.5g;corrPval=%.5f;ratio=%.5f\t"
								%(pval, corrPval, ratio))
			else:
				fOUT.write("pval=%.5g;corrPval=%.5f;ratio=%.5f;fst=%.5f\t"
								%(pval, corrPval, ratio, fst))
			fOUT.write("GT:Table")
			poolIndex = 1
			for i in range(len(tmp_line[4:-3])):
				table = tmp_line[4:-3][i].replace(':', '-')
				fOUT.write("\t./.:%s" %(table))
			fOUT.write("\n")
	fOUT.close()
예제 #9
0
def run_view(args):
    check_if_files_exist(args.ipileup)
    make_dirs_if_necessary(args.out)

    dSNPs = getSNPs(args.isnp)
    fOUT = open(args.out, 'w')
    #	nRemoved = 0
    with open(args.ipileup, 'r') as fIN:
        for line in fIN:
            tmp_line = line.strip().split("\t")
            chr = tmp_line[0]
            pos = int(tmp_line[1])
            if (chr, pos) in dSNPs:
                cov = int(tmp_line[3])
                ref_base = tmp_line[2].upper()
                alt_base = dSNPs[chr, pos][1]
                if ref_base == dSNPs[chr, pos][0]:
                    if cov > 0:
                        reads_bases = tmp_line[4]
                        reads_bases_parsed = parseReadsBases(
                            ref_base, alt_base, reads_bases)
                        fOUT.write(
                            "%s\t%d\t%s\t%s\t%s\n" %
                            (chr, pos, ref_base, alt_base, reads_bases_parsed))
#						reads_bases_parsed, nReadsBases, nRefBases, dMultiBases, dIndels = parseReadsBases(reads_bases, ref_base, alt_base)

# the following is a checkup on other alleles
# at this moment this checkup is inactive
# number of alleles (SNPs, Indels) other than the alternative allele
#					nMultiBases = sum(dMultiBases.values()) + sum(dIndels.values())
#					if (nReadsBases == nRefBases or
#						(nMultiBases)/float(nReadsBases) <= 0.05):
#						out.write("%s\t%d\t%s\t%s\t%s\n" %(chr, pos, ref_base, alt_base,
#														   reads_bases_parsed))
#					else:
#						nRemoved += 1
#						print pos, ref_base, alt_base, reads_bases, reads_bases_parsed
#						print dMultiBases
#						print dIndels
#						print
                else:
                    sys.stderr.write("reference base not consistent\n")
                    sys.stderr.write(line)
                    sys.exit()
                del dSNPs[chr, pos]
    fOUT.close()
예제 #10
0
def run_view(args):
	check_if_files_exist(args.ipileup)
	make_dirs_if_necessary(args.out)

	dSNPs = getSNPs(args.isnp)
	fOUT = open(args.out, 'w')
#	nRemoved = 0
	with open(args.ipileup, 'r') as fIN:
		for line in fIN:
			tmp_line = line.strip().split("\t")
			chr = tmp_line[0]
			pos = int(tmp_line[1])
			if (chr, pos) in dSNPs:
				cov = int(tmp_line[3])
				ref_base = tmp_line[2].upper()
				alt_base = dSNPs[chr, pos][1]
				if ref_base == dSNPs[chr, pos][0]:
					if cov > 0:
						reads_bases = tmp_line[4]
						reads_bases_parsed = parseReadsBases(ref_base, alt_base, reads_bases)
						fOUT.write("%s\t%d\t%s\t%s\t%s\n" %(chr, pos, ref_base, alt_base,
														    reads_bases_parsed))
#						reads_bases_parsed, nReadsBases, nRefBases, dMultiBases, dIndels = parseReadsBases(reads_bases, ref_base, alt_base)

					# the following is a checkup on other alleles
					# at this moment this checkup is inactive
					# number of alleles (SNPs, Indels) other than the alternative allele
#					nMultiBases = sum(dMultiBases.values()) + sum(dIndels.values())
#					if (nReadsBases == nRefBases or
#						(nMultiBases)/float(nReadsBases) <= 0.05):
#						out.write("%s\t%d\t%s\t%s\t%s\n" %(chr, pos, ref_base, alt_base,
#														   reads_bases_parsed))
#					else:
#						nRemoved += 1
#						print pos, ref_base, alt_base, reads_bases, reads_bases_parsed
#						print dMultiBases
#						print dIndels
#						print
				else:
					sys.stderr.write("reference base not consistent\n")
					sys.stderr.write(line)
					sys.exit()
				del dSNPs[chr, pos]
	fOUT.close()
예제 #11
0
def run_merge(args):
    ''' combine allele counts across replicates '''
    allele_counts = collections.defaultdict(list)
    data = collections.defaultdict(list)
    for ac_file in args.acs:
        sz_utils.check_if_files_exist(ac_file)
        ColorText().info(
            "[poolseq_tk] reading and updating allele counts from %s ..." %
            (ac_file), "stderr")
        with open(ac_file) as fAC:
            for line in fAC:
                tmp_line = line.strip().split()
                pos = int(tmp_line[1])
                if not pos in data:
                    data[pos] = tmp_line[0:4]
                if not pos in allele_counts:
                    allele_counts[pos] = map(int, tmp_line[4].split(':'))
                else:
                    allele_counts[pos] = map(
                        sum,
                        zip(allele_counts[pos], map(int,
                                                    tmp_line[4].split(':'))))
        ColorText().info(" [done]\n", "stderr")

    # output to file
    fOUT = None
    if args.out == sys.stdout:
        fOUT = sys.stdout
    else:
        sz_utils.make_dirs_if_necessary(args.out)
        fOUT = open(args.out, 'w')
    ColorText().info("[poolseq_tk] outputting to %s ..." % (fOUT.name),
                     "stderr")
    for pos in sorted(allele_counts.iterkeys()):
        fOUT.write(
            "%s\t%s\n" %
            ("\t".join(data[pos]), ":".join(map(str, allele_counts[pos]))))
    ColorText().info(" [done]\n", "stderr")
예제 #12
0
def run_overlap(args):
	''' getting SNPs identified from both pools '''
	sz_utils.check_if_files_exist(args.file_a, args.file_b)

	snp_a = collections.defaultdict(list)
	with open(args.file_a, 'r') as fA:
		for line in fA:
			tmp_line = line.strip().split("\t")
			snp_a[int(tmp_line[1])] = tmp_line
	ColorText().info("[poolseq_tk]: %d SNPs parsed from %s\n" %(len(snp_a),
					 os.path.basename(args.file_a)), "stderr")

	sz_utils.make_dirs_if_necessary(args.out)

	num_overlapion = 0
	with open(args.out, 'w') as fOUT:
		with open(args.file_b, 'r') as fB:
			for line in fB:
				tmp_line = line.strip().split("\t")
				if int(tmp_line[1]) in snp_a:
					num_overlapion += 1
					fOUT.write("%s\t%s\n" %("\t".join(snp_a[int(tmp_line[1])]), "\t".join(tmp_line[-4:])))
	ColorText().info("[poolseq_tk]: %d SNPs identified from both pools\n" %(num_overlapion),
					 "stderr")
예제 #13
0
def run_collapse(args):
	'''
		Given two pileup files of the same region, like 2l+ and 2la,
		collapse the pileups at each corresponding SNP
		Some SNPs are not reported in one or the other pileup file.
		A full list of SNP positions are required
	'''
	m1_base = os.path.basename(args.m1)
	m2_base = os.path.basename(args.m2)

	# first, getting the full list of SNPs
	dSNPs = get_SNPs(args.snps)

	# second, reading each of the pileup files
	chr1, dM1 = read_mpileup(args.m1, args.offset1)
	chr2, dM2 = read_mpileup(args.m2, args.offset2)

	ColorText().info("[poolseq_tk] %s: %d SNPs parsed\n" %(m1_base, len(dM1)), "stderr")
	ColorText().info("[poolseq_tk] %s: %d SNPs parsed\n" %(m2_base, len(dM2)), "stderr")

	fOUT = None
	if args.out != sys.stdout:
		outdir = os.path.dirname(os.path.realpath(args.out))
		sz_utils.make_dirs_if_necessary(outdir)
		fOUT = open(args.out, 'w')
	else:
		fOUT = args.out
	ColorText().info("[poolseq_tk]: collapsing mpileups %s and %s ..."
					 %(m1_base, m2_base), "stderr")
	for pos in sorted(dSNPs.iterkeys()):
		reads_bases_collapsed = ""
		if pos in dM1 and pos in dM2:
			'''
				dSNPs[pos][0]: ref base of m1 pileup
				dSNPs[pos][1]: ref base of m2 pileup
				dM1[pos][0]: ref base of m1 pileup
				dM2[pos][1]: ref base of m2 pileup
			'''
			if dSNPs[pos][0] == dM1[pos][0] and dSNPs[pos][1] == dM2[pos][0]:
				reads_bases_collapsed = parseReadsBases(dM1[pos][0],
															 dM2[pos][0],
															 dM1[pos][1])
				reads_bases_collapsed += parseReadsBases(dM2[pos][0],
															 dM1[pos][0],
															 dM2[pos][1])
				fOUT.write("%s/%s\t%d\t%s\t%s\t%s\n"
								%(chr1, chr2, pos,
								  dSNPs[pos][0], dSNPs[pos][1], reads_bases_collapsed))
			else:
				# this should bark if the same sites having different states
				ColorText().error("SNP position: %d %s %s\t\tMpileup position: %d %s %s\n"
								 %(pos, dSNPs[pos][0], dSNPs[pos][1],
								   pos, dM1[pos][0], dM2[pos][0]),
								   "stderr")
		# SNPS missed in both pileup file
		elif pos not in dM1 and pos not in dM2:
			fOUT.write("%s/%s\t%d\t%s\t%s\n"
							%(chr1, chr2, pos,
							  dSNPs[pos][0], dSNPs[pos][1]))
		# SNPs in m1 pileup file but not in m2
		elif pos in dM1 and pos not in dM2:
			reads_bases_collapsed = parseReadsBases(dM1[pos][0],
														 dSNPs[pos][1],
														 dM1[pos][1])
			fOUT.write("%s/%s\t%d\t%s\t%s\t%s\n"
							%(chr1, chr2, pos,
							  dSNPs[pos][0], dSNPs[pos][1], reads_bases_collapsed))
		# SNPs in m2 pileup file but not in m1
		elif pos not in dM1 and pos in dM2:
			reads_bases_collapsed = parseReadsBases(dM2[pos][0],
														 dSNPs[pos][0],
														 dM2[pos][1])
			fOUT.write("%s/%s\t%d\t%s\t%s\t%s\n"
							%(chr1, chr2, pos,
							  dSNPs[pos][0], dSNPs[pos][1], reads_bases_collapsed))
	ColorText().info(" [done]\n", "stderr")
	fOUT.close()
예제 #14
0
def run_fisher(args):
    ''' run Fisher's Exact test '''
    sz_utils.make_dirs_if_necessary(args.outp)
    sz_utils.check_if_files_exist(args.ac_file)
    tables = sz_utils._count2table(args.ac_file)[0]

    task_q = mp.JoinableQueue()
    result_q = mp.Queue()
    create_procs(args.nproc, task_q, result_q, args.outp)
    sz_utils._assign_tables(tables, task_q, args.nproc)

    try:
        task_q.join()
    except KeyboardInterrupt:
        ColorText().info("[poolseq_tk]: Terminated unexpectedly by keyboard\n",
                         "stderr")
        sys.exit()
    else:
        pvals, odds_ratios, log10_pvals = {}, {}, {}
        while args.nproc:
            file = result_q.get()
            with open(file, 'r') as fIN:
                for line in fIN:
                    tmp_line = line.strip().split("\t")
                    chr = tmp_line[0]
                    pos = int(tmp_line[1])
                    pval = float(tmp_line[2])
                    odds_ratio = float(tmp_line[3])
                    log10_pval = tmp_line[4]
                    if (chr, pos) not in pvals:
                        pvals[chr, pos] = pval
                    if (chr, pos) not in odds_ratios:
                        odds_ratios[chr, pos] = odds_ratio
                    if (chr, pos) not in log10_pvals:
                        log10_pvals[chr, pos] = log10_pval
            os.remove(file)
            #			pvals_split, odds_ratios_split = result_q.get()
            #			pvals.update(pvals_split)
            #			odds_ratios.update(odds_ratios_split)
            args.nproc -= 1
        ColorText().info(
            "[poolseq_tk]: Running Fisher's Exact tests successfully\n",
            "stderr")

        # correcting raw p-values and make QQ plots
        ColorText().info(
            "[poolseq_tk]: multi-testing correction using %s method at %d%% level ..."
            % (args.adj_method, args.adj_cutoff * 100), "stderr")
        raw_pvals = [pvals[k] for k in sorted(pvals.iterkeys())]
        raw_pvals_vector = robjects.FloatVector(raw_pvals)
        padjust = robjects.r['p.adjust'](raw_pvals_vector,
                                         method=args.adj_method)
        ColorText().info(" [done]\n", "stderr")
        ColorText().info(
            "[poolseq_tk]: p-value cutoff using Benjamini.Hochberg procedure %.5e"
            % (sz_utils.getFDR_BH(pvals, args.adj_cutoff)), "stderr")
        ColorText().info(" [done]\n", "stderr")

        # output p-values
        ColorText().info("[poolseq_tk]: output to files ...", "stderr")
        out_all = args.outp + ".fisher.all"
        out_fdr = args.outp + ".fisher.fdr%d" % (args.adj_cutoff * 100)
        out_expect = args.outp + ".fisher.fdr%d.expect" % (args.adj_cutoff *
                                                           100)
        with open(out_all, 'w') as fALL, \
          open(out_fdr, 'w') as fFDR, \
          open(out_expect, 'w') as fEXPECT:
            for i, k in enumerate(sorted(pvals.iterkeys())):
                chr = k[0]
                pos = k[1]
                raw_pval = pvals[k]
                log_pval = log10_pvals[k]
                odds_ratio = odds_ratios[k]
                if padjust[i] <= args.adj_cutoff:
                    sz_utils._results_outputter(fFDR, pos, chr,
                                                "\t".join(tables[k][1:3]),
                                                tables[k][3:], raw_pval,
                                                log_pval, padjust[i],
                                                odds_ratio)
                    if ((args.oddsr_direction == "greater"
                         and odds_ratios[k] > 1)
                            or (args.oddsr_direction == "less"
                                and odds_ratios[k] < 1)):
                        sz_utils._results_outputter(fEXPECT, pos, chr,
                                                    "\t".join(tables[k][1:3]),
                                                    tables[k][3:], raw_pval,
                                                    log_pval, padjust[i],
                                                    odds_ratio)
                sz_utils._results_outputter(fALL, pos, chr,
                                            "\t".join(tables[k][1:3]),
                                            tables[k][3:], raw_pval, log_pval,
                                            padjust[i], odds_ratio)
        ColorText().info(" [done]\n", "stderr")
        ColorText().info("[poolseq_tk]: Program finishes successfully\n",
                         "stderr")
예제 #15
0
def making_plot(args):
    ''' making Q-Q plot and Manhattan plot '''
    # install qqman package if not installed

    if not rpackages.isinstalled("qqman"):
        rutils = rpackages.importr('utils')
        rutils.chooseCRANmirror(ind=84)
        rutils.install_packages("qqman")

    # get pvalues
    ColorText().info("[poolseq_tk]: Extracting P-Values ... ", "stderr")
    data = collections.defaultdict()
    chrs = []
    pvals, adjust_pvals = {}, {}
    nchr = 0
    with open(args.input, 'r') as fIN:
        for line in fIN:
            tmp_line = line.strip().split("\t")
            chr = tmp_line[0]
            pos = int(tmp_line[1])
            if chr not in chrs:
                chrs.append(chr)
                nchr += 1
            data[chr, pos] = nchr
            pvals[chr, pos] = float(tmp_line[8])
    ColorText().info(" [done]\n", "stderr")

    # get FDR cutoff using BH if not provided through command line
    pcutoff = 0.0
    if not args.pcutoff:
        ColorText().info(
            "[poolseq_tk]: Getting p-value cutoff at FDR %d%%: " %
            (args.fdrlevel * 100), "stderr")
        pcutoff = sz_utils.getFDR_BH(pvals, args.fdrlevel)
        ColorText().info("%.5e\n" % (pcutoff), "stderr")
    else:
        pcutoff = args.pcutoff
        ColorText().info(
            "[poolseq_tk]: p-value cutoff provided: %.5e\n" % (pcutoff),
            "stderr")

    # get SNPs to highlight
    snps_to_highlight = []
    if args.highlight_snps:
        ColorText().info(
            "[poolseq_tk]: Getting SNPs to be highlighed in Manhattan plot ... ",
            "stderr")
        with open(args.highlight_snps, 'r') as fHIGHLIGHT:
            for line in fHIGHLIGHT:
                tmp_line = line.strip().split("\t")
                snps_to_highlight.append('_'.join(tmp_line[:2]))
    ColorText().info(" [done]\n", "stderr")

    if args.pdf:
        out_qqplot = args.outp + ".qqplot.pdf"
        out_manhattan = args.outp + ".manhattan.pdf"
    elif args.png:  # save to PNG probably wont work
        out_qqplot = args.outp + ".qqplot.png"
        out_manhattan = args.outp + ".manhattan.png"
    sz_utils.make_dirs_if_necessary(out_qqplot, out_manhattan)
    grdevices = rpackages.importr('grDevices')
    raw_pvals_vector = robjects.FloatVector(
        [pvals[k] for k in sorted(pvals.iterkeys())])

    ColorText().info("[poolseq_tk]: Making Q-Q plot ...", "stderr")
    make_qqplots(grdevices, raw_pvals_vector, out_qqplot, args.qqtitle)
    ColorText().info(" [done]\n", "stderr")

    ColorText().info("[poolseq_tk]: Making Manhattan plot ...", "stderr")
    make_manhattan(grdevices, data, raw_pvals_vector, snps_to_highlight,
                   pcutoff, out_manhattan, args.mantitle, args.manx,
                   args.manxlim)
    ColorText().info(" [done]\n", "stderr")
예제 #16
0
def run_count(args):
    ''' Counting alleles at each SNP in the given pileup files '''

    dPos = {}
    if args.pos:
        ColorText().info("[poolseq_tk] reading SNPs positions:", "stderr")
        with open(args.pos, 'r') as fPOS:
            for line in fPOS:
                tmp_line = line.strip().split("\t")
                chr = tmp_line[0]
                pos = int(tmp_line[1])
                if (chr, pos) not in dPos:
                    dPos[chr, pos] = 1
        ColorText().info(" %d\n" % (len(dPos)), "stderr")
    else:
        ColorText().info(
            "[poolseq_tk] no SNP positions provided ... [skipped]\n", "stderr")

    ac = collections.defaultdict(tuple)
    for pileup in args.pileups:
        sz_utils.check_if_files_exist(pileup)
        nsnps = 0
        ColorText().info(
            "[poolseq_tk] counting alleles in %s:" %
            (os.path.basename(pileup)), "stderr")
        with open(pileup, 'r') as fMPILEUP:
            for line in fMPILEUP:
                nsnps += 1
                tmp_line = line.strip().split("\t")
                chr = tmp_line[0]
                pos = int(tmp_line[1])
                if (((chr, pos) in dPos and args.pos)
                        or (len(dPos) == 0 and not args.pos)):
                    ref_base = tmp_line[2]
                    alt_base = tmp_line[3]
                    nRefAlleles, nAltAlleles = 0, 0
                    if len(tmp_line) == 5:
                        nRefAlleles = tmp_line[-1].count(ref_base) + \
                             tmp_line[-1].count(ref_base.lower())
                        nAltAlleles = tmp_line[-1].count(alt_base) + \
                             tmp_line[-1].count(alt_base.lower())
                    if (chr, pos) not in ac:
                        ac[chr, pos] = [
                            ref_base, alt_base,
                            str(nRefAlleles),
                            str(nAltAlleles)
                        ]
                    else:
                        ac[chr, pos] += [str(nRefAlleles), str(nAltAlleles)]
        ColorText().info(" %d SNPs parsed\n" % (nsnps), "stderr")

    fOUT = None
    if args.out == sys.stdout:
        fOUT = sys.stdout
    else:
        sz_utils.make_dirs_if_necessary(args.out)
        fOUT = open(args.out, 'w')
    ColorText().info("[poolseq_tk] outputting allele counts to table ...",
                     "stderr")
    for k in sorted(ac.iterkeys()):
        chr = k[0]
        pos = k[1]
        i = 2
        if len(ac[k][i:]) == 2 * len(args.pileups):
            fOUT.write("%s\t%d\t%s" % (chr, pos, "\t".join(ac[k][0:2])))
            while i <= len(ac[k]) - 4:
                fOUT.write("\t%s" % (":".join(ac[k][i:i + 4])))
                i += 4
            fOUT.write("\n")
    ColorText().info(" [done]\n", "stderr")
    fOUT.close()
예제 #17
0
def run_cmh(args):
	''' run Cochran-Mantel-Hasenzle test '''

	sz_utils.make_dirs_if_necessary(args.outp)
	allele_counts = {}
	pvals = {}
	tables = collections.defaultdict(list)
	ntests = 0
	tables, ntables_per_snp = sz_utils._count2table(args.table_file)
	ColorText().info("[poolseq_tk]: %d tables prepared\n" %(len(tables)), "stderr")

	task_q = mp.JoinableQueue()
	result_q = mp.Queue()
	create_procs(args.nproc,task_q, result_q, ntables_per_snp, args.outp)
	sz_utils._assign_tables(tables, task_q, args.nproc)

	# waiting for all tasks to be finished
	try:
		task_q.join()
	except KeyboardInterrupt:
		ColorText().info("[poolseq_tk]: Terminated unexpectedly by keyboard\n", "stderr")
		sys.exit()
	else:
		# merge results
		pvals, odds_ratios = {}, {}
		while args.nproc:
			file = result_q.get()
			with open(file, 'r') as fIN:
				for line in fIN:
					tmp_line = line.strip().split("\t")
					chr = tmp_line[0]
					pos = int(tmp_line[1])
					pval = float(tmp_line[2])
					odds_ratio = float(tmp_line[3])
					if (chr, pos) not in pvals:
						pvals[chr, pos] = pval
					if (chr, pos) not in odds_ratios:
						odds_ratios[chr, pos] = odds_ratio
			os.remove(file)
#			pvals_split, odds_ratios_split = result_q.get()
#			pvals.update(pvals_split)
#			odds_ratios.update(odds_ratios_split)
			args.nproc -= 1
		ColorText().info("[poolseq_tk]: Running CMH tests successfully\n", "stderr")

		# correcting raw p-values
		ColorText().info("[poolseq_tk]: multi-testing correction using %s method at %d%% level ..."
						 %(args.adj_method, args.adj_cutoff*100), "stderr")
		raw_pvals = [pvals[chr, pos] for chr, pos in sorted(pvals.iterkeys())]
		raw_pvals_vector = robjects.FloatVector(raw_pvals)
		padjust = robjects.r['p.adjust'](raw_pvals_vector, method=args.adj_method)
		ColorText().info(" [done]\n", "stderr")
		pcutoff = sz_utils.getFDR_BH(pvals, args.adj_cutoff)
		ColorText().info("[poolseq_tk]: p-value cutoff using Benjamini.Hochberg procedure %.5e"
						 %(pcutoff), "stderr")
		ColorText().info(" [done]\n", "stderr")

		# output p-values
		ColorText().info("[poolseq_tk]: output to files ...", "stderr")
		out_all = args.outp + ".cmh.all"
		out_fdr = args.outp + ".cmh.fdr%d" %(args.adj_cutoff*100)
		out_expect = args.outp + ".cmh.fdr%d.expect" %(args.adj_cutoff*100)
		sz_utils.make_dirs_if_necessary(out_all, out_fdr)
		with open(out_all, 'w') as fALL, \
			 open(out_fdr, 'w') as fFDR, \
			 open(out_expect, 'w') as fEXPECT:
			for i, k in enumerate(sorted(pvals.iterkeys())):
				chr = k[0]
				pos = k[1]
				raw_pval = pvals[chr, pos]
				log_pval = None
				if raw_pval == 0.0:
					log_pval = "Inf"
				elif raw_pval == "Nan":
					raw_pval = 1.0
					log_pval = 0.0
				else:
					log_pval = -1 * math.log10(raw_pval)
				odds_ratio = odds_ratios[k]
				if padjust[i] <= args.adj_cutoff:
					sz_utils._results_outputter(fFDR, pos, chr, "\t".join(tables[chr, pos][1:3]), tables[chr, pos][3:], raw_pval, log_pval, padjust[i], odds_ratio)
					if ((args.oddsr_direction == "greater" and odds_ratios[chr, pos] > 1) or
						(args.oddsr_direction == "less" and odds_ratios[chr, pos] < 1)):
						sz_utils._results_outputter(fEXPECT, pos, chr, "\t".join(tables[chr, pos][1:3]), tables[chr, pos][3:], raw_pval, log_pval, padjust[i], odds_ratio)
				sz_utils._results_outputter(fALL, pos, chr, "\t".join(tables[chr, pos][1:3]), tables[chr, pos][3:], raw_pval, log_pval, padjust[i], odds_ratio)
		ColorText().info(" [done]\n", "stderr")
		ColorText().info("[poolseq_tk]: Program finishes successfully\n", "stderr")