Exemplo n.º 1
0
def run_filter(args):
    sz_utils.check_if_files_exist(args.ac_file)
    fOUT = None
    if args.out == sys.stdout:
        fOUT = sys.stdout
    else:
        sz_utils.make_dirs_if_necessary(args.out)
        fOUT = open(args.out, 'w')
    before, after = 0, 0  # number of SNPs before and after filteration
    with open(args.ac_file, 'r') as fAC:
        for line in fAC:
            tmp_line = line.strip().split("\t")
            before += 1
            ref_base = tmp_line[2]
            alt_base = tmp_line[3]
            fail = 0
            for i in range(len(tmp_line[4:])):
                fail = apply_filter(tmp_line[4:][i], fail, args)
                if fail:
                    break
            if not fail:
                fOUT.write(line)
                after += 1
    fOUT.close()
    ColorText().info("Number of SNPs before filtering: %d\n" % (before),
                     "stderr")
    ColorText().info("Number of SNPs after filtering: %d\n" % (after),
                     "stderr")
Exemplo n.º 2
0
def read_mpileup(mpileup_file, offset):
    ''' read certain columns in a pileup file into a dictionary of tuple '''
    ColorText().info("[poolseq_tk]: reading %s ..." % (mpileup_file), "stderr")
    mpileup_info = collections.defaultdict(tuple)
    chr = ""
    sz_utils.check_if_files_exist(mpileup_file)
    with open(mpileup_file, 'r') as fMPILEUP:
        for line in fMPILEUP:
            tmp_line = line.strip().split("\t")
            chr = tmp_line[0]
            '''
				key: SNP position in integer
				value: a tuple with two elements
					   1) ref base at the position
					   2) reads bases covering that position
						  if the coverage at that position > 0
						  else N/A
			'''
            #			if int(tmp_line[3]) == 0:		# the forth column: coverage at a position
            #				mpileup_info[int(tmp_line[1])+offset] = (tmp_line[2].upper(), "N/A")
            if int(tmp_line[3]) > 0:
                mpileup_info[int(tmp_line[1]) + offset] = (tmp_line[2].upper(),
                                                           tmp_line[4])
    ColorText().info(" [done]\n", "stderr")
    return chr, mpileup_info
Exemplo n.º 3
0
def run_overlap(args):
    ''' getting SNPs identified from both pools '''
    sz_utils.check_if_files_exist(args.file_a, args.file_b)

    snp_a = collections.defaultdict(list)
    with open(args.file_a, 'r') as fA:
        for line in fA:
            tmp_line = line.strip().split("\t")
            snp_a[int(tmp_line[1])] = tmp_line
    ColorText().info(
        "[poolseq_tk]: %d SNPs parsed from %s\n" %
        (len(snp_a), os.path.basename(args.file_a)), "stderr")

    sz_utils.make_dirs_if_necessary(args.out)

    num_overlapion = 0
    with open(args.out, 'w') as fOUT:
        with open(args.file_b, 'r') as fB:
            for line in fB:
                tmp_line = line.strip().split("\t")
                if int(tmp_line[1]) in snp_a:
                    num_overlapion += 1
                    fOUT.write("%s\t%s\n" % ("\t".join(snp_a[int(
                        tmp_line[1])]), "\t".join(tmp_line[-4:])))
    ColorText().info(
        "[poolseq_tk]: %d SNPs identified from both pools\n" %
        (num_overlapion), "stderr")
Exemplo n.º 4
0
def _count2table(ac_file, max_cov=100):
    ColorText().info(
        "[poolseq_tk]: reading counts and preparing 2*2 tables ...", "stderr")
    tables = collections.defaultdict(list)
    ntables_per_snp = 0
    with open(ac_file, 'r') as fAC:
        for line in fAC:
            tmp_line = line.strip().split("\t")
            if ntables_per_snp == 0:
                ntables_per_snp = len(tmp_line[4:])
            chr = tmp_line[0]
            pos = int(tmp_line[1])
            base1 = tmp_line[2]
            base2 = tmp_line[3]
            tables[chr, pos] = [tmp_line[0], base1,
                                base2]  # chr, allele1, allele2
            for counts in tmp_line[4:]:
                tmp_counts = counts.split(':')
                if sum(map(int, tmp_counts[0:2])) <= max_cov:
                    tables[chr, pos] += tmp_counts[:2]
                if sum(map(int, tmp_counts[2:4])) <= max_cov:
                    tables[chr, pos] += tmp_counts[2:]


#					tables[chr, pos] += counts.split(':')			# counts
            if len(tables[chr, pos]) < len(tmp_line[4:]) * 4 + 3:
                del tables[chr, pos]
    ColorText().info(" [done]\n", "stderr")
    return tables, ntables_per_snp
Exemplo n.º 5
0
def apply_filter(tmp_counts, fail, args):
    counts = map(int, tmp_counts.split(':'))
    if len(counts) < 2:
        ColorText().error("At least two counts (separated by colon) required"
                          "for column five\n")
        sys.exit(1)
    for i in range(len(counts)):
        if i == 0:
            if counts[i] < args.min_ref_ac:
                fail = 1
                break
        elif i == 1:
            if (counts[i] < args.min_alt_ac
                    or counts[i] + counts[i - 1] < args.min_cov):
                fail = 1
                break
        elif i == 2:
            if counts[i] < args.min_ref_ac:
                fail = 1
                break
        elif i == 3:
            if (counts[i] < args.min_alt_ac
                    or counts[i] + counts[i - 1] < args.min_cov):
                fail = 1
                break
    return fail
Exemplo n.º 6
0
def create_procs(nproc, task_q, result_q, ntables_per_snp, outp):
	''' initialize processes '''
	ColorText().info("[poolseq_tk]: Initializing processes ...\n", "stderr")
	for _ in range(nproc):
		p = mp.Process(target=cmh_worker, args=(task_q, result_q, ntables_per_snp, outp))
		p.daemon = True
		p.start()
Exemplo n.º 7
0
def read_mpileup(mpileup_file, offset):
	''' read certain columns in a pileup file into a dictionary of tuple '''
	ColorText().info("[poolseq_tk]: reading %s ..." %(mpileup_file), "stderr")
	dMpileups = collections.defaultdict(tuple)
	chr = ""
	sz_utils.check_if_files_exist(mpileup_file)
	with open(mpileup_file, 'r') as fMPILEUP:
		for line in fMPILEUP:
			tmp_line = line.strip().split("\t")
			chr = tmp_line[0]
			pos = int(tmp_line[1])
			cov = int(tmp_line[3])
			ref_base = tmp_line[2].upper()
			if cov > 0:
				reads_bases = tmp_line[4]
				dMpileups[pos+offset] = (ref_base, reads_bases)
	ColorText().info(" [done]\n", "stderr")
	return chr, dMpileups
Exemplo n.º 8
0
    def addBlock(self, block):
        '''
        Add a block to the blockchain
        :param block: The block to add
        '''
        localBlock = block
        # Here we modify the attribute hashPreviousBlock of the block we want to
        # add to make it point to the previous block hash
        try:
            localBlock.hashPreviousBlock = self.listBlock[-1].blockHash

        except IndexError:
            pass

        if self.verify()[0]:
            self.listBlock.append(localBlock)
            print(ColorText.OkGreen("[VERIFICATION] - Pass"))
        else:
            print(ColorText.Fail("[VERIFICATION] - The block {} is not correct ! Cannot add the block {}".format(self.verify()[1], localBlock.blockIndex)))
Exemplo n.º 9
0
def cmh_worker(task_q, result_q, ntables_per_snp, outp):
	while True:
		try:
			table_part, nth_job = task_q.get()
			pvals, odds_ratios = {}, {}
			ColorText().info("[poolseq_tk]: %s running Cochran-Mantel-Haenszel test on %d tables ...\n"
							 %(mp.current_process().name, len(table_part)),
							 "stderr")
			tmpFile = outp + "." + mp.current_process().name + ".cmh"
			fOUT = open(tmpFile, 'w')
			nTests = 0
			for chr, pos in sorted(table_part.iterkeys()):
				array = []
				i = 3
				while i <= len(table_part[chr, pos])-4:
					if (i > 2 and sum(map(int, table_part[chr, pos][i:i+4])) >= 10 and
					   int(table_part[chr, pos][i])+int(table_part[chr, pos][i+1]) >= 5 and
					   int(table_part[chr, pos][i])+int(table_part[chr, pos][i+2]) >= 5 and
					   int(table_part[chr, pos][i+2])+int(table_part[chr, pos][i+3]) >= 5 and
					   int(table_part[chr, pos][i+1])+int(table_part[chr, pos][i+3]) >= 5):
						array += map(int, table_part[chr, pos][i:i+4])
					i += 4
				if len(array) == ntables_per_snp*4:
					dim_vector = robjects.IntVector([2, 2, ntables_per_snp])
					data = robjects.r['array'](robjects.IntVector(array), dim=dim_vector)
					rcmh = robjects.r['mantelhaen.test'](data, alternative='t')
					pvalue = rcmh[2][0]
					nTests += 1
					if pvalue == "NaN":
						pvalue = 1.0
					if pvalue == 0.0:
						fOUT.write("%s\t%d\t%.4g\t%.8f\tInf\n" %(chr, pos, float(pvalue), float(rcmh[4][0])))
					else:
						fOUT.write("%s\t%d\t%.8f\t%.8f\t%.8f\n" %(chr, pos, float(pvalue), float(rcmh[4][0]), -1*math.log10(pvalue)))
			fOUT.close()
			ColorText().info("[poolseq_tk]: %s ran %d tests\n"
							 %(mp.current_process().name, nTests),
							 "stderr")
			result_q.put(tmpFile)
#			result_q.put((pvals, odds_ratios))
		finally:
			task_q.task_done()
Exemplo n.º 10
0
def run_merge(args):
    ''' combine allele counts across replicates '''
    allele_counts = collections.defaultdict(list)
    data = collections.defaultdict(list)
    for ac_file in args.acs:
        sz_utils.check_if_files_exist(ac_file)
        ColorText().info(
            "[poolseq_tk] reading and updating allele counts from %s ..." %
            (ac_file), "stderr")
        with open(ac_file) as fAC:
            for line in fAC:
                tmp_line = line.strip().split()
                pos = int(tmp_line[1])
                if not pos in data:
                    data[pos] = tmp_line[0:4]
                if not pos in allele_counts:
                    allele_counts[pos] = map(int, tmp_line[4].split(':'))
                else:
                    allele_counts[pos] = map(
                        sum,
                        zip(allele_counts[pos], map(int,
                                                    tmp_line[4].split(':'))))
        ColorText().info(" [done]\n", "stderr")

    # output to file
    fOUT = None
    if args.out == sys.stdout:
        fOUT = sys.stdout
    else:
        sz_utils.make_dirs_if_necessary(args.out)
        fOUT = open(args.out, 'w')
    ColorText().info("[poolseq_tk] outputting to %s ..." % (fOUT.name),
                     "stderr")
    for pos in sorted(allele_counts.iterkeys()):
        fOUT.write(
            "%s\t%s\n" %
            ("\t".join(data[pos]), ":".join(map(str, allele_counts[pos]))))
    ColorText().info(" [done]\n", "stderr")
Exemplo n.º 11
0
def getFDR_BH(dPvals, fdr_level):
    '''
		Using BH procedure to calculate pvalue
		cutoff at a FDR level
	'''
    lPvals = [dPvals[k] for k in dPvals.iterkeys()]
    ntests = len(lPvals)
    sort_lPvals = sorted(lPvals)
    for i in xrange(len(sort_lPvals)):
        if sort_lPvals[i] > (float(i + 1) / ntests) * fdr_level:
            #			print i, ntests, (float(i+1)/ntests)*fdr_level, sort_lPvals[i], sort_lPvals[i-1]
            if i == 0:
                return 0.00000000
            else:
                return sort_lPvals[i - 1]
    if i == len(sort_lPvals):
        ColorText().error("[poolseq_tk] Fail to calculate pvalue cutoff\n")
        sys.exit()
Exemplo n.º 12
0
def DisplayResultMerkle(listOriginalWords, listHash, finalHash):
    """
    Display the result of the Merkle tree
    :param listOriginalWords: List with the original words
    :param listWords: List with the modified words
    :param listHash: List of hash
    :param finalHash: Final hash
    """
    print(
        ColorText.OkBlue("######### Merkle Tree #########\n\n") +
        ColorText.OkBlue("Original word") + " -> " + ColorText.OkGreen("Hash"))
    for i in range(len(listOriginalWords)):
        print(
            ColorText.OkBlue(listOriginalWords[i]) + " -> " +
            ColorText.OkGreen(listHash[i]))

    print(
        ColorText.Fail("\nFinal hash of the root : {}\n").format(finalHash[0]))
Exemplo n.º 13
0
 def __init__(self, complexity):
     self.listBlock = []
     self.complexity = complexity
     print(ColorText.OkBlue("######### Blockchain #########\n"))
Exemplo n.º 14
0
import colortext.ColorText as ColorText

if __name__ == '__main__':
    ctext = ColorText()

    text = ctext.colorstring("ColorText Python", color=ctext.HEADER)
    text += ctext.colorstring(" class", color=ctext.OKBLUE)

    print(text, '\n')

    print('https://github.com/bessavagner', '\n')

    ctext.cprint("This is a default usage of cprint function")

    print('\n')

    text = ctext.colorstring('affect', color=ctext.WARNING)

    ctext.cprint(f"Of course you can {text} the behavior"
                 f" of {ctext.colorstring('cprint', color=ctext.UNDERLINE)}"
                 f" and override it's color.")

    ctext.cprint("But you can avoid it by passing", endcolor=False, end='')
    ctext.cprint("endcolor=False", color=ctext.OKBLUE, end='')
    ctext.cprint("as parameter.")
Exemplo n.º 15
0
def run_collapse(isnp, m1, m2, out):
    '''
		Given two pileup files of the same region, like 2l+ and 2la,
		collapse the pileups at each corresponding SNP
		Some SNPs are not reported in one or the other pileup file.
		A full list of SNP positions are required
	'''
    m1_base = os.path.basename(m1)
    m2_base = os.path.basename(m2)

    # first, getting the full list of SNPs
    dSNPs = sz_utils.getSNPs(isnp)

    offset1 = 0
    offset2 = 20524057

    # second, reading each of the pileup files
    chr1, m1_info = read_mpileup(m1, offset1)
    chr2, m2_info = read_mpileup(m2, offset2)

    ColorText().info(
        "[poolseq_tk] %s: %d SNPs parsed\n" % (m1_base, len(m1_info)),
        "stderr")
    ColorText().info(
        "[poolseq_tk] %s: %d SNPs parsed\n" % (m2_base, len(m2_info)),
        "stderr")

    #	fOUT = None
    #	if args.out != sys.stdout:
    #		outdir = os.path.dirname(os.path.realpath(args.out))
    #		sz_utils.make_dirs_if_necessary(outdir)
    #		fOUT = open(args.out, 'w')
    #	else:
    #		fOUT = args.out
    fOUT = open(out, 'w')
    ColorText().info(
        "[poolseq_tk]: collapsing mpileups %s and %s ..." % (m1_base, m2_base),
        "stderr")
    for (chr, pos) in sorted(dSNPs.iterkeys()):
        k = (chr, pos)
        reads_bases_collapsed = ""
        refBase1, refBase2 = "", ""
        if pos in m1_info:
            if m1_info[pos][0] == dSNPs[k][0]:
                refBase1 = m1_info[pos][0]
            else:
                ColorText().error(
                    "SNP position: %d %s\t\tMpileup position: %d %s\n" %
                    (pos, dSNPs[k][0], pos, m1_info[pos][0]))
                sys.exit()
        else:
            refBase1 = ""
        if pos in m2_info:
            if m2_info[pos][0] == dSNPs[k][1]:
                refBase2 = m2_info[pos][0]
            else:
                ColorText().error(
                    "SNP position: %d %s\t\tMpileup position: %d %s\n" %
                    (pos, dSNPs[k][1], pos, m2_info[pos][0]))
                sys.exit()
        else:
            refBase2 = ""

        if refBase1 != "" and refBase2 != "":
            reads_bases_collapsed1, nReadsBases1, nRefBases1, dMultiBases1, dIndels1 = sz_utils.parseReadsBases(
                m1_info[pos][1], refBase1, refBase2)
            reads_bases_collapsed2, nReadsBases2, nRefBases2, dMultiBases2, dIndels2 = sz_utils.parseReadsBases(
                m2_info[pos][1], refBase2, refBase1)
            reads_bases_collapsed = reads_bases_collapsed1 + reads_bases_collapsed2
            nReadsBases = nReadsBases1 + nReadsBases2
            nRefBases = nRefBases1 + (nReadsBases2 - nRefBases2)
            dMultiBases = dict(dMultiBases1.items() + dMultiBases2.items())
            dIndels = dict(dIndels1.items() + dIndels2.items())
            nMultiBases = sum(dMultiBases.values()) + sum(dIndels.values())
            if (nReadsBases == nRefBases or nMultiBases <= 1):
                fOUT.write(
                    "%s\t%d\t%s\t%s\t%s\n" %
                    (chr, pos, refBase1, refBase2, reads_bases_collapsed))
        elif refBase1 == "" and refBase2 != "":
            reads_bases_collapsed2, nReadsBases2, nRefBases2, dMultiBases2, dIndels2 = sz_utils.parseReadsBases(
                m2_info[pos][1], refBase2, refBase1)
            nMultiBases = sum(dMultiBases2.values()) + sum(dIndels2.values())
            if (nReadsBases2 == nRefBases2 or nMultiBases <= 1):
                fOUT.write(
                    "%s\t%d\t%s\t%s\t%s\n" %
                    (chr, pos, dSNPs[k][0], refBase2, reads_bases_collapsed2))
        elif refBase1 != "" and refBase2 == "":
            reads_bases_collapsed1, nReadsBases1, nRefBases1, dMultiBases1, dIndels1 = sz_utils.parseReadsBases(
                m1_info[pos][1], refBase1, refBase2)
            nMultiBases = sum(dMultiBases1.values()) + sum(dIndels1.values())
            if (nReadsBases1 == nRefBases1 or nMultiBases <= 1):
                fOUT.write(
                    "%s\t%d\t%s\t%s\t%s\n" %
                    (chr, pos, refBase1, dSNPs[k][1], reads_bases_collapsed1))
    ColorText().info(" [done]\n", "stderr")
    fOUT.close()
Exemplo n.º 16
0
def fisher_worker(task_q, result_q, outp):
    while True:
        try:
            tables, nth_job = task_q.get()
            ColorText().info(
                "[poolseq_tk]: %s running Fisher's Exact test on %d tables ...\n"
                % (mp.current_process().name, len(tables)), "stderr")
            tmpFile = outp + "." + mp.current_process().name + ".fisher"
            fOUT = open(tmpFile, 'w')
            pvals_split, odds_ratios_split = {}, {}
            nTests = 0
            for k in sorted(tables.iterkeys()):
                oddsr = 0.0
                chr = k[0]
                pos = k[1]
                alt_base = tables[k][2]
                ref_base = tables[k][1]
                ref_ac1 = int(tables[k][3])
                alt_ac1 = int(tables[k][4])
                ref_ac2 = int(tables[k][5])
                alt_ac2 = int(tables[k][6])
                if (sum(map(int, tables[k][3:7])) >= 10
                        and alt_ac1 + ref_ac1 >= 5 and  # row subtotals
                        alt_ac2 + ref_ac2 >= 5 and
                        alt_ac1 + alt_ac2 >= 5 and  # column subtotals
                        ref_ac1 + ref_ac2 >= 5):
                    nTests += 1
                    if (ref_ac1 == 0 or ref_ac2 == 0
                            or  # add pseudo counts in case
                            alt_ac1 == 0 or alt_ac2
                            == 0):  # odds ratio goes to Inf
                        ref_ac1 += 1
                        ref_ac2 += 1
                        alt_ac1 += 1
                        alt_ac2 += 1
                    data_vector = robjects.IntVector(
                        [ref_ac1, alt_ac1, ref_ac2, alt_ac2])
                    table = robjects.r['matrix'](data_vector, ncol=2)
                    rfisher = robjects.r['fisher.test'](table, alternative='t')
                    #					pvals_split[pos] = float(rfisher[0][0])
                    #					if (ref_ac1 == 0 or ref_ac2 == 0 or
                    #						alt_ac1 == 0 or alt_ac2 == 0):
                    #						oddsr = (float(ref_ac1+1)/(alt_ac1+1))/(float(ref_ac2+1)/(alt_ac2+1))
                    #					else:
                    pvalue = float(rfisher[0][0])
                    oddsr = rfisher[2][0]
                    #					odds_ratios_split[pos] = oddsr
                    if pvalue == 0.0:
                        fOUT.write("%s\t%d\t%.4g\t%.8f\tInf\n" %
                                   (chr, pos, pvalue, oddsr))
                    elif pvalue == 1.0:
                        fOUT.write("%s\t%d\t%.4g\t%.8f\t0.00000000\n" %
                                   (chr, pos, pvalue, oddsr))
                    else:
                        fOUT.write(
                            "%s\t%d\t%.8f\t%.8f\t%.8f\n" %
                            (chr, pos, pvalue, oddsr, -1 * math.log10(pvalue)))
            fOUT.close()
            ColorText().info(
                "[poolseq_tk]: %s ran %d tests\n" %
                (mp.current_process().name, nTests), "stderr")
            result_q.put(tmpFile)
#			result_q.put((pvals_split, odds_ratios_split))
        finally:
            task_q.task_done()
Exemplo n.º 17
0
def run_fisher(args):
    ''' run Fisher's Exact test '''
    sz_utils.make_dirs_if_necessary(args.outp)
    sz_utils.check_if_files_exist(args.ac_file)
    tables = sz_utils._count2table(args.ac_file)[0]

    task_q = mp.JoinableQueue()
    result_q = mp.Queue()
    create_procs(args.nproc, task_q, result_q, args.outp)
    sz_utils._assign_tables(tables, task_q, args.nproc)

    try:
        task_q.join()
    except KeyboardInterrupt:
        ColorText().info("[poolseq_tk]: Terminated unexpectedly by keyboard\n",
                         "stderr")
        sys.exit()
    else:
        pvals, odds_ratios, log10_pvals = {}, {}, {}
        while args.nproc:
            file = result_q.get()
            with open(file, 'r') as fIN:
                for line in fIN:
                    tmp_line = line.strip().split("\t")
                    chr = tmp_line[0]
                    pos = int(tmp_line[1])
                    pval = float(tmp_line[2])
                    odds_ratio = float(tmp_line[3])
                    log10_pval = tmp_line[4]
                    if (chr, pos) not in pvals:
                        pvals[chr, pos] = pval
                    if (chr, pos) not in odds_ratios:
                        odds_ratios[chr, pos] = odds_ratio
                    if (chr, pos) not in log10_pvals:
                        log10_pvals[chr, pos] = log10_pval
            os.remove(file)
            #			pvals_split, odds_ratios_split = result_q.get()
            #			pvals.update(pvals_split)
            #			odds_ratios.update(odds_ratios_split)
            args.nproc -= 1
        ColorText().info(
            "[poolseq_tk]: Running Fisher's Exact tests successfully\n",
            "stderr")

        # correcting raw p-values and make QQ plots
        ColorText().info(
            "[poolseq_tk]: multi-testing correction using %s method at %d%% level ..."
            % (args.adj_method, args.adj_cutoff * 100), "stderr")
        raw_pvals = [pvals[k] for k in sorted(pvals.iterkeys())]
        raw_pvals_vector = robjects.FloatVector(raw_pvals)
        padjust = robjects.r['p.adjust'](raw_pvals_vector,
                                         method=args.adj_method)
        ColorText().info(" [done]\n", "stderr")
        ColorText().info(
            "[poolseq_tk]: p-value cutoff using Benjamini.Hochberg procedure %.5e"
            % (sz_utils.getFDR_BH(pvals, args.adj_cutoff)), "stderr")
        ColorText().info(" [done]\n", "stderr")

        # output p-values
        ColorText().info("[poolseq_tk]: output to files ...", "stderr")
        out_all = args.outp + ".fisher.all"
        out_fdr = args.outp + ".fisher.fdr%d" % (args.adj_cutoff * 100)
        out_expect = args.outp + ".fisher.fdr%d.expect" % (args.adj_cutoff *
                                                           100)
        with open(out_all, 'w') as fALL, \
          open(out_fdr, 'w') as fFDR, \
          open(out_expect, 'w') as fEXPECT:
            for i, k in enumerate(sorted(pvals.iterkeys())):
                chr = k[0]
                pos = k[1]
                raw_pval = pvals[k]
                log_pval = log10_pvals[k]
                odds_ratio = odds_ratios[k]
                if padjust[i] <= args.adj_cutoff:
                    sz_utils._results_outputter(fFDR, pos, chr,
                                                "\t".join(tables[k][1:3]),
                                                tables[k][3:], raw_pval,
                                                log_pval, padjust[i],
                                                odds_ratio)
                    if ((args.oddsr_direction == "greater"
                         and odds_ratios[k] > 1)
                            or (args.oddsr_direction == "less"
                                and odds_ratios[k] < 1)):
                        sz_utils._results_outputter(fEXPECT, pos, chr,
                                                    "\t".join(tables[k][1:3]),
                                                    tables[k][3:], raw_pval,
                                                    log_pval, padjust[i],
                                                    odds_ratio)
                sz_utils._results_outputter(fALL, pos, chr,
                                            "\t".join(tables[k][1:3]),
                                            tables[k][3:], raw_pval, log_pval,
                                            padjust[i], odds_ratio)
        ColorText().info(" [done]\n", "stderr")
        ColorText().info("[poolseq_tk]: Program finishes successfully\n",
                         "stderr")
Exemplo n.º 18
0
def run_count(args):
    ''' Counting alleles at each SNP in the given pileup files '''

    dPos = {}
    if args.pos:
        ColorText().info("[poolseq_tk] reading SNPs positions:", "stderr")
        with open(args.pos, 'r') as fPOS:
            for line in fPOS:
                tmp_line = line.strip().split("\t")
                chr = tmp_line[0]
                pos = int(tmp_line[1])
                if (chr, pos) not in dPos:
                    dPos[chr, pos] = 1
        ColorText().info(" %d\n" % (len(dPos)), "stderr")
    else:
        ColorText().info(
            "[poolseq_tk] no SNP positions provided ... [skipped]\n", "stderr")

    ac = collections.defaultdict(tuple)
    for pileup in args.pileups:
        sz_utils.check_if_files_exist(pileup)
        nsnps = 0
        ColorText().info(
            "[poolseq_tk] counting alleles in %s:" %
            (os.path.basename(pileup)), "stderr")
        with open(pileup, 'r') as fMPILEUP:
            for line in fMPILEUP:
                nsnps += 1
                tmp_line = line.strip().split("\t")
                chr = tmp_line[0]
                pos = int(tmp_line[1])
                if (((chr, pos) in dPos and args.pos)
                        or (len(dPos) == 0 and not args.pos)):
                    ref_base = tmp_line[2]
                    alt_base = tmp_line[3]
                    nRefAlleles, nAltAlleles = 0, 0
                    if len(tmp_line) == 5:
                        nRefAlleles = tmp_line[-1].count(ref_base) + \
                             tmp_line[-1].count(ref_base.lower())
                        nAltAlleles = tmp_line[-1].count(alt_base) + \
                             tmp_line[-1].count(alt_base.lower())
                    if (chr, pos) not in ac:
                        ac[chr, pos] = [
                            ref_base, alt_base,
                            str(nRefAlleles),
                            str(nAltAlleles)
                        ]
                    else:
                        ac[chr, pos] += [str(nRefAlleles), str(nAltAlleles)]
        ColorText().info(" %d SNPs parsed\n" % (nsnps), "stderr")

    fOUT = None
    if args.out == sys.stdout:
        fOUT = sys.stdout
    else:
        sz_utils.make_dirs_if_necessary(args.out)
        fOUT = open(args.out, 'w')
    ColorText().info("[poolseq_tk] outputting allele counts to table ...",
                     "stderr")
    for k in sorted(ac.iterkeys()):
        chr = k[0]
        pos = k[1]
        i = 2
        if len(ac[k][i:]) == 2 * len(args.pileups):
            fOUT.write("%s\t%d\t%s" % (chr, pos, "\t".join(ac[k][0:2])))
            while i <= len(ac[k]) - 4:
                fOUT.write("\t%s" % (":".join(ac[k][i:i + 4])))
                i += 4
            fOUT.write("\n")
    ColorText().info(" [done]\n", "stderr")
    fOUT.close()
Exemplo n.º 19
0
def making_plot(args):
    ''' making Q-Q plot and Manhattan plot '''
    # install qqman package if not installed

    if not rpackages.isinstalled("qqman"):
        rutils = rpackages.importr('utils')
        rutils.chooseCRANmirror(ind=84)
        rutils.install_packages("qqman")

    # get pvalues
    ColorText().info("[poolseq_tk]: Extracting P-Values ... ", "stderr")
    data = collections.defaultdict()
    chrs = []
    pvals, adjust_pvals = {}, {}
    nchr = 0
    with open(args.input, 'r') as fIN:
        for line in fIN:
            tmp_line = line.strip().split("\t")
            chr = tmp_line[0]
            pos = int(tmp_line[1])
            if chr not in chrs:
                chrs.append(chr)
                nchr += 1
            data[chr, pos] = nchr
            pvals[chr, pos] = float(tmp_line[8])
    ColorText().info(" [done]\n", "stderr")

    # get FDR cutoff using BH if not provided through command line
    pcutoff = 0.0
    if not args.pcutoff:
        ColorText().info(
            "[poolseq_tk]: Getting p-value cutoff at FDR %d%%: " %
            (args.fdrlevel * 100), "stderr")
        pcutoff = sz_utils.getFDR_BH(pvals, args.fdrlevel)
        ColorText().info("%.5e\n" % (pcutoff), "stderr")
    else:
        pcutoff = args.pcutoff
        ColorText().info(
            "[poolseq_tk]: p-value cutoff provided: %.5e\n" % (pcutoff),
            "stderr")

    # get SNPs to highlight
    snps_to_highlight = []
    if args.highlight_snps:
        ColorText().info(
            "[poolseq_tk]: Getting SNPs to be highlighed in Manhattan plot ... ",
            "stderr")
        with open(args.highlight_snps, 'r') as fHIGHLIGHT:
            for line in fHIGHLIGHT:
                tmp_line = line.strip().split("\t")
                snps_to_highlight.append('_'.join(tmp_line[:2]))
    ColorText().info(" [done]\n", "stderr")

    if args.pdf:
        out_qqplot = args.outp + ".qqplot.pdf"
        out_manhattan = args.outp + ".manhattan.pdf"
    elif args.png:  # save to PNG probably wont work
        out_qqplot = args.outp + ".qqplot.png"
        out_manhattan = args.outp + ".manhattan.png"
    sz_utils.make_dirs_if_necessary(out_qqplot, out_manhattan)
    grdevices = rpackages.importr('grDevices')
    raw_pvals_vector = robjects.FloatVector(
        [pvals[k] for k in sorted(pvals.iterkeys())])

    ColorText().info("[poolseq_tk]: Making Q-Q plot ...", "stderr")
    make_qqplots(grdevices, raw_pvals_vector, out_qqplot, args.qqtitle)
    ColorText().info(" [done]\n", "stderr")

    ColorText().info("[poolseq_tk]: Making Manhattan plot ...", "stderr")
    make_manhattan(grdevices, data, raw_pvals_vector, snps_to_highlight,
                   pcutoff, out_manhattan, args.mantitle, args.manx,
                   args.manxlim)
    ColorText().info(" [done]\n", "stderr")
Exemplo n.º 20
0
def run_cmh(args):
	''' run Cochran-Mantel-Hasenzle test '''

	sz_utils.make_dirs_if_necessary(args.outp)
	allele_counts = {}
	pvals = {}
	tables = collections.defaultdict(list)
	ntests = 0
	tables, ntables_per_snp = sz_utils._count2table(args.table_file)
	ColorText().info("[poolseq_tk]: %d tables prepared\n" %(len(tables)), "stderr")

	task_q = mp.JoinableQueue()
	result_q = mp.Queue()
	create_procs(args.nproc,task_q, result_q, ntables_per_snp, args.outp)
	sz_utils._assign_tables(tables, task_q, args.nproc)

	# waiting for all tasks to be finished
	try:
		task_q.join()
	except KeyboardInterrupt:
		ColorText().info("[poolseq_tk]: Terminated unexpectedly by keyboard\n", "stderr")
		sys.exit()
	else:
		# merge results
		pvals, odds_ratios = {}, {}
		while args.nproc:
			file = result_q.get()
			with open(file, 'r') as fIN:
				for line in fIN:
					tmp_line = line.strip().split("\t")
					chr = tmp_line[0]
					pos = int(tmp_line[1])
					pval = float(tmp_line[2])
					odds_ratio = float(tmp_line[3])
					if (chr, pos) not in pvals:
						pvals[chr, pos] = pval
					if (chr, pos) not in odds_ratios:
						odds_ratios[chr, pos] = odds_ratio
			os.remove(file)
#			pvals_split, odds_ratios_split = result_q.get()
#			pvals.update(pvals_split)
#			odds_ratios.update(odds_ratios_split)
			args.nproc -= 1
		ColorText().info("[poolseq_tk]: Running CMH tests successfully\n", "stderr")

		# correcting raw p-values
		ColorText().info("[poolseq_tk]: multi-testing correction using %s method at %d%% level ..."
						 %(args.adj_method, args.adj_cutoff*100), "stderr")
		raw_pvals = [pvals[chr, pos] for chr, pos in sorted(pvals.iterkeys())]
		raw_pvals_vector = robjects.FloatVector(raw_pvals)
		padjust = robjects.r['p.adjust'](raw_pvals_vector, method=args.adj_method)
		ColorText().info(" [done]\n", "stderr")
		pcutoff = sz_utils.getFDR_BH(pvals, args.adj_cutoff)
		ColorText().info("[poolseq_tk]: p-value cutoff using Benjamini.Hochberg procedure %.5e"
						 %(pcutoff), "stderr")
		ColorText().info(" [done]\n", "stderr")

		# output p-values
		ColorText().info("[poolseq_tk]: output to files ...", "stderr")
		out_all = args.outp + ".cmh.all"
		out_fdr = args.outp + ".cmh.fdr%d" %(args.adj_cutoff*100)
		out_expect = args.outp + ".cmh.fdr%d.expect" %(args.adj_cutoff*100)
		sz_utils.make_dirs_if_necessary(out_all, out_fdr)
		with open(out_all, 'w') as fALL, \
			 open(out_fdr, 'w') as fFDR, \
			 open(out_expect, 'w') as fEXPECT:
			for i, k in enumerate(sorted(pvals.iterkeys())):
				chr = k[0]
				pos = k[1]
				raw_pval = pvals[chr, pos]
				log_pval = None
				if raw_pval == 0.0:
					log_pval = "Inf"
				elif raw_pval == "Nan":
					raw_pval = 1.0
					log_pval = 0.0
				else:
					log_pval = -1 * math.log10(raw_pval)
				odds_ratio = odds_ratios[k]
				if padjust[i] <= args.adj_cutoff:
					sz_utils._results_outputter(fFDR, pos, chr, "\t".join(tables[chr, pos][1:3]), tables[chr, pos][3:], raw_pval, log_pval, padjust[i], odds_ratio)
					if ((args.oddsr_direction == "greater" and odds_ratios[chr, pos] > 1) or
						(args.oddsr_direction == "less" and odds_ratios[chr, pos] < 1)):
						sz_utils._results_outputter(fEXPECT, pos, chr, "\t".join(tables[chr, pos][1:3]), tables[chr, pos][3:], raw_pval, log_pval, padjust[i], odds_ratio)
				sz_utils._results_outputter(fALL, pos, chr, "\t".join(tables[chr, pos][1:3]), tables[chr, pos][3:], raw_pval, log_pval, padjust[i], odds_ratio)
		ColorText().info(" [done]\n", "stderr")
		ColorText().info("[poolseq_tk]: Program finishes successfully\n", "stderr")
Exemplo n.º 21
0
def run_collapse(args):
	'''
		Given two pileup files of the same region, like 2l+ and 2la,
		collapse the pileups at each corresponding SNP
		Some SNPs are not reported in one or the other pileup file.
		A full list of SNP positions are required
	'''
	m1_base = os.path.basename(args.m1)
	m2_base = os.path.basename(args.m2)

	# first, getting the full list of SNPs
	dSNPs = get_SNPs(args.snps)

	# second, reading each of the pileup files
	chr1, dM1 = read_mpileup(args.m1, args.offset1)
	chr2, dM2 = read_mpileup(args.m2, args.offset2)

	ColorText().info("[poolseq_tk] %s: %d SNPs parsed\n" %(m1_base, len(dM1)), "stderr")
	ColorText().info("[poolseq_tk] %s: %d SNPs parsed\n" %(m2_base, len(dM2)), "stderr")

	fOUT = None
	if args.out != sys.stdout:
		outdir = os.path.dirname(os.path.realpath(args.out))
		sz_utils.make_dirs_if_necessary(outdir)
		fOUT = open(args.out, 'w')
	else:
		fOUT = args.out
	ColorText().info("[poolseq_tk]: collapsing mpileups %s and %s ..."
					 %(m1_base, m2_base), "stderr")
	for pos in sorted(dSNPs.iterkeys()):
		reads_bases_collapsed = ""
		if pos in dM1 and pos in dM2:
			'''
				dSNPs[pos][0]: ref base of m1 pileup
				dSNPs[pos][1]: ref base of m2 pileup
				dM1[pos][0]: ref base of m1 pileup
				dM2[pos][1]: ref base of m2 pileup
			'''
			if dSNPs[pos][0] == dM1[pos][0] and dSNPs[pos][1] == dM2[pos][0]:
				reads_bases_collapsed = parseReadsBases(dM1[pos][0],
															 dM2[pos][0],
															 dM1[pos][1])
				reads_bases_collapsed += parseReadsBases(dM2[pos][0],
															 dM1[pos][0],
															 dM2[pos][1])
				fOUT.write("%s/%s\t%d\t%s\t%s\t%s\n"
								%(chr1, chr2, pos,
								  dSNPs[pos][0], dSNPs[pos][1], reads_bases_collapsed))
			else:
				# this should bark if the same sites having different states
				ColorText().error("SNP position: %d %s %s\t\tMpileup position: %d %s %s\n"
								 %(pos, dSNPs[pos][0], dSNPs[pos][1],
								   pos, dM1[pos][0], dM2[pos][0]),
								   "stderr")
		# SNPS missed in both pileup file
		elif pos not in dM1 and pos not in dM2:
			fOUT.write("%s/%s\t%d\t%s\t%s\n"
							%(chr1, chr2, pos,
							  dSNPs[pos][0], dSNPs[pos][1]))
		# SNPs in m1 pileup file but not in m2
		elif pos in dM1 and pos not in dM2:
			reads_bases_collapsed = parseReadsBases(dM1[pos][0],
														 dSNPs[pos][1],
														 dM1[pos][1])
			fOUT.write("%s/%s\t%d\t%s\t%s\t%s\n"
							%(chr1, chr2, pos,
							  dSNPs[pos][0], dSNPs[pos][1], reads_bases_collapsed))
		# SNPs in m2 pileup file but not in m1
		elif pos not in dM1 and pos in dM2:
			reads_bases_collapsed = parseReadsBases(dM2[pos][0],
														 dSNPs[pos][0],
														 dM2[pos][1])
			fOUT.write("%s/%s\t%d\t%s\t%s\t%s\n"
							%(chr1, chr2, pos,
							  dSNPs[pos][0], dSNPs[pos][1], reads_bases_collapsed))
	ColorText().info(" [done]\n", "stderr")
	fOUT.close()
Exemplo n.º 22
0
def check_if_files_exist(*files):
    for file in files:
        if not os.path.exists(file):
            ColorText().error("\n[poolseq_tk] ERROR: cannot find file %s\n" %
                              (os.path.realpath(file)))
            sys.exit(1)