Exemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser(
        description="Chiamante v"
        + version
        + "\nCalls genotypes for Illumina Bead chips (possibly augmented with sequence genotype likelihoods for improved accuracy)"
    )

    parser.add_argument("arrayfilename", metavar="arrayfilename", type=str, help="the micro array data")

    # OPTIONAL
    parser.add_argument("-output", metavar="output.vcf.gz", default="", type=str, help="output file")
    parser.add_argument(
        "-chromosome",
        dest="chromosome",
        action="store",
        default=None,
        type=str,
        help="Only call loci on this chromosome",
    )
    parser.add_argument("--nohwe", dest="nohwe", action="store_true", help="")
    parser.add_argument(
        "-seq",
        dest="seqfilename",
        metavar="genotype_likelihoods.vcf.gz",
        default=None,
        type=str,
        help="the genotype likelihoods from sequence data",
    )
    parser.add_argument(
        "-pop",
        dest="pop",
        action="store",
        default="",
        type=str,
        help="text file containing space delimited population of each sample eg. YRI YRI CEU CHB...",
    )
    parser.add_argument(
        "-hwe_filter",
        dest="hwe_filter",
        action="store",
        default=5,
        type=float,
        help="-log10 HWE filter (values greater than this will be flagged in the FILTER field of the vcf",
    )
    parser.add_argument(
        "-aqual_filter",
        dest="aqual_filter",
        action="store",
        default=10,
        type=float,
        help="AQUAL filter (values lower than this will be flagged in the FILTER field of the vcf",
    )
    parser.add_argument(
        "-squal_filter",
        dest="squal_filter",
        action="store",
        default=10,
        type=float,
        help="SQUAL filter (values lower than this will be flagged in the FILTER field of the vcf",
    )

    # EXPERIMENTAL/DEPRECATED - there be dragons here.
    parser.add_argument(
        "-snpinfo",
        dest="snpinfo",
        metavar="snpinfo.txt(.gz)",
        action="store",
        default=None,
        type=str,
        help=argparse.SUPPRESS,
    )  # "SNP information file - required if you are using Broad or Illuminus style input")
    parser.add_argument("-pp_filter", dest="pp_filter", action="store", default=0.9, type=float, help=argparse.SUPPRESS)
    parser.add_argument(
        "-pos", dest="positionfile", action="store", default=None, type=str, help=argparse.SUPPRESS
    )  # 'optional list of genomic positions to call at (default: everything in arrayfile)')
    parser.add_argument("--broad", dest="broad", action="store_true", help=argparse.SUPPRESS)
    parser.add_argument("--illuminus", dest="illuminus", action="store_true", help=argparse.SUPPRESS)
    parser.add_argument(
        "-sf", dest="sf", action="store", default=1.0 / 3.0, type=float, help=argparse.SUPPRESS
    )  #'sequence fail likelihood (as a rule dont touch this)')
    parser.add_argument(
        "-af", dest="af", action="store", default=0.005, type=float, help=argparse.SUPPRESS
    )  #'array fail likelihood (as a rule dont touch this)')
    parser.add_argument(
        "-nprocess",
        dest="nprocess",
        action="store",
        default=1,
        type=int,
        help="Number of processes to use for genotyping calling (set this to the number of avaiable CPU cores)",
    )
    parser.add_argument(
        "-thisjob", dest="thisjob", action="store", default=0, type=int, help=argparse.SUPPRESS
    )  #'array fail likelihood (as a rule dont touch this)')
    parser.add_argument(
        "-niteration", dest="niteration", metavar="30", action="store", default=30, type=int, help=argparse.SUPPRESS
    )  # 'Maximum number of ECM iterations')
    parser.add_argument(
        "-nsubiteration",
        dest="nsubiteration",
        metavar="30",
        action="store",
        default=30,
        type=int,
        help=argparse.SUPPRESS,
    )  # 'Maximum number of ECM iterations per SNP')
    parser.add_argument(
        "-sigfig", dest="sigfig", metavar="3", action="store", default=3, type=int, help="signifciant figures in output"
    )
    parser.add_argument(
        "-tolerance", dest="tolerance", action="store", default=0.1, type=float, help=argparse.SUPPRESS
    )  #'convergence toleran
    parser.add_argument("-chunksize", dest="chunksize", action="store", default=10000, type=int, help=argparse.SUPPRESS)
    parser.add_argument("-f", dest="f", action="store", default=False, type=bool, help=argparse.SUPPRESS)
    parser.add_argument("--vcfin", dest="vcfin", action="store_true", help=argparse.SUPPRESS)
    parser.add_argument("--debug", dest="debug", action="store_true", help=argparse.SUPPRESS)
    parser.add_argument("--plot", dest="plot", action="store_true", help=argparse.SUPPRESS)
    parser.add_argument("--GL", dest="gl", action="store_true", help=argparse.SUPPRESS)  # generate GLs

    print "Chiamante version", version

    args = parser.parse_args()
    args.arrayonly = False
    args.renormalise = True
    tol = 0.001

    buf = 500 * 2 ** 20  # buffer for concatenating files

    if args.debug:
        print "DEBUG MODE!"

    chromosomes = set([str(val) for val in range(1, 23)])
    if args.chromosome != None:
        if args.chromosome not in chromosomes:
            print "Invalid chromosome specified", args.chromosome
            quit()
        chromosomes = set([args.chromosome])
    if (args.chromosome == None and args.seqfilename != None) and not args.f:
        print "WARNING: you have specified sequence likelihoods but without the -chrom argument.\n This is no problem, but sequence data is usually split between chromosomes.\nMaybe you meant to use the -chrom argument.\nRun with -f to ignore this warning.\nExiting..."
        quit()

    if args.output == "":
        args.output = args.arrayfilename.split(".")[0] + "_chiamante"
    ids = chiamante_io.get_arrayids(args)

    start_time0 = time.time()

    if args.positionfile == None:
        print "Parsing", args.arrayfilename, "..."
        positions = chiamante_io.get_positions(args)
    elif args.chromosome == None:
        print "You need to specificy a chromosome if positions file is specified"
        quit()
    else:
        tmp = np.array(chiamante_io.ropen(args.positionfile).read().split(), np.int)
        positions = [np.array([args.chromosome for idx in range(len(tmp))]), tmp]

    print len(positions[0]), "SNPs and", len(ids), "samples in", args.arrayfilename

    if args.pop != "":
        print "Reading populations from", args.pop
        poptable, pop = chiamante_io.setup_populations(chiamante_io.ropen(args.pop).read().split(), True)
    else:
        print "No population information provide I am assuming all samples are from the same population"

    # print "Took ",time.time() - start_time0," seconds"
    start_time0 = time.time()
    chromosomes = chromosomes.intersection(set(positions[0].tolist()))
    if args.seqfilename == None:
        seqfilename = "None"
    else:
        seqfilename = args.seqfilename
        chiamante_io.ropen(seqfilename)

    for chrom in chromosomes:
        if len(chromosomes) == 1 and ("chr" + str(chrom)) in args.output:
            outfilename = args.output
        else:
            outfilename = args.output + "_chr" + str(chrom)
        outfile = chiamante_io.vcfWriter(
            outfilename,
            ids,
            header=[
                "fileformat=VCFv4.0",
                "source=chiamante_v" + version,
                "arrayfile=" + args.arrayfilename,
                "seqfile=" + seqfilename,
                "timestamp=" + str(datetime.datetime.now()),
                "FILTER=<ID=a"
                + str(args.aqual_filter)
                + ',Description="Array data quality below'
                + str(args.aqual_filter)
                + '">',
                "FILTER=<ID=s"
                + str(args.squal_filter)
                + ',Description="Sequence data quality below'
                + str(args.squal_filter)
                + '">',
                "FILTER=<ID=hwe"
                + str(args.hwe_filter)
                + ',Description="-log10 HWE exact test p-value above'
                + str(args.hwe_filter)
                + '">',
                'FILTER=<ID=NOT_POLY,Description="loci does not appear to be polymorphic in this sample">',
                'INFO=<ID=HWE,Number=1,Type=Float,Description="-log10 HWE exact test p-value">',
                'INFO=<ID=AQUAL,Number=1,Type=Integer,Description="phred scaled array failure rate">',
                'INFO=<ID=SQUAL,Number=1,Type=Integer,Description="phred scaled sequence failure rate">',
                'FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
                'FORMAT=<ID=GP,Number=3,Type=Float,Description="Genotype probabilities">',
            ],
        )
        del outfile
        positions_on_this_chrom = positions[1][positions[0] == str(chrom)]
        positions_on_this_chrom.sort()
        npos = len(positions_on_this_chrom)
        nchunk = max(1, int(round(float(npos) / float(args.chunksize))))
        neach = int(round(float(npos) / (nchunk)))
        splitpos = range(0, npos, neach) + [npos]
        # print splitpos
        new_positions = [positions_on_this_chrom[splitpos[i] : splitpos[i + 1]] for i in range(nchunk)]
        start_time1 = time.time()
        print "Calling genotypes for chromosome", chrom, "using", args.nprocess, "processes..."
        timestamp = str(datetime.datetime.now()).replace(" ", "_")
        workdir = args.output + ".working_" + timestamp
        try:
            os.mkdir(workdir)
        except:
            pass
        workdir += "/"
        print "Chromosome", chrom, "split into", len(new_positions), "chunks"
        #        if __name__ == '__main__':
        if True:
            if not args.debug:
                #                pool = Pool(processes=args.nprocess)              # starts processes
                for chunk, pos in enumerate(new_positions):
                    # print chunk,pos
                    tmpargs = deepcopy(args)
                    tmpargs.chrom = str(chrom)
                    #                    tmpargs.output = workdir+tmpargs.output+"_"+str(chrom)+"_"+str(chunk)
                    tmpargs.output = workdir + "_" + str(chrom) + "_" + str(chunk)
                    stdout = tmpargs.output + ".stdout"
                    #                    pool.apply_async(chiamante_main.main,args=(tmpargs,pos,stdout))
                    #                    pool_result = pool.apply_async(process_chunk,args=(tmpargs,pos,stdout))
                    process_chunk(tmpargs, pos)

                # pool_result.get()
                # pool.close()
                # pool.join()

            else:
                for chunk, pos in enumerate(new_positions):
                    print "Processing chromosome", chrom, "Chunk", chunk, "..."
                    tmpargs = deepcopy(args)
                    tmpargs.chrom = str(chrom)
                    #                    tmpargs.output = workdir+tmpargs.output+"_"+str(chrom)+"_"+str(chunk)
                    tmpargs.output = workdir + "_" + str(chrom) + "_" + str(chunk)
                    stdout = tmpargs.output + ".stdout"
                    #                    chiamante_main.main(tmpargs,pos)
                    process_chunk(tmpargs, pos)

            print "Chromosome", chrom, "took", time.time() - start_time1, " seconds"

            print "Calling finished.  Writing output..."
            outfile = open(outfilename + ".vcf.gz", "ab", buf)
            logfile = open(args.output + "_" + str(chrom) + ".log", "wb")
            for chunk in range(nchunk):
                postfix = "_" + str(chrom) + "_" + str(chunk)
                logfile.write(open(workdir + postfix + ".log", "rb").read())
                tempfile = open(workdir + postfix + ".vcf.gz", "rb")
                while True:
                    data = tempfile.read(buf)
                    if data:
                        outfile.write(data)
                    else:
                        break
                # os.remove(args.output+postfix+".log")
                # os.remove(args.output+postfix+".vcf.gz")

            if not args.debug:
                shutil.rmtree(workdir, ignore_errors=True)
                # try: shutil.rmtree(workdir)
                # except: pass

    print "Took ", time.time() - start_time0, " seconds"
Exemplo n.º 2
0
def process_chunk(args, positions=None, stdout=None):

    np.seterr(invalid="raise")
    ngig = 8
    resource.setrlimit(resource.RLIMIT_AS, (ngig * pow(2, 30), ngig * pow(2, 30)))
    start_time = time.time()

    print "Processing Chr", args.chrom, ":", min(positions), "-", max(positions), "..."

    if args.plot:
        import plot_functions as plot

    if not args.debug and stdout != None:
        sys.stdout = open(stdout, "w")
    if args.output == "":
        args.output = args.arrayfilename.split(".")[0] + "_chiamante"
    #    if args.nprocess>0:args.output+="_"+str(args.thisjob)
    chiamante_io.checkfiledoesnotexist(args.output + ".vcf.gz")
    logfile = open(args.output + ".log", "wb")

    sorted_positions, ids, pop, snpinfo, array_chromosomes, array_positions, signal = chiamante_io.array_setup(
        args, positions
    )
    nsnp = len(sorted_positions)

    #    meanfile = chiamante_io.wopen('snp_means.txt.gz',force=True)
    out = chiamante_io.vcfWriter(args.output, ids=None, header=None, significant_figures=args.sigfig)
    nsample = len(ids)
    signal[np.isnan(signal)] = 0
    signal[signal < 8] += 8
    signal = np.log2(signal)

    if args.renormalise:  # if some samples have intensities with mean >4 SD away, just apply simple normalisation
        x = signal.mean(0)
        ii = np.abs((x - x.mean(0)) / x.std(0)).max(1) > 4
        if ii.sum() > 0:
            # print "WARNING:",ids[ii],"have extremely atypical signal values - normalising them"
            mu = x[np.logical_not(ii)].mean(0)
            for i in np.where(ii)[0]:
                signal[:, i] = (signal[:, i] - signal[:, i].mean(0)) + mu
                signal[signal < 0] = 0.0

    nsnp = signal.shape[0]
    nsample = signal.shape[1]
    nsubsample = len(signal)

    if args.seqfilename != None:
        doseq = True
        seqlik = chiamante_io.get_sequence_likelihoods(
            args.seqfilename, array_chromosomes, sorted_positions, ids, snpinfo, logfile
        )
    else:
        doseq = False
        seqlik = {}

    pops = set(pop)
    popidx = [np.where(pop == popid)[0] for popid in pops]
    npop = len(pops)
    start, priors = chiamante_init.setup_omni25(
        3,
        len(np.unique(pop)),
        sigma_mu_multiplier=1.0,
        rafprior=[1.01, 1.01],
        sigma_multiplier=10,
        hwe_var=-1,
        illumina=True,
        s0_multiplier=1,
        nocorrelation=False,
    )

    nit = args.niteration

    #   print "Running chiamante style mixture distribution with hyperprior on mu_mu and mu_sigma..."

    gprob = np.empty((nsample, 4), np.float)
    new_means = np.empty((nsubsample, 6), np.float)
    old_means = np.ones((nsubsample, 6), np.float)
    parms = {}
    if doseq:
        working_arrays = chiamante_ecm.chiamante_work_arrays(nsample, 1.0 / 3.0, seqlik[seqlik.keys()[0]]["lik"])
    else:
        working_arrays = chiamante_ecm.chiamante_work_arrays(nsample)
    i = 0
    # print len(sorted_positions),"positions on",args.chromosome
    # print "Min:",min(sorted_positions)
    # print "Max:",max(sorted_positions)
    # print "I/O took",time.time() - start_time,"seconds"

    start_time = time.time()

    while i <= nit:
        print "Iteration", i
        snpnumber = 0
        for k, pos in enumerate(sorted_positions):  # LOOP OVER EVERY SNP

            if k % 1000 == 0:
                print "Iteration", i, "SNP", k, "Position", pos

            for snp in snpinfo[pos]:

                x1 = signal[snp[9]]
                if pos in seqlik:
                    x2 = seqlik[pos]["lik"]
                else:
                    x2 = None

                if i == 0:
                    start["mu"] = chiamante_init.get_mustart(x1, illumina=True)
                else:
                    start = parms[snp[2]]

                if i < nit:
                    subiterations = 1
                else:
                    subiterations = args.nsubiteration  # on the last inter-SNP iteration we let each loci converge

                # version2 two full passes, resetting starting values on second pass
                # start['mu'] = chiamante_init.get_mustart(x1,illumina=True)
                # subiterations = args.nsubiteration

                chiamante = chiamante_ecm.chiamante_mainloop(
                    x1,
                    x2,
                    priors,
                    start,
                    seqfaildens=args.sf,
                    arrfaildens=args.af,
                    popidx=popidx,
                    working=working_arrays,
                    df=[5, 5, 5],
                    niteration=subiterations,
                    hwe_prior=not args.nohwe,
                    tolerance=0.01,
                    C=False,
                    genotype_likelihoods=args.gl,
                )

                if i == nit:  # ON THE LAST ITERATION WE WRITE OUT TO THE VCF
                    infostring = ""
                    filterstring = ""
                    if pos in seqlik:
                        flip = seqlik[pos]["flip"]
                    else:
                        #                        if doseq: filterstring += ';SEQ_MISSING'
                        if snp[8] == "ALT:REF":
                            flip = True
                        else:
                            flip = False

                    chiamante_ecm.corrected_posteriors(working_arrays, gprob, doseq)

                    # FILTER and INFO parameters
                    ngeno = gprob.sum(0)
                    try:
                        hwe = chiamante_statfunc.hwe_test(gprob[:, :3], popidx)
                    except:
                        print "ERROR!\nExiting...\nPlease report to [email protected]"
                        quit()
                    aqual = -10 * np.log10(chiamante["parameters"]["eta_array"])
                    infostring += "HWE=%.4f" % hwe + ";AQUAL=" + str(int(aqual))
                    if doseq:
                        if x2 != None:
                            squal = -10 * np.log10(chiamante["parameters"]["eta_seq"])
                            infostring += ";SQUAL=" + str(int(squal))
                            if squal < args.squal_filter:
                                filterstring += ";s" + str(args.squal_filter)
                        else:
                            infostring += ";SQUAL=."

                    if aqual < args.aqual_filter:
                        filterstring += ";a" + str(args.aqual_filter)
                    if hwe > args.hwe_filter:
                        filterstring += ";hwe" + str(args.hwe_filter)
                    if gprob.max(0)[1] < args.pp_filter:
                        filterstring += ";NOT_POLY"
                    if len(filterstring) > 0:
                        filterstring = filterstring[1:]
                    else:
                        filterstring = "PASS"

                    if flip:
                        posterior_probs = gprob[:, [2, 1, 0, 3]]
                        ref = snp[4]
                        alt = snp[3]
                        if args.gl:
                            gl = chiamante["gl"][:, [2, 1, 0]]

                    else:
                        posterior_probs = gprob
                        ref = snp[3]
                        alt = snp[4]
                        if args.gl:
                            gl = chiamante["gl"]

                    if infostring == "":
                        infostring = "."

                    if args.gl:
                        data = {"GP": posterior_probs, "GL": gl}
                    else:
                        data = {"GP": posterior_probs}

                    out.writerow(
                        snp[0], snp[1], snp[2], ref, alt, data=data, info=infostring, filterstring=filterstring
                    )
                    if args.plot:
                        plot.debug_plot_snp(
                            x1,
                            "figs/" + str(pos) + "_" + str(i),
                            x2,
                            chiamante["gprobs"],
                            gprob,
                            snp[3],
                            snp[4],
                            snp[2],
                            priors,
                            start,
                            chiamante["parameters"],
                            flip,
                            2,
                        )

                else:
                    parms[snp[2]] = chiamante["parameters"]  # store parameters for next pass
                    for j in range(3):
                        new_means[snpnumber, (j * 2) : ((j + 1) * 2)] = chiamante["parameters"]["mu"][j]
                    snpnumber += 1

        # CALCULATE PRIORS FOR MU_0 AND SIGMA_MU
        #        chiamante_init.build_priors(new_means,priors)

        old_means -= new_means
        print old_means.max()
        if i == nit:
            i += 1
        elif i > 0 and old_means.max() < args.tolerance:
            print "Converged."
            i = nit
        else:
            i += 1
            if nsnp > 1:
                chiamante_init.build_priors(new_means, priors)
            tmp_means = old_means
            old_means = new_means
            new_means = tmp_means

    print "\nChr", args.chromosome, ":", min(sorted_positions), "-", max(
        sorted_positions
    ), "finished in", time.time() - start_time, " seconds"