def merge_snps_and_indels(step, jobs_id=None):
    cwd = os.getcwd()
    if os.path.isdir(os.path.join(cwd, step)):
        print "WARNING: {} already present, assuming this step has been completed with success.".format(step)
        return
    #create the folder structure
    os.mkdir(os.path.join(cwd, step))

    sbatch_command = sbatch_header(sbatch_name=step, cwd=os.path.join(cwd, step))
    sbatch_command += "java -Xmx250g -jar {} -T CombineVariants  \\\n".format(CONFIG["GATK"])
    sbatch_command += "-R {} \\\n".format(CONFIG["reference"])
    sbatch_command += "-nt 16  \\\n"
    sbatch_command += "--variant:snps {} \\\n".format(CONFIG["popSNPs"])
    sbatch_command += "--variant:indels {} \\\n".format(CONFIG["popINDELs"])
    sbatch_command += "-L {} \\\n".format(CONFIG["intervals"])
    output = os.path.join(cwd, "01_merge_snp_indels", "SGP_joincalled.snp.indels.vcf.gz")
    sbatch_command += "-o {} \\\n".format(output)
    sbatch_command += "-genotypeMergeOptions PRIORITIZE \\\n"
    sbatch_command += "-priority snps,indels \\\n"
    sbatch_command += "\n"
    with open(os.path.join(cwd, step, "{}.sbatch".format(step)), "w") as MERGE:
        MERGE.write(sbatch_command)

    slurm_jobs_id = submit_jobs([os.path.join(cwd, step, "{}.sbatch".format(step))], jobs_id)
    return slurm_jobs_id
示例#2
0
def select(step, vcf_in, vcf_out, options, jobs_id=None):
    cwd = os.getcwd()
    if os.path.isdir(os.path.join(cwd, step)):
        print "WARNING: {} already present, assuming this step has been completed with success.".format(
            step)
        return
    #create the folder structure
    os.mkdir(os.path.join(cwd, step))

    sbatch_command = sbatch_header(sbatch_name=step,
                                   cwd=os.path.join(cwd, step))
    sbatch_command += "java -Xmx250g -jar {} -T SelectVariants  \\\n".format(
        CONFIG["GATK"])
    sbatch_command += "-R {} \\\n".format(CONFIG["reference"])
    sbatch_command += "-nt 16  \\\n"
    sbatch_command += "-V {} \\\n".format(vcf_in)
    sbatch_command += "-L {} \\\n".format(CONFIG["intervals"])
    output = os.path.join(cwd, step, vcf_out)
    sbatch_command += "-o {} \\\n".format(output)
    for option in options:
        sbatch_command += "{} \\\n".format(option)
    sbatch_command += "\n"
    with open(os.path.join(cwd, step, "{}.sbatch".format(step)),
              "w") as SELECT:
        SELECT.write(sbatch_command)

    slurm_jobs_id = submit_jobs(
        [os.path.join(cwd, step, "{}.sbatch".format(step))], jobs_id)
    return slurm_jobs_id
示例#3
0
def merge_snps_and_indels(step, jobs_id=None):
    cwd = os.getcwd()
    if os.path.isdir(os.path.join(cwd, step)):
        print "WARNING: {} already present, assuming this step has been completed with success.".format(
            step)
        return
    #create the folder structure
    os.mkdir(os.path.join(cwd, step))

    sbatch_command = sbatch_header(sbatch_name=step,
                                   cwd=os.path.join(cwd, step))
    sbatch_command += "java -Xmx250g -jar {} -T CombineVariants  \\\n".format(
        CONFIG["GATK"])
    sbatch_command += "-R {} \\\n".format(CONFIG["reference"])
    sbatch_command += "-nt 16  \\\n"
    sbatch_command += "--variant:snps {} \\\n".format(CONFIG["popSNPs"])
    sbatch_command += "--variant:indels {} \\\n".format(CONFIG["popINDELs"])
    sbatch_command += "-L {} \\\n".format(CONFIG["intervals"])
    output = os.path.join(cwd, "01_merge_snp_indels",
                          "SGP_joincalled.snp.indels.vcf.gz")
    sbatch_command += "-o {} \\\n".format(output)
    sbatch_command += "-genotypeMergeOptions PRIORITIZE \\\n"
    sbatch_command += "-priority snps,indels \\\n"
    sbatch_command += "\n"
    with open(os.path.join(cwd, step, "{}.sbatch".format(step)), "w") as MERGE:
        MERGE.write(sbatch_command)

    slurm_jobs_id = submit_jobs(
        [os.path.join(cwd, step, "{}.sbatch".format(step))], jobs_id)
    return slurm_jobs_id
示例#4
0
def runPCA(folder, output, VCF, populations, jobs_id=None):
    cwd = os.getcwd()
    if os.path.isdir(os.path.join(cwd, folder)):
        print "WARNING: {} already present, assuming this step has been completed with success.".format(
            folder)
        return
    #create the folder structure
    os.mkdir(os.path.join(cwd, folder))

    sbatch_command = sbatch_header(sbatch_name=folder,
                                   cwd=os.path.join(cwd, folder))
    #create tbed
    output_folder = os.path.join(cwd, folder)
    sbatch_command += "vcftools --gzvcf {} --plink-tped --out {}/{} \n".format(
        VCF, output_folder, output)
    #run plink on this set
    sbatch_command += "{} -tfile {}/{}  --pca --out {}/{}_PCA \n".format(
        CONFIG["PLINK"], output_folder, output, output_folder, output)
    #create PCA table with population
    sbatch_command += "python {} --pca {}/{}_PCA.eigenvec --populations ".format(
        CONFIG["PCA_to_plink"], output_folder, output)
    for population in populations:
        sbatch_command += " {} ".format(population)
    sbatch_command += " > {}/{}_PCA.pop.eigenvec \n".format(
        output_folder, output)
    sbatch_command += "\n"
    with open(os.path.join(cwd, folder, "00_runPCA.sbatch"), "w") as PCA:
        PCA.write(sbatch_command)
    slurm_jobs_id = submit_jobs(
        [os.path.join(cwd, folder, "00_runPCA.sbatch")], jobs_id)
示例#5
0
def main(args):
    config = conf.load_yaml_config(args.configuration)
    if not check_configuration():
        sys.exit(
            "ERROR: configuration file was malformed, please edit it and retry"
        )
    #store in a file path to vcf that are going to be analysed
    if args.resume and os.path.exists("00_samples.txt"):
        sys.exit(
            "ERROR: -- resume  specified, however 00_samples.txt found. Please if you want to resume analysis, remove/move 00_samples.txt, 02_GenotypeGVCF, 03_ ... "
        )
    if not args.resume:
        #create the file 00_samples.txt in order to prevent deleting by mistake analysis
        with open("00_samples.txt", "w") as samplesFile:
            for sample in CONFIG["samples_JC"]:
                samplesFile.write("{}\n".format(sample))
    ## IMPORTANT: samples_JC contains samples to be JointCalled.
    ######### START JOIN CALLING OF THE VARIANTS #####################
    ##### https://www.broadinstitute.org/gatk/guide/article?id=3893
    ##################################################################
    #now join batches of batch_size samples
    if args.resume:
        #recompute only last batch of sample and, in case the extra ones
        sbatch_files = CombineGVCFs_resume()
    else:
        #start from scratch
        sbatch_files = CombineGVCFs()
    slurm_jobs_id = None
    if not CONFIG["dry_run"]:
        slurm_jobs_id = submit_jobs(sbatch_files)
    #now perform the GenotypeGVCF step
    sbatch_files = GenotypeGVCFs()
    if not CONFIG["dry_run"]:
        slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id)
    #at this point merge the chr into a single one
    sbatch_files = CatVariants()
    #and execute
    if not CONFIG["dry_run"]:
        slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id)
    #now perofmr VQSR
    if args.mixed_positions:
        sbatch_files = VQSR()
        #and execute
        if not CONFIG["dry_run"]:
            slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id)
    else:
        #start with select variants and variant evaluation
        sbatch_files = SelectVariants()
        #and execute
        if not CONFIG["dry_run"]:
            slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id)
        #then perfomr VQSR
        sbatch_files = VariantRecalibrator()
        #and execute
        if not CONFIG["dry_run"]:
            slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id)
        #than ApplyRecalibration
        sbatch_files = ApplyRecalibration()
        if not CONFIG["dry_run"]:
            slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id)
def main(args):
    config = conf.load_yaml_config(args.configuration)
    if not check_configuration():
        sys.exit("ERROR: configuration file was malformed, please edit it and retry")
    #store in a file path to vcf that are going to be analysed
    if args.resume and os.path.exists("00_samples.txt"):
        sys.exit("ERROR: -- resume  specified, however 00_samples.txt found. Please if you want to resume analysis, remove/move 00_samples.txt, 02_GenotypeGVCF, 03_ ... ")
    if not args.resume:
        #create the file 00_samples.txt in order to prevent deleting by mistake analysis
        with open("00_samples.txt", "w") as samplesFile:
            for sample in CONFIG["samples_JC"]:
                samplesFile.write("{}\n".format(sample))
    ## IMPORTANT: samples_JC contains samples to be JointCalled.
    ######### START JOIN CALLING OF THE VARIANTS #####################
    ##### https://www.broadinstitute.org/gatk/guide/article?id=3893
    ##################################################################
    #now join batches of batch_size samples
    if args.resume:
        #recompute only last batch of sample and, in case the extra ones
        sbatch_files = CombineGVCFs_resume()
    else:
        #start from scratch
        sbatch_files = CombineGVCFs()
    slurm_jobs_id = None
    if not CONFIG["dry_run"]:
        slurm_jobs_id = submit_jobs(sbatch_files)
    #now perform the GenotypeGVCF step
    sbatch_files = GenotypeGVCFs()
    if not CONFIG["dry_run"]:
        slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id)
    #at this point merge the chr into a single one
    sbatch_files = CatVariants()
    #and execute
    if not CONFIG["dry_run"]:
        slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id)
    #now perofmr VQSR
    if args.mixed_positions:
        sbatch_files = VQSR()
        #and execute
        if not CONFIG["dry_run"]:
            slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id)
    else:
        #start with select variants and variant evaluation
        sbatch_files = SelectVariants()
        #and execute
        if not CONFIG["dry_run"]:
            slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id)
        #then perfomr VQSR
        sbatch_files = VariantRecalibrator()
        #and execute
        if not CONFIG["dry_run"]:
            slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id)
        #than ApplyRecalibration
        sbatch_files = ApplyRecalibration()
        if not CONFIG["dry_run"]:
            slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id)
示例#7
0
def merge_with_1KGP(step, vcf_one, vcf_two, jobs_id):
    cwd = os.getcwd()
    if os.path.isdir(os.path.join(cwd, step)):
        print "WARNING: {} already present, assuming this step has been completed with success.".format(
            step)
        return
    #create the folder structure
    os.mkdir(os.path.join(cwd, step))

    sbatch_command = sbatch_header(sbatch_name=step,
                                   cwd=os.path.join(cwd, step))

    sbatch_command += "java -Xmx250g -jar {} -T CombineVariants  \\\n".format(
        CONFIG["GATK"])
    sbatch_command += "-R {} \\\n".format(CONFIG["reference"])
    sbatch_command += "-nt 16  \\\n"
    sbatch_command += "--variant:SGP {} \\\n".format(vcf_one)
    sbatch_command += "--variant:1KGP {} \\\n".format(vcf_two)
    sbatch_command += "-L {} \\\n".format(CONFIG["intervals"])
    output = os.path.join(cwd, step, "1KGP_SGP.vcf.gz")
    sbatch_command += "-o {} \n".format(output)

    sbatch_command += "\n"

    sbatch_command += "java -Xmx250g -jar {} -T SelectVariants  \\\n".format(
        CONFIG["GATK"])
    sbatch_command += "-R {} \\\n".format(CONFIG["reference"])
    sbatch_command += "-nt 16  \\\n"
    sbatch_command += "-L {} \\\n".format(CONFIG["intervals"])
    sbatch_command += "-V {} \\\n".format(
        os.path.join(cwd, step, "1KGP_SGP.vcf.gz"))
    sbatch_command += "-select \'set == \"Intersection\"\' \\\n"
    output = os.path.join(cwd, step, "1KGP_SGP.intersection.vcf.gz")
    sbatch_command += "-o {} \\\n".format(output)
    sbatch_command += "\n"

    with open(os.path.join(cwd, step, "{}.sbatch".format(step)),
              "w") as INTERSECT:
        INTERSECT.write(sbatch_command)

    slurm_jobs_id = submit_jobs(
        [os.path.join(cwd, step, "{}.sbatch".format(step))], jobs_id)
    return slurm_jobs_id
def select_EU_samples(step, vcf_all, jobs_id):
    cwd = os.getcwd()
    if os.path.isdir(os.path.join(cwd, step)):
        print "WARNING: {} already present, assuming this step has been completed with success.".format(step)
        return
    #create the folder structure
    os.mkdir(os.path.join(cwd, step))

    sbatch_command = sbatch_header(sbatch_name=step, cwd=os.path.join(cwd, step))
    sbatch_command +=  "java -Xmx250g -jar {} -T SelectVariants  \\\n".format(CONFIG["GATK"])
    sbatch_command += "-R {} \\\n".format(CONFIG["reference"])
    sbatch_command += "-nt 16  \\\n"
    sbatch_command += "-V {} \\\n".format(vcf_all)
    sbatch_command += "-L {} \\\n".format(CONFIG["intervals"])
    output = os.path.join(cwd, step, "EU_1KGP_SGP.vcf.gz")
    sbatch_command += "-o {} \\\n".format(output)
    sbatch_command += "-sf  {}\\\n".format(CONFIG["EU_samples"])

    sbatch_command += "\n"
    with open(os.path.join(cwd, step, "{}.sbatch".format(step)), "w") as SELECT:
        SELECT.write(sbatch_command)
    slurm_jobs_id = submit_jobs([os.path.join(cwd, step, "{}.sbatch".format(step))], jobs_id)
    return slurm_jobs_id
def merge_with_1KGP(step, vcf_one, vcf_two, jobs_id):
    cwd = os.getcwd()
    if os.path.isdir(os.path.join(cwd, step)):
        print "WARNING: {} already present, assuming this step has been completed with success.".format(step)
        return
    #create the folder structure
    os.mkdir(os.path.join(cwd, step))
    
    sbatch_command = sbatch_header(sbatch_name=step, cwd=os.path.join(cwd, step))

    sbatch_command +=  "java -Xmx250g -jar {} -T CombineVariants  \\\n".format(CONFIG["GATK"])
    sbatch_command += "-R {} \\\n".format(CONFIG["reference"])
    sbatch_command += "-nt 16  \\\n"
    sbatch_command += "--variant:SGP {} \\\n".format(vcf_one)
    sbatch_command += "--variant:1KGP {} \\\n".format(vcf_two)
    sbatch_command += "-L {} \\\n".format(CONFIG["intervals"])
    output = os.path.join(cwd, step, "1KGP_SGP.vcf.gz")
    sbatch_command += "-o {} \n".format(output)

    sbatch_command += "\n"

    sbatch_command +=  "java -Xmx250g -jar {} -T SelectVariants  \\\n".format(CONFIG["GATK"])
    sbatch_command += "-R {} \\\n".format(CONFIG["reference"])
    sbatch_command += "-nt 16  \\\n"
    sbatch_command += "-L {} \\\n".format(CONFIG["intervals"])
    sbatch_command += "-V {} \\\n".format(os.path.join(cwd, step, "1KGP_SGP.vcf.gz"))
    sbatch_command += "-select \'set == \"Intersection\"\' \\\n"
    output = os.path.join(cwd, step, "1KGP_SGP.intersection.vcf.gz")
    sbatch_command += "-o {} \\\n".format(output)
    sbatch_command += "\n"
    
    with open(os.path.join(cwd, step, "{}.sbatch".format(step)), "w") as INTERSECT:
        INTERSECT.write(sbatch_command)

    slurm_jobs_id = submit_jobs([os.path.join(cwd, step, "{}.sbatch".format(step))], jobs_id)
    return slurm_jobs_id
def runPCA(folder, output, VCF, populations, jobs_id=None):
    cwd = os.getcwd()
    if os.path.isdir(os.path.join(cwd, folder)):
        print "WARNING: {} already present, assuming this step has been completed with success.".format(folder)
        return
    #create the folder structure
    os.mkdir(os.path.join(cwd, folder))
    
    
    sbatch_command = sbatch_header(sbatch_name=folder, cwd=os.path.join(cwd, folder))
    #create tbed
    output_folder = os.path.join(cwd, folder)
    sbatch_command +=  "vcftools --gzvcf {} --plink-tped --out {}/{} \n".format(VCF, output_folder, output)
    #run plink on this set
    sbatch_command +=  "{} -tfile {}/{}  --pca --out {}/{}_PCA \n".format(CONFIG["PLINK"], output_folder, output, output_folder, output)
    #create PCA table with population
    sbatch_command += "python {} --pca {}/{}_PCA.eigenvec --populations ".format(CONFIG["PCA_to_plink"], output_folder, output)
    for population in populations:
        sbatch_command += " {} ".format(population)
    sbatch_command += " > {}/{}_PCA.pop.eigenvec \n".format(output_folder, output)
    sbatch_command += "\n"
    with open(os.path.join(cwd, folder, "00_runPCA.sbatch"), "w") as PCA:
        PCA.write(sbatch_command)
    slurm_jobs_id = submit_jobs([os.path.join(cwd, folder, "00_runPCA.sbatch")], jobs_id)