Exemplo n.º 1
0
    def run(self):
        valid_path(self.output().path, check_ofile=1)
        prefix = self.output().path.replace('.vcf', "")
        input_normal = self.input()["normal"].path
        input_tumor = self.input()["tumor"].path
        if config.bed_file_path:
            extra_str = " --intervals %s" % config.bed_file_path
        else:
            extra_str = ''

        normal_name = self.infodict_N["SampleID"]
        tumor_name = self.infodict_T["SampleID"]

        cmdline = "{gatk4} Mutect2 --java-options '-Xmx20g' --native-pair-hmm-threads 20 --reference {REF} -I {input_normal} -normal {N_name} -I {input_tumor} -tumor {T_name} --dbsnp {db_snp} --seconds-between-progress-updates 60 --all-site-pls -stand-call-conf 10 -A Coverage -A DepthPerAlleleBySample -A FisherStrand -A BaseQuality -A QualByDepth -A RMSMappingQuality -A MappingQualityRankSumTest -A ReadPosRankSumTest -A ChromosomeCounts --all-site-pls true --output {prefix}.vcf -bamout {prefix}.bam {extra_str} ".format(
            REF=config.REF_file_path,
            cosmic=config.cos_snp,
            db_snp=config.db_snp,
            input_tumor=input_tumor,
            input_normal=input_normal,
            gatk4=config.gatk_pro,
            N_name=normal_name,
            T_name=tumor_name,
            prefix=prefix,
            extra_str=extra_str)
        run_cmd(cmdline,
                dry_run=self.dry_run,
                log_file=self.infodict_N.get("log_path", None))
        if self.dry_run:
            run_cmd("touch %s" % self.output().path,
                    dry_run=False,
                    log_file=self.get_log_path())
Exemplo n.º 2
0
 def run(self):
     cmdline = "{vt} decompose -s {input_vcf} | {vt} normalize -r {REF} - > {vt_vcf}".format(
         vt=config.vt_pro,
         input_vcf=self.input().path,
         REF=config.REF_file_path,
         vt_vcf=self.output().path)
     run_cmd(cmdline, dry_run=self.dry_run)
Exemplo n.º 3
0
    def run(self):
        valid_path(self.output().path, check_ofile=1)
        input_f = self.input().path

        somatic_type = self.infodict["Somatic"]
        if somatic_type == "N":
            # Normal only
            extra_str = ''
        elif somatic_type == "T":
            # Tumor only
            extra_str = ' --tumor_lod 4'
        else:
            raise Exception("Unknown values of Somatic columns (like '%s' )" %
                            somatic_type)
        # both normal and tumor sample use input_file:tumor as parameter
        cmdline = '''java {java_option} -jar {gatk} -T MuTect2 --allSitePLs --artifact_detection_mode -R {REF} --cosmic {cosmic} --dbsnp {db_snp} --input_file:tumor {input_f} --out {output_f} --bamOutput {prefix}.bam --log_to_file {prefix}.log {extra_str}'''.format(
            gatk=config.gatkv36_path,
            java_option=config.java_option,
            REF=config.REF_file_path,
            cosmic=config.cos_snp,
            db_snp=config.db_snp,
            input_f=input_f,
            output_f=self.output().path,
            prefix=self.output().path.replace('.vcf', ''),
            extra_str=extra_str)
        run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
        if self.dry_run:
            run_cmd("touch %s" % self.output().path, dry_run=False)
Exemplo n.º 4
0
 def run(self):
     valid_path(self.output().path, check_ofile=1)
     cmdline = """{gatk4} MergeVcfs --java-options "-Xmx4g" -R {REF} --INPUT {input_indel} --INPUT {input_snp} --OUTPUT {output_f}""".format(
         gatk4=config.gatk_pro,
         REF=config.REF_file_path,
         input_indel=self.input()["indel"].path,
         input_snp=self.input()["snp"].path,
         output_f=self.output().path)
     run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
Exemplo n.º 5
0
 def run(self):
     valid_path(self.output().path, check_ofile=1)
     cmdline = "java -Xmx4g -jar {gatk} -T CombineVariants -R {REF} --variant:indel {input_indel} --variant:snp {input_snp} --interval_padding 25 --out {output_f} --setKey set --genotypemergeoption UNSORTED".format(
         gatk=config.gatkv36_path,
         REF=config.REF_file_path,
         input_indel=self.input()["indel"].path,
         input_snp=self.input()["snp"].path,
         output_f=self.output().path)
     run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
     if self.dry_run:
         run_cmd("touch %s" % self.output().path, dry_run=False)
Exemplo n.º 6
0
    def run(self):
        normal_bam = self.input()["normal"].path
        tumor_bam = self.input()["tumor"].path
        input_csv_files = self.input()["annovar"]

        for input_csv in input_csv_files:
            input_csv = input_csv.path
            output_csv = input_csv.replace('.csv', '_added_cov.csv')
            cmdline = f"python3 {project_root_path}/api/add_per_info_into_csv.py -i {input_csv} -o {output_csv} -tb {tumor_bam} -nb {normal_bam}"
            run_cmd(cmdline,
                    log_file=self.get_log_path(),
                    dry_run=self.dry_run)
Exemplo n.º 7
0
 def run(self):
     for _output, _input in zip(self.output(), self.input()):
         cmdline = """{gemini} load --cores {threads} -t VEP -v {vep_output_vcf_gz} {Output_db}; \
     {gemini} annotate -f {vep_output_vcf_gz} -a extract \
     -c SAD,SAF,AF,AD,BaseQRankSum,FS,MQRankSum,ReadPosRankSum,SOR \
     -t text,float,float,text,float,float,float,float,float \
     -o list,list,list,list,mean,mean,mean,mean,mean {Output_db} >> {gemini_log} 2>&1""".format(
             gemini=config.gemini_pro,
             threads=config.gemini_thread,
             vep_output_vcf_gz=_input.path,
             Output_db=_output.path,
             gemini_log=_output.path.replace('.db', ".log"))
         run_cmd(cmdline, dry_run=self.dry_run)
Exemplo n.º 8
0
 def run(self):
     input_vcf = self.input().path.replace('.vcf', '.vt.vcf')
     output_dir = dirname(self.output().path)
     source_name = self.infodict["source_name"]
     valid_path(output_dir, check_odir=1)
     # todo: convert ref_path to grch37??
     cmdline = "source activate pcgr; python3 {pcgr_dir}/pcgr.py --input_vcf {input_vcf} {pcgr_dir} {output_dir} grch37 {toml_config} {source_name} --no-docker --force_overwrite".format(
         pcgr_dir=config.pcgr_dir,
         input_vcf=input_vcf,
         output_dir=output_dir,
         toml_config=config.pcgr_toml_file,
         source_name=source_name)
     run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
Exemplo n.º 9
0
    def run(self):
        valid_path(self.output().path, check_ofile=1)

        if config.bed_file_path != '':
            extra_str = " --intervals {}".format(config.bed_file_path)
        else:
            extra_str = ""
        cmdline = "{gatk4} HaplotypeCaller --java-options '-Xmx30g' --native-pair-hmm-threads 30 --reference {ref} --input {input} --genotyping-mode DISCOVERY --dbsnp {dbsnp} -stand-call-conf 10 -A Coverage -A DepthPerAlleleBySample -A FisherStrand -A BaseQuality -A QualByDepth -A RMSMappingQuality -A MappingQualityRankSumTest -A ReadPosRankSumTest -A ChromosomeCounts --all-site-pls true --output {output} {extra_str}".format(
            ref=config.REF_file_path,
            input=self.input().path,
            dbsnp=config.db_snp,
            output=self.output().path,
            extra_str=extra_str,
            gatk4=config.gatk_pro)
        run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
Exemplo n.º 10
0
    def run(self):
        valid_path(self.output().path, check_ofile=1)
        if self.object_type == "snp":
            selecttype = "SNP"
        elif self.object_type == "indel":
            selecttype = "INDEL"
        else:
            raise Exception

        cmdline = "{gatk4} SelectVariants --java-options '-Xmx4g' -R {REF} -V {input_f} -select-type {selecttype} -O {output_f}".format(
            gatk4=config.gatk_pro,
            REF=config.REF_file_path,
            input_f=self.input().path,
            output_f=self.output().path,
            selecttype=selecttype)
        run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
Exemplo n.º 11
0
    def run(self):
        valid_path(self.output().path, check_ofile=1)
        if self.object_type == "snp":
            filterExpression = "QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0"
        elif self.object_type == "indel":
            filterExpression = "QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0"
        else:
            raise Exception

        cmdline = """{gatk4} VariantFiltration --java-options '-Xmx4g' -R {REF} -V {input_f} --filter-expression "{filterExpression}" --filter-name \"my_{object_type}_filter\" -O {output_f}""".format(
            gatk4=config.gatk_pro,
            REF=config.REF_file_path,
            input_f=self.input().path,
            output_f=self.output().path,
            filterExpression=filterExpression,
            object_type=self.object_type)
        run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
Exemplo n.º 12
0
 def run(self):
     valid_path(self.output().path, check_ofile=1)
     if self.object_type == "snp":
         filterExpression = "QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0"
     elif self.object_type == "indel":
         filterExpression = "QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0"
     else:
         raise Exception
     cmdline = """java -Xmx4g -jar {gatk} -T VariantFiltration -R {REF} -V {input_f} --filterExpression "{filterExpression}" --filterName \"my_{object_type}_filter\" -o {output_f}""".format(
         gatk=config.gatkv36_path,
         REF=config.REF_file_path,
         input_f=self.input().path,
         output_f=self.output().path,
         filterExpression=filterExpression,
         object_type=self.object_type)
     run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
     if self.dry_run:
         run_cmd("touch %s" % self.output().path, dry_run=False)
Exemplo n.º 13
0
    def run(self):
        valid_path(self.output().path, check_ofile=1)
        if self.object_type == "snp":
            selecttype = "SNP"
        elif self.object_type == "indel":
            selecttype = "INDEL"
        else:
            raise Exception

        cmdline = "java -Xmx4g -jar {gatk} -T SelectVariants -R {REF} -V {input_f} -selectType {selecttype} -o {output_f}".format(
            gatk=config.gatkv36_path,
            REF=config.REF_file_path,
            input_f=self.input().path,
            output_f=self.output().path,
            selecttype=selecttype)
        run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
        if self.dry_run:
            run_cmd("touch %s" % self.output().path, dry_run=False)
Exemplo n.º 14
0
    def run(self):
        valid_path(self.output().path, check_ofile=1)

        if config.bed_file_path != '':
            extra_str = "-L %s" % config.bed_file_path
        else:
            extra_str = ''
        cmdline = "java -Xmx4g -jar {gatk} -T HaplotypeCaller -nct {gatk_thread} -R {REF} -I {input} {extra_str} --genotyping_mode DISCOVERY --dbsnp {db_snp} -stand_call_conf 10 -stand_emit_conf 5 -A AlleleBalance -A Coverage -A FisherStrand -o {output_f}".format(
            gatk=config.gatkv36_path,
            gatk_thread=config.gatk_thread,
            REF=config.REF_file_path,
            input=self.input().path,
            extra_str=extra_str,
            db_snp=config.db_snp,
            output_f=self.output().path)
        run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
        if self.dry_run:
            run_cmd("touch %s" % self.output().path, dry_run=False)
Exemplo n.º 15
0
    def run(self):
        valid_path(self.output().path, check_ofile=1)
        prefix = self.output().path.replace('.vcf', "")
        input_normal = self.input()["normal"].path
        input_tumor = self.input()["tumor"].path

        cmdline = '''java {java_option} -jar {gatk} -T MuTect2 --allSitePLs -R {REF} --cosmic {cosmic} --dbsnp {db_snp} --input_file:normal {input_normal} --input_file:tumor {input_tumor} --out {output_f} --bamOutput {prefix}.bam --log_to_file {prefix}.log'''.format(
            gatk=config.gatkv36_path,
            java_option=config.java_option,
            REF=config.REF_file_path,
            cosmic=config.cos_snp,
            db_snp=config.db_snp,
            input_tumor=input_tumor,
            input_normal=input_normal,
            prefix=prefix,
            output_f=self.output().path)
        run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
        if self.dry_run:
            run_cmd("touch %s" % self.output().path, dry_run=False)
Exemplo n.º 16
0
    def run(self):
        from post_pipelines_analysis.extracted_pos_from_vcf import merge_two_vcf
        from special_fun.csv2bed import csv2bed
        pair_vcf = self.input()["pair_vcf"].path.replace('.av', '.vcf')
        tumor_single_vcf = self.input()["tumor_vcf"].path.replace(
            '.av', '.vcf')
        filtered_csv = self.input()["filtered"][1].path

        if not self.dry_run:
            filtered_bed = csv2bed(filtered_csv,
                                   filtered_csv.replace('.csv', '.bed'))
            merge_two_vcf(pair_vcf,
                          tumor_single_vcf,
                          filtered_bed,
                          self.output().path,
                          log_file=self.get_log_path())

        cmdline = "{vt} decompose -s {input_vcf} | {vt} normalize -r {REF_path} - > {output_vcf}".format(
            vt=config.vt_pro,
            input_vcf=self.output().path,
            REF_path=config.REF_file_path,
            output_vcf=self.output().path.replace('.vcf', '.vt.vcf'))
        run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
Exemplo n.º 17
0
    def run(self):
        valid_path(self.output().path, check_ofile=1)
        input_f = self.input().path
        sample_name = self.infodict["SampleID"]

        if config.bed_file_path:
            extra_str = " --intervals %s" % config.bed_file_path
        else:
            extra_str = ""

        somatic_type = self.infodict["Somatic"]
        if somatic_type == "N":
            extra_str += ""
            # Normal only
            extra_str = ''
        elif somatic_type == "T":
            # Tumor only
            extra_str = ' --tumor_lod 4'
        else:
            raise Exception("Unknown values of Somatic columns (like '%s' )" %
                            somatic_type)

        cmdline = "{gatk4} Mutect2 --java-options '-Xmx20g' --native-pair-hmm-threads 20 --reference {REF} -I {input_tumor} -tumor {T_name} --dbsnp {db_snp} --seconds-between-progress-updates 60 --all-site-pls -stand-call-conf 10 -A Coverage -A DepthPerAlleleBySample -A FisherStrand -A BaseQuality -A QualByDepth -A RMSMappingQuality -A MappingQualityRankSumTest -A ReadPosRankSumTest -A ChromosomeCounts --all-site-pls true --output {prefix}.vcf -bamout {prefix}.bam".format(
            gatk4=config.gatk_pro,
            REF=config.REF_file_path,
            db_snp=config.db_snp,
            input_tumor=input_f,
            prefix=self.output().path.replace('.vcf', ''),
            T_name=sample_name,
            extra_str=extra_str)
        run_cmd(cmdline,
                dry_run=self.dry_run,
                log_file=self.infodict.get("log_path", None))
        if self.dry_run:
            run_cmd("touch %s" % self.output().path,
                    dry_run=False,
                    log_file=self.get_log_path())
Exemplo n.º 18
0
def prepare_vcf(vcf_path, log_file=sys.stdout):
    if not vcf_path.endswith('.gz'):
        run_cmd('{bgzip} -c {vcf_p} > {vcf_p}.gz'.format(
            bgzip=config.bgzip_pro, vcf_p=vcf_path),
                log_file=log_file)
        vcf_path += '.gz'
        run_cmd('%s index %s' % (bcftools_path, vcf_path), log_file=log_file)
    else:
        if not os.path.isfile(vcf_path + '.csi'):
            run_cmd('%s index %s' % (bcftools_path, vcf_path),
                    log_file=log_file)
    return vcf_path
Exemplo n.º 19
0
def merge_two_vcf(pair_vcf, single_vcf, bed, output_vcf, log_file=sys.stdout):
    prepare_vcf(pair_vcf, log_file=log_file)
    prepare_vcf(single_vcf, log_file=log_file)
    pair_sample_name = os.popen("zgrep '^#C' %s | cut -f 10-" %
                                pair_vcf).read().replace('\n', '').replace(
                                    '\t', ',')
    single_sample_name = os.popen("zgrep '^#C' %s | cut -f 10-" %
                                  single_vcf).read().replace('\n', '').replace(
                                      '\t', ',')

    formatted_line = '''{bcftools_path} view {vcf} -R {bed} -s ^{SM} --force-samples | {bcftools_path} sort > {output}; '''
    cmdline1 = formatted_line.format(vcf=pair_vcf.replace('.gz', '') + '.gz',
                                     bcftools_path=bcftools_path,
                                     bed=bed,
                                     output=output_vcf + '1',
                                     SM=pair_sample_name)
    cmdline2 = formatted_line.format(vcf=single_vcf.replace('.gz', '') + '.gz',
                                     bcftools_path=bcftools_path,
                                     bed=bed,
                                     output=output_vcf + '2',
                                     SM=single_sample_name)
    run_cmd(cmdline1, log_file=log_file)
    run_cmd(cmdline2, log_file=log_file)

    prepare_vcf(output_vcf + '1', log_file=log_file)
    prepare_vcf(output_vcf + '2', log_file=log_file)

    formatted_line2 = """{bcftools} concat {o1} {o2} -a -d all > {output}""".format(
        bcftools=bcftools_path,
        o1=output_vcf + '1.gz',
        o2=output_vcf + '2.gz',
        output=output_vcf + '3')
    run_cmd(formatted_line2, log_file=log_file)

    with open(output_vcf + '3', "r") as fr:
        with open(output_vcf, 'w') as f1:
            for row in fr:
                if row.startswith("##SAMPLE=<ID="):
                    continue
                f1.write(row)
    prepare_vcf(output_vcf, log_file=log_file)
    run_cmd('rm {o}1* ;rm {o}2* ; rm {o}3* '.format(o=output_vcf))
Exemplo n.º 20
0
    def run(self):
        cmdline = """{vep} -i {vt_vcf} -o {vep_output_vcf} --vcf --cache --merged --fasta {REF} --sift b --polyphen b --symbol --numbers --biotype \
        --total_length --canonical --ccds --gene_phenotype --uniprot --assembly GRCh37 \
        --force_overwrite --offline --domains --regulatory --protein --tsl --variant_class --fork {threads} --force \
        --no_stats >> {vep_log} 2>&1""".format(
            vep=config.vep_pro,
            vt_vcf=self.input().path,
            REF=config.REF_file_path,
            vep_output_vcf=self.input().path.replace('.vt.vcf', '.vep.vcf'),
            threads=config.vep_thread,
            vep_log=self.input().path.replace('.vt.vcf', '.vep.log'))
        run_cmd(cmdline, dry_run=self.dry_run)

        cmdline = '{bgzip} -c {vep_output_vcf} > {vep_output_vcf_gz}'.format(
            bgzip=config.bgzip_pro,
            vep_output_vcf=self.input().path.replace('.vt.vcf', '.vep.vcf'),
            vep_output_vcf_gz=self.output().path)
        run_cmd(cmdline, dry_run=self.dry_run)

        cmdline = '{tabix} -p vcf {vep_output_vcf_gz}'.format(
            tabix=config.tabix_pro, vep_output_vcf_gz=self.output().path)
        run_cmd(cmdline, dry_run=self.dry_run)
Exemplo n.º 21
0
def germline_filter(indir, odir, tab):
    run_cmd(f"python3 {project_root_path}/api/var_filters.py -i {indir} --tab {tab} -o {odir}",
            dry_run=False)
Exemplo n.º 22
0
def test_somatic_gatk4(odir):
    run_cmd(
        f"python3 {project_root_path}/luigi_pipelines/main.py workflow --tab {project_root_path}/test_set/somatic/data_input.tsv --odir {odir} --analysis-type somatic_gatk4 --workers 5 --log-path {odir}/cmd_log.txt",
        dry_run=False)