Пример #1
0
    def run(self):
        valid_path(self.output().path, check_ofile=1)
        input_f = self.input().path

        somatic_type = self.infodict["Somatic"]
        if somatic_type == "N":
            # Normal only
            extra_str = ''
        elif somatic_type == "T":
            # Tumor only
            extra_str = ' --tumor_lod 4'
        else:
            raise Exception("Unknown values of Somatic columns (like '%s' )" %
                            somatic_type)
        # both normal and tumor sample use input_file:tumor as parameter
        cmdline = '''java {java_option} -jar {gatk} -T MuTect2 --allSitePLs --artifact_detection_mode -R {REF} --cosmic {cosmic} --dbsnp {db_snp} --input_file:tumor {input_f} --out {output_f} --bamOutput {prefix}.bam --log_to_file {prefix}.log {extra_str}'''.format(
            gatk=config.gatkv36_path,
            java_option=config.java_option,
            REF=config.REF_file_path,
            cosmic=config.cos_snp,
            db_snp=config.db_snp,
            input_f=input_f,
            output_f=self.output().path,
            prefix=self.output().path.replace('.vcf', ''),
            extra_str=extra_str)
        run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
        if self.dry_run:
            run_cmd("touch %s" % self.output().path, dry_run=False)
Пример #2
0
    def run(self):
        valid_path(self.output().path, check_ofile=1)
        prefix = self.output().path.replace('.vcf', "")
        input_normal = self.input()["normal"].path
        input_tumor = self.input()["tumor"].path
        if config.bed_file_path:
            extra_str = " --intervals %s" % config.bed_file_path
        else:
            extra_str = ''

        normal_name = self.infodict_N["SampleID"]
        tumor_name = self.infodict_T["SampleID"]

        cmdline = "{gatk4} Mutect2 --java-options '-Xmx20g' --native-pair-hmm-threads 20 --reference {REF} -I {input_normal} -normal {N_name} -I {input_tumor} -tumor {T_name} --dbsnp {db_snp} --seconds-between-progress-updates 60 --all-site-pls -stand-call-conf 10 -A Coverage -A DepthPerAlleleBySample -A FisherStrand -A BaseQuality -A QualByDepth -A RMSMappingQuality -A MappingQualityRankSumTest -A ReadPosRankSumTest -A ChromosomeCounts --all-site-pls true --output {prefix}.vcf -bamout {prefix}.bam {extra_str} ".format(
            REF=config.REF_file_path,
            cosmic=config.cos_snp,
            db_snp=config.db_snp,
            input_tumor=input_tumor,
            input_normal=input_normal,
            gatk4=config.gatk_pro,
            N_name=normal_name,
            T_name=tumor_name,
            prefix=prefix,
            extra_str=extra_str)
        run_cmd(cmdline,
                dry_run=self.dry_run,
                log_file=self.infodict_N.get("log_path", None))
        if self.dry_run:
            run_cmd("touch %s" % self.output().path,
                    dry_run=False,
                    log_file=self.get_log_path())
Пример #3
0
 def output(self):
     odir = self.infodict["odir"]
     source_name = self.infodict["source_name"]
     filter_path = config.filtered_dir.format(path=odir, PairN=source_name)
     valid_path(filter_path, check_odir=1)
     opath1 = join(filter_path, source_name + '_except_AF_depth.csv')
     opath2 = join(filter_path, source_name + '_except_AF_depth_PASS.csv')
     return [luigi.LocalTarget(opath1), luigi.LocalTarget(opath2)]
Пример #4
0
    def output(self):
        odir = self.infodict["odir"]
        source_name = self.infodict["source_name"]
        filter_path = config.filtered_dir.format(path=odir, PairN=source_name)
        valid_path(filter_path, check_odir=1)

        final_variants = join(filter_path, source_name + '_final.vcf')
        return luigi.LocalTarget(final_variants)
Пример #5
0
 def output(self):
     indir = dirname(self.input().path)
     odir = join(indir, "pcgr_output")
     valid_path(odir, check_odir=1)
     source_name = self.infodict["source_name"]
     ofile = join(odir, "{}.pcgr_acmg.grch37.html".format(source_name))
     # todo: convert ref_path to grch37??
     return luigi.LocalTarget(ofile)
Пример #6
0
 def run(self):
     valid_path(self.output().path, check_ofile=1)
     cmdline = """{gatk4} MergeVcfs --java-options "-Xmx4g" -R {REF} --INPUT {input_indel} --INPUT {input_snp} --OUTPUT {output_f}""".format(
         gatk4=config.gatk_pro,
         REF=config.REF_file_path,
         input_indel=self.input()["indel"].path,
         input_snp=self.input()["snp"].path,
         output_f=self.output().path)
     run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
Пример #7
0
 def run(self):
     valid_path(self.output().path, check_ofile=1)
     cmdline = "java -Xmx4g -jar {gatk} -T CombineVariants -R {REF} --variant:indel {input_indel} --variant:snp {input_snp} --interval_padding 25 --out {output_f} --setKey set --genotypemergeoption UNSORTED".format(
         gatk=config.gatkv36_path,
         REF=config.REF_file_path,
         input_indel=self.input()["indel"].path,
         input_snp=self.input()["snp"].path,
         output_f=self.output().path)
     run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
     if self.dry_run:
         run_cmd("touch %s" % self.output().path, dry_run=False)
Пример #8
0
 def run(self):
     input_vcf = self.input().path.replace('.vcf', '.vt.vcf')
     output_dir = dirname(self.output().path)
     source_name = self.infodict["source_name"]
     valid_path(output_dir, check_odir=1)
     # todo: convert ref_path to grch37??
     cmdline = "source activate pcgr; python3 {pcgr_dir}/pcgr.py --input_vcf {input_vcf} {pcgr_dir} {output_dir} grch37 {toml_config} {source_name} --no-docker --force_overwrite".format(
         pcgr_dir=config.pcgr_dir,
         input_vcf=input_vcf,
         output_dir=output_dir,
         toml_config=config.pcgr_toml_file,
         source_name=source_name)
     run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
Пример #9
0
    def run(self):
        valid_path(self.output().path, check_ofile=1)

        if config.bed_file_path != '':
            extra_str = " --intervals {}".format(config.bed_file_path)
        else:
            extra_str = ""
        cmdline = "{gatk4} HaplotypeCaller --java-options '-Xmx30g' --native-pair-hmm-threads 30 --reference {ref} --input {input} --genotyping-mode DISCOVERY --dbsnp {dbsnp} -stand-call-conf 10 -A Coverage -A DepthPerAlleleBySample -A FisherStrand -A BaseQuality -A QualByDepth -A RMSMappingQuality -A MappingQualityRankSumTest -A ReadPosRankSumTest -A ChromosomeCounts --all-site-pls true --output {output} {extra_str}".format(
            ref=config.REF_file_path,
            input=self.input().path,
            dbsnp=config.db_snp,
            output=self.output().path,
            extra_str=extra_str,
            gatk4=config.gatk_pro)
        run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
Пример #10
0
    def run(self):
        valid_path(self.output().path, check_ofile=1)
        if self.object_type == "snp":
            selecttype = "SNP"
        elif self.object_type == "indel":
            selecttype = "INDEL"
        else:
            raise Exception

        cmdline = "{gatk4} SelectVariants --java-options '-Xmx4g' -R {REF} -V {input_f} -select-type {selecttype} -O {output_f}".format(
            gatk4=config.gatk_pro,
            REF=config.REF_file_path,
            input_f=self.input().path,
            output_f=self.output().path,
            selecttype=selecttype)
        run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
Пример #11
0
    def run(self):
        valid_path(self.output().path, check_ofile=1)
        if self.object_type == "snp":
            filterExpression = "QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0"
        elif self.object_type == "indel":
            filterExpression = "QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0"
        else:
            raise Exception

        cmdline = """{gatk4} VariantFiltration --java-options '-Xmx4g' -R {REF} -V {input_f} --filter-expression "{filterExpression}" --filter-name \"my_{object_type}_filter\" -O {output_f}""".format(
            gatk4=config.gatk_pro,
            REF=config.REF_file_path,
            input_f=self.input().path,
            output_f=self.output().path,
            filterExpression=filterExpression,
            object_type=self.object_type)
        run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
Пример #12
0
    def run(self):
        valid_path(self.output().path, check_ofile=1)
        if self.object_type == "snp":
            selecttype = "SNP"
        elif self.object_type == "indel":
            selecttype = "INDEL"
        else:
            raise Exception

        cmdline = "java -Xmx4g -jar {gatk} -T SelectVariants -R {REF} -V {input_f} -selectType {selecttype} -o {output_f}".format(
            gatk=config.gatkv36_path,
            REF=config.REF_file_path,
            input_f=self.input().path,
            output_f=self.output().path,
            selecttype=selecttype)
        run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
        if self.dry_run:
            run_cmd("touch %s" % self.output().path, dry_run=False)
Пример #13
0
 def run(self):
     valid_path(self.output().path, check_ofile=1)
     if self.object_type == "snp":
         filterExpression = "QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0"
     elif self.object_type == "indel":
         filterExpression = "QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0"
     else:
         raise Exception
     cmdline = """java -Xmx4g -jar {gatk} -T VariantFiltration -R {REF} -V {input_f} --filterExpression "{filterExpression}" --filterName \"my_{object_type}_filter\" -o {output_f}""".format(
         gatk=config.gatkv36_path,
         REF=config.REF_file_path,
         input_f=self.input().path,
         output_f=self.output().path,
         filterExpression=filterExpression,
         object_type=self.object_type)
     run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
     if self.dry_run:
         run_cmd("touch %s" % self.output().path, dry_run=False)
Пример #14
0
    def run(self):
        valid_path(self.output().path, check_ofile=1)

        if config.bed_file_path != '':
            extra_str = "-L %s" % config.bed_file_path
        else:
            extra_str = ''
        cmdline = "java -Xmx4g -jar {gatk} -T HaplotypeCaller -nct {gatk_thread} -R {REF} -I {input} {extra_str} --genotyping_mode DISCOVERY --dbsnp {db_snp} -stand_call_conf 10 -stand_emit_conf 5 -A AlleleBalance -A Coverage -A FisherStrand -o {output_f}".format(
            gatk=config.gatkv36_path,
            gatk_thread=config.gatk_thread,
            REF=config.REF_file_path,
            input=self.input().path,
            extra_str=extra_str,
            db_snp=config.db_snp,
            output_f=self.output().path)
        run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
        if self.dry_run:
            run_cmd("touch %s" % self.output().path, dry_run=False)
Пример #15
0
    def run(self):
        valid_path(self.output().path, check_ofile=1)
        prefix = self.output().path.replace('.vcf', "")
        input_normal = self.input()["normal"].path
        input_tumor = self.input()["tumor"].path

        cmdline = '''java {java_option} -jar {gatk} -T MuTect2 --allSitePLs -R {REF} --cosmic {cosmic} --dbsnp {db_snp} --input_file:normal {input_normal} --input_file:tumor {input_tumor} --out {output_f} --bamOutput {prefix}.bam --log_to_file {prefix}.log'''.format(
            gatk=config.gatkv36_path,
            java_option=config.java_option,
            REF=config.REF_file_path,
            cosmic=config.cos_snp,
            db_snp=config.db_snp,
            input_tumor=input_tumor,
            input_normal=input_normal,
            prefix=prefix,
            output_f=self.output().path)
        run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
        if self.dry_run:
            run_cmd("touch %s" % self.output().path, dry_run=False)
Пример #16
0
    def run(self):
        valid_path(self.output().path, check_ofile=1)
        input_f = self.input().path
        sample_name = self.infodict["SampleID"]

        if config.bed_file_path:
            extra_str = " --intervals %s" % config.bed_file_path
        else:
            extra_str = ""

        somatic_type = self.infodict["Somatic"]
        if somatic_type == "N":
            extra_str += ""
            # Normal only
            extra_str = ''
        elif somatic_type == "T":
            # Tumor only
            extra_str = ' --tumor_lod 4'
        else:
            raise Exception("Unknown values of Somatic columns (like '%s' )" %
                            somatic_type)

        cmdline = "{gatk4} Mutect2 --java-options '-Xmx20g' --native-pair-hmm-threads 20 --reference {REF} -I {input_tumor} -tumor {T_name} --dbsnp {db_snp} --seconds-between-progress-updates 60 --all-site-pls -stand-call-conf 10 -A Coverage -A DepthPerAlleleBySample -A FisherStrand -A BaseQuality -A QualByDepth -A RMSMappingQuality -A MappingQualityRankSumTest -A ReadPosRankSumTest -A ChromosomeCounts --all-site-pls true --output {prefix}.vcf -bamout {prefix}.bam".format(
            gatk4=config.gatk_pro,
            REF=config.REF_file_path,
            db_snp=config.db_snp,
            input_tumor=input_f,
            prefix=self.output().path.replace('.vcf', ''),
            T_name=sample_name,
            extra_str=extra_str)
        run_cmd(cmdline,
                dry_run=self.dry_run,
                log_file=self.infodict.get("log_path", None))
        if self.dry_run:
            run_cmd("touch %s" % self.output().path,
                    dry_run=False,
                    log_file=self.get_log_path())
Пример #17
0
def add_per_info(result_csvs, output_csvs, tumor_bam, normal_bam, bed_file):
    '''
    :param result_csv:
    :param output_csv:
    :param tumor_bam:
    :param normal_bam:
    :return:
    '''
    fasta = config.REF_file_path
    print('{:#^40}'.format('Start Whole project...'))
    t1 = time.time()
    bed_df = pd.read_csv(bed_file, sep='\t', header=None)
    range_list = []
    for _idx in tqdm(range(bed_df.shape[0])):
        range_list += ['chr' + str(_i)
                       for _i in range(bed_df.iloc[_idx, 1],
                                       bed_df.iloc[_idx, 2])]
    tb = pysam.AlignmentFile(tumor_bam)
    nb = pysam.AlignmentFile(normal_bam)
    ref_ = pysam.FastaFile(fasta)

    for result_csv, output_csv in zip(result_csvs,
                                      output_csvs):
        result_csv = os.path.realpath(result_csv)
        output_csv = os.path.realpath(output_csv)
        # ~ in path is missing location.It will raise error, so need to expand it.
        ori_csv = pd.read_csv(result_csv, index_col=None)
        t2 = time.time()
        print('{:#^40}'.format('Loaded/Inited all required file...... Using %d ' % (t2 - t1)))
        print('{:#^40}'.format('Star Iteration.......'))

        for _ in added_col:
            # init a col or add a new column.
            ori_csv.loc[:, _] = 0

        for _index, row in tqdm(ori_csv.iterrows(),
                                total=ori_csv.shape[0]):
            Ref = row['Ref']
            Alt = row['Alt']
            Chr = row['Chr']
            Pos = int(row['Start'])
            End = int(row['End'])
            if not set(['chr' + str(_i)
                        for _i in range(Pos,
                                        End + 1)]).intersection(range_list):
                # if this pos at the range of WES bed indicated.
                # if it is a deletion or intersection, partially intersected also is ok.
                ori_csv.loc[_index, added_col] = 'Off target'
                continue
            if tumor_bam:
                mut_cov, mut_per, ref_cov = parse_bam(tb, Chr, Pos, End, Ref, Alt, ref_)
                ori_csv.loc[_index, added_col[-3:]] = mut_cov, mut_per, ref_cov
            if normal_bam:
                mut_cov, mut_per, ref_cov = parse_bam(nb, Chr, Pos, End, Ref, Alt, ref_)
                ori_csv.loc[_index, added_col[:3]] = mut_cov, mut_per, ref_cov
        print('{:#^40}'.format('Almost Completing. Iteration used %d.' % (time.time() - t2)))
        print('{:#^40}'.format('filtering all unconvinced snp/indel.'))

        # a_num.T_mut_per = a_num.T_mut_per.astype(float)
        # a_num.N_mut_per = a_num.N_mut_per.astype(float)
        # a_num = a_num[a_num.T_mut_per >= a_num.N_mut_per]
        # a_num = a_num[a_num.T_mut_per != 0]
        valid_path(output_csv, check_ofile=1)
        with open(output_csv, 'w') as f1:
            ori_csv.to_csv(f1, index=False)