Пример #1
0
 def copy_bam_files(self):
     self._final_bam = join(self._align_dir, '{0}_final.bam'.format(self._sample_name))
     bam_dst_md5 = '{0}.md5'.format(self._final_bam)
     if md5sum_check(self._final_bam, bam_dst_md5):
         log_progress(__modname__, 'Copy the BAM file to output directory already finished', f=self._log_file)
     else:
         log_progress(__modname__, 'Copy the BAM file to output directory', f=self._log_file)
         ### 901: DNA, 902: RNA
         if self._pipeline == '901':
             bam_dir = join(self._tst170_dir, 'DNA_IntermediateFiles', 'Alignment')
             bam_src = join(bam_dir, '{0}_realigned.bam'.format(self._sample_name))
             bed_name = 'DNA_PicardTarget.bed'
         elif self._pipeline == '902':
             bam_dir = join(self._tst170_dir, 'RNA_IntermediateFiles', 'Alignment')
             bam_src = join(bam_dir, '{0}.bam'.format(self._sample_name))
             bed_name = 'RNA_PicardTarget.bed'
         else:
             log_error(__modname__, 'Unknown pipeline code {0} for TST170 pipeline'.format(self._pipeline), f=self._log_file)
             sys.exit(1)
         self.copy_files(bam_src, self._final_bam)
         bai_src = '{0}.bai'.format(bam_src)
         bai_dst = '{0}.bai'.format(self._final_bam)
         self.copy_files(bai_src, bai_dst)
         self.copy_files(join(bam_dir, bed_name), join(self._align_dir, bed_name))
         self.generate_tdf_file(self._final_bam)
         run_command_md5sum(__modname__, self._log_file, self._final_bam, bam_dst_md5)
         log_progress(__modname__, 'Copy the BAM file finished', f=self._log_file)
Пример #2
0
 def vcf_post_processing(self, input_file, refined_vcf):
     refined_vcf_md5 = '{0}.md5'.format(refined_vcf)
     if md5sum_check(refined_vcf, refined_vcf_md5):
         log_progress(__modname__,
                      'VCF post processing already finished!!!',
                      f=self._log_file)
         log_version(__modname__, self._sw['vt_ver'], f=self._log_file)
     else:
         log_progress(__modname__,
                      'VCF post processing start',
                      f=self._log_file)
         log_version(__modname__, self._sw['vt_ver'], f=self._log_file)
         if os.path.exists(refined_vcf):
             os.remove(refined_vcf)
         exec_cmd = [
             '{0} normalize -r {1} {2}'.format(self._sw['vt'],
                                               self._sw['hg19'],
                                               input_file),
             '{0} decompose -s -'.format(self._sw['vt']),
         ]
         run_command_pipe_file_handle(__modname__, exec_cmd, self._log_file,
                                      'w', refined_vcf)
         run_command_md5sum(__modname__, self._log_file, refined_vcf,
                            refined_vcf_md5)
         log_progress(__modname__,
                      'VCF post processing finished',
                      f=self._log_file)
Пример #3
0
 def pileup_depth(self, pileup_depth):
     md5_file = '%s.md5' % (pileup_depth)
     if md5sum_check(pileup_depth, md5_file):
         log_progress(__modname__,
                      'Get Pileup Depth already finished!!!',
                      f=self._log_file)
         log_version(__modname__,
                     self._sw['samtools_ver'],
                     f=self._log_file)
     else:
         log_progress(__modname__,
                      'Get Pileup Depth start',
                      f=self._log_file)
         log_version(__modname__,
                     self._sw['samtools_ver'],
                     f=self._log_file)
         exec_cmd = [
             self._sw['samtools'], 'depth', '-a', '-q', '0', '-Q', '1',
             '-d', '1000000', '-b', self._target_bed, '--reference',
             self._sw['hg19'], self._final_bam
         ]
         run_command_file_handle(__modname__, exec_cmd, self._log_file, 'w',
                                 pileup_depth)
         run_command_md5sum(__modname__, self._log_file, pileup_depth,
                            md5_file)
         log_progress(__modname__,
                      'Get Pileup Depth finished',
                      f=self._log_file)
Пример #4
0
    def low_confidence_annotation(self, input_file, lowconf_vcf):
        low_conf_homopolyx = join(
            self._variant_dir,
            '{0}_lowconf.homopolyx'.format(self._sample_name))
        lowconf_vcf_md5 = '{0}.md5'.format(lowconf_vcf)
        if md5sum_check(lowconf_vcf, lowconf_vcf_md5):
            log_progress(__modname__,
                         'Low confidence annotation already finished!!!',
                         f=self._log_file)
        else:
            log_progress(__modname__,
                         'Low confidence annotation start',
                         f=self._log_file)

            if os.path.exists(low_conf_homopolyx):
                os.remove(low_conf_homopolyx)
            if os.path.exists(lowconf_vcf):
                os.remove(lowconf_vcf)

            exec_cmd1 = [
                'python', self._sw['ngb_lowconf_homopolyx'], '-p', '5', '-r',
                self._sw['hg19'], '-o', low_conf_homopolyx, input_file
            ]
            run_command(__modname__, exec_cmd1, self._log_file)

            exec_cmd2 = [
                'python', self._sw['ngb_lowconf_repeatcnt'], '-r',
                self._sw['hg19'], '-o', lowconf_vcf, low_conf_homopolyx
            ]
            run_command(__modname__, exec_cmd2, self._log_file)
            run_command_md5sum(__modname__, self._log_file, lowconf_vcf,
                               lowconf_vcf_md5)
            log_progress(__modname__,
                         'Low confidence annotation finished',
                         f=self._log_file)
Пример #5
0
 def remove_reference_info(self, input_file, remove_ref_vcf):
     remove_ref_vcf_md5 = '{0}.md5'.format(remove_ref_vcf)
     # remove only reference...
     if md5sum_check(remove_ref_vcf, remove_ref_vcf_md5):
         log_progress(__modname__,
                      'Remove only reference in VCF already finished!!!',
                      f=self._log_file)
         log_version(__modname__,
                     self._sw['vcftools_ver'],
                     f=self._log_file)
         log_version(__modname__, self._sw['vt_ver'], f=self._log_file)
     else:
         log_progress(__modname__,
                      'Remove only reference in VCF start',
                      f=self._log_file)
         log_version(__modname__,
                     self._sw['vcftools_ver'],
                     f=self._log_file)
         log_version(__modname__, self._sw['vt_ver'], f=self._log_file)
         if os.path.exists(remove_ref_vcf):
             os.remove(remove_ref_vcf)
         exec_cmd = [
             '{0} --vcf {1} --recode --stdout'.format(
                 self._sw['vcftools'], input_file), 'grep -v "0[/|]0"',
             'grep -v "\.[/|]\."', '{0} sort -'.format(self._sw['vt'])
         ]
         run_command_pipe_file_handle(__modname__, exec_cmd, self._log_file,
                                      'w', remove_ref_vcf)
         run_command_md5sum(__modname__, self._log_file, remove_ref_vcf,
                            remove_ref_vcf_md5)
         log_progress(__modname__,
                      'Remove only reference in VCF finished',
                      f=self._log_file)
Пример #6
0
    def run(self, summary_file, mapq_file, stat_json_file, flag):
        stat_json_file_md5 = "{0}.md5".format(stat_json_file)
        if md5sum_check(stat_json_file, stat_json_file_md5):
            log_progress(__modname__,
                         "Analysis Statistics already finished",
                         f=self._log_file)
        else:
            log_progress(__modname__,
                         "Analysis Statistics start",
                         f=self._log_file)

            if flag == "solid": self._cutoff_uniformity05 = 5
            elif flag == "blood": self._cutoff_uniformity05 = 10

            stat_data = {}

            try:
                with open(summary_file, "r") as f:
                    lines = f.readlines()
                for line in lines:
                    sp = line.replace("\n", "").split("\t")
                    if len(sp) == 2:
                        stat_data[sp[0].replace(" ", "_")] = sp[1]
            except Exception as ex:
                log_error(__modname__,
                          "Parsing stat summary file error: {0}".format(ex),
                          f=self._log_file)
                sys.exit(1)

            try:
                with open(mapq_file, "r") as f:
                    lines = f.readlines()
                for line in lines:
                    sp = line.replace("\n", "").split("\t")
                    if len(sp) == 2:
                        stat_data[sp[0].replace(" ", "_")] = sp[1]
            except Exception as ex:
                log_error(__modname__,
                          "Parsing mapping quality file error: {0}".format(ex),
                          f=self._log_file)
                sys.exit(1)

            json_list = self.workflow(stat_data)
            json_data = {}
            json_data["qc_data"] = json_list
            try:
                with open(stat_json_file, "w") as make_json_file:
                    json.dump(json_data,
                              make_json_file,
                              ensure_ascii=False,
                              sort_keys=True,
                              indent=2)
                run_command_md5sum(__modname__, self._log_file, stat_json_file,
                                   stat_json_file_md5)
            except Exception as ex:
                log_error(__modname__, "{0}".format(ex), f=self._log_file)
                sys.exit(1)
            log_progress(__modname__,
                         "Analysis Statistics finished",
                         f=self._log_file)
Пример #7
0
 def workflow(self):
     tsv_md5 = '{0}.md5'.format(self._cnv_tsv)
     fc_stat_md5 = '{0}.md5'.format(self._cnv_fc_stat)
     if md5sum_check(self._cnv_tsv, tsv_md5) and md5sum_check(
             self._cnv_fc_stat, fc_stat_md5):
         log_progress(__modname__,
                      'CNV TSV file generation already finished!!!',
                      f=self._log_file)
     else:
         log_progress(__modname__,
                      'CNV TSV file generation start',
                      f=self._log_file)
         if os.path.exists(self._cnv_tsv):
             os.remove(self._cnv_tsv)
         if os.path.exists(self._cnv_fc_stat):
             os.remove(self._cnv_fc_stat)
         vcf_reader = vcf.Reader(open(self._cnv_vcf, 'r'))
         cnv_tsv = open(self._cnv_tsv, 'w')
         cnv_tsv.write(
             'chromosome\tstart\tend\treference\talternate\tSV_type\tgene\tfold_change\n'
         )
         for i, record in enumerate(vcf_reader):
             if record.ALT[0] is not None:
                 chrom = record.CHROM
                 start_pos = record.POS
                 end_pos = record.INFO['END']
                 allele_reference = record.REF
                 if str(record.ALT[0]) == '<DUP>':
                     allele_alternate = 'DUP'
                 elif str(record.ALT[0]) == '<DEL>':
                     allele_alternate = 'DEL'
                 gene_name = record.INFO['ANT']
                 sv_type = record.INFO['SVTYPE']
                 cnv_tsv.write(
                     '{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n'.format(
                         chrom, int(start_pos), int(end_pos),
                         allele_reference, allele_alternate, sv_type,
                         gene_name, float(record.samples[0]['FC'])))
             else:
                 pass
             with open(self._cnv_fc_stat, 'a') as f:
                 f.write('{0}\t{1}\n'.format(record.INFO['ANT'],
                                             record.samples[0]['FC']))
         cnv_tsv.close()
         run_command_md5sum(__modname__, self._log_file, self._cnv_tsv,
                            tsv_md5)
         run_command_md5sum(__modname__, self._log_file, self._cnv_fc_stat,
                            fc_stat_md5)
         log_progress(__modname__,
                      'CNV TSV file generation finished',
                      f=self._log_file)
Пример #8
0
 def run_hered_qc_report(self):
     with open(self._status_log_file, 'w') as f:
         f.write('[STATUS] QC Report Generation\n[PROGRESS] 95')
     qc_report_file = join(self._output_dir, "data", "stat", "{0}.pdf".format(self._sample_name))
     md5_file = '{0}.md5'.format(qc_report_file)
     if md5sum_check(qc_report_file, md5_file):
         log_progress(__modname__, 'QC Report Generation already finished', f=self._log_file)
     else:
         log_progress(__modname__, 'Run QC Report Generation', f=self._log_file)
         qc_report_gen = Her_QC_Report(self._sample_name, self._output_dir, self._fastq_r1, self._fastq_r2,
                                       self._pipeline, self._pipeline_name, self._platform, self._sample_source, self._run_name, self._log_file)
         qc_report_gen.run()
         run_command_md5sum(__modname__, self._log_file, qc_report_file, md5_file)
         log_progress(__modname__, 'QC Report Generation finished', f=self._log_file)
Пример #9
0
    def run_snpEff(self, input_file, output_file):
        snpeff_tmp_out = join(self._variant_dir,
                              '{0}_snpeff_tmp.vcf'.format(self._sample_name))
        output_md5 = '{0}.md5'.format(output_file)
        if md5sum_check(output_file, output_md5):
            log_progress(__modname__,
                         'snpEff gene annotation already finished!!!',
                         f=self._log_file)
            log_version(__modname__, self._sw['snpeff_ver'], f=self._log_file)
        else:
            log_progress(__modname__,
                         'snpEff gene annotation start',
                         f=self._log_file)
            log_version(__modname__, self._sw['snpeff_ver'], f=self._log_file)

            if os.path.exists(snpeff_tmp_out):
                os.remove(snpeff_tmp_out)
            if os.path.exists(output_file):
                os.remove(output_file)

            exec_cmd1 = [
                self._sw['java'], '-Xmx4g', '-XX:ParallelGCThreads={0}'.format(
                    self._pe_core), '-Djava.io.tmpdir={0}'.format(
                        self._sample_tmp_dir), '-jar', self._sw['snpeff'],
                'ann', 'hg19ngb', '-no-downstream', '-no-upstream', '-noStats',
                '-no', 'INTERGENIC', '-no', 'INTERGENIC_CONSERVED', '-no',
                'INTRAGENIC', '-no', 'RARE_AMINO_ACID', '-no', 'TRANSCRIPT',
                '-no', 'TRANSCRIPT_DELETED', '-no', 'REGULATION', '-no',
                'NEXT_PROT', '-no', 'PROTEIN_STRUCTURAL_INTERACTION_LOCUS',
                '-no', 'PROTEIN_PROTEIN_INTERACTION_LOCUS', input_file
            ]
            run_command_file_handle(__modname__, exec_cmd1, self._log_file,
                                    'w', snpeff_tmp_out)

            exec_cmd2 = [
                self._sw['bcftools'], 'view', '-i', 'INFO/ANN!="."',
                snpeff_tmp_out
            ]
            run_command_file_handle(__modname__, exec_cmd2, self._log_file,
                                    'w', output_file)
            run_command_md5sum(__modname__, self._log_file, output_file,
                               output_md5)
            log_progress(__modname__,
                         'snpEff gene annotation finished',
                         f=self._log_file)
Пример #10
0
    def run_dbnsfp_annotation(self, input_file, output_file):
        dbnsfp_tmp_vcf = join(self._variant_dir,
                              '{0}_dbnsfp_tmp.vcf'.format(self._sample_name))
        output_md5 = '{0}.md5'.format(output_file)
        if md5sum_check(output_file, output_md5):
            log_progress(__modname__,
                         'dbNSFP annotation already finished!!!',
                         f=self._log_file)
            log_version(__modname__,
                        self._sw['dbnsfp_db_ver'],
                        f=self._log_file)
        else:
            log_progress(__modname__,
                         'dbNSFP annotation start',
                         f=self._log_file)
            log_version(__modname__,
                        self._sw['dbnsfp_db_ver'],
                        f=self._log_file)

            exec_cmd1 = [
                self._sw['java'], '-Xmx4g',
                '-XX:ParallelGCThreads={0}'.format(self._pe_core),
                '-Djava.io.tmpdir={0}'.format(self._sample_tmp_dir), '-jar',
                self._sw['snpsift'], 'dbnsfp', '-f',
                'aapos,aapos_SIFT,aapos_FATHMM,Uniprot_acc,Interpro_domain,SIFT_pred,SIFT_score,LRT_pred,MutationTaster_pred,MutationTaster_score,GERP++_NR,GERP++_RS,phastCons100way_vertebrate,MutationAssessor_pred,FATHMM_pred,PROVEAN_pred,MetaSVM_pred,Polyphen2_HDIV_pred,Polyphen2_HDIV_score,Polyphen2_HVAR_pred,Polyphen2_HVAR_score,CADD_phred',
                '-db', self._sw['dbnsfp_db'], input_file
            ]
            run_command_file_handle(__modname__, exec_cmd1, self._log_file,
                                    'w', dbnsfp_tmp_vcf)

            exec_cmd2 = [
                'python', self._sw['ngb_transcript_dbNSFP'], '-o', output_file,
                dbnsfp_tmp_vcf
            ]
            run_command(__modname__, exec_cmd2, self._log_file)
            run_command_md5sum(__modname__, self._log_file, output_file,
                               output_md5)
            run_command(__modname__,
                        ['rm', '-rf', '{0}/*'.format(self._sample_tmp_dir)],
                        self._log_file)
            log_progress(__modname__,
                         'dbNSFP annotation finished',
                         f=self._log_file)
Пример #11
0
 def add_type_to_vcf(self, input_file, output_file):
     output_md5 = '{0}.md5'.format(output_file)
     if md5sum_check(output_file, output_md5):
         log_progress(__modname__,
                      'Add TYPE info already finished!!!',
                      f=self._log_file)
     else:
         log_progress(__modname__, 'Add TYPE info start', f=self._log_file)
         if os.path.exists(output_file):
             os.remove(output_file)
         exec_cmd = [
             'python', self._sw['ngb_add_vcfinfo'], '-o', output_file,
             input_file
         ]
         run_command(__modname__, exec_cmd, self._log_file)
         run_command_md5sum(__modname__, self._log_file, output_file,
                            output_md5)
         log_progress(__modname__,
                      'Add TYPE info finished',
                      f=self._log_file)
Пример #12
0
 def generate_tdf_file(self, final_bam):
     tdf_file = '{0}.tdf'.format(final_bam)
     tdf_file_md5 = '{0}.md5'.format(tdf_file)
     if md5sum_check(tdf_file, tdf_file_md5):
         log_progress(__modname__, 'TDF file generation already finished!!!', f=self._log_file)
         log_version(__modname__, self._sw['igvtools_ver'], f=self._log_file)
     else:
         log_progress(__modname__, 'TDF file generation start', f=self._log_file)
         log_version(__modname__, self._sw['igvtools_ver'], f=self._log_file)
         if os.path.exists(tdf_file):
             os.remove(tdf_file)
         exec_cmd = [
             self._sw['igvtools'],
             'count',
             final_bam,
             tdf_file,
             'hg19'
         ]
         run_command(__modname__, exec_cmd, self._log_file)
         run_command_md5sum(__modname__, self._log_file, tdf_file, tdf_file_md5)
         log_progress(__modname__, 'TDF file generation finished', f=self._log_file)
Пример #13
0
 def add_hgvs(self, input_file, output_file):
     output_md5 = '{0}.md5'.format(output_file)
     if md5sum_check(output_file, output_md5):
         log_progress(__modname__,
                      'Add HGVS info and variant type already finished!!!',
                      f=self._log_file)
     else:
         log_progress(__modname__,
                      'Add HGVS info and variant type start',
                      f=self._log_file)
         if os.path.exists(output_file):
             os.remove(output_file)
         exec_cmd = [
             'python', self._sw['ngb_add_HGVS'], '-d',
             self._sw['mutect2_bed'], '-o', output_file, input_file
         ]
         run_command(__modname__, exec_cmd, self._log_file)
         run_command_md5sum(__modname__, self._log_file, output_file,
                            output_md5)
         log_progress(__modname__,
                      'Add HGVS info and variant type finished',
                      f=self._log_file)
Пример #14
0
 def plot_generation(self):
     png_md5 = '{0}.md5'.format(self._cnv_plot)
     if md5sum_check(self._cnv_plot, png_md5):
         log_progress(__modname__,
                      'CNV plot generation already finished!!!',
                      f=self._log_file)
     else:
         log_progress(__modname__,
                      'CNV plot generation start',
                      f=self._log_file)
         if os.path.exists(self._cnv_plot):
             os.remove(self._cnv_plot)
         exec_cmd = [
             'Rscript', self._cnv_plot_script, self._sample_name,
             self._cnv_fc_stat, self._cnv_plot
         ]
         run_command(__modname__, exec_cmd, self._log_file)
         run_command_md5sum(__modname__, self._log_file, self._cnv_plot,
                            png_md5)
         log_progress(__modname__,
                      'CNV plot generation finished',
                      f=self._log_file)
Пример #15
0
    def sort_target_bed(self):
        md5_file = '%s.md5' % (self._target_bed)
        if md5sum_check(self._target_bed, md5_file):
            log_progress(__modname__,
                         'Target BED sort already finished!!!',
                         f=self._log_file)
        else:
            log_progress(__modname__,
                         'Target BED sort start',
                         f=self._log_file)

            if self._pipeline == '901':
                bed_src = join(self._align_dir, 'DNA_PicardTarget.bed')
            else:
                bed_src = join(self._align_dir, 'RNA_PicardTarget.bed')
            exec_cmd = ['cat %s' % (bed_src), 'sort -k1V,1 -k2n,2']
            run_command_pipe_file_handle(__modname__, exec_cmd, self._log_file,
                                         'w', self._target_bed)
            run_command_md5sum(__modname__, self._log_file, self._target_bed,
                               md5_file)
            log_progress(__modname__,
                         'Target BED sort finished',
                         f=self._log_file)
Пример #16
0
 def run_annotation(self, db_name, target_vcf, target_vcf_ver, _info, _name,
                    input_file, output_file):
     output_md5 = '{0}.md5'.format(output_file)
     if md5sum_check(output_file, output_md5):
         log_progress(__modname__,
                      '{0} annotation already finished!!!'.format(db_name),
                      f=self._log_file)
         log_version(__modname__, target_vcf_ver, f=self._log_file)
     else:
         log_progress(__modname__,
                      '{0} annotation start'.format(db_name),
                      f=self._log_file)
         log_version(__modname__, target_vcf_ver, f=self._log_file)
         exec_cmd = [
             self._sw['java'], '-Xmx4g',
             '-XX:ParallelGCThreads={0}'.format(self._pe_core),
             '-Djava.io.tmpdir={0}'.format(self._sample_tmp_dir), '-jar',
             self._sw['snpsift'], 'annotate'
         ]
         if _info != '':
             exec_cmd.append('-info')
             exec_cmd.append(_info)
         if _name != '':
             exec_cmd.append('-name')
             exec_cmd.append(_name)
         exec_cmd.append(target_vcf)
         exec_cmd.append(input_file)
         run_command_file_handle(__modname__, exec_cmd, self._log_file, 'w',
                                 output_file)
         run_command_md5sum(__modname__, self._log_file, output_file,
                            output_md5)
         run_command(__modname__,
                     ['rm', '-rf', '{0}/*'.format(self._sample_tmp_dir)],
                     self._log_file)
         log_progress(__modname__,
                      '{0} annotation finished'.format(db_name),
                      f=self._log_file)
Пример #17
0
    def run_clinvar_annotation(self, input_file, output_file):
        clinvar_tmp_vcf = join(self._variant_dir,
                               '{0}_clinvar_tmp.vcf'.format(self._sample_name))
        output_md5 = '{0}.md5'.format(output_file)
        if md5sum_check(output_file, output_md5):
            log_progress(__modname__,
                         'ClinVar annotation already finished!!!',
                         f=self._log_file)
            log_version(__modname__,
                        self._sw['ngb_clinvar_ver'],
                        f=self._log_file)
        else:
            log_progress(__modname__,
                         'ClinVar annotation start',
                         f=self._log_file)
            log_version(__modname__,
                        self._sw['ngb_clinvar_ver'],
                        f=self._log_file)

            exec_cmd1 = [
                'python', self._sw['ngb_anno_clinvar'], '--dbfile',
                self._sw['ngb_clinvar_db'], '--infoVCF',
                self._sw['clinvar_compact_header'], '--inVCF', input_file,
                '--outVCF', clinvar_tmp_vcf
            ]
            run_command(__modname__, exec_cmd1, self._log_file)

            exec_cmd2 = [
                'python', self._sw['ngb_clinvar_variation'], '-d',
                self._sw['ngb_clinvar_ref'], '-o', output_file, clinvar_tmp_vcf
            ]
            run_command(__modname__, exec_cmd2, self._log_file)
            run_command_md5sum(__modname__, self._log_file, output_file,
                               output_md5)
            log_progress(__modname__,
                         'ClinVar annotation finished',
                         f=self._log_file)
Пример #18
0
    def run(self):
        remove_ref_vcf = join(self._variant_dir,
                              '{0}_remove_ref.vcf'.format(self._sample_name))
        add_type_vcf = join(self._variant_dir,
                            '{0}_add_type.vcf'.format(self._sample_name))
        refined_vcf = join(self._variant_dir,
                           '{0}_refined.vcf'.format(self._sample_name))
        lowconf_vcf = join(self._variant_dir,
                           '{0}_lowconf.vcf'.format(self._sample_name))
        snpeff_vcf = join(self._variant_dir,
                          '{0}_snpeff.vcf'.format(self._sample_name))
        hgvs_vcf = join(self._variant_dir,
                        '{0}_HGVS.vcf'.format(self._sample_name))

        dbsnp_vcf = join(self._variant_dir,
                         '{0}_dbsnp.vcf'.format(self._sample_name))
        dbsnp_common_vcf = join(
            self._variant_dir,
            '{0}_dbsnp_common.vcf'.format(self._sample_name))
        kg_vcf = join(self._variant_dir,
                      '{0}_1kg.vcf'.format(self._sample_name))
        esp6500_vcf = join(self._variant_dir,
                           '{0}_esp6500.vcf'.format(self._sample_name))
        exac_vcf = join(self._variant_dir,
                        '{0}_exac.vcf'.format(self._sample_name))
        koexid_vcf = join(self._variant_dir,
                          '{0}_koexid.vcf'.format(self._sample_name))
        krgdb_vcf = join(self._variant_dir,
                         '{0}_krgdb.vcf'.format(self._sample_name))
        gnomad_vcf = join(self._variant_dir,
                          '{0}_gnomad.vcf'.format(self._sample_name))
        cosmic_vcf = join(self._variant_dir,
                          '{0}_cosmic.vcf'.format(self._sample_name))
        clinvar_vcf = join(self._variant_dir,
                           '{0}_clinvar.vcf'.format(self._sample_name))
        dbnsfp_vcf = join(self._variant_dir,
                          '{0}_dbnsfp.vcf'.format(self._sample_name))

        self.remove_reference_info(self._raw_vcf, remove_ref_vcf)
        self.add_type_to_vcf(remove_ref_vcf, add_type_vcf)
        self.vcf_post_processing(add_type_vcf, refined_vcf)
        self.low_confidence_annotation(refined_vcf, lowconf_vcf)
        self.run_snpEff(lowconf_vcf, snpeff_vcf)
        self.add_hgvs(snpeff_vcf, hgvs_vcf)

        # dbSNP
        dbsnp_name = 'dbsnp_all_'
        self.run_annotation('dbSNP All', self._sw['dbsnp_vcf'],
                            self._sw['dbsnp_vcf_ver'], '', dbsnp_name,
                            hgvs_vcf, dbsnp_vcf)
        dbsnp_common_name = 'dbsnp_common_'
        self.run_annotation('dbSNP common', self._sw['dbsnp_common_vcf'],
                            self._sw['dbsnp_vcf_ver'], '', dbsnp_common_name,
                            dbsnp_vcf, dbsnp_common_vcf)

        # 1000 Genome
        kg_info = 'AF,EAS_AF,EUR_AF,AFR_AF,AMR_AF,SAS_AF'
        kg_name = 'G1000_'
        self.run_annotation('1KG', self._sw['KG_vcf'], self._sw['KG_vcf_ver'],
                            kg_info, kg_name, dbsnp_common_vcf, kg_vcf)

        # esp6500
        esp6500_info = 'MAF'
        esp6500_name = 'esp6500_'
        self.run_annotation('esp6500', self._sw['esp6500_vcf'],
                            self._sw['esp6500_vcf_ver'], esp6500_info,
                            esp6500_name, kg_vcf, esp6500_vcf)

        # ExAC
        exac_info = 'AF'
        exac_name = 'ExAC_'
        self.run_annotation('ExAC', self._sw['exac_vcf'],
                            self._sw['exac_vcf_ver'], exac_info, exac_name,
                            esp6500_vcf, exac_vcf)

        # KOEXID
        self.run_annotation('KOEXID', self._sw['koexid_vcf'],
                            self._sw['koexid_vcf_ver'], '', '', exac_vcf,
                            koexid_vcf)

        # KRGDB
        self.run_annotation('KRGDB', self._sw['krgdb_vcf'],
                            self._sw['krgdb_vcf_ver'], '', '', koexid_vcf,
                            krgdb_vcf)

        # gnomAD
        gnomad_info = 'AF,AF_AFR,AF_AMR,AF_ASJ,AF_EAS,AF_FIN,AF_NFE,AF_OTH,AF_SAS,AF_Male,AF_Female,AF_raw,AF_POPMAX,AF_AMR_Male,AF_FIN_Female,AF_FIN_Male,AF_AFR_Male,AF_SAS_Male,AF_OTH_Male,AF_NFE_Female,AF_EAS_Female,AF_EAS_Male,AF_SAS_Female,AF_AFR_Female,AF_AMR_Female,AF_ASJ_Male,AF_ASJ_Female,AF_OTH_Female,AF_NFE_Male'
        gnomad_name = 'gnomAD_'
        self.run_annotation('gnomAD', self._sw['gnomad_vcf'],
                            self._sw['gnomad_vcf_ver'], gnomad_info,
                            gnomad_name, krgdb_vcf, gnomad_vcf)

        # COSMIC
        self.run_annotation('COSMIC', self._sw['cosmic_vcf'],
                            self._sw['cosmic_vcf_ver'], '', '', gnomad_vcf,
                            cosmic_vcf)

        # ClinVar
        self.run_clinvar_annotation(cosmic_vcf, clinvar_vcf)

        # dbNSFP
        self.run_dbnsfp_annotation(clinvar_vcf, dbnsfp_vcf)

        # copy to final vcf
        if os.path.exists(self._final_vcf):
            os.remove(self._final_vcf)
        shutil.copyfile(dbnsfp_vcf, self._final_vcf)
        run_command_md5sum(__modname__, self._log_file, self._final_vcf,
                           '{0}.md5'.format(self._final_vcf))
Пример #19
0
    def workflow(self):
        vcf_record = {}
        vcf_reader = vcf.Reader(open(self._vcf_file, 'r'))

        for i, record in enumerate(vcf_reader):
            vcf_contents = {}
            vcf_contents['id'] = i + 1

            if 'ANN' in record.INFO:
                pass
            else:
                continue

            ### allele
            allele = self.get_allele(record)
            vcf_contents['allele'] = allele

            ### build
            build = {}
            build['ref_genome'] = 'GRCh37/hg19'
            vcf_contents['build'] = build

            ### genomic coordiante
            genomic_coordinate = {}
            genomic_coordinate['chromosome'] = record.CHROM
            genomic_coordinate['g.pos'] = record.POS
            vcf_contents['genomic_coordinate'] = genomic_coordinate

            ### gene
            gene = self.get_gene_info(record, allele)
            vcf_contents['gene'] = gene

            ### insilico prediction
            in_silico_prediction = {}
            # FATHMM_prediction
            fathmm = {}
            if "dbNSFP_FATHMM_pred" in record.INFO:
                fathmm["prediction"] = str(
                    record.INFO["dbNSFP_FATHMM_pred"]).replace(
                        "'", "").replace("[", "").replace("]", "").replace(
                            ", ", "|").replace("D", "Deleterious").replace(
                                "T", "Tolerated").replace("None", "")
            else:
                fathmm['prediction'] = ''
            in_silico_prediction['FATHMM_prediction'] = fathmm
            # GERP++
            gerp = {}
            if "dbNSFP_GERP___RS" in record.INFO:
                gerp["RS_score"] = str(record.INFO["dbNSFP_GERP___RS"][0])
            else:
                gerp['RS_score'] = ''
            if "dbNSFP_GERP___NR" in record.INFO:
                gerp["NR_score"] = str(record.INFO["dbNSFP_GERP___NR"][0])
            else:
                gerp['NR_score'] = ''
            in_silico_prediction["GERP++"] = gerp
            # LRT
            lrt = {}
            if "dbNSFP_LRT_pred" in record.INFO:
                lrt["prediction"] = str(
                    record.INFO["dbNSFP_LRT_pred"]).replace("'", "").replace(
                        "[", "").replace("]", "").replace(", ", "|").replace(
                            "D",
                            "Deleterious").replace("N", "Neutral").replace(
                                "U", "Unknown").replace("None", "")
            else:
                lrt['prediction'] = ''
            in_silico_prediction["LRT_prediction"] = lrt
            vcf_contents['in_silico_prediction'] = in_silico_prediction
            # Mutation Assessor
            ma = {}
            if "dbNSFP_MutationAssessor_pred" in record.INFO:
                ma["prediction"] = str(
                    record.INFO["dbNSFP_MutationAssessor_pred"]).replace(
                        "'", "").replace("[", "").replace("]", "").replace(
                            ", ", "|").replace("H", "high").replace(
                                "M", "medium").replace("L", "low").replace(
                                    "N", "netural").replace("None", "")
            else:
                ma['prediction'] = ''
            in_silico_prediction["MutationAssessor_prediction"] = ma
            # PolyPhen2
            polyphen2 = {}
            polyphen2_score, polyphen2_text, polyphen2_radar = self.polyphen2(
                record.INFO)
            polyphen2['score'] = str(polyphen2_score)
            polyphen2['radar'] = str(polyphen2_radar)
            polyphen2['text'] = str(polyphen2_text)
            polyphen2['prediction'] = ''
            in_silico_prediction['PolyPhen2'] = polyphen2
            # SIFT
            sift = {}
            sift_score, sift_text, sift_radar = self.sift(record.INFO)
            sift['score'] = str(sift_score)
            sift['radar'] = str(sift_radar)
            sift['text'] = str(sift_text)
            if "dbNSFP_SIFT_pred" in record.INFO:
                sift["prediction"] = str(
                    record.INFO["dbNSFP_SIFT_pred"]).replace("'", "").replace(
                        "[", "").replace("]", "").replace(", ", "|").replace(
                            "D", "Deleterious").replace("T",
                                                        "Tolerated").replace(
                                                            "None", "")
            else:
                sift['prediction'] = ''
            in_silico_prediction["SIFT"] = sift
            # Mutation Taster
            mt = {}
            mt_score, mt_text, mt_radar = self.mutationtaster(record.INFO)
            mt['radar'] = str(mt_radar)
            mt['text'] = str(mt_text)
            if "dbNSFP_MutationTaster_pred" in record.INFO:
                mt["prediction"] = str(
                    record.INFO["dbNSFP_MutationTaster_pred"]
                ).replace("'", "").replace("[", "").replace("]", "").replace(
                    ", ",
                    "|").replace("A", "disease causing automatic").replace(
                        "D", "disease causing").replace(
                            "N", "polymorphism (probably harmless)").replace(
                                "P",
                                "polymorphism automatic (known to be harmless)"
                            ).replace("None", "")
            else:
                mt['prediction'] = ''
            in_silico_prediction["mt"] = mt
            vcf_contents['in_silico_prediction'] = in_silico_prediction

            ### Variant Flag
            flag = {}
            # check sequencing error
            if "LOW_CONFIDENCE" in record.INFO:
                flag["low_confidence"] = str(record.INFO["LOW_CONFIDENCE"][0])

            if len(record.FILTER) != 0:
                for _filter in record.FILTER:
                    if _filter == 'q20':
                        flag[
                            'low_quality_score'] = 'Quality score less than 20'
                    elif _filter == 'LowDP':
                        flag[
                            'low_DP'] = 'Low coverage, therefore no genotype called. Note different depth thresholds for variant and reference calls'
                    elif _filter == 'SB':
                        flag[
                            'high_strand_bias'] = 'Variant strand bias too high'
                    elif _filter == 'LowVariantFreq':
                        flag[
                            'low_variant_frequency'] = 'Variant frequency less than 0.0260'
                    elif _filter == 'R8':
                        flag[
                            'indel_repeat'] = 'Indel repeat greater than or equal to 8'
                    elif _filter == 'LowAQ':
                        flag[
                            'low_AQ'] = 'Variant artifact adjusted quality score less than 10'
                    elif _filter == 'LowVarSupport':
                        flag[
                            'low_variant_support'] = 'Variant is supported by too few reads'
                    else:
                        continue

            vcf_contents["flag"] = flag

            # oncokb
            oncokb = {}
            if "oncokb_action_cancer" in record.INFO:
                oncokb["action_cancer"] = str(
                    record.INFO["oncokb_action_cancer"][0]).replace("_", " ")
            else:
                oncokb["action_cancer"] = ""
            if "oncokb_action_drugs" in record.INFO:
                oncokb["action_drugs"] = str(
                    record.INFO["oncokb_action_drugs"][0]).replace("_", " ")
            else:
                oncokb["action_drugs"] = ""
            if "oncokb_action_level" in record.INFO:
                oncokb["action_level"] = str(
                    record.INFO["oncokb_action_level"][0]).replace("_", " ")
            else:
                oncokb["action_level"] = ""
            if "oncokb_action_pmid" in record.INFO:
                oncokb["action_pmid"] = str(
                    record.INFO["oncokb_action_pmid"][0]).replace("_", " ")
            else:
                oncokb["action_pmid"] = ""

            if "oncokb_oncogenicity" in record.INFO:
                oncokb["oncogenicity"] = str(
                    record.INFO["oncokb_oncogenicity"][0]).replace("_", " ")
            else:
                oncokb["oncogenicity"] = ""
            if "oncokb_hgvsp" in record.INFO:
                oncokb["oncokb_hgvsp"] = str(
                    record.INFO["oncokb_hgvsp"][0]).replace("_", " ")
            else:
                oncokb["oncokb_hgvsp"] = ""
            if "oncokb_tx" in record.INFO:
                oncokb["oncokb_tx"] = str(record.INFO["oncokb_tx"][0])
            else:
                oncokb["oncokb_tx"] = ""

            vcf_contents["Precision Oncology Knowledge Base"] = oncokb

            #clincal(clinvar)
            clinical = {}
            if 'ngb_cv_rcv_acc' in record.INFO:
                clinvar_rcv_accessions = str(
                    record.INFO["ngb_cv_rcv_acc"]).replace("]", "").replace(
                        "[", "").replace("'", "").strip().split(',')
                clinical['clinvar_accession'] = ",".join(
                    clinvar_rcv_accessions).strip()
                clinical['total_rcv'] = len(clinvar_rcv_accessions)

            clinvar_cs = self.clinvar_clinical_significance(record.INFO)
            clinical['radar'] = str(clinvar_cs)

            if 'ngb_cv_rcv_sig_reviewstatus' in record.INFO:
                clinical['review_status'] = clinvar_rcv_accessions = str(
                    record.INFO["ngb_cv_rcv_sig_reviewstatus"]).replace(
                        "]", "").replace("[", "").replace("'", "").replace(
                            "0x2C", "").replace("_", " ").strip()
            else:
                clinical['review_status'] = "None"

            if 'ngb_cv_rcv_sig_description' in record.INFO:
                clinical['classification'] = clinvar_rcv_accessions = str(
                    record.INFO["ngb_cv_rcv_sig_description"]).replace(
                        "]", "").replace("[", "").replace("'", "").strip()

            if "ngb_cv_trait_preferred_name" in record.INFO:
                tmp_preferred_name = str(
                    record.INFO["ngb_cv_trait_preferred_name"][0])
                clinical["trait_preferred_name"] = tmp_preferred_name.replace(
                    "0x2C", "").replace("_", " ")

            if "ngb_cv_trait_link_omim" in record.INFO:
                clinical["omim"] = str(
                    record.INFO["ngb_cv_trait_link_omim"][0]).replace("-", "")

            vcf_contents['clinical'] = clinical

            population = {}
            #1000 genomes
            pop_1kg = {}
            pop_1kg["African"] = self.g1000_pop_freq("AFR", record)
            pop_1kg["American"] = self.g1000_pop_freq("AMR", record)
            pop_1kg["East_Asian"] = self.g1000_pop_freq("EAS", record)
            pop_1kg["European"] = self.g1000_pop_freq("EUR", record)
            pop_1kg["South_Asian"] = self.g1000_pop_freq("SAS", record)
            pop_1kg["ALL"] = self.g1000_pop_freq("ALL", record)
            population['1000_genomes'] = pop_1kg

            #esp 6500
            pop_esp6500 = {}
            pop_esp6500["ALL"] = self.esp6500_pop_freq("ALL", record)
            pop_esp6500["AA"] = self.esp6500_pop_freq("AA", record)
            pop_esp6500["EA"] = self.esp6500_pop_freq("EA", record)
            population['ESP6500'] = pop_esp6500

            #Exac population frequnecy
            pop_exac = {}
            pop_exac["ALL"] = self.exac_pop_freq("ALL", record)
            population["ExAC"] = pop_exac

            #KoEXID
            pop_koexid = {}
            koexid = {}
            koexid["code"] = "ALL"
            koexid["population"] = "All KoEXID"
            koexid_af = ""
            if "KoEXID_AF" in record.INFO:
                koexid_af = str(record.INFO["KoEXID_AF"])

            koexid["allele_frequency"] = koexid_af
            pop_koexid["ALL"] = koexid
            population["Korean Exome Information Database"] = pop_koexid

            #KRGDB
            pop_krgdb = {}
            krgdb = {}
            krgdb["code"] = "ALL"
            krgdb["population"] = "All KRGDB"
            krgdb_af = ""
            if "KRGDB1100_AF" in record.INFO:
                krgdb_af = str(record.INFO["KRGDB1100_AF"][0])

            krgdb["allele_frequency"] = krgdb_af
            pop_krgdb["ALL"] = krgdb
            population["Korean Reference Genome DB"] = pop_krgdb

            # gnomAD
            pop_gnomad = {}
            pop_gnomad["Finnish"] = self.gnomad_pop_freq("FIN", record)
            pop_gnomad["South Asian"] = self.gnomad_pop_freq("SAS", record)
            pop_gnomad["Admixed American"] = self.gnomad_pop_freq(
                "AMR", record)
            pop_gnomad["Non-Finnish European"] = self.gnomad_pop_freq(
                "NFE", record)
            pop_gnomad["East Asian"] = self.gnomad_pop_freq("EAS", record)
            pop_gnomad["Others"] = self.gnomad_pop_freq("OTH", record)
            pop_gnomad["African/African American"] = self.gnomad_pop_freq(
                "AFR", record)
            pop_gnomad["Ashkenazi Jewish"] = self.gnomad_pop_freq(
                "ASJ", record)
            pop_gnomad["ALL"] = self.gnomad_pop_freq("ALL", record)
            population["gnomAD"] = pop_gnomad

            vcf_contents['population_frequency'] = population

            if "TYPE" in record.INFO:
                allele_type_of_allele_ = str(record.INFO["TYPE"]).replace(
                    "[",
                    "").replace("]",
                                "").replace("'",
                                            "").replace(" ",
                                                        "").strip().split(",")
                allele_type_of_allele = str(allele_type_of_allele_[0])
            else:
                allele_type_of_allele = ""

            ref_len = len(allele['reference'])
            alt_len = len(allele['alternate'])

            if allele_type_of_allele == "ins":
                variant_size = ref_len - 1
            else:
                variant_size = ref_len - alt_len

            variant = {}
            if "dbsnp_all_RS" in record.INFO:
                variant["rs_id"] = "rs%s" % (str(record.INFO['dbsnp_all_RS']))
            else:
                variant["rs_id"] = ""

            variant['start'] = int(record.POS)
            variant['stop'] = int(record.POS) + variant_size
            variant['exac_format'] = "{0}-{1}-{2}-{3}".format(
                str(record.CHROM).replace('chr', ''), record.POS,
                allele['reference'], allele['alternate'])
            variant['type'] = allele_type_of_allele
            left_10_bp = variant['start'] - 23
            right_10_bp = variant['stop'] + 22

            fasta = pysam.FastaFile(self._reference)
            left_10_bp_seq = fasta.fetch(str(record.CHROM),
                                         start=left_10_bp,
                                         end=variant['start'] - 1)
            right_10_bp_seq = fasta.fetch(str(record.CHROM),
                                          start=variant['stop'],
                                          end=right_10_bp)
            variant['left_22_bp'] = left_10_bp_seq
            variant['right_22_bp'] = right_10_bp_seq

            if "Variant_type" in record.INFO:
                variant["variant_type"] = str(record.INFO["Variant_type"][0])
            else:
                variant["variant_type"] = ""

            vcf_contents["variant_information"] = variant

            variant_classifier = {}

            if 'ngb_cv_rcv_sig_description' in record.INFO:
                prediction = clinical['classification']
            else:
                prediction = "uncertain significance"
            variant_classifier['result'] = prediction

            if prediction == "pathogenic":
                variant_classifier['radar'] = "5"
                variant_classifier['grade'] = "A"
            elif prediction == "likely pathogenic":
                variant_classifier['radar'] = "4"
                variant_classifier['grade'] = "B"
            elif prediction == "uncertain significance":
                variant_classifier['radar'] = "3"
                variant_classifier['grade'] = "C"
            elif prediction == "likely benign":
                variant_classifier['radar'] = "2"
                variant_classifier['grade'] = "D"
            elif prediction == "benign":
                variant_classifier['radar'] = "1"
                variant_classifier['grade'] = "E"

            vcf_contents['variant_classifier'] = variant_classifier

            cosmic_anno = {}
            cosmic_anno['cosmic_cnt'] = ""
            cosmic_anno['cosmic_id'] = ""
            cosmic_anno['cosmic_occurrence'] = ""

            if 'COSMIC_CNT' in record.INFO:
                cosmic_anno['cosmic_cnt'] = str(
                    record.INFO['COSMIC_CNT']).replace("]", "").replace(
                        "[", "").replace("'", "").strip()
            if 'COSMIC_ID' in record.INFO:
                cosmic_anno['cosmic_id'] = str(
                    record.INFO['COSMIC_ID']).replace("]", "").replace(
                        "[", "").replace("'", "").strip()
            if 'COSMIC_OCCURENCE' in record.INFO:
                cosmic_anno['cosmic_occurrence'] = str(
                    record.INFO['COSMIC_OCCURENCE']).replace("]", "").replace(
                        "[", "").replace("'", "").strip()

            vcf_contents[
                'Catalogue Of Somatic Mutations In Cancer'] = cosmic_anno

            variant_name = "GRCh37-%s-%s-%s-%s" % (
                record.CHROM, variant['start'], allele['reference'],
                allele['alternate'])
            vcf_record[variant_name] = vcf_contents

        for key in sorted(vcf_record.keys(), key=str.lower):
            vcf_json = {}
            vcf_json[key] = vcf_record[key]
            vcf_json = json.dumps(vcf_json, sort_keys=True)
            with open(self._json_file, 'a') as f:
                f.write(vcf_json + '\n')
            #print (vcf_json)
        run_command_md5sum(__modname__, self._log_file, self._json_file,
                           self._md5_file)