예제 #1
0
    def reclassify_mutations(self):
        # calculate p(Somatic | given joint TiN estimate)
        if self.input.weighted_classification == True:
            numerator = np.zeros(len(self.ssnv_based_model.p_TiN_given_S))
            denominator = np.zeros(len(self.ssnv_based_model.p_TiN_given_S))
            for idx, p in enumerate(self.joint_posterior):
                if p > 0.001:
                    num_iter = (p * self.ssnv_based_model.p_somatic * self.ssnv_based_model.p_TiN_given_S[:, idx])
                    numerator = numerator + num_iter
                    denom_iter = num_iter + p * (np.array(
                        [1 - self.ssnv_based_model.p_somatic] * np.nan_to_num(
                            self.ssnv_based_model.p_TiN_given_G[:, idx])))
                    denominator = denominator + denom_iter

        else:
            numerator = self.ssnv_based_model.p_somatic * np.expand_dims(
                self.ssnv_based_model.p_TiN_given_S[:, self.TiN_int], 1)
            denominator = numerator + np.array(
                [1 - self.ssnv_based_model.p_somatic] * np.expand_dims(
                    np.nan_to_num(self.ssnv_based_model.p_TiN_given_G[:, self.TiN_int]), 1))
        self.SSNVs.loc[:, ('p_somatic_given_TiN')] = np.nan_to_num(np.true_divide(numerator, denominator))
        # expected normal allele fraction given TiN and tau
        af_n_given_TiN = np.multiply(self.ssnv_based_model.tumor_f, self.ssnv_based_model.CN_ratio[:, self.TiN_int])
        # probability of normal allele fraction less than or equal to predicted fraction
        self.SSNVs.loc[:, 'p_outlier'] = self.ssnv_based_model.rv_normal_af.cdf(af_n_given_TiN + 0.01)
        if self.TiN_int == 0:
            print 'Estimated 0 TiN no SSNVs will be recovered outputing deTiN statistics for each site'
        elif self.use_outlier_threshold:
            # remove outliers mutations p(af_n >= E[af_n|TiN]) < 0.05
            self.SSNVs['judgement'][np.logical_and(self.SSNVs['p_somatic_given_TiN'] > self.threshold,
                                                   self.SSNVs['p_outlier'] >= 0.01)] = 'KEEP'
        else:
            self.SSNVs['judgement'][self.SSNVs['p_somatic_given_TiN'] > self.threshold] = 'KEEP'
        if  self.input.indel_file != 'None':
            if self.input.indel_table.isnull().values.sum() == 0:
                indel_model = dssnv.model(self.input.indel_table, self.input.mutation_prior, self.input.resolution)
                indel_model.generate_conditional_ps()
                self.indels = self.input.indel_table
                numerator = indel_model.p_somatic * np.expand_dims(indel_model.p_TiN_given_S[:, self.TiN_int], 1)
                denominator = numerator + np.array(
                    [1 - indel_model.p_somatic] * np.expand_dims(np.nan_to_num(
                        indel_model.p_TiN_given_G[:, self.TiN_int]), 1))
                af_n_given_TiN = np.multiply(indel_model.tumor_f, indel_model.CN_ratio[:, self.TiN_int])
                self.indels.loc[:, ('p_somatic_given_TiN')] = np.nan_to_num(np.true_divide(numerator, denominator))
                self.indels.loc[:, 'p_outlier'] = indel_model.rv_normal_af.cdf(af_n_given_TiN)
                if self.TiN_int == 0:
                    print 'Estimated 0 TiN no indels will be recovered outputing deTiN statistics for each site'
                elif self.use_outlier_threshold:
                    # remove outliers mutations p(af_n >= E[af_n|TiN]) < 0.05
                    self.indels['filter'][np.logical_and(self.indels['p_somatic_given_TiN'] > self.threshold,
                                                     self.indels['p_outlier'] >= 0.01)] = 'PASS'
                else:
                    self.indels['filter'][self.indels['p_somatic_given_TiN'] > self.threshold] = 'PASS'
            elif self.input.indel_table.isnull().values.sum() >  0:
                self.indels = self.input.indel_table
예제 #2
0
파일: deTiN.py 프로젝트: amarotaylor/deTiN
 def reclassify_mutations(self):
     # calculate p(Somatic | given joint TiN estimate)
     numerator = self.ssnv_based_model.p_somatic * self.ssnv_based_model.p_TiN_given_S[:,
                                                                                       self
                                                                                       .
                                                                                       TiN_int]
     denominator = numerator + np.array(
         [1 - self.ssnv_based_model.p_somatic] * np.nan_to_num(
             self.ssnv_based_model.p_TiN_given_G[:, self.TiN_int]))
     self.SSNVs.loc[:, ('p_somatic_given_TiN')] = np.nan_to_num(
         np.true_divide(numerator, denominator))
     # expected normal allele fraction given TiN and tau
     af_n_given_TiN = np.multiply(
         self.ssnv_based_model.tumor_f,
         self.ssnv_based_model.CN_ratio[:, self.TiN_int])
     # probability of normal allele fraction less than or equal to predicted fraction
     self.SSNVs.loc[:,
                    'p_outlier'] = self.ssnv_based_model.rv_normal_af.cdf(
                        af_n_given_TiN + 0.01)
     if self.use_outlier_threshold:
         # remove outliers mutations p(af_n >= E[af_n|TiN]) < 0.05
         self.SSNVs['judgement'][np.logical_and(
             self.SSNVs['p_somatic_given_TiN'] > self.threshold,
             self.SSNVs['p_outlier'] >= 0.01)] = 'KEEP'
     else:
         self.SSNVs['judgement'][
             self.SSNVs['p_somatic_given_TiN'] > self.threshold] = 'KEEP'
     if not self.input.indel_file == 'None':
         print 'reclassifying indels'
         indel_model = dssnv.model(self.input.indel_table,
                                   self.input.mutation_prior)
         indel_model.generate_conditional_ps()
         self.indels = self.input.indel_table
         numerator = indel_model.p_somatic * indel_model.p_TiN_given_S[:,
                                                                       self.
                                                                       TiN_int]
         denominator = numerator + np.array(
             [1 - indel_model.p_somatic] *
             np.nan_to_num(indel_model.p_TiN_given_G[:, self.TiN_int]))
         af_n_given_TiN = np.multiply(indel_model.tumor_f,
                                      indel_model.CN_ratio[:, self.TiN_int])
         self.indels.loc[:, ('p_somatic_given_TiN')] = np.nan_to_num(
             np.true_divide(numerator, denominator))
         self.indels.loc[:, 'p_outlier'] = indel_model.rv_normal_af.cdf(
             af_n_given_TiN)
         if self.use_outlier_threshold:
             # remove outliers mutations p(af_n >= E[af_n|TiN]) < 0.05
             self.indels['filter'][np.logical_and(
                 self.indels['p_somatic_given_TiN'] > self.threshold,
                 self.indels['p_outlier'] >= 0.01)] = 'PASS'
         else:
             self.indels['filter'][self.indels['p_somatic_given_TiN'] >
                                   self.threshold] = 'PASS'
예제 #3
0
파일: deTiN.py 프로젝트: hurrialice/deTiN
def main():
    """ deTiN pipeline. Method operates in two stages (1) estimating tumor in normal via candidate SSNVs and SCNAS.
        (2) Performing variant re-classification using bayes rule.
    """

    parser = argparse.ArgumentParser(
        description='Estimate tumor in normal (TiN) using putative somatic'
        ' events see Taylor-Weiner & Stewart et al. 2017')
    # input files
    parser.add_argument('--mutation_data_path',
                        help='Path to mutation candidate SSNV data.'
                        'Supported formats: MuTect call-stats',
                        required=False,
                        default='NULL')
    parser.add_argument(
        '--cn_data_path',
        help='Path to copy number data.'
        'Supported format: AllelicCapseg .seg file. Generated by GATK4 AllelicCNV.',
        required=False,
        default='NULL')
    parser.add_argument(
        '--tumor_het_data_path',
        help=
        'Path to heterozygous site allele count data in tumor. Generated by GATK4 GetBayesianHetCoverage.'
        'Required columns: CONTIG,POS,REF_COUNT and ALT_COUNT',
        required=False,
        default='NULL')
    parser.add_argument(
        '--normal_het_data_path',
        help=
        'Path to heterozygous site allele count data in normal. Generated by GATK4 GetBayesianHetCoverage'
        'Required columns: CONTIG,POS,REF_COUNT and ALT_COUNT',
        required=False,
        default='NULL')
    parser.add_argument(
        '--exac_data_path',
        help=
        'Path to exac af > 0.01 pickle. Can be generated by downloading ExAC VCF and running build_exac_pickle',
        required=False)
    parser.add_argument('--indel_data_path',
                        help='Path to candidate indels data.'
                        'Supported formats: Strelka / MuTect2 VCFs',
                        required=False,
                        default='None')
    parser.add_argument('--indel_data_type',
                        help='MuTect2 or Strelka'
                        'Caller used to generate indels',
                        required=False,
                        default='None')
    # output related arguments
    parser.add_argument('--output_name', required=True, help='sample name')
    parser.add_argument('--output_dir',
                        help='directory to put plots and TiN solution',
                        required=False,
                        default='.')
    # model related parameters
    parser.add_argument(
        '--mutation_prior',
        help=
        'prior expected ratio of somatic mutations to rare germline events',
        required=False,
        default=0.05)
    parser.add_argument(
        '--aSCNA_threshold',
        help='minor allele fraction threshold for calling aSCNAs.',
        required=False,
        default=0.1)
    parser.add_argument(
        '--TiN_prior',
        help=
        'expected frequency of TiN contamination in sequencing setting used for model selection',
        required=False,
        default=0.5)
    parser.add_argument(
        '--use_outlier_removal',
        help=
        'remove sites from recovered SSNVs where allele fractions significantly exceed predicted fraction',
        required=False,
        default=True)
    parser.add_argument(
        '--resolution',
        help=
        'number of TiN bins to consider default = 101 corresponds to 0.01 TiN levels',
        required=False,
        default=101)
    parser.add_argument(
        '--weighted_classification',
        help='integrate variant classification over all values of TiN',
        required=False,
        default=False)
    parser.add_argument(
        '--ascna_probe_number_filter',
        help='number of probes to require for an aSCNA to be considered',
        required=False,
        default=200)
    parser.add_argument(
        '--ascna_SNP_number_filter',
        help='number of probes to require for an aSCNA to be considered',
        required=False,
        default=20)
    parser.add_argument(
        '--coverage_threshold',
        help='number of reads required to use a site for TiN estimation',
        required=False,
        default=15)
    parser.add_argument(
        '--SSNV_af_threshold',
        help='fraction of alternate alleles required for site to be used '
        'for SSNV TiN estimation',
        required=False,
        default=.2)
    parser.add_argument(
        '--aSCNA_variance_threshold',
        help=
        'variance of segment allele shift tolerated before removing segment '
        'as artifact',
        required=False,
        default=0.025)
    parser.add_argument(
        '--cancer_hot_spots',
        help=
        'Optional BED file of cancer hot spot mutations which the user has a stronger prior on being somatic e.g. BRAF v600E mutations.'
        'The format of this file is Chromosome\tPosition\tProbability. Note this will override the mutation prior at these locations',
        required=False,
        default='NA')
    parser.add_argument('--only_ascnas',
                        help='only use ascna data for TiN estimation',
                        required=False,
                        action='store_true')
    args = parser.parse_args()
    if args.cn_data_path == 'NULL' and args.mutation_data_path == 'NULL':
        print('One of CN data or SSNV data are required.')
        sys.exit()
    elif args.cn_data_path == 'NULL':
        di = input(args)
        di.read_and_preprocess_SSNVs()

        di.candidates = du.select_candidate_mutations(di.call_stats_table,
                                                      di.exac_db_file)
        n_calls_pre = np.sum(di.candidates['judgement'] == "KEEP")

        ssnv_based_model = dssnv.model(di.candidates,
                                       di.mutation_prior,
                                       di.resolution,
                                       di.SSNV_af_threshold,
                                       di.coverage_threshold,
                                       di.CancerHotSpotsBED,
                                       skew=di.skew)
        ssnv_based_model.perform_inference()
        ascna_based_model = dascna.model(di.seg_table, di.het_table,
                                         di.resolution)
        ascna_based_model.TiN = np.nan

    elif args.mutation_data_path == 'NULL':
        di = input(args)
        di.read_and_preprocess_aSCNAs()
        di.candidates = pd.DataFrame(
            index=[0],
            columns=[
                'contig', 'position', 'ref_allele', 'alt_allele', 'tumor_name',
                'normal_name', 't_alt_count', 't_ref_count', 'n_alt_count',
                'n_ref_count', 'failure_reasons', 'judgement',
                'genomic_coord_x', 'f_acs', 'tau'
            ])
        ssnv_based_model = dssnv.model(di.candidates, di.mutation_prior,
                                       di.resolution, di.SSNV_af_threshold,
                                       di.coverage_threshold,
                                       di.CancerHotSpotsBED)
        ssnv_based_model.TiN = np.nan
        ascna = False
        # identify aSCNAs and filter hets
        if len(di.seg_table) > 0:
            di.aSCNA_hets = du.ensure_balanced_hets(di.seg_table, di.het_table)
            if len(di.aSCNA_hets) > 0:
                di.aSCNA_segs, di.convergent_segs = du.identify_aSCNAs(
                    di.seg_table, di.aSCNA_hets, di.aSCNA_thresh,
                    di.ascna_SNP_number_filter, di.aSCNA_variance_threshold)
                if len(di.aSCNA_segs) > 0:
                    ascna_based_model = dascna.model(di.aSCNA_segs,
                                                     di.aSCNA_hets,
                                                     di.resolution)
                    ascna_based_model.perform_inference()
                    ascna = True
        if not ascna:
            ascna_based_model = dascna.model(di.seg_table, di.het_table,
                                             di.resolution)
            ascna_based_model.TiN = np.nan
    else:
        di = input(args)
        di.read_and_preprocess_data()
        # identify candidate mutations based on MuTect flags.
        # kept sites are flagged as KEEP or rejected for normal lod and/or alt_allele_in_normal
        di.candidates = du.select_candidate_mutations(di.call_stats_table,
                                                      di.exac_db_file)
        n_calls_pre = np.sum(di.candidates['judgement'] == "KEEP")
        # generate SSNV based model using candidate sites
        ssnv_based_model = dssnv.model(di.candidates, di.mutation_prior,
                                       di.resolution, di.SSNV_af_threshold,
                                       di.coverage_threshold,
                                       di.CancerHotSpotsBED)
        ssnv_based_model.perform_inference()
        if di.only_ascnas == True:
            ssnv_based_model.TiN = np.nan
            print('Only using aSCNA data')
        ascna = False
        # identify aSCNAs and filter hets
        if len(di.seg_table) > 0:
            di.aSCNA_hets = du.ensure_balanced_hets(di.seg_table, di.het_table)
            if len(di.aSCNA_hets) > 0:
                di.aSCNA_segs, di.convergent_segs = du.identify_aSCNAs(
                    di.seg_table, di.aSCNA_hets, di.aSCNA_thresh,
                    di.ascna_SNP_number_filter, di.aSCNA_variance_threshold)
                if len(di.aSCNA_segs) > 0:
                    ascna_based_model = dascna.model(di.aSCNA_segs,
                                                     di.aSCNA_hets,
                                                     di.resolution)
                    ascna_based_model.perform_inference()
                    ascna = True
        if not ascna:
            ascna_based_model = dascna.model(di.seg_table, di.het_table,
                                             di.resolution)
            ascna_based_model.TiN = np.nan

        # combine models and reclassify mutations
    do = output(di, ssnv_based_model, ascna_based_model)
    do.calculate_joint_estimate()
    if len(do.SSNVs) > 1:
        do.reclassify_mutations()
        do.SSNVs.drop('Chromosome', axis=1, inplace=True)
    n_calls_post = np.sum(do.SSNVs['judgement'] == "KEEP")
    n_calls_added = n_calls_post - n_calls_pre
    # make output directory if needed
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    # write deTiN reclassified SSNVs
    do.SSNVs.to_csv(path_or_buf=do.input.output_path + '/' +
                    do.input.output_name + '.deTiN_SSNVs.txt',
                    sep='\t',
                    index=None)

    if not di.indel_file == 'None':
        #if 'Chromosome' in do.indels.columns:
        do.indels.drop('Chromosome', axis=1, inplace=True)
        do.indels.to_csv(path_or_buf=do.input.output_path + '/' +
                         do.input.output_name + '.deTiN_indels.txt',
                         sep='\t',
                         index=None)
    # write plots
    if not np.isnan(ascna_based_model.TiN):
        do.ascna_based_model.segs[
            'Chromosome'] = do.ascna_based_model.segs['Chromosome'] + 1
        do.ascna_based_model.segs.to_csv(path_or_buf=do.input.output_path +
                                         '/' + do.input.output_name +
                                         '.deTiN_aSCNAs.txt',
                                         sep='\t',
                                         index=None)
        du.plot_kmeans_info(ascna_based_model, do.input.output_path,
                            do.input.output_name)
        du.plot_TiN_models(do)
        du.plot_aSCNA_het_data(do)
    if not np.isnan(ssnv_based_model.TiN):
        du.plot_SSNVs(do)
    # write TiN and CIs
    file = open(
        do.input.output_path + '/' + do.input.output_name +
        '.TiN_estimate.txt', 'w')
    file.write('%s' % (do.TiN))
    file.close()

    file = open(
        do.input.output_path + '/' + do.input.output_name +
        '.TiN_estimate_CI.txt', 'w')
    file.write('%s - %s' % (str(do.CI_tin_low), str(do.CI_tin_high)))
    file.close()

    file = open(
        do.input.output_path + '/' + do.input.output_name +
        '.number_of_SSNVs_added.txt', 'w')
    file.write('%s\n' % int(n_calls_added))
예제 #4
0
파일: deTiN.py 프로젝트: amarotaylor/deTiN
def main():
    """ deTiN pipeline. Method operates in two stages (1) estimating tumor in normal via candidate SSNVs and SCNAS.
        (2) Performing variant re-classification using bayes rule.
    """

    parser = argparse.ArgumentParser(
        description='Estimate tumor in normal (TiN) using putative somatic'
        ' events see Taylor-Weiner & Stewart et al. 2017')
    # input files
    parser.add_argument('--mutation_data_path',
                        help='Path to mutation candidate SSNV data.'
                        'Supported formats: MuTect call-stats',
                        required=True)
    parser.add_argument(
        '--cn_data_path',
        help='Path to copy number data.'
        'Supported format: AllelicCapseg .seg file. Generated by GATK4 AllelicCNV.',
        required=True)
    parser.add_argument(
        '--tumor_het_data_path',
        help=
        'Path to heterozygous site allele count data in tumor. Generated by GATK4 GetBayesianHetCoverage.'
        'Required columns: CONTIG,POS,REF_COUNT and ALT_COUNT',
        required=True)
    parser.add_argument(
        '--normal_het_data_path',
        help=
        'Path to heterozygous site allele count data in normal. Generated by GATK4 GetBayesianHetCoverage'
        'Required columns: CONTIG,POS,REF_COUNT and ALT_COUNT',
        required=True)
    parser.add_argument(
        '--exac_data_path',
        help=
        'Path to exac af > 0.01 pickle. Can be generated by downloading ExAC VCF and running build_exac_pickle',
        required=False)
    parser.add_argument('--indel_data_path',
                        help='Path to candidate indels data.'
                        'Supported formats: Strelka / MuTect2 VCFs',
                        required=False,
                        default='None')
    parser.add_argument('--indel_data_type',
                        help='MuTect2 or Strelka'
                        'Caller used to generate indels',
                        required=False,
                        default='None')
    # output related arguments
    parser.add_argument('--output_name', required=True, help='sample name')
    parser.add_argument('--output_dir',
                        help='directory to put plots and TiN solution',
                        required=False,
                        default='.')
    # model related parameters
    parser.add_argument(
        '--mutation_prior',
        help=
        'prior expected ratio of somatic mutations to rare germline events',
        required=False,
        default=0.15)
    parser.add_argument(
        '--aSCNA_threshold',
        help='minor allele fraction threshold for calling aSCNAs.',
        required=False,
        default=0.1)
    parser.add_argument(
        '--TiN_prior',
        help=
        'expected frequency of TiN contamination in sequencing setting used for model selection',
        required=False,
        default=0.5)
    parser.add_argument(
        '--use_outlier_removal',
        help=
        'remove sites from recovered SSNVs where allele fractions significantly exceed predicted fraction',
        required=False,
        default=True)

    args = parser.parse_args()
    di = input(args)
    di.read_and_preprocess_data()

    # identify candidate mutations based on MuTect flags.
    # kept sites are flagged as KEEP or rejected for normal lod and/or alt_allele_in_normal
    di.candidates = du.select_candidate_mutations(di.call_stats_table,
                                                  di.exac_db_file)

    # generate SSNV based model using candidate sites
    ssnv_based_model = dssnv.model(di.candidates, di.mutation_prior)
    ssnv_based_model.perform_inference()

    # identify aSCNAs and filter hets
    if len(di.seg_table) > 0:
        di.aSCNA_hets = du.ensure_balanced_hets(di.seg_table, di.het_table)
        di.aSCNA_segs = du.identify_aSCNAs(di.seg_table, di.aSCNA_hets,
                                           di.aSCNA_thresh)
        # generate aSCNA based model
        ascna_based_model = dascna.model(di.aSCNA_segs, di.aSCNA_hets)
        ascna_based_model.perform_inference()
    else:
        ascna_based_model = dascna.model(di.seg_table, di.het_table)
        ascna_based_model.TiN = np.nan
    # combine models and reclassify mutations
    do = output(di, ssnv_based_model, ascna_based_model)
    do.calculate_joint_estimate()
    do.reclassify_mutations()
    do.SSNVs.drop('Chromosome', axis=1, inplace=True)

    # make output directory if needed
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    # write deTiN reclassified SSNVs
    do.SSNVs.to_csv(path_or_buf=do.input.output_path + '/' +
                    do.input.output_name + '.deTiN_SSNVs.txt',
                    sep='\t',
                    index=None)
    if not di.indel_file == 'None':
        do.indels.to_csv(path_or_buf=do.input.output_path + '/' +
                         do.input.output_name + '.deTiN_indels.txt',
                         sep='\t',
                         index=None)
    # write plots
    if not np.isnan(ascna_based_model.TiN):
        du.plot_kmeans_info(ascna_based_model, do.input.output_path,
                            do.input.output_name)
    du.plot_TiN_models(do)
    du.plot_SSNVs(do)
    # write TiN and CIs
    file = open(
        do.input.output_path + '/' + do.input.output_name +
        '.TiN_estimate.txt', 'w')
    file.write('%s' % (do.TiN))
    file.close()

    file = open(
        do.input.output_path + '/' + do.input.output_name +
        '.TiN_estimate_CI.txt', 'w')
    file.write('%s - %s' % (str(do.CI_tin_low), str(do.CI_tin_high)))
    file.close()