예제 #1
0
def adjudicate_BAF(metrics, labeler, name):
    # Deletions
    testable = metrics.loc[(metrics.svtype == 'DEL')
                           & (metrics.svsize >= 5000)]
    trainable = testable.loc[(testable.poor_region_cov < 0.3)
                             & ~testable.chrom.isin(ALLOSOMES)
                             & ~testable.is_outlier_specific]

    trainable['label'] = labeler.label(trainable)
    trainable.to_csv('{0}_DEL_trainable.txt'.format(name),
                     index=False,
                     sep='\t')
    testable.to_csv('{0}_DEL_testable.txt'.format(name), index=False, sep='\t')

    features = 'BAF_snp_ratio BAF_del_loglik'.split()
    cutoffs = {'indep': ['BAF_snp_ratio'], 'dep': ['BAF_del_loglik']}

    del_cutoffs = rf_classify(metrics, trainable, testable, features, labeler,
                              cutoffs, name)

    # Duplications
    testable = metrics.loc[(metrics.svtype == 'DUP')
                           & (metrics.svsize >= 5000)]
    trainable = testable.loc[(testable.poor_region_cov < 0.3)
                             & ~testable.chrom.isin(ALLOSOMES)
                             & ~testable.is_outlier_specific]
    trainable['label'] = labeler.label(trainable)
    trainable.to_csv('{0}_DUP_trainable.txt'.format(name),
                     index=False,
                     sep='\t')
    testable.to_csv('{0}_DUP_testable.txt'.format(name), index=False, sep='\t')
    features = 'BAF_KS_stat BAF_KS_log_pval'.split()
    cutoffs = {'indep': ['BAF_KS_stat'], 'dep': ['BAF_KS_log_pval']}

    dup_cutoffs = rf_classify(metrics, trainable, testable, features, labeler,
                              cutoffs, name)

    # Combine cutoffs
    del_cutoffs['svtype'] = 'DEL'
    dup_cutoffs['svtype'] = 'DUP'
    cutoffs = pd.concat([del_cutoffs, dup_cutoffs]).reset_index()
    cutoffs.to_csv('{0}_cutoffs.txt'.format(name), index=False, sep='\t')
    cutoffs['test'] = 'BAF'
    cutoffs['max_svsize'] = np.nan
    cutoffs['min_svsize'] = 5000
    cutoffs['algtype'] = 'any'

    return cutoffs
예제 #2
0
def adjudicate_SR1(metrics):
    testable = metrics.loc[~metrics.name.str.contains('_depth_')]
    trainable = testable.loc[(testable.poor_region_cov < 0.3)
                             & ~testable.chrom.isin(ALLOSOMES)
                             & ~testable.is_outlier_specific]
    features = ['SR_sum_log_pval', 'SR_sum_bg_frac']
    cutoffs = {'indep': ['SR_sum_log_pval'], 'dep': ['SR_sum_bg_frac']}
    labeler = labelers.SR1TrainingLabeler()
    trainable['label'] = labeler.label(trainable)
    trainable.to_csv('SR1_trainable.txt', index=False, sep='\t')

    cutoffs = rf_classify(metrics, trainable, testable, features, labeler,
                          cutoffs, 'SR1_prob')

    cutoffs.to_csv('SR1_cutoffs.txt', index=False, sep='\t')
    cutoffs['test'] = 'SR1'
    cutoffs['svtype'] = 'CNV'
    cutoffs['algtype'] = 'PESR'

    return cutoffs
예제 #3
0
def adjudicate_RD(metrics):
    features = ["RD_Median_Separation", "RD_log_pval", "RD_log_2ndMaxP"]
    cutoff_features = {
        'indep': ['RD_log_pval', 'RD_Median_Separation'],
        'dep': ['RD_log_2ndMaxP']
    }
    labeler = labelers.RDTrainingLabeler()
    cutoff_dfs = []

    # PE/SR >1 kb
    testable = metrics.loc[~metrics.name.str.contains('_depth_')
                           & (metrics.svsize >= 1000)]
    trainable = testable.loc[(testable.svsize >= 5000)
                             & (testable.poor_region_cov < 0.3)
                             & ~testable.chrom.isin(ALLOSOMES)
                             & ~testable.is_outlier_specific]

    testable.to_csv('RD_pesr_gt5kb_testable.txt', index=False, sep='\t')
    trainable['label'] = labeler.label(trainable)
    trainable.to_csv('RD_pesr_gt5kb_trainable.txt', index=False, sep='\t')
    cutoffs = rf_classify(metrics, trainable, testable, features, labeler,
                          cutoff_features, 'RD_prob')

    cutoff_dfs.append(cutoffs)
    cutoff_dfs[0]['algtype'] = 'PESR'
    cutoff_dfs[0]['max_svsize'] = np.nan
    cutoff_dfs[0]['min_svsize'] = 1000
    cutoff_dfs[0]['svtype'] = 'CNV'

    # PE/SR <1 kb
    testable = metrics.loc[~metrics.name.str.contains('_depth_')
                           & (metrics.svsize < 1000)]
    trainable = testable.loc[(testable.svsize >= 100)
                             & (testable.poor_region_cov < 0.3)
                             & ~testable.chrom.isin(ALLOSOMES)
                             & ~testable.is_outlier_specific]
    testable.to_csv('RD_pesr_lt5kb_testable.txt', index=False, sep='\t')
    trainable['label'] = labeler.label(trainable)
    trainable.to_csv('RD_pesr_lt5kb_trainable.txt', index=False, sep='\t')

    cutoffs = rf_classify(metrics, trainable, testable, features, labeler,
                          cutoff_features, 'RD_prob')

    cutoff_dfs.append(cutoffs)
    cutoff_dfs[1]['algtype'] = 'PESR'
    cutoff_dfs[1]['max_svsize'] = 1000
    cutoff_dfs[1]['min_svsize'] = 0
    cutoff_dfs[1]['svtype'] = 'CNV'

    # Depth dels
    cutoff_features = {
        'indep': ['RD_log_pval', 'RD_Median_Separation'],
        'dep': []
    }
    testable = metrics.loc[metrics.name.str.contains('_depth_')
                           & (metrics.svtype == 'DEL')]
    trainable = testable.loc[(testable.svsize >= 5000)
                             & (testable.poor_region_cov < 0.3)
                             & ~testable.chrom.isin(ALLOSOMES)
                             & ~testable.is_outlier_specific]
    trainable['label'] = labeler.label(trainable)
    trainable.to_csv('RD_depth_DEL_trainable.txt', index=False, sep='\t')

    cutoffs = rf_classify(metrics,
                          trainable,
                          testable,
                          features,
                          labeler,
                          cutoff_features,
                          'RD_prob',
                          clean_cutoffs=True)

    cutoff_dfs.append(cutoffs)
    cutoff_dfs[2]['algtype'] = 'Depth'
    cutoff_dfs[2]['max_svsize'] = np.nan
    cutoff_dfs[2]['min_svsize'] = 5000
    cutoff_dfs[2]['svtype'] = 'DEL'

    # Depth dups
    testable = metrics.loc[metrics.name.str.contains('_depth_')
                           & (metrics.svtype == 'DUP')]
    trainable = testable.loc[(testable.svsize >= 5000)
                             & (testable.poor_region_cov < 0.3)
                             & ~testable.chrom.isin(ALLOSOMES)
                             & ~testable.is_outlier_specific]
    trainable['label'] = labeler.label(trainable)
    trainable.to_csv('RD_depth_DUP_trainable.txt', index=False, sep='\t')

    cutoffs = rf_classify(metrics,
                          trainable,
                          testable,
                          features,
                          labeler,
                          cutoff_features,
                          'RD_prob',
                          clean_cutoffs=True)

    cutoff_dfs.append(cutoffs)
    cutoff_dfs[3]['algtype'] = 'Depth'
    cutoff_dfs[3]['max_svsize'] = np.nan
    cutoff_dfs[3]['min_svsize'] = 5000
    cutoff_dfs[3]['svtype'] = 'DUP'

    # Fail depth-only below 5 kb
    metrics.loc[metrics.name.str.contains('_depth_') & (metrics.svsize < 5000)
                & (metrics.RD_prob >= 0.5), 'RD_prob'] = 0.499

    cutoffs = pd.concat(cutoff_dfs)
    cutoffs.to_csv('RD_cutoffs.txt', index=False, sep='\t')
    cutoffs['test'] = 'RD'

    return cutoffs