Exemplo n.º 1
0
with open(config_file, 'r') as f:
    cfg = yaml.safe_load(f)
min_DP = cfg['min_DP']
var_type = cfg['variant_type']
vcf_pat = cfg['vcf_pattern']
ped_file = cfg['ped_file']
known_vars = cfg['known_variants']
output_dir = cfg['output_directory']
test_set_pat = output_dir + '/%s'
m_pkl = joblib.load(model)
list_of_features = m_pkl['features']
if lvl == 0:
    m_pkl['extra_col_names'] = []
    m_pkl['y_name'] = []

myped = ped.Ped(ped_file)
myped.addTestFile(field='ind_id', file_pat=test_set_pat)
myped.ped.dropna(subset=['test'], inplace=True)
myped.ped.reset_index(inplace=True)

test_labels = numpy.array([], dtype=int)
pred_labels = numpy.array([], dtype=int)
test_var_id = numpy.array([], dtype=str)
test_alleles = numpy.array([], dtype=str)
pred_prob = numpy.array([], dtype=float)
dp_offspring = numpy.array([], dtype=int)
dp_father = numpy.array([], dtype=int)
dp_mother = numpy.array([], dtype=int)

for i, row in myped.ped.iterrows():
    if row['ind_id'] != child_id:
Exemplo n.º 2
0
import ped
import sys

infile_ped = '/mnt/scratch/asalomatov/data/SSC/SSCped/SSC.ped'
myped = ped.Ped(infile_ped, ['collection'])
myped.addBam(
    file_pat='/mnt/scratch/asalomatov/data/SSC/wes/bam/%s.realigned.recal.bam')
myped.ped.dropna(subset=['bam'], inplace=True)
myped.ped.shape
myped.addBai(
    file_pat=
    '/mnt/scratch/asalomatov/data/SSC/wes/bam/%s.realigned.recal.bam.bai')
myped.ped.dropna(subset=['bai'], inplace=True)
myped.ped.shape
myped.ped.head()

probands = myped.getAllProbands()
siblings = list(myped.ped.ind_id[myped.ped.ind_id.str.contains('s1')])
print 'probands ', len(probands)
print 'siblings ', len(siblings)

out_dir = '/mnt/scratch/asalomatov/data/SSC/wes/dnm_files/bam'
for p in probands + siblings:
    fa = myped.getChildsFather(p)
    mo = myped.getChildsMother(p)
    df = myped.ped[['ind_id', 'bam']][myped.ped.ind_id.isin([fa, mo, p])]
    print df.shape
    if df.shape[0] != 3:
        continue
    df.to_csv
    print df
Exemplo n.º 3
0
def summarizeMutations(infile,
                       outp_dir,
                       config_file,
                       exac_anno='/mnt/scratch/asalomatov/data/ExAC/fordist_cleaned_exac_r03_march16_z_pli_rec_null_data.txt'):
    with open(config_file, 'r') as f:
        cfg = yaml.safe_load(f)

    ped_file = cfg['ped_file']
    myped = ped.Ped(cfg['ped_file'])
    myped.addVcf(file_pat=cfg['vcf_pattern'])
    myped.ped.dropna(subset=['vcf'], inplace=True)
    myped.ped.reset_index(inplace=True)

    #kv_vcf = pandas.read_csv('/mnt/scratch/asalomatov/data/columbia/feature_sets/known/all_known.txt', sep='\t')
    #kv_vcf = kv_vcf[['ind_id','CHROM', 'POS', 'REF_offspring', 'ALT_base_offspring', 'status', 'descr', 'DP_offspring', 'DP_father', 'DP_mother']]
    #kv_vcf = kv_vcf[kv_vcf.descr.isin(['after'])]
    #kv_vcf['var_id'] = kv_vcf.ind_id.astype(str)+'_'+kv_vcf.CHROM.astype(str)+'_'+kv_vcf.POS.astype(str)


    #effects_of_interest = effects_loss_of_func + '|' + effect_damaging_missense + '|' + effect_synon


    exac = pandas.read_table(exac_anno)
    vn = pandas.read_table(infile)
    vn.columns = vn.columns.str.translate(None, '#')
    print vn.shape
    vn.ix[:, 'gene'] = vn['ANN[*].GENE']
    vn = vn.merge(
        exac[[u'syn_z', u'mis_z', u'lof_z', u'pLI', u'pRec', u'pNull', u'gene']],
        on='gene', how='left')
    print vn.shape
    #sys.exit(1)
    vn['v_id'] = vn.ind_id + '_' +\
                 vn['CHROM'].astype(str) + '_' +\
                 vn.POS.astype(str) + '_' +\
                 vn['ANN[*].GENE'] + '_' +\
                 vn['ANN[*].EFFECT'] + '_' +\
                 vn['ANN[*].IMPACT']
    vn['var_id'] = vn.ind_id + '_' +\
                 vn['CHROM'].astype(str) + '_' +\
                 vn.POS.astype(str)
    vn['chr_pos'] = vn['CHROM'].astype(str) + '_' +\
                 vn.POS.astype(str)


    #vn = vn.merge(kv_vcf[['var_id', 'status']], on='var_id', how='left')
    #print vn.shape

    vn = vn[~vn.v_id.duplicated()]
    # stats before any filtering
    print '\ndeduped and annotated vars, pred_labels value_counts:'
    print vn.pred_labels.value_counts()
    print 'deduped and annotated vars, test_labels value_counts:'
    print vn.status.value_counts()
    calcMetr(vn, msg='deduped metrics')
    vn_full = vn


    var_freq = vn.groupby('chr_pos').apply(lambda x: len(x['ind_id'].unique()))
    var_freq_2 = var_freq[var_freq > cfg['max_cohort_freq']]
    vn = vn[~vn.chr_pos.isin(var_freq_2.index)]
    print '\ncohort freq vars, pred_labels value_counts:'
    print vn.pred_labels.value_counts()
    print 'cohort freq vars, test_labels value_counts:'
    print vn.status.value_counts()
    calcMetr(vn, msg='cohort_freq')
    vn_diff =  getDiff(vn_full, vn, msg='cohort_freq')



    vn.ix[:, 'effect_cat'] = None
    vn.ix[vn['ANN[*].EFFECT'].str.contains(
        '|'.join(cfg['snpeff']['effect_synon'])), 'effect_cat'] = 'syn' 
    vn.ix[vn['ANN[*].EFFECT'].str.contains(
        '|'.join(cfg['snpeff']['effect_dmgmis'])), 'effect_cat'] = 'mis' 
    vn.ix[vn['ANN[*].EFFECT'].str.contains(
        '|'.join(cfg['snpeff']['effect_lof'])), 'effect_cat'] = 'lof' 
    print vn.shape
    vn_full = vn
    vn = vn.dropna(subset=['effect_cat'], axis=0)
    print vn.shape
    #vn = vn[vn['ANN[*].EFFECT'].str.contains(effects_of_interest)] 

    print '\neffects of interest vars, pred_labels value_counts:'
    print vn.pred_labels.value_counts()
    print 'effects of interest vars, test_labels value_counts:'
    print vn.status.value_counts()
    calcMetr(vn, msg='effects metrics')
    vn_diff = pandas.concat([vn_diff, getDiff(vn_full, vn, msg='effects')])


    vn.ix[vn.dbNSFP_1000Gp3_AF.isin(['.']), 'dbNSFP_1000Gp3_AF'] = '0'
    vn.ix[vn.dbNSFP_ExAC_AF.isin(['.']), 'dbNSFP_ExAC_AF'] = '0'
    vn.ix[:, 'dbNSFP_1000Gp3_AF'] = vn.dbNSFP_1000Gp3_AF.str.replace('ZZZ', '0')
    vn.ix[:, 'dbNSFP_ExAC_AF'] = vn.dbNSFP_ExAC_AF.str.replace('ZZZ', '0')
    vn.ix[:, 'dbNSFP_1000Gp3_AF'] = vn.dbNSFP_1000Gp3_AF.str.replace(',.', ',0')
    vn.ix[:, 'dbNSFP_ExAC_AF'] = vn.dbNSFP_ExAC_AF.str.replace(',.', ',0')
    vn.ix[:, 'dbNSFP_1000Gp3_AF'] = vn.dbNSFP_1000Gp3_AF.str.replace('.,', '0,')
    vn.ix[:, 'dbNSFP_ExAC_AF'] = vn.dbNSFP_ExAC_AF.str.replace('.,', '0,')
    vn.ix[:, 'dbNSFP_1000Gp3_AF'] = vn.dbNSFP_1000Gp3_AF.apply(lambda x: min(map(float, x.split(','))))
    vn.ix[:, 'dbNSFP_ExAC_AF'] = vn.dbNSFP_ExAC_AF.apply(lambda x: min(map(float, x.split(','))))
    vn.ix[:, 'dbNSFP_1000Gp3_AF'] = vn.dbNSFP_1000Gp3_AF.astype(float)
    vn.ix[:, 'dbNSFP_ExAC_AF'] = vn.dbNSFP_ExAC_AF.astype(float)

    vn_full = vn

    vn = vn[(vn.dbNSFP_1000Gp3_AF < cfg['population_AF']) &
            (vn.dbNSFP_ExAC_AF < cfg['population_AF'])]
    print '\nAF vars, pred_labels value_counts:'
    print vn.pred_labels.value_counts()
    print 'AF vars, test_labels value_counts:'
    print vn.status.value_counts()
    calcMetr(vn, msg='AF metrics')
    vn_diff = pandas.concat([vn_diff, getDiff(vn_full, vn, msg='pop_freq')])

    vn_full = vn
    vn = vn[vn['ANN[*].BIOTYPE'].str.contains('|'.join(cfg['snpeff']['biotype']))]

    print '\nprotein coding vars, pred_labels value_counts:'
    print vn.pred_labels.value_counts()
    print 'protein coding vars, test_labels value_counts:'
    print vn.status.value_counts()
    calcMetr(vn, msg='protein coding metrics')
    vn_diff = pandas.concat([vn_diff, getDiff(vn_full, vn, msg='protein')])

    vn_full = vn
    allele_frac = vn.alt_DP.astype(float)/vn.DP
    vn = vn[(allele_frac > cfg['alt_allele_frac_range'][0]) & (allele_frac < cfg['alt_allele_frac_range'][1])]

    print '\nallele fraction, pred_labels value_counts:'
    print vn.pred_labels.value_counts()
    print 'allel fraction vars, test_labels value_counts:'
    print vn.status.value_counts()
    calcMetr(vn, msg='all fraction metrics')
    vn_diff = pandas.concat([vn_diff, getDiff(vn_full, vn, msg='allele_frac')])

    vn = vn.replace('ZZZ', '.')
    vn['FILTER'] = vn.apply(func.getFieldFromVCF, args=(myped,), axis=1)
    vn = vn[~vn.FILTER.isnull()]



    c_missense = vn['effect_cat'] == 'mis'
    c_lof = vn['effect_cat'] == 'lof'
    c_syn = vn['effect_cat'] == 'syn'

    c_metaSVM_D = vn.dbNSFP_MetaSVM_pred.str.contains('|'.join(cfg['db_nsfp']['metaSVM_pred']))
    c_metaSVM_null = vn.dbNSFP_MetaSVM_pred.isin(['ZZZ', '.'])

    c_cadd_null = vn.dbNSFP_CADD_phred.isin(['ZZZ', '.'])
    c_cadd_D = vn.dbNSFP_CADD_phred[~c_cadd_null].apply(
        lambda x: min(map(float, x.split(',')))) >= cfg['db_nsfp']['cadd_phred']
    c_cadd_15 = vn.dbNSFP_CADD_phred[~c_cadd_null].apply(
        lambda x: min(map(float, x.split(',')))) >= cfg['db_nsfp']['combined']['cadd_phred']

    c_poly_HVAR_null = vn.dbNSFP_Polyphen2_HVAR_pred.isin(['ZZZ', '.'])
    c_poly_HDIV_null = vn.dbNSFP_Polyphen2_HVAR_pred.isin(['ZZZ', '.'])
    c_poly_HVAR_D = vn.dbNSFP_Polyphen2_HVAR_pred.str.contains(
        '|'.join(cfg['db_nsfp']['combined']['polyphen2_pred']))
    c_poly_HDIV_D = vn.dbNSFP_Polyphen2_HVAR_pred.str.contains(
        '|'.join(cfg['db_nsfp']['combined']['polyphen2_pred']))

    c_sift_null = vn.dbNSFP_SIFT_pred.isin(['ZZZ', '.'])
    c_sift_D = vn.dbNSFP_SIFT_pred.str.contains(
        '|'.join(cfg['db_nsfp']['combined']['sift_pred']))
    c_new = (vn.pred_labels == 1) & (~vn.status.isin(['Y']))

    c_dmg_miss = c_metaSVM_D | c_cadd_D | ((c_poly_HDIV_D | c_poly_HVAR_D) & c_sift_D & c_cadd_15)
    vn_full = vn[c_missense]
    vn_mis = vn[c_dmg_miss & c_missense]
    vn_diff = pandas.concat([vn_diff, getDiff(vn_full, vn_mis, msg='dmg_miss')])

    c_impact_lof = vn['ANN[*].IMPACT'].str.contains(
        '|'.join(cfg['snpeff']['impact_lof']))

    vn_full = vn[c_lof]
    vn_lof = vn[c_lof & c_impact_lof]
    vn_diff = pandas.concat([vn_diff, getDiff(vn_full, vn_lof, msg='impact_lof')])

    vn_syn = vn[c_syn]
    print vn.shape

    c_FN = (vn.pred_labels == 0) & vn.status.isin(['Y'])
    vn_FN = vn[cols_to_output][c_FN]
    #vn_FN = vn_FN[~vn_FN.v_id.duplicated()]

    c_TP = (vn.pred_labels == 1) & vn.status.isin(['Y'])
    vn_TP = vn[cols_to_output][c_TP]
    #vn_TP = vn_TP[~vn_TP.v_id.duplicated()]
    
    var_type = cfg['variant_type']
    outp_suffix = '{:%Y-%m-%d_%H-%M-%S-%f}'.format(datetime.datetime.now())

    def writeVariants(df, cols_to_output, var_type, prefix, suffix, outp_dir):
        if df.empty:
            print('%s is empty' % prefix)
            return None
        df[cols_to_output].to_csv(os.path.join(outp_dir,
                                                    '_'.join([prefix,
                                                              var_type,
                                                              suffix,
                                                              '.csv'])), index=False)
    writeVariants(vn, cols_to_output[:-2], var_type, 'ALL', outp_suffix, outp_dir)
    writeVariants(vn_FN, cols_to_output[:-2], var_type, 'FN', outp_suffix, outp_dir)
    writeVariants(vn_TP, cols_to_output[:-2], var_type, 'TP', outp_suffix, outp_dir)
    writeVariants(vn_mis, cols_to_output[:-2], var_type, 'MIS', outp_suffix, outp_dir)
    writeVariants(vn_lof, cols_to_output[:-2], var_type, 'LOF', outp_suffix, outp_dir)
    writeVariants(vn_syn, cols_to_output[:-2], var_type, 'SYN', outp_suffix, outp_dir)
    writeVariants(vn_diff, cols_to_output[:-2]+['step'], var_type, 'DIFF', outp_suffix, outp_dir)
        
#    vn_TP[cols_to_output[:-2]].to_csv(
#        os.path.join(outp_dir, 'true_pos_snp' + outp_suffix + '.csv'), index=False)
#    vn_mis[cols_to_output[:-2]][c_new].to_csv(
#        os.path.join(outp_dir, 'dmg_missense' + outp_suffix + '.csv'), index=False)
#    vn_lof[cols_to_output[:-2]][c_new].to_csv(
#        os.path.join(outp_dir, 'lof' + outp_suffix + '.csv'), index=False)
#    vn_syn[cols_to_output[:-2]][c_new].to_csv(
#        os.path.join(outp_dir, 'syn' + outp_suffix + '.csv'), index=False)
#    vn_diff[cols_to_output[:-2] + ['step']].to_csv(
#        os.path.join(outp_dir, 'lostTP' + outp_suffix + '.csv'), index=False)

    cfg['predictions_file'] = infile

    with open(os.path.join(outp_dir,'cfg' + outp_suffix + '.yml'), 'w') as f:
        yaml.dump(cfg, f, default_flow_style=False)
Exemplo n.º 4
0
def summarizeMutations(infile,
                       infile_vep,
                       prefix,
                       outp_dir,
                       config_file,
                       exac_anno='/mnt/xfs1/scratch/asalomatov/data/ExAC/fordist_cleaned_exac_r03_march16_z_pli_rec_null_data.txt',
                       asd_gene_prob_anno='/mnt/xfs1/scratch/asalomatov/data/gene-scores/asd_gene_prediction_olga.csv',
                       ios_anno='/mnt/xfs1/scratch/asalomatov/data/gene-scores/ioss_lgd_rvis.scores.csv',
                       sfari_scores='/mnt/xfs1/scratch/asalomatov/data/SFARI/gene-score-only.csv'):
    with open(config_file, 'r') as f:
        cfg = yaml.safe_load(f)

    ped_file = cfg['ped_file']
    ped_file_extended = cfg['ped_file_extended']
    # populate ped DF
    if ped_file and ped_file_extended:
        sys.exit('only one of ped_file, ped_file_extended may be non-empty')
    if ped_file:
        myped = ped.Ped(ped_file)
        myped.addVcf(file_pat=cfg['vcf_pattern'])
        myped.ped.dropna(subset=['vcf'], inplace=True)
        myped.ped.reset_index(inplace=True)
    elif ped_file_extended:
        myped = ped.Ped(ped_file_extended, ['bam', 'vcf'])
    else:
        sys.exit('ped_file or ped_file_extended must be defined')


    #kv_vcf = pandas.read_csv('/mnt/xfs1/scratch/asalomatov/data/columbia/feature_sets/known/all_known.txt', sep='\t')
    #kv_vcf = kv_vcf[['ind_id','CHROM', 'POS', 'REF_offspring', 'ALT_base_offspring', 'status', 'descr', 'DP_offspring', 'DP_father', 'DP_mother']]
    #kv_vcf = kv_vcf[kv_vcf.descr.isin(['after'])]
    #kv_vcf['var_id'] = kv_vcf.ind_id.astype(str)+'_'+kv_vcf.CHROM.astype(str)+'_'+kv_vcf.POS.astype(str)
    #effects_of_interest = effects_loss_of_func + '|' + effect_damaging_missense + '|' + effect_synon
    exac = pandas.read_table(exac_anno)
    sfari_scores_df = pandas.read_csv(sfari_scores)
    vn = pandas.read_table(infile)
    vn.columns = vn.columns.str.translate(None, '#')
    vn.columns = [i.replace('[*]', '') for i in vn.columns]
    # read vep
    vep = func.readVcfToDF(infile_vep)
    vep = vep.merge(vep.apply(lambda row: func.vepVar2vcfVar(row, cfg['genome_ref']), axis=1),
                    right_index=True, left_index=True)

    vn.ix[:, 'gene'] = vn['ANN.GENE']
    vn = vn.merge(
        exac[[u'syn_z', u'syn_z_rank', u'syn_z_perc_rank',
              u'mis_z', u'mis_z_rank', u'mis_z_perc_rank',
              u'lof_z', u'lof_z_rank', u'lof_z_perc_rank',
              u'pLI', u'pLI_rank', u'pLI_perc_rank',
              u'pRec', u'pRec_rank', u'pRec_perc_rank',
              u'pNull', u'pNull_rank', u'pNull_perc_rank',
              u'gene']],
        on='gene', how='left')
    vn = vn.merge(
        sfari_scores_df,
        on='gene', how='left')
    print(vn.shape)
    vn['v_id'] = vn.ind_id.astype(str) + '_' +\
                 vn['CHROM'].astype(str) + '_' +\
                 vn.POS.astype(str) + '_' +\
                 vn['ANN.GENE'] # + '_' +\
                 # vn['ANN[*].FEATUREID']
                 # vn['ANN[*].EFFECT'] + '_' +\
                 # vn['ANN[*].IMPACT']
    vn['var_id'] = vn.ind_id.astype(str) + '_' +\
                 vn['CHROM'].astype(str) + '_' +\
                 vn.POS.astype(str)
    vn['chr_pos'] = vn['CHROM'].astype(str) + '_' +\
                 vn.POS.astype(str)
    vn['chr_pos_allel_tr'] = vn['CHROM'].astype(str) + '_' +\
                             vn.POS.astype(str) + '_' +\
                             vn.REF.astype(str) + '_' +\
                             vn.ALT.astype(str) + '_' +\
                             vn['ANN.FEATUREID'].astype(str)

    # if VEP anno is refseq based, translate to ensembl
    if vep.Feature.str.startswith('NM').sum() > 0 or\
                vep.Feature.str.startswith('XM').sum() > 0:
        ens_refseq = pandas.read_csv(cfg['ens_refseq'])
        vep['vep_transcript'] = vep.Feature.apply(
                lambda i: i.split('.')[0])
        vep = vep.merge(ens_refseq[['enst', 'nm']],
                        how='left',
                        left_on='vep_transcript',
                        right_on='nm')
    else:
        vep['enst'] = vep.Feature
    vep['chr_pos'] = vep['CHROM'].astype(str) + '_' +\
                             vep.POS.astype(str)
    vep['chr_pos_allel'] = vep['CHROM'].astype(str) + '_' +\
                             vep.POS.astype(str) + '_' +\
                             vep.REF.astype(str) + '_' +\
                             vep.ALT.astype(str)
    # join all transcripts
    vep_by_var = vep.groupby('chr_pos').apply(func.mergeFieldsForVariant).to_frame()
    vep_by_var.reset_index(inplace=True)
    vep_by_var.columns = ['chr_pos', 'HGVSc;Exon;Intron;HGVSp']
    vep['chr_pos_allel_tr'] = vep['CHROM'].astype(str) + '_' +\
                             vep.POS.astype(str) + '_' +\
                             vep.REF.astype(str) + '_' +\
                             vep.ALT.astype(str) + '_' +\
                             vep.enst.astype(str)
    print(vn.chr_pos_allel_tr)
    print(vep.chr_pos_allel_tr)
    print('vn dim before merging with vep:')
    print(vn.shape)
    vn = vn.merge(vep, how='left',
                  left_on='chr_pos_allel_tr',
                  right_on='chr_pos_allel_tr',
                  suffixes=['', '_vep'])
    vn = vn.merge(vep_by_var, how='left',
                  left_on='chr_pos',
                  right_on='chr_pos',
                  suffixes=['', '_vep'])
    # vn = vn.merge(kv_vcf[['var_id', 'status']], on='var_id', how='left')
    print('vn dim after merging with vep:')
    print(vn.shape)
    print('before dedup')
    print(vn.shape)
    vn_dups = vn[vn.v_id.duplicated()]
    vn = vn[~vn.v_id.duplicated()]
    print('after dedup')
    print(vn.shape)

#    vn_all = vn
# stats before any filtering
#    print('\ndeduped and annotated vars, pred_labels value_counts:')
#    print(vn.pred_labels.value_counts())
#    print('deduped and annotated vars, test_labels value_counts:')
#    print(vn.status.value_counts())
#    calcMetr(vn, msg='deduped metrics')
#    vn_full = vn

    var_freq = vn.groupby('chr_pos').apply(lambda x: len(x['ind_id'].unique()))
    c_cohort_freq = var_freq > cfg['max_cohort_freq']
    var_freq_2 = var_freq[c_cohort_freq]
    # vn = vn[~vn.chr_pos.isin(var_freq_2.index)]
    vn['c_cohort_freq'] = ~vn.chr_pos.isin(var_freq_2.index)
#    print('\ncohort freq vars, pred_labels value_counts:')
#    print(vn.pred_labels.value_counts())
#    print('cohort freq vars, test_labels value_counts:')
#    print(vn.status.value_counts())
#    calcMetr(vn, msg='cohort_freq')
#    vn_diff =  getDiff(vn_full, vn, msg='cohort_freq')

    vn.ix[:, 'effect_cat'] = 'other'
    vn.ix[vn['ANN.EFFECT'].str.contains(
        '|'.join(cfg['snpeff']['effect_synon'])), 'effect_cat'] = 'syn'
    vn.ix[vn['ANN.EFFECT'].str.contains(
        '|'.join(cfg['snpeff']['effect_dmgmis'])), 'effect_cat'] = 'mis'
    vn.ix[vn['ANN.EFFECT'].str.contains(
        '|'.join(cfg['snpeff']['effect_lof'])), 'effect_cat'] = 'lof'
    vn['c_effect_cat'] = ~vn.effect_cat.isin(['other'])
#    print(vn.shape)
#    vn_full = vn
#    vn = vn.dropna(subset=['effect_cat'], axis=0)
#    print(vn.shape)

#    print('\neffects of interest vars, pred_labels value_counts:')
#    print(vn.pred_labels.value_counts())
#    print('effects of interest vars, test_labels value_counts:')
#    print(vn.status.value_counts())
#    calcMetr(vn, msg='effects metrics')
#    vn_diff = pandas.concat([vn_diff, getDiff(vn_full, vn, msg='effects')])

    vn.ix[vn.dbNSFP_1000Gp3_AF.isin(['.']), 'dbNSFP_1000Gp3_AF'] = '0'
    vn.ix[vn.dbNSFP_ExAC_AF.isin(['.']), 'dbNSFP_ExAC_AF'] = '0'
    vn.ix[:, 'dbNSFP_1000Gp3_AF'] = vn.dbNSFP_1000Gp3_AF.str.replace('ZZZ', '0')
    vn.ix[:, 'dbNSFP_ExAC_AF'] = vn.dbNSFP_ExAC_AF.str.replace('ZZZ', '0')
    vn.ix[:, 'dbNSFP_1000Gp3_AF'] = vn.dbNSFP_1000Gp3_AF.str.replace(',.', ',0')
    vn.ix[:, 'dbNSFP_ExAC_AF'] = vn.dbNSFP_ExAC_AF.str.replace(',.', ',0')
    vn.ix[:, 'dbNSFP_1000Gp3_AF'] = vn.dbNSFP_1000Gp3_AF.str.replace('.,', '0,')
    vn.ix[:, 'dbNSFP_ExAC_AF'] = vn.dbNSFP_ExAC_AF.str.replace('.,', '0,')
    vn.ix[:, 'dbNSFP_1000Gp3_AF'] = vn.dbNSFP_1000Gp3_AF.apply(lambda x: min(map(float, x.split(','))))
    vn.ix[:, 'dbNSFP_ExAC_AF'] = vn.dbNSFP_ExAC_AF.apply(lambda x: min(map(float, x.split(','))))
    vn.ix[:, 'dbNSFP_1000Gp3_AF'] = vn.dbNSFP_1000Gp3_AF.astype(float)
    vn.ix[:, 'dbNSFP_ExAC_AF'] = vn.dbNSFP_ExAC_AF.astype(float)

#    vn_full = vn

    vn['c_pop_freq'] = (vn.dbNSFP_1000Gp3_AF < cfg['population_AF']) &\
                       (vn.dbNSFP_ExAC_AF < cfg['population_AF'])
#    vn = vn[(vn.dbNSFP_1000Gp3_AF < cfg['population_AF']) &
#            (vn.dbNSFP_ExAC_AF < cfg['population_AF'])]
#    print('\nAF vars, pred_labels value_counts:')
#    print(vn.pred_labels.value_counts())
#    print('AF vars, test_labels value_counts:')
#    print(vn.status.value_counts())
#    calcMetr(vn, msg='AF metrics')
#    vn_diff = pandas.concat([vn_diff, getDiff(vn_full, vn, msg='pop_freq')])

#    vn_full = vn

    vn['c_biotype'] = vn['ANN.BIOTYPE'].str.contains(
        '|'.join(cfg['snpeff']['biotype']))
#    vn = vn[vn['ANN.BIOTYPE'].str.contains('|'.join(cfg['snpeff']['biotype']))]

#    print('\nprotein coding vars, pred_labels value_counts:')
#    print(vn.pred_labels.value_counts())
#    print('protein coding vars, test_labels value_counts:')
#    print(vn.status.value_counts())
#    calcMetr(vn, msg='protein coding metrics')
#    vn_diff = pandas.concat([vn_diff, getDiff(vn_full, vn, msg='protein')])

#    vn_full = vn
    vn['allele_frac'] = None # vn.alt_DP.astype(float)/vn.DP
    vn['c_allele_frac'] = None # (vn.allele_frac > cfg['alt_allele_frac_range'][0]) &\
                        #  (vn.allele_frac < cfg['alt_allele_frac_range'][1])
#    vn = vn[(allele_frac > cfg['alt_allele_frac_range'][0]) & (allele_frac < cfg['alt_allele_frac_range'][1])]

#    print('\nallele fraction, pred_labels value_counts:')
#    print(vn.pred_labels.value_counts())
#    print('allel fraction vars, test_labels value_counts:')
#    print(vn.status.value_counts())
#    calcMetr(vn, msg='all fraction metrics')
#    vn_diff = pandas.concat([vn_diff, getDiff(vn_full, vn, msg='allele_frac')])

    vn = vn.replace('ZZZ', '.')
#    if vn.empty:
#        print('No de novo mutation of interest')
#        return 0
    print('vn shape')
    print(vn.shape)
    vn['FILTER'] = None # vn.apply(func.getFieldFromVCF, args=(myped,), axis=1)
#    vn = vn[~vn.FILTER.isnull()]
    print('vn shape in vcf')
    print(vn.shape)

    non_coding_vars = ['intron_variant', 'downstream_gene_variant',
                       'upstream_gene_variant', 'sequence_feature',
                       '5_prime_UTR_variant', '3_prime_UTR_variant']
    vn['coding_var'] = True
    for i in non_coding_vars:
        vn.ix[vn['ANN.EFFECT'] == i, 'coding_var'] = False

    c_missense = vn['effect_cat'] == 'mis'
    c_lof = vn['effect_cat'] == 'lof'
    c_syn = vn['effect_cat'] == 'syn'
    c_other = vn['effect_cat'] == 'other'
    vn['c_missense'] = c_missense
    vn['c_lof'] = c_lof
    vn['c_syn'] = c_syn

    c_M_CAP_D = vn.dbNSFP_M_CAP_pred.str.contains(
        '|'.join(cfg['db_nsfp']['M_CAP_pred']))
    c_metaSVM_D = vn.dbNSFP_MetaSVM_pred.str.contains(
        '|'.join(cfg['db_nsfp']['metaSVM_pred']))
#    c_metaSVM_null = vn.dbNSFP_MetaSVM_pred.isin(['ZZZ', '.'])

    c_cadd_null = vn.dbNSFP_CADD_phred.isin(['ZZZ', '.'])
    vn.ix[c_cadd_null, 'dbNSFP_CADD_phred'] = 0
    vn.ix[:, 'dbNSFP_CADD_phred'] = vn.dbNSFP_CADD_phred.astype(
        str).str.replace(',\.', ',0')
    vn.ix[:, 'dbNSFP_CADD_phred'] = vn.dbNSFP_CADD_phred.astype(
        str).str.replace('\.,', '0,')
    c_cadd_D = vn.dbNSFP_CADD_phred.astype(str).apply(
        lambda x: max(map(float, x.split(',')))) >=\
        cfg['db_nsfp']['cadd_phred']
    c_cadd_15 = vn.dbNSFP_CADD_phred.astype(str).apply(
        lambda x: max(map(float, x.split(',')))) >=\
        cfg['db_nsfp']['combined']['cadd_phred']

    c_poly_HVAR_D = vn.dbNSFP_Polyphen2_HVAR_pred.str.contains(
        '|'.join(cfg['db_nsfp']['combined']['polyphen2_pred']))
    c_poly_HDIV_D = vn.dbNSFP_Polyphen2_HVAR_pred.str.contains(
        '|'.join(cfg['db_nsfp']['combined']['polyphen2_pred']))

#    c_sift_null = vn.dbNSFP_SIFT_pred.isin(['ZZZ', '.'])
    c_sift_D = vn.dbNSFP_SIFT_pred.str.contains(
        '|'.join(cfg['db_nsfp']['combined']['sift_pred']))
#   c_new = (vn.pred_labels == 1) & (~vn.status.isin(['Y']))

    c_dmg_miss = c_M_CAP_D | c_metaSVM_D | c_cadd_D |\
                 ((c_poly_HDIV_D | c_poly_HVAR_D) & c_sift_D & c_cadd_15)
    print('N dmg_mis w/o M_CAP %s' % sum(c_metaSVM_D | c_cadd_D |
                                         ((c_poly_HDIV_D | c_poly_HVAR_D) &
                                          c_sift_D & c_cadd_15)))
    print('N dmg_mis w M_CAP %s' % sum(c_M_CAP_D | c_metaSVM_D | c_cadd_D |
                                       ((c_poly_HDIV_D | c_poly_HVAR_D) &
                                        c_sift_D & c_cadd_15)))
    vn['c_dmg_miss'] = c_dmg_miss
    vn['c_dmg_miss_woMCAP'] = c_metaSVM_D | c_cadd_D |\
                              ((c_poly_HDIV_D | c_poly_HVAR_D) &
                               c_sift_D & c_cadd_15)

#   vn_full = vn[c_missense]
    c_impact_lof = vn['ANN.IMPACT'].str.contains(
        '|'.join(cfg['snpeff']['impact_lof']))
    vn['c_impact_lof'] = c_impact_lof

    c_all_denovo = vn.c_cohort_freq & vn.c_pop_freq
    c_prev = vn.c_cohort_freq &\
             vn.c_pop_freq
#             vn.c_allele_frac
             # vn.c_effect_cat &\
#             vn.c_biotype &\
    print('sum(c_prev)')
    print(sum(c_prev))
    c_spark_genes = vn['ANN.GENE'].str.contains(
        '|'.join(cfg['snpeff']['genes']))
    vn['c_spark_genes'] = c_spark_genes
    vn_mis = vn[c_dmg_miss & c_missense & c_prev]
    vn_mis_clinical = vn[c_dmg_miss & c_missense & c_prev & c_spark_genes]
    print('shape vn_mis')
    print(vn_mis.shape)
#    vn_diff = pandas.concat([vn_diff, getDiff(vn_full, vn_mis, msg='dmg_miss')])
#    vn_full = vn[c_lof]
    vn_lof = vn[c_lof & c_impact_lof & c_prev]
    vn_lof_clinical = vn[c_lof & c_impact_lof & c_prev & c_spark_genes]
#    vn_diff = pandas.concat([vn_diff, getDiff(vn_full, vn_lof, msg='impact_lof')])
    vn_lof_dmis_clinical = pandas.concat([vn_lof_clinical, vn_mis_clinical])

    vn_syn = vn[c_syn & c_prev]
    vn_syn_clinical = vn[c_syn & c_prev & c_spark_genes]

    vn_other = vn[c_other & c_prev]
    vn_other_clinical = vn[c_other & c_prev & c_spark_genes]
#    print(vn.shape)

    c_FN = (vn.pred_labels == 0) & vn.status.isin(['Y'])
    vn_FN = vn[cols_to_output][c_FN]
    #vn_FN = vn_FN[~vn_FN.v_id.duplicated()]

    c_TP = (vn.pred_labels == 1) & vn.status.isin(['Y'])
    vn_TP = vn[cols_to_output][c_TP]
    #vn_TP = vn_TP[~vn_TP.v_id.duplicated()]

    var_type = cfg['variant_type']
    outp_suffix = '' # '{:%Y-%m-%d_%H-%M-%S-%f}'.format(datetime.datetime.now())

    def writeVariants(df, cols_to_output, var_type, prefix, suffix, outp_dir):
        if df.empty:
            print('%s is empty' % prefix)
            return None
        df = df[cols_to_output]
        df.columns = map(lambda i: i[2:] if i.startswith('c_') else i,
                         df.columns)
        df.to_csv(os.path.join(outp_dir,
                               '_'.join([prefix,
                                         var_type,
                                         suffix]) + '.csv'),
                  index=False)

    writeVariants(vn[c_all_denovo], cols_to_output + extra_cols, var_type,
                  prefix, 'ALL_DENOVO', outp_dir)
    writeVariants(vn[vn.c_biotype], cols_to_output + extra_cols, var_type,
                  prefix, 'ALL_DENOVO_CODING', outp_dir)
    writeVariants(vn_FN, cols_to_output + extra_cols, var_type, prefix,
                  'FN', outp_dir)
    writeVariants(vn_TP, cols_to_output + extra_cols, var_type, prefix,
                  'TP', outp_dir)
    writeVariants(vn_mis, cols_to_output + extra_cols, var_type, prefix,
                  'MIS', outp_dir)
    writeVariants(vn_lof, cols_to_output + extra_cols, var_type, prefix,
                  'LOF', outp_dir)
    writeVariants(vn_syn, cols_to_output + extra_cols, var_type, prefix,
                  'SYN', outp_dir)
    writeVariants(vn_other, cols_to_output + extra_cols, var_type, prefix,
                  'OTHER', outp_dir)
    writeVariants(vn_mis_clinical, cols_to_output + extra_cols, var_type,
                  prefix + '_MIS', 'clinical', outp_dir)
    writeVariants(vn_lof_clinical, cols_to_output + extra_cols, var_type,
                  prefix + '_LOF', 'clinical', outp_dir)
    writeVariants(vn_lof_dmis_clinical, cols_to_output + extra_cols, var_type,
                  prefix + '_LOF_DMIS', 'clinical', outp_dir)
    writeVariants(vn_syn_clinical, cols_to_output + extra_cols, var_type,
                  prefix + '_SYN', 'clinical', outp_dir)
    writeVariants(vn_other_clinical, cols_to_output + extra_cols,
                  var_type, prefix + '_OTHER', 'clinical', outp_dir)


#    writeVariants(vn_diff, cols_to_output[:-2]+['step'], var_type,
#                  prefix + '_DIFF', outp_suffix, outp_dir)

#    vn_TP[cols_to_output[:-2]].to_csv(
#        os.path.join(outp_dir, 'true_pos_snp' + outp_suffix + '.csv'), index=False)
#    vn_mis[cols_to_output[:-2]][c_new].to_csv(
#        os.path.join(outp_dir, 'dmg_missense' + outp_suffix + '.csv'), index=False)
#    vn_lof[cols_to_output[:-2]][c_new].to_csv(
#        os.path.join(outp_dir, 'lof' + outp_suffix + '.csv'), index=False)
#    vn_syn[cols_to_output[:-2]][c_new].to_csv(
#        os.path.join(outp_dir, 'syn' + outp_suffix + '.csv'), index=False)
#    vn_diff[cols_to_output[:-2] + ['step']].to_csv(
#        os.path.join(outp_dir, 'lostTP' + outp_suffix + '.csv'), index=False)

    cfg['predictions_file'] = infile

    with open(os.path.join(outp_dir, 'cfg' + outp_suffix + '.yml'), 'w') as f:
        yaml.dump(cfg, f, default_flow_style=False)
    return vn[c_all_denovo]
Exemplo n.º 5
0
                lambda i: get_spID(i, labID2spID))
        for clr in callers:
            ped_by_btch = {}
            for batch_dir in batch_dirs:
                if sf_seq_center == 'bay':
                    if batch_dir == 'b1-2':
                        path_toPed = '/mnt/xfs1/scratch/asalomatov/data/SPARK/ped/spark_%s.ped' % clr
                    else:
                        path_toPed = '/mnt/xfs1/scratch/asalomatov/data/SPARK/ped/spark_spID_%s_ext_%s.ped' % (
                            batch_dir, clr)
                elif sf_seq_center == 'reg':
                    path_toPed = '/mnt/ceph/users/asalomatov/regeneron_spark_pilot/ped/spark_%s_ext_%s.ped' % (
                        batch_dir, clr)
                else:
                    sys.exit('unknown seq_center')
                myped = ped.Ped(path_toPed, ['bam', 'vcf'])
                ped_by_btch[batch_dir] = myped
                # check if variants are present in vcf files
            other_calls['in_vcf_' + clr] = other_calls.apply(
                lambda row: isInVcf(row, ped_by_btch[row['batch']]), axis=1)
            other_calls['is_dn_' + clr] = other_calls.apply(
                lambda row: isDeNovo(row, clr), axis=1)

        other_calls['inVCF'] = isInVcfSNP(other_calls)
        other_calls['isDeNovo'] = isDeNovoSNP(other_calls)
        other_calls.to_csv(os.path.join(
            sf_calls_dir, 'other_calls_' + '_'.join(batch_dirs) + '.csv'),
                           index=False)

        sys.exit('done, annotate other calls')
Exemplo n.º 6
0
    genome_b = 'hg19'
elif args.genome_build in [38]:
    genome_b = 'genome.fa'
else:
    sys.exit('Unknown genome build, use 19, 37, or 38')
func.runInShell('mkdir -p ' + output_dir)

igv_inp = pandas.read_csv(input_file, dtype=str)
#igv_inp['centers'] = 'SF'
#igv_inp = pandas.read_table(input_file, dtype=str)
#for reg
if args.genome_build in [38]:
    igv_inp.SP_id = igv_inp.ind_id
    igv_inp.lab_id = igv_inp.ind_id

myped = ped.Ped(args.ped_file, ['bam', 'vcf'])
#myped = ped.Ped(args.ped_file, ['batch', 'bam'])


tmpl1 = """

new
genome %(genome_b)s
snapshotDirectory %(output_dir)s
load %(sample_bam)s
goto %(chr_pos)s
snapshot %(sample_snapshot_name)s

"""

tmpl3 = """
Exemplo n.º 7
0
    builds 19, 37, 38 are supported.')
    sys.exit('Only builds 19, 37, 38 are supported')

logging.info('Using %s as pipeline config file' % incl_make)

# create output dirs
func.makeDir(output_dir)
# create temp dir 
tmp_dir = tempfile.mkdtemp()
logging.info('working dir is %s' % tmp_dir)
# get path to script
script_name = os.path.abspath(
    pkg_resources.resource_filename('variants', 'vep_snpeff_for_vcf.sh'))

# populate ped DF
myped = ped.Ped(ped_file_extended, cfg['ped_extra_clmns'])
f = features.Features(myped, None)

# trio has to be complete with no file missing
if not f.initTrioFor(child_id):
    logging.critical('\nfailed to initialize trio for ' + child_id)
    sys.exit(1)

sys.stdout.write('\ninitialized trio for ' + child_id)
sys.stdout.write('\n')
logging.info('father and mother: ' +
             ' '.join([f.father_id, f.mother_id]))
# extract just the trio, drop non variant loci
# annotate, and filter based on AF
vrs = variants.Variants(f.sample_vcf, f.family_id)
vrs.readVcfToDF(sample_list=[f.sample_id, f.father_id, f.mother_id])
Exemplo n.º 8
0
import variants
import ped
import features_vcf as fv
import pandas as pd
import features_vcf
import numpy as np
import os
import sklearn

variants = reload(variants)
func = reload(func)
ped = reload(ped)

###SSC
infile_ped = '/mnt/scratch/asalomatov/data/SSC/SSCped/SSC.ped'
myped = ped.Ped(infile_ped, ['collection'])
myped.getParents(11006)
#myped.addVcf()
myped.addVcf(file_pat = '/mnt/scratch/asalomatov/data/SSC/vcf/raw/%s.family.vqsr.sorted.vcf.gz')
myped.ped.head()

###Columbia
infile_ped = '/mnt/scratch/asalomatov/data/columbia/pcgc_ped.txt'
myped = ped.Ped(infile_ped, [])
myped.getParents('1-00034')
myped.getFather('1-00034')
myped.
myped.ped.head()
myped.ped.shape
myped.addVcf(file_pat = '/mnt/scratch/asalomatov/data/columbia/vcf/deco/%s_%s-02_%s-01.annotated-deco.vcf.gz')
sum(myped.ped.vcf.notnull())