def annotateVCFs(vcflistpath=bsmutils.get_bsmdir() + '/results/calls/filtered-vcfs.tsv', vcfdir=bsmutils.get_bsmdir() + '/results/calls/'): vcflist = pd.read_csv(vcflistpath, sep='\t', names=['sample', 'file'], index_col='sample') def helper(sample): invcf = vcfdir + os.path.sep + 'filtered' + os.path.sep + vcflist.loc[sample, 'file'] targetdir = vcfdir + os.path.sep + 'annotated' + os.path.sep val = annotateVCF(invcf=invcf, sample=sample, targetdir=targetdir) return(val) pp = [helper(y) for y in vcflist.index] return(pp)
def annotateVCF(invcf=bsmutils.get_bsmdir() + '/results/calls/filtered/MSSM_106_brain.ploidy_50.filtered.vcf', sample='MSSM_106_NeuN_pl', targetdir=bsmutils.get_bsmdir() + '/results/calls/annotated/'): ''' Some help would be nice ''' script = bsmutils.get_bsmdir() + '/src/annotate-vcf-bsm' cmd = [script, '-t', targetdir, invcf, sample] p = subprocess.run(cmd, capture_output=True) return(p)
def get_multi_annotations(annotlist, vcflistpath=bsmutils.get_bsmdir() + '/results/calls/filtered-vcfs-Chess-Walsh.tsv', annotdirpath=bsmutils.get_bsmdir() + '/results/2020-09-07-annotations', na_values={}, simplecolumns=True): vcflist = pd.read_csv(vcflistpath, sep='\t', names=['sample', 'file'], index_col='sample') samplestr = '((MSSM|PITT)_[0-9]+)_(NeuN_pl|NeuN_mn|muscle)' def sample2indivID(sample): return (re.sub(samplestr, 'CMC_\\1', sample)) def sample2tissue(sample): #if re.match('.*Walsh.*', vcflistpath): if not re.match(samplestr, sample): return ('frontal cortex') # Walsh data return (re.sub(samplestr, '\\3', sample)) # Chess data def get_annot(sample, annotyp): sampledir = annotdirpath + os.path.sep + sample tsvpath = sampledir + os.path.sep + annotyp + '.txt' indivID = sample2indivID(sample) tissue = sample2tissue(sample) na_val = na_values[annotyp] if annotyp in na_values.keys() else [] try: annot = read_TXT_per_annotation(tsvpath, indivID, tissue, simplecolumns=simplecolumns, na_values=na_val) annot = annotation_duplicates(annot, sep=':') except ValueError: annot = None return (annot) def do_annotyp(annotyp): try: annot = pd.concat([get_annot(s, annotyp) for s in vcflist.index], axis=0) except ValueError: annot = None return (annot) annot = pd.concat([do_annotyp(a) for a in annotlist], axis=1) return (annot)
def create_colsdict(): ''' Create input dictionary for regularize_categ_cols ''' colsdict = {} # order reflecting severity of effect l = [ 'Deleterious', 'Deleterious - Low Confidence', 'Tolerated', 'Tolerated - Low Confidence' ] colsdict.update({'sift_Prediction': l}) # order reflecting increasing frequency of categories in the data set l = ['Polymerase', 'Open Chromatin', 'Transcription Factor', 'Histone'] colsdict.update({'encode_Feature Type Class': l}) l = [ 'intronic (splice_site)', 'coding', 'intronic', '5utr', '3utr', '5upstream', '3downstream', 'non-coding intronic', 'non-coding' ] colsdict.update({'ensembl_Predicted Function': l}) def read_categories(fpath): with open(fpath) as f: val = f.readlines() val = [x.strip() for x in val] return (val) regbuild_epigenomes = read_categories( bsmutils.get_bsmdir() + '/results/2020-09-07-annotations/regbuild-epigenomes') colsdict.update({'regbuild_Epigenome': regbuild_epigenomes}) colsdict.update({'structvar_Type': ['complex', 'loss', 'gain']}) return (colsdict)
def load_data(picklepath=bsmutils.get_bsmdir() + '/results/2020-09-07-annotations/annotated-calls.p'): ''' Load annotated calls from pickle file ''' with open(picklepath, 'rb') as f: data = pickle.load(f) return (data)
def readVCFs(vcflistpath=bsmutils.get_bsmdir() + '/results/calls/filtered-vcfs.tsv', vcfdir=bsmutils.get_bsmdir() + '/results/calls/', clean=True): ''' Reads the calls/records of several VCFs into rows of a single DataFrame Arguments vcflistpath: path to file listing all VCFs vcfdir: the directory of the VCFs clean: weather to remove redundant & degenerate columns Value: calls: a pandas DataFrame ''' vcflist = pd.read_csv(vcflistpath, sep='\t', names=['sample', 'file'], index_col='sample') vcflist['filepath'] = [vcfdir + os.sep + 'annotated' + os.sep + f for f in vcflist['file']] l = [readVCF(y) for y in vcflist['filepath']] calls = pd.concat(l, axis=0) if clean: calls = clean_calls(calls, dropna=True, dropdegenerate=True, dropredundant=True) return(calls)
def do_annot(annotlist=annotlist, na_values=na_values, colsdict=create_colsdict(), fpath=bsmutils.get_bsmdir() + '/results/2020-09-07-annotations/annot.p', calls=individuals.get_datasets()): ''' Main function: read SNPnexus annotations for the full Chess and Walsh datasets ''' if os.path.exists(fpath): print('loading annot DataFrame from', fpath) with open(fpath, 'rb') as f: annot = pickle.load(f) else: vcflistpath = bsmutils.get_bsmdir( ) + '/results/calls/filtered-vcfs-Chess-Walsh.tsv' annotdirpath = bsmutils.get_bsmdir( ) + '/results/2020-09-07-annotations' annot = get_multi_annotations(annotlist, vcflistpath, annotdirpath, na_values) pickle.dump(annot, open(fpath, 'wb')) return (annot)
def read_annotlist(annotpath=bsmutils.get_bsmdir() + '/tables/VCF-HC.annotations', withFORMAT=False): ''' Reads a file containing list of annotations in VCFs into a list. Parameters annotpath: the path to the aforementioned file withFORMAT: if False (default) the FORMAT fields are omitted Value: the list of annotations ''' with open(annotpath) as f: l = f.readlines() l = [y.replace('\n', '') for y in l] # remove newline characters if not withFORMAT: l = [y for y in l if not re.match('^FORMAT', y)] return(l)
def read_clinical(ancestry=True): # CMC_Human_clinical_metadata.csv if not os.path.exists(cmc_clinical_path): import synapseclient syn = synapseclient.login() wdir = bsmutils.get_bsmdir() + '/resources/' clinical_syn = syn.get('syn2279441', downloadLocation=wdir, ifcollision='overwrite.local') fpath = clinical_syn.path else: fpath = cmc_clinical_path clinical = pd.read_csv(fpath, index_col='Individual ID') if ancestry: ancestry = pd.read_csv(cmc_ancestry_path, sep='\t', index_col='Individual_ID') ancestry = ancestry.drop(columns=['Genotyping_Sample_ID', 'Cluster']) clinical = pd.concat([clinical, ancestry], axis=1) #clinical.columns = pd.MultiIndex.from_product([['Clinical'], calls.columns], names=['Source', 'Annotation']) return (clinical)
targetdir=bsmutils.get_bsmdir() + '/results/calls/annotated/'): ''' Some help would be nice ''' script = bsmutils.get_bsmdir() + '/src/annotate-vcf-bsm' cmd = [script, '-t', targetdir, invcf, sample] p = subprocess.run(cmd, capture_output=True) return(p) def annotateVCFs(vcflistpath=bsmutils.get_bsmdir() + '/results/calls/filtered-vcfs.tsv', vcfdir=bsmutils.get_bsmdir() + '/results/calls/'): vcflist = pd.read_csv(vcflistpath, sep='\t', names=['sample', 'file'], index_col='sample') def helper(sample): invcf = vcfdir + os.path.sep + 'filtered' + os.path.sep + vcflist.loc[sample, 'file'] targetdir = vcfdir + os.path.sep + 'annotated' + os.path.sep val = annotateVCF(invcf=invcf, sample=sample, targetdir=targetdir) return(val) pp = [helper(y) for y in vcflist.index] return(pp) if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('-d', '--dir', help='main VCF directory (bsm/results/calls/)', default=bsmutils.get_bsmdir() + '/results/calls/') parser.add_argument('-l', '--vcflist', help='list of samples and VCF files (bsm/results/calls/filtered-vcfs.tsv)', default=bsmutils.get_bsmdir() + '/results/calls/filtered-vcfs.tsv') args = parser.parse_args() annotateVCFs(vcflistpath=args.vcflist, vcfdir=args.dir) readVCFs(vcflistpath=args.vcflist, vcfdir=args.dir)
import scipy.stats import numpy as np import pandas as pd import os.path from bsmcalls import readVCF from bsmcalls import preprocessing import bsmutils cmc_clinical_synid = 'syn2279441' cmc_clinical_path = bsmutils.get_bsmdir( ) + '/resources/CMC_Human_clinical_metadata.csv' cmc_ancestry_path = bsmutils.get_bsmdir( ) + '/resources/cmc-ancestry/CMC_MSSM-Penn-Pitt_DNA_GENOTYPE_ANCESTRY_GemTools.tsv' walsh_gsub_path = bsmutils.get_bsmdir( ) + '/resources/walsh-manifests/genomics_subject02_template_WalshParkASD-corr.csv' walsh_vcfs_path = bsmutils.get_bsmdir( ) + '/results/calls/filtered-vcfs-Walsh.tsv' chess_vcfs_path = bsmutils.get_bsmdir() + '/results/calls/filtered-vcfs.tsv' v1 = [ 'AF', 'ALT', 'BaseQRankSum', 'DP', 'FILTER/PASS', 'FS', 'GWASpval', 'REF', 'ReadPosRankSum', 'SOR', 'VQSLOD', 'chromatinState_DLPFC', 'culprit', 'szdbCNVcount' ] v2 = ['Dx', 'AntipsychAtyp', 'AntipsychTyp', 'Institution', 'EV.3'] def read_clinical(ancestry=True): # CMC_Human_clinical_metadata.csv if not os.path.exists(cmc_clinical_path): import synapseclient
def get_geneset(df=pd.read_csv( bsmutils.get_bsmdir() + '/resources/CLOZUK/supp-table-4.csv', skiprows=7), col='Gene(s) tagged'): val = df['Gene(s) tagged'].str.split(', ').dropna().sum() geneset = set(val) return (geneset)
import pandas as pd import numpy as np import bsmutils roadmap_rna_bname = bsmutils.get_bsmdir( ) + '/resources/roadmap-epigenomics/rna/expression/57epigenomes.' proteinatlas_rna_bname = bsmutils.get_bsmdir( ) + '/resources/proteinatlas/expression/tissue_category_rna_brain_' def read_roadmap_rna(kind='RPKM', sampledict={ 'E071': 'BRN.HIPP.MID', 'E082': 'BRN.FET.F' }, suffix=False): if suffix: sampledict = dict( zip(sampledict.keys(), [x + '_' + kind for x in sampledict.values()])) fpath = roadmap_rna_bname + kind + '.pc' df = pd.read_csv(fpath, sep='\t', index_col=0, usecols=sampledict.keys()) df = df.rename(sampledict, axis=1) return (df) def read_roadmap_rna_RPKM_N(sampledict={ 'E071': 'BRN.HIPP.MID', 'E082': 'BRN.FET.F' }): l = [read_roadmap_rna(k, sampledict, suffix=True) for k in ['RPKM', 'N']]