from util import Timer ''' This script is the baseline that we later compare to. It loops over genes and performs score-tests, using pre-computed indicator variables (0/1) ''' # covariates loader covariatesloader = CovariatesLoaderCSV( snakemake.params.phenotype, snakemake.input.covariates_tsv, snakemake.params.covariate_column_names, sep='\t', path_to_phenotypes=snakemake.input.phenotypes_tsv) # set up burden loaders bloader_lof = BurdenLoaderHDF5(snakemake.input.h5_lof, snakemake.input.iid_lof, snakemake.input.gid_lof) bloader_missense = BurdenLoaderHDF5(snakemake.input.h5_missense, snakemake.input.iid_missense, snakemake.input.gid_missense) # make sure individuals are in the same order bloader_lof.update_individuals(covariatesloader.get_iids()) bloader_missense.update_individuals(covariatesloader.get_iids()) # gene names to iterate over genes = np.union1d(bloader_lof.get_vids(), bloader_missense.get_vids()) if isinstance(genes, str): genes = [genes] # set up the null model Y, X = covariatesloader.get_one_hot_covariates_and_phenotype('NoK')
regions.columns = ['chrom', 'start', 'end', 'name'] # discard all genes for which we don't have annotations regions['gene'] = regions.name.str.split('_', expand=True)[0] regions.set_index('gene', inplace=True) genes = intersect_ids(np.unique(regions.index.values), np.unique(eveploader.pos_df.gene)) regions = regions.loc[genes].reset_index() regions = regions.sort_values(['chrom','start','end'])[['chrom','start','end','name','gene']] # set up the variant loader (missense variants) for the chromosome plinkloader = VariantLoaderSnpReader(Bed(bed, count_A1=True, num_threads=4)) plinkloader.update_variants(eveploader.get_vids()) plinkloader.update_individuals(covariatesloader.get_iids()) # set up the protein LOF burden loader bloader_lof = BurdenLoaderHDF5(h5_lof, iid_lof, gid_lof) bloader_lof.update_individuals(covariatesloader.get_iids()) # set up local collapsing collapser = LocalCollapsing(distance_threshold=1.) # set up the missense genotype + vep loading function def get_missense(interval): try: V1 = eveploader.anno_by_interval(interval, gene=interval['name'].split('_')[0]) except KeyError: raise GotNone if V1.index.empty: raise GotNone