def run_PrsQtl_analysis_load_intersect_phenotype_covariates_kinship_sample_mapping\ (pheno_filename, anno_filename, prsFile, minimum_test_samples= 10, relatedness_score=0.95, skipAutosomeFiltering = False, snps_filename=None, feature_filename=None, snp_feature_filename=None, selection='all', covariates_filename=None, kinship_filename=None, sample_mapping_filename=None, feature_variant_covariate_filename=None): selectionStart = None selectionEnd = None if(":" in selection): parts = selection.split(":") if("-" not in parts[1]): print("No correct sub selection.") print("Given in: "+selection) print("Expected format: (chr number):(start location)-(stop location)") sys.exit() chromosome = parts[0] if("-" in parts[1]): parts2 = parts[1].split("-") selectionStart = int(parts2[0]) selectionEnd = int(parts2[1]) else : chromosome=selection ''' function to take input and intersect sample and genotype.''' #Load input data files & filter for relevant data #Load input data filesf #import pdb; pdb.set_trace(); phenotype_df = qtl_loader_utils.get_phenotype_df(pheno_filename) annotation_df = qtl_loader_utils.get_annotation_df(anno_filename) phenotype_df.columns = phenotype_df.columns.astype("str") phenotype_df.index = phenotype_df.index.astype("str") annotation_df.columns = annotation_df.columns.astype("str") annotation_df.index = annotation_df.index.astype("str") if(annotation_df.shape[0] != annotation_df.groupby(annotation_df.index).first().shape[0]): print("Only one location per feature supported. If multiple locations are needed please look at: --extended_anno_file") sys.exit() #Determine features to be tested if chromosome!='all': if not selectionStart is None : lowest = min([selectionStart,selectionEnd]) highest = max([selectionStart,selectionEnd]) annotation_df['mean'] = ((annotation_df["start"] + annotation_df["end"])/2) feature_list = list(set(annotation_df.iloc[(annotation_df['chromosome'].values==chromosome) & (annotation_df['mean'].values>=lowest) & (annotation_df["mean"].values<highest)].index.values)) annotation_df = annotation_df.loc[feature_list,] del annotation_df['mean'] else : feature_list = list(annotation_df[annotation_df['chromosome']==chromosome].index) annotation_df = annotation_df.loc[feature_list,] #To be able to read variants from a large file we change the loading here. #First we subset the genes to the chunk and get the relevant SNPs based on that. snp_feature_filter_df= qtl_loader_utils.get_snp_feature_df(snp_feature_filename) feature_filter_df = qtl_loader_utils.get_snp_df(feature_filename) snp_filter_df = qtl_loader_utils.get_snp_df(snps_filename) feature_variant_covariate_df = qtl_loader_utils.get_snp_feature_df(feature_variant_covariate_filename) #import pdb; pdb.set_trace() #Do filtering on variants and features first stage. if snp_feature_filter_df is not None: if feature_filter_df is not None: toSelect = set(feature_filter_df.index.values).intersection(set(annotation_df.index.values)) annotation_df = annotation_df.loc[toSelect,] toSelect = list(set(snp_feature_filter_df['feature'].values).intersection(set(annotation_df.index.values))) snp_feature_filter_df = snp_feature_filter_df.loc[snp_feature_filter_df['feature'].isin(toSelect)] relSnps = snp_feature_filter_df['snp_id'].values if snp_filter_df is not None: relSnps = set(snp_filter_df.index).intersection(set(relSnps)) if feature_variant_covariate_df is not None: feature_variant_covariate_df = feature_variant_covariate_df.loc[feature_variant_covariate_df['feature'].isin(toSelect)] relSnps = np.union1d(relSnps, feature_variant_covariate_df["snp_id"].values) relSnps = np.unique(relSnps) risk_df = qtl_loader_utils.get_grs_subset_df(prsFile, relSnps) if risk_df is None: print("No variants selected during SNP reading.") sys.exit() risk_df = risk_df.assign(SnpId=risk_df.index.values) risk_df = risk_df.drop_duplicates(keep='first') risk_df = risk_df.drop(['SnpId'], axis='columns') risk_df = risk_df.loc[risk_df.isnull().sum(axis=1)!=risk_df.shape[1],] elif snp_filter_df is not None: relSnps = snp_filter_df.index if feature_variant_covariate_df is not None: feature_variant_covariate_df = feature_variant_covariate_df.loc[feature_variant_covariate_df['feature'].isin(toSelect)] relSnps = np.union1d(relSnps, feature_variant_covariate_df["snp_id"].values) relSnps = np.unique(relSnps) risk_df = qtl_loader_utils.get_grs_subset_df(prsFile, relSnps) if risk_df is None: print("No variants selected during SNP reading.") sys.exit() risk_df = risk_df.assign(SnpId=risk_df.index.values) risk_df = risk_df.drop_duplicates(keep='first') risk_df = risk_df.drop(['SnpId'], axis='columns') risk_df = risk_df.loc[risk_df.isnull().sum(axis=1)!=risk_df.shape[1],] else : risk_df = qtl_loader_utils.get_phenotype_df(prsFile) print("Intersecting data.") risk_df = risk_df.astype(float) #pdb.set_trace(); ##Make sure that there is only one entry per feature id!. sample2individual_df = qtl_loader_utils.get_samplemapping_df(sample_mapping_filename,list(phenotype_df.columns),'sample') sample2individual_df.index = sample2individual_df.index.astype('str') sample2individual_df = sample2individual_df.astype('str') sample2individual_df['sample']=sample2individual_df.index sample2individual_df = sample2individual_df.drop_duplicates(); ##Filter first the linking files! #Subset linking to relevant genotypes. orgSize = sample2individual_df.shape[0] sample2individual_df = sample2individual_df.loc[sample2individual_df['iid'].map(lambda x: x in list(map(str, risk_df.columns))),:] diff = orgSize- sample2individual_df.shape[0] orgSize = sample2individual_df.shape[0] print("Dropped: "+str(diff)+" samples because they are not present in the genotype file.") #Subset linking to relevant phenotypes. sample2individual_df = sample2individual_df.loc[np.intersect1d(sample2individual_df.index,phenotype_df.columns),:] diff = orgSize- sample2individual_df.shape[0] orgSize = sample2individual_df.shape[0] print("Dropped: "+str(diff)+" samples because they are not present in the phenotype file.") #Subset linking vs kinship. kinship_df = qtl_loader_utils.get_kinship_df(kinship_filename) if kinship_df is not None: #Filter from individual2sample_df & sample2individual_df since we don't want to filter from the genotypes. sample2individual_df = sample2individual_df[sample2individual_df['iid'].map(lambda x: x in list(map(str, kinship_df.index)))] diff = orgSize- sample2individual_df.shape[0] orgSize = sample2individual_df.shape[0] print("Dropped: "+str(diff)+" samples because they are not present in the kinship file.") #Subset linking vs covariates. covariate_df = qtl_loader_utils.get_covariate_df(covariates_filename) if covariate_df is not None: if np.nansum(covariate_df==1,0).max()<covariate_df.shape[0]: covariate_df.insert(0, 'ones',np.ones(covariate_df.shape[0])) sample2individual_df = sample2individual_df.loc[list(set(sample2individual_df.index) & set(covariate_df.index)),:] diff = orgSize- sample2individual_df.shape[0] orgSize = sample2individual_df.shape[0] print("Dropped: "+str(diff)+" samples because they are not present in the covariate file.") ### print("Number of samples with genotype & phenotype data: " + str(sample2individual_df.shape[0])) if(sample2individual_df.shape[0]<minimum_test_samples): print("Not enough samples with both genotype & phenotype data.") sys.exit() #import pdb; pdb.set_trace() ##Filter now the actual data! #Filter phenotype data based on the linking files. phenotype_df = phenotype_df.loc[list(set(phenotype_df.index)&set(annotation_df.index)),sample2individual_df.index.values] #Filter kinship data based on the linking files. genetically_unique_individuals = None if kinship_df is not None: kinship_df = kinship_df.loc[np.intersect1d(kinship_df.index,sample2individual_df['iid']),np.intersect1d(kinship_df.index,sample2individual_df['iid'])] if kinship_df is not None and (relatedness_score is not None): genetically_unique_individuals = get_unique_genetic_samples(kinship_df, relatedness_score); #Filter covariate data based on the linking files. #Do filtering on features. if feature_filter_df is not None: toSelect = set(feature_filter_df.index.values).intersection(set(phenotype_df.index.values)) phenotype_df = phenotype_df.loc[toSelect,:] ##Filtering on features to test. if snp_feature_filter_df is not None: toSelect = set(snp_feature_filter_df['feature'].values).intersection(set(phenotype_df.index.values)) phenotype_df = phenotype_df.loc[toSelect,:] if feature_filter_df is not None: snp_feature_filter_df = snp_feature_filter_df.loc[snp_feature_filter_df['feature'].isin(toSelect)] ##Filtering on features to test from the combined feature snp filter. #Prepare to filter on SNPs. if snp_filter_df is not None: toSelect = set(snp_filter_df.index).intersection(set(risk_df.index.values)) risk_df=risk_df.loc[toSelect,:] ##Filtering on SNPs to test from the snp filter. if snp_feature_filter_df is not None: toSelect = set(np.unique(snp_feature_filter_df['snp_id'])).intersection(set(risk_df.index.values)) risk_df=risk_df.loc[toSelect,:] ##Filtering on features to test from the combined feature snp filter. #Filtering for sites on non allosomes. if not skipAutosomeFiltering : annotation_df = annotation_df[annotation_df['chromosome'].map(lambda x: x in list(map(str, range(1, 23))))] feature_list = list(set(annotation_df.index)&set(phenotype_df.index)) print("Number of features to be tested: " + str(len(feature_list))) print("Total number of variants to be considered, before variante QC and feature intersection: " + str(risk_df.shape[0])) if(phenotype_df.shape[1]<minimum_test_samples): print("Not enough samples with both genotype & phenotype data, for current number of covariates.") sys.exit() return [phenotype_df, kinship_df, covariate_df, sample2individual_df, annotation_df, snp_filter_df, snp_feature_filter_df, genetically_unique_individuals, minimum_test_samples, feature_list, risk_df, chromosome, selectionStart, selectionEnd, feature_variant_covariate_df]
def run_structLMM_QTL_analysis_load_intersect_phenotype_environments_covariates_kinship_sample_mapping\ (pheno_filename, anno_filename, env_filename, geno_prefix, plinkGenotype, cis_mode = True, association_mode = True, skipAutosomeFiltering = False, minimum_test_samples = 10, relatedness_score = 0.95, snps_filename = None, feature_filename = None, snp_feature_filename = None, selection = 'all', covariates_filename = None, kinship_filename = None, sample_mapping_filename = None, extended_anno_filename = None, feature_variant_covariate_filename = None): selectionStart = None selectionEnd = None if(":" in selection): parts = selection.split(":") if("-" not in parts[1]): print("No correct sub selection.") print("Given in: "+selection) print("Expected format: (chr number):(start location)-(stop location)") sys.exit() chromosome = parts[0] if("-" in parts[1]): parts2 = parts[1].split("-") selectionStart = int(parts2[0]) selectionEnd = int(parts2[1]) else : chromosome=selection ''' function to take input and intersect sample and genotype.''' #Load input data files & filter for relevant data #Load input data filesf phenotype_df = qtl_loader_utils.get_phenotype_df(pheno_filename) annotation_df = qtl_loader_utils.get_annotation_df(anno_filename) if(plinkGenotype): bim,fam,bed = qtl_loader_utils.get_genotype_data(geno_prefix) annotation_df.replace(['X', 'Y', 'XY', 'MT'], ['23', '24', '25', '26'],inplace=True) if chromosome=='X' : chromosome = '23' elif chromosome=='Y': chromosome = '24' elif chromosome=='XY': chromosome='25' elif chromosome=='MT': chromosome='26' #X -> 23 #Y -> 24 #XY -> 25 #MT -> 26 else : geno_prefix+='.bgen' print(geno_prefix) print("Intersecting data.") if(annotation_df.shape[0] != annotation_df.groupby(annotation_df.index).first().shape[0]): print("Only one location per feature supported. If multiple locations are needed please look at: --extended_anno_file") sys.exit() ##Make sure that there is only one entry per feature id!. sample2individual_df = qtl_loader_utils.get_samplemapping_df(sample_mapping_filename,list(phenotype_df.columns),'sample') sample2individual_df['sample']=sample2individual_df.index sample2individual_df = sample2individual_df.drop_duplicates(); ##Filter first the linking files! #Subset linking to relevant genotypes. orgSize = sample2individual_df.shape[0] sample2individual_df = sample2individual_df.loc[sample2individual_df['iid'].map(lambda x: x in list(map(str, fam.index))),:] diff = orgSize- sample2individual_df.shape[0] orgSize = sample2individual_df.shape[0] print("Dropped: "+str(diff)+" samples because they are not present in the genotype file.") #Subset linking to relevant phenotypes. sample2individual_df = sample2individual_df.loc[np.intersect1d(sample2individual_df.index,phenotype_df.columns),:] diff = orgSize- sample2individual_df.shape[0] orgSize = sample2individual_df.shape[0] print("Dropped: "+str(diff)+" samples because they are not present in the phenotype file.") #Subset linking vs kinship. kinship_df = qtl_loader_utils.get_kinship_df(kinship_filename) if kinship_df is not None: #Filter from individual2sample_df & sample2individual_df since we don't want to filter from the genotypes. sample2individual_df = sample2individual_df[sample2individual_df['iid'].map(lambda x: x in list(map(str, kinship_df.index)))] diff = orgSize- sample2individual_df.shape[0] orgSize = sample2individual_df.shape[0] print("Dropped: "+str(diff)+" samples because they are not present in the kinship file.") #Subset linking vs covariates. covariate_df = qtl_loader_utils.get_covariate_df(covariates_filename) if covariate_df is not None: if np.nansum(covariate_df==1,0).max()<covariate_df.shape[0]: covariate_df.insert(0, 'ones',np.ones(covariate_df.shape[0])) sample2individual_df = sample2individual_df.loc[list(set(sample2individual_df.index) & set(covariate_df.index)),:] diff = orgSize- sample2individual_df.shape[0] orgSize = sample2individual_df.shape[0] print("Dropped: "+str(diff)+" samples because they are not present in the covariate file.") #Subset linking vs environments. environment_df = qtl_loader_utils.get_env_df(env_filename) if np.nansum(environment_df==1,0).max()<environment_df.shape[0]: environment_df.insert(0, 'ones',np.ones(environment_df.shape[0])) sample2individual_df = sample2individual_df.loc[list(set(sample2individual_df.index) & set(environment_df.index)),:] diff = orgSize - sample2individual_df.shape[0] orgSize = sample2individual_df.shape[0] print("Dropped: "+str(diff)+" samples because they are not present in the environment file.") ### print("Number of samples with genotype & phenotype data: " + str(sample2individual_df.shape[0])) if(sample2individual_df.shape[0]<minimum_test_samples): print("Not enough samples with both genotype & phenotype data.") sys.exit() ##Filter now the actual data! #Filter phenotype data based on the linking files. phenotype_df = phenotype_df.loc[list(set(phenotype_df.index)&set(annotation_df.index)),sample2individual_df.index.values] #Filter kinship data based on the linking files. genetically_unique_individuals = None if kinship_df is not None: kinship_df = kinship_df.loc[np.intersect1d(kinship_df.index,sample2individual_df['iid']),np.intersect1d(kinship_df.index,sample2individual_df['iid'])] genetically_unique_individuals = get_unique_genetic_samples(kinship_df, relatedness_score); #Filter covariate data based on the linking files. if covariate_df is not None: covariate_df = covariate_df.loc[np.intersect1d(covariate_df.index,sample2individual_df.index.values),:] snp_feature_filter_df= qtl_loader_utils.get_snp_feature_df(snp_feature_filename) try: feature_filter_df = qtl_loader_utils.get_snp_df(feature_filename) except: if feature_filename is not None: feature_filter_df=pd.DataFrame(index=feature_filename) #Do filtering on features. if feature_filter_df is not None: phenotype_df = phenotype_df.loc[feature_filter_df.index,:] ##Filtering on features to test. if snp_feature_filter_df is not None: phenotype_df = phenotype_df.loc[np.unique(snp_feature_filter_df['feature']),:] ##Filtering on features to test from the combined feature snp filter. if ((not cis_mode) and len(set(bim['chrom']))<22) : print("Warning, running a trans-analysis on snp data from less than 22 chromosomes.\nTo merge data later the permutation P-values need to be written out.") if(cis_mode): #Remove features from the annotation that are on chromosomes which are not present anyway. annotation_df = annotation_df[np.in1d(annotation_df['chromosome'],list(set(bim['chrom'])))] #Prepare to filter on snps. snp_filter_df = qtl_loader_utils.get_snp_df(snps_filename) if snp_filter_df is not None: toSelect = set(snp_filter_df.index).intersection(set(bim['snp'])) bim = bim.loc[bim['snp'].isin(toSelect)] ##Filtering on SNPs to test from the snp filter. if snp_feature_filter_df is not None: toSelect = set(np.unique(snp_feature_filter_df['snp_id'])).intersection(set(bim['snp'])) bim = bim.loc[bim['snp'].isin(toSelect)] ##Filtering on features to test from the combined feature snp filter. #Filtering for sites on non allosomes. if not skipAutosomeFiltering : annotation_df = annotation_df[annotation_df['chromosome'].map(lambda x: x in list(map(str, range(1, 23))))] #Determine features to be tested if chromosome=='all': feature_list = list(set(annotation_df.index)&set(phenotype_df.index)) else: if not selectionStart is None : lowest = min([selectionStart,selectionEnd]) highest = max([selectionStart,selectionEnd]) annotation_df['mean'] = ((annotation_df["start"] + annotation_df["end"])/2) feature_list = list(set(annotation_df.iloc[(annotation_df['chromosome'].values==chromosome) & (annotation_df['mean'].values>=lowest) & (annotation_df["mean"].values<highest)].index.values)&set(phenotype_df.index)) del annotation_df['mean'] else : feature_list = list(set(annotation_df[annotation_df['chromosome']==chromosome].index)&set(phenotype_df.index)) print("Number of features to be tested: " + str(len(feature_list))) print("Total number of variants to be considered, before variante QC and feature intersection: " + str(bim.shape[0])) if(phenotype_df.shape[1]<minimum_test_samples): print("Not enough samples with both genotype & phenotype data, for current number of covariates.") sys.exit() if extended_anno_filename is not None: complete_annotation_df = pd.read_csv(extended_anno_filename,sep='\t',index_col=0) annotation_df['index']=annotation_df.index complete_annotation_df['index']=complete_annotation_df.index complete_annotation_df = pd.concat([annotation_df,complete_annotation_df]).drop_duplicates() del complete_annotation_df['index'] else: complete_annotation_df = annotation_df feature_variant_covariate_df = qtl_loader_utils.get_snp_feature_df(feature_variant_covariate_filename) return [phenotype_df, kinship_df, covariate_df, environment_df, sample2individual_df, complete_annotation_df, annotation_df, snp_filter_df, snp_feature_filter_df, genetically_unique_individuals, minimum_test_samples, feature_list,bim,fam,bed, chromosome, selectionStart, selectionEnd, feature_variant_covariate_df]