def _stats_dataframe(self): from pandas import DataFrame if len(self._envs0) == 0: return self._stats_dataframe_h2_only stats = [] for i, test in enumerate(self._tests): dof10 = test.h1.candidate_effsizes.size dof20 = test.h2.candidate_effsizes.size dof21 = dof20 - dof10 stats.append([ i, self._h0.lml, test.h1.lml, test.h2.lml, dof10, dof20, dof21, test.h1.scale, test.h2.scale, ]) columns = [ "test", "lml0", "lml1", "lml2", "dof10", "dof20", "dof21", "scale1", "scale2", ] stats = DataFrame(stats, columns=columns) stats["pv10"] = lrt_pvalues(stats["lml0"], stats["lml1"], stats["dof10"]) stats["pv20"] = lrt_pvalues(stats["lml0"], stats["lml2"], stats["dof20"]) stats["pv21"] = lrt_pvalues(stats["lml1"], stats["lml2"], stats["dof21"]) return stats
def _stats_dataframe(self): from pandas import DataFrame stats = [] for i, test in enumerate(self._tests): dof10 = test["h1"]["candidate_effsizes"].size dof20 = test["h2"]["candidate_effsizes"].size dof21 = dof20 - dof10 stats.append([ i, self._h0.lml, test["h1"]["lml"], test["h2"]["lml"], dof10, dof20, dof21, test["h1"]["scale"], test["h2"]["scale"], ]) columns = [ "test", "lml0", "lml1", "lml2", "dof10", "dof20", "dof21", "scale1", "scale2", ] stats = DataFrame(stats, columns=columns) stats["pv10"] = lrt_pvalues(stats["lml0"], stats["lml1"], stats["dof10"]) stats["pv20"] = lrt_pvalues(stats["lml0"], stats["lml2"], stats["dof20"]) stats["pv21"] = lrt_pvalues(stats["lml1"], stats["lml2"], stats["dof21"]) return stats
def _stats_dataframe_h2_only(self): from pandas import DataFrame stats = [] for i, test in enumerate(self._tests): dof20 = test.h2.candidate_effsizes.size stats.append([i, self._h0.lml, test.h2.lml, dof20, test.h2.scale]) columns = ["test", "lml0", "lml2", "dof20", "scale2"] stats = DataFrame(stats, columns=columns) stats["pv20"] = lrt_pvalues(stats["lml0"], stats["lml2"], stats["dof20"]) return stats
def variant_pvalues(self): r"""Variant p-values. Returns ------- array_like Association significance between variant and phenotype. """ from xarray import zeros_like pv = zeros_like(self._alt_lmls) pv[:] = lrt_pvalues(self.null_lml, self._alt_lmls.values) pv.name = "pv" return pv
def run_QTL_analysis(pheno_filename, anno_filename, geno_prefix, plinkGenotype, output_dir, window_size=250000, min_maf=0.05, min_hwe_P=0.001, min_call_rate=0.95, blocksize=1000, cis_mode=True, skipAutosomeFiltering=False, gaussianize_method=None, minimum_test_samples=10, seed=np.random.randint(40000), n_perm=0, write_permutations=False, relatedness_score=0.95, feature_variant_covariate_filename=None, snps_filename=None, feature_filename=None, snp_feature_filename=None, genetic_range='all', covariates_filename=None, kinship_filename=None, sample_mapping_filename=None, extended_anno_filename=None, regressCovariatesUpfront=False): fill_NaN = Imputer(missing_values=np.nan, strategy='mean', axis=0, copy=False) print('Running QTL analysis.') lik = 'normal' minimumProbabilityStep = 0.1 '''Core function to take input and run QTL tests on a given chromosome.''' if relatedness_score is not None: relatedness_score = float(relatedness_score) [phenotype_df, kinship_df, covariate_df, sample2individual_df,complete_annotation_df, annotation_df, snp_filter_df, snp_feature_filter_df, geneticaly_unique_individuals, minimum_test_samples, feature_list, bim, fam, bed, bgen, chromosome, selectionStart, selectionEnd, feature_variant_covariate_df]=\ utils.run_QTL_analysis_load_intersect_phenotype_covariates_kinship_sample_mapping(pheno_filename=pheno_filename, anno_filename=anno_filename, geno_prefix=geno_prefix, plinkGenotype=plinkGenotype, cis_mode=cis_mode, skipAutosomeFiltering = skipAutosomeFiltering, minimum_test_samples= minimum_test_samples, relatedness_score=relatedness_score, snps_filename=snps_filename, feature_filename=feature_filename, snp_feature_filename=snp_feature_filename, selection=genetic_range, covariates_filename=covariates_filename, kinship_filename=kinship_filename, sample_mapping_filename=sample_mapping_filename, extended_anno_filename=extended_anno_filename, feature_variant_covariate_filename=feature_variant_covariate_filename) mixed = kinship_df is not None if (kinship_df is None) or (relatedness_score is None): geneticaly_unique_individuals = sample2individual_df['iid'].values QS = None if (feature_list == None or len(feature_list) == 0): print('No features to be tested.') sys.exit() #Open output files qtl_loader_utils.ensure_dir(output_dir) if not selectionStart is None: output_writer = qtl_output.hdf5_writer( output_dir + '/qtl_results_{}_{}_{}.h5'.format( chromosome, selectionStart, selectionEnd)) else: output_writer = qtl_output.hdf5_writer( output_dir + '/qtl_results_{}.h5'.format(chromosome)) if (write_permutations): if not selectionStart is None: permutation_writer = qtl_output.hdf5_permutations_writer( output_dir + '/perm_results_{}_{}_{}.h5'.format( chromosome, selectionStart, selectionEnd), n_perm) else: permutation_writer = qtl_output.hdf5_permutations_writer( output_dir + '/perm_results_{}.h5'.format(chromosome), n_perm) #Arrays to store indices of snps tested and pass and fail QC SNPs for features without missingness. tested_snp_ids = [] pass_qc_snps_all = [] fail_qc_snps_all = [] fail_qc_features = [] alpha_params = [] beta_params = [] n_samples = [] n_e_samples = [] na_containing_features = 0 currentFeatureNumber = 0 snpQcInfoMain = None for feature_id in feature_list: snpQcInfo = None currentFeatureNumber += 1 if (len(phenotype_df.loc[feature_id, :])) < minimum_test_samples: print("Feature: " + feature_id + " not tested not enough samples do QTL test.") fail_qc_features.append(feature_id) geneticaly_unique_individuals = tmp_unique_individuals continue data_written = False contains_missing_samples = False snpQuery = utils.do_snp_selection(feature_id, complete_annotation_df, bim, cis_mode, window_size, skipAutosomeFiltering) snp_cov_df = None if (feature_variant_covariate_df is not None): if (feature_id in feature_variant_covariate_df['feature'].values): covariateSnp = feature_variant_covariate_df['snp_id'].values[ feature_variant_covariate_df['feature'] == feature_id] if (any(i in bim['snp'].values for i in covariateSnp)): snpQuery_cov = bim.loc[ bim['snp'].map(lambda x: x in list(covariateSnp)), :] if (plinkGenotype): snp_cov_df = pd.DataFrame( data=bed[snpQuery_cov['i'].values, :].compute(). transpose(), index=fam.index, columns=snpQuery_cov['snp'], ) else: ##Here we make some assumptions on the SNPs. They are expected to be ploidy 2! ##Also we don't use a minimal quality to assure a value is present for all samples. print( 'Warning, during the regression of SNPs we assume ploidy 2.' ) snp_cov_df_t = pd.DataFrame(columns=fam.index) rowNumber = 0 for snpId in snpQuery_cov['i']: geno = bgen["genotype"][snpId].compute() if (geno["phased"]): snp_df_dosage_t = geno["probs"][:, [0, 2]].sum( 1).astype(float) snp_df_dosage_t[( np.amax(geno["probs"][:, :2], 1) + np.amax(geno["probs"][:, 2:4], 1)) < ( 1 + minimumProbabilityStep)] = float('NaN') else: snp_df_dosage_t = (geno["probs"][:, 0] * 2) + geno["probs"][:, 1] snp_df_dosage_t[ np.amax(geno["probs"][:, :3], 1) < ( (1 / 3) + minimumProbabilityStep)] = float('NaN') snp_df_dosage_t = pd.Series(snp_df_dosage_t, index=fam.index) snp_df_dosage_t.name = snpId snp_cov_df_t = snp_cov_df_t.append(snp_df_dosage_t) rowNumber = rowNumber + 1 snp_cov_df_t = snp_cov_df_t.transpose() if (len(snpQuery) != 0) and (snp_filter_df is not None): toSelect = set(snp_filter_df.index).intersection( set(snpQuery['snp'])) snpQuery = snpQuery.loc[snpQuery['snp'].isin(toSelect)] if (len(snpQuery) != 0) and (snp_feature_filter_df is not None): toSelect = set( np.unique(snp_feature_filter_df['snp_id'].loc[ snp_feature_filter_df['feature'] == feature_id])).intersection(set(snpQuery['snp'])) snpQuery = snpQuery.loc[snpQuery['snp'].isin(toSelect)] if len(snpQuery) == 0: print("Feature: " + feature_id + " not tested. No SNPS passed QC for phenotype.") fail_qc_features.append(feature_id) continue else: phenotype_ds = phenotype_df.loc[feature_id] contains_missing_samples = any(~np.isfinite(phenotype_ds)) if (contains_missing_samples): print('Feature: ' + feature_id + ' contains missing data.') phenotype_ds.dropna(inplace=True) na_containing_features = na_containing_features + 1 '''select indices for relevant individuals in genotype matrix These are not unique. NOT to be used to access phenotype/covariates data ''' individual_ids = sample2individual_df.loc[phenotype_ds.index, 'iid'].values sample2individual_feature = sample2individual_df.loc[ phenotype_ds.index] if (contains_missing_samples): tmp_unique_individuals = geneticaly_unique_individuals if (kinship_df is not None) and (relatedness_score is not None): geneticaly_unique_individuals = utils.get_unique_genetic_samples( kinship_df.loc[individual_ids, individual_ids], relatedness_score) else: geneticaly_unique_individuals = individual_ids else: #If no missing samples we can use the previous SNP Qc information before actually loading data. #This allows for more efficient blocking and retrieving of data snpQuery = snpQuery.loc[snpQuery['snp'].map( lambda x: x not in list(map(str, fail_qc_snps_all)))] if phenotype_ds.empty or len( geneticaly_unique_individuals) < minimum_test_samples: print("Feature: " + feature_id + " not tested not enough samples do QTL test.") fail_qc_features.append(feature_id) if contains_missing_samples: geneticaly_unique_individuals = tmp_unique_individuals continue elif np.var(phenotype_ds.values) == 0: print("Feature: " + feature_id + " has no variance in selected individuals.") fail_qc_features.append(feature_id) if contains_missing_samples: geneticaly_unique_individuals = tmp_unique_individuals continue print('For feature: ' + str(currentFeatureNumber) + '/' + str(len(feature_list)) + ' (' + feature_id + '): ' + str(snpQuery.shape[0]) + ' SNPs need to be tested.\n Please stand by.') if (n_perm != 0): bestPermutationPval = np.ones((n_perm), dtype=np.float) #Here we need to start preparing the LMM, can use the fam for sample IDS in SNP matrix. #test if the covariates, kinship, snp and phenotype are in the same order if ((all(kinship_df.loc[individual_ids,individual_ids].index==sample2individual_feature.loc[phenotype_ds.index]['iid']) if kinship_df is not None else True) &\ (all(phenotype_ds.index==covariate_df.loc[sample2individual_feature['sample'],:].index)if covariate_df is not None else True)): ''' if all lines are in order put in arrays the correct genotype and phenotype x=a if cond1 else b <---> equivalent to if cond1: x=a else x=b; better readability of the code ''' if kinship_df is not None: kinship_mat = kinship_df.loc[individual_ids, individual_ids].values kinship_mat = kinship_mat.astype(float) ##GOWER normalization of Kinship matrix. kinship_mat *= (kinship_mat.shape[0] - 1) / ( kinship_mat.trace() - kinship_mat.mean(0).sum()) ## This needs to go with the subselection stuff. if (QS is None and not contains_missing_samples): QS = economic_qs(kinship_mat) elif (contains_missing_samples): QS_tmp = QS QS = economic_qs(kinship_mat) if kinship_df is None: K = np.eye(len(phenotype_ds.index)) if (QS is None and not contains_missing_samples): QS = economic_qs(K) elif (contains_missing_samples): QS_tmp = QS QS = economic_qs(K) cov_matrix = covariate_df.loc[sample2individual_feature[ 'sample'], :].values if covariate_df is not None else None if covariate_df is None: cov_matrix = np.ones((len(individual_ids), 1)) if snp_cov_df is not None: snp_cov_df_tmp = snp_cov_df.loc[individual_ids, :] snp_cov_df_tmp.index = sample2individual_feature['sample'] snp_cov_df = pd.DataFrame( fill_NaN.fit_transform(snp_cov_df_tmp)) snp_cov_df.index = snp_cov_df_tmp.index snp_cov_df.columns = snp_cov_df_tmp.columns cov_matrix = np.concatenate( (cov_matrix, snp_cov_df.values), 1) snp_cov_df_tmp = None snp_cov_df = None cov_matrix = cov_matrix.astype(float) else: print( 'There is an issue in mapping phenotypes vs covariates and/or kinship' ) sys.exit() phenotype = utils.force_normal_distribution( phenotype_ds.values, method=gaussianize_method ) if gaussianize_method is not None else phenotype_ds.values #Prepare LMM phenotype = phenotype.astype(float) ##Mixed and test. ##This is a future change so we don't need to decompose the COVs every time. ##Like QS this needs to happen when genetic unique individuals is the same. #svd_cov = economic_svd(cov_matrix) #lmm = LMM(phenotype, cov_matrix, QS, SVD=svd_cov) #These steps need to happen only once per phenotype. #print(QS) lmm = LMM(phenotype, cov_matrix, QS) if not mixed: lmm.delta = 1 lmm.fix('delta') #Prepare null model. lmm.fit(verbose=False) if regressCovariatesUpfront: phenotype_corrected = phenotype - cov_matrix[:, 1:].dot( lmm.beta[1:]) cov_matrix_corrected = cov_matrix[:, 0] lmm = LMM(phenotype_corrected, cov_matrix_corrected, QS) lmm.fit(verbose=False) null_lml = lmm.lml() flmm = lmm.get_fast_scanner() countChunker = 0 for snpGroup in utils.chunker(snpQuery, blocksize): countChunker = countChunker + 1 #print(countChunker) #Fix seed at the start of the first chunker so all permutations are based on the same random first split. np.random.seed(seed) #print(snpGroup) snp_idxs = snpGroup['i'].values snp_names = snpGroup['snp'].values tested_snp_ids.extend(snp_names) #subset genotype matrix, we cannot subselect at the same time, do in two steps. if (plinkGenotype): snp_df = pd.DataFrame( data=bed[snp_idxs, :].compute().transpose(), index=fam.index, columns=snp_names) else: snp_df_dosage = pd.DataFrame(np.nan, index=fam.index, columns=snp_names) snp_df = pd.DataFrame(np.nan, index=fam.index, columns=snp_names) rowNumber = 0 for snpId in snp_idxs: geno = bgen["genotype"][snpId].compute() if (geno["ploidy"].min() > 1 & geno["ploidy"].max() < 3): if (geno["phased"]): snp_df_dosage_t = geno["probs"][:, [0, 2]].sum( 1).astype(float) snp_df_t = (np.abs( np.argmax(geno["probs"][:, :2], axis=1) - 1 ) + np.abs( np.argmax(geno["probs"][:, 2:4], axis=1) - 1)).astype(float) naId = (np.amax(geno["probs"][:, :2], 1) + np.amax(geno["probs"][:, 2:4], 1)) < ( 1 + minimumProbabilityStep) snp_df_dosage_t[naId] = float('NaN') snp_df_t[naId] = float('NaN') else: snp_df_dosage_t = ( (geno["probs"][:, 0] * 2) + geno["probs"][:, 1]).astype(float) snp_df_t = (np.abs( np.argmax(geno["probs"][:, :3], axis=1) - 2)).astype(float) naId = np.amax(geno["probs"][:, :3], 1) < ( (1 / 3) + minimumProbabilityStep) snp_df_dosage_t[naId] = float('NaN') snp_df_t[naId] = float('NaN') snp_df_dosage.loc[:, snp_names[ rowNumber]] = snp_df_dosage_t snp_df.loc[:, snp_names[rowNumber]] = snp_df_t rowNumber = rowNumber + 1 snp_df_dosage = snp_df_dosage.loc[individual_ids, :] snp_df = snp_df.loc[individual_ids, :] snp_df = snp_df.loc[:, np.unique(snp_df.columns)[ np.unique(snp_df.columns, return_counts=1)[1] == 1]] #SNP QC. if not contains_missing_samples: #remove SNPs from snp_df if they have previously failed QC snp_df = snp_df.loc[:, snp_df.columns[~snp_df.columns. isin(fail_qc_snps_all)]] if snp_df.shape[1] == 0: continue snps_to_test_df = snp_df.loc[:, snp_df.columns[ ~snp_df.columns.isin(pass_qc_snps_all)]] if snps_to_test_df.shape[1] > 0: #Only do QC on relevant SNPs. join pre-QCed list and new QCed list. if kinship_df is not None: passed_snp_names, failed_snp_names, call_rate, maf, hweP = do_snp_qc( snps_to_test_df.iloc[np.unique( snps_to_test_df.index, return_index=1)[1]].loc[ geneticaly_unique_individuals, :], min_call_rate, min_maf, min_hwe_P) else: passed_snp_names, failed_snp_names, call_rate, maf, hweP = do_snp_qc( snps_to_test_df, min_call_rate, min_maf, min_hwe_P) snps_to_test_df = None #append snp_names and failed_snp_names pass_qc_snps_all.extend(passed_snp_names) fail_qc_snps_all.extend(failed_snp_names) snp_df = snp_df.loc[:, snp_df.columns[snp_df.columns. isin(pass_qc_snps_all)]] else: #Do snp QC for relevant section. #Get relevant slice from: phenotype_ds if kinship_df is not None: passed_snp_names, failed_snp_names, call_rate, maf, hweP = do_snp_qc( snp_df.iloc[np.unique( snp_df.index, return_index=1)[1]].loc[ geneticaly_unique_individuals, :], min_call_rate, min_maf, min_hwe_P) else: passed_snp_names, failed_snp_names, call_rate, maf, hweP = do_snp_qc( snp_df, min_call_rate, min_maf, min_hwe_P) snp_df = snp_df.loc[:, snp_df.columns[snp_df.columns. isin(passed_snp_names)]] snpQcInfo_t = None if call_rate is not None: snpQcInfo_t = call_rate if maf is not None: snpQcInfo_t = pd.concat( [snpQcInfo_t, maf.reindex(snpQcInfo_t.index)], axis=1) if hweP is not None: snpQcInfo_t = pd.concat( [snpQcInfo_t, hweP.reindex(snpQcInfo_t.index)], axis=1) call_rate = None maf = None hweP = None if snpQcInfo is None and snpQcInfo_t is not None: snpQcInfo = snpQcInfo_t elif snpQcInfo_t is not None: snpQcInfo = pd.concat([snpQcInfo, snpQcInfo_t], axis=0, sort=False) ##First process SNPQc than check if we can continue. if len(snp_df.columns) == 0: continue elif (not plinkGenotype): snp_df_dosage = snp_df_dosage.loc[:, np.unique(snp_df.columns )] #We could make use of relatedness when imputing. And impute only based on genetically unique individuals. snp_df = pd.DataFrame(fill_NaN.fit_transform(snp_df), index=snp_df.index, columns=snp_df.columns) if (not plinkGenotype): snp_df_dosage = pd.DataFrame( fill_NaN.fit_transform(snp_df_dosage), index=snp_df_dosage.index, columns=snp_df_dosage.columns) ##No more snp_matrix_DF > snp_df # test if the covariates, kinship, snp and phenotype are in the same order if (len(snp_df.index) != len(sample2individual_feature.loc[ phenotype_ds.index]['iid']) or not all(snp_df.index == sample2individual_feature. loc[phenotype_ds.index]['iid'])): print( 'There is an issue in mapping phenotypes and genotypes' ) sys.exit() G = snp_df.values if (not plinkGenotype): G = snp_df_dosage.values G = G.astype(float) G_index = snp_df.columns alt_lmls, effsizes = flmm.fast_scan(G, verbose=False) var_pvalues = lrt_pvalues(null_lml, alt_lmls) var_effsizes_se = effsizes_se(effsizes, var_pvalues) #add these results to qtl_results temp_df = pd.DataFrame(index=range(len(G_index)), columns=[ 'feature_id', 'snp_id', 'p_value', 'beta', 'beta_se', 'empirical_feature_p_value' ]) temp_df['snp_id'] = G_index temp_df['feature_id'] = feature_id temp_df['beta'] = np.asarray(effsizes) temp_df['p_value'] = np.asarray(var_pvalues) temp_df['beta_se'] = np.asarray(var_effsizes_se) #insert default dummy value temp_df['empirical_feature_p_value'] = -1.0 if (n_perm != 0): pValueBuffer = [] totalSnpsToBeTested = (G.shape[1] * n_perm) permutationStepSize = np.floor( n_perm / (totalSnpsToBeTested / blocksize)) if (permutationStepSize > n_perm): permutationStepSize = n_perm elif (permutationStepSize < 1): permutationStepSize = 1 if (write_permutations): perm_df = pd.DataFrame( index=range(len(G_index)), columns=['snp_id'] + ['permutation_' + str(x) for x in range(n_perm)]) perm_df['snp_id'] = G_index for currentNperm in utils.chunker( list(range(1, n_perm + 1)), permutationStepSize): if (kinship_df is not None) and (relatedness_score is not None): if (plinkGenotype): temp = utils.get_shuffeld_genotypes_preserving_kinship( geneticaly_unique_individuals, relatedness_score, snp_df, kinship_df.loc[individual_ids, individual_ids], len(currentNperm)) else: temp = utils.get_shuffeld_genotypes_preserving_kinship( geneticaly_unique_individuals, relatedness_score, snp_df_dosage, kinship_df.loc[individual_ids, individual_ids], len(currentNperm)) else: if (plinkGenotype): temp = utils.get_shuffeld_genotypes( snp_df, len(currentNperm)) else: temp = utils.get_shuffeld_genotypes( snp_df_dosage, len(currentNperm)) temp = temp.astype(float) alt_lmls_p, effsizes_p = flmm.fast_scan(temp, verbose=False) var_pvalues_p = lrt_pvalues(null_lml, alt_lmls_p) pValueBuffer.extend(np.asarray(var_pvalues_p)) if (not (len(pValueBuffer) == totalSnpsToBeTested)): #print(len(pValueBuffer)) #print(pValueBuffer) #print(totalSnpsToBeTested) print('Error in blocking logic for permutations.') sys.exit() perm = 0 for relevantOutput in utils.chunker( pValueBuffer, G.shape[1]): if (write_permutations): perm_df['permutation_' + str(perm)] = relevantOutput if (bestPermutationPval[perm] > min(relevantOutput)): bestPermutationPval[perm] = min(relevantOutput) perm = perm + 1 #print(relevantOutput) #print('permutation_'+str(perm)) if not temp_df.empty: data_written = True output_writer.add_result_df(temp_df) if (write_permutations): permutation_writer.add_permutation_results_df( perm_df, feature_id) #This we need to change in the written file. if (n_perm > 1 and data_written): #updated_permuted_p_in_hdf5(bestPermutationPval, feature_id); alpha_para, beta_para = output_writer.apply_pval_correction( feature_id, bestPermutationPval, cis_mode) #np.savetxt(output_dir+"/Permutation.pValues."+feature_id+".txt",bestPermutationPval) alpha_params.append(alpha_para) beta_params.append(beta_para) if not data_written: fail_qc_features.append(feature_id) else: n_samples.append(phenotype_ds.size) n_e_samples.append(len(geneticaly_unique_individuals)) if contains_missing_samples: QS = QS_tmp geneticaly_unique_individuals = tmp_unique_individuals del QS_tmp del tmp_unique_individuals if snpQcInfo is not None: snpQcInfo.index.name = "snp_id" snpQcInfo.to_csv( output_dir + '/snp_qc_metrics_naContaining_feature_{}.txt'.format( feature_id), sep='\t') else: if (snpQcInfo is not None and snpQcInfoMain is not None): snpQcInfoMain = pd.concat([snpQcInfoMain, snpQcInfo], axis=0, sort=False) elif snpQcInfo is not None: snpQcInfoMain = snpQcInfo.copy(deep=True) #if snpQcInfo is not None: #snpQcInfo2 = snpQcInfo.copy().transpose() #snpQcInfo2.to_csv(output_dir+'/snp_qc_metrics_feature_{}.txt'.format(feature_id),sep='\t') #print('step 5') output_writer.close() if (write_permutations): permutation_writer.close() fail_qc_features = np.unique(fail_qc_features) if ((len(feature_list) - len(fail_qc_features)) == 0): time.sleep(15) #Safety timer to make sure the file is unlocked. print("Trying to remove the h5 file. Nothing has been tested.") print(output_dir + 'qtl_results_{}_{}_{}.h5'.format( chromosome, selectionStart, selectionEnd)) if not selectionStart is None: os.remove(output_dir + 'qtl_results_{}_{}_{}.h5'.format( chromosome, selectionStart, selectionEnd)) else: os.remove(output_dir + 'qtl_results_{}.h5'.format(chromosome)) sys.exit() #gather unique indexes of tested SNPs tested_snp_ids = list(set(tested_snp_ids)) #write annotation and snp data to file snp_df = pd.DataFrame() snp_df['snp_id'] = bim['snp'] snp_df['chromosome'] = bim['chrom'] snp_df['position'] = bim['pos'] snp_df['assessed_allele'] = bim['a1'] snp_df.index = snp_df['snp_id'] snp_df = snp_df.drop_duplicates() snp_df = snp_df.reindex(tested_snp_ids) snp_df = snp_df.drop_duplicates() if snpQcInfoMain is not None: snpQcInfoMain['index'] = snpQcInfoMain.index snpQcInfoMain = snpQcInfoMain.drop_duplicates() del snpQcInfoMain['index'] snp_df = pd.concat( [snp_df, snpQcInfoMain.reindex(snp_df.index)], axis=1) if (snp_df.shape[1] == 5): snp_df.columns = [ 'snp_id', 'chromosome', 'position', 'assessed_allele', 'call_rate' ] elif (snp_df.shape[1] == 6): snp_df.columns = [ 'snp_id', 'chromosome', 'position', 'assessed_allele', 'call_rate', 'maf' ] else: snp_df.columns = [ 'snp_id', 'chromosome', 'position', 'assessed_allele', 'call_rate', 'maf', 'hwe_p' ] feature_list = list(set(feature_list) - set(fail_qc_features)) annotation_df = annotation_df.reindex(feature_list) annotation_df['n_samples'] = n_samples annotation_df['n_e_samples'] = n_e_samples if (n_perm > 1): annotation_df['alpha_param'] = alpha_params annotation_df['beta_param'] = beta_params if not selectionStart is None: snp_df.to_csv(output_dir + '/snp_metadata_{}_{}_{}.txt'.format( chromosome, selectionStart, selectionEnd), sep='\t', index=False) annotation_df.to_csv(output_dir + '/feature_metadata_{}_{}_{}.txt'.format( chromosome, selectionStart, selectionEnd), sep='\t') else: snp_df.to_csv(output_dir + '/snp_metadata_{}.txt'.format(chromosome), sep='\t', index=False) annotation_df.to_csv(output_dir + '/feature_metadata_{}.txt'.format(chromosome), sep='\t')
def run_PrsQtl_analysis(pheno_filename, anno_filename, prsFile, output_dir, min_call_rate=0.95, blocksize=1000, skipAutosomeFiltering=False, gaussianize_method=None, minimum_test_samples=10, seed=np.random.randint(40000), n_perm=0, write_permutations=False, relatedness_score=None, feature_variant_covariate_filename=None, snps_filename=None, feature_filename=None, snp_feature_filename=None, genetic_range='all', covariates_filename=None, kinship_filename=None, sample_mapping_filename=None, regressCovariatesUpfront=False): fill_NaN = Imputer(missing_values=np.nan, strategy='mean', axis=0) print('Running GRS QT analysis.') lik = 'normal' '''Core function to take input and run QTL tests on a given chromosome.''' if relatedness_score is not None: relatedness_score = float(relatedness_score) [phenotype_df, kinship_df, covariate_df, sample2individual_df, annotation_df, snp_filter_df, snp_feature_filter_df, geneticaly_unique_individuals, minimum_test_samples, feature_list, risk_df, chromosome, selectionStart, selectionEnd, feature_variant_covariate_df]=\ utils.run_PrsQtl_analysis_load_intersect_phenotype_covariates_kinship_sample_mapping(pheno_filename=pheno_filename, anno_filename=anno_filename, prsFile=prsFile, skipAutosomeFiltering = skipAutosomeFiltering, minimum_test_samples= minimum_test_samples, relatedness_score=relatedness_score, snps_filename=snps_filename, feature_filename=feature_filename, snp_feature_filename=snp_feature_filename, selection=genetic_range, covariates_filename=covariates_filename, kinship_filename=kinship_filename, sample_mapping_filename=sample_mapping_filename, feature_variant_covariate_filename=feature_variant_covariate_filename) mixed = kinship_df is not None if (kinship_df is None) or (relatedness_score is None): geneticaly_unique_individuals = sample2individual_df['iid'].values QS = None if (feature_list == None or len(feature_list) == 0): print('No features to be tested.') sys.exit() #Open output files qtl_loader_utils.ensure_dir(output_dir) if not selectionStart is None: output_writer = qtl_output.hdf5_writer( output_dir + '/qtl_results_{}_{}_{}.h5'.format( chromosome, selectionStart, selectionEnd)) else: output_writer = qtl_output.hdf5_writer( output_dir + '/qtl_results_{}.h5'.format(chromosome)) if (write_permutations): if not selectionStart is None: permutation_writer = qtl_output.hdf5_permutations_writer( output_dir + '/perm_results_{}_{}_{}.h5'.format( chromosome, selectionStart, selectionEnd), n_perm) else: permutation_writer = qtl_output.hdf5_permutations_writer( output_dir + '/perm_results_{}.h5'.format(chromosome), n_perm) #Arrays to store indices of snps tested and pass and fail QC SNPs for features without missingness. tested_snp_names = [] fail_qc_features = [] alpha_params = [] beta_params = [] n_samples = [] n_e_samples = [] na_containing_features = 0 currentFeatureNumber = 0 snpQcInfoMain = None for feature_id in feature_list: snpQcInfo = None currentFeatureNumber += 1 if (len(phenotype_df.loc[feature_id, :])) < minimum_test_samples: print("Feature: " + feature_id + " not tested not enough samples do QTL test.") fail_qc_features.append(feature_id) geneticaly_unique_individuals = tmp_unique_individuals continue data_written = False contains_missing_samples = False snpQuery = risk_df.index.values snp_cov_df = None if (feature_variant_covariate_df is not None): if (feature_id in feature_variant_covariate_df['feature'].values): covariateSnp = feature_variant_covariate_df['snp_id'].values[ feature_variant_covariate_df['feature'] == feature_id] if (any(i in risk_df.index.values for i in covariateSnp)): snp_cov_df = risk_df.loc[risk_df.index.map( lambda x: x in list(covariateSnp)), :].transpose() if (len(snpQuery) != 0) and (snp_filter_df is not None): snpQuery = list( set(snp_filter_df.index).intersection(set(snpQuery))) if (len(snpQuery) != 0) and (snp_feature_filter_df is not None): snpQuery = list( set( np.unique(snp_feature_filter_df['snp_id'].loc[ snp_feature_filter_df['feature'] == feature_id])).intersection(set(snpQuery))) if len(snpQuery) == 0: print("Feature: " + feature_id + " not tested. No SNPS passed QC for phenotype.") fail_qc_features.append(feature_id) continue else: phenotype_ds = phenotype_df.loc[feature_id] contains_missing_samples = any(~np.isfinite(phenotype_ds)) if (contains_missing_samples): #import pdb; pdb.set_trace() print('Feature: ' + feature_id + ' contains missing data.') phenotype_ds.dropna(inplace=True) na_containing_features = na_containing_features + 1 '''select indices for relevant individuals in genotype matrix These are not unique. NOT to be used to access phenotype/covariates data ''' individual_ids = sample2individual_df.loc[phenotype_ds.index, 'iid'].values sample2individual_feature = sample2individual_df.loc[ phenotype_ds.index] if contains_missing_samples: tmp_unique_individuals = geneticaly_unique_individuals if (kinship_df is not None) and (relatedness_score is not None): geneticaly_unique_individuals = utils.get_unique_genetic_samples( kinship_df.loc[individual_ids, individual_ids], relatedness_score) else: geneticaly_unique_individuals = individual_ids if phenotype_ds.empty or len( geneticaly_unique_individuals) < minimum_test_samples: print("Feature: " + feature_id + " not tested not enough samples do QTL test.") fail_qc_features.append(feature_id) if contains_missing_samples: geneticaly_unique_individuals = tmp_unique_individuals continue elif np.var(phenotype_ds.values) == 0: print("Feature: " + feature_id + " has no variance in selected individuals.") fail_qc_features.append(feature_id) if contains_missing_samples: geneticaly_unique_individuals = tmp_unique_individuals continue print('For feature: ' + str(currentFeatureNumber) + '/' + str(len(feature_list)) + ' (' + feature_id + '): ' + str(len(snpQuery)) + ' risk scores will be tested.\n Please stand by.') if (n_perm != 0): bestPermutationPval = np.ones((n_perm), dtype=np.float) #Here we need to start preparing the LMM, can use the fam for sample IDS in SNP matrix. # test if the covariates, kinship, snp and phenotype are in the same order if ((all(kinship_df.loc[individual_ids,individual_ids].index==sample2individual_feature.loc[phenotype_ds.index]['iid']) if kinship_df is not None else True) &\ (all(phenotype_ds.index==covariate_df.loc[sample2individual_feature['sample'],:].index)if covariate_df is not None else True)): ''' if all lines are in order put in arrays the correct genotype and phenotype x=a if cond1 else b <---> equivalent to if cond1: x=a else x=b; better readability of the code ''' if kinship_df is not None: kinship_mat = kinship_df.loc[individual_ids, individual_ids].values kinship_mat = kinship_mat.astype(float) ##GOWER normalization of Kinship matrix. kinship_mat *= (kinship_mat.shape[0] - 1) / ( kinship_mat.trace() - kinship_mat.mean(0).sum()) ## This needs to go with the subselection stuff. if (QS is None and not contains_missing_samples): QS = economic_qs(kinship_mat) elif (contains_missing_samples): QS_tmp = QS QS = economic_qs(kinship_mat) if kinship_df is None: K = np.eye(len(phenotype_ds.index)) if (QS is None and not contains_missing_samples): QS = economic_qs(K) elif (contains_missing_samples): QS_tmp = QS QS = economic_qs(K) cov_matrix = covariate_df.loc[sample2individual_feature[ 'sample'], :].values if covariate_df is not None else None if covariate_df is None: cov_matrix = np.ones((len(individual_ids), 1)) #pdb.set_trace() if snp_cov_df is not None: snp_cov_df_tmp = snp_cov_df.loc[individual_ids, :] snp_cov_df = pd.DataFrame( fill_NaN.fit_transform(snp_cov_df_tmp)) snp_cov_df.index = sample2individual_feature['sample'] snp_cov_df.columns = snp_cov_df_tmp.columns cov_matrix = np.concatenate( (cov_matrix, snp_cov_df.values), 1) snp_cov_df_tmp = None snp_cov_df = None cov_matrix = cov_matrix.astype(float) else: print( 'There is an issue in mapping phenotypes vs covariates and/or kinship' ) sys.exit() phenotype = utils.force_normal_distribution( phenotype_ds.values, method=gaussianize_method ) if gaussianize_method is not None else phenotype_ds.values #Prepare LMM phenotype = phenotype.astype(float) ##Mixed and test. ##This is a future change so we don't need to decompose the COVs every time. ##Like QS this needs to happen when genetic unique individuals is the same. #svd_cov = economic_svd(cov_matrix) #lmm = LMM(phenotype, cov_matrix, QS, SVD=svd_cov) #These steps need to happen only once per phenotype. #print(QS) lmm = LMM(phenotype, cov_matrix, QS) if not mixed: lmm.delta = 1 lmm.fix('delta') #Prepare null model. lmm.fit(verbose=False) if regressCovariatesUpfront: phenotype_corrected = phenotype - cov_matrix[:, 1:].dot( lmm.beta[1:]) cov_matrix_corrected = cov_matrix[:, 0] lmm = LMM(phenotype_corrected, cov_matrix_corrected, QS) lmm.fit(verbose=False) null_lml = lmm.lml() flmm = lmm.get_fast_scanner() #pdb.set_trace(); for snpGroup in utils.chunker(snpQuery, blocksize): #Fix seed at the start of the first chunker so all permutations are based on the same random first split. np.random.seed(seed) snp_names = snpGroup tested_snp_names.extend(snp_names) snp_matrix_DF = risk_df.loc[snp_names, individual_ids].transpose() ##GRS var QC snp_matrix_DF = snp_matrix_DF.loc[:, snp_matrix_DF.isna().sum( axis=0) != snp_matrix_DF. shape[0], ] snp_matrix_DF = snp_matrix_DF.loc[:, ( np.nanstd(snp_matrix_DF, axis=0) > 0)] # test if the covariates, kinship, snp and phenotype are in the same order if (len(snp_matrix_DF.index) != len( sample2individual_feature.loc[phenotype_ds.index] ['iid']) or not all( snp_matrix_DF.index == sample2individual_feature.loc[ phenotype_ds.index]['iid'])): print( 'There is an issue in mapping phenotypes and genotypes' ) sys.exit() #Impute missingness #pdb.set_trace() call_rate = 1 - snp_matrix_DF.isnull().sum() / len( snp_matrix_DF.index) if snpQcInfo is None and call_rate is not None: snpQcInfo = call_rate elif call_rate is not None: snpQcInfo = pd.concat([snpQcInfo, call_rate], axis=0) selection = call_rate > min_call_rate snp_matrix_DF = snp_matrix_DF.loc[:, list(snp_matrix_DF. columns[selection])] if snp_matrix_DF.shape[1] == 0: continue snp_matrix_DF = pd.DataFrame( fill_NaN.fit_transform(snp_matrix_DF), index=snp_matrix_DF.index, columns=snp_matrix_DF.columns) # G = snp_matrix_DF.values G = G.astype(float) G_index = snp_matrix_DF.columns alt_lmls, effsizes = flmm.fast_scan(G, verbose=False) var_pvalues = lrt_pvalues(null_lml, alt_lmls) var_effsizes_se = effsizes_se(effsizes, var_pvalues) #add these results to qtl_results temp_df = pd.DataFrame(index=range(len(G_index)), columns=[ 'feature_id', 'snp_id', 'p_value', 'beta', 'beta_se', 'empirical_feature_p_value' ]) temp_df['snp_id'] = G_index temp_df['feature_id'] = feature_id temp_df['beta'] = np.asarray(effsizes) temp_df['p_value'] = np.asarray(var_pvalues) temp_df['beta_se'] = np.asarray(var_effsizes_se) #insert default dummy value temp_df['empirical_feature_p_value'] = -1.0 if (n_perm != 0): pValueBuffer = [] totalSnpsToBeTested = (G.shape[1] * n_perm) permutationStepSize = np.floor( n_perm / (totalSnpsToBeTested / blocksize)) if (permutationStepSize > n_perm): permutationStepSize = n_perm elif (permutationStepSize < 1): permutationStepSize = 1 if (write_permutations): perm_df = pd.DataFrame( index=range(len(G_index)), columns=['snp_id'] + ['permutation_' + str(x) for x in range(n_perm)]) perm_df['snp_id'] = G_index for currentNperm in utils.chunker( list(range(1, n_perm + 1)), permutationStepSize): if (kinship_df is not None) and (relatedness_score is not None): temp = utils.get_shuffeld_genotypes_preserving_kinship( geneticaly_unique_individuals, relatedness_score, snp_matrix_DF, kinship_df.loc[individual_ids, individual_ids], len(currentNperm)) else: temp = utils.get_shuffeld_genotypes( snp_matrix_DF, len(currentNperm)) temp = temp.astype(float) alt_lmls_p, effsizes_p = flmm.fast_scan(temp, verbose=False) var_pvalues_p = lrt_pvalues(null_lml, alt_lmls_p) pValueBuffer.extend(np.asarray(var_pvalues_p)) if (not (len(pValueBuffer) == totalSnpsToBeTested)): #print(len(pValueBuffer)) #print(pValueBuffer) #print(totalSnpsToBeTested) print('Error in blocking logic for permutations.') sys.exit() perm = 0 for relevantOutput in utils.chunker( pValueBuffer, G.shape[1]): if (write_permutations): perm_df['permutation_' + str(perm)] = relevantOutput if (bestPermutationPval[perm] > min(relevantOutput)): bestPermutationPval[perm] = min(relevantOutput) perm = perm + 1 #print(relevantOutput) #print('permutation_'+str(perm)) if not temp_df.empty: data_written = True output_writer.add_result_df(temp_df) if (write_permutations): permutation_writer.add_permutation_results_df( perm_df, feature_id) #This we need to change in the written file. if (n_perm > 1 and data_written): #updated_permuted_p_in_hdf5(bestPermutationPval, feature_id); alpha_para, beta_para = output_writer.apply_pval_correction( feature_id, bestPermutationPval, False) alpha_params.append(alpha_para) beta_params.append(beta_para) #pdb.set_trace(); if not data_written: fail_qc_features.append(feature_id) else: n_samples.append(phenotype_ds.size) n_e_samples.append(len(geneticaly_unique_individuals)) if contains_missing_samples: QS = QS_tmp geneticaly_unique_individuals = tmp_unique_individuals snpQcInfo = snpQcInfo.to_frame(name="call_rate") snpQcInfo.index.name = "snp_id" snpQcInfo.to_csv( output_dir + '/snp_qc_metrics_naContaining_feature_{}.txt'.format( feature_id), sep='\t') del QS_tmp del tmp_unique_individuals else: if (snpQcInfo is not None and snpQcInfoMain is not None): snpQcInfoMain = pd.concat([snpQcInfoMain, snpQcInfo], axis=0) elif snpQcInfo is not None: snpQcInfoMain = snpQcInfo.copy(deep=True) #print('step 5') output_writer.close() if (write_permutations): permutation_writer.close() fail_qc_features = np.unique(fail_qc_features) if ((len(feature_list) - len(fail_qc_features)) == 0): time.sleep(15) #Safety timer to make sure the file is unlocked. print("Trying to remove the h5 file. Nothing has been tested.") print(output_dir + 'qtl_results_{}_{}_{}.h5'.format( chromosome, selectionStart, selectionEnd)) if not selectionStart is None: os.remove(output_dir + 'qtl_results_{}_{}_{}.h5'.format( chromosome, selectionStart, selectionEnd)) else: os.remove(output_dir + 'qtl_results_{}.h5'.format(chromosome)) sys.exit() #gather unique indexes of tested snps #write annotation and snp data to file snp_df = pd.DataFrame() snp_df['snp_id'] = np.unique(tested_snp_names) snp_df.index = np.unique(tested_snp_names) snp_df['chromosome'] = "NA" snp_df['position'] = "NA" if (snpQcInfoMain is not None): snpQcInfoMain = snpQcInfoMain.to_frame(name="call_rate") snpQcInfoMain['index'] = snpQcInfoMain.index snpQcInfoMain = snpQcInfoMain.drop_duplicates() del snpQcInfoMain['index'] snp_df = pd.concat( [snp_df, snpQcInfoMain.reindex(snp_df.index)], axis=1) feature_list = list(set(feature_list) - set(fail_qc_features)) annotation_df = annotation_df.reindex(feature_list) annotation_df['n_samples'] = n_samples annotation_df['n_e_samples'] = n_e_samples if (n_perm > 1): annotation_df['alpha_param'] = alpha_params annotation_df['beta_param'] = beta_params if not selectionStart is None: snp_df.to_csv(output_dir + '/snp_metadata_{}_{}_{}.txt'.format( chromosome, selectionStart, selectionEnd), sep='\t', index=False) annotation_df.to_csv(output_dir + '/feature_metadata_{}_{}_{}.txt'.format( chromosome, selectionStart, selectionEnd), sep='\t') else: snp_df.to_csv(output_dir + '/snp_metadata_{}.txt'.format(chromosome), sep='\t', index=False) annotation_df.to_csv(output_dir + '/feature_metadata_{}.txt'.format(chromosome), sep='\t')