def compute_deflation_ratios(self, sumstats_prefix, sumstats_prefix_chr, pve, N, mean_l2, M_base): #read diagGRM files df_diagGRM = pcgc_utils.load_dfs(sumstats_prefix, sumstats_prefix_chr, 'diagGRM.gz', 'diagGRM', 'diagGRM', join_axis=0, index_col=['fid', 'iid'], use_tqdm=False) df_diagGRM = df_diagGRM.groupby(['fid', 'iid']).sum() #compute trace ratio deflate_columns = df_diagGRM.columns.str.startswith('diag_G_deflate') nodeflate_columns = df_diagGRM.columns.str.startswith('diag_G_nodeflate') sum_diag_deflate = df_diagGRM.loc[:,deflate_columns].sum(axis=0).values sum_diag_nodeflate = df_diagGRM.loc[:,nodeflate_columns].sum(axis=0).values trace_ratios = sum_diag_nodeflate / sum_diag_deflate #compute deflation ratios if (len(pve) == 0): if np.isclose(trace_ratios[0], 1): deflation_ratio = 1.0 else: raise ValueError('trace deflation reported, but no pve values found!') else: var_diag_deflate = df_diagGRM.loc[:,deflate_columns].iloc[:,0].var(ddof=0) / M_base**2 var_diag_nodeflate = df_diagGRM.loc[:,nodeflate_columns].iloc[:,0].var(ddof=0) / M_base**2 trace_G = sum_diag_nodeflate[0] / M_base deflation_ratio = self.compute_PCS_deflation_ratio(pve, N, M_base, mean_l2, trace_G, var_diag_nodeflate, var_diag_deflate) return trace_ratios, deflation_ratio
def read_Gty_files(self, args, sumstats_prefix, sumstats_prefix_chr, category_names, N, mean_Q): #read otherstats files df_otherstats_list = pcgc_utils.load_dfs(sumstats_prefix, sumstats_prefix_chr, 'otherstats', 'otherstats', 'otherstats', join_axis=None, use_tqdm=False, allow_duplicates=True) #sum M2 (sum of squares of annotations) of each annotation M_annot_sumstats2 = np.zeros(len(category_names)) for df_otherstats_chr in df_otherstats_list: for c_i, c in enumerate(category_names): if len(category_names)==1: df_M_annot_sumstats2 = df_otherstats_chr.loc[df_otherstats_chr['Property'].str.startswith('M2_'), 'Value'] else: df_M_annot_sumstats2 = df_otherstats_chr.query('Property == "M2_%s"'%(c))['Value'] if df_M_annot_sumstats2.shape[0] == 0: raise ValueError('M2_%s not found in otherstats file'%(c)) if df_M_annot_sumstats2.shape[0] > 1: raise ValueError('Multiple M2_%s values found in otherstats file'%(c)) M_annot_sumstats2[c_i] += df_M_annot_sumstats2.values[0] #load PCGC Gty files and aggregate them (across chromosomes) if args.no_Gty: df_Gty = self.create_synthetic_Gty(N, mean_Q, category_names) else: df_Gty = pcgc_utils.load_dfs(sumstats_prefix, sumstats_prefix_chr, 'Gty.gz', 'Gty', 'Gty', join_axis=0, index_col=['fid', 'iid'], use_tqdm=False, allow_duplicates=True) df_Gty = np.sqrt((df_Gty**2).groupby(['fid', 'iid']).sum()) #synchronize df_Gty columns to match category_names if len(category_names) > 1: columns_intersect = df_Gty.columns.intersection(category_names) if len(columns_intersect) < len(category_names): raise ValueError('Gty files and prodr2 files must have the same annotations') if len(columns_intersect) < df_Gty.shape[1]: logging.warning('Gty file has ununsed annotations') df_Gty.loc[:, category_names] if not np.all(df_Gty.columns == category_names): df_Gty = df_Gty.loc[:, category_names] #normalize Gty by the total number of SNPs in the genome for anno_i in range(df_Gty.shape[1]): df_Gty.iloc[:,anno_i] /= np.sqrt(M_annot_sumstats2[anno_i]) M_base = M_annot_sumstats2[0] return df_Gty, M_base
def read_sumstats(self, args, sumstats_prefix, sumstats_prefix_chr): #load summary statistics if args.he: try: df_sumstats = pcgc_utils.load_dfs(sumstats_prefix, sumstats_prefix_chr, '', 'sumstats', 'sumstats', index_col='SNP', allow_duplicates=True) except IOError: df_sumstats = pcgc_utils.load_dfs(sumstats_prefix, sumstats_prefix_chr, 'sumstats.gz', 'sumstats', 'sumstats', index_col='SNP', allow_duplicates=True) else: df_sumstats = pcgc_utils.load_dfs(sumstats_prefix, sumstats_prefix_chr, 'sumstats.gz', 'sumstats', 'sumstats', index_col='SNP', allow_duplicates=True) #transform Z column if it wasn't created especially for PCGC if args.he: if 'pcgc_sumstat' not in df_sumstats.columns: if 'Z' not in df_sumstats.columns: raise ValueError('cannot find a Z column in summary statistics file') else: df_sumstats['pcgc_sumstat'] = df_sumstats['Z'] * np.sqrt(df_sumstats['N']) #if HE regrssion is used, create default naive values for PCGC-relevant fields if args.he: var_t = 0 mean_Q = 1 pve = np.array([]) N = df_sumstats['N'].mean() #load PCGC other stats files else: df_otherstats_list = pcgc_utils.load_dfs(sumstats_prefix, sumstats_prefix_chr, 'otherstats', 'otherstats', 'otherstats', join_axis=None, use_tqdm=False, allow_duplicates=True) df_otherstats = df_otherstats_list[0] var_t = df_otherstats.query('Property == "var_t"')['Value'].values[0] pve = df_otherstats.loc[df_otherstats['Property'].str.startswith('pve'), 'Value'].values mean_Q = df_otherstats.query('Property == "mean_Q"')['Value'].values[0] N = df_otherstats.query('Property == "N"')['Value'].values[0] #filter out SNPs with very large summary statistics if args.chisq_max is None: chisq_max = max(0.001*df_sumstats.N.max(), 80) else: chisq_max = args.chisq_max df_sumstats['Z2'] = df_sumstats['pcgc_sumstat']**2 / df_sumstats['N'] / mean_Q is_large_z = (df_sumstats['Z2'] > chisq_max) if is_large_z.any(): logging.warning('Removing %d summary statistics with Z^2 > %0.1f'%(is_large_z.sum(), chisq_max)) df_sumstats = df_sumstats.loc[~is_large_z] return df_sumstats, var_t, pve, mean_Q, N
def load_annotations(self, anno, anno_chr, sync_prefix): #extract SNP names if self.genetic_format == 'plink': index_snpnames = self.bfile['df_bim'].index elif self.genetic_format == 'bgen': raise NotImplementedError() else: raise ValueError('illegal option') #load annotations (or create baseline annotation only) if (anno is None) and (anno_chr is None): df_annotations = pd.DataFrame(np.ones(len(index_snpnames)), index=index_snpnames, columns=['base']) else: df_annotations = pcgc_utils.load_dfs(anno, anno_chr, 'annot.gz', 'annot', 'annot', index_col='SNP') df_annotations.drop(columns=['CHR', 'CM', 'BP'], inplace=True) if not np.allclose(df_annotations.iloc[:, 0], 1): raise ValueError( 'The first annotation must be the base annotation (all SNPs must have value 1.0)' ) #apply min_annot correction to ensure no negative values category_names = df_annotations.columns if sync_prefix is None: raise ValueError( '--annot and --annot-chr must be used together with --sync' ) df_sync = pd.read_table(sync_prefix + 'sync', index_col='Category') if df_sync.shape[0] != len(category_names) or not np.all( df_sync.index == category_names): raise ValueError( 'Annotations in sync file do not match those in annotations/prodr2 files' ) min_annot = df_sync['min_annot'].values df_annotations -= min_annot #remove annotations for unused SNPs is_same = (df_annotations.shape[0] == len(index_snpnames)) and \ (df_annotations.index == index_snpnames).all() if not is_same: has_anno = (index_snpnames.isin(df_annotations.index)).all() if not has_anno: raise ValueError('not all SNPs have annotations') df_annotations = df_annotations.loc[index_snpnames] #save the df in a class member self.df_annotations_noneg = df_annotations
def load_all_snp_indices(self, args): index_list = [] if args.annot is not None or args.annot_chr is not None: df_annot_index = pcgc_utils.load_dfs(args.annot, args.annot_chr, 'annot.gz', 'annot', 'annot', index_col='SNP', usecols=['SNP'], allow_duplicates=True) index_list.append(df_annot_index.index) if args.fit_intercept: df_l2_index = pcgc_utils.load_dfs(args.annot, args.annot_chr, 'l2.ldscore.gz', 'l2.ldscore', 'annot', index_col='SNP', usecols=['SNP'], allow_duplicates=True) index_list.append(df_l2_index.index) if args.w_ld is not None or args.w_ld_chr is not None: df_w_ld_index = pcgc_utils.load_dfs(args.w_ld, args.w_ld_chr, 'l2.ldscore.gz', 'l2.ldscore', 'w-ld', index_col='SNP', usecols=['SNP'], allow_duplicates=True) index_list.append(df_w_ld_index.index) if len(index_list) == 0: index_intersect = None elif len(index_list) == 1: index_intersect = index_list[0] else: index_intersect = reduce(lambda i1,i2: i1.intersection(i2), index_list) if index_intersect is not None: assert not index_intersect.duplicated().any() return index_intersect
def load_prodr2(self, args): df_prodr2 = pcgc_utils.load_dfs(args.prodr2, args.prodr2_chr, 'prodr2', 'prodr2', 'prodr2', use_tqdm=False) df_prodr2.index.name = 'Category' df_prodr2 = df_prodr2.groupby(by=['Category']).sum() assert df_prodr2.shape[0] == len(df_prodr2.columns.intersection(df_prodr2.index)) df_prodr2 = df_prodr2.loc[df_prodr2.columns, df_prodr2.columns] assert (df_prodr2.columns == df_prodr2.index).all() if args.annot is None and args.annot_chr is None: if df_prodr2.shape[1] > 1: logging.warning('Using only the first annotation in prodr2 file!!!') df_prodr2 = df_prodr2.iloc[:1,:1] return df_prodr2
def compute_r2_prod(args): # read bim/snp array_snps = ps.PlinkBIMFile(args.bfile + '.bim') snpnames = array_snps.df['SNP'] #read annotations file if args.annot is None and args.annot_chr is None: df_annotations = pd.DataFrame(np.ones(snpnames.shape[0]), index=snpnames, columns=['base']) else: df_annotations = pcgc_utils.load_dfs(args.annot, args.annot_chr, 'annot.gz', 'annot', 'annot', join_axis=0, index_col='SNP') df_annotations.drop(columns=['CHR', 'CM', 'BP'], inplace=True) #apply min_annot correction to ensure no negative values category_names = df_annotations.columns if args.sync is None: raise ValueError('--annot must be used together with --sync') df_sync = pd.read_table(args.sync + 'sync', index_col='Category') if df_sync.shape[0] != len(category_names) or not np.all( df_sync.index == category_names): raise ValueError( 'Annotations in sync file do not match those in annotations/prodr2 files' ) min_annot = df_sync['min_annot'].values df_annotations -= min_annot #mark which SNPs to keep is_good_snp = np.ones(len(snpnames), dtype=np.bool) if args.exclude is not None: df_exclude = pd.read_table(args.exclude, squeeze=True, header=None) is_good_snp = is_good_snp & (~snpnames.isin(df_exclude)) logging.info('Excluding %d SNPs' % (np.sum(~snpnames.isin(df_exclude)))) if args.extract is not None: df_extract = pd.read_table(args.extract, squeeze=True, header=None) is_good_snp = is_good_snp & (snpnames.isin(df_extract)) logging.info('Extracting %d SNPs' % (np.sum(snpnames.isin(df_extract)))) if args.ld_all: keep_snps = None is_r2_snp = is_good_snp else: keep_snps = np.where(is_good_snp)[0] is_r2_snp = np.ones(len(keep_snps), dtype=np.bool) snpnames = snpnames.iloc[keep_snps] #keep only annotations of SNPs in plink file if not snpnames.isin(df_annotations.index).all(): raise ValueError('not all SNPs have annotations') df_annotations = df_annotations.loc[snpnames] logging.info('Computing r^2 products for %d SNPs' % (len(snpnames))) #find #individuals in bfile df_fam = pd.read_table(args.bfile + '.fam', header=None) n = df_fam.shape[0] #read plink file keep_indivs = None mafMin = None reload(ldscore_r2) logging.info('Loading SNP file...') geno_array = ldscore_r2.PlinkBEDFile(args.bfile + '.bed', n, array_snps, is_r2_snp, keep_snps=keep_snps, keep_indivs=keep_indivs, mafMin=mafMin) #compute r2_prod_table logging.info('Computing r2 prod...') coords = np.array(array_snps.df['CM'])[geno_array.kept_snps] block_left = ldscore_r2.getBlockLefts(coords, args.ld_wind_cm) if block_left[len(block_left) - 1] == 0: error_msg = 'Only a single block selected - this is probably a mistake' raise ValueError(error_msg) t0 = time.time() geno_array._currentSNP = 0 r2prod_table = geno_array.ldScoreVarBlocks(block_left, args.chunk_size, annot=df_annotations.values) df_r2prod_table = pd.DataFrame(r2prod_table, index=df_annotations.columns, columns=df_annotations.columns) return df_r2prod_table
def __init__(self, args): #read phenotypes file df_pheno = self.read_pheno_file(args) #read covariates if args.covar is not None: df_covar = pd.read_table(args.covar, delim_whitespace=True) df_covar = self.add_fid_iid_index(df_covar) #merge individuals across phenotypes and covariates index_intersect = df_pheno.index.intersection(df_covar.index) if len(index_intersect) < len(df_pheno.index): if len(index_intersect) == 0: raise ValueError( 'no individuals have both both phenotype and covariates data' ) df_pheno = df_pheno.loc[index_intersect] df_covar = df_covar.loc[index_intersect] logging.info( '%d individuals have both phenotypes and covariates data' % (df_covar.shape[0])) #read mafs file if it exists if args.frqfile is not None or args.frqfile_chr is not None: df_maf = pcgc_utils.load_dfs(args.frqfile, args.frqfile_chr, 'frq', 'frq', 'frqfile', index_col='SNP') else: df_maf = None logging.warning( 'MAF file not provided! We will use the in-sample MAF estimates (this is highly unrecommended!)' ) #read bfile or bgen file if args.bfile is not None: assert args.bgen is None, '--bfile and --bgen cannot both be specified' self.bfile, df_pheno, df_maf, self.num_snps, self.sample_size = self.read_plink( args, df_pheno, df_maf) self.genetic_format = 'plink' elif args.bgen is not None: assert args.bfile is None, '--bfile and --bgen cannot both be specified' raise NotImplementedError('bgen functionality not yet implemented') self.genetic_format = 'bgen' else: raise ValueError('either --bfile or --bgen must be specified') #save MAFs if df_maf is not None: maf_col = self.find_df_column(df_maf, ['MAF', 'FRQ', 'FREQ', 'A1Freq'], 'MAF') self.mafs = df_maf[maf_col] else: self.mafs = None #Extract relevant covariates and compute the Cholesky factorization of the hat matrix if args.covar is None: C = None self.C_regress = None self.L_CTC = None covars_regress_cols = [] else: #reorder individuals to make sure that the covariates match the other files is_same = (df_covar.shape[0] == df_pheno.shape[0]) and (df_covar.index == df_pheno.index).all() if not is_same: df_covar = df_covar.loc[df_pheno.index] C = df_covar.iloc[:, 2:] #extract relevant covariates if args.covars_regress is None: self.C_regress = None self.L_CTC = None covars_regress_cols = [] else: assert args.covar is not None covars_regress_cols = args.covars_regress.split(',') for c in covars_regress_cols: if c not in df_covar.columns: raise ValueError('%s is not in the covariates file' % (c)) self.C_regress = df_covar[covars_regress_cols].values #compute the Cholesky factorization of the hat matrix self.L_CTC = la.cho_factor(self.C_regress.T.dot( self.C_regress)) #load pve file if it exists if args.pve is None: if args.covars_regress is not None: raise ValueError('--covars_regress must be used with --pve') self.pve = [] else: df_pve = pd.read_table(args.pve, header=None) if df_pve.shape[1] > 1: raise ValueError('pve file must include only a single column') if df_pve.shape[0] < len(covars_regress_cols): raise ValueError( 'There are fewer pve values than covariates to regress') df_pve.columns = ['pve'] if df_pve.shape[0] > len(covars_regress_cols): logging.warning( 'There are more pve values than covariates to regress. Using only top %d pve values' % (len(covars_regress_cols))) df_pve = df_pve.sort_values('pve', ascending=False).head( len(covars_regress_cols)) self.pve = df_pve.iloc[:, 0].values #transform phenotypes to 0/1 y = df_pheno.iloc[:, -1] num_pheno = len(np.unique(y)) if num_pheno == 1: raise ValueError('only one phenotype value found!') elif num_pheno > 2: raise ValueError( 'phenotype file must include only cases and controls') y = (y > y.mean()).astype(np.int).values #compute PCGC statistics P = y.mean() u0, u1, ty, self.var_t = self.compute_pcgc_stats(args.prev, P, y, C) y_norm = (y - P) / np.sqrt(P * (1 - P)) self.z_coeff = y_norm * (u0 + u1) self.mean_Q = np.mean((u0 + u1)**2) self.prev = args.prev #load annotations self.load_annotations(args.annot, args.annot_chr, args.sync)
def pcgc_sync2(args): #define the list of annotation file names if args.annot is None and args.annot_chr is None: raise ValueError('you must use either --annot or --annot-chr') if args.annot is not None and args.annot_chr is not None: raise ValueError('you cannot use both --annot and --annot-chr') if args.annot is not None: annot_fname_list = [args.annot+'annot.gz'] else: annot_fname_list = [args.annot_chr+'%d.annot.gz'%(chr_num) for chr_num in range(1,23)] #round 1: Collect min_annot fields logging.info('Computing annotation minimum values') min_annot_list = [] for annot_fname in tqdm(annot_fname_list, disable=len(annot_fname_list)==1): min_annot_list.append(collect_annotations_chr_info(annot_fname).values[:,0]) #compute min_annot across all annotations min_annot = np.min(np.array(min_annot_list), axis=0) min_annot[min_annot>0] = 0 #collect MAFs df_frq = pcgc_utils.load_dfs(args.frqfile, args.frqfile_chr, 'frq', 'frq', 'frqfile', join_axis=0, index_col='SNP', usecols=['SNP', 'MAF']) df_frq = df_frq['MAF'] #round 2: collect all fields overlap_matrix = 0 overlap_matrix_common = 0 df_sync_list = [] logging.info('Collecting annotation details') for annot_fname in tqdm(annot_fname_list, disable=len(annot_fname_list)==1): df_sync, df_overlap, df_overlap_common = collect_annotations_chr_info(annot_fname, df_frq, min_annot) df_sync_list.append(df_sync) overlap_matrix += df_overlap.values overlap_matrix_common += df_overlap_common.values #create df_overlap and df_overlap_common df_overlap = pd.DataFrame(overlap_matrix, index=df_overlap.index, columns=df_overlap.columns) df_overlap_common = pd.DataFrame(overlap_matrix_common, index=df_overlap_common.index, columns=df_overlap_common.columns) #Group df_sync results func_dict = {} func_dict['min_annot'] = np.min func_dict['is_continuous'] = np.any func_dict['M'] = np.sum func_dict['M2'] = np.sum func_dict['M_noneg'] = np.sum func_dict['M2_noneg'] = np.sum func_dict['M_5_50'] = np.sum func_dict['M2_5_50'] = np.sum func_dict['M_5_50_noneg'] = np.sum func_dict['M2_5_50_noneg'] = np.sum assert np.all([c in func_dict for c in df_sync.columns]) df_sync_concat = pd.concat(df_sync_list, axis=0) df_sync = df_sync_concat.groupby(df_sync_concat.index).agg(func_dict) df_sync = df_sync.loc[df_sync_list[0].index] #add df index names df_sync.index.name = 'Category' df_overlap.index.name = 'Category' df_overlap_common.index.name = 'Category' return df_sync, df_overlap, df_overlap_common
def load_annotations_data(self, args, df_prodr2, index_intersect): if args.annot is None and args.annot_chr is None: #create relevant data for a single annotation if args.M is None: raise ValueError('--M must be used when not using --annot or --annot-chr') if args.not_M_5_50 is not None: raise ValueError('--not-M-5-50 cannot be used without using --annot or --annot-chr') if args.fit_intercept: raise ValueError('--fit-intercept cannot be used without using --annot or --annot-chr') M_annot = np.ones(1) * args.M df_annotations_sumstats_noneg = pd.DataFrame(np.ones(len(index_intersect)), index=index_intersect, columns=['base']) df_sync = pd.DataFrame(index=['base']) df_sync['min_annot'] = 0 df_sync['M2_5_50'] = args.M df_sync['is_continuous'] = False df_overlap = pd.DataFrame(index=['base']) df_overlap['base'] = args.M df_l2 = None df_w_ld = None else: #load M_annot if args.not_M_5_50: M_suffix = 'l2.M' else: M_suffix = 'l2.M_5_50' df_M_annot = pcgc_utils.load_dfs(args.annot, args.annot_chr, M_suffix, 'M', 'annot', header=None,use_tqdm=False) M_annot = df_M_annot.sum(axis=0).values if M_annot.shape[0] != df_prodr2.shape[1]: raise ValueError('.M files have a different number of columns than .prodr2 files') #read df_sync and overlap matrix if args.sync is None: raise ValueError('--sync not provided') df_sync = pd.read_table(args.sync+'sync', index_col='Category') overlap_suffix = 'overlap' if args.not_M_5_50 else 'overlap_5_50' df_overlap = pd.read_table(args.sync+overlap_suffix, sep='\s+', index_col='Category') if df_sync.shape[0] != df_prodr2.shape[1] or not np.all(df_sync.index == df_prodr2.columns): raise ValueError('sync and prodr2 files must have the same annotations') if df_overlap.shape[0] != df_prodr2.shape[1] or not np.all(df_overlap.index == df_prodr2.columns): raise ValueError('overlap_matrix and prodr2 files must have the same annotations') #read SNP data files df_annotations_sumstats_noneg = pcgc_utils.load_dfs(args.annot, args.annot_chr, 'annot.gz', 'annot', 'annot', index_col='SNP', index_intersect=index_intersect, use_tqdm=True) df_annotations_sumstats_noneg.drop(columns=['CHR', 'CM', 'BP'], inplace=True) if df_annotations_sumstats_noneg.shape[1] != df_prodr2.shape[1] or not np.all(df_annotations_sumstats_noneg.columns == df_prodr2.columns): raise ValueError('annotation and prodr2 files must have the same annotations') df_annotations_sumstats_noneg -= df_sync['min_annot'].values df_list = [df_annotations_sumstats_noneg] if args.fit_intercept: df_l2 = pcgc_utils.load_dfs(args.annot, args.annot_chr, 'l2.ldscore.gz', 'l2.ldscore', 'annot', index_col='SNP', usecols=['SNP', 'baseL2'], index_intersect=index_intersect) df_w_ld = pcgc_utils.load_dfs(args.w_ld, args.w_ld_chr, 'l2.ldscore.gz', 'l2.ldscore', 'w-ld', index_col='SNP', usecols=['SNP', 'L2'], index_intersect=index_intersect) df_l2 = df_l2['baseL2'] df_w_ld = df_w_ld['L2'] df_list += [df_l2, df_w_ld] else: df_l2, df_w_ld = None, None #make sure that all the dfs are perfectly aligned for df in df_list: assert not df.index.duplicated().any() index_intersect_df = index_intersect.intersection(df.index) if len(index_intersect_df) < len(index_intersect): raise ValueError('not all SNPs found in the annotation or LD score files - this shouldn''t happen') is_same = (df.index == index_intersect).all() if not is_same: df = df.loc[index_intersect] #restrict annotations if requested if args.keep_anno is not None or args.remove_anno is not None: #Find the set of annotations to select category_names = df_prodr2.columns remove_anno = set([] if (args.remove_anno is None) else args.remove_anno.split(',')) keep_anno = set(category_names if (args.keep_anno is None) else args.keep_anno.split(',')) if len(keep_anno.intersection(set(category_names))) < len(keep_anno): raise ValueError('-keep-anno includes non-existing annotations') if len(remove_anno.intersection(set(category_names))) < len(remove_anno): raise ValueError('-remove-anno includes non-existing annotations') #keep only the selected annotations anno_arr = [c for c in category_names if ((c in keep_anno) and (c not in remove_anno))] if len(anno_arr) < len(category_names): selected_anno = np.isin(category_names, anno_arr) M_annot = M_annot[selected_anno] df_prodr2 = df_prodr2.loc[anno_arr, anno_arr] df_annotations_sumstats_noneg = df_annotations_sumstats_noneg[anno_arr] df_sync = df_sync.loc[anno_arr] df_overlap = df_overlap.loc[anno_arr, anno_arr] logging.info('%d annotations remained after applying --keep-anno and --remove anno'%(df_prodr2.shape[1])) return M_annot, df_prodr2, df_annotations_sumstats_noneg, df_sync, df_overlap, df_l2, df_w_ld