def polyloc_partitions(self, args): self.load_posterior_betas(args) self.partition_snps_to_bins(args, use_ridge=False) #add another partition for all SNPs not in the posterior file df_bim_list = [] for chr_num in range(1, 23): df_bim_chr = pd.read_table( args.bfile_chr + '%d.bim' % (chr_num), delim_whitespace=True, names=['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], header=None) df_bim_list.append(df_bim_chr) df_bim = pd.concat(df_bim_list, axis=0) df_bim = set_snpid_index(df_bim) self.df_bins = set_snpid_index(self.df_bins) #make sure that all variants in the posterior file are also in the plink files if np.any(~self.df_bins.index.isin(df_bim.index)): raise ValueError( 'Found variants in posterior file that are not found in the plink files' ) #add a new bin for SNPs that are not found in the posterior file (if there are any) if df_bim.shape[0] > self.df_bins.shape[0]: new_snps = df_bim.index[~df_bim.index.isin(self.df_bins.index)] df_bins_new = df_bim.loc[new_snps, SNP_COLUMNS].copy() for colname in self.df_bins.drop(columns=SNP_COLUMNS).columns: df_bins_new[colname] = False new_colname = 'snpvar_bin%d' % (df_bins_new.shape[1] - len(SNP_COLUMNS) + 1) self.df_bins[new_colname] = False df_bins_new[new_colname] = True self.df_bins = pd.concat([self.df_bins, df_bins_new], axis=0) #save the bins to disk self.save_bins_to_disk(args) #save the bin sizes to disk df_binsize = pd.DataFrame( index=np.arange(1, self.df_bins.shape[1] - len(SNP_COLUMNS) + 1)) df_binsize.index.name = 'BIN' df_binsize['BIN_SIZE'] = [ self.df_bins[c].sum() for c in self.df_bins.drop(columns=SNP_COLUMNS).columns ] #saves memory df_binsize.to_csv(args.output_prefix + '.binsize', sep='\t', index=True)
def compute_ld_scores(self, args): #define the range of chromosomes to iterate over if args.chr is None: chr_range = range(1, 23) else: chr_range = range(args.chr, args.chr + 1) #iterate over chromosomes and compute LD-scores ###df_ldscores_chr_list = [] for chr_num in tqdm(chr_range, disable=len(chr_range) == 1): #load or extract the bins for the current chromosome try: df_bins_chr = self.df_bins.query('CHR==%d' % (chr_num)) except AttributeError: df_bins_chr = self.load_bins_chr(args, chr_num) #compute LD-scores for this chromosome if args.ld_ukb: if args.ld_dir is None: ld_dir = tempfile.mkdtemp() else: ld_dir = args.ld_dir df_bins_chr = set_snpid_index(df_bins_chr) df_ldscores_chr = compute_ldscores_chr(df_bins_chr, ld_dir) elif args.bfile_chr is not None: df_ldscores_chr = self.compute_ldscores_plink_chr( args, chr_num, df_bins_chr) else: raise ValueError('no LDscore computation method specified') #save the LD-scores to disk ldscores_output_file = get_file_name(args, 'ldscores', chr_num, verify_exists=False) df_ldscores_chr.to_parquet(ldscores_output_file, index=False)
def get_bcor_meta(bcor_obj): df_ld_snps = bcor_obj.getMeta() df_ld_snps.rename(columns={ 'rsid': 'SNP', 'position': 'BP', 'chromosome': 'CHR', 'allele1': 'A1', 'allele2': 'A2' }, inplace=True, errors='raise') df_ld_snps['CHR'] = df_ld_snps['CHR'].astype(np.int) df_ld_snps['BP'] = df_ld_snps['BP'].astype(np.int) df_ld_snps = set_snpid_index(df_ld_snps) return df_ld_snps
def read_annot(annot_file): try: df_annot = pd.read_parquet(annot_file) except (ArrowIOError, ArrowInvalid): df_annot = pd.read_table(annot_file, sep='\s+') assert 'CHR' in df_annot.columns assert 'SNP' in df_annot.columns assert 'BP' in df_annot.columns assert 'A1' in df_annot.columns assert 'A2' in df_annot.columns for c in df_annot.columns: if c in META_COLUMNS: continue if not is_numeric_dtype(df_annot[c]): raise ValueError('Annotation %s does not have numeric values' % (c)) df_annot = set_snpid_index(df_annot) return df_annot
def load_ld_matrix(ld_dir, ld_prefix): #load the SNPs metadata gz_file = os.path.join(ld_dir, '%s.gz' % (ld_prefix)) try: df_ld_snps = pd.read_table(gz_file, delim_whitespace=True) except (ArrowIOError, ArrowInvalid): raise IOError('Corrupt file downloaded') df_ld_snps.rename(columns={ 'rsid': 'SNP', 'chromosome': 'CHR', 'position': 'BP', 'allele1': 'A1', 'allele2': 'A2' }, inplace=True, errors='ignore') assert 'SNP' in df_ld_snps.columns assert 'CHR' in df_ld_snps.columns assert 'BP' in df_ld_snps.columns assert 'A1' in df_ld_snps.columns assert 'A2' in df_ld_snps.columns df_ld_snps = set_snpid_index(df_ld_snps) #load the LD matrix npz_file = os.path.join(ld_dir, '%s.npz' % (ld_prefix)) logging.info('Loading LD from file %s' % (npz_file)) t0 = time.time() try: R = sparse.load_npz(npz_file).toarray() R += R.T except ValueError: raise IOError('Corrupt file downloaded') logging.info('Done in %0.2f seconds' % (time.time() - t0)) #create df_R and return it df_R = pd.DataFrame(R, index=df_ld_snps.index, columns=df_ld_snps.index) return df_R
t0 = time.time() try: df_snps = pd.read_parquet(args.sumstats) except (ArrowIOError, ArrowInvalid): df_snps = pd.read_table(args.sumstats, sep='\s+') if 'A1' not in df_snps.columns: raise ValueError('missing column A1') if 'A2' not in df_snps.columns: raise ValueError('missing column A2') if 'CHR' not in df_snps.columns: raise ValueError('missing column CHR') if 'BP' not in df_snps.columns: raise ValueError('missing column BP') #set index df_snps = set_snpid_index(df_snps) logging.info('Done in %0.2f seconds' % (time.time() - t0)) #make sure there aren't any duplicated SNPs if np.any(df_snps.index.duplicated()): raise ValueError( 'duplicate SNPs found in output - please make sure there aren\'t any duplicate SNPs in your sumstats file' ) #read df_meta logging.info('Loading meta-analyzed per-SNP-h2 files...') t0 = time.time() script_dir = os.path.dirname(os.path.realpath(__file__)) df_meta1 = pd.read_parquet( os.path.join(script_dir, 'snpvar_meta.chr1_7.parquet')) df_meta2 = pd.read_parquet(
def compute_ldscores(args): #read bim/snp array_snps = parse.PlinkBIMFile(args.bfile + '.bim') df_bim = array_snps.df if len(df_bim['CHR'].unique()) > 1: raise ValueError( 'plink file includes multiple chromosomes. Please specify a plink file with a single chromosome' ) df_bim = set_snpid_index(df_bim) #read annotations keep_snps = None if args.annot is not None: try: df_annot = pd.read_parquet(args.annot) except (ArrowIOError, ArrowInvalid): df_annot = pd.read_table(args.annot, sep='\s+') #Remove annotations of SNPs that are not in the .bim file df_annot = set_snpid_index(df_annot) df_annot = df_annot.loc[df_annot.index.isin(df_bim.index)] #make sure that all SNPs have annotations if np.any(~df_bim.index.isin(df_annot.index)): error_msg = 'Not all SNPs have annotation values' if args.allow_missing: is_good_snp = df_bim.index.isin(df_annot.index) if not np.any(is_good_snp): raise ValueError('No SNPs have annotations') keep_snps = np.where(is_good_snp)[0] logging.warning(error_msg) logging.warning( 'Keeping only %d/%d SNPs that have annotations' % (is_good_snp.sum(), len(is_good_snp))) else: raise ValueError( error_msg + '. If you wish to omit the missing SNPs, please use the flag --allow-missing' ) #make sure that all of the annotations are numeric for c in df_annot.columns: if c in SNP_COLUMNS: continue if not is_numeric_dtype(df_annot[c]): raise ValueError('Annotation %s does not have numeric values' % (c)) #find #individuals in bfile fam_file = args.bfile + '.fam' df_fam = pd.read_table(fam_file, header=None, usecols=[5], sep='\s+') n = df_fam.shape[0] #find keep_indivs if args.keep is None: keep_indivs = None else: array_indivs = parse.PlinkFAMFile(args.bfile + '.fam') keep_indivs = __filter__(args.keep, 'individuals', 'include', array_indivs) logging.info('after applying --keep, %d individuals remain' % (len(keep_indivs))) #read plink file bed_file = args.bfile + '.bed' geno_array = ldscore.PlinkBEDFile(bed_file, n, array_snps, keep_snps=keep_snps, keep_indivs=keep_indivs, mafMin=None) #remove omitted SNPs from df_bim if len(geno_array.kept_snps) < df_bim.shape[0]: assert np.all( np.array(geno_array.kept_snps) == np.sort( np.array(geno_array.kept_snps))) assert geno_array.kept_snps[-1] < df_bim.shape[0] df_bim = df_bim.iloc[geno_array.kept_snps] #rearrange annotations to match the order of SNPs in the plink file if args.annot is not None: assert df_annot.shape[0] >= df_bim.shape[0] if (df_annot.shape[0] > df_bim.shape[0]) or np.any( df_annot.index != df_bim.index): assert np.all(df_bim.index.isin(df_annot.index)) df_annot = df_annot.loc[df_bim.index] # determine block widths num_wind_args = np.array( (args.ld_wind_snps, args.ld_wind_kb, args.ld_wind_cm), dtype=bool) if np.sum(num_wind_args) != 1: raise ValueError('Must specify exactly one --ld-wind option') if args.ld_wind_snps: max_dist = args.ld_wind_snps coords = np.array(list(range(geno_array.m))) elif args.ld_wind_kb: max_dist = args.ld_wind_kb * 1000 coords = np.array(df_bim['BP']) if len(np.unique(coords)) == 1: raise ValueError( 'bim file has no basepair data --- please use a different ld-wind option' ) elif args.ld_wind_cm: max_dist = args.ld_wind_cm coords = np.array(df_bim['CM']) if len(np.unique(coords)) == 1: raise ValueError( 'bim file has no CM data --- please use a different ld-wind option' ) #compute LD-scores block_left = ldscore.getBlockLefts(coords, max_dist) if block_left[len(block_left) - 1] == 0: error_msg = 'Only a single block selected - this is probably a mistake' raise ValueError(error_msg) t0 = time.time() geno_array._currentSNP = 0 annot_values = (None if args.annot is None else df_annot.drop( columns=SNP_COLUMNS).values) ldscores = geno_array.ldScoreVarBlocks(block_left, args.chunk_size, annot=annot_values) #create an ldscores df if args.annot is None: df_ldscores = pd.DataFrame(ldscores, columns=['base']) else: df_ldscores = pd.DataFrame( ldscores, columns=df_annot.drop(columns=SNP_COLUMNS).columns) #add SNP identifier columns for c in SNP_COLUMNS: df_ldscores[c] = df_bim[c].values df_ldscores = df_ldscores[ SNP_COLUMNS + list(df_ldscores.drop(columns=SNP_COLUMNS).columns)] return df_ldscores
df_snps = pd.read_parquet(args.pips) except (ArrowIOError, ArrowInvalid): df_snps = pd.read_table(args.pips, sep='\s+') if 'A1' not in df_snps.columns: raise ValueError('missing column A1') if 'A2' not in df_snps.columns: raise ValueError('missing column A2') if 'CHR' not in df_snps.columns: raise ValueError('missing column CHR') if 'BP' not in df_snps.columns: raise ValueError('missing column BP') if 'PIP' not in df_snps.columns: raise ValueError('missing column PIP') #set index df_snps = set_snpid_index(df_snps) #restrict to SNPs with a large PIP df_snps = df_snps.query('PIP>=%s'%(args.pip_cutoff)) if df_snps.shape[0]==0: raise ValueError('No SNPs with PIP>=%s found'%(args.pip_cutoff)) #read df_annot logging.info('Loading annotations file...') t0 = time.time() try: df_annot = pd.read_parquet(args.annot) except (ArrowIOError, ArrowInvalid): df_annot = pd.read_table(args.annot, sep='\s+') df_annot = set_snpid_index(df_annot) logging.info('Done in %0.2f seconds'%(time.time() - t0))
def compute_ldscores_plink_chr(self, args, chr_num, df_bins_chr): # read bim/snp bim_file = get_file_name(args, 'bim', chr_num) array_snps = parse.PlinkBIMFile(bim_file) df_bim = array_snps.df df_bim = set_snpid_index(df_bim) #Remove annotations of SNPs that are not in the .bim file df_bins_chr = set_snpid_index(df_bins_chr) df_bins_chr = df_bins_chr.loc[df_bins_chr.index.isin(df_bim.index)] #make sure that all SNPs have a bin keep_snps = None if np.any(~df_bim.index.isin(df_bins_chr.index)): error_msg = 'Not all SNPs were assigned a bin (meaning some SNPS are not in the annotation files)' if args.allow_missing: is_good_snp = df_bim.index.isin(df_bins_chr.index) if not np.any(is_good_snp): raise ValueError( 'No SNPs in chromosome %d have annotations' % (chr_num)) keep_snps = np.where(is_good_snp)[0] logging.warning(error_msg) logging.warning( 'Keeping only %d/%d SNPs in chromosome %d that have annotations' % (df_bim.shape[0], len(is_good_snp), chr_num)) else: raise ValueError( error_msg + '. If you wish to omit the missing SNPs, please use the flag --allow-missing' ) #find #individuals in bfile fam_file = get_file_name(args, 'fam', chr_num) df_fam = pd.read_table(fam_file, header=None, usecols=[5], sep='\s+') n = df_fam.shape[0] #find keep_indivs if args.keep is None: keep_indivs = None else: array_indivs = parse.PlinkFAMFile(args.bfile + '.fam') keep_indivs = __filter__(args.keep, 'individuals', 'include', array_indivs) logging.info('after applying --keep, %d individuals remain' % (len(keep_indivs))) #read plink file logging.info('Loading SNP file...') bed_file = get_file_name(args, 'bed', chr_num) geno_array = ldscore.PlinkBEDFile(bed_file, n, array_snps, keep_snps=keep_snps, keep_indivs=keep_indivs, mafMin=None) #remove omitted SNPs from df_bim if len(geno_array.kept_snps) != df_bim.shape[0]: assert np.all( np.array(geno_array.kept_snps) == np.sort( np.array(geno_array.kept_snps))) assert geno_array.kept_snps[-1] < df_bim.shape[0] df_bim = df_bim.iloc[geno_array.kept_snps] #rearrange annotations to match the order of SNPs in the plink file assert df_bins_chr.shape[0] >= df_bim.shape[0] if (df_bins_chr.shape[0] > df_bim.shape[0]) or np.any( df_bins_chr.index != df_bim.index): assert np.all(df_bim.index.isin(df_bins_chr.index)) df_bins_chr = df_bins_chr.loc[df_bim.index] # determine block widths num_wind_args = np.array( (args.ld_wind_snps, args.ld_wind_kb, args.ld_wind_cm), dtype=bool) if np.sum(num_wind_args) != 1: raise ValueError('Must specify exactly one --ld-wind option') if args.ld_wind_snps: max_dist = args.ld_wind_snps coords = np.array(list(range(geno_array.m))) elif args.ld_wind_kb: max_dist = args.ld_wind_kb * 1000 coords = np.array(df_bim['BP']) if len(np.unique(coords)) == 1: raise ValueError( 'bim file has no basepair data --- please use a different ld-wind option' ) elif args.ld_wind_cm: max_dist = args.ld_wind_cm coords = np.array(df_bim['CM']) if len(np.unique(coords)) == 1: raise ValueError( 'bim file has no CM data --- please use a different ld-wind option' ) #compute LD-scores block_left = ldscore.getBlockLefts(coords, max_dist) if block_left[len(block_left) - 1] == 0: error_msg = 'Only a single block selected - this is probably a mistake' raise ValueError(error_msg) t0 = time.time() geno_array._currentSNP = 0 logging.info('Computing LD scores for chromosome %d' % (chr_num)) ldscores = geno_array.ldScoreVarBlocks( block_left, args.chunk_size, annot=df_bins_chr.drop(columns=SNP_COLUMNS).values) #create an ldscores df df_ldscores = pd.DataFrame( ldscores, index=df_bins_chr.index, columns=df_bins_chr.drop(columns=SNP_COLUMNS).columns) df_ldscores = pd.concat((df_bins_chr[SNP_COLUMNS], df_ldscores), axis=1) return df_ldscores
def save_snpvar_to_disk(self, args, use_ridge, constrain_range): if constrain_range: logging.info('Saving constrained SNP variances to disk') else: logging.info('Saving SNP variances to disk') #determine which df_snpvar to use if use_ridge: df_snpvar = self.df_snpvar_ridge else: df_snpvar = self.df_snpvar #constrain the ratio between the largest and smallest snp-var if constrain_range: df_snpvar = df_snpvar.copy() h2_total = df_snpvar['SNPVAR'].sum() min_snpvar = df_snpvar['SNPVAR'].max() / args.q df_snpvar.loc[df_snpvar['SNPVAR'] < min_snpvar, 'SNPVAR'] = min_snpvar df_snpvar['SNPVAR'] *= h2_total / df_snpvar['SNPVAR'].sum() assert np.isclose(df_snpvar['SNPVAR'].sum(), h2_total) #merge snpvar with sumstats try: df_sumstats = pd.read_parquet(args.sumstats) except (ArrowIOError, ArrowInvalid): df_sumstats = pd.read_table(args.sumstats, sep='\s+') df_sumstats.drop(columns=['SNP'], errors='ignore', inplace=True) for col in ['CHR', 'BP', 'A1', 'A2']: if col not in df_sumstats.columns: raise ValueError('sumstats file has a missing column: %s' % (col)) df_snpvar = set_snpid_index(df_snpvar, copy=True) df_sumstats = set_snpid_index(df_sumstats) svpvar_cols = df_snpvar.columns.copy() df_snpvar.drop(columns=['CHR', 'BP', 'A1', 'A2'], inplace=True) df_snpvar = df_snpvar.merge(df_sumstats, left_index=True, right_index=True) df_snpvar = df_snpvar[ list(svpvar_cols) + [c for c in df_sumstats.columns if c not in list(svpvar_cols)]] if df_snpvar.shape[0] < df_sumstats.shape[0]: error_message = 'not all SNPs in the sumstats file are also in the annotations file' if args.allow_missing: logging.warning(error_message + '. Keeping %d/%d SNPs' % (df_snpvar.shape[0], df_sumstats.shape[0])) else: raise ValueError( error_message + '. If you wish to omit the missing SNPs, please use the flag --allow-missing' ) #iterate over chromosomes for chr_num in tqdm(range(1, 23)): #define output file name output_fname = 'snpvar' if use_ridge: output_fname += '_ridge' if constrain_range: output_fname += '_constrained' snpvar_chr_file = get_file_name(args, output_fname, chr_num, verify_exists=False) #save snpvar to file df_snpvar_chr = df_snpvar.query('CHR==%d' % (chr_num)) df_snpvar_chr.to_csv(snpvar_chr_file, index=False, sep='\t', compression='gzip', float_format='%0.4e')
def main(args): #read sumstats file try: df_sumstats = pd.read_parquet(args.sumstats) except (ArrowIOError, ArrowInvalid): df_sumstats = pd.read_table(args.sumstats, sep='\s+') #compute p-values if needed if args.pvalue_cutoff is not None: df_sumstats['P'] = stats.chi2(1).sf(df_sumstats['Z']**2) #read regions file df_regions = pd.read_table(args.regions_file) if args.chr is not None: df_regions = df_regions.query('CHR==%d' % (args.chr)) if df_regions.shape[0] == 0: raise ValueError('no SNPs found in chromosome %d' % (args.chr)) df_regions = df_regions.loc[df_regions.apply(lambda r: np.any( (df_sumstats['CHR'] == r['CHR']) & (df_sumstats['BP'].between(r['START'], r['END']))), axis=1)] #aggregate outputs df_sumstats_list = [] logging.info('Aggregating results...') for _, r in tqdm(df_regions.iterrows()): chr_num, start, end, url_prefix = r['CHR'], r['START'], r['END'], r[ 'URL_PREFIX'] #apply p-value filter if needed if args.pvalue_cutoff is not None: df_sumstats_r = df_sumstats.query('CHR==%d & %d <= BP <= %d' % (chr_num, start, end)) if np.all(df_sumstats_r['P'] > args.pvalue_cutoff): continue output_file_r = '%s.chr%s.%s_%s.gz' % (args.out_prefix, chr_num, start, end) if not os.path.exists(output_file_r): err_msg = 'output file for chromosome %d bp %d-%d doesn\'t exist' % ( chr_num, start, end) if args.allow_missing_jobs: logging.warning(err_msg) continue else: raise IOError( err_msg + '.\nTo override this error, please provide the flag --allow-missing-jobs' ) df_sumstats_r = pd.read_table(output_file_r) #mark distance from center middle = (start + end) // 2 df_sumstats_r['DISTANCE_FROM_CENTER'] = np.abs(df_sumstats_r['BP'] - middle) df_sumstats_list.append(df_sumstats_r) if len(df_sumstats_list) == 0: raise ValueError('no output files found') #keep only the most central result for each SNP df_sumstats = pd.concat(df_sumstats_list, axis=0) df_sumstats.sort_values('DISTANCE_FROM_CENTER', inplace=True, ascending=True) df_sumstats = set_snpid_index(df_sumstats, allow_duplicates=True) df_sumstats = df_sumstats.loc[~df_sumstats.index.duplicated(keep='first')] del df_sumstats['DISTANCE_FROM_CENTER'] df_sumstats.sort_values(['CHR', 'BP'], inplace=True, ascending=True) #write output file if args.adjust_beta_freq: df_sumstats['BETA_MEAN'] /= np.sqrt(2 * df_sumstats['MAF'] * (1 - df_sumstats['MAF'])) df_sumstats['BETA_SD'] /= np.sqrt(2 * df_sumstats['MAF'] * (1 - df_sumstats['MAF'])) df_sumstats.to_csv(args.out, sep='\t', index=False) logging.info('Wrote aggregated results to %s' % (args.out))
def compute_prs_for_file(args, plink_file, df_betas, temp_dir, ranges_file=None, keep_file=None): #read the bim file plink_file_prefix = plink_file[:plink_file.rfind('.')] df_bim = pd.read_csv(plink_file_prefix + '.bim', header=None, names=['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], delim_whitespace=True) df_bim = set_snpid_index(df_bim) #keep only relevant SNPs df_betas = df_betas.merge(df_bim[['SNP']], left_index=True, right_index=True, suffixes=('_betas', '_bim')) if df_betas.shape[0] == 0: raise ValueError('No betas found for SNPs in plink file %s' % (plink_file_prefix)) #create temp files betas_file = os.path.join(temp_dir, next(tempfile._get_candidate_names())) outfile_prs_temp = os.path.join(temp_dir, next(tempfile._get_candidate_names())) #save the betas to a file df_betas[['SNP_bim', 'A1', 'BETA']].to_csv(betas_file, header=False, index=False, sep='\t', float_format='%0.8e') #Run plink plink_exe = args.plink_exe if (args.plink_exe is not None) else args.plink2_exe plink_cmd = '%s --allow-no-sex --extract %s --out %s --memory %d --threads %d' % ( plink_exe, betas_file, outfile_prs_temp, args.memory * 1024, args.threads) if plink_file.endswith('.pgen'): plink_cmd += ' --bpfile %s --score %s cols=scoresums' % ( plink_file_prefix, betas_file) elif plink_file.endswith('.bed'): plink_cmd += ' --bfile %s --score %s sum' % (plink_file_prefix, betas_file) else: raise ValueError('neither --bed nor --pgen specified') if ranges_file is not None: scores_file = os.path.join(temp_dir, next(tempfile._get_candidate_names())) df_betas[['SNP_bim', 'score']].drop_duplicates('SNP_bim').to_csv(scores_file, sep='\t', header=False, index=False) plink_cmd += ' --q-score-range %s %s' % (ranges_file, scores_file) if keep_file is not None: plink_cmd += ' --keep %s' % (keep_file) os.system(plink_cmd) #read results if ranges_file is None: if plink_file.endswith('.bed'): df_prs = pd.read_csv(outfile_prs_temp + '.profile', delim_whitespace=True) elif plink_file.endswith('.pgen'): df_prs = pd.read_csv(outfile_prs_temp + '.sscore', delim_whitespace=True) df_prs.rename(columns={ '#IID': 'IID', 'SCORE1_SUM': 'SCORESUM' }, inplace=True) df_prs['FID'] = df_prs['IID'] else: raise ValueError('neither --bed nor --pgen specified') df_prs.set_index('IID', inplace=True, drop=True) if np.any(df_prs.index.duplicated()): raise ValueError('duplicated iids found in %s' % (plink_file_prefix)) else: df_prs = None if plink_file.endswith('.bed'): jk_files = glob(outfile_prs_temp + '.*.profile') elif plink_file.endswith('.pgen'): jk_files = glob(outfile_prs_temp + '.*.sscore') else: raise ValueError('neither --bed nor --pgen specified') for jk_file in jk_files: df_jk = pd.read_csv(jk_file, delim_whitespace=True) df_jk.rename(columns={ '#IID': 'IID', 'SCORE1_SUM': 'SCORESUM' }, inplace=True) df_jk['FID'] = df_jk['IID'] df_jk.set_index('IID', inplace=True, drop=True) if np.any(df_jk.index.duplicated()): raise ValueError('duplicated iids found in %s' % (plink_file_prefix)) jk_file_basename = os.path.basename(jk_file) block_name = jk_file_basename.split('.')[-2] block_num = int(block_name[5:]) scoresum_colname = 'SCORESUM.jk%d' % (block_num) df_jk.rename(columns={'SCORESUM': scoresum_colname}, inplace=True) if df_prs is None: df_prs = df_jk df_prs['SCORESUM'] = 0 else: assert np.all(df_jk.index == df_prs.index) df_prs[scoresum_colname] = df_jk[scoresum_colname] df_prs['SCORESUM'] += df_jk[scoresum_colname] if df_prs is None: raise ValueError('The following plink command failed:\n%s' % (plink_cmd)) return df_prs
def load_betas_files(betas_file, verbose=True): if verbose: logging.info('Loading betas file %s...' % (betas_file)) t0 = time.time() try: df_betas = pd.read_parquet(betas_file) if len(df_betas.index.names) > 1: df_betas.reset_index(inplace=True) except (ArrowIOError, ArrowInvalid): if betas_file.endswith('.parquet'): raise IOError('corrupt parquet file: %s' % (betas_file)) df_betas = pd.read_csv(betas_file, delim_whitespace=True) if verbose: logging.info('done in %0.2f seconds' % (time.time() - t0)) #rename columns if needed df_betas.rename(columns={ 'sid': 'SNP', 'nt1': 'A1', 'nt2': 'A2', 'BETA_MEAN': 'BETA', 'ldpred_inf_beta': 'BETA', 'chrom': 'CHR', 'Chrom': 'CHR', 'pos': 'BP' }, inplace=True, errors='ignore') if not is_numeric_dtype(df_betas['CHR']): if df_betas['CHR'].str.startswith('chrom_').all(): df_betas['CHR'] = df_betas['CHR'].str[6:].astype(np.int) else: raise ValueError('unknown CHR format') df_betas.rename(columns={ 'BETA_joint': 'BETA', 'ALLELE1': 'A1', 'ALLELE0': 'A2', 'beta_mean': 'BETA', 'MAF_BOLT': 'A1Frq', 'Name': 'SNP', 'A1Effect': 'BETA', 'Name': 'SNP', 'Chrom': 'CHR', 'Position': 'BP', 'beta': 'BETA' }, inplace=True, errors='ignore') #create index df_betas = set_snpid_index(df_betas) #subset SNPs according to extract file if args.extract is not None: df_extract = pd.read_csv(args.extract, header=None, squeeze=True) df_betas = df_betas.loc[df_betas['SNP'].isin(df_extract)] if df_betas.shape[0] == 0: raise ValueError('No SNPs remained after applying --extract') if verbose: logging.info('#SNPs after --extract: %s' % (df_betas.shape[0])) return df_betas