def test_read_plink1_bin(): datafiles = join(dirname(realpath(__file__)), "data_files") file_prefix = join(datafiles, "data") bim = file_prefix + ".bim" bed = file_prefix + ".bed" fam = file_prefix + ".fam" G = read_plink1_bin(bed, bim, fam, verbose=False) assert_equal(G.data.dtype, dtype("float64")) snp = G.where((G.chrom == "1") & (G.pos == 72515), drop=True)["snp"].values assert_array_equal(snp, ["rs4030300"]) shape = G.where(G.chrom == "1", drop=True).shape assert_array_equal(shape, [3, 10]) shape = G.where(G.chrom == "2", drop=True).shape assert_array_equal(shape, [3, 0]) g = G.where((G.fid == "Sample_2") & (G.iid == "Sample_2"), drop=True) assert_array_equal(g["trait"].values, -9) arr = [ [2.0, 2.0, nan, nan, 2.0, 2.0, 2.0, 2.0, 1.0, 2.0], [2.0, 1.0, nan, nan, 2.0, 2.0, 1.0, 2.0, 2.0, 1.0], [1.0, 2.0, nan, 1.0, 2.0, 2.0, 0.0, 2.0, 2.0, 2.0], ] assert_array_equal(G, arr)
def read_genotype(geno_prefix): try: G = read_plink1_bin(geno_prefix + '.bed', geno_prefix + '.bim', geno_prefix + '.fam', ref='a0', verbose=False) except Exception: return None return G
def test_read_plink1_bin_wildcard(): datafiles = join(dirname(realpath(__file__)), "data_files") bed_files = join(datafiles, "chr*.bed") G = read_plink1_bin(bed_files, verbose=False) G.where(G.chrom == "11", drop=True).values assert_equal(G.where(G.chrom == "11", drop=True).shape, (14, 779)) assert_equal(G.where(G.chrom == "12", drop=True).shape, (14, 473)) x = [[0.00, 0.00], [0.00, 1.00]] assert_equal(G.where(G.chrom == "11", drop=True).values[:2, :2], x)
def ReadPlink(plink_file, bim, dtype=np.float32): Genotype = read_plink1_bin(plink_file + ".bed", plink_file + ".bim", plink_file + ".fam", verbose=False) Genotype = Genotype.where(Genotype.snp.isin( Genotype.snp.values[bim['index'].values]), drop=True) Genotype = Genotype.astype(np.int8) G_geno = Genotype.values G_geno[np.isnan(G_geno)] = 2 G_geno = 2 - G_geno return G_geno.astype(dtype)
def main(args): ## Parse arguments argp = ap.ArgumentParser(description="") argp.add_argument("--bfile", required=True, help="stem of the PLINK binary file name that contains data on both reference individuals and those to be dropped") args = argp.parse_args(args) bedfile = pdp.read_plink1_bin(str(args.bfile) + ".bed") nInds = bedfile.shape[0] outped = str(args.bfile) + "_pseudoHap.ped" for ind_idx in range(nInds): make_pseudohap(x=ind_idx, bedfile=bedfile, outped=outped) os.system("awk \'{print $1, $2, $3, $4}\' " + str(args.bfile)+ ".bim > " + str(args.bfile) + "_pseudoHap.map") args.bfile = str(args.bfile) + "_pseudoHap" os.system("plink1.9 --file " + str(args.bfile) + " --make-bed --out " + str(args.bfile)) os.system("rm " + str(args.bfile) + ".ped") os.system("rm " + str(args.bfile)+ ".map") return 0
def load_raw_bed(self): from pandas_plink import read_plink1_bin print( 'WARNING: THESE FUNCTIONS ARE VERY TIME CONSUMING. IT MIGHT TAKE UP TO 48 h' ) # read parts of bed values https://github.com/limix/pandas-plink/blob/master/doc/usage.rst # https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas """Loading Files""" # os.chdir(path_input) G = read_plink1_bin(self.path_raw + "Peds_CIO_merged_qc_data.bed", bim=None, fam=None, verbose=False) samples = G.sample.values # samples variants = G.variant.values s, v = len(samples), len(variants) print('Original shape: ', s, v) # Shape: 454 6726287 cadd = self.raw_cadd() '''Saving samples output''' np.save(self.path + 'samples', samples) '''Making sure the Cadd and variants are in the same order (very important)''' cadd['variants_cat'] = pd.Categorical(cadd['variants'], categories=variants, ordered=True) cadd_sort = cadd.sort_values(by=['variants_cat']) cadd_sort.reset_index(inplace=True) if np.equal(cadd_sort.variants, variants).sum() == len(variants): print('CADD and variants are in the same order') del cadd else: print('ERROR: CADD and variantres are in DIFFERENT order') cadd_sort.fillna(value={'CADD_PHRED': 0}, inplace=True) """First PRUNE: IF 0 IN ONE AND 1 ON ANOTHER: 2, IF 1 AND 0: 0, IF 0 AND 0: 1""" # Takes 48 hours to finish data_d, data_s, variants_, cadd_ = self.filteringSNPS( variants, cadd_sort.CADD_PHRED.values, samples, G, 'f1') """FINAL PRUNE: IF 0 IN ONE AND 1 ON ANOTHER: 2, IF 1 AND 0: 0, IF 0 AND 0: 1""" data_d, data_s, variants_, cadd_ = self.filteringSNPS( np.array(variants_), np.array(cadd_), samples, G, '', 0.2) fixing_erros() adding_known_snps_back(G, samples, variants_, cadd_, cadd_sort, data_d, data_s)
def test_read_plink1_bin_wildcard_not_found(): datafiles = join(dirname(realpath(__file__)), "data_files") bed_files = join(datafiles, "chrr*.bed") with pytest.raises(ValueError): read_plink1_bin(bed_files, verbose=False) bed_files = join(datafiles, "chr*.bed") with pytest.raises(ValueError): read_plink1_bin(bed_files, "chr11.bim", verbose=False) bed_files = join(datafiles, "chr*.bed") bim_files = join(datafiles, "chrr*.bim") with pytest.raises(ValueError): read_plink1_bin(bed_files, bim_files, verbose=False) bed_files = join(datafiles, "chr*.bed") bim_files = join(datafiles, "chr*.bim") fam_files = join(datafiles, "chr*.fam") with pytest.warns(UserWarning): read_plink1_bin(bed_files, bim_files, fam_files, verbose=True)
def read_bed(path, type_pos): ''' Read the .bed, .bim and .fam files Parameter : --------------- path : string path to .bed, .bim. fam files without the extension. They need to be all in the same folder type_pos : integer equal to 1 if we use the Morgans, equal to 2 if we use the base-pair coordinates Return : --------------- gen_A : pandas.DataFrame DataFrame with all the genotypes in the additiv model pos : pandas.Series Series with the position of the SNP in Morgans chr : pandas.Series Series with the number of the chromosome for each SNP ''' data = read_plink1_bin(path + ".bed", path + ".bim", path + ".fam", verbose=False) print("read OK") gen_A = pd.DataFrame(data=np.transpose(data.values), index=data.variant.snp.values) print("gen_A ok") if type_pos == 1: pos = pd.Series(np.transpose(data.variant.cm.values), index=data.variant.snp.values) elif type_pos == 2: pos = pd.Series(np.transpose(data.variant.pos.values), index=data.variant.snp.values) print("pos ok") chrom = pd.Series(np.transpose(data.variant.chrom.values), index=data.variant.snp.values) print("chrom ok") return (gen_A, pos, chrom)
def read_genotypes(self, genotype_files, ld_block_files, standardize=True): """ Read the genotype files :return: """ if self.verbose: print("> Reading genotype files...") if not iterable(ld_block_files): ld_block_files = [ld_block_files] self.n_per_snp = {} self.genotypes = {} self.bed_files = {} for i, (bfile, ldb_file) in tqdm(enumerate( zip_longest(genotype_files, ld_block_files)), disable=not self.verbose): # Read plink file: try: gt_ac = read_plink1_bin(bfile + ".bed", ref="a0", verbose=False) except ValueError: gt_ac = read_plink1_bin(bfile, ref="a0", verbose=False) except Exception as e: self.genotypes = None self.sample_ids = None raise e gt_ac = gt_ac.set_index(variant='snp') # Filter individuals: if self.keep_individuals is not None: common_samples = pd.DataFrame({ 'Sample': gt_ac.sample.values }).merge( pd.DataFrame({'Sample': self.keep_individuals}, dtype=type( gt_ac.sample.values[0])))['Sample'].values gt_ac = gt_ac.sel(sample=common_samples) # Filter SNPs: if self.keep_snps is not None: common_snps = pd.DataFrame({ 'SNP': gt_ac.variant.values }).merge(pd.DataFrame({'SNP': self.keep_snps}))['SNP'].values gt_ac = gt_ac.sel(variant=common_snps) # Obtain information about current chromosome: chr_id, (chr_n, chr_p) = int(gt_ac.chrom.values[0]), gt_ac.shape # Assign the number of samples per SNP # This accounts for missing data self.n_per_snp[chr_id] = gt_ac.shape[0] - gt_ac.isnull().sum( axis=0).compute().values maf = gt_ac.sum(axis=0) / (2. * self.n_per_snp[chr_id]) #maf = np.round(np.where(maf > .5, 1. - maf, maf), 6) gt_ac = gt_ac.assign_coords({"MAF": ("variant", maf)}) # Standardize genotype matrix: if standardize: gt_ac = (gt_ac - gt_ac.mean(axis=0)) / gt_ac.std(axis=0) self.standardized_genotype = standardize gt_ac = gt_ac.fillna(0.) # Add filename to the bedfiles dictionary: self.bed_files[chr_id] = bfile if i == 0: self.sample_ids = gt_ac.sample.values # TODO: Harmonize the code given the updated keys (using chrom_id now). self.genotypes[chr_id] = {'CHR': chr_id, 'G': gt_ac} # If an LD block file is provided, then read it, # match snps with their corresponding blocks, # and create a list of snp coordinates in each block: if ldb_file is not None: # Read LD block file: ldb_df = pd.read_csv(ldb_file, delim_whitespace=True) # Create a SNP dataframe with BP position: snp_df = pd.DataFrame({'pos': gt_ac.pos.values}) # Assign each SNP its appropriate block ID snp_df['block_id'] = snp_df['pos'].apply( lambda pos: ldb_df.loc[(pos >= ldb_df['start']) & (pos < ldb_df['stop'])].index[0]) ld_blocks = [] for b_idx in range(len(ldb_df)): ld_blocks.append( da.array(snp_df.loc[snp_df['block_id'] == b_idx].index.tolist())) self.genotypes[i]['LD Blocks'] = ld_blocks
def read_plink(bed_file): snp_info = read_plink1_bin(bed_file + ".bed", bed_file + ".bim", bed_file + ".fam", verbose=False) return snp_info.values
def load_plink_array(path_to_plink_files: Optional[Union[str, Path]] = None, bed: Optional[Union[str, Path]] = None, bim: Optional[Union[str, Path]] = None, fam: Optional[Union[str, Path]] = None, transpose: bool = False) -> da.core.Array: """Gathers plink array from possible formats Requires one of the following parameter configures to be satisfied to load an array: - array is specified and no other parameters are specified - path_to_files is specified and no other parameters are specified - bim AND fam AND bed are specified and no other parameters are specified Parameters ---------- path_to_plink_files : path_like, optional Assuming bim, fam, bed files are in the following format </path/to/data.bim> </path/to/data.fam> </path/to/data.bed> Then, path_to_files would be '/path/to/data' bed : path_like, optional '/path/to/data.bed' bim : path_like, optional '/path/to/data.bim' fam : path_like, optional '/path/to/data.fam' transpose : bool Whether `array` is stored/loaded in transposed format If A is stored/loaded as A.T but SVD(A) is desired set transpose flag to True Returns ------- array : dask.array.core.Array """ if path_to_plink_files is not None and not all([bed, bim, fam]): (_, _, G) = read_plink(path_to_plink_files) array = G elif all(p is not None for p in [bed, bim, fam]) and not path_to_plink_files: G = read_plink1_bin(bed, bim, fam) array = G.data else: raise ValueError( 'Uninterpretable input.' ' Please specify array, xor path_to_files, xor (bed and bim and fim)' ) try: array = da.from_array(array) except AttributeError: raise ValueError('Uninterpretable array.') except ValueError: pass if len(array.shape) != 2: raise ValueError("Must be a 2-D array") if transpose: array = array.T return array
TType=torch.cuda.FloatTensor else: if args.double: TType=torch.DoubleTensor else: TType=torch.FloatTensor if args.nosubnormal: torch.set_flush_denormal(True) #floatlib.set_ftz() #floatlib.set_daz() #n = int(args.cols); p = int(args.rows) bedfile = "/shared/ukbiobank_filtered/filtered_200k.bed" famfile = "/shared/ukbiobank_filtered/filtered_200k.2.fam" G = read_plink1_bin(bedfile, fam=famfile, verbose=False) n = G.shape[0] p_pheno = 11 p = G.shape[1] + 6 start_ind = (p // size) * rank end_ind = (p // size) * (rank + 1) pheno = genfromtxt("/shared/ukbiobank_filtered/ukb_short.filtered.200k.tab", skip_header=1) if rank != size - 1: X_chunk = G[:, start_ind:end_ind].data.compute() else: X_chunk = da.hstack([G[:,start_ind:].data, da.zeros((n, 6))]).compute() X_chunk[:, -11:] = pheno[:, 1:p_pheno + 1]
def load_genotype_from_bedfile(bedfile, indiv_list, snplist_to_exclude, chromosome=None, load_first_n_samples=None, missing_rate_cutoff=0.5, return_snp=False, standardize=True): G = read_plink1_bin(bedfile, verbose=False) if chromosome is not None: chr_str = G.chrom[0].values.tolist() if 'chr' in chr_str: chromosome = 'chr' + str(chromosome) else: chromosome = str(chromosome) G = G.where(G.chrom == chromosome, drop=True) df_geno_indiv = pd.DataFrame({'indiv': G.sample.to_series().tolist()}) df_geno_indiv['idx'] = [ i for i in range(df_geno_indiv.shape[0]) ] if indiv_list is None: indiv_list = G.sample.to_series().tolist() if load_first_n_samples is not None: indiv_list = indiv_list[:load_first_n_samples] df_target_indiv = pd.DataFrame({'indiv': indiv_list}) df_geno_indiv = pd.merge(df_geno_indiv, df_target_indiv, on='indiv').sort_values(by=['idx']) if df_geno_indiv.shape[0] != len(indiv_list): raise ValueError('There are input individuals that do not appear in BED file.') query_indiv_list = df_geno_indiv.indiv.tolist() snpid = G.variant.variant.to_series().to_list() snpid = np.array([ s.split('_')[1] for s in snpid ]) if return_snp is True: a0 = G.variant.a0.to_series().to_numpy() a1 = G.variant.a1.to_series().to_numpy() chrom = G.variant.chrom.to_series().to_numpy() geno = G.sel(sample=query_indiv_list).values # re-order to target indiv_list geno = geno[match_y_to_x(np.array(query_indiv_list), np.array(indiv_list)), :] # filter out unwanted snps geno = geno[:, ~np.isin(snpid, snplist_to_exclude)] if return_snp is True: a0 = a0[~np.isin(snpid, snplist_to_exclude)] a1 = a1[~np.isin(snpid, snplist_to_exclude)] chrom = chrom[~np.isin(snpid, snplist_to_exclude)] snpid = snpid[~np.isin(snpid, snplist_to_exclude)] # filter out genotypes with high missing rate missing_rate = np.isnan(geno).mean(axis=0) geno = geno[:, missing_rate < missing_rate_cutoff] if return_snp is True: snpid = snpid[missing_rate < missing_rate_cutoff] a0 = a0[missing_rate < missing_rate_cutoff] a1 = a1[missing_rate < missing_rate_cutoff] chrom = chrom[missing_rate < missing_rate_cutoff] maf = np.nanmean(geno, axis=0) / 2 # impute genotype missing value miss_x, miss_y = np.where(np.isnan(geno)) geno[(miss_x, miss_y)] = maf[miss_y] * 2 var_geno = 2 * maf * (1 - maf) # keep only genotypes with variance != 0 to_keep = var_geno != 0 geno = geno[:, to_keep] if return_snp is True: snpid = snpid[to_keep] a0 = a0[to_keep] a1 = a1[to_keep] chrom = chrom[to_keep] maf = maf[to_keep] var_geno = var_geno[to_keep] if standardize is True: geno = (geno - 2 * maf) / np.sqrt(var_geno) if return_snp is True: return geno, indiv_list, np.sqrt(var_geno), (snpid.tolist(), a0.tolist(), a1.tolist(), chrom.tolist()) else: return geno, indiv_list, np.sqrt(var_geno)
import pandas_plink as pp from dask.diagnostics import ProgressBar for i in range(1, 23): G = pp.read_plink1_bin("PATH_TO_BED_TRAIN" + str(i) + '.bed') G = G.astype('int8') G = G.to_dataset() with ProgressBar(): G.compute() with ProgressBar(): G.to_zarr('PATH_TO_TRAINING_DATA' + str(i))
import pandas_plink as pp from dask.diagnostics import ProgressBar for i in range(1, 24): G = pp.read_plink1_bin("PATH_TO_CHROMOSOME_DATA_" + str(i) + '.bed') G = G.astype('int8') G = G.to_dataset() with ProgressBar(): G.compute() with ProgressBar(): G.to_zarr('PATH_TO_ZARR_' + str(i))
def plink_inputs(self): # Initializing some variables plink_exec = genoml.dependencies.check_plink() impute_type = self.impute_type addit_df = self.addit_df pheno_df = self.pheno_df outfile_h5 = self.run_prefix + ".dataForML.h5" pheno_df.to_hdf(outfile_h5, key='pheno', mode='w') if (self.geno_path != None): if (self.skip_prune == "no"): # Set the bashes bash1a = f"{plink_exec} --bfile " + self.geno_path + " --indep-pairwise 1000 50 " + self.r2 bash1b = f"{plink_exec} --bfile " + self.geno_path + " --extract " + self.run_prefix + \ ".p_threshold_variants.tab" + " --indep-pairwise 1000 50 " + self.r2 # may want to consider outputting temp_genos to dir in run_prefix bash2 = f"{plink_exec} --bfile " + self.geno_path + \ " --extract plink.prune.in --make-bed --out temp_genos" bash3 = "cut -f 2,5 temp_genos.bim > " + \ self.run_prefix + ".variants_and_alleles.tab" bash4 = "rm plink.log" bash5 = "rm plink.prune.*" # bash6 = "rm " + self.run_prefix + ".log" # Set the bash command groups cmds_a = [bash1a, bash2, bash3, bash4, bash5] cmds_b = [bash1b, bash2, bash3, bash4, bash5] if (self.gwas_path != None) & (self.geno_path != None): p_thresh = self.p_gwas gwas_df_reduced = self.gwas_df[['SNP', 'p']] snps_to_keep = gwas_df_reduced.loc[(gwas_df_reduced['p'] <= p_thresh)] outfile = self.run_prefix + ".p_threshold_variants.tab" snps_to_keep.to_csv(outfile, index=False, sep="\t") print( f"Your candidate variant list prior to pruning is right here: {outfile}." ) if (self.gwas_path == None) & (self.geno_path != None): print( f"A list of pruned variants and the allele being counted in the dosages (usually the minor allele) can be found here: {self.run_prefix}.variants_and_alleles.tab" ) for cmd in cmds_a: subprocess.run(cmd, shell=True) if (self.gwas_path != None) & (self.geno_path != None): print( f"A list of pruned variants and the allele being counted in the dosages (usually the minor allele) can be found here: {self.run_prefix}.variants_and_alleles.tab" ) for cmd in cmds_b: subprocess.run(cmd, shell=True) if (self.skip_prune == "yes"): bash1a = f"{plink_exec} --bfile " + self.geno_path bash1b = f"{plink_exec} --bfile " + self.geno_path + " --extract " + self.run_prefix + ".p_threshold_variants.tab" + " --make-bed --out temp_genos" # may want to consider outputting temp_genos to dir in run_prefix bash2 = f"{plink_exec} --bfile " + self.geno_path + " --make-bed --out temp_genos" bash3 = "cut -f 2,5 temp_genos.bim > " + self.run_prefix + ".variants_and_alleles.tab" bash4 = "rm plink.log" # Set the bash command groups cmds_a = [bash1a, bash2, bash3, bash4] cmds_b = [bash1b, bash3, bash4] if (self.gwas_path != None) & (self.geno_path != None): p_thresh = self.p_gwas gwas_df_reduced = self.gwas_df[['SNP', 'p']] snps_to_keep = gwas_df_reduced.loc[(gwas_df_reduced['p'] <= p_thresh)] outfile = self.run_prefix + ".p_threshold_variants.tab" snps_to_keep.to_csv(outfile, index=False, sep="\t") print( f"Your candidate variant list is right here: {outfile}." ) if (self.gwas_path == None) & (self.geno_path != None): print( f"A list of variants and the allele being counted in the dosages (usually the minor allele) can be found here: {self.run_prefix}.variants_and_alleles.tab" ) for cmd in cmds_a: subprocess.run(cmd, shell=True) if (self.gwas_path != None) & (self.geno_path != None): print( f"A list of variants and the allele being counted in the dosages (usually the minor allele) can be found here: {self.run_prefix}.variants_and_alleles.tab" ) for cmd in cmds_b: subprocess.run(cmd, shell=True) if (self.geno_path != None): g = read_plink1_bin('temp_genos.bed') g_pruned = g.drop([ 'fid', 'father', 'mother', 'gender', 'trait', 'chrom', 'cm', 'pos', 'a1' ]) g_pruned = g_pruned.set_index({'sample': 'iid', 'variant': 'snp'}) g_pruned.values = g_pruned.values.astype('int') # swap pandas-plink genotype coding to match .raw format...more about that below: # for example, assuming C in minor allele, alleles are coded in plink .raw labels homozygous for minor allele as 2 and homozygous for major allele as 0: #A A -> 0 #A C -> 1 #C C -> 2 #0 0 -> NA # where as, read_plink1_bin flips these, with homozygous minor allele = 0 and homozygous major allele = 2 #A A -> 2 #A C -> 1 #C C -> 0 #0 0 -> NA two_idx = (g_pruned.values == 2) zero_idx = (g_pruned.values == 0) g_pruned.values[two_idx] = 0 g_pruned.values[zero_idx] = 2 g_pd = g_pruned.to_pandas() g_pd.reset_index(inplace=True) raw_df = g_pd.rename(columns={'sample': 'ID'}) # del raw_df.index.name # del raw_df.columns.name # now, remove temp_genos bash_rm_temp = "rm temp_genos.*" print(bash_rm_temp) subprocess.run(bash_rm_temp, shell=True) # Checking the impute flag and execute # Currently only supports mean and median impute_list = ["mean", "median"] if (self.geno_path != None): if impute_type not in impute_list: return "The 2 types of imputation currently supported are 'mean' and 'median'" elif impute_type.lower() == "mean": raw_df = raw_df.fillna(raw_df.mean()) elif impute_type.lower() == "median": raw_df = raw_df.fillna(raw_df.median()) print("") print( f"You have just imputed your genotype features, covering up NAs with the column {impute_type} so that analyses don't crash due to missing data." ) print( "Now your genotype features might look a little better (showing the first few lines of the left-most and right-most columns)..." ) print("#" * 70) print(raw_df.describe()) print("#" * 70) print("") # Checking the imputation of non-genotype features if (self.addit_path != None): if impute_type not in impute_list: return "The 2 types of imputation currently supported are 'mean' and 'median'" elif impute_type.lower() == "mean": addit_df = addit_df.fillna(addit_df.mean()) elif impute_type.lower() == "median": addit_df = addit_df.fillna(addit_df.median()) print("") print( f"You have just imputed your non-genotype features, covering up NAs with the column {impute_type} so that analyses don't crash due to missing data." ) print( "Now your non-genotype features might look a little better (showing the first few lines of the left-most and right-most columns)..." ) print("#" * 70) print(addit_df.describe()) print("#" * 70) print("") # Remove the ID column cols = list(addit_df.columns) cols.remove('ID') addit_df[cols] # Z-scale the features print(f"Now Z-scaling your non-genotype features...") # Remove any columns with a standard deviation of zero print( f"Removing any columns that have a standard deviation of 0 prior to Z-scaling..." ) if any(addit_df.std() == 0.0): print("") print( f"Looks like there's at least one column with a standard deviation of 0. Let's remove that for you..." ) print("") addit_keep = addit_df.drop( addit_df.std()[addit_df.std() == 0.0].index.values, axis=1) addit_keep_list = list(addit_keep.columns.values) addit_df = addit_df[addit_keep_list] addit_keep_list.remove('ID') removed_list = np.setdiff1d(cols, addit_keep_list) for removed_column in range(len(removed_list)): print( f"The column {removed_list[removed_column]} was removed" ) cols = addit_keep_list print("") for col in cols: if (addit_df[col].min() == 0.0) and (addit_df[col].max() == 1.0): print( col, "is likely a binary indicator or a proportion and will not be scaled, just + 1 all the values of this variable and rerun to flag this column to be scaled." ) else: addit_df[col] = (addit_df[col] - addit_df[col].mean() ) / addit_df[col].std(ddof=0) print("") print( "You have just Z-scaled your non-genotype features, putting everything on a numeric scale similar to genotypes." ) print( "Now your non-genotype features might look a little closer to zero (showing the first few lines of the left-most and right-most columns)..." ) print("#" * 70) print(addit_df.describe()) print("#" * 70) # Saving out the proper HDF5 file if (self.geno_path != None): merged = raw_df.to_hdf(outfile_h5, key='geno') if (self.addit_path != None): merged = addit_df.to_hdf(outfile_h5, key='addit') if (self.geno_path != None) & (self.addit_path != None): pheno = pd.read_hdf(outfile_h5, key="pheno") geno = pd.read_hdf(outfile_h5, key="geno") addit = pd.read_hdf(outfile_h5, key="addit") temp = pd.merge(pheno, addit, on='ID', how='inner') merged = pd.merge(temp, geno, on='ID', how='inner') if (self.geno_path != None) & (self.addit_path == None): pheno = pd.read_hdf(outfile_h5, key="pheno") geno = pd.read_hdf(outfile_h5, key="geno") merged = pd.merge(pheno, geno, on='ID', how='inner') if (self.geno_path == None) & (self.addit_path != None): pheno = pd.read_hdf(outfile_h5, key="pheno") addit = pd.read_hdf(outfile_h5, key="addit") merged = pd.merge(pheno, addit, on='ID', how='inner') # Checking the reference column names flag # If this is a step that comes after harmonize, then a .txt file with columns to keep should have been produced # This is a list of column names from the reference dataset that the test dataset was harmonized against # We want to compare apples to apples, so we will only keep the column names that match if (self.refColsHarmonize != None): print("") print( f"Looks like you are munging after the harmonization step. Great! We will keep the columns generated from your reference dataset from that harmonize step that was exported to this file: {self.refColsHarmonize}" ) print("") with open(self.refColsHarmonize, 'r') as refCols_file: ref_column_names_list = refCols_file.read().splitlines() # Keep the reference columns from the test dataset if found in test data matching_cols = merged[np.intersect1d(merged.columns, ref_column_names_list)] # Make a list of final features that will be included in the model # This will be used again when remunging the reference dataset matching_cols_list = matching_cols.columns.values.tolist() # Save out the final list intersecting_cols_outfile = self.run_prefix + ".finalHarmonizedCols_toKeep.txt" with open(intersecting_cols_outfile, 'w') as filehandle: for col in matching_cols_list: filehandle.write('%s\n' % col) print( f"A final list of harmonized columns between your reference and test dataset has been generated here: {intersecting_cols_outfile}" ) print( f"Use this to re-train your reference dataset in order to move on to testing." ) # Replace the dataframe variable with the matching options merged = matching_cols self.merged = merged merged.to_hdf(outfile_h5, key='dataForML') features_list = merged.columns.values.tolist() features_listpath = self.run_prefix + ".list_features.txt" with open(features_listpath, 'w') as f: for feature in features_list: f.write("%s\n" % feature) print( f"An updated list of {len(features_list)} features, including ID and PHENO, that is in your munged dataForML.h5 file can be found here {features_listpath}" ) print("") print( f"Your .dataForML file that has been fully munged can be found here: {outfile_h5}" ) return merged