def divideData(filename,direct,num=5,mph=3,delet=True): print "Estimating heritability using "+str(num)+" components" [yFil,sFil]=getData(filename,mph=mph); n=sFil.iid_count reOrd=perm(n); yFil=yFil[reOrd,:]; sFil=sFil[reOrd,:]; div=[int(math.ceil( i*n/float(num) )) for i in range(0,num+1)]; varEsts=[]; for i in range(0,num): print "For component "+str(i); sFilTemp=sFil[div[i]:div[i+1],:]; yFilTemp=yFil[div[i]:div[i+1],:]; fileTemp=direct+"/tempFile_"+str(i); Bed.write(fileTemp,sFilTemp.read()); Pheno.write(fileTemp+".phen",yFilTemp.read()) varEsts.append(varRes(fileTemp,direct)); if delet: os.system("rm "+direct+"/tempFile_"+str(i)+"*"); return varEsts;
def divideData(filename, direct, num=5, mph=3, delet=True): print "Estimating heritability using " + str(num) + " components" [yFil, sFil] = getData(filename, mph=mph) n = sFil.iid_count reOrd = perm(n) yFil = yFil[reOrd, :] sFil = sFil[reOrd, :] div = [int(math.ceil(i * n / float(num))) for i in range(0, num + 1)] varEsts = [] for i in range(0, num): print "For component " + str(i) sFilTemp = sFil[div[i]:div[i + 1], :] yFilTemp = yFil[div[i]:div[i + 1], :] fileTemp = direct + "/tempFile_" + str(i) Bed.write(fileTemp, sFilTemp.read()) Pheno.write(fileTemp + ".phen", yFilTemp.read()) varEsts.append(varRes(fileTemp, direct)) if delet: os.system("rm " + direct + "/tempFile_" + str(i) + "*") return varEsts
def test_write_x_x_cpp(self): snpreader = Bed(self.currentFolder + "/examples/toydata") for order in ['C','F']: for dtype in [np.float32,np.float64]: snpdata = snpreader.read(order=order,dtype=dtype) snpdata.val[-1,0] = float("NAN") output = "tempdir/toydata.{0}{1}.cpp".format(order,"32" if dtype==np.float32 else "64") create_directory_if_necessary(output) Bed.write(output, snpdata) snpdata2 = Bed(output).read() np.testing.assert_array_almost_equal(snpdata.val, snpdata2.val, decimal=10)
def test_write_x_x_cpp(self): snpreader = Bed(self.currentFolder + "/examples/toydata") for order in ['C','F']: for dtype in [np.float32,np.float64]: snpdata = snpreader.read(order=order,dtype=dtype) snpdata.val[-1,0] = float("NAN") output = "tempdir/toydata.{0}{1}.cpp".format(order,"32" if dtype==np.float32 else "64") create_directory_if_necessary(output) Bed.write(snpdata, output) snpdata2 = Bed(output).read() assert TestLoader.is_same(snpdata, snpdata2) #!!!define an equality method on snpdata?
def too_slow_test_write_bedbig(self): iid_count = 100000 sid_count = 50000 from pysnptools.snpreader.snpdata import SnpData #!!! promote on level up innamespace iid = np.array([[str(i),str(i)] for i in xrange(iid_count)]) sid = np.array(["sid_{0}".format(i) for i in xrange(sid_count)]) pos = np.array([[i,i,i] for i in xrange(sid_count)]) np.random.seed = 0 snpdata = SnpData(iid,sid,pos,np.zeros((iid_count,sid_count))) #random.choice((0.0,1.0,2.0,float("nan")),size=(iid_count,sid_count))) output = "tempdir/bedbig.{0}.{1}".format(iid_count,sid_count) create_directory_if_necessary(output) Bed.write(snpdata, output) snpdata2 = Bed(output).read() assert TestLoader.is_same(snpdata, snpdata2) #!!!define an equality method on snpdata?
def too_slow_test_write_bedbig(self): iid_count = 100000 sid_count = 50000 from pysnptools.snpreader import SnpData iid = np.array([[str(i),str(i)] for i in range(iid_count)]) sid = np.array(["sid_{0}".format(i) for i in range(sid_count)]) pos = np.array([[i,i,i] for i in range(sid_count)]) np.random.seed(0) snpdata = SnpData(iid,sid,np.zeros((iid_count,sid_count)),pos=pos) #random.choice((0.0,1.0,2.0,float("nan")),size=(iid_count,sid_count))) output = "tempdir/bedbig.{0}.{1}".format(iid_count,sid_count) create_directory_if_necessary(output) Bed.write(output, snpdata, count_A1=False) snpdata2 = Bed(output,count_A1=False).read() np.testing.assert_array_almost_equal(snpdata.val, snpdata2.val, decimal=10)
def write(path, storage, snpdata, count_A1=True, updater=None): file_list = [ SnpReader._name_of_other_file(path, remove_suffix="bed", add_suffix=new_suffix) for new_suffix in ["bim", "fam", "bed"] ] #'bed' should be last with _multiopen( lambda file_name: storage.open_write(file_name, updater=updater), file_list) as local_file_name_list: Bed.write(local_file_name_list[-1], snpdata, count_A1=count_A1) return _Distributed1Bed(path, storage)
def test_write_bed_f64cpp_5_python(self): snpreader = Bed(self.currentFolder + "/examples/toydata") iid_index = 5 logging.info("iid={0}".format(iid_index)) #if snpreader.iid_count % 4 == 0: # divisible by 4 isn't a good test # snpreader = snpreader[0:-1,:] #assert snpreader.iid_count % 4 != 0 snpdata = snpreader[0:iid_index,:].read(order='F',dtype=np.float64) if snpdata.iid_count > 0: snpdata.val[-1,0] = float("NAN") output = "tempdir/toydata.F64python.{0}".format(iid_index) create_directory_if_necessary(output) Bed.write(snpdata, output,force_python_only=True) snpdata2 = Bed(output).read() assert TestLoader.is_same(snpdata, snpdata2) #!!!define an equality method on snpdata?
def test_write_bed_f64cpp_5_python(self): snpreader = Bed(self.currentFolder + "/examples/toydata",count_A1=False) iid_index = 5 logging.info("iid={0}".format(iid_index)) #if snpreader.iid_count % 4 == 0: # divisible by 4 isn't a good test # snpreader = snpreader[0:-1,:] #assert snpreader.iid_count % 4 != 0 snpdata = snpreader[0:iid_index,:].read(order='F',dtype=np.float64) if snpdata.iid_count > 0: snpdata.val[-1,0] = float("NAN") output = "tempdir/toydata.F64python.{0}".format(iid_index) create_directory_if_necessary(output) Bed.write(output,snpdata, force_python_only=True) snpdata2 = Bed(output,count_A1=False).read() np.testing.assert_array_almost_equal(snpdata.val, snpdata2.val, decimal=10)
def test_write_x_x_cpp(self): for count_A1 in [False, True]: snpreader = Bed(self.currentFolder + "/examples/toydata", count_A1=count_A1) for order in ['C', 'F']: for dtype in [np.float32, np.float64]: snpdata = snpreader.read(order=order, dtype=dtype) snpdata.val[-1, 0] = float("NAN") output = "tempdir/toydata.{0}{1}.cpp".format( order, "32" if dtype == np.float32 else "64") create_directory_if_necessary(output) Bed.write(output, snpdata, count_A1=count_A1) snpdata2 = Bed(output, count_A1=count_A1).read() np.testing.assert_array_almost_equal(snpdata.val, snpdata2.val, decimal=10)
def test_write_bed_f64cpp_5_python(self): snpreader = Bed(self.currentFolder + "/examples/toydata", count_A1=False) iid_index = 5 logging.info("iid={0}".format(iid_index)) #if snpreader.iid_count % 4 == 0: # divisible by 4 isn't a good test # snpreader = snpreader[0:-1,:] #assert snpreader.iid_count % 4 != 0 snpdata = snpreader[0:iid_index, :].read(order='F', dtype=np.float64) if snpdata.iid_count > 0: snpdata.val[-1, 0] = float("NAN") output = "tempdir/toydata.F64python.{0}".format(iid_index) create_directory_if_necessary(output) Bed.write(output, snpdata, force_python_only=True) snpdata2 = Bed(output, count_A1=False).read() np.testing.assert_array_almost_equal(snpdata.val, snpdata2.val, decimal=10)
def too_slow_test_write_bedbig(self): iid_count = 100000 sid_count = 50000 from pysnptools.snpreader import SnpData iid = np.array([[str(i), str(i)] for i in range(iid_count)]) sid = np.array(["sid_{0}".format(i) for i in range(sid_count)]) pos = np.array([[i, i, i] for i in range(sid_count)]) np.random.seed(0) snpdata = SnpData( iid, sid, np.zeros((iid_count, sid_count)), pos=pos ) #random.choice((0.0,1.0,2.0,float("nan")),size=(iid_count,sid_count))) output = "tempdir/bedbig.{0}.{1}".format(iid_count, sid_count) create_directory_if_necessary(output) Bed.write(output, snpdata, count_A1=False) snpdata2 = Bed(output, count_A1=False).read() np.testing.assert_array_almost_equal(snpdata.val, snpdata2.val, decimal=10)
def shuffle_bed(bed_file): """ shuffle the genotypes of individuals snp-by-snp :param bed_file: the prefix for plink binary file :return: the shuffled plink binary file """ try: from pysnptools.snpreader import Bed except Exception as e: print(e) return 0 logging.INFO('Read the plink file') data = Bed(bed_file, count_A1=False).read() num_snp = data.val.shape[1] logging.INFO("Start shuffle the genotypes snp-by-snp") for i in tqdm(range(num_snp)): np.random.shuffle(data.val[:, i]) logging.INFO('Write the shuffled plink file') Bed.write(bed_file + '_shuffle', data, count_A1=False) return 1
def gen_Test_Bed(filename, n0, n1, m): n = n0 + n1 iid = [["fam_" + str(i), "iid_" + str(i)] for i in range(0, n)] sid = ["snp_" + str(i) for i in range(0, m)] X = [[2.0 for i in range(0, m)] for i in range(0, n1)] X.extend([[0.0 for i in range(0, m)] for i in range(0, n0)]) dat = SnpData(iid=iid, sid=sid, val=X) Bed.write(filename, dat) fil = open(filename + ".fam") lines = fil.readlines() fil.close() fil = open(filename + ".fam", "w") for i in range(0, len(lines)): l = lines[i] s = l.strip().split() if i < n1: s[5] = "2" else: s[5] = "1" l = " ".join(s) + "\n" fil.write(l) fil.close()
def snpsA(seed, iid_count, sid_count, use_distributed): import numpy as np from pysnptools.snpreader import Bed from pysnptools.snpreader import DistributedBed from pysnptools.snpreader import SnpGen chrom_count = 10 global top_cache if use_distributed: test_snp_path = ( cache_top / f"snpsA_{seed}_{chrom_count}_{iid_count}_{sid_count}_db") else: test_snp_path = ( cache_top / f"snpsA_{seed}_{chrom_count}_{iid_count}_{sid_count}.bed") count_A1 = False if not test_snp_path.exists(): snpgen = SnpGen( seed=seed, iid_count=iid_count, sid_count=sid_count, chrom_count=chrom_count, block_size=1000, ) if use_distributed: test_snps = DistributedBed.write(str(test_snp_path), snpgen) else: test_snps = Bed.write(str(test_snp_path), snpgen.read(dtype="float32"), count_A1=count_A1) else: if use_distributed: test_snps = DistributedBed(str(test_snp_path)) else: test_snps = Bed(str(test_snp_path), count_A1=count_A1) from pysnptools.snpreader import SnpData np.random.seed(seed) pheno = SnpData( iid=test_snps.iid, sid=["pheno"], val=np.random.randn(test_snps.iid_count, 1) * 3 + 2, ) covar = SnpData( iid=test_snps.iid, sid=["covar1", "covar2"], val=np.random.randn(test_snps.iid_count, 2) * 2 - 3, ) return test_snps, pheno, covar
sid_count_max = 5765294 sid_batch_size = 50 sid_batch_count = -(sid_count // -sid_batch_size) sid_batch_count_max = -(sid_count_max // -sid_batch_size) snpgen = SnpGen(seed=234, iid_count=iid_count, sid_count=sid_count_max) for batch_index in range(sid_batch_count): sid_index_start = batch_index * sid_batch_size sid_index_end = (batch_index + 1) * sid_batch_size # what about rounding filename = r"d:\deldir\rand\fakeukC{0}x{1}-{2}.bed".format( iid_count, sid_index_start, sid_index_end ) if not os.path.exists(filename): Bed.write( filename + ".temp", snpgen[:, sid_index_start:sid_index_end].read() ) os.rename(filename + ".temp", filename) if False: from pysnptools.snpreader import Pheno, Bed filename = r"m:\deldir\New folder (4)\all_chr.maf0.001.N300.bed" iid_count = 300 iid = [["0", "iid_{0}".format(iid_index)] for iid_index in range(iid_count)] bed = Bed(filename, iid=iid, count_A1=False) print(bed.iid_count) if False: from pysnptools.util import example_file
val[0::4, SNPsIndex:SNPsIndex + 1][bytes >= 3] = byteThree val = val[iid_index, :] #reorder or trim any extra allocation if not SnpReader._array_properties_are_ok(val, order, dtype): val = val.copy(order=order) self._close_bed() return val if __name__ == "__main__": logging.basicConfig(level=logging.INFO) if True: from pysnptools.util import example_file pheno_fn = example_file("pysnptools/examples/toydata.phe") if False: from pysnptools.snpreader import Pheno, Bed import pysnptools.util as pstutil import os print(os.getcwd()) snpdata = Pheno( '../examples/toydata.phe').read() # Read data from Pheno format pstutil.create_directory_if_necessary("tempdir/toydata.5chrom.bed") Bed.write("tempdir/toydata.5chrom.bed", snpdata, count_A1=False) # Write data in Bed format import doctest doctest.testmod(optionflags=doctest.ELLIPSIS) # There is also a unit test case in 'pysnptools\test.py' that calls this doc test
# print('Time for permutation GWAS:' + str(time.time() - time_permut_0) + 's') # # Permutations # In[8]: # Shuffling ALLELES by VARIANT for i in range(NUMBER_OF_PERMUTATIONS): time_permut_0 = time.time() # Python works a little different than R: Shuffle directly modifies the input data frame! np.random.shuffle(mysnpdata.val) Bed.write('VariantsPermuted', mysnpdata) copyfile(VARIANTS_TO_TEST + '.bim', 'VariantsPermuted.bim') tmp_shuffled_df = single_snp('VariantsPermuted', PHENOTYPE_DATA, # cache_file='Outputs/Fast-Lmm-Cache/Gwas-Permutations-Cache'+str(i) cache_file='Outputs/Fast-Lmm-Cache/Gwas-Permutations-Cache.npz', leave_out_one_chrom=False, ) tmp_shuffled_df['Full ID'] = tmp_shuffled_df['Chr'].astype('str') + '_' + tmp_shuffled_df['ChrPos'].astype('str') # sorting the new df to match the original tmp_shuffled_df = tmp_shuffled_df[['Full ID', 'SNP', 'PValue']] tmp_shuffled_df = tmp_shuffled_df.rename(columns={'Full ID':'Full IDShuffled'+str(i+1), 'PValue':'PValueShuffled'+str(i+1)}) snpdata = mysnpdata.val
header=0, ) fam_df = pd.read_csv(args.bfile + ".fam", delim_whitespace=True, usecols=[0, 1], names=["FID", "IID"]) merged_df = pd.merge(fam_df, pcs_df, on="IID").set_index("IID") pcs = merged_df.ix[fam_df.ix[:, 1], 2:] Q, R = np.linalg.qr(pcs) print("reading dataset") dataset = Bed(args.bfile).read().standardize() dataset.standardize() import pdb pdb.set_trace() Bed.write("temp", dataset) print("projecting data") X_Q = Q.T.dot(dataset.val) print("unprojecting") X_rr = Q.dot(X_Q) print("subtracting out population structure") X = dataset.val - X_rr print("writing") newbed = SnpData(dataset.iid, dataset.sid, X, pos=dataset.pos) Bed.write("temp.bed", newbed)
data_file = 'd:\OneDrive\programs\epiCornell\syndata.bed' if False: from pysnptools.snpreader import SnpData import numpy as np bed1 = Bed("../../tests/datasets/synth/all") print(bed1.iid_count, bed1.sid_count, bed1.iid_count * bed1.sid_count) #goal 1500 individuals x 27000 SNP snpdata1 = bed1.read() iid = bed1.iid sid = ['sid{0}'.format(i) for i in xrange(27000)] val = np.tile(snpdata1.val,(3,6))[:,:27000].copy() #snpdata = Pheno('pysnptools/examples/toydata.phe').read() # Read data from Pheno format snpdata2 = SnpData(iid, sid, val) print(snpdata2.iid_count, snpdata2.sid_count, snpdata2.iid_count * snpdata2.sid_count) Bed.write(snpdata2,data_file,count_A1=False) synbed = Bed(data_file) print(synbed.iid_count, synbed.sid_count, synbed.iid_count * synbed.sid_count) part_count = 1000 part_list = list(split_on_sids(synbed,part_count)) pairs00 = _Pairs(part_list[0]) from fastlmm.association import single_snp pheno_fn = r"d:\OneDrive\programs\epiCornell\pheno.txt" cov_fn = r"d:\OneDrive\programs\epiCornell\cov.txt" results_df = single_snp(pairs00, K0=synbed, pheno=pheno_fn, covar=cov_fn, leave_out_one_chrom=False, count_A1=True) if False: for i,synbed_part_i in enumerate(synbed_part_list):
val[1::4,SNPsIndex:SNPsIndex+1][bytes>=4]=np.nan val[1::4,SNPsIndex:SNPsIndex+1][bytes>=8]=1 val[1::4,SNPsIndex:SNPsIndex+1][bytes>=12]=byteThree bytes=np.mod(bytes,4) val[0::4,SNPsIndex:SNPsIndex+1]=byteZero val[0::4,SNPsIndex:SNPsIndex+1][bytes>=1]=np.nan val[0::4,SNPsIndex:SNPsIndex+1][bytes>=2]=1 val[0::4,SNPsIndex:SNPsIndex+1][bytes>=3]=byteThree val = val[iid_index_out,:] #reorder or trim any extra allocation #!!LATER this can fail because the trim statement above messes up the order #assert(SnpReader._array_properties_are_ok(val, order, dtype)) #!! self._close_bed() return val if __name__ == "__main__": logging.basicConfig(level=logging.INFO) from pysnptools.snpreader import Pheno, Bed import pysnptools.util as pstutil snpdata = Pheno('../examples/toydata.phe').read() # Read data from Pheno format pstutil.create_directory_if_necessary("tempdir/toydata.bed") Bed.write("tempdir/toydata.bed",snpdata,count_A1=False) # Write data in Bed format import doctest doctest.testmod() # There is also a unit test case in 'pysnptools\test.py' that calls this doc test
fam_df = pd.read_csv(args.bfile + '.fam', delim_whitespace=True, usecols=[0, 1], names=['FID', 'IID']) merged_df = pd.merge(fam_df, pcs_df, on='IID').set_index('IID') pcs = merged_df.ix[fam_df.ix[:, 1], 2:] Q, R = np.linalg.qr(pcs) print('reading dataset') dataset = Bed(args.bfile).read().standardize() dataset.standardize() import pdb pdb.set_trace() Bed.write('temp', dataset) print('projecting data') X_Q = Q.T.dot(dataset.val) print('unprojecting') X_rr = Q.dot(X_Q) print('subtracting out population structure') X = dataset.val - X_rr print('writing') newbed = SnpData(dataset.iid, dataset.sid, X, pos=dataset.pos) Bed.write('temp.bed', newbed)
print phenoreader, phenoreader.iid_count, phenoreader.sid_count, phenoreader.sid, phenoreader.pos #Pheno('pheno_10_causals.txt') 500 1 ['pheno0'] [[ nan nan nan]] phenodata = phenoreader.read() print phenodata.val #[[ 4.85339514e-01] # [ -2.07698457e-01] # [ 1.49090841e+00] # [ -1.21289967e+00] # ... # Write 1st 10 iids and sids of Bed data into Pheno format snpdata1010 = Bed("all.bed")[:10, :10].read() Pheno.write("deleteme1010.txt", snpdata1010) #Write it to Bed format Bed.write("deleteme1010.bed", snpdata1010) # Create a snpdata on the fly and write to Bed snpdata1 = SnpData(iid=[['f1', 'c1'], ['f1', 'c2'], ['f2', 'c1']], sid=['snp1', 'snp2'], val=[[0, 1], [2, 1], [1, np.nan]]) Bed.write("deleteme1.bed", snpdata1) #Pheno is slow because its txt. Bed format can only hold 0,1,2,missing. # Use SnpNpz for fastest read/write times, smallest file size from pysnptools.snpreader import SnpNpz SnpNpz.write("deleteme1010.snp.npz", snpdata1010) # Use SnpHdf5 for random-access reads, good speed and size, and compatiblity outside Python from pysnptools.snpreader import SnpHdf5
print phenoreader, phenoreader.iid_count, phenoreader.sid_count, phenoreader.sid, phenoreader.pos #Pheno('pheno_10_causals.txt') 500 1 ['pheno0'] [[ nan nan nan]] phenodata = phenoreader.read() print phenodata.val #[[ 4.85339514e-01] # [ -2.07698457e-01] # [ 1.49090841e+00] # [ -1.21289967e+00] # ... # Write 1st 10 iids and sids of Bed data into Pheno format snpdata1010 = Bed("all.bed")[:10,:10].read() Pheno.write("deleteme1010.txt",snpdata1010) #Write it to Bed format Bed.write("deleteme1010.bed",snpdata1010) # Create a snpdata on the fly and write to Bed snpdata1 = SnpData(iid=[['f1','c1'],['f1','c2'],['f2','c1']],sid=['snp1','snp2'],val=[[0,1],[2,1],[1,np.nan]]) Bed.write("deleteme1.bed",snpdata1) #Pheno is slow because its txt. Bed format can only hold 0,1,2,missing. # Use SnpNpz for fastest read/write times, smallest file size from pysnptools.snpreader import SnpNpz SnpNpz.write("deleteme1010.snp.npz", snpdata1010) # Use SnpHdf5 for random-access reads, good speed and size, and compatiblity outside Python from pysnptools.snpreader import SnpHdf5 SnpHdf5.write("deleteme1010.snp.hdf5", snpdata1010)