Exemplo n.º 1
0
    def divideData(self, filename, num=5, mph=3, delet=True):
        print "Estimating heritability using " + str(num) + " components"
        direct = "TEMP"
        sFil = Bed(filename)
        yFil = Pheno(filename + ".fam")
        n = sFil.iid_count
        reOrd = perm(n)
        yFil = yFil[reOrd, :]
        sFil = sFil[reOrd, :]

        y = yFil.read().val[:, 3]

        div = [int(math.ceil(i * n / float(num))) for i in range(0, num + 1)]

        varEsts = []

        for i in range(0, num):
            print "For component " + str(i)
            sFilTemp = self.BED[div[i]:div[i + 1], :]
            Xtemp = sFilTemp.read().standardize().val
            ytemp = y[div[i]:div[i + 1]]

            varEsts.append(self.VarCalc.RealVar(ytemp, Xtemp))

        return varEsts
Exemplo n.º 2
0
    def test_two(self):  #!!! rather a big test case
        from pysnptools.util.mapreduce1.runner import Local, LocalMultiProc
        logging.info("TestSingleSnpAllPlusSelect test_two")
        do_plot = False

        bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed"
        pheno_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt"
        cov_fn = self.pythonpath + "/tests/datasets/synth/cov.txt"

        # partition snps on chr5 vs rest
        test_chr = 5
        snp_reader = Bed(bed_fn, count_A1=False)
        test_snps = snp_reader[:, snp_reader.pos[:, 0] == test_chr]

        mf_name = "lmpl"  #"lmpl" "local", "coreP", "nodeP", "socketP", "nodeE", "lmp"
        runner = mf_to_runner_function(mf_name)(20)

        output_file_name = self.file_name("two")
        for GB_goal in [None, 2]:
            results = single_snp_all_plus_select(
                test_snps=test_snps,
                G=bed_fn,
                pheno=pheno_fn,
                covar=cov_fn,
                k_list=[int(k) for k in np.logspace(0, 7, base=2, num=7)],
                n_folds=7,
                seed=42,
                do_plot=do_plot,
                GB_goal=GB_goal,
                output_file_name=output_file_name,
                runner=runner,
                count_A1=False)
            logging.info(results.head())
            self.compare_files(results, "two")
Exemplo n.º 3
0
    def test_intersection(self):

        from pysnptools.standardizer import Unit
        from pysnptools.kernelreader import SnpKernel
        from pysnptools.snpreader import Pheno
        from pysnptools.kernelreader._subset import _KernelSubset
        from pysnptools.snpreader._subset import _SnpSubset
        from pysnptools.util import intersect_apply

        snps_all = Bed(self.currentFolder + "/../examples/toydata.5chrom.bed",
                       count_A1=False)
        k = SnpKernel(snps_all, stdizer.Identity())

        pheno = Pheno(self.currentFolder + "/../examples/toydata.phe")
        pheno = pheno[1:, :]  # To test intersection we remove a iid from pheno

        k1, pheno = intersect_apply([
            k, pheno
        ])  #SnpKernel is special because it standardizes AFTER intersecting.
        assert isinstance(k1.snpreader,
                          _SnpSubset) and not isinstance(k1, _KernelSubset)

        #What happens with fancy selection?
        k2 = k[::2]
        assert isinstance(k2, SnpKernel)

        logging.info("Done with test_intersection")
Exemplo n.º 4
0
 def __init__(self,args):
     if args.window_type not in ['KBP','SNP']:
         raise ValueError('Window type not supported')
     bed_1 = Bed(args.bfile,count_A1=False) #
     af1 = self.get_allele_frequency(bed_1,args) #
     print(len(af1), "SNPs in file 1")
     snps_1 = (af1>args.maf)&(af1<1-args.maf) #
     print(np.sum(snps_1), "SNPs in file 1 after MAF filter")
     if (args.from_bp is not None) and (args.to_bp is not None):
         k = (bed_1.pos[:,2]>args.from_bp)&(bed_1.pos[:,2]<args.to_bp)
         snps_1 = snps_1&k
     snps_to_use = bed_1.sid[snps_1]
     if args.extract is not None:
         keep = np.array([l.strip() for l in open(args.extract,'r')])
         snps_to_use = np.intersect1d(snps_to_use,keep)
         print(len(snps_to_use),"SNPs remaining after extraction")
     bed_1_index = np.sort(bed_1.sid_to_index(snps_to_use)) #
     pos = bed_1.pos[bed_1_index] #
     bim_1=pd.read_table(bed_1.filename+'.bim',header=None,
                         names=['chm','id','pos_mb','pos_bp','a1','a2'])
     af = af1[bed_1_index] #
     # if args.afile is not None:
     #     a1 =  pd.read_table(args.afile,header=None,sep='\s*',
     #                         names=['id1','id2','theta'])
     # else:
     a1 = None
     self.af = af
     self.M = len(bed_1_index) #
     self.windows = self.get_windows(pos,args) #
     self.chr = pos[:,0]
     self.pos = pos[:,2]
     self.id = bed_1.sid[bed_1_index]
     self.A1 = bim_1['a1'].loc[bed_1_index]
     self.A2 = bim_1['a2'].loc[bed_1_index]
     self.scores = self.compute(bed_1,bed_1_index,af,a1,args) #
Exemplo n.º 5
0
    def test_notebook(self):
        do_plot = False
        mf_name = "lmp"  #"local", "coreP", "nodeP", "socketP", "nodeE", "lmp"
        runner = mf_to_runner_function(mf_name)(4)
        output_file_name = self.file_name("notebook")

        logging.info("TestSingleSnpAllPlusSelect test_one")
        # define file names
        snp_reader = Bed(self.pythonpath + "/tests/datasets/synth/all",
                         count_A1=False)
        pheno_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt"
        cov_fn = self.pythonpath + "/tests/datasets/synth/cov.txt"

        # find the chr5 SNPs
        test_snps = snp_reader[:, snp_reader.pos[:, 0] == 5]

        #select the 2nd kernel and run GWAS
        results = single_snp_all_plus_select(test_snps=test_snps,
                                             G=snp_reader,
                                             pheno=pheno_fn,
                                             GB_goal=2,
                                             do_plot=do_plot,
                                             output_file_name=output_file_name,
                                             runner=runner,
                                             count_A1=False)

        self.compare_files(results, "notebook")
Exemplo n.º 6
0
def merge_ld(bfile, ld_dir):

    geno = Bed(bfile, count_A1=False)
    snp_num = geno.col_count
    cov = np.zeros([snp_num, snp_num])
    part_info = pd.read_table(join(ld_dir, 'part.info'),
                              header=None,
                              sep='\t',
                              names=['row', 'col'])

    for part_i, part in part_info.iterrows():
        row_start, row_end = [
            int(i) for i in part_info['row'][part_i].split('-')
        ]
        col_start, col_end = [
            int(i) for i in part_info['col'][part_i].split('-')
        ]
        cov[row_start:row_end, col_start:col_end] = np.load(
            join(ld_dir, 'part_{}.npy'.format(part_i + 1)))

    stddev = np.sqrt(np.diag(cov))
    cov /= stddev[:, None]
    cov /= stddev[None, :]
    inv_cov, rank = linalg.pinvh(cov, return_rank=True)

    np.save(join(ld_dir, 'inv_ld.npy'), inv_cov)
    with open(join(ld_dir, 'rank.txt'), 'w') as f:
        f.write(str(rank))
Exemplo n.º 7
0
    def test_one(self):
        logging.info("TestEpistasis test_one")
        from pysnptools.snpreader import Bed
        test_snps = Bed(self.bedbase, count_A1=False)
        pheno = self.phen_fn
        covar = self.cov_fn

        output_file = self.file_name("one")
        frame = epistasis(
            test_snps,
            pheno,
            G0=test_snps,
            covar=covar,
            sid_list_0=test_snps.sid[:10],  #first 10 snps
            sid_list_1=test_snps.sid[5:15],  #Skip 5 snps, use next 10
            output_file_name=output_file,
            count_A1=False)
        sid0, sid1, pvalue_list = np.array(frame['SNP0']), np.array(
            frame['SNP1']), np.array(frame['PValue'])

        #Check the output file
        self.compare_files(sid0, sid1, pvalue_list, "one")

        #Check the values returned
        output_file2 = self.file_name("one_again")
        write(sid0, sid1, pvalue_list, output_file2)
        self.compare_files(sid0, sid1, pvalue_list, "one")
Exemplo n.º 8
0
    def test_linreg(self):
        logging.info("TestSingleSnp test_linreg")
        test_snps = Bed(self.bedbase, count_A1=False)
        pheno = self.phen_fn
        covar = self.cov_fn

        output_file = self.file_name("linreg")

        frame1 = single_snp(test_snps=test_snps[:, :10],
                            pheno=pheno,
                            mixing=0,
                            leave_out_one_chrom=False,
                            G0=KernelIdentity(iid=test_snps.iid),
                            covar=covar,
                            output_file_name=output_file,
                            count_A1=False)

        frame1 = frame1[[
            'sid_index', 'SNP', 'Chr', 'GenDist', 'ChrPos', 'PValue'
        ]]
        self.compare_files(frame1, "linreg")

        frame2 = single_snp_linreg(test_snps=test_snps[:, :10],
                                   pheno=pheno,
                                   covar=covar,
                                   output_file_name=output_file)
        self.compare_files(frame2, "linreg")
Exemplo n.º 9
0
    def test_gb_goal(self):
        logging.info("TestSingleSnp test_gb_goal")
        test_snps = Bed(self.bedbase, count_A1=False)
        pheno = self.phen_fn
        covar = self.cov_fn

        output_file = self.file_name("gb_goal")
        frame = single_snp(test_snps=test_snps[:, :10],
                           pheno=pheno,
                           mixing=0,
                           leave_out_one_chrom=False,
                           G0=test_snps,
                           covar=covar,
                           GB_goal=0,
                           output_file_name=output_file,
                           count_A1=False)

        self.compare_files(frame, "one")

        output_file = self.file_name("gb_goal2")
        frame = single_snp(test_snps=test_snps[:, :10],
                           pheno=pheno,
                           mixing=0,
                           leave_out_one_chrom=False,
                           G0=test_snps,
                           covar=covar,
                           GB_goal=.12,
                           output_file_name=output_file,
                           count_A1=False)

        self.compare_files(frame, "one")
Exemplo n.º 10
0
 def __init__(self, prefix, case_file):
     self.prefix = prefix
     self.case_file = case_file
     self.snpreader = Bed(f"{prefix}.bed", count_A1=False)
     if self.snpreader.pos.dtype != 'int64':
         self.snpreader.pos[:,0] = np.vectorize(replace)(self.snpreader.pos[:,0])
     self.snpreader.pos[:,1] = self.snpreader.pos[:,0] * 100000000000 + self.snpreader.pos[:,2]
     self.snpdata = self.snpreader.read()
     print('SNP data loaded.')
     self.chr_list = list(set(self.snpreader.pos[:,0]))
     self.Chr = self.snpreader.pos[:,0]
     self.Position =  self.snpreader.pos[:,1]
     self.bp =  self.snpreader.pos[:,2]
     self.SNPID = self.snpreader.sid
     self.case = np.loadtxt(case_file, dtype=self.snpreader.iid.dtype)[:,:2]
     self.case_list = list(self.case)
     self.all_list = list([tuple(x) for x in self.snpreader.iid])
     self.caseset = set([tuple(x) for x in self.case])
     self.control_list = [list(x) for x in self.all_list if x not in self.caseset]
     self.numSNP = self.snpreader.sid_count
     self.numSample = len(self.all_list)
     self.numCase = len(self.case_list)
     self.numControl = len(self.control_list)
     self.case_geno = self.snpdata.val[self.snpreader.iid_to_index(self.case)]
     L = []
     for i in self.case_list:
         L.append(i[1].decode('utf-8'))
     self.case_list_print = '\n'.join(L)
     print('Case individuals are: \n')
     print(self.case_list_print)
     print('\n')
Exemplo n.º 11
0
    def test_leave_one_out_with_prekernels(self):
        logging.info(
            "TestSingleSnpLeaveOutOneChrom test_leave_one_out_with_prekernels")
        from pysnptools.kernelstandardizer import DiagKtoN
        test_snps = Bed(self.bedbase, count_A1=False)
        pheno = self.phen_fn
        covar = self.cov_fn

        chrom_to_kernel = {}
        with patch.dict('os.environ', {'ARRAY_MODULE': 'numpy'}) as _:
            for chrom in np.unique(test_snps.pos[:, 0]):
                other_snps = test_snps[:, test_snps.pos[:, 0] != chrom]
                kernel = other_snps.read_kernel(
                    standardizer=Unit(), block_size=500
                )  #Create a kernel from the SNPs not used in testing
                chrom_to_kernel[chrom] = kernel.standardize(
                    DiagKtoN()
                )  #improves the kernel numerically by making its diagonal sum to iid_count

        output_file = self.file_name("one_looc_prekernel")
        frame = single_snp(test_snps,
                           pheno,
                           covar=covar,
                           K0=chrom_to_kernel,
                           output_file_name=output_file,
                           count_A1=False)

        self.compare_files(frame, "one_looc")
    def too_slow_test_notebook(self):
        do_plot = False
        runner = LocalMultiProc(multiprocessing.cpu_count(), mkl_num_threads=2)
        output_file_name = self.file_name("notebook")

        logging.info("TestSingleSnpAllPlusSelect test_notebook")
        # define file names
        snp_reader = Bed(self.pythonpath + "/tests/datasets/synth/all.bed",
                         count_A1=False)
        pheno_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt"
        cov_fn = self.pythonpath + "/tests/datasets/synth/cov.txt"

        # find the chr5 SNPs
        test_snps = snp_reader[:, snp_reader.pos[:, 0] == 5]

        #select the 2nd kernel and run GWAS
        results = single_snp_all_plus_select(test_snps=test_snps,
                                             G=snp_reader,
                                             pheno=pheno_fn,
                                             GB_goal=2,
                                             do_plot=do_plot,
                                             output_file_name=output_file_name,
                                             runner=runner,
                                             count_A1=False)

        self.compare_files(results, "notebook")
Exemplo n.º 13
0
 def test_snp_dist2(self):
     logging.info("in test_snp_dist2")
     snpreader = Bed(self.currentFolder + "/../examples/toydata.5chrom.bed",
                     count_A1=False)
     snp2dist = snpreader.as_dist(max_weight=2)
     s = str(snp2dist)
     _fortesting_JustCheckExists().input(snp2dist)
Exemplo n.º 14
0
	def __init__(self,filename,snpfile="",params="",n0=-1,n1=-1):
		self.BED=Bed(filename);
		self.pheno=Pheno(filename+".fam");
		self.y=self.pheno.read().val[:,3];
		self.y=self.y-1.0;
		self.params=params;
		n=len(self.y)
		
		if n0>0:
			print "Initiate with n0"
			I0=[i for i in range(0,n) if self.y[i]==0.0]
			I0=I0[:n0]
			I1=[i for i in range(0,n) if self.y[i]==1.0]
			I1=I1[:n1]
			I0.extend(I1);
			self.y=self.y[I0]
			self.BED=self.BED[I0,:]

		try:
			if len(snpfile)>0:
				fil=open(snpfile)
				lines=fil.readlines();
				fil.close();
				self.snps=[l.strip() for l in lines]
			else:
				self.snps=self.BED.sid;
		except:
			print "Error loading SNPs!"
			sys.exit();
		self.setUp();
		self.n=len(self.y)
		print "Number of individuals: "+str(self.n)
		self.Cov=[];
		self.params="";
Exemplo n.º 15
0
    def readFiles(self):
        print 'Reading Data ...'
        X = None
        y = None
        Xname = None
        if self.fileType == 'plink':
            from pysnptools.snpreader import Bed
            snpreader = Bed(self.fileName + '.bed')
            snpdata = snpreader.read()
            X = snpdata.val
            Xname = snpdata.sid

            # from pysnptools.snpreader import Pheno
            # phenoreader = Pheno(self.fileName+".fam")
            # phenodata = phenoreader.read()
            # y = phenodata.val[:,-1]
            y = self.famReader(self.fileName + ".fam")

        if self.fileType == 'csv':
            X = np.loadtxt(self.fileName + '.geno.csv', delimiter=',')
            y = np.loadtxt(self.fileName + '.pheno.csv', delimiter=',')
            try:
                Xname = np.loadtxt(self.fileName + '.marker.csv',
                                   delimiter=',')
            except:
                Xname = ['geno ' + str(i + 1) for i in range(X.shape[1])]
        if self.imputationFlag:
            X = self.imputation(X)
            keep = True - np.isnan(y)
            return X[keep, :], y[keep], Xname
        else:
            X = self.simpleImputation(X)
            keep = (y == y)
            return X[keep, :], y[keep], Xname
Exemplo n.º 16
0
    def test_G0_has_reader(self):
        logging.info("TestSingleSnp test_G0_has_reader")
        test_snps = Bed(self.bedbase, count_A1=False)
        pheno = self.phen_fn
        covar = self.cov_fn

        output_file_name = self.file_name("G0_has_reader")

        frame0 = single_snp(test_snps=test_snps[:, :10],
                            pheno=pheno,
                            G0=test_snps,
                            leave_out_one_chrom=False,
                            covar=covar,
                            mixing=0,
                            output_file_name=output_file_name,
                            count_A1=False)
        self.compare_files(frame0, "one")

        frame1 = single_snp(test_snps=test_snps[:, :10],
                            pheno=pheno,
                            G0=KernelIdentity(test_snps.iid),
                            G1=test_snps,
                            leave_out_one_chrom=False,
                            covar=covar,
                            mixing=1,
                            output_file_name=output_file_name,
                            count_A1=False)
        self.compare_files(frame1, "one")
Exemplo n.º 17
0
    def test_linreg(self):
        logging.info("TestSingleSnpLinReg test_linreg")
        test_snps = Bed(self.bedbase, count_A1=False)
        pheno = self.phen_fn
        covar = self.cov_fn

        output_file = self.file_name("linreg")

        frame1 = single_snp_linreg(test_snps=test_snps[:, :10],
                                   pheno=pheno,
                                   covar=covar,
                                   output_file_name=output_file,
                                   count_A1=False)

        frame1 = frame1[[
            'sid_index', 'SNP', 'Chr', 'GenDist', 'ChrPos', 'PValue'
        ]]
        self.compare_files(frame1, "linreg")

        frame2 = single_snp_linreg(test_snps=test_snps[:, :10],
                                   pheno=pheno,
                                   covar=covar,
                                   output_file_name=output_file,
                                   count_A1=False)
        self.compare_files(frame2, "linreg")
Exemplo n.º 18
0
    def test_file_cache(self):
        logging.info("TestSingleSnp test_file_cache")
        test_snps = Bed(self.bedbase, count_A1=False)
        pheno = self.phen_fn
        covar = self.cov_fn

        output_file_name = self.file_name("G1")
        cache_file = self.file_name("cache_file") + ".npz"
        if os.path.exists(cache_file):
            os.remove(cache_file)
        frame = single_snp(test_snps=test_snps[:, :10],
                           pheno=pheno,
                           G0=test_snps[:, 10:100],
                           leave_out_one_chrom=False,
                           covar=covar,
                           G1=test_snps[:, 100:200],
                           mixing=.5,
                           output_file_name=output_file_name,
                           cache_file=cache_file,
                           count_A1=False)
        self.compare_files(frame, "G1")
        frame = single_snp(test_snps=test_snps[:, :10],
                           pheno=pheno,
                           G0=test_snps[:, 10:100],
                           leave_out_one_chrom=False,
                           covar=covar,
                           G1=test_snps[:, 100:200],
                           mixing=.5,
                           output_file_name=output_file_name,
                           cache_file=cache_file,
                           count_A1=False)
        self.compare_files(frame, "G1")
Exemplo n.º 19
0
def cut_ld(bfile, chunk_size, ld_dir):
    geno = Bed(bfile, count_A1=False)
    snp_num = geno.col_count
    # number of chunks snps devides into
    chunk_num = int(np.ceil(float(snp_num) / chunk_size))
    # number of parts of the covariance matrix that needs to be computed
    part_num = chunk_num * chunk_num
    part_list = []
    row_list = []
    col_list = []
    for part_i in range(1, part_num + 1):
        row_i = int((part_i - 1) / chunk_num)
        col_i = int((part_i - 1) % chunk_num)
        row_start = row_i * chunk_size
        row_end = (row_i + 1) * chunk_size
        col_start = col_i * chunk_size
        col_end = (col_i + 1) * chunk_size
        part_list.append(part_i)
        row_list.append('{}-{}'.format(row_start, row_end))
        col_list.append('{}-{}'.format(col_start, col_end))
    df = pd.DataFrame({
        'row': row_list,
        'col': col_list
    },
                      columns=['row', 'col'])
    df.to_csv(join(ld_dir, 'part.info'), index=False, header=False, sep='\t')
Exemplo n.º 20
0
    def test_some_std(self):
        k0 = self.snpdata.read_kernel(standardizer=Unit()).val
        from pysnptools.kernelreader import SnpKernel
        k1 = self.snpdata.read_kernel(standardizer=Unit())
        np.testing.assert_array_almost_equal(k0, k1.val, decimal=10)

        from pysnptools.snpreader import SnpData
        snpdata2 = SnpData(iid=self.snpdata.iid,
                           sid=self.snpdata.sid,
                           pos=self.snpdata.pos,
                           val=np.array(self.snpdata.val))
        s = str(snpdata2)
        snpdata2.standardize()
        s = str(snpdata2)

        snpreader = Bed(self.currentFolder + "/examples/toydata",
                        count_A1=False)
        k2 = snpreader.read_kernel(standardizer=Unit(), block_size=500).val
        np.testing.assert_array_almost_equal(k0, k2, decimal=10)

        from pysnptools.standardizer.identity import Identity
        from pysnptools.standardizer.diag_K_to_N import DiagKtoN
        for dtype in [sp.float64, sp.float32]:
            for std in [Unit(), Beta(1, 25), Identity(), DiagKtoN()]:
                s = str(std)
                np.random.seed(0)
                x = np.array(np.random.randint(3, size=[60, 100]), dtype=dtype)
                x2 = x[:, ::2]
                x2b = np.array(x2)
                #LATER what's this about? It doesn't do non-contiguous?
                #assert not x2.flags['C_CONTIGUOUS'] and not x2.flags['F_CONTIGUOUS'] #set up to test non contiguous
                #assert x2b.flags['C_CONTIGUOUS'] or x2b.flags['F_CONTIGUOUS'] #set up to test non contiguous
                #a,b = std.standardize(x2b),std.standardize(x2)
                #np.testing.assert_array_almost_equal(a,b)
        logging.info("done")
Exemplo n.º 21
0
    def setUpClass(self):
        currentFolder = os.path.dirname(os.path.realpath(__file__))
        self.snp_fn = currentFolder + "/../../tests/datasets/mouse/alldata"
        self.pheno_fn = currentFolder + "/../../tests/datasets/mouse/pheno_10_causals.txt"
        #self.cov_fn = currentFolder + "/examples/toydata.cov"

        # load data
        ###################################################################
        snp_reader = Bed(self.snp_fn)
        pheno = pstpheno.loadOnePhen(self.pheno_fn)
        #cov = pstpheno.loadPhen(self.cov_fn)

        # intersect sample ids
        snp_reader, pheno = pysnptools.util.intersect_apply(
            [snp_reader, pheno])

        self.G = snp_reader.read(order='C').val
        self.G = stdizer.Unit().standardize(self.G)
        self.G.flags.writeable = False
        self.y = pheno['vals'][:, 0]
        self.y.flags.writeable = False

        # load pcs
        #self.G_cov = cov['vals']
        self.G_cov = np.ones((len(self.y), 1))
        self.G_cov.flags.writeable = False
Exemplo n.º 22
0
 def test_write_x_x_cpp(self):
     for count_A1 in [False, True]:
         snpreader = Bed(self.currentFolder + "/examples/toydata",
                         count_A1=count_A1)
         for order in ['C', 'F']:
             for dtype in [np.float32, np.float64]:
                 snpdata = snpreader.read(order=order, dtype=dtype)
                 snpdata.val[-1, 0] = float("NAN")
                 output = "tempdir/toydata.{0}{1}.cpp".format(
                     order, "32" if dtype == np.float32 else "64")
                 create_directory_if_necessary(output)
                 Bed.write(output, snpdata, count_A1=count_A1)
                 snpdata2 = Bed(output, count_A1=count_A1).read()
                 np.testing.assert_array_almost_equal(snpdata.val,
                                                      snpdata2.val,
                                                      decimal=10)
Exemplo n.º 23
0
    def test1(self):
        logging.info("in TestSnpGen test1")
        seed = 0
        snpgen = SnpGen(seed=seed,
                        iid_count=1000,
                        sid_count=1000 * 1000,
                        block_size=1000)
        snpdata = snpgen[:, [0, 1, 200, 2200, 10]].read()
        snpdata2 = snpgen[:, [0, 1, 200, 2200, 10]].read()
        assert (snpdata.allclose(snpdata2))

        from pysnptools.snpreader import Bed
        ref = Bed(os.path.dirname(os.path.realpath(__file__)) +
                  '/../../tests/datasets/snpgen.bed',
                  count_A1=False).read()
        assert (snpdata.allclose(ref, equal_nan=True))

        cache_file = 'tempdir/cache_file_test1.npz'
        os.remove(cache_file) if os.path.exists(cache_file) else None
        snpgen3 = SnpGen(seed=seed,
                         iid_count=1000,
                         sid_count=1000 * 1000,
                         block_size=1000,
                         cache_file=cache_file)
        snpdata3 = snpgen3[::10, [0, 1, 200, 2200, 10]].read()
        assert (snpdata3.allclose(snpdata2[::10, :].read()))
        snpgen4 = SnpGen(seed=seed,
                         iid_count=1000,
                         sid_count=1000 * 1000,
                         block_size=1000,
                         cache_file=cache_file)
        snpdata4 = snpgen4[::10, [0, 1, 200, 2200, 10]].read()
        assert (snpdata4.allclose(snpdata2[::10, :].read()))
Exemplo n.º 24
0
    def __init__(self,
                 path_or_bed,
                 blocksize,
                 LOCO_chrom_id=None,
                 forcelowrank=False):
        """Constructor."""
        self.forcelowrank = forcelowrank  # only for testing purposes!

        if isinstance(path_or_bed, str):
            self.bed = Bed(path_or_bed, count_A1=True)
        else:
            assert isinstance(
                path_or_bed, SnpReader
            ), 'path_or_bed must either be a path to a bed-file, or an instance of SnpReader.'

        self.bed.pos[:, 0] = self.bed.pos[:, 0].astype(
            'str')  # chromosome should be str, stored positions are 1-based
        self.iid_fid = pd.DataFrame(self.bed.iid,
                                    index=self.bed.iid[:, 1].astype(str),
                                    columns=['fid', 'iid'])

        self.variants_to_include = self._get_LOCO_SNV_indices(LOCO_chrom_id)

        self.blocksize = blocksize

        self.nb_ind = None
        self.nb_SNVs_unf = None
        self.G0 = None
        self.K0 = None
        self.nb_SNVs_f = None
        self.samples_overlapped = False
Exemplo n.º 25
0
    def test_three(self):  #!!! rather a big test case
        from pysnptools.util.mapreduce1.runner import Local, LocalMultiProc
        logging.info("TestSingleSnpAllPlusSelect test_three")

        bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed"
        bed_fn = Bed(bed_fn, count_A1=False)
        pheno_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt"
        cov_fn = self.pythonpath + "/tests/datasets/synth/cov.txt"

        mf_name = "lmp"  #"local", "coreP", "nodeP", "socketP", "nodeE", "lmp"
        runner = mf_to_runner_function(mf_name)(4)

        output_file_name = self.file_name("three")
        results = single_snp_all_plus_select(
            test_snps=bed_fn,
            pheno=pheno_fn,
            covar=cov_fn,
            k_list=[int(k) for k in np.logspace(0, 7, base=2, num=7)],
            n_folds=7,
            seed=42,
            do_plot=False,
            GB_goal=2,
            output_file_name=output_file_name,
            runner=runner,
            count_A1=False)
        logging.info(results)
        self.compare_files(results, "three")
Exemplo n.º 26
0
 def test_pheno1(self):
     from pysnptools.snpreader import Bed, SnpData, SnpNpz
     some_snp_data = Bed(self.currentFolder + "/../../tests/datasets/generate/gen2.bed",count_A1=False).read()
     gen_snpdata = SnpData(iid=some_snp_data.iid,sid=["pheno"],val=_generate_phenotype(some_snp_data, 10, genetic_var=.5, noise_var=.5, seed=5).reshape(-1,1))
     #SnpNpz.write(r'c:\deldir\pheno1.snp.npz',gen_snpdata)
     ref_snpdata = SnpNpz(self.currentFolder + "/../../tests/datasets/generate/pheno1.snp.npz").read()
     assert gen_snpdata == ref_snpdata
Exemplo n.º 27
0
 def _snp_fixup(snp_input, iid_source_if_none=None):
     if isinstance(snp_input, str):
         return Bed(snp_input)
     elif snp_input is None:
         return iid_source_if_none[:, 0:0]  #return snpreader with no snps
     else:
         return snp_input
Exemplo n.º 28
0
 def hess_h2g(self):
     geno = Bed(self.bfile, count_A1=False)
     indv_idx = np.loadtxt("./sample/indv_idx.txt",
                           delimiter=",",
                           dtype=int)
     snp_idx = np.loadtxt("./sample/snp_idx.txt", delimiter=",", dtype=int)
     phe = np.load("./phe/phe_gene_std.npy")
     n = self.num_indv
     p = self.num_snp
     h2g_est = np.zeros(self.num_sim)
     for sim_i in range(self.num_sim):
         geno_val = geno[indv_idx[:, sim_i], snp_idx[:, sim_i]].read().val
         f = np.sum(geno_val, axis=0) / (2 * self.num_indv)
         X = (geno_val - 2 * f) / np.sqrt(2 * f * (1 - f))
         y = phe[sim_i, :]
         beta_est = np.matmul(np.transpose(X), y) / n
         V = np.loadtxt('./ld/ld_f{}.txt'.format(sim_i), delimiter=",")
         V_pinv = np.linalg.pinv(V)
         q = np.trace(np.matmul(V_pinv, V))
         h2g = (n * np.linalg.multi_dot([beta_est, V_pinv, beta_est]) -
                q) / (n - q)
         h2g_est[sim_i] = h2g
     np.savetxt("h2g_hess.txt", h2g_est, delimiter=",")
     print("HESS:")
     print(np.nanmean(h2g_est), np.nanvar(h2g_est), np.nanmedian(h2g_est))
     var = (n / (n - p))**2 * (2 * p * (
         (1 - self.h2g) / n) + 4 * self.h2g) * ((1 - self.h2g) / n)
     print(var)
Exemplo n.º 29
0
 def test_snp_kernel2(self):
     logging.info("in test_snp_kernel2")
     snpreader = Bed(self.currentFolder + "/../examples/toydata.5chrom.bed",
                     count_A1=False)
     snpkernel = SnpKernel(snpreader, standardizer=stdizer.Beta(1, 25))
     s = str(snpkernel)
     _fortesting_JustCheckExists().input(snpkernel)
Exemplo n.º 30
0
    def test_file_cache(self):
        logging.info("TestSingleSnp test_file_cache")
        from pysnptools.snpreader import Bed
        test_snps = Bed(self.bedbase)
        pheno = self.phen_fn
        covar = self.cov_fn

        output_file_name = self.file_name("G1")
        cache_file = self.file_name("cache_file") + ".npz"
        if os.path.exists(cache_file):
            os.remove(cache_file)
        frame = single_snp(test_snps=test_snps[:, :10],
                           pheno=pheno,
                           G0=test_snps[:, 10:100],
                           covar=covar,
                           G1=test_snps[:, 100:200],
                           mixing=.5,
                           output_file_name=output_file_name,
                           cache_file=cache_file)
        self.compare_files(frame, "G1")

        frame2 = single_snp(test_snps=test_snps[:, :10],
                            pheno=pheno,
                            G0=None,
                            covar=covar,
                            G1=None,
                            mixing=.5,
                            output_file_name=output_file_name,
                            cache_file=cache_file)
        self.compare_files(frame2, "G1")