示例#1
0
    def test_c_reader_npz(self):
        distreader = DistNpz(self.currentFolder + "/../examples/toydata10.dist.npz")
        distdata = distreader.read(order='F',force_python_only=False)
        snp_c = distdata.val
        
        self.assertEqual(np.float64, snp_c.dtype)
        self.assertTrue(np.allclose(self.dist_values[:,:10], snp_c, rtol=1e-05, atol=1e-05))

        distreader1 = DistNpz(self.currentFolder + "/../examples/toydata10.dist.npz")
        distreader2 = DistNpz(self.currentFolder + "/../examples/toydata.dist.npz")[:,:10]
        self.assertTrue(np.allclose(distreader1.read().val, distreader2.read().val, rtol=1e-05, atol=1e-05))


        distdata.val[1,2] = np.NaN # Inject a missing value to test writing and reading missing values
        output = "tempdir/distreader/toydata10.dist.npz"
        create_directory_if_necessary(output)
        DistNpz.write(output,distdata)
        snpdata2 = DistNpz(output).read()
        np.testing.assert_array_almost_equal(distdata.val, snpdata2.val, decimal=10)

        snpdata3 = distdata[:,0:0].read() #create distdata with no sids
        output = "tempdir/distreader/toydata0.dist.npz"
        DistNpz.write(output,snpdata3)
        snpdata4 = DistNpz(output).read()
        assert snpdata3 == snpdata4
    def test_str(self):
        logging.info("TestLmmTrain test_str")

        G0_train = self.pythonpath + "/tests/datasets/synth/all"
        covariate_train = None
        pheno_train = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt"

        fastlmm1 = FastLMM(GB_goal=2).fit(K0_train=G0_train, X=covariate_train, y=pheno_train,count_A1=False)
        filename = self.tempout_dir + "/model_str.flm.p"
        pstutil.create_directory_if_necessary(filename)

        joblib.dump(fastlmm1, filename) 
        fastlmm2 = joblib.load(filename)
                
        # predict on same
        G0_test = G0_train
        covariate_test = covariate_train

        predicted_pheno, covar = fastlmm2.predict(K0_whole_test=G0_test, X=covariate_test,count_A1=False)

        output_file = self.file_name("str")
        Dat.write(output_file,predicted_pheno)

        #pheno_actual = self.pheno_whole[test_idx,:].read().val[:,0]

        #pylab.plot(pheno_actual, predicted_pheno.val,".")
        #pylab.show()


        self.compare_files(predicted_pheno,"str")
    def test_twoK(self):
        logging.info("TestLmmTrain test_twoK")

        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        G0_train = self.snpreader_whole[train_idx,:]
        covariate_train = self.covariate_whole[train_idx,:]
        pheno_train = self.pheno_whole[train_idx,:]

        fastlmm1 = FastLMM(GB_goal=2).fit(K0_train=G0_train, K1_train=G0_train, X=covariate_train, y=pheno_train)
        filename = self.tempout_dir + "/model_one.flm.p"
        pstutil.create_directory_if_necessary(filename)
        joblib.dump(fastlmm1, filename) 
        fastlmm2 = joblib.load(filename)

                
        # predict on test set
        G0_test = self.snpreader_whole[test_idx,:]
        covariate_test = self.covariate_whole[test_idx,:]

        predicted_pheno, covar = fastlmm2.predict(K0_whole_test=G0_test, K1_whole_test=G0_test, X=covariate_test,count_A1=False)

        output_file = self.file_name("one")
        Dat.write(output_file,predicted_pheno)

        pheno_actual = self.pheno_whole[test_idx,:].read().val[:,0]

        #pylab.plot(pheno_actual, predicted_pheno.val,".")
        #pylab.show()


        self.compare_files(predicted_pheno,"one")
示例#4
0
    def test_one(self):
        logging.info("test_one")
        import filecmp

        azure_path = "/chrom1/azurecopypy/test/one/little.txt"
        local_path0 = self._temp_dir() + "/little0.txt"
        pstutil.create_directory_if_necessary(local_path0)
        shutil.copy(os.path.realpath(__file__), local_path0)
        local_path = self._temp_dir() + "/little.txt"

        if self.container.file_exists(azure_path):
            self.container.remove(azure_path)
        assert not self.container.file_exists(azure_path)
        self.container.upload(local_path0, azure_path)
        assert self.container.file_exists(azure_path)
        self.container.download(azure_path, local_path, as_needed=True)
        self.container.download(
            azure_path, local_path, as_needed=True
        )  #Manually testing: see that it doesn't download again
        assert self.container.getmdate(
            azure_path) == datetime.datetime.utcfromtimestamp(
                os.path.getmtime(local_path)).replace(tzinfo=pytz.utc)
        assert filecmp.cmp(local_path0, local_path)
        self.container.remove(azure_path)
        self.container.rmtree('/'.join(azure_path.split('/')[:3]))
        os.remove(local_path)
示例#5
0
    def _simple_open_write(self, simple_file_name, size=0, updater=None):
        import pysnptools.util as pstutil
        logging.info("open_write('{0}',size={1})".format(
            simple_file_name, size))

        #Register the file name in the directory
        file_name = self.directory + "/" + simple_file_name
        if os.path.exists(
                file_name
        ):  #This is only OK, if it is a directory containing no files (and thus doesn't exist)
            assert not os.path.isfile(
                file_name), "Can't open a file for write if it already exists."
            assert not self._at_least_one(
                self.walk(simple_file_name)
            ), "Can't open a file for write if a directory with files already has the same name ({0},{1})".format(
                self, simple_file_name)
            shutil.rmtree(file_name)
        else:
            pstutil.create_directory_if_necessary(file_name, isfile=True)

        yield file_name

        logging.info("close('{0}')".format(simple_file_name))
        assert os.path.exists(
            file_name
        ), "File doesn't exist in LocalCache. File is '{0}'".format(file_name)
示例#6
0
def _run_one_task(original_distributable, taskindex, taskcount, workdirectory):
    '''
    Does a fraction of the work (e.g. 1 of every 1000 work items) and then saves the results to single file.
    if taskindex == taskcount, does the reduce step
    '''

    if not 0 < taskcount: raise Exception("Expect taskcount to be positive")
    if not (0 <= taskindex and taskindex < taskcount + 1):
        raise Exception(
            "Expect taskindex to be between 0 (inclusive) and taskcount (exclusive)"
        )

    shaped_distributable = _shape_to_desired_workcount(original_distributable,
                                                       taskcount)

    if shaped_distributable.work_count != taskcount:
        raise Exception("Assert: expect workcount == taskcount")

    pstutil.create_directory_if_necessary(workdirectory,
                                          isfile=False,
                                          robust=True)

    if (taskindex < taskcount):
        doMainWorkForOneIndex(shaped_distributable, taskcount, taskindex,
                              workdirectory)
        return None
    else:
        result_sequence = work_sequence_from_disk(workdirectory, taskcount)
        return shaped_distributable.reduce(result_sequence)
    def _fasttwoK(self,force_low_rank,GB_goal):

        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        G0_train = self.snpreader_whole[train_idx,:]
        G1_train = SnpData(iid=G0_train.iid,sid=[item+"_1" for item in G0_train.sid],val=G0_train.read().val,pos=G0_train.pos,name="Different SNP names for {0}".format(G0_train))
        covariate_train = self.covariate_whole[train_idx,:]
        pheno_train = self.pheno_whole[train_idx,:]

        logging.info("force_low_rank = {0}".format(force_low_rank))
        fastlmm1 = FastLMM(force_low_rank=force_low_rank,GB_goal=GB_goal).fit(K0_train=G0_train, K1_train=G1_train, X=covariate_train, y=pheno_train, mixing=.1)

        filename = self.tempout_dir + "/model_fasttwoK.flm.p"
        pstutil.create_directory_if_necessary(filename)
        joblib.dump(fastlmm1, filename) 
        fastlmm2 = joblib.load(filename)
                
        # predict on test set
        G0_test = self.snpreader_whole[test_idx,:]
        G1_test = SnpData(iid=G0_test.iid,sid=[item+"_1" for item in G0_test.sid],val=G0_test.read().val,pos=G0_test.pos,name="Different SNP names for {0}".format(G0_test))
        covariate_test = self.covariate_whole[test_idx,:]

        predicted_pheno, covar = fastlmm2.predict(K0_whole_test=G0_test, K1_whole_test=G1_test, X=covariate_test,count_A1=False)

        output_file = self.file_name("fasttwoK"+("_force_low" if force_low_rank else "")+("GB{0}".format(GB_goal) if GB_goal is not None else ""))
        Dat.write(output_file,predicted_pheno)

        pheno_actual = self.pheno_whole[test_idx,:].read().val[:,0]

        #pylab.plot(pheno_actual, predicted_pheno.val,".")
        #pylab.show()


        self.compare_files(predicted_pheno,"one")
示例#8
0
文件: test.py 项目: hyacz/PySnpTools
    def test_c_reader_dat(self):
        snpreader = Dat(self.currentFolder + "/examples/toydata.dat")[:, ::100]
        _fortesting_JustCheckExists().input(snpreader)

        snpdata1 = snpreader.read()
        self.assertEqual(np.float64, snpdata1.val.dtype)
        self.assertTrue(
            np.allclose(self.snps[:, ::100],
                        snpdata1.val,
                        rtol=1e-05,
                        atol=1e-05))

        snpdata1.val[
            1,
            2] = np.NaN  # Inject a missing value to test writing and reading missing values
        output = "tempdir/snpreader/toydata.dat"
        create_directory_if_necessary(output)
        Dat.write(output, snpdata1)
        snpdata2 = Dat(output).read()
        np.testing.assert_array_almost_equal(snpdata1.val,
                                             snpdata2.val,
                                             decimal=10)

        snpdata3 = snpdata1[:, 0:0].read()  #create snpdata with no sids
        output = "tempdir/snpreader/toydata3.dat"
        Dat.write(output, snpdata3)
        snpdata4 = Dat(output).read()
        assert snpdata3 == snpdata4
示例#9
0
    def run(self, distributable):
        if self.temp_dir is not None:
            tempdir = self.temp_dir
        else:
            tempdir = os.path.join(self.run_dir, distributable.tempdirectory)
        tempdir = os.path.realpath(tempdir)

        with patch.dict('os.environ',
                        {'MKL_NUM_THREADS': str(self.mkl_num_threads)}
                        if self.mkl_num_threads is not None else {}) as _:
            if self.taskindex != self.taskcount:
                _JustCheckExists().input(distributable)
                return _run_one_task(distributable,
                                     self.taskindex,
                                     self.taskcount,
                                     tempdir,
                                     weights=self.weights,
                                     environ=self.environ)
            else:
                result = _run_one_task(distributable,
                                       self.taskindex,
                                       self.taskcount,
                                       tempdir,
                                       weights=self.weights,
                                       environ=self.environ)
                if self.result_file is not None:
                    create_directory_if_necessary(self.result_file)
                    with open(self.result_file, mode='wb') as f:
                        pickle.dump(result, f, pickle.HIGHEST_PROTOCOL)

                return result
示例#10
0
    def save_hashdown(self, filename):
        """
        Save a Hashdown object to a json file.

        :param filename: name of file to save to.
        :type path: string

        >>> from pysnptools.util.filecache import Hashdown
        >>> file_to_hash= {'pysnptools/examples/toydata.5chrom.bed': '766f55aa716bc7bc97cad4de41a50ec3',
        ...                'pysnptools/examples/toydata.5chrom.bim': '6a07f96e521f9a86df7bfd7814eebcd6',
        ...                'pysnptools/examples/toydata.5chrom.fam': 'f4eb01f67e0738d4865fad2014af8537'}
        >>> hashdown1 = Hashdown('https://github.com/fastlmm/PySnpTools/raw/cf248cbf762516540470d693532590a77c76fba2',
        ...                      file_to_hash=file_to_hash)
        >>> hashdown1.save_hashdown('tempdir/demo.hashdown.json')
        >>> hashdown2 = Hashdown.load_hashdown('tempdir/demo.hashdown.json')
        >>> hashdown2.file_exists('pysnptools/examples/toydata.5chrom.bed')
        True
        >>> hashdown2.load('pysnptools/examples/toydata.5chrom.fam').split('\\n')[0]
        'per0 per0 0 0 2 0.408848'

        """
        pstutil.create_directory_if_necessary(filename)
        dict0 = dict(self.__dict__)
        del dict0["directory"]
        del dict0["_relative_directory"]
        del dict0["allow_unknown_files"]
        del dict0["trust_local_files"]
        with open(filename, "w") as json_file:
            json.dump(dict0, json_file)
示例#11
0
    def test1(self):
        logging.info("in TestPstMemMap test1")
        old_dir = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__)))

        filename2 = "tempdir/tiny.pst.memmap"
        pstutil.create_directory_if_necessary(filename2)
        pstreader2 = PstMemMap.empty(row=['a','b','c'],col=['y','z'],filename=filename2,row_property=['A','B','C'],order="F",dtype=np.float64)
        assert isinstance(pstreader2.val,np.memmap)
        pstreader2.val[:,:] = [[1,2],[3,4],[np.nan,6]]
        assert np.array_equal(pstreader2[[0],[0]].read(view_ok=True).val,np.array([[1.]]))
        pstreader2.flush()
        assert isinstance(pstreader2.val,np.memmap)
        assert np.array_equal(pstreader2[[0],[0]].read(view_ok=True).val,np.array([[1.]]))
        pstreader2.flush()

        pstreader3 = PstMemMap(filename2)
        assert np.array_equal(pstreader3[[0],[0]].read(view_ok=True).val,np.array([[1.]]))
        assert isinstance(pstreader3.val,np.memmap)

        pstreader = PstMemMap('../examples/tiny.pst.memmap')
        assert pstreader.row_count == 3
        assert pstreader.col_count == 2
        assert isinstance(pstreader.val,np.memmap)

        pstdata = pstreader.read(view_ok=True)
        assert isinstance(pstdata.val,np.memmap)
        os.chdir(old_dir)
示例#12
0
    def test1(self):
        old_dir = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__)))

        filename2 = "tempdir/tiny.snp.memmap"
        pstutil.create_directory_if_necessary(filename2)
        snpreader2 = SnpMemMap.empty(iid=[['fam0', 'iid0'], ['fam0', 'iid1']],
                                     sid=['snp334', 'snp349', 'snp921'],
                                     filename=filename2,
                                     order="F",
                                     dtype=np.float64)
        assert isinstance(snpreader2.val, np.memmap)
        snpreader2.val[:, :] = [[0., 2., 0.], [0., 1., 2.]]
        assert np.array_equal(snpreader2[[1], [1]].read(view_ok=True).val,
                              np.array([[1.]]))
        snpreader2.flush()
        assert isinstance(snpreader2.val, np.memmap)
        assert np.array_equal(snpreader2[[1], [1]].read(view_ok=True).val,
                              np.array([[1.]]))
        snpreader2.flush()

        snpreader3 = SnpMemMap(filename2)
        assert np.array_equal(snpreader3[[1], [1]].read(view_ok=True).val,
                              np.array([[1.]]))
        assert isinstance(snpreader3.val, np.memmap)

        logging.info("in TestSnpMemMap test1")
        snpreader = SnpMemMap('../examples/tiny.snp.memmap')
        assert snpreader.iid_count == 2
        assert snpreader.sid_count == 3
        assert isinstance(snpreader.val, np.memmap)

        snpdata = snpreader.read(view_ok=True)
        assert isinstance(snpdata.val, np.memmap)
        os.chdir(old_dir)
    def test_kernel_one(self):
        logging.info("TestLmmTrain test_kernel_one")

        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        K0_train = SnpKernel(self.snpreader_whole[train_idx,:],standardizer=Unit())
        covariate_train = self.covariate_whole[train_idx,:]
        pheno_train = self.pheno_whole[train_idx,:]
        assert np.array_equal(K0_train.iid,covariate_train.iid), "Expect iids to be the same (so that early and late Unit standardization will give the same result)"
        assert np.array_equal(K0_train.iid,pheno_train.iid), "Expect iids to be the same (so that early and late Unit standardization will give the same result)"

        fastlmm1 = FastLMM(GB_goal=2).fit(K0_train=K0_train, X=covariate_train, y=pheno_train)
        filename = self.tempout_dir + "/model_kernel_one.flm.p"
        pstutil.create_directory_if_necessary(filename)
        joblib.dump(fastlmm1, filename) 
        fastlmm2 = joblib.load(filename)

                
        # predict on test set
        G0_test = self.snpreader_whole[test_idx,:]
        covariate_test = self.covariate_whole[test_idx,:]

        predicted_pheno, covar = fastlmm2.predict(K0_whole_test=G0_test, X=covariate_test,count_A1=False)

        output_file = self.file_name("kernel_one")
        Dat.write(output_file,predicted_pheno)

        pheno_actual = self.pheno_whole[test_idx,:].read().val[:,0]

        #pylab.plot(pheno_actual, predicted_pheno.val,".")
        #pylab.show()


        self.compare_files(predicted_pheno,"one") #Expect same results as SNPs "one"
示例#14
0
 def setUpClass(self):
     from pysnptools.util import create_directory_if_necessary
     create_directory_if_necessary(self.tempout_dir, isfile=False)
     self.pythonpath = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..",".."))
     self.bedbase = os.path.join(self.pythonpath, 'tests/datasets/all_chr.maf0.001.N300')
     self.phen_fn = os.path.join(self.pythonpath, 'tests/datasets/phenSynthFrom22.23.N300.randcidorder.txt')
     self.cov_fn = os.path.join(self.pythonpath,  'tests/datasets/all_chr.maf0.001.covariates.N300.txt')
示例#15
0
    def test_c_reader_pheno(self):
        snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read()

        self.assertEqual(np.float64, snpdata1.val.dtype)

        snpdata1.val[1,0] = np.NaN # Inject a missing value to test writing and reading missing values
        output = "tempdir/snpreader/toydata.phe"
        create_directory_if_necessary(output)
        Pheno.write(output, snpdata1)
        snpreader = Pheno(output)
        _fortesting_JustCheckExists().input(snpreader)
        s = str(snpreader)
        snpdata2 = snpreader.read()
        np.testing.assert_array_almost_equal(snpdata1.val, snpdata2.val, decimal=10)

        snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read()
        import pysnptools.util.pheno as pstpheno
        dict = pstpheno.loadOnePhen(self.currentFolder + "/examples/toydata.phe",missing="")
        snpdata3 = Pheno(dict).read()
        np.testing.assert_array_almost_equal(snpdata1.val, snpdata3.val, decimal=10)


        dict = pstpheno.loadOnePhen(self.currentFolder + "/examples/toydata.phe",missing="",vectorize=True)
        assert len(dict['vals'].shape)==1, "test 1-d array of values"
        snpdata3 = Pheno(dict).read()
        np.testing.assert_array_almost_equal(snpdata1.val, snpdata3.val, decimal=10)

        snpdata4 = Pheno(None,iid_if_none=snpdata1.iid)
        assert (snpdata4.row == snpdata1.row).all() and snpdata4.col_count == 0

        snpdata5 = Pheno(self.currentFolder + "/examples/toydata.id.phe").read()
        np.testing.assert_array_almost_equal(snpdata1.val, snpdata5.val, decimal=10)
        snpdata6 = Pheno(self.currentFolder + "/examples/toydata.fid.phe").read()
        np.testing.assert_array_almost_equal(snpdata1.val, snpdata6.val, decimal=10)
示例#16
0
文件: hpc.py 项目: fastlmm/PySnpTools
    def run(self, distributable):
        # Check that the local machine has python path set
        localpythonpath = os.environ.get("PYTHONPATH")#!!should it be able to work without pythonpath being set (e.g. if there was just one file)? Also, is None really the return or is it an exception.
        if localpythonpath is None: raise Exception("Expect local machine to have 'pythonpath' set")

        remotepythoninstall = self.check_remote_pythoninstall()

        remotewd, run_dir_abs, run_dir_rel, nodelocalwd = self.create_run_dir()
        pstutil.create_directory_if_necessary(os.path.join(remotewd, distributable.tempdirectory), isfile=False) #create temp directory now so that cluster tasks won't try to create it many times at once
        result_remote = os.path.join(run_dir_abs,"result.p")

        self.copy_python_settings(run_dir_abs)

        inputOutputCopier = HPCCopier(remotewd,skipinput=self.skipinputcopy) #Create the object that copies input and output files to where they are needed

        inputOutputCopier.input(distributable) # copy of the input files to where they are needed (i.e. the cluster)

        remotepythonpath = self.FindOrCreateRemotePythonPath(localpythonpath, run_dir_abs)

        batfilename_rel = self.create_bat_file(distributable, remotepythoninstall, remotepythonpath, remotewd, run_dir_abs, run_dir_rel, result_remote, nodelocalwd, distributable)

        self.submit_to_cluster(batfilename_rel, distributable, remotewd, run_dir_abs, run_dir_rel, nodelocalwd)

        inputOutputCopier.output(distributable) # copy the output file from where they were created (i.e. the cluster) to the local computer

        assert os.path.exists(result_remote), "The HPC job produced no result (and, thus, likely failed)"
        with open(result_remote, mode='rb') as f:
            result = pickle.load(f)

        #logging.info('Done: HPC runner is running a distributable. Returns {0}'.format(result))
        return result
示例#17
0
 def cmktest_writes(self):
     #===================================
     #    Defining sub functions
     #===================================
     def _oned_int(c):
         return range(c)
     def _oned_str(c):
         return [str(i).encode('ascii') for i in range(c)]
     def _twooned_int(c):
         return [[i] for i in range(c)]
     def _twooned_str(c):
         return [[str(i).encode('ascii')] for i in range(c)]
     def _twotwod_int(c):
         return [[i,i] for i in range(c)]
     def _twotwod_str(c):
         return [[str(i).encode('ascii'),b"hello"] for i in range(c)]
     def _none(c):
         return None
     def _zero(c):
         return np.empty([c,0])
     #===================================
     #    Staring main function
     #===================================
     logging.info("starting 'test_writes'")
     np.random.seed(0)
     output_template = "tempdir/pstreader/writes.{0}.{1}"
     create_directory_if_necessary(output_template.format(0,"npz"))
     i = 0
     for row_count in [5,2,1,0]:
         for col_count in [4,2,1,0]:
             val = np.random.normal(.5,2,size=(row_count,col_count))
             for row_or_col_gen in [_oned_int,_oned_str,_twooned_int,_twooned_str,_twotwod_int,_twotwod_str]:
                 row = row_or_col_gen(row_count)
                 col = row_or_col_gen(col_count)
                 for prop_gen in [_oned_int,_oned_str,_twooned_int,_twooned_str,_twotwod_int,_twotwod_str,_none,_zero]:
                     row_prop = prop_gen(row_count)
                     col_prop = prop_gen(col_count)
                     pstdata = PstData(row,col,val,row_prop,col_prop,str(i))
                     for the_class,suffix in [(PstNpz,"npz"),(PstHdf5,"hdf5")]:
                         filename = output_template.format(i,suffix)
                         logging.info(filename)
                         i += 1
                         the_class.write(filename,pstdata)
                         for subsetter in [None, sp.s_[::2,::3]]:
                             reader = the_class(filename)
                             _fortesting_JustCheckExists().input(reader)
                             subreader = reader if subsetter is None else reader[subsetter[0],subsetter[1]]
                             readdata = subreader.read(order='C')
                             expected = pstdata if subsetter is None else pstdata[subsetter[0],subsetter[1]].read()
                             assert np.array_equal(readdata.val,expected.val)
                             assert np.array_equal(readdata.row,expected.row)
                             assert np.array_equal(readdata.col,expected.col)
                             assert np.array_equal(readdata.row_property,expected.row_property)
                             assert np.array_equal(readdata.col_property,expected.col_property)
                         try:
                             os.remove(filename)
                         except:
                             pass
     logging.info("done with 'test_writes'")
示例#18
0
 def test_writes(self):
     #===================================
     #    Defining sub functions
     #===================================
     def _oned_int(c):
         return list(range(c))
     def _oned_str(c):
         return [str(i) for i in range(c)]
     def _twooned_int(c):
         return [[i] for i in range(c)]
     def _twooned_str(c):
         return [[str(i)] for i in range(c)]
     def _twotwod_int(c):
         return [[i,i] for i in range(c)]
     def _twotwod_str(c):
         return [[str(i),"hello"] for i in range(c)]
     def _none(c):
         return None
     def _zero(c):
         return np.empty([c,0])
     #===================================
     #    Staring main function
     #===================================
     logging.info("starting 'test_writes'")
     np.random.seed(0)
     output_template = "tempdir/pstreader/writes.{0}.{1}"
     create_directory_if_necessary(output_template.format(0,"npz"))
     i = 0
     for row_count in [5,2,1,0]:
         for col_count in [4,2,1,0]:
             val = np.random.normal(.5,2,size=(row_count,col_count))
             for row_or_col_gen in [_oned_int,_oned_str,_twooned_int,_twooned_str,_twotwod_int,_twotwod_str]:
                 row = row_or_col_gen(row_count)
                 col = row_or_col_gen(col_count)
                 for prop_gen in [_oned_int,_oned_str,_twooned_int,_twooned_str,_twotwod_int,_twotwod_str,_none,_zero]:
                     row_prop = prop_gen(row_count)
                     col_prop = prop_gen(col_count)
                     pstdata = PstData(row,col,val,row_prop,col_prop,str(i))
                     for the_class,suffix in [(PstNpz,"npz"),(PstHdf5,"hdf5")]:
                         filename = output_template.format(i,suffix)
                         logging.info(filename)
                         i += 1
                         the_class.write(filename,pstdata)
                         for subsetter in [None, sp.s_[::2,::3]]:
                             reader = the_class(filename)
                             _fortesting_JustCheckExists().input(reader)
                             subreader = reader if subsetter is None else reader[subsetter[0],subsetter[1]]
                             readdata = subreader.read(order='C')
                             expected = pstdata if subsetter is None else pstdata[subsetter[0],subsetter[1]].read()
                             assert np.array_equal(readdata.val,expected.val)
                             assert np.array_equal(readdata.row,expected.row)
                             assert np.array_equal(readdata.col,expected.col)
                             assert np.array_equal(readdata.row_property,expected.row_property)
                             assert np.array_equal(readdata.col_property,expected.col_property)
                         try:
                             os.remove(filename)
                         except:
                             pass
     logging.info("done with 'test_writes'")
 def reduce(self, result_sequence):
     '''
     '''
     for i, pcs in result_sequence:
         out_fn = self.create_out_fn(self.cache_prefix, i)
         pstutil.create_directory_if_necessary(out_fn)
         save(out_fn, pcs)
     return None
示例#20
0
    def setUpClass(self):
        from pysnptools.util import create_directory_if_necessary
        create_directory_if_necessary(self.tempout_dir, isfile=False)
        self.pythonpath = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..",".."))

        self.snpreader_whole = Bed(self.pythonpath + "/tests/datasets/synth/all",count_A1=False)
        self.covariate_whole = Pheno(self.pythonpath + "/tests/datasets/synth/cov.txt")
        self.pheno_whole = Pheno(self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt")
示例#21
0
    def test3(self):
        old_dir = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__)))
        filename = "tempdir/x.pst.memmap"
        pstutil.create_directory_if_necessary(filename)

        a = PstMemMap.empty(row=['a','b','c'],col=['y','z'],filename=filename,row_property=['A','B','C'],order="F",dtype=np.float64)
        pstdata = a.read(order='C',view_ok=True)
        os.chdir(old_dir)
示例#22
0
    def test_read1(self):

        old_dir = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__)))

        file_from = "../examples/example.bgen"
        file_to = "temp/example.bgen"
        pstutil.create_directory_if_necessary(file_to)
        if os.path.exists(file_to + ".metadata"):
            os.remove(file_to + ".metadata")
        meta = open_bgen._metadata_path_from_filename(file_to,
                                                      samples_filepath=None)
        if os.path.exists(meta):
            os.remove(meta)
        shutil.copy(file_from, file_to)

        for loop_index in range(2):
            bgen = Bgen(file_to)
            assert np.array_equal(bgen.iid[0], ["0", "sample_001"])
            assert bgen.sid[0] == "SNPID_2,RSID_2"

            # Use the bgen_sample_id for both parts of iid
            def iid_dup(bgen_sample_id):
                return (bgen_sample_id, bgen_sample_id)

            iid_function = iid_dup
            bgen = Bgen(file_to, iid_function=iid_function, sid_function="id")
            assert np.array_equal(bgen.iid[0], ["sample_001", "sample_001"])
            assert bgen.sid[0] == "SNPID_2"

            bgen = Bgen(file_to,
                        iid_function=iid_function,
                        sid_function="rsid")
            assert np.array_equal(bgen.iid[0], ["sample_001", "sample_001"])
            assert bgen.sid[0] == "RSID_2"

            sid_function = lambda id, rsid: "{0},{1}".format(id, rsid)
            bgen = Bgen(file_to, iid_function, sid_function=sid_function)
            assert bgen.sid[0] == "SNPID_2,RSID_2"

        metafile = bgen._open_bgen._metadata_path_from_filename(
            file_to, samples_filepath=None)
        del bgen
        os.remove(metafile)
        sid_function = lambda id, rsid: "{0},{1}".format(id, rsid)
        bgen = Bgen(file_to, iid_function, sid_function=sid_function)
        assert bgen.sid[0] == "SNPID_2,RSID_2"

        metafile = bgen._open_bgen._metadata_path_from_filename(
            file_to, samples_filepath=None)
        del bgen
        os.remove(metafile)
        bgen = Bgen(file_to, iid_function, sid_function="rsid")
        assert np.array_equal(bgen.iid[0], ["sample_001", "sample_001"])
        assert bgen.sid[0] == "RSID_2"

        os.chdir(old_dir)
示例#23
0
 def test_c_reader_dense(self):
     snpdata1 = self.snpdata[:,::100].read()
     snpdata1.val[1,2] = np.NaN # Inject a missing value to test writing and reading missing values
     output = "tempdir/snpreader/toydata.dense.txt"
     create_directory_if_necessary(output)
     Dense.write(output, snpdata1)
     snpreader = Dense(output)
     _fortesting_JustCheckExists().input(snpreader)
     snpdata2 = snpreader.read()
     np.testing.assert_array_almost_equal(snpdata1.val, snpdata2.val, decimal=10)
示例#24
0
 def create_distributablep(self, distributable, run_dir_abs, run_dir_rel):
     logging.info('Hadoop runner is pickling distributable')
     distributablep_filename_rel = os.path.join(run_dir_rel,
                                                "distributable.p")
     #distributablep_filename_abs = os.path.join(run_dir_abs, "distributable.p")
     pstutil.create_directory_if_necessary(distributablep_filename_rel)
     with open(distributablep_filename_rel, mode='wb') as f:
         pickle.dump(distributable, f, pickle.HIGHEST_PROTOCOL)
     logging.info('Done: Hadoop runner is pickling distributable')
     return distributablep_filename_rel
示例#25
0
    def fill_in_cache_file(self):
        self._run_once()

        logging.info("filling in the cache_file and log_delta, as needed")

        if self.G1_or_none is None:
            self.G1val_or_none = None
        else:
            self.G1val_or_none = self.G1_or_none.read().standardize().val

        # The S and U are always cached, in case they are needed for the cluster or for multi-threaded runs
        if self.cache_file is None:
            self.cache_file = os.path.join(self.__tempdirectory,
                                           "cache_file.npz")
            if os.path.exists(
                    self.cache_file
            ):  # If there is already a cache file in the temp directory, it must be removed because it might be out-of-date
                os.remove(self.cache_file)

        lmm = None
        if not os.path.exists(self.cache_file):
            logging.info("Precomputing eigen")
            lmm = LMM()
            G0_standardized = self.G0.read().standardize()
            lmm.setG(G0_standardized.val, self.G1val_or_none, a2=self.mixing)
            logging.info("Saving precomputation to {0}".format(
                self.cache_file))
            pstutil.create_directory_if_necessary(self.cache_file)
            np.savez(
                self.cache_file, lmm.U, lmm.S
            )  #using np.savez instead of pickle because it seems to be faster to read and write

        if self.external_log_delta is None:
            if lmm is None:
                lmm = self.lmm_from_cache_file()

            logging.info("searching for internal delta")
            lmm.setX(self.covar)
            lmm.sety(self.pheno['vals'])
            #log delta is used here. Might be better to use findH2, but if so will need to normalized G so that its K's diagonal would sum to iid_count

            # As per the paper, we optimized delta with REML=True, but
            # we will later optimize beta and find log likelihood with ML (REML=False)
            result = lmm.find_log_delta(
                REML=True,
                sid_count=self.G0.sid_count,
                min_log_delta=self.min_log_delta,
                max_log_delta=self.max_log_delta
            )  #!!what about findA2H2? minH2=0.00001
            self.external_log_delta = result['log_delta']

        self.internal_delta = np.exp(
            self.external_log_delta) * self.G0.sid_count
        logging.info("internal_delta={0}".format(self.internal_delta))
        logging.info("external_log_delta={0}".format(self.external_log_delta))
示例#26
0
 def test_write_x_x_cpp(self):
     snpreader = Bed(self.currentFolder + "/examples/toydata")
     for order in ['C','F']:
         for dtype in [np.float32,np.float64]:
             snpdata = snpreader.read(order=order,dtype=dtype)
             snpdata.val[-1,0] = float("NAN")
             output = "tempdir/toydata.{0}{1}.cpp".format(order,"32" if dtype==np.float32 else "64")
             create_directory_if_necessary(output)
             Bed.write(snpdata, output)
             snpdata2 = Bed(output).read()
             assert TestLoader.is_same(snpdata, snpdata2) #!!!define an equality method on snpdata?
示例#27
0
 def test_write_x_x_cpp(self):
     distreader = DistNpz(self.currentFolder + "/../examples/toydata.dist.npz")
     for order in ['C','F']:
         for dtype in [np.float32,np.float64]:
             distdata = distreader.read(order=order,dtype=dtype)
             distdata.val[-1,0] = float("NAN")
             output = "tempdir/toydata.{0}{1}.cpp.dist.npz".format(order,"32" if dtype==np.float32 else "64")
             create_directory_if_necessary(output)
             DistNpz.write(output, distdata)
             snpdata2 = DistNpz(output).read()
             np.testing.assert_array_almost_equal(distdata.val, snpdata2.val, decimal=10)
示例#28
0
 def test_write_x_x_cpp(self):
     snpreader = Bed(self.currentFolder + "/examples/toydata")
     for order in ['C','F']:
         for dtype in [np.float32,np.float64]:
             snpdata = snpreader.read(order=order,dtype=dtype)
             snpdata.val[-1,0] = float("NAN")
             output = "tempdir/toydata.{0}{1}.cpp".format(order,"32" if dtype==np.float32 else "64")
             create_directory_if_necessary(output)
             Bed.write(output, snpdata)
             snpdata2 = Bed(output).read()
             np.testing.assert_array_almost_equal(snpdata.val, snpdata2.val, decimal=10)
示例#29
0
    def _create_directory(local):
        import pysnptools.util as pstutil  #put here to avoid recursive nesting

        if os.path.exists(local):
            if os.path.isfile(local):
                os.remove(local)
            else:
                shutil.rmtree(local)
        directory_name = os.path.dirname(local)
        if os.path.exists(directory_name) and os.path.isfile(directory_name):
            os.remove(directory_name)
        pstutil.create_directory_if_necessary(local, isfile=True)
示例#30
0
 def test_npz(self):
     logging.info("in test_npz")
     snpreader = Bed(self.currentFolder + "/../examples/toydata",count_A1=False)
     kerneldata1 = snpreader.read_kernel(standardizer=stdizer.Unit())
     s = str(kerneldata1)
     output = "tempdir/kernelreader/toydata.kernel.npz"
     create_directory_if_necessary(output)
     KernelNpz.write(output,kerneldata1)
     kernelreader2 = KernelNpz(output)
     kerneldata2 = kernelreader2.read()
     np.testing.assert_array_almost_equal(kerneldata1.val, kerneldata2.val, decimal=10)
     logging.info("done with test")
示例#31
0
    def reducer_closure(result_sequence):
        if output_file_name is not None:
            create_directory_if_necessary(output_file_name)

        frame = pd.concat(result_sequence)
        frame.sort_values(by="PValue", inplace=True)
        frame.index = np.arange(len(frame))

        if output_file_name is not None:
            frame.to_csv(output_file_name, sep="\t", index=False)

        return frame
示例#32
0
 def test_npz(self):
     logging.info("in test_npz")
     distreader = DistNpz(self.currentFolder + "/../examples/toydata.dist.npz")
     snpdata1 = distreader.as_snp(max_weight=1.0).read()
     s = str(snpdata1)
     output = "tempdir/distreader/toydata.snp.npz"
     create_directory_if_necessary(output)
     SnpNpz.write(output,snpdata1)
     snpreader2 = SnpNpz(output)
     snpdata2 = snpreader2.read()
     np.testing.assert_array_almost_equal(snpdata1.val, snpdata2.val, decimal=10)
     logging.info("done with test")
示例#33
0
    def reducer_closure(result_sequence):
        if output_file_name is not None:
            create_directory_if_necessary(output_file_name)

        frame = pd.concat(result_sequence)
        frame.sort_values(by="PValue", inplace=True)
        frame.index = np.arange(len(frame))

        if output_file_name is not None:
            frame.to_csv(output_file_name, sep="\t", index=False)

        return frame
示例#34
0
文件: test.py 项目: hyacz/PySnpTools
    def test_c_reader_pheno(self):
        snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read()

        self.assertEqual(np.float64, snpdata1.val.dtype)

        snpdata1.val[
            1,
            0] = np.NaN  # Inject a missing value to test writing and reading missing values
        output = "tempdir/snpreader/toydata.phe"
        create_directory_if_necessary(output)
        Pheno.write(output, snpdata1)
        snpreader = Pheno(output)
        _fortesting_JustCheckExists().input(snpreader)
        s = str(snpreader)
        snpdata2 = snpreader.read()
        np.testing.assert_array_almost_equal(snpdata1.val,
                                             snpdata2.val,
                                             decimal=10)

        snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read()
        import pysnptools.util.pheno as pstpheno
        dict = pstpheno.loadOnePhen(self.currentFolder +
                                    "/examples/toydata.phe",
                                    missing="")
        snpdata3 = Pheno(dict).read()
        np.testing.assert_array_almost_equal(snpdata1.val,
                                             snpdata3.val,
                                             decimal=10)

        dict = pstpheno.loadOnePhen(self.currentFolder +
                                    "/examples/toydata.phe",
                                    missing="",
                                    vectorize=True)
        assert len(dict['vals'].shape) == 1, "test 1-d array of values"
        snpdata3 = Pheno(dict).read()
        np.testing.assert_array_almost_equal(snpdata1.val,
                                             snpdata3.val,
                                             decimal=10)

        snpdata4 = Pheno(None, iid_if_none=snpdata1.iid)
        assert (snpdata4.row == snpdata1.row).all() and snpdata4.col_count == 0

        snpdata5 = Pheno(self.currentFolder +
                         "/examples/toydata.id.phe").read()
        np.testing.assert_array_almost_equal(snpdata1.val,
                                             snpdata5.val,
                                             decimal=10)
        snpdata6 = Pheno(self.currentFolder +
                         "/examples/toydata.fid.phe").read()
        np.testing.assert_array_almost_equal(snpdata1.val,
                                             snpdata6.val,
                                             decimal=10)
示例#35
0
文件: hpc.py 项目: fastlmm/PySnpTools
 def FindOrCreateRemotePythonPath(self, localpythonpath, run_dir_abs):
     if self.remote_python_parent is None:
         remotepythonpath = self.CopySource(localpythonpath, run_dir_abs)
     else:
         pstutil.create_directory_if_necessary(self.remote_python_parent,isfile=False)
         list = []
         for rel in os.listdir(self.remote_python_parent):
             list.append(os.path.join(self.remote_python_parent,rel))
         remotepythonpath = ";".join(list)
         if self.update_remote_python_parent:
             remotepythonpath = self.CopySource(localpythonpath, run_dir_abs)
     
     return remotepythonpath
示例#36
0
 def setUpClass(self):
     from pysnptools.util import create_directory_if_necessary
     create_directory_if_necessary(self.tempout_dir, isfile=False)
     self.pythonpath = os.path.abspath(
         os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                      "..", ".."))
     self.bedbase = os.path.join(
         self.pythonpath,
         'fastlmm/feature_selection/examples/toydata.5chrom')
     self.phen_fn = os.path.join(
         self.pythonpath, 'fastlmm/feature_selection/examples/toydata.phe')
     self.cov_fn = os.path.join(
         self.pythonpath, 'fastlmm/feature_selection/examples/toydata.cov')
示例#37
0
    def test1(self):
        from pysnptools.snpreader import Bed, SnpMemMap
        from pysnptools.util import example_file  # Download and return local file name

        old_dir = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__)))

        filename2 = "tempdir/tiny.snp.memmap"
        pstutil.create_directory_if_necessary(filename2)
        snpreader2 = SnpMemMap.empty(iid=[['fam0', 'iid0'], ['fam0', 'iid1']],
                                     sid=['snp334', 'snp349', 'snp921'],
                                     filename=filename2,
                                     order="F",
                                     dtype=np.float64)
        assert isinstance(snpreader2.val, np.memmap)
        snpreader2.val[:, :] = [[0., 2., 0.], [0., 1., 2.]]
        assert np.array_equal(snpreader2[[1], [1]].read(view_ok=True).val,
                              np.array([[1.]]))
        snpreader2.flush()
        assert isinstance(snpreader2.val, np.memmap)
        assert np.array_equal(snpreader2[[1], [1]].read(view_ok=True).val,
                              np.array([[1.]]))
        snpreader2.flush()

        snpreader3 = SnpMemMap(filename2)
        assert np.array_equal(snpreader3[[1], [1]].read(view_ok=True).val,
                              np.array([[1.]]))
        assert isinstance(snpreader3.val, np.memmap)

        logging.info("in TestSnpMemMap test1")
        snpreader = SnpMemMap('tempdir/tiny.snp.memmap')
        assert snpreader.iid_count == 2
        assert snpreader.sid_count == 3
        assert isinstance(snpreader.val, np.memmap)

        snpdata = snpreader.read(view_ok=True)
        assert isinstance(snpdata.val, np.memmap)

        bed_file = example_file("pysnptools/examples/toydata.5chrom.*",
                                "*.bed")
        bed = Bed(bed_file)
        pstutil.create_directory_if_necessary(
            "tempdir/toydata.5chrom.snp.memmap"
        )  #LATER should we just promise to create directories?
        SnpMemMap.write("tempdir/toydata.5chrom.snp.memmap",
                        bed)  # Write bed in SnpMemMap format
        SnpMemMap.write(
            "tempdir/toydata.5chromsnpdata.snp.memmap",
            bed[:, ::2].read())  # Write snpdata in SnpMemMap format

        os.chdir(old_dir)
示例#38
0
 def setUpClass(self):
     from pysnptools.util import create_directory_if_necessary
     import fastlmm as fastlmm
     create_directory_if_necessary(self.tempout_dir, isfile=False)
     self.pythonpath = os.path.abspath(
         os.path.join(os.path.dirname(os.path.realpath(fastlmm.__file__)),
                      ".."))
     self.bed = Bed(os.path.join(self.pythonpath,
                                 'tests/datasets/synth/all.bed'),
                    count_A1=True)[:, ::10]
     self.phen_fn = os.path.join(
         self.pythonpath, 'tests/datasets/synth/pheno_10_causals.txt')
     self.cov_fn = os.path.join(self.pythonpath,
                                'tests/datasets/synth/cov.txt')
示例#39
0
文件: hpc.py 项目: fastlmm/PySnpTools
 def output(self,item):
     if isinstance(item, str):
         itemnorm = os.path.normpath(item)
         pstutil.create_directory_if_necessary(itemnorm)
         remote_file_name = os.path.join(self.remotewd,itemnorm)
         local_dir_name,ignore = os.path.split(itemnorm)
         assert os.path.exists(remote_file_name), "Don't see expected file '{0}'. Did the HPC job fail?".format(remote_file_name)
         #xcopycommand = "xcopy /d /e /s /c /h /y {0} {1}".format(remote_file_name, local_dir_name) # we copy to the local dir instead of the local file so that xcopy won't ask 'file or dir?'
         xcopycommand = "xcopy /d /c /y {0} {1}".format(remote_file_name, local_dir_name) # we copy to the local 
         logging.info(xcopycommand)
         rc = os.system(xcopycommand)
         if rc!=0: logging.info("xcopy cmd failed with return value={0}, from cmd {1}".format(rc,xcopycommand))
     elif hasattr(item,"copyoutputs"):
         item.copyoutputs(self)
示例#40
0
 def too_slow_test_write_bedbig(self):
     iid_count = 100000
     sid_count = 50000
     from pysnptools.snpreader.snpdata import SnpData #!!! promote on level up innamespace
     iid = np.array([[str(i),str(i)] for i in xrange(iid_count)])
     sid = np.array(["sid_{0}".format(i) for i in xrange(sid_count)])
     pos = np.array([[i,i,i] for i in xrange(sid_count)])
     np.random.seed = 0
     snpdata = SnpData(iid,sid,pos,np.zeros((iid_count,sid_count))) #random.choice((0.0,1.0,2.0,float("nan")),size=(iid_count,sid_count)))
     output = "tempdir/bedbig.{0}.{1}".format(iid_count,sid_count)
     create_directory_if_necessary(output)
     Bed.write(snpdata, output)
     snpdata2 = Bed(output).read()
     assert TestLoader.is_same(snpdata, snpdata2) #!!!define an equality method on snpdata?
示例#41
0
 def too_slow_test_write_bedbig(self):
     iid_count = 100000
     sid_count = 50000
     from pysnptools.snpreader import SnpData
     iid = np.array([[str(i),str(i)] for i in range(iid_count)])
     sid = np.array(["sid_{0}".format(i) for i in range(sid_count)])
     pos = np.array([[i,i,i] for i in range(sid_count)])
     np.random.seed(0)
     snpdata = SnpData(iid,sid,np.zeros((iid_count,sid_count)),pos=pos) #random.choice((0.0,1.0,2.0,float("nan")),size=(iid_count,sid_count)))
     output = "tempdir/bedbig.{0}.{1}".format(iid_count,sid_count)
     create_directory_if_necessary(output)
     Bed.write(output, snpdata, count_A1=False)
     snpdata2 = Bed(output,count_A1=False).read()
     np.testing.assert_array_almost_equal(snpdata.val, snpdata2.val, decimal=10)
示例#42
0
 def test_write_bed_f64cpp_5_python(self):
     snpreader = Bed(self.currentFolder + "/examples/toydata",count_A1=False)
     iid_index = 5
     logging.info("iid={0}".format(iid_index))
     #if snpreader.iid_count % 4 == 0: # divisible by 4 isn't a good test
     #    snpreader = snpreader[0:-1,:]
     #assert snpreader.iid_count % 4 != 0
     snpdata = snpreader[0:iid_index,:].read(order='F',dtype=np.float64)
     if snpdata.iid_count > 0:
         snpdata.val[-1,0] = float("NAN")
     output = "tempdir/toydata.F64python.{0}".format(iid_index)
     create_directory_if_necessary(output)
     Bed.write(output,snpdata, force_python_only=True)
     snpdata2 = Bed(output,count_A1=False).read()
     np.testing.assert_array_almost_equal(snpdata.val, snpdata2.val, decimal=10)
示例#43
0
 def test_write_bed_f64cpp_5_python(self):
     snpreader = Bed(self.currentFolder + "/examples/toydata")
     iid_index = 5
     logging.info("iid={0}".format(iid_index))
     #if snpreader.iid_count % 4 == 0: # divisible by 4 isn't a good test
     #    snpreader = snpreader[0:-1,:]
     #assert snpreader.iid_count % 4 != 0
     snpdata = snpreader[0:iid_index,:].read(order='F',dtype=np.float64)
     if snpdata.iid_count > 0:
         snpdata.val[-1,0] = float("NAN")
     output = "tempdir/toydata.F64python.{0}".format(iid_index)
     create_directory_if_necessary(output)
     Bed.write(snpdata, output,force_python_only=True)
     snpdata2 = Bed(output).read()
     assert TestLoader.is_same(snpdata, snpdata2) #!!!define an equality method on snpdata?
示例#44
0
 def test_npz(self):
     logging.info("in test_npz")
     snpreader = Bed(self.currentFolder + "/../examples/toydata.5chrom.bed",
                     count_A1=False)
     kerneldata1 = snpreader.read_kernel(standardizer=stdizer.Unit())
     s = str(kerneldata1)
     output = "tempdir/kernelreader/toydata.kernel.npz"
     create_directory_if_necessary(output)
     KernelNpz.write(output, kerneldata1)
     kernelreader2 = KernelNpz(output)
     kerneldata2 = kernelreader2.read()
     np.testing.assert_array_almost_equal(kerneldata1.val,
                                          kerneldata2.val,
                                          decimal=10)
     logging.info("done with test")
示例#45
0
 def test_write_distnpz_f64cpp_0(self):
     distreader = DistNpz(self.currentFolder + "/../examples/toydata.dist.npz")
     iid_index = 0
     logging.info("iid={0}".format(iid_index))
     #if distreader.iid_count % 4 == 0: # divisible by 4 isn't a good test
     #    distreader = distreader[0:-1,:]
     #assert distreader.iid_count % 4 != 0
     distdata = distreader[0:iid_index,:].read(order='F',dtype=np.float64)
     if distdata.iid_count > 0:
         distdata.val[-1,0] = float("NAN")
     output = "tempdir/toydata.F64cpp.{0}.dist.npz".format(iid_index)
     create_directory_if_necessary(output)
     DistNpz.write(output, distdata )
     snpdata2 = DistNpz(output).read()
     np.testing.assert_array_almost_equal(distdata.val, snpdata2.val, decimal=10)
    def test_lr(self):
        import matplotlib.pyplot as plt
        import pylab


        logging.info("TestLmmTrain test_lr")

        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        G0_train = self.snpreader_whole[train_idx,:]
        covariate_train3 = self.covariate_whole[train_idx,:].read()
        covariate_train3.val = np.array([[float(num)] for num in xrange(covariate_train3.iid_count)])
        pheno_train3 = self.pheno_whole[train_idx,:].read()
        np.random.seed(0)
        pheno_train3.val = covariate_train3.val * 2.0 + 100 + np.random.normal(size=covariate_train3.val.shape) # y = 2*x+100+normal(0,1)

        ##Plot training x and y
        #pylab.plot(covariate_train3.val, pheno_train3.val,".")
        #pylab.show()

        for force_full_rank,force_low_rank in [(True,False),(False,True)]:
            #Learn model, save, load
            fastlmm3x = FastLMM(force_full_rank=force_full_rank,force_low_rank=force_low_rank,GB_goal=2).fit(K0_train=G0_train, X=covariate_train3, y=pheno_train3)
            filename = self.tempout_dir + "/model_lr.flm.p"
            pstutil.create_directory_if_necessary(filename)
            joblib.dump(fastlmm3x, filename) 
            fastlmm3 = joblib.load(filename)


            #Predict with model (test on train)
            predicted_pheno, covar = fastlmm3.predict(K0_whole_test=G0_train, X=covariate_train3,count_A1=False) #test on train
            output_file = self.file_name("lr")
            Dat.write(output_file,predicted_pheno)

            ## Plot training x and y, and training x with predicted y
            #do_plot = True 
            #if do_plot:
            #    pylab.plot(covariate_train3.val, pheno_train3.val,covariate_train3.val,predicted_pheno.val,".")
            #    pylab.show()

            #    # Plot y and predicted y (test on train)
            #    pheno_actual = pheno_train3.val[:,0]
            #    pylab.plot(pheno_actual,predicted_pheno.val,".")
            #    pylab.show()


            self.compare_files(predicted_pheno,"lr")
示例#47
0
    def test_c_reader_ped(self):
        if False: #Too slow for routine testing
            snpdata1 = Ped(self.currentFolder + "/examples/toydata.ped")[::25,::1000].read()
            self.assertEqual(np.float64, snpdata1.val.dtype)
            TestPySnpTools.assert_match_012_210(self.snpdata[::25,::1000].read(),snpdata1)
        else:
            snpdata1 = self.snpdata[::25,::1000].read()

        output = "tempdir/snpreader/toydata.ped"
        create_directory_if_necessary(output)

        snpdata1.val[1,2] = np.NaN # Inject a missing value to test writing and reading missing values
        Ped.write(output, snpdata1)
        snpreader = Ped(output)
        _fortesting_JustCheckExists().input(snpreader)
        s = str(snpreader)
        snpdata2 = snpreader.read()
        TestPySnpTools.assert_match_012_210(snpdata1,snpdata2)
示例#48
0
    def cmktest_big_npz(self):
        logging.info("in test_big_npz")
        n = 1000
        pstdata = PstData(row=range(n-1),col=range(n+1),val=np.zeros([n-1,n+1]))
        output = "tempdir/pstreader/big.npz"
        create_directory_if_necessary(output)
        PstNpz.write(output,pstdata)
        pstnpz = PstNpz(output)
        pstdata1 = pstnpz[::2,::4].read()
        pstdata2 = pstnpz.read(order='A')
        assert pstdata2.val.flags['C_CONTIGUOUS']

        pstdata = PstData(row=range(n-1),col=range(n+1),val=np.zeros([n-1,n+1],order='F'))
        PstNpz.write(output,pstdata)
        pstnpz = PstNpz(output)
        pstdata2 = pstnpz.read(order='A')
        pstdata2.val.flags['F_CONTIGUOUS']

        print("done")
示例#49
0
    def test_c_reader_dat(self):
        snpreader = Dat(self.currentFolder + "/examples/toydata.dat")[:,::100]
        _fortesting_JustCheckExists().input(snpreader)

        snpdata1 = snpreader.read()
        self.assertEqual(np.float64, snpdata1.val.dtype)
        self.assertTrue(np.allclose(self.snps[:,::100], snpdata1.val, rtol=1e-05, atol=1e-05))

        snpdata1.val[1,2] = np.NaN # Inject a missing value to test writing and reading missing values
        output = "tempdir/snpreader/toydata.dat"
        create_directory_if_necessary(output)
        Dat.write(output,snpdata1)
        snpdata2 = Dat(output).read()
        np.testing.assert_array_almost_equal(snpdata1.val, snpdata2.val, decimal=10)

        snpdata3 = snpdata1[:,0:0].read() #create snpdata with no sids
        output = "tempdir/snpreader/toydata3.dat"
        Dat.write(output,snpdata3)
        snpdata4 = Dat(output).read()
        assert snpdata3 == snpdata4
    def test_kernel(self):
        logging.info("TestLmmTrain test_kernel")

        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        # Show it using the snps
        K0_train = self.snpreader_whole[train_idx,:].read_kernel(Unit())
        covariate_train3 = self.covariate_whole[train_idx,:].read()
        pheno_train3 = self.pheno_whole[train_idx,:].read()
        pheno_train3.val = self.snpreader_whole[train_idx,0:1].read().val*2
        assert np.array_equal(K0_train.iid,covariate_train3.iid), "Expect iids to be the same (so that early and late Unit standardization will give the same result)"
        assert np.array_equal(K0_train.iid,pheno_train3.iid), "Expect iids to be the same (so that early and late Unit standardization will give the same result)"

        #pylab.plot(G0_train[:,0:1].read().val[:,0], pheno_train3.val[:,0],".")
        #pylab.show()

        #Learn model, save, load
        fastlmm3x = FastLMM(GB_goal=2).fit(K0_train=K0_train, X=covariate_train3, y=pheno_train3)
        filename = self.tempout_dir + "/model_snps.flm.p"
        pstutil.create_directory_if_necessary(filename)
        joblib.dump(fastlmm3x, filename) 
        fastlmm3 = joblib.load(filename)


        #Predict with model (test on train)
        predicted_pheno, covar = fastlmm3.predict(K0_whole_test=K0_train, X=covariate_train3,count_A1=False) #test on train
        output_file = self.file_name("kernel")
        Dat.write(output_file,predicted_pheno)

        #### Plot training x and y, and training x with predicted y
        #pylab.plot(self.snpreader_whole[train_idx,0:1].read().val[:,0], pheno_train3.val,".",self.snpreader_whole[train_idx,0:1].read().val[:,0],predicted_pheno.val,".")
        #pylab.show()

        #### Plot y and predicted y (test on train)
        #pheno_actual = pheno_train3.val[:,0]
        #pylab.plot(pheno_actual,predicted_pheno.val,".")
        #pylab.show()


        self.compare_files(predicted_pheno,"snps") #"kernel" and "snps" test cases should give the same results
    def test_snps(self):
        logging.info("TestLmmTrain test_snps")

        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        # Show it using the snps
        G0_train = self.snpreader_whole[train_idx,:]
        covariate_train3 = self.covariate_whole[train_idx,:].read()
        pheno_train3 = self.pheno_whole[train_idx,:].read()
        pheno_train3.val = G0_train[:,0:1].read().val*2

        #pylab.plot(G0_train[:,0:1].read().val[:,0], pheno_train3.val[:,0],".")
        #pylab.show()

        #Learn model, save, load
        fastlmm3x = FastLMM(GB_goal=2).fit(K0_train=G0_train, X=covariate_train3, y=pheno_train3)
        filename = self.tempout_dir + "/model_snps.flm.p"
        pstutil.create_directory_if_necessary(filename)
        joblib.dump(fastlmm3x, filename) 
        fastlmm3 = joblib.load(filename)


        #Predict with model (test on train)
        predicted_pheno, covar = fastlmm3.predict(K0_whole_test=G0_train, X=covariate_train3,count_A1=False) #test on train
        output_file = self.file_name("snps")
        Dat.write(output_file,predicted_pheno)

        ### Plot training x and y, and training x with predicted y
        #pylab.plot(G0_train[:,0:1].read().val[:,0], pheno_train3.val,".",G0_train[:,0:1].read().val[:,0],predicted_pheno.val,".")
        #pylab.show()

        ### Plot y and predicted y (test on train)
        #pheno_actual = pheno_train3.val[:,0]
        #pylab.plot(pheno_actual,predicted_pheno.val,".")
        #pylab.show()


        self.compare_files(predicted_pheno,"snps")
    def test_str2(self):
        logging.info("TestLmmTrain test_str2")


        #Standardize train and test together
        whole_kernel = self.snpreader_whole.read_kernel(Unit())

        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids
        covariate_train = self.covariate_whole[train_idx,:]
        pheno_train = self.pheno_whole[train_idx,:]

        K0_train_filename = self.tempout_dir + "/model_str2.kernel.npz"
        pstutil.create_directory_if_necessary(K0_train_filename)
        from pysnptools.kernelreader import KernelNpz
        KernelNpz.write(K0_train_filename,whole_kernel[train_idx].read(order='A',view_ok=True))

        fastlmm1 = FastLMM(GB_goal=2).fit(K0_train=K0_train_filename, X=covariate_train, y=pheno_train)
        filename = self.tempout_dir + "/model_str2.flm.p"
        pstutil.create_directory_if_necessary(filename)
        joblib.dump(fastlmm1, filename) 
        fastlmm2 = joblib.load(filename)

                
        # predict on test set
        G0_test = self.snpreader_whole[test_idx,:]
        covariate_test = self.covariate_whole[test_idx,:]

        predicted_pheno, covar = fastlmm2.predict(K0_whole_test=whole_kernel[:,test_idx].read(order='A',view_ok=True), X=covariate_test,count_A1=False)

        output_file = self.file_name("str2")
        Dat.write(output_file,predicted_pheno)

        #pheno_actual = self.pheno_whole[test_idx,:].read().val[:,0]
        #pylab.plot(pheno_actual, predicted_pheno.val,".")
        #pylab.show()


        self.compare_files(predicted_pheno,"str2")
    def test_lmm(self):
        do_plot = False
        iid_count = 500
        seed = 0


        import pylab
        logging.info("TestLmmTrain test_lmm")

        iid = [["cid{0}P{1}".format(iid_index,iid_index//250)]*2 for iid_index in xrange(iid_count)]
        train_idx = np.r_[10:iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids


        #Every person is 100% related to everyone in one of 5 families
        K0a = KernelData(iid=iid,val=np.empty([iid_count,iid_count]),name="related by distance")
        for iid_index0 in xrange(iid_count):
            for iid_index1 in xrange(iid_count):
                K0a.val[iid_index0,iid_index1] = 1 if iid_index0 % 5 == iid_index1 % 5 else 0
                if iid_index1 < iid_index0:
                    assert K0a.val[iid_index0,iid_index1] == K0a.val[iid_index1,iid_index0]

        #every person lives on a line from 0 to 1
        # They are related to every other person as a function of distance on the line
        np.random.seed(seed)
        home = np.random.random([iid_count])
        K0b = KernelData(iid=iid,val=np.empty([iid_count,iid_count]),name="related by distance")
        for iid_index in xrange(iid_count):
            K0b.val[iid_index,:] = 1 - np.abs(home-home[iid_index])**.1

        #make covar just numbers 0,1,...
        covar = SnpData(iid=iid,sid=["x"],val=np.array([[float(num)] for num in xrange(iid_count)]))
        covariate_train = covar[train_idx,:].read()
        covariate_test = covar[test_idx,:].read()

        for name, h2, K0 in [("clones", 1, K0a),("line_world",.75,K0b)]:

            sigma2x = 100
            varg = sigma2x * h2
            vare = sigma2x * (1-h2)

            #######################################################################
            #make pheno  # pheno = 2*covar+100+normal(0,1)*2.5+normal(0,K)*7.5
            #######################################################################
            #random.multivariate_normal is sensitive to mkl_num_thread, so we control it.
            if 'MKL_NUM_THREADS' in os.environ:
                mkl_num_thread = os.environ['MKL_NUM_THREADS']
            else:
                mkl_num_thread = None
            os.environ['MKL_NUM_THREADS'] = '1'
            np.random.seed(seed)
            p1 = covar.val * 2.0 + 100
            p2 = np.random.normal(size=covar.val.shape)*np.sqrt(vare)
            p3 = (np.random.multivariate_normal(np.zeros(iid_count),K0.val)*np.sqrt(varg)).reshape(-1,1)
            if mkl_num_thread is not None:
                os.environ['MKL_NUM_THREADS'] = mkl_num_thread
            else:
                del os.environ['MKL_NUM_THREADS']
            pheno = SnpData(iid=iid,sid=["pheno0"],val= p1 + p2 + p3)

            pheno_train = pheno[train_idx,:].read()
            pheno_test = pheno[test_idx,:].read()

            if do_plot:
                #Plot training x and y, testing x and y
                pylab.plot(covariate_train.val, pheno_train.val,".",covariate_test.val, pheno_test.val,".")
                pylab.suptitle(name + ": Plot training x and y, testing x and y")
                pylab.show()

            Xtrain = np.c_[covariate_train.val,np.ones((covariate_train.iid_count,1))]
            Xtest = np.c_[covariate_test.val,np.ones((covariate_test.iid_count,1))]
            lsqSol = np.linalg.lstsq(Xtrain, pheno_train.val[:,0],rcond=-1)
            bs=lsqSol[0] #weights
            r2=lsqSol[1] #squared residuals
            D=lsqSol[2]  #rank of design matrix
            N=pheno_train.iid_count
            REML = False
            if not REML:
                sigma2 = float(r2/N)
                nLL =  N*0.5*np.log(2*np.pi*sigma2) + N*0.5
            else:
                sigma2 = float(r2 / (N-D))
                nLL = N*0.5*np.log(2*np.pi*sigma2) + 0.5/sigma2*r2;
                nLL -= 0.5*D*np.log(2*np.pi*sigma2);#REML term

            predicted = Xtest.dot(bs)
            yerr = [np.sqrt(sigma2)] * len(predicted)
            if do_plot:
                pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.")
                pylab.xlim([-1, 10])
                pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None')
                pylab.suptitle(name + ": real linear regression: actual to prediction")
                pylab.show()

            for factor in [1,100,.02]:
                K0 = K0.read()
                K0.val *= factor

                K0_train = K0[train_idx]
                K0_whole_test = K0[:,test_idx]

                #Learn model, save, load
                fastlmmx = FastLMM(GB_goal=2).fit(K0_train=K0_train, X=covariate_train, y=pheno_train)
                v2 = np.var(p2)
                v3 = np.var(p3)
                logging.debug("Original h2 of {0}. Generated h2 of {1}. Learned h2 of {2}".format(h2, v3/(v2+v3), fastlmmx.h2raw))
                
                
                filename = self.tempout_dir + "/model_lmm.flm.p"
                pstutil.create_directory_if_necessary(filename)
                joblib.dump(fastlmmx, filename) 
                fastlmm = joblib.load(filename)


                do_test_on_train = True
                if do_test_on_train:
                    #Predict with model (test on train)
                    predicted_pheno, covar_pheno = fastlmm.predict(K0_whole_test=K0_train, X=covariate_train,count_A1=False) #test on train
                    output_file = self.file_name("lmma_"+name)
                    Dat.write(output_file,predicted_pheno)
                    covar2 = SnpData(iid=covar_pheno.row,sid=covar_pheno.col[:,1],val=covar_pheno.val) #kludge to write kernel to text format
                    output_file = self.file_name("lmma.cov_"+name)
                    Dat.write(output_file,covar2)

                    yerr = np.sqrt(np.diag(covar_pheno.val))
                    predicted = predicted_pheno.val
                    if do_plot:
                        pylab.plot(covariate_train.val, pheno_train.val,"g.",covariate_train.val, predicted,"r.")
                        pylab.xlim([0, 50])
                        pylab.ylim([100, 200])
                        pylab.errorbar(covariate_train.val, predicted,yerr,linestyle='None')
                        pylab.suptitle(name+": test on train: train X to true target (green) and prediction (red)")
                        pylab.show()

                    self.compare_files(predicted_pheno,"lmma_"+name)
                    self.compare_files(covar2,"lmma.cov_"+name)

                    predicted_pheno0, covar_pheno0 = fastlmm.predict(K0_whole_test=K0_train[:,0], X=covariate_train[0,:],count_A1=False) #test on train #0
                    assert np.abs(predicted_pheno0.val[0,0] - predicted_pheno.val[0,0]) < 1e-6, "Expect a single case to get the same prediction as a set of cases"
                    assert np.abs(covar_pheno0.val[0,0] - covar_pheno.val[0,0]) < 1e-6, "Expect a single case to get the same prediction as a set of cases"


                #Predict with model (test on test)
                predicted_phenoB, covar_phenoB  = fastlmm.predict(K0_whole_test=K0_whole_test, X=covariate_test,count_A1=False) #test on test
                output_file = self.file_name("lmmb_"+name)
                Dat.write(output_file,predicted_phenoB)
                covar2 = SnpData(iid=covar_phenoB.row,sid=covar_phenoB.col[:,1],val=covar_phenoB.val) #kludge to write kernel to text format
                output_file = self.file_name("lmmb.cov_"+name)
                Dat.write(output_file,covar2)

                yerr = np.sqrt(np.diag(covar_phenoB.val))
                predicted = predicted_phenoB.val
                if do_plot:
                    pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.")
                    pylab.xlim([-1, 10])
                    pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None')
                    pylab.suptitle(name+": test on test: test X to true target (green) and prediction (red)")
                    pylab.show()

                self.compare_files(predicted_phenoB,"lmmb_"+name)
                self.compare_files(covar2,"lmmb.cov_"+name)

                predicted_phenoB0, covar_phenoB0  = fastlmm.predict(K0_whole_test=K0_whole_test[:,0], X=covariate_test[0,:],count_A1=False) #test on a single test case
                assert np.abs(predicted_phenoB0.val[0,0] - predicted_phenoB.val[0,0]) < 1e-6, "Expect a single case to get the same prediction as a set of cases"
                assert np.abs(covar_phenoB0.val[0,0] - covar_phenoB.val[0,0]) < 1e-6, "Expect a single case to get the same prediction as a set of cases"

                #Predict with model test on some train and some test
                some_idx = range(covar.iid_count)
                some_idx.remove(train_idx[0])
                some_idx.remove(test_idx[0])
                covariate_some = covar[some_idx,:]
                K0_whole_some = K0[:,some_idx]
                predicted_phenoC, covar_phenoC  = fastlmm.predict(K0_whole_test=K0_whole_some, X=covariate_some,count_A1=False)
                for idxC, iidC in enumerate(predicted_phenoC.iid):
                    meanC = predicted_phenoC.val[idxC]
                    varC = covar_phenoC.val[idxC,idxC]
                    if iidC in predicted_pheno.iid:
                        predicted_pheno_ref = predicted_pheno
                        covar_pheno_ref = covar_pheno
                    else:
                        assert iidC in predicted_phenoB.iid
                        predicted_pheno_ref = predicted_phenoB
                        covar_pheno_ref = covar_phenoB
                    idx_ref = predicted_pheno_ref.iid_to_index([iidC])[0]
                    mean_ref = predicted_pheno_ref.val[idx_ref]
                    var_ref = covar_pheno_ref.val[idx_ref,idx_ref]
                    assert np.abs(meanC - mean_ref) < 1e-6
                    assert np.abs(varC - var_ref) < 1e-6
示例#54
0
def _internal_single(G0_standardized, test_snps, pheno,covar, G1_standardized,
                 mixing, #!!test mixing and G1
                 h2, log_delta,
                 cache_file):

    assert h2 is None or log_delta is None, "if h2 is specified, log_delta may not be specified"
    if log_delta is not None:
        h2 = 1.0/(np.exp(log_delta)+1)

    covar = np.hstack((covar['vals'],np.ones((test_snps.iid_count, 1))))  #We always add 1's to the end.
    y =  pheno['vals']

    from pysnptools.standardizer import DiagKtoN

    assert mixing is None or 0.0 <= mixing <= 1.0

    if cache_file is not None and os.path.exists(cache_file):
        lmm = fastLMM(X=covar, Y=y, G=None, K=None)
        with np.load(cache_file) as data: #!! similar code in epistasis
            lmm.U = data['arr_0']
            lmm.S = data['arr_1']
    else:
        # combine two kernels (normalize kernels to diag(K)=N
        G0_standardized_val = DiagKtoN(G0_standardized.val.shape[0]).standardize(G0_standardized.val)
        G1_standardized_val = DiagKtoN(G1_standardized.val.shape[0]).standardize(G1_standardized.val)

        if mixing == 0.0 or G1_standardized.sid_count == 0:
            G = G0_standardized.val
        elif mixing == 1.0 or G0_standardized.sid_count == 0:
            G = G1_standardized.val
        else:
            G = np.empty((G0_standardized.iid_count,G0_standardized.sid_count+G1_standardized.sid_count))
            if mixing is None:
                mixing, h2 = _find_mixing(G, covar, G0_standardized_val, G1_standardized_val, h2, y)
            _mix(G, G0_standardized_val,G1_standardized_val,mixing)
        
        #TODO: make sure low-rank case is handled correctly
        lmm = fastLMM(X=covar, Y=y, G=G, K=None, inplace=True)


    if h2 is None:
        result = lmm.findH2()
        h2 = result['h2']
    logging.info("h2={0}".format(h2))

    snps_read = test_snps.read().standardize()
    res = lmm.nLLeval(h2=h2, dof=None, scale=1.0, penalty=0.0, snps=snps_read.val)

    if cache_file is not None and not os.path.exists(cache_file):
        pstutil.create_directory_if_necessary(cache_file)
        np.savez(cache_file, lmm.U,lmm.S) #using np.savez instead of pickle because it seems to be faster to read and write


    beta = res['beta']
        
    chi2stats = beta*beta/res['variance_beta']
    #p_values = stats.chi2.sf(chi2stats,1)[:,0]
    if G0_standardized is not None:
        assert G.shape[0] == lmm.U.shape[0]
    p_values = stats.f.sf(chi2stats,1,lmm.U.shape[0]-3)[:,0]#note that G.shape is the number of individuals and 3 is the number of fixed effects (covariates+SNP)


    items = [
        ('SNP', snps_read.sid),
        ('Chr', snps_read.pos[:,0]), 
        ('GenDist', snps_read.pos[:,1]),
        ('ChrPos', snps_read.pos[:,2]), 
        ('PValue', p_values),
        ('SnpWeight', beta[:,0]),
        ('SnpWeightSE', np.sqrt(res['variance_beta'][:,0])),
        ('SnpFractVarExpl', np.sqrt(res['fraction_variance_explained_beta'][:,0])),
        ('Nullh2', np.zeros((snps_read.sid_count)) + h2)
    ]
    frame = pd.DataFrame.from_items(items)

    return frame
示例#55
0
def _internal_single(K0, test_snps, pheno, covar, K1,
                 mixing, h2, log_delta,
                 cache_file, force_full_rank, force_low_rank,
                 output_file_name, block_size, interact_with_snp, runner):
    assert K0 is not None, "real assert"
    assert K1 is not None, "real assert"
    assert block_size is not None, "real assert"
    assert mixing is None or 0.0 <= mixing <= 1.0
    if force_full_rank and force_low_rank:
        raise Exception("Can't force both full rank and low rank")

    assert h2 is None or log_delta is None, "if h2 is specified, log_delta may not be specified"
    if log_delta is not None:
        h2 = 1.0/(np.exp(log_delta)+1)

    covar = np.c_[covar.read(view_ok=True,order='A').val,np.ones((test_snps.iid_count, 1))]  #view_ok because np.c_ will allocation new memory

    y =  pheno.read(view_ok=True,order='A').val #view_ok because this code already did a fresh read to look for any missing values 

    if cache_file is not None and os.path.exists(cache_file):
        lmm = lmm_cov(X=covar, Y=y, G=None, K=None)
        with np.load(cache_file) as data: #!! similar code in epistasis
            lmm.U = data['arr_0']
            lmm.S = data['arr_1']
            h2 = data['arr_2'][0]
            mixing = data['arr_2'][1]
    else:
        K, h2, mixer = _Mixer.combine_the_best_way(K0, K1, covar, y, mixing, h2, force_full_rank=force_full_rank, force_low_rank=force_low_rank,kernel_standardizer=DiagKtoN())
        mixing = mixer.mixing

        if mixer.do_g:
            lmm = lmm_cov(X=covar, Y=y, K=None, G=K.snpreader.val, inplace=True)
        else:
            lmm = lmm_cov(X=covar, Y=y, K=K.val, G=None, inplace=True)

        if h2 is None:
            result = lmm.findH2()
            h2 = result['h2']
        logging.info("h2={0}".format(h2))

        if cache_file is not None and not os.path.exists(cache_file):
            pstutil.create_directory_if_necessary(cache_file)
            lmm.getSU()
            np.savez(cache_file, lmm.U,lmm.S,np.array([h2,mixing])) #using np.savez instead of pickle because it seems to be faster to read and write

    if interact_with_snp is not None:
        logging.info("interaction with %i" % interact_with_snp)
        assert 0 <= interact_with_snp and interact_with_snp < covar.shape[1]-1, "interact_with_snp is out of range"
        interact = covar[:,interact_with_snp].copy()
        interact -=interact.mean()
        interact /= interact.std()
    else:
        interact = None

    work_count = -(test_snps.sid_count // -block_size) #Find the work count based on batch size (rounding up)

    # We define three closures, that is, functions define inside function so that the inner function has access to the local variables of the outer function.
    def debatch_closure(work_index):
        return test_snps.sid_count * work_index // work_count

    def mapper_closure(work_index):
        if work_count > 1: logging.info("single_snp: Working on part {0} of {1}".format(work_index,work_count))
        do_work_time = time.time()
        start = debatch_closure(work_index)
        end = debatch_closure(work_index+1)

        snps_read = test_snps[:,start:end].read().standardize()
        if interact_with_snp is not None:
            variables_to_test = snps_read.val * interact[:,np.newaxis]
        else:
            variables_to_test = snps_read.val
        res = lmm.nLLeval(h2=h2, dof=None, scale=1.0, penalty=0.0, snps=variables_to_test)

        beta = res['beta']
        
        chi2stats = beta*beta/res['variance_beta']
        #p_values = stats.chi2.sf(chi2stats,1)[:,0]
        assert test_snps.iid_count == lmm.U.shape[0]
        p_values = stats.f.sf(chi2stats,1,lmm.U.shape[0]-3)[:,0]#note that G.shape is the number of individuals and 3 is the number of fixed effects (covariates+SNP)

        dataframe = _create_dataframe(snps_read.sid_count)
        dataframe['sid_index'] = np.arange(start,end)
        dataframe['SNP'] = snps_read.sid
        dataframe['Chr'] = snps_read.pos[:,0]
        dataframe['GenDist'] = snps_read.pos[:,1]
        dataframe['ChrPos'] = snps_read.pos[:,2] 
        dataframe['PValue'] = p_values
        dataframe['SnpWeight'] = beta[:,0]
        dataframe['SnpWeightSE'] = np.sqrt(res['variance_beta'][:,0])
        dataframe['SnpFractVarExpl'] = np.sqrt(res['fraction_variance_explained_beta'][:,0])
        dataframe['Mixing'] = np.zeros((snps_read.sid_count)) + mixing
        dataframe['Nullh2'] = np.zeros((snps_read.sid_count)) + h2

        logging.info("time={0}".format(time.time()-do_work_time))

        #logging.info(dataframe)
        return dataframe

    def reducer_closure(result_sequence):
        if output_file_name is not None:
            create_directory_if_necessary(output_file_name)

        frame = pd.concat(result_sequence)
        frame.sort_values(by="PValue", inplace=True)
        frame.index = np.arange(len(frame))

        if output_file_name is not None:
            frame.to_csv(output_file_name, sep="\t", index=False)

        return frame

    frame = map_reduce(xrange(work_count),
                       mapper=mapper_closure,reducer=reducer_closure,
                       input_files=[test_snps],output_files=[output_file_name],
                       name="single_snp(output_file={0})".format(output_file_name),
                       runner=runner)
    return frame
def heritability_spatial_correction(G_kernel, spatial_coor, spatial_iid, alpha_list, alpha_power, pheno, 
                     map_function = map, cache_folder=None, 
                     jackknife_count=500, permute_plus_count=10000, permute_times_count=10000, seed=0,
                     just_testing=False,  always_remote=False, allow_gxe2 = True
                     ):
    """
    Function measuring heritability with correction for spatial location.

    :param G_kernel: A kernel that tells the genetic similarity between all pairs of individuals. The kernel can be given 
      explicitly, for example with a :class:`.KernelData`. The kernel can also be given implicitly by providing a set of
      SNPs or the name of a BED file.
    :type G_kernel: a :class:`.KernelReader`, :class:`.SnpReader` or a string

    :param spatial_coor: The position of each individual given by two coordinates. Any units are allowed, but the two values
       must be compatible so that distance can be determined via Pythagoras' theorem. (So, longitude and latitude should
       not be used unless the locations are near the Equator.) 
    :type spatial_coor: a iid_count x 2 array

    :param spatial_iid: A ndarray of the iids. Each iid is a ndarray of two strings (a family ID and a case ID) that identifies an individual.
    :type spatial_iid: array of strings with shape [iid_count,2]

    :param alpha_list: a list of numbers to search to find the best alpha, which is the similarity scale. The similarity of two individuals
      is here defined as exp(-(distance_between/alpha)**alpha_power). If the closest individuals are 100 units apart and the farthest
      individuals are 4e6 units apart, a reasonable alpha_list might be: [int(v) for v in np.logspace(np.log10(100),np.log10(1e10), 100)]
      The function's reports on the alphas chosen. If an extreme alpha is picked, change alpha_list to cover more range.
    :type alpha_list: list of numbers

    :param alpha_power: 2 (a good choice) means that similarity goes with area. 1 means with distance.
    :type alpha_list: number

    :param pheno: The target values(s) to predict. It can be a file name readable via :class:`SnpReader.Pheno` or any :class:`.SnpReader`.
    :type pheno: a :class:`.SnpReader` or string

    :param cache_folder: (default 'None') The name of a directory in which to save intermediate results. If 'None', then no intermediate results are saved.
    :type cache_folder: a string

    :param map_function: (default 'map') A function with the same inputs and functionality as Python's 'map' function.
       Can be used to run 'heritability_spatial_correction' on a cluster.
    :type map_function: a function

    :param jackknife_count: (default 500) The number of jackknife groups to use when calculating standard errors (SE). Changing to a small number, 2, 
       speeds up calculation at the cost of unusable SEs.
    :type jackknife_count: number

    :param permute_plus_count: (default 10000) The number of permutations used when calculating P values. Changing to a small number, 1, 
       speeds up calculation at the cost of unusable P values.
    :type permute_plus_count: number

    :param permute_times_count: (default 10000) The number of permutations used when calculating P values. Changing to a small number, 1, 
       speeds up calculation at the cost of unusable P values.
    :type permute_times_count: number

    :param seed: (default 0) The random seed used by jackknifing and permutation.
    :type seed: number

    :param just_testing: (default False) If true, skips actual LMM-related search and calculation.
    :type just_testing: bool

    :rtype: Pandas dataframe with one row per phenotyper. Columns include "h2uncorr", "h2corr", etc.

    """

    ######################
    # Prepare the inputs
    ######################

    from fastlmm.inference.fastlmm_predictor import _kernel_fixup, _pheno_fixup
    G_kernel = _kernel_fixup(G_kernel, iid_if_none=None, standardizer=Unit())  # Create a kernel from an in-memory kernel, some snps, or a text file.
    pheno = _pheno_fixup(pheno,iid_if_none=G_kernel.iid, missing='NA') # Create phenotype data from in-memory data or a text file.

    if cache_folder is not None:
        pstutil.create_directory_if_necessary(cache_folder,isfile=False)

    
    jackknife_seed = seed or 1954692566L
    permute_plus_seed = seed or 2372373100L
    permute_times_seed = seed or 2574440128L

    ######################
    # Find 'alpha', the scale for distance
    ######################

    # create the alpha table (unless it is already there)
    alpha_table_fn = "{0}/alpha_table.{1}.txt".format(cache_folder,pheno.sid_count) # create a name for the alpha_table cache file
    if cache_folder is not None and os.path.exists(alpha_table_fn):
        alpha_table = pd.read_csv(alpha_table_fn, delimiter = '\t',index_col=False, comment=None)
    else:
        # create the list of arguments to run    
        arg_list = []   
        for phen_target in pheno.sid:
            pheno_one = pheno[:,pheno.col_to_index([phen_target])] # Look at only this pheno_target
            for alpha in alpha_list:
                            #pheno, G_kernel, spatial_coor, spatial_iid, alpha,     alpha_power,  (jackknife_index, jackknife_count, jackknife_seed),
                arg_tuple = (pheno_one, G_kernel, spatial_coor, spatial_iid, alpha, alpha_power, (-1,     0,     None),  
                             # (permute_plus_index, permute_plus_count, permute_plus_seed), (permute_times_index, permute_times_count, permute_times_seed) ,just_testing, do_uncorr, do_gxe2,               a2
                               (-1,     0,     None),                                       (-1,     0,     None),                                          just_testing, False,     True and allow_gxe2,   None)
                arg_list.append(arg_tuple)

        # Run "run_line" on each set of arguments and save to file
        return_list = map_function(work_item, arg_list) if len(arg_list)>1 or always_remote else map(work_item, arg_list)
        return_list = [line for line in return_list if line is not None] #Remove 'None' results
        alpha_table = pd.DataFrame(return_list)
        if cache_folder is not None:
            _write_csv(alpha_table,False,alpha_table_fn)

    # read the alpha table and find the best values
    grouped = alpha_table.groupby("phen")
    alpha_dict = {}
    for phen, phen_table in grouped:
        best_index_corr = phen_table['nLLcorr'].idxmin() # with Pandas, this returns the index in the parent table, not the group table
        best_index_gxe2 = phen_table['nLL_gxe2'].idxmin() if allow_gxe2 else 0
        alpha_corr = alpha_table.iloc[best_index_corr]['alpha']
        alpha_gxe2 = alpha_table.iloc[best_index_gxe2]['alpha']
        alpha_dict[phen] = alpha_corr, alpha_gxe2
    logging.info(alpha_dict)


    ######################
    # Use jackknifing to compute h2uncorr, SE, h2corr, SE, e2, SE, gxe2, SE
    ######################

    jackknife_count_actual = min(jackknife_count,G_kernel.iid_count)

    # Set up the run and do it (unless it has already been run)
    jackknife_table_fn = "{0}/jackknife.{1}.count{2}.txt".format(cache_folder, pheno.sid_count, jackknife_count_actual)
    if cache_folder is not None and os.path.exists(jackknife_table_fn):
        jackknife_table = pd.read_csv(jackknife_table_fn, delimiter = '\t',index_col=False, comment=None)
    else:
        arg_list = []
        for phen_target in pheno.sid:
            pheno_one = pheno[:,pheno.col_to_index([phen_target])] # Look at only this pheno_target
            alpha_corr, alpha_gxe2 = alpha_dict[phen_target]
            alpha_set = set([alpha_corr, alpha_gxe2]) #If these are the same, then only need to do half the work
            for alpha in alpha_set:
                logging.debug(alpha)
                do_uncorr = (alpha == alpha_corr)
                do_gxe2   = (alpha == alpha_gxe2) and allow_gxe2
                for jackknife in range(-1, jackknife_count_actual):
                               # pheno, G_kernel, spatial_coor, spatial_iid, alpha,     alpha_power, (jackknife_index, jackknife_count,         jackknife_seed),
                    arg_tuple = (pheno_one, G_kernel, spatial_coor, spatial_iid, alpha, alpha_power, (jackknife,       jackknife_count_actual,  jackknife_seed),
                                    # (permute_plus_index, permute_plus_count, permute_plus_seed), (permute_times_index, permute_times_count, permute_times_seed) ,just_testing, do_uncorr, do_gxe2, a2
                                    (-1,0,None),                                                 (-1,0,None),                                                    just_testing, do_uncorr, do_gxe2, None)
                    arg_list.append(arg_tuple)    

        # Run "run_line" on each set of arguments and save to file
        return_list = map_function(work_item, arg_list) if len(arg_list)>1 or always_remote else map(work_item, arg_list)
        return_list = [line for line in return_list if line is not None] #Remove 'None' results
        jackknife_table = pd.DataFrame(return_list)
        if cache_folder is not None:
            _write_csv(jackknife_table, False, jackknife_table_fn)


    # get the real (that is, unjackknifed) values    
    jackknife_table["diff"] = jackknife_table.h2uncorr-jackknife_table.h2corr # Compute the diff = h2uncorr-h2corr column
    results_both = jackknife_table[jackknife_table.jackknife_index==-1]  # Create a table of the real (non-jackknifed) results for both alphas (which may be the same)
    del results_both["jackknife_index"]
    results_corr = results_both[results_both.alpha == [alpha_dict[phen][0] for phen in results_both.phen]] #Create version for g+e's alpha
    results_gxe2 = results_both[results_both.alpha == [alpha_dict[phen][1] for phen in results_both.phen]] #Create version for gxe's alpha
    #remove unwanted columns
    for delcol in ["a2_gxe2","gxe2","nLL_gxe2","permute_plus_count","permute_plus_index","permute_plus_seed","permute_times_count","permute_times_index","permute_times_seed","jackknife_count","jackknife_seed"]:
        del results_corr[delcol]
    for delcol in ["a2","e2","h2corr","h2uncorr","nLLcorr","nLLuncorr","diff","permute_plus_count","permute_plus_index","permute_plus_seed","permute_times_count","permute_times_index","permute_times_seed","jackknife_count","jackknife_seed"]:
        del results_gxe2[delcol]

    #Use a pivottable to compute the jackknifed SE's
    corr_rows = np.logical_and(jackknife_table.jackknife_index!=-1,jackknife_table.alpha==[alpha_dict[phen][0] for phen in jackknife_table.phen])
    jk_table_corr = pd.pivot_table(jackknife_table[corr_rows], values=['h2uncorr','h2corr','diff','e2'], index=['phen'], columns=[], aggfunc=np.std)
    jk_table_corr["h2uncorr SE"] = jk_table_corr["h2uncorr"] * np.sqrt(jackknife_count_actual-1)
    jk_table_corr["h2corr SE"] = jk_table_corr["h2corr"] * np.sqrt(jackknife_count_actual-1)
    jk_table_corr["diff SE"] = jk_table_corr["diff"] * np.sqrt(jackknife_count_actual-1)
    jk_table_corr["e2 SE"] = jk_table_corr["e2"] * np.sqrt(jackknife_count_actual-1)
    del jk_table_corr["h2uncorr"]
    del jk_table_corr["h2corr"]
    del jk_table_corr["diff"]
    del jk_table_corr["e2"]
    gxe2_rows = np.logical_and(jackknife_table.jackknife_index!=-1,jackknife_table.alpha==[alpha_dict[phen][1] for phen in jackknife_table.phen])
    jk_table_gxe2 = pd.pivot_table(jackknife_table[gxe2_rows], values=['gxe2'], index=['phen'], columns=[], aggfunc=np.std)
    jk_table_gxe2["gxe2 SE"] = jk_table_gxe2["gxe2"] * np.sqrt(jackknife_count_actual-1)
    del jk_table_gxe2["gxe2"]

    #Join the SE's to the main results table
    results_corr = results_corr.join(jk_table_corr, on='phen')
    results_gxe2 = results_gxe2.join(jk_table_gxe2, on='phen')

    #compute pValue columns
    results_corr["P (diff=0)"] = stats.t.sf(results_corr["diff"]/results_corr["diff SE"],df=jackknife_count_actual-1)*2 #two sided
    results_corr["from SE, one-sided, P (e2=0)"] = stats.t.sf(results_corr["e2"]/results_corr["e2 SE"],df=jackknife_count_actual-1)
    results_gxe2["from SE, one-sided, P (gxe2=0)"] = stats.t.sf(results_gxe2["gxe2"]/results_gxe2["gxe2 SE"],df=jackknife_count_actual-1)   #one sided

    if cache_folder is not None:
        _write_csv(results_corr, False, "{0}/jackknife_corr_summary.{1}.jackknife{2}.txt".format(cache_folder, pheno.sid_count, jackknife_count_actual))
        _write_csv(results_gxe2, False, "{0}/jackknife_gxe2_summary.{1}.jackknife{2}.txt".format(cache_folder, pheno.sid_count, jackknife_count_actual))


    ######################
    # compute p(e2=0) via permutation
    ######################

    permplus_table_fn = "{0}/permutation.GPlusE.{1}.count{2}.txt".format(cache_folder, pheno.sid_count, permute_plus_count)
    if cache_folder is not None and os.path.exists(permplus_table_fn):
        permplus_table = pd.read_csv(permplus_table_fn, delimiter = '\t',index_col=False, comment=None)
    else:
        arg_list = []
        for phen_target in pheno.sid:
            pheno_one = pheno[:,pheno.col_to_index([phen_target])] # Look at only this pheno_target
            alpha_corr, alpha_gxe2 = alpha_dict[phen_target]
            for jackknife_index in range(-1,permute_plus_count):
                           # pheno, G_kernel, spatial_coor, spatial_iid, alpha,          alpha_power,    (jackknife_index, jackknife_count, jackknife_seed),
                arg_tuple = (pheno_one, G_kernel, spatial_coor, spatial_iid, alpha_corr, alpha_power, (-1,0,None),
                             # (permute_plus_index, permute_plus_count, permute_plus_seed), (permute_times_index, permute_times_count, permute_times_seed) ,just_testing, do_uncorr, do_gxe2, a2
                             (jackknife_index, permute_plus_count,permute_plus_seed),       (-1,0,None),                                                    just_testing, False,    False,    None)
                arg_list.append(arg_tuple)

        # Run "run_line" on each set of arguments and save to file
        return_list = map_function(work_item, arg_list) if len(arg_list)>1 or always_remote else map(work_item, arg_list)
        return_list = [line for line in return_list if line is not None] #Remove 'None' results
        permplus_table = pd.DataFrame(return_list)
        if cache_folder is not None:
            _write_csv(permplus_table, False, permplus_table_fn)


    #Create a table of the real nLL for each pheno
    real_result_permplus = permplus_table[permplus_table.permute_plus_index==-1][['phen','nLLcorr']]
    real_result_permplus.rename(columns={'nLLcorr':'nLLcorr_real'},inplace=True)
    real_result_permplus.set_index(['phen'],inplace=True)

    # Create a table of the permutation runs and add the real nLL to each row
    perm_table = permplus_table[permplus_table.permute_plus_index!=-1]
    result = perm_table.join(real_result_permplus, on='phen')
    result['P(e2)'] = [1.0 if b else 0.0 for b in result.nLLcorr <= result.nLLcorr_real] # create a column showing where the perm is better (or as good) as the real
    # Use pivottable to find the fraction of of times when permutation is better
    pivot_table_plus = pd.pivot_table(result, values=['P(e2)'], index=['phen'], columns=[], aggfunc=np.mean)
    if cache_folder is not None:
        summary_permplus_table_fn = "{0}/summary.permutation.GPlusE.{1}.count{2}.txt".format(cache_folder, pheno.sid_count, permute_plus_count)
        _write_csv(pivot_table_plus, True, summary_permplus_table_fn)

    ################################################
    # compute p(gxe2=0) via permutation
    ################################################

    #Only process phenos for which gxe2 is not 0
    nonzero = set(results_gxe2[results_gxe2.gxe2 !=0].phen)
    permtimes_phenotypes = set(pheno.sid) & nonzero #intersection
    permtimes_table_list = []
    for phen_target in permtimes_phenotypes:
        permtimes_table_fn = "{0}/permutation.GxE/{1}.count{2}.txt".format(cache_folder, phen_target, permute_times_count)

        if cache_folder is not None and os.path.exists(permtimes_table_fn):
            permtime_results = pd.read_csv(permtimes_table_fn, delimiter = '\t',index_col=False, comment=None)
        else:
            arg_list = []
            pheno_one = pheno[:,pheno.col_to_index([phen_target])] # Look at only this pheno_target
            alpha_corr, alpha_gxe2 = alpha_dict[phen_target]
            a2 = float(permplus_table[permplus_table.phen==phen_target][permplus_table.permute_plus_index == -1]['a2'])
            for permute_index in range(-1,permute_times_count):
                           # pheno, G_kernel, spatial_coor, spatial_iid, alpha,          alpha_powerm (permute_index, permute_count, permute_seed),
                arg_tuple = (pheno_one, G_kernel, spatial_coor, spatial_iid, alpha_gxe2, alpha_power, (-1,0,None),
                             # (permute_plus_index, permute_plus_count, permute_plus_seed), (permute_times_index, permute_times_count, permute_times_seed) ,just_testing, do_uncorr, do_gxe2, a2
                            (-1,0,None),                                                    (permute_index, permute_times_count,permute_times_seed),        just_testing, False,     allow_gxe2,    a2)
                arg_list.append(arg_tuple)    

            # Run "run_line" on each set of arguments and save to file
            return_list = map_function(work_item, arg_list) if len(arg_list)>1 or always_remote else map(work_item, arg_list)
            return_list = [line for line in return_list if line is not None] #Remove 'None' results
            permtime_results = pd.DataFrame(return_list)
            if cache_folder is not None:
                pstutil.create_directory_if_necessary(permtimes_table_fn)
                _write_csv(permtime_results,False,permtimes_table_fn)
        permtimes_table_list.append(permtime_results)

    if permtimes_table_list: #not empty
        permtimes_table = pd.concat(permtimes_table_list)
        logging.info(permtimes_table.head())

        #Create a table of the real nLL for each pheno
        real_result_permtimes = permtimes_table[permtimes_table.permute_times_index==-1][['phen','nLL_gxe2']]
        real_result_permtimes.rename(columns={'nLL_gxe2':'nLL_gxe2_real'},inplace=True)
        real_result_permtimes.set_index(['phen'],inplace=True)

        # Create a table of the permutation runs and add the real nLL to reach row
        summary_permtimes_table_fn = "{0}/summary.permutation.GxE.{1}.count{2}.txt".format(cache_folder,len(permtimes_phenotypes), permute_times_count)

        perm_table = permtimes_table[permtimes_table.permute_times_index!=-1]
        resultx = perm_table.join(real_result_permtimes, on='phen')
        resultx['P(gxe2)'] = [1.0 if b else 0.0 for b in resultx.nLL_gxe2 <= resultx.nLL_gxe2_real] # create a column showing where the perm is better (or as good) as the real
        # Use pivottable to find the fraction of of times when permutation is better
        pivot_table_times = pd.pivot_table(resultx, values=['P(gxe2)'], index=['phen'], columns=[], aggfunc=np.mean)
        if cache_folder is not None:
            _write_csv(pivot_table_times,True,summary_permtimes_table_fn)


    #######################
    # Create final table of results by combining the summary tables
    #######################

    #Rename some columns
    results_corr.rename(columns={"h2uncorr SE":"SE (h2uncorr)","h2corr SE":"SE (h2corr)","e2 SE":"SE (e2)"}, inplace=True)

    #Rename some columns and join results
    results_gxe2.rename(columns={"alpha":"alpha_gxe2","gxe2 SE":"SE (gxe2)"}, inplace=True)
    del results_gxe2['alpha_power']
    results_gxe2.set_index(["phen"],inplace=True)
    final0 = results_corr.join(results_gxe2, on='phen')

    #Rename some columns and join results
    pivot_table_plus.rename(columns={"P(e2)":"P(e2=0)"}, inplace=True)
    final1 = final0.join(pivot_table_plus, on='phen')

    #Rename some columns and join results
    if permtimes_table_list: #not empty
        pivot_table_times.rename(columns={"P(gxe2)":"P(gxe2=0)"}, inplace=True)
        final2 = final1.join(pivot_table_times, on='phen')
    else:
        final2 = final1.copy()
        final2["P(gxe2=0)"] = np.nan

    #Rename 'phen' and select final columns
    final2.rename(columns={"phen":"phenotype"}, inplace=True)
    final3 = final2[["phenotype","h2uncorr","SE (h2uncorr)","h2corr","SE (h2corr)","P (diff=0)","e2","SE (e2)","P(e2=0)","alpha","alpha_gxe2","gxe2","SE (gxe2)","P(gxe2=0)"]].copy()

    #Rename sort the phenotypes
    final3['lower'] = [pheno_one.lower() for pheno_one in final3.phenotype]
    final3.sort(['lower'],inplace=True)
    del final3['lower']

    if cache_folder is not None:
        summary_final_table_fn = "{0}/summary.final.{1}.{2}.{3}.{4}.txt".format(cache_folder, pheno.sid_count, jackknife_count_actual,permute_plus_count,permute_times_count)
        _write_csv(final3,False,summary_final_table_fn)
    
    return final3
示例#57
0
                val[1::4,SNPsIndex:SNPsIndex+1][bytes>=4]=np.nan
                val[1::4,SNPsIndex:SNPsIndex+1][bytes>=8]=1
                val[1::4,SNPsIndex:SNPsIndex+1][bytes>=12]=byteThree
                bytes=np.mod(bytes,4)
                val[0::4,SNPsIndex:SNPsIndex+1]=byteZero
                val[0::4,SNPsIndex:SNPsIndex+1][bytes>=1]=np.nan
                val[0::4,SNPsIndex:SNPsIndex+1][bytes>=2]=1
                val[0::4,SNPsIndex:SNPsIndex+1][bytes>=3]=byteThree
            val = val[iid_index_out,:] #reorder or trim any extra allocation


            #!!LATER this can fail because the trim statement above messes up the order
            #assert(SnpReader._array_properties_are_ok(val, order, dtype)) #!!
            self._close_bed()

        return val


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)

    from pysnptools.snpreader import Pheno, Bed
    import pysnptools.util as pstutil
    snpdata = Pheno('../examples/toydata.phe').read()         # Read data from Pheno format
    pstutil.create_directory_if_necessary("tempdir/toydata.bed")
    Bed.write("tempdir/toydata.bed",snpdata,count_A1=False)   # Write data in Bed format

    import doctest
    doctest.testmod()
    # There is also a unit test case in 'pysnptools\test.py' that calls this doc test
    def test_lr_real(self):
        do_plot = False

        import pylab
        logging.info("TestLinRegTrain test_lr_real")

        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        #make covar just numbers 0,1,...
        covar = self.covariate_whole.read()
        covar.val = np.array([[float(num)] for num in xrange(covar.iid_count)])
        covariate_train = covar[train_idx,:].read()
        covariate_test = covar[test_idx,:].read()
        K0_test_test = KernelIdentity(covariate_test.iid)

        #make pheno  # pheno = 2*covar+100+normal(0,1)*10
        pheno = self.pheno_whole.read()
        np.random.seed(0)
        pheno.val = covar.val * 2.0 + 100 + np.random.normal(size=covar.val.shape)*10

        pheno_train = pheno[train_idx,:].read()
        pheno_test = pheno[test_idx,:].read()

        if do_plot:
            #Plot training x and y, testing x and y
            pylab.plot(covariate_train.val, pheno_train.val,".",covariate_test.val, pheno_test.val,".")
            pylab.suptitle("Plot training x and y, testing x and y")
            pylab.show()

        Xtrain = np.c_[covariate_train.val,np.ones((covariate_train.iid_count,1))]
        Xtest = np.c_[covariate_test.val,np.ones((covariate_test.iid_count,1))]
        lsqSol = np.linalg.lstsq(Xtrain, pheno_train.val[:,0])
        bs=lsqSol[0] #weights
        r2=lsqSol[1] #squared residuals
        D=lsqSol[2]  #rank of design matrix
        N=pheno_train.iid_count
        REML = False
        if not REML:
            sigma2 = float(r2/N)
            nLL =  N*0.5*np.log(2*np.pi*sigma2) + N*0.5
        else:
            sigma2 = float(r2 / (N-D))
            nLL = N*0.5*np.log(2*np.pi*sigma2) + 0.5/sigma2*r2;
            nLL -= 0.5*D*np.log(2*np.pi*sigma2);#REML term

        predicted = Xtest.dot(bs)
        yerr = [np.sqrt(sigma2)] * len(predicted)
        if do_plot:
            pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.")
            pylab.xlim([-1, 10])
            pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None')
            pylab.suptitle("real linear regression: actual to prediction")
            pylab.show()

        #These should all give the same result
        first_name = None
        for name,K0_train,K0_whole_test in [("Identity Kernel",None,None)]:

            first_name = first_name or name
            #Learn model, save, load
            modelx = LinearRegression().fit(K0_train=K0_train, X=covariate_train, y=pheno_train)
                
                
            filename = self.tempout_dir + "/model_lr_real.flm.p"
            pstutil.create_directory_if_necessary(filename)
            joblib.dump(modelx, filename) 
            model = joblib.load(filename)

            do_test_on_train = True
            if do_test_on_train:
                #Predict with model (test on train)
                predicted_pheno, covar = model.predict(K0_whole_test=K0_train, X=covariate_train) #test on train
                output_file = self.file_name("lr_reala_"+name)
                Dat.write(output_file,predicted_pheno)
                covar2 = SnpData(iid=covar.row,sid=covar.col[:,1],val=covar.val) #kludge to write kernel to text format
                output_file = self.file_name("lr_reala.cov_"+name)
                Dat.write(output_file,covar2)

                yerr = np.sqrt(np.diag(covar.val))
                predicted = predicted_pheno.val
                if do_plot:
                    pylab.plot(covariate_train.val, pheno_train.val,"g.",covariate_train.val, predicted,"r.")
                    pylab.xlim([0, 50])
                    pylab.ylim([100, 200])
                    pylab.errorbar(covariate_train.val, predicted,yerr,linestyle='None')
                    pylab.suptitle(name+": test on train: train X to true target (green) and prediction (red)")
                    pylab.show()

                self.compare_files(predicted_pheno,"lr2a_"+first_name)
                self.compare_files(covar2,"lr2a.cov_"+first_name)

            #Predict with model (test on test)
            predicted_pheno, covar  = model.predict(K0_whole_test=K0_whole_test, X=covariate_test) #test on train
            output_file = self.file_name("lr_realb_"+name)
            Dat.write(output_file,predicted_pheno)
            covar2 = SnpData(iid=covar.row,sid=covar.col[:,1],val=covar.val) #kludge to write kernel to text format
            output_file = self.file_name("lr_realb.cov_"+name)
            Dat.write(output_file,covar2)

            yerr = np.sqrt(np.diag(covar.val))
            predicted = predicted_pheno.val
            if do_plot:
                pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.")
                pylab.xlim([-1, 10])
                pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None')
                pylab.suptitle(name+": test on test: test X to true target (green) and prediction (red)")
                pylab.show()
                ## Plot y and predicted y (test on train)
                #pylab.plot(pheno_test.val,predicted_pheno.val,".")
                #pylab.suptitle(name+": test on test: true target to prediction")
                #pylab.show()

            self.compare_files(predicted_pheno,"lr2b_"+first_name)
            self.compare_files(covar2,"lr2b.cov_"+first_name)
    def test_lr_as_lmm(self):
            do_plot = False
            #later why does this test case generate two intersect info messages instead of just one?
            import pylab
            logging.info("TestLmmTrain test_lr_as_lmm")

            ###############################################################
            # Create a linear data set with just a little noise
            ###############################################################

            train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
            test_idx  = np.r_[0:10] # the first 10 iids

            #make covar just numbers 0,1,...
            covar = self.covariate_whole.read()
            covar.val = np.array([[float(num)] for num in xrange(covar.iid_count)])
            covar._name = 'np.array([[float(num)] for num in xrange(covar.iid_count)])'
            covariate_train = covar[train_idx,:].read()
            covariate_test = covar[test_idx,:].read()


            #make pheno  # pheno = 2*covar+100+normal(0,1)*10
            pheno = self.pheno_whole.read()
            np.random.seed(0)
            pheno.val = covar.val * 2.0 + 100 + np.random.normal(size=covar.val.shape)*10

            pheno_train = pheno[train_idx,:].read()
            pheno_test = pheno[test_idx,:].read()

            if do_plot:
                #Plot training x and y, testing x and y
                pylab.plot(covariate_train.val, pheno_train.val,".",covariate_test.val, pheno_test.val,".")
                pylab.suptitle("Plot training x and y, testing x and y")
                pylab.show()

            ###############################################################
            # Show that linear regression does a good job predicting
            ###############################################################

            Xtrain = np.c_[covariate_train.val,np.ones((covariate_train.iid_count,1))]
            Xtest = np.c_[covariate_test.val,np.ones((covariate_test.iid_count,1))]
            lsqSol = np.linalg.lstsq(Xtrain, pheno_train.val[:,0],rcond=-1)
            bs=lsqSol[0] #weights
            r2=lsqSol[1] #squared residuals
            D=lsqSol[2]  #rank of design matrix
            N=pheno_train.iid_count
            REML = False
            if not REML:
                sigma2 = float(r2/N)
                nLL =  N*0.5*np.log(2*np.pi*sigma2) + N*0.5
            else:
                sigma2 = float(r2 / (N-D))
                nLL = N*0.5*np.log(2*np.pi*sigma2) + 0.5/sigma2*r2;
                nLL -= 0.5*D*np.log(2*np.pi*sigma2);#REML term

            predicted = Xtest.dot(bs)
            yerr = [np.sqrt(sigma2)] * len(predicted)
            if do_plot:
                pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.")
                pylab.xlim([-1, 10])
                pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None')
                pylab.suptitle("real linear regression: actual to prediction")
                pylab.show()

            ###############################################################
            # Use LMM as LR and apply test on train
            ###############################################################
            for force_full_rank in [True, False]:
                #Learn model, save, load
                fastlmmx = FastLMM(GB_goal=2,force_full_rank=force_full_rank).fit(K0_train=covariate_train, X=None, y=pheno_train)
                
                
                filename = self.tempout_dir + "/model_lr_as_lmm.flm.p"
                pstutil.create_directory_if_necessary(filename)
                joblib.dump(fastlmmx, filename) 
                fastlmm = joblib.load(filename)


                do_test_on_train = True
                if do_test_on_train:
                    #Predict with model (test on train)
                    predicted_pheno, covar = fastlmm.predict(K0_whole_test=covariate_train, X=None,count_A1=False) #test on train
                    output_file = self.file_name("lr_as_lmma_")
                    Dat.write(output_file,predicted_pheno)
                    covar2 = SnpData(iid=covar.row,sid=covar.col[:,1],val=covar.val) #kludge to write kernel to text format
                    output_file = self.file_name("lr_as_lmma.cov_")
                    Dat.write(output_file,covar2)

                    yerr = np.sqrt(np.diag(covar.val))
                    predicted = predicted_pheno.val
                    if do_plot:
                        pylab.plot(covariate_train.val, pheno_train.val,"g.",covariate_train.val, predicted,"r.")
                        pylab.xlim([0, 50])
                        pylab.ylim([100, 200])
                        pylab.errorbar(covariate_train.val, predicted,yerr,linestyle='None')
                        pylab.suptitle("test on train: train X to true target (green) and prediction (red)")
                        pylab.show()

                    self.compare_files(predicted_pheno,"lr_as_lmma_")
                    self.compare_files(covar2,"lr_as_lmma.cov_")

                ###############################################################
                # Use LMM as LR and apply test on test
                ###############################################################

                #Predict with model (test on test)
                predicted_pheno, covar  = fastlmm.predict(K0_whole_test=covariate_test, X=None,count_A1=False) #test on train
                output_file = self.file_name("lr_as_lmmb_")
                Dat.write(output_file,predicted_pheno)
                covar2 = SnpData(iid=covar.row,sid=covar.col[:,1],val=covar.val) #kludge to write kernel to text format
                output_file = self.file_name("lr_as_lmmb.cov_")
                Dat.write(output_file,covar2)

                yerr = np.sqrt(np.diag(covar.val))
                predicted = predicted_pheno.val
                if do_plot:
                    pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.")
                    pylab.xlim([-1, 10])
                    pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None')
                    pylab.suptitle("test on test: test X to true target (green) and prediction (red)")
                    pylab.show()
                    ## Plot y and predicted y (test on train)
                    #pylab.plot(pheno_test.val,predicted_pheno.val,".")
                    #pylab.suptitle(name+": test on test: true target to prediction")
                    #pylab.show()

                self.compare_files(predicted_pheno,"lr_as_lmmb_")
                self.compare_files(covar2,"lr_as_lmmb.cov_")