Exemplo n.º 1
0
    def readFiles(self):
        print 'Reading Data ...'
        X = None
        y = None
        Xname = None
        if self.fileType == 'plink':
            from pysnptools.snpreader import Bed
            snpreader = Bed(self.fileName+'.bed')
            snpdata = snpreader.read()
            X = snpdata.val
            Xname = snpdata.sid

            # from pysnptools.snpreader import Pheno
            # phenoreader = Pheno(self.fileName+".fam")
            # phenodata = phenoreader.read()
            # y = phenodata.val[:,-1]
            y = self.famReader(self.fileName+".fam")

        if self.fileType == 'csv':
            X = np.loadtxt(self.fileName+'.geno.csv', delimiter=',')
            y = np.loadtxt(self.fileName+'.pheno.csv', delimiter=',')
            try:
                Xname = np.loadtxt(self.fileName+'.marker.csv', delimiter=',')
            except:
                Xname = ['geno ' + str(i+1) for i in range(X.shape[1])]
        if self.imputationFlag:
            X = self.imputation(X)
            keep = True - np.isnan(y)
            return X[keep,:], y[keep], Xname
        else:
            X = self.simpleImputation(X)
            keep = True - np.isnan(y)
            return X[keep,:], y[keep], Xname
Exemplo n.º 2
0
 def setUpClass(self):
     self.currentFolder = os.path.dirname(os.path.realpath(__file__))
     #TODO: get data set with NANs!
     snpreader = Bed(self.currentFolder + "/examples/toydata",count_A1=False)
     self.pheno_fn = self.currentFolder + "/examples/toydata.phe"
     self.snpdata = snpreader.read(order='F',force_python_only=True)
     self.snps = self.snpdata.val
Exemplo n.º 3
0
    def setUpClass(self):
        currentFolder = os.path.dirname(os.path.realpath(__file__))
        self.snp_fn = currentFolder + "/../../tests/datasets/mouse/alldata"
        self.pheno_fn = currentFolder + "/../../tests/datasets/mouse/pheno_10_causals.txt"
        #self.cov_fn = currentFolder + "/examples/toydata.cov"

        # load data
        ###################################################################
        snp_reader = Bed(self.snp_fn)
        pheno = pstpheno.loadOnePhen(self.pheno_fn)
        #cov = pstpheno.loadPhen(self.cov_fn)
        
        # intersect sample ids
        snp_reader, pheno = pysnptools.util.intersect_apply([snp_reader, pheno])
        
        self.G = snp_reader.read(order='C').val
        self.G = stdizer.Unit().standardize(self.G)
        self.G.flags.writeable = False
        self.y = pheno['vals'][:,0]
        self.y.flags.writeable = False

        # load pcs
        #self.G_cov = cov['vals']
        self.G_cov = np.ones((len(self.y), 1))
        self.G_cov.flags.writeable = False
Exemplo n.º 4
0
    def setUpClass(self):
        currentFolder = os.path.dirname(os.path.realpath(__file__))
        self.snp_fn = currentFolder + "/../../tests/datasets/mouse/alldata"
        self.pheno_fn = currentFolder + "/../../tests/datasets/mouse/pheno_10_causals.txt"
        #self.cov_fn = currentFolder + "/examples/toydata.cov"

        # load data
        ###################################################################
        snp_reader = Bed(self.snp_fn)
        pheno = pstpheno.loadOnePhen(self.pheno_fn)
        #cov = pstpheno.loadPhen(self.cov_fn)

        # intersect sample ids
        snp_reader, pheno = pysnptools.util.intersect_apply(
            [snp_reader, pheno])

        self.G = snp_reader.read(order='C').val
        self.G = stdizer.Unit().standardize(self.G)
        self.G.flags.writeable = False
        self.y = pheno['vals'][:, 0]
        self.y.flags.writeable = False

        # load pcs
        #self.G_cov = cov['vals']
        self.G_cov = np.ones((len(self.y), 1))
        self.G_cov.flags.writeable = False
Exemplo n.º 5
0
 def setUpClass(self):
     self.currentFolder = os.path.dirname(os.path.realpath(__file__))
     #TODO: get data set with NANs!
     snpreader = Bed(self.currentFolder + "/examples/toydata",
                     count_A1=False)
     self.pheno_fn = self.currentFolder + "/examples/toydata.phe"
     self.snpdata = snpreader.read(order='F', force_python_only=True)
     self.snps = self.snpdata.val
Exemplo n.º 6
0
def getData(filename):
    mph=3;
	sFil=Bed(filename);
	yFil=Pheno(filename+".fam");
	
	X=sFil.read().standardize().val;
	y=yFil.read().val[:,mph];
	return [y,sFil];
Exemplo n.º 7
0
 def test_roundtrip(self):
     max_weight = 2
     snpreader1 = Bed(self.currentFolder + "/../examples/toydata.5chrom.bed",count_A1=True)
     snpdata1 = snpreader1.read()
     distreader1 = snpreader1.as_dist(max_weight)
     snpreader2 = distreader1.as_snp(max_weight)
     assert snpdata1.allclose(snpreader2.read(),equal_nan=True)
     snpdata1.val[0,0] = np.nan
     assert snpdata1.allclose(snpdata1.as_dist(max_weight).as_snp(max_weight).read(),equal_nan=True)
Exemplo n.º 8
0
def main():
    """
    example that compares output to fastlmmc
    """

    # set up data
    phen_fn = "../feature_selection/examples/toydata.phe"
    snp_fn = "../feature_selection/examples/toydata.5chrom.bed"
    #chrom_count = 5

    # load data
    ###################################################################
    snp_reader = Bed(snp_fn)
    pheno = pstpheno.loadOnePhen(phen_fn)

    cov = None
    #cov = pstpheno.loadPhen(self.cov_fn)

    snp_reader, pheno, cov = intersect_apply([snp_reader, pheno, cov])

    G = snp_reader.read(order='C').val
    G = stdizer.Unit().standardize(G)
    G.flags.writeable = False
    y = pheno['vals'][:, 0]
    y.flags.writeable

    # load pcs
    #G_pc = cov['vals']
    #G_pc.flags.writeable = False
    delta = 2.0
    gwas = WindowingGwas(G, y, delta=delta)
    pv = gwas.run_gwas()

    from fastlmm.association.tests.test_gwas import GwasTest
    REML = False
    snp_pos_sim = snp_reader.sid
    snp_pos_test = snp_reader.sid
    os.environ["FastLmmUseAnyMklLib"] = "1"
    gwas_c = GwasTest(snp_fn,
                      phen_fn,
                      snp_pos_sim,
                      snp_pos_test,
                      delta,
                      REML=REML,
                      excludeByPosition=0)
    gwas_c.run_gwas()

    import pylab
    pylab.plot(np.log(pv), np.log(gwas_c.p_values), "+")
    pylab.plot(np.arange(-18, 0), np.arange(-18, 0), "-k")
    pylab.show()

    np.testing.assert_array_almost_equal(np.log(pv),
                                         np.log(gwas_c.p_values),
                                         decimal=3)

    simple_manhattan_plot(pv)
Exemplo n.º 9
0
 def read_plink(self, fn_plink = None):
     """
     plink reader
     """
     PL = Bed(fn_plink)
     PLOB = PL.read()
     self.GT = PLOB.val
     self.POS = PLOB.pos[:,[0,1]]
     self.SID = PLOB.iid[:,1]
     self.isNormalised = False
Exemplo n.º 10
0
 def test_hdf5_case3(self):
     snpreader1 = SnpHdf5(self.currentFolder +
                          "/examples/toydata.snpmajor.snp.hdf5")[::2, :]
     snpreader2 = Bed(self.currentFolder + "/examples/toydata",
                      count_A1=False)[::2, :]
     self.assertTrue(
         np.allclose(snpreader1.read().val,
                     snpreader2.read().val,
                     rtol=1e-05,
                     atol=1e-05))
Exemplo n.º 11
0
 def read_plink(self, fn_plink=None):
     """
     plink reader
     """
     PL = Bed(fn_plink)
     PLOB = PL.read()
     self.GT = PLOB.val
     self.POS = PLOB.pos[:, [0, 1]]
     self.SID = PLOB.iid[:, 1]
     self.isNormalised = False
Exemplo n.º 12
0
    def gen_and_compare(self, output_file, **kwargs):
        from pysnptools.snpreader import Bed

        gen_snpdata = snp_gen(**kwargs)
        #pstutil.create_directory_if_necessary(self.currentFolder + "/tempdir/" + output_file,isfile=True)
        #Bed.write(gen_snpdata, self.currentFolder + "/tempdir/" + output_file)  #comment out
        bed = Bed(self.currentFolder + "/../../tests/datasets/generate/" + output_file,count_A1=False)
        ref_snpdata = bed.read()
        assert gen_snpdata == ref_snpdata, "Failure on "+output_file
        return gen_snpdata
Exemplo n.º 13
0
 def test_write_x_x_cpp(self):
     snpreader = Bed(self.currentFolder + "/examples/toydata")
     for order in ['C','F']:
         for dtype in [np.float32,np.float64]:
             snpdata = snpreader.read(order=order,dtype=dtype)
             snpdata.val[-1,0] = float("NAN")
             output = "tempdir/toydata.{0}{1}.cpp".format(order,"32" if dtype==np.float32 else "64")
             create_directory_if_necessary(output)
             Bed.write(snpdata, output)
             snpdata2 = Bed(output).read()
             assert TestLoader.is_same(snpdata, snpdata2) #!!!define an equality method on snpdata?
Exemplo n.º 14
0
    def factory(snpreader, num_snps_in_memory, standardizer, blocksize):
        if isinstance(snpreader, str):
            snpreader = Bed(snpreader)

        if num_snps_in_memory >= snpreader.sid_count:
            in_memory = InMemory(snpreader.read(order='C').standardize(standardizer), standardizer, blocksize)
            in_memory._snpreader.val.flags.writeable = False
            in_memory._val = in_memory._snpreader.val
            return in_memory
        else:
            return FromDisk(snpreader, num_snps_in_memory, standardizer, blocksize, None)
Exemplo n.º 15
0
 def test_write_x_x_cpp(self):
     snpreader = Bed(self.currentFolder + "/examples/toydata")
     for order in ['C','F']:
         for dtype in [np.float32,np.float64]:
             snpdata = snpreader.read(order=order,dtype=dtype)
             snpdata.val[-1,0] = float("NAN")
             output = "tempdir/toydata.{0}{1}.cpp".format(order,"32" if dtype==np.float32 else "64")
             create_directory_if_necessary(output)
             Bed.write(output, snpdata)
             snpdata2 = Bed(output).read()
             np.testing.assert_array_almost_equal(snpdata.val, snpdata2.val, decimal=10)
Exemplo n.º 16
0
def process_data(input_path, output_path, name):
    snpreader = Bed(os.path.join(input_path, name))
    data = snpreader.read()
    values = data.val
    preproc_vals = pysnp_genpreproc(values)
    assert(np.any(np.isnan(preproc_vals)) == False)
    saved = os.path.join(output_path, name + ".h5py")
    path, keys = h5_save(path=saved, data_obj={name:preproc_vals}, dt='f')
    return {'n_subjects':data.iid_count, 'subject_ids':data.iid,
            'n_snps':data.sid_count, 'snp_ids':data.sid,
            'data_preprocessed_location': {'path':path, 'key':keys}}
Exemplo n.º 17
0
    def factory(snpreader, num_snps_in_memory, standardizer, blocksize,count_A1=None):
        if isinstance(snpreader, str):
            snpreader = Bed(snpreader,count_A1=count_A1)

        if num_snps_in_memory >= snpreader.sid_count:
            in_memory = InMemory(snpreader.read(order='C').standardize(standardizer), standardizer, blocksize)
            in_memory._snpreader.val.flags.writeable = False
            in_memory._val = in_memory._snpreader.val
            return in_memory
        else:
            return FromDisk(snpreader, num_snps_in_memory, standardizer, blocksize, None)
Exemplo n.º 18
0
 def test_subset_view(self):
     snpreader2 = Bed(self.currentFolder + "/examples/toydata",count_A1=False)[:,:]
     result = snpreader2.read(view_ok=True)
     self.assertFalse(snpreader2 is result)
     result2 = result[:,:].read()
     self.assertFalse(sp.may_share_memory(result2.val,result.val))
     result3 = result[:,:].read(view_ok=True)
     self.assertTrue(sp.may_share_memory(result3.val,result.val))
     result4 = result3.read()
     self.assertFalse(sp.may_share_memory(result4.val,result3.val))
     result5 = result4.read(view_ok=True)
     self.assertTrue(sp.may_share_memory(result4.val,result5.val))
Exemplo n.º 19
0
def main(args):
    print('reading seeed snps')
    seed_snps = pd.read_csv(args.seed_snps, header=None, names=['SNP'], index_col='SNP')
    seed_snps['ibs_length'] = 0
    seed_snps['ibd'] = 0

    print('reading typed snps')
    typed_snps = pd.read_csv(args.typed_snps, header=None, names=['SNP'])

    print('reading genotypes')
    data = Bed(args.bfile)
    X = data.read().val
    typed_snps_indices = np.sort(data.sid_to_index(typed_snps.SNP))
    typed_snps_bp = data.col_property[typed_snps_indices,2]

    print(len(seed_snps), 'snps in list')
    print(data.iid_count, data.sid_count, 'are dimensions of X')

    def analyze_snp(i):
        # find first typed snp after query snp
        snp_bp = data.col_property[i,2]
        v = np.where(typed_snps_bp > snp_bp)[0]
        if len(v) > 0:
            typed_i = v[0]
        else:
            typed_i = len(typed_snps_indices)-1

        n1, n2 = np.where(X[:,i] == 1)[0]
        if (X[n1,typed_snps_indices[typed_i]] - X[n2, typed_snps_indices[typed_i]])**2 == 4:
            return 0, 0

        typed_il, typed_ir = fis.find_boundaries(
                X[n1,typed_snps_indices],
                X[n2,typed_snps_indices],
                typed_i)
        typed_ir -= 1

        il = typed_snps_indices[typed_il]
        ir = typed_snps_indices[typed_ir]
        cM = data.col_property[ir, 1] - \
                data.col_property[il, 1]
        ibd = (np.mean(X[n1,il:ir] == X[n2,il:ir]) > 0.99)
        return cM, int(ibd)

    for (i, snp) in iter.show_progress(
            it.izip(data.sid_to_index(seed_snps.index), seed_snps.index),
            total=len(seed_snps)):
            # total=10):
        seed_snps.ix[snp, ['ibs_length', 'ibd']] = analyze_snp(i)

    print(seed_snps.iloc[:100])
    seed_snps.to_csv(args.outfile, sep='\t')
Exemplo n.º 20
0
def cal_kin_val(bed_file, small_val=0.001):
    snp_on_disk = Bed(bed_file, count_A1=False)
    snp_mat = snp_on_disk.read().val
    freq = np.sum(snp_mat, axis=0) / (2 * snp_on_disk.iid_count)
    freq.shape = (1, snp_on_disk.sid_count)
    snp_mat = snp_mat - 2*freq
    scale = 2 * freq * (1 - freq)
    scale = np.sum(scale)
    kin = np.dot(snp_mat,snp_mat.T)/scale
    kin_diag = np.diag(kin)
    kin_diag = kin_diag + kin_diag * small_val
    np.fill_diagonal(kin, kin_diag)
    return kin
Exemplo n.º 21
0
 def test_subset_view(self):
     snpreader2 = Bed(self.currentFolder + "/examples/toydata",
                      count_A1=False)[:, :]
     result = snpreader2.read(view_ok=True)
     self.assertFalse(snpreader2 is result)
     result2 = result[:, :].read()
     self.assertFalse(sp.may_share_memory(result2.val, result.val))
     result3 = result[:, :].read(view_ok=True)
     self.assertTrue(sp.may_share_memory(result3.val, result.val))
     result4 = result3.read()
     self.assertFalse(sp.may_share_memory(result4.val, result3.val))
     result5 = result4.read(view_ok=True)
     self.assertTrue(sp.may_share_memory(result4.val, result5.val))
Exemplo n.º 22
0
def main():
    """
    example that compares output to fastlmmc
    """


    # set up data
    phen_fn = "../feature_selection/examples/toydata.phe"
    snp_fn = "../feature_selection/examples/toydata.5chrom"
    #chrom_count = 5
    
    # load data
    ###################################################################
    snp_reader = Bed(snp_fn)
    pheno = pstpheno.loadOnePhen(phen_fn)

    cov = None
    #cov = pstpheno.loadPhen(self.cov_fn)    

    snp_reader, pheno, cov = intersect_apply([snp_reader, pheno, cov])
    
    G = snp_reader.read(order='C').val
    G = stdizer.Unit().standardize(G)
    G.flags.writeable = False
    y = pheno['vals'][:,0]
    y.flags.writeable

    # load pcs
    #G_pc = cov['vals']
    #G_pc.flags.writeable = False
    delta = 2.0
    gwas = WindowingGwas(G, y, delta=delta)
    pv = gwas.run_gwas()

    from fastlmm.association.tests.test_gwas import GwasTest
    REML = False
    snp_pos_sim = snp_reader.sid
    snp_pos_test = snp_reader.sid
    os.environ["FastLmmUseAnyMklLib"] = "1"
    gwas_c = GwasTest(snp_fn, phen_fn, snp_pos_sim, snp_pos_test, delta, REML=REML, excludeByPosition=0)
    gwas_c.run_gwas()

    import pylab
    pylab.plot(np.log(pv), np.log(gwas_c.p_values), "+")
    pylab.plot(np.arange(-18, 0), np.arange(-18,0), "-k")
    pylab.show()

    np.testing.assert_array_almost_equal(np.log(pv), np.log(gwas_c.p_values), decimal=3)
    
    simple_manhattan_plot(pv)
Exemplo n.º 23
0
    def test_SNC(self):
        logging.info("TestSNC")
        test_snps = self.bedbase
        pheno = pstpheno.loadOnePhen(self.phen_fn,vectorize=True)
        covar = pstpheno.loadPhen(self.cov_fn)
        bed = Bed(test_snps, count_A1=False)
        snc = bed.read()
        snc.val[:,2] = [0] * snc.iid_count # make SNP #2 have constant values (aka a SNC)

        output_file_name = self.file_name("snc")

        frame = single_snp(test_snps=snc[:,:10], pheno=pheno, G0=snc, mixing=0,leave_out_one_chrom=False,
                                  covar=covar, output_file_name=output_file_name,count_A1=False
                                  )
        self.compare_files(frame,"snc")
Exemplo n.º 24
0
 def test_write_x_x_cpp(self):
     for count_A1 in [False, True]:
         snpreader = Bed(self.currentFolder + "/examples/toydata",
                         count_A1=count_A1)
         for order in ['C', 'F']:
             for dtype in [np.float32, np.float64]:
                 snpdata = snpreader.read(order=order, dtype=dtype)
                 snpdata.val[-1, 0] = float("NAN")
                 output = "tempdir/toydata.{0}{1}.cpp".format(
                     order, "32" if dtype == np.float32 else "64")
                 create_directory_if_necessary(output)
                 Bed.write(output, snpdata, count_A1=count_A1)
                 snpdata2 = Bed(output, count_A1=count_A1).read()
                 np.testing.assert_array_almost_equal(snpdata.val,
                                                      snpdata2.val,
                                                      decimal=10)
Exemplo n.º 25
0
def load_plink_bed_bim_fam_dataset(path_dataset,
                                   snp_ids=None,
                                   subject_ids=None,
                                   count_A1=True):
    """
    Load a Plink bed/bim/fam dataset as a SnpData instance. Optionnally a
    specific list of snps or subjects can be extracted to avoid loading
    everything in memory.

    Parameters
    ----------
    path_dataset: str
        Path to the Plink bed/bim/fam dataset, with or without .bed extension.
    snp_ids: list/set of str, default None
        Snps that should be extracted if available in the dataset.
        By default None, all snps are loaded.
    subject_ids: list of str, default None
        Subjects that should be extracted if available in the dataset.
        By default None, all subjects are loaded.
    count_A1: bool, default True
        Genotypes are provided as allele counts, A1 if True else A2.

    Return
    ------
    snp_data: pysnptools object
        PLINK data loaded by the 'pysnptools' library.
    """

    # Load the metadata, without loading the genotypes
    snp_data = Bed(path_dataset, count_A1=count_A1)

    # If requested, filter on snp ids
    if snp_ids is not None:
        snp_ids = set(snp_ids)
        snp_bool_indexes = [(s in snp_ids) for s in snp_data.sid]
        snp_data = snp_data[:, snp_bool_indexes]

    # If requested, filter on subject ids
    if subject_ids is not None:
        subject_ids = set(subject_ids)
        subject_bool_indexes = [(s in subject_ids) for s in snp_data.iid[:, 1]]
        snp_data = snp_data[subject_bool_indexes, :]

    # Load the genotypes from the Plink dataset
    snp_data = snp_data.read()

    return snp_data
Exemplo n.º 26
0
def genPheno(filename="../thinFam",
             per=.5,
             savename="fakePheno.txt",
             c=2.0,
             num=5):
    sFil = Bed(filename)
    D = sFil.read().val
    m = len(D[0])
    n = len(D)
    print m
    print n
    I = [rand.randint(0, m - 1) for i in range(0, num)]
    SNP = [[D[j][i] for j in range(0, n)] for i in I]
    #p0=n*peir/sum([c**i*len([j for j in SNP if j==float(i)]) for i in range(0,3)])
    print len(I)
    print len(SNP)
    print len(SNP[0])
    print n
    print min([len(s) for s in SNP])
    print SNP

    SNP = [[max(i, 0.0) for i in s] for s in SNP]
    for i in range(0, num):
        for j in range(0, n):
            if not SNP[i][j] in [1.0, 0.0, 2.0]:
                SNP[i][j] = 0.0
    print[list(set(s)) for s in SNP]
    lst = [sum([SNP[j][i] for j in range(0, num)]) for i in range(0, n)]
    #print lst;
    print sum(
        [c**(sum([SNP[j][i] for j in range(0, num)])) for i in range(0, n)])
    p0 = n * per / sum(
        [c**(sum([SNP[j][i] for j in range(0, num)])) for i in range(0, n)])
    print p0
    y = [
        float(
            rand.uniform(0, 1) < p0 *
            c**sum([SNP[j][i] for j in range(0, num)])) for i in range(0, n)
    ]
    if len(savename) == 0:
        return y
    fil = open(savename, "w")
    for i in y:
        fil.write(str(i) + "\n")
    fil.close()
Exemplo n.º 27
0
def load_plink_bed_bim_fam_dataset(path_dataset, snp_ids=None,
                                   subject_ids=None, count_A1=True):
    """
    Load a Plink bed/bim/fam dataset as a SnpData instance. Optionnally a
    specific list of snps or subjects can be extracted to avoid loading
    everything in memory.

    Parameters
    ----------
    path_dataset: str
        Path to the Plink bed/bim/fam dataset, with or without .bed extension.
    snp_ids: list/set of str, default None
        Snps that should be extracted if available in the dataset.
        By default None, all snps are loaded.
    subject_ids: list of str, default None
        Subjects that should be extracted if available in the dataset.
        By default None, all subjects are loaded.
    count_A1: bool, default True
        Genotypes are provided as allele counts, A1 if True else A2.

    Return
    ------
    snp_data: pysnptools object
        PLINK data loaded by the 'pysnptools' library.
    """

    # Load the metadata, without loading the genotypes
    snp_data = Bed(path_dataset, count_A1=count_A1)

    # If requested, filter on snp ids
    if snp_ids is not None:
        snp_ids = set(snp_ids)
        snp_bool_indexes = [(s in snp_ids) for s in snp_data.sid]
        snp_data = snp_data[:, snp_bool_indexes]

    # If requested, filter on subject ids
    if subject_ids is not None:
        subject_ids = set(subject_ids)
        subject_bool_indexes = [(s in subject_ids) for s in snp_data.iid[:, 1]]
        snp_data = snp_data[subject_bool_indexes, :]

    # Load the genotypes from the Plink dataset
    snp_data = snp_data.read()

    return snp_data
Exemplo n.º 28
0
    def test_SNC(self):
        logging.info("TestSNC")
        test_snps = self.bedbase
        pheno = pstpheno.loadOnePhen(self.phen_fn, vectorize=True)
        covar = pstpheno.loadPhen(self.cov_fn)
        bed = Bed(test_snps, count_A1=False)
        snc = bed.read()
        snc.val[:, 2] = 0  # make SNP #2 have constant values (aka a SNC)

        output_file_name = self.file_name("snc")

        frame = single_snp(test_snps=snc[:, :10],
                           pheno=pheno,
                           G0=snc,
                           mixing=0,
                           leave_out_one_chrom=False,
                           covar=covar,
                           output_file_name=output_file_name,
                           count_A1=False)
        self.compare_files(frame, "snc")
Exemplo n.º 29
0
    def test_SNC(self):
        logging.info("TestSNC")
        from pysnptools.snpreader import Bed
        test_snps = self.bedbase
        pheno = pstpheno.loadOnePhen(self.phen_fn, vectorize=True)
        covar = pstpheno.loadPhen(self.cov_fn)
        bed = Bed(test_snps)
        snc = bed.read()
        snc.val[:, 2] = [
            0
        ] * snc.iid_count  # make SNP #2 have constant values (aka a SNC)

        output_file_name = self.file_name("snc")

        frame = single_snp(test_snps=snc[:, :10],
                           pheno=pheno,
                           G0=snc,
                           mixing=0,
                           covar=covar,
                           output_file_name=output_file_name)
        self.compare_files(frame, "snc")
Exemplo n.º 30
0
def cluster_data(snpreader):
    """
    compute hierarchical clustering of snp data set in bed_fn
    """

    if isinstance(snpreader,str):
        snpreader = Bed(snpreader)
    G = snpreader.read().standardize().val

    # Generate distance matrix
    from sklearn.metrics.pairwise import euclidean_distances
    D = euclidean_distances(G, G)

    # Compute and plot first dendrogram.
    fig = pylab.figure(figsize=(8,8))
    ax1 = fig.add_axes([0.09,0.1,0.2,0.6])
    Y = fc.linkage(D, method='average') #method="centroid" is cubic!
    Z1 = sch.dendrogram(Y, orientation='right')
    ax1.set_xticks([])
    ax1.set_yticks([])

    # Compute and plot second dendrogram.
    ax2 = fig.add_axes([0.3,0.71,0.6,0.2])
    #Y = sch.linkage(D, method='single')
    Z2 = sch.dendrogram(Y)
    ax2.set_xticks([])
    ax2.set_yticks([])

    # Plot distance matrix.
    axmatrix = fig.add_axes([0.3,0.1,0.6,0.6])
    idx1 = Z1['leaves']
    #dx2 = Z2['leaves']
    D = D[idx1,:]
    D = D[:,idx1]
    axmatrix.matshow(D, aspect='auto', origin='lower', cmap=pylab.cm.YlGnBu)
    axmatrix.set_xticks([])
    axmatrix.set_yticks([])

    pylab.show()
Exemplo n.º 31
0
def genPheno(filename="../thinFam",per=.5,savename="fakePheno.txt",c=2.0,num=5):
	sFil=Bed(filename);
	D=sFil.read().val;
	m=len(D[0]);
	n=len(D);
	print m;
	print n;
	I=[rand.randint(0,m-1) for i in range(0,num)];
	SNP=[[D[j][i] for j in range(0,n)] for i in I]
	#p0=n*peir/sum([c**i*len([j for j in SNP if j==float(i)]) for i in range(0,3)])
	print len(I);
	print len(SNP);
	print len(SNP[0]);
	print n;
	print min([len(s) for s in SNP])
	print SNP;
	
	SNP=[[max(i,0.0) for i in s] for s in SNP]
	for i in range(0,num):
		for j in range(0,n):
			if not SNP[i][j] in [1.0,0.0,2.0]:
				SNP[i][j]=0.0;
	print [list(set(s)) for s in SNP]
Exemplo n.º 32
0
        start=end

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)

    from pysnptools.snpreader import Pheno, Bed
    import pysnptools.util as pstutil

    data_file = 'd:\OneDrive\programs\epiCornell\syndata.bed'
    if False:
        from pysnptools.snpreader import SnpData
        import numpy as np
        bed1 = Bed("../../tests/datasets/synth/all")
        print(bed1.iid_count, bed1.sid_count, bed1.iid_count * bed1.sid_count)
        #goal 1500 individuals x 27000 SNP
        snpdata1 = bed1.read()
        iid = bed1.iid
        sid = ['sid{0}'.format(i) for i in xrange(27000)]
        val = np.tile(snpdata1.val,(3,6))[:,:27000].copy()
        #snpdata = Pheno('pysnptools/examples/toydata.phe').read()         # Read data from Pheno format
        snpdata2 = SnpData(iid, sid, val)
        print(snpdata2.iid_count, snpdata2.sid_count, snpdata2.iid_count * snpdata2.sid_count)
        Bed.write(snpdata2,data_file,count_A1=False)

    synbed = Bed(data_file)
    print(synbed.iid_count, synbed.sid_count, synbed.iid_count * synbed.sid_count)

    part_count = 1000
    part_list = list(split_on_sids(synbed,part_count))

    pairs00 = _Pairs(part_list[0])
Exemplo n.º 33
0
        V_stds = np.std(V[:, 1:n_V], axis=0)
        V[:, 1:n_V] = zscore(V[:, 1:n_V], axis=0)
    else:
        V = np.ones((int(y.shape[0]), 1))
        n_V = 1
        V_names = np.array(['Intercept'])
    n_pars = n_X + n_V + 1
    print(str(n_pars) + ' parameters in model')

    ### Read genotypes ###
    test_chr = Bed(args.genofile)
    # select subset to test
    if args.whole_chr:
        sid = test_chr.sid
        pos = test_chr.pos
        test_chr = test_chr.read()
    else:
        sid = test_chr.sid[args.start:args.end]
        pos = test_chr.pos[args.start:args.end]
        test_chr = test_chr[:, args.start:args.end].read()
    genotypes = test_chr.val
    # Get genotype matrix
    if genotypes.ndim == 1:
        chr_length = 1
        genotypes = genotypes.reshape(genotypes.shape[0], 1)
    else:
        chr_length = genotypes.shape[1]
    print('Number of test loci: ' + str(genotypes.shape[1]))
    print('Genotypes for '+str(genotypes.shape[0])+' individuals read')
    # Get sample ids
    geno_id_dict = id_dict_make(np.array(test_chr.iid))
Exemplo n.º 34
0
def gma_univariate_eigen_lt_gwas(y,
                                 xmat,
                                 bed_file,
                                 out_file=None,
                                 init=None,
                                 maxiter=100,
                                 cc=1.0e-8):

    # kinship
    print 'Build the kinship matrix'
    starttime = datetime.datetime.now()
    num_id = max(y.shape)
    snp_on_disk = Bed(bed_file, count_A1=False)
    snp_mat = snp_on_disk.read().val
    freq = np.sum(snp_mat, axis=0) / (2 * snp_on_disk.iid_count)
    freq.shape = (1, snp_on_disk.sid_count)
    snp_mat = snp_mat - 2 * freq
    scale = 2 * freq * (1 - freq)
    scale = np.sum(scale)
    kin = np.dot(snp_mat, snp_mat.T) / scale

    endtime = datetime.datetime.now()
    print "Running time", (endtime - starttime).seconds
    print 'Finish'

    print 'Eigen decomposition'
    starttime = datetime.datetime.now()
    kin_eigen_val, kin_eigen_vec = linalg.eigh(kin)
    kin_eigen_val = kin_eigen_val.reshape(len(kin_eigen_val), 1)
    endtime = datetime.datetime.now()
    print "Running time", (endtime - starttime).seconds
    print 'Finish'
    y = np.dot(kin_eigen_vec.T, y)
    xmat = np.dot(kin_eigen_vec.T, xmat)
    if init is not None:
        var = np.array(init)
    else:
        var = np.array([1.0, 1.0])

    fd_mat = np.zeros(2)
    ai_mat = np.zeros((2, 2))
    em_mat = np.zeros((2, 2))
    ### 计算null model的方差组分

    print 'Estimate variances'
    starttime = datetime.datetime.now()
    for i in range(maxiter):
        print 'Start the iteration:', i + 1
        vmat = 1.0 / (kin_eigen_val * var[0] + var[1])
        vx = np.multiply(vmat, xmat)
        xvx = np.dot(xmat.T, vx)
        xvx = np.linalg.inv(xvx)

        # py
        xvy = np.dot(vx.T, y)
        y_xb = y - np.dot(xmat, np.dot(xvx, xvy))
        py = np.multiply(vmat, y_xb)

        # add_py p_add_py
        add_py = np.multiply(kin_eigen_val, py)
        xvy = np.dot(vx.T, add_py)
        y_xb = add_py - np.dot(xmat, np.dot(xvx, xvy))
        p_add_py = np.multiply(vmat, y_xb)

        # res_py p_res_py
        res_py = py.copy()
        xvy = np.dot(vx.T, res_py)
        y_xb = res_py - np.dot(xmat, np.dot(xvx, xvy))
        p_res_py = np.multiply(vmat, y_xb)

        # fd
        tr_vd = np.sum(np.multiply(vmat, kin_eigen_val))
        xvdvx = np.dot(xmat.T, vmat * kin_eigen_val * vx)
        tr_2d = np.sum(np.multiply(xvdvx, xvx))
        ypvpy = np.sum(np.dot(py.T, add_py))
        fd_mat[0] = 0.5 * (-tr_vd + tr_2d + ypvpy)

        tr_vd = np.sum(vmat)
        xvdvx = np.dot(xmat.T, vmat * vx)
        tr_2d = np.sum(np.multiply(xvdvx, xvx))
        ypvpy = np.sum(np.dot(py.T, res_py))
        fd_mat[1] = 0.5 * (-tr_vd + tr_2d + ypvpy)

        # AI
        ai_mat[0, 0] = np.sum(np.dot(add_py.T, p_add_py))
        ai_mat[0, 1] = ai_mat[1, 0] = np.sum(np.dot(add_py.T, p_res_py))
        ai_mat[1, 1] = np.sum(np.dot(res_py.T, p_res_py))
        ai_mat = 0.5 * ai_mat

        # EM
        em_mat[0, 0] = num_id / (var[0] * var[0])
        em_mat[1, 1] = num_id / (var[1] * var[1])

        print "FD:", fd_mat
        print "AI:", ai_mat
        print "EM:", em_mat

        for j in range(0, 51):
            gamma = j * 0.02
            wemai_mat = (1 - gamma) * ai_mat + gamma * em_mat
            delta = np.dot(linalg.inv(wemai_mat), fd_mat)
            var_update = var + delta
            if min(var_update) > 0:
                print 'EM weight value:', gamma
                break

        print 'Updated variances:', var_update

        # Convergence criteria
        cc_val = np.sum(pow(delta, 2)) / np.sum(pow(var_update, 2))
        cc_val = np.sqrt(cc_val)
        var = var_update.copy()

        print "CC: ", cc_val
        if cc_val < cc:
            break

    endtime = datetime.datetime.now()
    print "Running time", (endtime - starttime).seconds
    print 'Finish'

    # GWAS

    print 'Start GWAS'
    starttime = datetime.datetime.now()
    vmat = 1.0 / (kin_eigen_val * var[0] + var[1])
    vx = np.multiply(vmat, xmat)
    xvx = np.dot(xmat.T, vx)
    xvx = np.linalg.inv(xvx)

    # py
    xvy = np.dot(vx.T, y)
    y_xb = y - np.dot(xmat, np.dot(xvx, xvy))
    py = np.multiply(vmat, y_xb)

    snp_mat = np.dot(kin_eigen_vec.T, snp_mat)
    # 效应
    chi_vec = []
    p_vec = []
    eff_vec = np.dot(snp_mat.T, py) * var[0]
    eff_vec = eff_vec[:, -1]
    for i in range(snp_on_disk.sid_count):
        snpi = snp_mat[:, i:(i + 1)]
        snp_var1 = np.sum(reduce(np.multiply, [snpi, vmat, snpi]))
        snp_var2 = np.dot(snpi.T, vx)
        snp_var2 = reduce(np.dot, [snp_var2, xvx, snp_var2.T])
        snp_var = (snp_var1 + np.sum(snp_var2)) * var[0] * var[0]
        chi_val = eff_vec[i] * eff_vec[i] / snp_var
        p_val = chi2.sf(chi_val, 1)
        chi_vec.append(chi_val)
        p_vec.append(p_val)

    endtime = datetime.datetime.now()
    print "Running time", (endtime - starttime).seconds
    print 'Finish'

    snp_info_file = bed_file + '.bim'
    snp_info = pd.read_csv(snp_info_file, sep='\s+', header=None)
    res_df = snp_info.iloc[:, [0, 1, 3, 4, 5]]
    res_df.columns = ['chro', 'snp_ID', 'pos', 'allele1', 'allele2']
    res_df.loc[:, 'eff_val'] = eff_vec
    res_df.loc[:, 'chi_val'] = chi_vec
    res_df.loc[:, 'p_val'] = p_vec

    if out_file is not None:
        try:
            res_df.to_csv(out_file, sep=' ', index=False)
        except Exception, e:
            print e
            print 'Fail to output the result!'
            exit()
Exemplo n.º 35
0
        V_stds = np.std(V[:, 1:n_V], axis=0)
        V[:, 1:n_V] = zscore(V[:, 1:n_V], axis=0)
    else:
        V = np.ones((int(y.shape[0]), 1))
        n_V = 1
        V_names = np.array(['Intercept'])
    n_pars = n_X + n_V + 1
    print(str(n_pars) + ' parameters in model')

    ### Read genotypes ###
    test_chr = Bed(args.genofile)
    # select subset to test
    if args.whole_chr:
        sid = test_chr.sid
        pos = test_chr.pos
        test_chr = test_chr.read()
    else:
        sid = test_chr.sid[args.start:args.end]
        pos = test_chr.pos[args.start:args.end]
        test_chr = test_chr[:, args.start:args.end].read()
    genotypes = test_chr.val
    # Get genotype matrix
    if genotypes.ndim == 1:
        chr_length = 1
        genotypes = genotypes.reshape(genotypes.shape[0], 1)
    else:
        chr_length = genotypes.shape[1]
    print('Number of test loci: ' + str(genotypes.shape[1]))
    print('Genotypes for ' + str(genotypes.shape[0]) + ' individuals read')
    # Get sample ids
    geno_id_dict = id_dict_make(np.array(test_chr.iid))
Exemplo n.º 36
0
class _Epistasis(object) : #implements IDistributable

    def __init__(self,test_snps,pheno,G0, G1=None, mixing=0.0, covar=None,sid_list_0=None,sid_list_1=None,
                 log_delta=None, min_log_delta=-5, max_log_delta=10, output_file=None, cache_file=None):
        self._ran_once = False

        self.test_snps = test_snps
        self.pheno = pheno
        self.output_file_or_none = output_file
        self.cache_file = cache_file
        self.covar = covar
        self.sid_list_0 = sid_list_0
        self.sid_list_1 = sid_list_1
        self.G0=G0
        self.G1_or_none=G1
        self.mixing=mixing
        self.external_log_delta=log_delta
        self.min_log_delta = min_log_delta
        self.max_log_delta = max_log_delta
        self._str = "{0}({1},{2},G0={6},G1={7},mixing={8},covar={3},output_file={12},sid_list_0={4},sid_list_1{5},log_delta={9},min_log_delta={10},max_log_delta={11},cache_file={13})".format(
            self.__class__.__name__, self.test_snps,self.pheno,self.covar,self.sid_list_0,self.sid_list_1,
                 self.G0, self.G1_or_none, self.mixing, self.external_log_delta, self.min_log_delta, self.max_log_delta, output_file, cache_file)
        self.block_size = 1000

    def set_sid_sets(self):
        sid_set_0 = set(self.sid_list_0)
        self.intersect = sid_set_0.intersection(self.sid_list_1)
        self.just_sid_0 = sid_set_0.difference(self.intersect)
        self.just_sid_1 = self.intersect.symmetric_difference(self.sid_list_1)
        self._pair_count = len(self.just_sid_0)*len(self.intersect) + len(self.just_sid_0)*len(self.just_sid_1) + len(self.intersect)*len(self.just_sid_1) + len(self.intersect) * (len(self.intersect)-1)//2
        self.test_snps, self.pheno, self.covar, self.G0, self.G1_or_none = pstutil.intersect_apply([self.test_snps, self.pheno, self.covar, self.G0, self.G1_or_none]) #should put G0 and G1 first

    def _run_once(self):
        if self._ran_once:
            return
        self._ran_once = None

        if isinstance(self.test_snps, str):
            self.test_snps = Bed(self.test_snps)

        if isinstance(self.G0, str):
            self.G0 = Bed(self.G0)

        if isinstance(self.pheno, str):
            self.pheno = pstpheno.loadOnePhen(self.pheno,vectorize=True,missing='NaN')

        if self.covar is not None and isinstance(self.covar, str):
            self.covar = pstpheno.loadPhen(self.covar,missing='NaN')

        if self.G1_or_none is not None and isinstance(self.G1_or_none, str):
            self.G1_or_none = Bed(self.G1_or_none)

        if self.sid_list_0 is None:
            self.sid_list_0 = self.test_snps.sid

        if self.sid_list_1 is None:
            self.sid_list_1 = self.test_snps.sid

        self.set_sid_sets()

        #!!Should fix up to add only of no constant columns - will need to add a test case for this
        if self.covar is None:
            self.covar = np.ones((self.test_snps.iid_count, 1))
        else:
            self.covar = np.hstack((self.covar['vals'],np.ones((self.test_snps.iid_count, 1))))
        self.n_cov = self.covar.shape[1] 


        if self.output_file_or_none is None:
            self.__tempdirectory = ".working"
        else:
            self.__tempdirectory = self.output_file_or_none + ".working"

        self._ran_once = True
        

 #start of IDistributable interface--------------------------------------
    @property
    def work_count(self):
        self._run_once()
        block_count = self.div_ceil(self._pair_count, self.block_size)
        return block_count



    def work_sequence(self):
        self._run_once()

        return self.work_sequence_range(0,self.work_count)

    def work_sequence_range(self, start, end):
        self._run_once()

        lmm = self.lmm_from_cache_file()
        lmm.sety(self.pheno['vals'])

        for sid0_list, sid1_list in self.pair_block_sequence_range(start,end):
            yield lambda lmm=lmm,sid0_list=sid0_list,sid1_list=sid1_list : self.do_work(lmm,sid0_list,sid1_list)  # the 'lmm=lmm,...' is need to get around a strangeness in Python

    def reduce(self, result_sequence):
        #doesn't need "run_once()"

        frame = pd.concat(result_sequence)
        frame.sort_values(by="PValue", inplace=True)
        frame.index = np.arange(len(frame))

        if self.output_file_or_none is not None:
            frame.to_csv(self.output_file_or_none, sep="\t", index=False)

        return frame

        #!!Find a place to output info like this near the end of the run
        #logging.info("PhenotypeName\t{0}".format(pheno['header']))
        #logging.info("SampleSize\t{0}".format(test_snps.iid_count))
        #logging.info("SNPCount\t{0}".format(test_snps.sid_count))
        #logging.info("Runtime\t{0}".format(time.time()-t0))


    @property
    def tempdirectory(self):
        self._run_once()
        return self.__tempdirectory

    #optional override -- the str name of the instance is used by the cluster as the job name
    def __str__(self):
        #Doesn't need run_once
        return self._str


    def copyinputs(self, copier):
        self._run_once()
        if isinstance(self.test_snps, str):
            copier.input(self.test_snps + ".bed")
            copier.input(self.test_snps + ".bim")
            copier.input(self.test_snps + ".fam")
        else:
            copier.input(self.test_snps)

        copier.input(self.pheno)
        copier.input(self.covar)

        if isinstance(self.G0, str):
            copier.input(self.G0 + ".bed")
            copier.input(self.G0 + ".bim")
            copier.input(self.G0 + ".fam")
        else:
            copier.input(self.G0)

        copier.input(self.G1_or_none)
        copier.input(self.cache_file)

    def copyoutputs(self,copier):
        #Doesn't need run_once
        copier.output(self.output_file_or_none)

 #end of IDistributable interface---------------------------------------

    @staticmethod
    def div_ceil(num, den): #!!move to utils?
        return -(-num//den) #The -/- trick makes it do ceiling instead of floor. "//" will do integer division even in the future and on floats.
    
    def pair_block_sequence_range(self,block_start,block_end):
        self._run_once()
        assert 0 <= block_start and block_start <= block_end and block_end <= self.work_count, "real assert"

        block_index = block_start
        start = block_index * self.pair_count // self.work_count
        next_start = (block_index+1) * self.pair_count // self.work_count
        size_goal = next_start - start
        end = block_end * self.pair_count // self.work_count

        sid0_list = []
        sid1_list = []
        for sid0, sid1 in self.pair_sequence_range(start,end):
            sid0_list.append(sid0)
            sid1_list.append(sid1)
            if len(sid0_list) == size_goal:
                yield sid0_list, sid1_list
                block_index += 1
                if block_index == block_end:
                    return
                sid0_list = []
                sid1_list = []
                start = next_start
                next_start = (block_index+1) * self.pair_count // self.work_count
                size_goal = next_start - start
        assert len(sid0_list) == 0, "real assert"

    #If start == end, then returns without yielding anything 
    def pair_sequence_range(self,start,end):
        self._run_once()
        assert 0 <= start and start <= end and end <= self._pair_count, "real assert"

        i = start
        for sid0, sid1 in self.pair_sequence_with_start(start):
            yield sid0, sid1
            i = i + 1
            if i == end:
                break
        assert i == end, "Not enough items found. Didn't get to the end"


    def pair_sequence_with_start(self,start):
        self._run_once()

        skip_ref = [start]

        just_sid_0_list = list(self.just_sid_0)
        just_sid_1_list = list(self.just_sid_1)
        intersect_list = list(self.intersect)

        for sid0, sid1 in self.combo_distinct(just_sid_0_list, intersect_list, skip_ref):
            yield sid0, sid1
        for sid0, sid1 in self.combo_distinct(just_sid_0_list, just_sid_1_list, skip_ref):
            yield sid0, sid1
        for sid0, sid1 in self.combo_distinct(intersect_list, just_sid_1_list, skip_ref):
            yield sid0, sid1
        for sid0, sid1 in self.combo_same(intersect_list, skip_ref):
            yield sid0, sid1
        assert skip_ref[0] == 0, "real assert"


    def combo_distinct(self, distinct__list0, distinct__list1, skip_ref):
        row_count = len(distinct__list0)
        col_count = len(distinct__list1)

        if skip_ref[0] >= row_count * col_count:
            skip_ref[0] = skip_ref[0] - row_count * col_count
            assert skip_ref[0] >=0, "real assert"
            return

        row_start = skip_ref[0] // col_count
        skip_ref[0] = skip_ref[0] - row_start * col_count
        assert skip_ref[0] >=0, "real assert"

        for row_index in range(row_start, row_count):
            sid0 = distinct__list0[row_index]
            if row_index == row_start:
                col_start = skip_ref[0]
                skip_ref[0] = 0
            else:
                col_start = 0
            for col_index in range(col_start, col_count):
                sid1 = distinct__list1[col_index]
                yield sid0, sid1

    def combo_same(self, list, skip_ref):
        count = len(list)
        full_size = count * (count + 1) // 2
        if skip_ref[0] >= full_size:
            skip_ref[0] = skip_ref[0] - full_size
            assert skip_ref[0] >=0, "real assert"
            return

        row_start = int((-1 + 2*count - np.sqrt(1 - 4*count + 4*count**2 - 8*skip_ref[0]))/2)
        skip_ref[0] = skip_ref[0] - (count*row_start - (row_start*(1 + row_start))//2)
        assert skip_ref[0] >=0, "real assert"

        for row_index in range(row_start, count):
            sid0 = list[row_index]
            if row_index == row_start:
                col_start = skip_ref[0]
                skip_ref[0] = 0
            else:
                col_start = 0
            for col_index in range(col_start + 1 + row_index, count):
                sid1 = list[col_index]
                assert sid0 is not sid1, "real assert"
                yield sid0, sid1



    @property
    def pair_count(self):
        self._run_once()
        return self._pair_count

    def lmm_from_cache_file(self):
        logging.info("Loading precomputation from {0}".format(self.cache_file))
        lmm = LMM()
        with np.load(self.cache_file) as data:
            lmm.U = data['arr_0']
            lmm.S = data['arr_1']
        return lmm

    def fill_in_cache_file(self):
        self._run_once()

        logging.info("filling in the cache_file and log_delta, as needed")

        if self.G1_or_none is None:
            self.G1val_or_none = None
        else:
            self.G1val_or_none = self.G1_or_none.read().val

        # The S and U are always cached, in case they are needed for the cluster or for multi-threaded runs
        if self.cache_file is None:
            self.cache_file = os.path.join(self.__tempdirectory, "cache_file.npz")
            if os.path.exists(self.cache_file): # If there is already a cache file in the temp directory, it must be removed because it might be out-of-date
                os.remove(self.cache_file)

        lmm = None
        if not os.path.exists(self.cache_file):
            logging.info("Precomputing eigen")
            lmm = LMM()
            G0_standardized = self.G0.read().standardize()
            lmm.setG(G0_standardized.val, self.G1val_or_none, a2=self.mixing)
            logging.info("Saving precomputation to {0}".format(self.cache_file))
            util.create_directory_if_necessary(self.cache_file)
            np.savez(self.cache_file, lmm.U,lmm.S) #using np.savez instead of pickle because it seems to be faster to read and write

        if self.external_log_delta is None:
            if lmm is None:
                lmm = self.lmm_from_cache_file()

            logging.info("searching for internal delta")
            lmm.setX(self.covar)
            lmm.sety(self.pheno['vals'])
            #log delta is used here. Might be better to use findH2, but if so will need to normalized G so that its K's diagonal would sum to iid_count
            result = lmm.find_log_delta(REML=False, sid_count=self.G0.sid_count, min_log_delta=self.min_log_delta, max_log_delta=self.max_log_delta  ) #!!what about findA2H2? minH2=0.00001
            self.external_log_delta = result['log_delta']

        self.internal_delta = np.exp(self.external_log_delta) * self.G0.sid_count
        logging.info("internal_delta={0}".format(self.internal_delta))
        logging.info("external_log_delta={0}".format(self.external_log_delta))


    do_pair_count = 0
    do_pair_time = time.time()

    def do_work(self, lmm, sid0_list, sid1_list):
        dataframe = pd.DataFrame(
            index=np.arange(len(sid0_list)),
            columns=('SNP0', 'Chr0', 'GenDist0', 'ChrPos0', 'SNP1', 'Chr1', 'GenDist1', 'ChrPos1', 'PValue', 'NullLogLike', 'AltLogLike')
            )
        #!!Is this the only way to set types in a dataframe?
        dataframe['Chr0'] = dataframe['Chr0'].astype(np.float)
        dataframe['GenDist0'] = dataframe['GenDist0'].astype(np.float)
        dataframe['ChrPos0'] = dataframe['ChrPos0'].astype(np.float)
        dataframe['Chr1'] = dataframe['Chr1'].astype(np.float)
        dataframe['GenDist1'] = dataframe['GenDist1'].astype(np.float)
        dataframe['ChrPos1'] = dataframe['ChrPos1'].astype(np.float)
        dataframe['PValue'] = dataframe['PValue'].astype(np.float)
        dataframe['NullLogLike'] = dataframe['NullLogLike'].astype(np.float)
        dataframe['AltLogLike'] = dataframe['AltLogLike'].astype(np.float)


        #This is some of the code for a different way that reads and dot-products 50% more, but does less copying. Seems about the same speed
        #sid0_index_list = self.test_snps.sid_to_index(sid0_list)
        #sid1_index_list = self.test_snps.sid_to_index(sid1_list)
        #sid_index_union_dict = {}
        #sid0_index_index_list = self.create_index_index(sid_index_union_dict, sid0_index_list)
        #sid1_index_index_list = self.create_index_index(sid_index_union_dict, sid1_index_list)
        #snps0_read = self.test_snps[:,sid0_index_list].read().standardize()
        #snps1_read = self.test_snps[:,sid1_index_list].read().standardize()

        sid_union = set(sid0_list).union(sid1_list)
        sid_union_index_list = sorted(self.test_snps.sid_to_index(sid_union))
        snps_read = self.test_snps[:,sid_union_index_list].read().standardize()

        sid0_index_list = snps_read.sid_to_index(sid0_list)
        sid1_index_list = snps_read.sid_to_index(sid1_list)

        products = snps_read.val[:,sid0_index_list] * snps_read.val[:,sid1_index_list] # in the products matrix, each column i is the elementwise product of sid i in each list
        X = np.hstack((self.covar, snps_read.val, products))
        UX = lmm.U.T.dot(X)
        k = lmm.S.shape[0]
        N = X.shape[0]
        if (k<N):
            UUX = X - lmm.U.dot(UX)
        else:
            UUX = None

        for pair_index, sid0 in enumerate(sid0_list):
            sid1 = sid1_list[pair_index]
            sid0_index = sid0_index_list[pair_index]
            sid1_index = sid1_index_list[pair_index]

            index_list = np.array([pair_index]) #index to product
            index_list = index_list + len(sid_union_index_list) #Shift by the number of snps in the union
            index_list = np.hstack((np.array([sid0_index,sid1_index]),index_list)) # index to sid0 and sid1
            index_list = index_list + self.covar.shape[1] #Shift by the number of values in the covar
            index_list = np.hstack((np.arange(self.covar.shape[1]),index_list)) #indexes of the covar

            index_list_less_product = index_list[:-1] #index to everything but the product

            #Null -- the two additive SNPs
            lmm.X = X[:,index_list_less_product]
            lmm.UX = UX[:,index_list_less_product]
            if (k<N):
                lmm.UUX = UUX[:,index_list_less_product]
            else:
                lmm.UUX = None
            res_null = lmm.nLLeval(delta=self.internal_delta, REML=False)
            ll_null = -res_null["nLL"]

            #Alt -- now with the product feature
            lmm.X = X[:,index_list]
            lmm.UX = UX[:,index_list]
            if (k<N):
                lmm.UUX = UUX[:,index_list]
            else:
                lmm.UUX = None
            res_alt = lmm.nLLeval(delta=self.internal_delta, REML=False)
            ll_alt = -res_alt["nLL"]

            test_statistic = ll_alt - ll_null
            degrees_of_freedom = 1
            pvalue = stats.chi2.sf(2.0 * test_statistic, degrees_of_freedom)
            logging.debug("<{0},{1}>, null={2}, alt={3}, pvalue={4}".format(sid0,sid1,ll_null,ll_alt,pvalue))

            dataframe.iloc[pair_index] = [
                 sid0, snps_read.pos[sid0_index,0],  snps_read.pos[sid0_index,1], snps_read.pos[sid0_index,2],
                 sid1, snps_read.pos[sid1_index,0],  snps_read.pos[sid1_index,1], snps_read.pos[sid1_index,2],
                 pvalue, ll_null, ll_alt]

            self.do_pair_count += 1
            if self.do_pair_count % 100 == 0:
                start = self.do_pair_time
                self.do_pair_time = time.time()
                logging.info("do_pair_count={0}, time={1}".format(self.do_pair_count,self.do_pair_time-start))

        return dataframe
Exemplo n.º 37
0
class _Epistasis(object) : #implements IDistributable

    def __init__(self,test_snps,pheno,G0, G1=None, mixing=0.0, covar=None,sid_list_0=None,sid_list_1=None,
                 log_delta=None, min_log_delta=-5, max_log_delta=10, output_file=None, cache_file=None):
        self.test_snps = test_snps
        self.pheno = pheno
        self.output_file_or_none = output_file
        self.cache_file = cache_file
        self.covar = covar
        self.sid_list_0 = sid_list_0
        self.sid_list_1 = sid_list_1
        self.G0=G0
        self.G1_or_none=G1
        self.mixing=mixing
        self.external_log_delta=log_delta
        self.min_log_delta = min_log_delta
        self.max_log_delta = max_log_delta
        self._ran_once = False
        self._str = "{0}({1},{2},G0={6},G1={7},mixing={8},covar={3},output_file={12},sid_list_0={4},sid_list_1{5},log_delta={9},min_log_delta={10},max_log_delta={11},cache_file={13})".format(
            self.__class__.__name__, self.test_snps,self.pheno,self.covar,self.sid_list_0,self.sid_list_1,
                 self.G0, self.G1_or_none, self.mixing, self.external_log_delta, self.min_log_delta, self.max_log_delta, output_file, cache_file)
        self.block_size = 1000

    def set_sid_sets(self):
        sid_set_0 = set(self.sid_list_0)
        self.intersect = sid_set_0.intersection(self.sid_list_1)
        self.just_sid_0 = sid_set_0.difference(self.intersect)
        self.just_sid_1 = self.intersect.symmetric_difference(self.sid_list_1)
        self._pair_count = len(self.just_sid_0)*len(self.intersect) + len(self.just_sid_0)*len(self.just_sid_1) + len(self.intersect)*len(self.just_sid_1) + len(self.intersect) * (len(self.intersect)-1)//2
        self.test_snps, self.pheno, self.covar, self.G0, self.G1_or_none = pstutil.intersect_apply([self.test_snps, self.pheno, self.covar, self.G0, self.G1_or_none]) #should put G0 and G1 first

    def _run_once(self):
        if self._ran_once:
            return
        self._ran_once = None

        if isinstance(self.test_snps, str):
            self.test_snps = Bed(self.test_snps)

        if isinstance(self.G0, str):
            self.G0 = Bed(self.G0)

        if isinstance(self.pheno, str):
            self.pheno = pstpheno.loadOnePhen(self.pheno,vectorize=True) #!! what about missing=-9?

        if self.covar is not None and isinstance(self.covar, str):
            self.covar = pstpheno.loadPhen(self.covar)#!! what about missing=-9?

        if self.G1_or_none is not None and isinstance(self.G1_or_none, str):
            self.G1_or_none = Bed(self.G1_or_none)

        if self.sid_list_0 is None:
            self.sid_list_0 = self.test_snps.sid

        if self.sid_list_1 is None:
            self.sid_list_1 = self.test_snps.sid

        self.set_sid_sets()

        #!!Should fix up to add only of no constant columns - will need to add a test case for this
        if self.covar is None:
            self.covar = np.ones((self.test_snps.iid_count, 1))
        else:
            self.covar = np.hstack((self.covar['vals'],np.ones((self.test_snps.iid_count, 1))))
        self.n_cov = self.covar.shape[1] 


        if self.output_file_or_none is None:
            self.__tempdirectory = ".working"
        else:
            self.__tempdirectory = self.output_file_or_none + ".working"

        self._ran_once = True
        

 #start of IDistributable interface--------------------------------------
    @property
    def work_count(self):
        self._run_once()
        block_count = self.div_ceil(self._pair_count, self.block_size)
        return block_count



    def work_sequence(self):
        self._run_once()

        return self.work_sequence_range(0,self.work_count)

    def work_sequence_range(self, start, end):
        self._run_once()

        lmm = self.lmm_from_cache_file()
        lmm.sety(self.pheno['vals'])

        for sid0_list, sid1_list in self.pair_block_sequence_range(start,end):
            yield lambda lmm=lmm,sid0_list=sid0_list,sid1_list=sid1_list : self.do_work(lmm,sid0_list,sid1_list)  # the 'lmm=lmm,...' is need to get around a strangeness in Python

    def reduce(self, result_sequence):
        #doesn't need "run_once()"

        frame = pd.concat(result_sequence)
        frame.sort("PValue", inplace=True)
        frame.index = np.arange(len(frame))

        if self.output_file_or_none is not None:
            frame.to_csv(self.output_file_or_none, sep="\t", index=False)

        return frame

        #!!Find a place to output info like this near the end of the run
        #logging.info("PhenotypeName\t{0}".format(pheno['header']))
        #logging.info("SampleSize\t{0}".format(test_snps.iid_count))
        #logging.info("SNPCount\t{0}".format(test_snps.sid_count))
        #logging.info("Runtime\t{0}".format(time.time()-t0))


    @property
    def tempdirectory(self):
        self._run_once()
        return self.__tempdirectory

    #optional override -- the str name of the instance is used by the cluster as the job name
    def __str__(self):
        #Doesn't need run_once
        return self._str


    def copyinputs(self, copier):
        self._run_once()
        if isinstance(self.test_snps, str):
            copier.input(self.test_snps + ".bed")
            copier.input(self.test_snps + ".bim")
            copier.input(self.test_snps + ".fam")
        else:
            copier.input(self.test_snps)

        copier.input(self.pheno)
        copier.input(self.covar)

        if isinstance(self.G0, str):
            copier.input(self.G0 + ".bed")
            copier.input(self.G0 + ".bim")
            copier.input(self.G0 + ".fam")
        else:
            copier.input(self.G0)

        copier.input(self.G1_or_none)
        copier.input(self.cache_file)

    def copyoutputs(self,copier):
        #Doesn't need run_once
        copier.output(self.output_file_or_none)

 #end of IDistributable interface---------------------------------------

    @staticmethod
    def div_ceil(num, den): #!!move to utils?
        return -(-num//den) #The -/- trick makes it do ceiling instead of floor. "//" will do integer division even in the future and on floats.
    
    def pair_block_sequence_range(self,block_start,block_end):
        self._run_once()
        assert 0 <= block_start and block_start <= block_end and block_end <= self.work_count, "real assert"

        block_index = block_start
        start = block_index * self.pair_count // self.work_count
        next_start = (block_index+1) * self.pair_count // self.work_count
        size_goal = next_start - start
        end = block_end * self.pair_count // self.work_count

        sid0_list = []
        sid1_list = []
        for sid0, sid1 in self.pair_sequence_range(start,end):
            sid0_list.append(sid0)
            sid1_list.append(sid1)
            if len(sid0_list) == size_goal:
                yield sid0_list, sid1_list
                block_index += 1
                if block_index == block_end:
                    return
                sid0_list = []
                sid1_list = []
                start = next_start
                next_start = (block_index+1) * self.pair_count // self.work_count
                size_goal = next_start - start
        assert len(sid0_list) == 0, "real assert"

    #If start == end, then returns without yielding anything 
    def pair_sequence_range(self,start,end):
        self._run_once()
        assert 0 <= start and start <= end and end <= self._pair_count, "real assert"

        i = start
        for sid0, sid1 in self.pair_sequence_with_start(start):
            yield sid0, sid1
            i = i + 1
            if i == end:
                break
        assert i == end, "Not enough items found. Didn't get to the end"


    def pair_sequence_with_start(self,start):
        self._run_once()

        skip_ref = [start]

        just_sid_0_list = list(self.just_sid_0)
        just_sid_1_list = list(self.just_sid_1)
        intersect_list = list(self.intersect)

        for sid0, sid1 in self.combo_distinct(just_sid_0_list, intersect_list, skip_ref):
            yield sid0, sid1
        for sid0, sid1 in self.combo_distinct(just_sid_0_list, just_sid_1_list, skip_ref):
            yield sid0, sid1
        for sid0, sid1 in self.combo_distinct(intersect_list, just_sid_1_list, skip_ref):
            yield sid0, sid1
        for sid0, sid1 in self.combo_same(intersect_list, skip_ref):
            yield sid0, sid1
        assert skip_ref[0] == 0, "real assert"


    def combo_distinct(self, distinct__list0, distinct__list1, skip_ref):
        row_count = len(distinct__list0)
        col_count = len(distinct__list1)

        if skip_ref[0] >= row_count * col_count:
            skip_ref[0] = skip_ref[0] - row_count * col_count
            assert skip_ref[0] >=0, "real assert"
            return

        row_start = skip_ref[0] // col_count
        skip_ref[0] = skip_ref[0] - row_start * col_count
        assert skip_ref[0] >=0, "real assert"

        for row_index in xrange(row_start, row_count):
            sid0 = distinct__list0[row_index]
            if row_index == row_start:
                col_start = skip_ref[0]
                skip_ref[0] = 0
            else:
                col_start = 0
            for col_index in xrange(col_start, col_count):
                sid1 = distinct__list1[col_index]
                yield sid0, sid1

    def combo_same(self, list, skip_ref):
        count = len(list)
        full_size = count * (count + 1) // 2
        if skip_ref[0] >= full_size:
            skip_ref[0] = skip_ref[0] - full_size
            assert skip_ref[0] >=0, "real assert"
            return

        row_start = int((-1 + 2*count - np.sqrt(1 - 4*count + 4*count**2 - 8*skip_ref[0]))/2)
        skip_ref[0] = skip_ref[0] - (count*row_start - (row_start*(1 + row_start))//2)
        assert skip_ref[0] >=0, "real assert"

        for row_index in xrange(row_start, count):
            sid0 = list[row_index]
            if row_index == row_start:
                col_start = skip_ref[0]
                skip_ref[0] = 0
            else:
                col_start = 0
            for col_index in xrange(col_start + 1 + row_index, count):
                sid1 = list[col_index]
                assert sid0 is not sid1, "real assert"
                yield sid0, sid1



    @property
    def pair_count(self):
        self._run_once()
        return self._pair_count

    def lmm_from_cache_file(self):
        logging.info("Loading precomputation from {0}".format(self.cache_file))
        lmm = LMM()
        with np.load(self.cache_file) as data:
            lmm.U = data['arr_0']
            lmm.S = data['arr_1']
        return lmm

    def fill_in_cache_file(self):
        self._run_once()

        logging.info("filling in the cache_file and log_delta, as needed")

        if self.G1_or_none is None:
            self.G1val_or_none = None
        else:
            self.G1val_or_none = self.G1_or_none.read().val

        # The S and U are always cached, in case they are needed for the cluster or for multi-threaded runs
        if self.cache_file is None:
            self.cache_file = os.path.join(self.__tempdirectory, "cache_file.npz")
            if os.path.exists(self.cache_file): # If there is already a cache file in the temp directory, it must be removed because it might be out-of-date
                os.remove(self.cache_file)

        lmm = None
        if not os.path.exists(self.cache_file):
            logging.info("Precomputing eigen")
            lmm = LMM()
            G0_standardized = self.G0.read().standardize()
            lmm.setG(G0_standardized.val, self.G1val_or_none, a2=self.mixing)
            logging.info("Saving precomputation to {0}".format(self.cache_file))
            util.create_directory_if_necessary(self.cache_file)
            np.savez(self.cache_file, lmm.U,lmm.S) #using np.savez instead of pickle because it seems to be faster to read and write

        if self.external_log_delta is None:
            if lmm is None:
                lmm = self.lmm_from_cache_file()

            logging.info("searching for internal delta")
            lmm.setX(self.covar)
            lmm.sety(self.pheno['vals'])
            #log delta is used here. Might be better to use findH2, but if so will need to normalized G so that its K's diagonal would sum to iid_count
            result = lmm.find_log_delta(REML=False, sid_count=self.G0.sid_count, min_log_delta=self.min_log_delta, max_log_delta=self.max_log_delta  ) #!!what about findA2H2? minH2=0.00001
            self.external_log_delta = result['log_delta']

        self.internal_delta = np.exp(self.external_log_delta) * self.G0.sid_count
        logging.info("internal_delta={0}".format(self.internal_delta))
        logging.info("external_log_delta={0}".format(self.external_log_delta))


    do_pair_count = 0
    do_pair_time = time.time()

    def do_work(self, lmm, sid0_list, sid1_list):
        dataframe = pd.DataFrame(
            index=np.arange(len(sid0_list)),
            columns=('SNP0', 'Chr0', 'GenDist0', 'ChrPos0', 'SNP1', 'Chr1', 'GenDist1', 'ChrPos1', 'PValue', 'NullLogLike', 'AltLogLike')
            )
        #!!Is this the only way to set types in a dataframe?
        dataframe['Chr0'] = dataframe['Chr0'].astype(np.float)
        dataframe['GenDist0'] = dataframe['GenDist0'].astype(np.float)
        dataframe['ChrPos0'] = dataframe['ChrPos0'].astype(np.float)
        dataframe['Chr1'] = dataframe['Chr1'].astype(np.float)
        dataframe['GenDist1'] = dataframe['GenDist1'].astype(np.float)
        dataframe['ChrPos1'] = dataframe['ChrPos1'].astype(np.float)
        dataframe['PValue'] = dataframe['PValue'].astype(np.float)
        dataframe['NullLogLike'] = dataframe['NullLogLike'].astype(np.float)
        dataframe['AltLogLike'] = dataframe['AltLogLike'].astype(np.float)


        #This is some of the code for a different way that reads and dot-products 50% more, but does less copying. Seems about the same speed
        #sid0_index_list = self.test_snps.sid_to_index(sid0_list)
        #sid1_index_list = self.test_snps.sid_to_index(sid1_list)
        #sid_index_union_dict = {}
        #sid0_index_index_list = self.create_index_index(sid_index_union_dict, sid0_index_list)
        #sid1_index_index_list = self.create_index_index(sid_index_union_dict, sid1_index_list)
        #snps0_read = self.test_snps[:,sid0_index_list].read().standardize()
        #snps1_read = self.test_snps[:,sid1_index_list].read().standardize()

        sid_union = set(sid0_list).union(sid1_list)
        sid_union_index_list = sorted(self.test_snps.sid_to_index(sid_union))
        snps_read = self.test_snps[:,sid_union_index_list].read().standardize()

        sid0_index_list = snps_read.sid_to_index(sid0_list)
        sid1_index_list = snps_read.sid_to_index(sid1_list)

        products = snps_read.val[:,sid0_index_list] * snps_read.val[:,sid1_index_list] # in the products matrix, each column i is the elementwise product of sid i in each list
        X = np.hstack((self.covar, snps_read.val, products))
        UX = lmm.U.T.dot(X)
        k = lmm.S.shape[0]
        N = X.shape[0]
        if (k<N):
            UUX = X - lmm.U.dot(UX)
        else:
            UUX = None

        for pair_index, sid0 in enumerate(sid0_list):
            sid1 = sid1_list[pair_index]
            sid0_index = sid0_index_list[pair_index]
            sid1_index = sid1_index_list[pair_index]

            index_list = np.array([pair_index]) #index to product
            index_list = index_list + len(sid_union_index_list) #Shift by the number of snps in the union
            index_list = np.hstack((np.array([sid0_index,sid1_index]),index_list)) # index to sid0 and sid1
            index_list = index_list + self.covar.shape[1] #Shift by the number of values in the covar
            index_list = np.hstack((np.arange(self.covar.shape[1]),index_list)) #indexes of the covar

            index_list_less_product = index_list[:-1] #index to everything but the product

            #Null -- the two additive SNPs
            lmm.X = X[:,index_list_less_product]
            lmm.UX = UX[:,index_list_less_product]
            if (k<N):
                lmm.UUX = UUX[:,index_list_less_product]
            else:
                lmm.UUX = None
            res_null = lmm.nLLeval(delta=self.internal_delta, REML=False)
            ll_null = -res_null["nLL"]

            #Alt -- now with the product feature
            lmm.X = X[:,index_list]
            lmm.UX = UX[:,index_list]
            if (k<N):
                lmm.UUX = UUX[:,index_list]
            else:
                lmm.UUX = None
            res_alt = lmm.nLLeval(delta=self.internal_delta, REML=False)
            ll_alt = -res_alt["nLL"]

            test_statistic = ll_alt - ll_null
            degrees_of_freedom = 1
            pvalue = stats.chi2.sf(2.0 * test_statistic, degrees_of_freedom)
            logging.debug("<{0},{1}>, null={2}, alt={3}, pvalue={4}".format(sid0,sid1,ll_null,ll_alt,pvalue))

            dataframe.iloc[pair_index] = [
                 sid0, snps_read.pos[sid0_index,0],  snps_read.pos[sid0_index,1], snps_read.pos[sid0_index,2],
                 sid1, snps_read.pos[sid1_index,0],  snps_read.pos[sid1_index,1], snps_read.pos[sid1_index,2],
                 pvalue, ll_null, ll_alt]

            self.do_pair_count += 1
            if self.do_pair_count % 100 == 0:
                start = self.do_pair_time
                self.do_pair_time = time.time()
                logging.info("do_pair_count={0}, time={1}".format(self.do_pair_count,self.do_pair_time-start))

        return dataframe
Exemplo n.º 38
0
# Bed("all.bed")

# Find out about iids and sids
print snpreader.iid_count
print snpreader.sid_count
print snpreader.iid[:3]
print snpreader.sid[:3]
#500
#5000
#[['cid0P0' 'cid0P0']
# ['cid1P0' 'cid1P0']
# ['cid2P0' 'cid2P0']]
#['snp625_m0_.03m1_.07' 'snp1750_m0_.02m1_.04' 'snp0_m0_.37m1_.24']

#Read all the SNP data in to memory
snpdata = snpreader.read()
#What is snpdata?
# SnpData(Bed("all.bed"))

#What do the iids and sid of snprdata look like?
print snpdata.iid_count, snpdata.sid_count
print snpdata.iid[:3]
print snpdata.sid[:3]
# The same.

# print the SNP data
print snpdata.val
#[[ 2.  2.  1. ...,  2.  1.  2.]
# [ 2.  2.  1. ...,  2.  0.  2.]
# [ 2.  2.  1. ...,  1.  1.  1.]
# ...,
Exemplo n.º 39
0
def imputation_test(
        chromosomes,
        imputed_prefix='outputs/parent_imputed_chr',
        expected_prefix="../UKBioRDE_revision/data/tmp/filtered_ukb_chr",
        start=None,
        end=None):
    #Data files for chromosome i should be named in this fashion: "prefix{i}"
    chromosomes_expected_genes_o = []
    chromosomes_expected_genes_pm = []
    chromosomes_imputed_genes_o = []
    chromosomes_imputed_genes_pm = []
    for chromosome in chromosomes:
        with h5py.File(imputed_prefix + str(chromosome) + ".hdf5", 'r') as f:
            gts = np.array(f["imputed_par_gts"])
            fids = np.array(f["families"]).astype(str)
            parental_status = np.array(f["parental_status"])
            ped_array = np.array(f["pedigree"]).astype(str)
            ped = pd.DataFrame(ped_array[1:], columns=ped_array[0])
        expected = Bed(expected_prefix + str(chromosome) + ".bed",
                       count_A1=True)
        if start is not None and end is not None:
            expected_gts = expected[:, start:end].read().val
        else:
            expected_gts = expected.read().val
        expected_ids = expected.iid
        iid_to_bed_index = {
            i: index
            for index, i in enumerate(expected_ids[:, 1])
        }
        #fids of control families start with _
        #this has the predix _*_
        index_of_families_in_imputation = {
            fid: index
            for index, fid in enumerate(fids)
        }
        # no parent control starts with _o_
        # only has father control starts with _p_
        # only has father control starts with _m_
        control_o_families = list({
            row["FID"][3:]
            for index, row in ped.iterrows() if row["FID"].startswith("_o_")
        })
        #for each family select id of the parents
        parent_ids = ped.groupby("FID").agg({
            'FATHER_ID':
            lambda x:
            ([a for a in list(x) if a in ped["IID"].tolist()] + [None])[0],
            'MOTHER_ID':
            lambda x:
            ([a for a in list(x) if a in ped["IID"].tolist()] + [None])[0],
        })
        parents_of_control_o_families = parent_ids.loc[control_o_families]
        mother_indexes_control_o = [
            iid_to_bed_index[parents_of_control_o_families.loc[i, "MOTHER_ID"]]
            for i in control_o_families
        ]
        father_indexes_control_o = [
            iid_to_bed_index[parents_of_control_o_families.loc[i, "FATHER_ID"]]
            for i in control_o_families
        ]
        expected_parent_gts_control_o = (
            expected_gts[mother_indexes_control_o, :] +
            expected_gts[father_indexes_control_o, :]) / 2
        expected_genes_o = expected_parent_gts_control_o.reshape((1, -1))
        index_of_control_families_in_imputation_o = [
            index_of_families_in_imputation["_o_" + i]
            for i in control_o_families
        ]
        imputed_genes_o = gts[
            index_of_control_families_in_imputation_o, :].reshape((1, -1))
        mask_o = ~(np.isnan(expected_genes_o) | np.isnan(imputed_genes_o))
        expected_genes_o = expected_genes_o[mask_o]
        imputed_genes_o = imputed_genes_o[mask_o]
        control_p = list({
            row["FID"][3:]
            for index, row in ped.iterrows() if row["FID"].startswith("_p_")
        })
        control_m = list({
            row["FID"][3:]
            for index, row in ped.iterrows() if row["FID"].startswith("_m_")
        })
        control_pm_families = control_p + control_m
        parent_of_control_m = parent_ids.loc[control_m]
        parent_of_control_p = parent_ids.loc[control_p]
        father_indexes_control_m = [
            iid_to_bed_index[parent_of_control_m.loc[i, "FATHER_ID"]]
            for i in control_m
        ]
        mother_indexes_control_p = [
            iid_to_bed_index[parent_of_control_p.loc[i, "MOTHER_ID"]]
            for i in control_p
        ]
        expected_parent_gts_control_pm = expected_gts[
            mother_indexes_control_p + father_indexes_control_m, :]
        expected_genes_pm = expected_parent_gts_control_pm.reshape((1, -1))
        index_of_control_families_in_imputation_pm = [
            index_of_families_in_imputation["_p_" + i] for i in control_p
        ] + [index_of_families_in_imputation["_m_" + i] for i in control_m]
        imputed_genes_pm = gts[
            index_of_control_families_in_imputation_pm, :].reshape((1, -1))
        mask_pm = ~(np.isnan(expected_genes_pm) | np.isnan(imputed_genes_pm))
        expected_genes_pm = expected_genes_pm[mask_pm]
        imputed_genes_pm = imputed_genes_pm[mask_pm]
        chromosomes_expected_genes_o.append(expected_genes_o)
        chromosomes_expected_genes_pm.append(expected_genes_pm)
        chromosomes_imputed_genes_o.append(imputed_genes_o)
        chromosomes_imputed_genes_pm.append(imputed_genes_pm)

    whole_expected_genes_o = np.concatenate(chromosomes_expected_genes_o)
    whole_imputed_genes_o = np.concatenate(chromosomes_imputed_genes_o)
    whole_expected_genes_pm = np.concatenate(chromosomes_expected_genes_pm)
    whole_imputed_genes_pm = np.concatenate(chromosomes_imputed_genes_pm)

    covs_o = np.cov(whole_expected_genes_o, whole_imputed_genes_o)
    coef_o = covs_o[0, 1] / covs_o[1, 1]
    residual_var_o = np.var(whole_expected_genes_o -
                            coef_o * whole_imputed_genes_o)
    s2_o = residual_var_o / (len(control_o_families) * 22 * 2 * covs_o[1, 1])
    z_o = (1 - coef_o) / np.sqrt(s2_o)
    q_o = norm.cdf(z_o)
    p_value_o = min(q_o, 1 - q_o)

    covs_pm = np.cov(whole_expected_genes_pm, whole_imputed_genes_pm)
    coef_pm = covs_pm[0, 1] / covs_pm[1, 1]
    residual_var_pm = np.var(whole_expected_genes_pm -
                             coef_pm * whole_imputed_genes_pm)
    s2_pm = residual_var_pm / (len(control_pm_families) * 22 * 2 *
                               covs_pm[1, 1])
    z_pm = (1 - coef_pm) / np.sqrt(s2_pm)
    q_pm = norm.cdf(z_pm)
    p_value_pm = min(q_pm, 1 - q_pm)
    print(covs_pm, coef_pm, z_pm, p_value_pm)

    #TODO compute z correctly(find the correct sd)
    return (coef_o, coef_pm), (z_o, z_pm), (p_value_o, p_value_pm)
Exemplo n.º 40
0
 def test_c_reader_bed_count_A1(self):
     snpreader = Bed(self.currentFolder + "/examples/toydata",
                     count_A1=True)
     snpdata = snpreader.read()
     snpdata.val = 2 - snpdata.val
     self.c_reader(snpdata)
Exemplo n.º 41
0
from __future__ import print_function
import numpy as np
from pysnptools.snpreader import Bed

data_dir = '/groups/price/hilary/ibd/data'
bedfile = data_dir+'/1000G.EUR.QC.22'
outfile = bedfile+'.f2snps'

bed = Bed(bedfile)
x = bed.read()
b = np.array([sum(x.val[:,i]) in [2,976] and 1 in x.val[:,i] for i in range(len(x.sid))])
f2snps = x.sid[b]
print('\n'.join(f2snps), file = open(outfile,'w'))
Exemplo n.º 42
0

# In[5]:


test_stat = pd.read_csv('Outputs/Fast-Lmm-Cache/Test-Stat-Cache.txt', header=None)
test_stat = test_stat.replace('[\[\] ]', '', regex=True)
test_stat = pd.to_numeric(test_stat[0])

results_df['Full ID'] = results_df['Chr'].astype('str') + '_' + results_df['ChrPos'].astype('str')
results_df = pd.concat([results_df[['Chr', 'ChrPos', 'SNP', 'Full ID', 'PValue']], test_stat],
                       axis = 1)
results_df.columns = ['Chr', 'ChrPos', 'SNP', 'Full ID', 'PValue', 'F-test statistic']

mybed = Bed(VARIANTS_TO_TEST + '.bed')
mysnpdata = mybed.read()

print 'Time (s): ' + str(time.clock()-start)


# In[6]:


pheno = _pheno_fixup(PHENOTYPE_DATA, count_A1=None).read()
pheno = pheno.val[np.searchsorted(pheno.iid[:,1], mysnpdata.iid[:,1])]
snpdata = mysnpdata.val
diff = range(snpdata.shape[1])
maf = range(snpdata.shape[1])
n_alleles = range(snpdata.shape[1])
mean_major = range(snpdata.shape[1])
for i in range(snpdata.shape[1]):
Exemplo n.º 43
0
__author__ = 'Haohan Wang'

from pysnptools.snpreader import Bed
import numpy as np

snp_on_disk = Bed('../data/ANDI.bed', count_A1=False)

snps = snp_on_disk.read()

np.save('../result/sampleID', snps.iid)

sid = snps.sid

markers = [line.strip() for line in open('../commonData/markers.txt')]

mdic = {}
for m in markers:
    mdic[m] = 0

idx = []
for i in range(len(sid)):
    if sid[i] in mdic:
        idx.append(i)
idx = np.array(idx)

data = snps.val[:, idx]

# print data.shape

# print snps.sid
Exemplo n.º 44
0
class LeaveTwoChrOutSimulation():

    def __init__(self, snp_fn, out_prefix):


        self.force_recompute = False

        #self.base_path = base_path
        self.snp_fn = snp_fn

        from pysnptools.snpreader import Bed
        self.snp_reader = Bed(snp_fn)
        
        self.eigen_fn = self.snp_fn + "_pcs.pickle"

        self.out_prefix = out_prefix


    def precompute_pca(self):
        """
        compute pcs
        """

        logging.info("computing PCA on train set")
        t0 = time.time()
        
        if not os.path.isfile(self.eigen_fn) or self.force_recompute:

            G = self.snp_reader.read(order='C').standardize().val
            G.flags.writeable = False
            chr1_idx, chr2_idx, rest_idx = split_data_helper.split_chr1_chr2_rest(self.snp_reader.pos)

            G_train = G.take(rest_idx, axis=1)

            from sklearn.decomposition import PCA
            pca = PCA()
            pcs = pca.fit_transform(G_train)

            logging.info("saving eigendecomp to file %s" % self.eigen_fn)
            
            eig_dec = {"pcs": pcs}
            save(self.eigen_fn, eig_dec)


            logging.info("time taken for pc computation: " + str(time.time()-t0))
        else:
            logging.info("pc file already exists: %s" % (self.eigen_fn))


    def run(self, methods, num_causal, num_repeats, num_pcs, description, runner, seed=None, plot_fn=None):
        
        
        self.precompute_pca()

        input_files = [self.snp_fn + ext for ext in [".bed", ".fam", ".bim"]] + [self.eigen_fn]
        input_args = [(methods, self.snp_fn, self.eigen_fn, num_causal, num_pcs, seed, sim_id) for sim_id in range(num_repeats)]
        output_list = distributed_map.d_map(semisynth_simulations.compute_core, input_args, runner, input_files=input_files)

        ############################################
        results_fn = "%s_results.runs_%i.causals_%i.pickle.bzip" % (description, num_repeats, num_causal)
        reduced_results_fn = results_fn.replace("runs", "reduced.runs")

        save(results_fn, output_list)

        
        methods = output_list[0][0].keys()
        arg_list = [(method, results_fn) for method in methods]

        #reduce_runner = Hadoop(len(methods), mapmemory=90*1024, reducememory=90*1024, mkl_num_threads=1, queue="shared")
        reduce_runner = Local()
        combine_output = distributed_map.d_map(semisynth_simulations.combine_results, arg_list, reduce_runner, input_files=[results_fn])
        
        save(reduced_results_fn, combine_output)
        title = "%i causal, %i repeats" % (num_causal, num_repeats)
        visualize_reduced_results(methods, combine_output, title=title, plot_fn=plot_fn)

        return combine_output
Exemplo n.º 45
0
    def test_generate_and_regress(self):
        #requies plink
        number_of_snps = 1000
        min_f = 0.05
        number_of_families = 100
        filename = "outputs/tmp/generated"
        if not os.path.exists(os.path.dirname(filename)):
            try:
                os.makedirs(os.path.dirname(filename))
            except OSError as exc:  # Guard against race condition
                if exc.errno != errno.EEXIST:
                    raise
        start = 0
        interval = 0
        #generating population
        os.system('python example/simulate_pop.py ' + str(number_of_snps) +
                  ' ' + str(min_f) + ' ' + str(number_of_families) +
                  ' 0 0 0 "outputs/tmp/generated"')
        # Adding header to the ped.igree file
        os.system(
            'echo -e "FID IID FATHER_ID MOTHER_ID\n$(cat outputs/tmp/generated_fams.ped)" > outputs/tmp/generated_fams.ped'
        )
        #convert the generated data to a bed file
        os.system(
            'plink/plink --noweb --file outputs/tmp/generated --make-bed --out outputs/tmp/generated'
        )
        columns = ["FID", "IID", "FATHER_ID", "MOTHER_ID", "sex", "phenotype"
                   ] + ["genotype_" + str(i) for i in range(number_of_snps)]
        ped = pd.read_csv("outputs/tmp/generated.ped",
                          sep=" ",
                          header=None,
                          names=columns)
        ped = ped[["FID", "IID", "FATHER_ID", "MOTHER_ID"]].astype(str)
        only_remove_father_ids = [
            str(i) + "_P" for i in range(number_of_families // 4)
        ]
        only_remove_mother_ids = [
            str(i) + "_M"
            for i in range(number_of_families // 4, number_of_families // 2)
        ]
        remove_both_parents_ids = [
            str(i) + "_M"
            for i in range(number_of_families // 2, number_of_families)
        ] + [
            str(i) + "_P"
            for i in range(number_of_families // 2, number_of_families)
        ]
        parents = ped[ped["IID"].str.endswith("_P")
                      | ped["IID"].str.endswith("_M")]
        sibs = ped[~ped["IID"].isin(only_remove_father_ids +
                                    only_remove_mother_ids +
                                    remove_both_parents_ids)]
        sibs.to_csv("outputs/tmp/generated_sibs.ped", sep=" ")
        parents.to_csv("outputs/tmp/generated_parents.ped", sep=" ")
        with open("outputs/tmp/generated_sibs.txt", "w") as f:
            for i, j in sibs[["FID", "IID"]].values.tolist():
                f.write(str(i) + " " + str(j) + "\n")

        with open("outputs/tmp/generated_parents.txt", "w") as f:
            for i, j in ped[["FID", "IID"]].values.tolist():
                if j.endswith("_P") or j.endswith("_M"):
                    f.write(str(i) + " " + str(j) + "\n")
        #writing sibs only
        os.system(
            'plink/plink --noweb --bfile outputs/tmp/generated --keep outputs/tmp/generated_sibs.txt --make-bed --out outputs/tmp/generated_sibs'
        )
        #writing parents only
        os.system(
            'plink/plink --noweb --bfile outputs/tmp/generated --keep outputs/tmp/generated_parents.txt --make-bed --out outputs/tmp/generated_parents'
        )
        ibd = pd.read_csv("outputs/tmp/generated.segments.gz", sep="\t")
        sibships, iid_to_bed_index, gts, ibd, pos, chromosomes, hdf5_output_dict = prepare_data(
            sibs, "outputs/tmp/generated_sibs", ibd)
        gts = gts.astype(float)
        pos = pos.astype(int)
        imputed_fids, imputed_par_gts = impute(sibships,
                                               iid_to_bed_index,
                                               gts,
                                               ibd,
                                               pos,
                                               hdf5_output_dict,
                                               str(chromosomes),
                                               threads=2)
        expected_parents = Bed("outputs/tmp/generated_parents.bed",
                               count_A1=True)
        expected_parents_gts = expected_parents.read().val
        expected_parents_ids = expected_parents.iid
        father = expected_parents_gts[[
            bool(i % 2) for i in range(2 * number_of_families)
        ]]
        father = father[[int(t) for t in imputed_fids], :]
        mother = expected_parents_gts[[
            not bool(i % 2) for i in range(2 * number_of_families)
        ]]
        mother = mother[[int(t) for t in imputed_fids], :]
        expected_parents = np.zeros(imputed_par_gts.shape)
        no_parent = ~sibships["has_father"] & ~sibships["has_mother"]
        only_mother = ~sibships["has_father"] & sibships["has_mother"]
        only_father = ~sibships["has_mother"] & sibships["has_father"]
        expected_parents[no_parent] = (mother[no_parent] +
                                       father[no_parent]) / 2
        expected_parents[only_mother] = mother[only_mother]
        expected_parents[only_father] = father[only_father]
        expected_genotypes = expected_parents.reshape((1, -1))
        imputed_genotypes = imputed_par_gts.reshape((1, -1))
        covs = np.cov(expected_genotypes, imputed_genotypes)
        coef = covs[0, 1] / covs[1, 1]
        residual_var = np.var(expected_genotypes - coef * imputed_genotypes)
        s2 = residual_var / (number_of_snps * covs[1, 1])
        #TODO should i divide by number_of_snps*covs[1,1]*number_of_families
        z = (1 - coef) / np.sqrt(s2)
        q = norm.cdf(z)
        p_val = min(q, 1 - q)
        self.assertGreaterEqual(p_val, 0.01)
Exemplo n.º 46
0
 def test_p_reader_bed_count_A1(self):
     snpreader = Bed(self.currentFolder + "/examples/toydata",count_A1=True)
     snpdata = snpreader.read(force_python_only=True)
     snpdata.val = 2 - snpdata.val
     self.c_reader(snpdata)
Exemplo n.º 47
0

# Find out about iids and sids
print snpreader.iid_count
print snpreader.sid_count
print snpreader.iid[:3]
print snpreader.sid[:3]
#500
#5000
#[['cid0P0' 'cid0P0']
# ['cid1P0' 'cid1P0']
# ['cid2P0' 'cid2P0']]
#['snp625_m0_.03m1_.07' 'snp1750_m0_.02m1_.04' 'snp0_m0_.37m1_.24']

#Read all the SNP data in to memory
snpdata = snpreader.read()
#What is snpdata?
# SnpData(Bed("all.bed"))

#What do the iids and sid of snprdata look like?
print snpdata.iid_count, snpdata.sid_count
print snpdata.iid[:3]
print snpdata.sid[:3]
# The same.

# print the SNP data
print snpdata.val
#[[ 2.  2.  1. ...,  2.  1.  2.]
# [ 2.  2.  1. ...,  2.  0.  2.]
# [ 2.  2.  1. ...,  1.  1.  1.]
# ...,
Exemplo n.º 48
0
 def test_hdf5_case3(self):
     snpreader1 = SnpHdf5(self.currentFolder + "/examples/toydata.snpmajor.snp.hdf5")[::2,:]
     snpreader2 = Bed(self.currentFolder + "/examples/toydata",count_A1=False)[::2,:]
     self.assertTrue(np.allclose(snpreader1.read().val, snpreader2.read().val, rtol=1e-05, atol=1e-05))
Exemplo n.º 49
0
class LeaveTwoChrOutSimulation():
    def __init__(self, snp_fn, out_prefix):

        self.force_recompute = False

        #self.base_path = base_path
        self.snp_fn = snp_fn

        from pysnptools.snpreader import Bed
        self.snp_reader = Bed(snp_fn)

        self.eigen_fn = self.snp_fn + "_pcs.pickle"

        self.out_prefix = out_prefix

    def precompute_pca(self):
        """
        compute pcs
        """

        logging.info("computing PCA on train set")
        t0 = time.time()

        if not os.path.isfile(self.eigen_fn) or self.force_recompute:

            G = self.snp_reader.read(order='C').standardize().val
            G.flags.writeable = False
            chr1_idx, chr2_idx, rest_idx = split_data_helper.split_chr1_chr2_rest(
                self.snp_reader.pos)

            G_train = G.take(rest_idx, axis=1)

            from sklearn.decomposition import PCA
            pca = PCA()
            pcs = pca.fit_transform(G_train)

            logging.info("saving eigendecomp to file %s" % self.eigen_fn)

            eig_dec = {"pcs": pcs}
            save(self.eigen_fn, eig_dec)

            logging.info("time taken for pc computation: " +
                         str(time.time() - t0))
        else:
            logging.info("pc file already exists: %s" % (self.eigen_fn))

    def run(self,
            methods,
            num_causal,
            num_repeats,
            num_pcs,
            description,
            runner,
            seed=None,
            plot_fn=None):

        self.precompute_pca()

        input_files = [self.snp_fn + ext
                       for ext in [".bed", ".fam", ".bim"]] + [self.eigen_fn]
        input_args = [(methods, self.snp_fn, self.eigen_fn, num_causal,
                       num_pcs, seed, sim_id) for sim_id in range(num_repeats)]
        output_list = distributed_map.d_map(semisynth_simulations.compute_core,
                                            input_args,
                                            runner,
                                            input_files=input_files)

        ############################################
        results_fn = "%s_results.runs_%i.causals_%i.pickle.bzip" % (
            description, num_repeats, num_causal)
        reduced_results_fn = results_fn.replace("runs", "reduced.runs")

        save(results_fn, output_list)

        methods = output_list[0][0].keys()
        arg_list = [(method, results_fn) for method in methods]

        #reduce_runner = Hadoop(len(methods), mapmemory=90*1024, reducememory=90*1024, mkl_num_threads=1, queue="shared")
        reduce_runner = Local()
        combine_output = distributed_map.d_map(
            semisynth_simulations.combine_results,
            arg_list,
            reduce_runner,
            input_files=[results_fn])

        save(reduced_results_fn, combine_output)
        title = "%i causal, %i repeats" % (num_causal, num_repeats)
        visualize_reduced_results(methods,
                                  combine_output,
                                  title=title,
                                  plot_fn=plot_fn)

        return combine_output
Exemplo n.º 50
0
if (args.indf != None):
	assert (args.e != 0), "Specify number of eigenvectors used to estimate allele frequencies!"

# Parse Beagle file
if args.plink == None:
	print "Parsing Beagle file"
	likeMatrix = pd.read_csv(str(args.beagle), sep="\t", engine="c", header=0, usecols=range(3, 3 + 3*args.n), dtype=np.float32, compression="gzip")
	likeMatrix = likeMatrix.as_matrix().T
else:
	chunk_N = int(np.ceil(float(args.n)/args.threads))
	chunks = [i * chunk_N for i in xrange(args.threads)]
	print "Parsing PLINK files"
	from pysnptools.snpreader import Bed # Import Microsoft Genomics PLINK reader
	snpClass = Bed(args.plink, count_A1=True)
	pos = np.copy(snpClass.sid)
	snpFile = snpClass.read(dtype=np.float32) # Read PLINK files into memory
	f = np.nanmean(snpFile.val, axis=0, dtype=np.float64)/2
	likeMatrix = np.zeros((3*args.n, snpFile.val.shape[1]), dtype=np.float32)
	print "Converting PLINK files into genotype likelihood matrix"

	# Multithreading
	threads = [threading.Thread(target=convertPlink, args=(likeMatrix, snpFile.val, chunk, chunk_N, args.epsilon)) for chunk in chunks]
	for thread in threads:
		thread.start()
	for thread in threads:
		thread.join()

	del snpClass, snpFile

##### Estimate population allele frequencies #####
if args.plink == None: