Exemplo n.º 1
0
    def test_match_cpp(self):
        '''
        match
            FaSTLMM.207\Data\DemoData>..\.cd.\bin\windows\cpp_mkl\fastlmmc -bfile snps -extract topsnps.txt -bfileSim snps -extractSim ASout.snps.txt -pheno pheno.txt -covar covariate.txt -out topsnps.singlesnp.txt -logDelta 0 -verbose 100

        '''
        logging.info("TestSingleSnp test_match_cpp")
        snps = Bed(os.path.join(self.pythonpath, "tests/datasets/selecttest/snps"), count_A1=False)
        pheno = os.path.join(self.pythonpath, "tests/datasets/selecttest/pheno.txt")
        covar = os.path.join(self.pythonpath, "tests/datasets/selecttest/covariate.txt")
        sim_sid = ["snp26250_m0_.19m1_.19","snp82500_m0_.28m1_.28","snp63751_m0_.23m1_.23","snp48753_m0_.4m1_.4","snp45001_m0_.26m1_.26","snp52500_m0_.05m1_.05","snp75002_m0_.39m1_.39","snp41253_m0_.07m1_.07","snp11253_m0_.2m1_.2","snp86250_m0_.33m1_.33","snp3753_m0_.23m1_.23","snp75003_m0_.32m1_.32","snp30002_m0_.25m1_.25","snp26252_m0_.19m1_.19","snp67501_m0_.15m1_.15","snp63750_m0_.28m1_.28","snp30001_m0_.28m1_.28","snp52502_m0_.35m1_.35","snp33752_m0_.31m1_.31","snp37503_m0_.37m1_.37","snp15002_m0_.11m1_.11","snp3751_m0_.34m1_.34","snp7502_m0_.18m1_.18","snp52503_m0_.3m1_.3","snp30000_m0_.39m1_.39","isnp4457_m0_.11m1_.11","isnp23145_m0_.2m1_.2","snp60001_m0_.39m1_.39","snp33753_m0_.16m1_.16","isnp60813_m0_.2m1_.2","snp82502_m0_.34m1_.34","snp11252_m0_.13m1_.13"]
        sim_idx = snps.sid_to_index(sim_sid)
        test_sid = ["snp26250_m0_.19m1_.19","snp63751_m0_.23m1_.23","snp82500_m0_.28m1_.28","snp48753_m0_.4m1_.4","snp45001_m0_.26m1_.26","snp52500_m0_.05m1_.05","snp75002_m0_.39m1_.39","snp41253_m0_.07m1_.07","snp86250_m0_.33m1_.33","snp15002_m0_.11m1_.11","snp33752_m0_.31m1_.31","snp26252_m0_.19m1_.19","snp30001_m0_.28m1_.28","snp11253_m0_.2m1_.2","snp67501_m0_.15m1_.15","snp3753_m0_.23m1_.23","snp52502_m0_.35m1_.35","snp30000_m0_.39m1_.39","snp30002_m0_.25m1_.25"]
        test_idx = snps.sid_to_index(test_sid)

        for G0,G1 in [(snps[:,sim_idx],KernelIdentity(snps.iid)),(KernelIdentity(snps.iid),snps[:,sim_idx])]:
            frame_h2 = single_snp(test_snps=snps[:,test_idx], pheno=pheno, G0=G0,G1=G1, covar=covar,h2=.5,leave_out_one_chrom=False,count_A1=False)
            frame_log_delta = single_snp(test_snps=snps[:,test_idx], pheno=pheno, G0=G0,G1=G1, covar=covar,log_delta=0,leave_out_one_chrom=False,count_A1=False)
            for frame in [frame_h2, frame_log_delta]:
                referenceOutfile = TestFeatureSelection.reference_file("single_snp/topsnps.single.txt")
                reference = pd.read_table(referenceOutfile,sep="\t") # We've manually remove all comments and blank lines from this file
                assert len(frame) == len(reference)
                for _, row in reference.iterrows():
                    sid = row.SNP
                    pvalue = frame[frame['SNP'] == sid].iloc[0].PValue
                    reldiff = abs(row.Pvalue - pvalue)/row.Pvalue
                    assert reldiff < .035, "'{0}' pvalue_list differ too much {4} -- {2} vs {3}".format(sid,None,row.Pvalue,pvalue,reldiff)
Exemplo n.º 2
0
def main(args):
    print('reading seeed snps')
    seed_snps = pd.read_csv(args.seed_snps, header=None, names=['SNP'], index_col='SNP')
    seed_snps['ibs_length'] = 0
    seed_snps['ibd'] = 0

    print('reading typed snps')
    typed_snps = pd.read_csv(args.typed_snps, header=None, names=['SNP'])

    print('reading genotypes')
    data = Bed(args.bfile)
    X = data.read().val
    typed_snps_indices = np.sort(data.sid_to_index(typed_snps.SNP))
    typed_snps_bp = data.col_property[typed_snps_indices,2]

    print(len(seed_snps), 'snps in list')
    print(data.iid_count, data.sid_count, 'are dimensions of X')

    def analyze_snp(i):
        # find first typed snp after query snp
        snp_bp = data.col_property[i,2]
        v = np.where(typed_snps_bp > snp_bp)[0]
        if len(v) > 0:
            typed_i = v[0]
        else:
            typed_i = len(typed_snps_indices)-1

        n1, n2 = np.where(X[:,i] == 1)[0]
        if (X[n1,typed_snps_indices[typed_i]] - X[n2, typed_snps_indices[typed_i]])**2 == 4:
            return 0, 0

        typed_il, typed_ir = fis.find_boundaries(
                X[n1,typed_snps_indices],
                X[n2,typed_snps_indices],
                typed_i)
        typed_ir -= 1

        il = typed_snps_indices[typed_il]
        ir = typed_snps_indices[typed_ir]
        cM = data.col_property[ir, 1] - \
                data.col_property[il, 1]
        ibd = (np.mean(X[n1,il:ir] == X[n2,il:ir]) > 0.99)
        return cM, int(ibd)

    for (i, snp) in iter.show_progress(
            it.izip(data.sid_to_index(seed_snps.index), seed_snps.index),
            total=len(seed_snps)):
            # total=10):
        seed_snps.ix[snp, ['ibs_length', 'ibd']] = analyze_snp(i)

    print(seed_snps.iloc[:100])
    seed_snps.to_csv(args.outfile, sep='\t')
Exemplo n.º 3
0
 def __init__(self,args):
     if args.window_type not in ['BP','SNP']:
         raise ValueError('Window type not supported')
     bed_1 = Bed(args.bfile) #
     af1 = self.get_allele_frequency(bed_1,args) #
     print(len(af1), "SNPs in file 1")
     snps_1 = (af1>args.maf)&(af1<1-args.maf) #
     print(np.sum(snps_1), "SNPs in file 1 after MAF filter")
     if (args.from_bp is not None) and (args.to_bp is not None):
         k = (bed_1.pos[:,2]>args.from_bp)&(bed_1.pos[:,2]<args.to_bp)
         snps_1 = snps_1&k
     snps_to_use = bed_1.sid[snps_1]
     if args.extract is not None:
         keep = np.array([l.strip() for l in open(args.extract,'r')])
         snps_to_use = np.intersect1d(snps_to_use,keep)
         print(len(snps_to_use),"SNPs remaining after extraction")
     bed_1_index = np.sort(bed_1.sid_to_index(snps_to_use)) #
     pos = bed_1.pos[bed_1_index] #
     bim_1=pd.read_table(bed_1.filename+'.bim',header=None,
                         names=['chm','id','pos_mb','pos_bp','a1','a2'])
     af = af1[bed_1_index] #
     if args.afile is not None:
         a1 =  pd.read_table(args.afile,header=None,sep='\s*',
                             names=['id1','id2','theta'])
     else:
         a1 = None
     self.af = af
     self.M = len(bed_1_index) #
     self.windows = self.get_windows(pos,args) #
     self.chr = pos[:,0]
     self.pos = pos[:,2]
     self.id = bed_1.sid[bed_1_index]
     self.A1 = bim_1['a1'].loc[bed_1_index]
     self.A2 = bim_1['a2'].loc[bed_1_index]
     self.scores = self.compute(bed_1,bed_1_index,af,a1,args) #
Exemplo n.º 4
0
 def __init__(self,args):
     if args.window_type not in ['KBP','SNP']:
         raise ValueError('Window type not supported')
     bed_1 = Bed(args.bfile,count_A1=False) #
     af1 = self.get_allele_frequency(bed_1,args) #
     print(len(af1), "SNPs in file 1")
     snps_1 = (af1>args.maf)&(af1<1-args.maf) #
     print(np.sum(snps_1), "SNPs in file 1 after MAF filter")
     if (args.from_bp is not None) and (args.to_bp is not None):
         k = (bed_1.pos[:,2]>args.from_bp)&(bed_1.pos[:,2]<args.to_bp)
         snps_1 = snps_1&k
     snps_to_use = bed_1.sid[snps_1]
     if args.extract is not None:
         keep = np.array([l.strip() for l in open(args.extract,'r')])
         snps_to_use = np.intersect1d(snps_to_use,keep)
         print(len(snps_to_use),"SNPs remaining after extraction")
     bed_1_index = np.sort(bed_1.sid_to_index(snps_to_use)) #
     pos = bed_1.pos[bed_1_index] #
     bim_1=pd.read_table(bed_1.filename+'.bim',header=None,
                         names=['chm','id','pos_mb','pos_bp','a1','a2'])
     af = af1[bed_1_index] #
     # if args.afile is not None:
     #     a1 =  pd.read_table(args.afile,header=None,sep='\s*',
     #                         names=['id1','id2','theta'])
     # else:
     a1 = None
     self.af = af
     self.M = len(bed_1_index) #
     self.windows = self.get_windows(pos,args) #
     self.chr = pos[:,0]
     self.pos = pos[:,2]
     self.id = bed_1.sid[bed_1_index]
     self.A1 = bim_1['a1'].loc[bed_1_index]
     self.A2 = bim_1['a2'].loc[bed_1_index]
     self.scores = self.compute(bed_1,bed_1_index,af,a1,args) #
Exemplo n.º 5
0
 def __init__(self, args):
     if args.window_type not in ['KBP', 'SNP']:
         raise ValueError('Window type not supported')
     bed_1 = Bed(args.bfile, count_A1=False)  #
     af1 = self.get_allele_frequency(bed_1, args)  #
     print(len(af1), "SNPs in file 1")
     snps_1 = (af1 > args.maf) & (af1 < 1 - args.maf)  #
     print(np.sum(snps_1), "SNPs in file 1 after MAF filter")
     # Omit SNPs with NA values for h2weight
     if args.h2weight:
         data = pd.read_table(args.bfile + '.h2weight',
                              header=None,
                              names=['SNP', 'h2weight'],
                              index_col=False)
         if (len(data['SNP']) != len(bed_1.sid)
                 or (data['SNP'] == bed_1.sid).min() == False):
             raise ValueError(
                 'SNPs disagree between bed/bim/fam and h2weight files')
         h2weight = data['h2weight']
         snps_1 = snps_1 & ~h2weight.isnull().values
         print(np.sum(snps_1),
               "SNPs in file 1 after extracting non-NA h2weight")
         del (data)
     if (args.from_bp is not None) and (args.to_bp is not None):
         k = (bed_1.pos[:, 2] > args.from_bp) & (bed_1.pos[:, 2] <
                                                 args.to_bp)
         snps_1 = snps_1 & k
     snps_to_use = bed_1.sid[snps_1]
     if args.extract is not None:
         keep = np.array([l.strip() for l in open(args.extract, 'r')])
         snps_to_use = np.intersect1d(snps_to_use, keep)
         print(len(snps_to_use), "SNPs remaining after extraction")
     bed_1_index = np.sort(bed_1.sid_to_index(snps_to_use))  #
     pos = bed_1.pos[bed_1_index]  #
     bim_1 = pd.read_table(
         bed_1.filename + '.bim',
         header=None,
         names=['chm', 'id', 'pos_mb', 'pos_bp', 'a1', 'a2'])
     af = af1[bed_1_index]  #
     # if args.afile is not None:
     #     a1 =  pd.read_table(args.afile,header=None,sep='\s*',
     #                         names=['id1','id2','theta'])
     # else:
     a1 = None
     try:
         h2weight = h2weight[bed_1_index].values
     except NameError:
         h2weight = None
     self.af = af
     self.M = len(bed_1_index)  #
     self.windows = self.get_windows(pos, args)  #
     self.chr = pos[:, 0]
     self.pos = pos[:, 2]
     self.id = bed_1.sid[bed_1_index]
     self.A1 = bim_1['a1'].loc[bed_1_index]
     self.A2 = bim_1['a2'].loc[bed_1_index]
     self.scores = self.compute(bed_1, bed_1_index, af, a1, h2weight,
                                args)  #
Exemplo n.º 6
0
    def test_match_cpp(self):
        '''
        match
            FaSTLMM.207\Data\DemoData>fastlmmc -snpPairs -bfile snps -extract topsnps.txt -bfileSim snps -extractSim ASout.snps.txt -pheno pheno.txt -covar covariate.txt -out topsnps.pairs.txt -logDelta 0 -verbose 100

        '''
        logging.info("TestEpistasis test_match_cpp")
        from pysnptools.snpreader import Bed
        snps = Bed(os.path.join(self.pythonpath, "tests/datasets/selecttest/snps"))
        pheno = os.path.join(self.pythonpath, "tests/datasets/selecttest/pheno.txt")
        covar = os.path.join(self.pythonpath, "tests/datasets/selecttest/covariate.txt")
        sim_sid = ["snp26250_m0_.19m1_.19","snp82500_m0_.28m1_.28","snp63751_m0_.23m1_.23","snp48753_m0_.4m1_.4","snp45001_m0_.26m1_.26","snp52500_m0_.05m1_.05","snp75002_m0_.39m1_.39","snp41253_m0_.07m1_.07","snp11253_m0_.2m1_.2","snp86250_m0_.33m1_.33","snp3753_m0_.23m1_.23","snp75003_m0_.32m1_.32","snp30002_m0_.25m1_.25","snp26252_m0_.19m1_.19","snp67501_m0_.15m1_.15","snp63750_m0_.28m1_.28","snp30001_m0_.28m1_.28","snp52502_m0_.35m1_.35","snp33752_m0_.31m1_.31","snp37503_m0_.37m1_.37","snp15002_m0_.11m1_.11","snp3751_m0_.34m1_.34","snp7502_m0_.18m1_.18","snp52503_m0_.3m1_.3","snp30000_m0_.39m1_.39","isnp4457_m0_.11m1_.11","isnp23145_m0_.2m1_.2","snp60001_m0_.39m1_.39","snp33753_m0_.16m1_.16","isnp60813_m0_.2m1_.2","snp82502_m0_.34m1_.34","snp11252_m0_.13m1_.13"]
        sim_idx = snps.sid_to_index(sim_sid)
        test_sid = ["snp26250_m0_.19m1_.19","snp63751_m0_.23m1_.23","snp82500_m0_.28m1_.28","snp48753_m0_.4m1_.4","snp45001_m0_.26m1_.26","snp52500_m0_.05m1_.05","snp75002_m0_.39m1_.39","snp41253_m0_.07m1_.07","snp86250_m0_.33m1_.33","snp15002_m0_.11m1_.11","snp33752_m0_.31m1_.31","snp26252_m0_.19m1_.19","snp30001_m0_.28m1_.28","snp11253_m0_.2m1_.2","snp67501_m0_.15m1_.15","snp3753_m0_.23m1_.23","snp52502_m0_.35m1_.35","snp30000_m0_.39m1_.39","snp30002_m0_.25m1_.25"]
        test_idx = snps.sid_to_index(test_sid)

        frame = epistasis(snps[:,test_idx], pheno,covar=covar, G0 = snps[:,sim_idx],log_delta=0)
        sid0,sid1,pvalue_list =np.array(frame['SNP0']),np.array(frame['SNP1']),np.array(frame['PValue'])

        referenceOutfile = TestFeatureSelection.reference_file("epistasis/topsnps.pairs.txt")

        import pandas as pd
        table = pd.read_table(referenceOutfile,sep="\t") # We've manually remove all comments and blank lines from this file
        assert len(pvalue_list) == len(table)
        for row in table.iterrows():
            snp0cpp,snp1cpp,pvaluecpp,i1,i2 = row[1]
            for i in xrange(len(pvalue_list)):
                found = False
                pvaluepy = pvalue_list[i]
                snp0py = sid0[i]
                snp1py = sid1[i]
                if (snp0py == snp0cpp and snp1py == snp1cpp) or (snp0py == snp1cpp and snp1py == snp0cpp):
                    found = True
                    diff = abs(pvaluecpp - pvaluepy)/pvaluecpp
                    assert diff < .035, "'{0}' '{1}' pvalue_list differ too much {4} -- {2} vs {3}".format(snp0cpp,snp1cpp,pvaluecpp,pvaluepy,diff)
                    break
            assert found
Exemplo n.º 7
0
class _Epistasis(object) : #implements IDistributable

    def __init__(self,test_snps,pheno,G0, G1=None, mixing=0.0, covar=None,sid_list_0=None,sid_list_1=None,
                 log_delta=None, min_log_delta=-5, max_log_delta=10, output_file=None, cache_file=None):
        self.test_snps = test_snps
        self.pheno = pheno
        self.output_file_or_none = output_file
        self.cache_file = cache_file
        self.covar = covar
        self.sid_list_0 = sid_list_0
        self.sid_list_1 = sid_list_1
        self.G0=G0
        self.G1_or_none=G1
        self.mixing=mixing
        self.external_log_delta=log_delta
        self.min_log_delta = min_log_delta
        self.max_log_delta = max_log_delta
        self._ran_once = False
        self._str = "{0}({1},{2},G0={6},G1={7},mixing={8},covar={3},output_file={12},sid_list_0={4},sid_list_1{5},log_delta={9},min_log_delta={10},max_log_delta={11},cache_file={13})".format(
            self.__class__.__name__, self.test_snps,self.pheno,self.covar,self.sid_list_0,self.sid_list_1,
                 self.G0, self.G1_or_none, self.mixing, self.external_log_delta, self.min_log_delta, self.max_log_delta, output_file, cache_file)
        self.block_size = 1000

    def set_sid_sets(self):
        sid_set_0 = set(self.sid_list_0)
        self.intersect = sid_set_0.intersection(self.sid_list_1)
        self.just_sid_0 = sid_set_0.difference(self.intersect)
        self.just_sid_1 = self.intersect.symmetric_difference(self.sid_list_1)
        self._pair_count = len(self.just_sid_0)*len(self.intersect) + len(self.just_sid_0)*len(self.just_sid_1) + len(self.intersect)*len(self.just_sid_1) + len(self.intersect) * (len(self.intersect)-1)//2
        self.test_snps, self.pheno, self.covar, self.G0, self.G1_or_none = pstutil.intersect_apply([self.test_snps, self.pheno, self.covar, self.G0, self.G1_or_none]) #should put G0 and G1 first

    def _run_once(self):
        if self._ran_once:
            return
        self._ran_once = None

        if isinstance(self.test_snps, str):
            self.test_snps = Bed(self.test_snps)

        if isinstance(self.G0, str):
            self.G0 = Bed(self.G0)

        if isinstance(self.pheno, str):
            self.pheno = pstpheno.loadOnePhen(self.pheno,vectorize=True) #!! what about missing=-9?

        if self.covar is not None and isinstance(self.covar, str):
            self.covar = pstpheno.loadPhen(self.covar)#!! what about missing=-9?

        if self.G1_or_none is not None and isinstance(self.G1_or_none, str):
            self.G1_or_none = Bed(self.G1_or_none)

        if self.sid_list_0 is None:
            self.sid_list_0 = self.test_snps.sid

        if self.sid_list_1 is None:
            self.sid_list_1 = self.test_snps.sid

        self.set_sid_sets()

        #!!Should fix up to add only of no constant columns - will need to add a test case for this
        if self.covar is None:
            self.covar = np.ones((self.test_snps.iid_count, 1))
        else:
            self.covar = np.hstack((self.covar['vals'],np.ones((self.test_snps.iid_count, 1))))
        self.n_cov = self.covar.shape[1] 


        if self.output_file_or_none is None:
            self.__tempdirectory = ".working"
        else:
            self.__tempdirectory = self.output_file_or_none + ".working"

        self._ran_once = True
        

 #start of IDistributable interface--------------------------------------
    @property
    def work_count(self):
        self._run_once()
        block_count = self.div_ceil(self._pair_count, self.block_size)
        return block_count



    def work_sequence(self):
        self._run_once()

        return self.work_sequence_range(0,self.work_count)

    def work_sequence_range(self, start, end):
        self._run_once()

        lmm = self.lmm_from_cache_file()
        lmm.sety(self.pheno['vals'])

        for sid0_list, sid1_list in self.pair_block_sequence_range(start,end):
            yield lambda lmm=lmm,sid0_list=sid0_list,sid1_list=sid1_list : self.do_work(lmm,sid0_list,sid1_list)  # the 'lmm=lmm,...' is need to get around a strangeness in Python

    def reduce(self, result_sequence):
        #doesn't need "run_once()"

        frame = pd.concat(result_sequence)
        frame.sort("PValue", inplace=True)
        frame.index = np.arange(len(frame))

        if self.output_file_or_none is not None:
            frame.to_csv(self.output_file_or_none, sep="\t", index=False)

        return frame

        #!!Find a place to output info like this near the end of the run
        #logging.info("PhenotypeName\t{0}".format(pheno['header']))
        #logging.info("SampleSize\t{0}".format(test_snps.iid_count))
        #logging.info("SNPCount\t{0}".format(test_snps.sid_count))
        #logging.info("Runtime\t{0}".format(time.time()-t0))


    @property
    def tempdirectory(self):
        self._run_once()
        return self.__tempdirectory

    #optional override -- the str name of the instance is used by the cluster as the job name
    def __str__(self):
        #Doesn't need run_once
        return self._str


    def copyinputs(self, copier):
        self._run_once()
        if isinstance(self.test_snps, str):
            copier.input(self.test_snps + ".bed")
            copier.input(self.test_snps + ".bim")
            copier.input(self.test_snps + ".fam")
        else:
            copier.input(self.test_snps)

        copier.input(self.pheno)
        copier.input(self.covar)

        if isinstance(self.G0, str):
            copier.input(self.G0 + ".bed")
            copier.input(self.G0 + ".bim")
            copier.input(self.G0 + ".fam")
        else:
            copier.input(self.G0)

        copier.input(self.G1_or_none)
        copier.input(self.cache_file)

    def copyoutputs(self,copier):
        #Doesn't need run_once
        copier.output(self.output_file_or_none)

 #end of IDistributable interface---------------------------------------

    @staticmethod
    def div_ceil(num, den): #!!move to utils?
        return -(-num//den) #The -/- trick makes it do ceiling instead of floor. "//" will do integer division even in the future and on floats.
    
    def pair_block_sequence_range(self,block_start,block_end):
        self._run_once()
        assert 0 <= block_start and block_start <= block_end and block_end <= self.work_count, "real assert"

        block_index = block_start
        start = block_index * self.pair_count // self.work_count
        next_start = (block_index+1) * self.pair_count // self.work_count
        size_goal = next_start - start
        end = block_end * self.pair_count // self.work_count

        sid0_list = []
        sid1_list = []
        for sid0, sid1 in self.pair_sequence_range(start,end):
            sid0_list.append(sid0)
            sid1_list.append(sid1)
            if len(sid0_list) == size_goal:
                yield sid0_list, sid1_list
                block_index += 1
                if block_index == block_end:
                    return
                sid0_list = []
                sid1_list = []
                start = next_start
                next_start = (block_index+1) * self.pair_count // self.work_count
                size_goal = next_start - start
        assert len(sid0_list) == 0, "real assert"

    #If start == end, then returns without yielding anything 
    def pair_sequence_range(self,start,end):
        self._run_once()
        assert 0 <= start and start <= end and end <= self._pair_count, "real assert"

        i = start
        for sid0, sid1 in self.pair_sequence_with_start(start):
            yield sid0, sid1
            i = i + 1
            if i == end:
                break
        assert i == end, "Not enough items found. Didn't get to the end"


    def pair_sequence_with_start(self,start):
        self._run_once()

        skip_ref = [start]

        just_sid_0_list = list(self.just_sid_0)
        just_sid_1_list = list(self.just_sid_1)
        intersect_list = list(self.intersect)

        for sid0, sid1 in self.combo_distinct(just_sid_0_list, intersect_list, skip_ref):
            yield sid0, sid1
        for sid0, sid1 in self.combo_distinct(just_sid_0_list, just_sid_1_list, skip_ref):
            yield sid0, sid1
        for sid0, sid1 in self.combo_distinct(intersect_list, just_sid_1_list, skip_ref):
            yield sid0, sid1
        for sid0, sid1 in self.combo_same(intersect_list, skip_ref):
            yield sid0, sid1
        assert skip_ref[0] == 0, "real assert"


    def combo_distinct(self, distinct__list0, distinct__list1, skip_ref):
        row_count = len(distinct__list0)
        col_count = len(distinct__list1)

        if skip_ref[0] >= row_count * col_count:
            skip_ref[0] = skip_ref[0] - row_count * col_count
            assert skip_ref[0] >=0, "real assert"
            return

        row_start = skip_ref[0] // col_count
        skip_ref[0] = skip_ref[0] - row_start * col_count
        assert skip_ref[0] >=0, "real assert"

        for row_index in xrange(row_start, row_count):
            sid0 = distinct__list0[row_index]
            if row_index == row_start:
                col_start = skip_ref[0]
                skip_ref[0] = 0
            else:
                col_start = 0
            for col_index in xrange(col_start, col_count):
                sid1 = distinct__list1[col_index]
                yield sid0, sid1

    def combo_same(self, list, skip_ref):
        count = len(list)
        full_size = count * (count + 1) // 2
        if skip_ref[0] >= full_size:
            skip_ref[0] = skip_ref[0] - full_size
            assert skip_ref[0] >=0, "real assert"
            return

        row_start = int((-1 + 2*count - np.sqrt(1 - 4*count + 4*count**2 - 8*skip_ref[0]))/2)
        skip_ref[0] = skip_ref[0] - (count*row_start - (row_start*(1 + row_start))//2)
        assert skip_ref[0] >=0, "real assert"

        for row_index in xrange(row_start, count):
            sid0 = list[row_index]
            if row_index == row_start:
                col_start = skip_ref[0]
                skip_ref[0] = 0
            else:
                col_start = 0
            for col_index in xrange(col_start + 1 + row_index, count):
                sid1 = list[col_index]
                assert sid0 is not sid1, "real assert"
                yield sid0, sid1



    @property
    def pair_count(self):
        self._run_once()
        return self._pair_count

    def lmm_from_cache_file(self):
        logging.info("Loading precomputation from {0}".format(self.cache_file))
        lmm = LMM()
        with np.load(self.cache_file) as data:
            lmm.U = data['arr_0']
            lmm.S = data['arr_1']
        return lmm

    def fill_in_cache_file(self):
        self._run_once()

        logging.info("filling in the cache_file and log_delta, as needed")

        if self.G1_or_none is None:
            self.G1val_or_none = None
        else:
            self.G1val_or_none = self.G1_or_none.read().val

        # The S and U are always cached, in case they are needed for the cluster or for multi-threaded runs
        if self.cache_file is None:
            self.cache_file = os.path.join(self.__tempdirectory, "cache_file.npz")
            if os.path.exists(self.cache_file): # If there is already a cache file in the temp directory, it must be removed because it might be out-of-date
                os.remove(self.cache_file)

        lmm = None
        if not os.path.exists(self.cache_file):
            logging.info("Precomputing eigen")
            lmm = LMM()
            G0_standardized = self.G0.read().standardize()
            lmm.setG(G0_standardized.val, self.G1val_or_none, a2=self.mixing)
            logging.info("Saving precomputation to {0}".format(self.cache_file))
            util.create_directory_if_necessary(self.cache_file)
            np.savez(self.cache_file, lmm.U,lmm.S) #using np.savez instead of pickle because it seems to be faster to read and write

        if self.external_log_delta is None:
            if lmm is None:
                lmm = self.lmm_from_cache_file()

            logging.info("searching for internal delta")
            lmm.setX(self.covar)
            lmm.sety(self.pheno['vals'])
            #log delta is used here. Might be better to use findH2, but if so will need to normalized G so that its K's diagonal would sum to iid_count
            result = lmm.find_log_delta(REML=False, sid_count=self.G0.sid_count, min_log_delta=self.min_log_delta, max_log_delta=self.max_log_delta  ) #!!what about findA2H2? minH2=0.00001
            self.external_log_delta = result['log_delta']

        self.internal_delta = np.exp(self.external_log_delta) * self.G0.sid_count
        logging.info("internal_delta={0}".format(self.internal_delta))
        logging.info("external_log_delta={0}".format(self.external_log_delta))


    do_pair_count = 0
    do_pair_time = time.time()

    def do_work(self, lmm, sid0_list, sid1_list):
        dataframe = pd.DataFrame(
            index=np.arange(len(sid0_list)),
            columns=('SNP0', 'Chr0', 'GenDist0', 'ChrPos0', 'SNP1', 'Chr1', 'GenDist1', 'ChrPos1', 'PValue', 'NullLogLike', 'AltLogLike')
            )
        #!!Is this the only way to set types in a dataframe?
        dataframe['Chr0'] = dataframe['Chr0'].astype(np.float)
        dataframe['GenDist0'] = dataframe['GenDist0'].astype(np.float)
        dataframe['ChrPos0'] = dataframe['ChrPos0'].astype(np.float)
        dataframe['Chr1'] = dataframe['Chr1'].astype(np.float)
        dataframe['GenDist1'] = dataframe['GenDist1'].astype(np.float)
        dataframe['ChrPos1'] = dataframe['ChrPos1'].astype(np.float)
        dataframe['PValue'] = dataframe['PValue'].astype(np.float)
        dataframe['NullLogLike'] = dataframe['NullLogLike'].astype(np.float)
        dataframe['AltLogLike'] = dataframe['AltLogLike'].astype(np.float)


        #This is some of the code for a different way that reads and dot-products 50% more, but does less copying. Seems about the same speed
        #sid0_index_list = self.test_snps.sid_to_index(sid0_list)
        #sid1_index_list = self.test_snps.sid_to_index(sid1_list)
        #sid_index_union_dict = {}
        #sid0_index_index_list = self.create_index_index(sid_index_union_dict, sid0_index_list)
        #sid1_index_index_list = self.create_index_index(sid_index_union_dict, sid1_index_list)
        #snps0_read = self.test_snps[:,sid0_index_list].read().standardize()
        #snps1_read = self.test_snps[:,sid1_index_list].read().standardize()

        sid_union = set(sid0_list).union(sid1_list)
        sid_union_index_list = sorted(self.test_snps.sid_to_index(sid_union))
        snps_read = self.test_snps[:,sid_union_index_list].read().standardize()

        sid0_index_list = snps_read.sid_to_index(sid0_list)
        sid1_index_list = snps_read.sid_to_index(sid1_list)

        products = snps_read.val[:,sid0_index_list] * snps_read.val[:,sid1_index_list] # in the products matrix, each column i is the elementwise product of sid i in each list
        X = np.hstack((self.covar, snps_read.val, products))
        UX = lmm.U.T.dot(X)
        k = lmm.S.shape[0]
        N = X.shape[0]
        if (k<N):
            UUX = X - lmm.U.dot(UX)
        else:
            UUX = None

        for pair_index, sid0 in enumerate(sid0_list):
            sid1 = sid1_list[pair_index]
            sid0_index = sid0_index_list[pair_index]
            sid1_index = sid1_index_list[pair_index]

            index_list = np.array([pair_index]) #index to product
            index_list = index_list + len(sid_union_index_list) #Shift by the number of snps in the union
            index_list = np.hstack((np.array([sid0_index,sid1_index]),index_list)) # index to sid0 and sid1
            index_list = index_list + self.covar.shape[1] #Shift by the number of values in the covar
            index_list = np.hstack((np.arange(self.covar.shape[1]),index_list)) #indexes of the covar

            index_list_less_product = index_list[:-1] #index to everything but the product

            #Null -- the two additive SNPs
            lmm.X = X[:,index_list_less_product]
            lmm.UX = UX[:,index_list_less_product]
            if (k<N):
                lmm.UUX = UUX[:,index_list_less_product]
            else:
                lmm.UUX = None
            res_null = lmm.nLLeval(delta=self.internal_delta, REML=False)
            ll_null = -res_null["nLL"]

            #Alt -- now with the product feature
            lmm.X = X[:,index_list]
            lmm.UX = UX[:,index_list]
            if (k<N):
                lmm.UUX = UUX[:,index_list]
            else:
                lmm.UUX = None
            res_alt = lmm.nLLeval(delta=self.internal_delta, REML=False)
            ll_alt = -res_alt["nLL"]

            test_statistic = ll_alt - ll_null
            degrees_of_freedom = 1
            pvalue = stats.chi2.sf(2.0 * test_statistic, degrees_of_freedom)
            logging.debug("<{0},{1}>, null={2}, alt={3}, pvalue={4}".format(sid0,sid1,ll_null,ll_alt,pvalue))

            dataframe.iloc[pair_index] = [
                 sid0, snps_read.pos[sid0_index,0],  snps_read.pos[sid0_index,1], snps_read.pos[sid0_index,2],
                 sid1, snps_read.pos[sid1_index,0],  snps_read.pos[sid1_index,1], snps_read.pos[sid1_index,2],
                 pvalue, ll_null, ll_alt]

            self.do_pair_count += 1
            if self.do_pair_count % 100 == 0:
                start = self.do_pair_time
                self.do_pair_time = time.time()
                logging.info("do_pair_count={0}, time={1}".format(self.do_pair_count,self.do_pair_time-start))

        return dataframe
Exemplo n.º 8
0
 def __init__(self,args):
     if args.window_type not in ['BP','SNP']:
         raise ValueError('Window type not supported')
     bed_1 = Bed(args.bfile1) #
     bed_2 = Bed(args.bfile2)
     af1 = self.get_allele_frequency(bed_1,args) #
     af2 = self.get_allele_frequency(bed_2,args)
     print(len(af1), "SNPs in file 1")
     print(len(af2), "SNPs in file 2")
     snps_1 = (af1>args.maf)&(af1<1-args.maf) #
     snps_2 = (af2>args.maf)&(af2<1-args.maf)
     print(np.sum(snps_1), "SNPs in file 1 after MAF filter")
     print(np.sum(snps_2), "SNPs in file 2 after MAF filter")
     if (args.from_bp is not None) and (args.to_bp is not None):
         k1 = (bed_1.pos[:,2]>args.from_bp)&(bed_1.pos[:,2]<args.to_bp)
         k2 = (bed_2.pos[:,2]>args.from_bp)&(bed_2.pos[:,2]<args.to_bp)
         snps_1 = snps_1&k1
         snps_2 = snps_2&k2
     snps_to_use = np.intersect1d(bed_1.sid[snps_1],bed_2.sid[snps_2])
     print(len(snps_to_use),"SNPs common in both populations")
     if args.extract is not None:
         keep = np.array([l.strip() for l in open(args.extract,'r')])
         print(len(keep),"SNPs to extract")
         snps_to_use = np.intersect1d(snps_to_use,keep)
         print(len(snps_to_use),"SNPs remaining after extraction")
     bed_1_index = np.sort(bed_1.sid_to_index(snps_to_use)) #
     bed_2_index = np.sort(bed_2.sid_to_index(snps_to_use))
     if not args.no_align:
         alignment,bed_1_index,bed_2_index =\
             self.align_alleles(bed_1,bed_1_index,af1,bed_2,bed_2_index,af2)
     else:
         alignment = np.ones(len(bed_1_index))
     pos = bed_1.pos[bed_1_index] #
     bim_1=pd.read_table(bed_1.filename+'.bim',header=None,
                         names=['chm','id','pos_mb','pos_bp','a1','a2'])
     af1 = af1[bed_1_index] #
     af2 = af2[bed_2_index]
     if args.afile1 is not None:
         a1 =  pd.read_table(args.afile,header=None,sep='\s*',
                             names=['id1','id2','theta'])
     else:
         a1 = None
     if args.afile2 is not None:
         a2 =  pd.read_table(args.afile,header=None,sep='\s*',
                             names=['id1','id2','theta'])
     else:
         a2 = None
     self.af1 = af1 #
     self.af2 = af2
     self.M = len(bed_1_index) #
     self.N = (bed_1.iid_count, bed_2.iid_count) #
     self.chr = pos[:,0]
     self.pos = pos[:,2]
     self.id = bed_1.sid[bed_1_index]
     self.A1 = bim_1['a1'].loc[bed_1_index]
     self.A2 = bim_1['a2'].loc[bed_1_index]
     self.windows = self.get_windows(pos,args) #
     self.scores1 = self.compute(bed_1,bed_1_index,af1,a1,args)
     self.scores2 = self.compute(bed_2,bed_2_index,af2,a2,args) #
     self.scoresX = self.compute2(bed_1,bed_1_index,bed_2,bed_2_index,
                                  alignment,a1,a2,args) #
Exemplo n.º 9
0
    def test_match_cpp(self):
        '''
        match
            FaSTLMM.207\Data\DemoData>..\.cd.\bin\windows\cpp_mkl\fastlmmc -bfile snps -extract topsnps.txt -bfileSim snps -extractSim ASout.snps.txt -pheno pheno.txt -covar covariate.txt -out topsnps.singlesnp.txt -logDelta 0 -verbose 100

        '''
        logging.info("TestSingleSnp test_match_cpp")
        snps = Bed(
            os.path.join(self.pythonpath, "tests/datasets/selecttest/snps"))
        pheno = os.path.join(self.pythonpath,
                             "tests/datasets/selecttest/pheno.txt")
        covar = os.path.join(self.pythonpath,
                             "tests/datasets/selecttest/covariate.txt")
        sim_sid = [
            "snp26250_m0_.19m1_.19", "snp82500_m0_.28m1_.28",
            "snp63751_m0_.23m1_.23", "snp48753_m0_.4m1_.4",
            "snp45001_m0_.26m1_.26", "snp52500_m0_.05m1_.05",
            "snp75002_m0_.39m1_.39", "snp41253_m0_.07m1_.07",
            "snp11253_m0_.2m1_.2", "snp86250_m0_.33m1_.33",
            "snp3753_m0_.23m1_.23", "snp75003_m0_.32m1_.32",
            "snp30002_m0_.25m1_.25", "snp26252_m0_.19m1_.19",
            "snp67501_m0_.15m1_.15", "snp63750_m0_.28m1_.28",
            "snp30001_m0_.28m1_.28", "snp52502_m0_.35m1_.35",
            "snp33752_m0_.31m1_.31", "snp37503_m0_.37m1_.37",
            "snp15002_m0_.11m1_.11", "snp3751_m0_.34m1_.34",
            "snp7502_m0_.18m1_.18", "snp52503_m0_.3m1_.3",
            "snp30000_m0_.39m1_.39", "isnp4457_m0_.11m1_.11",
            "isnp23145_m0_.2m1_.2", "snp60001_m0_.39m1_.39",
            "snp33753_m0_.16m1_.16", "isnp60813_m0_.2m1_.2",
            "snp82502_m0_.34m1_.34", "snp11252_m0_.13m1_.13"
        ]
        sim_idx = snps.sid_to_index(sim_sid)
        test_sid = [
            "snp26250_m0_.19m1_.19", "snp63751_m0_.23m1_.23",
            "snp82500_m0_.28m1_.28", "snp48753_m0_.4m1_.4",
            "snp45001_m0_.26m1_.26", "snp52500_m0_.05m1_.05",
            "snp75002_m0_.39m1_.39", "snp41253_m0_.07m1_.07",
            "snp86250_m0_.33m1_.33", "snp15002_m0_.11m1_.11",
            "snp33752_m0_.31m1_.31", "snp26252_m0_.19m1_.19",
            "snp30001_m0_.28m1_.28", "snp11253_m0_.2m1_.2",
            "snp67501_m0_.15m1_.15", "snp3753_m0_.23m1_.23",
            "snp52502_m0_.35m1_.35", "snp30000_m0_.39m1_.39",
            "snp30002_m0_.25m1_.25"
        ]
        test_idx = snps.sid_to_index(test_sid)

        for G0, G1 in [(snps[:, sim_idx], KernelIdentity(snps.iid)),
                       (KernelIdentity(snps.iid), snps[:, sim_idx])]:
            frame_h2 = single_snp(test_snps=snps[:, test_idx],
                                  pheno=pheno,
                                  G0=G0,
                                  G1=G1,
                                  covar=covar,
                                  h2=.5,
                                  leave_out_one_chrom=False)
            frame_log_delta = single_snp(test_snps=snps[:, test_idx],
                                         pheno=pheno,
                                         G0=G0,
                                         G1=G1,
                                         covar=covar,
                                         log_delta=0,
                                         leave_out_one_chrom=False)
            for frame in [frame_h2, frame_log_delta]:
                referenceOutfile = TestFeatureSelection.reference_file(
                    "single_snp/topsnps.single.txt")
                reference = pd.read_table(
                    referenceOutfile, sep="\t"
                )  # We've manually remove all comments and blank lines from this file
                assert len(frame) == len(reference)
                for _, row in reference.iterrows():
                    sid = row.SNP
                    pvalue = frame[frame['SNP'] == sid].iloc[0].PValue
                    reldiff = abs(row.Pvalue - pvalue) / row.Pvalue
                    assert reldiff < .035, "'{0}' pvalue_list differ too much {4} -- {2} vs {3}".format(
                        sid, None, row.Pvalue, pvalue, reldiff)
def unbalance_condition_longwas_trans(
        data_file,
        id,
        tpoint,
        trait,
        bed_file,
        kin_file,
        var_com,
        condition_snp,
        snp_lst=None,
        tfix=None,
        fix=None,
        forder=3,
        aorder=3,
        porder=3,
        na_method='omit',
        prefix_outfile='unbalance_condition_longwas_trans'):
    """
    the longitudinal GWAS for the unbalanced data treating the SNP as the time varied random effect.
    :param data_file: the data file. The first row is the variate names whose first initial position is alphabetical.
    For the class variates, the first letter must be capital; for the covariates (continuous variates), the first letter
    must be lowercase.
    :param id: A class variate name which indicates the individual id column in the data file.
    :param tpoint: A covariate names which indicates the time point column in the data file.
    :param trait: A variate name which indicates the analyzed trait column in the data file.
    :param bed_file: the prefix for the plink binary file.
    :param kin_file: the file for genomic relationship matrix. This file can be produced by
    gmat.gmatrix.agmat function using agmat(bed_file, inv=True, small_val=0.001, out_fmt='id_id_val')
    :param var_com: the estimated variance parameters by the gmat.longwas.unbalance.unbalance_varcom function.
    :param condition_snp: conditional snp
    :param snp_lst: the snp list to test. Default is None.
    :param tfix: A class variate name for the time varied fixed effect. Default value is None. Only one time varied
    fixed effect can be included in the current version.
    :param fix: Expression for the time independent fixed effect. Default value is None. An example:
    fix = "Sex + age + Season".
    :param forder: the order of Legendre polynomials for the time varied fixed effect. The default value is 3.
    :param aorder: the order of Legendre polynomials for the additive genetic effect. The default value is 3.
    :param porder: the order of Legendre polynomials for the permanent environment effect. The default value is 3.
    :param na_method: The method to deal with missing values. The default value is 'omit'. 'omit' method will delete the
    row with missing values. 'include' method will fill the missing values with the adjacent values.
    :param prefix_outfile: the prefix for the output file. Default is 'unbalance_longwas_fixed'.
    :return: A pandas data frame for the test result.
    """
    logging.info('################################')
    logging.info('###Prepare the related matrix###')
    logging.info('################################')
    if var_com.shape[0] != aorder * (aorder + 1) / 2 + aorder + 1 + porder * (
            porder + 1) / 2 + porder + 1 + 1:
        logging.info('ERROR: Variances do not match the data, please check')
        exit()
    logging.info('***Read the data file***')
    logging.info('Data file: ' + data_file)
    data_df = pd.read_csv(data_file, sep='\s+', header=0)
    logging.info('NA method: ' + na_method)
    if na_method == 'omit':
        data_df = data_df.dropna()
    elif na_method == 'include':
        data_df = data_df.fillna(method='ffill')
        data_df = data_df.fillna(method='bfill')
    else:
        logging.info('na_method does not exist: ' + na_method)
        exit()
    col_names = data_df.columns
    logging.info('The column names of data file: ' + ' '.join(list(col_names)))
    logging.info(
        'Note: Variates beginning with a capital letter is converted into factors.'
    )
    class_vec = []
    for val in col_names:
        if not val[0].isalpha():
            logging.info(
                "The first character of columns names must be alphabet!")
            exit()
        if val[0] == val.capitalize()[0]:
            class_vec.append(val)
            data_df[val] = data_df[val].astype('str')
        else:
            try:
                data_df[val] = data_df[val].astype('float')
            except Exception as e:
                logging.info(e)
                logging.info(val + " may contain string, please check!")
                exit()
    logging.info('Individual column: ' + id)
    if id not in col_names:
        logging.info(id + ' is not in the data file, please check!')
        exit()
    if id not in class_vec:
        logging.info('The initial letter of {} should be capital'.format(id))
        exit()
    id_order = []
    id_arr = list(data_df[id])
    id_order.append(id_arr[0])
    for i in range(1, len(id_arr)):
        if id_arr[i] != id_arr[i - 1]:
            id_order.append(id_arr[i])
    id_in_data = set(data_df[id])
    if len(id_in_data) - len(id_order) != 0:
        logging.info('The data is not sored by individual ID!')
        exit()
    logging.info('Time points column: ' + tpoint)
    if tpoint not in col_names:
        logging.info(tpoint + ' is not in the data file, please check!')
        exit()
    if tpoint in class_vec:
        logging.info(
            'The initial letter of {} should be lowercase'.format(tpoint))
        exit()
    logging.info('Trait column: ' + trait)
    if trait not in col_names:
        logging.info(trait + ' is not in the data file, please check!')
        exit()
    if trait in class_vec:
        logging.info(
            'The initial letter of {} should be lowercase'.format(trait))
        exit()
    logging.info('Code factor variables of the data file: ' +
                 ' '.join(list(class_vec)))
    code_val = {}
    code_dct = dct_2D()
    for val in class_vec:
        code_val[val] = 0
        temp = []
        for i in range(data_df.shape[0]):
            if data_df[val][i] not in code_dct[val]:
                code_val[val] += 1
                code_dct[val][data_df[val][i]] = str(code_val[val])
            temp.append(code_dct[val][data_df[val][i]])
        data_df[val] = np.array(temp)
    for val in class_vec:
        data_df[val] = data_df[val].astype('int')
    logging.info('***Build the design matrix for fixed effect***')
    logging.info('Time dependent fixed effect: ' + str(tfix))
    leg_fix = leg(data_df[tpoint], forder)
    if tfix == None:
        xmat_t = np.concatenate(leg_fix, axis=1)
        xmat_t = csr_matrix(xmat_t)
    else:
        if tfix not in class_vec:
            logging.info(tfix + ' is not the class variate')
            exit()
        row = np.array(range(data_df.shape[0]))
        col = np.array(data_df[tfix]) - 1
        val = np.array([1.0] * data_df.shape[0])
        tfix_mat = csr_matrix((val, (row, col)))
        xmat_t = []
        for i in range(len(leg_fix)):
            xmat_t.append(tfix_mat.multiply(leg_fix[i]))
        xmat_t = hstack(xmat_t)
        del row, col, val
        gc.collect()
    logging.info('Time independent fix effect: ' + str(fix))
    xmat_nt = None
    if fix == None:
        xmat_nt = None
    else:
        try:
            fix_exp = ''
            vec = fix.split('+')
            for i in vec:
                val = i.strip()
                if val in class_vec:
                    fix_exp += 'C(' + val + ')'
                else:
                    fix_exp += val
            xmat_nt = dmatrix(fix_exp, data_df)
            logging.info('The expression for fixed effect: ' + fix_exp)
        except Exception as e:
            logging.info(e + ': Check the fix effect expression.')
            exit()
        xmat_nt = csr_matrix(xmat_nt[:, 1:])
    xmat = hstack([xmat_t, xmat_nt])
    xmat = xmat.toarray()
    max_id = max(data_df[id]) + 1
    tmin = min(data_df[tpoint])
    tmax = max(data_df[tpoint])
    leg_lst = [
    ]  # legendre polynomials for time dependent fixed SNP effects, save for each individuals
    for i in range(1, max_id):
        leg_lst.append(
            leg_mt(data_df[data_df[id] == i][tpoint], tmax, tmin, forder))
    tpoint_vec = sorted(set(data_df[tpoint]))
    leg_tpoint_mat = leg_mt(np.array(tpoint_vec), tmax, tmin, aorder)
    leg_tpoint_accum = np.sum(leg_tpoint_mat, axis=0)
    logging.info('***Read the kinship matrix***')
    logging.info('Kinship file: ' + kin_file)
    with open(kin_file) as fin:
        row = []
        col = []
        kin = []
        id_in_kin = {}
        for line in fin:
            arr = line.split()
            id_in_kin[arr[0]] = 1
            id_in_kin[arr[1]] = 1
            if arr[0] not in code_dct[id]:
                logging.info(arr[0] + ' is not in the kinship inversion file!')
                exit()
            if arr[1] not in code_dct[id]:
                logging.info(arr[1], 'is not in the kinship inversion file!')
                exit()
            row.append(int(code_dct[id][arr[0]]))
            col.append(int(code_dct[id][arr[1]]))
            kin.append(float(arr[2]))
    id_not_in_kin = list(set(code_dct[id].keys()) - set(id_in_kin.keys()))
    if len(id_not_in_kin) != 0:
        logging.info(
            'The ID: {} in the data file is not in the kinship file!'.format(
                ' '.join(id_not_in_kin)))
        exit()
    kin = csr_matrix(
        (np.array(kin), (np.array(row) - 1, np.array(col) - 1))).toarray()
    kin = np.add(kin, kin.T)
    kin[np.diag_indices_from(kin)] = 0.5 * np.diag(kin)
    del row, col
    gc.collect()
    logging.info('***Build the dedign matrix for random effect***')
    logging.info('Legendre order for additive effects: ' + str(aorder))
    leg_add = leg(data_df[tpoint], aorder)
    row = np.array(range(data_df.shape[0]))
    col = np.array(data_df[id]) - 1
    val = np.array([1.0] * data_df.shape[0])
    add_mat = csr_matrix((val, (row, col)),
                         shape=(data_df.shape[0], kin.shape[0]))
    zmat_add = []
    for i in range(len(leg_add)):
        zmat_add.append(add_mat.multiply(leg_add[i]))
    logging.info('Legendre order for permanent environmental effect: ' +
                 str(porder))
    leg_per = leg(data_df[tpoint], porder)
    per_mat = csr_matrix((val, (row, col)))
    zmat_per = []
    for i in range(len(leg_per)):
        zmat_per.append((per_mat.multiply(leg_per[i])))
    del row, col, val
    gc.collect()
    zmat = [zmat_add, zmat_per]
    y = data_df[trait].values.reshape(data_df.shape[0], 1)
    # kin_inv = [kin_inv, sparse.eye(max(data_df[id]), format="csr")]
    logging.info('***Prepare the merged Z matrix***')
    eff_ind = [[0, xmat.shape[1]]]  # the index for all effects [start end]
    zmat_con_lst = []  # combined random matrix
    for i in range(len(zmat)):
        temp = [eff_ind[i][-1]]
        zmat_con_lst.append(hstack(zmat[i]))
        for j in range(len(zmat[i])):
            temp.append(temp[-1] + zmat[i][j].shape[1])
        eff_ind.append(temp)
    logging.info(
        '***Calculate the phenotypic (co)variance and P = V(-1) - V(-1)X[X(T)V(-1)X](-1)X(T)V(-1)***'
    )
    add_cov = var_com.loc[var_com.loc[:, 'vari'] == 1, :]
    row = np.array(add_cov['varij']) - 1
    col = np.array(add_cov['varik']) - 1
    val = add_cov['var_val']
    add_cov = csr_matrix((val, (row, col))).toarray()
    add_cov = add_cov + np.tril(add_cov, k=-1).T
    per_cov = var_com.loc[var_com.loc[:, 'vari'] == 2, :]
    row = np.array(per_cov['varij']) - 1
    col = np.array(per_cov['varik']) - 1
    val = per_cov['var_val']
    per_cov = csr_matrix((val, (row, col))).toarray()
    per_cov = per_cov + np.tril(per_cov, k=-1).T
    res_var = np.array(var_com['var_val'])[-1]
    vmat = zmat_con_lst[0].dot((zmat_con_lst[0].dot(np.kron(add_cov, kin))).T)
    one_id = sparse.eye(zmat_con_lst[1].shape[1] / per_cov.shape[0])
    vmat = vmat + zmat_con_lst[1].dot(
        (zmat_con_lst[1].dot(sparse.kron(per_cov, one_id))).T)
    vmat_diag = np.diag(vmat) + res_var
    np.fill_diagonal(vmat, vmat_diag)
    vmat = linalg.inv(vmat)
    fam_df = pd.read_csv(bed_file + '.fam', sep='\s+', header=None)
    id_geno = list(np.array(fam_df.iloc[:, 1], dtype=str))
    id_order_index = []
    for i in id_order:
        id_order_index.append(id_geno.index(i))
    snp_on_disk = Bed(bed_file, count_A1=False)
    condition_snp_index = snp_on_disk.sid_to_index([condition_snp])[0]
    condition_snp_val = snp_on_disk[:, condition_snp_index].read().val
    condition_snp_val = condition_snp_val[id_order_index, 0]
    snp_condition = list(
        map(lambda x, y: x * y, leg_lst, list(condition_snp_val)))
    snp_condition = np.concatenate(snp_condition, axis=0)
    xmat = np.concatenate((xmat, snp_condition), axis=1)
    vxmat = np.dot(vmat, xmat)
    xvxmat = np.dot(xmat.T, vxmat)
    xvxmat = linalg.inv(xvxmat)
    pmat = vmat - reduce(np.dot, [vxmat, xvxmat, vxmat.T])
    logging.info('***Read the snp data***')
    # snp_mat = read_plink(bed_file)
    snp_on_disk = Bed(bed_file, count_A1=False)
    num_id = snp_on_disk.iid_count
    num_snp = snp_on_disk.sid_count
    logging.info("There are {:d} individuals and {:d} SNPs.".format(
        num_id, num_snp))
    """
    fam_df = pd.read_csv(bed_file + '.fam', sep='\s+', header=None)
    id_geno = list(np.array(fam_df.iloc[:, 1], dtype=str))
    id_order_index = []
    for i in id_order:
        id_order_index.append(id_geno.index(i))
    """
    if snp_lst is None:
        snp_lst = range(num_snp)
    snp_lst = list(snp_lst)
    if min(snp_lst) < 0 or max(snp_lst) >= num_snp:
        logging.info('The value in the snp list should be >= {} and < {}', 0,
                     num_snp)
        exit()
    snp_mat = snp_on_disk[:, snp_lst].read().val
    if np.any(np.isnan(snp_mat)):
        logging.info('Missing genotypes are imputed with random genotypes.')
    snp_mat = snp_mat[id_order_index, :]
    # snp_mat = snp_mat[:, snp_lst]
    logging.info(
        '#####################################################################'
    )
    logging.info(
        '###Start the random regression longitudinal GWAS for unbalance data###'
    )
    logging.info(
        '#####################################################################'
    )
    qpmat = zmat_con_lst[0].T.dot(pmat)
    qpqmat = zmat_con_lst[0].T.dot(qpmat.T)
    qpymat = np.dot(qpmat, y)
    chi_df = add_cov.shape[1]
    eff_vec = []
    chi_vec = []
    p_vec = []
    p_min_vec = []
    p_accum_vec = []
    for i in tqdm(range(snp_mat.shape[1])):
        snpi = np.kron(add_cov, snp_mat[:, i:(i + 1)].T)
        snpi_eff = np.dot(snpi, qpymat)
        snpi_var = reduce(np.dot, [snpi, qpqmat, snpi.T])
        chi_val = np.sum(
            reduce(np.dot,
                   [snpi_eff.T, linalg.inv(snpi_var), snpi_eff]))
        p_val = chi2.sf(chi_val, chi_df)
        eff_vec.append(snpi_eff[:, -1])
        chi_vec.append(chi_val)
        p_vec.append(p_val)
        p_tpoint_vec = []
        for k in range(leg_tpoint_mat.shape[0]):
            eff_tpoint = np.sum(np.dot(leg_tpoint_mat[k, :], snpi_eff))
            eff_var_tpoint = np.sum(
                np.dot(leg_tpoint_mat[k, :],
                       np.dot(snpi_var, leg_tpoint_mat[k, :])))
            chi_tpoint = eff_tpoint * eff_tpoint / eff_var_tpoint
            p_tpoint = chi2.sf(chi_tpoint, 1)
            p_tpoint_vec.append(p_tpoint)
        p_min_vec.append(min(p_tpoint_vec))
        eff_accum = np.sum(np.dot(leg_tpoint_accum, snpi_eff))
        eff_var_accum = np.sum(
            np.dot(leg_tpoint_accum, np.dot(snpi_var, leg_tpoint_accum)))
        chi_accum = eff_accum * eff_accum / eff_var_accum
        p_accum = chi2.sf(chi_accum, 1)
        p_accum_vec.append(p_accum)
    logging.info('Finish association analysis')
    logging.info('***Output***')
    snp_info_file = bed_file + '.bim'
    snp_info = pd.read_csv(snp_info_file, sep='\s+', header=None)
    res_df = snp_info.iloc[snp_lst, [0, 1, 3, 4, 5]]
    res_df.columns = ['chro', 'snp_ID', 'pos', 'allele1', 'allele2']
    res_df.loc[:, 'order'] = snp_lst
    res_df = res_df.iloc[:, [5, 0, 1, 2, 3, 4]]
    eff_vec = np.array(eff_vec)
    for i in range(eff_vec.shape[1]):
        col_ind = 'eff' + str(i)
        res_df.loc[:, col_ind] = eff_vec[:, i]
    res_df.loc[:, 'chi_val'] = chi_vec
    res_df.loc[:, 'p_val'] = p_vec
    res_df.loc[:, 'p_min'] = p_min_vec
    res_df.loc[:, 'p_accum'] = p_accum_vec
    out_file = prefix_outfile + '.res'
    res_df.to_csv(out_file, sep=' ', index=False)
    return res_df
Exemplo n.º 11
0
class _Epistasis(object) : #implements IDistributable

    def __init__(self,test_snps,pheno,G0, G1=None, mixing=0.0, covar=None,sid_list_0=None,sid_list_1=None,
                 log_delta=None, min_log_delta=-5, max_log_delta=10, output_file=None, cache_file=None):
        self._ran_once = False

        self.test_snps = test_snps
        self.pheno = pheno
        self.output_file_or_none = output_file
        self.cache_file = cache_file
        self.covar = covar
        self.sid_list_0 = sid_list_0
        self.sid_list_1 = sid_list_1
        self.G0=G0
        self.G1_or_none=G1
        self.mixing=mixing
        self.external_log_delta=log_delta
        self.min_log_delta = min_log_delta
        self.max_log_delta = max_log_delta
        self._str = "{0}({1},{2},G0={6},G1={7},mixing={8},covar={3},output_file={12},sid_list_0={4},sid_list_1{5},log_delta={9},min_log_delta={10},max_log_delta={11},cache_file={13})".format(
            self.__class__.__name__, self.test_snps,self.pheno,self.covar,self.sid_list_0,self.sid_list_1,
                 self.G0, self.G1_or_none, self.mixing, self.external_log_delta, self.min_log_delta, self.max_log_delta, output_file, cache_file)
        self.block_size = 1000

    def set_sid_sets(self):
        sid_set_0 = set(self.sid_list_0)
        self.intersect = sid_set_0.intersection(self.sid_list_1)
        self.just_sid_0 = sid_set_0.difference(self.intersect)
        self.just_sid_1 = self.intersect.symmetric_difference(self.sid_list_1)
        self._pair_count = len(self.just_sid_0)*len(self.intersect) + len(self.just_sid_0)*len(self.just_sid_1) + len(self.intersect)*len(self.just_sid_1) + len(self.intersect) * (len(self.intersect)-1)//2
        self.test_snps, self.pheno, self.covar, self.G0, self.G1_or_none = pstutil.intersect_apply([self.test_snps, self.pheno, self.covar, self.G0, self.G1_or_none]) #should put G0 and G1 first

    def _run_once(self):
        if self._ran_once:
            return
        self._ran_once = None

        if isinstance(self.test_snps, str):
            self.test_snps = Bed(self.test_snps)

        if isinstance(self.G0, str):
            self.G0 = Bed(self.G0)

        if isinstance(self.pheno, str):
            self.pheno = pstpheno.loadOnePhen(self.pheno,vectorize=True,missing='NaN')

        if self.covar is not None and isinstance(self.covar, str):
            self.covar = pstpheno.loadPhen(self.covar,missing='NaN')

        if self.G1_or_none is not None and isinstance(self.G1_or_none, str):
            self.G1_or_none = Bed(self.G1_or_none)

        if self.sid_list_0 is None:
            self.sid_list_0 = self.test_snps.sid

        if self.sid_list_1 is None:
            self.sid_list_1 = self.test_snps.sid

        self.set_sid_sets()

        #!!Should fix up to add only of no constant columns - will need to add a test case for this
        if self.covar is None:
            self.covar = np.ones((self.test_snps.iid_count, 1))
        else:
            self.covar = np.hstack((self.covar['vals'],np.ones((self.test_snps.iid_count, 1))))
        self.n_cov = self.covar.shape[1] 


        if self.output_file_or_none is None:
            self.__tempdirectory = ".working"
        else:
            self.__tempdirectory = self.output_file_or_none + ".working"

        self._ran_once = True
        

 #start of IDistributable interface--------------------------------------
    @property
    def work_count(self):
        self._run_once()
        block_count = self.div_ceil(self._pair_count, self.block_size)
        return block_count



    def work_sequence(self):
        self._run_once()

        return self.work_sequence_range(0,self.work_count)

    def work_sequence_range(self, start, end):
        self._run_once()

        lmm = self.lmm_from_cache_file()
        lmm.sety(self.pheno['vals'])

        for sid0_list, sid1_list in self.pair_block_sequence_range(start,end):
            yield lambda lmm=lmm,sid0_list=sid0_list,sid1_list=sid1_list : self.do_work(lmm,sid0_list,sid1_list)  # the 'lmm=lmm,...' is need to get around a strangeness in Python

    def reduce(self, result_sequence):
        #doesn't need "run_once()"

        frame = pd.concat(result_sequence)
        frame.sort_values(by="PValue", inplace=True)
        frame.index = np.arange(len(frame))

        if self.output_file_or_none is not None:
            frame.to_csv(self.output_file_or_none, sep="\t", index=False)

        return frame

        #!!Find a place to output info like this near the end of the run
        #logging.info("PhenotypeName\t{0}".format(pheno['header']))
        #logging.info("SampleSize\t{0}".format(test_snps.iid_count))
        #logging.info("SNPCount\t{0}".format(test_snps.sid_count))
        #logging.info("Runtime\t{0}".format(time.time()-t0))


    @property
    def tempdirectory(self):
        self._run_once()
        return self.__tempdirectory

    #optional override -- the str name of the instance is used by the cluster as the job name
    def __str__(self):
        #Doesn't need run_once
        return self._str


    def copyinputs(self, copier):
        self._run_once()
        if isinstance(self.test_snps, str):
            copier.input(self.test_snps + ".bed")
            copier.input(self.test_snps + ".bim")
            copier.input(self.test_snps + ".fam")
        else:
            copier.input(self.test_snps)

        copier.input(self.pheno)
        copier.input(self.covar)

        if isinstance(self.G0, str):
            copier.input(self.G0 + ".bed")
            copier.input(self.G0 + ".bim")
            copier.input(self.G0 + ".fam")
        else:
            copier.input(self.G0)

        copier.input(self.G1_or_none)
        copier.input(self.cache_file)

    def copyoutputs(self,copier):
        #Doesn't need run_once
        copier.output(self.output_file_or_none)

 #end of IDistributable interface---------------------------------------

    @staticmethod
    def div_ceil(num, den): #!!move to utils?
        return -(-num//den) #The -/- trick makes it do ceiling instead of floor. "//" will do integer division even in the future and on floats.
    
    def pair_block_sequence_range(self,block_start,block_end):
        self._run_once()
        assert 0 <= block_start and block_start <= block_end and block_end <= self.work_count, "real assert"

        block_index = block_start
        start = block_index * self.pair_count // self.work_count
        next_start = (block_index+1) * self.pair_count // self.work_count
        size_goal = next_start - start
        end = block_end * self.pair_count // self.work_count

        sid0_list = []
        sid1_list = []
        for sid0, sid1 in self.pair_sequence_range(start,end):
            sid0_list.append(sid0)
            sid1_list.append(sid1)
            if len(sid0_list) == size_goal:
                yield sid0_list, sid1_list
                block_index += 1
                if block_index == block_end:
                    return
                sid0_list = []
                sid1_list = []
                start = next_start
                next_start = (block_index+1) * self.pair_count // self.work_count
                size_goal = next_start - start
        assert len(sid0_list) == 0, "real assert"

    #If start == end, then returns without yielding anything 
    def pair_sequence_range(self,start,end):
        self._run_once()
        assert 0 <= start and start <= end and end <= self._pair_count, "real assert"

        i = start
        for sid0, sid1 in self.pair_sequence_with_start(start):
            yield sid0, sid1
            i = i + 1
            if i == end:
                break
        assert i == end, "Not enough items found. Didn't get to the end"


    def pair_sequence_with_start(self,start):
        self._run_once()

        skip_ref = [start]

        just_sid_0_list = list(self.just_sid_0)
        just_sid_1_list = list(self.just_sid_1)
        intersect_list = list(self.intersect)

        for sid0, sid1 in self.combo_distinct(just_sid_0_list, intersect_list, skip_ref):
            yield sid0, sid1
        for sid0, sid1 in self.combo_distinct(just_sid_0_list, just_sid_1_list, skip_ref):
            yield sid0, sid1
        for sid0, sid1 in self.combo_distinct(intersect_list, just_sid_1_list, skip_ref):
            yield sid0, sid1
        for sid0, sid1 in self.combo_same(intersect_list, skip_ref):
            yield sid0, sid1
        assert skip_ref[0] == 0, "real assert"


    def combo_distinct(self, distinct__list0, distinct__list1, skip_ref):
        row_count = len(distinct__list0)
        col_count = len(distinct__list1)

        if skip_ref[0] >= row_count * col_count:
            skip_ref[0] = skip_ref[0] - row_count * col_count
            assert skip_ref[0] >=0, "real assert"
            return

        row_start = skip_ref[0] // col_count
        skip_ref[0] = skip_ref[0] - row_start * col_count
        assert skip_ref[0] >=0, "real assert"

        for row_index in range(row_start, row_count):
            sid0 = distinct__list0[row_index]
            if row_index == row_start:
                col_start = skip_ref[0]
                skip_ref[0] = 0
            else:
                col_start = 0
            for col_index in range(col_start, col_count):
                sid1 = distinct__list1[col_index]
                yield sid0, sid1

    def combo_same(self, list, skip_ref):
        count = len(list)
        full_size = count * (count + 1) // 2
        if skip_ref[0] >= full_size:
            skip_ref[0] = skip_ref[0] - full_size
            assert skip_ref[0] >=0, "real assert"
            return

        row_start = int((-1 + 2*count - np.sqrt(1 - 4*count + 4*count**2 - 8*skip_ref[0]))/2)
        skip_ref[0] = skip_ref[0] - (count*row_start - (row_start*(1 + row_start))//2)
        assert skip_ref[0] >=0, "real assert"

        for row_index in range(row_start, count):
            sid0 = list[row_index]
            if row_index == row_start:
                col_start = skip_ref[0]
                skip_ref[0] = 0
            else:
                col_start = 0
            for col_index in range(col_start + 1 + row_index, count):
                sid1 = list[col_index]
                assert sid0 is not sid1, "real assert"
                yield sid0, sid1



    @property
    def pair_count(self):
        self._run_once()
        return self._pair_count

    def lmm_from_cache_file(self):
        logging.info("Loading precomputation from {0}".format(self.cache_file))
        lmm = LMM()
        with np.load(self.cache_file) as data:
            lmm.U = data['arr_0']
            lmm.S = data['arr_1']
        return lmm

    def fill_in_cache_file(self):
        self._run_once()

        logging.info("filling in the cache_file and log_delta, as needed")

        if self.G1_or_none is None:
            self.G1val_or_none = None
        else:
            self.G1val_or_none = self.G1_or_none.read().val

        # The S and U are always cached, in case they are needed for the cluster or for multi-threaded runs
        if self.cache_file is None:
            self.cache_file = os.path.join(self.__tempdirectory, "cache_file.npz")
            if os.path.exists(self.cache_file): # If there is already a cache file in the temp directory, it must be removed because it might be out-of-date
                os.remove(self.cache_file)

        lmm = None
        if not os.path.exists(self.cache_file):
            logging.info("Precomputing eigen")
            lmm = LMM()
            G0_standardized = self.G0.read().standardize()
            lmm.setG(G0_standardized.val, self.G1val_or_none, a2=self.mixing)
            logging.info("Saving precomputation to {0}".format(self.cache_file))
            util.create_directory_if_necessary(self.cache_file)
            np.savez(self.cache_file, lmm.U,lmm.S) #using np.savez instead of pickle because it seems to be faster to read and write

        if self.external_log_delta is None:
            if lmm is None:
                lmm = self.lmm_from_cache_file()

            logging.info("searching for internal delta")
            lmm.setX(self.covar)
            lmm.sety(self.pheno['vals'])
            #log delta is used here. Might be better to use findH2, but if so will need to normalized G so that its K's diagonal would sum to iid_count
            result = lmm.find_log_delta(REML=False, sid_count=self.G0.sid_count, min_log_delta=self.min_log_delta, max_log_delta=self.max_log_delta  ) #!!what about findA2H2? minH2=0.00001
            self.external_log_delta = result['log_delta']

        self.internal_delta = np.exp(self.external_log_delta) * self.G0.sid_count
        logging.info("internal_delta={0}".format(self.internal_delta))
        logging.info("external_log_delta={0}".format(self.external_log_delta))


    do_pair_count = 0
    do_pair_time = time.time()

    def do_work(self, lmm, sid0_list, sid1_list):
        dataframe = pd.DataFrame(
            index=np.arange(len(sid0_list)),
            columns=('SNP0', 'Chr0', 'GenDist0', 'ChrPos0', 'SNP1', 'Chr1', 'GenDist1', 'ChrPos1', 'PValue', 'NullLogLike', 'AltLogLike')
            )
        #!!Is this the only way to set types in a dataframe?
        dataframe['Chr0'] = dataframe['Chr0'].astype(np.float)
        dataframe['GenDist0'] = dataframe['GenDist0'].astype(np.float)
        dataframe['ChrPos0'] = dataframe['ChrPos0'].astype(np.float)
        dataframe['Chr1'] = dataframe['Chr1'].astype(np.float)
        dataframe['GenDist1'] = dataframe['GenDist1'].astype(np.float)
        dataframe['ChrPos1'] = dataframe['ChrPos1'].astype(np.float)
        dataframe['PValue'] = dataframe['PValue'].astype(np.float)
        dataframe['NullLogLike'] = dataframe['NullLogLike'].astype(np.float)
        dataframe['AltLogLike'] = dataframe['AltLogLike'].astype(np.float)


        #This is some of the code for a different way that reads and dot-products 50% more, but does less copying. Seems about the same speed
        #sid0_index_list = self.test_snps.sid_to_index(sid0_list)
        #sid1_index_list = self.test_snps.sid_to_index(sid1_list)
        #sid_index_union_dict = {}
        #sid0_index_index_list = self.create_index_index(sid_index_union_dict, sid0_index_list)
        #sid1_index_index_list = self.create_index_index(sid_index_union_dict, sid1_index_list)
        #snps0_read = self.test_snps[:,sid0_index_list].read().standardize()
        #snps1_read = self.test_snps[:,sid1_index_list].read().standardize()

        sid_union = set(sid0_list).union(sid1_list)
        sid_union_index_list = sorted(self.test_snps.sid_to_index(sid_union))
        snps_read = self.test_snps[:,sid_union_index_list].read().standardize()

        sid0_index_list = snps_read.sid_to_index(sid0_list)
        sid1_index_list = snps_read.sid_to_index(sid1_list)

        products = snps_read.val[:,sid0_index_list] * snps_read.val[:,sid1_index_list] # in the products matrix, each column i is the elementwise product of sid i in each list
        X = np.hstack((self.covar, snps_read.val, products))
        UX = lmm.U.T.dot(X)
        k = lmm.S.shape[0]
        N = X.shape[0]
        if (k<N):
            UUX = X - lmm.U.dot(UX)
        else:
            UUX = None

        for pair_index, sid0 in enumerate(sid0_list):
            sid1 = sid1_list[pair_index]
            sid0_index = sid0_index_list[pair_index]
            sid1_index = sid1_index_list[pair_index]

            index_list = np.array([pair_index]) #index to product
            index_list = index_list + len(sid_union_index_list) #Shift by the number of snps in the union
            index_list = np.hstack((np.array([sid0_index,sid1_index]),index_list)) # index to sid0 and sid1
            index_list = index_list + self.covar.shape[1] #Shift by the number of values in the covar
            index_list = np.hstack((np.arange(self.covar.shape[1]),index_list)) #indexes of the covar

            index_list_less_product = index_list[:-1] #index to everything but the product

            #Null -- the two additive SNPs
            lmm.X = X[:,index_list_less_product]
            lmm.UX = UX[:,index_list_less_product]
            if (k<N):
                lmm.UUX = UUX[:,index_list_less_product]
            else:
                lmm.UUX = None
            res_null = lmm.nLLeval(delta=self.internal_delta, REML=False)
            ll_null = -res_null["nLL"]

            #Alt -- now with the product feature
            lmm.X = X[:,index_list]
            lmm.UX = UX[:,index_list]
            if (k<N):
                lmm.UUX = UUX[:,index_list]
            else:
                lmm.UUX = None
            res_alt = lmm.nLLeval(delta=self.internal_delta, REML=False)
            ll_alt = -res_alt["nLL"]

            test_statistic = ll_alt - ll_null
            degrees_of_freedom = 1
            pvalue = stats.chi2.sf(2.0 * test_statistic, degrees_of_freedom)
            logging.debug("<{0},{1}>, null={2}, alt={3}, pvalue={4}".format(sid0,sid1,ll_null,ll_alt,pvalue))

            dataframe.iloc[pair_index] = [
                 sid0, snps_read.pos[sid0_index,0],  snps_read.pos[sid0_index,1], snps_read.pos[sid0_index,2],
                 sid1, snps_read.pos[sid1_index,0],  snps_read.pos[sid1_index,1], snps_read.pos[sid1_index,2],
                 pvalue, ll_null, ll_alt]

            self.do_pair_count += 1
            if self.do_pair_count % 100 == 0:
                start = self.do_pair_time
                self.do_pair_time = time.time()
                logging.info("do_pair_count={0}, time={1}".format(self.do_pair_count,self.do_pair_time-start))

        return dataframe
Exemplo n.º 12
0
 def __init__(self,args):
     if args.window_type not in ['KBP','SNP']:
         raise ValueError('Window type not supported')
     # Open files
     bed_1 = Bed(args.bfile1,count_A1=False) #
     bed_2 = Bed(args.bfile2,count_A1=False)
     # Get indel locations, if any
     bim_1=pd.read_table(bed_1.filename+'.bim',header=None,
                         names=['chm','id','pos_mb','pos_bp','a1','a2'])
     bim_2=pd.read_table(bed_2.filename+'.bim',header=None,
                         names=['chm','id','pos_mb','pos_bp','a1','a2'])
     is_indel_1 = np.array([(len(str(a1))>1)|(len(str(a2))>1)  for a1,a2 in bim_1[['a1','a2']].values])
     is_indel_2 = np.array([(len(str(a1))>1)|(len(str(a2))>1)  for a1,a2 in bim_2[['a1','a2']].values])
     # Make sure two SNPs don't have the same position
     is_duplicated_bp_1=bim_1.pos_bp.duplicated()
     is_duplicated_bp_2=bim_2.pos_bp.duplicated()
     # Make sure two SNPs don't have the same ID
     is_duplicated_id_1=bim_1.id.duplicated()
     is_duplicated_id_2=bim_2.id.duplicated()
     # Get allele frequencies
     af1 = self.get_allele_frequency(bed_1,args) #
     af2 = self.get_allele_frequency(bed_2,args)
     print(len(af1), "Variants in file 1")
     print(len(af2), "Variants in file 2")
     # Get good SNPs
     snps_1 = (af1>args.maf)&(af1<1-args.maf)&(~is_indel_1)&(~is_duplicated_bp_1)&(~is_duplicated_id_1) #
     snps_2 = (af2>args.maf)&(af2<1-args.maf)&(~is_indel_2)&(~is_duplicated_bp_2)&(~is_duplicated_id_2)
     print(np.sum(snps_1), "SNPs in file 1 after MAF and indel filter")
     print(np.sum(snps_2), "SNPs in file 2 after MAF and indel filter")
     if (args.from_bp is not None) and (args.to_bp is not None):
         k1 = (bed_1.pos[:,2]>args.from_bp)&(bed_1.pos[:,2]<args.to_bp)
         k2 = (bed_2.pos[:,2]>args.from_bp)&(bed_2.pos[:,2]<args.to_bp)
         snps_1 = snps_1&k1
         snps_2 = snps_2&k2
     snps_to_use = np.intersect1d(bed_1.sid[snps_1.values],bed_2.sid[snps_2.values])
     print(len(snps_to_use),"SNPs common in both populations")
     if args.extract is not None:
         keep = np.array([l.strip() for l in open(args.extract,'r')])
         print(len(keep),"SNPs to extract")
         snps_to_use = np.intersect1d(snps_to_use,keep)
         print(len(snps_to_use),"SNPs remaining after extraction")
     bed_1_index = np.sort(bed_1.sid_to_index(snps_to_use)) #
     bed_2_index = np.sort(bed_2.sid_to_index(snps_to_use))
     if not args.no_align:
         alignment,bed_1_index,bed_2_index =\
             self.align_alleles(bed_1,bed_1_index,af1,bed_2,bed_2_index,af2)
     else:
         alignment = np.ones(len(bed_1_index))
     pos = bed_1.pos[bed_1_index] #
     af1 = af1[bed_1_index] #
     af2 = af2[bed_2_index]
     # if args.afile1 is not None:
     #     a1 =  pd.read_table(args.afile,header=None,sep='\s*',
     #                         names=['id1','id2','theta'])
     # else:
     a1 = None
     # if args.afile2 is not None:
     #     a2 =  pd.read_table(args.afile,header=None,sep='\s*',
     #                         names=['id1','id2','theta'])
     # else:
     a2 = None
     self.af1 = af1 #
     self.af2 = af2
     self.M = len(bed_1_index) #
     self.N = (bed_1.iid_count, bed_2.iid_count) #
     self.chr = pos[:,0]
     self.pos = pos[:,2]
     self.id = bed_1.sid[bed_1_index]
     self.A1 = bim_1['a1'].iloc[bed_1_index]
     self.A2 = bim_1['a2'].iloc[bed_1_index]
     self.windows = self.get_windows(pos,args) #
     self.scores1 = self.compute(bed_1,bed_1_index,af1,a1,args)
     self.scores2 = self.compute(bed_2,bed_2_index,af2,a2,args) #
     self.scoresX = self.compute2(bed_1,bed_1_index,bed_2,bed_2_index,
                                  alignment,a1,a2,args) #
Exemplo n.º 13
0
class fit(object):
    def __init__(self,args):
        self.bed = Bed(args.bfile) #
        self.N = self.bed.iid_count
        if args.covfile is not None:
            cov = pd.read_table(args.covfile,header=None)
            self.cov = sm.add_constant(ju._reorder(cov,self.bed.iid))
            self.ncov = self.cov.shape[1] # + constant
        else:
            self.cov = np.ones((self.N,1))
            self.ncov = 1 # Constant
        if args.phenofile is not None:
            Y = pd.read_table(args.phenofile,header=None,na_values='-9')
        else:
            try:
                Y = pd.read_table(args.bfile+'.pheno',header=None,na_values='-9')
            except IOError:
                print("Phenotype file not found.")
                exit(1)
        self.Y = ju._reorder(Y,self.bed.iid)
        af = ju.get_allele_frequency(self.bed,args) #
        snps = (af>args.maf)&(af<1-args.maf) #
        if (args.from_bp is not None) and (args.to_bp is not None):
            k = (bed.pos[:,2]>args.from_bp)&(bed.pos[:,2]<args.to_bp)
            snp1 = snps&k
        snps_to_use = self.bed.sid[snps]
        if args.extract is not None:
            keep = np.array([l.strip() for l in open(args.extract,'r')])
            snps_to_use = np.intersect1d(snps_to_use,keep)
        self.bed_index = np.sort(self.bed.sid_to_index(snps_to_use)) #
        pos = self.bed.pos[self.bed_index] #
        bim=pd.read_table(self.bed.filename+'.bim',header=None,
                          names=['chm','id','pos_mb','pos_bp','a1','a2'])
        self.af = af[self.bed_index] #
        self.M = len(self.bed_index) #
        self.windows = ju.get_windows(pos,self.M,args.window_size,args.window_type)
        self.pos = pos[:,2]
        self.chr = pos[:,0]
        self.id = self.bed.sid[self.bed_index]
        self.A1 = bim['a1'].loc[self.bed_index]
        self.A2 = bim['a2'].loc[self.bed_index]
        self.logistic = False
        self.chimin = stats.chi2.ppf(1-args.minp,2)

        # Fit null
        if (not args.linear) and (self.Y.min() >= 0 and self.Y.max() <= 1):
            self.null = sm.Logit(self.Y, self.cov, missing='drop').fit(disp=0)
            self.logistic = True
        else:
            self.null = sm.OLS(self.Y, self.cov, missing='drop').fit(disp=0)
        if self.ncov > 1:
            self.cov = sm.add_constant(self.null.fittedvalues)
        self.marg_res, self.joint_res = self.compute(args)

    def compute(self,args):
        t=time()
        marg_res = []
        joint_res = []
        Z = []
        windex = 0
        li,ri = self.windows[windex]
        nstr = np.max((args.SNPs_to_read,ri-li))
        offset = li
        G = self.bed[:,self.bed_index[li:(li+nstr)]].read().val
        G = ju._impute_missing(G) # replace missing with mean
        self.compute_marg(marg_res,Z,G,li,args)
        A = ju._norm_data(G)
        while ri < offset+nstr:
            st = li-offset
            fi = ri-offset
            # All correlations of SNP j with SNPs in its window
            R = np.dot(np.atleast_2d(A[:,st]/self.N),A[:,(st+1):fi]).flatten()
            Zl = Z[li]
            Zr = np.array(Z[(li+1):ri])
            # Use marginal Z-scores and R to compute expected joint chi2s
            ChiP = (1/(1-R**2))*(Zl**2+Zr**2-2*R*Zl*Zr)
            ChiP[R**2 < args.r2min] = -1
            self.compute_joint(joint_res,G,ChiP,offset,li,ri,args)
            windex += 1
            li,ri = self.windows[windex]
        for i in xrange(offset+nstr,self.M,nstr):
            sys.stdout.flush()
            sys.stdout.write("SNP: %d, %f\r" % (i, time()-t))
            Gn = self.bed[:,self.bed_index[i:(i+nstr)]].read().val
            Gn = ju._impute_missing(Gn)
            An = ju._norm_data(Gn)
            self.compute_marg(marg_res,Z,Gn,i,args)
            G = np.hstack((G,Gn))
            A = np.hstack((A,An))
            if G.shape[1] > args.SNPs_to_store:
                G = G[:,nstr:]
                A = A[:,nstr:]
                offset += nstr
            while ri < i+nstr:
                st = li-offset
                fi = ri-offset
                # All correlations of SNP j with SNPs in its window
                R = np.dot(np.atleast_2d(A[:,st]/self.N),A[:,(st+1):fi]).flatten()
                Zl = Z[li]
                Zr = np.array(Z[(li+1):ri])
                ChiP = (1/(1-R**2))*(Zl**2+Zr**2-2*R*Zl*Zr)
                ChiP[R**2 < args.r2min] = -1
                self.compute_joint(joint_res,G,ChiP,offset,li,ri,args)
                try:
                    windex += 1
                    li,ri = self.windows[windex]
                except IndexError:
                    break
        marg_res = pd.DataFrame(marg_res)
        joint_res = pd.DataFrame(joint_res)
        return marg_res, joint_res

    def compute_joint(self,joint_res,G,ChiP,offset,li,ri,args):
        st = li-offset
        fi = ri-offset
        snp1 = G[:,st]
        for i,snp2 in enumerate(G[:,(st+1):fi].T):
            if ChiP[i] > self.chimin:
                X = np.hstack((self.cov,snp1.reshape((len(snp1),1)),
                               snp2.reshape((len(snp2),1))))
                if self.logistic:
                    joint = sm.Logit(self.Y,X).fit(disp=0)
                else:
                    joint = sm.OLS(self.Y,X).fit(disp=0)
                joint_b1 = joint.params[-2]
                joint_b2 = joint.params[-1]
                joint_or1 = np.exp(joint_b1)
                joint_or2 = np.exp(joint_b2)
                joint_se1 = joint.bse[-2]
                joint_se2 = joint.bse[-1]
                joint_p1 = joint.pvalues[-2]
                joint_p2 = joint.pvalues[-1]
                joint_t1 = joint.tvalues[-2]
                joint_t2 = joint.tvalues[-1]
                joint_Chi2 = 2*(joint.llf - self.null.llf)
                pv = stats.chi2.sf(joint_Chi2,2)
                joint_res.append([self.chr[li],self.id[li],self.id[offset+i],
                                  self.pos[li],self.pos[offset+i],joint_b1,joint_se1,
                                  joint_or1,joint_t1,joint_p1,joint_b2,joint_se2,
                                  joint_or2,joint_t2,joint_p2,joint_Chi2,pv])
            else:
                continue

    def compute_marg(self,marg_res,Z,G,offset,args):
        for i,snp in enumerate(G.T):
            X = np.hstack((self.cov,snp.reshape((len(snp),1))))
            if self.logistic:
                marg = sm.Logit(self.Y,X).fit(disp=0)
            else:
                marg = sm.OLS(self.Y,X).fit(disp=0)
            marg_b = marg.params[-1]
            marg_or = np.exp(marg_b)
            marg_se = marg.bse[-1]
            marg_p = marg.pvalues[-1]
            marg_t = marg.tvalues[-1]
            marg_Chi2 = 2*(marg.llf - self.null.llf)
            pv = stats.chi2.sf(marg_Chi2,1)
            marg_res.append([self.chr[offset+i],self.id[offset+i],
                             self.pos[offset+i],marg_b, marg_se, marg_or, marg_t,
                             marg_p,marg_Chi2,pv])
            Z.append(marg_b/marg_se)
Exemplo n.º 14
0
 def __init__(self, args):
     if args.window_type not in ['BP', 'SNP']:
         raise ValueError('Window type not supported')
     bed_1 = Bed(args.bfile1)  #
     bed_2 = Bed(args.bfile2)
     af1 = self.get_allele_frequency(bed_1, args)  #
     af2 = self.get_allele_frequency(bed_2, args)
     print(len(af1), "SNPs in file 1")
     print(len(af2), "SNPs in file 2")
     snps_1 = (af1 > args.maf) & (af1 < 1 - args.maf)  #
     snps_2 = (af2 > args.maf) & (af2 < 1 - args.maf)
     print(np.sum(snps_1), "SNPs in file 1 after MAF filter")
     print(np.sum(snps_2), "SNPs in file 2 after MAF filter")
     if (args.from_bp is not None) and (args.to_bp is not None):
         k1 = (bed_1.pos[:, 2] > args.from_bp) & (bed_1.pos[:, 2] <
                                                  args.to_bp)
         k2 = (bed_2.pos[:, 2] > args.from_bp) & (bed_2.pos[:, 2] <
                                                  args.to_bp)
         snps_1 = snps_1 & k1
         snps_2 = snps_2 & k2
     snps_to_use = np.intersect1d(bed_1.sid[snps_1], bed_2.sid[snps_2])
     print(len(snps_to_use), "SNPs common in both populations")
     if args.extract is not None:
         keep = np.array([l.strip() for l in open(args.extract, 'r')])
         print(len(keep), "SNPs to extract")
         snps_to_use = np.intersect1d(snps_to_use, keep)
         print(len(snps_to_use), "SNPs remaining after extraction")
     bed_1_index = np.sort(bed_1.sid_to_index(snps_to_use))  #
     bed_2_index = np.sort(bed_2.sid_to_index(snps_to_use))
     if not args.no_align:
         alignment,bed_1_index,bed_2_index =\
             self.align_alleles(bed_1,bed_1_index,af1,bed_2,bed_2_index,af2)
     else:
         alignment = np.ones(len(bed_1_index))
     pos = bed_1.pos[bed_1_index]  #
     bim_1 = pd.read_table(
         bed_1.filename + '.bim',
         header=None,
         names=['chm', 'id', 'pos_mb', 'pos_bp', 'a1', 'a2'])
     af1 = af1[bed_1_index]  #
     af2 = af2[bed_2_index]
     if args.afile1 is not None:
         a1 = pd.read_table(args.afile,
                            header=None,
                            sep='\s*',
                            names=['id1', 'id2', 'theta'])
     else:
         a1 = None
     if args.afile2 is not None:
         a2 = pd.read_table(args.afile,
                            header=None,
                            sep='\s*',
                            names=['id1', 'id2', 'theta'])
     else:
         a2 = None
     self.af1 = af1  #
     self.af2 = af2
     self.M = len(bed_1_index)  #
     self.N = (bed_1.iid_count, bed_2.iid_count)  #
     self.chr = pos[:, 0]
     self.pos = pos[:, 2]
     self.id = bed_1.sid[bed_1_index]
     self.A1 = bim_1['a1'].loc[bed_1_index]
     self.A2 = bim_1['a2'].loc[bed_1_index]
     self.windows = self.get_windows(pos, args)  #
     self.scores1 = self.compute(bed_1, bed_1_index, af1, a1, args)
     self.scores2 = self.compute(bed_2, bed_2_index, af2, a2, args)  #
     self.scoresX = self.compute2(bed_1, bed_1_index, bed_2, bed_2_index,
                                  alignment, a1, a2, args)  #
Exemplo n.º 15
0
class sample(object):
    def __init__(self,args):
        self.bed = Bed(args.bfile) #
        self.N = self.bed.iid_count
        if args.covfile is not None:
            cov = pd.read_table(args.covfile,header=None)
            self.cov = sm.add_constant(ju._reorder(cov,self.bed.iid))
            self.ncov = self.cov.shape[1] # + constant
        else:
            self.cov = np.ones((self.N,1))
            self.ncov = 1 # Constant
        af = ju.get_allele_frequency(self.bed,args) #
        snps = (af>args.maf)&(af<1-args.maf) #
        if (args.from_bp is not None) and (args.to_bp is not None):
            k = (bed.pos[:,2]>args.from_bp)&(bed.pos[:,2]<args.to_bp)
            snp1 = snps&k
        snps_to_use = self.bed.sid[snps]
        if args.extract is not None:
            keep = np.array([l.strip() for l in open(args.extract,'r')])
            snps_to_use = np.intersect1d(snps_to_use,keep)
        self.bed_index = np.sort(self.bed.sid_to_index(snps_to_use)) #
        pos = self.bed.pos[self.bed_index] #
        bim=pd.read_table(self.bed.filename+'.bim',header=None,
                          names=['chm','id','pos_mb','pos_bp','a1','a2'])
        self.af = af[self.bed_index] #
        self.M = len(self.bed_index) #
        self.windows = ju.get_windows(pos,self.M,args.window_size,args.window_type)
        self.sample_windows = ju.get_windows(pos,self.M,args.sample_window_size,
                                             args.sample_window_type)
        self.pos = pos[:,2]
        self.chr = pos[:,0]
        self.id = self.bed.sid[self.bed_index]
        self.A1 = bim['a1'].loc[self.bed_index]
        self.A2 = bim['a2'].loc[self.bed_index]
        self.numSamples = args.numSamples
        self.JMaxStats, self.ZMaxStats = self.sample(args)
        self.JMinP = stats.chi2.sf(self.JMaxStats,2)
        self.ZMinP = stats.chi2.sf(self.ZMaxStats**2,1)
        self.minP = np.minimum(self.JMinP,self.ZMinP)

    def sample(self,args):
        t=time()
        nz = 0
        ZMaxStats = np.zeros((self.numSamples,1))
        JMaxStats = np.zeros((self.numSamples,1))
        windex = 0
        sli,sri = self.sample_windows[windex]
        tli,tri = self.windows[windex]
        nstr = np.max((args.SNPs_to_read,sri-sli))
        offset = sli
        G = self.bed[:,self.bed_index[sli:(sli+nstr)]].read().val
        G = ju._impute_missing(G)
        A = ju._norm_data(G)
        # Sample Z-scores and do joint tests of first window
        R = np.dot(A[:,sli:sri].T/self.N,A[:,sli:sri])
        Z=np.random.multivariate_normal(np.zeros((R.shape[0])),R,args.numSamples)
        nz += R.shape[0]
        zli,zri = sli,sri # position of Z relative to full genotype
        gli,gri = zli,zri # position of Z relative to genotype in memory
        Rp = R[(tli+1):tri,0]
        to_test = Rp**2 > args.r2min
        Rp = Rp[to_test]
        Zl = np.atleast_2d(Z[:,0]).T
        Zr = np.array(Z[:,1:(tri-tli)])[:,to_test]
        ChiP = (1/(1-Rp**2))*(Zl**2+Zr**2-2*Rp*Zl*Zr)
        ZMaxStats = np.atleast_2d(np.hstack((ZMaxStats,abs(Z))).max(1)).T
        # ZMaxStats = np.maximum(ZMaxStats,abs(Z.max(1)))
        # JMaxStats = np.maximum(JMaxStats,ChiP.max(1))
        JMaxStats = np.atleast_2d(np.hstack((JMaxStats,ChiP)).max(1)).T
        # Slide through genotype in memory
        while True:
            windex += 1
            sli,sri = self.sample_windows[windex]
            tli,tri = self.windows[windex]
            if sri >= offset+nstr: break
            tst,tfi,sst,sfi = np.array([tli,tri,sli,sri])-offset
            #print sli, sri, zli, zri, gli, gri, Z.shape[1]
            if zli < sli: # drop zli..sli and update indices
                Z = Z[:,(sli-zli):]
                zli,gli = sli,sst
            if zri < sri: # marginal sample everything from zri..sri
                S = A[:,gli:gri] # G that overlaps Z
                Sn = A[:,gri:sri] # G about to have Z scores sampled
                r12 = S.T.dot(Sn)/self.N
                r11 = Sn.T.dot(Sn)/self.N
                Zn = self.sample_func(Z,S,Sn,r11,r12,args)
                Z = np.hstack((Z,Zn))
                nz += (sri-gri)
                zri,gri = sri,sfi
            ZMaxStats = np.atleast_2d(np.hstack((ZMaxStats,abs(Zn))).max(1)).T
            # ZMaxStats = np.maximum(ZMaxStats,abs(Zn.flatten()))
            # All correlations of SNP tli with SNPs in its window
            # Surely these are already computed and some cleverness
            #  can be used to re-use them but its a fast calculation anyways
            if sri-sli > 1:
                R = np.dot(np.atleast_2d(A[:,tst]/self.N),
                           A[:,(tst+1):tfi]).flatten()
                to_test = R**2 > args.r2min
                R = R[to_test]
                Zl = np.atleast_2d(Z[:,0]).T
                Zr = np.array(Z[:,1:(sri-sli)])[:,to_test]
                ChiP = (1/(1-R**2))*(Zl**2+Zr**2-2*R*Zl*Zr)
                JMaxStats = np.atleast_2d(np.hstack((JMaxStats,ChiP)).max(1)).T
                #JMaxStats = np.maximum(JMaxStats,ChiP.max(1))
        for i in xrange(offset+nstr,self.M,nstr):
            sys.stdout.flush()
            sys.stdout.write("SNP: %d, %f\r" % (i, time()-t))
            Gn = self.bed[:,self.bed_index[i:(i+nstr)]].read().val
            Gn = ju._impute_missing(Gn)
            An = ju._norm_data(Gn)
            G = np.hstack((G,Gn))
            A = np.hstack((A,An))
            if G.shape[1] > args.SNPs_to_store:
                G = G[:,nstr:]
                A = A[:,nstr:]
                offset += nstr
                gli -= nstr
                gri -= nstr
            while sri < i+nstr:
                tst,tfi,sst,sfi = np.array([tli,tri,sli,sri])-offset
                if zli < sli: # drop zli..sli and update indices
                    Z = Z[:,(sli-zli):]
                    zli,gli = sli,sst
                if zri < sri: # marginal sample everything from zri..sri
                    S = A[:,gli:gri] # G that overlaps Z
                    Sn = A[:,gri:sfi] # G about to have Z scores sampled
                    r12 = S.T.dot(Sn)/self.N
                    r11 = Sn.T.dot(Sn)/self.N
                    Zn = self.sample_func(Z,S,Sn,r11,r12,args)
                    Z = np.hstack((Z,Zn))
                    nz += (sfi-gri)
                    zri,gri = sri,sfi
                ZMaxStats = np.atleast_2d(np.hstack((ZMaxStats,abs(Zn))).max(1)).T
                # ZMaxStats = np.maximum(ZMaxStats,abs(Zn.flatten()))
                if sri-sli > 1:
                    R = np.dot(np.atleast_2d(A[:,tst]/self.N),
                               A[:,(tst+1):tfi]).flatten()
                    to_test = R**2 > args.r2min
                    R = R[to_test]
                    Zl = np.atleast_2d(Z[:,0]).T
                    Zr = np.array(Z[:,1:(sri-sli)])[:,to_test]
                    ChiP = (1/(1-R**2))*(Zl**2+Zr**2-2*R*Zl*Zr)
                    #JMaxStats = np.maximum(JMaxStats,ChiP.max(1))
                    JMaxStats = np.atleast_2d(np.hstack((JMaxStats,ChiP)).max(1)).T
                try:
                    windex += 1
                    sli,sri = self.sample_windows[windex]
                    tli,tri = self.windows[windex]
                except IndexError:
                    break
        # print "HERE:", nz
        return JMaxStats.flatten(), ZMaxStats.flatten()

    def sample_func(self,Z,S,Sn,r11,r12,args):
        S22IS12 = sp.linalg.lstsq(S,Sn,cond=1e-8)[0]
        muC = Z.dot(S22IS12)
        SigC = r11-r12.T.dot(S22IS12)
        Zn = np.random.multivariate_normal(np.zeros((SigC.shape[0])),
                                              SigC,size=args.numSamples)
        Zn += muC
        return Zn