示例#1
0
文件: hase.py 项目: urmovosa/hase
        mapper.load_flip(args.mapper)
        mapper.load(args.mapper)

        phen = Reader('phenotype')
        phen.start(args.phenotype[0])

        gen = Reader('genotype')
        gen.start(args.genotype[0],
                  hdf5=args.hdf5,
                  study_name=args.study_name[0],
                  ID=False)

        e = Encoder(args.out)
        e.study_name = args.study_name[0]

        row_index, ids = study_indexes(phenotype=phen.folder._data,
                                       genotype=gen.folder._data)
        with Timer() as t:

            e.matrix(len(ids), save=True)
            N_snps_read = 0
            while True:
                with Timer() as t_gen:
                    genotype = gen.get_next()
                    if isinstance(genotype, type(None)):
                        break

                    flip = mapper.flip[
                        args.study_name[0]][N_snps_read:N_snps_read +
                                            genotype.shape[0]]
                    N_snps_read += genotype.shape[0]
                    flip_index = (flip == -1)
示例#2
0
def haseregression(phen,
                   gen,
                   cov,
                   mapper,
                   Analyser,
                   maf,
                   intercept=True,
                   interaction=None):

    g = tuple([i.folder._data for i in gen])

    row_index, ids = study_indexes(phenotype=phen.folder._data,
                                   genotype=g,
                                   covariates=cov.folder._data)

    if mapper is not None:
        SNP = [0, 0, mapper.n_keys]
    else:
        SNP = [0, 0, 'unknown']

    covariates = cov.get_next(index=row_index[2])
    a_cov = A_covariates(covariates, intercept=intercept)

    while True:
        gc.collect()
        if mapper is not None:
            if mapper.cluster == 'n':
                SNPs_index, keys = mapper.get()
            else:
                ch = mapper.chunk_pop()
                if ch is None:
                    SNPs_index = None
                    break
                SNPs_index, keys = mapper.get(chunk_number=ch)
            if isinstance(SNPs_index, type(None)):
                break
            Analyser.rsid = keys
        else:
            SNPs_index = None

        with Timer() as t:
            genotype = merge_genotype(gen, SNPs_index, mapper)
        print('time to read and merge genotype {}s'.format(t.secs))
        gc.collect()
        if genotype is None:
            print 'All genotype processed!'
            break
        SNP[0] += genotype.shape[0]
        genotype = genotype[:, row_index[0]]

        if mapper is None:
            Analyser.rsid = np.array(range(genotype.shape[0]))

        MAF = np.mean(genotype, axis=1) / 2
        STD = np.std(genotype, axis=1)

        if maf != 0:

            filter = (MAF > maf) & (MAF < 1 - maf) & (MAF != 0.5)
            genotype = genotype[filter, :]
            Analyser.MAF = MAF[filter]
            Analyser.rsid = Analyser.rsid[filter]

            if genotype.shape[0] == 0:
                print 'NO SNPs > MAF'
                continue

        else:
            Analyser.MAF = MAF

        SNP[1] += genotype.shape[0]

        while True:
            phenotype = phen.get_next(index=row_index[1])

            if isinstance(phenotype, type(None)):
                phen.folder.processed = 0
                print 'All phenotypes processed!'
                break

            if phen.permutation:
                np.random.shuffle(phenotype)

            b_cov = B_covariates(covariates, phenotype, intercept=intercept)

            C = C_matrix(phenotype)

            if interaction is not None:
                pass

            a_test = A_tests(covariates, genotype, intercept=intercept)
            a_inv = A_inverse(a_cov, a_test)

            N_con = a_inv.shape[1] - 1

            DF = (phenotype.shape[0] - a_inv.shape[1])

            b4 = B4(phenotype, genotype)

            t_stat, SE = HASE(b4, a_inv, b_cov, C, N_con, DF)
            print('Read {}, processed {}, total {}'.format(
                SNP[0], SNP[1], SNP[2]))
            Analyser.t_stat = t_stat
            Analyser.SE = SE
            if mapper is not None and mapper.cluster == 'y':
                Analyser.cluster = True
                Analyser.chunk = ch
                Analyser.node = mapper.node[1]
            if phen.permutation:
                Analyser.permutation = True
            Analyser.save_result(
                phen.folder._data.names[phen.folder._data.start:phen.folder.
                                        _data.finish])
            t_stat = None
            Analyser.t_stat = None
            del b4
            del C
            del b_cov
            del a_inv
            del a_test
            del t_stat
            gc.collect()

    if Analyser.cluster:
        np.save(
            os.path.join(Analyser.out,
                         str(Analyser.node) + '_node_RSID.npy'),
            Analyser.rsid_dic)
示例#3
0
文件: pard.py 项目: roshchupkin/hase
def partial_derivatives(save_path=None,COV=None,PHEN=None, GEN=None,
                        MAP=None, MAF=None, R2=None, B4_flag=False, study_name=None,intercept=True):

    row_index, ids =  study_indexes(phenotype=PHEN.folder._data,genotype=GEN.folder._data,covariates=COV.folder._data)

    metadata={}



    #TODO (mid) add parameter to compute PD only for new phenotypes or cov
    metadata['id']=ids
    metadata['MAF']=[]
    metadata['filter']=[]
    metadata['names']=[] #TODO (low) change to cov_names
    metadata['phenotype']=[]
    b_cov=[]
    C=[]
    a_test=[]
    b4=[]

    covariates=COV.get_next(index=row_index[2])
    if intercept:
        metadata['names'].append(study_name+ '_intercept')
    metadata['names']=metadata['names']+[ study_name+ '_' + i for i in COV.folder._data.get_names() ]

    a_cov=A_covariates(covariates,intercept=intercept)
    np.save(os.path.join(save_path,study_name+'_a_cov.npy'),a_cov)

    i=1

    with Timer() as t_phen:

        while True:

            phenotype=PHEN.get_next(index=row_index[1])
            if isinstance(phenotype, type(None)):
                b_cov=np.concatenate(b_cov, axis=1)
                C=np.concatenate(C, axis=0)
                np.save(os.path.join(save_path,study_name+'_b_cov.npy'),b_cov)
                np.save(os.path.join(save_path,study_name+'_C.npy'),C)
                break

            metadata['phenotype']=metadata['phenotype']+ list(PHEN.folder._data.get_names())
            b_cov.append(B_covariates(covariates,phenotype,intercept=intercept))
            C.append(C_matrix(phenotype))

    print ('Time to PD phenotype {} is {} s'.format(np.array(C).shape, t_phen.secs))

    N_snps_read=0
    while True:
        with Timer() as t_gen:
            genotype=GEN.get_next()
            if isinstance(genotype, type(None)):
                np.save(os.path.join(save_path,study_name+'_a_test.npy'), np.concatenate(a_test) )
                np.save(os.path.join(save_path,study_name+'_metadata.npy'),metadata)
                if B4_flag:
                    b4=np.concatenate(b4, axis=0)
                    np.save(os.path.join(save_path,study_name+'_b4.npy'),b4)
                break
            flip = MAP.flip[N_snps_read:N_snps_read + genotype.shape[0], 0]
            N_snps_read += genotype.shape[0]
            flip_index=(flip==-1)
            genotype=np.apply_along_axis(lambda x: flip*(x-2*flip_index) ,0,genotype)
            genotype=genotype[:,row_index[0]]
            maf=np.mean(genotype, axis=1)/2
            metadata['MAF']=metadata['MAF']+list(maf)

            #TODO (low) add interaction
            a_test.append(A_tests(covariates,genotype,intercept=intercept))

            if B4_flag:
                #works only when all phenotypes in one chunk, if not, do not use this option!
                #it would use to much disk space anyway
                if len([f for f in PHEN.folder.files if f!='info_dic.npy' ])>1:
                    print 'pd_full flag disabled!'
                    B4_flag=False
                    continue
                PHEN.folder.processed=0
                if isinstance(phenotype, type(None)):
                    phenotype=PHEN.get_next(index=row_index[1])
                b4.append(B4(phenotype,genotype))

        print ('Time to PD genotype {} is {} s'.format(genotype.shape, t_gen.secs))
示例#4
0
def haseregression(phen,gen,cov, mapper, Analyser, maf,intercept=True):

	g=tuple( [i.folder._data for i in gen ] )

	row_index, ids =  study_indexes(phenotype=phen.folder._data,
											   genotype=g,
											   covariates=cov.folder._data)

	if mapper is not None:
		SNP=[0,0,mapper.n_keys]
	else:
		SNP=[0,0,'unknown']

	covariates=cov.get_next(index=row_index[2])
	a_cov=A_covariates(covariates,intercept=intercept)

	while True:
		gc.collect()
		if mapper is not None:
			if mapper.cluster=='n':
				SNPs_index, keys=mapper.get()
			else:
				ch=mapper.chunk_pop()
				if ch is None:
					SNPs_index=None
					break
				print ch
				SNPs_index, keys=mapper.get(chunk_number=ch)
			if isinstance(SNPs_index, type(None)):
				break
			Analyser.rsid=keys
		else:
			SNPs_index=None

		with Timer() as t:
			genotype=merge_genotype(gen, SNPs_index, mapper)
		print ('time to read and merge genotype {}s'.format(t.secs))
		gc.collect()
		if genotype is None:
			print 'All genotype processed!'
			break
		SNP[0]+=genotype.shape[0]
		genotype=genotype[:,row_index[0]]

		if mapper is None:
			Analyser.rsid=np.array(range(genotype.shape[0]))


		MAF=np.mean(genotype, axis=1)/2

		if maf!=0:

			filter=(MAF>maf) & (MAF<1-maf) & (MAF!=0.5)
			genotype=genotype[filter,:]
			Analyser.MAF=MAF[filter]
			Analyser.rsid=Analyser.rsid[filter]

			if genotype.shape[0]==0:
				print 'NO SNPs > MAF'
				continue

		else:
			Analyser.MAF=MAF

		SNP[1]+=genotype.shape[0]

		while True:
			phenotype=phen.get_next(index=row_index[1])

			if isinstance(phenotype, type(None)):
				phen.folder.processed=0
				print 'All phenotypes processed!'
				break

			if phen.permutation:
				np.random.shuffle(phenotype)

			b_cov=B_covariates(covariates,phenotype,intercept=intercept)

			C=C_matrix(phenotype)
			a_test=A_tests(covariates,genotype,intercept=intercept)
			a_inv=A_inverse(a_cov,a_test)

			N_con=a_inv.shape[1] - 1

			DF=(phenotype.shape[0] - a_inv.shape[1])

			b4=B4(phenotype,genotype)


			t_stat, SE=HASE(b4, a_inv, b_cov, C, N_con, DF)
			print('Read {}, processed {}, total {}'.format(SNP[0],SNP[1],SNP[2] ))
			Analyser.t_stat=t_stat
			Analyser.SE=SE
			if mapper is not None and mapper.cluster == 'y':
				Analyser.cluster=True
				Analyser.chunk=ch
				Analyser.node=mapper.node[1]
			if phen.permutation:
				Analyser.permutation=True
			Analyser.save_result( phen.folder._data.names[phen.folder._data.start:phen.folder._data.finish])
			t_stat=None
			Analyser.t_stat=None
			del b4
			del C
			del b_cov
			del a_inv
			del a_test
			del t_stat
			gc.collect()

	if Analyser.cluster:
		np.save(os.path.join(Analyser.out,str(Analyser.node)+'_node_RSID.npy'),Analyser.rsid_dic)
示例#5
0
def partial_derivatives(save_path=None,
                        COV=None,
                        PHEN=None,
                        GEN=None,
                        MAP=None,
                        MAF=None,
                        R2=None,
                        B4_flag=False,
                        study_name=None,
                        intercept=True):

    row_index, ids = study_indexes(phenotype=PHEN.folder._data,
                                   genotype=GEN.folder._data,
                                   covariates=COV.folder._data)

    metadata = {}

    #TODO (mid) add parameter to compute PD only for new phenotypes or cov
    metadata['id'] = ids
    metadata['MAF'] = []
    metadata['filter'] = []
    metadata['names'] = []  #TODO (low) change to cov_names
    metadata['phenotype'] = []
    b_cov = []
    C = []
    a_test = []
    b4 = []

    covariates = COV.get_next(index=row_index[2])

    if MAP.cluster == 'n' or MAP.node[1] == 1:
        if intercept:
            metadata['names'].append(study_name + '_intercept')
        metadata['names'] = metadata['names'] + [
            study_name + '_' + i for i in COV.folder._data.get_names()
        ]

        a_cov = A_covariates(covariates, intercept=intercept)
        np.save(os.path.join(save_path, study_name + '_a_cov.npy'), a_cov)

        with Timer() as t_phen:

            while True:

                phenotype = PHEN.get_next(index=row_index[1])
                if isinstance(phenotype, type(None)):
                    b_cov = np.concatenate(b_cov, axis=1)
                    C = np.concatenate(C, axis=0)
                    np.save(os.path.join(save_path, study_name + '_b_cov.npy'),
                            b_cov)
                    np.save(os.path.join(save_path, study_name + '_C.npy'), C)
                    break
                # Moved to commented line below the outside of this while loop as the names
                # are not sliced like the 'PHEN.get_next(...)' does for phenotype.
                # This solves an exception being thrown when a larger number of samples is used. (> 1000)
                # metadata['phenotype'] = metadata['phenotype'] + list(PHEN.folder._data.get_names())
                b_cov.append(
                    B_covariates(covariates, phenotype, intercept=intercept))
                C.append(C_matrix(phenotype))

            # Moved from the while loop above to fix bug. See ^^^ for more.
            metadata['phenotype'] = metadata['phenotype'] + list(
                PHEN.folder._data.get_names())

        print('Time to PD phenotype {} is {} s'.format(
            np.array(C).shape, t_phen.secs))

    if MAP.cluster == 'y':
        f_max = np.max([int(f.split('_')[0]) for f in GEN.folder.files])
        files2read = [
            '{}_{}.h5'.format(i, study_name)
            for i in np.array_split(range(f_max +
                                          1), MAP.node[0])[MAP.node[1] - 1]
        ][::-1]
        filesdone = []
        for i in range(MAP.node[1] - 1):
            filesdone = filesdone + [
                '{}_{}.h5'.format(i, study_name)
                for i in np.array_split(range(f_max + 1), MAP.node[0])[i]
            ]

        N_snps_read = 0
        for f in filesdone:
            file = os.path.join(GEN.folder.path, 'genotype', f)
            N_snps_read += GEN.folder.get_info(file)['shape'][0]
    else:
        N_snps_read = 0
    while True:
        with Timer() as t_gen:
            if MAP.cluster == 'y':
                if len(files2read) != 0:
                    file = os.path.join(GEN.folder.path, 'genotype',
                                        files2read.pop())
                    genotype = GEN.folder.read(file)
                else:
                    genotype = None
            else:
                genotype = GEN.get_next()
            if isinstance(genotype, type(None)):
                if MAP.cluster == 'y':

                    np.save(
                        os.path.join(
                            save_path, 'node_{}_'.format(MAP.node[1]) +
                            study_name + '_a_test.npy'),
                        np.concatenate(a_test).astype(np.float64))
                    np.save(
                        os.path.join(
                            save_path, 'node_{}_'.format(MAP.node[1]) +
                            study_name + '_metadata.npy'), metadata)
                    if B4_flag:
                        b4 = np.concatenate(b4, axis=0)
                        np.save(
                            os.path.join(
                                save_path, 'node_{}_'.format(MAP.node[1]) +
                                study_name + '_b4.npy'), b4.astype(np.float64))
                    if MAP.node[1] == MAP.node[0]:
                        merge_PD(save_path, MAP.node[0], study_name)

                else:
                    np.save(
                        os.path.join(save_path, study_name + '_a_test.npy'),
                        np.concatenate(a_test))
                    np.save(
                        os.path.join(save_path, study_name + '_metadata.npy'),
                        metadata)
                    if B4_flag:
                        b4 = np.concatenate(b4, axis=0)
                        np.save(
                            os.path.join(save_path, study_name + '_b4.npy'),
                            b4)
                break
            flip = MAP.flip[GEN.folder.name][N_snps_read:N_snps_read +
                                             genotype.shape[0]]
            N_snps_read += genotype.shape[0]
            flip_index = (flip == -1)
            genotype = np.apply_along_axis(
                lambda x: flip * (x - 2 * flip_index), 0, genotype)
            genotype = genotype[:, row_index[0]]
            maf = np.mean(genotype, axis=1) / 2
            metadata['MAF'] = metadata['MAF'] + list(maf)

            #TODO (low) add interaction
            a_test.append(A_tests(covariates, genotype, intercept=intercept))

            if B4_flag:
                #works only when all phenotypes in one chunk, if not, do not use this option!
                #it would use to much disk space anyway
                if len([f for f in PHEN.folder.files if f != 'info_dic.npy'
                        ]) > 1:
                    print 'pd_full flag disabled!'
                    B4_flag = False
                    continue
                PHEN.folder.processed = 0
                phenotype = PHEN.get_next(index=row_index[1])
                b4.append(B4(phenotype, genotype))

        print('Time to PD genotype {} is {} s'.format(genotype.shape,
                                                      t_gen.secs))
示例#6
0
文件: hase.py 项目: roshchupkin/hase
		mapper.genotype_names=args.study_name
		mapper.chunk_size=MAPPER_CHUNK_SIZE
		mapper.reference_name=args.ref_name
		mapper.load_flip(args.mapper)
		mapper.load(args.mapper)

		phen=Reader('phenotype')
		phen.start(args.phenotype[0])

		gen=Reader('genotype')
		gen.start(args.genotype[0], hdf5=args.hdf5, study_name=args.study_name[0], ID=False)

		e=Encoder(args.out)
		e.study_name=args.study_name[0]

		row_index, ids =  study_indexes(phenotype=phen.folder._data,genotype=gen.folder._data)
		with Timer() as t:

			e.matrix(len(ids),save=True)
			N_snps_read=0
			while True:
				with Timer() as t_gen:
					genotype = gen.get_next()
					if isinstance(genotype, type(None)):
						break

					flip=mapper.flip[N_snps_read:N_snps_read+genotype.shape[0],0]
					N_snps_read+=genotype.shape[0]
					flip_index=(flip==-1)
					genotype=np.apply_along_axis(lambda x: flip*(x-2*flip_index) ,0,genotype)
					genotype=genotype[:,row_index[0]]