mapper.load_flip(args.mapper) mapper.load(args.mapper) phen = Reader('phenotype') phen.start(args.phenotype[0]) gen = Reader('genotype') gen.start(args.genotype[0], hdf5=args.hdf5, study_name=args.study_name[0], ID=False) e = Encoder(args.out) e.study_name = args.study_name[0] row_index, ids = study_indexes(phenotype=phen.folder._data, genotype=gen.folder._data) with Timer() as t: e.matrix(len(ids), save=True) N_snps_read = 0 while True: with Timer() as t_gen: genotype = gen.get_next() if isinstance(genotype, type(None)): break flip = mapper.flip[ args.study_name[0]][N_snps_read:N_snps_read + genotype.shape[0]] N_snps_read += genotype.shape[0] flip_index = (flip == -1)
def haseregression(phen, gen, cov, mapper, Analyser, maf, intercept=True, interaction=None): g = tuple([i.folder._data for i in gen]) row_index, ids = study_indexes(phenotype=phen.folder._data, genotype=g, covariates=cov.folder._data) if mapper is not None: SNP = [0, 0, mapper.n_keys] else: SNP = [0, 0, 'unknown'] covariates = cov.get_next(index=row_index[2]) a_cov = A_covariates(covariates, intercept=intercept) while True: gc.collect() if mapper is not None: if mapper.cluster == 'n': SNPs_index, keys = mapper.get() else: ch = mapper.chunk_pop() if ch is None: SNPs_index = None break SNPs_index, keys = mapper.get(chunk_number=ch) if isinstance(SNPs_index, type(None)): break Analyser.rsid = keys else: SNPs_index = None with Timer() as t: genotype = merge_genotype(gen, SNPs_index, mapper) print('time to read and merge genotype {}s'.format(t.secs)) gc.collect() if genotype is None: print 'All genotype processed!' break SNP[0] += genotype.shape[0] genotype = genotype[:, row_index[0]] if mapper is None: Analyser.rsid = np.array(range(genotype.shape[0])) MAF = np.mean(genotype, axis=1) / 2 STD = np.std(genotype, axis=1) if maf != 0: filter = (MAF > maf) & (MAF < 1 - maf) & (MAF != 0.5) genotype = genotype[filter, :] Analyser.MAF = MAF[filter] Analyser.rsid = Analyser.rsid[filter] if genotype.shape[0] == 0: print 'NO SNPs > MAF' continue else: Analyser.MAF = MAF SNP[1] += genotype.shape[0] while True: phenotype = phen.get_next(index=row_index[1]) if isinstance(phenotype, type(None)): phen.folder.processed = 0 print 'All phenotypes processed!' break if phen.permutation: np.random.shuffle(phenotype) b_cov = B_covariates(covariates, phenotype, intercept=intercept) C = C_matrix(phenotype) if interaction is not None: pass a_test = A_tests(covariates, genotype, intercept=intercept) a_inv = A_inverse(a_cov, a_test) N_con = a_inv.shape[1] - 1 DF = (phenotype.shape[0] - a_inv.shape[1]) b4 = B4(phenotype, genotype) t_stat, SE = HASE(b4, a_inv, b_cov, C, N_con, DF) print('Read {}, processed {}, total {}'.format( SNP[0], SNP[1], SNP[2])) Analyser.t_stat = t_stat Analyser.SE = SE if mapper is not None and mapper.cluster == 'y': Analyser.cluster = True Analyser.chunk = ch Analyser.node = mapper.node[1] if phen.permutation: Analyser.permutation = True Analyser.save_result( phen.folder._data.names[phen.folder._data.start:phen.folder. _data.finish]) t_stat = None Analyser.t_stat = None del b4 del C del b_cov del a_inv del a_test del t_stat gc.collect() if Analyser.cluster: np.save( os.path.join(Analyser.out, str(Analyser.node) + '_node_RSID.npy'), Analyser.rsid_dic)
def partial_derivatives(save_path=None,COV=None,PHEN=None, GEN=None, MAP=None, MAF=None, R2=None, B4_flag=False, study_name=None,intercept=True): row_index, ids = study_indexes(phenotype=PHEN.folder._data,genotype=GEN.folder._data,covariates=COV.folder._data) metadata={} #TODO (mid) add parameter to compute PD only for new phenotypes or cov metadata['id']=ids metadata['MAF']=[] metadata['filter']=[] metadata['names']=[] #TODO (low) change to cov_names metadata['phenotype']=[] b_cov=[] C=[] a_test=[] b4=[] covariates=COV.get_next(index=row_index[2]) if intercept: metadata['names'].append(study_name+ '_intercept') metadata['names']=metadata['names']+[ study_name+ '_' + i for i in COV.folder._data.get_names() ] a_cov=A_covariates(covariates,intercept=intercept) np.save(os.path.join(save_path,study_name+'_a_cov.npy'),a_cov) i=1 with Timer() as t_phen: while True: phenotype=PHEN.get_next(index=row_index[1]) if isinstance(phenotype, type(None)): b_cov=np.concatenate(b_cov, axis=1) C=np.concatenate(C, axis=0) np.save(os.path.join(save_path,study_name+'_b_cov.npy'),b_cov) np.save(os.path.join(save_path,study_name+'_C.npy'),C) break metadata['phenotype']=metadata['phenotype']+ list(PHEN.folder._data.get_names()) b_cov.append(B_covariates(covariates,phenotype,intercept=intercept)) C.append(C_matrix(phenotype)) print ('Time to PD phenotype {} is {} s'.format(np.array(C).shape, t_phen.secs)) N_snps_read=0 while True: with Timer() as t_gen: genotype=GEN.get_next() if isinstance(genotype, type(None)): np.save(os.path.join(save_path,study_name+'_a_test.npy'), np.concatenate(a_test) ) np.save(os.path.join(save_path,study_name+'_metadata.npy'),metadata) if B4_flag: b4=np.concatenate(b4, axis=0) np.save(os.path.join(save_path,study_name+'_b4.npy'),b4) break flip = MAP.flip[N_snps_read:N_snps_read + genotype.shape[0], 0] N_snps_read += genotype.shape[0] flip_index=(flip==-1) genotype=np.apply_along_axis(lambda x: flip*(x-2*flip_index) ,0,genotype) genotype=genotype[:,row_index[0]] maf=np.mean(genotype, axis=1)/2 metadata['MAF']=metadata['MAF']+list(maf) #TODO (low) add interaction a_test.append(A_tests(covariates,genotype,intercept=intercept)) if B4_flag: #works only when all phenotypes in one chunk, if not, do not use this option! #it would use to much disk space anyway if len([f for f in PHEN.folder.files if f!='info_dic.npy' ])>1: print 'pd_full flag disabled!' B4_flag=False continue PHEN.folder.processed=0 if isinstance(phenotype, type(None)): phenotype=PHEN.get_next(index=row_index[1]) b4.append(B4(phenotype,genotype)) print ('Time to PD genotype {} is {} s'.format(genotype.shape, t_gen.secs))
def haseregression(phen,gen,cov, mapper, Analyser, maf,intercept=True): g=tuple( [i.folder._data for i in gen ] ) row_index, ids = study_indexes(phenotype=phen.folder._data, genotype=g, covariates=cov.folder._data) if mapper is not None: SNP=[0,0,mapper.n_keys] else: SNP=[0,0,'unknown'] covariates=cov.get_next(index=row_index[2]) a_cov=A_covariates(covariates,intercept=intercept) while True: gc.collect() if mapper is not None: if mapper.cluster=='n': SNPs_index, keys=mapper.get() else: ch=mapper.chunk_pop() if ch is None: SNPs_index=None break print ch SNPs_index, keys=mapper.get(chunk_number=ch) if isinstance(SNPs_index, type(None)): break Analyser.rsid=keys else: SNPs_index=None with Timer() as t: genotype=merge_genotype(gen, SNPs_index, mapper) print ('time to read and merge genotype {}s'.format(t.secs)) gc.collect() if genotype is None: print 'All genotype processed!' break SNP[0]+=genotype.shape[0] genotype=genotype[:,row_index[0]] if mapper is None: Analyser.rsid=np.array(range(genotype.shape[0])) MAF=np.mean(genotype, axis=1)/2 if maf!=0: filter=(MAF>maf) & (MAF<1-maf) & (MAF!=0.5) genotype=genotype[filter,:] Analyser.MAF=MAF[filter] Analyser.rsid=Analyser.rsid[filter] if genotype.shape[0]==0: print 'NO SNPs > MAF' continue else: Analyser.MAF=MAF SNP[1]+=genotype.shape[0] while True: phenotype=phen.get_next(index=row_index[1]) if isinstance(phenotype, type(None)): phen.folder.processed=0 print 'All phenotypes processed!' break if phen.permutation: np.random.shuffle(phenotype) b_cov=B_covariates(covariates,phenotype,intercept=intercept) C=C_matrix(phenotype) a_test=A_tests(covariates,genotype,intercept=intercept) a_inv=A_inverse(a_cov,a_test) N_con=a_inv.shape[1] - 1 DF=(phenotype.shape[0] - a_inv.shape[1]) b4=B4(phenotype,genotype) t_stat, SE=HASE(b4, a_inv, b_cov, C, N_con, DF) print('Read {}, processed {}, total {}'.format(SNP[0],SNP[1],SNP[2] )) Analyser.t_stat=t_stat Analyser.SE=SE if mapper is not None and mapper.cluster == 'y': Analyser.cluster=True Analyser.chunk=ch Analyser.node=mapper.node[1] if phen.permutation: Analyser.permutation=True Analyser.save_result( phen.folder._data.names[phen.folder._data.start:phen.folder._data.finish]) t_stat=None Analyser.t_stat=None del b4 del C del b_cov del a_inv del a_test del t_stat gc.collect() if Analyser.cluster: np.save(os.path.join(Analyser.out,str(Analyser.node)+'_node_RSID.npy'),Analyser.rsid_dic)
def partial_derivatives(save_path=None, COV=None, PHEN=None, GEN=None, MAP=None, MAF=None, R2=None, B4_flag=False, study_name=None, intercept=True): row_index, ids = study_indexes(phenotype=PHEN.folder._data, genotype=GEN.folder._data, covariates=COV.folder._data) metadata = {} #TODO (mid) add parameter to compute PD only for new phenotypes or cov metadata['id'] = ids metadata['MAF'] = [] metadata['filter'] = [] metadata['names'] = [] #TODO (low) change to cov_names metadata['phenotype'] = [] b_cov = [] C = [] a_test = [] b4 = [] covariates = COV.get_next(index=row_index[2]) if MAP.cluster == 'n' or MAP.node[1] == 1: if intercept: metadata['names'].append(study_name + '_intercept') metadata['names'] = metadata['names'] + [ study_name + '_' + i for i in COV.folder._data.get_names() ] a_cov = A_covariates(covariates, intercept=intercept) np.save(os.path.join(save_path, study_name + '_a_cov.npy'), a_cov) with Timer() as t_phen: while True: phenotype = PHEN.get_next(index=row_index[1]) if isinstance(phenotype, type(None)): b_cov = np.concatenate(b_cov, axis=1) C = np.concatenate(C, axis=0) np.save(os.path.join(save_path, study_name + '_b_cov.npy'), b_cov) np.save(os.path.join(save_path, study_name + '_C.npy'), C) break # Moved to commented line below the outside of this while loop as the names # are not sliced like the 'PHEN.get_next(...)' does for phenotype. # This solves an exception being thrown when a larger number of samples is used. (> 1000) # metadata['phenotype'] = metadata['phenotype'] + list(PHEN.folder._data.get_names()) b_cov.append( B_covariates(covariates, phenotype, intercept=intercept)) C.append(C_matrix(phenotype)) # Moved from the while loop above to fix bug. See ^^^ for more. metadata['phenotype'] = metadata['phenotype'] + list( PHEN.folder._data.get_names()) print('Time to PD phenotype {} is {} s'.format( np.array(C).shape, t_phen.secs)) if MAP.cluster == 'y': f_max = np.max([int(f.split('_')[0]) for f in GEN.folder.files]) files2read = [ '{}_{}.h5'.format(i, study_name) for i in np.array_split(range(f_max + 1), MAP.node[0])[MAP.node[1] - 1] ][::-1] filesdone = [] for i in range(MAP.node[1] - 1): filesdone = filesdone + [ '{}_{}.h5'.format(i, study_name) for i in np.array_split(range(f_max + 1), MAP.node[0])[i] ] N_snps_read = 0 for f in filesdone: file = os.path.join(GEN.folder.path, 'genotype', f) N_snps_read += GEN.folder.get_info(file)['shape'][0] else: N_snps_read = 0 while True: with Timer() as t_gen: if MAP.cluster == 'y': if len(files2read) != 0: file = os.path.join(GEN.folder.path, 'genotype', files2read.pop()) genotype = GEN.folder.read(file) else: genotype = None else: genotype = GEN.get_next() if isinstance(genotype, type(None)): if MAP.cluster == 'y': np.save( os.path.join( save_path, 'node_{}_'.format(MAP.node[1]) + study_name + '_a_test.npy'), np.concatenate(a_test).astype(np.float64)) np.save( os.path.join( save_path, 'node_{}_'.format(MAP.node[1]) + study_name + '_metadata.npy'), metadata) if B4_flag: b4 = np.concatenate(b4, axis=0) np.save( os.path.join( save_path, 'node_{}_'.format(MAP.node[1]) + study_name + '_b4.npy'), b4.astype(np.float64)) if MAP.node[1] == MAP.node[0]: merge_PD(save_path, MAP.node[0], study_name) else: np.save( os.path.join(save_path, study_name + '_a_test.npy'), np.concatenate(a_test)) np.save( os.path.join(save_path, study_name + '_metadata.npy'), metadata) if B4_flag: b4 = np.concatenate(b4, axis=0) np.save( os.path.join(save_path, study_name + '_b4.npy'), b4) break flip = MAP.flip[GEN.folder.name][N_snps_read:N_snps_read + genotype.shape[0]] N_snps_read += genotype.shape[0] flip_index = (flip == -1) genotype = np.apply_along_axis( lambda x: flip * (x - 2 * flip_index), 0, genotype) genotype = genotype[:, row_index[0]] maf = np.mean(genotype, axis=1) / 2 metadata['MAF'] = metadata['MAF'] + list(maf) #TODO (low) add interaction a_test.append(A_tests(covariates, genotype, intercept=intercept)) if B4_flag: #works only when all phenotypes in one chunk, if not, do not use this option! #it would use to much disk space anyway if len([f for f in PHEN.folder.files if f != 'info_dic.npy' ]) > 1: print 'pd_full flag disabled!' B4_flag = False continue PHEN.folder.processed = 0 phenotype = PHEN.get_next(index=row_index[1]) b4.append(B4(phenotype, genotype)) print('Time to PD genotype {} is {} s'.format(genotype.shape, t_gen.secs))
mapper.genotype_names=args.study_name mapper.chunk_size=MAPPER_CHUNK_SIZE mapper.reference_name=args.ref_name mapper.load_flip(args.mapper) mapper.load(args.mapper) phen=Reader('phenotype') phen.start(args.phenotype[0]) gen=Reader('genotype') gen.start(args.genotype[0], hdf5=args.hdf5, study_name=args.study_name[0], ID=False) e=Encoder(args.out) e.study_name=args.study_name[0] row_index, ids = study_indexes(phenotype=phen.folder._data,genotype=gen.folder._data) with Timer() as t: e.matrix(len(ids),save=True) N_snps_read=0 while True: with Timer() as t_gen: genotype = gen.get_next() if isinstance(genotype, type(None)): break flip=mapper.flip[N_snps_read:N_snps_read+genotype.shape[0],0] N_snps_read+=genotype.shape[0] flip_index=(flip==-1) genotype=np.apply_along_axis(lambda x: flip*(x-2*flip_index) ,0,genotype) genotype=genotype[:,row_index[0]]