def main(): #sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) p_dict = parse_parameters() #Use the same LD file as LDpred local_ld_dict_file = '%s_ldradius%d.pickled.gz'%(p_dict['ld_prefix'], p_dict['ld_radius']) print """ Note: For maximal accuracy all SNPs with LDpred weights should be included in the validation data set. If they are a subset of the validation data set, then we suggest recalculate LDpred for the overlapping SNPs. """ if not os.path.isfile(local_ld_dict_file): df = h5py.File(p_dict['coord']) chrom_ld_scores_dict = {} chrom_ld_dict = {} chrom_ref_ld_mats = {} ld_score_sum = 0 num_snps = 0 print 'Calculating LD information w. radius %d'% p_dict['ld_radius'] cord_data_g = df['cord_data'] for chrom_str in cord_data_g.keys(): print 'Working on %s'%chrom_str g = cord_data_g[chrom_str] if 'raw_snps_ref' in g.keys(): raw_snps = g['raw_snps_ref'][...] snp_stds = g['snp_stds_ref'][...] snp_means = g['snp_means_ref'][...] n_snps = len(raw_snps) snp_means.shape = (n_snps,1) snp_stds.shape = (n_snps,1) # Normalize SNPs.. snps = sp.array((raw_snps - snp_means)/snp_stds,dtype='float32') ret_dict = ld.get_LDpred_ld_tables(snps, ld_radius=p_dict['ld_radius'], ld_window_size=2*p_dict['ld_radius']) chrom_ld_dict[chrom_str] = ret_dict['ld_dict'] chrom_ref_ld_mats[chrom_str] = ret_dict['ref_ld_matrices'] ld_scores = ret_dict['ld_scores'] chrom_ld_scores_dict[chrom_str] = {'ld_scores':ld_scores, 'avg_ld_score':sp.mean(ld_scores)} ld_score_sum += sp.sum(ld_scores) num_snps += n_snps avg_gw_ld_score = ld_score_sum / float(num_snps) ld_scores_dict = {'avg_gw_ld_score': avg_gw_ld_score, 'chrom_dict':chrom_ld_scores_dict} print 'Done calculating the LD table and LD score, writing to file:', local_ld_dict_file print 'Genome-wide average LD score was:', ld_scores_dict['avg_gw_ld_score'] ld_dict = {'ld_scores_dict':ld_scores_dict, 'chrom_ld_dict':chrom_ld_dict, 'chrom_ref_ld_mats':chrom_ref_ld_mats} with gzip.open(local_ld_dict_file, 'wb') as f: cPickle.dump(ld_dict, f, protocol=2) print 'LD information is now pickled.' else: print 'Loading LD information from file: %s'%local_ld_dict_file with gzip.open(local_ld_dict_file, 'r') as f: ld_dict = cPickle.load(f) ldpred_inf_genomewide(data_file=p_dict['coord'], out_file_prefix=p_dict['out'], ld_radius=p_dict['ld_radius'], ld_dict = ld_dict, n=p_dict['N'], h2=p_dict['H2'], verbose=False)
def main(): p_dict = parse_parameters() #Use the same LD file as LDpred local_ld_dict_file = '%s_ldradius%d.pickled.gz'%(p_dict['ld_prefix'], p_dict['ld_radius']) print """ Note: For maximal accuracy all SNPs with LDpred weights should be included in the validation data set. If they are a subset of the validation data set, then we suggest recalculate LDpred for the overlapping SNPs. """ if not os.path.isfile(local_ld_dict_file): df = h5py.File(p_dict['coord']) chrom_ld_scores_dict = {} chrom_ld_dict = {} chrom_ref_ld_mats = {} ld_score_sum = 0 num_snps = 0 print 'Calculating LD information w. radius %d'% p_dict['ld_radius'] cord_data_g = df['cord_data'] for chrom_str in cord_data_g.keys(): print 'Working on %s'%chrom_str g = cord_data_g[chrom_str] if 'raw_snps_ref' in g.keys(): raw_snps = g['raw_snps_ref'][...] snp_stds = g['snp_stds_ref'][...] snp_means = g['snp_means_ref'][...] n_snps = len(raw_snps) snp_means.shape = (n_snps,1) snp_stds.shape = (n_snps,1) # Normalize SNPs.. snps = sp.array((raw_snps - snp_means)/snp_stds,dtype='float32') ret_dict = ld.get_LDpred_ld_tables(snps, ld_radius=p_dict['ld_radius'], ld_window_size=2*p_dict['ld_radius']) chrom_ld_dict[chrom_str] = ret_dict['ld_dict'] chrom_ref_ld_mats[chrom_str] = ret_dict['ref_ld_matrices'] ld_scores = ret_dict['ld_scores'] chrom_ld_scores_dict[chrom_str] = {'ld_scores':ld_scores, 'avg_ld_score':sp.mean(ld_scores)} ld_score_sum += sp.sum(ld_scores) num_snps += n_snps avg_gw_ld_score = ld_score_sum / float(num_snps) ld_scores_dict = {'avg_gw_ld_score': avg_gw_ld_score, 'chrom_dict':chrom_ld_scores_dict} print 'Done calculating the LD table and LD score, writing to file:', local_ld_dict_file print 'Genome-wide average LD score was:', ld_scores_dict['avg_gw_ld_score'] ld_dict = {'ld_scores_dict':ld_scores_dict, 'chrom_ld_dict':chrom_ld_dict, 'chrom_ref_ld_mats':chrom_ref_ld_mats} with gzip.open(local_ld_dict_file, 'wb') as f: cPickle.dump(ld_dict, f, protocol=2) print 'LD information is now pickled.' else: print 'Loading LD information from file: %s'%local_ld_dict_file with gzip.open(local_ld_dict_file, 'r') as f: ld_dict = cPickle.load(f) ldpred_inf_genomewide(data_file=p_dict['coord'], out_file_prefix=p_dict['out'], ld_radius=p_dict['ld_radius'], ld_dict = ld_dict, n=p_dict['N'], h2=p_dict['H2'], verbose=False)
def main(): p_dict = parse_parameters() local_ld_dict_file = '%s_ldradius%d.pickled.gz' % ( p_dict['local_ld_file_prefix'], p_dict['ld_radius']) print """ Note: For maximal accuracy all SNPs with LDpred weights should be included in the validation data set. If they are a subset of the validation data set, then we suggest recalculate LDpred for the overlapping SNPs. """ if not os.path.isfile(local_ld_dict_file): df = h5py.File(p_dict['coord']) chrom_ld_scores_dict = {} chrom_ld_dict = {} chrom_ref_ld_mats = {} if p_dict['gm_ld_radius'] is not None: chrom_ld_boundaries = {} ld_score_sum = 0 num_snps = 0 print 'Calculating LD information w. radius %d' % p_dict['ld_radius'] cord_data_g = df['cord_data'] for chrom_str in cord_data_g.keys(): print 'Working on %s' % chrom_str g = cord_data_g[chrom_str] if 'raw_snps_ref' in g.keys(): raw_snps = g['raw_snps_ref'][...] snp_stds = g['snp_stds_ref'][...] snp_means = g['snp_means_ref'][...] #Filter monomorphic SNPs ok_snps_filter = snp_stds > 0 ok_snps_filter = ok_snps_filter.flatten() raw_snps = raw_snps[ok_snps_filter] snp_means = snp_means[ok_snps_filter] snp_stds = snp_stds[ok_snps_filter] n_snps = len(raw_snps) snp_means.shape = (n_snps, 1) snp_stds.shape = (n_snps, 1) # Normalize SNPs.. snps = sp.array((raw_snps - snp_means) / snp_stds, dtype='float32') assert snps.shape == raw_snps.shape, 'Array Shape mismatch' if p_dict['gm_ld_radius'] is not None: assert 'genetic_map' in g.keys(), 'Genetic map is missing.' gm = g['genetic_map'][...] ret_dict = ld.get_LDpred_ld_tables( snps, gm=gm, gm_ld_radius=p_dict['gm_ld_radius']) chrom_ld_boundaries[chrom_str] = ret_dict['ld_boundaries'] else: ret_dict = ld.get_LDpred_ld_tables( snps, ld_radius=p_dict['ld_radius'], ld_window_size=2 * p_dict['ld_radius']) chrom_ld_dict[chrom_str] = ret_dict['ld_dict'] chrom_ref_ld_mats[chrom_str] = ret_dict['ref_ld_matrices'] ld_scores = ret_dict['ld_scores'] chrom_ld_scores_dict[chrom_str] = { 'ld_scores': ld_scores, 'avg_ld_score': sp.mean(ld_scores) } ld_score_sum += sp.sum(ld_scores) num_snps += n_snps avg_gw_ld_score = ld_score_sum / float(num_snps) ld_scores_dict = { 'avg_gw_ld_score': avg_gw_ld_score, 'chrom_dict': chrom_ld_scores_dict } print 'Done calculating the LD table and LD score, writing to file:', local_ld_dict_file print 'Genome-wide average LD score was:', ld_scores_dict[ 'avg_gw_ld_score'] ld_dict = { 'ld_scores_dict': ld_scores_dict, 'chrom_ld_dict': chrom_ld_dict, 'chrom_ref_ld_mats': chrom_ref_ld_mats } if p_dict['gm_ld_radius'] is not None: ld_dict['chrom_ld_boundaries'] = chrom_ld_boundaries f = gzip.open(local_ld_dict_file, 'wb') cPickle.dump(ld_dict, f, protocol=2) f.close() print 'LD information is now pickled.' else: print 'Loading LD information from file: %s' % local_ld_dict_file f = gzip.open(local_ld_dict_file, 'r') ld_dict = cPickle.load(f) f.close() ldpred_genomewide(data_file=p_dict['coord'], out_file_prefix=p_dict['out'], ps=p_dict['PS'], ld_radius=p_dict['ld_radius'], ld_dict=ld_dict, n=p_dict['N'], num_iter=p_dict['num_iter'], h2=p_dict['H2'], verbose=False)
raw_snps = raw_snps[ok_snps_filter] snp_means = snp_means[ok_snps_filter] snp_stds = snp_stds[ok_snps_filter] n_snps = len(raw_snps) snp_means.shape = (n_snps,1) snp_stds.shape = (n_snps,1) # Normalize SNPs.. snps = sp.array((raw_snps - snp_means)/snp_stds,dtype='float32') assert snps.shape==raw_snps.shape, 'Array Shape mismatch' if p_dict['gm_ld_radius'] is not None: assert 'genetic_map' in g.keys(), 'Genetic map is missing.' gm = g['genetic_map'][...] ret_dict = ld.get_LDpred_ld_tables(snps, gm=gm, gm_ld_radius=p_dict['gm_ld_radius']) chrom_ld_boundaries[chrom_str] = ret_dict['ld_boundaries'] else: ret_dict = ld.get_LDpred_ld_tables(snps, ld_radius=p_dict['ld_radius'], ld_window_size=2*p_dict['ld_radius']) chrom_ld_dict[chrom_str] = ret_dict['ld_dict'] chrom_ref_ld_mats[chrom_str] = ret_dict['ref_ld_matrices'] ld_scores = ret_dict['ld_scores'] chrom_ld_scores_dict[chrom_str] = {'ld_scores':ld_scores, 'avg_ld_score':sp.mean(ld_scores)} ld_score_sum += sp.sum(ld_scores) num_snps += n_snps avg_gw_ld_score = ld_score_sum / float(num_snps) ld_scores_dict = {'avg_gw_ld_score': avg_gw_ld_score, 'chrom_dict':chrom_ld_scores_dict} print 'Done calculating the LD table and LD score, writing to file:', local_ld_dict_file print 'Genome-wide average LD score was:', ld_scores_dict['avg_gw_ld_score'] ld_dict = {'ld_scores_dict':ld_scores_dict, 'chrom_ld_dict':chrom_ld_dict, 'chrom_ref_ld_mats':chrom_ref_ld_mats}
def main(): p_dict = parse_parameters() # - start wallace # local_ld_dict_file = '%s_ldradius%d.pickled.gz'%(p_dict['local_ld_file_prefix'], p_dict['ld_radius']) local_ld_dict_file = p_dict['local_ld_file'] # - end wallace print """ Note: For maximal accuracy all SNPs with LDpred weights should be included in the validation data set. If they are a subset of the validation data set, then we suggest recalculate LDpred for the overlapping SNPs. """ # wallace: # Generate the local_ld_file file. if not os.path.isfile(local_ld_dict_file): # - start wallace, should not run into this point in this file. print 'ERROR: can not find LD file, please run "LDpred.getLocalLDFile.CHR.Wallace.V1.py" to get them!' sys.exit(-1) # - end wallace df = h5py.File(p_dict['coord']) chrom_ld_scores_dict = {} chrom_ld_dict = {} chrom_ref_ld_mats = {} if p_dict['gm_ld_radius'] is not None: chrom_ld_boundaries = {} ld_score_sum = 0 num_snps = 0 print 'Calculating LD information w. radius %d' % p_dict['ld_radius'] cord_data_g = df['cord_data'] for chrom_str in cord_data_g.keys(): print 'Working on %s' % chrom_str g = cord_data_g[chrom_str] if 'raw_snps_ref' in g.keys(): raw_snps = g['raw_snps_ref'][...] snp_stds = g['snp_stds_ref'][...] snp_means = g['snp_means_ref'][...] #Filter monomorphic SNPs ok_snps_filter = snp_stds > 0 ok_snps_filter = ok_snps_filter.flatten() raw_snps = raw_snps[ok_snps_filter] snp_means = snp_means[ok_snps_filter] snp_stds = snp_stds[ok_snps_filter] n_snps = len(raw_snps) snp_means.shape = (n_snps, 1) snp_stds.shape = (n_snps, 1) # Normalize SNPs.. snps = sp.array((raw_snps - snp_means) / snp_stds, dtype='float32') assert snps.shape == raw_snps.shape, 'Array Shape mismatch' if p_dict['gm_ld_radius'] is not None: assert 'genetic_map' in g.keys(), 'Genetic map is missing.' gm = g['genetic_map'][...] ret_dict = ld.get_LDpred_ld_tables( snps, gm=gm, gm_ld_radius=p_dict['gm_ld_radius']) chrom_ld_boundaries[chrom_str] = ret_dict['ld_boundaries'] else: ret_dict = ld.get_LDpred_ld_tables( snps, ld_radius=p_dict['ld_radius'], ld_window_size=2 * p_dict['ld_radius']) chrom_ld_dict[chrom_str] = ret_dict['ld_dict'] chrom_ref_ld_mats[chrom_str] = ret_dict['ref_ld_matrices'] ld_scores = ret_dict['ld_scores'] chrom_ld_scores_dict[chrom_str] = { 'ld_scores': ld_scores, 'avg_ld_score': sp.mean(ld_scores) } ld_score_sum += sp.sum(ld_scores) num_snps += n_snps # - start Wallace --- # gather data for estimate heritability # ref ldpred_genomewide section: betas = g['betas'][...] n_betas = len(betas) # sum_beta2s += sp.sum(betas ** 2) #WRITE OUT CHROMOSOME LEVEL data. with open(local_ld_dict_file + '_byFileCache' + '.txt', 'w') as f: f.write( chrom_str + ': ld_scores\t%f\tn_snps\t%d\ttotal_beta_square\t%f\tn_betas\t%d\n' % (sp.sum(ld_scores), n_snps, sp.sum(betas**2), n_betas)) # - end Wallace --- avg_gw_ld_score = ld_score_sum / float(num_snps) ld_scores_dict = { 'avg_gw_ld_score': avg_gw_ld_score, 'chrom_dict': chrom_ld_scores_dict } print 'Done calculating the LD table and LD score, writing to file:', local_ld_dict_file print 'Genome-wide average LD score was:', ld_scores_dict[ 'avg_gw_ld_score'] # This part is dumpped to disk # Global values: ld_scores_dict # Chromosome wise values: chrom_ld_dict, chrom_ref_ld_mats. ld_dict = { 'ld_scores_dict': ld_scores_dict, 'chrom_ld_dict': chrom_ld_dict, 'chrom_ref_ld_mats': chrom_ref_ld_mats } if p_dict['gm_ld_radius'] is not None: ld_dict['chrom_ld_boundaries'] = chrom_ld_boundaries f = gzip.open(local_ld_dict_file, 'wb') cPickle.dump(ld_dict, f, protocol=2) f.close() print 'LD information is now pickled.' else: print 'Loading LD information from file: %s' % local_ld_dict_file f = gzip.open(local_ld_dict_file, 'r') ld_dict = cPickle.load(f) f.close() # - start wallace # ldpred_genomewide(data_file=p_dict['coord'], out_file_prefix=p_dict['out'], ps=p_dict['PS'], ld_radius=p_dict['ld_radius'], # ld_dict = ld_dict, n=p_dict['N'], num_iter=p_dict['num_iter'], h2=p_dict['H2'], verbose=False) ldpred_genomewide(data_file=p_dict['coord'], out_file_prefix=p_dict['out'], ps=p_dict['PS'], ld_radius=p_dict['ld_radius'], ld_dict=ld_dict, n=p_dict['N'], num_iter=p_dict['num_iter'], h2=p_dict['H2'], verbose=False, local_ld_dict_file=local_ld_dict_file)
def main(): p_dict = parse_parameters() local_ld_dict_file = '%s_ldradius%d.pickled.gz'%(p_dict['local_ld_file_prefix'], p_dict['ld_radius']) print """ Note: For maximal accuracy all SNPs with LDpred weights should be included in the validation data set. If they are a subset of the validation data set, then we suggest recalculate LDpred for the overlapping SNPs. """ if not os.path.isfile(local_ld_dict_file): df = h5py.File(p_dict['coord']) chrom_ld_scores_dict = {} chrom_ld_dict = {} chrom_ref_ld_mats = {} if p_dict['gm_ld_radius'] is not None: chrom_ld_boundaries={} ld_score_sum = 0 num_snps = 0 print 'Calculating LD information w. radius %d'% p_dict['ld_radius'] cord_data_g = df['cord_data'] for chrom_str in cord_data_g.keys(): print 'Working on %s'%chrom_str g = cord_data_g[chrom_str] if 'raw_snps_ref' in g.keys(): raw_snps = g['raw_snps_ref'][...] snp_stds = g['snp_stds_ref'][...] snp_means = g['snp_means_ref'][...] #Filter monomorphic SNPs ok_snps_filter = snp_stds>0 ok_snps_filter = ok_snps_filter.flatten() raw_snps = raw_snps[ok_snps_filter] snp_means = snp_means[ok_snps_filter] snp_stds = snp_stds[ok_snps_filter] n_snps = len(raw_snps) snp_means.shape = (n_snps,1) snp_stds.shape = (n_snps,1) # Normalize SNPs.. snps = sp.array((raw_snps - snp_means)/snp_stds,dtype='float32') assert snps.shape==raw_snps.shape, 'Array Shape mismatch' if p_dict['gm_ld_radius'] is not None: assert 'genetic_map' in g.keys(), 'Genetic map is missing.' gm = g['genetic_map'][...] ret_dict = ld.get_LDpred_ld_tables(snps, gm=gm, gm_ld_radius=p_dict['gm_ld_radius']) chrom_ld_boundaries[chrom_str] = ret_dict['ld_boundaries'] else: ret_dict = ld.get_LDpred_ld_tables(snps, ld_radius=p_dict['ld_radius'], ld_window_size=2*p_dict['ld_radius']) chrom_ld_dict[chrom_str] = ret_dict['ld_dict'] chrom_ref_ld_mats[chrom_str] = ret_dict['ref_ld_matrices'] ld_scores = ret_dict['ld_scores'] chrom_ld_scores_dict[chrom_str] = {'ld_scores':ld_scores, 'avg_ld_score':sp.mean(ld_scores)} ld_score_sum += sp.sum(ld_scores) num_snps += n_snps avg_gw_ld_score = ld_score_sum / float(num_snps) ld_scores_dict = {'avg_gw_ld_score': avg_gw_ld_score, 'chrom_dict':chrom_ld_scores_dict} print 'Done calculating the LD table and LD score, writing to file:', local_ld_dict_file print 'Genome-wide average LD score was:', ld_scores_dict['avg_gw_ld_score'] ld_dict = {'ld_scores_dict':ld_scores_dict, 'chrom_ld_dict':chrom_ld_dict, 'chrom_ref_ld_mats':chrom_ref_ld_mats} if p_dict['gm_ld_radius'] is not None: ld_dict['chrom_ld_boundaries']=chrom_ld_boundaries f = gzip.open(local_ld_dict_file, 'wb') cPickle.dump(ld_dict, f, protocol=2) f.close() print 'LD information is now pickled.' else: print 'Loading LD information from file: %s'%local_ld_dict_file f = gzip.open(local_ld_dict_file, 'r') ld_dict = cPickle.load(f) f.close() ldpred_genomewide(data_file=p_dict['coord'], out_file_prefix=p_dict['out'], ps=p_dict['PS'], ld_radius=p_dict['ld_radius'], ld_dict = ld_dict, n=p_dict['N'], num_iter=p_dict['num_iter'], h2=p_dict['H2'], verbose=False)