def save_montage(NIFTI, ANAT, ONAME, SGN): nifti = load_image(NIFTI) anat = load_image(ANAT) imax = nifti.get_data().max() imin = nifti.get_data().min() imshow_args = {'vmax': imax, 'vmin': imin} mcmap = cmaps[SGN + 1] num_features = nifti.shape[-1] y = max([1, int(round(sqrt(num_features / 3)))]) x = int(ceil(num_features / y) + 1) font = {'size': 8} rc('font', **font) f = figure(figsize=[iscale * y, iscale * x / 3]) subplots_adjust(left=0.01, right=0.99, bottom=0.01, top=0.99, wspace=0.1, hspace=0) for i in range(0, num_features): data = nifti.get_data()[:, :, :, i] data[sign(data) == negative(SGN)] = 0 if max(abs(data.flatten())) > thr + 0.2: ax = subplot(x, y, i + 1) max_idx = np.unravel_index(argmax(data), data.shape) plot_map(data, xyz_affine(nifti), anat=anat.get_data(), anat_affine=xyz_affine(anat), black_bg=True, threshold=thr, cut_coords=coord_transform(max_idx[0], max_idx[1], max_idx[2], xyz_affine(nifti)), annotate=False, axes=ax, cmap=mcmap, draw_cross=False, **imshow_args) text(0., 0.95, str(i), transform=ax.transAxes, horizontalalignment='center', color=(1, 1, 1)) savefig(ONAME, facecolor=(0, 0, 0))
def calc_ibd_kinship(snps, dtype='single', scaled=True): num_snps = len(snps) n_indivs = len(snps[0]) k_mat = sp.zeros((n_indivs, n_indivs), dtype=dtype) for chunk_i, i in enumerate(range(0, num_snps, n_indivs)): snps_array = sp.array(snps[i:i + n_indivs]) snps_array = snps_array.T norm_snps_array = (snps_array - sp.mean(snps_array, 0)) / sp.std(snps_array, 0) assert sp.all(sp.negative(sp.isnan(norm_snps_array))), 'WTF?' x = sp.mat(norm_snps_array.T) k_mat += x.T * x sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%' % (100.0 * (min(1, ((chunk_i + 1.0) * n_indivs) / num_snps)))) sys.stdout.flush() k_mat = k_mat / float(num_snps) if scaled: k_mat = scale_k(k_mat) return k_mat
def calc_ibd_kinship(snps, dtype='single', scaled=True): num_snps = len(snps) n_indivs = len(snps[0]) k_mat = sp.zeros((n_indivs, n_indivs), dtype=dtype) for chunk_i, i in enumerate(range(0, num_snps, n_indivs)): snps_array = sp.array(snps[i:i + n_indivs]) snps_array = snps_array.T norm_snps_array = (snps_array - sp.mean(snps_array, 0)) / sp.std( snps_array, 0) assert sp.all(sp.negative(sp.isnan(norm_snps_array))), 'WTF?' x = sp.mat(norm_snps_array.T) k_mat += x.T * x sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%' % (100.0 * (min(1, ((chunk_i + 1.0) * n_indivs) / num_snps)))) sys.stdout.flush() k_mat = k_mat / float(num_snps) if scaled: k_mat = scale_k(k_mat) return k_mat
def parse_1KG_snp_info(input_file='/project/TheHonestGene/faststorage/1Kgenomes/phase3/1k_genomes_hg.hdf5' , out_file='/project/PCMA/faststorage/1_DATA/1k_genomes/1K_SNP_INFO_EUR_MAF0.05.hdf5', filter_ambiguous=True, maf_thres=0.05): print 'Generating a SNP info file' ih5f = h5py.File(input_file) oh5f = h5py.File(out_file) num_indivs = len(ih5f['indivs']['continent']) eur_filter = ih5f['indivs']['continent'][...] == 'EUR' num_eur_indivs = sp.sum(eur_filter) print 'Number of European individuals: %d \nTotal number of individuals: %d' % (num_eur_indivs, num_indivs) std_thres = sp.sqrt(2.0 * (1 - maf_thres) * (maf_thres)) for chrom in range(1, 23): print 'Working on Chromosome %d' % chrom chrom_str = 'chr%d' % chrom print 'Loading SNPs and data' snps = sp.array(ih5f[chrom_str]['calldata']['snps'][...], dtype='int8') print 'Excluding non-European individuals' snps = snps[:, eur_filter] print "Loading other SNP information" snp_ids = ih5f[chrom_str]['variants']['ID'][...] positions = ih5f[chrom_str]['variants']['POS'][...] print 'Loading NTs' ref_nts = ih5f[chrom_str]['variants']['REF'][...] alt_nts = ih5f[chrom_str]['variants']['ALT'][...] print 'Filtering multi-allelic SNPs' multi_allelic_filter = sp.negative(ih5f[chrom_str]['variants']['MULTI_ALLELIC'][...]) snps = snps[multi_allelic_filter] ref_nts = ref_nts[multi_allelic_filter] alt_nts = alt_nts[multi_allelic_filter] snp_ids = snp_ids[multi_allelic_filter] positions = positions[multi_allelic_filter] print 'Filter SNPs with missing NT information' nt_filter = sp.in1d(ref_nts, ok_nts) nt_filter = nt_filter * sp.in1d(alt_nts, ok_nts) if sp.sum(nt_filter) < len(nt_filter): snps = snps[nt_filter] ref_nts = ref_nts[nt_filter] alt_nts = alt_nts[nt_filter] snp_ids = snp_ids[nt_filter] positions = positions[nt_filter] print 'Filtering SNPs with MAF <', maf_thres afs = sp.sum(snps, axis=1) / num_eur_indivs assert sp.all(0 <= afs) and sp.all(afs <= 2), 'AF is out of range' mafs = sp.minimum(afs, 1 - afs) maf_filter = mafs < maf_thres snps = snps[maf_filter] ref_nts = ref_nts[maf_filter] alt_nts = alt_nts[maf_filter] snp_ids = snp_ids[maf_filter] positions = positions[maf_filter] mafs = mafs[maf_filter] g = oh5f.create_group(chrom_str) g.create_dataset('sids', data=snp_ids) g.create_dataset('positions', data=positions) g.create_dataset('eur_mafs', data=mafs) g.create_dataset('ref', data=ref_nts) g.create_dataset('alt', data=alt_nts) oh5f.flush() oh5f.close()
def get_kinships(snps_file='C:/Users/MariaIzabel/Desktop/MASTER/PHD/Bjarnicode/new_snps.HDF5', plot_figures = True, figure_dir = 'C:/Users/MariaIzabel/Desktop/MASTER/PHD/Bjarnicode', fig_id = 'all', min_maf = 0.1, max_strain_num=200): """ Calculates the kinship """ h5f = h5py.File(snps_file) gene_groups = h5f.keys() all_strains = set() for gg in gene_groups: data_g = h5f[gg] strains = data_g['strains'][...] if len(strains)<max_strain_num: all_strains = set(strains).union(all_strains) num_strains = len(all_strains) print 'Found %d "distinct" strains'%num_strains ordered_strains = sorted(list(all_strains)) strain_index = pd.Index(ordered_strains) K_snps = sp.zeros((num_strains,num_strains)) counts_mat_snps = sp.zeros((num_strains,num_strains)) K_codon_snps = sp.zeros((num_strains,num_strains)) counts_mat_codon_snps = sp.zeros((num_strains,num_strains)) K_nonsyn_snps = sp.zeros((num_strains,num_strains)) counts_mat_nonsyn_snps = sp.zeros((num_strains,num_strains)) K_syn_snps = sp.zeros((num_strains,num_strains)) counts_mat_syn_snps = sp.zeros((num_strains,num_strains)) for i, gg in enumerate(gene_groups): if i%100==0: print 'Working on gene nr. %d'%i data_g = h5f[gg] strains = data_g['strains'][...] if len(strains)<max_strain_num: strain_mask = strain_index.get_indexer(strains) snps = data_g['norm_snps'][...] freqs = data_g['freqs'][...] mafs = sp.minimum(freqs,1-freqs) maf_mask = mafs>min_maf snps = snps[maf_mask] if len(snps)==0: continue K_snps_slice = K_snps[strain_mask] K_snps_slice[:,strain_mask] += sp.dot(snps.T,snps) K_snps[strain_mask] = K_snps_slice counts_mat_snps_slice = counts_mat_snps[strain_mask] counts_mat_snps_slice[:,strain_mask] += len(snps) counts_mat_snps[strain_mask] = counts_mat_snps_slice codon_snps = data_g['norm_codon_snps'][...] if len(codon_snps)==0: continue freqs = data_g['codon_snp_freqs'][...] mafs = sp.minimum(freqs,1-freqs) maf_mask = mafs>min_maf codon_snps = codon_snps[maf_mask] is_synonimous_snp = data_g['is_synonimous_snp'][...] is_synonimous_snp = is_synonimous_snp[maf_mask] if len(codon_snps)>0: K_codon_snps_slice = K_codon_snps[strain_mask] K_codon_snps_slice[:,strain_mask] += sp.dot(codon_snps.T,codon_snps) K_codon_snps[strain_mask] = K_codon_snps_slice counts_mat_codon_snps_slice = counts_mat_codon_snps[strain_mask] counts_mat_codon_snps_slice[:,strain_mask] += len(codon_snps) counts_mat_codon_snps[strain_mask] = counts_mat_codon_snps_slice if sp.sum(is_synonimous_snp)>0: syn_snps = codon_snps[is_synonimous_snp] K_syn_snps_slice = K_syn_snps[strain_mask] K_syn_snps_slice[:,strain_mask] += sp.dot(syn_snps.T,syn_snps) K_syn_snps[strain_mask] = K_syn_snps_slice counts_mat_syn_snps_slice = counts_mat_syn_snps[strain_mask] counts_mat_syn_snps_slice[:,strain_mask] += len(syn_snps) counts_mat_syn_snps[strain_mask] = counts_mat_syn_snps_slice is_nonsynonimous_snp = sp.negative(is_synonimous_snp) if sp.sum(is_nonsynonimous_snp)>0: nonsyn_snps = codon_snps[is_nonsynonimous_snp] K_nonsyn_snps_slice = K_nonsyn_snps[strain_mask] K_nonsyn_snps_slice[:,strain_mask] += sp.dot(nonsyn_snps.T,nonsyn_snps) K_nonsyn_snps[strain_mask] = K_nonsyn_snps_slice counts_mat_nonsyn_snps_slice = counts_mat_nonsyn_snps[strain_mask] counts_mat_nonsyn_snps_slice[:,strain_mask] += len(nonsyn_snps) counts_mat_nonsyn_snps[strain_mask] = counts_mat_nonsyn_snps_slice K_snps = K_snps/counts_mat_snps #element-wise division K_codon_snps = K_codon_snps/counts_mat_codon_snps #element-wise division K_syn_snps = K_syn_snps/counts_mat_syn_snps #element-wise division K_nonsyn_snps = K_nonsyn_snps/counts_mat_nonsyn_snps #element-wise division if plot_figures: plot_dirty_PCA(K_snps,figure_fn='PCA34_all_snps_%s.pdf'%fig_id, k_figure_fn='K_all_snps_%s.png'%fig_id, figure_dir=figure_dir, strains=ordered_strains, title='All SNPs') plot_dirty_PCA(K_codon_snps,figure_fn='PCA34_codon_snps_%s.pdf'%fig_id, k_figure_fn='K_codon_snps_%s.png'%fig_id, figure_dir=figure_dir, strains=ordered_strains, title='Codon SNPs') plot_dirty_PCA(K_syn_snps,figure_fn='PCA34_syn_snps_%s.pdf'%fig_id, k_figure_fn='K_syn_snps_%s.png'%fig_id, figure_dir=figure_dir, strains=ordered_strains, title='Synonymous SNPs') plot_dirty_PCA(K_nonsyn_snps,figure_fn='PCA_34nonsyn_snps_%s.pdf'%fig_id, k_figure_fn='K_nonsyn_snps_%s.png'%fig_id, figure_dir=figure_dir, strains=ordered_strains, title='Non-Synonymous SNPs') print 'Average number of SNPs: %0.2f.'%sp.mean(counts_mat_snps) print 'Average number of codon SNPs: %0.2f.'%sp.mean(counts_mat_snps) print 'Average number of codon SNPs: %0.2f.'%sp.mean(counts_mat_snps) print 'Average number of codon SNPs: %0.2f.'%sp.mean(counts_mat_snps) return {'K_snps':K_snps, 'K_codon_snps':K_codon_snps, 'counts_mat_snps':counts_mat_snps, 'counts_mat_codon_snps':counts_mat_codon_snps, 'K_syn_snps':K_syn_snps, 'K_nonsyn_snps':K_nonsyn_snps, 'counts_mat_syn_snps':counts_mat_syn_snps, 'counts_mat_nonsyn_snps':counts_mat_nonsyn_snps, 'strains':ordered_strains}
def leave_k_out_blup(num_repeats=20, num_cvs=5, genotype_file='/Users/bjarnivilhjalmsson/data/cegs_lehmann/', k_thres=0.5): """ """ import h5py import hdf5_data import kinship import linear_models as lm import time import scipy as sp from matplotlib import pyplot as plt import analyze_gwas_results as agr phen_dict = hdf5_data.parse_cegs_drosophila_phenotypes() phenotypes = ['Protein', 'Sugar', 'Triglyceride', 'weight'] envs = ['mated', 'virgin'] rep_dict = {} for rep_i in range(num_repeats): res_dict = {} for phenotype in phenotypes: env_dict = {} for env in envs: print phenotype, env s1 = time.time() # Load data.. d = hdf5_data.coordinate_cegs_genotype_phenotype( phen_dict, phenotype, env, k_thres=k_thres) Y_means = d['Y_means'] snps = d['snps'] assert sp.all(sp.negative(sp.isnan(snps))), 'WTF?' K = kinship.calc_ibd_kinship(snps) print '\nKinship calculated' assert sp.all(sp.negative(sp.isnan(K))), 'WTF?' n = len(Y_means) # partition genotypes in k parts. gt_ids = d['gt_ids'] num_ids = len(gt_ids) chunk_size = num_ids / num_cvs # Create k CV sets of prediction and validation data cv_chunk_size = int((n / num_cvs) + 1) ordering = sp.random.permutation(n) a = sp.arange(n) osb_ys = [] pred_ys = [] p_herits = [] for cv_i, i in enumerate(range(0, n, cv_chunk_size)): cv_str = 'cv_%d' % cv_i # print 'Working on CV %d' % cv_i end_i = min(n, i + cv_chunk_size) validation_filter = sp.in1d(a, ordering[i:end_i]) training_filter = sp.negative(validation_filter) train_snps = snps[:, training_filter] val_snps = snps[:, validation_filter] train_Y = Y_means[training_filter] val_Y = Y_means[validation_filter] #Calc. kinship K_train = K[training_filter, :][:, training_filter] K_cross = K[validation_filter, :][:, training_filter] # Do gBLUP lmm = lm.LinearMixedModel(train_Y) lmm.add_random_effect(K_train) r1 = lmm.get_REML() # Now the BLUP. y_mean = sp.mean(lmm.Y) Y = lmm.Y - y_mean p_herit = r1['pseudo_heritability'] p_herits.append(p_herit) #delta = (1 - p_herit) / p_herit # if K_inverse == None: # K_inverse = K.I # M = (sp.eye(K.shape[0]) + delta * K_inverse) # u_blup = M.I * Y M = sp.mat(p_herit * sp.mat(K_train) + (1 - p_herit) * sp.eye(K_train.shape[0])) u_mean_pred = sp.array(K_cross * (M.I * Y)).flatten() osb_ys.extend(val_Y) pred_ys.extend(u_mean_pred) corr = sp.corrcoef(osb_ys, pred_ys)[1, 0] print 'Correlation:', corr r2 = corr**2 print 'R2:', r2 mean_herit = sp.mean(p_herits) print 'Avg. heritability:', mean_herit env_dict[env] = {'R2': r2, 'obs_y': osb_ys, 'pred_y': pred_ys, 'corr': corr, 'avg_herit': mean_herit} res_dict[phenotype] = env_dict rep_dict[rep_i] = res_dict res_hdf5_file = '/Users/bjarnivilhjalmsson/data/tmp/leave_%d_BLUP_results_kthres_%0.1f.hdf5' % ( num_cvs, k_thres) h5f = h5py.File(res_hdf5_file) for rep_i in range(num_repeats): res_dict = rep_dict[rep_i] rep_g = h5f.create_group('repl_%d' % rep_i) for phenotype in phenotypes: phen_g = rep_g.create_group(phenotype) for env in envs: d = res_dict[phenotype][env] env_g = phen_g.create_group(env) env_g.create_dataset('R2', data=[d['R2']]) env_g.create_dataset('corr', data=[d['corr']]) env_g.create_dataset('obs_y', data=d['obs_y']) env_g.create_dataset('pred_y', data=d['pred_y']) env_g.create_dataset('avg_herit', data=[d['avg_herit']]) h5f.close()
def coordinate_cegs_genotype_phenotype( phen_dict, phenotype='Protein', env='mated', k_thres=0.8, ind_missing_thres=0.5, snp_missing_thres=0.05, maf_thres=0.1, genotype_file='/Users/bjarnivilhjalmsson/data/cegs_lehmann/CEGS.216.lines.NO_DPGP4.GATK.SNP.HETS.FILTERED.Filter_imputed.hdf5' ): """ Parse genotypes and coordinate with phenotype, and ready data for analysis. """ gh5f = h5py.File(genotype_file) p_dict = phen_dict[phenotype][env] print 'Loading SNPs' snps = sp.array(gh5f['gt'][...], dtype='single') snps = snps[:, p_dict['ind_filter']] positions = gh5f['pos'][...] m, n = snps.shape print 'Loaded %d SNPs for %d individuals' % (m, n) print 'Filtering individuals with missing rates >%0.2f' % ind_missing_thres missing_mat = sp.isnan(snps) ind_missing_rates = sp.sum(missing_mat, 0) / float(m) ind_filter = ind_missing_rates < ind_missing_thres snps = snps[:, ind_filter] n = sp.sum(ind_filter) print 'Filtered %d individuals due to high missing rates' % sp.sum( sp.negative(ind_filter)) gt_ids = gh5f['gt_ids'][p_dict['ind_filter']] gt_ids = gt_ids[ind_filter] Y_means = p_dict['Y_means'][p_dict['ind_filter']] Y_means = Y_means[ind_filter] Y_medians = p_dict['Y_medians'][p_dict['ind_filter']] Y_medians = Y_medians[ind_filter] rep_count = p_dict['rep_count'][p_dict['ind_filter']] rep_count = rep_count[ind_filter] print 'Now removing "bad" genotypes.' bad_genotypes = [ 'Raleigh_272', 'Raleigh_378', 'Raleigh_554', 'Raleigh_591', 'Raleigh_398', 'Raleigh_138', 'Raleigh_208', 'Raleigh_336', 'Raleigh_370', 'Raleigh_373', 'Raleigh_374', 'Raleigh_799', 'Raleigh_821', 'Raleigh_822', 'Raleigh_884', 'Raleigh_335' ] ind_filter = sp.negative(sp.in1d(gt_ids, bad_genotypes)) gt_ids = gt_ids[ind_filter] Y_means = Y_means[ind_filter] Y_medians = Y_medians[ind_filter] rep_count = rep_count[ind_filter] snps = snps[:, ind_filter] print 'Removed %d "bad" genotypes' % sp.sum(sp.negative(ind_filter)) n = len(snps[0]) print 'Filtering SNPs with missing rate >%0.2f' % snp_missing_thres missing_mat = sp.isnan(snps) snp_missing_rates = sp.sum(missing_mat, 1) / float(n) snps_filter = snp_missing_rates < snp_missing_thres snps = snps[snps_filter] positions = positions[snps_filter] m = sp.sum(snps_filter) print 'Filtered %d SNPs due to high missing rate' % sp.sum( sp.negative(snps_filter)) print 'Now imputing (w mean)' missing_mat = sp.isnan(snps) ok_counts = n - sp.sum(missing_mat, 1) snps[missing_mat] = 0 snp_means = sp.sum(snps, 1) / ok_counts # print snp_means.shape # print snp_means[:10] # import pdb # pdb.set_trace() for i in range(len(snps)): snps[i, missing_mat[i]] = snp_means[i] print 'And filtering SNPs with MAF<%0.2f' % maf_thres snp_means = sp.mean(snps, 1) snp_mafs = sp.minimum(snp_means, 1 - snp_means) snps_filter = snp_mafs > maf_thres snps = snps[snps_filter] positions = positions[snps_filter] print 'Filtered %d SNPs with low MAFs' % sp.sum(sp.negative(snps_filter)) print 'Filtering based on kinship w threshold:', k_thres import kinship K = kinship.calc_ibd_kinship(snps) print '\nKinship calculated' K_ind_filter = [] for i in range(n): K_ind_filter.append(not sp.any(K[i, i + 1:n] > k_thres)) if sum(K_ind_filter) == n: print 'No individuals were filtered based on kinship..' else: print 'Filtering %d individuals based on kinship.' % ( n - sum(K_ind_filter)) K_ind_filter = sp.array(K_ind_filter) gt_ids = gt_ids[K_ind_filter] Y_means = Y_means[K_ind_filter] Y_medians = Y_medians[K_ind_filter] rep_count = rep_count[K_ind_filter] snps = snps[:, K_ind_filter] print 'Again filtering SNPs with MAF<%0.2f' % maf_thres snp_means = sp.mean(snps, 1) snp_mafs = sp.minimum(snp_means, 1 - snp_means) snps_filter = snp_mafs > maf_thres snps = snps[snps_filter] positions = positions[snps_filter] print 'Filtered %d additional SNPs with low MAFs' % sp.sum( sp.negative(snps_filter)) print 'All filtering done.' m, n = snps.shape print 'In all there are %d SNPs remaining, for %d individuals.' % (m, n) ret_dict = { 'Y_means': Y_means, 'Y_medians': Y_medians, 'rep_count': rep_count, 'gt_ids': gt_ids, 'positions': positions, 'snps': snps } return ret_dict
def obj(z): x_pts, y_pts = z2xy(z) area = sp.integrate.trapz(y_pts, x=x_pts) return sp.negative(area)
def gen_unrelated_eur_1k_data( input_file='/home/bjarni/TheHonestGene/faststorage/1Kgenomes/phase3/1k_genomes_hg.hdf5', out_file='/home/bjarni/PCMA/faststorage/1_DATA/1k_genomes/1K_genomes_phase3_EUR_unrelated.hdf5', maf_thres=0.01, max_relatedness=0.05, K_thinning_frac=0.1, debug=False): h5f = h5py.File(input_file) num_indivs = len(h5f['indivs']['continent']) eur_filter = h5f['indivs']['continent'][...] == 'EUR' num_eur_indivs = sp.sum(eur_filter) print 'Number of European individuals: %d', num_eur_indivs K = sp.zeros((num_eur_indivs, num_eur_indivs), dtype='float64') num_snps = 0 std_thres = sp.sqrt(2.0 * (1 - maf_thres) * (maf_thres)) print 'Calculating kinship' for chrom in range(1, 23): print 'Working on Chromosome %d' % chrom chrom_str = 'chr%d' % chrom print 'Loading SNPs and data' snps = sp.array(h5f[chrom_str]['calldata']['snps'][...], dtype='int8') print 'Loading NTs' ref_nts = h5f[chrom_str]['variants']['REF'][...] alt_nts = h5f[chrom_str]['variants']['ALT'][...] print 'Filtering multi-allelic SNPs' multi_allelic_filter = sp.negative( h5f[chrom_str]['variants']['MULTI_ALLELIC'][...]) snps = snps[multi_allelic_filter] ref_nts = ref_nts[multi_allelic_filter] alt_nts = alt_nts[multi_allelic_filter] if K_thinning_frac < 1: print 'Thinning SNPs for kinship calculation' thinning_filter = sp.random.random(len(snps)) < K_thinning_frac snps = snps[thinning_filter] alt_nts = alt_nts[thinning_filter] ref_nts = ref_nts[thinning_filter] print 'Filter SNPs with missing NT information' nt_filter = sp.in1d(ref_nts, ok_nts) nt_filter = nt_filter * sp.in1d(alt_nts, ok_nts) if sp.sum(nt_filter) < len(nt_filter): snps = snps[nt_filter] print 'Filtering non-European individuals' snps = snps[:, eur_filter] print 'Filtering SNPs with MAF <', maf_thres snp_stds = sp.std(snps, 1) maf_filter = snp_stds.flatten() > std_thres snps = snps[maf_filter] snp_stds = snp_stds[maf_filter] print '%d SNPs remaining after all filtering steps.' % len(snps) print 'Normalizing SNPs' snp_means = sp.mean(snps, 1) norm_snps = (snps - snp_means[sp.newaxis].T) / snp_stds[sp.newaxis].T print 'Updating kinship' K += sp.dot(norm_snps.T, norm_snps) num_snps += len(norm_snps) assert sp.isclose( sp.sum(sp.diag(K)) / (num_snps * num_eur_indivs), 1.0) K = K / float(num_snps) print 'Kinship calculation done using %d SNPs\n' % num_snps # Filter individuals print 'Filtering individuals' keep_indiv_set = set(range(num_eur_indivs)) for i in range(num_eur_indivs): if i in keep_indiv_set: for j in range(i + 1, num_eur_indivs): if K[i, j] > max_relatedness: if j in keep_indiv_set: keep_indiv_set.remove(j) keep_indivs = list(keep_indiv_set) keep_indivs.sort() print 'Retained %d individuals\n' % len(keep_indivs) # Checking that everything is ok! K_ok = K[keep_indivs] K_ok = K_ok[:, keep_indivs] assert (K_ok - sp.tril(K_ok)).max() < max_relatedness indiv_filter = sp.zeros(num_indivs, dtype='bool8') indiv_filter[(sp.arange(num_indivs)[eur_filter])[keep_indivs]] = 1 assert sp.sum(indiv_filter) == len(keep_indivs) # Store in new file print 'Now storing data.' oh5f = h5py.File(out_file, 'w') indiv_ids = h5f['indivs']['indiv_ids'][indiv_filter] oh5f.create_dataset('indiv_ids', data=indiv_ids) for chrom in range(1, 23): print 'Working on Chromosome %d' % chrom chrom_str = 'chr%d' % chrom print 'Loading SNPs and data' snps = sp.array(h5f[chrom_str]['calldata']['snps'][...], dtype='int8') snp_ids = h5f[chrom_str]['variants']['ID'][...] positions = h5f[chrom_str]['variants']['POS'][...] print 'Loading NTs' ref_nts = h5f[chrom_str]['variants']['REF'][...] alt_nts = h5f[chrom_str]['variants']['ALT'][...] print 'Filtering multi-allelic SNPs' multi_allelic_filter = sp.negative( h5f[chrom_str]['variants']['MULTI_ALLELIC'][...]) snps = snps[multi_allelic_filter] ref_nts = ref_nts[multi_allelic_filter] alt_nts = alt_nts[multi_allelic_filter] positions = positions[multi_allelic_filter] snp_ids = snp_ids[multi_allelic_filter] print 'Filter individuals' snps = snps[:, indiv_filter] print 'Filter SNPs with missing NT information' nt_filter = sp.in1d(ref_nts, ok_nts) nt_filter = nt_filter * sp.in1d(alt_nts, ok_nts) if sp.sum(nt_filter) < len(nt_filter): snps = snps[nt_filter] ref_nts = ref_nts[nt_filter] alt_nts = alt_nts[nt_filter] positions = positions[nt_filter] snp_ids = snp_ids[nt_filter] print 'filter monomorphic SNPs' snp_stds = sp.std(snps, 1) mono_morph_filter = snp_stds > 0 snps = snps[mono_morph_filter] ref_nts = ref_nts[mono_morph_filter] alt_nts = alt_nts[mono_morph_filter] positions = positions[mono_morph_filter] snp_ids = snp_ids[mono_morph_filter] snp_stds = snp_stds[mono_morph_filter] snp_means = sp.mean(snps, 1) if debug: if K_thinning_frac < 1: print 'Thinning SNPs for kinship calculation' thinning_filter = sp.random.random(len(snps)) < K_thinning_frac k_snps = snps[thinning_filter] k_snp_stds = snp_stds[thinning_filter] print 'Filtering SNPs with MAF <', maf_thres maf_filter = k_snp_stds.flatten() > std_thres k_snps = k_snps[maf_filter] k_snp_stds = k_snp_stds[maf_filter] k_snp_means = sp.mean(k_snps) print 'Verifying that the Kinship makes sense' norm_snps = (k_snps - k_snp_means[sp.newaxis].T) / k_snp_stds[sp.newaxis].T K = sp.dot(norm_snps.T, norm_snps) num_snps += len(norm_snps) if sp.isclose( sp.sum(sp.diag(K)) / (num_snps * num_eur_indivs), 1.0) and (K - sp.tril(K)).max() < (max_relatedness * 1.5): print 'It looks OK!' else: raise Exception('Kinship looks wrong?') nts = sp.array([[nt1, nt2] for nt1, nt2 in izip(ref_nts, alt_nts)]) print 'Writing to disk' cg = oh5f.create_group(chrom_str) cg.create_dataset('snps', data=snps) cg.create_dataset('snp_means', data=snp_means[sp.newaxis].T) cg.create_dataset('snp_stds', data=snp_stds[sp.newaxis].T) cg.create_dataset('snp_ids', data=snp_ids) cg.create_dataset('positions', data=positions) cg.create_dataset('nts', data=nts) oh5f.flush() print 'Done writing to disk' # centimorgans = h5f[chrom_str]['centimorgans'][...] # cg.create_dataset('centimorgans',data=centimorgans) # # centimorgan_rates = h5f[chrom_str]['centimorgan_rates'][...] # cg.create_dataset('centimorgan_rates',data=centimorgan_rates) oh5f.close() h5f.close() print 'Done'
def leave_k_out_blup( num_cvs=20, genotype_file='/Users/bjarnivilhjalmsson/data/cegs_lehmann/', k_thres=0.5): """ """ import h5py import hdf5_data import kinship import linear_models as lm import time import scipy as sp from matplotlib import pyplot as plt import analyze_gwas_results as agr phen_dict = hdf5_data.parse_cegs_drosophila_phenotypes() phenotypes = ['Protein', 'Sugar', 'Triglyceride', 'weight'] envs = ['mated', 'virgin'] res_dict = {} for phenotype in phenotypes: env_dict = {} for env in envs: print phenotype, env s1 = time.time() #Load data.. d = hdf5_data.coordinate_cegs_genotype_phenotype(phen_dict, phenotype, env, k_thres=k_thres) Y_means = d['Y_means'] snps = d['snps'] assert sp.all(sp.negative(sp.isnan(snps))), 'WTF?' K = kinship.calc_ibd_kinship(snps) print '\nKinship calculated' assert sp.all(sp.negative(sp.isnan(K))), 'WTF?' n = len(Y_means) #partition genotypes in k parts. gt_ids = d['gt_ids'] num_ids = len(gt_ids) chunk_size = num_ids / num_cvs #Create k CV sets of prediction and validation data cv_chunk_size = int((n / num_cvs) + 1) ordering = sp.random.permutation(n) a = sp.arange(n) osb_ys = [] pred_ys = [] p_herits = [] for cv_i, i in enumerate(range(0, n, cv_chunk_size)): cv_str = 'cv_%d' % cv_i #print 'Working on CV %d' % cv_i end_i = min(n, i + cv_chunk_size) validation_filter = sp.in1d(a, ordering[i:end_i]) training_filter = sp.negative(validation_filter) train_snps = snps[:, training_filter] val_snps = snps[:, validation_filter] train_Y = Y_means[training_filter] val_Y = Y_means[validation_filter] #Calc. kinship K_train = K[training_filter, :][:, training_filter] K_cross = K[validation_filter, :][:, training_filter] #Do gBLUP lmm = lm.LinearMixedModel(train_Y) lmm.add_random_effect(K_train) r1 = lmm.get_REML() #Now the BLUP. y_mean = sp.mean(lmm.Y) Y = lmm.Y - y_mean p_herit = r1['pseudo_heritability'] p_herits.append(p_herit) #delta = (1 - p_herit) / p_herit # if K_inverse == None: # K_inverse = K.I # M = (sp.eye(K.shape[0]) + delta * K_inverse) # u_blup = M.I * Y M = sp.mat(p_herit * sp.mat(K_train) + (1 - p_herit) * sp.eye(K_train.shape[0])) u_mean_pred = sp.array(K_cross * (M.I * Y)).flatten() osb_ys.extend(val_Y) pred_ys.extend(u_mean_pred) corr = sp.corrcoef(osb_ys, pred_ys)[1, 0] print 'Correlation:', corr r2 = corr**2 print 'R2:', r2 mean_herit = sp.mean(p_herits) print 'Avg. heritability:', mean_herit env_dict[env] = { 'R2': r2, 'obs_y': osb_ys, 'pred_y': pred_ys, 'corr': corr, 'avg_herit': mean_herit } res_dict[phenotype] = env_dict res_hdf5_file = '/Users/bjarnivilhjalmsson/data/tmp/leave_%d_BLUP_results_kthres_%0.1f.hdf5' % ( num_cvs, k_thres) h5f = h5py.File(res_hdf5_file) for phenotype in phenotypes: phen_g = h5f.create_group(phenotype) for env in envs: d = res_dict[phenotype][env] env_g = phen_g.create_group(env) env_g.create_dataset('R2', data=[d['R2']]) env_g.create_dataset('corr', data=[d['corr']]) env_g.create_dataset('obs_y', data=d['obs_y']) env_g.create_dataset('pred_y', data=d['pred_y']) env_g.create_dataset('avg_herit', data=[d['avg_herit']]) h5f.close()
def gen_sfs_plots( snps_hdf5_file='C:/Users/MariaIzabel/Desktop/MASTER/PHD/Bjarnicode/called_snps.hdf5', fig_dir='C:/Users/MariaIzabel/Desktop/MASTER/PHD/Bjarnicode/', filter_pop=None): ### Here I will do the SFS for each genospecies based on the rhizobium xls file pop = parse_pop_map() pop_map = pop.keys() ct_array = pop.values() from itertools import izip h5f = h5py.File(snps_hdf5_file) gene_groups = sorted(h5f.keys()) syn_mafs = [] nonsyn_mafs = [] all_mafs = [] sfs_dict = {} for i, gg in enumerate(gene_groups): if i % 100 == 0: print '%d: Gene %s' % (i, gg) g = h5f[gg] if g['codon_snps'].size > 1: #print g['codon_snps'].shape if filter_pop is not None: strains = g['strains'] indiv_filter = sp.zeros((len(strains)), dtype='bool8') for s_i, s in enumerate(strains): if pop[s]['genospecies'] == filter_pop: indiv_filter[s_i] = True codon_snps = g['codon_snps'][...] codon_snps = codon_snps[:, indiv_filter] # reducing the collumns based on the genospecies t_codon_snps = sp.transpose(codon_snps) freqs = sp.mean(t_codon_snps, 0) # rows are snps collumns are individuals #counts = np.sum(codon_snps, axis = 0) #print counts #for c in counts: # if c in sfs_dict: # sfs_dict[c] += 1 # else: # sfs_dict[c] = 1 #with open('dict.csv', 'wb') as csv_file: # writer = csv.writer(csv_file) # for key, value in sfs_dict.items(): # writer.writerow([key, value]) else: codon_snps = g['codon_snps'][...] t_codon_snps = sp.transpose(codon_snps) freqs = sp.mean(t_codon_snps, 0) # number of minor allele mafs = sp.minimum(freqs, 1 - freqs) is_synonimous_snp = g['is_synonimous_snp'][...] syn_mafs.extend(mafs[is_synonimous_snp]) nonsyn_mafs.extend(mafs[sp.negative(is_synonimous_snp)]) all_mafs.extend(mafs) if filter_pop is not None: output_file = "%s.csv" % (str(argv[1])) np.savetxt(output_file, all_mafs, delimiter=',') # X is an array output_file = "%ssyn_mafs.csv" % (str(argv[1])) np.savetxt(output_file, syn_mafs, delimiter=',') output_file = "%snon_syn_mafs.csv" % (str(argv[1])) np.savetxt(output_file, nonsyn_mafs, delimiter=',') # pylab.clf() # pylab.hist(all_mafs, bins=50) # pylab.title('SFS (all binary codon SNPs)') # pylab.savefig('%s/sfs_all_%s.png'%(fig_dir,filter_pop)) #pylab.clf() #pylab.hist(nonsyn_mafs, bins=50) #pylab.title('SFS (non-synonimous SNPs)') #pylab.savefig('%s/sfs_non_syn_%s.png'%(fig_dir,filter_pop)) #pylab.clf() #pylab.hist(syn_mafs, bins=50) #pylab.title('SFS (synonimous SNPs)') #pylab.savefig('%s/sfs_syn_%s.png'%(fig_dir,filter_pop)) else: output_file = "total_2.csv" np.savetxt(output_file, all_mafs, delimiter=',') pylab.clf() pylab.hist(all_mafs, bins=50) pylab.title('SFS (all binary codon SNPs)') pylab.savefig(fig_dir + '/sfs_all.png') pylab.clf() pylab.hist(nonsyn_mafs, bins=50) pylab.title('SFS (non-synonimous SNPs)') pylab.savefig(fig_dir + '/sfs_non_syn.png') pylab.clf() pylab.hist(syn_mafs, bins=50) pylab.title('SFS (synonimous SNPs)') pylab.savefig(fig_dir + '/sfs_syn.png')
def coordinate_genot_ss(genotype_file=None, hdf5_file=None, genetic_map_dir=None, check_mafs=False, min_maf=0.01): """ Assumes plink BED files. Imputes missing genotypes. """ plinkf = plinkfile.PlinkFile(genotype_file) samples = plinkf.get_samples() num_individs = len(samples) # num_individs = len(gf['chrom_1']['snps'][:, 0]) # Y = sp.array(gf['indivs']['phenotype'][...] == 'Case', dtype='int8') Y = [s.phenotype for s in samples] fids = [s.fid for s in samples] iids = [s.iid for s in samples] unique_phens = sp.unique(Y) if len(unique_phens) == 1: print 'Unable to find phenotype values.' has_phenotype = False elif len(unique_phens) == 2: cc_bins = sp.bincount(Y) assert len(cc_bins) == 2, 'Problems with loading phenotype' print 'Loaded %d controls and %d cases' % (cc_bins[0], cc_bins[1]) has_phenotype = True else: print 'Found quantitative phenotype values' has_phenotype = True risk_scores = sp.zeros(num_individs) rb_risk_scores = sp.zeros(num_individs) num_common_snps = 0 corr_list = [] rb_corr_list = [] if has_phenotype: hdf5_file.create_dataset('y', data=Y) hdf5_file.create_dataset('fids', data=fids) hdf5_file.create_dataset('iids', data=iids) ssf = hdf5_file['sum_stats'] cord_data_g = hdf5_file.create_group('cord_data') #Figure out chromosomes and positions by looking at SNPs. loci = plinkf.get_loci() plinkf.close() gf_chromosomes = [l.chromosome for l in loci] chromosomes = sp.unique(gf_chromosomes) chromosomes.sort() chr_dict = _get_chrom_dict_(loci, chromosomes) tot_num_non_matching_nts = 0 for chrom in chromosomes: chr_str = 'chrom_%d' % chrom print 'Working on chromsome: %s' % chr_str chrom_d = chr_dict[chr_str] try: ssg = ssf['chrom_%d' % chrom] except Exception, err_str: print err_str print 'Did not find chromsome in SS dataset.' print 'Continuing.' continue g_sids = chrom_d['sids'] g_sid_set = set(g_sids) assert len(g_sid_set) == len(g_sids), 'Some duplicates?' ss_sids = ssg['sids'][...] ss_sid_set = set(ss_sids) assert len(ss_sid_set) == len(ss_sids), 'Some duplicates?' #Figure out filters: g_filter = sp.in1d(g_sids, ss_sids) ss_filter = sp.in1d(ss_sids, g_sids) #Order by SNP IDs g_order = sp.argsort(g_sids) ss_order = sp.argsort(ss_sids) g_indices = [] for g_i in g_order: if g_filter[g_i]: g_indices.append(g_i) ss_indices = [] for ss_i in ss_order: if ss_filter[ss_i]: ss_indices.append(ss_i) g_nts = chrom_d['nts'] snp_indices = chrom_d['snp_indices'] ss_nts = ssg['nts'][...] betas = ssg['betas'][...] log_odds = ssg['log_odds'][...] assert not sp.any(sp.isnan(betas)), 'WTF?' assert not sp.any(sp.isinf(betas)), 'WTF?' num_non_matching_nts = 0 num_ambig_nts = 0 ok_nts = [] print 'Found %d SNPs present in both datasets' % (len(g_indices)) if 'freqs' in ssg.keys(): ss_freqs = ssg['freqs'][...] ss_freqs_list = [] ok_indices = {'g': [], 'ss': []} for g_i, ss_i in it.izip(g_indices, ss_indices): #Is the nucleotide ambiguous? #g_nt = [recode_dict[g_nts[g_i][0]],recode_dict[g_nts[g_i][1]] g_nt = [g_nts[g_i][0], g_nts[g_i][1]] if tuple(g_nt) in ambig_nts: num_ambig_nts += 1 tot_num_non_matching_nts += 1 continue #First check if nucleotide is sane? if (not g_nt[0] in valid_nts) or (not g_nt[1] in valid_nts): num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue ss_nt = ss_nts[ss_i] #Are the nucleotides the same? flip_nts = False os_g_nt = sp.array( [opp_strand_dict[g_nt[0]], opp_strand_dict[g_nt[1]]]) if not (sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)): # Opposite strand nucleotides flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or ( os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1]) if flip_nts: betas[ss_i] = -betas[ss_i] log_odds[ss_i] = -log_odds[ss_i] if 'freqs' in ssg.keys(): ss_freqs[ss_i] = 1 - ss_freqs[ss_i] else: # print "Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \ # (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt)) num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue # everything seems ok. ok_indices['g'].append(g_i) ok_indices['ss'].append(ss_i) ok_nts.append(g_nt) print '%d SNPs were excluded due to ambiguous nucleotides.' % num_ambig_nts print '%d SNPs were excluded due to non-matching nucleotides.' % num_non_matching_nts #Resorting by position positions = sp.array(chrom_d['positions'])[ok_indices['g']] order = sp.argsort(positions) ok_indices['g'] = list(sp.array(ok_indices['g'])[order]) ok_indices['ss'] = list(sp.array(ok_indices['ss'])[order]) positions = positions[order] #Parse SNPs snp_indices = sp.array(chrom_d['snp_indices']) snp_indices = snp_indices[ ok_indices['g']] #Pinpoint where the SNPs are in the file. raw_snps, freqs = _parse_plink_snps_(genotype_file, snp_indices) print 'raw_snps.shape=', raw_snps.shape snp_stds = sp.sqrt(2 * freqs * (1 - freqs)) #sp.std(raw_snps, 1) snp_means = freqs * 2 #sp.mean(raw_snps, 1) betas = betas[ok_indices['ss']] log_odds = log_odds[ok_indices['ss']] ps = ssg['ps'][...][ok_indices['ss']] nts = sp.array(ok_nts)[order] sids = ssg['sids'][...][ok_indices['ss']] #Check SNP frequencies.. if check_mafs and 'freqs' in ssg.keys(): ss_freqs = ss_freqs[ok_indices['ss']] freq_discrepancy_snp = sp.absolute(ss_freqs - (1 - freqs)) > 0.15 if sp.any(freq_discrepancy_snp): print 'Warning: %d SNPs appear to have high frequency discrepancy between summary statistics and validation sample' % sp.sum( freq_discrepancy_snp) print freqs[freq_discrepancy_snp] print ss_freqs[freq_discrepancy_snp] #Filter freq_discrepancy_snps ok_freq_snps = sp.negative(freq_discrepancy_snp) raw_snps = raw_snps[ok_freq_snps] snp_stds = snp_stds[ok_freq_snps] snp_means = snp_means[ok_freq_snps] freqs = freqs[ok_freq_snps] ps = ps[ok_freq_snps] positions = positions[ok_freq_snps] nts = nts[ok_freq_snps] sids = sids[ok_freq_snps] betas = betas[ok_freq_snps] log_odds = log_odds[ok_freq_snps] #Filter minor allele frequency SNPs. maf_filter = (freqs > min_maf) * (freqs < (1 - min_maf)) maf_filter_sum = sp.sum(maf_filter) n_snps = len(maf_filter) assert maf_filter_sum <= n_snps, "WTF?" if sp.sum(maf_filter) < n_snps: raw_snps = raw_snps[maf_filter] snp_stds = snp_stds[maf_filter] snp_means = snp_means[maf_filter] freqs = freqs[maf_filter] ps = ps[maf_filter] positions = positions[maf_filter] nts = nts[maf_filter] sids = sids[maf_filter] betas = betas[maf_filter] log_odds = log_odds[maf_filter] print '%d SNPs with MAF < %0.3f were filtered' % ( n_snps - maf_filter_sum, min_maf) print '%d SNPs were retained on chromosome %d.' % (maf_filter_sum, chrom) rb_prs = sp.dot(sp.transpose(raw_snps), log_odds) if has_phenotype: print 'Normalizing SNPs' snp_means.shape = (len(raw_snps), 1) snp_stds.shape = (len(raw_snps), 1) snps = (raw_snps - snp_means) / snp_stds assert snps.shape == raw_snps.shape, 'Aha!' snp_stds = snp_stds.flatten() snp_means = snp_means.flatten() prs = sp.dot(sp.transpose(snps), betas) corr = sp.corrcoef(Y, prs)[0, 1] corr_list.append(corr) print 'PRS correlation for chromosome %d was %0.4f' % (chrom, corr) rb_corr = sp.corrcoef(Y, rb_prs)[0, 1] rb_corr_list.append(rb_corr) print 'Raw effect sizes PRS correlation for chromosome %d was %0.4f' % ( chrom, rb_corr) sid_set = set(sids) if genetic_map_dir is not None: genetic_map = [] with gzip.open(genetic_map_dir + 'chr%d.interpolated_genetic_map.gz' % chrom) as f: for line in f: l = line.split() if l[0] in sid_set: genetic_map.append(l[0]) print 'Now storing coordinated data to HDF5 file.' ofg = cord_data_g.create_group('chrom_%d' % chrom) ofg.create_dataset('raw_snps_ref', data=raw_snps, compression='lzf') ofg.create_dataset('snp_stds_ref', data=snp_stds) ofg.create_dataset('snp_means_ref', data=snp_means) ofg.create_dataset('freqs_ref', data=freqs) ofg.create_dataset('ps', data=ps) ofg.create_dataset('positions', data=positions) ofg.create_dataset('nts', data=nts) ofg.create_dataset('sids', data=sids) if genetic_map_dir is not None: ofg.create_dataset('genetic_map', data=genetic_map) # print 'Sum of squared effect sizes:', sp.sum(betas ** 2) # print 'Sum of squared log odds:', sp.sum(log_odds ** 2) ofg.create_dataset('betas', data=betas) ofg.create_dataset('log_odds', data=log_odds) ofg.create_dataset('log_odds_prs', data=rb_prs) if has_phenotype: risk_scores += prs rb_risk_scores += rb_prs num_common_snps += len(betas)
def grasp_callback(my_grasp): my_pose = geometry_msgs.msg.PoseStamped() my_pose.header.stamp = my_grasp.markers[0].header.stamp my_pose.header.frame_id = "/xtion_rgb_optical_frame" pose_target = geometry_msgs.msg.Pose() pose_target.position.x = my_grasp.markers[0].points[0].x pose_target.position.y = my_grasp.markers[0].points[0].y pose_target.position.z = my_grasp.markers[0].points[0].z ## Convert to quaternion u = [1, 0, 0] norm = linalg.norm([ my_grasp.markers[i].points[0].x - my_grasp.markers[0].points[1].x, my_grasp.markers[0].points[0].y - my_grasp.markers[0].points[1].y, my_grasp.markers[0].points[0].z - my_grasp.markers[0].points[1].z ]) v = asarray([ my_grasp.markers[0].points[0].x - my_grasp.markers[0].points[1].x, my_grasp.markers[0].points[0].y - my_grasp.markers[0].points[1].y, my_grasp.markers[0].points[0].z - my_grasp.markers[0].points[1].z ]) / norm if (array_equal(u, v)): pose_target.orientation.w = 1 pose_target.orientation.x = 0 pose_target.orientation.y = 0 pose_target.orientation.z = 0 elif (array_equal(u, negative(v))): pose_target.orientation.w = 0 pose_target.orientation.x = 0 pose_target.orientation.y = 0 pose_target.orientation.z = 1 else: half = [u[0] + v[0], u[1] + v[1], u[2] + v[2]] pose_target.orientation.w = dot(u, half) temp = cross(u, half) pose_target.orientation.x = temp[0] pose_target.orientation.y = temp[1] pose_target.orientation.z = temp[2] norm = math.sqrt(pose_target.orientation.x * pose_target.orientation.x + pose_target.orientation.y * pose_target.orientation.y + pose_target.orientation.z * pose_target.orientation.z + pose_target.orientation.w * pose_target.orientation.w) if norm == 0: norm = 1 my_pose.pose.orientation.x = pose_target.orientation.x / norm my_pose.pose.orientation.y = pose_target.orientation.y / norm my_pose_.pose.orientation.z = pose_target.orientation.z / norm my_pose.pose.orientation.w = pose_target.orientation.w / norm pose_target_trans = geometry_msgs.msg.PoseStamped() pose_target_trans.header.stamp = pose_target.header.stamp pose_target_trans.header.frame_id = "/map" now = rospy.Time.now() listener.waitForTransform("/map", "/xtion_rgb_optical_frame", now, rospy.Duration(1.0)) pose_target_trans = listener.transformPose("/map", pose_target) my_grasp_pub.publish(pose_target_trans)
def gen_sfs_plots(snps_hdf5_file = 'C:/Users/MariaIzabel/Desktop/MASTER/PHD/Bjarnicode/called_snps.hdf5', fig_dir = 'C:/Users/MariaIzabel/Desktop/MASTER/PHD/Bjarnicode/', filter_pop=None): ### Here I will do the SFS for each genospecies based on the rhizobium xls file pop = parse_pop_map() pop_map = pop.keys() ct_array = pop.values() from itertools import izip h5f = h5py.File(snps_hdf5_file) gene_groups = sorted(h5f.keys()) syn_mafs = [] nonsyn_mafs = [] all_mafs = [] sfs_dict = {} for i, gg in enumerate(gene_groups): if i%100==0: print '%d: Gene %s'%(i,gg) g = h5f[gg] if g['codon_snps'].size>1: #print g['codon_snps'].shape if filter_pop is not None: strains = g['strains'] indiv_filter = sp.zeros((len(strains)),dtype='bool8') for s_i, s in enumerate(strains): if pop[s]['genospecies']== filter_pop: indiv_filter[s_i]=True codon_snps = g['codon_snps'][...] codon_snps = codon_snps[:,indiv_filter] # reducing the collumns based on the genospecies t_codon_snps = sp.transpose(codon_snps) freqs = sp.mean(t_codon_snps,0) # rows are snps collumns are individuals #counts = np.sum(codon_snps, axis = 0) #print counts #for c in counts: # if c in sfs_dict: # sfs_dict[c] += 1 # else: # sfs_dict[c] = 1 #with open('dict.csv', 'wb') as csv_file: # writer = csv.writer(csv_file) # for key, value in sfs_dict.items(): # writer.writerow([key, value]) else: codon_snps = g['codon_snps'][...] t_codon_snps = sp.transpose(codon_snps) freqs = sp.mean(t_codon_snps,0) # number of minor allele mafs = sp.minimum(freqs,1-freqs) is_synonimous_snp = g['is_synonimous_snp'][...] syn_mafs.extend(mafs[is_synonimous_snp]) nonsyn_mafs.extend(mafs[sp.negative(is_synonimous_snp)]) all_mafs.extend(mafs) if filter_pop is not None: output_file = "%s.csv" %(str(argv[1])) np.savetxt(output_file, all_mafs, delimiter=',') # X is an array output_file = "%ssyn_mafs.csv" %(str(argv[1])) np.savetxt(output_file, syn_mafs, delimiter=',') output_file = "%snon_syn_mafs.csv" %(str(argv[1])) np.savetxt(output_file, nonsyn_mafs, delimiter=',') # pylab.clf() # pylab.hist(all_mafs, bins=50) # pylab.title('SFS (all binary codon SNPs)') # pylab.savefig('%s/sfs_all_%s.png'%(fig_dir,filter_pop)) #pylab.clf() #pylab.hist(nonsyn_mafs, bins=50) #pylab.title('SFS (non-synonimous SNPs)') #pylab.savefig('%s/sfs_non_syn_%s.png'%(fig_dir,filter_pop)) #pylab.clf() #pylab.hist(syn_mafs, bins=50) #pylab.title('SFS (synonimous SNPs)') #pylab.savefig('%s/sfs_syn_%s.png'%(fig_dir,filter_pop)) else: output_file = "total_2.csv" np.savetxt(output_file, all_mafs, delimiter=',') pylab.clf() pylab.hist(all_mafs, bins=50) pylab.title('SFS (all binary codon SNPs)') pylab.savefig(fig_dir+'/sfs_all.png') pylab.clf() pylab.hist(nonsyn_mafs, bins=50) pylab.title('SFS (non-synonimous SNPs)') pylab.savefig(fig_dir+'/sfs_non_syn.png') pylab.clf() pylab.hist(syn_mafs, bins=50) pylab.title('SFS (synonimous SNPs)') pylab.savefig(fig_dir+'/sfs_syn.png')
def get_kinships(snps_file='/project/NChain/faststorage/rhizobium/ld/new_snps.hdf5', plot_figures=False, figure_dir='/project/NChain/faststorage/rhizobium/ld/figures', fig_id='all', min_maf=0.1, max_strain_num=200): """ Calculates the kinship """ h5f = h5py.File(snps_file) gene_groups = h5f.keys() all_strains = set() for gg in gene_groups: data_g = h5f[gg] strains = data_g['strains'][...] if len(strains) < max_strain_num: all_strains = set(strains).union(all_strains) num_strains = len(all_strains) print 'Found %d "distinct" strains' % num_strains ordered_strains = sorted(list(all_strains)) strain_index = pd.Index(ordered_strains) K_snps = sp.zeros((num_strains, num_strains)) counts_mat_snps = sp.zeros((num_strains, num_strains)) K_codon_snps = sp.zeros((num_strains, num_strains)) counts_mat_codon_snps = sp.zeros((num_strains, num_strains)) K_nonsyn_snps = sp.zeros((num_strains, num_strains)) counts_mat_nonsyn_snps = sp.zeros((num_strains, num_strains)) K_syn_snps = sp.zeros((num_strains, num_strains)) counts_mat_syn_snps = sp.zeros((num_strains, num_strains)) for i, gg in enumerate(gene_groups): if i % 100 == 0: print 'Working on gene nr. %d' % i data_g = h5f[gg] strains = data_g['strains'][...] if len(strains) < max_strain_num: strain_mask = strain_index.get_indexer(strains) snps = data_g['norm_snps'][...] freqs = data_g['freqs'][...] mafs = sp.minimum(freqs, 1 - freqs) maf_mask = mafs > min_maf snps = snps[maf_mask] if len(snps) == 0: continue K_snps_slice = K_snps[strain_mask] K_snps_slice[:, strain_mask] += sp.dot(snps.T, snps) K_snps[strain_mask] = K_snps_slice counts_mat_snps_slice = counts_mat_snps[strain_mask] counts_mat_snps_slice[:, strain_mask] += len(snps) counts_mat_snps[strain_mask] = counts_mat_snps_slice codon_snps = data_g['norm_codon_snps'][...] if len(codon_snps) == 0: continue freqs = data_g['codon_snp_freqs'][...] mafs = sp.minimum(freqs, 1 - freqs) maf_mask = mafs > min_maf codon_snps = codon_snps[maf_mask] is_synonimous_snp = data_g['is_synonimous_snp'][...] is_synonimous_snp = is_synonimous_snp[maf_mask] if len(codon_snps) > 0: K_codon_snps_slice = K_codon_snps[strain_mask] K_codon_snps_slice[:, strain_mask] += sp.dot(codon_snps.T, codon_snps) K_codon_snps[strain_mask] = K_codon_snps_slice counts_mat_codon_snps_slice = counts_mat_codon_snps[strain_mask] counts_mat_codon_snps_slice[:, strain_mask] += len(codon_snps) counts_mat_codon_snps[strain_mask] = counts_mat_codon_snps_slice if sp.sum(is_synonimous_snp) > 0: syn_snps = codon_snps[is_synonimous_snp] K_syn_snps_slice = K_syn_snps[strain_mask] K_syn_snps_slice[:, strain_mask] += sp.dot(syn_snps.T, syn_snps) K_syn_snps[strain_mask] = K_syn_snps_slice counts_mat_syn_snps_slice = counts_mat_syn_snps[strain_mask] counts_mat_syn_snps_slice[:, strain_mask] += len(syn_snps) counts_mat_syn_snps[strain_mask] = counts_mat_syn_snps_slice is_nonsynonimous_snp = sp.negative(is_synonimous_snp) if sp.sum(is_nonsynonimous_snp) > 0: nonsyn_snps = codon_snps[is_nonsynonimous_snp] K_nonsyn_snps_slice = K_nonsyn_snps[strain_mask] K_nonsyn_snps_slice[:, strain_mask] += sp.dot(nonsyn_snps.T, nonsyn_snps) K_nonsyn_snps[strain_mask] = K_nonsyn_snps_slice counts_mat_nonsyn_snps_slice = counts_mat_nonsyn_snps[strain_mask] counts_mat_nonsyn_snps_slice[:, strain_mask] += len(nonsyn_snps) counts_mat_nonsyn_snps[strain_mask] = counts_mat_nonsyn_snps_slice K_snps = K_snps / counts_mat_snps # element-wise division K_codon_snps = K_codon_snps / counts_mat_codon_snps # element-wise division K_syn_snps = K_syn_snps / counts_mat_syn_snps # element-wise division K_nonsyn_snps = K_nonsyn_snps / counts_mat_nonsyn_snps # element-wise division if plot_figures: plot_dirty_PCA(K_snps, figure_fn='PCA_all_snps_%s.pdf' % fig_id, k_figure_fn='K_all_snps_%s.png' % fig_id, figure_dir=figure_dir, strains=ordered_strains, title='All SNPs') plot_dirty_PCA(K_codon_snps, figure_fn='PCA_codon_snps_%s.pdf' % fig_id, k_figure_fn='K_codon_snps_%s.png' % fig_id, figure_dir=figure_dir, strains=ordered_strains, title='Codon SNPs') plot_dirty_PCA(K_syn_snps, figure_fn='PCA_syn_snps_%s.pdf' % fig_id, k_figure_fn='K_syn_snps_%s.png' % fig_id, figure_dir=figure_dir, strains=ordered_strains, title='Synonymous SNPs') plot_dirty_PCA(K_nonsyn_snps, figure_fn='PCA_nonsyn_snps_%s.pdf' % fig_id, k_figure_fn='K_nonsyn_snps_%s.png' % fig_id, figure_dir=figure_dir, strains=ordered_strains, title='Non-Synonymous SNPs') print 'Average number of SNPs: %0.2f.' % sp.mean(counts_mat_snps) print 'Average number of codon SNPs: %0.2f.' % sp.mean(counts_mat_snps) print 'Average number of codon SNPs: %0.2f.' % sp.mean(counts_mat_snps) print 'Average number of codon SNPs: %0.2f.' % sp.mean(counts_mat_snps) return {'K_snps':K_snps, 'K_codon_snps':K_codon_snps, 'counts_mat_snps':counts_mat_snps, 'counts_mat_codon_snps':counts_mat_codon_snps, 'K_syn_snps':K_syn_snps, 'K_nonsyn_snps':K_nonsyn_snps, 'counts_mat_syn_snps':counts_mat_syn_snps, 'counts_mat_nonsyn_snps':counts_mat_nonsyn_snps, 'strains':ordered_strains}
def coordinate_cegs_genotype_phenotype(phen_dict, phenotype='Protein',env='mated',k_thres=0.8, ind_missing_thres=0.5, snp_missing_thres=0.05, maf_thres=0.1, genotype_file='/Users/bjarnivilhjalmsson/data/cegs_lehmann/CEGS.216.lines.NO_DPGP4.GATK.SNP.HETS.FILTERED.Filter_imputed.hdf5'): """ Parse genotypes and coordinate with phenotype, and ready data for analysis. """ gh5f = h5py.File(genotype_file) p_dict = phen_dict[phenotype][env] print 'Loading SNPs' snps = sp.array(gh5f['gt'][...],dtype='single') snps = snps[:,p_dict['ind_filter']] positions = gh5f['pos'][...] m,n = snps.shape print 'Loaded %d SNPs for %d individuals'%(m,n) print 'Filtering individuals with missing rates >%0.2f'%ind_missing_thres missing_mat = sp.isnan(snps) ind_missing_rates = sp.sum(missing_mat,0)/float(m) ind_filter = ind_missing_rates<ind_missing_thres snps = snps[:,ind_filter] n = sp.sum(ind_filter) print 'Filtered %d individuals due to high missing rates'%sp.sum(sp.negative(ind_filter)) gt_ids = gh5f['gt_ids'][p_dict['ind_filter']] gt_ids = gt_ids[ind_filter] Y_means = p_dict['Y_means'][p_dict['ind_filter']] Y_means = Y_means[ind_filter] Y_medians = p_dict['Y_medians'][p_dict['ind_filter']] Y_medians = Y_medians[ind_filter] rep_count = p_dict['rep_count'][p_dict['ind_filter']] rep_count = rep_count[ind_filter] print 'Now removing "bad" genotypes.' bad_genotypes = ['Raleigh_272', 'Raleigh_378', 'Raleigh_554', 'Raleigh_591', 'Raleigh_398', 'Raleigh_138', 'Raleigh_208', 'Raleigh_336', 'Raleigh_370', 'Raleigh_373', 'Raleigh_374', 'Raleigh_799', 'Raleigh_821', 'Raleigh_822', 'Raleigh_884', 'Raleigh_335'] ind_filter = sp.negative(sp.in1d(gt_ids,bad_genotypes)) gt_ids = gt_ids[ind_filter] Y_means= Y_means[ind_filter] Y_medians= Y_medians[ind_filter] rep_count= rep_count[ind_filter] snps = snps[:,ind_filter] print 'Removed %d "bad" genotypes'%sp.sum(sp.negative(ind_filter)) n = len(snps[0]) print 'Filtering SNPs with missing rate >%0.2f'%snp_missing_thres missing_mat = sp.isnan(snps) snp_missing_rates = sp.sum(missing_mat,1)/float(n) snps_filter = snp_missing_rates<snp_missing_thres snps = snps[snps_filter] positions = positions[snps_filter] m = sp.sum(snps_filter) print 'Filtered %d SNPs due to high missing rate'%sp.sum(sp.negative(snps_filter)) print 'Now imputing (w mean)' missing_mat = sp.isnan(snps) ok_counts = n-sp.sum(missing_mat,1) snps[missing_mat]=0 snp_means = sp.sum(snps,1)/ok_counts # print snp_means.shape # print snp_means[:10] # import pdb # pdb.set_trace() for i in range(len(snps)): snps[i,missing_mat[i]]=snp_means[i] print 'And filtering SNPs with MAF<%0.2f'%maf_thres snp_means = sp.mean(snps,1) snp_mafs = sp.minimum(snp_means,1-snp_means) snps_filter = snp_mafs>maf_thres snps = snps[snps_filter] positions = positions[snps_filter] print 'Filtered %d SNPs with low MAFs'%sp.sum(sp.negative(snps_filter)) print 'Filtering based on kinship w threshold:',k_thres import kinship K = kinship.calc_ibd_kinship(snps) print '\nKinship calculated' K_ind_filter = [] for i in range(n): K_ind_filter.append(not sp.any(K[i,i+1:n]>k_thres)) if sum(K_ind_filter)==n: print 'No individuals were filtered based on kinship..' else: print 'Filtering %d individuals based on kinship.'%(n-sum(K_ind_filter)) K_ind_filter = sp.array(K_ind_filter) gt_ids = gt_ids[K_ind_filter] Y_means= Y_means[K_ind_filter] Y_medians= Y_medians[K_ind_filter] rep_count= rep_count[K_ind_filter] snps = snps[:,K_ind_filter] print 'Again filtering SNPs with MAF<%0.2f'%maf_thres snp_means = sp.mean(snps,1) snp_mafs = sp.minimum(snp_means,1-snp_means) snps_filter = snp_mafs>maf_thres snps = snps[snps_filter] positions = positions[snps_filter] print 'Filtered %d additional SNPs with low MAFs'%sp.sum(sp.negative(snps_filter)) print 'All filtering done.' m,n = snps.shape print 'In all there are %d SNPs remaining, for %d individuals.'%(m,n) ret_dict = {'Y_means':Y_means, 'Y_medians':Y_medians, 'rep_count':rep_count, 'gt_ids':gt_ids, 'positions':positions, 'snps':snps} return ret_dict
def parse_cegs_drosophila_phenotypes(phenotype_file='/Users/bjarnivilhjalmsson/data/cegs_lehmann/allphenotypes_5.0_cleaned.tab.reps.hdf5',): """ Parser for CEGS Drosophila phenotype data """ import pylab #Load phenotypes... ph5f = h5py.File(phenotype_file) #Now take the median and mean of all values for all individuals. phen_dict = {} for phen in ph5f.keys(): #First mated Y_mated = ph5f[phen]['Y_mated'][...] Z_mated = ph5f[phen]['Z_mated'][...] sample_filter = sp.negative(sp.isnan(Y_mated)) Ys_sum = sp.dot(Y_mated[sample_filter], Z_mated[sample_filter]) rep_count = sp.dot(sp.ones(sum(sample_filter)), Z_mated[sample_filter]) Y_means = Ys_sum/rep_count #Now calculate medians by iteration. phen_vals_list = [[] for i in range(216)] for i in range(len(Y_mated)): ind_i = sp.where(1==Z_mated[i])[0][0] phen_vals_list[ind_i].append(Y_mated[i]) medians = sp.zeros(216) for i, pl in enumerate(phen_vals_list): if len(pl)>0: medians[i] = sp.median(pl) else: medians[i] = sp.nan ind_filter = sp.negative(sp.isnan(Y_means)) if phen=='Triglyceride': ind_filter = (Y_means>0)*ind_filter phen_dict[phen]={'mated':{'Y_means':Y_means, 'rep_count':rep_count, 'ind_filter':ind_filter, 'Y_medians':medians}} print 'Plotting phenotype histograms for %s, %s'%(phen,'mated') mated_filtered_means = Y_means[ind_filter] pylab.hist(mated_filtered_means) pylab.savefig('/Users/bjarnivilhjalmsson/data/tmp/cegs_hist_%s_mated_means.png' % (phen)) pylab.clf() mated_filtered_medians = medians[ind_filter] pylab.hist(mated_filtered_medians) pylab.savefig('/Users/bjarnivilhjalmsson/data/tmp/cegs_hist_%s_mated_medians.png' % (phen)) pylab.clf() #Then virgin Y_virgin = ph5f[phen]['Y_virgin'][...] Z_virgin = ph5f[phen]['Z_virgin'][...] sample_filter = sp.negative(sp.isnan(Y_virgin)) Ys_sum = sp.dot(Y_virgin[sample_filter], Z_virgin[sample_filter]) rep_count = sp.dot(sp.ones(sum(sample_filter)), Z_virgin[sample_filter]) Y_means = Ys_sum/rep_count #Now calculate medians by iteration. phen_vals_list = [[] for i in range(216)] for i in range(len(Y_virgin)): ind_i = sp.where(1==Z_virgin[i])[0][0] phen_vals_list[ind_i].append(Y_virgin[i]) medians = sp.zeros(216) for i, pl in enumerate(phen_vals_list): if len(pl)>0: medians[i] = sp.median(pl) else: medians[i] = sp.nan ind_filter = sp.negative(sp.isnan(Y_means)) if phen=='Triglyceride': ind_filter = (Y_means>0)*ind_filter phen_dict[phen]['virgin']={'Y_means':Y_means, 'rep_count':rep_count, 'ind_filter':ind_filter, 'Y_medians':medians} print 'Plotting phenotype histograms for %s, %s'%(phen,'virgin') virgin_filtered_means = Y_means[ind_filter] pylab.hist(virgin_filtered_means) pylab.savefig('/Users/bjarnivilhjalmsson/data/tmp/cegs_hist_%s_virgin_means.png' % (phen)) pylab.clf() virgin_filtered_medians = medians[ind_filter] pylab.hist(virgin_filtered_medians) pylab.savefig('/Users/bjarnivilhjalmsson/data/tmp/cegs_hist_%s_virgin_medians.png' % (phen)) pylab.clf() means_corr = sp.corrcoef(mated_filtered_means, virgin_filtered_means)[0,1] medians_corr = sp.corrcoef(mated_filtered_medians, virgin_filtered_medians)[0,1] print 'Correlation between mated and virgin flies, means: %0.2f, medians: %0.2f'%(means_corr,medians_corr) phen_dict[phen]['corrs'] = {'means':means_corr, 'medians':medians_corr} return phen_dict
def grasp_callback(my_grasp): ## Planning to a Pose goal ## ^^^^^^^^^^^^^^^^^^^^^^^ ## We can plan a motion for this group to a desired pose for the ## end-effector pose_target = geometry_msgs.msg.PoseStamped() pose_target.header.stamp = my_grasp.markers[0].header.stamp.secs pose_target.header.frame_id = "/camera_rgb_optical_frame" pose_target.pose.position.x = my_grasp.markers[0].points[1].x pose_target.pose.position.y = my_grasp.markers[0].points[1].y pose_target.pose.position.z = my_grasp.markers[0].points[1].z ## Convert to quaternion u = [1, 0, 0] norm = linalg.norm([ my_grasp.markers[0].points[0].x - my_grasp.markers[0].points[1].x, my_grasp.markers[0].points[0].y - my_grasp.markers[0].points[1].y, my_grasp.markers[0].points[0].z - my_grasp.markers[0].points[1].z ]) v = asarray([ my_grasp.markers[0].points[0].x - my_grasp.markers[0].points[1].x, my_grasp.markers[0].points[0].y - my_grasp.markers[0].points[1].y, my_grasp.markers[0].points[0].z - my_grasp.markers[0].points[1].z ]) / norm if (array_equal(u, v)): pose_target.pose.orientation.w = 1 pose_target.pose.orientation.x = 0 pose_target.pose.orientation.y = 0 pose_target.pose.orientation.z = 0 elif (array_equal(u, negative(v))): pose_target.pose.orientation.w = 0 pose_target.pose.orientation.x = 0 pose_target.pose.orientation.y = 0 pose_target.pose.orientation.z = 1 else: half = [u[0] + v[0], u[1] + v[1], u[2] + v[2]] pose_target.pose.orientation.w = dot(u, half) temp = cross(u, half) pose_target.pose.orientation.x = temp[0] pose_target.pose.orientation.y = temp[1] pose_target.pose.orientation.z = temp[2] norm = math.sqrt( pose_target.pose.orientation.x * pose_target.pose.orientation.x + pose_target.pose.orientation.y * pose_target.pose.orientation.y + pose_target.pose.orientation.z * pose_target.pose.orientation.z + pose_target.pose.orientation.w * pose_target.pose.orientation.w) if norm == 0: norm = 1 pose_target.pose.orientation.x = pose_target.pose.orientation.x / norm pose_target.pose.orientation.y = pose_target.pose.orientation.y / norm pose_target.pose.orientation.z = pose_target.pose.orientation.z / norm pose_target.pose.orientation.w = pose_target.pose.orientation.w / norm print "Timestamp: %d." % pose_target.header.stamp print "Position X: %f." % pose_target.pose.position.x print "Position Y: %f." % pose_target.pose.position.y print "Position Z: %f." % pose_target.pose.position.z print "Orientation X: %f." % pose_target.pose.orientation.x print "Orientation Y: %f." % pose_target.pose.orientation.y print "Orientation Z: %f." % pose_target.pose.orientation.z print "Orientation W: %f." % pose_target.pose.orientation.w ## Broadcast transform br = tf.TransformBroadcaster() br.sendTransform( (pose_target.pose.position.x, pose_target.pose.position.y, pose_target.pose.position.z), (pose_target.pose.orientation.x, pose_target.pose.orientation.y, pose_target.pose.orientation.z, pose_target.pose.orientation.w), my_grasp.markers[0].header.stamp, "/grasping_target", "/camera_rgb_optical_frame") #br.sendTransform((pose_target.pose.position.x, pose_target.pose.position.y, pose_target.pose.position.z), tf.transformations.quaternion_from_euler(my_grasp.grasps[0].axis.x, my_grasp.grasps[0].axis.y, my_grasp.grasps[0].axis.z), my_grasp.header.stamp, "/grasping_target", "/camera_rgb_optical_frame") ## Listening to transform (trans, rot) = listener.lookupTransform('/world', '/grasping_target', rospy.Time(0)) ## Build new pose pose_target_trans = geometry_msgs.msg.PoseStamped() pose_target_trans.header.frame_id = "/world" pose_target_trans.header.stamp = my_grasp.markers[0].header.stamp pose_target_trans.pose.position.x = trans[0] pose_target_trans.pose.position.y = trans[1] pose_target_trans.pose.position.z = trans[2] pose_target_trans.pose.orientation.x = rot[0] pose_target_trans.pose.orientation.y = rot[1] pose_target_trans.pose.orientation.z = rot[2] pose_target_trans.pose.orientation.w = rot[3] print "NEW POSITION." print "Position X: %f." % pose_target_trans.pose.position.x print "Position Y: %f." % pose_target_trans.pose.position.y print "Position Z: %f." % pose_target_trans.pose.position.z print "Orientation X: %f." % pose_target_trans.pose.orientation.x print "Orientation Y: %f." % pose_target_trans.pose.orientation.y print "Orientation Z: %f." % pose_target_trans.pose.orientation.z print "Orientation W: %f." % pose_target_trans.pose.orientation.w my_grasp_pub.publish(pose_target_trans) group.set_pose_target(pose_target_trans.pose, end_effector_link="my_eef") ## Now, we call the planner to compute the plan ## and visualize it if successful ## Note that we are just planning, not asking move_group ## to actually move the robot # group.set_planner_id("RRTstarkConfigDefault") # group.allow_replanning(True) ## Planning with collision detection can be slow. Lets set the planning time ## to be sure the planner has enough time to plan around the box. 10 seconds ## should be plenty. # group.set_planning_time(5.0) plan1 = group.plan() ## Moving to a pose goal ## ^^^^^^^^^^^^^^^^^^^^^ ## ## Moving to a pose goal is similar to the step above ## except we now use the go() function. Note that ## the pose goal we had set earlier is still active ## and so the robot will try to move to that goal. We will ## not use that function in this tutorial since it is ## a blocking function and requires a controller to be active ## and report success on execution of a trajectory. # Uncomment below line when working with a real robot # group.go(wait=True) print "============ DONE PLANNING ============" sys.exit("DONE PLANNING") ## Sleep to give Rviz time to visualize the plan. */ rospy.sleep(5) group.clear_pose_targets()
def coordinate_genot_ss(genotype_file=None, hdf5_file=None, genetic_map_dir=None, check_mafs=False, min_maf =0.01): """ Assumes plink BED files. Imputes missing genotypes. """ plinkf = plinkfile.PlinkFile(genotype_file) samples = plinkf.get_samples() num_individs = len(samples) # num_individs = len(gf['chrom_1']['snps'][:, 0]) # Y = sp.array(gf['indivs']['phenotype'][...] == 'Case', dtype='int8') Y = [s.phenotype for s in samples] fids = [s.fid for s in samples] iids = [s.iid for s in samples] unique_phens = sp.unique(Y) if len(unique_phens)==1: print 'Unable to find phenotype values.' has_phenotype=False elif len(unique_phens)==2: cc_bins = sp.bincount(Y) assert len(cc_bins)==2, 'Problems with loading phenotype' print 'Loaded %d controls and %d cases'%(cc_bins[0], cc_bins[1]) has_phenotype=True else: print 'Found quantitative phenotype values' has_phenotype=True risk_scores = sp.zeros(num_individs) rb_risk_scores = sp.zeros(num_individs) num_common_snps = 0 corr_list = [] rb_corr_list = [] if has_phenotype: hdf5_file.create_dataset('y', data=Y) hdf5_file.create_dataset('fids', data=fids) hdf5_file.create_dataset('iids', data=iids) ssf = hdf5_file['sum_stats'] cord_data_g = hdf5_file.create_group('cord_data') #Figure out chromosomes and positions by looking at SNPs. loci = plinkf.get_loci() plinkf.close() gf_chromosomes = [l.chromosome for l in loci] chromosomes = sp.unique(gf_chromosomes) chromosomes.sort() chr_dict = _get_chrom_dict_(loci, chromosomes) tot_num_non_matching_nts = 0 for chrom in chromosomes: chr_str = 'chrom_%d'%chrom print 'Working on chromsome: %s'%chr_str chrom_d = chr_dict[chr_str] try: ssg = ssf['chrom_%d' % chrom] except Exception, err_str: print err_str print 'Did not find chromsome in SS dataset.' print 'Continuing.' continue g_sids = chrom_d['sids'] g_sid_set = set(g_sids) assert len(g_sid_set) == len(g_sids), 'Some duplicates?' ss_sids = ssg['sids'][...] ss_sid_set = set(ss_sids) assert len(ss_sid_set) == len(ss_sids), 'Some duplicates?' #Figure out filters: g_filter = sp.in1d(g_sids,ss_sids) ss_filter = sp.in1d(ss_sids,g_sids) #Order by SNP IDs g_order = sp.argsort(g_sids) ss_order = sp.argsort(ss_sids) g_indices = [] for g_i in g_order: if g_filter[g_i]: g_indices.append(g_i) ss_indices = [] for ss_i in ss_order: if ss_filter[ss_i]: ss_indices.append(ss_i) g_nts = chrom_d['nts'] snp_indices = chrom_d['snp_indices'] ss_nts = ssg['nts'][...] betas = ssg['betas'][...] log_odds = ssg['log_odds'][...] assert not sp.any(sp.isnan(betas)), 'WTF?' assert not sp.any(sp.isinf(betas)), 'WTF?' num_non_matching_nts = 0 num_ambig_nts = 0 ok_nts = [] print 'Found %d SNPs present in both datasets'%(len(g_indices)) if 'freqs' in ssg.keys(): ss_freqs = ssg['freqs'][...] ss_freqs_list=[] ok_indices = {'g':[], 'ss':[]} for g_i, ss_i in it.izip(g_indices, ss_indices): #Is the nucleotide ambiguous? #g_nt = [recode_dict[g_nts[g_i][0]],recode_dict[g_nts[g_i][1]] g_nt = [g_nts[g_i][0],g_nts[g_i][1]] if tuple(g_nt) in ambig_nts: num_ambig_nts +=1 tot_num_non_matching_nts += 1 continue #First check if nucleotide is sane? if (not g_nt[0] in valid_nts) or (not g_nt[1] in valid_nts): num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue ss_nt = ss_nts[ss_i] #Are the nucleotides the same? flip_nts = False os_g_nt = sp.array([opp_strand_dict[g_nt[0]], opp_strand_dict[g_nt[1]]]) if not (sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)): # Opposite strand nucleotides flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or (os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1]) if flip_nts: betas[ss_i] = -betas[ss_i] log_odds[ss_i] = -log_odds[ss_i] if 'freqs' in ssg.keys(): ss_freqs[ss_i] = 1-ss_freqs[ss_i] else: # print "Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \ # (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt)) num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue # everything seems ok. ok_indices['g'].append(g_i) ok_indices['ss'].append(ss_i) ok_nts.append(g_nt) print '%d SNPs were excluded due to ambiguous nucleotides.' % num_ambig_nts print '%d SNPs were excluded due to non-matching nucleotides.' % num_non_matching_nts #Resorting by position positions = sp.array(chrom_d['positions'])[ok_indices['g']] order = sp.argsort(positions) ok_indices['g'] = list(sp.array(ok_indices['g'])[order]) ok_indices['ss'] = list(sp.array(ok_indices['ss'])[order]) positions = positions[order] #Parse SNPs snp_indices = sp.array(chrom_d['snp_indices']) snp_indices = snp_indices[ok_indices['g']] #Pinpoint where the SNPs are in the file. raw_snps, freqs = _parse_plink_snps_(genotype_file, snp_indices) print 'raw_snps.shape=', raw_snps.shape snp_stds = sp.sqrt(2*freqs*(1-freqs)) #sp.std(raw_snps, 1) snp_means = freqs*2 #sp.mean(raw_snps, 1) betas = betas[ok_indices['ss']] log_odds = log_odds[ok_indices['ss']] ps = ssg['ps'][...][ok_indices['ss']] nts = sp.array(ok_nts)[order] sids = ssg['sids'][...][ok_indices['ss']] #Check SNP frequencies.. if check_mafs and 'freqs' in ssg.keys(): ss_freqs = ss_freqs[ok_indices['ss']] freq_discrepancy_snp = sp.absolute(ss_freqs-(1-freqs))>0.15 if sp.any(freq_discrepancy_snp): print 'Warning: %d SNPs appear to have high frequency discrepancy between summary statistics and validation sample'%sp.sum(freq_discrepancy_snp) print freqs[freq_discrepancy_snp] print ss_freqs[freq_discrepancy_snp] #Filter freq_discrepancy_snps ok_freq_snps = sp.negative(freq_discrepancy_snp) raw_snps = raw_snps[ok_freq_snps] snp_stds = snp_stds[ok_freq_snps] snp_means = snp_means[ok_freq_snps] freqs = freqs[ok_freq_snps] ps = ps[ok_freq_snps] positions = positions[ok_freq_snps] nts = nts[ok_freq_snps] sids = sids[ok_freq_snps] betas = betas[ok_freq_snps] log_odds = log_odds[ok_freq_snps] #Filter minor allele frequency SNPs. maf_filter = (freqs>min_maf)*(freqs<(1-min_maf)) maf_filter_sum = sp.sum(maf_filter) n_snps = len(maf_filter) assert maf_filter_sum<=n_snps, "WTF?" if sp.sum(maf_filter)<n_snps: raw_snps = raw_snps[maf_filter] snp_stds = snp_stds[maf_filter] snp_means = snp_means[maf_filter] freqs = freqs[maf_filter] ps = ps[maf_filter] positions = positions[maf_filter] nts = nts[maf_filter] sids = sids[maf_filter] betas = betas[maf_filter] log_odds = log_odds[maf_filter] print '%d SNPs with MAF < %0.3f were filtered'%(n_snps-maf_filter_sum,min_maf) print '%d SNPs were retained on chromosome %d.' % (maf_filter_sum, chrom) rb_prs = sp.dot(sp.transpose(raw_snps), log_odds) if has_phenotype: print 'Normalizing SNPs' snp_means.shape = (len(raw_snps),1) snp_stds.shape = (len(raw_snps),1) snps = (raw_snps - snp_means) / snp_stds assert snps.shape==raw_snps.shape, 'Aha!' snp_stds = snp_stds.flatten() snp_means = snp_means.flatten() prs = sp.dot(sp.transpose(snps), betas) corr = sp.corrcoef(Y, prs)[0, 1] corr_list.append(corr) print 'PRS correlation for chromosome %d was %0.4f' % (chrom, corr) rb_corr = sp.corrcoef(Y, rb_prs)[0, 1] rb_corr_list.append(rb_corr) print 'Raw effect sizes PRS correlation for chromosome %d was %0.4f' % (chrom, rb_corr) sid_set = set(sids) if genetic_map_dir is not None: genetic_map = [] with gzip.open(genetic_map_dir+'chr%d.interpolated_genetic_map.gz'%chrom) as f: for line in f: l = line.split() if l[0] in sid_set: genetic_map.append(l[0]) print 'Now storing coordinated data to HDF5 file.' ofg = cord_data_g.create_group('chrom_%d' % chrom) ofg.create_dataset('raw_snps_ref', data=raw_snps, compression='lzf') ofg.create_dataset('snp_stds_ref', data=snp_stds) ofg.create_dataset('snp_means_ref', data=snp_means) ofg.create_dataset('freqs_ref', data=freqs) ofg.create_dataset('ps', data=ps) ofg.create_dataset('positions', data=positions) ofg.create_dataset('nts', data=nts) ofg.create_dataset('sids', data=sids) if genetic_map_dir is not None: ofg.create_dataset('genetic_map', data=genetic_map) # print 'Sum of squared effect sizes:', sp.sum(betas ** 2) # print 'Sum of squared log odds:', sp.sum(log_odds ** 2) ofg.create_dataset('betas', data=betas) ofg.create_dataset('log_odds', data=log_odds) ofg.create_dataset('log_odds_prs', data=rb_prs) if has_phenotype: risk_scores += prs rb_risk_scores += rb_prs num_common_snps += len(betas)
def coordinate_genotypes_ss_w_ld_ref(genotype_file=None, reference_genotype_file=None, hdf5_file=None, genetic_map_dir=None, check_mafs=False, min_maf=0.01): # recode_dict = {1:'A', 2:'T', 3:'C', 4:'G'} #1K genomes recoding.. print 'Coordinating things w genotype file: %s \nref. genot. file: %s' % ( genotype_file, reference_genotype_file) plinkf = plinkfile.PlinkFile(genotype_file) #Loads only the individuals... (I think?) samples = plinkf.get_samples() num_individs = len(samples) Y = [s.phenotype for s in samples] fids = [s.fid for s in samples] iids = [s.iid for s in samples] unique_phens = sp.unique(Y) if len(unique_phens) == 1: print 'Unable to find phenotype values.' has_phenotype = False elif len(unique_phens) == 2: cc_bins = sp.bincount(Y) assert len(cc_bins) == 2, 'Problems with loading phenotype' print 'Loaded %d controls and %d cases' % (cc_bins[0], cc_bins[1]) has_phenotype = True else: print 'Found quantitative phenotype values' has_phenotype = True #Figure out chromosomes and positions. print 'Parsing validation genotype bim file' loci = plinkf.get_loci() plinkf.close() gf_chromosomes = [l.chromosome for l in loci] chromosomes = sp.unique(gf_chromosomes) chromosomes.sort() chr_dict = _get_chrom_dict_(loci, chromosomes) print 'Parsing LD reference genotype bim file' plinkf_ref = plinkfile.PlinkFile(reference_genotype_file) loci_ref = plinkf_ref.get_loci() plinkf_ref.close() chr_dict_ref = _get_chrom_dict_(loci_ref, chromosomes) # chr_dict_ref = _get_chrom_dict_bim_(reference_genotype_file+'.bim', chromosomes) #Open HDF5 file and prepare out data assert not 'iids' in hdf5_file.keys( ), 'Something is wrong with the HDF5 file?' if has_phenotype: hdf5_file.create_dataset('y', data=Y) hdf5_file.create_dataset('fids', data=fids) hdf5_file.create_dataset('iids', data=iids) ssf = hdf5_file['sum_stats'] cord_data_g = hdf5_file.create_group('cord_data') maf_adj_risk_scores = sp.zeros(num_individs) num_common_snps = 0 #corr_list = [] tot_g_ss_nt_concord_count = 0 tot_rg_ss_nt_concord_count = 0 tot_g_rg_nt_concord_count = 0 tot_num_non_matching_nts = 0 #Now iterate over chromosomes for chrom in chromosomes: ok_indices = {'g': [], 'rg': [], 'ss': []} chr_str = 'chrom_%d' % chrom print 'Working on chromsome: %s' % chr_str chrom_d = chr_dict[chr_str] chrom_d_ref = chr_dict_ref[chr_str] try: ssg = ssf['chrom_%d' % chrom] except Exception, err_str: print err_str print 'Did not find chromsome in SS dataset.' print 'Continuing.' continue ssg = ssf['chrom_%d' % chrom] g_sids = chrom_d['sids'] rg_sids = chrom_d_ref['sids'] ss_sids = ssg['sids'][...] print 'Found %d SNPs in validation data, %d SNPs in LD reference data, and %d SNPs in summary statistics.' % ( len(g_sids), len(rg_sids), len(ss_sids)) common_sids = sp.intersect1d(ss_sids, g_sids) common_sids = sp.intersect1d(common_sids, rg_sids) print 'Found %d SNPs on chrom %d that were common across all datasets' % ( len(common_sids), chrom) ss_snp_map = [] g_snp_map = [] rg_snp_map = [] ss_sid_dict = {} for i, sid in enumerate(ss_sids): ss_sid_dict[sid] = i g_sid_dict = {} for i, sid in enumerate(g_sids): g_sid_dict[sid] = i rg_sid_dict = {} for i, sid in enumerate(rg_sids): rg_sid_dict[sid] = i for sid in common_sids: g_snp_map.append(g_sid_dict[sid]) #order by positions g_positions = sp.array(chrom_d['positions'])[g_snp_map] order = sp.argsort(g_positions) #order = order.tolist() g_snp_map = sp.array(g_snp_map)[order] g_snp_map = g_snp_map.tolist() common_sids = sp.array(common_sids)[order] #Get the other two maps for sid in common_sids: rg_snp_map.append(rg_sid_dict[sid]) for sid in common_sids: ss_snp_map.append(ss_sid_dict[sid]) g_nts = sp.array(chrom_d['nts']) rg_nts = sp.array(chrom_d_ref['nts']) rg_nts_ok = sp.array(rg_nts)[rg_snp_map] # rg_nts_l = [] # for nt in rg_nts_ok: # rg_nts_l.append([recode_dict[nt[0]],recode_dict[nt[1]]]) # rg_nts_ok = sp.array(rg_nts_l) ss_nts = ssg['nts'][...] betas = ssg['betas'][...] log_odds = ssg['log_odds'][...] if 'freqs' in ssg.keys(): ss_freqs = ssg['freqs'][...] g_ss_nt_concord_count = sp.sum( g_nts[g_snp_map] == ss_nts[ss_snp_map]) / 2.0 rg_ss_nt_concord_count = sp.sum(rg_nts_ok == ss_nts[ss_snp_map]) / 2.0 g_rg_nt_concord_count = sp.sum(g_nts[g_snp_map] == rg_nts_ok) / 2.0 print 'Nucleotide concordance counts out of %d genotypes: vg-g: %d, vg-ss: %d, g-ss: %d' % ( len(g_snp_map), g_rg_nt_concord_count, g_ss_nt_concord_count, rg_ss_nt_concord_count) tot_g_ss_nt_concord_count += g_ss_nt_concord_count tot_rg_ss_nt_concord_count += rg_ss_nt_concord_count tot_g_rg_nt_concord_count += g_rg_nt_concord_count num_non_matching_nts = 0 num_ambig_nts = 0 #Identifying which SNPs have nucleotides that are ok.. ok_nts = [] for g_i, rg_i, ss_i in it.izip(g_snp_map, rg_snp_map, ss_snp_map): #To make sure, is the SNP id the same? assert g_sids[g_i] == rg_sids[rg_i] == ss_sids[ ss_i], 'Some issues with coordinating the genotypes.' g_nt = g_nts[g_i] rg_nt = rg_nts[rg_i] # rg_nt = [recode_dict[rg_nts[rg_i][0]],recode_dict[rg_nts[rg_i][1]]] ss_nt = ss_nts[ss_i] #Is the nucleotide ambiguous. g_nt = [g_nts[g_i][0], g_nts[g_i][1]] if tuple(g_nt) in ambig_nts: num_ambig_nts += 1 tot_num_non_matching_nts += 1 continue #First check if nucleotide is sane? if (not g_nt[0] in valid_nts) or (not g_nt[1] in valid_nts): num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue os_g_nt = sp.array( [opp_strand_dict[g_nt[0]], opp_strand_dict[g_nt[1]]]) flip_nts = False if not ((sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)) and (sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt))): if sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt): flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or (os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1]) #Try flipping the SS nt if flip_nts: betas[ss_i] = -betas[ss_i] log_odds[ss_i] = -log_odds[ss_i] if 'freqs' in ssg.keys(): ss_freqs[ss_i] = 1 - ss_freqs[ss_i] else: print "Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \ (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt)) num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue else: num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue # Opposite strand nucleotides # everything seems ok. ok_indices['g'].append(g_i) ok_indices['rg'].append(rg_i) ok_indices['ss'].append(ss_i) ok_nts.append(g_nt) # if flip_nts: # ok_nts.append([ss_nt[1],ss_nt[0]]) # else: # ok_nts.append(ss_nt) #print '%d SNPs in LD references to be flipped.'%((len(ref_snp_directions)-sp.sum(ref_snp_directions))/2.0) print '%d SNPs had ambiguous nucleotides.' % num_ambig_nts print '%d SNPs were excluded due to nucleotide issues.' % num_non_matching_nts print '%d SNPs were retained on chromosome %d.' % (len( ok_indices['g']), chrom) #Resorting by position positions = sp.array(chrom_d['positions'])[ok_indices['g']] # order = sp.argsort(positions) # sorted_positions = positions[order] # assert sp.all(sorted_positions==positions), 'Perhaps something is wrong here?' # ok_indices['g'] = list(sp.array(ok_indices['g'])[order]) # ok_indices['ss'] = list(sp.array(ok_indices['ss'])[order]) #Now parse SNPs .. snp_indices = sp.array(chrom_d['snp_indices']) snp_indices = snp_indices[ ok_indices['g']] #Pinpoint where the SNPs are in the file. raw_snps, freqs = _parse_plink_snps_(genotype_file, snp_indices) snp_indices_ref = sp.array(chrom_d_ref['snp_indices']) snp_indices_ref = snp_indices_ref[ ok_indices['rg']] #Pinpoint where the SNPs are in the file. raw_ref_snps, freqs_ref = _parse_plink_snps_(reference_genotype_file, snp_indices_ref) snp_stds_ref = sp.sqrt(2 * freqs_ref * (1 - freqs_ref)) snp_means_ref = freqs_ref * 2 snp_stds = sp.sqrt(2 * freqs * (1 - freqs)) snp_means = freqs * 2 betas = betas[ok_indices['ss']] # * sp.sqrt(freqs * (1 - freqs)) log_odds = log_odds[ok_indices['ss']] # * sp.sqrt(freqs * (1 - freqs)) ps = ssg['ps'][...][ok_indices['ss']] nts = sp.array(ok_nts) #[order] sids = ssg['sids'][...][ok_indices['ss']] #For debugging... # g_sids = sp.array(chrom_d['sids'])[ok_indices['g']] # rg_sids = sp.array(chrom_d_ref['sids'])[ok_indices['rg']] # ss_sids = ssg['sids'][...][ok_indices['ss']] # assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?' #Check SNP frequencies.. if check_mafs and 'freqs' in ssg.keys(): ss_freqs = ss_freqs[ok_indices['ss']] freq_discrepancy_snp = sp.absolute(ss_freqs - (1 - freqs)) > 0.15 if sp.any(freq_discrepancy_snp): print 'Warning: %d SNPs were filtered due to high allele frequency discrepancy between summary statistics and validation sample' % sp.sum( freq_discrepancy_snp) # print freqs[freq_discrepancy_snp] # print ss_freqs[freq_discrepancy_snp] #Filter freq_discrepancy_snps ok_freq_snps = sp.negative(freq_discrepancy_snp) raw_snps = raw_snps[ok_freq_snps] snp_stds = snp_stds[ok_freq_snps] snp_means = snp_means[ok_freq_snps] raw_ref_snps = raw_ref_snps[ok_freq_snps] snp_stds_ref = snp_stds_ref[ok_freq_snps] snp_means_ref = snp_means_ref[ok_freq_snps] freqs = freqs[ok_freq_snps] freqs_ref = freqs_ref[ok_freq_snps] ps = ps[ok_freq_snps] positions = positions[ok_freq_snps] nts = nts[ok_freq_snps] sids = sids[ok_freq_snps] betas = betas[ok_freq_snps] log_odds = log_odds[ok_freq_snps] #For debugging... # if sp.any(freq_discrepancy_snp): # g_sids = g_sids[ok_freq_snps] # rg_sids = rg_sids[ok_freq_snps] # ss_sids = ss_sids[ok_freq_snps] # assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?' #Filter minor allele frequency SNPs. maf_filter = (freqs > min_maf) * (freqs < (1 - min_maf)) maf_filter_sum = sp.sum(maf_filter) n_snps = len(maf_filter) assert maf_filter_sum <= n_snps, "WTF?" if sp.sum(maf_filter) < n_snps: raw_snps = raw_snps[maf_filter] snp_stds = snp_stds[maf_filter] snp_means = snp_means[maf_filter] raw_ref_snps = raw_ref_snps[maf_filter] snp_stds_ref = snp_stds_ref[maf_filter] snp_means_ref = snp_means_ref[maf_filter] freqs = freqs[maf_filter] freqs_ref = freqs_ref[maf_filter] ps = ps[maf_filter] positions = positions[maf_filter] nts = nts[maf_filter] sids = sids[maf_filter] betas = betas[maf_filter] log_odds = log_odds[maf_filter] # if sp.sum(maf_filter)<n_snps: # g_sids = g_sids[maf_filter] # rg_sids = rg_sids[maf_filter] # ss_sids = ss_sids[maf_filter] # assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?' maf_adj_prs = sp.dot(log_odds, raw_snps) if has_phenotype: maf_adj_corr = sp.corrcoef(Y, maf_adj_prs)[0, 1] print 'Log odds, per genotype PRS correlation w phenotypes for chromosome %d was %0.4f' % ( chrom, maf_adj_corr) genetic_map = [] if genetic_map_dir is not None: with gzip.open(genetic_map_dir + 'chr%d.interpolated_genetic_map.gz' % chrom) as f: for line in f: l = line.split() if l[0] in sid_set: genetic_map.append(l[0]) print 'Now storing coordinated data to HDF5 file.' ofg = cord_data_g.create_group('chrom_%d' % chrom) ofg.create_dataset('raw_snps_val', data=raw_snps, compression='lzf') ofg.create_dataset('snp_stds_val', data=snp_stds) ofg.create_dataset('snp_means_val', data=snp_means) ofg.create_dataset('freqs_val', data=freqs) ofg.create_dataset('raw_snps_ref', data=raw_ref_snps, compression='lzf') ofg.create_dataset('snp_stds_ref', data=snp_stds_ref) ofg.create_dataset('snp_means_ref', data=snp_means_ref) ofg.create_dataset('freqs_ref', data=freqs_ref) ofg.create_dataset('nts', data=nts) ofg.create_dataset('ps', data=ps) ofg.create_dataset('positions', data=positions) ofg.create_dataset('sids', data=sids) if genetic_map_dir is not None: ofg.create_dataset('genetic_map', data=genetic_map) ofg.create_dataset('betas', data=betas) ofg.create_dataset('log_odds', data=log_odds) ofg.create_dataset('log_odds_prs', data=maf_adj_prs) # print 'Sum betas', sp.sum(betas ** 2) #ofg.create_dataset('prs', data=prs) #risk_scores += prs maf_adj_risk_scores += maf_adj_prs num_common_snps += len(betas)
def coordinate_genotypes_ss_w_ld_ref(genotype_file = None, reference_genotype_file = None, hdf5_file = None, genetic_map_dir=None, check_mafs=False, min_maf=0.01): # recode_dict = {1:'A', 2:'T', 3:'C', 4:'G'} #1K genomes recoding.. print 'Coordinating things w genotype file: %s \nref. genot. file: %s'%(genotype_file, reference_genotype_file) plinkf = plinkfile.PlinkFile(genotype_file) #Loads only the individuals... (I think?) samples = plinkf.get_samples() num_individs = len(samples) Y = [s.phenotype for s in samples] fids = [s.fid for s in samples] iids = [s.iid for s in samples] unique_phens = sp.unique(Y) if len(unique_phens)==1: print 'Unable to find phenotype values.' has_phenotype=False elif len(unique_phens)==2: cc_bins = sp.bincount(Y) assert len(cc_bins)==2, 'Problems with loading phenotype' print 'Loaded %d controls and %d cases'%(cc_bins[0], cc_bins[1]) has_phenotype=True else: print 'Found quantitative phenotype values' has_phenotype=True #Figure out chromosomes and positions. print 'Parsing validation genotype bim file' loci = plinkf.get_loci() plinkf.close() gf_chromosomes = [l.chromosome for l in loci] chromosomes = sp.unique(gf_chromosomes) chromosomes.sort() chr_dict = _get_chrom_dict_(loci, chromosomes) print 'Parsing LD reference genotype bim file' plinkf_ref = plinkfile.PlinkFile(reference_genotype_file) loci_ref = plinkf_ref.get_loci() plinkf_ref.close() chr_dict_ref = _get_chrom_dict_(loci_ref, chromosomes) # chr_dict_ref = _get_chrom_dict_bim_(reference_genotype_file+'.bim', chromosomes) #Open HDF5 file and prepare out data assert not 'iids' in hdf5_file.keys(), 'Something is wrong with the HDF5 file?' if has_phenotype: hdf5_file.create_dataset('y', data=Y) hdf5_file.create_dataset('fids', data=fids) hdf5_file.create_dataset('iids', data=iids) ssf = hdf5_file['sum_stats'] cord_data_g = hdf5_file.create_group('cord_data') maf_adj_risk_scores = sp.zeros(num_individs) num_common_snps = 0 #corr_list = [] tot_g_ss_nt_concord_count = 0 tot_rg_ss_nt_concord_count = 0 tot_g_rg_nt_concord_count = 0 tot_num_non_matching_nts = 0 #Now iterate over chromosomes for chrom in chromosomes: ok_indices = {'g':[], 'rg':[], 'ss':[]} chr_str = 'chrom_%d'%chrom print 'Working on chromsome: %s'%chr_str chrom_d = chr_dict[chr_str] chrom_d_ref = chr_dict_ref[chr_str] try: ssg = ssf['chrom_%d' % chrom] except Exception, err_str: print err_str print 'Did not find chromsome in SS dataset.' print 'Continuing.' continue ssg = ssf['chrom_%d' % chrom] g_sids = chrom_d['sids'] rg_sids = chrom_d_ref['sids'] ss_sids = ssg['sids'][...] print 'Found %d SNPs in validation data, %d SNPs in LD reference data, and %d SNPs in summary statistics.'%(len(g_sids), len(rg_sids), len(ss_sids)) common_sids = sp.intersect1d(ss_sids, g_sids) common_sids = sp.intersect1d(common_sids, rg_sids) print 'Found %d SNPs on chrom %d that were common across all datasets'%(len(common_sids), chrom) ss_snp_map = [] g_snp_map = [] rg_snp_map = [] ss_sid_dict = {} for i, sid in enumerate(ss_sids): ss_sid_dict[sid]=i g_sid_dict = {} for i, sid in enumerate(g_sids): g_sid_dict[sid]=i rg_sid_dict = {} for i, sid in enumerate(rg_sids): rg_sid_dict[sid]=i for sid in common_sids: g_snp_map.append(g_sid_dict[sid]) #order by positions g_positions = sp.array(chrom_d['positions'])[g_snp_map] order = sp.argsort(g_positions) #order = order.tolist() g_snp_map = sp.array(g_snp_map)[order] g_snp_map = g_snp_map.tolist() common_sids = sp.array(common_sids)[order] #Get the other two maps for sid in common_sids: rg_snp_map.append(rg_sid_dict[sid]) for sid in common_sids: ss_snp_map.append(ss_sid_dict[sid]) g_nts = sp.array(chrom_d['nts']) rg_nts = sp.array(chrom_d_ref['nts']) rg_nts_ok = sp.array(rg_nts)[rg_snp_map] # rg_nts_l = [] # for nt in rg_nts_ok: # rg_nts_l.append([recode_dict[nt[0]],recode_dict[nt[1]]]) # rg_nts_ok = sp.array(rg_nts_l) ss_nts = ssg['nts'][...] betas = ssg['betas'][...] log_odds = ssg['log_odds'][...] if 'freqs' in ssg.keys(): ss_freqs = ssg['freqs'][...] g_ss_nt_concord_count = sp.sum(g_nts[g_snp_map] == ss_nts[ss_snp_map])/2.0 rg_ss_nt_concord_count = sp.sum(rg_nts_ok == ss_nts[ss_snp_map])/2.0 g_rg_nt_concord_count = sp.sum(g_nts[g_snp_map] == rg_nts_ok)/2.0 print 'Nucleotide concordance counts out of %d genotypes: vg-g: %d, vg-ss: %d, g-ss: %d'%(len(g_snp_map),g_rg_nt_concord_count, g_ss_nt_concord_count, rg_ss_nt_concord_count) tot_g_ss_nt_concord_count += g_ss_nt_concord_count tot_rg_ss_nt_concord_count += rg_ss_nt_concord_count tot_g_rg_nt_concord_count += g_rg_nt_concord_count num_non_matching_nts = 0 num_ambig_nts = 0 #Identifying which SNPs have nucleotides that are ok.. ok_nts = [] for g_i, rg_i, ss_i in it.izip(g_snp_map, rg_snp_map, ss_snp_map): #To make sure, is the SNP id the same? assert g_sids[g_i]==rg_sids[rg_i]==ss_sids[ss_i], 'Some issues with coordinating the genotypes.' g_nt = g_nts[g_i] rg_nt = rg_nts[rg_i] # rg_nt = [recode_dict[rg_nts[rg_i][0]],recode_dict[rg_nts[rg_i][1]]] ss_nt = ss_nts[ss_i] #Is the nucleotide ambiguous. g_nt = [g_nts[g_i][0],g_nts[g_i][1]] if tuple(g_nt) in ambig_nts: num_ambig_nts +=1 tot_num_non_matching_nts += 1 continue #First check if nucleotide is sane? if (not g_nt[0] in valid_nts) or (not g_nt[1] in valid_nts): num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue os_g_nt = sp.array([opp_strand_dict[g_nt[0]], opp_strand_dict[g_nt[1]]]) flip_nts = False if not ((sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)) and (sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt))): if sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt): flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or (os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1]) #Try flipping the SS nt if flip_nts: betas[ss_i] = -betas[ss_i] log_odds[ss_i] = -log_odds[ss_i] if 'freqs' in ssg.keys(): ss_freqs[ss_i] = 1-ss_freqs[ss_i] else: print "Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \ (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt)) num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue else: num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue # Opposite strand nucleotides # everything seems ok. ok_indices['g'].append(g_i) ok_indices['rg'].append(rg_i) ok_indices['ss'].append(ss_i) ok_nts.append(g_nt) # if flip_nts: # ok_nts.append([ss_nt[1],ss_nt[0]]) # else: # ok_nts.append(ss_nt) #print '%d SNPs in LD references to be flipped.'%((len(ref_snp_directions)-sp.sum(ref_snp_directions))/2.0) print '%d SNPs had ambiguous nucleotides.' % num_ambig_nts print '%d SNPs were excluded due to nucleotide issues.' % num_non_matching_nts print '%d SNPs were retained on chromosome %d.' % (len(ok_indices['g']), chrom) #Resorting by position positions = sp.array(chrom_d['positions'])[ok_indices['g']] # order = sp.argsort(positions) # sorted_positions = positions[order] # assert sp.all(sorted_positions==positions), 'Perhaps something is wrong here?' # ok_indices['g'] = list(sp.array(ok_indices['g'])[order]) # ok_indices['ss'] = list(sp.array(ok_indices['ss'])[order]) #Now parse SNPs .. snp_indices = sp.array(chrom_d['snp_indices']) snp_indices = snp_indices[ok_indices['g']] #Pinpoint where the SNPs are in the file. raw_snps,freqs = _parse_plink_snps_(genotype_file, snp_indices) snp_indices_ref = sp.array(chrom_d_ref['snp_indices']) snp_indices_ref = snp_indices_ref[ok_indices['rg']] #Pinpoint where the SNPs are in the file. raw_ref_snps, freqs_ref = _parse_plink_snps_(reference_genotype_file, snp_indices_ref) snp_stds_ref = sp.sqrt(2*freqs_ref*(1-freqs_ref)) snp_means_ref = freqs_ref*2 snp_stds = sp.sqrt(2*freqs*(1-freqs)) snp_means = freqs*2 betas = betas[ok_indices['ss']] # * sp.sqrt(freqs * (1 - freqs)) log_odds = log_odds[ok_indices['ss']] # * sp.sqrt(freqs * (1 - freqs)) ps = ssg['ps'][...][ok_indices['ss']] nts = sp.array(ok_nts)#[order] sids = ssg['sids'][...][ok_indices['ss']] #For debugging... # g_sids = sp.array(chrom_d['sids'])[ok_indices['g']] # rg_sids = sp.array(chrom_d_ref['sids'])[ok_indices['rg']] # ss_sids = ssg['sids'][...][ok_indices['ss']] # assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?' #Check SNP frequencies.. if check_mafs and 'freqs' in ssg.keys(): ss_freqs = ss_freqs[ok_indices['ss']] freq_discrepancy_snp = sp.absolute(ss_freqs-(1-freqs))>0.15 if sp.any(freq_discrepancy_snp): print 'Warning: %d SNPs were filtered due to high allele frequency discrepancy between summary statistics and validation sample'%sp.sum(freq_discrepancy_snp) # print freqs[freq_discrepancy_snp] # print ss_freqs[freq_discrepancy_snp] #Filter freq_discrepancy_snps ok_freq_snps = sp.negative(freq_discrepancy_snp) raw_snps = raw_snps[ok_freq_snps] snp_stds = snp_stds[ok_freq_snps] snp_means = snp_means[ok_freq_snps] raw_ref_snps = raw_ref_snps[ok_freq_snps] snp_stds_ref = snp_stds_ref[ok_freq_snps] snp_means_ref = snp_means_ref[ok_freq_snps] freqs = freqs[ok_freq_snps] freqs_ref = freqs_ref[ok_freq_snps] ps = ps[ok_freq_snps] positions = positions[ok_freq_snps] nts = nts[ok_freq_snps] sids = sids[ok_freq_snps] betas = betas[ok_freq_snps] log_odds = log_odds[ok_freq_snps] #For debugging... # if sp.any(freq_discrepancy_snp): # g_sids = g_sids[ok_freq_snps] # rg_sids = rg_sids[ok_freq_snps] # ss_sids = ss_sids[ok_freq_snps] # assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?' #Filter minor allele frequency SNPs. maf_filter = (freqs>min_maf)*(freqs<(1-min_maf)) maf_filter_sum = sp.sum(maf_filter) n_snps = len(maf_filter) assert maf_filter_sum<=n_snps, "WTF?" if sp.sum(maf_filter)<n_snps: raw_snps = raw_snps[maf_filter] snp_stds = snp_stds[maf_filter] snp_means = snp_means[maf_filter] raw_ref_snps = raw_ref_snps[maf_filter] snp_stds_ref = snp_stds_ref[maf_filter] snp_means_ref = snp_means_ref[maf_filter] freqs = freqs[maf_filter] freqs_ref = freqs_ref[maf_filter] ps = ps[maf_filter] positions = positions[maf_filter] nts = nts[maf_filter] sids = sids[maf_filter] betas = betas[maf_filter] log_odds = log_odds[maf_filter] # if sp.sum(maf_filter)<n_snps: # g_sids = g_sids[maf_filter] # rg_sids = rg_sids[maf_filter] # ss_sids = ss_sids[maf_filter] # assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?' maf_adj_prs = sp.dot(log_odds, raw_snps) if has_phenotype: maf_adj_corr = sp.corrcoef(Y, maf_adj_prs)[0, 1] print 'Log odds, per genotype PRS correlation w phenotypes for chromosome %d was %0.4f' % (chrom, maf_adj_corr) genetic_map = [] if genetic_map_dir is not None: with gzip.open(genetic_map_dir+'chr%d.interpolated_genetic_map.gz'%chrom) as f: for line in f: l = line.split() if l[0] in sid_set: genetic_map.append(l[0]) print 'Now storing coordinated data to HDF5 file.' ofg = cord_data_g.create_group('chrom_%d' % chrom) ofg.create_dataset('raw_snps_val', data=raw_snps, compression='lzf') ofg.create_dataset('snp_stds_val', data=snp_stds) ofg.create_dataset('snp_means_val', data=snp_means) ofg.create_dataset('freqs_val', data=freqs) ofg.create_dataset('raw_snps_ref', data=raw_ref_snps, compression='lzf') ofg.create_dataset('snp_stds_ref', data=snp_stds_ref) ofg.create_dataset('snp_means_ref', data=snp_means_ref) ofg.create_dataset('freqs_ref', data=freqs_ref) ofg.create_dataset('nts', data=nts) ofg.create_dataset('ps', data=ps) ofg.create_dataset('positions', data=positions) ofg.create_dataset('sids', data=sids) if genetic_map_dir is not None: ofg.create_dataset('genetic_map', data=genetic_map) ofg.create_dataset('betas', data=betas) ofg.create_dataset('log_odds', data=log_odds) ofg.create_dataset('log_odds_prs', data=maf_adj_prs) # print 'Sum betas', sp.sum(betas ** 2) #ofg.create_dataset('prs', data=prs) #risk_scores += prs maf_adj_risk_scores += maf_adj_prs num_common_snps += len(betas)
def parse_1KG_snp_info( input_file='/project/TheHonestGene/faststorage/1Kgenomes/phase3/1k_genomes_hg.hdf5', out_file='/project/PCMA/faststorage/1_DATA/1k_genomes/1K_SNP_INFO_EUR_MAF0.05.hdf5', filter_ambiguous=True, maf_thres=0.05): print 'Generating a SNP info file' ih5f = h5py.File(input_file) oh5f = h5py.File(out_file) num_indivs = len(ih5f['indivs']['continent']) eur_filter = ih5f['indivs']['continent'][...] == 'EUR' num_eur_indivs = sp.sum(eur_filter) print 'Number of European individuals: %d \nTotal number of individuals: %d' % ( num_eur_indivs, num_indivs) std_thres = sp.sqrt(2.0 * (1 - maf_thres) * (maf_thres)) for chrom in range(1, 23): print 'Working on Chromosome %d' % chrom chrom_str = 'chr%d' % chrom print 'Loading SNPs and data' snps = sp.array(ih5f[chrom_str]['calldata']['snps'][...], dtype='int8') print 'Excluding non-European individuals' snps = snps[:, eur_filter] print "Loading other SNP information" snp_ids = ih5f[chrom_str]['variants']['ID'][...] positions = ih5f[chrom_str]['variants']['POS'][...] print 'Loading NTs' ref_nts = ih5f[chrom_str]['variants']['REF'][...] alt_nts = ih5f[chrom_str]['variants']['ALT'][...] print 'Filtering multi-allelic SNPs' multi_allelic_filter = sp.negative( ih5f[chrom_str]['variants']['MULTI_ALLELIC'][...]) snps = snps[multi_allelic_filter] ref_nts = ref_nts[multi_allelic_filter] alt_nts = alt_nts[multi_allelic_filter] snp_ids = snp_ids[multi_allelic_filter] positions = positions[multi_allelic_filter] print 'Filter SNPs with missing NT information' nt_filter = sp.in1d(ref_nts, ok_nts) nt_filter = nt_filter * sp.in1d(alt_nts, ok_nts) if sp.sum(nt_filter) < len(nt_filter): snps = snps[nt_filter] ref_nts = ref_nts[nt_filter] alt_nts = alt_nts[nt_filter] snp_ids = snp_ids[nt_filter] positions = positions[nt_filter] print 'Filtering SNPs with MAF <', maf_thres afs = sp.sum(snps, axis=1) / float(num_eur_indivs) assert sp.all(0.0 <= afs) and sp.all(afs <= 2.0), 'AF is out of range' mafs = sp.minimum(afs, 1.0 - afs) maf_filter = mafs < maf_thres snps = snps[maf_filter] ref_nts = ref_nts[maf_filter] alt_nts = alt_nts[maf_filter] snp_ids = snp_ids[maf_filter] positions = positions[maf_filter] mafs = mafs[maf_filter] g = oh5f.create_group(chrom_str) g.create_dataset('sids', data=snp_ids) g.create_dataset('positions', data=positions) g.create_dataset('eur_mafs', data=mafs) g.create_dataset('ref', data=ref_nts) g.create_dataset('alt', data=alt_nts) oh5f.flush() oh5f.close()
def coordinate_ss(genotype_file=None,ssfformat=None,hdf5_file=None,outfile=None, genetic_map_dir=None, check_mafs=False, min_maf=0.01, skip_coordination=False, keep_all=False,skip_ambiguous=False): """ Assumes plink BED files. Imputes missing genotypes. """ plinkf = plinkfile.PlinkFile(genotype_file) samples = plinkf.get_samples() num_individs = len(samples) # num_individs = len(gf['chrom_1']['snps'][:, 0]) # Y = sp.array(gf['indivs']['phenotype'][...] == 'Case', dtype='int8') Y = [s.phenotype for s in samples] fids = [s.fid for s in samples] iids = [s.iid for s in samples] unique_phens = sp.unique(Y) if len(unique_phens) == 1: print 'Unable to find phenotype values.' has_phenotype = False elif len(unique_phens) == 2: cc_bins = sp.bincount(Y) assert len(cc_bins) == 2, 'Problems with loading phenotype' print 'Loaded %d controls and %d cases' % (cc_bins[0], cc_bins[1]) has_phenotype = True else: print 'Found quantitative phenotype values' has_phenotype = True risk_scores = sp.zeros(num_individs) rb_risk_scores = sp.zeros(num_individs) num_common_snps = 0 corr_list = [] rb_corr_list = [] ssf = hdf5_file['sum_stats'] ssf_dict={} # Figure out chromosomes and positions by looking at SNPs. loci = plinkf.get_loci() plinkf.close() gf_chromosomes = [l.chromosome for l in loci] chromosomes = sp.unique(gf_chromosomes) chromosomes.sort() chr_dict = _get_chrom_dict_(loci, chromosomes) tot_num_non_matching_nts = 0 for chrom in chromosomes: chr_str = 'chrom_%d' % chrom chr_col = 'chr%d' % chrom print 'Working on chromsome: %s' % chr_str chrom_d = chr_dict[chr_str] try: ssg = ssf['chrom_%d' % chrom] except Exception, err_str: print err_str print 'Did not find chromsome in SS dataset.' print 'Continuing.' continue g_sids = chrom_d['sids'] g_sid_set = set(g_sids) assert len(g_sid_set) == len(g_sids), 'Some duplicates?' ss_sids = ssg['sids'][...] ss_sid_set = set(ss_sids) assert len(ss_sid_set) == len(ss_sids), 'Some duplicates?' # Figure out filters: g_filter = sp.in1d(g_sids, ss_sids) ss_filter = sp.in1d(ss_sids, g_sids) # Order by SNP IDs g_order = sp.argsort(g_sids) ss_order = sp.argsort(ss_sids) g_indices = [] for g_i in g_order: if g_filter[g_i]: g_indices.append(g_i) ss_indices = [] for ss_i in ss_order: if ss_filter[ss_i]: ss_indices.append(ss_i) g_ntA1=[] g_ntA2=[] g_nts = chrom_d['nts'] snp_indices = chrom_d['snp_indices'] ss_nts = ssg['nts'][...] betas = ssg['betas'][...] log_odds = ssg['log_odds'][...] if ssfformat=="LDSCORE" or ssfformat == "STANDARD_FUNCT": ld_score = ssg['ld_score'][...] ### LDSCORE #### Track allele flips indices #### ss_flips = sp.ones(len(ss_indices)) assert not sp.any(sp.isnan(betas)), 'WTF?' # assert not sp.any(sp.isinf(betas)), 'WTF?' num_non_matching_nts = 0 num_ambig_nts = 0 ok_nts = [] print 'Found %d SNPs present in both datasets' % (len(g_indices)) if 'freqs' in ssg.keys(): ss_freqs = ssg['freqs'][...] ss_freqs_list = [] ok_indices = {'g': [], 'ss': []} for g_i, ss_i in it.izip(g_indices, ss_indices): # Is the nucleotide ambiguous? # g_nt = [recode_dict[g_nts[g_i][0]],recode_dict[g_nts[g_i][1]] g_nt = [g_nts[g_i][0], g_nts[g_i][1]] g_ntA1.append(g_nt[0]) g_ntA2.append(g_nt[1]) if not skip_coordination: if not skip_ambiguous: if tuple(g_nt) in ambig_nts: num_ambig_nts += 1 tot_num_non_matching_nts += 1 continue if (not g_nt[0] in valid_nts) or (not g_nt[1] in valid_nts): num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue ss_nt = ss_nts[ss_i] # Are the nucleotides the same? flip_nts = False os_g_nt = sp.array([opp_strand_dict[g_nt[0]], opp_strand_dict[g_nt[1]]]) if not (sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)): # Opposite strand nucleotides flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or ( os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1]) if flip_nts: betas[ss_i] = -betas[ss_i] log_odds[ss_i] = -log_odds[ss_i] ss_flips[ss_i] = -1 if 'freqs' in ssg.keys(): ss_freqs[ss_i] = 1 - ss_freqs[ss_i] else: # print "Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \ # (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt)) num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue # everything seems ok. ok_indices['g'].append(g_i) ok_indices['ss'].append(ss_i) ok_nts.append(g_nt) print '%d SNPs were excluded due to ambiguous nucleotides.' % num_ambig_nts print '%d SNPs were excluded due to non-matching nucleotides.' % num_non_matching_nts # Resorting by position positions = sp.array(chrom_d['positions'])[ok_indices['g']] order = sp.argsort(positions) ok_indices['g'] = list(sp.array(ok_indices['g'])[order]) ok_indices['ss'] = list(sp.array(ok_indices['ss'])[order]) positions = positions[order] # Parse SNPs snp_indices = sp.array(chrom_d['snp_indices']) snp_indices = snp_indices[ok_indices['g']] # Pinpoint where the SNPs are in the file. #raw_snps, freqs = _parse_plink_snps_(genotype_file, snp_indices) freqs = _parse_plink_snps_freqs_(genotype_file, snp_indices) betas = betas[ok_indices['ss']] log_odds = log_odds[ok_indices['ss']] sids = ssg['sids'][...][ok_indices['ss']] if ssfformat == "LDSCORE" or ssfformat == "STANDARD_FUNCT": ld_score = ld_score[ok_indices['ss']] #### LDSCORE # Check SNP frequencies.. if check_mafs and 'freqs' in ssg.keys(): ss_freqs = ss_freqs[ok_indices['ss']] freq_discrepancy_snp = sp.absolute(ss_freqs - (1 - freqs)) > 0.15 if sp.any(freq_discrepancy_snp): print 'Warning: %d SNPs appear to have high frequency discrepancy between summary statistics and validation sample' % sp.sum( freq_discrepancy_snp) print freqs[freq_discrepancy_snp] print ss_freqs[freq_discrepancy_snp] # Filter freq_discrepancy_snps ok_freq_snps = sp.negative(freq_discrepancy_snp) freqs = freqs[ok_freq_snps] sids = sids[ok_freq_snps] betas = betas[ok_freq_snps] log_odds = log_odds[ok_freq_snps] if ssfformat == "LDSCORE" or ssfformat == "STANDARD_FUNCT": ld_score = ld_score[ok_freq_snps] #### LDSCORE # Filter minor allele frequency SNPs. maf_filter = (freqs > min_maf) * (freqs < (1 - min_maf)) maf_filter_sum = sp.sum(maf_filter) n_snps = len(maf_filter) assert maf_filter_sum <= n_snps, "WTF?" if sp.sum(maf_filter) < n_snps: sids = sids[maf_filter] betas = betas[maf_filter] log_odds = log_odds[maf_filter] if ssfformat == "LDSCORE" or ssfformat == "STANDARD_FUNCT": ld_score = ld_score[maf_filter] print '%d SNPs with MAF < %0.3f were filtered' % (n_snps - maf_filter_sum, min_maf) print '%d SNPs were retained on chromosome %d.' % (maf_filter_sum, chrom) num_common_snps += len(betas) ssf_dict[chr_str]['betas']=betas ssf_dict[chr_str]['log_odds'] = log_odds
def gen_unrelated_eur_1k_data(input_file='/home/bjarni/TheHonestGene/faststorage/1Kgenomes/phase3/1k_genomes_hg.hdf5' , out_file='/home/bjarni/PCMA/faststorage/1_DATA/1k_genomes/1K_genomes_phase3_EUR_unrelated.hdf5', maf_thres=0.01, max_relatedness=0.05, K_thinning_frac=0.1, debug=False): h5f = h5py.File(input_file) num_indivs = len(h5f['indivs']['continent']) eur_filter = h5f['indivs']['continent'][...] == 'EUR' num_eur_indivs = sp.sum(eur_filter) print 'Number of European individuals: %d', num_eur_indivs K = sp.zeros((num_eur_indivs, num_eur_indivs), dtype='single') num_snps = 0 std_thres = sp.sqrt(2.0 * (1 - maf_thres) * (maf_thres)) print 'Calculating kinship' for chrom in range(1, 23): print 'Working on Chromosome %d' % chrom chrom_str = 'chr%d' % chrom print 'Loading SNPs and data' snps = sp.array(h5f[chrom_str]['calldata']['snps'][...], dtype='int8') print 'Loading NTs' ref_nts = h5f[chrom_str]['variants']['REF'][...] alt_nts = h5f[chrom_str]['variants']['ALT'][...] print 'Filtering multi-allelic SNPs' multi_allelic_filter = sp.negative(h5f[chrom_str]['variants']['MULTI_ALLELIC'][...]) snps = snps[multi_allelic_filter] ref_nts = ref_nts[multi_allelic_filter] alt_nts = alt_nts[multi_allelic_filter] if K_thinning_frac < 1: print 'Thinning SNPs for kinship calculation' thinning_filter = sp.random.random(len(snps)) < K_thinning_frac snps = snps[thinning_filter] alt_nts = alt_nts[thinning_filter] ref_nts = ref_nts[thinning_filter] print 'Filter SNPs with missing NT information' nt_filter = sp.in1d(ref_nts, ok_nts) nt_filter = nt_filter * sp.in1d(alt_nts, ok_nts) if sp.sum(nt_filter) < len(nt_filter): snps = snps[nt_filter] print 'Filtering non-European individuals' snps = snps[:, eur_filter] print 'Filtering SNPs with MAF <', maf_thres snp_stds = sp.std(snps, 1) maf_filter = snp_stds.flatten() > std_thres snps = snps[maf_filter] snp_stds = snp_stds[maf_filter] print '%d SNPs remaining after all filtering steps.' % len(snps) print 'Normalizing SNPs' snp_means = sp.mean(snps, 1) norm_snps = (snps - snp_means[sp.newaxis].T) / snp_stds[sp.newaxis].T print 'Updating kinship' K += sp.dot(norm_snps.T, norm_snps) num_snps += len(norm_snps) assert sp.isclose(sp.sum(sp.diag(K)) / (num_snps * num_eur_indivs), 1.0) K = K / float(num_snps) print 'Kinship calculation done using %d SNPs\n' % num_snps # Filter individuals print 'Filtering individuals' keep_indiv_set = set(range(num_eur_indivs)) for i in range(num_eur_indivs): if i in keep_indiv_set: for j in range(i + 1, num_eur_indivs): if K[i, j] > max_relatedness: if j in keep_indiv_set: keep_indiv_set.remove(j) keep_indivs = list(keep_indiv_set) keep_indivs.sort() print 'Retained %d individuals\n' % len(keep_indivs) # Checking that everything is ok! K_ok = K[keep_indivs] K_ok = K_ok[:, keep_indivs] assert (K_ok - sp.tril(K_ok)).max() < max_relatedness indiv_filter = sp.zeros(num_indivs, dtype='bool8') indiv_filter[(sp.arange(num_indivs)[eur_filter])[keep_indivs]] = 1 assert sp.sum(indiv_filter) == len(keep_indivs) # Store in new file print 'Now storing data.' oh5f = h5py.File(out_file, 'w') indiv_ids = h5f['indivs']['indiv_ids'][indiv_filter] oh5f.create_dataset('indiv_ids', data=indiv_ids) for chrom in range(1, 23): print 'Working on Chromosome %d' % chrom chrom_str = 'chr%d' % chrom print 'Loading SNPs and data' snps = sp.array(h5f[chrom_str]['calldata']['snps'][...], dtype='int8') snp_ids = h5f[chrom_str]['variants']['ID'][...] positions = h5f[chrom_str]['variants']['POS'][...] print 'Loading NTs' ref_nts = h5f[chrom_str]['variants']['REF'][...] alt_nts = h5f[chrom_str]['variants']['ALT'][...] print 'Filtering multi-allelic SNPs' multi_allelic_filter = sp.negative(h5f[chrom_str]['variants']['MULTI_ALLELIC'][...]) snps = snps[multi_allelic_filter] ref_nts = ref_nts[multi_allelic_filter] alt_nts = alt_nts[multi_allelic_filter] positions = positions[multi_allelic_filter] snp_ids = snp_ids[multi_allelic_filter] print 'Filter individuals' snps = snps[:, indiv_filter] print 'Filter SNPs with missing NT information' nt_filter = sp.in1d(ref_nts, ok_nts) nt_filter = nt_filter * sp.in1d(alt_nts, ok_nts) if sp.sum(nt_filter) < len(nt_filter): snps = snps[nt_filter] ref_nts = ref_nts[nt_filter] alt_nts = alt_nts[nt_filter] positions = positions[nt_filter] snp_ids = snp_ids[nt_filter] print 'filter monomorphic SNPs' snp_stds = sp.std(snps, 1) mono_morph_filter = snp_stds > 0 snps = snps[mono_morph_filter] ref_nts = ref_nts[mono_morph_filter] alt_nts = alt_nts[mono_morph_filter] positions = positions[mono_morph_filter] snp_ids = snp_ids[mono_morph_filter] snp_stds = snp_stds[mono_morph_filter] snp_means = sp.mean(snps, 1) if debug: if K_thinning_frac < 1: print 'Thinning SNPs for kinship calculation' thinning_filter = sp.random.random(len(snps)) < K_thinning_frac k_snps = snps[thinning_filter] k_snp_stds = snp_stds[thinning_filter] print 'Filtering SNPs with MAF <', maf_thres maf_filter = k_snp_stds.flatten() > std_thres k_snps = k_snps[maf_filter] k_snp_stds = k_snp_stds[maf_filter] k_snp_means = sp.mean(k_snps) print 'Verifying that the Kinship makes sense' norm_snps = (k_snps - k_snp_means[sp.newaxis].T) / k_snp_stds[sp.newaxis].T K = sp.dot(norm_snps.T, norm_snps) num_snps += len(norm_snps) if sp.isclose(sp.sum(sp.diag(K)) / (num_snps * num_eur_indivs), 1.0) and (K - sp.tril(K)).max() < (max_relatedness * 1.5): print 'It looks OK!' else: raise Exception('Kinship looks wrong?') nts = sp.array([[nt1, nt2] for nt1, nt2 in izip(ref_nts, alt_nts)]) print 'Writing to disk' cg = oh5f.create_group(chrom_str) cg.create_dataset('snps', data=snps) cg.create_dataset('snp_means', data=snp_means[sp.newaxis].T) cg.create_dataset('snp_stds', data=snp_stds[sp.newaxis].T) cg.create_dataset('snp_ids', data=snp_ids) cg.create_dataset('positions', data=positions) cg.create_dataset('nts', data=nts) oh5f.flush() print 'Done writing to disk' # centimorgans = h5f[chrom_str]['centimorgans'][...] # cg.create_dataset('centimorgans',data=centimorgans) # # centimorgan_rates = h5f[chrom_str]['centimorgan_rates'][...] # cg.create_dataset('centimorgan_rates',data=centimorgan_rates) oh5f.close() h5f.close() print 'Done'
def parse_cegs_drosophila_phenotypes( phenotype_file='/Users/bjarnivilhjalmsson/data/cegs_lehmann/allphenotypes_5.0_cleaned.tab.reps.hdf5', ): """ Parser for CEGS Drosophila phenotype data """ import pylab #Load phenotypes... ph5f = h5py.File(phenotype_file) #Now take the median and mean of all values for all individuals. phen_dict = {} for phen in ph5f.keys(): #First mated Y_mated = ph5f[phen]['Y_mated'][...] Z_mated = ph5f[phen]['Z_mated'][...] sample_filter = sp.negative(sp.isnan(Y_mated)) Ys_sum = sp.dot(Y_mated[sample_filter], Z_mated[sample_filter]) rep_count = sp.dot(sp.ones(sum(sample_filter)), Z_mated[sample_filter]) Y_means = Ys_sum / rep_count #Now calculate medians by iteration. phen_vals_list = [[] for i in range(216)] for i in range(len(Y_mated)): ind_i = sp.where(1 == Z_mated[i])[0][0] phen_vals_list[ind_i].append(Y_mated[i]) medians = sp.zeros(216) for i, pl in enumerate(phen_vals_list): if len(pl) > 0: medians[i] = sp.median(pl) else: medians[i] = sp.nan ind_filter = sp.negative(sp.isnan(Y_means)) if phen == 'Triglyceride': ind_filter = (Y_means > 0) * ind_filter phen_dict[phen] = { 'mated': { 'Y_means': Y_means, 'rep_count': rep_count, 'ind_filter': ind_filter, 'Y_medians': medians } } print 'Plotting phenotype histograms for %s, %s' % (phen, 'mated') mated_filtered_means = Y_means[ind_filter] pylab.hist(mated_filtered_means) pylab.savefig( '/Users/bjarnivilhjalmsson/data/tmp/cegs_hist_%s_mated_means.png' % (phen)) pylab.clf() mated_filtered_medians = medians[ind_filter] pylab.hist(mated_filtered_medians) pylab.savefig( '/Users/bjarnivilhjalmsson/data/tmp/cegs_hist_%s_mated_medians.png' % (phen)) pylab.clf() #Then virgin Y_virgin = ph5f[phen]['Y_virgin'][...] Z_virgin = ph5f[phen]['Z_virgin'][...] sample_filter = sp.negative(sp.isnan(Y_virgin)) Ys_sum = sp.dot(Y_virgin[sample_filter], Z_virgin[sample_filter]) rep_count = sp.dot(sp.ones(sum(sample_filter)), Z_virgin[sample_filter]) Y_means = Ys_sum / rep_count #Now calculate medians by iteration. phen_vals_list = [[] for i in range(216)] for i in range(len(Y_virgin)): ind_i = sp.where(1 == Z_virgin[i])[0][0] phen_vals_list[ind_i].append(Y_virgin[i]) medians = sp.zeros(216) for i, pl in enumerate(phen_vals_list): if len(pl) > 0: medians[i] = sp.median(pl) else: medians[i] = sp.nan ind_filter = sp.negative(sp.isnan(Y_means)) if phen == 'Triglyceride': ind_filter = (Y_means > 0) * ind_filter phen_dict[phen]['virgin'] = { 'Y_means': Y_means, 'rep_count': rep_count, 'ind_filter': ind_filter, 'Y_medians': medians } print 'Plotting phenotype histograms for %s, %s' % (phen, 'virgin') virgin_filtered_means = Y_means[ind_filter] pylab.hist(virgin_filtered_means) pylab.savefig( '/Users/bjarnivilhjalmsson/data/tmp/cegs_hist_%s_virgin_means.png' % (phen)) pylab.clf() virgin_filtered_medians = medians[ind_filter] pylab.hist(virgin_filtered_medians) pylab.savefig( '/Users/bjarnivilhjalmsson/data/tmp/cegs_hist_%s_virgin_medians.png' % (phen)) pylab.clf() means_corr = sp.corrcoef(mated_filtered_means, virgin_filtered_means)[0, 1] medians_corr = sp.corrcoef(mated_filtered_medians, virgin_filtered_medians)[0, 1] print 'Correlation between mated and virgin flies, means: %0.2f, medians: %0.2f' % ( means_corr, medians_corr) phen_dict[phen]['corrs'] = { 'means': means_corr, 'medians': medians_corr } return phen_dict