def work_item2(pheno, G_kernel, spatial_coor, spatial_iid, alpha, alpha_power, xxx_todo_changeme, xxx_todo_changeme1, xxx_todo_changeme2, just_testing, do_uncorr, do_gxe2, a2): ######################################### # Load GPS info from filename if that's the way it is given ######################################## (jackknife_index, jackknife_count, jackknife_seed) = xxx_todo_changeme (permute_plus_index, permute_plus_count, permute_plus_seed) = xxx_todo_changeme1 (permute_times_index, permute_times_count, permute_times_seed) = xxx_todo_changeme2 if isinstance(spatial_coor, str): assert spatial_iid is None, "if spatial_coor is a str, then spatial_iid should be None" gps_table = pd.read_csv(spatial_coor, delimiter=" ").dropna() spatial_iid = np.array([(v, v) for v in gps_table["id"].values]) spatial_coor = gps_table[["south_new", "east_new"]].values ######################################### # Remove any missing values from pheno ######################################## assert pheno.sid_count == 1, "Expect only one pheno in work_item" pheno = pheno.read() pheno = pheno[pheno.val[:, 0] == pheno. val[:, 0], :] #Excludes NaN because NaN is not equal to NaN ######################################### # Environment: Turn spatial info info a KernelData ######################################### spatial_val = spatial_similarity(spatial_coor, alpha, power=alpha_power) E_kernel = KernelData(iid=spatial_iid, val=spatial_val) ######################################### # Intersect, apply the jackknife or permutation, and then (because we now know the iids) standardize appropriately ######################################### from pysnptools.util import intersect_apply G_kernel, E_kernel, pheno = intersect_apply([G_kernel, E_kernel, pheno]) if jackknife_index >= 0: assert jackknife_count <= G_kernel.iid_count, "expect the number of groups to be less than the number of iids" assert jackknife_index < jackknife_count, "expect the jackknife index to be less than the count" m_fold = model_selection.KFold(n_splits=jackknife_count, shuffle=True, random_state=jackknife_seed % 4294967295).split( list(range(G_kernel.iid_count))) iid_index, _ = _nth(m_fold, jackknife_index) pheno = pheno[iid_index, :] G_kernel = G_kernel[iid_index] E_kernel = E_kernel[iid_index] if permute_plus_index >= 0: #We shuffle the val, but not the iid, because that would cancel out. #Integrate the permute_plus_index into the random. np.random.seed((permute_plus_seed + permute_plus_index) % 4294967295) new_index = np.arange(G_kernel.iid_count) np.random.shuffle(new_index) E_kernel_temp = E_kernel[new_index].read() E_kernel = KernelData( iid=E_kernel.iid, val=E_kernel_temp.val, name="permutation {0}".format(permute_plus_index)) pheno = pheno.read().standardize() # defaults to Unit standardize G_kernel = G_kernel.read().standardize( ) # defaults to DiagKtoN standardize E_kernel = E_kernel.read().standardize( ) # defaults to DiagKtoN standardize ######################################### # find h2uncoor, the best mixing weight of pure random noise and G_kernel ######################################### if not do_uncorr: h2uncorr, nLLuncorr = np.nan, np.nan else: logging.info("Find best h2 for G_kernel") lmmg = LMM() lmmg.setK(K0=G_kernel.val) lmmg.setX(np.ones([G_kernel.iid_count, 1])) # just a bias column lmmg.sety(pheno.val[:, 0]) if not just_testing: resg = lmmg.findH2() h2uncorr, nLLuncorr = resg["h2"], resg["nLL"] else: h2uncorr, nLLuncorr = 0, 0 logging.info("just G: h2uncorr: {0}, nLLuncorr: {1}".format( h2uncorr, nLLuncorr)) ######################################### # Find a2, the best mixing for G_kernel and E_kernel ######################################### if a2 is None: logging.info("Find best mixing for G_kernel and E_kernel") lmm1 = LMM() lmm1.setK(K0=G_kernel.val, K1=E_kernel.val, a2=0.5) lmm1.setX(np.ones([G_kernel.iid_count, 1])) # just a bias column lmm1.sety(pheno.val[:, 0]) if not just_testing: res1 = lmm1.findA2() h2, a2, nLLcorr = res1["h2"], res1["a2"], res1["nLL"] h2corr = h2 * (1 - a2) e2 = h2 * a2 h2corr_raw = h2 else: h2corr, e2, a2, nLLcorr, h2corr_raw = 0, 0, .5, 0, 0 logging.info( "G plus E mixture: h2corr: {0}, e2: {1}, a2: {2}, nLLcorr: {3} (h2corr_raw:{4})" .format(h2corr, e2, a2, nLLcorr, h2corr_raw)) else: h2corr, e2, nLLcorr, h2corr_raw = np.nan, np.nan, np.nan, np.nan ######################################### # Find a2_gxe2, the best mixing for G+E_kernel and the GxE kernel ######################################### if not do_gxe2: gxe2, a2_gxe2, nLL_gxe2 = np.nan, np.nan, np.nan else: #Create the G+E kernel by mixing according to a2 val = (1 - a2) * G_kernel.val + a2 * E_kernel.val GplusE_kernel = KernelData(iid=G_kernel.iid, val=val, name="{0} G + {1} E".format(1 - a2, a2)) #Don't need to standardize GplusE_kernel because it's the weighted combination of standardized kernels # Create GxE Kernel and then find the best mixing of it and GplusE logging.info("Find best mixing for GxE and GplusE_kernel") val = G_kernel.val * E_kernel.val if permute_times_index >= 0: #We shuffle the val, but not the iid, because doing both would cancel out np.random.seed( (permute_times_seed + permute_times_index) % 4294967295) new_index = np.arange(G_kernel.iid_count) np.random.shuffle(new_index) val = pstutil.sub_matrix(val, new_index, new_index) GxE_kernel = KernelData( iid=G_kernel.iid, val=val, name="GxE" ) # recall that Python '*' is just element-wise multiplication GxE_kernel = GxE_kernel.standardize() lmm2 = LMM() lmm2.setK(K0=GplusE_kernel.val, K1=GxE_kernel.val, a2=0.5) lmm2.setX(np.ones([G_kernel.iid_count, 1])) # just a bias column lmm2.sety(pheno.val[:, 0]) if not just_testing: res2 = lmm2.findA2() gxe2, a2_gxe2, nLL_gxe2 = res2["h2"], res2["a2"], res2["nLL"] gxe2 *= a2_gxe2 else: gxe2, a2_gxe2, nLL_gxe2 = 0, .5, 0 logging.info( "G+E plus GxE mixture: gxe2: {0}, a2_gxe2: {1}, nLL_gxe2: {2}". format(gxe2, a2_gxe2, nLL_gxe2)) ######################################### # Return results ######################################### ret = { "h2uncorr": h2uncorr, "nLLuncorr": nLLuncorr, "h2corr": h2corr, "h2corr_raw": h2corr_raw, "e2": e2, "a2": a2, "nLLcorr": nLLcorr, "gxe2": gxe2, "a2_gxe2": a2_gxe2, "nLL_gxe2": nLL_gxe2, "alpha": alpha, "alpha_power": alpha_power, "phen": np.array(pheno.sid, dtype='str')[0], "jackknife_index": jackknife_index, "jackknife_count": jackknife_count, "jackknife_seed": jackknife_seed, "permute_plus_index": permute_plus_index, "permute_plus_count": permute_plus_count, "permute_plus_seed": permute_plus_seed, "permute_times_index": permute_times_index, "permute_times_count": permute_times_count, "permute_times_seed": permute_times_seed } logging.info("run_line: {0}".format(ret)) return ret
def generate_and_analyze(seed, N, do_shuffle, just_testing=True, map_function=None, cache_folder=None): #Generate SNPs snpdata = snp_gen(fst=.1, dfr=0, iid_count=N, sid_count=1000, chr_count=10, label_with_pop=True, seed=seed) K_causal = snpdata.read_kernel(Unit()).standardize() #Generate geo-spatial locations and K_loc distance_between_centers = 2500000 x0 = distance_between_centers * 0.5 x1 = distance_between_centers * 1.5 y0 = distance_between_centers y1 = distance_between_centers sd = distance_between_centers / 4. spatial_iid = snpdata.iid center_dict = {"0": (x0, y0), "1": (x1, y1)} centers = np.array( [center_dict[iid_item[0]] for iid_item in spatial_iid]) np.random.seed(seed) logging.info("Generating positions for seed {0}".format(seed)) spatial_coor = SnpData( iid=snpdata.iid, sid=["x", "y"], val=centers + np.random.multivariate_normal( [0, 0], [[1, 0], [0, 1]], size=len(centers)) * sd, parent_string="'spatial_coor_gen_original'") alpha = distance_between_centers spatial_val = spatial_similarity(spatial_coor.val, alpha, power=2) K_loc = KernelData(iid=snpdata.iid, val=spatial_val).standardize() #Generate phenotype iid = K_causal.iid iid_count = K_causal.iid_count np.random.seed(seed) pheno_causal = SnpData(iid=iid, sid=["causal"], val=np.random.multivariate_normal( np.zeros(iid_count), K_causal.val).reshape(-1, 1), parent_string="causal") np.random.seed(seed ^ 998372) pheno_noise = SnpData(iid=iid, sid=["noise"], val=np.random.normal(size=iid_count).reshape( -1, 1), parent_string="noise") np.random.seed(seed ^ 12230302) pheno_loc_original = SnpData(iid=iid, sid=["loc_original"], val=np.random.multivariate_normal( np.zeros(iid_count), K_loc.val).reshape(-1, 1), parent_string="loc_original") if do_shuffle: idx = np.arange(iid_count) np.random.seed(seed) np.random.shuffle(idx) pheno_loc = pheno_loc_original.read( view_ok=True ) #don't need to copy, because the next line will be fresh memory pheno_loc.val = pheno_loc.val[idx, :] else: pheno_loc = pheno_loc_original pheno = SnpData(iid=iid, sid=["pheno_all"], val=pheno_causal.val + pheno_noise.val + pheno_loc.val) #Analyze data alpha_list = [ int(v) for v in np.logspace(np.log10(100), np.log10(1e10), 100) ] dataframe = heritability_spatial_correction( snpdata, spatial_coor.val, spatial_iid, alpha_list=[alpha] if just_testing else alpha_list, pheno=pheno, alpha_power=2, jackknife_count=0, permute_plus_count=0, permute_times_count=0, just_testing=just_testing, map_function=map_function, cache_folder=cache_folder) logging.info(dataframe) return dataframe
if do_plot: import matplotlib.pyplot as plt color_dict = {"0": "r", "1": "b", "2": "g"} colors = [color_dict[iid_item] for iid_item in snpdata.iid[:, 0]] plt.axis('equal') plt.scatter(spatial_coor_gen_original.val[:, 0], spatial_coor_gen_original.val[:, 1], c=colors) plt.show() from fastlmm.association.heritability_spatial_correction import spatial_similarity from pysnptools.kernelreader import KernelData alpha = distance_between_centers spatial_val = spatial_similarity(spatial_coor_gen_original.val, alpha, power=2) K_loc = KernelData(iid=snpdata.iid, val=spatial_val).standardize() if do_plot: pylab.suptitle("$K_{loc}$") pylab.imshow(K_loc.val, cmap=pylab.gray(), vmin=0, vmax=1) pylab.show() from pysnptools.snpreader import SnpData iid = K_causal.iid iid_count = K_causal.iid_count np.random.seed(seed) pheno_causal = SnpData(iid=iid, sid=["causal"],