예제 #1
0
def work_item2(pheno, G_kernel, spatial_coor, spatial_iid, alpha, alpha_power,
               xxx_todo_changeme, xxx_todo_changeme1, xxx_todo_changeme2,
               just_testing, do_uncorr, do_gxe2, a2):

    #########################################
    # Load GPS info from filename if that's the way it is given
    ########################################
    (jackknife_index, jackknife_count, jackknife_seed) = xxx_todo_changeme
    (permute_plus_index, permute_plus_count,
     permute_plus_seed) = xxx_todo_changeme1
    (permute_times_index, permute_times_count,
     permute_times_seed) = xxx_todo_changeme2
    if isinstance(spatial_coor, str):
        assert spatial_iid is None, "if spatial_coor is a str, then spatial_iid should be None"
        gps_table = pd.read_csv(spatial_coor, delimiter=" ").dropna()
        spatial_iid = np.array([(v, v) for v in gps_table["id"].values])
        spatial_coor = gps_table[["south_new", "east_new"]].values

    #########################################
    # Remove any missing values from pheno
    ########################################
    assert pheno.sid_count == 1, "Expect only one pheno in work_item"
    pheno = pheno.read()
    pheno = pheno[pheno.val[:, 0] == pheno.
                  val[:, 0], :]  #Excludes NaN because NaN is not equal to NaN

    #########################################
    # Environment: Turn spatial info info a KernelData
    #########################################
    spatial_val = spatial_similarity(spatial_coor, alpha, power=alpha_power)
    E_kernel = KernelData(iid=spatial_iid, val=spatial_val)

    #########################################
    # Intersect, apply the jackknife or permutation, and then (because we now know the iids) standardize appropriately
    #########################################
    from pysnptools.util import intersect_apply
    G_kernel, E_kernel, pheno = intersect_apply([G_kernel, E_kernel, pheno])

    if jackknife_index >= 0:
        assert jackknife_count <= G_kernel.iid_count, "expect the number of groups to be less than the number of iids"
        assert jackknife_index < jackknife_count, "expect the jackknife index to be less than the count"
        m_fold = model_selection.KFold(n_splits=jackknife_count,
                                       shuffle=True,
                                       random_state=jackknife_seed %
                                       4294967295).split(
                                           list(range(G_kernel.iid_count)))
        iid_index, _ = _nth(m_fold, jackknife_index)
        pheno = pheno[iid_index, :]
        G_kernel = G_kernel[iid_index]
        E_kernel = E_kernel[iid_index]

    if permute_plus_index >= 0:
        #We shuffle the val, but not the iid, because that would cancel out.
        #Integrate the permute_plus_index into the random.
        np.random.seed((permute_plus_seed + permute_plus_index) % 4294967295)
        new_index = np.arange(G_kernel.iid_count)
        np.random.shuffle(new_index)
        E_kernel_temp = E_kernel[new_index].read()
        E_kernel = KernelData(
            iid=E_kernel.iid,
            val=E_kernel_temp.val,
            name="permutation {0}".format(permute_plus_index))

    pheno = pheno.read().standardize()  # defaults to Unit standardize
    G_kernel = G_kernel.read().standardize(
    )  # defaults to DiagKtoN standardize
    E_kernel = E_kernel.read().standardize(
    )  # defaults to DiagKtoN standardize

    #########################################
    # find h2uncoor, the best mixing weight of pure random noise and G_kernel
    #########################################

    if not do_uncorr:
        h2uncorr, nLLuncorr = np.nan, np.nan
    else:
        logging.info("Find best h2 for G_kernel")
        lmmg = LMM()
        lmmg.setK(K0=G_kernel.val)
        lmmg.setX(np.ones([G_kernel.iid_count, 1]))  # just a bias column
        lmmg.sety(pheno.val[:, 0])
        if not just_testing:
            resg = lmmg.findH2()
            h2uncorr, nLLuncorr = resg["h2"], resg["nLL"]
        else:
            h2uncorr, nLLuncorr = 0, 0
        logging.info("just G: h2uncorr: {0}, nLLuncorr: {1}".format(
            h2uncorr, nLLuncorr))

    #########################################
    # Find a2, the best mixing for G_kernel and E_kernel
    #########################################

    if a2 is None:
        logging.info("Find best mixing for G_kernel and E_kernel")
        lmm1 = LMM()
        lmm1.setK(K0=G_kernel.val, K1=E_kernel.val, a2=0.5)
        lmm1.setX(np.ones([G_kernel.iid_count, 1]))  # just a bias column
        lmm1.sety(pheno.val[:, 0])
        if not just_testing:
            res1 = lmm1.findA2()
            h2, a2, nLLcorr = res1["h2"], res1["a2"], res1["nLL"]
            h2corr = h2 * (1 - a2)
            e2 = h2 * a2
            h2corr_raw = h2
        else:
            h2corr, e2, a2, nLLcorr, h2corr_raw = 0, 0, .5, 0, 0
        logging.info(
            "G plus E mixture: h2corr: {0}, e2: {1}, a2: {2}, nLLcorr: {3} (h2corr_raw:{4})"
            .format(h2corr, e2, a2, nLLcorr, h2corr_raw))
    else:
        h2corr, e2, nLLcorr, h2corr_raw = np.nan, np.nan, np.nan, np.nan

    #########################################
    # Find a2_gxe2, the best mixing for G+E_kernel and the GxE kernel
    #########################################

    if not do_gxe2:
        gxe2, a2_gxe2, nLL_gxe2 = np.nan, np.nan, np.nan
    else:
        #Create the G+E kernel by mixing according to a2
        val = (1 - a2) * G_kernel.val + a2 * E_kernel.val
        GplusE_kernel = KernelData(iid=G_kernel.iid,
                                   val=val,
                                   name="{0} G + {1} E".format(1 - a2, a2))
        #Don't need to standardize GplusE_kernel because it's the weighted combination of standardized kernels

        # Create GxE Kernel and then find the best mixing of it and GplusE
        logging.info("Find best mixing for GxE and GplusE_kernel")

        val = G_kernel.val * E_kernel.val
        if permute_times_index >= 0:
            #We shuffle the val, but not the iid, because doing both would cancel out
            np.random.seed(
                (permute_times_seed + permute_times_index) % 4294967295)
            new_index = np.arange(G_kernel.iid_count)
            np.random.shuffle(new_index)
            val = pstutil.sub_matrix(val, new_index, new_index)

        GxE_kernel = KernelData(
            iid=G_kernel.iid, val=val, name="GxE"
        )  # recall that Python '*' is just element-wise multiplication
        GxE_kernel = GxE_kernel.standardize()

        lmm2 = LMM()
        lmm2.setK(K0=GplusE_kernel.val, K1=GxE_kernel.val, a2=0.5)
        lmm2.setX(np.ones([G_kernel.iid_count, 1]))  # just a bias column
        lmm2.sety(pheno.val[:, 0])
        if not just_testing:
            res2 = lmm2.findA2()
            gxe2, a2_gxe2, nLL_gxe2 = res2["h2"], res2["a2"], res2["nLL"]
            gxe2 *= a2_gxe2
        else:
            gxe2, a2_gxe2, nLL_gxe2 = 0, .5, 0
        logging.info(
            "G+E plus GxE mixture: gxe2: {0}, a2_gxe2: {1}, nLL_gxe2: {2}".
            format(gxe2, a2_gxe2, nLL_gxe2))

    #########################################
    # Return results
    #########################################

    ret = {
        "h2uncorr": h2uncorr,
        "nLLuncorr": nLLuncorr,
        "h2corr": h2corr,
        "h2corr_raw": h2corr_raw,
        "e2": e2,
        "a2": a2,
        "nLLcorr": nLLcorr,
        "gxe2": gxe2,
        "a2_gxe2": a2_gxe2,
        "nLL_gxe2": nLL_gxe2,
        "alpha": alpha,
        "alpha_power": alpha_power,
        "phen": np.array(pheno.sid, dtype='str')[0],
        "jackknife_index": jackknife_index,
        "jackknife_count": jackknife_count,
        "jackknife_seed": jackknife_seed,
        "permute_plus_index": permute_plus_index,
        "permute_plus_count": permute_plus_count,
        "permute_plus_seed": permute_plus_seed,
        "permute_times_index": permute_times_index,
        "permute_times_count": permute_times_count,
        "permute_times_seed": permute_times_seed
    }

    logging.info("run_line: {0}".format(ret))
    return ret
예제 #2
0
    def generate_and_analyze(seed,
                             N,
                             do_shuffle,
                             just_testing=True,
                             map_function=None,
                             cache_folder=None):

        #Generate SNPs
        snpdata = snp_gen(fst=.1,
                          dfr=0,
                          iid_count=N,
                          sid_count=1000,
                          chr_count=10,
                          label_with_pop=True,
                          seed=seed)
        K_causal = snpdata.read_kernel(Unit()).standardize()

        #Generate geo-spatial locations and K_loc
        distance_between_centers = 2500000
        x0 = distance_between_centers * 0.5
        x1 = distance_between_centers * 1.5
        y0 = distance_between_centers
        y1 = distance_between_centers
        sd = distance_between_centers / 4.

        spatial_iid = snpdata.iid
        center_dict = {"0": (x0, y0), "1": (x1, y1)}
        centers = np.array(
            [center_dict[iid_item[0]] for iid_item in spatial_iid])
        np.random.seed(seed)
        logging.info("Generating positions for seed {0}".format(seed))
        spatial_coor = SnpData(
            iid=snpdata.iid,
            sid=["x", "y"],
            val=centers + np.random.multivariate_normal(
                [0, 0], [[1, 0], [0, 1]], size=len(centers)) * sd,
            parent_string="'spatial_coor_gen_original'")
        alpha = distance_between_centers
        spatial_val = spatial_similarity(spatial_coor.val, alpha, power=2)
        K_loc = KernelData(iid=snpdata.iid, val=spatial_val).standardize()

        #Generate phenotype
        iid = K_causal.iid
        iid_count = K_causal.iid_count
        np.random.seed(seed)
        pheno_causal = SnpData(iid=iid,
                               sid=["causal"],
                               val=np.random.multivariate_normal(
                                   np.zeros(iid_count),
                                   K_causal.val).reshape(-1, 1),
                               parent_string="causal")
        np.random.seed(seed ^ 998372)
        pheno_noise = SnpData(iid=iid,
                              sid=["noise"],
                              val=np.random.normal(size=iid_count).reshape(
                                  -1, 1),
                              parent_string="noise")
        np.random.seed(seed ^ 12230302)
        pheno_loc_original = SnpData(iid=iid,
                                     sid=["loc_original"],
                                     val=np.random.multivariate_normal(
                                         np.zeros(iid_count),
                                         K_loc.val).reshape(-1, 1),
                                     parent_string="loc_original")

        if do_shuffle:
            idx = np.arange(iid_count)
            np.random.seed(seed)
            np.random.shuffle(idx)
            pheno_loc = pheno_loc_original.read(
                view_ok=True
            )  #don't need to copy, because the next line will be fresh memory
            pheno_loc.val = pheno_loc.val[idx, :]
        else:
            pheno_loc = pheno_loc_original

        pheno = SnpData(iid=iid,
                        sid=["pheno_all"],
                        val=pheno_causal.val + pheno_noise.val + pheno_loc.val)

        #Analyze data
        alpha_list = [
            int(v) for v in np.logspace(np.log10(100), np.log10(1e10), 100)
        ]
        dataframe = heritability_spatial_correction(
            snpdata,
            spatial_coor.val,
            spatial_iid,
            alpha_list=[alpha] if just_testing else alpha_list,
            pheno=pheno,
            alpha_power=2,
            jackknife_count=0,
            permute_plus_count=0,
            permute_times_count=0,
            just_testing=just_testing,
            map_function=map_function,
            cache_folder=cache_folder)

        logging.info(dataframe)
        return dataframe
예제 #3
0
    if do_plot:
        import matplotlib.pyplot as plt
        color_dict = {"0": "r", "1": "b", "2": "g"}
        colors = [color_dict[iid_item] for iid_item in snpdata.iid[:, 0]]
        plt.axis('equal')
        plt.scatter(spatial_coor_gen_original.val[:, 0],
                    spatial_coor_gen_original.val[:, 1],
                    c=colors)
        plt.show()

    from fastlmm.association.heritability_spatial_correction import spatial_similarity
    from pysnptools.kernelreader import KernelData

    alpha = distance_between_centers
    spatial_val = spatial_similarity(spatial_coor_gen_original.val,
                                     alpha,
                                     power=2)
    K_loc = KernelData(iid=snpdata.iid, val=spatial_val).standardize()

    if do_plot:
        pylab.suptitle("$K_{loc}$")
        pylab.imshow(K_loc.val, cmap=pylab.gray(), vmin=0, vmax=1)
        pylab.show()

    from pysnptools.snpreader import SnpData

    iid = K_causal.iid
    iid_count = K_causal.iid_count
    np.random.seed(seed)
    pheno_causal = SnpData(iid=iid,
                           sid=["causal"],