예제 #1
0
def run_script(input_dir, output_dir):
    """
    Run the commandline script for MFSDA.

    Args:
        input_dir (str): full path to the data folder
        output_dir (str): full path to the output folder
    """

    """+++++++++++++++++++++++++++++++++++"""
    """Step 1. load dataset """
    print("loading data ......")
    print("+++++++Read the surface shape data+++++++")
    shape_file_name = input_dir + "aligned_shapes.mat"
    mat = loadmat(shape_file_name)
    y_design = mat['aligned_shape']
    n, l, m = y_design.shape
    print("The dimension of shape matrix is " + str(y_design.shape))
    print("+++++++Read the sphere coordinate data+++++++")
    template_file_name = input_dir + "template.mat"
    mat = loadmat(template_file_name)
    coord_mat = mat['template']
    # d = coord_mat.shape[1]
    print("+++++++Read the design matrix+++++++")
    design_data_file_name = input_dir + "design_data.txt"
    design_data_tmp = np.loadtxt(design_data_file_name, delimiter=delimiter)
    if len(design_data_tmp.shape) == 1:
        design_data = np.reshape(design_data_tmp, (design_data_tmp.shape[0], 1))
    else:
        design_data = design_data_tmp

    # read the covariate type
    var_type_file_name = input_dir + "var_type.txt"
    var_type = np.loadtxt(var_type_file_name)
    print("+++++++Construct the design matrix: normalization+++++++")
    x_design = read_x(design_data, var_type)
    p = x_design.shape[1]
    print("The dimension of design matrix is " + str(x_design.shape))

    """+++++++++++++++++++++++++++++++++++"""
    """Step 2. Statistical analysis: including (1) smoothing and (2) hypothesis testing"""
    gpvals, lpvals_fdr, clu_pvals, efit_beta, efity_design, efit_eta = mfsda.run_stats(y_design, coord_mat, design_data, var_type)

    """+++++++++++++++++++++++++++++++++++"""
    """Step3. Save all the results"""
    gpvals_file_name = output_dir + "global_pvalue.txt"
    np.savetxt(gpvals_file_name, gpvals)
    lpvals_fdr_file_name = output_dir + "local_pvalue_fdr.txt"
    np.savetxt(lpvals_fdr_file_name, lpvals_fdr)
    clu_pvals_file_name = output_dir + "cluster_pvalue.txt"
    np.savetxt(clu_pvals_file_name, clu_pvals)
예제 #2
0
def run_stats(y_design, coord_mat, design_data, var_type):

    n, l, m = y_design.shape

    print("+++++++Construct the design matrix: normalization+++++++")
    x_design = read_x(design_data, var_type)
    p = x_design.shape[1]
    print("The dimension of design matrix is ", str(x_design.shape))
    """+++++++++++++++++++++++++++++++++++"""
    """Step 2. Statistical analysis: including (1) smoothing and (2) hypothesis testing"""

    print("+++++++Local linear kernel smoothing+++++++")
    start = timeit.default_timer()
    efit_beta, efity_design, h_opt = lpks(coord_mat, x_design, y_design)
    stop = timeit.default_timer()
    delta_time = str(stop - start)
    # print(h_opt)
    print("Elapsed time is " + delta_time)

    print(
        "+++++++Kernel smoothing (order = 1) for smooth functions (eta)+++++++"
    )
    start = timeit.default_timer()
    resy_design = y_design - efity_design
    print(np.amax(resy_design))
    print(np.amin(resy_design))
    efit_eta, res_eta, esig_eta = sif(coord_mat, resy_design, h_opt)
    print(np.amax(res_eta))
    print(np.amin(res_eta))
    stop = timeit.default_timer()
    delta_time = str(stop - start)
    print("Elapsed time is " + delta_time)

    print("+++++++Hypothesis testing+++++++")
    # hypothesis: beta_pj(d)=0 v.s. beta_pj(d)~=0 for all j and d
    start = timeit.default_timer()
    lpvals = np.zeros((l, p - 1))
    lpvals_fdr = np.zeros((l, p - 1))
    gpvals = np.zeros((1, p - 1))
    clu_pvals = np.zeros((1, p - 1))
    areas = np.zeros((1, p - 1))
    num_bstrp = 500  # number of bootstrap samples
    thres = 2

    for pp in range(p - 1):
        print("Testing whether the covariate " + str(pp + 1) +
              " is zero or not...")
        """ local and global statistics calculation """
        cdesign = np.zeros((1, p))
        cdesign[0, pp + 1] = 1
        gstat, lstat = wald_ht(x_design, efit_beta, esig_eta, cdesign)
        lpvals[:, pp] = 1 - np.squeeze(stats.chi2.cdf(lstat, m))
        lpvals_fdr[:, pp] = fdrcorrection0(lpvals[:, pp])[1]
        ind_thres = -np.log10(lpvals[:, pp]) >= thres
        area = np.sum(ind_thres)
        """ Generate random samples and calculate the corresponding statistics and pvalues """
        gpval, clu_pval = bstrp_pvalue(coord_mat, x_design, y_design, cdesign,
                                       gstat, num_bstrp, thres, area)

        gpvals[0, pp] = gpval
        areas[0, pp] = area
        clu_pvals[0, pp] = clu_pval
        print("the global p-value for covariate " + str(pp + 1) + " is " +
              str(gpvals[0, pp]) + "...")
        print("the p-value of most significant subregion for covariate " +
              str(pp + 1) + " is " + str(clu_pvals[0, pp]) + "...")

    stop = timeit.default_timer()
    delta_time = str(stop - start)
    print("Elapsed time is " + delta_time)

    return gpvals, lpvals_fdr, clu_pvals, efit_beta, efity_design, efit_eta
예제 #3
0
def run_script(input_dir, output_dir):
    """
    Run the commandline script for MFSDA.

    Args:
        input_dir (str): full path to the data folder
        output_dir (str): full path to the output folder
    """

    """+++++++++++++++++++++++++++++++++++"""
    """Step 1. load dataset """
    print("loading data ......")
    print("+++++++Read the surface shape data+++++++")
    shape_file_name = input_dir + "aligned_shapes.mat"
    mat = loadmat(shape_file_name)
    y_design = mat['aligned_shape']
    n, l, m = y_design.shape
    print("The dimension of shape matrix is " + str(y_design.shape))
    print("+++++++Read the sphere coordinate data+++++++")
    template_file_name = input_dir + "template.mat"
    mat = loadmat(template_file_name)
    coord_mat = mat['template']
    # d = coord_mat.shape[1]
    print("+++++++Read the design matrix+++++++")
    design_data_file_name = input_dir + "design_data.txt"
    design_data = np.loadtxt(design_data_file_name)
    # read the covariate type
    var_type_file_name = input_dir + "var_type.txt"
    var_type = np.loadtxt(var_type_file_name)
    print("+++++++Construct the design matrix: normalization+++++++")
    x_design = read_x(design_data, var_type)
    p = x_design.shape[1]
    print("The dimension of design matrix is ", str(x_design.shape))

    """+++++++++++++++++++++++++++++++++++"""
    """Step 2. Statistical analysis: including (1) smoothing and (2) hypothesis testing"""

    print("+++++++Local linear kernel smoothing+++++++")
    start = timeit.default_timer()
    efit_beta, efity_design, h_opt = lpks(coord_mat, x_design, y_design)
    stop = timeit.default_timer()
    delta_time = str(stop - start)
    # print(h_opt)
    print("Elapsed time is " + delta_time)

    print("+++++++Kernel smoothing (order = 1) for smooth functions (eta)+++++++")
    start = timeit.default_timer()
    resy_design = y_design - efity_design
    print(np.amax(resy_design))
    print(np.amin(resy_design))
    efit_eta, res_eta, esig_eta = sif(coord_mat, resy_design, h_opt)
    print(np.amax(res_eta))
    print(np.amin(res_eta))
    stop = timeit.default_timer()
    delta_time = str(stop - start)
    print("Elapsed time is " + delta_time)

    print("+++++++Hypothesis testing+++++++")
    # hypothesis: beta_pj(d)=0 v.s. beta_pj(d)~=0 for all j and d
    start = timeit.default_timer()
    lpvals = np.zeros((l, p-1))
    lpvals_fdr = np.zeros((l, p-1))
    gpvals = np.zeros((1, p-1))
    clu_pvals = np.zeros((1, p-1))
    areas = np.zeros((1, p-1))
    num_bstrp = 500  # number of bootstrap samples
    thres = 2

    for pp in range(p-1):
        print("Testing whether the covariate " + str(pp+1) + " is zero or not...")
        """ local and global statistics calculation """
        cdesign = np.zeros((1, p))
        cdesign[0, pp+1] = 1
        gstat, lstat = wald_ht(x_design, efit_beta, esig_eta, cdesign)
        lpvals[:, pp] = 1 - np.squeeze(stats.chi2.cdf(lstat, m))
        lpvals_fdr[:, pp] = fdrcorrection0(lpvals[:, pp])[1]
        ind_thres = -np.log10(lpvals[:, pp]) >= thres
        area = np.sum(ind_thres)

        """ Generate random samples and calculate the corresponding statistics and pvalues """
        gpval, clu_pval = bstrp_pvalue(coord_mat, x_design, y_design, cdesign, gstat, num_bstrp, thres, area)

        gpvals[0, pp] = gpval
        areas[0, pp] = area
        clu_pvals[0, pp] = clu_pval
        print("the global p-value for covariate " + str(pp+1) + " is " + str(gpvals[0, pp]) + "...")
        print("the p-value of most significant subregion for covariate " +
              str(pp+1) + " is " + str(clu_pvals[0, pp]) + "...")

    stop = timeit.default_timer()
    delta_time = str(stop - start)
    print("Elapsed time is " + delta_time)

    """+++++++++++++++++++++++++++++++++++"""
    """Step3. Save all the results"""
    gpvals_file_name = output_dir + "global_pvalue.txt"
    np.savetxt(gpvals_file_name, gpvals)
    lpvals_fdr_file_name = output_dir + "local_pvalue_fdr.txt"
    np.savetxt(lpvals_fdr_file_name, lpvals_fdr)
    clu_pvals_file_name = output_dir + "cluster_pvalue.txt"
    np.savetxt(clu_pvals_file_name, clu_pvals)
예제 #4
0
파일: test.py 프로젝트: BIG-S2/FGWAS
def run_script(input_dir, output_dir):
    """
    Run the commandline script for FGWAS.

    :param
        input_dir (str): full path to the data folder
        output_dir (str): full path to the output folder
    """
    """+++++++++++++++++++++++++++++++++++"""
    print(""" Step 0. load dataset """)
    print("+++++++Read the imaging data+++++++")
    img_file_name = input_dir + "img_data.mat"
    mat = loadmat(img_file_name)
    img_data = mat['img_data']
    n, l, m = img_data.shape
    img_data = np.log10(img_data)
    print("The matrix dimension of image data is " + str(img_data.shape))
    print("+++++++Read the imaging coordinate data+++++++")
    coord_file_name = input_dir + "coord_data.txt"
    coord_data = np.loadtxt(coord_file_name)
    # d = coord_data.shape[1]
    print("The matrix dimension of coordinate data is " +
          str(coord_data.shape))
    print("+++++++Read the SNP data+++++++")
    snp_file_name = input_dir + "snp_data.txt"
    snp_data = np.loadtxt(snp_file_name)
    # g = snp_data.shape[1]
    print("The matrix dimension of original snp data is " +
          str(snp_data.shape))
    print("+++++++Read the covariate data+++++++")
    design_data_file_name = input_dir + "design_data.txt"
    design_data = np.loadtxt(design_data_file_name)
    # design_data = design_data[:, np.arange(5)]
    print("The matrix dimension of covariate data is " +
          str(design_data.shape))

    # read the covariate type
    var_type_file_name = input_dir + "var_type.txt"
    var_type = np.loadtxt(var_type_file_name)
    # read the image size
    img_size_file_name = input_dir + "img_size.txt"
    img_size = np.loadtxt(img_size_file_name)
    # read the image index of non-background region
    img_idx_file_name = input_dir + "img_idx.txt"
    img_idx = np.loadtxt(img_idx_file_name)

    print("+++++++++Matrix preparing and Data preprocessing++++++++")
    print(
        "+++++++Construct the imaging response, design, coordinate matrix: normalization+++++++"
    )
    x_design, y_design, coord_data = read_x(img_data, coord_data, design_data,
                                            var_type)
    p = x_design.shape[1]
    print("The dimension of normalized design matrix is " +
          str(x_design.shape))
    print("+++++++Preprocess SNP: filtering+++++++")
    max_num = np.zeros(shape=(3, snp_data.shape[1]))
    for i in range(3):
        bw = np.zeros(snp_data.shape)
        bw[snp_data == i] = 1
        max_num[i, :] = np.sum(bw, axis=0)
    max_num_idx = np.argmax(max_num, axis=0)
    indx = np.where(snp_data < 0)
    for i in range(len(indx[1])):
        snp_data[indx[0][i], indx[1][i]] = max_num_idx[indx[1][i]]

    min_maf = 0.05  # threshold for MAF
    maf = np.sum(snp_data, axis=0) / (2 * n)
    temp_idx = np.where(maf > 0.5)
    maf[temp_idx] = 1 - maf[temp_idx]
    rm_snp_index = np.where(maf <= min_maf)
    snp = np.delete(snp_data, rm_snp_index, axis=1)
    print("There are " + str(snp.shape[1]) + " snps with MAF>0.05.")
    """+++++++++++++++++++++++++++++++++++"""
    print(
        """ Step 1. Fit the multivariate varying coefficient model (MVCM) """)
    start_1 = time.time()
    # find the optimal bandwidth
    h_opt, hat_mat = bw_rt(coord_data, x_design, y_design)
    print("the optimal bandwidth by Scott's Rule is ", h_opt)
    qr_smy_mat, esig_eta, smy_design, resy_design, efit_eta = mvcm(
        coord_data, y_design, h_opt, hat_mat)
    end_1 = time.time()
    print("Elapsed time in Step 1 is ", end_1 - start_1)
    # print(esig_eta)
    # print(qr_smy_mat)
    for mii in range(m):
        res_mii = resy_design[:, :, mii] - efit_eta[:, :, mii]
        print("The bound of the residual is [" + str(np.min(res_mii)) + ", " +
              str(np.max(res_mii)) + "]")
        # res_img = np.reshape(np.mean(res_mii, axis=0), (int(img_size[0]), int(img_size[1])))
        # res_img_file_name = output_dir + "residual_%d.txt" % mii
        # np.savetxt(res_img_file_name, res_img)
    """+++++++++++++++++++++++++++++++++++"""
    print(""" Step 2. Global sure independence screening (GSIS) """)
    start_2 = time.time()
    g_num = 1000  # number of top candidate snps
    g_pv_log10 = gsis(snp, qr_smy_mat, hat_mat)[0]
    g_pv_log10_file_name = output_dir + "g_pv_log10.txt"
    np.savetxt(g_pv_log10_file_name, g_pv_log10)
    snp_pv = 10**(-g_pv_log10)
    top_snp_idx = np.argsort(-g_pv_log10)
    top_snp = snp[:, top_snp_idx[0:g_num]]

    end_2 = time.time()

    print("Elapsed time in Step 2 is ", end_2 - start_2)
예제 #5
0
def run_script(input_dir, output_dir):

    """
    Run the commandline script for FGWAS.

    :param
        input_dir (str): full path to the data folder
        output_dir (str): full path to the output folder
    """

    """+++++++++++++++++++++++++++++++++++"""
    print(""" Step 0. load dataset """)
    print("+++++++Read the imaging data+++++++")
    img_file_name = input_dir + "img_data.mat"
    mat = loadmat(img_file_name)
    img_data = mat['img_data']
    if len(img_data.shape) == 2:
        img_data = img_data.reshape(1, img_data.shape[0], img_data.shape[1])
    m, n, n_v = img_data.shape
    y_design = np.log10(img_data)  # log transformation on response
    print("The matrix dimension of image data is " + str(img_data.shape))
    print("+++++++Read the imaging coordinate data+++++++")
    coord_file_name = input_dir + "coord_data.txt"
    coord_data = np.loadtxt(coord_file_name)
    print("The matrix dimension of coordinate data is " + str(coord_data.shape))
    print("+++++++Read the SNP data+++++++")
    snp_file_name = input_dir + "snp_data.txt"
    snp_data = np.loadtxt(snp_file_name)
    print("The matrix dimension of original snp data is " + str(snp_data.shape))
    print("+++++++Read the covariate data+++++++")
    design_data_file_name = input_dir + "design_data.txt"
    design_data = np.loadtxt(design_data_file_name)
    print("The matrix dimension of covariate data is " + str(design_data.shape))

    # read the covariate type
    var_type_file_name = input_dir + "var_type.txt"
    var_type = np.loadtxt(var_type_file_name)
    var_type = np.array([int(i) for i in var_type])

    print("+++++++++Matrix preparing and Data preprocessing++++++++")
    print("+++++++Construct the imaging response, design, coordinate matrix: normalization+++++++")
    x_design, coord_data = read_x(coord_data, design_data, var_type)
    p = x_design.shape[1]
    print("The dimension of normalized design matrix is " + str(x_design.shape))
    print("+++++++Preprocess SNP: filtering+++++++")
    max_num = np.zeros(shape=(3, snp_data.shape[1]))
    for i in range(3):
        bw = np.zeros(snp_data.shape)
        bw[snp_data == i] = 1
        max_num[i, :] = np.sum(bw, axis=0)
    max_num_idx = np.argmax(max_num, axis=0)
    indx = np.where(snp_data < 0)
    for i in range(len(indx[1])):
        snp_data[indx[0][i], indx[1][i]] = max_num_idx[indx[1][i]]

    min_maf = 0.05  # threshold for MAF
    maf = np.sum(snp_data, axis=0) / (2 * n)
    temp_idx = np.where(maf > 0.5)
    maf[temp_idx] = 1 - maf[temp_idx]
    rm_snp_index = np.where(maf <= min_maf)
    snp = np.delete(snp_data, rm_snp_index, axis=1)
    g = snp.shape[1]
    print("There are " + str(snp.shape[1]) + " snps with MAF>0.05.")

    """+++++++++++++++++++++++++++++++++++"""
    print(""" Step 1. Fit the multivariate varying coefficient model (MVCM) under H0 """)
    start_1 = time.time()
    # find the optimal bandwidth
    h_opt, hat_mat = bw_rt(coord_data, x_design, y_design)
    print("the optimal bandwidth by Scott's Rule is ", h_opt)
    qr_smy_mat, esig_eta, smy_design, resy_design, efit_eta = mvcm(coord_data, y_design, h_opt, hat_mat)
    end_1 = time.time()
    print("Elapsed time in Step 1 is ", end_1 - start_1)
    for mii in range(m):
        res_mii = resy_design[mii, :, :]-efit_eta[mii, :, :]
        print("The bound of the residual is [" + str(np.min(res_mii)) + ", " + str(np.max(res_mii)) + "]")

    """+++++++++++++++++++++++++++++++++++"""
    print(""" Step 2. Global sure independence screening (GSIS) """)
    start_2 = time.time()
    g_num = 1000  # number of top candidate snps
    g_pv_log10, g_stat = gsis(snp, qr_smy_mat, hat_mat)
    snp_pv = 10 ** (-g_pv_log10)
    top_snp_idx = np.argsort(-g_pv_log10)
    top_snp = snp[:, top_snp_idx[0:g_num]]
    snp_info_file = input_dir + "snp_info.map"
    fd = open(snp_info_file, 'r')
    snp_info = np.loadtxt(fd, delimiter='\t', dtype=bytes).astype(str)
    fd.close()
    snp_chr_tp = np.delete(snp_info[:, 0], rm_snp_index)
    snp_chr = np.array([int(i) for i in snp_chr_tp])
    snp_name = np.delete(snp_info[:, 1], rm_snp_index)
    snp_bp_tp = np.delete(snp_info[:, 3], rm_snp_index)
    snp_bp = np.array([int(i) for i in snp_bp_tp])
    gsis_all = np.vstack((snp_chr, snp_bp, snp_pv)).T  # input for plotting Manhattan plot
    top_snp_chr = snp_chr[top_snp_idx[0:g_num]]
    top_snp_name = snp_name[top_snp_idx[0:g_num]]
    top_snp_bp = snp_bp[top_snp_idx[0:g_num]]
    top_snp_pv_log10 = g_pv_log10[top_snp_idx[0:g_num]]
    gsis_top = np.vstack((top_snp_name, top_snp_chr, top_snp_bp, top_snp_pv_log10)).T  # top SNP GSIS results
    gsis_all_file_name = output_dir + "GSIS_all.txt"
    np.savetxt(gsis_all_file_name, gsis_all, delimiter="\t", fmt="%d %d %f")
    gsis_top_file_name = output_dir + "GSIS_top.txt"
    np.savetxt(gsis_top_file_name, gsis_top, delimiter="\t", fmt="%s", comments='',
               header="SNP\tCHR\tBP\tP")
    end_2 = time.time()
    print("Elapsed time in Step 2 is ", end_2 - start_2)

    # save results in temp folder for next step
    start_3 = time.time()
    temp_dir = output_dir + "/temp/"
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)
    data_dim = np.array([n, n_v, m, p, g, g_num])
    data_dim_file_name = temp_dir + "data_dim.mat"
    savemat(data_dim_file_name, mdict={'data_dim': data_dim})
    all_snp_file_name = temp_dir + "snp.mat"
    savemat(all_snp_file_name, mdict={'snp': snp})
    top_snp_file_name = temp_dir + "top_snp.mat"
    savemat(top_snp_file_name, mdict={'top_snp': top_snp})
    y_design_file_name = temp_dir + "y_design.mat"
    savemat(y_design_file_name, mdict={'y_design': y_design})
    resy_design_file_name = temp_dir + "resy_design.mat"
    savemat(resy_design_file_name, mdict={'resy_design': resy_design})
    efit_eta_file_name = temp_dir + "efit_eta.mat"
    savemat(efit_eta_file_name, mdict={'efit_eta': efit_eta})
    esig_eta_file_name = temp_dir + "esig_eta.mat"
    savemat(esig_eta_file_name, mdict={'esig_eta': esig_eta})
    hat_mat_file_name = temp_dir + "hat_mat.mat"
    savemat(hat_mat_file_name, mdict={'hat_mat': hat_mat})
    end_3 = time.time()
    print("Elapsed time in saving temp results is ", end_3 - start_3)