Exemplo n.º 1
0
def test_localreg_realistic():
    x = np.array([
        -6.89438, 7.94300378, 5.5221823, 9.77749217, -0.35979986, 2.01456739,
        4.80691814, 3.22260756, -7.12156073, -8.69959441
    ])
    y = np.array([
        -1.74962299, -8.55733072, 8.56537608, 1.79095858, 4.43380336,
        -14.63365203, 5.41264117, 9.69660297, -13.85424098, 0.42264531
    ])
    x0 = np.array([2., 3.])

    # Testing all orders
    assert np.allclose(localreg(x,
                                y,
                                x0,
                                degree=0,
                                kernel=epanechnikov,
                                width=1), [-14.63365203, 8.9780852],
                       rtol=1e-3)
    assert np.allclose(localreg(x,
                                y,
                                x0,
                                degree=1,
                                kernel=epanechnikov,
                                width=1), [-14.5487543, 5.21322664],
                       rtol=1e-3)
    assert np.allclose(localreg(x,
                                y,
                                x0,
                                degree=2,
                                kernel=epanechnikov,
                                width=1), [-14.4523815, 3.77134959],
                       rtol=1e-3)

    # Testing width
    assert np.allclose(localreg(x,
                                y,
                                x0,
                                degree=2,
                                kernel=epanechnikov,
                                width=2), [-14.80997735, 7.00785276],
                       rtol=1e-3)

    # Testing frac
    assert np.allclose(localreg(x,
                                y,
                                x0,
                                degree=2,
                                kernel=epanechnikov,
                                frac=0.5), [-6.21823369, 4.33953829],
                       rtol=1e-3)
Exemplo n.º 2
0
def test_localreg_narrow_kernel(caplog):
    x = np.array([0., 1., 2.])
    y = np.array([0., 1., 2.])
    x0 = np.array([0.5])
    y0 = localreg(x, y, x0, degree=2, kernel=epanechnikov, width=0.4)
    assert np.isnan(y0)[0]
    assert (len(caplog.records) == 1)
def plot_nucleosome_occupancy_sample(fig_format ="png"):
    mkdir(figure_dir)
    input_fp = os.path.join(data_dir, "nucleosome_positioning", "GSM1194220_H1.bed")
    df = pd.read_csv(input_fp, sep="\t", header=None).values
    NROW = 4
    NCOL = 4
    fig, axs = plt.subplots(NROW, NROW, figsize=(NROW * EACH_SUB_FIG_SIZE, NROW * EACH_SUB_FIG_SIZE))
    fig_fp = os.path.join(figure_dir, "nucleosome_occupancy_of_part_of_chr1.%s" % fig_format)
    for rid in range(NROW * NCOL):
        row = rid // NCOL
        col = rid % NCOL
        ax = axs[row][col]
        start = 20000 + rid * 2000
        end = 20000 + (rid + 1) * 2000
        loci = df[start:end, 1].astype(np.float)
        oc = df[start:end, 2].astype(np.float)
        try:
            y2 = localreg(loci, oc, degree=2, kernel=tricube, width=10)
            ax.scatter(loci, oc, s=8, color="blue")
            ax.plot(loci, y2, "k-", linewidth=2)
        except np.linalg.LinAlgError as e:
            sns.regplot(x=loci, y=oc, ax=ax, scatter_kws={'s': 8, 'color': "blue"})
        ax.set_xlim(start, end)
        ax.set_ylim(0, 600)
        ax.set_title("nucleos occupancy of chr1: %d:%d" %(start, end) , fontsize=14)


    plt.savefig(fig_fp, dpi=300, bbox_inches='tight', pad_inches=0.1)
Exemplo n.º 4
0
def plot_local_regression_and_RD(fig_format="png"):
    K_RD = [1]  # , 0
    N_COL = N_ROW = 2
    cm = plt.get_cmap('gist_rainbow')
    mkdir(figure_dir)
    for km in K_RD:
        correlation_type = '' if km else '-methy'
        for rid, region_label in enumerate(REGION_LABELS):
            input_dir = os.path.join(data_dir, "Q1_CGI_density", region_label)

            REGIONS = ["Non-" + region_label, region_label]

            fig, axs = plt.subplots(N_ROW,
                                    N_COL,
                                    figsize=(N_COL * EACH_SUB_FIG_SIZE,
                                             N_ROW * EACH_SUB_FIG_SIZE))
            fig_fp = os.path.join(figure_dir,
                                  "%s.%s" % (region_label, fig_format))
            for iid, region in enumerate(REGIONS):
                for jid, cgi in enumerate(CGI_LABELS):
                    correlation_fp = os.path.join(
                        input_dir,
                        region + "-" + cgi + correlation_type + "-Rd.bed")
                    file_label = region + " " + cgi
                    idx = iid * len(REGIONS) + jid
                    row = iid
                    col = jid
                    ax = axs[row][col]
                    RD_df = pd.read_csv(correlation_fp, sep="\t",
                                        header=None).values
                    x = RD_df[:, 0]
                    y = RD_df[:, 1]
                    color = cm(1. * idx / (N_COL * N_ROW))
                    try:
                        y2 = localreg(x,
                                      y,
                                      degree=2,
                                      kernel=tricube,
                                      width=100)
                        ax.scatter(x, y, s=8, color=color, label=file_label)
                        ax.plot(x, y2, "k-", linewidth=2)
                    except np.linalg.LinAlgError as e:
                        sns.regplot(x=x,
                                    y=y,
                                    ax=ax,
                                    scatter_kws={
                                        's': 8,
                                        'color': color
                                    })  #, line_kws ={'color':'black', "lw": 2}
                    ax.set_xticks(range(0, D_MAX + 1, 200))
                    ax.set_xlim(0, D_MAX)
                    ax.set_ylim(0, 1.0)
                    ax.set_title(file_label, fontsize=14)
                plt.savefig(fig_fp,
                            dpi=300,
                            bbox_inches='tight',
                            pad_inches=0.1)
def plot_local_regression_and_RD_within_vs_across(fig_format="png"):
    K_RD = [1]# , 0
    N_COL = 4
    N_ROW = 2
    cm = plt.get_cmap('gist_rainbow')
    mkdir(figure_dir)
    for km in K_RD:
        correlation_type = '' if km else '-methy'
        fig, axs = plt.subplots(N_ROW, N_COL, figsize=(N_COL * EACH_SUB_FIG_SIZE, N_ROW * EACH_SUB_FIG_SIZE))
        fig_fp = os.path.join(figure_dir, "CGI_RATIO_Within-Across-5000bp.%s" % fig_format)
        for rid, ratio in enumerate(RATIOS):
            within_correlation_fp = os.path.join(data_dir, "CGI_identified_with_different_thereshold", "CGI_%s_K_intersected" % ratio + correlation_type +"-only-within-Rd.bed")
            across_correlation_fp = os.path.join(data_dir, "CGI_identified_with_different_thereshold", "CGI_%s_K_intersected" % ratio + correlation_type +"-both-within-across-Rd.bed")
            file_label = "Obs/Expect " + str(RATIO_LABELS[rid])
            row = rid // N_COL
            col = rid % N_COL
            ax = axs[row][col]
            within_RD = pd.read_csv(within_correlation_fp, sep="\t", header=None).values
            across_RD = pd.read_csv(across_correlation_fp, sep="\t", header=None).values
            x1 = within_RD[:, 0]
            y1 = within_RD[:, 1]

            x2 = across_RD[:, 0]
            y2 = across_RD[:, 1]

            color = cm(1. * rid / len(RATIOS))
            try:
                yy1 = localreg(x1, y1, degree=2, kernel=tricube, width=100)
                yy2 = localreg(x2, y2, degree=2, kernel=tricube, width=100)
                ax.scatter(x1, y1, s=3, color=color, label="within")
                ax.scatter(x2, y2, s=1.5, color="black", label="within")
                ax.plot(x2, yy2, "w-", linewidth=1.5)
                ax.plot(x1, yy1, "k-", linewidth=1.5)
            except np.linalg.LinAlgError as e:
                sns.regplot(x=x1, y=y1, ax=ax, scatter_kws={'s':8, 'color': color})#, line_kws ={'color':'black', "lw": 2}
                sns.regplot(x=x2, y=y2, ax=ax, scatter_kws={'s':8, 'color': "black"})#, line_kws ={'color':'black', "lw": 2}
            ax.set_xticks(range(0, D_MAX + 1, 1000))
            ax.set_xlim(0, D_MAX)
            ax.set_ylim(0, 1.0)
            ax.set_title(file_label, fontsize=14)
        plt.savefig(fig_fp, dpi=300, bbox_inches='tight', pad_inches=0.1)
def plot_local_regression_and_RD(fig_format="png"):
    K_RD = [1]# , 0
    N_COL = 1
    N_ROW = 1
    plt.rc('xtick', labelsize=12)  # fontsize of the tick labels
    plt.rc('ytick', labelsize=12)  # fontsize of the tick labels
    cm = plt.get_cmap('gist_rainbow')
    mkdir(figure_dir)
    for km in K_RD:
        correlation_type = '' if km else '-methy'
        fig, ax = plt.subplots(N_ROW, N_COL, figsize=((N_COL * EACH_SUB_FIG_SIZE), N_ROW * EACH_SUB_FIG_SIZE))
        fig_fp = os.path.join(figure_dir, "CGI_RATIO_FROM-06-13-%d.%s" % (D_MAX, fig_format))
        for rid, ratio in enumerate(RATIOS):
            correlation_fp = os.path.join(data_dir, "CGI_identified_with_different_thereshold", "CGI_%s_K_intersected" % ratio + correlation_type +"-only-within-Rd.bed")
            file_label = "Obs/Exp>" + str(RATIO_LABELS[rid])
            # row = rid // N_COL
            # col = rid % N_COL
            # if N_ROW == 1:
            #     ax = axs[0]
            # else:
            #     ax = axs[row][0]
            RD_df = pd.read_csv(correlation_fp, sep="\t", header=None).values
            x = RD_df[:, 0]
            y = RD_df[:, 1]
            color = cm(1. * rid / len(RATIOS))
            try:
                y2 = localreg(x, y, degree=2, kernel=tricube, width=100)
                ax.scatter(x, y, s=8, color=color, label=file_label)
                ax.plot(x, y2, color='black', linestyle = 'solid', linewidth=2)
            except np.linalg.LinAlgError as e:
                sns.regplot(x=x, y=y, ax=ax, scatter_kws={'s':8, 'color': color})#, line_kws ={'color':'black', "lw": 2}
            ax.set_xticks(range(0, D_MAX + 1, 200))
            ax.set_xlim(0, D_MAX)
            ax.set_ylim(0, 1.0)
            ax.set_xlabel("Genomic Distance(bp)", fontsize=18, fontweight='bold')
            ax.set_ylabel("Pearson Correlation", fontsize=18, fontweight='bold')
            # box = ax.get_position()
            # ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
            ax.legend(loc='best')#, bbox_to_anchor=(1, 0.5))

        # ax.set_title(file_label, fontsize=14)
        plt.savefig(fig_fp, dpi=300)
def smooth(x, y, xgrid):
    """ Approximates the linear relationship between x and y by a random subsample

    :param (np.array)   x: x-axis data points
    :param np.array     y: y-axis data points
    :param np.array     xgrid: interpolation points
    :return np.array: approximate/smoothed y-values
    """
    samples = np.random.choice(len(x), len(x), replace=True)
    y_s = y[samples]
    x_s = x[samples]
    y_sm = localreg(x_s,
                    y_s,
                    x0=None,
                    degree=1,
                    kernel=triangular,
                    width=19.08094)
    y_grid = scipy.interpolate.interp1d(x_s, y_sm,
                                        fill_value='extrapolate')(xgrid)

    return y_grid
def plot_whole_landscape(fig_format="png"):
    regions = ["Genomic_Regions", "Histone_Modification", "ChromHMM",
               "TFBS"]  #
    RD_DIRNAME = "Whole_Landscape"
    cm = plt.get_cmap('gist_rainbow')
    FIG_DIR = os.path.join(BASE_DIR, "figures")
    NBIN = 30
    vmins = [0, 120]
    vmaxs = [0.15, 220]
    N_COL = 18
    COL_Labels = [
        "Corr", "Corr with DNase", "Corr with Nuc Occ", "Hist of k",
        "Hist of methylation", "Hist2D of k/methy", "DNase Peak",
        "Nucleosome Occupancy", "H3k4me1", "H3k4me3", "H3k9me3", "H3k9ac",
        "H3k27ac", "H3k27me3", "H3k36me3", "H4k20me", "CTCF", "p300"
    ]
    COL_INDEXS = [
        0, 0, 0, 4, 3, 0, -2, -1, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16
    ]
    MP = 50  #Max Peak Value
    xlims = [
        D_MAX, D_MAX, D_MAX, 10, 1., 1, 0.15, 220, MP, MP, MP, MP, MP, MP, MP,
        MP, MP, MP
    ]
    ylims = [
        1, 1, 1, 0.6, 16, 1, 70, 0.05, 0.25, 0.2, 0.14, 0.06, .1, .12, .25,
        .20, 0.05, .2
    ]
    for REGION in regions:
        fig_dir = os.path.join(FIG_DIR, RD_DIRNAME)
        mkdir(fig_dir)
        out_rd_corr_dir = os.path.join("../data/K_region_intersect", REGION,
                                       "K_Rd")
        file_paths = [
            os.path.join(out_rd_corr_dir, "%s.bed" % file_name)
            for file_name in FILE_ORDERED_NAMES[REGION]
        ]
        file_labels = FILE_LABELS[REGION]
        bed_fps = [
            os.path.join("../data/K_region_intersect", REGION,
                         "%s.bed" % file_name)
            for file_name in FILE_ORDERED_NAMES[REGION]
        ]
        N_ROW = len(file_labels)

        fig_fp = os.path.join(fig_dir, "%s.%s" % (REGION, fig_format))
        fig, axs = plt.subplots(N_ROW,
                                N_COL,
                                figsize=(N_COL * EACH_SUB_FIG_SIZE,
                                         N_ROW * EACH_SUB_FIG_SIZE))
        for row in range(N_ROW):
            RD_df = pd.read_csv(file_paths[row], sep="\t", header=None).values
            bed_df = pd.read_csv(bed_fps[row], sep="\t", header=None).values
            for col in range(N_COL):
                print("%s, %s" % (file_labels[row], COL_Labels[col]))
                ax = axs[row][col]
                xlim = xlims[col]
                ylim = ylims[col]
                if col <= 2:
                    x = RD_df[:, 0]
                    y = RD_df[:, 1]
                    try:
                        y2 = localreg(x,
                                      y,
                                      degree=2,
                                      kernel=tricube,
                                      width=100)
                        if col != 0:
                            z = RD_df[:, 1 + col]
                            sc = ax.scatter(x,
                                            y,
                                            s=8,
                                            c=z,
                                            label=file_labels[row],
                                            cmap=cm,
                                            vmin=vmins[col - 1],
                                            vmax=vmaxs[col - 1])
                            fig.colorbar(sc, ax=ax)
                        else:
                            ax.scatter(x,
                                       y,
                                       s=2,
                                       color=cm(1. * row / N_ROW),
                                       label=file_labels[row])
                        ax.plot(x, y2, "k-", linewidth=2)
                    except np.linalg.LinAlgError as e:
                        sns.regplot(
                            x=x,
                            y=y,
                            ax=ax,
                            scatter_kws={
                                's': 8,
                                'color': cm(1. * row / N_ROW)
                            })  # , line_kws ={'color':'black', "lw": 2}
                    ax.set_xticks(range(0, D_MAX + 1, 200))
                    ax.set_xlim(0, xlim)
                    ax.set_ylim(0, ylim)
                    ax.set_title(COL_Labels[col], fontsize=18)
                    if col == 0:
                        ax.set_ylabel(file_labels[row], fontsize=20)
                    if row == N_ROW - 1:
                        ax.set_xlabel("Genomic Distance(bp)")
                else:
                    col_ind_in_bed = COL_INDEXS[col]
                    if col_ind_in_bed != 0:
                        vals = bed_df[:, col_ind_in_bed]
                        vals = vals[vals != "."].astype(float)
                        vals[vals > xlim] = xlim
                        _ = ax.hist(vals,
                                    bins=NBIN,
                                    density=True,
                                    color=cm(1. * col / N_COL),
                                    edgecolor='black',
                                    alpha=0.5,
                                    linewidth=0.5)
                        ax.set_xlim([0, xlim])
                        ax.set_ylim(0, ylim)
                        ax.set_title(COL_Labels[col], fontsize=18)
                    else:
                        ks = bed_df[:, 4].astype(float)
                        methys = bed_df[:, 3].astype(float)
                        ks[ks <= 0] = 0.001
                        ks = np.log10(ks)
                        h = ax.hist2d(methys,
                                      ks,
                                      bins=(NBIN, NBIN),
                                      density=True,
                                      vmin=0,
                                      vmax=5,
                                      cmap="viridis")

                        fig.colorbar(h[3], ax=ax)
                        if col == 0:
                            ax.set_ylabel('log10(K)')
                        if row == N_ROW - 1:
                            ax.set_xlabel('Methylation level')
                        ax.set_ylim(-1, 1)
                        ax.set_xlim(0, 1)
                        ax.set_xticks([0.2 * i for i in range(6)])
        plt.savefig(fig_fp, dpi=300, bbox_inches='tight', pad_inches=0.1)
Exemplo n.º 9
0
    #     p_upper[0] = 12
    #     p_init[0]  = 9.11
    #     # p_lower[2] = 1
    #     # p_upper[2] = 2
    #     # p_init[2] = 1.5
    #     p_upper[3] = 3

    # if np.abs(l-2e-3)<1e-4:
    #     p_lower[1] = 0
    #     p_upper[1] = 1e-6
    #     p_init[1] = 0
    #     p_lower[3] = -1

    if inspect or fit_of_fit:
        zeta_ = np.linspace(0, lambd, 5000)
        g_ = localreg(zeta, g, zeta_, kernel=gaussian, width=0.1, degree=2)

    bounded = ""
    model = partial(additive_model, lambd)
    model = lambda zeta, C, A, alpha, delta: additive_model(lambd, zeta, C, A, alpha, 0, 1, 1, delta)
    if fit_of_fit:
        zeta_fit = zeta_
        g_fit = g_
    else:
        zeta_fit = zeta
        g_fit = g

    zeta_bnd = 5
    frac_bnd = 0.5
    inds = np.where(np.logical_or(zeta_fit<zeta_bnd, zeta_fit>lambd-zeta_bnd))[0]
    n_bnd = len(inds)
def rdd_plot(data, sbins, bw, k, calc_points, dependant_var):
    """Plots smoothed local regression with bootstrapped CIs on both sides of "margin_1".

    :param (df)         data: df that contains "margin_1" and parameter dependant_var
    :param (int)        sbins: length of bin
    :param (int)        bw: bandwidth for local regression
    :param (int)        k: iterations of resampling by bootstrapping
    :param (int)        calc_points: points where to calculate smoothed value
    :param (str)        dependant_var: name of dependant variable in df
    :return:            plot
    """
    temp_df = bin_fct(data, sbins)
    avg_rank_impr = temp_df.groupby(temp_df["bin"]).mean()[dependant_var]

    x = range(-30, 30, sbins)
    vic_marg = x - np.mod(x, sbins) + sbins / 2

    df_figure2 = pd.DataFrame([vic_marg, avg_rank_impr],
                              index=["vic_marg", "rank_imp"]).transpose()

    df_neg = temp_df.loc[(temp_df["margin_1"] < 0)].sort_values(
        by=["margin_1"])
    df_pos = temp_df.loc[(temp_df["margin_1"] > 0)].sort_values(
        by=["margin_1"])

    y1 = np.asarray(df_neg[dependant_var])
    y2 = np.asarray(df_pos[dependant_var])

    x1 = np.asarray(df_neg["margin_1"])
    x2 = np.asarray(df_pos["margin_1"])

    x_sm1 = x1[0::calc_points]
    x_sm2 = x2[0::calc_points]

    reg_1 = localreg(x1, y1, x0=x_sm1, degree=1, kernel=triangular, width=bw)
    reg_2 = localreg(x2, y2, x0=x_sm2, degree=1, kernel=triangular, width=bw)

    xgrid1 = np.linspace(-30, 0, 50)
    xgrid2 = np.linspace(0, 30, 50)

    smooths1 = np.stack([smooth(x1, y1, xgrid1) for i in range(k)]).T
    smooths2 = np.stack([smooth(x2, y2, xgrid2) for i in range(k)]).T

    mean_neg = np.nanmean(smooths1, axis=1)
    stderr_neg = np.nanstd(smooths1, axis=1, ddof=0)

    mean_pos = np.nanmean(smooths2, axis=1)
    stderr_pos = np.nanstd(smooths2, axis=1, ddof=0)

    fig, (ax0) = plt.subplots(1, 1, figsize=(12, 8), tight_layout=True)

    plt.fill_between(xgrid1,
                     mean_neg - 1.96 * stderr_neg,
                     mean_neg + 1.96 * stderr_neg,
                     alpha=0.25)

    plt.fill_between(xgrid2,
                     mean_pos - 1.96 * stderr_pos,
                     mean_pos + 1.96 * stderr_pos,
                     alpha=0.25)

    plt.axvline(0, linewidth=0.4, color='r')
    ax0.grid(True)
    ax0.scatter(df_figure2["vic_marg"], df_figure2["rank_imp"])
    ax0.plot(x_sm1, reg_1)
    ax0.plot(x_sm2, reg_2)
    plt.xlabel("% Margin of Victory")
    plt.ylabel("Average Rank Improvment")
    ax0.axis([-30, 30, -6, 6])
    plt.show()
Exemplo n.º 11
0
x = np.linspace(0, L, 5000)
x0 = np.linspace(0, L, 500)
yf = np.sin(x**2)
y = yf + 0.5 * np.random.randn(*x.shape)

plt.plot(x, yf, label='$\\sin(x^2)$')
plt.plot(x, y, '+', markersize=0.2, color='black')

N = 200
yarr = np.zeros((N, len(x0)))

for n in range(N):
    ind = np.random.randint(0, len(x), len(x))
    xb = x[ind]
    yb = y[ind]
    yarr[n, :] = localreg(xb, yb, x0, degree=2, kernel=tricube, width=0.4)
    # plt.plot(x0, yarr[n,:], linewidth=0.5, color='black')

lower = np.percentile(yarr, 2.5, axis=0)
upper = np.percentile(yarr, 97.5, axis=0)
plt.fill_between(x0, lower, upper, color='gray', alpha=0.5)

y0 = localreg(x, y, x0, degree=2, kernel=tricube, width=0.4)
plt.plot(x0, y0, label='Local regression')

# ym = np.average(yarr, axis=0)
# plt.plot(x0, ym, '--', label='Bootstrapped local regression')

plt.title('Locally Weighted Polynomial Regression')
plt.xlabel('x')
plt.xlabel('y')
Exemplo n.º 12
0
print('linear dim:' + str(semi_data.linear_dim))
print('varying dim:' + str(semi_data.varying_dim))

I = 5
la_design = lae.la_design(semi_data.varying, I)
varying_hat, linear_hat, index_design = lae.full_estimate(semi_data, I)

print('constant coefficients:')
print(linear_hat)
"""
further estimate via local polynomial regression 
"""
target = varying_hat[:, 1]
index = index_design
a_hat = localreg(index, target, degree=1, kernel=epanechnikov, width=0.3)

plt.style.use('ggplot')
plt.plot(index, a_hat, label='Local average + Local linear')
"""example for varying coefficient model"""

import numpy as np
from numpy import pi, sin, cos, exp, sqrt, std
from laepy import lae
from localreg import *
import matplotlib.pyplot as plt

n, d, snr = 500, 2, 5
x = np.random.normal(0, 1, [n, d])
u = np.random.uniform(0, 1, n)
coef = np.array([sin(60 * u), 4 * u * np.subtract(1, u)])
Exemplo n.º 13
0
def plot_local_regression_and_RD_separately(max_d, fig_format="png"):
    regions = ["Genomic_Regions", "Histone_Modification", "ChromHMM",
               "TFBS"]  #
    K_RD = [3]  # 0, 1, 2
    N_COL = 5
    cm = plt.get_cmap('gist_rainbow')
    FIG_DIR = os.path.join(BASE_DIR, "figures")
    plt.rc('xtick', labelsize=12)  # fontsize of the tick labels
    plt.rc('ytick', labelsize=12)  # fontsize of the tick labels

    for REGION in regions:
        fig_dir = os.path.join(FIG_DIR, "Rd")
        mkdir(fig_dir)
        for km in K_RD:
            if km == 1:
                inter_name = "K_Rd"
            elif km == 2:
                inter_name = "f_Rd"
            elif km == 3:
                inter_name = "DNase_Rd"
            else:
                inter_name = "Methy_Rd"

            out_rd_corr_dir = os.path.join("../data/K_region_intersect",
                                           REGION, inter_name)
            file_paths = [
                os.path.join(out_rd_corr_dir, "%s.bed" % file_name)
                for file_name in FILE_ORDERED_NAMES[REGION]
            ]
            file_labels = FILE_LABELS[REGION]
            N_FILES = len(file_labels)
            N_ROW = int(math.ceil((N_FILES) / N_COL))

            fig_fp = os.path.join(
                fig_dir, "%s_%s.%s" % (REGION, inter_name, fig_format))
            fig, axs = plt.subplots(N_ROW,
                                    N_COL,
                                    figsize=(N_COL * (EACH_SUB_FIG_SIZE - 1),
                                             N_ROW * (EACH_SUB_FIG_SIZE - 1)))
            for j in range(N_FILES):
                row = j // N_COL
                col = j % N_COL
                if N_ROW == 1:
                    ax = axs[col]
                else:
                    ax = axs[row][col]
                file_path = file_paths[j]
                RD_df = pd.read_csv(file_path, sep="\t", header=None).values
                x = RD_df[:, 0]
                y = RD_df[:, 1]
                try:
                    y2 = localreg(x, y, degree=2, kernel=tricube, width=100)
                    if file_labels[j] == "Genome":
                        ax.scatter(x,
                                   y,
                                   s=8,
                                   label=file_labels[j],
                                   color="blue")
                        ax.plot(x, y2, "w-", linewidth=2)
                    else:
                        ax.scatter(x,
                                   y,
                                   s=8,
                                   label=file_labels[j],
                                   color=cm(1. * j / N_FILES))
                        ax.plot(x, y2, "k-", linewidth=2)
                except np.linalg.LinAlgError as e:
                    sns.regplot(x=x,
                                y=y,
                                ax=ax,
                                scatter_kws={
                                    's': 8,
                                    'color': cm(1. * j / N_FILES)
                                })  #, line_kws ={'color':'black', "lw": 2}
                ax.set_xticks(range(0, max_d + 1, 200))
                ax.set_xlim(0, max_d)
                ax.set_ylim(0, 1.0)
                ax.set_title(file_labels[j], fontsize=18)
            plt.savefig(fig_fp, dpi=300, bbox_inches='tight', pad_inches=0.1)
Exemplo n.º 14
0
def plot_local_regression_and_RD(max_d, fig_format="png"):
    regions = [
        "Genomic_Regions", "Histone_Modification", "ChromHMM", "TFBS"
    ]  #"Genomic_Regions", "Histone_Modification", "ChromHMM", "TFBS"
    K_RD = [3]  # , 0

    N_COL = 5
    cm = plt.get_cmap('gist_rainbow')
    FIG_DIR = os.path.join(BASE_DIR, "figures")

    fig_dir = os.path.join(FIG_DIR, "Rd")
    mkdir(fig_dir)
    for REGION in regions:
        fig_fp = os.path.join(fig_dir, "%s.%s" % (REGION, fig_format))
        fig, axs = None, None
        for km in K_RD:
            if km == 1:
                inter_name = "K_Rd"
                color = "r"
                c = "red"
                label = "k"
            elif km == 2:
                inter_name = "f_Rd"
                color = "g"
                c = "green"
                label = "f"
            else:
                inter_name = "Methy_Rd"
                color = "b"
                c = "blue"
                label = "methy"

            out_rd_corr_dir = os.path.join("../data/K_region_intersect",
                                           REGION, inter_name)
            file_paths = [
                os.path.join(out_rd_corr_dir, "%s.bed" % file_name)
                for file_name in FILE_ORDERED_NAMES[REGION]
            ]
            file_labels = FILE_LABELS[REGION]
            N_FILES = len(file_labels)
            N_ROW = int(math.ceil((N_FILES) / N_COL))
            if not fig:
                fig, axs = plt.subplots(
                    N_ROW,
                    N_COL,
                    figsize=(N_COL * EACH_SUB_FIG_SIZE,
                             N_ROW * (EACH_SUB_FIG_SIZE - 1)))
            for j in range(N_FILES):
                row = j // N_COL
                col = j % N_COL
                if N_ROW == 1:
                    ax = axs[col]
                else:
                    ax = axs[row][col]
                file_path = file_paths[j]
                RD_df = pd.read_csv(file_path, sep="\t", header=None).values
                x = RD_df[:, 0]
                y = RD_df[:, 1]
                try:
                    y2 = localreg(x, y, degree=2, kernel=tricube, width=100)
                    sc = ax.scatter(x, y, s=8, label=label, color=c)
                    ax.plot(x, y2, "w-", linewidth=2)
                except np.linalg.LinAlgError as e:
                    sns.regplot(x=x,
                                y=y,
                                ax=ax,
                                scatter_kws={
                                    's': 8,
                                    'color': c
                                },
                                line_kws={
                                    'color': 'white',
                                    "lw": 2
                                })
                ax.set_xticks(range(0, max_d + 1, 200))
                ax.set_xlim(0, max_d)
                ax.set_ylim(0, 1.0)
                ax.set_title(file_labels[j], fontsize=16)
                if km == 2 and j == 0:
                    ax.legend()
        plt.savefig(fig_fp, dpi=300)  #, bbox_inches='tight', pad_inches=0.1
Exemplo n.º 15
0
import numpy as np
import matplotlib.pyplot as plt
from localreg import *

np.random.seed(1234)
x = np.linspace(1.5, 5, 2000)
yf = np.sin(x * x)
y = yf + 0.5 * np.random.randn(*x.shape)

y0 = localreg(x, y, degree=0, kernel=rbf.tricube, width=0.3)
y1 = localreg(x, y, degree=1, kernel=rbf.tricube, width=0.3)
y2 = localreg(x, y, degree=2, kernel=rbf.tricube, width=0.3)

plt.plot(x, y, '+', markersize=0.6, color='gray')
plt.plot(x, yf, label='Ground truth ($\sin(x^2)$)')
plt.plot(x, y0, label='Moving average')
plt.plot(x, y1, label='Local linear regression')
plt.plot(x, y2, label='Local quadratic regression')
plt.legend()
plt.show()