Пример #1
0
    def plot_sample_density_against_true_density(self,
                                                 X,
                                                 figsize=(10, 7),
                                                 bandwidth=0.2):
        """Compare kernel density estimate of sample X to true density

        :param X: array (N, 1)
            X is a sample, possibly generated from self.sample
        :param figsize: tuple
            size of figure
        :param bandwidth:
            bandwidth parameter passed to sklearn.KernelDensity
        """
        _ = plt.figure(figsize=figsize)
        u = np.arange(-10, 10, 0.01)

        x_density = kd(bandwidth=bandwidth).fit(X.reshape(-1, 1))
        x_density_samples = np.exp(x_density.score_samples(u.reshape(-1, 1)))
        plt.plot(u, x_density_samples, label='kde')

        px = self.normalised(u)
        plt.plot(u, px, label='true', c='r', linestyle='--')

        plt.legend()
        plt.grid()
Пример #2
0
def kdepred(xcoords, ycoords):
    try:
        coords = list(zip(xcoords, ycoords))
        density = kd(kernel='gaussian',
                     bandwidth=0.2).fit(coords)  ## bandwidth
        e = density.score_samples(coords)
        max_index = int((np.argwhere(e == np.amax(e)).tolist())[0][0])
        kd_x = xcoords[max_index]
        kd_y = ycoords[max_index]
    except Exception:
        kd_x = np.mean(xcoords)
        kd_y = np.mean(ycoords)
    return (kd_x, kd_y)
Пример #3
0
    def train_1D(self, data):
        """
        trains the novelty detector

        Parameters
        ----------
        data: numpy array
            training data

        Returns
        -------
        None

        """

        bw = self.__bw
        nodes = self.__nodes
        timeSamples = self.__timeSamples
        if nodes != 1:
            #transpose so that each pattern is one vector in time domain.
            X = np.transpose(data)
            #reshape elements intro proper [time_samples,1] shape for 1D: refer
            #to kernel density est. scikit learn documentation
            elements = [np.reshape(e, [len(e), 1]) for e in X]
            #create kernels, one for each element, with their individual bandwidth!
            kernels = [kd(bw[i]) for i, e in enumerate(elements)]
            #create a parzen network, containing 1 parzen trained with each element
            parzens = [kernels[i].fit(e) for i, e in enumerate(elements)]
            self.__parzenNetwork = parzens
        #this is case when there is only ONE pattern useful for estimating PDF
        #it doesnt need to be tested for now, as it is not being used.
        else:  #pragma: no cover
            X = np.reshape(data, [timeSamples, nodes])
            parzen = kd(bw)  #create the kernel density object
            parzen.fit(X)
            self.__parzenNetwork = [
                parzen
            ]  #this is really not a "network" but only one PDF
Пример #4
0
smiles_wd = '/Users/nbaya/Documents/lab/smiles/data/'

phen = '50_irnt'
n_top_loci = 100
ld_window = int(1000e3)
block_mhc = True
random_betas = False
suffix = f'.top{n_top_loci}loci.ldwindow{int(ld_window/1e3)}kb{".block_mhc" if block_mhc else ""}'

maf = pd.read_csv(smiles_wd + phen + '.maf' + suffix + '.tsv.gz',
                  compression='gzip',
                  sep='\t')

toploci_maf = maf[maf.sim_truebeta == 1].minor_AF.values

kde = kd(bandwidth=0.06, kernel='gaussian').fit(toploci_maf[:, np.newaxis])
x_plot = np.linspace(0, 0.5, 1000)[:, np.newaxis]
log_dens = kde.score_samples(x_plot)
pdf = np.exp(log_dens)
plt.plot(x_plot, pdf)
plt.xlim([0, 0.5])

sns.kdeplot(maf[maf.sim_truebeta == 1].minor_AF, clip=[0, 0.5])
plt.xlim([0, 0.5])
plt.title('MAF of top loci')

sns.kdeplot(maf.minor_AF, clip=[0, 0.5])
plt.xlim([0, 0.5])
plt.title(f'MAF of all {len(maf)} SNPs')

cdf = (np.cumsum(pdf) / len(pdf) / 2)
Пример #5
0
def visualize_es(initial_design, init=None):
    """
    Visualize one-step of ES.
    :param initial_design: Method for initializing the GP, choice between 'uniform', 'random', and 'presentation'
    :param init: Number of datapoints to initialize GP with.
    :return: None
    """

    # 1. Show GP fit on initial dataset, 0 samples, histogram
    # 2. Show GP fit on initial dataset, 1 sample, histogram
    # 3. Show GP fit on initial dataset, 3 samples, histogram
    # 4. Show GP fit on initial dataset, 50 samples, histogram
    # 5. Show PDF derived from the histogram at 50 samples
    # 6. Mark maximum of the PDF as next configuration to be evaluated

    # a. Plot GP
    # b. Sample GP, mark minima, update histogram of lambda*
    # c. Repeat 2 for each sample.
    # d. Show results after multiple iterations

    boplot.set_rcparams(**{'figure.figsize': (22, 11)})

    # Initial setup
    # -------------------------------------------

    logging.debug("Visualizing ES with initial design {} and init {}".format(
        initial_design, init))
    # Initialize dummy dataset
    x, y = initialize_dataset(initial_design=initial_design, init=init)
    logging.debug(
        "Initialized dataset with:\nsamples {0}\nObservations {1}".format(
            x, y))

    # Fit GP to the currently available dataset
    gp = GPR(kernel=Matern())
    logging.debug("Fitting GP to\nx: {}\ny:{}".format(x, y))
    gp.fit(x, y)  # fit the model

    histogram_precision = 20
    X_ = boplot.get_plot_domain(precision=histogram_precision)
    nbins = X_.shape[0]
    logging.info("Creating histograms with {} bins".format(nbins))
    bin_range = (bounds['x'][0], bounds['x'][1] + 1 / histogram_precision)

    # -------------------------------------------

    def draw_samples(nsamples, ax1, ax2, show_min=False, return_pdf=False):
        if not nsamples:
            return
        seed2 = 1256
        seed3 = 65

        mu = gp.sample_y(X=X_, n_samples=nsamples, random_state=seed3)
        boplot.plot_gp_samples(mu=mu,
                               nsamples=nsamples,
                               precision=histogram_precision,
                               custom_x=X_,
                               show_min=show_min,
                               ax=ax1,
                               seed=seed2)
        data_h = X_[np.argmin(mu, axis=0), 0]
        logging.info("Shape of data_h is {}".format(data_h.shape))
        logging.debug("data_h is: {}".format(data_h))

        bins = ax2.hist(data_h,
                        bins=nbins,
                        range=bin_range,
                        density=return_pdf,
                        color='lightgreen',
                        edgecolor='black',
                        alpha=0.0 if return_pdf else 1.0)

        return bins

    # 1. Show GP fit on initial dataset, 0 samples, histogram
    # -------------------------------------------

    ax2_title = r'$p_{min}=P(\lambda=\lambda^*)$'

    bounds['acq_y'] = (0.0, 1.0)

    fig, (ax1, ax2) = plt.subplots(2, 1, squeeze=True)
    ax1.set_xlim(bounds['x'])
    ax1.set_ylim(bounds['gp_y'])
    ax2.set_xlim(bounds['x'])
    ax2.set_ylim(bounds['acq_y'])
    ax1.grid()
    ax2.grid()

    boplot.plot_objective_function(ax=ax1)
    boplot.plot_gp(model=gp, confidence_intervals=[3.0], ax=ax1, custom_x=x)
    boplot.mark_observations(X_=x, Y_=y, mark_incumbent=False, ax=ax1)

    nsamples = 0
    draw_samples(nsamples=nsamples, ax1=ax1, ax2=ax2, show_min=True)

    # Plot uniform prior for p_min
    xplot = boplot.get_plot_domain()
    ylims = ax2.get_ylim()
    xlims = ax2.get_xlim()
    yupper = [(ylims[1] - ylims[0]) / (xlims[1] - xlims[0])] * xplot.shape[0]
    ax2.plot(xplot[:, 0], yupper, color='green', linewidth=2.0)
    ax2.fill_between(xplot[:, 0], ylims[0], yupper, color='lightgreen')

    ax1.legend().set_zorder(20)
    ax1.set_xlabel(labels['xlabel'])
    ax1.set_ylabel(labels['gp_ylabel'])
    ax1.set_title(r"Visualization of $\mathcal{G}^t$", loc='left')

    ax2.set_xlabel(labels['xlabel'])
    ax2.set_ylabel(r'$p_{min}$')
    ax2.set_title(ax2_title, loc='left')

    plt.tight_layout()
    if TOGGLE_PRINT:
        plt.savefig('es_1')
    else:
        plt.show()
    # -------------------------------------------

    # 2. Show GP fit on initial dataset, 1 sample, histogram
    # -------------------------------------------

    bounds['acq_y'] = (0.0, 5.0)
    ax2_title = r'Frequency of $\lambda=\hat{\lambda}^*$'

    fig, (ax1, ax2) = plt.subplots(2, 1, squeeze=True)
    ax1.set_xlim(bounds['x'])
    ax1.set_ylim(bounds['gp_y'])
    ax2.set_xlim(bounds['x'])
    ax2.set_ylim(bounds['acq_y'])
    ax1.grid()
    ax2.grid()

    boplot.plot_objective_function(ax=ax1)
    boplot.mark_observations(X_=x, Y_=y, mark_incumbent=False, ax=ax1)

    nsamples = 1
    draw_samples(nsamples=nsamples, ax1=ax1, ax2=ax2, show_min=True)

    ax1.legend().set_zorder(20)
    ax1.set_xlabel(labels['xlabel'])
    ax1.set_ylabel(labels['gp_ylabel'])
    ax1.set_title(r"One sample from $\mathcal{G}^t$", loc='left')

    ax2.set_xlabel(labels['xlabel'])

    # ax2.set_ylabel(r'$p_{min}$')
    ax2.set_ylabel(r'Frequency')
    ax2.set_title(ax2_title, loc='left')

    plt.tight_layout()
    if TOGGLE_PRINT:
        plt.savefig('es_2')
    else:
        plt.show()

    # 3. Show GP fit on initial dataset, 10 samples, histogram
    # -------------------------------------------

    bounds['acq_y'] = (0.0, 10.0)
    ax2_title = r'Frequency of $\lambda=\hat{\lambda}^*$'

    fig, (ax1, ax2) = plt.subplots(2, 1, squeeze=True)
    ax1.set_xlim(bounds['x'])
    ax1.set_ylim(bounds['gp_y'])
    ax2.set_xlim(bounds['x'])
    ax2.set_ylim(bounds['acq_y'])
    ax1.grid()
    ax2.grid()

    boplot.plot_objective_function(ax=ax1)
    boplot.mark_observations(X_=x, Y_=y, mark_incumbent=False, ax=ax1)

    nsamples = 10
    draw_samples(nsamples=nsamples, ax1=ax1, ax2=ax2)

    ax1.set_xlabel(labels['xlabel'])
    ax1.set_ylabel(labels['gp_ylabel'])
    ax1.set_title(r"Ten samples from $\mathcal{G}^t$", loc='left')

    ax2.set_xlabel(labels['xlabel'])

    # ax2.set_ylabel(r'$p_{min}$')
    ax2.set_ylabel(r'Frequency')
    ax2.set_title(ax2_title, loc='left')

    plt.tight_layout()
    if TOGGLE_PRINT:
        plt.savefig('es_3')
    else:
        plt.show()

    # -------------------------------------------

    # 4. Show GP fit on initial dataset, 200 samples, histogram
    # -------------------------------------------

    bounds["acq_y"] = (0.0, 20.0)
    ax2_title = r'Frequency of $\lambda=\hat{\lambda}^*$'

    fig, (ax1, ax2) = plt.subplots(2, 1, squeeze=True)
    ax1.set_xlim(bounds['x'])
    ax1.set_ylim(bounds['gp_y'])
    ax2.set_xlim(bounds['x'])
    ax2.set_ylim(bounds['acq_y'])
    ax1.grid()
    ax2.grid()

    boplot.plot_objective_function(ax=ax1)
    boplot.mark_observations(X_=x, Y_=y, mark_incumbent=False, ax=ax1)

    nsamples = 200
    draw_samples(nsamples=nsamples, ax1=ax1, ax2=ax2)

    ax1.set_xlabel(labels['xlabel'])
    ax1.set_ylabel(labels['gp_ylabel'])
    ax1.set_title(r"200 samples from $\mathcal{G}^t$", loc='left')

    ax2.set_xlabel(labels['xlabel'])

    # ax2.set_ylabel(r'$p_{min}$')
    ax2.set_ylabel(r'Frequency')
    ax2.set_title(ax2_title, loc='left')

    plt.tight_layout()
    if TOGGLE_PRINT:
        plt.savefig('es_4')
    else:
        plt.show()
    # -------------------------------------------

    # 5. Show PDF derived from the histogram at 200 samples
    # -------------------------------------------

    ax2_title = "$\hat{P}(\lambda=\lambda^*)$"
    bounds["acq_y"] = (0.0, 1.0)

    fig, (ax1, ax2) = plt.subplots(2, 1, squeeze=True)
    ax1.set_xlim(bounds['x'])
    ax1.set_ylim(bounds['gp_y'])
    ax2.set_xlim(bounds['x'])
    ax2.set_ylim(bounds["acq_y"])
    ax1.grid()
    ax2.grid()

    boplot.plot_objective_function(ax=ax1)
    boplot.plot_gp(model=gp, confidence_intervals=[3.0], ax=ax1, custom_x=x)
    boplot.mark_observations(X_=x, Y_=y, mark_incumbent=False, ax=ax1)

    nsamples = 200
    seed3 = 65

    mu = gp.sample_y(X=X_, n_samples=nsamples, random_state=seed3)
    data_h = X_[np.argmin(mu, axis=0), 0]

    kde = kd(kernel='gaussian', bandwidth=0.75).fit(data_h.reshape(-1, 1))
    xplot = boplot.get_plot_domain()
    ys = np.exp(kde.score_samples(xplot))

    ax2.plot(xplot, ys, color='green', lw=2.)
    ax2.fill_between(xplot[:, 0], ax2.get_ylim()[0], ys, color='lightgreen')

    ax1.set_xlabel(labels['xlabel'])
    ax1.set_ylabel(labels['gp_ylabel'])
    ax1.set_title(r"Visualization of $\mathcal{G}^t$", loc='left')

    ax2.set_xlabel(labels['xlabel'])
    ax2.set_ylabel(r'$p_{min}$')
    ax2.set_title(ax2_title, loc='left')

    plt.tight_layout()
    if TOGGLE_PRINT:
        plt.savefig('es_5')
    else:
        plt.show()

    # -------------------------------------------

    # 6. Mark maximum of the PDF as next configuration to be evaluated
    # -------------------------------------------

    ax2_title = "$\hat{P}(\lambda=\lambda^*)$"
    bounds["acq_y"] = (0.0, 1.0)

    fig, (ax1, ax2) = plt.subplots(2, 1, squeeze=True)
    ax1.set_xlim(bounds['x'])
    ax1.set_ylim(bounds['gp_y'])
    ax2.set_xlim(bounds['x'])
    ax2.set_ylim(bounds["acq_y"])
    ax1.grid()
    ax2.grid()

    boplot.plot_objective_function(ax=ax1)
    boplot.plot_gp(model=gp, confidence_intervals=[3.0], ax=ax1, custom_x=x)
    boplot.mark_observations(X_=x, Y_=y, mark_incumbent=False, ax=ax1)

    nsamples = 200
    seed3 = 65

    mu = gp.sample_y(X=X_, n_samples=nsamples, random_state=seed3)
    data_h = X_[np.argmin(mu, axis=0), 0]

    kde = kd(kernel='gaussian', bandwidth=0.75).fit(data_h.reshape(-1, 1))
    xplot = boplot.get_plot_domain()
    ys = np.exp(kde.score_samples(xplot))

    idx_umax = np.argmax(ys)
    boplot.highlight_configuration(x=xplot[idx_umax],
                                   label='',
                                   ax=ax1,
                                   disable_ticks=True)
    boplot.annotate_x_edge(label=r'$\lambda^{(t)}$',
                           xy=(xplot[idx_umax], ax1.get_ylim()[0]),
                           ax=ax1,
                           align='top',
                           offset_param=1.5)
    boplot.highlight_configuration(x=xplot[idx_umax],
                                   label='',
                                   ax=ax2,
                                   disable_ticks=True)
    boplot.annotate_x_edge(label=r'$\lambda^{(t)}$',
                           xy=(xplot[idx_umax], ys[idx_umax]),
                           ax=ax2,
                           align='top',
                           offset_param=1.0)

    ax2.plot(xplot, ys, color='green', lw=2.)
    ax2.fill_between(xplot[:, 0], ax2.get_ylim()[0], ys, color='lightgreen')

    ax1.set_xlabel(labels['xlabel'])
    ax1.set_ylabel(labels['gp_ylabel'])
    ax1.set_title(r"Visualization of $\mathcal{G}^t$", loc='left')

    ax2.set_xlabel(labels['xlabel'])
    ax2.set_ylabel(r'$p_{min}$')
    ax2.set_title(ax2_title, loc='left')

    plt.tight_layout()
    if TOGGLE_PRINT:
        plt.savefig('es_6')
    else:
        plt.show()
Пример #6
0
j = 3

df_wide_asc = df_wide.sort_index(ascending=True)
df_wide_ema = df_wide_asc.rolling(window=25, win_type='gaussian').mean(std=10)

fig_ema = plt.figure()
axes = fig_ema.add_axes([0.1, 0.1, 0.8, 0.8])
axes.plot(df_wide_asc.index.values, df_wide_asc[list_countries[j]], 'b')
axes.plot(df_wide_asc.index.values, df_wide_ema[list_countries[j]], 'r')
axes.set_xlabel('Date')
axes.set_ylabel('Value')
axes.set_title('NASDAQ OMX Global Index ' + '(' + list_countries[j] + ')')
axes.legend([list_countries[j] + '_raw', list_countries[j] + '_ema'],
            fontsize=9)
plt.grid()
plt.show()

density = kd(kernel='gaussian', bandwidth=0.75).fit(dX)
dX_kde = density.score_samples(dX)

fig_kde = plt.figure()
axes = fig_kde.add_axes([0.1, 0.1, 0.8, 0.8])
axes.hist(dX, bins=30)
axes.plot(dX_kde, 'r')
axes.set_xlabel('Date')
axes.set_ylabel('Value')
axes.set_title('NASDAQ OMX Global Index ' + '(' + list_countries[j] + ')')
plt.grid()
plt.show()