def plot_sample_density_against_true_density(self, X, figsize=(10, 7), bandwidth=0.2): """Compare kernel density estimate of sample X to true density :param X: array (N, 1) X is a sample, possibly generated from self.sample :param figsize: tuple size of figure :param bandwidth: bandwidth parameter passed to sklearn.KernelDensity """ _ = plt.figure(figsize=figsize) u = np.arange(-10, 10, 0.01) x_density = kd(bandwidth=bandwidth).fit(X.reshape(-1, 1)) x_density_samples = np.exp(x_density.score_samples(u.reshape(-1, 1))) plt.plot(u, x_density_samples, label='kde') px = self.normalised(u) plt.plot(u, px, label='true', c='r', linestyle='--') plt.legend() plt.grid()
def kdepred(xcoords, ycoords): try: coords = list(zip(xcoords, ycoords)) density = kd(kernel='gaussian', bandwidth=0.2).fit(coords) ## bandwidth e = density.score_samples(coords) max_index = int((np.argwhere(e == np.amax(e)).tolist())[0][0]) kd_x = xcoords[max_index] kd_y = ycoords[max_index] except Exception: kd_x = np.mean(xcoords) kd_y = np.mean(ycoords) return (kd_x, kd_y)
def train_1D(self, data): """ trains the novelty detector Parameters ---------- data: numpy array training data Returns ------- None """ bw = self.__bw nodes = self.__nodes timeSamples = self.__timeSamples if nodes != 1: #transpose so that each pattern is one vector in time domain. X = np.transpose(data) #reshape elements intro proper [time_samples,1] shape for 1D: refer #to kernel density est. scikit learn documentation elements = [np.reshape(e, [len(e), 1]) for e in X] #create kernels, one for each element, with their individual bandwidth! kernels = [kd(bw[i]) for i, e in enumerate(elements)] #create a parzen network, containing 1 parzen trained with each element parzens = [kernels[i].fit(e) for i, e in enumerate(elements)] self.__parzenNetwork = parzens #this is case when there is only ONE pattern useful for estimating PDF #it doesnt need to be tested for now, as it is not being used. else: #pragma: no cover X = np.reshape(data, [timeSamples, nodes]) parzen = kd(bw) #create the kernel density object parzen.fit(X) self.__parzenNetwork = [ parzen ] #this is really not a "network" but only one PDF
smiles_wd = '/Users/nbaya/Documents/lab/smiles/data/' phen = '50_irnt' n_top_loci = 100 ld_window = int(1000e3) block_mhc = True random_betas = False suffix = f'.top{n_top_loci}loci.ldwindow{int(ld_window/1e3)}kb{".block_mhc" if block_mhc else ""}' maf = pd.read_csv(smiles_wd + phen + '.maf' + suffix + '.tsv.gz', compression='gzip', sep='\t') toploci_maf = maf[maf.sim_truebeta == 1].minor_AF.values kde = kd(bandwidth=0.06, kernel='gaussian').fit(toploci_maf[:, np.newaxis]) x_plot = np.linspace(0, 0.5, 1000)[:, np.newaxis] log_dens = kde.score_samples(x_plot) pdf = np.exp(log_dens) plt.plot(x_plot, pdf) plt.xlim([0, 0.5]) sns.kdeplot(maf[maf.sim_truebeta == 1].minor_AF, clip=[0, 0.5]) plt.xlim([0, 0.5]) plt.title('MAF of top loci') sns.kdeplot(maf.minor_AF, clip=[0, 0.5]) plt.xlim([0, 0.5]) plt.title(f'MAF of all {len(maf)} SNPs') cdf = (np.cumsum(pdf) / len(pdf) / 2)
def visualize_es(initial_design, init=None): """ Visualize one-step of ES. :param initial_design: Method for initializing the GP, choice between 'uniform', 'random', and 'presentation' :param init: Number of datapoints to initialize GP with. :return: None """ # 1. Show GP fit on initial dataset, 0 samples, histogram # 2. Show GP fit on initial dataset, 1 sample, histogram # 3. Show GP fit on initial dataset, 3 samples, histogram # 4. Show GP fit on initial dataset, 50 samples, histogram # 5. Show PDF derived from the histogram at 50 samples # 6. Mark maximum of the PDF as next configuration to be evaluated # a. Plot GP # b. Sample GP, mark minima, update histogram of lambda* # c. Repeat 2 for each sample. # d. Show results after multiple iterations boplot.set_rcparams(**{'figure.figsize': (22, 11)}) # Initial setup # ------------------------------------------- logging.debug("Visualizing ES with initial design {} and init {}".format( initial_design, init)) # Initialize dummy dataset x, y = initialize_dataset(initial_design=initial_design, init=init) logging.debug( "Initialized dataset with:\nsamples {0}\nObservations {1}".format( x, y)) # Fit GP to the currently available dataset gp = GPR(kernel=Matern()) logging.debug("Fitting GP to\nx: {}\ny:{}".format(x, y)) gp.fit(x, y) # fit the model histogram_precision = 20 X_ = boplot.get_plot_domain(precision=histogram_precision) nbins = X_.shape[0] logging.info("Creating histograms with {} bins".format(nbins)) bin_range = (bounds['x'][0], bounds['x'][1] + 1 / histogram_precision) # ------------------------------------------- def draw_samples(nsamples, ax1, ax2, show_min=False, return_pdf=False): if not nsamples: return seed2 = 1256 seed3 = 65 mu = gp.sample_y(X=X_, n_samples=nsamples, random_state=seed3) boplot.plot_gp_samples(mu=mu, nsamples=nsamples, precision=histogram_precision, custom_x=X_, show_min=show_min, ax=ax1, seed=seed2) data_h = X_[np.argmin(mu, axis=0), 0] logging.info("Shape of data_h is {}".format(data_h.shape)) logging.debug("data_h is: {}".format(data_h)) bins = ax2.hist(data_h, bins=nbins, range=bin_range, density=return_pdf, color='lightgreen', edgecolor='black', alpha=0.0 if return_pdf else 1.0) return bins # 1. Show GP fit on initial dataset, 0 samples, histogram # ------------------------------------------- ax2_title = r'$p_{min}=P(\lambda=\lambda^*)$' bounds['acq_y'] = (0.0, 1.0) fig, (ax1, ax2) = plt.subplots(2, 1, squeeze=True) ax1.set_xlim(bounds['x']) ax1.set_ylim(bounds['gp_y']) ax2.set_xlim(bounds['x']) ax2.set_ylim(bounds['acq_y']) ax1.grid() ax2.grid() boplot.plot_objective_function(ax=ax1) boplot.plot_gp(model=gp, confidence_intervals=[3.0], ax=ax1, custom_x=x) boplot.mark_observations(X_=x, Y_=y, mark_incumbent=False, ax=ax1) nsamples = 0 draw_samples(nsamples=nsamples, ax1=ax1, ax2=ax2, show_min=True) # Plot uniform prior for p_min xplot = boplot.get_plot_domain() ylims = ax2.get_ylim() xlims = ax2.get_xlim() yupper = [(ylims[1] - ylims[0]) / (xlims[1] - xlims[0])] * xplot.shape[0] ax2.plot(xplot[:, 0], yupper, color='green', linewidth=2.0) ax2.fill_between(xplot[:, 0], ylims[0], yupper, color='lightgreen') ax1.legend().set_zorder(20) ax1.set_xlabel(labels['xlabel']) ax1.set_ylabel(labels['gp_ylabel']) ax1.set_title(r"Visualization of $\mathcal{G}^t$", loc='left') ax2.set_xlabel(labels['xlabel']) ax2.set_ylabel(r'$p_{min}$') ax2.set_title(ax2_title, loc='left') plt.tight_layout() if TOGGLE_PRINT: plt.savefig('es_1') else: plt.show() # ------------------------------------------- # 2. Show GP fit on initial dataset, 1 sample, histogram # ------------------------------------------- bounds['acq_y'] = (0.0, 5.0) ax2_title = r'Frequency of $\lambda=\hat{\lambda}^*$' fig, (ax1, ax2) = plt.subplots(2, 1, squeeze=True) ax1.set_xlim(bounds['x']) ax1.set_ylim(bounds['gp_y']) ax2.set_xlim(bounds['x']) ax2.set_ylim(bounds['acq_y']) ax1.grid() ax2.grid() boplot.plot_objective_function(ax=ax1) boplot.mark_observations(X_=x, Y_=y, mark_incumbent=False, ax=ax1) nsamples = 1 draw_samples(nsamples=nsamples, ax1=ax1, ax2=ax2, show_min=True) ax1.legend().set_zorder(20) ax1.set_xlabel(labels['xlabel']) ax1.set_ylabel(labels['gp_ylabel']) ax1.set_title(r"One sample from $\mathcal{G}^t$", loc='left') ax2.set_xlabel(labels['xlabel']) # ax2.set_ylabel(r'$p_{min}$') ax2.set_ylabel(r'Frequency') ax2.set_title(ax2_title, loc='left') plt.tight_layout() if TOGGLE_PRINT: plt.savefig('es_2') else: plt.show() # 3. Show GP fit on initial dataset, 10 samples, histogram # ------------------------------------------- bounds['acq_y'] = (0.0, 10.0) ax2_title = r'Frequency of $\lambda=\hat{\lambda}^*$' fig, (ax1, ax2) = plt.subplots(2, 1, squeeze=True) ax1.set_xlim(bounds['x']) ax1.set_ylim(bounds['gp_y']) ax2.set_xlim(bounds['x']) ax2.set_ylim(bounds['acq_y']) ax1.grid() ax2.grid() boplot.plot_objective_function(ax=ax1) boplot.mark_observations(X_=x, Y_=y, mark_incumbent=False, ax=ax1) nsamples = 10 draw_samples(nsamples=nsamples, ax1=ax1, ax2=ax2) ax1.set_xlabel(labels['xlabel']) ax1.set_ylabel(labels['gp_ylabel']) ax1.set_title(r"Ten samples from $\mathcal{G}^t$", loc='left') ax2.set_xlabel(labels['xlabel']) # ax2.set_ylabel(r'$p_{min}$') ax2.set_ylabel(r'Frequency') ax2.set_title(ax2_title, loc='left') plt.tight_layout() if TOGGLE_PRINT: plt.savefig('es_3') else: plt.show() # ------------------------------------------- # 4. Show GP fit on initial dataset, 200 samples, histogram # ------------------------------------------- bounds["acq_y"] = (0.0, 20.0) ax2_title = r'Frequency of $\lambda=\hat{\lambda}^*$' fig, (ax1, ax2) = plt.subplots(2, 1, squeeze=True) ax1.set_xlim(bounds['x']) ax1.set_ylim(bounds['gp_y']) ax2.set_xlim(bounds['x']) ax2.set_ylim(bounds['acq_y']) ax1.grid() ax2.grid() boplot.plot_objective_function(ax=ax1) boplot.mark_observations(X_=x, Y_=y, mark_incumbent=False, ax=ax1) nsamples = 200 draw_samples(nsamples=nsamples, ax1=ax1, ax2=ax2) ax1.set_xlabel(labels['xlabel']) ax1.set_ylabel(labels['gp_ylabel']) ax1.set_title(r"200 samples from $\mathcal{G}^t$", loc='left') ax2.set_xlabel(labels['xlabel']) # ax2.set_ylabel(r'$p_{min}$') ax2.set_ylabel(r'Frequency') ax2.set_title(ax2_title, loc='left') plt.tight_layout() if TOGGLE_PRINT: plt.savefig('es_4') else: plt.show() # ------------------------------------------- # 5. Show PDF derived from the histogram at 200 samples # ------------------------------------------- ax2_title = "$\hat{P}(\lambda=\lambda^*)$" bounds["acq_y"] = (0.0, 1.0) fig, (ax1, ax2) = plt.subplots(2, 1, squeeze=True) ax1.set_xlim(bounds['x']) ax1.set_ylim(bounds['gp_y']) ax2.set_xlim(bounds['x']) ax2.set_ylim(bounds["acq_y"]) ax1.grid() ax2.grid() boplot.plot_objective_function(ax=ax1) boplot.plot_gp(model=gp, confidence_intervals=[3.0], ax=ax1, custom_x=x) boplot.mark_observations(X_=x, Y_=y, mark_incumbent=False, ax=ax1) nsamples = 200 seed3 = 65 mu = gp.sample_y(X=X_, n_samples=nsamples, random_state=seed3) data_h = X_[np.argmin(mu, axis=0), 0] kde = kd(kernel='gaussian', bandwidth=0.75).fit(data_h.reshape(-1, 1)) xplot = boplot.get_plot_domain() ys = np.exp(kde.score_samples(xplot)) ax2.plot(xplot, ys, color='green', lw=2.) ax2.fill_between(xplot[:, 0], ax2.get_ylim()[0], ys, color='lightgreen') ax1.set_xlabel(labels['xlabel']) ax1.set_ylabel(labels['gp_ylabel']) ax1.set_title(r"Visualization of $\mathcal{G}^t$", loc='left') ax2.set_xlabel(labels['xlabel']) ax2.set_ylabel(r'$p_{min}$') ax2.set_title(ax2_title, loc='left') plt.tight_layout() if TOGGLE_PRINT: plt.savefig('es_5') else: plt.show() # ------------------------------------------- # 6. Mark maximum of the PDF as next configuration to be evaluated # ------------------------------------------- ax2_title = "$\hat{P}(\lambda=\lambda^*)$" bounds["acq_y"] = (0.0, 1.0) fig, (ax1, ax2) = plt.subplots(2, 1, squeeze=True) ax1.set_xlim(bounds['x']) ax1.set_ylim(bounds['gp_y']) ax2.set_xlim(bounds['x']) ax2.set_ylim(bounds["acq_y"]) ax1.grid() ax2.grid() boplot.plot_objective_function(ax=ax1) boplot.plot_gp(model=gp, confidence_intervals=[3.0], ax=ax1, custom_x=x) boplot.mark_observations(X_=x, Y_=y, mark_incumbent=False, ax=ax1) nsamples = 200 seed3 = 65 mu = gp.sample_y(X=X_, n_samples=nsamples, random_state=seed3) data_h = X_[np.argmin(mu, axis=0), 0] kde = kd(kernel='gaussian', bandwidth=0.75).fit(data_h.reshape(-1, 1)) xplot = boplot.get_plot_domain() ys = np.exp(kde.score_samples(xplot)) idx_umax = np.argmax(ys) boplot.highlight_configuration(x=xplot[idx_umax], label='', ax=ax1, disable_ticks=True) boplot.annotate_x_edge(label=r'$\lambda^{(t)}$', xy=(xplot[idx_umax], ax1.get_ylim()[0]), ax=ax1, align='top', offset_param=1.5) boplot.highlight_configuration(x=xplot[idx_umax], label='', ax=ax2, disable_ticks=True) boplot.annotate_x_edge(label=r'$\lambda^{(t)}$', xy=(xplot[idx_umax], ys[idx_umax]), ax=ax2, align='top', offset_param=1.0) ax2.plot(xplot, ys, color='green', lw=2.) ax2.fill_between(xplot[:, 0], ax2.get_ylim()[0], ys, color='lightgreen') ax1.set_xlabel(labels['xlabel']) ax1.set_ylabel(labels['gp_ylabel']) ax1.set_title(r"Visualization of $\mathcal{G}^t$", loc='left') ax2.set_xlabel(labels['xlabel']) ax2.set_ylabel(r'$p_{min}$') ax2.set_title(ax2_title, loc='left') plt.tight_layout() if TOGGLE_PRINT: plt.savefig('es_6') else: plt.show()
j = 3 df_wide_asc = df_wide.sort_index(ascending=True) df_wide_ema = df_wide_asc.rolling(window=25, win_type='gaussian').mean(std=10) fig_ema = plt.figure() axes = fig_ema.add_axes([0.1, 0.1, 0.8, 0.8]) axes.plot(df_wide_asc.index.values, df_wide_asc[list_countries[j]], 'b') axes.plot(df_wide_asc.index.values, df_wide_ema[list_countries[j]], 'r') axes.set_xlabel('Date') axes.set_ylabel('Value') axes.set_title('NASDAQ OMX Global Index ' + '(' + list_countries[j] + ')') axes.legend([list_countries[j] + '_raw', list_countries[j] + '_ema'], fontsize=9) plt.grid() plt.show() density = kd(kernel='gaussian', bandwidth=0.75).fit(dX) dX_kde = density.score_samples(dX) fig_kde = plt.figure() axes = fig_kde.add_axes([0.1, 0.1, 0.8, 0.8]) axes.hist(dX, bins=30) axes.plot(dX_kde, 'r') axes.set_xlabel('Date') axes.set_ylabel('Value') axes.set_title('NASDAQ OMX Global Index ' + '(' + list_countries[j] + ')') plt.grid() plt.show()