def test_kernel_density_sampling(n_samples=100, n_features=3): rng = np.random.RandomState(0) X = rng.randn(n_samples, n_features) bandwidth = 0.2 for kernel in ['gaussian', 'tophat']: # draw a tophat sample kde = KernelDensity(bandwidth, kernel=kernel).fit(X) samp = kde.sample(100) assert_equal(X.shape, samp.shape) # check that samples are in the right range nbrs = NearestNeighbors(n_neighbors=1).fit(X) dist, ind = nbrs.kneighbors(X, return_distance=True) if kernel == 'tophat': assert np.all(dist < bandwidth) elif kernel == 'gaussian': # 5 standard deviations is safe for 100 samples, but there's a # very small chance this test could fail. assert np.all(dist < 5 * bandwidth) # check unsupported kernels for kernel in ['epanechnikov', 'exponential', 'linear', 'cosine']: kde = KernelDensity(bandwidth, kernel=kernel).fit(X) assert_raises(NotImplementedError, kde.sample, 100) # non-regression test: used to return a scalar X = rng.randn(4, 1) kde = KernelDensity(kernel="gaussian").fit(X) assert_equal(kde.sample().shape, (1, 1))
def test_kernel_density_sampling(n_samples=100, n_features=3): rng = np.random.RandomState(0) X = rng.randn(n_samples, n_features) bandwidth = 0.2 for kernel in ['gaussian', 'tophat']: # draw a tophat sample kde = KernelDensity(bandwidth=bandwidth, kernel=kernel).fit(X) samp = kde.sample(100) assert X.shape == samp.shape # check that samples are in the right range nbrs = NearestNeighbors(n_neighbors=1).fit(X) dist, ind = nbrs.kneighbors(X, return_distance=True) if kernel == 'tophat': assert np.all(dist < bandwidth) elif kernel == 'gaussian': # 5 standard deviations is safe for 100 samples, but there's a # very small chance this test could fail. assert np.all(dist < 5 * bandwidth) # check unsupported kernels for kernel in ['epanechnikov', 'exponential', 'linear', 'cosine']: kde = KernelDensity(bandwidth=bandwidth, kernel=kernel).fit(X) assert_raises(NotImplementedError, kde.sample, 100) # non-regression test: used to return a scalar X = rng.randn(4, 1) kde = KernelDensity(kernel="gaussian").fit(X) assert kde.sample().shape == (1, 1)
def npd_function_ica(X, n_sample=50, k=10, s=2000, verbose=False): N_samp = np.shape(X)[0] N_bins = np.shape(X)[1] d = 1 scotts_b = N_samp**(-1. / (d + 4)) ica = FastICA(whiten=False, max_iter=1000, tol=1e-2) S_ = ica.fit_transform(X) A_ = ica.mixing_ Y_ref_ica_unmixed = np.zeros((n_sample, N_samp - 1, N_bins)) Y_ref_ica = np.zeros((n_sample, N_samp - 1, N_bins)) for i in range(n_sample): for j in range(N_bins): X_ica_ind = S_[:, j].reshape(-1, 1) #shape is now (2048,1) kde = KernelDensity(bandwidth=scotts_b, kernel='gaussian').fit(X_ica_ind) samps = kde.sample(N_samp - 1) #2047 samples for the jth k bin Y_ref_ica_unmixed[i, :, j] = np.ndarray.flatten(samps) Y_ref_ica[i] = np.dot( Y_ref_ica_unmixed[i], A_.T) #applying the mixing matrix to undo the ICA transformation X_ref_ica_unmixed = np.zeros((N_samp, N_bins)) X_ref_ica = np.zeros((N_samp, N_bins)) for j in range(N_bins): X_ica_ind = S_[:, j].reshape(-1, 1) #shape is now (2048,1) kde = KernelDensity(bandwidth=scotts_b, kernel='gaussian').fit(X_ica_ind) samps = kde.sample(N_samp) #2047 samples for the jth k bin X_ref_ica_unmixed[:, j] = np.ndarray.flatten(samps) X_ref_ica = np.dot( X_ref_ica_unmixed, A_.T) #applying the mixing matrix to undo the ICA transformation kl_ref, kl_data = [], [] for i in range(n_sample): if verbose: print(i) Y_ref_ica_samp = Y_ref_ica[i] kl_ref.append( kNNdiv_general(X_ref_ica, Y_ref_ica_samp, Knn=k, alpha=None, div_func='kl')) kl_data.append( kNNdiv_general(X, Y_ref_ica_samp, Knn=k, alpha=None, div_func='kl')) return kl_ref, kl_data
def get_probability(self, team1, team2): home_dist = np.array(self.teams[team1].get_sot_list() + self.teams[team2].get_sota_list()).reshape(-1, 1) grid = GridSearchCV(KernelDensity(kernel='gaussian'), {'bandwidth': self.bandwidths}, cv=LeaveOneOut()) grid.fit(home_dist) bandwidth = grid.best_params_["bandwidth"] home_kernel = KernelDensity(bandwidth=bandwidth, kernel="gaussian") home_kernel.fit(home_dist) away_dist = np.array(self.teams[team1].get_sota_list() + self.teams[team2].get_sot_list()).reshape(-1, 1) grid = GridSearchCV(KernelDensity(kernel='gaussian'), {'bandwidth': self.bandwidths}, cv=LeaveOneOut()) grid.fit(away_dist) bandwidth = grid.best_params_["bandwidth"] away_kernel = KernelDensity(bandwidth=bandwidth, kernel="gaussian") away_kernel.fit(away_dist) draw = 0 home = 0 away = 0 for i in range(self.iterations): home_shots = home_kernel.sample()[0][0] away_shots = away_kernel.sample()[0][0] home_goals = np.round(home_shots * self.teams[team1].get_shot_conversion()) away_goals = np.round(away_shots * self.teams[team2].get_shot_conversion()) if home_goals == away_goals: draw += 1 elif home_goals > away_goals: home += 1 else: away += 1 #print(home, draw, away) return home / self.iterations, draw / self.iterations, away / self.iterations
def pval_calibrated_bandwidth(data, alpha_cal, null, I='auto', N_bootstrap=1000, comm=MPI.COMM_WORLD, calibration_file=None): ''' NB!: Test is only calibrated to correct level for alpha_cal. ''' data = comm.bcast(data) I = get_I(data, I) try: lambda_alpha = load_lambda('bw_ad', null, alpha_cal, calibration_file) except KeyError: lambda_alpha = load_lambda('bw', null, alpha_cal, calibration_file) h_crit = critical_bandwidth(data, I) var_data = np.var(data) KDE_h_crit = KernelDensity(kernel='gaussian', bandwidth=h_crit).fit(data.reshape(-1, 1)) resamp_fun = lambda: is_unimodal_kde( h_crit * lambda_alpha, KDE_h_crit.sample(len(data)).ravel() / np.sqrt(1 + h_crit**2 / var_data ), I) smaller_equal_crit_bandwidth = bootstrap(resamp_fun, N_bootstrap, dtype=np.bool_, comm=comm) return np.mean(~smaller_equal_crit_bandwidth)
def test_silverman_adaptive_resampling(data, alpha, I='auto', N_bootstrap_max=10000, comm=MPI.COMM_WORLD): data = comm.bcast(data) I = get_I(data, I) h_crit = critical_bandwidth(data, I) var_data = np.var(data) KDE_h_crit = KernelDensity(kernel='gaussian', bandwidth=h_crit).fit(data.reshape(-1, 1)) resamp_fun = lambda: not is_unimodal_kde( h_crit, KDE_h_crit.sample(len(data)).ravel() / np.sqrt(1 + h_crit**2 / var_data ), I) try: return float( probability_above(resamp_fun, alpha, max_samp=N_bootstrap_max, comm=comm, batch=100, bound_significance=0.05, exception_at_max_samp=True, printing=False)) except MaxSampExceededException: return alpha
def test_calibrated_bandwidth_adaptive_resampling(data, alpha, null, I='auto', N_bootstrap_max=10000, comm=MPI.COMM_WORLD, calibration_file=None): data = comm.bcast(data) I = get_I(data, I) try: lambda_alpha = load_lambda('bw_ad', null, alpha, calibration_file) # loading lambda computed with adaptive probablistic bisection search except KeyError: lambda_alpha = load_lambda('bw', null, alpha, calibration_file) # loading lambda computed with probabilistic bisection search h_crit = critical_bandwidth(data, I) var_data = np.var(data) KDE_h_crit = KernelDensity(kernel='gaussian', bandwidth=h_crit).fit(data.reshape(-1, 1)) resamp_fun = lambda: not is_unimodal_kde( h_crit * lambda_alpha, KDE_h_crit.sample(len(data)).ravel() / np.sqrt(1 + h_crit**2 / var_data ), I) try: return float( probability_above(resamp_fun, alpha, max_samp=N_bootstrap_max, comm=comm, batch=100, bound_significance=0.05, exception_at_max_samp=True, printing=False)) except MaxSampExceededException: return alpha
class XSampleBW(XSample): def __init__(self, N, sampfun, comm=MPI.COMM_WORLD): super(XSampleBW, self).__init__(N, sampfun, comm) self.I = (-1.5, 1.5) # avoiding spurious bumps in the tails self.h_crit = critical_bandwidth(self.data, self.I) #print_all_ranks(self.comm, "self.h_crit = {}".format(self.h_crit)) self.var = np.var(self.data) self.kde_h_crit = KernelDensity(kernel='gaussian', bandwidth=self.h_crit).fit( self.data.reshape(-1, 1)) @property def statistic(self): return self.h_crit def resampled_statistic_below_scaled_statistic(self, lambda_scale): ''' P( h_{crit}^* <= \lambda*h_{crit}) = P(KDE(X^*, \lambda* h_{crit}) is unimodal) ''' return self.is_unimodal_resample(lambda_scale) def is_unimodal_resample(self, lambda_val): data = self.kde_h_crit.sample( self.N).reshape(-1) / np.sqrt(1 + self.h_crit**2 / self.var) #print "np.var(data)/self.var = {}".format(np.var(data)/self.var) return is_unimodal_kde(self.h_crit * lambda_val, data, self.I) def probability_of_unimodal_above(self, lambda_val, gamma): return self.prob_resampled_statistic_below_bound_above_gamma( lambda_val, gamma)
class XSampleBW(XSample): def __init__(self, N, sampfun, comm=MPI.COMM_WORLD): super(XSampleBW, self).__init__(N, sampfun, comm) self.I = (-1.5, 1.5) # avoiding spurious bumps in the tails self.h_crit = critical_bandwidth(self.data, self.I) #print_all_ranks(self.comm, "self.h_crit = {}".format(self.h_crit)) self.var = np.var(self.data) self.kde_h_crit = KernelDensity(kernel='gaussian', bandwidth=self.h_crit).fit(self.data.reshape(-1, 1)) @property def statistic(self): return self.h_crit def resampled_statistic_below_scaled_statistic(self, lambda_scale): ''' P( h_{crit}^* <= \lambda*h_{crit}) = P(KDE(X^*, \lambda* h_{crit}) is unimodal) ''' return self.is_unimodal_resample(lambda_scale) def is_unimodal_resample(self, lambda_val): data = self.kde_h_crit.sample(self.N).reshape(-1)/np.sqrt(1+self.h_crit**2/self.var) #print "np.var(data)/self.var = {}".format(np.var(data)/self.var) return is_unimodal_kde(self.h_crit*lambda_val, data, self.I) def probability_of_unimodal_above(self, lambda_val, gamma): return self.prob_resampled_statistic_below_bound_above_gamma(lambda_val, gamma)
def augment_x_df(x_train, df, repeats=2, fit_col='Assets', seed=22, cutoff=2.5, col_num=200): #augments data with random multiplactive constant on all present value columns #distribution of this constant is that of the eg Assets column (which is assets growth) aug_mask = list(map(lambda x: x[0] != 'p', df.columns[0:col_num])) ker_fit_data = df[fit_col].values ker_fit_data = ker_fit_data[(ker_fit_data > 0.5) * (ker_fit_data < 1.5)] ker_fit_data = ker_fit_data.reshape(-1, 1) kde = KernelDensity(kernel='gaussian', bandwidth=0.05).fit(ker_fit_data) x_train_aug = np.repeat(x_train, repeats, axis=0) number = x_train_aug.shape[0] seed = 22 scale_rand = kde.sample([number], seed) x_train_aug[:, aug_mask] * scale_rand x_train[np.abs(x_train) > cutoff] = 0 return x_train_aug
def copula_generate(X, generator=None, n=None): """ Generate using copula trick. :param generator: Model to fit and sample from. KDE by default. :param n: Number of examples to generate. By default it is the number of observations in X. """ indexes = X.indexes columns = X.columns if generator is None: generator = KernelDensity() if n is None: n = X.shape[0] X_real = np.array(X) # X marginals to uniforms X = matrix_to_rank(X) # X uniforms to inverse gaussian CDF X = rank_matrix_to_inverse(X) # Fit generator generator.fit(X) # Generating artificial data \n Sampling from generator X_artif = generator.sample(n) # Marginal retrofitting result = autopandas.AutoData(marginal_retrofit(X_artif, X_real)) # Restore data frame index result.indexes = indexes result.columns = columns return result
def expand__(self): """ -Expand. An algorithm for expanding the bounds of an APR to improve its generalization ability. The objective is to estimate a kernel density for the chosen instances with rel_features. After we have our density function, we sample n samples and we get the epsilon percentiles. Taking the percentiles from a sampled distribution. """ self.mn_ = self.mn__pred.copy() self.mx_ = self.mx__pred.copy() for i in range(len(self.rel_features_)): kd = KernelDensity() kd.fit(self.chosen[:, i].reshape(1, -1)) u = kd.sample(100, random_state=0) mn, mx = np.percentile( u, [self.epsilon * 100, (1 - self.epsilon) * 100]) # if the bounds are outside the APR, we update the APR bounds. if mn < self.mn_[self.rel_features_[i]]: self.mn_[self.rel_features_[i]] = mn if mx > self.mx_[self.rel_features_[i]]: self.mx_[self.rel_features_[i]] = mx
def ICA_loglikes_samples(arr, lst): S_ = lst[0] A_ = lst[1] N_samp = np.shape(arr)[0] N_bins = np.shape(arr)[1] d = 1 scotts_b = N_samp**(-1. / (d + 4)) X_ref_ica_unmixed = np.zeros((2048, N_bins)) X_ref_ica = np.zeros((2048, N_bins)) loglike_ica = np.zeros(np.shape(arr)[0]) for j in range(N_bins): X_ica_ind = S_[:, j].reshape(-1, 1) kde = KernelDensity(bandwidth=scotts_b, kernel='gaussian').fit(X_ica_ind) samps = kde.sample(2048) X_ref_ica_unmixed[:, j] = np.ndarray.flatten(samps) X_ref_ica = np.dot(X_ref_ica_unmixed, A_.T) return loglike_ica, X_ref_ica
def test_KernelDensity_sampling(n_samples=100, n_features=3): np.random.seed(0) X = np.random.random((n_samples, n_features)) bandwidth = 0.2 for kernel in ["gaussian", "tophat"]: # draw a tophat sample kde = KernelDensity(bandwidth, kernel=kernel).fit(X) samp = kde.sample(100) assert_equal(X.shape, samp.shape) # check that samples are in the right range nbrs = NearestNeighbors(n_neighbors=1).fit(X) dist, ind = nbrs.kneighbors(X, return_distance=True) if kernel == "tophat": assert np.all(dist < bandwidth) elif kernel == "gaussian": # 5 standard deviations is safe for 100 samples, but there's a # very small chance this test could fail. assert np.all(dist < 5 * bandwidth) # check unsupported kernels for kernel in ["epanechnikov", "exponential", "linear", "cosine"]: kde = KernelDensity(bandwidth, kernel=kernel).fit(X) assert_raises(NotImplementedError, kde.sample, 100)
def test_KernelDensity_sampling(n_samples=100, n_features=3): np.random.seed(0) X = np.random.random((n_samples, n_features)) bandwidth = 0.2 for kernel in ['gaussian', 'tophat']: # draw a tophat sample kde = KernelDensity(bandwidth, kernel=kernel).fit(X) samp = kde.sample(100) assert_equal(X.shape, samp.shape) # check that samples are in the right range nbrs = NearestNeighbors(n_neighbors=1).fit(X) dist, ind = nbrs.kneighbors(X, return_distance=True) if kernel == 'tophat': assert np.all(dist < bandwidth) elif kernel == 'gaussian': # 5 standard deviations is safe for 100 samples, but there's a # very small chance this test could fail. assert np.all(dist < 5 * bandwidth) # check unsupported kernels for kernel in ['epanechnikov', 'exponential', 'linear', 'cosine']: kde = KernelDensity(bandwidth, kernel=kernel).fit(X) assert_raises(NotImplementedError, kde.sample, 100)
def KDE_resample(x, y, N, bandwidth=0.5): """ Resample features based on Kernel Density approximation Parameters ---------- X : numpy.ndarray Feature array y : numpy.ndarray Label array N : int Total samples to simulate (to be added to original sample) Returns ------- newX : numpy.ndarray New Feature array newY : numpy.ndarray New label array """ uys = np.unique(y) newX = np.zeros((int(N * len(uys)), np.size(x, axis=1))) newy = np.zeros((int(N * len(uys)), )) for i, uy in enumerate(uys): gind = np.where(y == uy) newX[i * N:i * N + len(gind[0]), :] = x[gind[0], :] newy[i * N:(i + 1) * N] = uy cx = x[gind[0], :] kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(cx) newX[i * N + len(gind[0]):(i + 1) * N] = kde.sample(n_samples=N - len(gind[0])) return newX, newy
def ICA_loglikes_samples_fitSA(arr): N_samp = np.shape(arr)[0] N_bins = np.shape(arr)[1] d = 1 scotts_b = N_samp**(-1. / (d + 4)) print('scotts_b = %.2f' % scotts_b) ica = FastICA(whiten=False, max_iter=1000, tol=1e-3) S_ = ica.fit_transform(arr) A_ = ica.mixing_ #W_ica = ica.components_ X_ref_ica_unmixed = np.zeros((N_samp, N_bins)) X_ref_ica = np.zeros((N_samp, N_bins)) loglike_ica = np.zeros(np.shape(arr)[0]) for j in range(N_bins): X_ica_ind = S_[:, j].reshape(-1, 1) kde = KernelDensity(bandwidth=scotts_b, kernel='gaussian').fit(X_ica_ind) samps = kde.sample(N_samp) #samples for the jth bin X_ref_ica_unmixed[:, j] = np.ndarray.flatten(samps) log_dens = kde.score_samples(X_ica_ind) #.reshape(len(X_ica_ind),1)) loglike_ica += log_dens X_ref_ica = np.dot(X_ref_ica_unmixed, A_.T) #should be akin to samples X^mock. return loglike_ica, X_ref_ica
def kde_estimator(X, y, random_state=None, kernel='gaussian'): n_classes = len(np.unique(y)) lst_1 = [len(np.where(y==clase)[0]) for clase in range(n_classes)] lst_2 = [max(lst_1)-x for x in lst_1] X_res = np.array([]).reshape(0, X.shape[-1]) y_res = np.array([]) for i in range(n_classes): if lst_2[i]==0: X_res = np.concatenate([ X_res, X[np.where(y==i)], ]) y_res = np.concatenate([ y_res, y[np.where(y==i)], ]) else: print("CLASS:", i) kde = KernelDensity(kernel=kernel, bandwidth=0.2).fit(X[np.where(y==i)]) X_res = np.concatenate([ X_res, X[np.where(y==i)], kde.sample(n_samples=lst_2[i], random_state=random_state), ]) y_res = np.concatenate([ y_res, y[np.where(y==i)], np.array([i for _ in range(lst_2[i])]), ]) return X_res, y_res
class KDE(): def __init__(self, **kwargs): """ Kernel Density Estimation (parzen windows). """ self.model = KernelDensity(**kwargs) self.columns = None self.indexes = None def fit(self, data, **kwargs): """ Train the generator with data. :param data: The training data. """ self.columns = data.columns self.indexes = data.indexes self.model.fit(data, **kwargs) def sample(self, n=1, **kwargs): """ Sample from trained KDE. :param n: Number of examples to sample. """ if self.indexes is None: raise Exception( 'You firstly need to train the KDE before sampling. Please use fit method.' ) else: gen_data = self.model.sample(n, **kwargs) return autopandas.AutoData(gen_data, columns=self.columns, indexes=self.indexes)
def kde_fit_quantiles(rtquants, nsamples=1000, bw=.1): """ takes quantile estimates and fits cumulative density function returns samples to pass to sns.kdeplot() """ kdefit = KernelDensity(kernel='gaussian', bandwidth=bw).fit(rtquants) samples = kdefit.sample(n_samples=nsamples).flatten() return samples
def test_kde_sample_weights(): n_samples = 400 size_test = 20 weights_neutral = np.full(n_samples, 3.) for d in [1, 2, 10]: rng = np.random.RandomState(0) X = rng.rand(n_samples, d) weights = 1 + (10 * X.sum(axis=1)).astype(np.int8) X_repetitions = np.repeat(X, weights, axis=0) n_samples_test = size_test // d test_points = rng.rand(n_samples_test, d) for algorithm in ['auto', 'ball_tree', 'kd_tree']: for metric in ['euclidean', 'minkowski', 'manhattan', 'chebyshev']: if algorithm != 'kd_tree' or metric in KDTree.valid_metrics: kde = KernelDensity(algorithm=algorithm, metric=metric) # Test that adding a constant sample weight has no effect kde.fit(X, sample_weight=weights_neutral) scores_const_weight = kde.score_samples(test_points) sample_const_weight = kde.sample(random_state=1234) kde.fit(X) scores_no_weight = kde.score_samples(test_points) sample_no_weight = kde.sample(random_state=1234) assert_allclose(scores_const_weight, scores_no_weight) assert_allclose(sample_const_weight, sample_no_weight) # Test equivalence between sampling and (integer) weights kde.fit(X, sample_weight=weights) scores_weight = kde.score_samples(test_points) sample_weight = kde.sample(random_state=1234) kde.fit(X_repetitions) scores_ref_sampling = kde.score_samples(test_points) sample_ref_sampling = kde.sample(random_state=1234) assert_allclose(scores_weight, scores_ref_sampling) assert_allclose(sample_weight, sample_ref_sampling) # Test that sample weights has a non-trivial effect diff = np.max(np.abs(scores_no_weight - scores_weight)) assert diff > 0.001 # Test invariance with respect to arbitrary scaling scale_factor = rng.rand() kde.fit(X, sample_weight=(scale_factor * weights)) scores_scaled_weight = kde.score_samples(test_points) assert_allclose(scores_scaled_weight, scores_weight)
def kde3d(x, y, z, data_point): values = np.vstack([x, y, z]).T # Use grid search cross-validation to optimize the bandwidth # params = {'bandwidth': np.logspace(-1, 1, 20)} kde = KernelDensity(bandwidth=0.3) kde.fit(values) kde_coords = kde.sample(10000) log_pdf = kde.score_samples(kde_coords) percentile = np.sum(log_pdf < kde.score(data_point))/10000. return (percentile)
def GetDensity(self, action='generate', samples=100, draws=None): """ TODO: Check density calculations for multiple dimensions. Generate a density estimation of the positions at each time or sample positions from the generated density at a specified time. Parameters ---------- action : (string) - Options: 'generate', 'sample'. 'generate' : Generate a density estimation using kernel density estimation and save it. 'sample' : Generate a density estimation at the final time and both draw and return samples from it equal to the number of points in position. samples : (int) - The number of sample points in each dimension at which to measure the density. Total number of points is samples ** dimensions. draws : (int) - The number of points to draw from the density distribution, if None, draw a number of points equal to the number of points in Positions. Returns ------- 'generate' DensitySamples : (np.array) - An array of the positions of the points used to sample the density. Density : (np.array) - The value of the density evaluated at each points in DensitySamples. 'sample' samples : (np.array) - An array of the samples drawn from the density generated from the positions at the final time. """ if action == 'generate': # A list of sample arrays ranging from the min value to the max value in each dimension. minmax = [ np.linspace(np.amin(self.Positions[:, i, :]), np.amax(self.Positions[:, i, :]), samples) for i in range(self.Positions.shape[1]) ] self.DensitySamples = np.array(list(product(*minmax))) self.Density = np.zeros( (self.DensitySamples.shape[0], self.Times.shape[0])) for i in range(self.Positions.shape[2]): bandwidth = 0.2 * np.mean(pdist(self.Positions[:, :, i])) KDE = KernelDensity(bandwidth=bandwidth, kernel='gaussian', metric='euclidean') KDE.fit(self.Positions[:, :, i]) self.Density[:, i] = np.exp( KDE.score_samples(self.DensitySamples)) elif action == 'sample': bandwidth = min(pdist(self.Positions[-1, :][:, np.newaxis])) KDE = KernelDensity(bandwidth=bandwidth, kernel='gaussian', metric='euclidean') KDE.fit(self.Positions[-1, :][:, np.newaxis]) if draws is None: draws = self.Positions.shape[1] return KDE.sample(draws)
def generate_fit(x, y, n=100000, bandwidth=0.1, nbins=15, xmin=-1, xmax=2.5): data = np.vstack([x, y]).T kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(data) sample = kde.sample(n) x_fit, median, mean, std = calculate_median_profile(sample[:, 0], sample[:, 1], xmin=xmin, xmax=xmax, nbins=nbins) return sample, x_fit, median, mean, std
def downsample(self, X, n): # we've already fit()ted, but we're worried that our X is so # large our classifier will be too slow in practice. we can # downsample by running a kde on X and sampling from it (this # will be slow, but happens only once), and then using those # points as the new X. if len(X) < n: return X kde = KernelDensity() kde.fit(X) return kde.sample(n)
def pval_silverman(data, I='auto', N_bootstrap=1000, comm=MPI.COMM_WORLD): I = get_I(data, I) data = comm.bcast(data) h_crit = critical_bandwidth(data, I) var_data = np.var(data) KDE_h_crit = KernelDensity(kernel='gaussian', bandwidth=h_crit).fit(data.reshape(-1, 1)) resamp_fun = lambda: is_unimodal_kde( h_crit, KDE_h_crit.sample(len(data)).ravel()/np.sqrt(1+h_crit**2/var_data), I) smaller_equal_crit_bandwidth = bootstrap(resamp_fun, N_bootstrap, dtype=np.bool_, comm=comm) return np.mean(~smaller_equal_crit_bandwidth)
def copula_generate(X): print('X marginals to uniforms...') X = matrix_to_rank(X) print('X uniforms to inverse gaussian cdf...') X = rank_matrix_to_inverse(X) print('Gaussian Kernel Density Estimation...') kernel = KernelDensity().fit(X) print('Generating artificial data \n Sampling from KDE distribution...') X_artif = kernel.sample(X.shape[0]) print('Marginal retrofitting...') return marginal_retrofit(X_artif, X)
class KDEDist(object): def __init__(self, bw, kernel='gaussian'): self._bw = bw self._kernel = kernel self._kd = KernelDensity(bandwidth=bw, kernel=kernel) self._samples = None @staticmethod def bw_range(x, n=3): max_pwr = 2 h_opt = np.std(x) * (4. / (3. * len(x)))**0.2 pwrs = np.concatenate([ np.linspace(-max_pwr, 0, n + 1), np.linspace(0, max_pwr, n + 1)[1:] ]) return h_opt * 2**pwrs @property def name(self): return 'KDE({}, {:.5f})'.format(self._kernel, self._bw) @property def samples(self): if self._samples is None: self._samples = self.rvs(100000) return self._samples def dist(self): return self def fit(self, x): self._kd.fit(np.reshape(x, (len(x), 1))) return self def logpdf(self, x): return self._kd.score_samples(np.reshape(x, (len(x), 1))) def rvs(self, n): return self._kd.sample(n).reshape(n) def stats(self, moments='mv'): out = [] if 'm' in moments: out.append(np.array([np.mean(self.samples)])) if 'v' in moments: out.append(np.array([np.var(self.samples)])) if 's' in moments: out.append(np.array([skew(self.samples)])) if 'k' in moments: out.append(np.array([kurtosis(self.samples)])) return tuple(out) def ppf(self, q): return np.percentile(self.samples, q)
def generate_samples(X, size=100): ''' Generate new sample from the same distribution of original data :param X: the original data :param size: size of new samples :return: data: new sampled data ''' kde = KernelDensity(kernel='gaussian', bandwidth=0.01) # kernel density estimation (0.01: bandwidth of the kernel) kde.fit(X) # fit the kernel density model on the data data = kde.sample(size) # generate new random samples from the model return kde, data
class colorKDE(object): def __init__(self,data=np.array([])): self.data = data def runKDE(self,bandwidth=0.2,use_opt=False): ''' Generate the KDE and run with given bandwith If use_opt is specified, ruCVSearch must have been run already ''' if use_opt: self.kde = KernelDensity(bandwidth=self.optimal_bandwidth) else: self.kde = KernelDensity(bandwidth=bandwidth) self.kde.fit(self.data) def runCVSearch(self,search_range=np.linspace(0.01,1.0,50),folds=20): self.grid = GridSearchCV(KernelDensity(),{'bandwidth':search_range},\ cv=folds) self.grid.fit(self.data) self.optimal_bandwidth=self.grid.best_params_['bandwidth'] print 'Optimal bandwidth: ' + str(self.optimal_bandwidth) def score_samples(self,x): ''' Replicate score_samples functionality so both saves can be treated the same ''' return self.kde.score_samples(x) def sample(self,n_samples): ''' Replicate samples functionality so both saves can be treated the same ''' return self.kde.sample(n_samples=n_samples) def save(self,filename,full=True): ''' Save current state of the object If full is false, only save self.kde ''' if full: #save the entire object, including data pickle.dump(self,open(filename,'wb'),protocol=-1) else: #only save the .kde object pickle.dump(self.kde,open(filename,'wb'),protocol=-1)
def montecarlo(): """ Run montecarlo simulation. Reads parameters from the excel sheet Params, and perform price simulations over one year. Finally, plots the simulation results and adds it to the sheet. :return: None """ # Get the Excel work book wb = xw.Book.caller() # Get params ticker, start_date, end_date = get_params(book=wb, sheet_name="Params") # Get adj closes closes = get_adj_closes(ticker, start_date, end_date) # Calculate simple daily returns ret = closes.pct_change().dropna() # Estimate density with Gaussian kernels kde = KernelDensity(kernel='gaussian', bandwidth=0.001).fit(ret) # Returns simulation n_days, n_sim = 252, 100000 d_range = pd.date_range(start=closes.index[-1] + pd.Timedelta(days=1), periods=n_days) ret_sim = pd.DataFrame(data=kde.sample(n_samples=n_days * n_sim).reshape( (n_days, n_sim)), index=d_range) # To prices closes_sim = (closes.iloc[-1].values[0]) * (1 + ret_sim).cumprod() # Get 5% - 95% percentile bands band_5 = pd.DataFrame( data={'5% band': np.percentile(closes_sim, 5, axis=1)}, index=d_range) band_95 = pd.DataFrame( data={'95% band': np.percentile(closes_sim, 95, axis=1)}, index=d_range) # Plot past prices, bands and prices scenarios fig = plt.figure(figsize=(6, 4)) plt.plot(closes.iloc[-100:], label='Historical Adj Close') plt.plot(band_5, label='5% Percentile Band') plt.plot(band_95, label='95% Percentile Band') plt.plot(closes_sim.sample(10, axis=1), label='Price Scenarios') plt.xlabel('Time') plt.ylabel('Price') plt.legend(loc='upper left', bbox_to_anchor=(1.05, 1)) # Add the plot to the sheet sheet = wb.sheets['Params'] sheet.pictures.add(fig, name='Montecarlo Simulation', update=True, left=sheet.range('B9').left, top=sheet.range('B9').top)
def get_numerical_signature(values, S): ''' Learns a distribution of the values Then generates a sample of size S ''' # Transform data to numpy array Xnumpy = np.asarray(values) X = Xnumpy.reshape(-1, 1) # Learn kernel kde = KernelDensity(kernel=C.kd["kernel"], bandwidth=C.kd["bandwidth"]).fit(X) sig_v = [kde.sample()[0][0] for x in range(S)] return sig_v
class colorKDE(object): def __init__(self, data=np.array([])): self.data = data def runKDE(self, bandwidth=0.2, use_opt=False): ''' Generate the KDE and run with given bandwith If use_opt is specified, ruCVSearch must have been run already ''' if use_opt: self.kde = KernelDensity(bandwidth=self.optimal_bandwidth) else: self.kde = KernelDensity(bandwidth=bandwidth) self.kde.fit(self.data) def runCVSearch(self, search_range=np.linspace(0.01, 1.0, 50), folds=20): self.grid = GridSearchCV(KernelDensity(),{'bandwidth':search_range},\ cv=folds) self.grid.fit(self.data) self.optimal_bandwidth = self.grid.best_params_['bandwidth'] print('Optimal bandwidth: ' + str(self.optimal_bandwidth)) def score_samples(self, x): ''' Replicate score_samples functionality so both saves can be treated the same ''' return self.kde.score_samples(x) def sample(self, n_samples): ''' Replicate samples functionality so both saves can be treated the same ''' return self.kde.sample(n_samples=n_samples) def save(self, filename, full=True): ''' Save current state of the object If full is false, only save self.kde ''' if full: #save the entire object, including data pickle.dump(self, open(filename, 'wb'), protocol=-1) else: #only save the .kde object pickle.dump(self.kde, open(filename, 'wb'), protocol=-1)
def generate_data_ae(generator, encoder, org_data, n): encoded = encoder.predict(org_data) input_vectors_list = [] kde = KernelDensity().fit(encoded) new_col = kde.sample(n) input_vectors_list.append(new_col) input_data = np.column_stack(input_vectors_list) generated_data = generator.predict(input_data) new_data = pd.DataFrame(data=generated_data, columns=list(org_data.columns)) return new_data
class XSampleFMBW(XSampleBW): def __init__(self, N, comm=MPI.COMM_SELF): self.comm = comm self.rank = self.comm.Get_rank() self.I = (-1.5, a + 1) # CHECK: Is appropriate bound? OK. self.lamtol = 0 self.mtol = mtol self.N = N if self.rank == 0: N1 = binom.rvs(N, 2.0 / 3) #print "N1 = {}".format(N1) N2 = N - N1 data = np.hstack( [np.random.randn(N1), np.random.randn(N2) + a]) else: data = None data = self.comm.bcast(data) self.data = data self.var = np.var(data) self.h_crit = fisher_marron_critical_bandwidth( data, self.lamtol, self.mtol, self.I) #print_all_ranks(self.comm, "self.h_crit = {}".format(self.h_crit)) self.kde_h_crit = KernelDensity(kernel='gaussian', bandwidth=self.h_crit).fit( data.reshape(-1, 1)) def is_unimodal_resample(self, lambda_val): data = self.kde_h_crit.sample( self.N).reshape(-1) / np.sqrt(1 + self.h_crit**2 / self.var) #print "np.var(data)/self.var = {}".format(np.var(data)/self.var) return is_unimodal_kde_fm(self.h_crit * lambda_val, data, self.lamtol, self.mtol, self.I) def probability_of_unimodal_above(self, lambda_val, gamma): ''' G_n(\lambda) = P(\hat h_{crit}^*/\hat h_{crit} <= \lambda) = P(\hat h_{crit}^* <= \lambda*\hat h_{crit}) = P(KDE(X^*, \lambda*\hat h_{crit}) is unimodal) ''' # print "bootstrapping 1000 samples at rank {}:".format(self.rank) # smaller_equal_crit_bandwidth = bootstrap(lambda: self.is_unimodal_resample(lambda_val), 1000, dtype=np.bool_) # pval = np.mean(~smaller_equal_crit_bandwidth) # print "result at rank {}: pval = {}".format(self.rank, pval)+"\n"+"-"*20 return probability_above( lambda: self.is_unimodal_resample(lambda_val), gamma, max_samp=20000, comm=self.comm, batch=20)
class ManoDatasetC(Dataset): def __init__(self, base_path, transform, train_indices): self.transform = transform mano_path = os.path.join(base_path, '%s_mano.json' % 'training') mano_list = json_load(mano_path) mano_array = np.array(mano_list).squeeze(1) mano_poses = mano_array[..., :51] mano_poses = mano_poses[train_indices] self.kde = KernelDensity(bandwidth=0.15, kernel='gaussian') self.kde.fit(mano_poses) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.mano_layer = ManoLayer( mano_root='mano/models', use_pca=False, ncomps=45, flat_hand_mean=False) self.mano_layer.to(self.device) def __len__(self): return 32560 def __getitem__(self, idx): sample = self.kde.sample() pose = sample[..., :48] shape_start = sample[..., 48:] shape = np.ones([1, 10]) shape[..., :3] = shape_start x = { 'p': pose, 's': shape } x = self.transform(x) hand_verts, hand_joints = self.mano_layer(x['p'], x['s']) batch_size = hand_joints.shape[0] hand_joints = hand_joints.reshape([batch_size, 63]) sample = { 'hand_joints': torch.squeeze(hand_joints), 'hand_verts': torch.squeeze(hand_verts), 'poses': torch.squeeze(x['p']), 'shapes': torch.squeeze(x['s']) } return sample
def test_silverman_adaptive_resampling(data, alpha, I='auto', N_bootstrap_max=10000, comm=MPI.COMM_WORLD): data = comm.bcast(data) I = get_I(data, I) h_crit = critical_bandwidth(data, I) var_data = np.var(data) KDE_h_crit = KernelDensity(kernel='gaussian', bandwidth=h_crit).fit(data.reshape(-1, 1)) resamp_fun = lambda: not is_unimodal_kde( h_crit, KDE_h_crit.sample(len(data)).ravel()/np.sqrt(1+h_crit**2/var_data), I) try: return float(probability_above(resamp_fun, alpha, max_samp=N_bootstrap_max, comm=comm, batch=100, bound_significance=0.05, exception_at_max_samp=True, printing=False)) except MaxSampExceededException: return alpha
def pval_calibrated_bandwidth(data, alpha_cal, null, I='auto', N_bootstrap=1000, comm=MPI.COMM_WORLD, calibration_file=None): ''' NB!: Test is only calibrated to correct level for alpha_cal. ''' data = comm.bcast(data) I = get_I(data, I) try: lambda_alpha = load_lambda('bw_ad', null, alpha_cal, calibration_file) except KeyError: lambda_alpha = load_lambda('bw', null, alpha_cal, calibration_file) h_crit = critical_bandwidth(data, I) var_data = np.var(data) KDE_h_crit = KernelDensity(kernel='gaussian', bandwidth=h_crit).fit(data.reshape(-1, 1)) resamp_fun = lambda: is_unimodal_kde( h_crit*lambda_alpha, KDE_h_crit.sample(len(data)).ravel()/np.sqrt(1+h_crit**2/var_data), I) smaller_equal_crit_bandwidth = bootstrap(resamp_fun, N_bootstrap, dtype=np.bool_, comm=comm) return np.mean(~smaller_equal_crit_bandwidth)
class XSampleFMBW(XSampleBW): def __init__(self, N, comm=MPI.COMM_SELF): self.comm = comm self.rank = self.comm.Get_rank() self.I = (-1.5, a+1) # CHECK: Is appropriate bound? OK. self.lamtol = 0 self.mtol = mtol self.N = N if self.rank == 0: N1 = binom.rvs(N, 2.0/3) #print "N1 = {}".format(N1) N2 = N - N1 data = np.hstack([np.random.randn(N1), np.random.randn(N2)+a]) else: data = None data = self.comm.bcast(data) self.data = data self.var = np.var(data) self.h_crit = fisher_marron_critical_bandwidth(data, self.lamtol, self.mtol, self.I) #print_all_ranks(self.comm, "self.h_crit = {}".format(self.h_crit)) self.kde_h_crit = KernelDensity(kernel='gaussian', bandwidth=self.h_crit).fit(data.reshape(-1, 1)) def is_unimodal_resample(self, lambda_val): data = self.kde_h_crit.sample(self.N).reshape(-1)/np.sqrt(1+self.h_crit**2/self.var) #print "np.var(data)/self.var = {}".format(np.var(data)/self.var) return is_unimodal_kde_fm(self.h_crit*lambda_val, data, self.lamtol, self.mtol, self.I) def probability_of_unimodal_above(self, lambda_val, gamma): ''' G_n(\lambda) = P(\hat h_{crit}^*/\hat h_{crit} <= \lambda) = P(\hat h_{crit}^* <= \lambda*\hat h_{crit}) = P(KDE(X^*, \lambda*\hat h_{crit}) is unimodal) ''' # print "bootstrapping 1000 samples at rank {}:".format(self.rank) # smaller_equal_crit_bandwidth = bootstrap(lambda: self.is_unimodal_resample(lambda_val), 1000, dtype=np.bool_) # pval = np.mean(~smaller_equal_crit_bandwidth) # print "result at rank {}: pval = {}".format(self.rank, pval)+"\n"+"-"*20 return probability_above(lambda: self.is_unimodal_resample(lambda_val), gamma, max_samp=20000, comm=self.comm, batch=20)
def test_calibrated_bandwidth_adaptive_resampling(data, alpha, null, I='auto', N_bootstrap_max=10000, comm=MPI.COMM_WORLD, calibration_file=None): data = comm.bcast(data) I = get_I(data, I) try: lambda_alpha = load_lambda('bw_ad', null, alpha, calibration_file) # loading lambda computed with adaptive probablistic bisection search except KeyError: lambda_alpha = load_lambda('bw', null, alpha, calibration_file) # loading lambda computed with probabilistic bisection search h_crit = critical_bandwidth(data, I) var_data = np.var(data) KDE_h_crit = KernelDensity(kernel='gaussian', bandwidth=h_crit).fit(data.reshape(-1, 1)) resamp_fun = lambda: not is_unimodal_kde( h_crit*lambda_alpha, KDE_h_crit.sample(len(data)).ravel()/np.sqrt(1+h_crit**2/var_data), I) try: return float(probability_above(resamp_fun, alpha, max_samp=N_bootstrap_max, comm=comm, batch=100, bound_significance=0.05, exception_at_max_samp=True, printing=False)) except MaxSampExceededException: return alpha
result, stats = get_standart_deviation(delta, PHAT_targets_valid[:, 0], method="full") print(result) full_set = np.hstack((PHAT_features_train, PHAT_targets_train)) # bring all magnitudes to redshift range rescaled_set = np.copy(full_set) rescaled_set[:, 0:-1] = rescaled_set[:, 0:-1] # *feature_av rescaled_set[:, -1] = rescaled_set[:, -1] # Draw a sample set every time kde = KernelDensity(bandwidth=0.001) kde.fit(rescaled_set) for i in range(500, 9000, 2000): aug_data = kde.sample(i) # aug_data = np.vstack((aug_data, full_set)) # initalize predictor tree_para = {"min_samples_leaf": 5} clf = AdaBoostRegressor(DecisionTreeRegressor(**tree_para), loss="exponential", n_estimators=20) # fit predictor clf.fit(aug_data[:, 0:-1], aug_data[:, -1]) predicted_aug = clf.predict(PHAT_features_valid) # collect stats delta_aug = predicted_aug - PHAT_targets_valid[:, 0] feature_imp_aug = clf.feature_importances_ result_aug, stats_aug = get_standart_deviation(delta_aug, PHAT_targets_valid[:, 0], method="full")
def sklearn_log_density(sample_points, evaluation_points): """ Estimate the log probability density function from which a set of sample points was drawn and return the estimated density at the evaluation points. *sample_points* is an [n x m] matrix. *evaluation_points* is the set of points at which to evaluate the kde. Note: if any dimension has all points equal then the entire distribution is treated as a dirac distribution with infinite density at each point. This makes the entropy calculation better behaved (narrowing the distribution increases the entropy) but is not so useful in other contexts. Other packages will (correctly) ignore dimensions of width zero. """ # Ugly hack warning: if *evaluation_points* is an integer, then sample # that many points from the kde and return the log density at each # sampled point. Since the code that uses this is looking only at # the mean log density, it doesn't need the sample points themselves. # This interface should be considered internal to the entropy module # and not used by outside functions. If you need it externally, then # restructure the api so that the function always returns both the # points and the density, as well as any other function (such as the # denisty function and the sister function scipy_stats_density) so # that all share the new interface. from sklearn.neighbors import KernelDensity # Standardize data so we can use spherical kernels and uniform bandwidth data, mu, sigma = standardize(sample_points) # Note that sigma will be zero for dimensions w_o where all points are equal. # With P(w) = P(w, w_o) / P(w_o | w) and P(w_o) = 1 for all points in # the set, then P(w) = P(w, w_o) and we can ignore the zero dimensions. # However, as another ugly hack, we want the differential entropy to go # to -inf as the distribution narrows, so pretend that P = 0 everywhere. # Uncomment the following line to return the sample probability instead. ## sigma[sigma == 0.] = 1. # Silverman bandwidth estimator n, d = sample_points.shape bandwidth = (n * (d + 2) / 4.)**(-1. / (d + 4)) #print("starting grid search for bandwidth over %d points"%n) #from sklearn.grid_search import GridSearchCV #from numpy import logspace #params = {'bandwidth': logspace(-1, 1, 20)} #fitter = GridSearchCV(KernelDensity(), params) #fitter.fit(data) #kde = fitter.best_estimator_ #print("best bandwidth: {0}".format(kde.bandwidth)) #import time; T0 = time.time() kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth, rtol=1e-6, atol=1e-6) kde.fit(data) if isinstance(evaluation_points, int): # For generated points, they already follow the distribution points = kde.sample(n) elif evaluation_points is not None: # Standardized evaluation points to match sample distribution # Note: for dimensions where all sample points are equal, sigma # has been artificially set equal to one. This means that the # evaluation points which do not match the sample value will # use the simple differences for the z-score rather than # pushing them out to plus/minus infinity. points = (evaluation_points - mu)/(sigma + (sigma == 0.)) else: points = sample_points # Evaluate pdf, scaling the resulting density by sigma to correct the area. # If sigma is zero, return entropy as -inf; this seems to not be the # case for discrete distributions (consider Bernoulli with p=1, q=0, # => H = -p log p - q log q = 0), so need to do something else, both # for the kde and for the entropy calculation. with np.errstate(divide='ignore'): log_pdf = kde.score_samples(points) - np.sum(np.log(sigma)) return log_pdf