def obj(y0): y0x0z0 = np.hstack([y0, x0z0]) num = kernel_density.KDEMultivariate( data=np.hstack([self.endog, self.exog, self.external]), var_type=self._endogtype + self._exogtype + self._externaltype, bw='normal_reference') den = kernel_density.KDEMultivariate( data=np.hstack([self.exog, self.external]), var_type=self._exogtype + self._externaltype, bw='normal_reference') return -num.pdf(y0x0z0) / den.pdf(x0z0)
def calc_next_params(domain, trials): gamma = DEFAULT_GAMMA bw = DEFAULT_BW n_min = DEFAULT_N_MIN bw_weight = DEFAULT_BW_WEIGHT sampling_num = DEFAULT_SAMPLING_NUM next_params = {} if len(trials) <= n_min + 2: random_result = domain.random() for index, fieldname in enumerate(domain.fieldnames): next_params[fieldname] = random_result[index] return next_params train_x, train_y = trials.get_train_data() idx = np.argsort(train_y) n = len(trials) l_len = max(n_min, int(n * gamma)) g_len = max(n_min, n - l_len) x_l = train_x[idx[:l_len], :] x_g = train_x[idx[-g_len:], :] # I want to get the types of params from domain # v_types = "ccccuuuuuooooo" # ref. http://www.statsmodels.org/dev/generated/ # statsmodels.nonparametric.kernel_density.KDEMultivariate.html v_types = 'c' * domain.n_params l_est = kde.KDEMultivariate(x_l, v_types, bw=bw) g_est = kde.KDEMultivariate(x_g, v_types, bw=bw) wide_bw = l_est.bw * bw_weight for w in np.nditer(wide_bw, op_flags=['readwrite']): w[...] = max(w, 1e-3 * bw_weight) bounds = domain.bounds minimize_result = minimize(fun=objective_function, x=x_l, sampling_num=sampling_num, bw=wide_bw, bounds=bounds, args=(l_est, g_est)) for index, fieldname in enumerate(domain.fieldnames): next_params[fieldname] = minimize_result[index] return next_params
def logpdf(self, rowid, targets, constraints=None, inputs=None): if self.N == 0: raise ValueError('KDE requires at least one observation.') constraints = self.populate_constraints(rowid, targets, constraints) if inputs: raise ValueError('Prohibited inputs: %s' % (inputs, )) if not targets: raise ValueError('No targets: %s' % (targets, )) if any(np.isnan(v) for v in targets.values()): raise ValueError('Invalid nan values in targets: %s' % (targets, )) if any(q not in self.outputs for q in targets): raise ValueError('Unknown targets: %s' % (targets, )) if any(q in constraints for q in targets): raise ValueError('Duplicate variable: %s, %s' % ( targets, constraints, )) if not constraints: model = kernel_density.KDEMultivariate( self._dataset(targets), self._stattypes(targets), bw=self._bw(targets), ) pdf = model.pdf(targets.values()) else: full_members = self._dataset(targets.keys() + constraints.keys()) model = kernel_density.KDEMultivariateConditional( full_members[:, :len(targets)], full_members[:, len(targets):], self._stattypes(targets), self._stattypes(constraints), bw=np.concatenate((self._bw(targets), self._bw(constraints))), ) pdf = model.pdf(targets.values(), constraints.values()) return np.log(pdf)
def transition(self, N=None): if self.N > 0: dataset = self._dataset(self.outputs) stattypes = self._stattypes(self.outputs) # Learn the kernel bandwidths. kde = kernel_density.KDEMultivariate(dataset, stattypes, bw='cv_ml') self.bw = kde.bw.tolist()
def execute(self): """Execute the link. :returns: status code of execution :rtype: StatusCode """ # --- your algorithm code goes here self.logger.debug('Now executing link: {link}.', link=self.name) ds = process_manager.service(DataStore) unordered_categorical_i = ds['unordered_categorical_i'] ordered_categorical_i = ds['ordered_categorical_i'] continuous_i = ds['continuous_i'] data_no_nans = ds[self.data_no_nans_read_key] # Concatenate normalized data with original categorical data # if one of unordered_categorical_i, ordered_categorical_i, data_normalized is empty, then concatenating will # not work (see next line). We thus make them of the correct length data_unordered_categorical = data_no_nans[:, unordered_categorical_i] data_ordered_categorical = data_no_nans[:, ordered_categorical_i] n_obs = len(data_no_nans) if data_unordered_categorical.size == 0: data_unordered_categorical = np.empty(shape=(n_obs, 0)) if data_ordered_categorical.size == 0: data_ordered_categorical = np.empty(shape=(n_obs, 0)) if self.do_pca: data_normalized_pca = ds[self.data_normalized_pca_read_key] d = np.concatenate((data_unordered_categorical, data_ordered_categorical, data_normalized_pca), axis=1) else: data_normalized = ds[self.data_normalized_read_key] if data_normalized.size == 0: data_normalized = np.empty(shape=(n_obs, 0)) d = np.concatenate((data_unordered_categorical, data_ordered_categorical, data_normalized), axis=1) var_type = 'u' * len(unordered_categorical_i) + 'o' * len(ordered_categorical_i) + \ 'c' * len(continuous_i) # NB: statsmodels uses normal reference for unordered categorical variables as well! # NB: the bandwiths are determined on the normalized continuous data and on the original categorical data if (len(continuous_i) == 0) & (len(ordered_categorical_i) == 0): kde_weights = ut.kde_only_unordered_categorical(d) ds[self.store_key] = kde_weights else: kde = kernel_density.KDEMultivariate(d, var_type=var_type, bw='normal_reference') ds[self.store_key] = kde.bw return StatusCode.Success
def kde_statsmodels_m(x, x_grid, **kwargs): """ multivariate kde """ model = kde.KDEMultivariate(x, bw='normal_reference', var_type='c') return model.cdf(x_grid)
w = KPDF.MPDFGaussian(rvs, grid, bw_kpdf / 2) w = w.reshape(x.shape) / w.max() plot(axes[3], w, 'KPDF bw:kpdf/2 ($\ell_2$ norm: %.3f)' % np.linalg.norm( (p - w).flat)) w = KPDF.MPDFGaussian(rvs, grid, bw_scott) w = w.reshape(x.shape) / w.max() plot(axes[4], w, 'KPDF bw:scott ($\ell_2$ norm: %.3f)' % np.linalg.norm( (p - w).flat)) w = KPDF.MPDFGaussian(rvs, grid, bw_scott / 2) w = w.reshape(x.shape) / w.max() plot(axes[5], w, 'KPDF bw:scott/2 ($\ell_2$ norm: %.3f)' % np.linalg.norm( (p - w).flat)) dens = smkde.KDEMultivariate(rvs, 'cc', bw='cv_ml') print("SM bandwidth (cv_ml): " + repr(dens.bw)) w = dens.pdf(grid) w = w.reshape(x.shape) / w.max() plot(axes[6], w, 'SM bw:CVML ($\ell_2$ norm: %.3f)' % np.linalg.norm( (p - w).flat)) dens = smkde.KDEMultivariate(rvs, 'cc', bw='cv_ls') print("SM bandwidth (cv_ls): " + repr(dens.bw)) w = dens.pdf(grid) w = w.reshape(x.shape) / w.max() plot(axes[7], w, 'SM bw:CVLS ($\ell_2$ norm: %.3f)' % np.linalg.norm( (p - w).flat)) plt.savefig('fig3.png') plt.show()
def stats_kde(x, **kwargs): grid = np.arange(np.nanmin(x), np.nanmax(x)) model = kde.KDEMultivariate(x, bw='normal_reference', var_type='c') return grid, model.cdf(grid), model.pdf(grid)
print('\n') #%% # We need a multivariate alternative to scikit learn... # # https://www.statsmodels.org/stable/generated/ # statsmodels.nonparametric.kernel_density.KDEMultivariate.html import statsmodels.nonparametric.kernel_density as statmKDE # Get the longitude and latitude values of the asteroid AST_LONG_LAT = ast_2020_jx1_df[['ECLIP_LONG_RAD', 'ECLIP_LAT_RAD']].values # Compute now the 2D multivariate KDE DENS_MODEL = statmKDE.KDEMultivariate(data=AST_LONG_LAT, \ var_type='cc', \ bw='normal_reference') #%% # Let's print the bandwidth results print(f'Bandwidth longitude in radians (normal ref.): {DENS_MODEL.bw[0]}') print(f'Bandwidth latitude in radians (normal ref.): {DENS_MODEL.bw[1]}') print('\n') #%% # Do the results from other bw-determining methods differ? DENS_MODEL_TEMP = statmKDE.KDEMultivariate(data=AST_LONG_LAT, \ var_type='cc', \ bw='cv_ml')