def _demo_kde(): n = 100 np.random.seed(1) # generate some dummy data x = np.concatenate((np.random.normal(0, 1, int(0.3 * n)), np.random.normal(5, 1, int(0.7 * n))))[:, np.newaxis] # append 0 label to all data as we are interested in a single class case y = np.zeros(x.shape) # a subset of domain of the random variable x x_plot = np.linspace(-5, 10, 1000)[:, np.newaxis] # generate a dummy density function to sample data from true_dens = (0.3 * norm(0, 1).pdf(x_plot[:, 0]) + 0.7 * norm(5, 1).pdf(x_plot[:, 0])) fig, ax = plt.subplots() ax.fill(x_plot[:, 0], true_dens, fc='black', alpha=0.2, label='input distribution') # thumb up rule for bandwidth selection bandwidth = 1.06 * min(np.std(x), iqr(x) / 1.34) * np.power(x.shape[0], -0.2) # try different kernels and show how the look like for kernel in ['gaussian', 'tophat', 'epanechnikov']: kde = KernelDensityEstimate(kernel=kernel, bandwidth=bandwidth, num_cls=1) kde.fit(x, y) log_dens = kde.list_den_est[0].score_samples(x_plot) ax.plot(x_plot[:, 0], np.exp(log_dens), '-', label="kernel = '{0}'".format(kernel)) ax.text(6, 0.38, "N={0} points".format(n)) ax.legend(loc='upper left') ax.plot(x[:, 0], -0.005 - 0.01 * np.random.random(x.shape[0]), '+k') ax.set_xlim(-4, 9) ax.set_ylim(-0.02, 0.4) plt.show() print('KDE Flows!') return 0
def train_pca_rda_kde_model(x, y, k_folds=10): """ Trains the Cw-PCA RDA KDE model given the input data and labels with cross validation and returns the model Args: x(ndarray[float]): C x N x k data array y(ndarray[int]): N x 1 observation (class) array N is number of samples k is dimensionality of features C is number of channels k_folds(int): Number of cross validation folds Return: model(pipeline): trained likelihood model """ # Pipeline is the model. It can be populated manually rda = RegularizedDiscriminantAnalysis() pca = ChannelWisePrincipalComponentAnalysis(var_tol=.1 ** 5, num_ch=x.shape[0]) model = Pipeline() model.add(pca) model.add(rda) # Cross validate arg_cv = cross_validation(x, y, model=model, k_folds=k_folds) # Get the AUC before the regularization tmp, sc_cv, y_cv = cost_cross_validation_auc(model, 1, x, y, arg_cv, k_folds=10, split='uniform') auc_init = -tmp # Start Cross validation lam = arg_cv[0] gam = arg_cv[1] log.debug('Optimized val [gam:{} \ lam:{}]'.format(lam, gam)) model.pipeline[1].lam = lam model.pipeline[1].gam = gam tmp, sc_cv, y_cv = cost_cross_validation_auc(model, 1, x, y, arg_cv, k_folds=10, split='uniform') auc_cv = -tmp # After finding cross validation scores do one more round to learn the final RDA model model.fit(x, y) # Insert the density estimates to the model and train using the cross validated # scores to avoid over fitting. Observe that these scores are not obtained using # the final model bandwidth = 1.06 * min( np.std(sc_cv), iqr(sc_cv) / 1.34) * np.power(x.shape[0], -0.2) model.add(KernelDensityEstimate(bandwidth=bandwidth)) model.pipeline[-1].fit(sc_cv, y_cv) # Report AUC log.debug('AUC-i: {}, AUC-cv: {}'.format(auc_init, auc_cv)) return model, auc_cv
def _demo_validate_data(): dim_x = 75 num_x_p = 500 num_x_n = 500 num_ch = 20 x_p_train = np.asarray( [np.random.randn(num_x_p, dim_x) for i in range(num_ch)]) x_n_train = np.array( [np.random.randn(num_x_p, dim_x) for i in range(num_ch)]) y_p_train = [1] * num_x_p y_n_train = [0] * num_x_n x_train = np.concatenate((x_n_train, x_p_train), axis=1) y_train = np.concatenate((y_n_train, y_p_train), axis=0) permutation = np.random.permutation(x_train.shape[1]) x_train = x_train[:, permutation, :] y_train = y_train[permutation] model, _ = train_pca_rda_kde_model(x_train, y_train, k_folds=10) fig = plt.figure() ax = fig.add_subplot(211) x_plot = np.linspace(np.min(model.line_el[-1]), np.max(model.line_el[-1]), 1000)[:, np.newaxis] ax.plot(model.line_el[2][y_train == 0], -0.005 - 0.01 * np.random.random(model.line_el[2][y_train == 0].shape[0]), 'ro', label='class(-)') ax.plot(model.line_el[2][y_train == 1], -0.005 - 0.01 * np.random.random(model.line_el[2][y_train == 1].shape[0]), 'go', label='class(+)') for idx in range(len(model.pipeline[2].list_den_est)): log_dens = model.pipeline[2].list_den_est[idx].score_samples(x_plot) ax.plot(x_plot[:, 0], np.exp(log_dens), 'r-' * (idx == 0) + 'g-' * (idx == 1), linewidth=2.0) ax.legend(loc='upper right') plt.title('Training Data') plt.ylabel('p(e|l)') plt.xlabel('scores') # Test x_p_test = np.asarray( [np.random.randn(num_x_p, dim_x) for i in range(num_ch)]) x_n_test = np.array( [np.random.randn(num_x_p, dim_x) for i in range(num_ch)]) y_p_test = [1] * num_x_p y_n_test = [0] * num_x_n x_test = np.concatenate((x_n_test, x_p_test), axis=1) y_test = np.concatenate((y_n_test, y_p_test), axis=0) permutation = np.random.permutation(x_test.shape[1]) x_test = x_test[:, permutation, :] y_test = y_test[permutation] model.transform(x_test) ax.plot(model.line_el[2][y_test == 0], -0.01 - 0.01 * np.random.random(model.line_el[2][y_test == 0].shape[0]), 'bo', label='t_class(-)') ax.plot(model.line_el[2][y_test == 1], -0.01 - 0.01 * np.random.random(model.line_el[2][y_test == 1].shape[0]), 'ko', label='t_class(+)') bandwidth = 1.06 * min(np.std(model.line_el[2]), iqr(model.line_el[2]) / 1.34) * np.power( model.line_el[2].shape[0], -0.2) test_kde = KernelDensityEstimate(bandwidth=bandwidth) test_kde.fit(model.line_el[2], y_test) for idx in range(len(model.pipeline[2].list_den_est)): log_dens = test_kde.list_den_est[idx].score_samples(x_plot) ax.plot(x_plot[:, 0], np.exp(log_dens), 'b--' * (idx == 0) + 'k--' * (idx == 1), linewidth=2.0) ax.legend(loc='upper right') plt.title('Training Data') plt.ylabel('p(e|l)') plt.xlabel('scores') plt.show()
def _demo_validate_real_data(): ds_rate = 2 channel_map = [1] * 16 + [0, 0, 1, 1, 0, 1, 1, 1, 0] data_train_folder = load_experimental_data() mode = 'calibration' raw_dat, stamp_time, channels, type_amp, fs = read_data_csv( data_train_folder + '/rawdata.csv') dat = sig_pro(raw_dat, fs=fs, k=ds_rate) # Get data and labels s_i, t_t_i, t_i = trigger_decoder(mode=mode, trigger_loc=data_train_folder + '/triggers.txt') x_train, y_train, num_seq, _ = trial_reshaper(t_t_i, t_i, dat, mode=mode, fs=fs, k=ds_rate, channel_map=channel_map) model = train_pca_rda_kde_model(x_train, y_train, k_folds=10) fig = plt.figure() ax = fig.add_subplot(211) x_plot = np.linspace(np.min(model.line_el[-1]), np.max(model.line_el[-1]), 1000)[:, np.newaxis] ax.plot(model.line_el[2][y_train == 0], -0.005 - 0.01 * np.random.random(model.line_el[2][y_train == 0].shape[0]), 'ro', label='class(-)') ax.plot(model.line_el[2][y_train == 1], -0.005 - 0.01 * np.random.random(model.line_el[2][y_train == 1].shape[0]), 'go', label='class(+)') for idx in range(len(model.pipeline[2].list_den_est)): log_dens = model.pipeline[2].list_den_est[idx].score_samples(x_plot) ax.plot(x_plot[:, 0], np.exp(log_dens), 'r-' * (idx == 0) + 'g-' * (idx == 1), linewidth=2.0) ax.legend(loc='upper right') plt.title('Training Data') plt.ylabel('p(e|l)') plt.xlabel('scores') # Test data_test_folder = load_experimental_data() mode = 'calibration' raw_dat, stamp_time, channels, type_amp, fs = read_data_csv( data_test_folder + '/rawdata.csv') dat = sig_pro(raw_dat, fs=fs, k=ds_rate) # Get data and labels s_i, t_t_i, t_i = trigger_decoder(mode=mode, trigger_loc=data_test_folder + '/triggers.txt') x_test, y_test, num_seq, _ = trial_reshaper(t_t_i, t_i, dat, mode=mode, fs=fs, k=ds_rate, channel_map=channel_map) model.transform(x_test) ax.plot(model.line_el[2][y_test == 0], -0.01 - 0.01 * np.random.random(model.line_el[2][y_test == 0].shape[0]), 'bo', label='t_class(-)') ax.plot(model.line_el[2][y_test == 1], -0.01 - 0.01 * np.random.random(model.line_el[2][y_test == 1].shape[0]), 'ko', label='t_class(+)') bandwidth = 1.06 * min(np.std(model.line_el[2]), iqr(model.line_el[2]) / 1.34) * np.power( model.line_el[2].shape[0], -0.2) test_kde = KernelDensityEstimate(bandwidth=bandwidth) test_kde.fit(model.line_el[2], y_test) for idx in range(len(model.pipeline[2].list_den_est)): log_dens = test_kde.list_den_est[idx].score_samples(x_plot) ax.plot(x_plot[:, 0], np.exp(log_dens), 'b--' * (idx == 0) + 'k--' * (idx == 1), linewidth=2.0) ax.legend(loc='upper right') plt.title('Training Data') plt.ylabel('p(e|l)') plt.xlabel('scores') plt.show()
def _demo_pipeline(): dim_x = 2 num_x_p = 200 num_x_n = 200 var_tol = .8 num_ch = 2 mtx_p = [np.array([[1, 0], [0, 1]]), np.array([[1, 2], [2, 1]])] mtx_n = [np.array([[2, 0], [0, 2]]), np.array([[1, -2], [-2, 1]])] x_p = np.asarray([ np.dot(np.random.randn(num_x_p, dim_x), mtx_p[i]) for i in range(num_ch) ]) x_n = 3 + np.array([ np.dot(np.random.randn(num_x_p, dim_x), mtx_n[i]) for i in range(num_ch) ]) y_p = [1] * num_x_p y_n = [0] * num_x_n x = np.concatenate((x_n, x_p), axis=1) y = np.concatenate((y_n, y_p), axis=0) permutation = np.random.permutation(x.shape[1]) x = x[:, permutation, :] y = y[permutation] """ Select bandwidth of the gaussian kernel assuming data is also comming from a gaussian distribution. Ref: Silverman, Bernard W. Density estimation for statistics and data analysis. Vol. 26. CRC press, 1986. """ bandwidth = 1.06 * min(np.std(x), iqr(x) / 1.34) * np.power(x.shape[0], -0.2) pca = ChannelWisePrincipalComponentAnalysis(num_ch=x.shape[0]) rda = RegularizedDiscriminantAnalysis() kde = KernelDensityEstimate(bandwidth=bandwidth) model = Pipeline() model.add(pca) model.add(rda) model.add(kde) plt.ion() fig = plt.figure() ax = fig.add_subplot(212) ax_2 = fig.add_subplot(221) ax_3 = fig.add_subplot(222) for gam in [0, .3, .6, .9]: for lam in [0, .3, .6, .9]: model.pipeline[1].lam = lam model.pipeline[1].gam = gam if gam == 0 and lam == 0: # Show this once only bad implementation but I don't care model.pipeline[0].var_tol = 0 model.fit(x, y) sv_init = [ model.pipeline[0].list_pca[i].singular_values_ for i in range(len(model.pipeline[0].list_pca)) ] model.pipeline[0].var_tol = var_tol model.fit(x, y) sv_final = [ model.pipeline[0].list_pca[i].singular_values_ for i in range(len(model.pipeline[0].list_pca)) ] print("Initial SV:{}".format(sv_init)) print("-- using tolerance:{} -->".format(var_tol)) print("Final SV:{}".format(sv_final)) print("Init dim.:{} -> Final dim.:{}".format( x.shape, model.line_el[1].shape)) model.fit_transform(x, y) el = model.line_el[1] x_min, x_max = el[:, 0].min() - 1, el[:, 0].max() + 1 y_min, y_max = el[:, 1].min() - 1, el[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) z = model.pipeline[1].predict(np.c_[xx.ravel(), yy.ravel()]) z = z.reshape(xx.shape) ax.clear() ax_2.clear() ax_3.clear() ax.contourf(xx, yy, z, alpha=0.2, c=y, s=20) ax.scatter(model.line_el[1][y == 1, 0], model.line_el[1][y == 1, 1], c='r') ax.scatter(model.line_el[1][y == 0, 0], model.line_el[1][y == 0, 1], c='g') ax.set_title('after PCA') ax_2.scatter(x[0, y == 1, 0], x[0, y == 1, 1], c='r') ax_2.scatter(x[0, y == 0, 0], x[0, y == 0, 1], c='g') ax_3.scatter(x[1, y == 1, 0], x[1, y == 1, 1], c='r') ax_3.scatter(x[1, y == 0, 0], x[1, y == 0, 1], c='g') ax_2.set_title('1st dim') ax_3.set_title('2nd dim') fig.canvas.draw() time.sleep(.2) time.sleep(1) plt.ioff() fig_2, axn = plt.subplots() x_plot = np.linspace(np.min(model.line_el[-1]), np.max(model.line_el[-1]), 1000)[:, np.newaxis] axn.plot(model.line_el[2][y == 0], -0.005 - 0.01 * np.random.random(model.line_el[2][y == 0].shape[0]), 'ro', label='class(-)') axn.plot(model.line_el[2][y == 1], -0.005 - 0.01 * np.random.random(model.line_el[2][y == 1].shape[0]), 'go', label='class(+)') for idx in range(len(model.pipeline[2].list_den_est)): log_dens = model.pipeline[2].list_den_est[idx].score_samples(x_plot) axn.plot(x_plot[:, 0], np.exp(log_dens), 'r-' * (idx == 0) + 'g--' * (idx == 1), linewidth=2.0) axn.legend(loc='upper right') plt.title('Likelihoods Given the Labels') plt.ylabel('p(e|l)') plt.xlabel('scores') fig_2.show() time.sleep(10)