class OneClassKDE(BaseClassifier): _fit_params = ["bandwidth"] _predict_params = [] def __init__(self, *args, **kwargs): self.bandwidth = kwargs["bandwidth"] self.perc_keep = kwargs["perc_keep"] def fit(self, data, **kwargs): #self.train_data = data self.kde = KernelDensity(kernel='gaussian', bandwidth=self.bandwidth) idx = numpy.random.randint(2, size=len(data)).astype(numpy.bool) print idx self.kde.fit(data[idx, :]) self.training_score = self.kde.score_samples(data[~idx, :]) self.direct_thresh = numpy.percentile(self.training_score, 100-self.perc_keep) print 'training', self.training_score.min(), self.training_score.mean(), self.training_score.max(), self.direct_thresh print self.direct_thresh def predict(self, data): score = self.kde.score_samples(data) self.score = score res = (score < self.direct_thresh) print 'test', self.score.min(), self.score.mean(), self.score.max() print res.sum(), "of", len(self.score), 'outliers' return res.astype(numpy.uint8)*-2+1 def decision_function(self, data=None): return self.score
def kernel_estimation(test,train_n,train_p): relevance_score=[] result_n=[] result_p=[] X_n=np.array(train_n) X_p=np.array(train_p) Y=np.array(test) #params = {'bandwidth': np.logspace(-1, 1, 20)} #grid = GridSearchCV(KernelDensity(), params) #grid.fit(X_n) #print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth)) kde_n = KernelDensity(kernel='gaussian', bandwidth=0.999).fit(X_n) kde_p = KernelDensity(kernel='gaussian', bandwidth=4.772).fit(X_p) for i in range(len(Y)): result_n.append(np.exp(float(str(kde_n.score_samples(Y[i])).replace('[','').replace(']','')))) result_p.append(np.exp(float(str(kde_p.score_samples(Y[i])).replace('[','').replace(']','')))) if i%1000==0: print i for i in range(len(result_n)): if result_n[i]==0.0: relevance_score.append(np.log(result_p[i]/1.8404e-17+1)) else: relevance_score.append(np.log(result_p[i]/result_n[i]+1)) return relevance_score
def kernel_pmi_func(df, x, y, i, b=1.0): x = np.array(df[x]) y = np.array(df[y]) x_y = np.stack((x, y), axis=-1) kde_x = KernelDensity(kernel='gaussian', bandwidth=b).fit(x[:, np.newaxis]) kde_y = KernelDensity(kernel='gaussian', bandwidth=b).fit(y[:, np.newaxis]) kde_x_y = KernelDensity(kernel='gaussian', bandwidth=b).fit(x_y) p_x = pd.Series(np.exp(kde_x.score_samples(x[:, np.newaxis]))) p_y = pd.Series(np.exp(kde_y.score_samples(y[:, np.newaxis]))) p_x_y = pd.Series(np.exp(kde_x_y.score_samples(x_y))) df['PMI_'+str(i)] = np.log( p_x_y / (p_x * p_y) )
def kernel_reg(df, bw=30, indepv="ARRTIME", kernel='gaussian'): ''' Make various KDEs that using 3 different kernels and bandwidths Inputs: df: pandas dataframe indepv: (a list) of strings of the indepedent variable names bw: (float) bandwidth Run kernel regression and print out the scores for comparison ''' y = df["WAITTIME"] x = df[indepv] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=42) train = pd.concat([x_train, y_train], axis=1) test = pd.concat([x_test, y_test], axis=1) #kernel density estimations # instantiate and fit the KDE model kde = KernelDensity(kernel=kernel, bandwidth=bw).fit(train) # score_samples returns the log of the probability density logprob = kde.score_samples(test) plt.scatter(y_test, logprob, label='%s, bw=%s' % (kernel, bw)) plt.legend(loc=0) plt.show()
def kernel_fit_single(data, bw=None, min_size=20, kern='gaussian'): """ guassian fit to 1D data """ res = np.histogram(data.ravel(), bins='sqrt', density=True) std_data = data.std() if (bw == None): bw = (data.ravel().shape[0] * (std_data + 2) / 4.)**(-1. / (std_data + 4)) N_bins = res[1].shape[0] if (N_bins < min_size): extra = 0.2 #N_bins *=2 else: extra = 0.0 # get plus or minus 20% x_grid = np.linspace(res[1][0] - extra * abs(res[1][0]), res[1][-1] + extra * abs(res[1][0]), N_bins) kde = KernelDensity(bandwidth=bw, kernel=kern) kde.fit(data.ravel()[:, None]) pdf = np.exp(kde.score_samples(x_grid[:, None])) return pdf, x_grid
def kde_labeler(picks): if isinstance(picks, torch.Tensor): picks = picks.clone().cpu().data.numpy().astype(int) nums = np.array([x for x in range(0, 101)]).reshape(-1, 1) picks = picks.reshape(-1, 1) lower = np.percentile(picks, 25) upper = np.percentile(picks, 75) IQR = upper - lower std = picks.std() if std < 0.5: std = 1.0 IQR = 1.0 if IQR < 0.1: IQR = 0.1 m = min(np.sqrt(std * std), IQR / 1.349) bandwidth = (0.9 * float(m)) / (float(pow(float(len(picks)), 0.2))) if bandwidth > 5: # TODO: Handle this in a manner not using print statements. Maybe set a warning flag print( f"Bandwidth too high! m: {m} std: {std} IQR: {IQR} bandwidth: {bandwidth}" ) kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth) kde.fit(picks) log_dens = kde.score_samples(nums) label = np.exp(log_dens) label = label / label.sum() return label
def kde(): k, v = [], [] score_kde, u = {}, 0 result, frequency, lat, lon = ope_file( path="C:/Users\AMITY UNIVERSITY\Desktop\QWER\TEST_DATA") mlat, mlon, path, north, south, east, west, cout, od, centroid, countdiction, pathdistion, sqcout = pathlist( lat, lon) for key, value in centroid.items(): print(value) if input(): pass cou = 0 for i in zip(result, frequency): print(haversine(value[0], value[1], i[0][0], i[0][1])) X = np.array([i[0][0], i[0][1]]) kde = KernelDensity(kernel='gaussian', bandwidth=0.3).fit([[i[0][0], i[0][1]]]) cou += kde.score_samples([centroid[key]]) * i[1] print(cou) print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%") score_kde[key] = cou k.append(key) v.append(cou) li = colo(v) print(score_kde, k, v, li) return mlat, mlon, path, north, south, east, west, cout, od, centroid, countdiction, pathdistion, sqcout
class KDECluster: ''' points is a vector of vectors [[],[]] ''' def __init__(self, points, bw): if len(points) < 5: self.kde_ = KernelDensity(kernel='gaussian', bandwidth=bw) else: self.kde_ = KernelDensity(kernel='epanechnikov', algorithm='ball_tree', bandwidth=bw, leaf_size=50) self.points_ = points self.kde_.fit(points) #.......................................................................... def compare(self, cluster): scores_self = np.exp(self.kde_.score_samples(cluster.points_)) scores_clus = np.exp(cluster.kde_.score_samples(self.points_)) m_self = max(scores_self) m_clus = max(scores_clus) return max(m_clus, m_self)
def plot_kde(obj, lo, hi, true, test): obj_plot = np.linspace(lo, hi, 10000)[:, np.newaxis] avg_std = np.mean(std(obj)) bandwidth = 1.06 * avg_std * len(obj)**-0.2 plt.figure() # ax = plt.gca() for i in range(obj.shape[1]): a = obj[:, i][:, np.newaxis] #1.06*np.std(a)*len(a)**-0.2 # Bandwidth estimated by Silverman's Rule of Thumb kde = KernelDensity(bandwidth=bandwidth, kernel='gaussian', algorithm='ball_tree') kde.fit(a) log_dens = kde.score_samples(obj_plot) plt.plot(obj_plot, np.exp(log_dens)) # vline_color = next(ax._get_lines.prop_cycler)['color'] # plt.axvline(np.mean(a), linestyle=':', color = vline_color, label='Update %i' %(i+1)) plt.axvline(np.mean(average(obj)), color='red', label='Mean of all predictions') plt.axvline(true, label='True value', linestyle='dashdot', color='black', linewidth=2) plt.ylabel('PDF') plt.xlabel('Cycle') plt.tight_layout() plt.legend()
def _importance_preprocess_uni(states, rewards, gradients, p_tar, p_gen): res = _create_episode_info() flat_states = [s for traj in states for s in traj] # TODO Pass in as args? kde = KernelDensity(kernel='gaussian', bandwidth=0.25) kde.fit(flat_states) for ss, rs, gs, ps, qs in izip(states, rewards, gradients, p_tar, p_gen): state_probs = kde.score_samples(ss) traj_p = np.cumsum(ps) # + np.mean(state_probs) traj_q = np.cumsum(qs) + state_probs traj_grads = np.cumsum(gs, axis=0) r_acc = np.cumsum(rs[::-1])[::-1] r_grad = (r_acc * traj_grads.T).T res.r_grads.extend(r_grad) res.traj_p_tar.extend(traj_p) res.traj_p_gen.extend(traj_q) res.traj_grads.extend(traj_grads) res.traj_r.extend(r_acc) # Used for estimating fisher res.act_grads.extend(gs) res.state_act_p_tar.extend(traj_p) res.state_act_p_gen.extend(traj_q) return res
def basic_properties( sequences , axess=None, labl = None, logscale=[False], markr='.', clr='k',offset=0, alfa = 0.8, distir = [False,False,False, False], bandwidths = [3, 0.1,0.01,1], limits = [(1,50),(0,1),(0,1),(1,25)] ): if axess is None: fig,axess = plt.subplots( 3, len(sequences),False,False, squeeze=False,figsize=(len(sequences)*3,8))#'col' plt.subplots_adjust(left=0.12, bottom=0.05, right=0.95, top=0.94, wspace=0.28, hspace=0.1) plt.subplots_adjust(left=0.45, bottom=0.05, right=0.95, top=0.94, wspace=0.28, hspace=1.2) for i in range(0,len(sequences)): ax = axess[offset][i] seq = sequences[i] smax =max(seq) smin =min(seq) if distir[i]==0: #print seq freqs , bin_edges = np.histogram(seq, smax+1 if smax>1 else 100, range = (0,smax+1) if smax>1 else (0,smax))#, normed = True, density=True) bin_centers = (bin_edges[:-1] + bin_edges[1:])/2. vals = range(0,smax+1) if smax>1 else bin_centers freqs=freqs*1.0/sum(freqs) #remove zeros y = np.array(freqs) nz_indexes = np.nonzero(y) y = y[nz_indexes] x = np.array(vals)[nz_indexes] ax.plot(x, y,':', label=labl, alpha =alfa, color = clr , marker ='.') else : X = np.array(seq) X = [ x for x in X if x>=limits[i][0] and x<=limits[i][1]] # X= (np.abs(X)) # print len(X) X = np.random.choice(X, size=min(10000, len(X))) X = X[:, np.newaxis] kde = KernelDensity(kernel = 'gaussian', bandwidth=bandwidths[i]).fit(X)#,atol=atols[i],kernel = 'tophat'kernel='gaussian' # if 'x' in logscale[i] : # X_plot = np.logspace( limits[i][0], limits[i][1], 1000)[:, np.newaxis] # else : X_plot = np.linspace(limits[i][0], limits[i][1], 1000)[:, np.newaxis] log_dens = kde.score_samples(X_plot) # # ax.fill(X_plot[:, 0], np.exp(log_dens), alpha =0.5, label=labl) Y = np.exp(log_dens) if distir[i]==2: Y = np.cumsum(Y) ax.plot(X_plot[:, 0],Y, '-',label=labl, alpha =alfa, color = clr ,markersize=2, marker ='') verts = [(limits[i][0]-1e-6, 0)] + list(zip(X_plot[:, 0],Y)) + [(limits[i][1]+1e-6, 0)] poly = Polygon(verts, facecolor=clr, alpha =alfa ) #, edgecolor='0.5') ax.add_patch(poly) # ax.set_yticks([]) # ax.set_ylim(bottom=-0.02) ax.set_xlim(limits[i][0],limits[i][1]) if len(logscale)==len(sequences): if 'x' in logscale[i] : ax.set_xscale('log') if 'y' in logscale[i] : ax.set_yscale('log') if i<3: ax.set_ylim(bottom=0.001) # ax.legend() # plt.show(block=False) return axess
def kde_sklearn(x, x_grid, bandwidth=0.2, **kwargs): """Kernel Density Estimation with Scikit-learn""" kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs) kde_skl.fit(x[:, np.newaxis]) # score_samples() returns the log-likelihood of the samples log_pdf = kde_skl.score_samples(x_grid[:, np.newaxis]) return np.exp(log_pdf)
def fitness(self,individual, markets): portfo_return = np.zeros(len(markets[0])) for j in range(len(markets[0])): portfo_return[j] = np.dot(np.array(individual) , np.array(markets).T[j]) ret = portfo_return X = ret[:, np.newaxis] X_plot = np.linspace(min(ret),max(ret), 200)[:, np.newaxis] kde = KernelDensity(kernel='gaussian', bandwidth=self.band_width).fit(X) log_dens = kde.score_samples(X_plot) pdf = np.exp(log_dens) sec_dev = np.diff(pdf,2) qpot = [500.0] for i in range(1,len(pdf)-1): if pdf[i] > 0.0001: jj = sec_dev[i-1]/pdf[i] else: jj=500 qpot.append(jj) qpot.append(500) # dd = X_plot[argrelextrema(qpot, np.greater)] # risk = dd[dd>0][0] - dd[dd<0][-1] xx =[] x = X_plot.reshape(len(qpot)) for i in range(len(qpot)): if qpot[i] >= 499: xx.append(i) x_list = np.array(x)[xx] d_lim = x_list[x_list<0][-1] u_lim = x_list[x_list>0][0] return u_lim-d_lim
def fitKDE(obs, bWidth=0.25, kernel='gaussian', x=None): """ Fit observation with Kernel Density Estimation Methods Snippet 2.2 Testing the Marcdnko-Pastur Theorem Args: obs: bWidth: kernel: x: Returns: """ # Fit kernel to a series of obs, and derive the probability of obs # x : the array of values on which the fit KDE will be evaluated if len(obs.shape) == 1: obs = obs.reshape(-1, 1) if x is None: x = np.unique(obs).reshape(-1, 1) if len(x.shape) == 1: x = x.reshape(-1, 1) kde = KernelDensity(kernel=kernel, bandwidth=bWidth).fit(obs) logProb = kde.score_samples(x) # log(density) pdf = pd.Series(np.exp(logProb), index=x.flatten()) return pdf
def xy_kde(xy,bandwidth,N_grid=100,levels=[0.8,0.6,0.4,0.2]): x_edges = np.linspace(np.min(xy[:,0]),np.max(xy[:,0]),N_grid+1) y_edges = np.linspace(np.min(xy[:,1]),np.max(xy[:,1]),N_grid+1) x_centres = np.array([x_edges[b] + (x_edges[b+1]-x_edges[b])/2 for b in range(N_grid)]) y_centres = np.array([y_edges[b] + (y_edges[b+1]-y_edges[b])/2 for b in range(N_grid)]) x_grid, y_grid = np.meshgrid(x_centres,y_centres) xy_grid = np.array([np.ravel(x_grid),np.ravel(y_grid)]).T kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(xy) H = np.exp(kde.score_samples(xy_grid).reshape(N_grid,N_grid)) # this bit is taken from the corner_plot.py method. ###################################### Hflat = H.flatten() inds = np.argsort(Hflat)[::-1] Hflat = Hflat[inds] sm = np.cumsum(Hflat) sm /= sm[-1] V = np.empty(len(levels)) for i, v0 in enumerate(levels): try: V[i] = Hflat[sm <= v0][-1] except: V[i] = Hflat[0] ##################################### V = np.sort(V) return H, V, x_grid, y_grid
def fitKDE(lst_obs, flt_bandwidth=0.25, str_kernel="gaussian", lst_x_eval=None): """ Function that fits a (given) Kernel Density Estimator to a series of observations, and derive the probability of observation :param lst_obs: the list of observations :param flt_bandwidth: the bandwidth of the kernel :param str_kernel: the kernel to use :param lst_x_eval: array of values on which the fitted KDE will be evaluated return: dtf_pdf: empirical pdf """ # List of observations lst_obs must be a 2-dimensional array if len(lst_obs.shape) == 1: lst_obs = lst_obs.reshape(-1, 1) # Initialize the KDE and fit it on the observations skl_kde = KernelDensity(bandwidth=flt_bandwidth, kernel=str_kernel).fit(lst_obs) # List lst_x_eval must be a 2-dimensional array too # If lst_x_eval it's not provided, let's initialize it as the list of unique observations if lst_x_eval is None: lst_x_eval = np.unique(lst_obs).reshape(-1, 1) if len(lst_x_eval.shape) == 1: lst_x_eval = lst_x_eval.reshape(-1, 1) # Evaluate the log density model on the data (i.e., on lst_x_eval) lst_logProb = skl_kde.score_samples(X=lst_x_eval) # Return the evaluations as pandas series dtf_pdf = pd.Series(data=np.exp(lst_logProb), index=lst_x_eval.flatten()) return dtf_pdf
def kernel_density_estimate(histogram, num_samples, bandwidth): kernel = 'gaussian' # kernel = 'epanechnikov' kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(histogram.reshape(-1, 1)) # kde = KernelDensity(kernel='gaussian', bandwidth=3).fit(a) # s = np.linspace(0, 50) # v = np.linspace(0,10,10) # plt.plot(v, a) # plt.show() # wid = 831 s = np.linspace(0, num_samples, num_samples) # e = kde.score_samples(s.reshape(-1, 1)) e = kde.score_samples(s.reshape(-1, 1)) prb = np.exp(e) # plt.figure() # print('e', e.shape, e) # print('prb', prb.shape, prb) '''reduce ''' # reduced_hist = np.unique(hist) # num_bin = len(reduced_hist) # print('num_bin', num_bin) '''extremas of probability density''' mi, ma = argrelextrema(prb, np.less)[0], argrelextrema(prb, np.greater)[0] # print("Minima:", mi) # print("Maxima:", ma) return mi,ma, s, prb
def _density(self): d = list() h = dict() bw = self.np.size**(-1. / 5) kd = KernelDensity(kernel='gaussian', bandwidth=bw).fit(self.np.reshape(-1, 1)) kd_vals = np.exp(kd.score_samples(self.np.reshape(-1, 1))) for i, x in enumerate(kd_vals): h[self.np[i]] = x for x in sorted(h): dp = h[x] * (0.4 / max(kd_vals)) d.append([x, self.seriesId - dp, self.seriesId + dp]) return { 'id': str(self.seriesId), 'name': self.seriesName, 'type': 'areasplinerange', 'enableMouseTracking': False, 'marker': { 'symbol': 'circle', 'enabled': False }, 'color': 'Highcharts.getOptions().colors[' + str(self.seriesId) + ']', 'data': d }
def kernel_density(self): pre_data = self.data start = np.array(pre_data) start_len = len(start) resolution = np.linspace(0, 1, num=10).tolist() pre_data = np.histogram(pre_data, bins=resolution)[0] pre_data = pre_data / max(pre_data) pre_data = np.array([int(i*100) for i in pre_data.tolist()]) initial_length = int(len(pre_data) * 2) # 2 is an arbitary good number to use a = pre_data.reshape(-1, 1) kde = KernelDensity(kernel='gaussian', bandwidth=2).fit(a) s = np.linspace(0, initial_length) e = kde.score_samples(s.reshape(-1, 1)) lower_boundaries = argrelextrema(e, np.less)[0] minima = s[lower_boundaries] demodulated_index = [int((i/initial_length)*start_len) for i in minima] return start[np.array(demodulated_index)]
def plot_dos(phonons, bandwidth=.05, n_points=200, is_showing=True, input_fig=None): if input_fig is None: fig = plt.figure() else: fig = input_fig kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit( phonons.frequency.flatten(order='C').reshape(-1, 1)) x = np.linspace(phonons.frequency.min(), phonons.frequency.max(), n_points) y = np.exp(kde.score_samples(x.reshape((-1, 1)))) plt.plot(x, y) plt.fill_between(x, y, alpha=.2) plt.xlabel("$\\nu$ (THz)", fontsize=16) plt.ylabel('DOS', fontsize=16) plt.tick_params(axis='both', which='major', labelsize=16) plt.tick_params(axis='both', which='minor', labelsize=16) folder = get_folder_from_label(phonons, base_folder=DEFAULT_FOLDER) if not os.path.exists(folder): os.makedirs(folder) fig.savefig(folder + '/' + 'dos.png') if is_showing: plt.show() elif input_fig is None: plt.close() else: return fig
def kde_example(): # ---------------------------------------------------------------------- # Plot a 1D density example N = 100 np.random.seed(1) X = np.concatenate((np.random.normal(0, 1, 0.3 * N), np.random.normal(5, 1, 0.7 * N)))[:, np.newaxis] X_plot = np.linspace(-5, 10, 1000)[:, np.newaxis] true_dens = (0.3 * norm(0, 1).pdf(X_plot[:, 0]) + 0.7 * norm(5, 1).pdf(X_plot[:, 0])) fig, ax = plt.subplots() ax.fill(X_plot[:, 0], true_dens, fc='black', alpha=0.2, label='input distribution') for kernel in ['gaussian', 'tophat', 'epanechnikov']: kde = KernelDensity(kernel=kernel, bandwidth=0.5).fit(X) log_dens = kde.score_samples(X_plot) ax.plot(X_plot[:, 0], np.exp(log_dens), '-', label="kernel = '{0}'".format(kernel)) ax.text(6, 0.38, "N={0} points".format(N)) ax.legend(loc='upper left') ax.plot(X[:, 0], -0.005 - 0.01 * np.random.random(X.shape[0]), '+k') ax.set_xlim(-4, 9) ax.set_ylim(-0.02, 0.4) plt.show()
def KDE(lst, dum): result = [] lst2 = [] lst.sort() if len(lst) > 1: for i in range(len(lst)): f, w = math.modf(lst[i]) lst2.append(round(w + (f / 0.6), 2)) a = array(lst2).reshape(-1, 1) kde = KernelDensity(kernel='gaussian', bandwidth=0.45).fit(a) s = linspace(0, 24) e = kde.score_samples(s.reshape(-1, 1)) mi = argrelextrema(e, np.less)[0] mi = s[mi] if (len(mi) > 0): for k in range(len(mi) + 1): if k == 0: result.append(list(filter(lambda i: i['t'] < mi[k], dum))) elif k == len(mi): result.append( list(filter(lambda i: i['t'] >= mi[k - 1], dum))) else: result.append( list( filter( lambda i: i['t'] >= mi[k - 1] and i['t'] < mi[ k], dum))) return result else: return dum else: return 0
def entropy_Integral(X): ''' Integral Estimate using summation ''' kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(X[:, None]) logprob = kde.score_samples(X[:, None]) return -1 * np.average(logprob, weights=np.exp(logprob))
def sklearn_kde_plot(dataframe, choose_choice, topic_name, fold_num): # print(dataframe) N = dataframe.values.size X = dataframe.values[:, np.newaxis] # X_plot = np.linspace(min(dataframe.values), max(dataframe.values), num=500)[:, np.newaxis] X_plot = np.linspace(min(dataframe.values), 10, num=500)[:, np.newaxis] # SET THISS # X_plot = np.linspace(min(dataframe.values), 10, num=500)[:, np.newaxis] # print(min(dataframe.values)) # print(max(dataframe.values)) # print(dataframe) true_dens = (0.3 * norm(0, 1).pdf(X_plot[:, 0]) + 0.7 * norm(5, 1).pdf(X_plot[:, 0])) fig, ax = plt.subplots() # ax.fill(X_plot, true_dens, fc='black', alpha=0.2, label='input distribution') # kde = KernelDensity(kernel='gaussian', bandwidth=0.005).fit(X) # 'tophat', 'epanechnikov' kde = KernelDensity(kernel='gaussian', bandwidth=0.01).fit(X) # 'tophat', 'epanechnikov' SET THISSSSSSSS log_dens = kde.score_samples(X_plot) ax.plot(X_plot[:, 0], np.exp(log_dens), '-', label="kernel = '{0}'".format('gaussian')) ax.text(6, 0.38, "N={0} points".format(N)) ax.legend(loc='upper right') # ax.plot(X[:, 0], -0.005 - 0.0005 * np.random.random(X.shape[0]), '+k') ax.plot(X[:, 0], -0.005 - 0.005 * np.random.random(X.shape[0]), '+k') # ax.set_xlim(min(dataframe.values), max(dataframe.values)) ax.set_xlim(0, 10) # SET THISSSSSSSS # ax.set_ylim(-0.02, 1) ax.set_ylim(-0.02, 1.0) # SET THISSSSSSSS ax.set_xlabel("Delta Follower") ax.set_ylabel("Density") plt.title('Density - ' + choose_choice + ' (' + topic_name + ', ' + fold_num + ')') plt.show() return
def density(self, x=None, kernel='gaussian', bandwidth=0.1, rtol=0.05, **kwargs): """Kernel density estimation. Parameters ---------- x : ndarray Time points at which to evaluate density. If `x` is None, then a KernelDensity object is returned. kernel : str bandwidth : scalar Kernel bandwidth rtol, **kwargs : extra arguments for sklearn.neighbor.kde.KernelDensity. Returns ------- ndarray or KernelDensity object """ from sklearn.neighbors.kde import KernelDensity #create density function #TODO test speed of KernelDensity - roll our own? kde = KernelDensity(kernel=kernel, bandwidth=bandwidth, rtol=rtol, **kwargs).fit(self._data[:, None]) if x is not None: #evaluate density function x = np.array(x, copy=False) kde = np.exp(kde.score_samples(x[:, None])) return kde
def get_highest_concentration(self, remove=False): '''Returns the point in the unassigned set with the highest estimated concentration using the Gaussian KDE estimator. args: remove: Boolean to define if the point should be taken out of the unassigned list or not. returns: The point with the highest concentration. ''' temp_point_list = list( self._points) #To preserve order going through kde estimation. lat_lng = { 'lat': [p.lat for p in temp_point_list], 'lng': [p.lng for p in temp_point_list] } lat_lng = pd.DataFrame(lat_lng) kde = KernelDensity(bandwidth=0.2, metric='haversine').fit(lat_lng) scored = kde.score_samples(lat_lng) lat_lng = lat_lng.assign(density=scored) highest = lat_lng['density'].idxmax() highest = temp_point_list[highest] if remove: self._points.remove(highest) return highest
def plot_kde(data, ax, settings): try: from sklearn.neighbors.kde import KernelDensity except: warnings.warn( "Cannot import sklearn.neighbors.kde. Cannot plot kernel density estimate." ) return x = np.linspace(0, max(data), 200) if settings["plotting.kde_bandwidth"] is not None: bw = settings["plotting.kde_bandwidth"] kde = KernelDensity(kernel=settings["plotting.kde_kernel"], bandwidth=bw).fit(data.values.reshape(-1, 1)) else: grid = GridSearchCV( KernelDensity(kernel=settings["plotting.kde_kernel"]), {'bandwidth': np.linspace(math.radians(2), math.radians(30), 40)}, cv=min(10, len(data))) # 10-fold cross-validation try: grid.fit(data.values.reshape(-1, 1)) except ValueError: return #Do not plot kde, if we do not have enough datapoints #print("Bandwidth = {}".format(grid.best_params_)) kde = grid.best_estimator_ ax.plot(x, np.exp(kde.score_samples(x.reshape(-1, 1))), label="kde", linewidth=settings["plotting.kde_linewidth"], color=settings["plotting.kde_color"])
def KDE(lst): result = [] lst2 = [] lst.sort() if len(lst) > 1: for i in range(len(lst)): f, w = math.modf(lst[i]) lst2.append(round(w + (f / 0.6), 2)) a = array(lst2).reshape(-1, 1) kde = KernelDensity(kernel='gaussian', bandwidth=0.45).fit(a) s = linspace(0, 24) e = kde.score_samples(s.reshape(-1, 1)) mi = argrelextrema(e, np.less)[0] if (len(mi) > 0): for i in range(len(mi) + 1): if i == 0: temp = a[a < s[mi[i]]] elif i == len(mi): temp = a[a >= s[mi[i - 1]]] else: temp = a[(a >= s[mi[i - 1]]) * (a <= s[mi[i]])] if (len(temp) > 1): for i in range(len(temp)): f, w = math.modf(temp[i]) temp[i] = round(w + (f * 0.6), 2) result.append(temp) print(result) else: print(lst) else: print('no')
def cluster_density(coords, gridcoords, bw = 0.004, CV_bw=False): """Compute KDE and return scores of grid points Parameters: ------------ coords: np.array (n, 3) coordinates of points in cluster(!) gridcoords: np.array (n, 3) coordinates of grid, in which the density must be mapped bw: float bandwidth of Gaussian kernel, now hand tuned, depends on rectangular grid resolution. bandwidth also depends on user; how important are 'lonely' neurons in the clusters? Returns: ----------- den scores of grid coords """ if CV_bw: grid = GridSearchCV(KernelDensity(), {'bandwidth': np.linspace(0.005, 0.03, 11)}, cv=25, verbose=0) grid.fit(coords) bw = grid.best_params_['bandwidth'] print(f'Best bandwidth {bw}') return grid else: kde = KernelDensity(kernel='gaussian', bandwidth=bw).fit(coords) # den = np.exp(kde.score_samples(refcoords)) # return scores den = kde.score_samples(gridcoords) # return log scores return den
def KL_div(approximate_target_samples, target_dist_parameters, true_target='Gamma'): """ This function calculates the approximate KL divergence between a true density function and an approximate one. The approximation is done via Gaussian KDE. """ # import pdb; pdb.set_trace() if true_target == 'Gamma': true_target_pdf = st.gamma.pdf(approximate_target_samples, a=target_dist_parameters[0], scale=1.0 / target_dist_parameters[1]) elif true_target == 'Beta': true_target_pdf = st.beta.pdf(approximate_target_samples, a=target_dist_parameters[0], b=target_dist_parameters[1]) # Now approximate the approximate_target_pdf T1_samples_for_kde = approximate_target_samples[:, np.newaxis] T1_kde = KernelDensity(kernel='gaussian').fit(T1_samples_for_kde) log_density = T1_kde.score_samples(T1_samples_for_kde) approximate_target_pdf = np.exp(log_density) # If the true_target_pdf has any 0 entries, the entropy will blow up # Take these samples out if np.any(true_target_pdf == 0): print('PDF of 0 detected at ' + str(len(np.where(true_target_pdf == 0))) + ' points.') print('Removing those points.') return entropy(approximate_target_pdf[true_target_pdf > 0], true_target_pdf[true_target_pdf > 0])
def clustering(self): """ Create clusters by movie scores """ # kernel density estimation values = self.df['score'].values.reshape(-1, 1) kde = KernelDensity(kernel='gaussian', bandwidth=3).fit(values) # find cluster min-max points s = np.linspace(650, 18000) e = kde.score_samples(s.reshape(-1, 1)) mi, ma = argrelextrema(e, np.less)[0], argrelextrema(e, np.greater)[0] # concat min-max points points = np.concatenate((s[mi], s[ma]), axis=0) buckets = [] for point in points: buckets.append(point) buckets = np.array(buckets) buckets.sort() # assign clusters self.df.loc[:, 'cluster'] = buckets.searchsorted(self.df.score)
def infer_from_contig2(df, contigs, contig_id, K=100000, K0=3000): # generate global KDE estimation C = df[(df['X1']==contig_id) & (df['X2']==contig_id)] inter = np.abs(C['P1'].values - C['P2'].values) kde = KernelDensity(kernel='gaussian', bandwidth=200).fit(inter.reshape(-1, 1)) f = lambda x: kde.score_samples(x.reshape(-1, 1)) # distant x1 = np.logspace(np.log10(K0), np.log10(K), 500) p = lambda x, a, b: a + b * np.log(x) param1, cov = curve_fit(p, x1, f(x1)) # proximal degree = 30 x0 = np.logspace(0, np.log10(K0), 500) param0 = np.polyfit(x0, f(x0), degree) P = (lambda x: np.where( \ x < K0, \ np.poly1d(param0)(x), \ np.where(x < K, param1[0] + param1[1] * np.log(x), param1[0] + param1[1] * np.log(K)) \ )) # P = (lambda x: np.where( \ # x < K0, \ # param1[0] + param1[1] * np.log(K0), \ # np.where(x < K, param1[0] + param1[1] * np.log(x), param1[0] + param1[1] * np.log(K)) \ # )) return P, f
def KDE_tri(pred_point_cloud, bbox, text): ''' use KDE to filter outliers in predict point cloud ''' (h0, h1, w0, w1, d0, d1) = bbox X,Y,Z = np.mgrid[h0:h1, w0:w1, d0:d1] positions = np.vstack([X.ravel(), Y.ravel(), Z.ravel()]) # 1. KDE start = time.time() kde = KernelDensity(kernel='epanechnikov', bandwidth=KDE_bandwidth).fit(pred_point_cloud.T) score = kde.score_samples(positions.T) score = score.reshape(h1-h0, w1-w0, d1-d0) filtered_pred_point_cloud = np.where(score > KDE_log_prob_th) points_list = [filtered_pred_point_cloud[0] + h0, filtered_pred_point_cloud[1] + w0, filtered_pred_point_cloud[2] + d0] print('KDE filter done', time.time() - start) print('filtered_pred_point_cloud (', filtered_pred_point_cloud[0].shape[0], '* 3 )') text.write('filtered_pred_point_cloud: ' + str(filtered_pred_point_cloud[0].shape[0]) + ' * 3 \n') text.flush() # 2. Delaunay triangulation start = time.time() points = np.asarray(points_list).T tri = Delaunay(points) print('Delaunay triangulation done', time.time() - start) return points, tri
def kdewrap(indata, kernel): grid = GridSearchCV(KernelDensity(), {'bandwidth': np.linspace(0.1, 1.0, 30)}, cv=10) # 10-fold cross-validation grid.fit(indata[:, None]) kde = KernelDensity(kernel=kernel, bandwidth=grid.best_params_["bandwidth"]).fit(indata[:, np.newaxis]) return kde.score_samples(indata[:, np.newaxis])
class RegularizedKernelDensityEstimator(BaseEstimator): def __init__(self, bandwidth=1.0, regularization=1.0e-5): self.bandwidth = bandwidth self.regularization = regularization def setup(self): self.kde = KernelDensity(kernel='gaussian', bandwidth=self.bandwidth) height, width = self.shape self.uniform_density = -np.log(width * height) self.kde_constant = np.log(1 - self.regularization) self.uniform_constant = np.log(self.regularization) def fit(self, X): self.shape = X[0, 2:4] self.setup() self.kde.fit(X[:, 0:2]) return self def score_samples(self, X): kde_logliks = self.kde.score_samples(X[:, :2]) logliks = np.logaddexp(self.kde_constant + kde_logliks, self.uniform_constant + self.uniform_density) return logliks def score(self, X): return np.sum(self.score_samples(X))
def simplify3(nk): result=[] nk=np.array(nk) xk = nk/float(np.sum(nk)) #print nk #X_plot = np.linspace(0, len(nk), 1000)[:, np.newaxis] sdiv=1000 X_plot = np.linspace(0, len(xk), sdiv)[:, np.newaxis] custm = stats.rv_discrete(name='custm',a=0,b=7, values=(range(len(xk)), xk)) yk= custm.rvs(size=100000) #yk.flatten() #fig, ax = plt.subplots(1, 1) #ax.hist(yk, normed=True, histtype='stepfilled', alpha=0.2) # gaussian KDE X=yk.reshape(-1, 1) kde = KernelDensity(kernel='gaussian', bandwidth=0.6).fit(X) log_dens = kde.score_samples(X_plot) mi, ma = argrelextrema(log_dens, np.less)[0], argrelextrema(log_dens, np.greater)[0] mi=np.rint(mi*float(len(xk))/float(sdiv)) ma=np.rint(ma*float(len(xk))/float(sdiv)) start=0 #print mi for i in mi: i=int(i) if start!=i: val=np.average(nk[start:i]) for j in xrange(start,i): result.append(val) start=i val=np.average(nk[start:]) for j in xrange(start,len(nk)): result.append(val) return np.array(result)
def projected_density_gauss(pos, fov, ncells): """ Input: pos: particle positions mass: particle masses centre: centre of sub-&halo fov: field-of-view ncells: number of grid cells """ pos = pos - centre _indx = np.logical_and(np.abs(pos[:, 0]) < 0.5*fov, np.abs(pos[:, 1]) < 0.5*fov) pos = pos[_indx, :] n = 1024*1024 h = (4*np.std(pos[:, :2])**5/(3*n))**(1/5) #TODO: plot this falty situation kde_skl = KernelDensity(bandwidth=h, kernel='gaussian', algorithm='ball_tree') xx, yy = np.mgrid[min(pos[:, 0]):max(pos[:, 0]):complex(ncells), min(pos[:, 1]):max(pos[:, 1]):complex(ncells)] xy_sample = np.vstack([xx.ravel(), yy.ravel()]).T kde_skl.fit(pos[:, :2]) sigma = np.exp(kde_skl.score_samples(xy_sample)) sigma = sigma.reshape(xx.shape) return sigma, h
def pdf(data: list): # hist, bin = np.histogram(data, bins=50) # return hist kde = KernelDensity(kernel='gaussian', bandwidth=0.1).fit([[x] for x in data]) b = [[x] for x in np.linspace(min(data), max(data), 100)] a = np.exp(kde.score_samples(b)) return a
def find_max_density_point(point_list): point_list, _ = remove_nan(point_list) if point_list.shape[0] == 0: return [float('nan'),float('nan'),float('nan')] kde = KernelDensity(kernel='gaussian', bandwidth=0.01).fit(point_list) prob_list = kde.score_samples(point_list) max_point = point_list[np.argmax(prob_list)] # print "max", max_point return max_point
def createfeatmat(N): grid = getgridcoords(N).T featmat = np.zeros((len(vals), N ** 2)) for i in range(len(vals)): m = np.array([vals[i][0], vals[i][1]]).T k = KernelDensity(bandwidth=0.5 / (N - 1), kernel="gaussian") k.fit(m) featmat[i, :] = k.score_samples(grid) return featmat
def histLine(axes, data, minmax, color): (xmin, xmax) = minmax data = data.reshape(-1, 1) kde = KernelDensity(bandwidth=(xmax-xmin)/100.0).fit(data) x = np.linspace(xmin, xmax, 100).reshape(-1, 1) foo = kde.score_samples(x) density = np.exp(foo) axes.plot(x, density, color=color)
def estimate_distribution(samples, h=0.1, n_points=100): kde = KernelDensity(bandwidth=h) min_xs = min(samples) max_xs = max(samples) samples = samples[:, np.newaxis] kde.fit(samples) xs = np.linspace(min_xs, max_xs, n_points) ys = np.exp(kde.score_samples(xs[:, np.newaxis])) print xs.shape, ys.shape, sum(ys) return xs, ys
class OneClassKDE(BaseClassifier): _fit_params = ["bandwidth"] def __init__(self, *args, **kwargs): self.bandwidth = kwargs["bandwidth"] def fit(self, data, **kwargs): #self.train_data = data self.kde = KernelDensity(kernel='gaussian', bandwidth=self.bandwidth) self.kde.fit(data) self.training_score = self.kde.score_samples(data) self.direct_thresh = numpy.percentile(self.training_score, 10) def predict(self, data): score = self.kde.score_samples(data) self.score = score return (score < self.direct_thresh).astype(numpy.int32)*-2+1 def decision_function(self, data): return self.score
def kde_sklearn(x, x_grid, bandwidth=0.2, **kwargs): """Kernel Density Estimation with Scikit-learn""" kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs) kde_skl.fit(x[:, np.newaxis]) # score_samples() returns the log-likelihood of the samples log_pdf = kde_skl.score_samples(x_grid[:, np.newaxis]) N = np.trapz(np.exp(log_pdf), x_grid) return np.exp(log_pdf)/N
def fit(self, X, y): a = np.zeros((24, 7)) hours = np.copy(X[:, 1]) weekdays = np.copy(X[:, 2]) hours = 23 * normalize(hours) weekdays = 6 * normalize(weekdays) if self.strategy == 'mean': counts = a.copy() for i, row in enumerate(zip(hours, weekdays)): hour = int(row[0]) day = int(row[1]) counts[hour, day] += 1 a[hour, day] += y[i] counts[counts == 0] = 1 self._model = a / counts elif self.strategy in ('median', 'kernel'): # this is a 3d array groups = [[[] for i in range(7)] for j in range(24)] for i, row in enumerate(zip(hours, weekdays)): hour = int(row[0]) day = int(row[1]) groups[hour][day].append(y[i]) if self.strategy == 'median': for i, j in np.ndindex((24, 7)): if groups[i][j]: a[i,j] = np.median(groups[i][j]) else: a[i,j] = np.nan elif self.strategy == 'kernel': # kernel method computes a kernel density for each of the # bins and determines the most probably value ('mode' of sorts) grid = np.linspace(np.min(y), np.max(y), 1000)[:, np.newaxis] for i, j in np.ndindex((24, 7)): if groups[i][j]: npgroups = np.array(groups[i][j])[np.newaxis] kernel = KernelDensity(kernel='gaussian', \ bandwidth=0.2).fit(npgroups.T) density = kernel.score_samples(grid) dmax = np.max(density) imax = np.where(density==dmax) a[i,j] = grid[imax, 0] else: a[i,j] = np.nan self._model = a # smooth the model here if there are nans return self
def densityEst(a,x,p,knn=1,Mode='G'): """ This is a density estimation currently supporting one-dimensional Data. There are two modes of operation: knn==0 (Default) use fixed bandwidth. knn==1 use k nearest neigbors. Tow types of kernel are supported: Mode=='T' (Default) for triangular. Mode=='G' for Gaussian. a is a vector of samples. p is the parameter of model (bandwidth when knn=0 of number of neighbors otherwise. x is points of estimation """ N=len(x) x.resize(N,1) l=len(a) a=num.array(a) a.resize(l,1) if knn==0: try: from sklearn.neighbors.kde import KernelDensity except ImportError: print 'Error:Please install sklearn package...' return if Mode=='T': S='linear' elif Mode=='G': S='gaussian' else: print 'Currently only G(gaussian) and T(triangular) Modes are supported' return kde = KernelDensity(kernel=S, bandwidth=p).fit(a) return (x,num.exp(kde.score_samples(x))) elif knn==1: try: from sklearn.neighbors import NearestNeighbors except ImportError: print 'Error:Please install sklearn package...' return neigh = NearestNeighbors(n_neighbors=p) neigh.fit(a) dist,index=neigh.kneighbors(x) H=dist[:,-1] est=[0.0]*N for i,point_v in enumerate(x): point=point_v[0] h=H[i] est[i]=sum(kernel((a-point)/h,Mode))/(l*h) return (x,est) else: print 'knn must be 0 or 1' return
def kernel_pmi_func(df, x, y, i, b=1.0): x = np.array(df[x]) y = np.array(df[y]) x_y = np.stack((x, y), axis=-1) kde_x = KernelDensity(kernel='gaussian', bandwidth=b).fit(x[:, np.newaxis]) kde_y = KernelDensity(kernel='gaussian', bandwidth=b).fit(y[:, np.newaxis]) kde_x_y = KernelDensity(kernel='gaussian', bandwidth=b).fit(x_y) p_x = np.exp(kde_x.score_samples(x[:, np.newaxis])) p_y = np.exp(kde_y.score_samples(y[:, np.newaxis])) p_x_y = np.exp(kde_x_y.score_samples(x_y)) # df['PMI_'+str(i)] = np.log( p_x_y / (p_x * p_y) ) # print "len p_x", len(p_x), "len p_y", len(p_y), "len p x y", len(p_x_y) # return df vals = np.log(p_x_y / (p_x * p_y)) # print vals[1] return vals
def plot_stan_trc(dftrc): """ Create simple plots of parameter distributions and traces from output of pystan sampling. Emulates pymc traceplots. """ fig, ax2d = plt.subplots(nrows=dftrc.shape[1], ncols=2, figsize=(14, 1.8*dftrc.shape[1]), facecolor='0.99', edgecolor='k') fig.suptitle('Distributions and traceplots for {} samples'.format( dftrc.shape[0]),fontsize=14) fig.subplots_adjust(wspace=0.2, hspace=0.5) k = 0 # create density and traceplot, per parameter coeff for i, (ax1d, col) in enumerate(zip(ax2d, dftrc.columns)): samples = dftrc[col].values scale = (10**np.round(np.log10(samples.max() - samples.min()))) / 20 kde = KernelDensity(bandwidth=scale).fit(samples.reshape(-1, 1)) x = np.linspace(samples.min(), samples.max(), 100).reshape(-1, 1) y = np.exp(kde.score_samples(x)) clr = sns.color_palette()[0] # density plot ax1d[0].plot(x, y, color=clr, linewidth=1.4) ax1d[0].vlines(np.percentile(samples, [2.5, 97.5]), ymin=0, ymax=y.max()*1.1, alpha=1, linestyles='dotted', colors=clr, linewidth=1.2) mn = np.mean(samples) ax1d[0].vlines(mn, ymin=0, ymax=y.max()*1.1, alpha=1, colors='r', linewidth=1.2) ax1d[0].annotate('{:.2f}'.format(mn), xy=(mn,0), xycoords='data' ,xytext=(5,10), textcoords='offset points', rotation=90 ,va='bottom', fontsize='large', color='#AA0022') ax1d[0].set_title('{}'.format(col), fontdict={'fontsize':10}) # traceplot ax1d[1].plot(np.arange(len(samples)),samples, alpha=0.2, color=clr, linestyle='solid' ,marker=',', markerfacecolor=clr, markersize=10) ax1d[1].hlines(np.percentile(samples,[2.5, 97.5]), xmin=0, xmax=len(samples), alpha=1, linestyles='dotted', colors=clr) ax1d[1].hlines(np.mean(samples), xmin=0, xmax=len(samples), alpha=1, colors='r') k += 1 ax1d[0].set_title('{}'.format(col), fontdict={'fontsize':14})#,'fontweight':'bold'}) #ax1d[0].legend(loc='best', shadow=True) _ = [ax1d[j].axes.grid(True, linestyle='-', color='lightgrey') for j in range(2)] plt.subplots_adjust(top=0.94) plt.show()
def chart_by_time(): weekday_amrush = [sum(traj[1:]) for traj in trajs_with_time if traj[0].weekday() not in [5,6] and traj[0].hour in [7,8,9]] weekday_pmrush = [sum(traj[1:]) for traj in trajs_with_time if traj[0].weekday() not in [5,6] and traj[0].hour in [17,18,19]] weekday_midday = [sum(traj[1:]) for traj in trajs_with_time if traj[0].weekday() not in [5,6] and traj[0].hour in [10,11,12,13,14,15,16]] weekday_night = [sum(traj[1:]) for traj in trajs_with_time if traj[0].weekday() not in [5,6] and traj[0].hour in [20,21,22,23,0,1,2,3,4,5,6]] weekend = [sum(traj[1:]) for traj in trajs_with_time if traj[0].weekday() in [5,6]] weekday_amrush_avg = sum(weekday_amrush) / float(len(weekday_amrush)) weekday_pmrush_avg = sum(weekday_pmrush) / float(len(weekday_pmrush)) weekday_midday_avg = sum(weekday_midday) / float(len(weekday_midday)) weekday_night_avg = sum(weekday_night) / float(len(weekday_night)) weekend_avg = sum(weekend) / float(len(weekend)) print("weekday_amrush_avg: ", weekday_amrush_avg, "weekday_pmrush_avg: ", weekday_pmrush_avg, "weekday_midday_avg: ", weekday_midday_avg, "weekday_night_avg: ", weekday_night_avg, "weekend_avg: ", weekend_avg) x = np.linspace(min(weekday_amrush+weekday_pmrush+weekday_midday+weekday_night+weekend), max(weekday_amrush+weekday_pmrush+weekday_midday+weekday_night+weekend), 100).reshape(-1, 1) kde_weekday_amrush = KernelDensity(bandwidth=70).fit(np.array(weekday_amrush).reshape(-1, 1)) density_weekday_amrush = np.exp(kde_weekday_amrush.score_samples(x)) kde_weekday_pmrush = KernelDensity(bandwidth=70).fit(np.array(weekday_pmrush).reshape(-1, 1)) density_weekday_pmrush = np.exp(kde_weekday_pmrush.score_samples(x)) kde_weekday_midday = KernelDensity(bandwidth=70).fit(np.array(weekday_midday).reshape(-1, 1)) density_weekday_midday = np.exp(kde_weekday_midday.score_samples(x)) kde_weekday_night = KernelDensity(bandwidth=70).fit(np.array(weekday_night).reshape(-1, 1)) density_weekday_night = np.exp(kde_weekday_night.score_samples(x)) kde_weekend = KernelDensity(bandwidth=70).fit(np.array(weekend).reshape(-1, 1)) density_weekend = np.exp(kde_weekend.score_samples(x)) plt.plot(x, density_weekday_amrush, 'r') plt.plot(x, density_weekday_pmrush, 'y') plt.plot(x, density_weekday_midday, 'g') plt.plot(x, density_weekday_night, 'b') plt.plot(x, density_weekend, 'm') plt.xlabel("Time start to endpoint") plt.ylabel("Density") plt.show()
def pda_single(synth_data, data, bandwidth=.1): #synth_data = np.log(np.abs(synth_data))[:, np.newaxis] #data_log = np.log(np.abs(data))[:, np.newaxis] synth_data = synth_data[:, np.newaxis] data = data[:, np.newaxis] if bandwidth == 'silverman': lower, upper = scoreatpercentile(synth_data, [25, 75]) iqr = upper - lower sd = np.std(synth_data) bandwidth = .9 * min(sd, iqr/1.34) * len(data)**(-1./5) kde = KernelDensity(kernel='epanechnikov', bandwidth=bandwidth).fit(synth_data) return kde.score_samples(data)
def chart_by_day(): # # On average, trips on the weekend take less time than trips on weekdays # 1337 sec versus 1446 sec # weekend_times = [sum(traj[1:]) for traj in trajs_with_time if traj[0].weekday() in [5,6]] weekday_times = [sum(traj[1:]) for traj in trajs_with_time if traj[0].weekday() not in [5,6]] weekend = sum(weekend_times) / float(len(weekend_times)) weekday = sum(weekday_times) / float(len(weekday_times)) print("weekend: ", weekend, "weekday: ", weekday) x = np.linspace(min(weekend_times + weekday_times), max(weekend_times + weekday_times), 100).reshape(-1, 1) kde_weekend = KernelDensity(bandwidth=100).fit(np.array(weekend_times).reshape(-1, 1)) density_weekend = np.exp(kde_weekend.score_samples(x)) kde_weekday = KernelDensity(bandwidth=100).fit(np.array(weekday_times).reshape(-1, 1)) density_weekday = np.exp(kde_weekday.score_samples(x)) plt.plot(x, density_weekend, 'r') plt.plot(x, density_weekday, 'b') plt.xlabel("Time start to Grand Ave: red: weekend, blue, weekday") plt.ylabel("Density") plt.show()
def find_centroid(data, bandwidth=0.003, iter_num=6, halfwidth=0.02): kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(data) grid = 10 position = np.array([0,0]) #halfwidth = 0.02 for i in range(iter_num): low = position-halfwidth high = position+halfwidth X, Y = np.mgrid[low[0]:high[0]:20j, low[1]:high[1]:20j] positions = np.vstack([X.ravel(), Y.ravel()]).T img = kde.score_samples(positions) position = positions[np.argmax(img)] halfwidth = halfwidth*2./(grid-1.) return position
def test_density_plot(): fig, ax = plt.subplots(2, 2, sharex=True, sharey=True) N=20 X = np.concatenate((np.random.normal(0, 1, 0.3 * N), np.random.normal(5, 1, 0.7 * N)))[:, np.newaxis] print np.shape(X) X_plot = np.linspace(-5, 10, 1000)[:, np.newaxis] print np.shape(X_plot) kde = KernelDensity(kernel='gaussian', bandwidth=0.75).fit(X) log_dens = kde.score_samples(X_plot) ax[0,0].fill(X_plot[:, 0], np.exp(log_dens), fc='#AAAAFF') ax[0,0].text(-3.5, 0.31, "Gaussian Kernel Density") ax[0,0].plot(X[:, 0], np.zeros(X.shape[0]) - 0.01, '+k') plt.show()
def makeKDE(m): m = m[(m>-100) & (m<100)] m = m[:, np.newaxis] # Training data l = len(m) sigma = np.std(m) kdebw = (1.*4/3*sigma**5/ l)**(1./5.) try: X_plot = np.linspace(rng[0], rng[1], 1000)[:, np.newaxis] kde = KernelDensity(kernel='gaussian', bandwidth=kdebw).fit(m) log_dens = kde.score_samples(X_plot) log_dens_exp = np.exp(log_dens) KDE_mag = np.float(X_plot[np.argmax(log_dens_exp)]) except ValueError: log_dens_exp = np.ones(len(X_plot[:,0]))*-99.99 KDE_mag, sigma = -99.99, -99.99 return X_plot, log_dens_exp, KDE_mag, sigma
def plot_2d(i1, i2): #px = pca.components_[i1] #py = pca.components_[i2] #xlabel, ylabel = [], [] #for i in xrange(len(px)): # xlabel.append('%.2f %s' % (px[i], colnames[i])) # ylabel.append('%.2f %s' % (py[i], colnames[i])) #ax = plt.axes() #ax.yaxis.set_label_coords(-0.05, 0.2) plt.clf() xy = np.vstack([output[:, i1], output[:, i2]]).T kde = KernelDensity(kernel='tophat', bandwidth=0.01, leaf_size=10).fit(xy) z = kde.score_samples(xy) # Sort the points by density, so that the densest points are plotted last idx = z.argsort() x, y, z = output[idx, i1], output[idx, i2], z[idx] plt.xlabel('Component %i' % i1) plt.ylabel('Component %i' % i2) plt.scatter(x, y, c=z, s=10, edgecolor='') plt.savefig('ML_data/%s_pca_%s_%s.png' % (name, i1, i2))
def simplify_data2(x,y,size): avg=[] result=[] kde = KernelDensity(kernel='tophat', bandwidth=0.5).fit(x) s = np.linspace(0,size,len(x)) e = kde.score_samples(s.reshape(-1,1)) mi, ma = argrelextrema(e, np.less)[0], argrelextrema(e, np.greater)[0] start=0 for i in mi: val=np.average(x[start:i]) for j in xrange(start,i): result.append(val) start=i val=np.average(x[start:]) for j in xrange(start,len(x)): result.append(val) #plt.plot(s, e*0.01+e[mi[0]]) print mi print ma plt.plot(s,x.reshape(1,-1)[0]) plt.plot(s,result) #print x, len(x) plt.show()
def doKDEBasedPlot(dataSamples,targetName,featureName,doSave): #doSave = True if(doSkipDraw): return doSave = doSaveGlobal tagetJustName = "" fields = targetName.split(":") targetJustName = fields[len(fields) - 1] isPapi = targetJustName.find("PAPI_") if(isPapi != -1): targetJustName = targetJustName[5:] featureJustName = "" fields = featureName.split(":") featureJustName = fields[len(fields) - 1] featureName = featureJustName appName = "" #appName = "Linpack" #appName = "Matrix Multiplication" #appName = "Sparse Matrix Vector Multiplication" #appName = "Black Scholes" appName = "FFmpeg" #appName = "PageRank" #appName = "PageRank" if(os.getenv("APPNAME") != None): appName = os.getenv("APPNAME") print "\nappName = " , appName , "\n" dumpDir = codeDir + "/gold_histograms/" if(appName == "Sparse Matrix Vector Multiplication"): dumpDir = dumpDir + "SPARSE_MATRIX_MUL" if(appName == "Linpack"): dumpDir = dumpDir + "LINPACK" if(appName == "Matrix Multiplication"): dumpDir = dumpDir + "MATRIX_MUL" if(appName == "FFmpeg"): dumpDir = dumpDir + "FFmpeg" if(appName == "PageRank"): dumpDir = dumpDir + "PAGE_RANK" if(appName == "Black Scholes"): dumpDir = dumpDir + "BLACKSCHOLES" if(appName == "LULESH"): dumpDir = dumpDir + "LULESH" if(appName == "CoMD"): dumpDir = dumpDir + "CoMD" if(appName == "Sparse Matrix Vector Multiplication"): dataSamples = [x / 10.0 for x in dataSamples] fileName1 = "errHisto_" + getGlobalObject("baseModuleName") + ":" + targetName + "_" + featureName + "_kde.png" fileName2 = "errHisto_" + getGlobalObject("baseModuleName") + ":" + targetName + "_" + featureName + "_tight_kde.png" fileName3 = "errHisto_" + getGlobalObject("baseModuleName") + ":" + targetName + "_" + featureName + "_kde.eps" fileName4 = "errHisto_" + getGlobalObject("baseModuleName") + ":" + targetName + "_" + featureName + "_tight_kde.eps" saveFileName1 = os.path.join(dumpDir,fileName1) saveFileName2 = os.path.join(dumpDir,fileName2) saveFileName3 = os.path.join(dumpDir,fileName3) saveFileName4 = os.path.join(dumpDir,fileName4) #ttl = "Error histogram: \n" + targetName + " for " + featureName #ttl = appName + ": " + getGlobalObject("baseModuleName") + " - Observation: " + targetJustName ttl = getGlobalObject("baseModuleName") + " - Obs:" + targetJustName dataSamples = np.array(dataSamples) #grid = GridSearchCV(KernelDensity(kernel='gaussian'),{'bandwidth': np.linspace(0.1, 1.0, 5)},cv=10) # 20-fold cross-validation #fitScore = grid.fit(dataSamples[:, None]) #print grid.best_params_, " fit score : " , fitScore #kde = grid.best_estimator_ kde = KernelDensity(kernel='gaussian', bandwidth=0.17).fit(dataSamples[:, None]) maxVal = max(dataSamples) minVal = min(dataSamples) #a = maxVal - 0.02 #b = maxVal + 0.01 #print "log-prob at 0 : ", kde.score(0) #print "log-prob at", a, " : ", kde.score(a) #print "log-prob at", b, " : ", kde.score(b) #x_grid = np.linspace(minVal, maxVal, 1000) x_grid = np.linspace(-4.0, 4.0, 5000) pdf = kde.score_samples(x_grid[:, None]) #pdf = np.exp(kde.score_samples(x_grid[:, None])) #pdf = np.exp(kde.score_samples(dataSamples[:, None])) fig, ax = plt.subplots() fig1 = plt.figure(frameon=False) #ax.plot(x_grid, np.exp(pdf), linewidth=3, alpha=0.5, label='bw=%.2f' % kde.bandwidth) #ax.plot(x_grid, np.exp(pdf), linewidth=3, color='b') #ax.fill(x_grid, np.exp(pdf), fc='g',alpha=0.75) ax.fill(x_grid, pdf, fc='g',alpha=0.75) #ax.plot(dataSamples, pdf, linewidth=3, alpha=0.5, label='bw=%.2f' % kde.bandwidth) #ax.hist(dataSamples, 30, fc='gray', histtype='stepfilled', alpha=0.3, normed=True) #ax.legend(loc='upper left') plt.title(ttl,fontsize=20) plt.xlabel('% Error in prediction',fontsize=20) plt.ylabel('Density',fontsize=20) #plt.xticks(np.arange(min(x), max(x)+1, 20.0)) plt.yticks(np.linspace(y[0], y[len(y)-2], 5)) plt.xticks(fontsize=14) plt.yticks(fontsize=14) #doSave = False if(doSave == False): plt.show() #plt.savefig(saveFileName) #plt.savefig(saveFileName2, bbox_inches='tight') #plt.savefig(saveFileName3, bbox_inches='tight') #plt.close() else: print "SaveFileName = " + saveFileName1 #plt.savefig(saveFileName, bbox_inches='tight') plt.savefig(saveFileName1, format='png',dpi=800) plt.savefig(saveFileName2, format='png',dpi=800,bbox_inches='tight') plt.savefig(saveFileName3, format='eps',dpi=800) plt.savefig(saveFileName4, format='eps',dpi=800,bbox_inches='tight') plt.close()
#PCA pca = PCA(n_components=20) pca.fit(msa_vectors[1000:]) a_samps_pca = pca.transform(msa_vectors[1000:]) b_samps_pca = pca.transform(msa_vectors[:1000]) print a_samps_pca.shape #KDE # for bw in [.01, .1, 1., 10.]: for bw in [ 1.]: kde = KernelDensity(kernel='gaussian', bandwidth=bw).fit(a_samps_pca) # density_train = kde.score_samples(msa_vectors) print bw, kde.score(b_samps_pca) densities = kde.score_samples(b_samps_pca) # densities = np.ones(1000) #Scale densities to betw 0 and 1 min_density = np.min(densities) densities = densities - min_density + 1. weights = np.reciprocal(densities) max_weights = np.max(weights) weights = weights / max_weights print np.max(weights) print np.mean(weights) print np.min(weights)