def __init__(self, summaryfile=None, inwave=None, indata=None, inerrs=None, inmask=None, smooth=None): self.data = indata self.wave = inwave self.errs = inerrs self.mask = inmask if summaryfile: self.open_summary(summaryfile) # Interpolate masked areas self.data[self.mask] = np.nan self.nonnanidx = np.where(~self.mask)[0] self.interp = np.interp(self.wave, self.wave[self.nonnanidx], self.data[self.nonnanidx]) self.interr = np.interp(self.wave, self.wave[self.nonnanidx], self.errs[self.nonnanidx]) if smooth == 'll': lle = KernelReg(self.interp, self.wave, 'c', bw=[10]) mean, marg = lle.fit() del marg self.smoothed = mean elif smooth == 'box': mean = np.convolve(self.data, np.array([1, 1, 1]) / 3) else: self.smoothed = self.data self._build_plot()
def dataSmoothing3(changes): length = len(changes) x = np.linspace(1, length, num=length, endpoint=True) y = np.array(changes) kr = KernelReg(y, x, 'c') r_fit = KernelReg.r_squared(kr) #plt.figure(1) #plt.subplot(131) #plt.plot(x, y, 'go-') #plt.title("Original",fontsize=20) #plt.xlabel('Periods',fontsize=20) #plt.ylabel('Dockerfile Size',fontsize=20) #plt.grid(True) if length < 20: x1 = np.linspace(1, length, num=3 * length, endpoint=True) else: x1 = x y_pred, y_std = kr.fit(x1) #plt.subplot(132) #plt.plot(x1, y_pred,'bo-') #plt.title("Smoothing",fontsize=20) #plt.xlabel('Periods',fontsize=20) #plt.ylabel('Dockerfile Size',fontsize=20) #plt.grid(True) #plt.show() ynew = dataResampling(y_pred) xnew = np.linspace(1, 20, 20, endpoint=False) #plt.subplot(133) #plt.plot(xnew, ynew,'ro-') #plt.title("Resampling",fontsize=20) #plt.xlabel('Periods',fontsize=20) #plt.ylabel('Dockerfile Size',fontsize=20) #plt.grid(True) #plt.show() return ynew, r_fit
class NadarayaWatsonUNLR(UnivariateNonlinearRegressor): kernel: KernelReg bandwidth: float def __init__( self, bandwidth: float = 0.25, random_state: Optional[Union[int, np.random.RandomState]] = None, ): """Instantiates a kernel regression model. Args: bandwidth: affects the scale on which to locally average samples random_state: random state which effects sample bootstrapping """ super().__init__(random_state) self.bandwidth = bandwidth def _fit_univariate(self, x: np.ndarray, y: np.ndarray, w: Optional[np.ndarray]) -> None: if w is not None: x, y = self.weighted_resampler(x, y, w) self.kernel = KernelReg(endog=y, exog=x, var_type="c", bw=[self.bandwidth]) def predict(self, x: np.ndarray) -> np.ndarray: return self.kernel.fit(x)[0] def derivative(self, x: np.ndarray) -> np.ndarray: return self.kernel.fit(x)[1].ravel()
def integrated_calibration_index_mod(y, p): """ local reg 使うバージョン TOOD: statsmodels.nonparametric.kernel_regression.KernReg がとても遅い. C++とかで実装したほうが良いのでは? """ ll = KernelReg(endog=y, exog=p, reg_type='ll', var_type='o') return mean_absolute_error(y, ll.fit()[0])
def get_fitted_values(week): # week - for knowing for which s_spotify values to take s_streams # делаем working_df с которой будет работать модель working_df = pd.read_csv(get_paths()[1]+"all_spotify.csv") working_df = working_df.drop(working_df.columns[[0]], axis=1) # делаем регрессию y = np.array(list(working_df["streams"])) x_r = np.array(list(working_df["rank"])) x_s = np.array(list(working_df["s_streams"])) var_cont = (np.var(x_s))**0.5 b_c = var_cont*(len(y)**(-1/5)) print(b_c) # count ordered discrete variable bandwidth b_o = len(y)**(-2/5) print(b_o) reg_new = KernelReg(y, [x_r, x_s], var_type="oc", reg_type = "ll", bw = [b_o, b_c]) df_of_needed_week = working_df[working_df["week_f_show"] == week] last_week_sstreams = df_of_needed_week["s_streams"][-1:].values[0] fit_values = reg_new.fit([[i for i in range(1,201)],[last_week_sstreams for h in range(1,201) ]])[0] return fit_values
def FWHM(wave, pertdata, mode='data', imin=False, ll_bw='cv_ls'): """ Mode can be data, ll, lc """ fwhms = [] imins = [] mvels = [] LLEs = [] for i in tqdm(range(pertdata.shape[0])): data = pertdata[i, :] if mode in ['ll', 'lc']: lle = KernelReg(data, wave, 'c', reg_type=mode, bw=ll_bw) data = lle.fit()[0] LLEs.append(data) print('LLE bandwidth: ', lle.bw[0], end="\r") iplwave = np.linspace(wave.min(), wave.max(), 1000) ipldata = np.interp(iplwave, wave, data) iplidx = np.where(ipldata > ipldata.max() / 2)[0] vmin, vmax = iplidx.min(), iplidx.max() fwhms.append(iplwave[vmax] - iplwave[vmin]) if imin: imins.append(1 - data.max()) mvels.append(iplwave[ipldata.argmax()]) if imin: return np.array(fwhms), np.array(mvels), np.array(imins), np.array( LLEs) return np.array(fwhms)
def kreg_demo1(hs=None, fast=True, fun='hisj'): """Compare KRegression to KernelReg from statsmodels.nonparametric Examples -------- >>> kreg_demo1() """ N = 100 # ei = np.random.normal(loc=0, scale=0.075, size=(N,)) ei = np.array([ -0.08508516, 0.10462496, 0.07694448, -0.03080661, 0.05777525, 0.06096313, -0.16572389, 0.01838912, -0.06251845, -0.09186784, -0.04304887, -0.13365788, -0.0185279, -0.07289167, 0.02319097, 0.06887854, -0.08938374, -0.15181813, 0.03307712, 0.08523183, -0.0378058, -0.06312874, 0.01485772, 0.06307944, -0.0632959, 0.18963205, 0.0369126, -0.01485447, 0.04037722, 0.0085057, -0.06912903, 0.02073998, 0.1174351, 0.17599277, -0.06842139, 0.12587608, 0.07698113, -0.0032394, -0.12045792, -0.03132877, 0.05047314, 0.02013453, 0.04080741, 0.00158392, 0.10237899, -0.09069682, 0.09242174, -0.15445323, 0.09190278, 0.07138498, 0.03002497, 0.02495252, 0.01286942, 0.06449978, 0.03031802, 0.11754861, -0.02322272, 0.00455867, -0.02132251, 0.09119446, -0.03210086, -0.06509545, 0.07306443, 0.04330647, 0.078111, -0.04146907, 0.05705476, 0.02492201, -0.03200572, -0.02859788, -0.05893749, 0.00089538, 0.0432551, 0.04001474, 0.04888828, -0.17708392, 0.16478644, 0.1171006, 0.11664846, 0.01410477, -0.12458953, -0.11692081, 0.0413047, -0.09292439, -0.07042327, 0.14119701, -0.05114335, 0.04994696, -0.09520663, 0.04829406, -0.01603065, -0.1933216, 0.19352763, 0.11819496, 0.04567619, -0.08348306, 0.00812816, -0.00908206, 0.14528945, 0.02901065]) x = np.linspace(0, 1, N) va_1 = 0.3 ** 2 va_2 = 0.7 ** 2 y0 = np.exp(-x ** 2 / (2 * va_1)) + 1.3 * np.exp(-(x - 1) ** 2 / (2 * va_2)) y = y0 + ei kernel = Kernel('gauss', fun=fun) hopt = kernel.hisj(x) kreg = KRegression( x, y, p=0, hs=hs, kernel=kernel, xmin=-2 * hopt, xmax=1 + 2 * hopt) if fast: kreg.__call__ = kreg.eval_grid_fast f = kreg(x, output='plot', title='Kernel regression', plotflag=1) plt.figure(0) f.plot(label='p=0') kreg.p = 1 f1 = kreg(x, output='plot', title='Kernel regression', plotflag=1) f1.plot(label='p=1') # print(f1.data) plt.plot(x, y, '.', label='data') plt.plot(x, y0, 'k', label='True model') from statsmodels.nonparametric.kernel_regression import KernelReg kreg2 = KernelReg(y, x, ('c')) y2 = kreg2.fit(x) plt.plot(x, y2[0], 'm', label='statsmodel') plt.legend()
def smooth_xy(x, y): x = np.squeeze(x) y = np.squeeze(y) #v = lowess(y, x, frac=.05) kernel_reg = KernelReg(y, x, var_type='c', reg_type='lc') kernel_reg.bw = np.asarray([.01]) y = kernel_reg.fit(x)[0] return x, y
def fit(self, X, y, variable_types={}): self.X_shape = X.shape self.y_shape = y.shape if variable_types: variable_type_string = "".join([variable_types[col] for col in X.columns]) self.model = KernelReg(y, X, variable_type_string, reg_type="ll") else: self.model = KernelReg(y, X, "c" * X.shape[1], reg_type="ll") return self
def fit(self, X, y, variable_types={}): self.X_shape = X.shape self.y_shape = y.shape if variable_types: variable_type_string = ''.join([variable_types[col] for col in X.columns]) self.model = KernelReg(y, X, variable_type_string, reg_type='ll') else: self.model = KernelReg(y, X, 'c' * X.shape[1], reg_type='ll') return self
def pred_from_loess(self, train_x, train_y, x_to_pred): """ Trains simple loess regression and returns predictions """ kr_model = KernelReg(endog=train_y, exog=train_x, var_type='c', bw=[self.bandwidth]) return kr_model.fit(x_to_pred)[0]
def __init__(self, x, y, yerr=None): reg = KernelReg([y], [x], var_type='c', reg_type='ll') vals = reg.fit(x)[0] self.spline = interp.UnivariateSpline(x, vals, w=np.isfinite(vals), ext='const') # calculate RMS and normalize to stop normalization drifting xs = np.linspace(np.min(x), np.max(x), 1000) ys = self.spline(xs) self.rms = np.sqrt(np.sum(ys**2) / 1000)
def find_extrema(s, bw='cv_ls'): """ Input: s: prices as pd.series bw: bandwith as str or array like Returns: prices: with 0-based index as pd.series extrema: extrema of prices as pd.series smoothed_prices: smoothed prices using kernel regression as pd.series smoothed_extrema: extrema of smoothed_prices as pd.series """ # Copy series so we can replace index and perform non-parametric # kernel regression. prices = s.copy() prices = prices.reset_index() prices.columns = ['date', 'price'] prices = prices['price'] kr = KernelReg([prices.values], [prices.index.to_numpy()], var_type='c', bw=bw) f = kr.fit([prices.index]) # Use smoothed prices to determine local minima and maxima smooth_prices = pd.Series(data=f[0], index=prices.index) smooth_local_max = argrelextrema(smooth_prices.values, np.greater)[0] smooth_local_min = argrelextrema(smooth_prices.values, np.less)[0] local_max_min = np.sort( np.concatenate([smooth_local_max, smooth_local_min])) smooth_extrema = smooth_prices.loc[local_max_min] # Iterate over extrema arrays returning datetime of passed # prices array. Uses idxmax and idxmin to window for local extrema. price_local_max_dt = [] for i in smooth_local_max: if (i > 1) and (i < len(prices) - 1): price_local_max_dt.append(prices.iloc[i - 2:i + 2].idxmax()) price_local_min_dt = [] for i in smooth_local_min: if (i > 1) and (i < len(prices) - 1): price_local_min_dt.append(prices.iloc[i - 2:i + 2].idxmin()) maxima = pd.Series(prices.loc[price_local_max_dt]) minima = pd.Series(prices.loc[price_local_min_dt]) extrema = pd.concat([maxima, minima]).sort_index() # Return series for each with bar as index return extrema, prices, smooth_extrema, smooth_prices
def __init__(self, f, f2, pts3d, left_pts, right_pts, oldpts3d, safety_check=False): self.f = f self.f2 = f2 self.safety_check = safety_check self.pts3d = np.matrix(pts3d) self.minimum = np.min(self.pts3d[:,2]) self.maximum = np.max(self.pts3d[:,2]) self.oldpts3d = oldpts3d self.left_pts = left_pts self.right_pts = right_pts pts2d = [] ptsz = [] f3 = open("../calibration_data/camera_matrix.p", "rb") self.cmat = pickle.load(f3) f3.close() for pt in pts3d: pts2d.append(pt[:2]) ptsz.append(np.ceil(pt[2] * 1000000)) self.neigh = KNeighborsClassifier(n_neighbors=2) self.neigh.fit(pts2d, ptsz) self.f = scipy.interpolate.Rbf(np.matrix(pts3d)[:,0].ravel(), np.matrix(pts3d)[:,1].ravel(), np.matrix(pts3d)[:,2].ravel(), function='linear', epsilon=.1) pts3d = np.array(pts3d).T print pts3d.shape print pts3d[:2,:].shape, pts3d[2,:].shape self.f = KernelReg(pts3d[2,:], pts3d[:2,:], 'cc')
def __init__(self, X, causes, effects, admissable_set=[], variable_types=None, expectation=False, density=True): """ We want to calculate the causal effect of X and Y through back-door adjustment, P(Y|do(X)) = Sum( P(Y|X,Z)P(Z), Z) for some admissable set of control variables, Z. First we calculate the conditional density P(Y|X,Z), then the density P(Z). We find the support of Z so we can properly sum over it later. variable_types are a dictionary with the column name pointing to an element of set(['o', 'u', 'c']), for 'ordered', 'unordered discrete', or 'continuous'. """ conditional_density_vars = causes + admissable_set self.causes = causes self.effects = effects self.admissable_set = admissable_set self.conditional_density_vars = conditional_density_vars if len(X) > 300 or max(len(causes+admissable_set),len(effects+admissable_set)) >= 3: self.defaults=EstimatorSettings(n_jobs=4, efficient=True) else: self.defaults=EstimatorSettings(n_jobs=-1, efficient=False) if variable_types: self.variable_types = variable_types dep_type = [variable_types[var] for var in effects] indep_type = [variable_types[var] for var in conditional_density_vars] density_types = [variable_types[var] for var in admissable_set] else: self.variable_types = self.__infer_variable_types(X) if 'c' not in variable_types.values(): bw = 'cv_ml' else: bw = 'normal_reference' if admissable_set: self.density = KDEMultivariate(X[admissable_set], var_type=''.join(density_types), bw=bw, defaults=self.defaults) self.conditional_density = KDEMultivariateConditional(endog=X[effects], exog=X[conditional_density_vars], dep_type=''.join(dep_type), indep_type=''.join(indep_type), bw=bw, defaults=self.defaults) if expectation: self.conditional_expectation = KernelReg(X[effects].values, X[conditional_density_vars].values, ''.join(indep_type), bw='cv_ls') self.support = self.__get_support(X) self.discrete_variables = [ variable for variable, var_type in self.variable_types.items() if var_type in ['o', 'u']] self.discrete_Z = list(set(self.discrete_variables).intersection(set(admissable_set))) self.continuous_variables = [ variable for variable, var_type in self.variable_types.items() if var_type == 'c' ] self.continuous_Z = list(set(self.continuous_variables).intersection(set(admissable_set)))
class local_stack: def __init__(self): pass def fit(self, X_train, y_train): N, p = X_train.shape self.kernel = KernelReg(y_train, X_train, var_type=p * 'c') def predict(self, X): return self.kernel.fit(X)[0]
class LocalRegression: def __init__(self): pass def fit(self, X_train, y_train): # By default, this function will do a local linear regression self.regression = KernelReg(y_train, X_train, var_type='c') return self def predict(self, X_test): return self.regression.fit(X_test)[0]
def fitData(X, y, method): if method == "simple-lr": model = LinearRegression().fit(X, y) return model.predict(X) elif method == "nonpara-lr": model = KernelRidge(kernel='linear').fit(X, y) return model.predict(X) elif method == "nonpara-poly": model = KernelReg(endog=y, exog=X, var_type='c', reg_type='ll') x2 = np.reshape(range(600), (-1, 1)) return model.fit(x2)[0]
def compute_arrival_rate(volume, duration, strikes): volume_duration = pd.concat([volume.sum(), duration.sum()], keys=['Volume', 'Duration'], axis=1) volume_duration_kernel = volume_duration.apply( lambda vd: vd.groupby('Half-spread').apply(lambda d: KernelReg( d.xs(d.name, level='Half-spread'), d.xs(d.name, level='Half-spread').index, 'c', 'lc'))) arrival_rate = volume_duration_kernel.apply( lambda vd: vd.groupby('Half-spread').apply(lambda k: pd.Series( k.xs(k.name).fit(strikes)[0], strikes))) return np.log(arrival_rate['Volume'] / arrival_rate['Duration'])
def apply_code(self, mqb, ctx): predictions = [] ctx['iter_count'] += 1 for duration in self.DURATIONS: if ctx['iter_count'] > duration * self.TIMES_IN_WINDOW: close_mid_values = mqb['close_mid'].last_with_duration( self.TIMES_IN_WINDOW, duration) indexes = linspace(1., len(close_mid_values), len(close_mid_values)) close_prices = pd.Series(index=indexes, data=close_mid_values) prices = close_prices.copy() kr = KernelReg([prices.values], [prices.index.values], var_type='c', bw=[1.8, 1]) max_mins = self.find_max_min(prices, kr) if max_mins.shape[0] == 5: e1 = max_mins.iloc[0] e2 = max_mins.iloc[1] e3 = max_mins.iloc[2] e4 = max_mins.iloc[3] e5 = max_mins.iloc[4] if e1 > e2 and e3 > e2 and e5 > e2 and e1 > e4 and e3 > e4 and e5 > e4: if e5 > e3 > e1 and e2 < e4: if close_mid_values[-1] > e5: prediction = {'duration': duration, 'value': 1} predictions.append(prediction) elif max_mins.shape[0] == 6: e1 = max_mins.iloc[0] e2 = max_mins.iloc[1] e3 = max_mins.iloc[2] e4 = max_mins.iloc[3] e5 = max_mins.iloc[4] e6 = max_mins.iloc[5] if e1 > e2 and e3 > e2 and e5 > e2 and e1 > e4 and e1 > e4 and e5 > e4 and e1 > 6 and e3 > e6 and e5 > e6: if e1 < e3 < e5 and e6 < e4 < e2: if close_mid_values[-1] < e6: prediction = { 'duration': duration, 'value': -1 } predictions.append(prediction) return predictions
def find_max_min(prices): """ Get min and max of a series consisting of prices """ prices_ = prices.copy() prices_.index = np.linspace(1., len(prices_), len(prices_)) kr = KernelReg([prices_.values], [prices_.index.values], var_type='c', bw=[1.8]) f = kr.fit([prices_.index.values]) smooth_prices = pd.Series(data=f[0], index=prices.index) local_max = argrelextrema(smooth_prices.values, np.greater)[0] local_min = argrelextrema(smooth_prices.values, np.less)[0] price_local_max_dt = [] for i in local_max: if (i > 1) and (i < len(prices) - 1): price_local_max_dt.append(prices.iloc[i - 2:i + 2].argmax()) price_local_min_dt = [] for i in local_min: if (i > 1) and (i < len(prices) - 1): price_local_min_dt.append(prices.iloc[i - 2:i + 2].argmin()) prices.name = 'price' maxima = pd.DataFrame(prices.loc[price_local_max_dt]) minima = pd.DataFrame(prices.loc[price_local_min_dt]) max_min = pd.concat([maxima, minima]).sort_index() max_min.index.name = 'date' max_min = max_min.reset_index() max_min = max_min[~max_min.date.duplicated()] p = prices.reset_index() max_min['day_num'] = p[p['index'].isin(max_min.date)].index.values max_min = max_min.set_index('day_num').price return max_min
def calc_smooth(prices: pd.Series, *, bw: Union[np.ndarray, str] = 'cv_ls', a: float = None, use_array: bool = True) -> Union[pd.Series, np.ndarray]: """计算Nadaraya-Watson核估计后的价格数据 Args: prices (pd.Series): 价格数据 bw (Union[np.ndarray,str]): Either a user-specified bandwidth or the method for bandwidth selection. Defaults to cv_ls. a (float, optional): 论文中所说的比例数据. Defaults to None. use_array (bool, optional): 为True返回ndarray,False返回为pd.Series. Defaults to True. Returns: Union[pd.Series,np.ndarry] """ if not isinstance(prices, pd.Series): raise ValueError('prices必须为pd.Series') idx = np.arange(len(prices)) kr = KernelReg(prices.values, idx, var_type='c', reg_type='ll', bw=bw) if a is None: f = kr.fit(idx)[0] else: kr.bw = a * kr.bw # 论文用的0.3 * h f = kr.fit(idx)[0] if use_array: return f else: return pd.Series(data=f, index=prices.index)
def estimator_nw(data, est_kwargs={}, **kwargs): from statsmodels.nonparametric.kernel_regression import KernelReg #http://www.statsmodels.org/dev/generated/statsmodels.nonparametric.kernel_density.EstimatorSettings.html from statsmodels.nonparametric.kernel_regression import EstimatorSettings k = len(data['x']['Train'].T) # n = len(data['x']['Train']) if 'reg_type' in est_kwargs.keys(): reg_type = est_kwargs[ 'reg_type'] #Allows for locally linear estimation else: reg_type = 'lc' #Default is local constant (Nadaraya-Watson). #Estimate model nw = KernelReg( data['y']['Train'], data['x']['Train'], #Fits regression var_type='c' * k, #Continuous variables reg_type=reg_type, bw='aic', #Least-squares cross val. Else aic for aic hurdwidth defaults=EstimatorSettings( n_jobs=1, #No parallel efficient=True, randomize=True, #bw estimation random subsampling n_res=25, #Number of resamples n_sub=50, # Size of samples ), ) betahat = np.array([]) #NP does not have coefficients # Extract results prob, mrgeff = {}, {} for split in ('Train', 'Test'): prob[split], mrgeff[split] = nw.fit(data_predict=data['x'][split]) return betahat, prob, mrgeff
def find_max_min(prices): prices_ = prices.copy() prices_.index = linspace(1., len(prices_), len(prices_)) #kr = KernelReg([prices_.values], [prices_.index.values], var_type='c', bw=[1.8, 1]) kr = KernelReg([prices_.values], [prices_.index.values], var_type='c', bw=[2]) # 小了捕捉局部,大了捕捉全局 ! # Either a user-specified bandwidth or the method for bandwidth selection. # If a string, valid values are ‘cv_ls’ (least-squares cross-validation) and ‘aic’ (AIC Hurvich bandwidth estimation). # Default is ‘cv_ls’. f = kr.fit([prices_.index.values]) smooth_prices = pd.Series(data=f[0], index=prices.index) local_max = argrelextrema(smooth_prices.values, np.greater)[0] local_min = argrelextrema(smooth_prices.values, np.less)[0] price_local_max_dt = [] for i in local_max: if (i > 1) and (i < len(prices) - 1): price_local_max_dt.append(prices.iloc[i - 2:i + 2].argmax()) price_local_min_dt = [] for i in local_min: if (i > 1) and (i < len(prices) - 1): price_local_min_dt.append(prices.iloc[i - 2:i + 2].argmin()) prices.name = 'price' maxima = pd.DataFrame(prices.loc[price_local_max_dt]) minima = pd.DataFrame(prices.loc[price_local_min_dt]) max_min = pd.concat([maxima, minima]).sort_index() max_min.index.name = 'date' max_min = max_min.reset_index() max_min = max_min[~max_min.date.duplicated()] p = prices.reset_index() max_min['day_num'] = p[p['index'].isin(max_min.date)].index.values max_min = max_min.set_index('day_num').price return max_min
class KernelModelWrapper(object): def __init__(self): self.model = None self.variable_types = {} self.X_shape = None self.y_shape = None def fit(self, X, y, variable_types={}): self.X_shape = X.shape self.y_shape = y.shape if variable_types: variable_type_string = ''.join([variable_types[col] for col in X.columns]) self.model = KernelReg(y, X, variable_type_string, reg_type='ll') else: self.model = KernelReg(y, X, 'c' * X.shape[1], reg_type='ll') return self def predict(self, X): if X.shape != self.X_shape: raise Exception("Expected shape {}, received {}".format(self.X_shape, X.shape)) return self.model.fit(X)[0]
class KernelModelWrapper(object): def __init__(self): self.model = None self.variable_types = {} self.X_shape = None self.y_shape = None def fit(self, X, y, variable_types={}): self.X_shape = X.shape self.y_shape = y.shape if variable_types: variable_type_string = ''.join( [variable_types[col] for col in X.columns]) self.model = KernelReg(y, X, variable_type_string, reg_type='ll') else: self.model = KernelReg(y, X, 'c' * X.shape[1], reg_type='ll') return self def predict(self, X): if X.shape != self.X_shape: raise Exception("Expected shape {}, received {}".format( self.X_shape, X.shape)) return self.model.fit(X)[0]
class Surface: def __init__(self, f, f2, pts3d, left_pts, right_pts, oldpts3d, safety_check=False): self.f = f self.f2 = f2 self.safety_check = safety_check self.pts3d = np.matrix(pts3d) self.minimum = np.min(self.pts3d[:, 2]) self.maximum = np.max(self.pts3d[:, 2]) self.oldpts3d = oldpts3d self.left_pts = left_pts self.right_pts = right_pts pts2d = [] ptsz = [] f3 = open("../calibration_data/camera_matrix.p", "rb") self.cmat = pickle.load(f3) f3.close() for pt in pts3d: pts2d.append(pt[:2]) ptsz.append(np.ceil(pt[2] * 1000000)) self.neigh = KNeighborsClassifier(n_neighbors=2) self.neigh.fit(pts2d, ptsz) self.f = scipy.interpolate.Rbf(np.matrix(pts3d)[:, 0].ravel(), np.matrix(pts3d)[:, 1].ravel(), np.matrix(pts3d)[:, 2].ravel(), function='linear', epsilon=.1) pts3d = np.array(pts3d).T print pts3d.shape print pts3d[:2, :].shape, pts3d[2, :].shape self.f = KernelReg(pts3d[2, :], pts3d[:2, :], 'cc') def leftpixels_to_rframe(self, x, y): surf = self.f2 left_pts = self.left_pts right_pts = self.right_pts pts3d = self.oldpts3d xin = np.array([a[0] for a in left_pts]) bias = np.ones(len(xin)) yin = np.array([a[1] for a in left_pts]) xout = np.array([a[0] for a in pts3d]) yout = np.array([a[1] for a in pts3d]) A = np.vstack([xin, bias]).T m1, c1 = np.linalg.lstsq(A, xout)[0] A = np.vstack([yin, bias]).T m2, c2 = np.linalg.lstsq(A, yout)[0] xnew = m1 * x + c1 ynew = m2 * y + c2 cpoint = np.matrix([(xnew, ynew, self.f2(xnew, ynew))]) pt = np.ones(4) pt[:3] = cpoint pred = self.cmat * np.matrix(pt).T return pred def query(self, x, y): temp = self.f.fit(np.array((x, y)))[0][0] if not self.safety_check: return (x, y, temp) if temp < self.minimum - 0.02: temp = self.query_knn(x, y)[2] elif temp > self.maximum + 0.02: temp = self.query_knn(x, y)[2] print 'asdf', temp return (x, y, temp) def query_knn(self, x, y): return (x, y, (self.neigh.predict([[x, y]]) / 1000000.0)[0]) def visualize(self): fig = plt.figure() ax = fig.add_subplot(111) pts3d = np.matrix(self.pts3d) f = self.f a, b = np.ravel(np.min(pts3d, axis=0)), np.ravel(np.max(pts3d, axis=0)) extra_range = 0.0 # xnew = np.arange(a[0] - extra_range,b[0] + extra_range,0.0001) # ynew = np.arange(a[1] - extra_range,b[1] + extra_range,0.0001) X, Y = np.mgrid[a[0] + .05:b[0] - .05:100j, a[1]:b[1]:100j]
fairK = np.array((3, 5, 9, 15, 20, 25, 30, 35, 40, 45)) event_lengths = durs_run1_new / fairK unique_event_lengths = np.unique(event_lengths) x = event_lengths.ravel() test_x = np.linspace(min(x), max(x), num=100) smooth_wva = np.zeros((len(unique_event_lengths), len(ROI_data), nBoots)) opt_bw_holder = np.zeros((nBoots, len(ROI_data))) for ROI in range(len(ROI_data)): for b in range(nBoots): opt_bw = 0 y = ROI_data[ROI][:, :, b].ravel() KR = KernelReg(y, x, var_type='c') opt_bw += KR.bw / len(ROI_data) opt_bw_holder[b, ROI] = opt_bw y = ROI_data[ROI][:, :, b].ravel() KR = KernelReg(y, x, var_type='c', bw=opt_bw) smooth_wva[:, ROI, b] += KR.fit(unique_event_lengths)[0] np.save( datadir + 'smooth_' + suffix + '_' + save_fn + '_auto_independent_bandwidths', smooth_wva) np.save( datadir + 'smooth_' + suffix + '_' + save_fn + '_auto_independent_optimal_bandwidth', opt_bw_holder)
class CausalEffect(object): def __init__(self, X, causes, effects, admissable_set=[], variable_types=None, expectation=False, density=True): """ We want to calculate the causal effect of X and Y through back-door adjustment, P(Y|do(X)) = Sum( P(Y|X,Z)P(Z), Z) for some admissable set of control variables, Z. First we calculate the conditional density P(Y|X,Z), then the density P(Z). We find the support of Z so we can properly sum over it later. variable_types are a dictionary with the column name pointing to an element of set(['o', 'u', 'c']), for 'ordered', 'unordered discrete', or 'continuous'. """ conditional_density_vars = causes + admissable_set self.causes = causes self.effects = effects self.admissable_set = admissable_set self.conditional_density_vars = conditional_density_vars if variable_types: self.variable_types = variable_types dep_type = [variable_types[var] for var in effects] indep_type = [variable_types[var] for var in conditional_density_vars] density_types = [variable_types[var] for var in admissable_set] else: self.variable_types = self.__infer_variable_types(X) if 'c' not in variable_types.values(): bw = 'cv_ml' else: bw = 'normal_reference' if admissable_set: self.density = KDEMultivariate(X[admissable_set], var_type=''.join(density_types), bw=bw) self.conditional_density = KDEMultivariateConditional(endog=X[effects], exog=X[conditional_density_vars], dep_type=''.join(dep_type), indep_type=''.join(indep_type), bw=bw) if expectation: self.conditional_expectation = KernelReg(X[effects].values, X[conditional_density_vars].values, ''.join(indep_type), bw='cv_ls') self.support = self.__get_support(X) self.discrete_variables = [ variable for variable, var_type in self.variable_types.items() if var_type in ['o', 'u']] self.discrete_Z = list(set(self.discrete_variables).intersection(set(admissable_set))) self.continuous_variables = [ variable for variable, var_type in self.variable_types.items() if var_type == 'c' ] self.continuous_Z = list(set(self.continuous_variables).intersection(set(admissable_set))) def __infer_variable_types(self,X): """ fill this in later. """ pass def __get_support(self, X): """ find the smallest cube around which the densities are supported, allowing a little flexibility for variables with larger bandwidths. """ data_support = { variable : (X[variable].min(), X[variable].max()) for variable in X.columns} variable_bandwidths = { variable : bw for variable, bw in zip(self.effects + self.conditional_density_vars, self.conditional_density.bw)} support = {} for variable in self.effects + self.conditional_density_vars: if self.variable_types[variable] == 'c': lower_support = data_support[variable][0] - 10. * variable_bandwidths[variable] upper_support = data_support[variable][1] + 10. * variable_bandwidths[variable] support[variable] = (lower_support, upper_support) else: support[variable] = data_support[variable] return support def integration_function(self,*args): # takes continuous z, discrete z, then x data = pd.DataFrame({ k : [v] for k, v in zip(self.continuous_Z + self.discrete_Z + self.causes + self.effects, args)}) conditional = self.conditional_density.pdf(exog_predict=data[self.conditional_density_vars].values[0], endog_predict=data[self.effects].values[0]) density = self.density.pdf(data_predict=data[self.admissable_set]) return conditional * density def expectation_integration_function(self, *args): data = pd.DataFrame({ k : [v] for k, v in zip(self.continuous_Z + self.discrete_Z + self.causes, args)}) conditional = self.conditional_expectation.fit(data_predict=data[self.conditional_density_vars].values)[0] density = self.density.pdf(data_predict=data[self.admissable_set]) return conditional * density def pdf(self, x): """ Currently, this does the whole sum/integral over the cube support of Z. We may be able to improve this by taking into account how the joint and conditionals factorize, and/or finding a more efficient support. This should be reasonably fast for |Z| <= 2 or 3, and small enough discrete variable cardinalities. It runs in O(n_1 n_2 ... n_k) in the cardinality of the discrete variables, |Z_1| = n_1, etc. It likewise runs in O(V^n) for n continuous Z variables. Factorizing the joint/conditional distributions in the sum could linearize the runtime. """ causal_effect = 0. x = x[self.causes + self.effects] if self.discrete_Z: discrete_variable_ranges = [ xrange(*(int(self.support[variable][0]), int(self.support[variable][1])+1)) for variable in self.discrete_Z] for z_vals in itertools.product(*discrete_variable_ranges): z_discrete = pd.DataFrame({k : [v] for k, v in zip(self.discrete_Z, z_vals)}) if self.continuous_Z: continuous_Z_ranges = [self.support[variable] for variable in self.continuous_Z] args = z_discrete.join(x).values[0] causal_effect += nquad(self.integration_function,continuous_Z_ranges,args=args)[0] else: z_discrete = z_discrete[self.admissable_set] exog_predictors = x.join(z_discrete)[self.conditional_density_vars] conditional = self.conditional_density.pdf(exog_predict=exog_predictors, endog_predict=x[self.effects]) density = self.density.pdf(data_predict=z_discrete) dc = conditional * density causal_effect += dc return causal_effect elif self.continuous_Z: continuous_Z_ranges = [self.support[var] for var in self.continuous_Z] causal_effect, error = nquad(self.integration_function,continuous_Z_ranges,args=tuple(x.values[0])) return causal_effect else: return self.conditional_density.pdf(exog_predict=x[self.causes],endog_predict=x[self.effects]) def expected_value( self, x): """ Currently, this does the whole sum/integral over the cube support of Z. We may be able to improve this by taking into account how the joint and conditionals factorize, and/or finding a more efficient support. This should be reasonably fast for |Z| <= 2 or 3, and small enough discrete variable cardinalities. It runs in O(n_1 n_2 ... n_k) in the cardinality of the discrete variables, |Z_1| = n_1, etc. It likewise runs in O(V^n) for n continuous Z variables. Factorizing the joint/conditional distributions in the sum could linearize the runtime. """ causal_effect = 0. x = x[self.causes] if self.discrete_Z: discrete_variable_ranges = [ xrange(*(int(self.support[variable][0]), int(self.support[variable][1])+1)) for variable in self.discrete_Z] for z_vals in itertools.product(*discrete_variable_ranges): z_discrete = pd.DataFrame({k : [v] for k, v in zip(self.discrete_Z, z_vals)}) if self.continuous_Z: continuous_Z_ranges = [self.support[variable] for variable in self.continuous_Z] args = z_discrete.join(x).values[0] causal_effect += nquad(self.expectation_integration_function,continuous_Z_ranges,args=args)[0] else: z_discrete = z_discrete[self.admissable_set] exog_predictors = x.join(z_discrete)[self.conditional_density_vars] causal_effect += self.conditional_expectation.fit(data_predict=exog_predictors.values)[0] * self.density.pdf(data_predict=z_discrete.values) return causal_effect elif self.continuous_Z: continuous_Z_ranges = [self.support[var] for var in self.continuous_Z] causal_effect, error = nquad(self.expectation_integration_function,continuous_Z_ranges,args=tuple(x.values[0])) return causal_effect else: return self.conditional_expectation.fit(data_predict=x[self.causes])[0]
import numpy as np import matplotlib.pyplot as plt from statsmodels.nonparametric.kernel_regression import KernelReg x = np.sort(np.random.rand(400) * 10 - 2) y = x**4 - 8 * (x**3) + 14 * (x**2) - 32 * (x) + 14 + ( (np.random.rand(len(x)) - 0.5) * 50) y_clean = x**4 - 8 * (x**3) + 14 * (x**2) - 32 * (x) + 14 reg = KernelReg(y, x, 'c') [mean, mfx] = reg.fit() plt.figure() plt.scatter(x, y) plt.plot(x, mean, color="red") plt.plot(x, y_clean, color="green") plt.show()
class Surface: def __init__(self, f, f2, pts3d, left_pts, right_pts, oldpts3d, safety_check=False): self.f = f self.f2 = f2 self.safety_check = safety_check self.pts3d = np.matrix(pts3d) self.minimum = np.min(self.pts3d[:,2]) self.maximum = np.max(self.pts3d[:,2]) self.oldpts3d = oldpts3d self.left_pts = left_pts self.right_pts = right_pts pts2d = [] ptsz = [] f3 = open("../calibration_data/camera_matrix.p", "rb") self.cmat = pickle.load(f3) f3.close() for pt in pts3d: pts2d.append(pt[:2]) ptsz.append(np.ceil(pt[2] * 1000000)) self.neigh = KNeighborsClassifier(n_neighbors=2) self.neigh.fit(pts2d, ptsz) self.f = scipy.interpolate.Rbf(np.matrix(pts3d)[:,0].ravel(), np.matrix(pts3d)[:,1].ravel(), np.matrix(pts3d)[:,2].ravel(), function='linear', epsilon=.1) pts3d = np.array(pts3d).T print pts3d.shape print pts3d[:2,:].shape, pts3d[2,:].shape self.f = KernelReg(pts3d[2,:], pts3d[:2,:], 'cc') def leftpixels_to_rframe(self, x, y): surf = self.f2 left_pts = self.left_pts right_pts = self.right_pts pts3d = self.oldpts3d xin = np.array([a[0] for a in left_pts]) bias = np.ones(len(xin)) yin = np.array([a[1] for a in left_pts]) xout = np.array([a[0] for a in pts3d]) yout = np.array([a[1] for a in pts3d]) A = np.vstack([xin, bias]).T m1, c1 = np.linalg.lstsq(A, xout)[0] A = np.vstack([yin, bias]).T m2, c2 = np.linalg.lstsq(A, yout)[0] xnew = m1 * x + c1 ynew = m2 * y + c2 cpoint = np.matrix([(xnew, ynew, self.f2(xnew, ynew))]) pt = np.ones(4) pt[:3] = cpoint pred = self.cmat * np.matrix(pt).T return pred def query(self, x, y): temp = self.f.fit(np.array((x, y)))[0][0] if not self.safety_check: return (x, y, temp) if temp < self.minimum - 0.02: temp = self.query_knn(x, y)[2] elif temp > self.maximum + 0.02: temp = self.query_knn(x, y)[2] print 'asdf', temp return (x, y, temp) def query_knn(self, x, y): return (x, y, (self.neigh.predict([[x, y]]) / 1000000.0)[0]) def visualize(self): fig = plt.figure() ax = fig.add_subplot(111) pts3d = np.matrix(self.pts3d) f = self.f a, b = np.ravel(np.min(pts3d, axis=0)), np.ravel(np.max(pts3d, axis=0)) extra_range = 0.0 # xnew = np.arange(a[0] - extra_range,b[0] + extra_range,0.0001) # ynew = np.arange(a[1] - extra_range,b[1] + extra_range,0.0001) X, Y = np.mgrid[a[0] + .05 :b[0] - .05 :100j, a[1]:b[1]:100j]
def fit(self, X_train, y_train): N, p = X_train.shape self.kernel = KernelReg(y_train, X_train, var_type=p * 'c')
event_lengths = durs_run1_new/fairK unique_event_lengths = np.unique(event_lengths) x = event_lengths.ravel() ROI_data = [a1_data, AG_data, prec_data, mpfc_data] #ROI_data = [a1_data,AG_data,prec_data] test_x = np.linspace(min(x), max(x), num=100) smooth_wva = np.zeros((len(unique_event_lengths), len(ROI_data), nBoots)) for b in range(nBoots): # Optimize bandwidth opt_bw = 0 for ROI in range(len(ROI_data)): y = ROI_data[ROI][:,:,b].ravel() KR = KernelReg(y,x,var_type='c') opt_bw += KR.bw/len(ROI_data) max_wva = np.zeros(len(ROI_data)) for ROI in range(len(ROI_data)): y = ROI_data[ROI][:,:,b].ravel() KR = KernelReg(y,x,var_type='c', bw=opt_bw) max_wva[ROI] = np.argmax(KR.fit(test_x)[0]) # Find peak on fine grid smooth_wva[:, ROI, b] += KR.fit(unique_event_lengths)[0] np.save(datadir + 'smooth_wva_split_merge_01_a1_prec_AG_bilmPFC',smooth_wva)
x2=xax2 y2= tweetatsec2 x3=xax3 y3= tweetatsec3 x4=xax4 y4= tweetatsec4 pyplot.xlabel('Second') pyplot.ylabel('Total tweet') pyplot.scatter(x,y,color='cyan') pyplot.scatter(x2,y2,color='red') pyplot.scatter(x3,y3,color='blue') pyplot.scatter(x4,y4,color='green') kr = KernelReg(y,x,'o') kr2 = KernelReg(y2,x2,'o') kr3 = KernelReg(y3,x3,'o') kr4 = KernelReg(y4,x4,'o') pyplot.plot(x, y, '+') pyplot.plot(x2,y2,'+') pyplot.plot(x3,y3,'+') pyplot.plot(x4,y4,'+') y_pred, y_std = kr.fit(x) y2_pred, y2_std = kr2.fit(x2) y3_pred, y3_std = kr3.fit(x3) y4_pred, y4_std = kr4.fit(x4) pyplot.plot(x, y_pred,'cyan',label='twitter') pyplot.plot(x2,y2_pred,'red',label='facebook')
def fit(self, X_train, y_train): # By default, this function will do a local linear regression self.regression = KernelReg(y_train, X_train, var_type='c') return self
def __init__(self, X, causes, effects, admissable_set=[], variable_types=None, expectation=False, density=True): """ We want to calculate the causal effect of X and Y through back-door adjustment, P(Y|do(X)) = Sum( P(Y|X,Z)P(Z), Z) for some admissable set of control variables, Z. First we calculate the conditional density P(Y|X,Z), then the density P(Z). We find the support of Z so we can properly sum over it later. variable_types are a dictionary with the column name pointing to an element of set(['o', 'u', 'c']), for 'ordered', 'unordered discrete', or 'continuous'. """ conditional_density_vars = causes + admissable_set self.causes = causes self.effects = effects self.admissable_set = admissable_set self.conditional_density_vars = conditional_density_vars if variable_types: self.variable_types = variable_types dep_type = [variable_types[var] for var in effects] indep_type = [ variable_types[var] for var in conditional_density_vars ] density_types = [variable_types[var] for var in admissable_set] else: self.variable_types = self.__infer_variable_types(X) if 'c' not in variable_types.values(): bw = 'cv_ml' else: bw = 'normal_reference' if admissable_set: self.density = KDEMultivariate(X[admissable_set], var_type=''.join(density_types), bw=bw) self.conditional_density = KDEMultivariateConditional( endog=X[effects], exog=X[conditional_density_vars], dep_type=''.join(dep_type), indep_type=''.join(indep_type), bw=bw) if expectation: self.conditional_expectation = KernelReg( X[effects].values, X[conditional_density_vars].values, ''.join(indep_type), bw='cv_ls') self.support = self.__get_support(X) self.discrete_variables = [ variable for variable, var_type in self.variable_types.items() if var_type in ['o', 'u'] ] self.discrete_Z = list( set(self.discrete_variables).intersection(set(admissable_set))) self.continuous_variables = [ variable for variable, var_type in self.variable_types.items() if var_type == 'c' ] self.continuous_Z = list( set(self.continuous_variables).intersection(set(admissable_set)))
def __init__(self, f, f2, pts3d, left_pts, right_pts, oldpts3d, safety_check=False): self.f = f self.f2 = f2 self.safety_check = safety_check self.pts3d = np.matrix(pts3d) self.minimum = np.min(self.pts3d[:, 2]) self.maximum = np.max(self.pts3d[:, 2]) self.oldpts3d = oldpts3d self.left_pts = left_pts self.right_pts = right_pts pts2d = [] ptsz = [] f3 = open("../calibration_data/camera_matrix.p", "rb") self.cmat = pickle.load(f3) f3.close() for pt in pts3d: pts2d.append(pt[:2]) ptsz.append(np.ceil(pt[2] * 1000000)) self.neigh = KNeighborsClassifier(n_neighbors=2) self.neigh.fit(pts2d, ptsz) self.f = scipy.interpolate.Rbf(np.matrix(pts3d)[:, 0].ravel(), np.matrix(pts3d)[:, 1].ravel(), np.matrix(pts3d)[:, 2].ravel(), function='linear', epsilon=.1) pts3d = np.array(pts3d).T print pts3d.shape print pts3d[:2, :].shape, pts3d[2, :].shape self.f = KernelReg(pts3d[2, :], pts3d[:2, :], 'cc')