示例#1
0
文件: nonlinear.py 项目: mikss/pr3
class NadarayaWatsonUNLR(UnivariateNonlinearRegressor):
    kernel: KernelReg
    bandwidth: float

    def __init__(
        self,
        bandwidth: float = 0.25,
        random_state: Optional[Union[int, np.random.RandomState]] = None,
    ):
        """Instantiates a kernel regression model.

        Args:
            bandwidth: affects the scale on which to locally average samples
            random_state: random state which effects sample bootstrapping
        """
        super().__init__(random_state)
        self.bandwidth = bandwidth

    def _fit_univariate(self, x: np.ndarray, y: np.ndarray, w: Optional[np.ndarray]) -> None:
        if w is not None:
            x, y = self.weighted_resampler(x, y, w)
        self.kernel = KernelReg(endog=y, exog=x, var_type="c", bw=[self.bandwidth])

    def predict(self, x: np.ndarray) -> np.ndarray:
        return self.kernel.fit(x)[0]

    def derivative(self, x: np.ndarray) -> np.ndarray:
        return self.kernel.fit(x)[1].ravel()
示例#2
0
def FWHM(wave, pertdata, mode='data', imin=False, ll_bw='cv_ls'):
    """ Mode can be data, ll, lc
    """
    fwhms = []
    imins = []
    mvels = []
    LLEs = []
    for i in tqdm(range(pertdata.shape[0])):
        data = pertdata[i, :]
        if mode in ['ll', 'lc']:
            lle = KernelReg(data, wave, 'c', reg_type=mode, bw=ll_bw)
            data = lle.fit()[0]
            LLEs.append(data)
            print('LLE bandwidth: ', lle.bw[0], end="\r")
        iplwave = np.linspace(wave.min(), wave.max(), 1000)
        ipldata = np.interp(iplwave, wave, data)
        iplidx = np.where(ipldata > ipldata.max() / 2)[0]
        vmin, vmax = iplidx.min(), iplidx.max()
        fwhms.append(iplwave[vmax] - iplwave[vmin])
        if imin:
            imins.append(1 - data.max())
            mvels.append(iplwave[ipldata.argmax()])
    if imin:
        return np.array(fwhms), np.array(mvels), np.array(imins), np.array(
            LLEs)
    return np.array(fwhms)
示例#3
0
 def __init__(self,
              summaryfile=None,
              inwave=None,
              indata=None,
              inerrs=None,
              inmask=None,
              smooth=None):
     self.data = indata
     self.wave = inwave
     self.errs = inerrs
     self.mask = inmask
     if summaryfile:
         self.open_summary(summaryfile)
         # Interpolate masked areas
         self.data[self.mask] = np.nan
         self.nonnanidx = np.where(~self.mask)[0]
         self.interp = np.interp(self.wave, self.wave[self.nonnanidx],
                                 self.data[self.nonnanidx])
         self.interr = np.interp(self.wave, self.wave[self.nonnanidx],
                                 self.errs[self.nonnanidx])
     if smooth == 'll':
         lle = KernelReg(self.interp, self.wave, 'c', bw=[10])
         mean, marg = lle.fit()
         del marg
         self.smoothed = mean
     elif smooth == 'box':
         mean = np.convolve(self.data, np.array([1, 1, 1]) / 3)
     else:
         self.smoothed = self.data
     self._build_plot()
示例#4
0
def dataSmoothing3(changes):
    length = len(changes)
    x = np.linspace(1, length, num=length, endpoint=True)
    y = np.array(changes)
    kr = KernelReg(y, x, 'c')
    r_fit = KernelReg.r_squared(kr)
    #plt.figure(1)
    #plt.subplot(131)
    #plt.plot(x, y, 'go-')
    #plt.title("Original",fontsize=20)
    #plt.xlabel('Periods',fontsize=20)
    #plt.ylabel('Dockerfile Size',fontsize=20)
    #plt.grid(True)
    if length < 20:
        x1 = np.linspace(1, length, num=3 * length, endpoint=True)
    else:
        x1 = x
    y_pred, y_std = kr.fit(x1)
    #plt.subplot(132)
    #plt.plot(x1, y_pred,'bo-')
    #plt.title("Smoothing",fontsize=20)
    #plt.xlabel('Periods',fontsize=20)
    #plt.ylabel('Dockerfile Size',fontsize=20)
    #plt.grid(True)
    #plt.show()
    ynew = dataResampling(y_pred)
    xnew = np.linspace(1, 20, 20, endpoint=False)
    #plt.subplot(133)
    #plt.plot(xnew, ynew,'ro-')
    #plt.title("Resampling",fontsize=20)
    #plt.xlabel('Periods',fontsize=20)
    #plt.ylabel('Dockerfile Size',fontsize=20)
    #plt.grid(True)
    #plt.show()
    return ynew, r_fit
示例#5
0
def integrated_calibration_index_mod(y, p):
    """
    local reg 使うバージョン
    TOOD: statsmodels.nonparametric.kernel_regression.KernReg がとても遅い. C++とかで実装したほうが良いのでは?
    """
    ll = KernelReg(endog=y, exog=p, reg_type='ll', var_type='o')
    return mean_absolute_error(y, ll.fit()[0])
示例#6
0
def get_fitted_values(week):
    
# week - for knowing for which s_spotify values to take s_streams

    # делаем working_df с которой будет работать модель
    working_df = pd.read_csv(get_paths()[1]+"all_spotify.csv")
    working_df = working_df.drop(working_df.columns[[0]], axis=1)
    
    # делаем регрессию
    y = np.array(list(working_df["streams"]))
    x_r = np.array(list(working_df["rank"]))
    x_s = np.array(list(working_df["s_streams"]))

    var_cont = (np.var(x_s))**0.5
    b_c = var_cont*(len(y)**(-1/5))
    print(b_c)

    # count ordered discrete variable bandwidth
    b_o = len(y)**(-2/5)
    print(b_o)


    reg_new = KernelReg(y, [x_r, x_s], var_type="oc", reg_type = "ll", bw = [b_o, b_c]) 
    
    df_of_needed_week = working_df[working_df["week_f_show"] == week]
    last_week_sstreams = df_of_needed_week["s_streams"][-1:].values[0]
    fit_values = reg_new.fit([[i for i in range(1,201)],[last_week_sstreams for h in range(1,201) ]])[0]
    
    return fit_values
示例#7
0
文件: demo.py 项目: morbult/pywafo
def kreg_demo1(hs=None, fast=True, fun='hisj'):
    """Compare KRegression to KernelReg from statsmodels.nonparametric

    Examples
    --------
    >>> kreg_demo1()
    """
    N = 100
    # ei = np.random.normal(loc=0, scale=0.075, size=(N,))
    ei = np.array([
        -0.08508516, 0.10462496, 0.07694448, -0.03080661, 0.05777525,
        0.06096313, -0.16572389, 0.01838912, -0.06251845, -0.09186784,
        -0.04304887, -0.13365788, -0.0185279, -0.07289167, 0.02319097,
        0.06887854, -0.08938374, -0.15181813, 0.03307712, 0.08523183,
        -0.0378058, -0.06312874, 0.01485772, 0.06307944, -0.0632959,
        0.18963205, 0.0369126, -0.01485447, 0.04037722, 0.0085057,
        -0.06912903, 0.02073998, 0.1174351, 0.17599277, -0.06842139,
        0.12587608, 0.07698113, -0.0032394, -0.12045792, -0.03132877,
        0.05047314, 0.02013453, 0.04080741, 0.00158392, 0.10237899,
        -0.09069682, 0.09242174, -0.15445323, 0.09190278, 0.07138498,
        0.03002497, 0.02495252, 0.01286942, 0.06449978, 0.03031802,
        0.11754861, -0.02322272, 0.00455867, -0.02132251, 0.09119446,
        -0.03210086, -0.06509545, 0.07306443, 0.04330647, 0.078111,
        -0.04146907, 0.05705476, 0.02492201, -0.03200572, -0.02859788,
        -0.05893749, 0.00089538, 0.0432551, 0.04001474, 0.04888828,
        -0.17708392, 0.16478644, 0.1171006, 0.11664846, 0.01410477,
        -0.12458953, -0.11692081, 0.0413047, -0.09292439, -0.07042327,
        0.14119701, -0.05114335, 0.04994696, -0.09520663, 0.04829406,
        -0.01603065, -0.1933216, 0.19352763, 0.11819496, 0.04567619,
        -0.08348306, 0.00812816, -0.00908206, 0.14528945, 0.02901065])
    x = np.linspace(0, 1, N)

    va_1 = 0.3 ** 2
    va_2 = 0.7 ** 2
    y0 = np.exp(-x ** 2 / (2 * va_1)) + 1.3 * np.exp(-(x - 1) ** 2 / (2 * va_2))
    y = y0 + ei
    kernel = Kernel('gauss', fun=fun)
    hopt = kernel.hisj(x)
    kreg = KRegression(
        x, y, p=0, hs=hs, kernel=kernel, xmin=-2 * hopt, xmax=1 + 2 * hopt)
    if fast:
        kreg.__call__ = kreg.eval_grid_fast

    f = kreg(x, output='plot', title='Kernel regression', plotflag=1)
    plt.figure(0)
    f.plot(label='p=0')

    kreg.p = 1
    f1 = kreg(x, output='plot', title='Kernel regression', plotflag=1)
    f1.plot(label='p=1')
    # print(f1.data)
    plt.plot(x, y, '.', label='data')
    plt.plot(x, y0, 'k', label='True model')
    from statsmodels.nonparametric.kernel_regression import KernelReg
    kreg2 = KernelReg(y, x, ('c'))
    y2 = kreg2.fit(x)
    plt.plot(x, y2[0], 'm', label='statsmodel')

    plt.legend()
def smooth_xy(x, y):
    x = np.squeeze(x)
    y = np.squeeze(y)
    #v = lowess(y, x, frac=.05)
    kernel_reg = KernelReg(y, x, var_type='c', reg_type='lc')
    kernel_reg.bw = np.asarray([.01])
    y = kernel_reg.fit(x)[0]
    return x, y
示例#9
0
class local_stack:
    def __init__(self):
        pass

    def fit(self, X_train, y_train):
        N, p = X_train.shape
        self.kernel = KernelReg(y_train, X_train, var_type=p * 'c')

    def predict(self, X):
        return self.kernel.fit(X)[0]
示例#10
0
    def pred_from_loess(self, train_x, train_y, x_to_pred):
        """
    	Trains simple loess regression and returns predictions
    	"""
        kr_model = KernelReg(endog=train_y,
                             exog=train_x,
                             var_type='c',
                             bw=[self.bandwidth])

        return kr_model.fit(x_to_pred)[0]
示例#11
0
 def __init__(self, x, y, yerr=None):
     reg = KernelReg([y], [x], var_type='c', reg_type='ll')
     vals = reg.fit(x)[0]
     self.spline = interp.UnivariateSpline(x,
                                           vals,
                                           w=np.isfinite(vals),
                                           ext='const')
     # calculate RMS and normalize to stop normalization drifting
     xs = np.linspace(np.min(x), np.max(x), 1000)
     ys = self.spline(xs)
     self.rms = np.sqrt(np.sum(ys**2) / 1000)
示例#12
0
class LocalRegression:
    def __init__(self):
        pass

    def fit(self, X_train, y_train):
        # By default, this function will do a local linear regression
        self.regression = KernelReg(y_train, X_train, var_type='c')
        return self

    def predict(self, X_test):
        return self.regression.fit(X_test)[0]
def calc_smooth(prices: pd.Series, *, bw: Union[np.ndarray, str] = 'cv_ls', a: float = None, use_array: bool = True) -> Union[pd.Series, np.ndarray]:
    """计算Nadaraya-Watson核估计后的价格数据

    Args:
        prices (pd.Series): 价格数据
        bw (Union[np.ndarray,str]): Either a user-specified bandwidth or the method for bandwidth selection. Defaults to cv_ls.
        a (float, optional): 论文中所说的比例数据. Defaults to None.
        use_array (bool, optional): 为True返回ndarray,False返回为pd.Series. Defaults to True.

    Returns:
        Union[pd.Series,np.ndarry]
    """

    if not isinstance(prices, pd.Series):
        raise ValueError('prices必须为pd.Series')

    idx = np.arange(len(prices))

    kr = KernelReg(prices.values, idx,
                   var_type='c', reg_type='ll', bw=bw)

    if a is None:

        f = kr.fit(idx)[0]

    else:

        kr.bw = a * kr.bw  # 论文用的0.3 * h

        f = kr.fit(idx)[0]

    if use_array:

        return f

    else:

        return pd.Series(data=f, index=prices.index)
def find_extrema(s, bw='cv_ls'):
    """
    Input:
        s: prices as pd.series
        bw: bandwith as str or array like
    Returns:
        prices: with 0-based index as pd.series
        extrema: extrema of prices as pd.series
        smoothed_prices: smoothed prices using kernel regression as pd.series
        smoothed_extrema: extrema of smoothed_prices as pd.series
    """
    # Copy series so we can replace index and perform non-parametric
    # kernel regression.
    prices = s.copy()
    prices = prices.reset_index()
    prices.columns = ['date', 'price']
    prices = prices['price']

    kr = KernelReg([prices.values], [prices.index.to_numpy()],
                   var_type='c',
                   bw=bw)
    f = kr.fit([prices.index])

    # Use smoothed prices to determine local minima and maxima
    smooth_prices = pd.Series(data=f[0], index=prices.index)
    smooth_local_max = argrelextrema(smooth_prices.values, np.greater)[0]
    smooth_local_min = argrelextrema(smooth_prices.values, np.less)[0]
    local_max_min = np.sort(
        np.concatenate([smooth_local_max, smooth_local_min]))
    smooth_extrema = smooth_prices.loc[local_max_min]

    # Iterate over extrema arrays returning datetime of passed
    # prices array. Uses idxmax and idxmin to window for local extrema.
    price_local_max_dt = []
    for i in smooth_local_max:
        if (i > 1) and (i < len(prices) - 1):
            price_local_max_dt.append(prices.iloc[i - 2:i + 2].idxmax())

    price_local_min_dt = []
    for i in smooth_local_min:
        if (i > 1) and (i < len(prices) - 1):
            price_local_min_dt.append(prices.iloc[i - 2:i + 2].idxmin())

    maxima = pd.Series(prices.loc[price_local_max_dt])
    minima = pd.Series(prices.loc[price_local_min_dt])
    extrema = pd.concat([maxima, minima]).sort_index()

    # Return series for each with bar as index
    return extrema, prices, smooth_extrema, smooth_prices
示例#15
0
class KernelModelWrapper(object):
    def __init__(self):
        self.model = None
        self.variable_types = {}
        self.X_shape = None
        self.y_shape = None

    def fit(self, X, y, variable_types={}):
        self.X_shape = X.shape
        self.y_shape = y.shape
        if variable_types:
            variable_type_string = ''.join([variable_types[col] for col in X.columns])
            self.model = KernelReg(y, X, variable_type_string, reg_type='ll')
        else:
            self.model = KernelReg(y, X, 'c' * X.shape[1], reg_type='ll')
        return self

    def predict(self, X):
        if X.shape != self.X_shape:
            raise Exception("Expected shape {}, received {}".format(self.X_shape, X.shape))
        return self.model.fit(X)[0]
示例#16
0
def find_max_min(prices):
    """
    Get min and max of a series consisting of prices
    """

    prices_ = prices.copy()
    prices_.index = np.linspace(1., len(prices_), len(prices_))
    kr = KernelReg([prices_.values], [prices_.index.values],
                   var_type='c',
                   bw=[1.8])
    f = kr.fit([prices_.index.values])
    smooth_prices = pd.Series(data=f[0], index=prices.index)

    local_max = argrelextrema(smooth_prices.values, np.greater)[0]
    local_min = argrelextrema(smooth_prices.values, np.less)[0]

    price_local_max_dt = []
    for i in local_max:
        if (i > 1) and (i < len(prices) - 1):
            price_local_max_dt.append(prices.iloc[i - 2:i + 2].argmax())

    price_local_min_dt = []
    for i in local_min:
        if (i > 1) and (i < len(prices) - 1):
            price_local_min_dt.append(prices.iloc[i - 2:i + 2].argmin())

    prices.name = 'price'
    maxima = pd.DataFrame(prices.loc[price_local_max_dt])
    minima = pd.DataFrame(prices.loc[price_local_min_dt])
    max_min = pd.concat([maxima, minima]).sort_index()
    max_min.index.name = 'date'
    max_min = max_min.reset_index()
    max_min = max_min[~max_min.date.duplicated()]
    p = prices.reset_index()
    max_min['day_num'] = p[p['index'].isin(max_min.date)].index.values
    max_min = max_min.set_index('day_num').price

    return max_min
示例#17
0
class KernelModelWrapper(object):
    def __init__(self):
        self.model = None
        self.variable_types = {}
        self.X_shape = None
        self.y_shape = None

    def fit(self, X, y, variable_types={}):
        self.X_shape = X.shape
        self.y_shape = y.shape
        if variable_types:
            variable_type_string = ''.join(
                [variable_types[col] for col in X.columns])
            self.model = KernelReg(y, X, variable_type_string, reg_type='ll')
        else:
            self.model = KernelReg(y, X, 'c' * X.shape[1], reg_type='ll')
        return self

    def predict(self, X):
        if X.shape != self.X_shape:
            raise Exception("Expected shape {}, received {}".format(
                self.X_shape, X.shape))
        return self.model.fit(X)[0]
示例#18
0
def find_max_min(prices):
    prices_ = prices.copy()
    prices_.index = linspace(1., len(prices_), len(prices_))
    #kr = KernelReg([prices_.values], [prices_.index.values], var_type='c', bw=[1.8, 1])
    kr = KernelReg([prices_.values], [prices_.index.values], var_type='c', bw=[2]) # 小了捕捉局部,大了捕捉全局 !
    # Either a user-specified bandwidth or the method for bandwidth selection.
    # If a string, valid values are ‘cv_ls’ (least-squares cross-validation) and ‘aic’ (AIC Hurvich bandwidth estimation).
    # Default is ‘cv_ls’.
    f = kr.fit([prices_.index.values])

    smooth_prices = pd.Series(data=f[0], index=prices.index)

    local_max = argrelextrema(smooth_prices.values, np.greater)[0]
    local_min = argrelextrema(smooth_prices.values, np.less)[0]
    price_local_max_dt = []
    for i in local_max:
        if (i > 1) and (i < len(prices) - 1):
            price_local_max_dt.append(prices.iloc[i - 2:i + 2].argmax())

    price_local_min_dt = []
    for i in local_min:
        if (i > 1) and (i < len(prices) - 1):
            price_local_min_dt.append(prices.iloc[i - 2:i + 2].argmin())

    prices.name = 'price'
    maxima = pd.DataFrame(prices.loc[price_local_max_dt])
    minima = pd.DataFrame(prices.loc[price_local_min_dt])
    max_min = pd.concat([maxima, minima]).sort_index()
    max_min.index.name = 'date'
    max_min = max_min.reset_index()
    max_min = max_min[~max_min.date.duplicated()]
    p = prices.reset_index()
    max_min['day_num'] = p[p['index'].isin(max_min.date)].index.values
    max_min = max_min.set_index('day_num').price

    return max_min
示例#19
0
def estimator_nw(data, est_kwargs={}, **kwargs):
    from statsmodels.nonparametric.kernel_regression import KernelReg
    #http://www.statsmodels.org/dev/generated/statsmodels.nonparametric.kernel_density.EstimatorSettings.html
    from statsmodels.nonparametric.kernel_regression import EstimatorSettings
    k = len(data['x']['Train'].T)
    #    n = len(data['x']['Train'])

    if 'reg_type' in est_kwargs.keys():
        reg_type = est_kwargs[
            'reg_type']  #Allows for locally linear estimation
    else:
        reg_type = 'lc'  #Default is local constant (Nadaraya-Watson).

    #Estimate model
    nw = KernelReg(
        data['y']['Train'],
        data['x']['Train'],  #Fits regression
        var_type='c' * k,  #Continuous variables
        reg_type=reg_type,
        bw='aic',  #Least-squares cross val. Else aic for aic hurdwidth
        defaults=EstimatorSettings(
            n_jobs=1,  #No parallel
            efficient=True,
            randomize=True,  #bw estimation random subsampling
            n_res=25,  #Number of resamples
            n_sub=50,  # Size of samples 
        ),
    )
    betahat = np.array([])  #NP does not have coefficients

    # Extract results
    prob, mrgeff = {}, {}
    for split in ('Train', 'Test'):
        prob[split], mrgeff[split] = nw.fit(data_predict=data['x'][split])

    return betahat, prob, mrgeff
class Surface:

    def __init__(self, f, f2, pts3d, left_pts, right_pts, oldpts3d, safety_check=False):
        self.f = f
        self.f2 = f2
        self.safety_check = safety_check
        self.pts3d = np.matrix(pts3d)
        self.minimum = np.min(self.pts3d[:,2])
        self.maximum = np.max(self.pts3d[:,2])
        self.oldpts3d = oldpts3d
        self.left_pts = left_pts
        self.right_pts = right_pts
        pts2d = []
        ptsz = []
        f3 = open("../calibration_data/camera_matrix.p", "rb")
        self.cmat = pickle.load(f3)
        f3.close()
        
        for pt in pts3d:
            pts2d.append(pt[:2])
            ptsz.append(np.ceil(pt[2] * 1000000))
        self.neigh = KNeighborsClassifier(n_neighbors=2)
        self.neigh.fit(pts2d, ptsz)
        self.f = scipy.interpolate.Rbf(np.matrix(pts3d)[:,0].ravel(), np.matrix(pts3d)[:,1].ravel(), np.matrix(pts3d)[:,2].ravel(), function='linear', epsilon=.1)
        pts3d = np.array(pts3d).T
        print pts3d.shape
        print pts3d[:2,:].shape, pts3d[2,:].shape
        self.f = KernelReg(pts3d[2,:], pts3d[:2,:], 'cc')

    def leftpixels_to_rframe(self, x, y):
        surf = self.f2
        left_pts = self.left_pts
        right_pts = self.right_pts
        pts3d = self.oldpts3d
        xin = np.array([a[0] for a in left_pts])
        bias = np.ones(len(xin))
        yin = np.array([a[1] for a in left_pts])

        xout = np.array([a[0] for a in pts3d])
        yout = np.array([a[1] for a in pts3d])

        A = np.vstack([xin, bias]).T
        m1, c1 = np.linalg.lstsq(A, xout)[0]

        A = np.vstack([yin, bias]).T
        m2, c2 = np.linalg.lstsq(A, yout)[0]

        xnew = m1 * x + c1
        ynew = m2 * y + c2
        cpoint = np.matrix([(xnew, ynew, self.f2(xnew, ynew))])
        pt = np.ones(4)
        pt[:3] = cpoint
        pred = self.cmat * np.matrix(pt).T
        return pred

    def query(self, x, y):
        temp = self.f.fit(np.array((x, y)))[0][0]
        if not self.safety_check:
            return (x, y, temp)
        if temp < self.minimum - 0.02:
            temp = self.query_knn(x, y)[2]
        elif temp > self.maximum + 0.02:
            temp = self.query_knn(x, y)[2]
        print 'asdf', temp
        return (x, y, temp)

    def query_knn(self, x, y):
        return (x, y, (self.neigh.predict([[x, y]]) / 1000000.0)[0])

    def visualize(self):
        fig = plt.figure()
        ax = fig.add_subplot(111)
        pts3d = np.matrix(self.pts3d)
        f = self.f
        a, b =  np.ravel(np.min(pts3d, axis=0)), np.ravel(np.max(pts3d, axis=0))
        extra_range = 0.0
#         xnew = np.arange(a[0] - extra_range,b[0] + extra_range,0.0001)
#         ynew = np.arange(a[1] - extra_range,b[1] + extra_range,0.0001)
        X, Y = np.mgrid[a[0] + .05 :b[0] - .05 :100j, a[1]:b[1]:100j]
fairK = np.array((3, 5, 9, 15, 20, 25, 30, 35, 40, 45))

event_lengths = durs_run1_new / fairK

unique_event_lengths = np.unique(event_lengths)
x = event_lengths.ravel()

test_x = np.linspace(min(x), max(x), num=100)
smooth_wva = np.zeros((len(unique_event_lengths), len(ROI_data), nBoots))

opt_bw_holder = np.zeros((nBoots, len(ROI_data)))

for ROI in range(len(ROI_data)):
    for b in range(nBoots):
        opt_bw = 0
        y = ROI_data[ROI][:, :, b].ravel()
        KR = KernelReg(y, x, var_type='c')
        opt_bw += KR.bw / len(ROI_data)
        opt_bw_holder[b, ROI] = opt_bw
        y = ROI_data[ROI][:, :, b].ravel()
        KR = KernelReg(y, x, var_type='c', bw=opt_bw)
        smooth_wva[:, ROI, b] += KR.fit(unique_event_lengths)[0]

np.save(
    datadir + 'smooth_' + suffix + '_' + save_fn +
    '_auto_independent_bandwidths', smooth_wva)
np.save(
    datadir + 'smooth_' + suffix + '_' + save_fn +
    '_auto_independent_optimal_bandwidth', opt_bw_holder)
示例#22
0
class CausalEffect(object):
    def __init__(self, X, causes, effects, admissable_set=[], variable_types=None, expectation=False, density=True):
        """
        We want to calculate the causal effect of X and Y through
        back-door adjustment, P(Y|do(X)) = Sum( P(Y|X,Z)P(Z), Z) 
        for some admissable set of control variables, Z.  First we 
        calculate the conditional density P(Y|X,Z), then the density
        P(Z).  We find the support of Z so we can properly sum over
        it later.  variable_types are a dictionary with the column name
        pointing to an element of set(['o', 'u', 'c']), for 'ordered',
        'unordered discrete', or 'continuous'.
        """
        conditional_density_vars = causes + admissable_set
        self.causes = causes
        self.effects = effects
        self.admissable_set = admissable_set
        self.conditional_density_vars = conditional_density_vars
        
        if variable_types:
            self.variable_types = variable_types
            dep_type      = [variable_types[var] for var in effects]
            indep_type    = [variable_types[var] for var in conditional_density_vars]
            density_types = [variable_types[var] for var in admissable_set]
        else:
            self.variable_types = self.__infer_variable_types(X)

        if 'c' not in variable_types.values():
            bw = 'cv_ml'
        else:
            bw = 'normal_reference'


        if admissable_set:            
            self.density = KDEMultivariate(X[admissable_set], 
                                  var_type=''.join(density_types),
                                  bw=bw)
        
        self.conditional_density = KDEMultivariateConditional(endog=X[effects],
                                                         exog=X[conditional_density_vars],
                                                         dep_type=''.join(dep_type),
                                                         indep_type=''.join(indep_type),
                                                         bw=bw)
        if expectation:
            self.conditional_expectation = KernelReg(X[effects].values,
                                                 X[conditional_density_vars].values,
                                                 ''.join(indep_type),
                                                 bw='cv_ls')

        self.support = self.__get_support(X)
        
        self.discrete_variables = [ variable for variable, var_type in self.variable_types.items() if var_type in ['o', 'u']]
        self.discrete_Z = list(set(self.discrete_variables).intersection(set(admissable_set)))
        self.continuous_variables = [ variable for variable, var_type in self.variable_types.items() if var_type == 'c' ]
        self.continuous_Z = list(set(self.continuous_variables).intersection(set(admissable_set)))
       
 
    def __infer_variable_types(self,X):
        """
        fill this in later.
        """
        pass
       
 
    def __get_support(self, X):
        """
        find the smallest cube around which the densities are supported,
        allowing a little flexibility for variables with larger bandwidths.
        """
        data_support = { variable : (X[variable].min(), X[variable].max()) for variable in X.columns}
        variable_bandwidths = { variable : bw for variable, bw in zip(self.effects + self.conditional_density_vars, self.conditional_density.bw)}
        support = {}
        for variable in self.effects + self.conditional_density_vars:
            if self.variable_types[variable] == 'c':
                lower_support = data_support[variable][0] - 10. * variable_bandwidths[variable]
                upper_support = data_support[variable][1] + 10. * variable_bandwidths[variable]
                support[variable] = (lower_support, upper_support)
            else:
                support[variable] = data_support[variable]
        return support

        
    def integration_function(self,*args):
        # takes continuous z, discrete z, then x
        data = pd.DataFrame({ k : [v] for k, v in zip(self.continuous_Z + self.discrete_Z + self.causes + self.effects, args)})
        conditional = self.conditional_density.pdf(exog_predict=data[self.conditional_density_vars].values[0], 
                                                   endog_predict=data[self.effects].values[0]) 
        density = self.density.pdf(data_predict=data[self.admissable_set])
        return conditional * density

    
    def expectation_integration_function(self, *args):
        data = pd.DataFrame({ k : [v] for k, v in zip(self.continuous_Z + self.discrete_Z + self.causes, args)})
        conditional = self.conditional_expectation.fit(data_predict=data[self.conditional_density_vars].values)[0]
        density = self.density.pdf(data_predict=data[self.admissable_set])
        return conditional * density

    
    def pdf(self, x):
        """
        Currently, this does the whole sum/integral over the cube support of Z.
        We may be able to improve this by taking into account how the joint
        and conditionals factorize, and/or finding a more efficient support.
        
        This should be reasonably fast for |Z| <= 2 or 3, and small enough discrete
        variable cardinalities.  It runs in O(n_1 n_2 ... n_k) in the cardinality of
        the discrete variables, |Z_1| = n_1, etc.  It likewise runs in O(V^n) for n
        continuous Z variables.  Factorizing the joint/conditional distributions in
        the sum could linearize the runtime.
        """
        causal_effect = 0.
        x = x[self.causes + self.effects]
        if self.discrete_Z:
            discrete_variable_ranges = [ xrange(*(int(self.support[variable][0]), int(self.support[variable][1])+1)) for variable in self.discrete_Z]
            for z_vals in itertools.product(*discrete_variable_ranges):
                z_discrete = pd.DataFrame({k : [v] for k, v in zip(self.discrete_Z, z_vals)})
                if self.continuous_Z:
                    continuous_Z_ranges = [self.support[variable] for variable in self.continuous_Z]
                    args = z_discrete.join(x).values[0]
                    causal_effect += nquad(self.integration_function,continuous_Z_ranges,args=args)[0]
                else:
                    z_discrete = z_discrete[self.admissable_set]
                    exog_predictors = x.join(z_discrete)[self.conditional_density_vars]
                    conditional = self.conditional_density.pdf(exog_predict=exog_predictors, 
                                                               endog_predict=x[self.effects]) 
                    density = self.density.pdf(data_predict=z_discrete)
                    dc = conditional * density
                    causal_effect += dc
            return causal_effect
        elif self.continuous_Z:
            continuous_Z_ranges = [self.support[var] for var in self.continuous_Z]
            causal_effect, error = nquad(self.integration_function,continuous_Z_ranges,args=tuple(x.values[0]))
            return causal_effect
        else:
            return self.conditional_density.pdf(exog_predict=x[self.causes],endog_predict=x[self.effects])

       
 
    def expected_value( self, x):
        """
        Currently, this does the whole sum/integral over the cube support of Z.
        We may be able to improve this by taking into account how the joint
        and conditionals factorize, and/or finding a more efficient support.
        
        This should be reasonably fast for |Z| <= 2 or 3, and small enough discrete
        variable cardinalities.  It runs in O(n_1 n_2 ... n_k) in the cardinality of
        the discrete variables, |Z_1| = n_1, etc.  It likewise runs in O(V^n) for n
        continuous Z variables.  Factorizing the joint/conditional distributions in
        the sum could linearize the runtime.
        """
        causal_effect = 0.
        x = x[self.causes]
        if self.discrete_Z:
            discrete_variable_ranges = [ xrange(*(int(self.support[variable][0]), int(self.support[variable][1])+1)) for variable in self.discrete_Z]
            for z_vals in itertools.product(*discrete_variable_ranges):
                z_discrete = pd.DataFrame({k : [v] for k, v in zip(self.discrete_Z, z_vals)})
                if self.continuous_Z:
                    continuous_Z_ranges = [self.support[variable] for variable in self.continuous_Z]
                    args = z_discrete.join(x).values[0]
                    causal_effect += nquad(self.expectation_integration_function,continuous_Z_ranges,args=args)[0]
                else:
                    z_discrete = z_discrete[self.admissable_set]
                    exog_predictors = x.join(z_discrete)[self.conditional_density_vars]
                    causal_effect += self.conditional_expectation.fit(data_predict=exog_predictors.values)[0] * self.density.pdf(data_predict=z_discrete.values)
            return causal_effect
        elif self.continuous_Z:
            continuous_Z_ranges = [self.support[var] for var in self.continuous_Z]
            causal_effect, error = nquad(self.expectation_integration_function,continuous_Z_ranges,args=tuple(x.values[0]))
            return causal_effect
        else:
            return self.conditional_expectation.fit(data_predict=x[self.causes])[0]
示例#23
0
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.nonparametric.kernel_regression import KernelReg

x = np.sort(np.random.rand(400) * 10 - 2)
y = x**4 - 8 * (x**3) + 14 * (x**2) - 32 * (x) + 14 + (
    (np.random.rand(len(x)) - 0.5) * 50)
y_clean = x**4 - 8 * (x**3) + 14 * (x**2) - 32 * (x) + 14

reg = KernelReg(y, x, 'c')
[mean, mfx] = reg.fit()

plt.figure()
plt.scatter(x, y)
plt.plot(x, mean, color="red")
plt.plot(x, y_clean, color="green")
plt.show()
示例#24
0
    def model(self):

        #Time the modelling
        start_time = time.clock()

        #Extract dependent and independent variables
        y = self.df['impl_volatility'].values
        x = self.df[['strike_price', 'stock', 'T', 'riskfree']].values

        #Activate efficient bandwidth selection
        if self.bandwidth == None:
            self.efficient = True
            self.bandwidth = 'cv_ls'
            print(
                'No predetermined bandwidth selected. Looking for optimizng the bandwidth'
            )

        #Bandwidth defined by Scott D.W.
        elif self.bandwidth == 'bw_scott':
            self.bandwidth = bw_scott(x)
            #self.bandwidth = self.bandwidth*()
            print('Selected bandwidth: ', self.bandwidth)

        #SBandwidth defined by Silverman B.W.
        elif self.bandwidth == 'bw_silverman':
            self.bandwidth = bw_silverman(x)
            print('Selected bandwidth: ', self.bandwidth)

        #Or else select own bandsidth for the array
        else:
            pass

        #Optimize the bandwidth selection if no other bandwidth selection method is defined.
        #See more here on their github page
        #https://github.com/statsmodels/statsmodels/blob/master/statsmodels/nonparametric/_kernel_base.py
        defaults = EstimatorSettings(efficient=self.efficient,
                                     randomize=False,
                                     n_sub=50,
                                     n_res=50,
                                     n_jobs=0,
                                     return_only_bw=True)

        #Preprocess the data for faster computation
        x = preprocessing.normalize(x)

        #Split the data into traning anf testing data for in and out of sample testing
        xtrain, xtest, ytrain, ytest = train_test_split(x, y)

        #Define the regressor, with conrinues variables and the bandwith selection
        reg = KernelReg(endog=ytrain,
                        exog=xtrain,
                        var_type='cccc',
                        bw=self.bandwidth,
                        defaults=defaults)

        #Fit the data onto the test data to get a out of sample prediction
        pred = reg.fit(xtest)[0]

        #Get the results from the test i form om RMSE and in and out of sample R^2
        print('RMSE: ', np.sqrt(mean_squared_error(ytest, pred)))
        print('Out of Sample  R^2 :', r2_score(ytest, pred))
        #print ('In sample ' , reg.r_squared())

        #Print the computing time
        print('Estimation time: ', time.clock() - start_time, "seconds")

        return reg
                     eigen_solver="auto",
                     tol=1e-9,
                     max_iter=3000,
                     n_jobs=-1)
    feature_coords = kpca.fit_transform((sim_mat**2) * -0.5)

    landfalls = np.array([float(h.made_landfall) for h in hurricane_list])

    inds = np.argsort(feature_coords[:, 0])

    feature_coords_sorted = feature_coords[inds]
    landfalls_sorted = landfalls[inds]

    vartypes = ''.join('c' * target_dim)
    reg = KernelReg(landfalls_sorted, feature_coords_sorted, vartypes)
    [mean, mfx] = reg.fit()

    # plt.figure()
    # plt.scatter(feature_coords_sorted[:,0], landfalls_sorted, color="green")
    # plt.plot(feature_coords_sorted[:,0], mean, color="red")
    # plt.show()

    cv_feature_coords = kpca.transform((data_matrix**2) * -0.5)
    # print cv_feature_coords
    [cv_mean, cv_mfx] = reg.fit(cv_feature_coords)
    # print cv_mean

    cv_predicted = np.zeros(m)
    cv_high_prob = np.zeros(m) + 0.5
    num_high_prob = 0
    thresh = 0.05
示例#26
0
event_lengths = durs_run1_new/fairK

unique_event_lengths = np.unique(event_lengths)
x = event_lengths.ravel()

ROI_data = [a1_data, AG_data, prec_data, mpfc_data]
#ROI_data = [a1_data,AG_data,prec_data]

test_x = np.linspace(min(x), max(x), num=100)
smooth_wva = np.zeros((len(unique_event_lengths), len(ROI_data), nBoots))

for b in range(nBoots):
    # Optimize bandwidth
    opt_bw = 0
    for ROI in range(len(ROI_data)):
        y = ROI_data[ROI][:,:,b].ravel()
        KR = KernelReg(y,x,var_type='c')
        opt_bw += KR.bw/len(ROI_data)

    max_wva = np.zeros(len(ROI_data))
    for ROI in range(len(ROI_data)):
        y = ROI_data[ROI][:,:,b].ravel()
        KR = KernelReg(y,x,var_type='c', bw=opt_bw)
        max_wva[ROI] = np.argmax(KR.fit(test_x)[0])  # Find peak on fine grid
        smooth_wva[:, ROI, b] += KR.fit(unique_event_lengths)[0]

np.save(datadir + 'smooth_wva_split_merge_01_a1_prec_AG_bilmPFC',smooth_wva)


class Surface:
    def __init__(self,
                 f,
                 f2,
                 pts3d,
                 left_pts,
                 right_pts,
                 oldpts3d,
                 safety_check=False):
        self.f = f
        self.f2 = f2
        self.safety_check = safety_check
        self.pts3d = np.matrix(pts3d)
        self.minimum = np.min(self.pts3d[:, 2])
        self.maximum = np.max(self.pts3d[:, 2])
        self.oldpts3d = oldpts3d
        self.left_pts = left_pts
        self.right_pts = right_pts
        pts2d = []
        ptsz = []
        f3 = open("../calibration_data/camera_matrix.p", "rb")
        self.cmat = pickle.load(f3)
        f3.close()

        for pt in pts3d:
            pts2d.append(pt[:2])
            ptsz.append(np.ceil(pt[2] * 1000000))
        self.neigh = KNeighborsClassifier(n_neighbors=2)
        self.neigh.fit(pts2d, ptsz)
        self.f = scipy.interpolate.Rbf(np.matrix(pts3d)[:, 0].ravel(),
                                       np.matrix(pts3d)[:, 1].ravel(),
                                       np.matrix(pts3d)[:, 2].ravel(),
                                       function='linear',
                                       epsilon=.1)
        pts3d = np.array(pts3d).T
        print pts3d.shape
        print pts3d[:2, :].shape, pts3d[2, :].shape
        self.f = KernelReg(pts3d[2, :], pts3d[:2, :], 'cc')

    def leftpixels_to_rframe(self, x, y):
        surf = self.f2
        left_pts = self.left_pts
        right_pts = self.right_pts
        pts3d = self.oldpts3d
        xin = np.array([a[0] for a in left_pts])
        bias = np.ones(len(xin))
        yin = np.array([a[1] for a in left_pts])

        xout = np.array([a[0] for a in pts3d])
        yout = np.array([a[1] for a in pts3d])

        A = np.vstack([xin, bias]).T
        m1, c1 = np.linalg.lstsq(A, xout)[0]

        A = np.vstack([yin, bias]).T
        m2, c2 = np.linalg.lstsq(A, yout)[0]

        xnew = m1 * x + c1
        ynew = m2 * y + c2
        cpoint = np.matrix([(xnew, ynew, self.f2(xnew, ynew))])
        pt = np.ones(4)
        pt[:3] = cpoint
        pred = self.cmat * np.matrix(pt).T
        return pred

    def query(self, x, y):
        temp = self.f.fit(np.array((x, y)))[0][0]
        if not self.safety_check:
            return (x, y, temp)
        if temp < self.minimum - 0.02:
            temp = self.query_knn(x, y)[2]
        elif temp > self.maximum + 0.02:
            temp = self.query_knn(x, y)[2]
        print 'asdf', temp
        return (x, y, temp)

    def query_knn(self, x, y):
        return (x, y, (self.neigh.predict([[x, y]]) / 1000000.0)[0])

    def visualize(self):
        fig = plt.figure()
        ax = fig.add_subplot(111)
        pts3d = np.matrix(self.pts3d)
        f = self.f
        a, b = np.ravel(np.min(pts3d, axis=0)), np.ravel(np.max(pts3d, axis=0))
        extra_range = 0.0
        #         xnew = np.arange(a[0] - extra_range,b[0] + extra_range,0.0001)
        #         ynew = np.arange(a[1] - extra_range,b[1] + extra_range,0.0001)
        X, Y = np.mgrid[a[0] + .05:b[0] - .05:100j, a[1]:b[1]:100j]
示例#28
0
x4=xax4
y4= tweetatsec4
pyplot.xlabel('Second')
pyplot.ylabel('Total tweet')


pyplot.scatter(x,y,color='cyan')
pyplot.scatter(x2,y2,color='red')
pyplot.scatter(x3,y3,color='blue')
pyplot.scatter(x4,y4,color='green')

kr = KernelReg(y,x,'o')
kr2 = KernelReg(y2,x2,'o')
kr3 = KernelReg(y3,x3,'o')
kr4 = KernelReg(y4,x4,'o')
pyplot.plot(x, y, '+')
pyplot.plot(x2,y2,'+')
pyplot.plot(x3,y3,'+')
pyplot.plot(x4,y4,'+')

y_pred, y_std = kr.fit(x)
y2_pred, y2_std = kr2.fit(x2)
y3_pred, y3_std = kr3.fit(x3)
y4_pred, y4_std = kr4.fit(x4)

pyplot.plot(x, y_pred,'cyan',label='twitter')
pyplot.plot(x2,y2_pred,'red',label='facebook')
pyplot.plot(x3,y3_pred,'blue',label='instagram')
pyplot.plot(x4,y4_pred,'green',label='tumblr')
pyplot.legend(loc='upper right')
pyplot.show()
示例#29
0
# compute average max wva across songs
mean_max_wva = np.mean(max_wvas)

# computing average event lengths using song durations divided by number of events
durs_run1_new = durs_run1[:, np.newaxis]

event_lengths = durs_run1_new / K_set

unique_event_lengths = np.unique(event_lengths)
x = event_lengths.ravel()

test_x = np.linspace(min(x), max(x), num=100)

y = ROI_WvA.ravel()
KR = KernelReg(y, x, var_type='c')
KR_w_bw = KernelReg(y, x, var_type='c', bw=KR.bw)
smooth_wva = KR_w_bw.fit(unique_event_lengths)[0]
max_wva = np.max(smooth_wva)

# compute roi's preferred event length in seconds
ROI_pref_sec = unique_event_lengths[np.argmax(smooth_wva)]

inputs = [ROI_WvA, smooth_wva, max_wva, mean_max_wva, ROI_pref_sec]
dct = {}

for i, j in zip(dict_names, inputs):
    dct.setdefault(i, []).append(j)

np.save(savedir + 'parcel' + roiNum + '_wva_data', dct)
示例#30
0
class CausalEffect(object):
    def __init__(self,
                 X,
                 causes,
                 effects,
                 admissable_set=[],
                 variable_types=None,
                 expectation=False,
                 density=True):
        """
        We want to calculate the causal effect of X and Y through
        back-door adjustment, P(Y|do(X)) = Sum( P(Y|X,Z)P(Z), Z) 
        for some admissable set of control variables, Z.  First we 
        calculate the conditional density P(Y|X,Z), then the density
        P(Z).  We find the support of Z so we can properly sum over
        it later.  variable_types are a dictionary with the column name
        pointing to an element of set(['o', 'u', 'c']), for 'ordered',
        'unordered discrete', or 'continuous'.
        """
        conditional_density_vars = causes + admissable_set
        self.causes = causes
        self.effects = effects
        self.admissable_set = admissable_set
        self.conditional_density_vars = conditional_density_vars

        if variable_types:
            self.variable_types = variable_types
            dep_type = [variable_types[var] for var in effects]
            indep_type = [
                variable_types[var] for var in conditional_density_vars
            ]
            density_types = [variable_types[var] for var in admissable_set]
        else:
            self.variable_types = self.__infer_variable_types(X)

        if 'c' not in variable_types.values():
            bw = 'cv_ml'
        else:
            bw = 'normal_reference'

        if admissable_set:
            self.density = KDEMultivariate(X[admissable_set],
                                           var_type=''.join(density_types),
                                           bw=bw)

        self.conditional_density = KDEMultivariateConditional(
            endog=X[effects],
            exog=X[conditional_density_vars],
            dep_type=''.join(dep_type),
            indep_type=''.join(indep_type),
            bw=bw)
        if expectation:
            self.conditional_expectation = KernelReg(
                X[effects].values,
                X[conditional_density_vars].values,
                ''.join(indep_type),
                bw='cv_ls')

        self.support = self.__get_support(X)

        self.discrete_variables = [
            variable for variable, var_type in self.variable_types.items()
            if var_type in ['o', 'u']
        ]
        self.discrete_Z = list(
            set(self.discrete_variables).intersection(set(admissable_set)))
        self.continuous_variables = [
            variable for variable, var_type in self.variable_types.items()
            if var_type == 'c'
        ]
        self.continuous_Z = list(
            set(self.continuous_variables).intersection(set(admissable_set)))

    def __infer_variable_types(self, X):
        """
        fill this in later.
        """
        pass

    def __get_support(self, X):
        """
        find the smallest cube around which the densities are supported,
        allowing a little flexibility for variables with larger bandwidths.
        """
        data_support = {
            variable: (X[variable].min(), X[variable].max())
            for variable in X.columns
        }
        variable_bandwidths = {
            variable: bw
            for variable, bw in zip(
                self.effects +
                self.conditional_density_vars, self.conditional_density.bw)
        }
        support = {}
        for variable in self.effects + self.conditional_density_vars:
            if self.variable_types[variable] == 'c':
                lower_support = data_support[variable][
                    0] - 10. * variable_bandwidths[variable]
                upper_support = data_support[variable][
                    1] + 10. * variable_bandwidths[variable]
                support[variable] = (lower_support, upper_support)
            else:
                support[variable] = data_support[variable]
        return support

    def integration_function(self, *args):
        # takes continuous z, discrete z, then x
        data = pd.DataFrame({
            k: [v]
            for k, v in zip(
                self.continuous_Z + self.discrete_Z + self.causes +
                self.effects, args)
        })
        conditional = self.conditional_density.pdf(
            exog_predict=data[self.conditional_density_vars].values[0],
            endog_predict=data[self.effects].values[0])
        density = self.density.pdf(data_predict=data[self.admissable_set])
        return conditional * density

    def expectation_integration_function(self, *args):
        data = pd.DataFrame({
            k: [v]
            for k, v in zip(self.continuous_Z + self.discrete_Z +
                            self.causes, args)
        })
        conditional = self.conditional_expectation.fit(
            data_predict=data[self.conditional_density_vars].values)[0]
        density = self.density.pdf(data_predict=data[self.admissable_set])
        return conditional * density

    def pdf(self, x):
        """
        Currently, this does the whole sum/integral over the cube support of Z.
        We may be able to improve this by taking into account how the joint
        and conditionals factorize, and/or finding a more efficient support.
        
        This should be reasonably fast for |Z| <= 2 or 3, and small enough discrete
        variable cardinalities.  It runs in O(n_1 n_2 ... n_k) in the cardinality of
        the discrete variables, |Z_1| = n_1, etc.  It likewise runs in O(V^n) for n
        continuous Z variables.  Factorizing the joint/conditional distributions in
        the sum could linearize the runtime.
        """
        causal_effect = 0.
        x = x[self.causes + self.effects]
        if self.discrete_Z:
            discrete_variable_ranges = [
                xrange(*(int(self.support[variable][0]),
                         int(self.support[variable][1]) + 1))
                for variable in self.discrete_Z
            ]
            for z_vals in itertools.product(*discrete_variable_ranges):
                z_discrete = pd.DataFrame(
                    {k: [v]
                     for k, v in zip(self.discrete_Z, z_vals)})
                if self.continuous_Z:
                    continuous_Z_ranges = [
                        self.support[variable]
                        for variable in self.continuous_Z
                    ]
                    args = z_discrete.join(x).values[0]
                    causal_effect += nquad(self.integration_function,
                                           continuous_Z_ranges,
                                           args=args)[0]
                else:
                    z_discrete = z_discrete[self.admissable_set]
                    exog_predictors = x.join(z_discrete)[
                        self.conditional_density_vars]
                    conditional = self.conditional_density.pdf(
                        exog_predict=exog_predictors,
                        endog_predict=x[self.effects])
                    density = self.density.pdf(data_predict=z_discrete)
                    dc = conditional * density
                    causal_effect += dc
            return causal_effect
        elif self.continuous_Z:
            continuous_Z_ranges = [
                self.support[var] for var in self.continuous_Z
            ]
            causal_effect, error = nquad(self.integration_function,
                                         continuous_Z_ranges,
                                         args=tuple(x.values[0]))
            return causal_effect
        else:
            return self.conditional_density.pdf(exog_predict=x[self.causes],
                                                endog_predict=x[self.effects])

    def expected_value(self, x):
        """
        Currently, this does the whole sum/integral over the cube support of Z.
        We may be able to improve this by taking into account how the joint
        and conditionals factorize, and/or finding a more efficient support.
        
        This should be reasonably fast for |Z| <= 2 or 3, and small enough discrete
        variable cardinalities.  It runs in O(n_1 n_2 ... n_k) in the cardinality of
        the discrete variables, |Z_1| = n_1, etc.  It likewise runs in O(V^n) for n
        continuous Z variables.  Factorizing the joint/conditional distributions in
        the sum could linearize the runtime.
        """
        causal_effect = 0.
        x = x[self.causes]
        if self.discrete_Z:
            discrete_variable_ranges = [
                xrange(*(int(self.support[variable][0]),
                         int(self.support[variable][1]) + 1))
                for variable in self.discrete_Z
            ]
            for z_vals in itertools.product(*discrete_variable_ranges):
                z_discrete = pd.DataFrame(
                    {k: [v]
                     for k, v in zip(self.discrete_Z, z_vals)})
                if self.continuous_Z:
                    continuous_Z_ranges = [
                        self.support[variable]
                        for variable in self.continuous_Z
                    ]
                    args = z_discrete.join(x).values[0]
                    causal_effect += nquad(
                        self.expectation_integration_function,
                        continuous_Z_ranges,
                        args=args)[0]
                else:
                    z_discrete = z_discrete[self.admissable_set]
                    exog_predictors = x.join(z_discrete)[
                        self.conditional_density_vars]
                    causal_effect += self.conditional_expectation.fit(
                        data_predict=exog_predictors.values
                    )[0] * self.density.pdf(data_predict=z_discrete.values)
            return causal_effect
        elif self.continuous_Z:
            continuous_Z_ranges = [
                self.support[var] for var in self.continuous_Z
            ]
            causal_effect, error = nquad(self.expectation_integration_function,
                                         continuous_Z_ranges,
                                         args=tuple(x.values[0]))
            return causal_effect
        else:
            return self.conditional_expectation.fit(
                data_predict=x[self.causes])[0]
示例#31
0
def get_regularized_params(
    model_parameters,
    genes,
    genes_step1,
    genes_log10_gmean_step1,
    genes_log10_gmean,
    cell_attr,
    umi,
    batch_var=None,
    bw_adjust=3,
    gmean_eps=1,
    theta_regularization="od_factor",
    exclude_poisson=False,
    poisson_genes=None,
    method="theta_ml",
):
    model_parameters = model_parameters.copy()

    model_parameters_fit = pd.DataFrame(
        npy.nan, index=genes, columns=model_parameters.columns
    )

    """
    exog_predict = genes_log10_gmean#.values
    for column in model_parameters.columns:
        if column == "theta":
            continue
        endog = model_parameters.loc[genes_step1, column].values
        exog_fit = genes_log10_gmean_step1#.values
        bw = bwSJ(genes_log10_gmean_step1, bw_adjust=bw_adjust)#.values)
        reg = KernelReg(endog=endog, exog=exog_fit, var_type="c", reg_type="ll", bw=bw)
        model_parameters_fit[column] = reg.fit(exog_predict)[0]

    """
    x_points_df = pd.DataFrame({"gene_log10_gmean": genes_log10_gmean})
    x_points_df["min_gene_log10_gmean_step1"] = genes_log10_gmean_step1.min()

    x_points_df["x_points"] = npy.nanmax(x_points_df, axis=1)
    x_points_df["max_gene_log10_gmean_step1"] = npy.nanmax(genes_log10_gmean_step1)
    x_points_df["x_points"] = x_points_df[
        ["x_points", "max_gene_log10_gmean_step1"]
    ].min(1)
    x_points = x_points_df["x_points"].values
    for column in model_parameters.columns:
        if column == "theta":
            continue
        endog = model_parameters.loc[genes_step1, column].values
        exog_fit = genes_log10_gmean_step1  # .values
        if method == "glgmp":
            bw = bw_SJr(genes_log10_gmean_step1, bw_adjust=bw_adjust)  # .values)
            params = ksmooth(genes_log10_gmean, genes_log10_gmean_step1, endog, bw[0])
            index = model_parameters_fit.index.values[params["order"] - 1]
            model_parameters_fit.loc[index, column] = params["smoothed"]
        else:
            bw = bwSJ(genes_log10_gmean_step1, bw_adjust=bw_adjust)  # .values)
            reg = KernelReg(endog=endog, exog=exog_fit, var_type="c", reg_type="ll", bw=bw)
            fit = reg.fit(x_points)
            model_parameters_fit[column] = npy.squeeze(fit[0])

    if theta_regularization == "theta":
        theta = npy.power(10, (model_parameters["od_factor"]))
    else:
        theta = npy.power(10, genes_log10_gmean) / (
            npy.power(10, model_parameters_fit["od_factor"]) - 1
        )
    model_parameters_fit["theta"] = theta
    if exclude_poisson:
        # relace theta by inf
        if poisson_genes is not None:
            model_parameters_fit.loc[poisson_genes, "theta"] = npy.inf

    return model_parameters_fit
示例#32
0
for row in range(xin.shape[0]):
    xin_list.append(float(xin.iloc[row]))

yin=pd.read_csv('yin.csv', names=['y'])
yin_list=[]
for row in range(yin.shape[0]):
    yin_list.append(float(yin.iloc[row]))



df=pd.concat([yin,xin], axis=1)

# Using statsmodels
kde = KernelReg(x, y, var_type='c', reg_type='ll', bw=[3.2])

estimator = kde.fit(y)
estimator = np.reshape(estimator[0], df.shape[0])

plt.scatter(x, y)
plt.scatter(x, estimator, c='r')
plt.show()

# Using SKFDA

df_grid=skfda.FDataGrid(df)

bandwidth = np.arange(0.1, 5, 0.2)

llr = val.SmoothingParameterSearch(
    ks.LocalLinearRegressionSmoother(),
    bandwidth)
示例#33
0
def selector(case):
    if case == 1:
        results_dir = create_results_directory('./results/paper/dtr_vs_xgb')
        x, y = load_boston(return_X_y=True)
        x = pd.DataFrame(x,
                         columns=[
                             'crime', 'zn', 'indus', 'chas', 'nox', 'rm',
                             'age', 'dis', 'rad', 'tax', 'ptratio', 'blacks',
                             'lstat'
                         ])
        x = x[['rm', 'lstat']]
        df_all = x.copy()
        df_all['price'] = y

        # Plot 3D scatter
        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')
        ax.scatter(df_all['rm'], df_all['lstat'], df_all['price'])
        ax.view_init(30, 135)
        plt.savefig(f'{results_dir}/scatter.png')
        plt.close()

        dtr = DecisionTreeRegressor(max_depth=2)
        dtr.fit(x, y)
        plot_tree(dtr, impurity=False)
        plt.savefig(f'{results_dir}/dtr_visual.png')
        plt.close()

        x_min = x.min(axis=0)
        x_max = x.max(axis=0)

        rm_linspace = np.linspace(x_min['rm'], x_max['rm'], 100)
        lstat_linspace = np.linspace(x_min['lstat'], x_max['lstat'], 100)

        rm, lstat = np.meshgrid(rm_linspace, lstat_linspace)
        points = np.stack(map(np.ravel, (rm, lstat)), axis=1)
        z = dtr.predict(points).reshape(rm.shape)

        fig = plt.figure()
        ax = fig.gca(projection='3d')
        ax.plot_surface(rm,
                        lstat,
                        z,
                        cmap=plt.cm.BuGn,
                        linewidth=0.2,
                        vmin=-50)
        ax.view_init(30, 135)
        plt.savefig(f'{results_dir}/dtr_prediction.png')
        plt.close()

        # Linear regression
        lr = LinearRegression().fit(x, y)
        z = lr.predict(points).reshape(rm.shape)
        fig = plt.figure()
        ax = fig.gca(projection='3d')
        ax.plot_surface(rm,
                        lstat,
                        z,
                        cmap=plt.cm.BuGn,
                        linewidth=0.2,
                        vmin=-50)
        ax.view_init(30, 135)
        plt.savefig(f'{results_dir}/lr_prediction.png')
        plt.close()

        # Linear regression
        kr = KernelReg(exog=x, endog=y, var_type='cc')
        z = kr.fit(points)[0].reshape(rm.shape)
        fig = plt.figure()
        ax = fig.gca(projection='3d')
        ax.plot_surface(rm,
                        lstat,
                        z,
                        cmap=plt.cm.BuGn,
                        linewidth=0.2,
                        vmin=-50)
        ax.view_init(30, 135)
        plt.savefig(f'{results_dir}/kr_prediction.png')
        plt.close()

        # XGB
        hparams = {
            'seed': 42,
            'booster': 'gbtree',
            'learning_rate': 0.1,
            'objective': 'reg:squarederror',
            'verbosity': 0,
            'subsample': 1,
            'max_depth': 2,
            'colsample_bytree': 0.5,
        }
        dtrain = xgb.DMatrix(x.values, label=y)
        model = xgb.train(hparams,
                          dtrain=dtrain,
                          num_boost_round=100,
                          verbose_eval=False)
        z_xgb = model.predict(xgb.DMatrix(points)).reshape(rm.shape)

        fig = plt.figure()
        ax = fig.gca(projection='3d')
        ax.plot_surface(rm,
                        lstat,
                        z_xgb,
                        cmap=plt.cm.BuGn,
                        linewidth=0.2,
                        vmin=-50)
        ax.view_init(30, 135)
        plt.savefig(f'{results_dir}/xgb_prediction.png')
示例#34
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri May 24 00:18:16 2019

KernelReg practice

@author: mbattley
"""

from statsmodels.nonparametric.kernel_regression import KernelReg
import numpy as np
import matplotlib.pyplot as plt
x = np.linspace(0, 2 * np.pi, 100)
y = np.sin(x) + np.random.random(100) * 0.2
# The third parameter specifies the type of the variable x;
# 'c' stands for continuous
kr = KernelReg(y, x, 'c')
plt.plot(x, y, '+')
y_pred, y_std = kr.fit(x)
plt.plot(x, y_pred)
plt.show()