Пример #1
0
 def box_cox_trans_attribute(
         self, attribute,
         lamda):  # boxcox transformation of an attribute in train_x
     self._train_data_set[attribute] = boxcox(
         self._train_data_set[attribute], lamda)
     self._test_data_set[attribute] = boxcox(self._test_data_set[attribute],
                                             lamda)
Пример #2
0
def test_nonfinite():
    x = np.array([-1, -0.5])
    y = boxcox(x, [0.5, -1.5])
    yield assert_equal, y, np.array([np.nan, np.nan])
    x = 0
    y = boxcox(x, [-2.5, 0])
    yield assert_equal, y, np.array([-np.inf, -np.inf])
Пример #3
0
def test_boxcox_nonfinite():
    # x < 0  =>  y = nan
    x = np.array([-1, -1, -0.5])
    y = boxcox(x, [0.5, 2.0, -1.5])
    yield assert_equal, y, np.array([np.nan, np.nan, np.nan])

    # x = 0 and lambda <= 0  =>  y = -inf
    x = 0
    y = boxcox(x, [-2.5, 0])
    yield assert_equal, y, np.array([-np.inf, -np.inf])
Пример #4
0
def test_boxcox_nonfinite():
    # x < 0  =>  y = nan
    x = np.array([-1, -1, -0.5])
    y = boxcox(x, [0.5, 2.0, -1.5])
    yield assert_equal, y, np.array([np.nan, np.nan, np.nan])

    # x = 0 and lambda <= 0  =>  y = -inf
    x = 0
    y = boxcox(x, [-2.5, 0])
    yield assert_equal, y, np.array([-np.inf, -np.inf])
    def box_cox_trans_attribute(
            self, attribute,
            lamda):  # boxcox transformation of an attribute in train_x
        self._train_data_set[attribute] = boxcox(
            self._train_data_set[attribute], lamda)

        if not self._test_data_set.empty:
            self._test_data_set[attribute] = boxcox(
                self._test_data_set[attribute], lamda)
        else:
            print("no test data set")
def predict_price(location, area, bedrooms, bathrooms):

    loc_index = np.where(X.columns==location)[0][0] # X is an np array so we use where method to loc the index
    
    x= np.zeros(len(X.columns))
    x[0] = boxcox(area,0)
    x[1] = boxcox(bedrooms,0)
    x[2] = boxcox(bathrooms,0)
    if loc_index >= 0:
        x[loc_index] = 1
    
    return "The rental predicted price for this house is " + " ".join((str(round(inv_boxcox(model.predict([x])[0],0))), "euros per month."))
Пример #7
0
def test_basic():
    x = np.array([1,2,3])
    y = boxcox(x, 0)
    yield assert_almost_equal, y, np.log(x)
    y = boxcox(x, 1)
    yield assert_almost_equal, y, x - 1
    y = boxcox(x, 2)
    yield assert_almost_equal, y, 0.5*(x**2 - 1)
    lam = np.array([0.5, 1, 2])
    y = boxcox(0, lam)
    yield assert_almost_equal, y, -1.0 / lam
    x = np.array([-1.0, -0.5])
    y = boxcox(x, np.array([[1],[2]]))
    yield assert_almost_equal, y, np.array([[-2, -1.5], [0, -0.375]])
Пример #8
0
def generate(x, y, filename):
    """Generate fixture data and write to file.

    # Arguments

    * `x`: domain
    * `y`: domain
    * `name::str`: filename of the output file

    # Examples

    ```python
    python> x = np.linspace(-10.0, 10.0, 2001)
    python> y = np.arange(-5.0, 5.0, 1001)
    python> generate(x, y, './data.json')
    ```
    """
    z = boxcox(x, y)
    data = dict(
        x=x.tolist(),
        y=y.tolist(),
        expected=z.tolist()
    )

    filepath = path.join(DIR, filename)

    with open(filepath, 'w') as out:
        json.dump(data, out)
Пример #9
0
    def fit(self, input_data):
        """ Class fit arima model on data

        :param input_data: data with features, target and ids to process
        """

        source_ts = np.array(input_data.features)
        # Save actual time series length
        self.actual_ts_len = len(source_ts)
        self.sts = source_ts

        # Apply box-cox transformation for positive values
        min_value = np.min(source_ts)
        if min_value > 0:
            pass
        else:
            # Making a shift to positive values
            self.scope = abs(min_value) + 1
            source_ts = source_ts + self.scope

        _, self.lambda_value = stats.boxcox(source_ts)
        transformed_ts = boxcox(source_ts, self.lambda_value)

        # Set parameters
        p = int(self.params.get('p'))
        d = int(self.params.get('d'))
        q = int(self.params.get('q'))
        params = {'order': (p, d, q)}
        self.arima = ARIMA(transformed_ts, **params).fit()

        return self.arima
Пример #10
0
    def transform(self, X):
        if self.transform_cols is None:
            raise NotFittedError(
                f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator."
            )

        new_X = X.copy()
        for column in self.transform_cols:
            new_X[column] = boxcox(new_X[column], self.lmbda)

        # Transformed skewness & kurtosis
        skew_df = new_X[self.transform_cols].skew().to_frame(
            name='Skewness (Box Cox)')
        kurt_df = new_X[self.transform_cols].kurt().to_frame(
            name='Kurtosis (Box Cox)')
        stat_df = skew_df.merge(kurt_df,
                                left_index=True,
                                right_index=True,
                                how='left')
        self.stat_df = self.stat_df.merge(stat_df,
                                          left_index=True,
                                          right_index=True,
                                          how='left')

        return new_X
Пример #11
0
def get_estimated_rent(area, sq_mt, bedrooms, bathrooms):
    try:
        loc_index = __data_columns.index(area.lower(
        ))  # From a list, we can get the index by simply using .index()
    except:
        loc_index = -1

    x = np.zeros(len(__data_columns))
    x[0] = boxcox(sq_mt, 0)
    x[1] = boxcox(bedrooms, 0)
    x[2] = boxcox(bathrooms, 0)
    if loc_index >= 0:
        x[loc_index] = 1

    return round(
        inv_boxcox(__model.predict([x])[0], 0)
    )  # this is how we call our model. x is the input in the form of a 2D array
Пример #12
0
def test_boxcox_basic():
    x = np.array([0.5, 1, 2, 4])

    # lambda = 0  =>  y = log(x)
    y = boxcox(x, 0)
    yield assert_almost_equal, y, np.log(x)

    # lambda = 1  =>  y = x - 1
    y = boxcox(x, 1)
    yield assert_almost_equal, y, x - 1

    # lambda = 2  =>  y = 0.5*(x**2 - 1)
    y = boxcox(x, 2)
    yield assert_almost_equal, y, 0.5*(x**2 - 1)

    # x = 0 and lambda > 0  =>  y = -1 / lambda
    lam = np.array([0.5, 1, 2])
    y = boxcox(0, lam)
    yield assert_almost_equal, y, -1.0 / lam
Пример #13
0
def test_boxcox_basic():
    x = np.array([0.5, 1, 2, 4])

    # lambda = 0  =>  y = log(x)
    y = boxcox(x, 0)
    yield assert_almost_equal, y, np.log(x)

    # lambda = 1  =>  y = x - 1
    y = boxcox(x, 1)
    yield assert_almost_equal, y, x - 1

    # lambda = 2  =>  y = 0.5*(x**2 - 1)
    y = boxcox(x, 2)
    yield assert_almost_equal, y, 0.5*(x**2 - 1)

    # x = 0 and lambda > 0  =>  y = -1 / lambda
    lam = np.array([0.5, 1, 2])
    y = boxcox(0, lam)
    yield assert_almost_equal, y, -1.0 / lam
Пример #14
0
    def target_transform(self, Y):
        #an empirical parameter
        if self.TRANSFORM:
            Y_t    = boxcox( Y, self.LAMBDA)

            # Y_t     = np.log(Y)
        else: #no transform
            Y_t    = Y

        return Y_t
Пример #15
0
def test_inv_boxcox():
    x = np.array([0., 1., 2.])
    lam = np.array([0., 1., 2.])
    y = boxcox(x, lam)
    x2 = inv_boxcox(y, lam)
    assert_almost_equal(x, x2)

    x = np.array([0., 1., 2.])
    lam = np.array([0., 1., 2.])
    y = boxcox1p(x, lam)
    x2 = inv_boxcox1p(y, lam)
    assert_almost_equal(x, x2)
Пример #16
0
def test_inv_boxcox():
    x = np.array([0., 1., 2.])
    lam = np.array([0., 1., 2.])
    y = boxcox(x, lam)
    x2 = inv_boxcox(y, lam)
    assert_almost_equal(x, x2)

    x = np.array([0., 1., 2.])
    lam = np.array([0., 1., 2.])
    y = boxcox1p(x, lam)
    x2 = inv_boxcox1p(y, lam)
    assert_almost_equal(x, x2)
Пример #17
0
    def _transform(self, Z, X=None):
        """Transform data.

        Parameters
        ----------
        Z : pd.Series
            Series to transform.
        X : pd.DataFrame, optional (default=None)
            Exogenous data used in transformation.

        Returns
        -------
        Zt : pd.Series
            Transformed series.
        """
        z = check_series(Z, enforce_univariate=True)
        zt = boxcox(z.to_numpy(), self.lambda_)
        return pd.Series(zt, index=z.index)
Пример #18
0
    def _transform(self, X, y=None):
        """Transform X and return a transformed version.

        private _transform containing the core logic, called from transform

        Parameters
        ----------
        X : 2D np.ndarray (n x 1)
            Data to be transformed
        y : ignored argument for interface compatibility
            Additional data, e.g., labels for transformation

        Returns
        -------
        Xt : 2D np.ndarray
            transformed version of X
        """
        X_shape = X.shape
        Xt = boxcox(X.flatten(), self.lambda_)
        Xt = Xt.reshape(X_shape)
        return Xt
Пример #19
0
    def transform(self, x):
        """

        Parameters
        ----------
        x

        Returns
        -------
        DataFrame
            Box-Cox transformed data.
        """
        x = self._check_type(x)
        xs = []
        for i, col in enumerate(x.T):
            if np.all(col > 0):
                self._shift[i] = 0.
            else:
                self._shift[i] -= col[~np.isnan(col)].min()

            _lmd = self._lmd[i]
            _shift = self._shift[i]
            for case in Switch(_lmd):
                if case(np.inf):
                    x = col
                    break
                if case(np.nan):
                    x = np.full(col.shape, np.nan)
                    break
                if case():
                    x = boxcox(col + _shift, _lmd)
            xs.append(x.reshape(-1, 1))
        xs = np.concatenate(xs, axis=1)

        if len(self._shape) == 1:
            return xs.ravel()
        return xs.reshape(-1, self._shape[1])
Пример #20
0
    def apply(self, ds):
        assert not self.shifting_factors, 'This function cannot be called twice.'

        ds = ds.astype('float64')

        for name, lmbda, boundary_location in \
                zip(self.var_names, self.lmbdas, self.boundary_locations):
            if boundary_location == 'right':
                ds = ds.assign({name: -ds[name]})

            sample_dim = ds[name].dims[0]
            stacked, stack_info = util.to_stacked_array(ds[[name]])
            mins = stacked.min(sample_dim)  # feature

            shifting_factor_per_feature = abs(
                mins) + NUMERICAL_OFFSET  # feature
            shifting_factor_per_feature.load()[mins >= NUMERICAL_OFFSET] = 0.
            self.shifting_factors.append(shifting_factor_per_feature)

            transformed = boxcox(stacked + shifting_factor_per_feature, lmbda)
            unstacked = util.to_unstacked_dataset(transformed.values,
                                                  stack_info)
            ds = ds.assign({name: unstacked[name]})
        return ds
Пример #21
0
def _boxcox(x, lmbda=None, bounds=None, alpha=None):
    r"""Return a dataset transformed by a Box-Cox power transformation.

    Parameters
    ----------
    x : ndarray
        Input array.  Must be positive 1-dimensional.  Must not be constant.
    lmbda : {None, scalar}, optional
        If `lmbda` is not None, do the transformation for that value.
        If `lmbda` is None, find the lambda that maximizes the log-likelihood
        function and return it as the second output argument.
    alpha : {None, float}, optional
        If ``alpha`` is not None, return the ``100 * (1-alpha)%`` confidence
        interval for `lmbda` as the third output argument.
        Must be between 0.0 and 1.0.

    Returns
    -------
    boxcox : ndarray
        Box-Cox power transformed array.
    maxlog : float, optional
        If the `lmbda` parameter is None, the second returned argument is
        the lambda that maximizes the log-likelihood function.
    (min_ci, max_ci) : tuple of float, optional
        If `lmbda` parameter is None and ``alpha`` is not None, this returned
        tuple of floats represents the minimum and maximum confidence limits
        given ``alpha``.

    See Also
    --------
    probplot, boxcox_normplot, boxcox_normmax, boxcox_llf

    Notes
    -----
    The Box-Cox transform is given by::
        y = (x**lmbda - 1) / lmbda,  for lmbda > 0
            log(x),                  for lmbda = 0
    `boxcox` requires the input data to be positive.  Sometimes a Box-Cox
    transformation provides a shift parameter to achieve this; `boxcox` does
    not.  Such a shift parameter is equivalent to adding a positive constant to
    `x` before calling `boxcox`.
    The confidence limits returned when ``alpha`` is provided give the interval
    where:

    .. math::

        llf(\hat{\lambda}) - llf(\lambda) < \frac{1}{2}\chi^2(1 - \alpha, 1),
    with ``llf`` the log-likelihood function and :math:`\chi^2` the chi-squared
    function.

    References
    ----------
    G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal of the
    Royal Statistical Society B, 26, 211-252 (1964).
    """
    x = np.asarray(x)
    if x.ndim != 1:
        raise ValueError("Data must be 1-dimensional.")

    if x.size == 0:
        return x

    if np.all(x == x[0]):
        raise ValueError("Data must not be constant.")

    if any(x <= 0):
        raise ValueError("Data must be positive.")

    if lmbda is not None:  # single transformation
        return special.boxcox(x, lmbda)

    # If lmbda=None, find the lmbda that maximizes the log-likelihood function.
    lmax = _boxcox_normmax(x, bounds=bounds, method="mle")
    y = _boxcox(x, lmax)

    if alpha is None:
        return y, lmax
    else:
        # Find confidence interval
        interval = _boxcox_conf_interval(x, lmax, alpha)
        return y, lmax, interval
Пример #22
0
def getPredict(region, sproduct, scale=1.96):
    connection = pymysql.connect(host='localhost',
                                 user='******',
                                 password='******',
                                 db='price',
                                 charset='utf8mb4',
                                 cursorclass=DictCursor)
    print('region {} product {}'.format(region, sproduct))
    df = pd.read_sql(
        "SELECT ymd, price FROM price.tab WHERE region = '{}' and products='{}'"
        .format(region, sproduct),
        con=connection)

    #df.set_index('ymd')

    df.price = boxcox(df.price, lmbda)

    X_train, X_test, y_train, y_test = prepareData(
        df,
        test_size=12,
        lag_start=12,
        lag_end=24,
    )

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test)

    # задаём параметры
    params = {'objective': 'reg:squarederror', 'booster': 'gbtree'}
    trees = 1000

    # прогоняем на кросс-валидации с метрикой rmse
    cv = xgb.cv(params,
                dtrain,
                metrics=('rmse'),
                verbose_eval=False,
                nfold=10,
                show_stdv=False,
                num_boost_round=trees,
                seed=0)

    # обучаем xgboost с оптимальным числом деревьев, подобранным на кросс-валидации
    bst = xgb.train(params,
                    dtrain,
                    num_boost_round=cv['test-rmse-mean'].values.argmin())

    # можно построить кривые валидации
    # cv.plot(y=['test-mae-mean', 'train-mae-mean'])

    # запоминаем ошибку на кросс-валидации
    #    deviation = cv.loc[cv['test-rmse-mean'].argmin()]["test-rmse-mean"]

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 10))

    # посмотрим, как модель вела себя на тренировочном отрезке ряда
    prediction_train = inv_boxcox(bst.predict(dtrain), lmbda)
    y_train = inv_boxcox(y_train, lmbda)
    ax1.plot(y_train, label="y_train")
    ax1.plot(prediction_train, label="prediction")
    ax1.axis('tight')
    ax1.grid(True)
    ax1.legend()
    ax1.set_title("{} \n MAPE {}".format(
        sproduct,
        round(mean_absolute_percentage_error(y_train, prediction_train))))

    # и на тестовом
    prediction_test = inv_boxcox(bst.predict(dtest), lmbda)
    y_test = inv_boxcox(y_test, lmbda)
    ax2.plot(list(y_test), label="y_test")
    ax2.plot(prediction_test, label="prediction")
    ax2.axis('tight')
    ax2.grid(True)
    ax2.legend()
    ax2.set_title("{} \n MAPE {}".format(
        sproduct, round(mean_absolute_percentage_error(y_test,
                                                       prediction_test))))

    plt.show()

    connection.close()
Пример #23
0
    d = namespace.d
    q = namespace.q
    sp = namespace.sp
    sd = namespace.sd
    sq = namespace.sq
    ss = namespace.ss
    datein = namespace.datein
    dateout = namespace.dateout
    timeforcast = namespace.timeforcast


    df = pd.read_sql(
        'SELECT ymd, price FROM price.tab_price WHERE region = "{}" and product="{}" and ymd > "{}" and ymd < "{}" '
        'ORDER BY ymd  '.format(region, product, datein, dateout),
        con=connection)
    df['price'] = boxcox(df['price'], lmbda)
    df['ymd'] = pd.to_datetime(df['ymd'])
    df = df.set_index('ymd')





    mod = sm.tsa.statespace.SARIMAX(df['price'], order=(p, d, q),  seasonal_order=(sp, sd, sq, ss))
    res = mod.fit(disp=False)

    param = {'p':p, 'd':d, 'q':q, 'sp':sp, 'sd':sd, 'sq':sq, 'ss':ss}
    datend = df['price'].index[-1] + (relativedelta(months=+(nforecast))) if timeforcast == "m" else (dateout + relativedelta(weeks=+(nforecast)))
    predict = res.get_prediction(start =  df['price'].index[0], end= datend)

    p_main = inv_boxcox(predict.predicted_mean, lmbda)
 def box_cox_target(self, lamda):
     self._y_train = boxcox(self._y_train, lamda)
Пример #25
0
def test_boxcox_underflow():
    x = 1 + 1e-15
    lmbda = 1e-306
    y = boxcox(x, lmbda)
    assert_allclose(y, np.log(x), rtol=1e-14)
Пример #26
0
def boxcox(x, lmbda=None, bounds=None, alpha=None):
    r"""
    Return a dataset transformed by a Box-Cox power transformation.
    Parameters
    ----------
    x : ndarray
        Input array.  Must be positive 1-dimensional.  Must not be constant.
    lmbda : {None, scalar}, optional
        If `lmbda` is not None, do the transformation for that value.
        If `lmbda` is None, find the lambda that maximizes the log-likelihood
        function and return it as the second output argument.
    alpha : {None, float}, optional
        If ``alpha`` is not None, return the ``100 * (1-alpha)%`` confidence
        interval for `lmbda` as the third output argument.
        Must be between 0.0 and 1.0.
    Returns
    -------
    boxcox : ndarray
        Box-Cox power transformed array.
    maxlog : float, optional
        If the `lmbda` parameter is None, the second returned argument is
        the lambda that maximizes the log-likelihood function.
    (min_ci, max_ci) : tuple of float, optional
        If `lmbda` parameter is None and ``alpha`` is not None, this returned
        tuple of floats represents the minimum and maximum confidence limits
        given ``alpha``.
    See Also
    --------
    probplot, boxcox_normplot, boxcox_normmax, boxcox_llf
    Notes
    -----
    The Box-Cox transform is given by::
        y = (x**lmbda - 1) / lmbda,  for lmbda > 0
            log(x),                  for lmbda = 0
    `boxcox` requires the input data to be positive.  Sometimes a Box-Cox
    transformation provides a shift parameter to achieve this; `boxcox` does
    not.  Such a shift parameter is equivalent to adding a positive constant to
    `x` before calling `boxcox`.
    The confidence limits returned when ``alpha`` is provided give the interval
    where:
    .. math::
        llf(\hat{\lambda}) - llf(\lambda) < \frac{1}{2}\chi^2(1 - \alpha, 1),
    with ``llf`` the log-likelihood function and :math:`\chi^2` the chi-squared
    function.
    References
    ----------
    G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal of the
    Royal Statistical Society B, 26, 211-252 (1964).
    Examples
    --------
    >>> from scipy import stats
    >>> import matplotlib.pyplot as plt
    We generate some random variates from a non-normal distribution and make a
    probability plot for it, to show it is non-normal in the tails:
    >>> fig = plt.figure()
    >>> ax1 = fig.add_subplot(211)
    >>> x = stats.loggamma.rvs(5, size=500) + 5
    >>> prob = stats.probplot(x, dist=stats.norm, plot=ax1)
    >>> ax1.set_xlabel('')
    >>> ax1.set_title('Probplot against normal distribution')
    We now use `boxcox` to transform the data so it's closest to normal:
    >>> ax2 = fig.add_subplot(212)
    >>> xt, _ = stats.boxcox(x)
    >>> prob = stats.probplot(xt, dist=stats.norm, plot=ax2)
    >>> ax2.set_title('Probplot after Box-Cox transformation')
    >>> plt.show()
    """
    x = np.asarray(x)
    if x.ndim != 1:
        raise ValueError("Data must be 1-dimensional.")

    if x.size == 0:
        return x

    if np.all(x == x[0]):
        raise ValueError("Data must not be constant.")

    if any(x <= 0):
        raise ValueError("Data must be positive.")

    if lmbda is not None:  # single transformation
        return special.boxcox(x, lmbda)

    # If lmbda=None, find the lmbda that maximizes the log-likelihood function.
    lmax = boxcox_normmax(x, bounds=bounds, method='mle')
    y = boxcox(x, lmax)

    if alpha is None:
        return y, lmax
    else:
        # Find confidence interval
        interval = _boxcox_conf_interval(x, lmax, alpha)
        return y, lmax, interval
Пример #27
0
def getPredict(namespace):

    id = namespace.id
    region = namespace.region
    sproduct = namespace.product
    lmbda = namespace.lmbda
    season = namespace.season
    p = namespace.p
    d = namespace.d
    q = namespace.q
    sp = namespace.sp
    sd = namespace.sd
    sq = namespace.sq
    ss = namespace.ss
    datein = namespace.datein
    dateout = namespace.dateout

    df = pd.read_sql(
        'SELECT ymd, price FROM price.tab_price WHERE region = "{}" and product="{}" and ymd > "{}" and ymd < "{}"'
        .format(region, sproduct, datein, dateout),
        con=connection)

    dta = df.price.values

    train = boxcox(dta, lmbda)

    n_p = range(0, p)
    n_d = range(0, d)
    n_q = range(0, q)
    n_sp = range(0, sp)
    n_sd = range(0, sd)
    n_sq = range(0, sq)
    n_ss = range(0, ss)
    print(n_p)
    parameters = product(n_p, n_d, n_q, n_sp, n_sd, n_sq, n_ss)
    parameters_list = list(parameters)

    best_aic = float("inf")
    best_bic = float("inf")
    best_hqic = float("inf")

    niter = 0
    for param in parameters_list:
        niter += 1
        try:
            model = sm.tsa.statespace.SARIMAX(
                train,
                order=(param[0], param[1], param[2]),
                seasonal_order=(param[3], param[4], param[5],
                                int(param[6] * season)),
                enforce_invertibility=False)
            res = model.fit(disp=-1)
        except:
            #print('wrong parameters:', param)
            continue

        aic = res.aic
        if aic < best_aic:
            best_aic = aic
            best_param_aic = param
        bic = res.bic
        if bic < best_bic:
            best_bic = bic
            best_param_bic = param
        hqic = res.hqic
        if hqic < best_hqic:
            best_hqic = hqic
            best_param_hqic = param

    return [best_param_aic, best_param_bic, best_param_hqic]
Пример #28
0
products = [
    'Молоко сырое крупного рогатого скота', 'Пшеница мягкая 3 класса',
    'Пшеница мягкая 5 класса', 'Ячмень', 'Гречиха', 'Семена подсолнечника',
    'Свекла столовая', 'Птица сельскохозяйственная живая', 'Олени северные',
    'Картофель'
]

for sproduct in products:

    df = pd.read_sql(
        'SELECT ymd, price FROM price.tab WHERE region = "{}" and products="{}"'
        .format(region, sproduct),
        con=connection)
    dta = df.price.values[start:]
    #dta = dta.reindex()
    xt = boxcox(dta, lmbda)
    train = xt[:len(xt) - nforecast]

    df = pd.read_sql(
        'SELECT * FROM price.model WHERE region = "{}" and product="{}"'.
        format(region, sproduct),
        con=connection)
    # Graph
    fig, ax = plt.subplots(figsize=(12, 10))
    ax.xaxis.grid()
    ax.yaxis.grid()
    ax.plot(inv_boxcox(xt, lmbda), 'k.')

    for param in df.iterrows():

        mod = sm.tsa.statespace.SARIMAX(
Пример #29
0
def test_boxcox_underflow():
    x = 1 + 1e-15
    lmbda = 1e-306
    y = boxcox(x, lmbda)
    assert_allclose(y, np.log(x), rtol=1e-14)
Пример #30
0
    else:
        return (np.exp(np.log(ld * y + 1) / ld))


y = train.血糖.values
print(y)
# We use the numpy function log1p which  applies log(1+x) to all elements of the column
# train[target_item] = np.add(10**15 * train[target_item], 0)
# train[target_item] = np.log1p(train[target_item])
# train[target_item] = np.sin(0.177*train[target_item]-0.08)  # sin不能一一对应?
# train[target_item] = np.arctan(train[target_item] - 3.17)
# train[target_item] = np.log1p(train[target_item])
# train[target_item] = np.arctan(0.6 * train[target_item] - 1.6)   # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# train[target_item] = np.log1p(train[target_item]**-2)
# train[target_item] = np.arctan(0.4 * train[target_item] - .8)
train["血糖"] = boxcox(train["血糖"], 0.15)
"""
for i in range(len(train[target_item])):
    pass
"""
'''
i = 0
for item in test_item:
    fig, ax = plt.subplots()
    ax.scatter(x = train[item], y = train[target_item])
    plt.ylabel(target_item, fontsize=13)
    plt.xlabel(item + f' {i}', fontsize=13)
    # plt.show()
    name = item
    if item[0] == '*':
        name = name[1:]
Пример #31
0
def getTrainData(region, product, datein, dateout, lag, lagVal, lagS, AvPr,
                 AvPrVal, test_size):

    df_train = pd.read_sql(
        'SELECT ymd, price FROM price.tab WHERE region = "{}" and products="{}" and ymd > "{}" and ymd < "{}"'
        .format(region, product, datein, dateout),
        con=connection)

    if len(df_train) == 0:
        return (df_train, 0)

    test_index = int(len(df_train) - test_size)
    df_train = df_train[:test_index]

    df_valute = pd.read_sql(
        'SELECT ValueVal, dateCalendar FROM price.valuta WHERE CharCode = "{}" '
        .format("USD"),
        con=connection)

    data = pd.merge(df_train,
                    df_valute,
                    left_on='ymd',
                    right_on='dateCalendar')

    data["price_boxcox"] = boxcox(data["price"], lmbda)

    trend = Pipeline([('poly', PolynomialFeatures(degree=2)),
                      ('linear', LinearRegression(fit_intercept=False))])
    x = data['ymd'].map(datetime.datetime.toordinal)
    trend.fit(x.values.reshape(-1, 1), data.price_boxcox.values.reshape(-1, 1))

    data["Trend"] = trend.predict(data['ymd'].map(
        datetime.datetime.toordinal).values.reshape(-1, 1))
    data["PriceWithOutTrend"] = data["price_boxcox"] - data["Trend"]
    data.loc[
        data['price'] == 0,
        "PriceWithOutTrend"] = 0.01  # если цена равна нулю, то поставим среднюю

    for i in lag:
        data["PriceWithOutTrend{}".format(i)] = data.PriceWithOutTrend.shift(i)
    for i in lagS:
        if i != 0:
            data["PriceWithOutTrend{}".format(
                i)] = data.PriceWithOutTrend.shift(i)
    for i in lagVal:
        data["lagValute_{}".format(i)] = data.ValueVal.shift(i)

    # средние , максимум, минимум за квартал , полгода, год
    data.ymd = pd.to_datetime(data["ymd"])

    data["month"] = data.ymd.dt.month

    meanPrice = data.groupby('month')['PriceWithOutTrend'].aggregate('mean')
    maxPrice = data.groupby('month')['PriceWithOutTrend'].aggregate('max')
    minPrice = data.groupby('month')['PriceWithOutTrend'].aggregate('min')

    data.loc[:, 'meanPrice'] = [meanPrice[month] for month in data['month']]
    data.loc[:, 'maxPrice'] = [maxPrice[month] for month in data['month']]
    data.loc[:, 'minPrice'] = [minPrice[month] for month in data['month']]

    df = data.set_index('ymd').resample('MS', label='right').first()
    df1 = df['PriceWithOutTrend'].shift().rolling(
        min_periods=1, window=AvPr).agg(['mean', 'median']).reset_index()
    data = pd.merge(data, df1, on=['ymd'], how='left')
    if AvPrVal != 0:
        df2 = df['ValueVal'].shift().rolling(
            min_periods=1, window=AvPrVal).agg(['mean',
                                                'median']).reset_index()
        data = pd.merge(data, df2, on=['ymd'], how='left')

    data.drop(["price"], axis=1, inplace=True)
    data.drop(["price_boxcox"], axis=1, inplace=True)
    data.drop(["ymd"], axis=1, inplace=True)
    data.drop(["month"], axis=1, inplace=True)
    data.drop(["dateCalendar"], axis=1, inplace=True)
    #data.drop(["Trend"], axis=1, inplace=True)

    data = data.dropna()
    data = data.reset_index(drop=True)

    return data, trend
Пример #32
0
 def transform(self, Z, X=None):
     self.check_is_fitted()
     z = check_series(Z, enforce_univariate=True)
     zt = boxcox(z.to_numpy(), self.lambda_)
     return pd.Series(zt, index=z.index)
Пример #33
0
 def fit_transform(self, y):
     return boxcox(y, 0.5)
Пример #34
0
def getTestData(datal,
                train=False,
                lag=0,
                lagVal=0,
                lagS=0,
                AvPr=0,
                AvPrVal=0,
                winWeather=0,
                AvMonth=0,
                tr=1,
                twinter=100,
                tsummer=100,
                tsping=100,
                tautomn=100,
                rwinter=100,
                rsummer=100,
                rsping=100,
                rautomn=100,
                dateout=None,
                nforecast=1):

    global trend
    data = datal.copy()

    data["month"] = data.ymd.dt.month
    data["week"] = data.ymd.dt.weekofyear
    #data["year"] = data.ymd.dt.year
    #data['yearofchange'] = (data["ymd"] > datetime.datetime(2015,1,1))

    data = data.replace({"price": {0: np.nan}})
    data["price"].interpolate(inplace=True)

    data = data.fillna(method='bfill')

    data["price_boxcox"] = boxcox(data["price"], lmbda)

    if train:
        x = data['ymd'].map(datetime.datetime.toordinal)

        trend = np.poly1d(np.polyfit(x.values, data.price_boxcox.values, tr))

    data["Trend"] = trend(data['ymd'].map(datetime.datetime.toordinal).values)

    data["PriceWithOutTrend"] = data["price_boxcox"] - data["Trend"]

    #data["diff"] = data["PriceWithOutTrend"] - data["PriceWithOutTrend"].shift(1)
    #data["diff2"] = data["PriceWithOutTrend"] - data["PriceWithOutTrend"].shift(2)
    #data.loc[0, "diff"] = 0
    #data.loc[[0,1], "diff2"] = 0
    #data.loc[data['price'] == 0, "PriceWithOutTrend"]= 0.1 # если цена равна нулю, то поставим среднюю

    for i in range(1, lag + 1):
        data["PriceWithOutTrend{}".format(i)] = data.PriceWithOutTrend.shift(i)
        #data["PriceWithOutTrend{}".format(i)].fillna(0, inplace=True)
    for i in [lagS]:
        if i != 0:
            data["PriceWithOutTrendS{}".format(
                i)] = data.PriceWithOutTrend.shift(i)
            #data["PriceWithOutTrendS{}".format(i)].fillna(0, inplace=True)
    for i in range(1, lagVal + 1):
        data["lagValute{}".format(i)] = data.ValueVal.shift(i)
        #data["lagValute{}".format(i)].fillna(0, inplace=True)

    # средние , максимум, минимум за квартал , полгода, год

    if AvMonth == 1:
        meanPrice = data[:-1].groupby('month')['PriceWithOutTrend'].aggregate(
            'mean')
        maxPrice = data[:-1].groupby('month')['PriceWithOutTrend'].aggregate(
            'max')
        minPrice = data[:-1].groupby('month')['PriceWithOutTrend'].aggregate(
            'min')

        data.loc[:,
                 'meanPrice'] = [meanPrice[month] for month in data['month']]
        data.loc[:, 'maxPrice'] = [maxPrice[month] for month in data['month']]
        data.loc[:, 'minPrice'] = [minPrice[month] for month in data['month']]

    if AvPr != 0:
        df = data.set_index('ymd').resample('MS', label='right').first()
        df1 = df['PriceWithOutTrend'].shift().rolling(
            min_periods=1, window=AvPr).agg(['mean', 'max',
                                             'min']).reset_index()
        df1 = df1.add_suffix('_AvPr')
        data = pd.merge(data,
                        df1,
                        left_on=['ymd'],
                        right_on=['ymd_AvPr'],
                        how='left')
        #data["mean_AvPr"].fillna(0, inplace=True)
        data.drop(["ymd_AvPr"], axis=1, inplace=True)

        for i in range(1, lag + 1):
            data["mean_AvPr{}".format(i)] = data.mean_AvPr.shift(i)
            data["mean_AvPr{}".format(i)].fillna(0, inplace=True)
            data["max_AvPr{}".format(i)] = data.max_AvPr.shift(i)
            data["max_AvPr{}".format(i)].fillna(0, inplace=True)
            data["min_AvPr{}".format(i)] = data.min_AvPr.shift(i)
            data["min_AvPr{}".format(i)].fillna(0, inplace=True)

    if AvPrVal != 0:
        df = data.set_index('ymd').resample('MS', label='right').first()
        df2 = df['ValueVal'].shift().rolling(
            min_periods=1, window=AvPrVal).agg(['mean', 'max',
                                                'min']).reset_index()
        df2 = df2.add_suffix('_AvPrVal')
        data = pd.merge(data,
                        df2,
                        left_on=['ymd'],
                        right_on=['ymd_AvPrVal'],
                        how='left')
        #data["mean_AvPrVal"].fillna(0, inplace=True)
        data.drop(["ymd_AvPrVal"], axis=1, inplace=True)
        for i in range(1, lagVal + 1):
            data["mean_AvPrVal{}".format(i)] = data.mean_AvPrVal.shift(i)
            data["mean_AvPrVal{}".format(i)].fillna(0, inplace=True)
            data["max_AvPrVal{}".format(i)] = data.max_AvPrVal.shift(i)
            data["max_AvPrVal{}".format(i)].fillna(0, inplace=True)
            data["min_AvPrVal{}".format(i)] = data.min_AvPrVal.shift(i)
            data["min_AvPrVal{}".format(i)].fillna(0, inplace=True)

    if winWeather != 0:
        dt_weather = pd.read_sql(
            'SELECT UTC as ymd, T, R FROM price.weather WHERE id = "{}" and UTC <= "{}"'
            .format("/weather.php?id=30710", dateout),
            con=connection)
        df = dt_weather.set_index('ymd').resample('D', label='right').agg({
            'T':
            'mean',
            'R':
            'mean'
        }).reset_index()
        date_start_pred = dateout + relativedelta(months=+1)
        date_stop_pred = dateout + relativedelta(months=+(nforecast + 1))

        #df['ymd'] = pd.to_datetime(df["ymd"])
        df['dayofyear'] = df.ymd.dt.dayofyear
        meanT = df.groupby('dayofyear')['T'].mean()
        meanR = df.groupby('dayofyear')['R'].mean()

        for d in date_range(date_start_pred, date_stop_pred,
                            datetime.timedelta(days=1)):
            df = pd.concat([
                df,
                pd.DataFrame.from_dict({
                    'ymd': [datetime.datetime(d.year, d.month, d.day)],
                    'T': [newT(d, meanT, twinter, tsummer, tsping, tautomn)],
                    'R': [newT(d, meanR, rwinter, rsummer, rsping, rautomn)],
                    'dayofyear': [1]
                })
            ],
                           ignore_index=True)

        #df['ymd'] = df.index
        df['indexmonth'] = df.apply(f_index_month, axis=1)
        df['indexmonth'] = df['indexmonth'].cumsum()
        df['cumT'] = df.groupby('indexmonth')['T'].cumsum()
        df['cumR'] = df.groupby('indexmonth')['R'].cumsum()

        df.loc[df['indexmonth'] % 2 == 0, 'cumT'] = 0
        df.loc[df['indexmonth'] % 2 == 0, 'cumR'] = 0

        #df = df.set_index('ymd')
        df.drop(
            'dayofyear',
            inplace=True,
            axis=1,
        )
        df.drop(
            'T',
            inplace=True,
            axis=1,
        )
        df.drop(
            'R',
            inplace=True,
            axis=1,
        )
        #df.drop('ymd2', inplace = True, axis=1,)
        df.drop(
            'indexmonth',
            inplace=True,
            axis=1,
        )
        for i in range(1, int(7 * lag), 7):
            df["cumT{}".format(i)] = df.cumT.shift(i)
            df["cumR{}".format(i)] = df.cumR.shift(i)
        data = pd.merge(data,
                        df,
                        left_on='ymd',
                        right_on='ymd',
                        how='left',
                        suffixes=('data', 'dt_weather'))
        #data = pd.merge(data, df, on=['ymd'], how='left', suffixes=('data', 'df2'))

    if lagVal == 0:
        data.drop(["ValueVal"], axis=1, inplace=True)
    data.drop(["price"], axis=1, inplace=True)
    data.drop(["price_boxcox"], axis=1, inplace=True)
    data.drop(["ymd"], axis=1, inplace=True)
    #data.drop(["month"], axis=1, inplace=True)
    data.drop(["dateCalendar"], axis=1, inplace=True)
    #data.drop(["Trend"], axis=1, inplace=True)

    #data = data.dropna()
    data = data.reset_index(drop=True)

    return data
Пример #35
0
def getTestData(region, product, datein, dateout, lag, lagVal, lagS, test_size,
                trend, AvPr, AvPrVal, y_past):

    df_train = pd.read_sql(
        'SELECT ymd, price FROM price.tab WHERE region = "{}" and products="{}" and ymd > "{}" and ymd < "{}"'
        .format(region, product, datein, dateout),
        con=connection)

    test_index = int(len(df_train) - test_size)
    ytrue = df_train.iloc[test_index + len(y_past), :].price
    df_train = df_train[:test_index]

    past_ymd = df_train.iloc[-1, :].ymd

    for indd, past_value in enumerate(y_past):
        df_train = df_train.append(
            {
                'price': past_value,
                'ymd': past_ymd + relativedelta(months=+1)
            },
            ignore_index=True)
    past_ymd = past_ymd + relativedelta(months=+(1 + len(y_past)))
    df_train = df_train.append({
        'price': 0,
        'ymd': past_ymd
    },
                               ignore_index=True)

    df_valute = pd.read_sql(
        'SELECT ValueVal, dateCalendar FROM price.valuta WHERE CharCode = "{}" '
        .format("USD"),
        con=connection)

    data = pd.merge(df_train,
                    df_valute,
                    left_on='ymd',
                    right_on='dateCalendar')

    data["price_boxcox"] = boxcox(data["price"], lmbda)

    data["Trend"] = trend.predict(data['ymd'].map(
        datetime.datetime.toordinal).values.reshape(-1, 1))
    data["PriceWithOutTrend"] = data["price_boxcox"] - data["Trend"]
    data.loc[
        data['price'] == 0,
        "PriceWithOutTrend"] = 0.1  # если цена равна нулю, то поставим среднюю

    for i in lag:
        data["PriceWithOutTrend{}".format(i)] = data.PriceWithOutTrend.shift(i)
    for i in lagS:
        if i != 0:
            data["PriceWithOutTrend{}".format(
                i)] = data.PriceWithOutTrend.shift(i)
    for i in lagVal:
        data["lagValute_{}".format(i)] = data.ValueVal.shift(i)

    # средние , максимум, минимум за квартал , полгода, год
    data.ymd = pd.to_datetime(data["ymd"])

    data["month"] = data.ymd.dt.month

    meanPrice = data[:-1].groupby('month')['PriceWithOutTrend'].aggregate(
        'mean')
    maxPrice = data[:-1].groupby('month')['PriceWithOutTrend'].aggregate('max')
    minPrice = data[:-1].groupby('month')['PriceWithOutTrend'].aggregate('min')

    data.loc[:, 'meanPrice'] = [meanPrice[month] for month in data['month']]
    data.loc[:, 'maxPrice'] = [maxPrice[month] for month in data['month']]
    data.loc[:, 'minPrice'] = [minPrice[month] for month in data['month']]

    df = data.set_index('ymd').resample('MS', label='right').first()
    df1 = df['PriceWithOutTrend'].shift().rolling(
        min_periods=1, window=AvPr).agg(['mean', 'median']).reset_index()
    data = pd.merge(data, df1, on=['ymd'], how='left')
    if AvPrVal != 0:
        df2 = df['ValueVal'].shift().rolling(
            min_periods=1, window=AvPrVal).agg(['mean',
                                                'median']).reset_index()
        data = pd.merge(data, df2, on=['ymd'], how='left')

    data.drop(["price"], axis=1, inplace=True)
    data.drop(["price_boxcox"], axis=1, inplace=True)
    data.drop(["ymd"], axis=1, inplace=True)
    data.drop(["month"], axis=1, inplace=True)
    data.drop(["dateCalendar"], axis=1, inplace=True)
    #data.drop(["Trend"], axis=1, inplace=True)

    data = data.dropna()
    data = data.reset_index(drop=True)

    return data, ytrue
Пример #36
0
def getPredictArima(region, sproduct, start=5):

    df_train = pd.read_sql(
        'SELECT ymd, price FROM price.tab WHERE region = "{}" and products="{}"'
        .format(region, sproduct),
        con=connection)

    df_valute = pd.read_sql(
        'SELECT ValueVal, dateCalendar FROM price.valuta WHERE CharCode = "{}" '
        .format("USD"),
        con=connection)

    data = pd.merge(df_train,
                    df_valute,
                    left_on='ymd',
                    right_on='dateCalendar')
    ex = data.ValueVal.values[start:]

    dta = df_train.price.values[start:]
    dttime = df_train.ymd.values[start:]
    xt = boxcox(dta, lmbda)
    train = xt[:len(xt) - nforecast]

    df = pd.read_sql(
        'SELECT * FROM price.model WHERE region = "{}" and product="{}" and criterion = "aic"'
        .format(region, sproduct),
        con=connection)
    # Graph
    param = df.iloc[0]

    mod = sm.tsa.statespace.SARIMAX(
        train,
        order=(param.p, param.d, param.q),
        seasonal_order=(param.sp, param.sd, param.sq, param.ss),
    )
    res = mod.fit(disp=False)

    predict = res.get_prediction(end=mod.nobs + nforecast - 1)

    p_main = inv_boxcox(predict.predicted_mean, lmbda)
    plt.figure(figsize=(10, 8))
    plt.plot(
        dta[-nforecast:],
        'bs',
        label="fact",
    )
    plt.plot(p_main[-nforecast:],
             'r--',
             label="arima. mape {}".format(
                 round(
                     mean_absolute_percentage_error(dta[-nforecast:],
                                                    p_main[-nforecast:]))))

    LAG = 18
    train = xt[LAG:len(xt) - nforecast]

    for lag in range(LAG):

        exog = ex[lag:len(xt) - nforecast + lag - LAG]

        mode = sm.tsa.statespace.SARIMAX(train,
                                         order=(param.p, param.d, param.q),
                                         seasonal_order=(param.sp, param.sd,
                                                         param.sq, param.ss),
                                         exog=exog)
        rese = mode.fit(disp=False)

        exog_forecast = data.iloc[-nforecast - lag - 1:-lag -
                                  1]['ValueVal'].values[..., np.newaxis]
        predicte = rese.get_prediction(end=mode.nobs + nforecast - 1,
                                       exog=exog_forecast)

        p_maine = inv_boxcox(predicte.predicted_mean, lmbda)

        plt.plot(p_maine[-nforecast:],
                 label="arimaX. lag {} mape {}".format(
                     LAG - lag,
                     round(
                         mean_absolute_percentage_error(
                             dta[-nforecast:], p_maine[-nforecast:]))))
        #   mean_absolute_percentage_error(dta[-nforecast:], p_main[-nforecast:]), 2), round(mean_absolute_percentage_error(dta[-nforecast:], p_maine[-nforecast:]), 2)))

    plt.axis('tight')
    plt.grid(True)
    plt.legend()
    plt.title("{} ".format(sproduct))
    plt.show()
def regress(x,y,y_label):
    regr.fit(x,y)
    print "R squared: " + str(regr.score(x,y))
    # Plot outputs
    fig = plt.figure()
    plt.scatter(y, regr.predict(x), color='blue')
    plt.xlabel(y_label)
    plt.ylabel('predicted')
    plt.show()

regress(x,latitude,'latitude')

regress(x,longitude,'longitude')

def boxcox(x,y,y_label):
    box_cox, maxlog = stats.boxcox(y + abs(min(y)) + 1)
    regr.fit(x,box_cox)
    box_cox_predict = regr.predict(x)
    y_predict = inv_boxcox(box_cox_predict,maxlog) - abs(min(y)) - 1
    print "R squared: " + str(np.var(y_predict)/np.var(y))
    # Plot outputs
    fig = plt.figure()
    plt.scatter(y, y_predict, color='blue')
    plt.xlabel(y_label)
    plt.ylabel('predicted')
    plt.show()

boxcox(x,latitude,'latitude')


boxcox(x,longitude,'longitude')