Пример #1
0
def lasso_regression_ic_jit(y: np.ndarray, x: np.ndarray, sample_length: int, frequency: int, criterion: str,
                            vol_target=False, vol_period=20):
    n, p = x.shape
    weights = np.zeros(((n - sample_length)//frequency + 1, p))
    lam = np.zeros((n - sample_length)//frequency + 1)
    for i in np.arange((n - sample_length)//frequency + 1):
        start = i*frequency
        end = i*frequency + sample_length - 1
        x_k = x[start:end+1]
        y_k = y[start:end+1]
        std_x = std(x_k)
        std_y = np.std(y_k)
        x_k = nan_to_num(x_k / std_x)
        y_k = nan_to_num(y_k / std_y)
        las = LassoLarsIC(criterion=criterion, fit_intercept=False, normalize=False)
        las.fit(x_k, np.ravel(y_k))
        weight = las.coef_ * std_y / std_x
        leverage = 1
        if vol_target:
            port_vol = np.std(y_k[-vol_period:])
            repli_vol = np.std(np.dot(x_k[-vol_period:, :], weight))
            if repli_vol > 0: leverage = port_vol / repli_vol
        weights[i] = leverage * weight
        lam[i] = 2. * las.alpha_ * (std_y ** 2)
    return weights, lam
Пример #2
0
def model_selection(data, features_list, target_col):
    '''
    This model instantiates a number of lasso regression models with different
    features and returns the BIC and AIC scores for each model.
    INPUT: data - pandas dataframe
           features_list  - a list of a list of strings that match column names
           target_col - string that matches the target column name
    OUTPUT: pandas dataframe containing features used with their BIC/AIC scores
    '''

    # instantiate the models for bic and aic scoring
    model_bic = LassoLarsIC(criterion='bic')
    model_aic = LassoLarsIC(criterion='aic')
    model_lst = [model_bic, model_aic]

    # iterate the features list to populate the scores list
    score_lst = []
    for features in features_list:
        score = score_model(data, model_lst, features, target_col)
        score_lst.append(score)

    # turn scores list into a pandas df
    score_df = pd.DataFrame(score_lst, columns=['Features', "BIC", 'AIC'])
    score_df['Features'] = score_df['Features'].apply(lambda x: ' + '.join(x))
    score_df['num_features'] = [len(x) for x in score_df.Features]

    return score_df
def lassoLARSIC(data, label, cri, state):
    kf = KFold(n_splits=10, shuffle=True, random_state=state)
    X = data
    y = label
    r2 = []
    mse = []
    true = []
    pred = []
    feature = []
    for train_index, test_index in kf.split(X):
        y_train, y_test = y[train_index], y[test_index]
        X_train_tmp, X_test_tmp = X[train_index], X[test_index]
        reg = LassoLarsIC(criterion=cri, normalize=False)
        reg.fit(X_train_tmp, y_train)
        feature_index = np.where(reg.coef_ > 0)[0]
        X_train = X_train_tmp[:, feature_index]
        X_test = X_test_tmp[:, feature_index]
        svr = svm.SVR(kernel='linear')
        svr.fit(X_train, np.ravel(y_train))
        y_test_pred = svr.predict(X_test)
        r2.append(r2_score(y_test, y_test_pred))
        mse.append(mean_squared_error(y_test, y_test_pred))
        feature.append(feature_index)
        pred.append(y_test_pred)
        true.append(np.ravel(y_test))
    feature = np.array(feature)
    pred = np.array(pred)
    true = np.array(true)
    r2 = np.array(r2)
    mse = np.array(mse)
    print('mean r2_score=',
          np.average(r2, weights=[18, 18, 18, 18, 18, 18, 17, 17, 17, 17]))
    feature = feature[np.where(r2 == max(r2))][0]
    print('number of selected features:', len(feature))
    return pred, true, r2, mse, feature
Пример #4
0
def parameter_select(X, y):
    print X.shape, y.shape
    ##############################################################################
    # LassoLarsIC: least angle regression with BIC/AIC criterion
    # model_bic = LassoLarsIC(criterion='bic')
    # model_bic.fit(X, y)
    # alpha_bic_ = model_bic.alpha_
    model_aic = LassoLarsIC(criterion='aic', max_iter=100000000)
    model_aic.fit(X, y)
    alpha_aic_ = model_aic.alpha_
    print alpha_aic_

    def plot_ic_criterion(model, name, color):
        alpha_ = model.alpha_
        alphas_ = model.alphas_
        criterion_ = model.criterion_
        plt.plot(-np.log10(alphas_), criterion_, '--', color=color,
                 linewidth=3, label='%s criterion' % name)
        plt.axvline(-np.log10(alpha_), color=color, linewidth=3,
                    label='alpha: %s estimate' % name)
        plt.xlabel('-log(alpha)')
        plt.ylabel('criterion')

    plt.figure()
    plot_ic_criterion(model_aic, 'AIC', 'b')
    # plot_ic_criterion(model_bic, 'BIC', 'r')
    plt.legend()
    plt.title('Information-criterion for model selection')
    plt.show()

    fields = iot.read_fields()
    for i in xrange(len(fields)):
        print str(fields[i]) +'\t'+ str(model_aic.coef_[i])
def _linear_model(X, y, rest='', centered=''):
    if centered:
        if rest == 'LassoLars':
            lm = LassoLarsIC(fit_intercept=False)
        elif rest == 'Ridge':
            lm = Ridge(fit_intercept=False)
        else:
            lm = LinearRegression(fit_intercept=False)
    else:
        if rest == 'LassoLars':
            lm = LassoLarsIC()
        elif rest == 'Ridge':
            lm = Ridge()
        else:
            lm = LinearRegression()

    lm.fit(X, y)

    # plt.figure()
    # plt.scatter(y_real, y_hat)
    # plt.plot(y_real,y_real)
    # plt.title('Target: {} Data Type: {}  Restriction: {}'.format(y_name, identifier, 'none'))
    # print('Data Type: {} Model Type: {} \n'.format(identifier, rest))
    # return [y_real, y_hat, lin_model, mean_squared_error(y_real, y_hat)]
    return lm
Пример #6
0
    def predict_adaptive_lasso(X, predictors, target, gamma=1.0):
        """
        Predict with Adaptive Lasso.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        predictors : array-like, shape (n_predictors)
            Indices of predictor variable.
        target : int
            Index of target variable.

        Returns
        -------
        coef : array-like, shape (n_features)
            Coefficients of predictor variable.
        """
        lr = LinearRegression()
        lr.fit(X[:, predictors], X[:, target])
        weight = np.power(np.abs(lr.coef_), gamma)
        reg = LassoLarsIC(criterion='bic')
        reg.fit(X[:, predictors] * weight, X[:, target])
        return reg.coef_ * weight
Пример #7
0
    def _pruning(self, X, B_taus, causal_order):
        """Prune edges"""
        n_features = X.shape[1]

        stacked = [np.flip(X, axis=0)]
        for i in range(self._lags):
            stacked.append(np.roll(stacked[-1], -1, axis=0))
        blocks = np.array(list(zip(*stacked)))[:-self._lags]

        for i in range(n_features):
            causal_order_no = causal_order.index(i)
            ancestor_indexes = causal_order[:causal_order_no]

            obj = np.zeros((len(blocks)))
            exp = np.zeros((len(blocks), causal_order_no + n_features * self._lags))
            for j, block in enumerate(blocks):
                obj[j] = block[0][i]
                exp[j:] = np.concatenate([block[0][ancestor_indexes].flatten(), block[1:][:].flatten()], axis=0)

            # adaptive lasso
            gamma = 1.0
            lr = LinearRegression()
            lr.fit(exp, obj)
            weight = np.power(np.abs(lr.coef_), gamma)
            reg = LassoLarsIC(criterion='bic')
            reg.fit(exp * weight, obj)
            coef = reg.coef_ * weight

            B_taus[0][i, ancestor_indexes] = coef[:causal_order_no]
            for j in range(len(B_taus[1:])):
                B_taus[j + 1][i, :] = coef[causal_order_no + n_features * j: causal_order_no + n_features * j + n_features]

        return B_taus
Пример #8
0
def LASSO_Calculation(BreakPoint_0, BreakPoint_1, Julian_Days, Values):
    Julian_Days_Interval = Julian_Days[BreakPoint_0:BreakPoint_1]
    Values_Interval = Values[BreakPoint_0:BreakPoint_1]
    X = Xarray_Calcultaion(Julian_Days_Interval)
    Y = Values_Interval.reshape(len(Values_Interval), 1)
    # ==============================使用样本数据训练系数矩阵============================
    model = LassoLarsIC()  # LassoLarsCV自动调节alpha可以实现选择最佳的alpha
    model.fit(X, Y)  # 线性回归建模
    return model
Пример #9
0
Файл: Model.py Проект: dlont/kbc
    def build_best_prediction(self):
        print("Building LassoLarsIC linear regression vanilla model!")

        from matplotlib import pyplot
        from sklearn.linear_model import LassoLarsIC
        from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error

        target_variable_names = self._configuration['model']['target'][0]
        data_provider = self.get_data_provider(
            self._configuration[target_variable_names]['data_provider'])

        input_features_names = self._configuration['model']['input_features']
        X_train = data_provider.train[input_features_names]
        y_train = data_provider.train[target_variable_names]

        X_test = data_provider.test[input_features_names]
        y_test = data_provider.test[target_variable_names]

        # print X_train.dtypes
        # print X_train.head()
        # print X_test.dtypes
        # print X_test.head()

        # print y_train.dtypes
        # print y_train.head()
        # print y_test.dtypes
        # print y_test.head()

        my_model_aic = LassoLarsIC(criterion='aic')
        my_model_aic.fit(X_train, y_train)
        y_pred_aic = my_model_aic.predict(X_test)
        # print "Max error: ", max_error(y_test,y_pred)
        print("AIC Explained variance score: ",
              explained_variance_score(y_test, y_pred_aic))
        print("AIC Mean absolute error: ",
              mean_absolute_error(y_test, y_pred_aic))
        print("AIC Mean squared error: ",
              mean_squared_error(y_test, y_pred_aic))

        my_model_bic = LassoLarsIC(criterion='bic')
        my_model_bic.fit(X_train, y_train)
        y_pred_bic = my_model_bic.predict(X_test)
        # print "Max error: ", max_error(y_test,y_pred)
        print("BIC Explained variance score: ",
              explained_variance_score(y_test, y_pred_bic))
        print("BIC Mean absolute error: ",
              mean_absolute_error(y_test, y_pred_bic))
        print("BIC Mean squared error: ",
              mean_squared_error(y_test, y_pred_bic))

        self.fit_results = {'aic': my_model_aic, 'bic': my_model_bic}
        pickle.dump(
            self.my_model,
            open(self._configuration['model']['output_filename'], 'wb'))

        pass
Пример #10
0
    def trainAlgo(self):

        self.model = LassoLarsIC(criterion=self.param['criterion'],
                                 fit_intercept=self.param['fit_intercept'],
                                 normalize=self.param['normalize'],
                                 max_iter=self.param['max_iter'],
                                 eps=self.param['eps'],
                                 positive=self.param['positive'])

        self.model.fit(self.inputData['X'], self.outputData['Y'])
Пример #11
0
class HistogramClassifier:
    def __init__(self):
        X, y = make_dataframe(letter_list)
        self.columns = list(X.columns)
        self.classifier = LassoLarsIC()
        self.classifier.fit(X, y)

    def predict(self, X):
        counter = snippet_to_histogram(X, letter_list)
        df = pd.DataFrame(columns=self.columns)
        df = df.append(counter, ignore_index=True).fillna(0)
        return self.classifier.predict(df)
Пример #12
0
def test_lasso_lars_fit_copyX_behaviour(copy_X):
    """
    Test that user input to .fit for copy_X overrides default __init__ value

    """
    lasso_lars = LassoLarsIC(precompute=False)
    rng = np.random.RandomState(0)
    X = rng.normal(0, 1, (100, 5))
    X_copy = X.copy()
    y = X[:, 2]
    lasso_lars.fit(X, y, copy_X=copy_X)
    assert copy_X == np.array_equal(X, X_copy)
Пример #13
0
def lasso_regression_ic(df_y: pd.DataFrame,
                        df_x: pd.DataFrame,
                        sample_length: int,
                        frequency: int,
                        criterion: str,
                        plot_lambda=True,
                        vol_target=False,
                        vol_period=20):

    if vol_target and vol_period > sample_length:
        raise Exception(
            "The period for vol_target cannot be longer than sample_length")

    index = df_y.index.copy()
    n, m = df_x.shape
    df_weight = pd.DataFrame(columns=df_x.columns)
    df_lambda = pd.DataFrame(columns=['$\lambda$'])

    for i in range((n - sample_length) // frequency + 1):
        start = index[i * frequency]
        vol_start = index[i * frequency + sample_length - vol_period]
        end = index[i * frequency + sample_length - 1]
        stdx = df_x.loc[start:end].std(axis=0,
                                       skipna=False).replace({0: np.nan})
        stdy = df_y.loc[start:end].std(axis=0)
        x = (df_x.loc[start:end] / stdx).fillna(0).values
        y = (df_y.loc[start:end] / stdy).values

        las = LassoLarsIC(criterion=criterion,
                          fit_intercept=False,
                          normalize=False)
        las.fit(x, np.ravel(y))

        df_lambda.loc[end] = 2. * las.alpha_ * (stdy.iloc[0]**2)
        df_weight.loc[end] = las.coef_
        df_weight.loc[end] = df_weight.loc[end] * stdy.iloc[0] / stdx
        weight = df_weight.loc[end].values

        leverage = 1
        if vol_target:
            port_vol = np.std(df_y.loc[vol_start:end].values)
            repli_vol = np.std(
                np.dot(df_x.loc[vol_start:end].fillna(0).values, weight))
            leverage = port_vol / repli_vol
        df_weight.loc[end] = leverage * weight

    if plot_lambda:
        sns.set()
        df_lambda['$\lambda$'].plot(
            title="$\lambda$ parameter selected by the " + criterion.upper())
        plt.show()

    return df_weight.fillna(0), df_lambda
Пример #14
0
def boot_coef(X, Y):
    """
    Takes original data and new sampling index.
    
    Returns coefficient vector. 
    """
    np.random.seed()
    N = X.shape[0]
    ind = np.random.choice(N, N)  #resample step
    clf = LassoLarsIC(criterion = 'aic')
    clf.fit(X[ind,],Y[ind])
    point_estimate = clf.coef_
    return point_estimate
Пример #15
0
def test_lasso_lars_copyX_behaviour(copy_X):
    """
    Test that user input regarding copy_X is not being overridden (it was until
    at least version 0.21)

    """
    lasso_lars = LassoLarsIC(copy_X=copy_X, precompute=False)
    rng = np.random.RandomState(0)
    X = rng.normal(0, 1, (100, 5))
    X_copy = X.copy()
    y = X[:, 2]
    lasso_lars.fit(X, y)
    assert copy_X == np.array_equal(X, X_copy)
Пример #16
0
class LassoLarsICImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)
Пример #17
0
def lasso_lars_ic(data, target):
    model_bic = LassoLarsIC(criterion='bic')
    model_bic.fit(data, target)
    alpha_bic_ = model_bic.alpha_

    model_aic = LassoLarsIC(criterion='aic')
    model_aic.fit(data, target)
    alpha_aic_ = model_aic.alpha_

    def plot_ic_criterion(model, name, color):
        alpha_ = model.alpha_
        alphas_ = model.alphas_
        criterion_ = model.criterion_
        plt.plot(-np.log10(alphas_),
                 criterion_,
                 '--',
                 color=color,
                 linewidth=3,
                 label='%s criterion' % name)
        plt.axvline(-np.log10(alpha_),
                    color=color,
                    linewidth=3,
                    label='alpha: %s estimate' % name)
        plt.xlabel('-log(alpha)')
        plt.ylabel('criterion')

    plt.figure()
    plot_ic_criterion(model_aic, 'AIC', 'b')
    plot_ic_criterion(model_bic, 'BIC', 'r')
    plt.legend()
    plt.title('Information-criterion for model selection')
    plt.savefig(plot_path + 'lasso_lars.png', dpi=300)
Пример #18
0
    def _pruning(self, X, ee, order, causal_order):
        """"""
        n_features = X.shape[1]

        # join X(t), X(t-1) and e(t-1)
        X_joined = np.zeros(
            (X.shape[0], X.shape[1] * (1 + order[0] + order[1])))
        for p in range(1 + order[0]):
            pos = n_features * p
            X_joined[:, pos:pos + n_features] = np.roll(X[:, 0:n_features],
                                                        p,
                                                        axis=0)

        for q in range(order[1]):
            pos = n_features * (1 + order[0]) + n_features * q
            X_joined[:, pos:pos + n_features] = np.roll(ee[:, 0:n_features],
                                                        q + 1,
                                                        axis=0)

        # pruned by adaptive lasso
        psi_omega = np.zeros(
            (n_features, n_features * (1 + order[0] + order[1])))
        for i, target in enumerate(causal_order):
            predictors = [
                j for j in range(X_joined.shape[1])
                if j not in causal_order[i:]
            ]

            # adaptive lasso
            gamma = 1.0
            lr = LinearRegression()
            lr.fit(X_joined[:, predictors], X_joined[:, target])
            weight = np.power(np.abs(lr.coef_), gamma)
            reg = LassoLarsIC(criterion='bic')
            reg.fit(X_joined[:, predictors] * weight, X_joined[:, target])

            psi_omega[target, predictors] = reg.coef_ * weight

        # split psi and omega
        psis = np.zeros(((1 + order[0]), n_features, n_features))
        for p in range(1 + order[0]):
            pos = n_features * p
            psis[p] = psi_omega[:, pos:pos + n_features]

        omegas = np.zeros((order[1], n_features, n_features))
        for q in range(order[1]):
            pos = n_features * (1 + order[0]) + n_features * q
            omegas[q] = psi_omega[:, pos:pos + n_features]

        return psis, omegas
Пример #19
0
def __lasso_selected(data,data_test, response):
    X = data.drop([response],axis=1).as_matrix()
    y = np.array(data[response].tolist()).reshape((len(data),1))
    #X = sm.add_constant(X)
    #model = sm.OLS(y,X)
    #m = model.fit_regularized(refit=True)
    #yp = m.predict(data_test)
    reg = LassoLarsIC(criterion='bic')
    print y.shape,X.shape
    reg.fit(X,y)
    x = data_test.drop([response],axis=1).as_matrix().reshape((len(data_test),len(data_test.keys())-1))
    yp = reg.predict(x)
    te = np.mean((yp-np.array(data_test[response].tolist()))**2)
    print reg.coef_,te
    return
Пример #20
0
class r07546035_ICRegression(regression):
    def trainAlgo(self):

        self.model = LassoLarsIC(criterion=self.param['criterion'],
                                 fit_intercept=self.param['fit_intercept'],
                                 normalize=self.param['normalize'],
                                 max_iter=self.param['max_iter'],
                                 eps=self.param['eps'],
                                 positive=self.param['positive'])

        self.model.fit(self.inputData['X'], self.outputData['Y'])

    def predictAlgo(self):

        self.result['Y'] = self.model.predict(self.inputData['X'])
Пример #21
0
def mdl_1d(x, y):
    """builds univariate model to calculate AUC"""
    lr = LogisticRegressionCV(scoring='roc_auc')
    lars = LassoLarsIC(criterion='aic')

    if x.nunique() > 10 and com.is_numeric_dtype(x):
        x2 = sb_cutz(x)
        series = pd.get_dummies(x2, dummy_na=True)
    else:
        series = pd.get_dummies(x, dummy_na=True)

    lr.fit(series, y)
    lars.fit(series, y)

    try:
        preds = (lr.predict_proba(series)[:, -1])
        #preds = (preds > preds.mean()).astype(int)
    except ValueError:
        Tracer()()

    # try:
    #    cm = confusion_matrix(y, (preds > y.mean()).astype(int))
    # except ValueError:
    #    Tracer()()

    aucz = roc_auc_score(y, preds)

    ns = num_bin_stats(x, y)

    nplot = plot_num(ns)
    #plot = plot_confusion_matrix(cm, y)

    imgdata = BytesIO()
    nplot.savefig(imgdata)
    imgdata.seek(0)
    nplot = 'data:image/png;base64,' + \
        quote(base64.b64encode(imgdata.getvalue()))
    plt.close()

    bplot = plot_bubble(ns)
    imgdatab = BytesIO()
    bplot.savefig(imgdatab)
    imgdatab.seek(0)
    bplot = 'data:image/png;base64,' + \
        quote(base64.b64encode(imgdatab.getvalue()))
    plt.close()

    return aucz, nplot, bplot
Пример #22
0
def get_param_mask(X, y, criterion='bic', **kwargs):
    """
    Determine the most relevant parameters in the input space using a regularised linear model and either the
    Aikake or Baysian Information Criterion.

    Parameters
    ----------
    X : array-like of shape (n_samples, n_features)
        Parameter values
    y : array-like of shape (n_samples,)
        target values.
    criterion: {'bic' , 'aic'}, default='bic'
        The information criteria to apply for parameter selection. Either Aikake or Baysian Information Criterion.
    kwargs: dict
        Further arguments for sklearn.feature_selection.SelectFromModel

    Returns
    -------
    mask : ndarray
        A boolean array of shape [# input features], in which an element is
        True iff its corresponding feature is selected for retention.
    """
    from sklearn.linear_model import LassoLarsIC
    from sklearn.feature_selection import SelectFromModel

    lsvc = LassoLarsIC(criterion=criterion).fit(X, y)
    model = SelectFromModel(lsvc, prefit=True, **kwargs)
    return model.get_support()
Пример #23
0
class HistogramClassifier:
    def __init__(self):
        X, y = make_dataframe(letter_list)
        self.columns = list(X.columns)
        self.classifier = LassoLarsIC()
        self.classifier.fit(X, y)

    def predict(self, X):
        counter = snippet_to_histogram(X, letter_list)
        df = pd.DataFrame(columns=self.columns)
        df = df.append(counter, ignore_index=True).fillna(0)
        y = np.zeros(len(X))
        for i in range(len(X)):
            y[i] = self.classifier.predict(df)
        y = round(y.sum() / len(X))
        return y
Пример #24
0
def update_results(n_clicks, value_tipo, value_estados):
    #return n_clicks
    if n_clicks is None:
        raise PreventUpdate
    else: 
        if value_estados == []:
            raise PreventUpdate

        if type(value_estados) == int:
            value_estados = [value_estados]
        lista_final = list(map(int, value_estados))
    
        tipo = value_tipo
        reg = LassoLarsIC(criterion='aic')
        info_rs = RiskScore()
        dicc_results = info_rs.get_all_info_filtered(lista_final,tipo, reg)
        criteria = dicc_results["criteria"].to_dict('records')
     
        id_map = int(''.join(map(str,lista_final)))
        nombre_map = "mapas/mapa_"+ tipo + "_" + str(id_map) + ".html"
        info_rs.get_map(dicc_results["risk"], nombre_map)
        abre_mapa = open(nombre_map, 'r').read()
        
        #Guarda resultados
        nombre_rs = "files/risk_"+ tipo + "_" + str(id_map) + ".csv"
        dicc_results["risk"].to_csv(nombre_rs, index=False)
        update_link = "/download/risk_"+ tipo + "_" + str(id_map) + ".csv"

        r2a_res = dicc_results["r2a"].round(2)
        rmse_res = dicc_results["rmse"].round(2)
        num_escu = format(len(dicc_results["risk"]), ',')
        return criteria, abre_mapa, update_link, r2a_res, num_escu, rmse_res
Пример #25
0
    def trained_model(self, data, dependent_var):
        """
        Attempts to find the best lasso alpha value fitting a dataset using the BIC
        metric: https://stats.stackexchange.com/questions/126898/tuning-alpha-parameter-in-lasso-linear-model-in-scikitlearn
        """
        predictors = data.drop([dependent_var], axis=1)
        # def bic_score(predictors, y, model):
        #    sse = sum((model.predict(predictors) - y.values[0])**2)
        #    s = np.count_nonzero(model.coef_)
        #    n = len(predictors.columns)
        #    cn = math.sqrt(n)/(s*s)
        #    print(math.log(sse/n) + s*math.log(n)/n*cn)
        #    return math.log(sse/n) + abs(s)*math.log(n)/n*cn

        model = LassoLarsIC(criterion='bic')
        model.fit(predictors, data[[dependent_var]])
        return model
Пример #26
0
def train_test_lasso(input_data, output_data, train_key, test_key, n_cv=3):
    """
    lasso回帰による学習/予測
    """
    # 例外処理 : 学習データ点数が分割数より少ない場合
    if len(train_key) < n_cv:
        n_cv = len(train_key)
    
    #-------------
    # 学習
    #-------------
    x = input_data[train_key,:]
    y = output_data[train_key]
    
    # インスタンス生成
    x_scaler = StandardScaler()
    y_scaler = StandardScaler()
    
    clf = LassoLarsCV(cv=n_cv, positive=True)
    #clf = LassoLarsCV(cv=n_cv, positive=True)
    #clf = LassoLarsIC(criterion='bic', positive=True)
    
    # モデル構築
    x_scaler.fit(x) #正規化
    y_scaler.fit(y.reshape(-1,1)) #正規化
    
    y_ = y_scaler.transform(y.reshape(-1,1))
    y_ = y_.reshape(-1)
    
    #import pdb; pdb.set_trace()
    
    #error_flag = 0 #初期化
    try:
        clf.fit(x_scaler.transform(x), y_)
    except ValueError:
        clf = LassoLarsIC(criterion='bic', positive=True)
        clf.fit(x_scaler.transform(x), y_)
    
    
    # モデルパラメータ取得
    #alpha = clf.alpha_ #ハイパーパラメータ
    a = clf.coef_ #係数
    b = clf.intercept_ #切片
    p = np.append(a, b)
    
    #-------------
    # 予測
    #-------------
    x = input_data[test_key,:]
    
    # 例外処理 : xのデータ点数 = 1の場合 ⇒配列を整形
    if x.ndim == 1:
        x = x.reshape(1,-1)
    
    # 予測
    tmp = clf.predict(x_scaler.transform(x))
    y_pred = y_scaler.inverse_transform(tmp) #非正規化
    return y_pred, p
 def __init__(self, X, y, lasso_lar_ic_params, nfolds=3, n_jobs=1, scoring=None, random_grid=False, n_iter=10, verbose=True):
     
     self._code="lasso_lar_ic"
     
     if verbose:
         print ("Constructed LassoLarICRegressorPredictiveModel: " +self._code)
     
     AbstractRegressorPredictiveModel.__init__(self, "regressor", X, y, lasso_lar_ic_params, nfolds, n_jobs, scoring, random_grid, n_iter,verbose)
     self._model = self.constructRegressor(LassoLarsIC(), self._random_grid)
Пример #28
0
    def solve(self, fraction_evaluated, dim):
        eyAdj = self.linkfv(self.ey[:, dim]) - self.link.f(self.fnull[dim])

        s = np.sum(self.maskMatrix, 1)

        # do feature selection if we have not well enumerated the space
        nonzero_inds = np.arange(self.M)
        log.debug("fraction_evaluated = {0}".format(fraction_evaluated))
        if (self.l1_reg not in ["auto", False, 0]) or (fraction_evaluated < 0.2 and self.l1_reg == "auto"):
            w_aug = np.hstack((self.kernelWeights * (self.M - s), self.kernelWeights * s))
            log.info("np.sum(w_aug) = {0}".format(np.sum(w_aug)))
            log.info("np.sum(self.kernelWeights) = {0}".format(np.sum(self.kernelWeights)))
            w_sqrt_aug = np.sqrt(w_aug)
            eyAdj_aug = np.hstack((eyAdj, eyAdj - (self.link.f(self.fx[dim]) - self.link.f(self.fnull[dim]))))
            eyAdj_aug *= w_sqrt_aug
            mask_aug = np.transpose(w_sqrt_aug * np.transpose(np.vstack((self.maskMatrix, self.maskMatrix - 1))))
            var_norms = np.array([np.linalg.norm(mask_aug[:, i]) for i in range(mask_aug.shape[1])])

            if self.l1_reg == "auto":
                model = LassoLarsIC(criterion="aic")
            elif self.l1_reg == "bic" or self.l1_reg == "aic":
                model = LassoLarsIC(criterion=self.l1_reg)
            else:
                model = Lasso(alpha=self.l1_reg)

            model.fit(mask_aug, eyAdj_aug)
            nonzero_inds = np.nonzero(model.coef_)[0]

        if len(nonzero_inds) == 0:
            return np.zeros(self.M), np.ones(self.M)

        # eliminate one variable with the constraint that all features sum to the output
        eyAdj2 = eyAdj - self.maskMatrix[:, nonzero_inds[-1]] * (
                    self.link.f(self.fx[dim]) - self.link.f(self.fnull[dim]))
        etmp = np.transpose(np.transpose(self.maskMatrix[:, nonzero_inds[:-1]]) - self.maskMatrix[:, nonzero_inds[-1]])
        log.debug("etmp[:4,:] {0}".format(etmp[:4, :]))

        # solve a weighted least squares equation to estimate phi
        tmp = np.transpose(np.transpose(etmp) * np.transpose(self.kernelWeights))
        tmp2 = np.linalg.inv(np.dot(np.transpose(tmp), etmp))
        w = np.dot(tmp2, np.dot(np.transpose(tmp), eyAdj2))
        log.debug("np.sum(w) = {0}".format(np.sum(w)))
        log.debug("self.link(self.fx) - self.link(self.fnull) = {0}".format(
            self.link.f(self.fx[dim]) - self.link.f(self.fnull[dim])))
        log.debug("self.fx = {0}".format(self.fx[dim]))
        log.debug("self.link(self.fx) = {0}".format(self.link.f(self.fx[dim])))
        log.debug("self.fnull = {0}".format(self.fnull[dim]))
        log.debug("self.link(self.fnull) = {0}".format(self.link.f(self.fnull[dim])))
        phi = np.zeros(self.M)
        phi[nonzero_inds[:-1]] = w
        phi[nonzero_inds[-1]] = (self.link.f(self.fx[dim]) - self.link.f(self.fnull[dim])) - sum(w)
        log.info("phi = {0}".format(phi))

        # clean up any rounding errors
        for i in range(self.M):
            if np.abs(phi[i]) < 1e-10:
                phi[i] = 0

        return phi, np.ones(len(phi))
Пример #29
0
def aic(df_data):
    X_data_pd = df_data.toPandas()  # spark DF to pd_df
    X1 = X_data_pd.values
    y1 = X_data_pd['label'].values * 10000

    rng = np.random.RandomState(42)
    X = np.c_[X, rng.randn(X.shape[0], 14)]  # add some bad features
    X1 = np.c_[X1, rng.randn(X1.shape[0], 14)]
    # normalize data as done by Lars to allow for comparison
    X /= np.sqrt(np.sum(X**2, axis=0))
    X1 /= np.sqrt(np.sum(X1**2, axis=0))
    # #############################################################################
    # LassoLarsIC: least angle regression with BIC/AIC criterion
    model_aic = LassoLarsIC(criterion='aic')

    t3 = time.time()
    model_aic.fit(X1, y1)
    t_aic = time.time() - t3
    alpha_aic_ = model_aic.alpha_

    def plot_ic_criterion(plot_model, name, color):
        alpha_ = plot_model.alpha_
        alphas_ = plot_model.alphas_
        criterion_ = plot_model.criterion_
        plt.plot(-np.log10(alphas_),
                 criterion_,
                 '--',
                 color=color,
                 linewidth=3,
                 label='%s criterion' % name)
        plt.axvline(-np.log10(alpha_),
                    color=color,
                    linewidth=3,
                    label='alpha: %s estimate' % name)
        plt.xlabel('-log(alpha)')
        plt.ylabel('criterion')

    plt.figure()
    plot_ic_criterion(model_aic, 'AIC', 'b')
    plt.legend()
    plt.title(
        'Information-criterion for model selection (training time %.3fs)' %
        t_aic)
    plt.show()
Пример #30
0
 def test_model_lasso_lars_ic(self):
     model, X = fit_regression_model(LassoLarsIC())
     model_onnx = convert_sklearn(
         model, "lasso lars cv",
         [("input", FloatTensorType([None, X.shape[1]]))])
     self.assertIsNotNone(model_onnx)
     dump_data_and_model(X,
                         model,
                         model_onnx,
                         basename="SklearnLassoLarsIC-Dec4")
Пример #31
0
def lassoLarsIC_coef(X, Xk, Y, precompute='auto', copy_X=True, criterion='aic'):
    p = X.shape[1]
    XXk = np.concatenate((X, Xk), axis=1)
    lfit = LassoLarsIC(criterion=criterion,
            copy_X = copy_X,
            normalize = False,
            eps = 1e-11,
            fit_intercept=False).fit(XXk, Y)
    b = lfit.coef_
    return b
Пример #32
0
def run_model(dataset, window_size, first_eday, last_eday, param, file_lambd,
              file_betas):
    #param: lambd for HP, level for wavelet
    raw_data = np.genfromtxt(f'DATA/{dataset}.txt')
    qs_real = np.reshape(raw_data[:, 2], (-1, 24))
    num_days = qs_real.shape[0]

    qs_predictions = np.zeros(qs_real.shape)
    dummies_all = get_dummies(raw_data[:, 0])
    lprices = np.log(raw_data[:, 2])
    lloads = np.log(raw_data[:, 4])

    for day in range(window_size + 1, num_days + 1):
        first_day_index = day - (window_size + 1)
        dummies = get_dummies_inwindow(dummies_all, window_size,
                                       first_day_index)
        #qs, qsmin, qsmax, zt, Ts_hat=decomposition_wavelet(lprices, lloads, first_day_index, window_size,param)
        #qs, qsmin, qsmax, zt, Ts_hat=decomposition_HP(lprices, lloads, first_day_index, window_size,param)
        qs, qsmin, qsmax, zt, Ts_hat = remove_mean(window_size,
                                                   first_day_index, lprices,
                                                   lloads)
        for hour in range(24):
            X, Y, Xr = get_calibartion_dataset(qs, qsmin, qsmax, zt,
                                               window_size, hour, dummies)

            model_aic = LassoLarsIC(criterion='aic', fit_intercept=False)
            fitted_model = model_aic.fit(X, Y)
            params = fitted_model.coef_
            est_lambd = fitted_model.alpha_
            file_lambd.write(str(est_lambd) + "\n")
            np.savetxt(file_betas, params.reshape(1, params.shape[0]))
            #print(params)
            c_prediction = make_prediction(
                Xr, params, Ts_hat
            )  #if wavelet or HP filters are used, change Ts_hat -> Ts_hat[hour]
            qs_predictions[first_day_index + window_size, hour] = c_prediction
            print(f'(day,hour):\t({day},{hour}):\t{c_prediction}')

    date = raw_data[0, 0].astype(int)
    first_day = get_datetime(date)
    WMAE, ave_num = get_WMAE(qs_real, qs_predictions, first_day, first_eday,
                             last_eday)
    return qs_real, qs_predictions, WMAE
Пример #33
0
    def recalibrate(self, Xtrain, Ytrain):
        """Function to recalibrate the LEAR model. 
        
        It uses a training (Xtrain, Ytrain) pair for recalibration
        
        Parameters
        ----------
        Xtrain : numpy.array
            Input in training dataset. It should be of size *[n,m]* where *n* is the number of days
            in the training dataset and *m* the number of input features
        
        Ytrain : numpy.array
            Output in training dataset. It should be of size *[n,24]* where *n* is the number of days 
            in the training dataset and 24 are the 24 prices of each day
                
        Returns
        -------
        numpy.array
            The prediction of day-ahead prices after recalibrating the model        
        
        """

        # # Applying Invariant, aka asinh-median transformation to the prices
        [Ytrain], self.scalerY = scaling([Ytrain], 'Invariant')

        # # Rescaling all inputs except dummies (7 last features)
        [Xtrain_no_dummies], self.scalerX = scaling([Xtrain[:, :-7]], 'Invariant')
        Xtrain[:, :-7] = Xtrain_no_dummies

        self.models = {}
        for h in range(24):

            # Estimating lambda hyperparameter using LARS
            param_model = LassoLarsIC(criterion='aic', max_iter=2500)
            param = param_model.fit(Xtrain, Ytrain[:, h]).alpha_

            # Re-calibrating LEAR using standard LASSO estimation technique
            model = Lasso(max_iter=2500, alpha=param)
            model.fit(Xtrain, Ytrain[:, h])

            self.models[h] = model
Пример #34
0
    def fit_models_LassoCV(self, X, Y, bands=None):
        """ Try to fit models to training period time series """
        if bands is None:
            bands = self.fit_indices

        models = []

        for b in bands:
            # lasso = LassoCV(n_alphas=100)
            # lasso = LassoLarsCV(masx_n_alphas=100)
            lasso = LassoLarsIC(criterion='bic')
            lasso = lasso.fit(X, Y[b, :])
            lasso.nobs = Y[b, :].size
            lasso.coef = np.copy(lasso.coef_)
            lasso.coef[0] += lasso.intercept_
            lasso.fittedvalues = lasso.predict(X)
            lasso.rss = np.sum((Y[b, :] - lasso.fittedvalues) ** 2)
            lasso.rmse = math.sqrt(lasso.rss / lasso.nobs)

            models.append(lasso)

        return np.array(models)
            % (mean_score, scores.std() * 2, params))
    print()    
 

import time

import matplotlib.pyplot as plt

from sklearn.linear_model import LassoCV, LassoLarsCV, LassoLarsIC



##############################################################################
# LassoLarsIC: least angle regression with BIC/AIC criterion

model_bic = LassoLarsIC(criterion='bic')
t1 = time.time()
model_bic.fit(X_train, y_train)
t_bic = time.time() - t1
alpha_bic_ = model_bic.alpha_

model_aic = LassoLarsIC(criterion='aic')
model_aic.fit(X_train, y_train)
alpha_aic_ = model_aic.alpha_


def plot_ic_criterion(model, name, color):
    alpha_ = model.alpha_
    alphas_ = model.alphas_
    criterion_ = model.criterion_
    plt.plot(-np.log10(alphas_), criterion_, '--', color=color,
Пример #36
0
from sklearn import datasets

diabetes = datasets.load_diabetes()
X = diabetes.data
y = diabetes.target

rng = np.random.RandomState(42)
X = np.c_[X, rng.randn(X.shape[0], 14)]  # add some bad features

# normalize data as done by Lars to allow for comparison
X /= np.sqrt(np.sum(X ** 2, axis=0))

##############################################################################
# LassoLarsIC: least angle regression with BIC/AIC criterion

model_bic = LassoLarsIC(criterion="bic")
t1 = time.time()
model_bic.fit(X, y)
t_bic = time.time() - t1
alpha_bic_ = model_bic.alpha_

model_aic = LassoLarsIC(criterion="aic")
model_aic.fit(X, y)
alpha_aic_ = model_aic.alpha_


def plot_ic_criterion(model, name, color):
    alpha_ = model.alpha_
    alphas_ = model.alphas_
    criterion_ = model.criterion_
    plt.plot(-np.log10(alphas_), criterion_, "--", color=color, linewidth=3, label="%s criterion" % name)
# R-square from training and test data
rsquared_train=model.score(pred_train,tar_train)
rsquared_test=model.score(pred_test,tar_test)
print ('training data R-square')
print(rsquared_train)
print ('test data R-square')
print(rsquared_test)


##

from sklearn.linear_model import LassoCV, LassoLarsCV, LassoLarsIC

##next we try and fit using  AIC criterion

model_aic = LassoLarsIC(criterion='aic')
model_aic.fit(pred_train,tar_train)
alpha_aic_ = model_aic.alpha_

def plot_ic_criterion(model, name, color):
    alpha_ = model.alpha_
    alphas_ = model.alphas_
    criterion_ = model.criterion_
    plt.plot(-np.log10(alphas_), criterion_, '--', color=color,
             linewidth=3, label='%s criterion' % name)
    plt.axvline(-np.log10(alpha_), color=color, linewidth=3,
                label='alpha: %s estimate' % name)
    plt.xlabel('-log(alpha)')
    plt.ylabel('criterion')

plt.figure()
Пример #38
0
def detect_stops_on_link_traj(traj, debug=False, **param):
  """
  Detects if a trajectory (characterized by a list of tspots on a link) corresponds to a stopping vehicle
  
  Input:
  traj: list of tspots on a link
  debug (optional): if True, returns extra variables to help with debugging. Default=False
  
  Output:
  stopping (bool): if True, indicates that there is a stop on the trajectory
  If debug is set to True (default=False)
  obs: list of Point_pts representing the 
    offset and time (in seconds after beginning of traj) 
    of the measurements
  est: list of Point_pts representing the 
    offset and time (in seconds after beginning of traj) 
    of the estimated location of the measurements
  """
  if len(traj) <= 3:
    return False
  loc = np.array([x.spot.offset - traj[0].spot.offset for x in traj[1 :]])
  time = np.array([(x.time - traj[0].time).total_seconds() for x in traj])
    
  n = len(time) - 1
  A = np.zeros((n, n), dtype=np.float64)
  for i in range(n):
    A[i :, i] = time[i + 1] - time[i]
    
  model_bic = LassoLarsIC(criterion='bic', fit_intercept=False)
  # There is a bug here for the following data:
#[[  0.669923   0.         0.         0.         0.         0.         0.
#    0.      ]
# [  0.669923   2.         0.         0.         0.         0.         0.
#    0.      ]
# [  0.669923   2.        34.         0.         0.         0.         0.
#    0.      ]
# [  0.669923   2.        34.         2.         0.         0.         0.
#    0.      ]
# [  0.669923   2.        34.         2.         2.         0.         0.
#    0.      ]
# [  0.669923   2.        34.         2.         2.         4.         0.
#    0.      ]
# [  0.669923   2.        34.         2.         2.         4.         2.
#    0.      ]
# [  0.669923   2.        34.         2.         2.         4.         2.
#    0.94968 ]]
#[  6.24743444   6.24743444   6.24743444  10.41858373  14.46159935
#  26.17648665  39.90241795  52.77      ]
#
#  if A.shape == (8,8):
#    print A
#    print loc
  try:
    model_bic.fit(A, loc)
  except TypeError:
    print "Failure in detect_stops_on_link_traj"
    return False
  
  if debug:
    print 'Lasso BIC'
    print np.dot(A, model_bic.coef_)
    print model_bic.coef_
    
  stop_time = 0
  stopping = False
  for i, speed in enumerate(model_bic.coef_):
    if speed < param['speed_threshold']:
      stop_time += time[i + 1] - time[i]
      if stop_time >= param['min_stop_duration']:
        stopping = True
        break
  if not debug:
    return stopping
  est_loc = [0.0] + list(np.dot(A, model_bic.coef_))
  obs = [struct.Point_pts(s.spot.offset, t, 0) for (s, t) in zip(traj, time)]
  est = [struct.Point_pts(e + traj[0].spot.offset, t, 0) for (e, t) in zip(est_loc, time)]
  plt.plot(time[1 :], loc, '-+b', linewidth=5, label='Observations')
  plt.plot(time, est_loc, '-or', label='Estimate')
  print stopping
  plt.legend()
  plt.show()
  return (stopping, obs, est)
Пример #39
0
    def Plot_Lasso_LARS_Path(self):

        X = self.X
        y = self.Y

        # normalize data as done by Lars to allow for comparison
        X /= np.sqrt(np.sum(X ** 2, axis=0))

        #######################################################################
        # LassoLarsIC: least angle regression with BIC/AIC criterion

        model_bic = LassoLarsIC(criterion='bic')
        t1 = time.time()
        model_bic.fit(X, y)
        t_bic = time.time() - t1

        model_aic = LassoLarsIC(criterion='aic')
        model_aic.fit(X, y)

        plt.figure()
        self.plot_ic_criterion(model_aic, 'AIC', 'b')
        self.plot_ic_criterion(model_bic, 'BIC', 'r')
        print model_bic.alpha_
        plt.legend()
        plt.title('Information-criterion for model selection (training time %.3fs)'
                  % t_bic)

        #######################################################################
        # LassoCV: coordinate descent

        # Compute paths
        print(
            "Computing regularization path using the coordinate descent lasso...")
        t1 = time.time()
        model = LassoCV(cv=20).fit(X, y)
        t_lasso_cv = time.time() - t1

        # Display results
        m_log_alphas = -np.log10(model.alphas_)

        plt.figure()
        ymin, ymax = 2300, 3800
        plt.plot(m_log_alphas, model.mse_path_, ':')
        plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k',
                 label='Average across the folds', linewidth=2)
        plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
                    label='alpha: CV estimate')

        plt.legend()

        plt.xlabel('-log(alpha)')
        plt.ylabel('Mean square error')
        plt.title('Mean square error on each fold: coordinate descent '
                  '(train time: %.2fs)' % t_lasso_cv)
        plt.axis('tight')
        plt.ylim(ymin, ymax)

        #######################################################################
        # LassoLarsCV: least angle regression

        # Compute paths
        print("Computing regularization path using the Lars lasso...")
        t1 = time.time()
        model = LassoLarsCV(cv=20).fit(X, y)
        t_lasso_lars_cv = time.time() - t1

        # Display results
        m_log_alphas = -np.log10(model.cv_alphas_)

        plt.figure()
        plt.plot(m_log_alphas, model.cv_mse_path_, ':')
        plt.plot(m_log_alphas, model.cv_mse_path_.mean(axis=-1), 'k',
                 label='Average across the folds', linewidth=2)
        plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
                    label='alpha CV')
        plt.legend()

        plt.xlabel('-log(alpha)')
        plt.ylabel('Mean square error')
        plt.title('Mean square error on each fold: Lars (train time: %.2fs)'
                  % t_lasso_lars_cv)
        plt.axis('tight')
        plt.ylim(ymin, ymax)

        plt.show()