示例#1
0
    def _compareFit(self, q):
        # QAR
        qar = self.fit(q)
        rhoName = qar.params.index[1]

        #  alpha_tau
        alpha_tau = qar.params[0]
        alpha_tauLowerCI = qar.conf_int().loc['const'][0]
        alpha_tauUpperCI = qar.conf_int().loc['const'][1]

        #  rho_tau
        rho_tau = qar.params[1]
        rho_tauLowerCI = qar.conf_int().loc[rhoName][0]
        rho_tauUpperCI = qar.conf_int().loc[rhoName][1]

        #  OLS
        ols = OLS(self.y, self.X).fit()
        rhoName = ols.params.index[1]

        #  alphaOLS
        alphaOLS = ols.params[0]
        alphaOLSLowerCI = ols.conf_int().loc['const'][0]
        alphaOLSUpperCI = ols.conf_int().loc['const'][1]

        #  rhoOLS
        rhoOLS = ols.params[1]
        rhoOLSLowerCI = ols.conf_int().loc[rhoName][0]
        rhoOLSUpperCI = ols.conf_int().loc[rhoName][1]

        params = {
            'quantile': q,
            'α₀(τ)': alpha_tau,
            'α₀(τ):LB': alpha_tauLowerCI,
            'α₀(τ):UB': alpha_tauUpperCI,
            'α₀(OLS)': alphaOLS,
            'α₀(OLS):LB': alphaOLSLowerCI,
            'α₀(OLS):UB': alphaOLSUpperCI,
            'ρ₁(τ)': rho_tau,
            'ρ₁(τ):LB': rho_tauLowerCI,
            'ρ₁(τ):UB': rho_tauUpperCI,
            'ρ₁(OLS)': rhoOLS,
            'ρ₁(OLS):LB': rhoOLSLowerCI,
            'ρ₁(OLS):UB': rhoOLSUpperCI,
        }
        return params
示例#2
0
class LinearRegression:
    """
    Class for OLS regression models based on the excellent statsmodels package.
    
    Parameters
    ----------
    method : 'enter' or 'backward' 
        Method for predictors selection
    include_constant : bool 
        (CURRENTLY UNAVAILIABLE) Whether to include constant in the model
    sig_level_entry : float 
        (CURRENTLY UNAVAILIABLE) Max significance level to include predictor in the model 
    sig_level_removal : float
        Min significance level to exclude predictor from the model

    Attributes
    ----------
    variables_excluded : list
        Variables excluded because of zero variance
    variables_included : list
        Variables included in a model
    predictions : pd.Series
        Predicted values
    N : int
        Number of observations included in a model
    r2 : float
        R-squared (coefficient of determination)
    r2_adjusted : float
        Adjusted r-squared
    F : float
        F-statistic
    F_pvalue : float
        P-value for F-statistic
    ess : float
        Explained sum of squares
    rss : float
        Residual sum of squares
    tss : float
        Total sum of squares
    coefficients : pd.Series
        Regression coefficients
    coefficients_sterrors : pd.Series
        Standard errors of regression coefficients 
    coefficients_tvalues : pd.Series
        T-statistics of regression coefficients 
    coefficients_pvalues : pd.Series
        P-values of regression coefficients
    """
    def __init__(self,
                 method='enter',
                 include_constant=True,
                 sig_level_entry=0.05,
                 sig_level_removal=0.05):

        self.method = method.lower().strip()
        self.include_constant = include_constant
        self.sig_level_entry = sig_level_entry
        self.sig_level_removal = sig_level_removal

    def fit(self,
            data,
            formula,
            categorical_variables=None,
            show_results=True,
            confidence_intervals=True,
            collinearity_statistics=False,
            use_patsy_notation=False,
            n_decimals=3):
        """
        Fit model to the given data using formula.

        Parameters
        ----------
        data : pd.DataFrame 
            Data to fit a model  
        formula : str 
            Formula of a model specification, e.g. 'y ~ x1 + x2'; 
            should be passed either in Patsy (statsmodels) notation
            or using the following rules: 
            '*' for interaction of the variables,
            ':' for interaction & main effects, 
            i.e., 'y ~ x:z' equals to 'y ~ x + z + x*z' (unlike the Patsy notation).
            If you use Patsy notation, please specify the parameter use_patsy_notation=True.
        categorical_variables : list 
            List of names of the variables that should be considered categorical.
            These variables would be automatically converted into sets of dummy variables.
            If you want to use this option, please make sure that you don't have nested names of variables
            (e.g. 'imdb' and 'imdb_rate' at the same time), otherwise this option results in an incorrect procedure.
        show_results : bool 
            Whether to show results of analysis
        confidence_intervals : bool 
            Whether to include coefficients' confidence intervals in the summary table
        collinearity_statistics : bool 
            whether to include coefficients' tolerance and VIF in the summary table
        use_patsy_notation : bool 
            turn this on if you use strictly Patsy's rules to define a formula.
            See more: https://patsy.readthedocs.io/en/latest/quickstart.html
        n_decimals : int 
            Number of digits to round results when showing them

        Returns
        -------
        self
            The current instance of the LinearRegression class
        """

        self._data = data.copy()

        self.categorical_variables = categorical_variables
        self._show_ci = confidence_intervals
        self._show_col = collinearity_statistics

        if '=' in formula:
            formula = formula.replace('=', '~')

        if not use_patsy_notation:
            formula = formula.replace('*', '^').replace(':',
                                                        '*').replace('^', ':')

        self.formula = formula

        #won't work correctly if some variables have similar names (e.g. kinopoisk_rate and kinopoisk_rate_count)
        if categorical_variables is not None:
            if not isinstance(categorical_variables, list):
                raise ValueError(
                    f"""Categorical variables should be passed as list.
                Type {type(categorical_variables)} was passed instead.""")
            else:
                for variable in categorical_variables:
                    formula = formula.replace(variable, f'C({variable})')

        self._model = ols(formula=formula, data=data).fit()
        self._observations_idx = list(self._model.fittedvalues.index)
        self.dependent_variable = self._model.model.endog_names
        self.variables_excluded = self._identify_variables_without_variation()

        if len(self.variables_excluded) > 0:
            y = pd.Series(self._model.model.endog.copy(),
                          index=self._observations_idx,
                          name=self.dependent_variable)
            X = self._remove_variables_without_variation()
            self._model = OLS(y, X, missing='drop').fit()
            self.variables_excluded = [
                LinearRegression._translate_from_patsy_notation(x)
                for x in self.variables_excluded
            ]

        if self.method == 'backward':
            self._fit_backward()

        self._get_statistics_from_model()

        self.predictions = self.predict()

        if show_results:
            self.show_results(n_decimals)

        if len(self.variables_excluded) > 0:
            print('------------------\n')
            print(
                f"Following variables were excluded due to zero variance: {'; '.join(self.variables_excluded)}"
            )

        return self

    def predict(
        self,
        data=None,
        add_to_data=False,
    ):
        """
        Predict values of a dependent variable for the given data using the fitted model.
        
        Parameters
        ----------
        data : pd.DataFrame 
            Data for predictions, 
            may be not specified if you want to predict values for the same data that were used to fit a model
        add_to_data : bool 
            Whether to merge predictions with the given data.
            Currently, this option returns data with a sorted index

        Returns
        -------
        pd.DataFrame
            Predictions
        """
        name = f'{self.dependent_variable} (predicted)'

        if data is None:
            data_init = self._data.copy()
            result = self._model.fittedvalues
            data_init[name] = result
            if add_to_data:
                return data_init
            else:
                return data_init[name].copy()

        else:
            aux_model = ols(self.formula, data).fit()
            aux_data_idx = aux_model.fittedvalues.index
            aux_data_cols = aux_model.model.exog_names
            aux_data_cols = [LinearRegression._translate_from_patsy_notation(x)\
                              for x in aux_data_cols]
            aux_data = pd.DataFrame(aux_model.model.exog,
                                    index=aux_data_idx,
                                    columns=aux_data_cols)
            aux_X = add_constant(aux_data[self.variables_included].copy())
            aux_y = aux_model.model.endog.copy()

            aux_model = OLS(aux_y, aux_X, missing='drop').fit()
            result = aux_model.fittedvalues
            result.name = name
            if add_to_data:
                result = pd.concat([data, result], axis=1, sort=False)

            return result

    def _get_statistics_from_model(self):

        self.N = self._model.nobs
        self.r2 = self._model.rsquared
        self.r2_adjusted = self._model.rsquared_adj
        self.F = self._model.fvalue
        self.F_pvalue = self._model.f_pvalue
        self.ess = self._model.ess
        self.rss = self._model.ssr
        if self.include_constant:
            self.tss = self._model.centered_tss
        else:
            self.tss = self._model.uncentered_tss
        self.ms_model = self._model.mse_model
        self.ms_resid = self._model.mse_resid
        self.ms_total = self._model.mse_total
        self.dof_model = self._model.df_model
        self.dof_resid = self._model.df_resid
        self.dof_total = self.dof_model + self.dof_resid

        self.coefficients = self._model.params.copy()
        self.coefficients_sterrors = self._model.bse.copy()
        self.coefficients_tvalues = self._model.tvalues.copy()
        self.coefficients_pvalues = self._model.pvalues.copy()

        variables_included = [
            x for x in list(self.coefficients.index) if x != 'Intercept'
        ]
        self._variables_included_patsy = variables_included.copy()

        variables_included = [
            LinearRegression._translate_from_patsy_notation(x)
            for x in variables_included
        ]

        self.variables_included = variables_included
        #self._independent_variables =

        if self.include_constant:
            self._params_idx = ['Constant'] + variables_included
        else:
            self._params_idx = variables_included.copy()

        for stats in [
                self.coefficients, self.coefficients_pvalues,
                self.coefficients_sterrors, self.coefficients_tvalues
        ]:
            stats.index = self._params_idx

        return

    @property
    def coefficients_beta(self):
        b = np.array(self._model.params)[1:]
        std_y = self._model.model.endog.std(axis=0)
        std_x = self._model.model.exog.std(axis=0)[1:]
        beta = list(b * (std_x / std_y))

        if self.include_constant:
            beta = [np.nan] + beta

        result = pd.Series(beta, index=self._params_idx)
        return result

    @property
    def coefficients_confidence_interval(self):

        ci = self._model.conf_int()
        ci.index = self._params_idx

        ci.columns = [f'LB CI (95%)', f'UB CI (95%)']
        return ci

    @property
    def coefficients_VIF(self):
        #eps = 1e-20
        x = self._model.model.exog[:, 1:].copy()
        inv_corr = np.linalg.inv(sp.corrcoef(x, rowvar=False))
        diag = list(inv_corr.diagonal())
        if self.include_constant:
            diag = [np.nan] + diag

        return pd.Series(diag, index=self._params_idx)

    @property
    def coefficients_tolerance(self):
        return 1 / self.coefficients_VIF

    @staticmethod
    def _translate_from_patsy_notation(effect):
        effect = effect\
        .replace(':', ' * ')\
        .replace('C(', '')\
        .replace('T.', '')\
        .replace('[', ' = "')\
        .replace(']', '"')\
        .replace(')', '')

        return effect

    def show_results(self, n_decimals):
        """
        Show results of the analysis in a readable form.
        
        Parameters
        ----------
        n_decimals : int 
            Number of digits to round results when showing them
        """
        phrase = 'method {}'

        print('\nLINEAR REGRESSION SUMMARY')
        print('------------------\n')
        print('Model summary')
        display(self.summary_r2().style\
                    .set_caption(phrase.format('.summary_r2()'))\
                    .set_precision(n_decimals))
        print('------------------\n')
        print('ANOVA')
        display(self.summary_F().style\
                    .format(None, na_rep="")\
                    .set_caption(phrase.format('.summary_F()'))\
                    .set_precision(n_decimals))
        print('------------------\n')
        print('Coefficients')
        display(self.summary().style\
                    .format(None, na_rep="")\
                    .set_caption(phrase.format('.summary()'))\
                    .set_precision(n_decimals))

    def summary(self):
        """
        Summary table with requested information related to regression coefficients.

        Returns
        -------
        pd.DataFrame
            A summary table
        """

        statistics = [
            self.coefficients, self.coefficients_sterrors,
            self.coefficients_beta, self.coefficients_tvalues,
            self.coefficients_pvalues
        ]

        columns = ['B', 'Std. Error', 'Beta', 't', 'p-value']

        if self._show_ci:
            statistics.append(self.coefficients_confidence_interval)
            columns.extend(list(self.coefficients_confidence_interval.columns))

        if self._show_col:
            statistics.append(self.coefficients_tolerance)
            statistics.append(self.coefficients_VIF)
            columns.extend(['Tolerance', 'VIF'])

        statistics = pd.concat(statistics, axis=1)

        statistics.columns = columns

        statistics.index = self._params_idx

        return statistics

    def summary_r2(self):
        """
        Summary table with information related to coefficient of determination.

        Returns
        -------
        pd.DataFrame
            A summary table
        """
        r = self.r2**0.5
        r2 = self.r2
        r2_adj = self.r2_adjusted

        statistics = [[r, r2, r2_adj]]
        columns = ['R', 'R Squared', 'Adj. R Squared']

        statistics = pd.DataFrame(statistics, columns=columns, index=[''])

        return statistics

    def summary_F(self):
        """
        Summary table with information related to F-statistic.

        Returns
        -------
        pd.DataFrame
            A summary table
        """

        results = [[
            self.ess, self.dof_model, self.ms_model, self.F, self.F_pvalue
        ], [self.rss, self.dof_resid, self.ms_resid, np.nan, np.nan],
                   [self.tss, self.dof_total, np.nan, np.nan, np.nan]]

        results = pd.DataFrame(
            results,
            columns=['Sum of Squares', 'df', 'Mean Square', 'F', 'p-value'],
            index=['Regression', 'Residual', 'Total'])

        return results

    def _fit_backward(self):

        y_train = pd.Series(self._model.model.endog.copy(),
                            name=self.dependent_variable,
                            index=self._observations_idx)
        X_train = pd.DataFrame(self._model.model.exog,
                               columns=self._model.model.exog_names,
                               index=self._observations_idx)

        model = OLS(y_train, X_train, missing='drop')

        results = model.fit()

        max_pvalue = results.pvalues.drop('Intercept').max()

        while max_pvalue > self.sig_level_removal:
            x_to_drop = results.pvalues.drop('Intercept').idxmax()
            X_train = X_train.drop(x_to_drop, axis=1)
            model = OLS(y_train, X_train, missing='drop')
            results = model.fit()
            max_pvalue = results.pvalues.drop('Intercept').max()

        self._model = results

        return

    def _identify_variables_without_variation(self):
        if self.include_constant:
            mask = self._model.model.exog.var(axis=0)[1:] == 0
        else:
            mask = self._model.model.exog.var(axis=0) == 0

        variables_included = [
            x for x in list(self._model.params.index) if x != 'Intercept'
        ]

        return list(np.array(variables_included)[mask])

    def _remove_variables_without_variation(self):
        X = pd.DataFrame(self._model.model.exog,
                         columns=self._model.model.exog_names,
                         index=self._observations_idx)
        X = X.drop(self.variables_excluded, axis=1)
        return X

    def save_independent_variables(self, data=None, add_to_data=False):
        """
        Produce values of independent variable remained in a fitted model.
        This option is useful if you don't create dummy variables or interaction effects manually
        but want to use them in a further analysis. Only variables remained in a model are returned
        (those that are shown in a summary table).
        
        Parameters
        ----------
        data : pd.DataFrame 
            Data for which independent variables are requested; 
            may be not specified if you want to save values for the same data that were used to fit a model
        add_to_data : bool 
            Whether to merge new values with the given data.
            Currently, this option returns data with a sorted index

        Returns
        -------
        pd.DataFrame
            Values of independent variables
        """

        if data is None:
            data = self._data.copy()
            if self.include_constant:
                result = self._model.model.exog[:, 1:].copy()
            else:
                result = self._model.model.exog.copy()
            columns = [x for x in self.variables_included if x != 'Constant']
            result = pd.DataFrame(result,
                                  columns=columns,
                                  index=self._observations_idx)

        else:
            aux_model = ols(self.formula, data).fit()
            aux_data_idx = aux_model.fittedvalues.index
            aux_data_cols = aux_model.model.exog_names
            aux_data_cols = [LinearRegression._translate_from_patsy_notation(x)\
                              for x in aux_data_cols]
            aux_data = pd.DataFrame(aux_model.model.exog,
                                    index=aux_data_idx,
                                    columns=aux_data_cols)
            result = aux_data[self.variables_included]

        if add_to_data:
            result = pd.concat([data, result], axis=1, sort=False)

        return result

    def save_residuals(self,
                       unstandardized=True,
                       standardized=False,
                       studentized=False,
                       deleted=False,
                       studentized_deleted=False,
                       add_to_data=False):
        """
        Produce values of various residuals. 
        Residuals are returned only for data used to fit a model.
        
        Parameters
        ----------
        unstandardized : bool 
            Whether to save unstandardized (raw) residuals
        standardized : bool 
            Whether to save standardized (z-scores) residuals
        studentized : bool 
            Whether to save studentized residuals
        deleted : bool 
            Whether to save deleted residuals
        studentized_deleted : bool
            Whether to save studentized deleted residuals
        add_to_data : bool
            Whether to merge new values with data.
            Currently, this option returns data with a sorted index

        Returns
        -------
        pd.DataFrame
            Requested residuals
        """

        columns_to_show = [f'{k.capitalize().replace("ized", ".").replace("eted", ".").replace("_", " ")} res.' \
                           for k, v in vars().items() if v==True and k!='add_to_data']

        infl = OLSInfluence(self._model)

        result = []

        res_unstand = infl.resid
        res_unstand.name = 'Unstandard. res.'

        res_stand = (res_unstand - res_unstand.mean()) / res_unstand.std()
        res_stand.name = 'Standard. res.'

        res_stud = infl.resid_studentized_internal
        res_stud.name = 'Student. res.'

        result.extend([res_unstand, res_stand, res_stud])

        if deleted:
            res_del = infl.resid_press
            res_del.name = 'Del. res.'
            result.append(res_del)

        if studentized_deleted:
            res_stud_del = infl.resid_studentized_external
            res_stud_del.name = 'Student. del. res.'
            result.append(res_stud_del)

        result = pd.concat(result, axis=1)
        result = result[columns_to_show].copy()

        if add_to_data:
            result = pd.concat([self._data, result], axis=1)

        return result

    #following two methods are still in progress
    @staticmethod
    def _turn_all_rows_to_fancy(summary):
        return summary.apply(
            lambda x: LinearRegression._turn_one_row_to_fancy(x), axis=1)

    @staticmethod
    def _turn_one_row_to_fancy(row):
        coef = round(row['B'].item(), 3)
        sterr = round(row['Std. Error'].item(), 3)
        pval = row['p-value'].item()

        if pval <= 0.01:
            mark = '***'
        elif pval <= 0.05:
            mark = '**'
        elif pval <= 0.1:
            mark = '*'
        else:
            mark = ''

        result = f'{coef}{mark} \n ({sterr})'
        return result
示例#3
0
    def singlesplit(self, X, y, nleft, model_selector):
        ######## Split a sample into two subsamples ########
        #print(self.gamma)
        n, p = X.shape
        nright = n - nleft

        pvals_v = np.ones(p)
        lci_v = np.array([-np.Inf] * p)
        uci_v = np.array([np.Inf] * p)
        coefs = np.zeros(p)
        ses_v = np.array([np.Inf] * p)
        df_res = 0

        tryagain = True
        count = 0

        while tryagain:

            ######## Randomly split the sample #######
            split = np.random.choice(np.arange(n), nleft,
                                     replace=False)  # without replacement
            xleft = X.copy()[split, :]
            yleft = y.copy()[split]

            xright = X.copy()[~split, :]
            yright = y.copy()[~split]

            ######## Model selection on Sample I #######
            if self.manual_lam:
                # calculate regularization path
                eps = 0.001
                K = 100
                max_lambda = np.max(np.abs(np.sum(np.dot(xleft.T, yleft)))) / n
                lambda_path = np.round(np.exp(
                    np.linspace(math.log10(max_lambda),
                                math.log10(max_lambda * eps), K)),
                                       decimals=100)
                model_selector.set_params(alphas=lambda_path,
                                          normalize=True,
                                          tol=1e-3)

            #print(lambda_path)
            model_selector.fit(X=xleft, y=yleft)

            sel_nonzero = (model_selector.coef_ != 0
                           )  # location of selected variables

            #location = np.where(self.selector.coef_ != 0)
            #print(location)
            #print(self.selector.coef_)

            p_sel = sum(sel_nonzero)  # size of selected variables

            ######## Check up the selected results, make sure applicable for OLS ########

            if (p_sel + 1) >= nright:
                # rankX larger than number of row, cannot calculate p-values
                tryagain = True
                count = count + 1
                print("Too large model selected in a sample-split")

            if p_sel == 0:
                print("Empty model selected, it is OK")
                tryagain = False

            if p_sel > 0 and (p_sel + 1) < nright:

                tryagain = False

                ######## Fitting Sample II with reduced features using OLS ########

                lm = OLS(yright, xright[:, sel_nonzero]).fit(method="qr")

                df_res = lm.df_resid
                sel_pval = lm.pvalues

                coefs[sel_nonzero] = lm.params

                ses_v[sel_nonzero] = lm.bse

                # Sanity checks for p-values

                if len(sel_pval) != p_sel:
                    sys.exit(
                        "The statsmodels.OLS didn't return the correct number of p-values for the provided submodel."
                    )
                if not (np.all(sel_pval >= 0) and np.all(sel_pval <= 1)):
                    sys.exit(
                        "The statsmodels.OLS returned p-values below 0 or above 1."
                    )

                ######## Multiple testing adjustment on small sample: Bonferroni ########

                pvals_v[sel_nonzero] = np.minimum(sel_pval * p_sel,
                                                  1)  # renew p-values

                ######## Confidence intervals and other relative informations ########
                if all(pow(10, -5) < abs(self.gamma * self.B % 1)):
                    print(
                        "Duality might be violated because of choice of gamma. Use steps of length 1 / B"
                    )

                sel_ci = lm.conf_int(alpha=self.ci_level)

                lci_v[sel_nonzero] = sel_ci[:, 0]
                uci_v[sel_nonzero] = sel_ci[:, 1]

                ######## End of C.I. ########

            if count > self.repeat_max:
                print(
                    "Exceed max repeat times,sample splits resulted in too large models."
                )
                sys.exit()

            return pvals_v, p_sel, coefs, lci_v, uci_v, ses_v, df_res
示例#4
0
    def singlesplit(self,X,y): # single split

        n,p = X.shape
        self.nleft = np.floor(n * self.fraction)
        self.nright = n - self.nleft
        if not (self.nleft>=1 and self.nright>=1):
            print("Not enough data for splitting")
            sys.exit()

        pvals_v = np.ones(p)

        lci_v = np.array([-np.Inf]*p)
        uci_v = np.array([np.Inf]*p)
        coefs = np.zeros(p)

        tryagain = True
        count = 0

        while tryagain:

            split = np.random.randint(low=1,high=n,size=self.nleft)
            xleft = X.copy()[split,:]
            yleft = y.copy()[split]

            xright = X.copy()[~split,:]
            yright = y.copy()[~split]

            # calculate lambdas sequence for lasso
            eps = 0.001
            K = 100
            max_lambda = np.max(np.abs(np.sum(np.dot(xleft.T,yleft)) ))/n
            lambda_path = np.round(np.exp(np.linspace(math.log10(max_lambda), math.log10(max_lambda * eps), K)),decimals=100)

            sel_model.set_params(alphas=lambda_path)
            sel_model.fit(X=xleft,y=yleft)
            sel_nonzero = np.where(sel_model.coef_!=0)[0]
            p_sel = len(sel_nonzero)


            ## Classical situation:
            ## A model with intercept is used, hence p.sel + 1 < nrow(x.right),
            ## otherwise, p-values can *not* be calculated
            if p_sel==0:
                print("Empty model selected")
                tryagain = False

            if p_sel>0 and p_sel< (self.nright-1):

                #Fitting Sample II with selected features using simple linear regression
                lm = OLS(yright,xright[:,sel_nonzero]).fit()
                sel_pval = lm.pvalues
                coefs[sel_nonzero] = lm.params

                if len(sel_pval)!=p_sel:
                    print("The classical OLS didn't return the correct number of p-values for the provided submodel.")
                    sys.exit()
                if not((sel_pval>=0).all() and (sel_pval<=1).all()):
                    print("The classical OLS returned p-values below 0 or above 1.")
                    sys.exit()

                # Multi-test adjustment with Bonferroni method:
                pvals_v[sel_nonzero] = np.minimum(sel_pval*p_sel,1) #renew p-values
                tryagain=False
                # Calculate confidence intervals
                if self.ci:
                    if not (all(abs(self.gamma * self.B % 1) <= pow(10, -5))):
                        print("Duality might be violated because of choice of gamma. Use steps of length 1 / B")
                    sel_ci = np.array(lm.conf_int(alpha=self.ci_level))
                    lci_v[sel_nonzero] = sel_ci[:,0]
                    uci_v[sel_nonzero] = sel_ci[:,1]
                    pvals_adjusted = np.minimum(pvals_v * p_sel, 1)
                    return pvals_adjusted,coefs, lci_v, uci_v

            if p_sel >= (self.nright-1):#rankX less than number of low
                tryagain=True
                count=count+1
                print("Too large model selected in a sample-split")

        if count>self.repeat_max:
                print("Exceed max repeat times,sample splits resulted in too large models.")
                sys.exit()
        if count > 5:  # Adaptive lasso
            init = RidgeCV(fit_intercept=False, cv=10).fit(xleft, yleft)
            w = abs(init.coef_)
            sel_model = LassoLars(fit_intercept=False, normalize=False, fit_path=False)
            sel_model.fit(xleft * w, y=yleft)
            sel_nonzero = np.where(sel_model.coef_ != 0)[0]
            p_sel = len(sel_nonzero)

        pvals_adjusted = np.minimum(pvals_v*p_sel,1)

        return pvals_adjusted,coefs