Пример #1
0
def run_statsmodels_models(train, test, model_description):
    """
    Run logistic regression model to predict whether a signed up driver ever actually drove.
    :param input_df: Data frame prepared for statsmodels regression
    :type input_df: pd.DataFrame
    :return: AUC for model generated
    :rtype: float
    """
    # Run model on all observations
    # Use dmatrices to format data
    logging.info('Running model w/ description: %s' %model_description)
    logging.debug('Train df: \n%s' % train.describe())
    logging.debug('Test df: \n%s' % test.describe())
    y_train, X_train = dmatrices(model_description, data=train, return_type='dataframe', NA_action='drop')
    y_test, X_test = dmatrices(model_description, data=test, return_type='dataframe', NA_action='drop')

    # Create, fit model
    mod = sm.Logit(endog=y_train, exog=X_train)
    res = mod.fit(method='bfgs', maxiter=100)

    # Output model summary
    print train['city_name'].value_counts()
    print train['signup_channel'].value_counts()
    print res.summary()

    # Create, output AUC
    predicted = res.predict(X_test)
    auc = roc_auc_score(y_true=y_test, y_score=predicted)
    print 'AUC for 20%% holdout: %s' %auc

    # Return AUC for model generated
    return auc
Пример #2
0
 def _set_XY(self):
     if self.regression_type == "pooled":
         return dmatrices(self.formula, self.data, return_type="dataframe")
     elif self.regression_type == "fe":
         idx = self.data.i
         Y, X = dmatrices(self.formula, self.data, return_type="dataframe")
         Y = fixed_effects_transform(Y, idx)
         X = fixed_effects_transform(X, idx)
         return Y, X
     else:
         raise ValueError("Regression type %s not implemented." % self.regression_type)
Пример #3
0
 def _set_XY(self):
     if self.regression_type == 'pooled':
         return dmatrices(self.formula, self.data, return_type='dataframe')
     elif self.regression_type == 'fe':
         idx = self.data.i
         Y, X = dmatrices(self.formula, self.data, return_type='dataframe')
         Y = fixed_effects_transform(Y, idx)
         X = fixed_effects_transform(X, idx)
         return Y, X
     else:
         raise ValueError('Regression type %s not implemented.' %
                          self.regression_type)
Пример #4
0
    def fit_model(self, in_csv_1, in_csv_2, column_flag):
        logger.info(f'Run logistic regression')

        logger.info(f'Reading {in_csv_1}')
        rvs1 = pd.read_csv(in_csv_1)[column_flag].to_numpy()
        logger.info(f'Data length {len(rvs1)}')

        logger.info(f'Reading {in_csv_2}')
        rvs2 = pd.read_csv(in_csv_2)[column_flag].to_numpy()
        logger.info(f'Data length {len(rvs2)}')

        x = np.zeros((len(rvs1) + len(rvs2), ), dtype=float)
        x[:len(rvs1)] = rvs1[:]
        x[len(rvs1):] = rvs2[:]
        y = np.zeros((len(rvs1) + len(rvs2), ), dtype=int)
        y[:len(rvs1)] = 0
        y[len(rvs1):] = 1

        self.data = {'x': x, 'y': y}

        Y, X = dmatrices('y ~ x', self.data)

        self.logit_model = sm.Logit(Y, X)
        self.logit_result = self.logit_model.fit()

        data_range = np.max(x) - np.min(x)
        self.view_range_min = np.min(x) - 0.05 * data_range
        self.view_range_max = np.max(x) + 0.05 * data_range

        print(self.logit_result.summary())
        print(f'Estimated params: {self.logit_result.params}')
def freq_pat_selection():
    input_dir = os.path.join(CUR_DIR, 'result', 'feature', 'freq_pat', 'normalized', 'support40')
    output_dir = os.path.join(CUR_DIR, 'result', 'feature', 'freq_pat_select', 'normalized', 'support40')
    for file in os.listdir(input_dir):
        input_fp = os.path.join(input_dir, file)    
        feature = file[:-4]
        #print feature
        df = pandas.read_csv(input_fp)
        for i in range(len(TRAITS)):
            response = TRAITS[i]
            #print response
            #y, X = dmatrices('%s ~ %s' % (response, feature), data=df)
            y, X = dmatrices('%s ~ Q("%s")' % (response, feature), data=df)
            mod = sm.OLS(y, X)
            res = mod.fit()
            if res.pvalues[1] <= 0.1:
                output_folder = os.path.join(output_dir, response)
                if not os.path.exists(output_folder):
                    os.makedirs(output_folder)
                output_fp = os.path.join(output_folder, feature + '.txt')
                fw = open(output_fp, 'a')
                fw.write("#####################################################################################################\n")
                fw.write(response + '\n')
                fw.write(str(res.summary()) + '\n')
                fw.close()
    def do_(self, *args):

        column_list, result = self.get_result(*args)
        y, X = dmatrices('%s ~ %s' % tuple(column_list), data=result, return_type='dataframe')
        mod = sm.OLS(y, X)
        res = mod.fit()
        return res.summary().as_html()
Пример #7
0
def logit():
    fp = r"data\matrix_data\logit\wifi_features_extra.csv"
    df = pandas.read_csv(fp)
    y, X = dmatrices('extra ~ wifi_features + end_time_var + fq_home', data=df)
    mod = sm.Logit(y, X)
    res = mod.fit()
    print res.summary()
Пример #8
0
def stay():
    input_file = 'new.xlsx'
    sheet = 'Sheet1'
    df = pd.read_excel(input_file, sheet, header=0)
    y, X = dmatrices('Stay ~ Age + Gender +Income+ TravelBudget+RecentVisit1+RecentVisit2+ Foreign+ EatingHabit +\
                        Purpose + PreferredPartner ', df, return_type='dataframe')
    y = np.ravel(y)
    return y, X
Пример #9
0
def lr(model_formula, data_df, print_mse=True):
    y_train, X_train = dmatrices(model_formula, data=data_df, return_type='dataframe')
    model = sm.OLS(y_train, X_train)
    result = model.fit()
    if print_mse:
        y_train_pred = result.predict(X_train)
        print(f'MSE = {metrics.mean_squared_error(y_train, y_train_pred)}')
    return result
Пример #10
0
    def getXsYData(self, df, yIndex, xlist):
        modelelements = ' + '.join(xlist)
        formula = yIndex + ' ~ ' + modelelements
        # print(formula)

        y, X = dmatrices(formula, data=df, return_type='dataframe')
        X = X.drop('Intercept', 1)

        return y, X
Пример #11
0
    def getSelectedData(self, df, yIndex, xlist):
        modelelements = ' + '.join(xlist)
        formula = yIndex + ' ~ ' + modelelements
        # print(formula)

        y, x = dmatrices(formula, data=df, return_type='dataframe')
        x = x.drop('Intercept', 1)

        return pd.concat([x, y], axis=1)
Пример #12
0
 def fit(formula: str, data: xr.Dataset) -> RegModel:
     y: DesignMatrix
     x: DesignMatrix
     y, x = dmatrices(formula, data)
     y_tensor: tf.Tensor = tf.constant(y)
     x_tensor: tf.Tensor = tf.constant(x)
     betas: tf.Tensor = OLS.fit_exec(y_tensor, x_tensor)
     out: RegModel = RegModel(data, y, x, "Direct Ordinary Least Squares",
                              betas)
     return out
Пример #13
0
def single_vrb(feature):
    input_fp = os.path.join(CUR_DIR, "result", "feature", feature + ".csv")
    df = pandas.read_csv(input_fp)
    print df
    for i in range(len(TRAITS)):
        print "#####################################################################################"
        print TRAITS[i]
        y, X = dmatrices("%s ~ %s" % (TRAITS[i], feature), data=df)
        mod = sm.OLS(y, X)
        res = mod.fit()
        print res.summary()
Пример #14
0
def regress(new_data=HISTORICAL_DATA):
    data = DataFrame.from_dict(new_data)

    y, X = dmatrices(
        'mu ~ wordpress + roadshow + email + facebook + google + new_coffees',
        data=data,
        return_type='dataframe')
    model = sm.OLS(y, X)
    res = model.fit()

    return res.conf_int()
Пример #15
0
def split_data(df, model_formula, test_size, random_state=None):
    y, x = dmatrices(model_formula, data=df, return_type='dataframe')
    x = x[x.columns.difference(['Intercept'])]
    y = y[y.columns.difference(['Intercept'])]
    target = model_formula.split('~')[0].strip()
    x_train, x_test, y_train, y_test = train_test_split(
        x,
        y,
        test_size=test_size,
        random_state=random_state,
        stratify=y[target])
    return x_train, x_test, y_train.values.ravel(), y_test.values.ravel()
Пример #16
0
def multi_feature_single_trait(feature_names, trait):
    file_name = "-".join(feature_names) + ".csv"
    input_fp = os.path.join(CUR_DIR, "result", "feature", file_name)
    df = pandas.read_csv(input_fp)
    print df

    for feature in feature_names:
        print "#####################################################################################"
        print feature
        y, X = dmatrices("%s ~ %s" % (trait, feature), data=df)
        mod = sm.OLS(y, X)
        res = mod.fit()
        print res.summary()
def multi_vrb():
    input_fp = os.path.join(CUR_DIR, 'data', 'matrix_data', 'all_wifi_features.csv')
    df = pandas.read_csv(input_fp)
    print df
    for i in range(len(LABELS)):
         
        print "#####################################################################################################"
        print LABELS[i]
     
        y, X = dmatrices('%s ~ edit_dist + start_time_var + end_time_var' % LABELS[i], data=df)
        mod = sm.OLS(y, X)
        res = mod.fit()
        print res.summary()
Пример #18
0
def multicollinearity_test(endog, exdog, data):
    """
    The def uses VIF-factor to do multicollinearity test
    :param endog: The dependent variable. String
    :param exdog: The independent variable. String, different variables are connected by '+'
    :param data: The data set. DataFrame
    :return: vif: VIF-factor. DataFrame
    """
    y, x = dmatrices(endog+'~'+exdog, data=data, return_type='dataframe')
    vif = pd.DataFrame()
    vif['VIF Factor'] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
    vif['features'] = x.columns
    return vif
Пример #19
0
def run_statsmodels_models(train, test, model_description):
    """
    Run logistic regression model to predict whether a signed up driver ever actually drove.
    :param input_df: Data frame prepared for statsmodels regression
    :type input_df: pd.DataFrame
    :return: AUC for model generated
    :rtype: float
    """
    # Run model on all observations
    # Use dmatrices to format data
    logging.info('Running model w/ description: %s' % model_description)
    logging.debug('Train df: \n%s' % train.describe())
    logging.debug('Test df: \n%s' % test.describe())
    y_train, X_train = dmatrices(model_description,
                                 data=train,
                                 return_type='dataframe',
                                 NA_action='drop')
    y_test, X_test = dmatrices(model_description,
                               data=test,
                               return_type='dataframe',
                               NA_action='drop')

    # Create, fit model
    mod = sm.Logit(endog=y_train, exog=X_train)
    res = mod.fit(method='bfgs', maxiter=100)

    # Output model summary
    print train['city_name'].value_counts()
    print train['signup_channel'].value_counts()
    print res.summary()

    # Create, output AUC
    predicted = res.predict(X_test)
    auc = roc_auc_score(y_true=y_test, y_score=predicted)
    print 'AUC for 20%% holdout: %s' % auc

    # Return AUC for model generated
    return auc
Пример #20
0
    def fit_model_intercept(self, x, y):
        logger.info('Run logistic regression with intercept only')

        self.data = {'x': x, 'y': y}

        Y, X = dmatrices('y ~ 1', self.data)

        self.logit_model_intercept = sm.Logit(Y, X)
        self.logit_result_intercept = self.logit_model_intercept.fit()

        data_range = np.max(x) - np.min(x)
        self.view_range_min = np.min(x) - 0.05 * data_range
        self.view_range_max = np.max(x) + 0.05 * data_range

        self.show_regression_result(self.logit_result_intercept)
def single_vrb(feature):

    input_fp = os.path.join(CUR_DIR, 'data', 'matrix_data', 'feature_' + feature + '.csv')
    
    df = pandas.read_csv(input_fp)
    print df

    for i in range(len(LABELS)):
        
        print "#####################################################################################################"
        print LABELS[i]
    
        y, X = dmatrices('%s ~ %s' % (LABELS[i], feature), data=df)
        mod = sm.OLS(y, X)
        res = mod.fit()
        print res.summary()
Пример #22
0
    def fit_model_third(self, x, y):
        logger.info('Run quadratic logistic regression')

        self.data = {'x': x, 'y': y}

        Y, X = dmatrices('y ~ x + np.power(x, 2) + np.power(x, 3)', self.data)

        self.logit_model_third = sm.Logit(Y, X)
        self.logit_result_third = self.logit_model_third.fit()

        data_range = np.max(x) - np.min(x)
        self.view_range_min = np.min(x) - 0.05 * data_range
        self.view_range_max = np.max(x) + 0.05 * data_range

        self.show_regression_result(self.logit_result_third)
        self.run_HL_test(3)
Пример #23
0
    def fit_model_linear(self, x, y):
        logger.info('Run first order logistic regression')

        # x, y = self.get_x_y_data(in_csv_1, in_csv_2, column_flag)

        self.data = {'x': x, 'y': y}

        Y, X = dmatrices('y ~ x', self.data)

        self.logit_model_linear = sm.Logit(Y, X)
        self.logit_result_linear = self.logit_model_linear.fit()

        data_range = np.max(x) - np.min(x)
        self.view_range_min = np.min(x) - 0.05 * data_range
        self.view_range_max = np.max(x) + 0.05 * data_range

        self.show_regression_result(self.logit_result_linear)
        self.run_HL_test(1)
def write_single_vrb_to_txt(input_fp, output_dir, feature):    
    df = pandas.read_csv(input_fp)

    for i in range(len(LABELS)):
        #response = 'activity'
        response = LABELS[i]
        y, X = dmatrices('%s ~ Q("%s")' % (response, feature), data=df)
        mod = sm.OLS(y, X)
        res = mod.fit()
    
        if res.pvalues[1] <= 0.05:
            output_folder = os.path.join(output_dir, response)
            if not os.path.exists(output_folder):
                os.makedirs(output_folder)
            fw = open(os.path.join(output_folder, 'summary_' + feature + '.txt'), 'a')
            fw.write("#####################################################################################################\n")
            fw.write(response + '\n')
            sum = str(res.summary())
            fw.write(sum + '\n')
            fw.close()
Пример #25
0
def LogReg():
    n = name.get()
    p = int(percent.get())
    b = int(backlog.get())
    i = int(intern.get())
    f = int(first.get())
    c = int(comm.get())
    input_file = 'dataSet.xlsx'
    sheet = 'Sheet1'
    df = pd.read_excel(input_file, sheet, header=0)
    y, X = dmatrices('Hire ~ Percentage + Backlog + Internship + First_Round + Communication_Skills', df,
                     return_type='dataframe')
    y = np.ravel(y)
    model = LogisticRegression()
    model = model.fit(X, y)
    h = int(model.predict(np.array([1, p, b, i, f, c]).reshape(1, -1)))
    if h == 1:
        lr.set("Hire")
    else:
        lr.set("Not Hire")
Пример #26
0
def linearRegression():
    printLMAndROHeader()
    df = pd.DataFrame(x,columns=labels)
    y, X = dmatrices(generateLabels(), df, return_type='matrix')
    y = np.ravel(y)
    regressionmodel = createRegressionModel(X, y)
    errorCount =  inferErrors(regressionmodel, X, y)
    print 'Errors in whole data: ' + str(errorCount)
    
    xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.25, random_state=np.random)
    
    regressionModel = createRegressionModel(xTrain, yTrain)
    trainErrorCount = inferErrors(regressionModel, xTrain, yTrain)
    print 'Errors in randomized training data (3/4): ' + str(trainErrorCount)
    
    testErrorCount = inferErrors(regressionModel, xTest, yTest)
    print 'Errors in randomized test data (1/4): ' + str(testErrorCount)
    
    numOfTrainErrors, numOfTestErrors = tenFoldExperiment(X, y)
    createBoxPlot(numOfTrainErrors, numOfTestErrors)
    calculateOptimalRegParam(xTrain, yTrain, xTest, yTest, True)
Пример #27
0
 def train(self, observed_data: pd.DataFrame,
           issue_times: pd.DatetimeIndex) -> None:
     resampled_data, unique_inverse = self.unique_data(
         observed_data, issue_times)
     y, X = dmatrices(self.formula, resampled_data)
     self.models = []
     for quantile_level in self.quantile_levels.fractions:
         model = QuantileRegressor(
             quantile=quantile_level,
             max_iter=self.max_iter,
         ).fit(X, y.flatten())
         if model.n_iter_ >= self.max_iter:
             print(
                 f"Training for model {quantile_level} stopped due to iteration limit."
             )
         else:
             print(f"Training for model {quantile_level} finished.")
         print("Iterations:  ", model.n_iter_)
         print("Coefficients:")
         print(model.coef_)
         print("Intercept:   ", model.intercept_)
         print("Gamma:       ", model.gamma_)
         self.models.append(model)
Пример #28
0
def linreg():  #actual linear regression
    print("Model Results: ")
    #printing the corrected model_string
    model_string = []
    model_string.append(dep_var)
    model_string.append(" ~ ")
    for i in range(0, len(full_model_variable_list)):
        model_string.append(full_model_variable_list[i])
        model_string.append(" + ")
    model_string.pop(-1)
    global full_model
    full_model = ''.join(model_string)
    print(full_model)  #prints model
    print()
    print(
        "***********************************************************************************************************"
    )
    print()
    index = 0
    global levels  #also used in contrasting()
    levels = []
    for i in range(len(condensed_data[0])):
        if c == condensed_data[0][i]:
            index = i
    for i in range(1, len(condensed_data)):
        if condensed_data[i][index] not in levels:
            levels.append(condensed_data[i][index])
    for i in range(len(levels)):
        levels[i] = i

    #Beginning of the linear regression
    global X
    global y
    if "*" in m:
        #correcting the format of the model string
        model_string = []
        model_string.append(dep_var)
        model_string.append(" ~ ")
        for i in range(0, len(full_model_variable_list)):
            model_string.append(full_model_variable_list[i])
            model_string.append(" + ")
        model_string.pop(-1)
        for i in range(0, len(model_string)):
            if "*" in model_string[i]:
                replacement = model_string[i].split("*")
                model_string[i] = replacement[0] + ":" + replacement[1]
            #makes sure the model is in the right format.
        string = ''.join(model_string)
        y, X = dmatrices(string, df_final)
    else:
        X = df_final[
            independentvariables]  # gets the modified values of the independent variables
        y = df_final[
            dep_var]  # gets the modified values of the dependent variable
    if not c:
        #The linear regression
        regressor = LinearRegression()
        regressor.fit(X, y)
        regression = regressor.fit(X, y)
        #Data about the linear regression, starting without contrast
        X2 = sm.add_constant(X)
        statistics = sm.OLS(y, X2)
        finalstats = statistics.fit()
        print(finalstats.summary())
        if (o is not None):
            # concatenate data frames
            """f = open(o,"a")
            f.write(full_model)
            f.write("\n*************************************************************************************\n")
            f.write(finalstats.summary())
            f.close()"""
            sys.stdout = open(o, "a")
            print(full_model)
            print(
                "\n*************************************************************************************\n"
            )
            print(finalstats.summary())
            sys.stdout.close()
        return finalstats
Пример #29
0
from math import sqrt
from patsy.highlevel import dmatrices

import pandas as pd

df = pd.read_csv('winequality-white.csv')
formula = "quality ~ density + pH + alcohol + sulphates + chlorides + Q('residual sugar') + Q('fixed acidity')" \
          " + Q('citric acid')" \
          " + Q('volatile acidity')" \
          " + Q('free sulfur dioxide')"
y, x = dmatrices(formula, data=df, return_type='dataframe')
z = x.join(y)
z = z.drop('Intercept', axis=1)
z = (z-z.min())/(z.max()-z.min())

def predict(row, columns, coefficients):
    yhat = coefficients[0]
    for i in range(len(columns)):
        yhat += coefficients[i + 1] * row[columns[i]]
    return yhat


def predict_test(row, target, columns, coefficients):
    p = predict(row, columns, coefficients)
    e = row[target]
    d = p - e
    return p, e, d


def print_prediction(r):
    p, e, d = r
                                                   lambda s: s.str.replace(
                                                       ',', '')).astype(float)

# Read the results into the local authority dataframe

df_las = df_las.join(results_2014.loc[:, ('Yes', 'No')], how='left')

# Write the form of the regression for the Yes vote and the No vote - i.e. Yes ~ Q("All people") + Q("16 to 19") + etc

expr = 'Q("' + ('") + Q("').join(
    list(df_las.columns[~df_las.columns.isin(['Yes', 'No'])])) + '")'
yes_expr = 'Yes ~ ' + expr
no_expr = 'No ~ ' + expr

# Run the regression
y_train_yes, X_train_yes = dmatrices(yes_expr, df_las, return_type='dataframe')
poisson_training_results_yes = sm.GLM(y_train_yes,
                                      X_train_yes,
                                      family=sm.families.Poisson()).fit()

y_train_no, X_train_no = dmatrices(no_expr, df_las, return_type='dataframe')
poisson_training_results_no = sm.GLM(y_train_no,
                                     X_train_no,
                                     family=sm.families.Poisson()).fit()

# Evaluate the regression

print(poisson_training_results_yes.summary())
print(poisson_training_results_no.summary())

# Then use the model to predict results for Intersections
Пример #31
0
    def from_formula(cls, formula, data, *, sigma=None, weights=None):
        """
        Parameters
        ----------
        formula : {str, dict-like}
            Either a string or a dictionary of strings where each value in
            the dictionary represents a single equation. See Notes for a
            description of the accepted syntax
        data : DataFrame
            Frame containing named variables
        sigma : array-like
            Pre-specified residual covariance to use in GLS estimation. If
            not provided, FGLS is implemented based on an estimate of sigma.
        weights : dict-like
            Dictionary like object (e.g. a DataFrame) containing variable
            weights.  Each entry must have the same number of observations as
            data.  If an equation label is not a key weights, the weights will
            be set to unity

        Returns
        -------
        model : SUR
            Model instance

        Notes
        -----
        Models can be specified in one of two ways. The first uses curly
        braces to encapsulate equations.  The second uses a dictionary
        where each key is an equation name.

        Examples
        --------
        The simplest format uses standard Patsy formulas for each equation
        in a dictionary.  Best practice is to use an Ordered Dictionary

        >>> import pandas as pd
        >>> import numpy as np
        >>> data = pd.DataFrame(np.random.randn(500, 4), columns=['y1', 'x1_1', 'y2', 'x2_1'])
        >>> from linearmodels.system import SUR
        >>> formula = {'eq1': 'y1 ~ 1 + x1_1', 'eq2': 'y2 ~ 1 + x2_1'}
        >>> mod = SUR.from_formula(formula, data)

        The second format uses curly braces {} to surround distinct equations

        >>> formula = '{y1 ~ 1 + x1_1} {y2 ~ 1 + x2_1}'
        >>> mod = SUR.from_formula(formula, data)

        It is also possible to include equation labels when using curly braces

        >>> formula = '{eq1: y1 ~ 1 + x1_1} {eq2: y2 ~ 1 + x2_1}'
        >>> mod = SUR.from_formula(formula, data)
        """
        na_action = NAAction(on_NA='raise', NA_types=[])
        if not isinstance(formula, (Mapping, str)):
            raise TypeError('formula must be a string or dictionary-like')

        missing_weight_keys = []
        eqns = OrderedDict()
        if isinstance(formula, Mapping):
            for key in formula:
                f = formula[key]
                f = '~ 0 +'.join(f.split('~'))
                dep, exog = dmatrices(f, data, return_type='dataframe',
                                      NA_action=na_action)
                eqns[key] = {'dependent': dep, 'exog': exog}
                if weights is not None:
                    if key in weights:
                        eqns[key]['weights'] = weights[key]
                    else:
                        missing_weight_keys.append(key)
            _missing_weights(missing_weight_keys)
            return SUR(eqns, sigma=sigma)

        formula = formula.replace('\n', ' ').strip()
        parts = formula.split('}')
        for i, part in enumerate(parts):
            base_key = None
            part = part.strip()
            if part == '':
                continue
            part = part.replace('{', '')
            if ':' in part.split('~')[0]:
                base_key, part = part.split(':')
                key = base_key = base_key.strip()
                part = part.strip()
            f = '~ 0 +'.join(part.split('~'))
            dep, exog = dmatrices(f, data, return_type='dataframe',
                                  NA_action=na_action)
            if base_key is None:
                base_key = key = f.split('~')[0].strip()
            count = 0
            while key in eqns:
                key = base_key + '.{0}'.format(count)
                count += 1
            eqns[key] = {'dependent': dep, 'exog': exog}
            if weights is not None:
                if key in weights:
                    eqns[key]['weights'] = weights[key]
                else:
                    missing_weight_keys.append(key)

        _missing_weights(missing_weight_keys)

        return SUR(eqns, sigma=sigma)
Пример #32
0
def parse_formula(formula, data):
    na_action = NAAction(on_NA='raise', NA_types=[])
    if formula.count('~') == 1:
        dep, exog = dmatrices(formula,
                              data,
                              return_type='dataframe',
                              NA_action=na_action)
        endog = instr = None
        return dep, exog, endog, instr

    elif formula.count('~') > 2:
        raise ValueError('formula not understood.  Must have 1 or 2 '
                         'occurrences of ~')

    blocks = [bl.strip() for bl in formula.strip().split('~')]
    if '[' not in blocks[1] or ']' not in blocks[2]:
        raise ValueError('formula not understood. Endogenous variables and '
                         'instruments must be segregated in a block that '
                         'starts with [ and ends with ].')

    dep = blocks[0].strip()
    exog, endog = [bl.strip() for bl in blocks[1].split('[')]
    instr, exog2 = [bl.strip() for bl in blocks[2].split(']')]
    if endog[0] == '+' or endog[1] == '+':
        raise ValueError(
            'endogenous block must not start or end with +. This block was: {0}'
            .format(endog))
    if instr[0] == '+' or instr[1] == '+':
        raise ValueError(
            'instrument block must not start or end with +. This block was: {0}'
            .format(instr))
    if exog2:
        exog += exog2
    exog = exog[:-1].strip() if exog[-1] == '+' else exog

    try:
        dep = dmatrix('0 + ' + dep,
                      data,
                      eval_env=2,
                      return_type='dataframe',
                      NA_action=na_action)
        exog = dmatrix('0 + ' + exog,
                       data,
                       eval_env=2,
                       return_type='dataframe',
                       NA_action=na_action)
        endog = dmatrix('0 + ' + endog,
                        data,
                        eval_env=2,
                        return_type='dataframe',
                        NA_action=na_action)
        instr = dmatrix('0 + ' + instr,
                        data,
                        eval_env=2,
                        return_type='dataframe',
                        NA_action=na_action)
    except Exception as e:
        raise type(e)(PARSING_ERROR.format(dep, exog, endog, instr) + e.msg,
                      e.args[1])

    return dep, exog, endog, instr