示例#1
0
def VIF(data, target, columns):
    for i in range(0, columns.shape[0]):
        y = data[columns[i]]
        x = target
        rsq = sm.ols(formula="y~x", data=data).fit().rsquared
        vif = round(1/(1-rsq),2)
        print(columns[i], " VIF = " , vif)
示例#2
0
文件: util.py 项目: lishaoyi/quandl
def regr(ticker1, ticker2, attr1, attr2):
  attr_1 = string.replace(attr1+"_1", ' ', '_')
  attr_2 = string.replace(attr1+"_2", ' ', '_')
  ticker1 = ticker1[[attr1]].rename(columns={attr1:attr_1})
  ticker2 = ticker2[[attr2]].rename(columns={attr2:attr_2})
  
  df = ticker1.join(other=ticker2, how='inner')
  fml = attr_1+' ~ '+attr_2
  return sm.ols(formula=fml, data=df).fit()
示例#3
0
def vif_cal(input_data):
    import statsmodels.formula.api as sm
    x_vars=input_data
    xvar_names=input_data.columns
    vif_lst=[]
    for i in range(0,xvar_names.shape[0]):
        y=x_vars[xvar_names[i]] 
        x=x_vars[xvar_names.drop(xvar_names[i])]
        rsq=sm.ols(formula="y~x", data=x_vars).fit().rsquared  
        vif=round(1/(1-rsq),2)
        vif_lst.append(vif)  
    vif_df = pd.concat([pd.DataFrame(xvar_names,columns=['Variable']),pd.DataFrame(vif_lst,columns=['VIF'])],axis=1)
    return vif_df
示例#4
0
文件: pyfunk.py 项目: bneb/dotfiles
def regression(csv_file_path, delim=","):
    """ Runs an OLS regression.

    This regresses the last column on all other non-index columns,
    and prints a summary.

    Args:
        csv_file_path: the filepath of the csv data file, organized
            so that the endogenous variable is the last column.
    """

    import pandas as pd
    from statsmodels.api import OLS as ols
    df = pd.read_csv(csv_file_path, delim).dropna()
    X = df[df.columns[1:-1]].astype(int)
    X['const'] = 1
    y = df[df.columns[-1]]
    print(ols(y, X).fit(const=True).summary())
示例#5
0
def fill_regressed_data(S):
    """ Fill missing returns by linear combinations of assets without missing returns. """
    S = S.copy()
    R = np.log(S).diff()
    R.iloc[0] = 0

    X = R.dropna(1)

    for col in set(S.columns) - set(X.columns):
        R[col].iloc[0] = np.nan
        y = R[col]

        # fit regression
        res = ols(y=y, x=X, intercept=True)
        pred = res.predict(x=X[y.isnull()])

        # get absolute prices
        pred = pred.cumsum()
        pred += np.log(S[col].dropna().iloc[0]) - pred.iloc[-1]

        # fill missing data
        S[col] = S[col].fillna(np.exp(pred))

    return S
示例#6
0
# 
# ":" adds a new column to the design matrix with the interaction of the other two columns. "*" will also include the individual columns that were multiplied together:

res1 = ols(formula='Lottery ~ Literacy : Wealth - 1', data=df).fit()
res2 = ols(formula='Lottery ~ Literacy * Wealth - 1', data=df).fit()
print res1.params, '\n'
print res2.params


# Many other things are possible with operators. Please consult the [patsy docs](https://patsy.readthedocs.org/en/latest/formulas.html) to learn more.

# ## Functions
# 
# You can apply vectorized functions to the variables in your model: 

res = sm.ols(formula='Lottery ~ np.log(Literacy)', data=df).fit()
print res.params


# Define a custom function:

def log_plus_1(x):
    return np.log(x) + 1.
res = sm.ols(formula='Lottery ~ log_plus_1(Literacy)', data=df).fit()
print res.params


# Any function that is in the calling namespace is available to the formula.

# ## Using formulas with models that do not (yet) support them
# 
示例#7
0
    return csvdataRowsNumericalCategorical_with_propensity_scores_overlap_only_matched


#### [7] Model controlling for blocks
import numpy as np
import pandas as pd
import statsmodels.formula.api as sm
df2 = pd.DataFrame(
    csvdataRowsNumericalCategorical_with_propensity_scores_overlap_only_matched[
        1:],
    columns=[
        element.replace('.', '_') for element in
        csvdataRowsNumericalCategorical_with_propensity_scores_overlap_only_matched[
            0]
    ])
FE_ols = sm.ols(formula='Alumni_Donations_2018 ~ Ranked_2017 + C(Match) - 1',
                data=df2).fit()
#FE_ols = sm.ols(formula="Lung_Hospitalizations ~ C(Vaping_Ban, Treatment(reference='0')) + C(State_Id) + C(Year) - 1",data=df).fit()
print(FE_ols.summary())
# Super correlated - statistically significant at 0.001 and being ranked increases alumni donations by ~$509,000 in 2018 (to be fair this still seems low, but perhaps the time value of money or this is just illustrative)


def build_basic_regression_outputs(OLSResults):
    # How to get model attributes - https://stackoverflow.com/questions/48522609/retrieve-model-estimates-from-statsmodels
    number_of_observations = str(int(OLSResults.nobs))
    r_squared = str(round(OLSResults.rsquared, 3))
    variables = [key for key, value in OLSResults.params.iteritems()]
    coefficients = OLSResults.params
    standard_error = OLSResults.bse
    p_values = OLSResults.pvalues
    rows = [[variables[i], coefficients[i], standard_error[i], p_values[i]]
            for i in range(len(coefficients))]
Vapor_df = df[df.Temperature > 151.12 +
              27.9058 * df.Pressure**0.237508]  # cut off value for densities
Vapor_df.tail(3)  # mock display values to see how are they are being arranged
threedee = plt.figure().gca(projection='3d')
threedee.scatter(Vapor_df['Temperature'], Vapor_df['Pressure'],
                 Vapor_df['Density'])
threedee.set_xlabel('Temperature (K)')
threedee.set_ylabel('Pressure (kPa)')
threedee.set_zlabel('Density (kg/m^3)')
plt.show()

# In[26]:

Vapor_df = Vapor_df.assign(Temp_sq=Vapor_df.Temperature**2)
resultVapor = sm.ols(formula="Density ~ Temperature+Pressure",
                     data=Vapor_df).fit()
forecast = resultVapor.predict(Vapor_df[['Temperature', 'Pressure']])
ErrorV = pd.DataFrame({'Error': forecast - Vapor_df.Density})
print(resultVapor.params)
print(resultVapor.summary())
ErrorV.describe()

# In[22]:

plt.hist(ErrorV.Error, bins=150)  # histogram of error distribution
plt.title('Error distribution')
plt.xlabel('Spread from function (kg/m^3))')
plt.ylabel('Frecuency')

# ## Result for Vapor portion Density Vapor= aT^2+bT+cP+f
#
示例#9
0
# ### Multiplicative interactions
#
# ":" adds a new column to the design matrix with the interaction of the other two columns. "*" will also include the individual columns that were multiplied together:

res1 = ols(formula='Lottery ~ Literacy : Wealth - 1', data=df).fit()
res2 = ols(formula='Lottery ~ Literacy * Wealth - 1', data=df).fit()
print res1.params, '\n'
print res2.params

# Many other things are possible with operators. Please consult the [patsy docs](https://patsy.readthedocs.org/en/latest/formulas.html) to learn more.

# ## Functions
#
# You can apply vectorized functions to the variables in your model:

res = sm.ols(formula='Lottery ~ np.log(Literacy)', data=df).fit()
print res.params

# Define a custom function:


def log_plus_1(x):
    return np.log(x) + 1.


res = sm.ols(formula='Lottery ~ log_plus_1(Literacy)', data=df).fit()
print res.params

# Any function that is in the calling namespace is available to the formula.

# ## Using formulas with models that do not (yet) support them
示例#10
0
import pandas as pd
import statsmodels.api as sm
from sklearn import linear_model

df = pd.read_csv(r'joined_filtered_dataset.csv')
df = df.drop('id', axis=1)
df = df.drop('key_emotion', axis=1)
df = df.dropna()
X = df.drop("perceived_trust", axis=1)
Y = df['perceived_trust']

regr = linear_model.LinearRegression()
regr.fit(X, Y)

print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)
print(regr)
"""model = sm.OLS(Y, X)
results = model.fit()"""

print(results)

result = sm.ols()
示例#11
0
sns.set(style="darkgrid")

# Generate a mask for the upper triangle
mask = np.zeros_like(alCorr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

f, ax = plt.subplots(figsize = (9, 9))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(alCorr, mask=mask, annot=False, cmap=cmap, ax=ax)
f.tight_layout()

#plt.matshow(alCorr)
# https://github.com/statsmodels/statsmodels/issues/5343
# !pip install --upgrade patsy
import statsmodels.formula.api as sm
result = sm.ols(formula="PO4 ~ oPO4", data=algae).fit() # ols: ordinary least square

#dir(result)
#[(name, type(getattr(result, name))) for name in dir(result)]

[name for name in dir(result) if not callable(getattr(result, name))]


print (result.params)

print (result.summary())

type(result.params)

type(algae)
示例#12
0
# Now I will apply logistic regression, with ``outcome`` as a dependant variable.

# In[10]:


plt.scatter(df["BloodPressure"],df["BMI"])
plt.show()


# In[11]:


import statsmodels.formula.api as sm

model  = sm.ols(formula='BloodPressure ~ BMI', data=df).fit()
model.summary(model)


# # Logistic Regression

# Now I will take ``outcome`` as a dependant variable and run the test to study is other variables affect the ``outcome``

# In[12]:


from sklearn import preprocessing
plt.rc("font", size=14)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
# In[6]:


"""top 10 in field goal percentage among power forwards with more than 20 games played"""
data_FGpct = data.loc[data['GamePlayed'] > 20]
data_FGpct = data_FGpct.loc[data_FGpct['Position'] == 'PF']
data_FGpct = data_FGpct.nlargest(10, 'FGPct')
data_FGpct = data_FGpct.sort_values(by=['FGPct'], ascending=False)
data_FGpct.loc[:,['Player','Position','GamePlayed','Team','FGPct']]


# In[8]:


"""Regression for Points as dependent variable and the following as independent variables: eFGPct, ThreePA, Assist, Turnover, TwoPct, Position as a categorical"""
reg = sm.ols("Points ~ eFGPct + ThreePA + Assist + Turnover + TwoPct + C(Position)", data=data).fit()
print(reg.summary())


# In[9]:


"""Regression for Points as dependent variable and the following as independent variables: eFGPct, ThreePA, Assist, ORB, 
Position as a categorical"""
reg = sm.ols("Points ~ eFGPct + ThreePA + Assist + ORB + C(Position)", data=data).fit()
print(reg.summary())


# In[10]: