Пример #1
0
def poisson_reg(formula, data, cov='normal'):
    """
    Creates a poisson model (counting y variable) and returns its summary and average parcial effects (get_margeff).
    Documentation: https://www.statsmodels.org/stable/examples/notebooks/generated/discrete_choice_example.html
    Remember to use mod = poisson_reg(...)!

    :param formula: patsy formula
    :param data: dataframe
    :param cov: str
        normal: common standard errors
        robust: HC1 standard errors
        cluster or clustered: clustered standard errors (must specify group)
    :return : statsmodels model instance
    """

    # Creating and fitting the model
    if cov == "robust":
        mod = poisson(formula, data).fit(use_t=True, cov_type='HC1')
    elif cov == "cluster" or cov == "clustered":
        group = str(input("What is the group column?"))
        try:
            mod = poisson(formula, data).fit(use_t=True, cov_type='cluster', cov_kwds={'groups': data[group]})
        except KeyError:
            erro = "It was not possible to find the desired group. Check the spelling and the data and try again!"
            return erro
    else:
        mod = poisson(formula, data).fit(use_t=True)

    ## Calculating under/overdispersion
    sigma = np.around((sum(mod.resid ** 2 / mod.predict()) / mod.df_resid) ** (1 / 2), 2)

    ## Capturing marginal effects
    mfx = mod.get_margeff(at='overall')
    clear_output()

    print(mod.summary())
    print(
        f"The coefficient to determine over/underdispersion is σ = {sigma}, " +
        f"which must be close to one for standard errors to be valid. " +
        f"If not, they must be multiplied by {sigma}.")

    print("##############################################################################")

    print(mfx.summary())
    print(
        "\nMarginal effects on certain values can be found using 'mod.get_margeff(atexog = values).summary()', " +
        "where values must be generated using:\nvalues = dict(zip(range(1,n), values.tolist())).update({0:1})")
    print(
        "\nUsually, the wanted effect of the poisson coefficients is it's semi-elasticity, which is 100*[exp(ß) - 1].")

    print("To predict values using the CDF, use mod.predict(X). X can be blank (use values from the dataset")
    print("or a K x N Dimensional array, where K = number of variables and N = number of observations.")

    return mod
Пример #2
0
    def test_compare_poisson(self):

        vs = Independence()
        family = Poisson()

        Y = np.ceil(-np.log(np.random.uniform(size=100)))
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        mod1 = GEE.from_formula("Y ~ X1 + X2 + X3",
                                groups,
                                D,
                                family=family,
                                cov_struct=vs)
        rslt1 = mod1.fit()

        mod2 = sm.poisson("Y ~ X1 + X2 + X3", data=D)
        rslt2 = mod2.fit(disp=False)

        assert_almost_equal(rslt1.params.values,
                            rslt2.params.values,
                            decimal=10)
Пример #3
0
def GoMiningNumBabes(df):
    """Search for variables that predict number of children borne.

    df: DataFrame of pregnancy records

    return: list of (rsquared, variable name) pairs
    """
    variables = []
    for name in df.columns:
        try:
            if df[name].var() < 1e-7:
                continue

            formula = 'numbabes ~ ' + name

            # The following seems to be required in some environments
            # formula = formula.encode('ascii')

            model = smf.poisson(formula, data=df)
            nobs = len(model.endog)
            if nobs < len(df) / 2:
                continue

            results = model.fit()
        except (ValueError, TypeError):
            continue

        variables.append((results.prsquared, name))

    return variables
Пример #4
0
def VariableMiningPoisson(df, y):
    """Searches variables using Poisson regression to find ones that predict the target dependent variable 'y'.

    Args:
        df (DataFrame): DataFrame that holds all the variables.
        y (string): Column name of dependent variable y.

    Returns:
        variables (list): A list of tuples each containing r-squared value and variable name
    """
    variables = []
    for name in df.columns:
        try:
            if df[name].var() < 1e-7:
                continue

            formula = '{} ~ '.format(y) + name
            model = smf.poisson(formula, data=df)
            nobs = len(model.endog)
            if nobs < len(df) / 2:
                continue

            results = model.fit()
        except:
            continue

        variables.append((results.prsquared, name))

    return variables
Пример #5
0
def analyze_stats(df_og, list_of_artificials, formula, covariates):
    """Analyze stats for real vs artificial dataframes."""
    result_real = sm.poisson(formula=formula, data=df_og).fit(disp=0)

    params = result_real.params
    params['real'] = "Yes"

    coefs = []
    coefs.append(params)

    for df_art in list_of_artificials:
        result_fake = sm.poisson(formula=formula, data=df_art).fit(disp=0)
        params = result_fake.params
        params['real'] = "No"
        coefs.append(params)

    return pd.DataFrame(coefs)
Пример #6
0
def analyze_stats_for_single(df, formula, covariates):
    """Analyze stats for single lexicon."""
    result_real = sm.poisson(formula=formula, data=df).fit(disp=0)

    params = result_real.params
    # params['real'] = "Yes"

    coefs = []
    coefs.append(params)

    return pd.DataFrame(coefs)
Пример #7
0
def fitmodel(ldf, ycol, xcols, modeltype, interactionpairs):
    string= "{} ~ {}".format(ycol, ' + '.join(xcols))
    for intpair in interactionpairs:
        string += ' + '+intpair[0]+':'+intpair[1]
    print "Running {} regression model:".format(modeltype)
    print string
    print "***************************************************************************************"
    print "***************************************************************************************"
    if modeltype=='logistic':
        model = smfa.logit(string, ldf)
    elif modeltype=='linear':
        model = smfa.ols(string, ldf)
    elif modeltype=='poisson':
        model = smfa.poisson(string, ldf)
    elif modeltype=='probit':
        model = smfa.probit(string, ldf)
    result=model.fit(maxiter=10000)
    return result
Пример #8
0
    def test_compare_poisson(self):

        vs = Independence()
        family = Poisson()

        Y = np.ceil(-np.log(np.random.uniform(size=100)))
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        md = GEE.from_formula("Y ~ X1 + X2 + X3", D, None, groups=groups, family=family, covstruct=vs).fit()

        sml = sm.poisson("Y ~ X1 + X2 + X3", data=D).fit()

        assert_almost_equal(sml.params.values, md.params, decimal=10)
Пример #9
0
    def test_compare_poisson(self):

        vs = Independence()
        family = Poisson()

        Y = np.ceil(-np.log(np.random.uniform(size=100)))
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D,
                                family=family, cov_struct=vs)
        rslt1 = mod1.fit()

        mod2 = sm.poisson("Y ~ X1 + X2 + X3", data=D)
        rslt2 = mod2.fit(disp=False)

        assert_almost_equal(rslt1.params, rslt2.params, decimal=10)
Пример #10
0
    def test_compare_poisson(self):

        vs = Independence()
        family = Poisson()

        Y = np.ceil(-np.log(np.random.uniform(size=100)))
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        md = GEE.from_formula("Y ~ X1 + X2 + X3",
                              D,
                              None,
                              groups=groups,
                              family=family,
                              covstruct=vs).fit()

        sml = sm.poisson("Y ~ X1 + X2 + X3", data=D).fit()

        assert_almost_equal(sml.params.values, md.params, decimal=10)
Пример #11
0
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
import pandas as pd

# Load the R dataset Insurance from the MASS package.
insurance_set = sm.datasets.get_rdataset('Insurance','MASS').data

# Capture the data as a pandas dataframe.
df = pd.DataFrame(insurance_set)

# Build a Poisson regression model with a log of an independent variable Holders, and dependent variable Claims.
poisson_model = smf.poisson('Claims ~ np.log(Holders)', df)

# Fit the model with data, and find the sum of the residuals.
poisson_model_result = poisson_model.fit()
print(np.sum(poisson_model_result.resid))
Пример #12
0
from sklearn import linear_model
lr = linear_model.LogisticRegression(max_iter=1000)
# 调用fit方法拟合模型,这和拟合线性模型的方法相同
results = lr.fit(X=predictors, y=acs['ge150k_i'])
# 输出系数
print(results.coef_)
print(results.intercept_)
values = np.append(results.intercept_, results.coef_)
# 得到值的名称
names = np.append('intercept', predictors.columns)
# 全部放入一个带标签的DataFrame中
results = pd.DataFrame(values, index=names, columns=['coef'])
print(results)
results['or'] = np.exp(results['coef'])
print(results)
results = smf.poisson('NumChildren ~ FamilyIncome + FamilyType + OwnRent',
                      data=acs).fit()
print(results.summary())

import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf
model = smf.glm('NumChildren ~ FamilyIncome + FamilyType + OwnRent',
                data=acs,
                family=sm.families.Poisson(sm.genmod.families.links.log))
results = model.fit()
print(results.summary())

model = smf.glm('NumChildren ~ FamilyIncome + FamilyType + OwnRent',
                data=acs,
                family=sm.families.NegativeBinomial(
                    sm.genmod.families.links.log))
entrance, vclass, lambda_parameter, intercept, alpha, chisqstat, pvalue \
    = [], [], [], [], [], [], []

for i in range(len(entrance_site)):
    for j in range(len(vehicle_class)):
        file = file_copy[(file_copy["entrance_site"] \
            == entrance_site[i]) & (file_copy["vehicle_class"] \
                == vehicle_class[j]) & (file_copy["year"] == 2019) \
                    & (file_copy["payment_type"] == "E-ZPass")]
        x = file["week_of_year"].unique()
        x.sort()
        dataframe = file.drop(["exit_site","payment_type","profit","year"],\
             axis = 1).groupby("week_of_year").mean()
        y = dataframe["vehicle_count"].values

        model_p = smf.poisson("vehicle_count ~ 1", data=dataframe)
        result_p = model_p.fit()
        lambda_ = float(np.exp(result_p.params))

        entrance.append(entrance_site[i])
        vclass.append(vehicle_class[j])
        lambda_parameter.append(lambda_)

        minimum = int(y.min())
        maximum = int(y.max())
        while minimum % 10 != 0:
            minimum -= 1
        while maximum % 10 != 0:
            maximum += 1
        X = stats.poisson(lambda_)
        v, k = np.histogram(y, range=(minimum, maximum), density=True)
Пример #14
0
# b)
all_results = defaultdict(dict)

all_regressions = [
    "claim_rate ~ C(age) + C(car) + C(dist) + C(age):C(car) + C(age):C(dist) + C(car):C(dist) ",
    "claim_rate ~ C(age) + C(car) + C(age):C(car)",
    "claim_rate ~ C(age) + C(car) + C(age):C(car)",
    "claim_rate ~ C(age) + C(dist) + C(age):C(dist)",
    "claim_rate ~ C(car) + C(dist) + C(car):C(dist)",
    "claim_rate ~ C(car)",
    "claim_rate ~ C(dist)",
    "claim_rate ~ C(age)",
]

for reg in all_regressions:
    regression = stt.poisson(reg,data=data)
    results = regression.fit()
    helper.wrap_regression_results(all_results,results)

results = pd.DataFrame(all_results).transpose()
combinaisons = it.permutations(range(7),2)

for pair in combinaisons:
    a,b = pair
    if results.iloc[a].degree_of_freedom > results.iloc[b].degree_of_freedom:
        Deviance = -2*(results.iloc[a].log - results.iloc[b].log)
        Degree_of_freedom = results.iloc[a].degree_of_freedom - results.iloc[b].degree_of_freedom
        if Deviance < dist.chi2.ppf(0.95,Degree_of_freedom):
            print(results.iloc[a].name, 'est rejete pour', results.iloc[b].name)

# Best regression : claim_rate ~ C(dist)
Пример #15
0
from statsmodels.formula.api import ols, glm, poisson
from statsmodels.discrete.discrete_model import Poisson

import statsmodels.stats.tests.test_anova as ttmod

test = ttmod.TestAnova3()
test.setupClass()

data = test.data.drop([0,1,2])
res_ols = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)", data).fit(use_t=False)

res_glm = glm("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)",
                        data).fit()

res_poi = Poisson.from_formula("Days ~ C(Weight) * C(Duration)", data).fit(cov_type='HC0')
res_poi_2 = poisson("Days ~ C(Weight) + C(Duration)", data).fit(cov_type='HC0')

print('\nOLS')
print(res_ols.wald_test_terms())
print('\nGLM')
print(res_glm.wald_test_terms(skip_single=False, combine_terms=['Duration', 'Weight']))
print('\nPoisson 1')
print(res_poi.wald_test_terms(skip_single=False, combine_terms=['Duration', 'Weight']))
print('\nPoisson 2')
print(res_poi_2.wald_test_terms(skip_single=False))

from statsmodels.discrete.discrete_model import NegativeBinomial
res_nb2 = NegativeBinomial.from_formula("Days ~ C(Weight) * C(Duration)", data).fit()
print('\nNegative Binomial nb2')
print(res_nb2.wald_test_terms(skip_single=False))
Пример #16
0
import pandas as pd
import matplotlib.pylab as plt
import statsmodels.formula.api as smf
import numpy as np
import scipy.optimize

# 今回使用するデータ
data = pd.read_csv('./psn_data.csv')

## 答え ##################################################
results = smf.poisson('y ~ x', data=data).fit()
print(results.summary())

# パラメータの推定値を取得
a, b = results.params

## 自作 ####################################################
params = [np.random.rand(), np.random.rand()]


def likelihood(params, y_vector, x_vector):
    ret = 0
    # ポアソン分布のパラメータ
    theta_reg = lambda params, x: np.exp(params[0] + params[1] * x)
    for i in range(y_vector.shape[0]):
        ret += y_vector[i] * np.log(theta_reg(
            params, x_vector[i])) - theta_reg(params, x_vector[i])
    return -ret


new_params = scipy.optimize.minimize(likelihood,
Пример #17
0
    true_neg = (1 - predict) * (1 - actual)
    sum(true_pos), sum(true_neg)

    ## calculate accuracy vs baseline
    print('baseline:\n', baseline)
    acc = (sum(true_pos) + sum(true_neg)) / len(actual)
    print('acc:\n', acc)

    ## mine the join DataFrame
    variables = GoMiningNumBabes(join)

    ## read the variables
    relevant = MiningReport(variables, n=60)

    ## make model with relevant predictive variables
    model = smf.poisson('numbabes ~ ager + educat + C(race) + totincr',
                        data=join)
    results = model.fit()
    results.summary()

    ## predict numbabes
    columns = ['ager', 'race', 'educat', 'totincr']
    new = pd.DataFrame([[35, 1, 16, 14]], columns=columns)
    predict_babes = results.predict(new)
    print('predict_babes:\n', predict_babes)

    ## predict married/divorced
    model = smf.mnlogit('rmarital ~ ager + C(race) + totincr + educat',
                        data=join)
    results = model.fit()
    results.summary()
Пример #18
0
import pandas as pd
import statsmodels.formula.api as smf

# Poisson Model describes a process where dependent variable refers to success count of many attempts
# and each attempt has a very low probability of success.

# Let's understand how to fit a Poisson regression model for a data set available at UCLA repository.

# The dataset contains details of a number of awards earned, type of program enrolled,
# and score obtained in final math exam by students at a high school.

# The dataset is fetched as a pandas data frame as shown below.
awards_df = pd.read_csv(
    "https://stats.idre.ucla.edu/stat/data/poisson_sim.csv")
print(awards_df.head(3))

# Now let's create a Poisson model with the patsy formula num_awards ~ math + C(prog).
poisson_model = smf.poisson('num_awards ~ math + C(prog)', awards_df)

# Fitting the Model
poisson_model_result = poisson_model.fit()

# Viewing Model Summary
# Analyzing Model Summary
# The coefficient for math variable is 0.07, which means for every one unit increase in math, the log count increases by 0.07.
# Having enrolled for prog=2, i.e., "Academic", instead of "Generic" program, changes the log count by 1.08.
# Having enrolled for prog=3, i.e., "Vocational", instead of "Generic" program, changes the log count by 0.37.

print(poisson_model_result.summary())
Пример #19
0
  
    Created on:
        2013/10/19 03:31:02
     
    Purpose:
        To show 
  
    Copyright:
        BSD / Apache        
  
---------------------------------------------------------------------------- """

from pandas import read_csv

from statsmodels.formula.api import poisson 

data_0 = read_csv('nosipsson.csv', header=None)

data_0 = data_0.drop(range(8, 12), 1)

data_0.columns = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']

fit_0 = poisson('H ~ E + F + G', data_0).fit()

""" ----------------------------------------------------------------------------
  
    End note:
        (end note starts here)     
   
============================================================================ """
Пример #20
0
opt_results = minimize(neg_lklh, theta_init, args=fbock)
opt_results

np.exp(opt_results.x)


# пуассоновская регрессия
# rate_i = exp(a * 1 + b * y_i)
# ghost_i | rate_i ~ Poisson(rate_i)
# theta = (a, b)
def neg_lklh(theta, fbock):
    a, b = theta
    rate = np.exp(a + b * fbock['y'])
    ln_prob = -rate + fbock['ghost'] * np.log(rate)
    lklh = np.sum(ln_prob) # в силу независимости дней
    return -lklh

theta_init = [0, 0]

opt_results = minimize(neg_lklh, theta_init, args=fbock)
opt_results


# многие модели регрессии живут в statsmodels
import statsmodels.formula.api as smf

model = smf.poisson('ghost ~ 1 + y', data=fbock).fit()
model.summary()

Пример #21
0
print "line fit for age and log(teamID)"
print "slope: ", slope_log, "intercept: ", intercept_log
pointsLineLog = [intercept_log + slope_log * x for x in team_ID]
plt.figure()
plt.plot(team_ID, log_teamID, "ro")
plt.plot(team_ID, pointsLineLog, '--')
plt.show()

formula = 'yearID ~ teamID + salary + IgID'
data_dict = {
    "yearId": year_ID,
    "teamID": team_ID,
    "salary": Salary,
    "IgID": Ig_ID
}
model = smf.poisson(formula, data=data_dict)
model = model.fit()

print "predicted salary for 870000 NL with year of born 1985: ", model.predict(
    {
        "salary": 8700,
        "IgID": 'NL',
        "yearID": 1985
    })

print "slope for yearID and salary is: ", slope

print "\n\nMultiple regression model:"
modelMR = smf.ols('salary ~ salary + YearId + IgID', data=data_dict)
modelMR = modelMR.fit()
print modelMR.summary()
Пример #22
0
# b)
all_results = defaultdict(dict)

all_regressions = [
    "claim_rate ~ C(age) + C(car) + C(dist) + C(age):C(car) + C(age):C(dist) + C(car):C(dist) ",
    "claim_rate ~ C(age) + C(car) + C(age):C(car)",
    "claim_rate ~ C(age) + C(car) + C(age):C(car)",
    "claim_rate ~ C(age) + C(dist) + C(age):C(dist)",
    "claim_rate ~ C(car) + C(dist) + C(car):C(dist)",
    "claim_rate ~ C(car)",
    "claim_rate ~ C(dist)",
    "claim_rate ~ C(age)",
]

for reg in all_regressions:
    regression = stt.poisson(reg, data=data)
    results = regression.fit()
    helper.wrap_regression_results(all_results, results)

results = pd.DataFrame(all_results).transpose()
combinaisons = it.permutations(range(7), 2)

for pair in combinaisons:
    a, b = pair
    if results.iloc[a].degree_of_freedom > results.iloc[b].degree_of_freedom:
        Deviance = -2 * (results.iloc[a].log - results.iloc[b].log)
        Degree_of_freedom = results.iloc[a].degree_of_freedom - results.iloc[
            b].degree_of_freedom
        if Deviance < dist.chi2.ppf(0.95, Degree_of_freedom):
            print(results.iloc[a].name, 'est rejete pour',
                  results.iloc[b].name)
Пример #23
0
plt.savefig("histogram poisson dist.pdf")

from patsy import dmatrices
import statsmodels.api as sm

formula = """Parasitized ~ C(Treatment)"""
response, predictors = dmatrices(formula, data, return_type='dataframe')
po_results = sm.GLM(response, predictors, family=sm.families.Poisson()).fit()
print(po_results.summary())

formula = """Parasitized ~ C(Treatment)"""
response, predictors = dmatrices(formula, data, return_type='dataframe')
po_results = sm.GLM(response,
                    predictors,
                    family=sm.families.NegativeBinomial()).fit()
print(po_results.summary())

modpoiss = smf.poisson(formula, data).fit()
print(modpoiss.summary())

modNB = smf.negativebinomial(formula, data).fit()
print(modNB.summary())

stats.probplot(modpoiss.resid, dist='poisson', sparams=(2.4, ), plot=plt)
plt.show()

stats.probplot(modNB.resid, dist='nbinom', sparams=(2.15, 0.4), plot=plt)
plt.show()

print(modNB.get_margeff('mean').summary())
Пример #24
0
                  'qemp86 + inc86 + black + hispan + born60',
                  data=crime1)
results_lin = reg_lin.fit()

# print regression table:
table_lin = pd.DataFrame({
    'b': round(results_lin.params, 4),
    'se': round(results_lin.bse, 4),
    't': round(results_lin.tvalues, 4),
    'pval': round(results_lin.pvalues, 4)
})
print(f'table_lin: \n{table_lin}\n')

# estimate Poisson model:
reg_poisson = smf.poisson(formula='narr86 ~ pcnv + avgsen + tottime +'
                          'ptime86 + qemp86 + inc86 + black +'
                          'hispan + born60',
                          data=crime1)
results_poisson = reg_poisson.fit(disp=0)

# print regression table:
table_poisson = pd.DataFrame({
    'b': round(results_poisson.params, 4),
    'se': round(results_poisson.bse, 4),
    't': round(results_poisson.tvalues, 4),
    'pval': round(results_poisson.pvalues, 4)
})
print(f'table_poisson: \n{table_poisson}\n')

# estimate Quasi-Poisson model:
reg_qpoisson = smf.glm(formula='narr86 ~ pcnv + avgsen + tottime + ptime86 +'
                       'qemp86 + inc86 + black + hispan + born60',
Пример #25
0
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

# awards_df = pd.read_csv("https://stats.idre.ucla.edu/stat/data/poisson_sim.csv")
# poission_model = smf.poisson("num_awards ~ math + C(prog)",data=awards_df)
# model_result = poission_model.fit()
# print(model_result.summary())

insurance_data = sm.datasets.get_rdataset("Insurance", package="MASS").data
poission_model = smf.poisson("Claims ~ np.log(Holders)", data=insurance_data)
model_result = poission_model.fit()
print(model_result.resid)
# reading data
data_crime = pd.read_excel(r'C:\Users\digiovanniyani\Desktop\NY_CRIMES.xlsx') #, names = ['MH_MURD', 'MH_RAPE', 'MH_ROBB', 'MH_ASSA', 'MH_BURG', 'MH_LARC'])#, delim_whitespace=True, header=0)

total_nyc_crime=data_crime["MH_TOT"]+ data_crime["BK_TOT"]+ data_crime["QN_TOT"] + data_crime["BX_TOT"] + data_crime["SI_TOT"]

sumcols_brooklyn= data_crime["BK_LARC"]+ data_crime["BK_ROBB"]+ data_crime["BK_MOTO"] + data_crime["BK_BURG"] + data_crime["BK_ASSA"]
sumcols_queens= data_crime["QN_LARC"]+ data_crime["QN_ROBB"]+ data_crime["QN_MOTO"] + data_crime["QN_BURG"] + data_crime["QN_ASSA"]
sumcols_manhattan= data_crime["MH_LARC"]+ data_crime["MH_ROBB"]+ data_crime["MH_MOTO"] + data_crime["MH_BURG"] #+ data_crime["MH_ASSA"]
sumcols_staten= data_crime["SI_LARC"]+ data_crime["SI_ROBB"]+ data_crime["SI_MOTO"] + data_crime["SI_BURG"] + data_crime["SI_ASSA"]
import random


dates=(list(range(2005,2020)))
str_dates = [str(i) for i in dates]
#+ BK_HSGR + BK_UNEM
m1 = poisson('QN_TOT ~ QN_HSGR + QN_INC + QN_UNEM  ', data = data_crime).fit() #maxiter=1000, method='nm')
print (m1.summary())
Y=data_crime['QN_TOT']
print(Y)
preds = m1.predict()

plt.plot(range(len(Y)), Y, 'r*-', range(len(Y)), preds, 'bo-')

plt.title('NYC total crimes VS  queens grad rate, unemployment,income')
plt.xticks(np.arange(0,15 , step=1))  # Set label locations.
plt.xticks(np.arange(0,15), [i for i in str_dates], rotation=30)
plt.show()


"""
data_row=data.iloc[2, 1:5]
Пример #27
0
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import Series
from pandas import read_csv
from pandas import DataFrame

series = read_csv('wc98_workload_hour.csv',
                  header=0,
                  parse_dates=[0],
                  index_col=0,
                  squeeze=True)
df = DataFrame(series)
fig, ax = plt.subplots(1, 1)
df.plot(kind='bar', ax=ax)
model = smf.poisson("count ~ 1", data=df)
result = model.fit()
print(result.summary())
Пример #28
0
from statsmodels.discrete.discrete_model import Poisson

import statsmodels.stats.tests.test_anova as ttmod

test = ttmod.TestAnova3()
test.setup_class()

data = test.data.drop([0, 1, 2])
res_ols = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)",
              data).fit(use_t=False)

res_glm = glm("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)", data).fit()

res_poi = Poisson.from_formula("Days ~ C(Weight) * C(Duration)",
                               data).fit(cov_type='HC0')
res_poi_2 = poisson("Days ~ C(Weight) + C(Duration)", data).fit(cov_type='HC0')

print('\nOLS')
print(res_ols.wald_test_terms())
print('\nGLM')
print(
    res_glm.wald_test_terms(skip_single=False,
                            combine_terms=['Duration', 'Weight']))
print('\nPoisson 1')
print(
    res_poi.wald_test_terms(skip_single=False,
                            combine_terms=['Duration', 'Weight']))
print('\nPoisson 2')
print(res_poi_2.wald_test_terms(skip_single=False))

from statsmodels.discrete.discrete_model import NegativeBinomial
Пример #29
0
df = dataset.data.set_index("time")

# In[126]:

df.head(10).T

# In[127]:

fig, ax = plt.subplots(1, 1, figsize=(16, 4))
df.plot(kind='bar', ax=ax)
fig.tight_layout()
fig.savefig("ch14-discoveries.pdf")

# In[128]:

model = smf.poisson("discoveries ~ 1", data=df)

# In[129]:

result = model.fit()

# In[130]:

print(result.summary())

# In[131]:

lmbda = np.exp(result.params)

# In[132]:
Пример #30
0
import patsy
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

dataset = sm.datasets.get_rdataset("discoveries")
df = dataset.data.set_index('time').rename(columns={'values': 'discoveries'})
print(df.head(10).T)

fig, ax = plt.subplots(1, 1, figsize=(16, 4))
df.plot(kind='bar', ax=ax)
plt.show()
model = smf.poisson("df.values ~ 1", data=df)

result = model.fit()
print(result.summary())

lmbda = np.exp(result.params)
X = stats.poisson(lmbda)
print(result.conf_int)

X_ci_l = stats.poisson(np.exp(result.conf_int().values)[0, 0])
X_ci_u = stats.poisson(np.exp(result.conf_int().values)[0, 1])

v, k = np.histogram(df.values, bins=12, range=(0, 12), normed=True)
fig, ax = plt.subplots(1, 1, figsize=(12, 4))
ax.bar(k[:-1],
       v,