Exemplo n.º 1
0
def get_city_data():
    """Helper to get city data"""
    data = pd.read_csv(pm.get_data("srrs2.dat"))
    cty_data = pd.read_csv(pm.get_data("cty.dat"))

    data = data[data.state == "MN"]

    data["fips"] = data.stfips * 1000 + data.cntyfips
    cty_data["fips"] = cty_data.stfips * 1000 + cty_data.ctfips
    data["lradon"] = np.log(np.where(data.activity == 0, 0.1, data.activity))
    data = data.merge(cty_data, "inner", on="fips")

    unique = data[["fips"]].drop_duplicates()
    unique["group"] = np.arange(len(unique))
    unique.set_index("fips")
    return data.merge(unique, "inner", on="fips")
Exemplo n.º 2
0
def get_city_data():
    """Helper to get city data"""
    data = pd.read_csv(pm.get_data('srrs2.dat'))
    cty_data = pd.read_csv(pm.get_data('cty.dat'))

    data = data[data.state == 'MN']

    data['fips'] = data.stfips * 1000 + data.cntyfips
    cty_data['fips'] = cty_data.stfips * 1000 + cty_data.ctfips
    data['lradon'] = np.log(np.where(data.activity == 0, .1, data.activity))
    data = data.merge(cty_data, 'inner', on='fips')

    unique = data[['fips']].drop_duplicates()
    unique['group'] = np.arange(len(unique))
    unique.set_index('fips')
    return data.merge(unique, 'inner', on='fips')
Exemplo n.º 3
0
    def time_glm_hierarchical(self):
        data = pd.read_csv(pm.get_data('radon.csv'))
        data['log_radon'] = data['log_radon'].astype(theano.config.floatX)
        county_idx = data.county_code.values

        n_counties = len(data.county.unique())
        with pm.Model():
            # Hyperpriors for group nodes
            mu_a = pm.Normal('mu_a', mu=0., sd=100**2)
            sigma_a = pm.HalfCauchy('sigma_a', 5)
            mu_b = pm.Normal('mu_b', mu=0., sd=100**2)
            sigma_b = pm.HalfCauchy('sigma_b', 5)

            # Intercept for each county, distributed around group mean mu_a
            # Above we just set mu and sd to a fixed value while here we
            # plug in a common group distribution for all a and b (which are
            # vectors of length n_counties).
            a = pm.Normal('a', mu=mu_a, sd=sigma_a, shape=n_counties)
            # Intercept for each county, distributed around group mean mu_a
            b = pm.Normal('b', mu=mu_b, sd=sigma_b, shape=n_counties)

            # Model error
            eps = pm.HalfCauchy('eps', 5)

            radon_est = a[county_idx] + b[county_idx] * data.floor.values

            # Data likelihood
            pm.Normal('radon_like',
                      mu=radon_est,
                      sd=eps,
                      observed=data.log_radon)
            pm.sample(draws=2000, njobs=4)
Exemplo n.º 4
0
def get_city_data():
    """Helper to get city data"""
    data = pd.read_csv(pm.get_data('srrs2.dat'))
    cty_data = pd.read_csv(pm.get_data('cty.dat'))

    data = data[data.state == 'MN']

    data['fips'] = data.stfips * 1000 + data.cntyfips
    cty_data['fips'] = cty_data.stfips * 1000 + cty_data.ctfips
    data['lradon'] = np.log(np.where(data.activity == 0, .1, data.activity))
    data = data.merge(cty_data, 'inner', on='fips')

    unique = data[['fips']].drop_duplicates()
    unique['group'] = np.arange(len(unique))
    unique.set_index('fips')
    return data.merge(unique, 'inner', on='fips')
Exemplo n.º 5
0
    def setup(self, step, init):
        """Initialize model and get start position"""
        np.random.seed(123)
        self.chains = 4
        data = pd.read_csv(pm.get_data('radon.csv'))
        data['log_radon'] = data['log_radon'].astype(theano.config.floatX)
        county_idx = data.county_code.values
        n_counties = len(data.county.unique())
        with pm.Model() as self.model:
            mu_a = pm.Normal('mu_a', mu=0., sd=100**2)
            sigma_a = pm.HalfCauchy('sigma_a', 5)

            mu_b = pm.Normal('mu_b', mu=0., sd=100**2)
            sigma_b = pm.HalfCauchy('sigma_b', 5)

            a = pm.Normal('a', mu=mu_a, sd=sigma_a, shape=n_counties)
            b = pm.Normal('b', mu=mu_b, sd=sigma_b, shape=n_counties)
            eps = pm.HalfCauchy('eps', 5)

            radon_est = a[county_idx] + b[county_idx] * data.floor.values

            pm.Normal('radon_like',
                      mu=radon_est,
                      sd=eps,
                      observed=data.log_radon)
            self.start, _ = pm.init_nuts(chains=self.chains, init=init)
Exemplo n.º 6
0
    def build_model(self):
        data = pd.read_csv(pm.get_data('wells.dat'),
                           delimiter=u' ', index_col=u'id', dtype={u'switch': np.int8})
        data.dist /= 100
        data.educ /= 4
        col = data.columns
        P = data[col[1:]]
        P -= P.mean()
        P['1'] = 1

        with pm.Model() as model:
            effects = pm.Normal('effects', mu=0, tau=100. ** -2, shape=len(P.columns))
            p = tt.nnet.sigmoid(tt.dot(floatX(np.array(P)), effects))
            pm.Bernoulli('s', p, observed=floatX(np.array(data.switch)))
        return model
Exemplo n.º 7
0
    def build_model(self):
        data = pd.read_csv(pm.get_data('wells.dat'),
                           delimiter=u' ', index_col=u'id', dtype={u'switch': np.int8})
        data.dist /= 100
        data.educ /= 4
        col = data.columns
        P = data[col[1:]]
        P -= P.mean()
        P['1'] = 1

        with pm.Model() as model:
            effects = pm.Normal('effects', mu=0, tau=100. ** -2, shape=len(P.columns))
            p = tt.nnet.sigmoid(tt.dot(floatX(np.array(P)), effects))
            pm.Bernoulli('s', p, observed=floatX(np.array(data.switch)))
        return model
Exemplo n.º 8
0
    def build_model(self):
        data = pd.read_csv(pm.get_data('wells.dat'),
                           delimiter=' ', index_col='id',
                           dtype={'switch': np.int8})
        data.dist /= 100
        data.educ /= 4
        col = data.columns
        P = data[col[1:]]
        P -= P.mean()
        P['1'] = 1

        with pm.Model() as model:
            effects = pm.Normal('effects', mu=0, sigma=100, shape=len(P.columns))
            logit_p = tt.dot(floatX(np.array(P)), effects)
            pm.Bernoulli('s', logit_p=logit_p, observed=floatX(data.switch.values))
        return model
Exemplo n.º 9
0
def build_model():
    data = np.loadtxt(pm.get_data('efron-morris-75-data.tsv'), delimiter="\t", 
                      skiprows=1, usecols=(2,3))
    
    atbats = pm.floatX(data[:,0])
    hits = pm.floatX(data[:,1])
    
    N = len(hits)
    
    # we want to bound the kappa below
    BoundedKappa = pm.Bound(pm.Pareto, lower=1.0)
    
    with pm.Model() as model:
        phi = pm.Uniform('phi', lower=0.0, upper=1.0)
        kappa = BoundedKappa('kappa', alpha=1.0001, m=1.5)
        thetas = pm.Beta('thetas', alpha=phi*kappa, beta=(1.0-phi)*kappa, shape=N)
        ys = pm.Binomial('ys', n=atbats, p=thetas, observed=hits)
    return model
Exemplo n.º 10
0
    def build_model(self):
        data = pd.read_csv(
            pm.get_data("wells.dat"),
            delimiter=" ",
            index_col="id",
            dtype={"switch": np.int8},
        )
        data.dist /= 100
        data.educ /= 4
        col = data.columns
        P = data[col[1:]]
        P -= P.mean()
        P["1"] = 1

        with pm.Model() as model:
            effects = pm.Normal("effects", mu=0, sigma=100, shape=len(P.columns))
            logit_p = at.dot(floatX(np.array(P)), effects)
            pm.Bernoulli("s", logit_p=logit_p, observed=floatX(data.switch.values))
        return model
Exemplo n.º 11
0
def build_model():
    data = np.loadtxt(
        pm.get_data("efron-morris-75-data.tsv"), delimiter="\t", skiprows=1, usecols=(2, 3)
    )

    atbats = pm.floatX(data[:, 0])
    hits = pm.floatX(data[:, 1])

    N = len(hits)

    # we want to bound the kappa below
    BoundedKappa = pm.Bound(pm.Pareto, lower=1.0)

    with pm.Model() as model:
        phi = pm.Uniform("phi", lower=0.0, upper=1.0)
        kappa = BoundedKappa("kappa", alpha=1.0001, m=1.5)
        thetas = pm.Beta("thetas", alpha=phi * kappa, beta=(1.0 - phi) * kappa, shape=N)
        ys = pm.Binomial("ys", n=atbats, p=thetas, observed=hits)
    return model
Exemplo n.º 12
0
    def time_glm_hierarchical(self):
        data = pd.read_csv(pm.get_data('radon.csv'))
        data['log_radon'] = data['log_radon'].astype(theano.config.floatX)
        county_idx = data.county_code.values

        n_counties = len(data.county.unique())
        with pm.Model():
            mu_a = pm.Normal('mu_a', mu=0., sd=100**2)
            sigma_a = pm.HalfCauchy('sigma_a', 5)
            mu_b = pm.Normal('mu_b', mu=0., sd=100**2)
            sigma_b = pm.HalfCauchy('sigma_b', 5)
            a = pm.Normal('a', mu=mu_a, sd=sigma_a, shape=n_counties)
            b = pm.Normal('b', mu=mu_b, sd=sigma_b, shape=n_counties)
            eps = pm.HalfCauchy('eps', 5)
            radon_est = a[county_idx] + b[county_idx] * data.floor.values
            pm.Normal('radon_like',
                      mu=radon_est,
                      sd=eps,
                      observed=data.log_radon)
            pm.sample(draws=2000, njobs=4)
Exemplo n.º 13
0
def glm_hierarchical_model(random_seed=123):
    """Sample glm hierarchical model to use in benchmarks"""
    np.random.seed(random_seed)
    data = pd.read_csv(pm.get_data("radon.csv"))
    data["log_radon"] = data["log_radon"].astype(aesara.config.floatX)
    county_idx = data.county_code.values

    n_counties = len(data.county.unique())
    with pm.Model() as model:
        mu_a = pm.Normal("mu_a", mu=0.0, sd=100**2)
        sigma_a = pm.HalfCauchy("sigma_a", 5)
        mu_b = pm.Normal("mu_b", mu=0.0, sd=100**2)
        sigma_b = pm.HalfCauchy("sigma_b", 5)
        a = pm.Normal("a", mu=0, sd=1, shape=n_counties)
        b = pm.Normal("b", mu=0, sd=1, shape=n_counties)
        a = mu_a + sigma_a * a
        b = mu_b + sigma_b * b
        eps = pm.HalfCauchy("eps", 5)
        radon_est = a[county_idx] + b[county_idx] * data.floor.values
        pm.Normal("radon_like", mu=radon_est, sd=eps, observed=data.log_radon)
    return model
Exemplo n.º 14
0
def glm_hierarchical_model(random_seed=123):
    """Sample glm hierarchical model to use in benchmarks"""
    np.random.seed(random_seed)
    data = pd.read_csv(pm.get_data('radon.csv'))
    data['log_radon'] = data['log_radon'].astype(theano.config.floatX)
    county_idx = data.county_code.values

    n_counties = len(data.county.unique())
    with pm.Model() as model:
        mu_a = pm.Normal('mu_a', mu=0., sd=100**2)
        sigma_a = pm.HalfCauchy('sigma_a', 5)
        mu_b = pm.Normal('mu_b', mu=0., sd=100**2)
        sigma_b = pm.HalfCauchy('sigma_b', 5)
        a = pm.Normal('a', mu=0, sd=1, shape=n_counties)
        b = pm.Normal('b', mu=0, sd=1, shape=n_counties)
        a = mu_a + sigma_a * a
        b = mu_b + sigma_b * b
        eps = pm.HalfCauchy('eps', 5)
        radon_est = a[county_idx] + b[county_idx] * data.floor.values
        pm.Normal('radon_like', mu=radon_est, sd=eps, observed=data.log_radon)
    return model
Exemplo n.º 15
0
def glm_hierarchical_model(random_seed=123):
    """Sample glm hierarchical model to use in benchmarks"""
    np.random.seed(random_seed)
    data = pd.read_csv(pm.get_data('radon.csv'))
    data['log_radon'] = data['log_radon'].astype(theano.config.floatX)
    county_idx = data.county_code.values

    n_counties = len(data.county.unique())
    with pm.Model() as model:
        mu_a = pm.Normal('mu_a', mu=0., sd=100**2)
        sigma_a = pm.HalfCauchy('sigma_a', 5)
        mu_b = pm.Normal('mu_b', mu=0., sd=100**2)
        sigma_b = pm.HalfCauchy('sigma_b', 5)
        a = pm.Normal('a', mu=0, sd=1, shape=n_counties)
        b = pm.Normal('b', mu=0, sd=1, shape=n_counties)
        a = mu_a + sigma_a * a
        b = mu_b + sigma_b * b
        eps = pm.HalfCauchy('eps', 5)
        radon_est = a[county_idx] + b[county_idx] * data.floor.values
        pm.Normal('radon_like', mu=radon_est, sd=eps, observed=data.log_radon)
    return model
Exemplo n.º 16
0
    def build_model(self):
        data = pd.read_csv(pm.get_data('wells.dat'),
                           delimiter=' ',
                           index_col='id',
                           dtype={'switch': np.int8})
        data.dist /= 100
        data.educ /= 4
        col = data.columns
        P = data[col[1:]]
        P -= P.mean()
        P['1'] = 1

        with pm.Model() as model:
            effects = pm.Normal('effects',
                                mu=0,
                                sigma=100,
                                shape=len(P.columns))
            logit_p = tt.dot(floatX(np.array(P)), effects)
            pm.Bernoulli('s',
                         logit_p=logit_p,
                         observed=floatX(data.switch.values))
        return model
"""
Getting started with pymc3.
Based on https://docs.pymc.io/notebooks/getting_started.html
"""

# %%
import pandas as pd
import pymc3 as pm
import matplotlib.pyplot as plt
import numpy as np

returns = pd.read_csv(
    pm.get_data("SP500.csv"), parse_dates=True, index_col=0, usecols=["Date", "change"]
).query("Date < '2009-12-31'")
returns

# %%
returns.plot(figsize=(10, 6))
plt.ylabel("daily returns in %")

# %%
with pm.Model() as sp500_model:
    nu = pm.Exponential("nu", 1 / 10.0, testval=5.0)
    sigma = pm.Exponential("sigma", 1 / 0.02, testval=0.1)

    s = pm.GaussianRandomWalk("s", sigma=sigma, shape=len(returns))
    volatility_process = pm.Deterministic(
        "volatility_process", pm.math.exp(-2 * s) ** 0.5
    )

    r = pm.StudentT("r", nu=nu, sigma=volatility_process, observed=returns["change"])
Exemplo n.º 18
0
import matplotlib.pyplot as plt
import numpy as np
import pymc3 as pm
import pandas as pd
import theano

"""
Hierachical GLM.
"""

data = pd.read_csv(pm.get_data("radon.csv"))
data["log_radon"] = data["log_radon"].astype(theano.config.floatX)
county_names = data.county.unique()
county_idx = data.county_code.values

n_counties = len(data.county.unique())

# Unpooled (non-hierarchical model)

with pm.Model() as unpooled_model:

    # Independent parameters for each county
    a = pm.Normal("a", 0, sigma=100, shape=n_counties)
    b = pm.Normal("a", 0, sigma=100, shape=n_counties)

    # Model error
    eps = pm.HalfCauchy("eps", 5)

    # Model prediction of radon level
    # a[county_idx] translates to a[0, 0, 0, 1, 1, ...],
    # we thus link multiple household measures of a county
Exemplo n.º 19
0
#https://docs.pymc.io/notebooks/stochastic_volatility.html
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('talk')
import pymc3 as pm
from pymc3.distributions.timeseries import GaussianRandomWalk
from scipy import optimize

import pandas as pd
n = 400
returns = pd.read_csv(pm.get_data("SP500.csv"), index_col='date')['change']
returns[:5]

fig, ax = plt.subplots(figsize=(14, 8))
returns.plot(label='S&P500')
ax.set(xlabel='time', ylabel='returns')
ax.legend()

with pm.Model() as model:
    step_size = pm.Exponential('sigma', 50.)
    s = GaussianRandomWalk('s', sigma=step_size, shape=len(returns))

    nu = pm.Exponential('nu', .1)

    r = pm.StudentT('r', nu=nu, lam=pm.math.exp(-2 * s), observed=returns)

with model:
    trace = pm.sample(tune=2000, target_accept=0.9)

pm.traceplot(trace, var_names=['sigma', 'nu'])
#!/usr/bin/env python
# coding: utf-8

# In[2]:


get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
import numpy as np
import pymc3 as pm
import pandas as pd
import theano

data = pd.read_csv(pm.get_data('radon.csv'))
data['log_radon'] = data['log_radon'].astype(theano.config.floatX)
county_names = data.county.unique()
county_idx = data.county_code.values

n_counties = len(data.county.unique())


# In[4]:


data[['county', 'log_radon', 'floor']].head()

for density estimation problem.

source: https://docs.pymc.io/notebooks/dp_mix.html
"""
from matplotlib import pyplot as plt
import numpy as np
import pymc3 as pm
import scipy as sp
import seaborn as sns
from theano import tensor as tt
import pandas as pd

SEED = 5132290  # from random.org
np.random.seed(SEED)

old_faithful_df = pd.read_csv(pm.get_data('old_faithful.csv'))

old_faithful_df['std_waiting'] = (
    old_faithful_df.waiting -
    old_faithful_df.waiting.mean()) / old_faithful_df.waiting.std()

# fig, ax = plt.subplots(figsize=(8, 6))

# n_bins = 20
# ax.hist(old_faithful_df.std_waiting, bins=n_bins, lw=0, alpha=0.5)

# ax.set_xlabel('Standardized waiting time between eruptions')
# ax.set_ylabel('Number of eruptions')
# plt.show()

N = old_faithful_df.shape[0]  # num of components = num of data points
Exemplo n.º 22
0
#
# Demonstrates the usage of hierarchical partial pooling
# See http://mc-stan.org/documentation/case-studies/pool-binary-trials.html for more details
#

import pymc3 as pm
import numpy as np
import theano

data = np.loadtxt(pm.get_data('efron-morris-75-data.tsv'),
                  delimiter="\t",
                  skiprows=1,
                  usecols=(2, 3))

atBats = data[:, 0].astype(theano.config.floatX)
hits = data[:, 1].astype(theano.config.floatX)

N = len(hits)

model = pm.Model()

# we want to bound the kappa below
BoundedKappa = pm.Bound(pm.Pareto, lower=1.0)

with model:
    phi = pm.Uniform('phi', lower=0.0, upper=1.0)
    kappa = BoundedKappa('kappa', alpha=1.0001, m=1.5)
    thetas = pm.Beta('thetas',
                     alpha=phi * kappa,
                     beta=(1.0 - phi) * kappa,
                     shape=N)
Exemplo n.º 23
0
import pymc3 as pm
import pandas as pd
from numpy.ma import masked_values

# Import data, filling missing values with sentinels (-999)
test_scores = pd.read_csv(pm.get_data("test_scores.csv")).fillna(-999)

# Extract variables: test score, gender, number of siblings, previous disability, age,
# mother with HS education or better, hearing loss identified by 3 months
# of age
(score, male, siblings, disability, age, mother_hs,
 early_ident) = (test_scores[[
     "score", "male", "siblings", "prev_disab", "age_test", "mother_hs",
     "early_ident"
 ]].astype(float).values.T)

with pm.Model() as model:
    # Impute missing values
    sib_mean = pm.Exponential("sib_mean", 1.0)
    siblings_imp = pm.Poisson("siblings_imp", sib_mean, observed=siblings)

    p_disab = pm.Beta("p_disab", 1.0, 1.0)
    disability_imp = pm.Bernoulli("disability_imp",
                                  p_disab,
                                  observed=masked_values(disability,
                                                         value=-999))

    p_mother = pm.Beta("p_mother", 1.0, 1.0)
    mother_imp = pm.Bernoulli("mother_imp",
                              p_mother,
                              observed=masked_values(mother_hs, value=-999))
Exemplo n.º 24
0
import pymc3 as pm
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

plt.style.use('seaborn-darkgrid')

data = pd.read_csv(pm.get_data("ml_100k_u.data"),
                   sep='\t',
                   names=["userid", "itemid", "rating", "timestamp"])
movie_columns = [
    'movie id', 'movie title', 'release date', 'video release date',
    'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation', "Children's",
    'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
    'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
    'Western'
]
movies = pd.read_csv(pm.get_data("ml_100k_u.item"),
                     sep="|",
                     names=movie_columns,
                     index_col="movie id",
                     parse_dates=['release date'])

# Extract the ratings from the DataFrame
ratings = data.rating

movie_means = data.join(movies['movie title'],
                        on='itemid').groupby('movie title').rating.mean()

user_means = data.groupby('userid').rating.mean().sort_values()
Exemplo n.º 25
0
import pymc3 as pm
import pandas as pd
from numpy.ma import masked_values

# Import data, filling missing values with sentinels (-999)
test_scores = pd.read_csv(pm.get_data('test_scores.csv')).fillna(-999)

# Extract variables: test score, gender, number of siblings, previous disability, age,
# mother with HS education or better, hearing loss identified by 3 months
# of age
(score, male, siblings, disability,
    age, mother_hs, early_ident) = test_scores[['score', 'male', 'siblings',
                                                'prev_disab', 'age_test',
                                                'mother_hs', 'early_ident']].astype(float).values.T

with pm.Model() as model:
    # Impute missing values
    sib_mean = pm.Exponential('sib_mean', 1.)
    siblings_imp = pm.Poisson('siblings_imp', sib_mean,
                              observed=siblings)

    p_disab = pm.Beta('p_disab', 1., 1.)
    disability_imp = pm.Bernoulli(
        'disability_imp', p_disab, observed=masked_values(disability, value=-999))

    p_mother = pm.Beta('p_mother', 1., 1.)
    mother_imp = pm.Bernoulli('mother_imp', p_mother,
                              observed=masked_values(mother_hs, value=-999))

    s = pm.HalfCauchy('s', 5., testval=5)
    beta = pm.Laplace('beta', 0., 100., shape=7, testval=.1)
Exemplo n.º 26
0
# of the unpooled county estimates and the pooled estimates.
# y = a[cluster] + b[cluster] * x
# adaptive prior for a[cluster] = Normal(a_bar, a_sigma_bar)
# adaptive prior for b[cluster] = Normal(b_bar, b_sigma_bar)
# prior for a_bar
# prior for a_sigma_bar
# prior for b_bar
# prior for b_sigma_bar

# partially pooled taking into account the relationship between a and b in the data
# by drawin on the variance-covariance matrix for the parameters a and b
# so here the adaptive prior uses this, draws from this?

# -----------------------------------------------------------------------------
# Import radon data
srrs2 = pd.read_csv(get_data('srrs2.dat'))
srrs2.columns = srrs2.columns.map(str.strip)
srrs_mn = srrs2[srrs2.state == 'MN'].copy()
srrs_mn.shape
srrs_mn.head()

srrs_mn['fips'] = srrs_mn.stfips * 1000 + srrs_mn.cntyfips
cty = pd.read_csv(get_data('cty.dat'))
cty_mn = cty[cty.st == 'MN'].copy()
cty_mn['fips'] = 1000 * cty_mn.stfips + cty_mn.ctfips

srrs_mn = srrs_mn.merge(cty_mn[['fips', 'Uppm']], on='fips')
srrs_mn = srrs_mn.drop_duplicates(subset='idnum')
u = np.log(srrs_mn.Uppm)
n = len(srrs_mn)
Exemplo n.º 27
0
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from pymc3 import HalfCauchy, Model, Normal, get_data, sample
from pymc3.distributions.timeseries import GaussianRandomWalk

data = pd.read_csv(get_data("pancreatitis.csv"))
countries = ["CYP", "DNK", "ESP", "FIN", "GBR", "ISL"]
data = data[data.area.isin(countries)]

age = data["age"] = np.array(data.age_start + data.age_end) / 2
rate = data.value = data.value * 1000
group, countries = pd.factorize(data.area, order=countries)

ncountries = len(countries)

for i, country in enumerate(countries):
    plt.subplot(2, 3, i + 1)
    plt.title(country)
    d = data[data.area == country]
    plt.plot(d.age, d.value, ".")

    plt.ylim(0, rate.max())

nknots = 10
knots = np.linspace(data.age_start.min(), data.age_end.max(), nknots)


def interpolate(x0, y0, x, group):
    x = np.array(x)
Exemplo n.º 28
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pymc3 import HalfCauchy, Model, Normal, get_data, sample
from pymc3.distributions.timeseries import GaussianRandomWalk

data = pd.read_csv(get_data('pancreatitis.csv'))
countries = ['CYP', 'DNK', 'ESP', 'FIN', 'GBR', 'ISL']
data = data[data.area.isin(countries)]

age = data['age'] = np.array(data.age_start + data.age_end) / 2
rate = data.value = data.value * 1000
group, countries = pd.factorize(data.area, order=countries)


ncountries = len(countries)

for i, country in enumerate(countries):
    plt.subplot(2, 3, i + 1)
    plt.title(country)
    d = data[data.area == country]
    plt.plot(d.age, d.value, '.')

    plt.ylim(0, rate.max())


nknots = 10
knots = np.linspace(data.age_start.min(), data.age_end.max(), nknots)

Exemplo n.º 29
0
import pandas as pd
import plotly.offline as py
import plotly.graph_objs as go
import datetime as dt
import numpy as np

py.init_notebook_mode(connected=True)


def dates_to_idx(timelist):
    reference_time = pd.to_datetime('1958-03-15')
    t = (timelist - reference_time) / pd.Timedelta(1, "Y")
    return np.asarray(t)


data_monthly = pd.read_csv(pm.get_data("monthly_in_situ_co2_mlo.csv"),
                           header=56)
data_monthly.replace(to_replace=-99.99, value=np.nan, inplace=True)

cols = [
    "year", "month", "--", "--", "CO2", "seasonaly_adjusted", "fit",
    "seasonally_adjusted_fit", "CO2_filled", "seasonally_adjusted_filled"
]
data_monthly.columns = cols
cols.remove("--")
cols.remove("--")
data_monthly = data_monthly[cols]

data_monthly["day"] = 15
data_monthly.index = pd.to_datetime(data_monthly[["year", "month", "day"]])
cols.remove("year")
Exemplo n.º 30
0
# -*- coding: utf-8 -*-
"""
Created on Mon Oct 21 17:10:07 2019

@author: Alex
https://docs.pymc.io/notebooks/hierarchical_partial_pooling.html
"""

import pymc3 as pm
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import theano.tensor as tt

dataBB = pd.read_csv(pm.get_data('efron-morris-75-data.tsv'), sep="\t")
at_bats, hits = dataBB[['At-Bats', 'Hits']].values.T

#%%
N = len(hits)

with pm.Model() as baseball_model:

    phi = pm.Uniform('phi', lower=0.0, upper=1.0)

    kappa_log = pm.Exponential('kappa_log', lam=1.5)
    kappa = pm.Deterministic('kappa', tt.exp(kappa_log))

    thetas = pm.Beta('thetas',
                     alpha=phi * kappa,
                     beta=(1.0 - phi) * kappa,
                     shape=N)
Exemplo n.º 31
0
#%matplotlib inline
import numpy as np
import pandas as pd
from pymc3 import __version__
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-darkgrid')
print('Running on PyMC3 v{}'.format(__version__))

from pymc3 import get_data

# Import radon data
srrs2 = pd.read_csv(get_data('srrs2.dat'))
srrs2.columns = srrs2.columns.map(str.strip)
srrs_mn = srrs2[srrs2.state == 'MN'].copy()
"""
import pandas as pd
import numpy as np
import seaborn as sns
#import pandas.util.testing as tm


df = pd.read_csv('inverts_database_CSV.csv', encoding = "ISO-8859-1", engine='python')


df = df.drop(['Date','management', 'Ecoregion', 'den_per_HA', 'density_for_summed_transect area_#permeter'], axis=1)
df = df.drop(['den_for_transect_#permeter', 'total transect area', 'Transect_area', 't_Width', 't_Length', 'Abundance', 'habitat_type', 'total number of transect'], axis= 1)
df['den'] = df['den_per_100m2']
df['den'] = pd.to_numeric(df['den'], errors='coerce') # make den a float

print(df.head())
Exemplo n.º 32
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pymc3 import HalfCauchy, Model, Normal, get_data, sample
from pymc3.distributions.timeseries import GaussianRandomWalk

data = pd.read_csv(get_data('pancreatitis.csv'))
countries = ['CYP', 'DNK', 'ESP', 'FIN', 'GBR', 'ISL']
data = data[data.area.isin(countries)]

age = data['age'] = np.array(data.age_start + data.age_end) / 2
rate = data.value = data.value * 1000
group, countries = pd.factorize(data.area, order=countries)

ncountries = len(countries)

for i, country in enumerate(countries):
    plt.subplot(2, 3, i + 1)
    plt.title(country)
    d = data[data.area == country]
    plt.plot(d.age, d.value, '.')

    plt.ylim(0, rate.max())

nknots = 10
knots = np.linspace(data.age_start.min(), data.age_end.max(), nknots)


def interpolate(x0, y0, x, group):
    x = np.array(x)
Created on Tue Jul  9 13:38:43 2019

@author: Alex
"""

import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn-darkgrid')
import pymc3 as pm
print('Running on PyMC3 v{}'.format(pm.__version__))
import pandas as pd

# The Data
# Our data consist of 401 daily returns of the S&P 500 stock market index during the 2008 financial crisis.

returns = pd.read_csv(pm.get_data('SP500.csv'), parse_dates=True, index_col=0)

len(returns)

returns.plot(figsize=(10, 6))
plt.ylabel('daily returns in %')

print('\n--- The Data ---')
#%% Model Specification

with pm.Model() as sp500_model:
    nu = pm.Exponential('nu', 1 / 10., testval=5.)
    sigma = pm.Exponential('sigma', 1 / 0.02, testval=.1)

    s = pm.GaussianRandomWalk('s', sigma=sigma, shape=len(returns))
    volatility_process = pm.Deterministic('volatility_process',
Exemplo n.º 34
0
    "figure.facecolor":
    "#fffff8",
    "axes.facecolor":
    "#fffff8",
    "figure.constrained_layout.use":
    True,
    "font.size":
    14.0,
    "hist.bins":
    "auto",
    "lines.linewidth":
    1.0,
})
# %% import data
returns = pd.read_csv(pm.get_data(
    "E:\\Users/Corly/Documents/GitHub/Python/self_course/dissertation/Data/SZ.csv"
),
                      index_col="trade_date")
returns["change"] = np.log(returns["close"]).diff()
returns = returns.dropna()
returns.head()

# %%
fig, ax = plt.subplots(figsize=(14, 4))
returns.plot(y="change", label="SZ", ax=ax)
ax.set(xlabel="time", ylabel="returns")
ax.legend()

# %%