def _build_model(self, sharpes, corr): """ Build the entire author model (in one function). The model is sufficiently simple to specify entirely in one function. Parameters ---------- sharpes : pd.DataFrame Long-format DataFrame of in-sample Sharpe ratios (from user-run backtests), indexed by user, algorithm and code ID. Note that currently, backtests are deduplicated based on code id. See fit_authors for more information. corr : np.ndarray Correlation matrix of returns streams (from backtests), estimated using Ledoit-Wolf shrinkage. See fit_authors for more information. """ with pm.Model() as model: mu_global = pm.Normal('mu_global', mu=0, sd=3) mu_author_sd = pm.HalfNormal('mu_author_sd', sd=1) mu_author_raw = pm.Normal('mu_author_raw', mu=0, sd=1, shape=self.num_authors) mu_author = pm.Deterministic('mu_author', mu_author_sd * mu_author_raw) mu_algo_sd = pm.HalfNormal('mu_algo_sd', sd=1) mu_algo_raw = pm.Normal('mu_algo_raw', mu=0, sd=1, shape=self.num_algos) mu_algo = pm.Deterministic('mu_algo', mu_algo_sd * mu_algo_raw) mu_backtest = \ pm.Deterministic('mu_backtest', mu_global + mu_author[self.author_to_backtest_encoding] + mu_algo[self.algo_to_backtest_encoding]) sigma_backtest = pm.Deterministic( 'sigma_backtest', tt.sqrt(APPROX_BDAYS_PER_YEAR / sharpes.meta_trading_days)) cov = corr * sigma_backtest[:, None] * sigma_backtest[None, :] alpha_author = pm.Deterministic('alpha_author', mu_global + mu_author) alpha_algo = \ pm.Deterministic('alpha_algo', mu_global + mu_author[self.author_to_algo_encoding] + mu_algo) sharpe = pm.MvNormal('sharpe', mu=mu_backtest, cov=cov, shape=self.num_backtests, observed=sharpes.sharpe_ratio) return model
def model_best(y1, y2, samples=1000): """Bayesian Estimation Supersedes the T-Test This model runs a Bayesian hypothesis comparing if y1 and y2 come from the same distribution. Returns are assumed to be T-distributed. In addition, computes annual volatility and Sharpe of in and out-of-sample periods. This model replicates the example used in: Kruschke, John. (2012) Bayesian estimation supersedes the t test. Journal of Experimental Psychology: General. Parameters ---------- y1 : array-like Array of returns (e.g. in-sample) y2 : array-like Array of returns (e.g. out-of-sample) samples : int, optional Number of posterior samples to draw. Returns ------- model : pymc.Model object PyMC3 model containing all random variables. trace : pymc3.sampling.BaseTrace object A PyMC3 trace object that contains samples for each parameter of the posterior. See Also -------- plot_stoch_vol : plotting of tochastic volatility model """ y = np.concatenate((y1, y2)) mu_m = np.mean(y) mu_p = 0.000001 * 1 / np.std(y)**2 sigma_low = np.std(y) / 1000 sigma_high = np.std(y) * 1000 with pm.Model() as model: group1_mean = pm.Normal('group1_mean', mu=mu_m, tau=mu_p, testval=y1.mean()) group2_mean = pm.Normal('group2_mean', mu=mu_m, tau=mu_p, testval=y2.mean()) group1_std = pm.Uniform('group1_std', lower=sigma_low, upper=sigma_high, testval=y1.std()) group2_std = pm.Uniform('group2_std', lower=sigma_low, upper=sigma_high, testval=y2.std()) nu = pm.Exponential('nu_minus_two', 1 / 29., testval=4.) + 2. returns_group1 = StudentT('group1', nu=nu, mu=group1_mean, lam=group1_std**-2, observed=y1) returns_group2 = StudentT('group2', nu=nu, mu=group2_mean, lam=group2_std**-2, observed=y2) diff_of_means = pm.Deterministic('difference of means', group2_mean - group1_mean) pm.Deterministic('difference of stds', group2_std - group1_std) pm.Deterministic( 'effect size', diff_of_means / pm.math.sqrt( (group1_std**2 + group2_std**2) / 2)) pm.Deterministic( 'group1_annual_volatility', returns_group1.distribution.variance**.5 * np.sqrt(252)) pm.Deterministic( 'group2_annual_volatility', returns_group2.distribution.variance**.5 * np.sqrt(252)) pm.Deterministic( 'group1_sharpe', returns_group1.distribution.mean / returns_group1.distribution.variance**.5 * np.sqrt(252)) pm.Deterministic( 'group2_sharpe', returns_group2.distribution.mean / returns_group2.distribution.variance**.5 * np.sqrt(252)) step = pm.NUTS() trace = pm.sample(samples, step) return model, trace
def build_model(self): base_numbers = self.data.n_safe.unique() choices = self.data.chose_risky.values safe_prior_mu = np.mean(np.log(base_numbers)) safe_prior_sd = np.std(np.log(base_numbers)) self.coords = { "subject": self.unique_subjects, "presentation": ['first', 'second'], "risky_prior_mu_regressors": self.design_matrices['risky_prior_mu'].design_info.term_names, "risky_prior_sd_regressors": self.design_matrices['risky_prior_sd'].design_info.term_names, "evidence_sd1_regressors": self.design_matrices['evidence_sd1'].design_info.term_names, "evidence_sd2_regressors": self.design_matrices['evidence_sd2'].design_info.term_names } with pm.Model(coords=self.coords) as self.model: inputs = self._get_model_input() for key, value in inputs.items(): inputs[key] = pm.Data(key, value) def build_hierarchical_nodes(name, mu_intercept=0.0, sigma=.5): nodes = {} mu = np.zeros(self.design_matrices[name].shape[1]) mu[0] = mu_intercept nodes[f'{name}_mu'] = pm.Normal(f"{name}_mu", mu=mu, sigma=sigma, dims=f'{name}_regressors') nodes[f'{name}_sd'] = pm.HalfCauchy(f'{name}_sd', .5, dims=f'{name}_regressors') nodes[f'{name}_offset'] = pm.Normal( f'{name}_offset', mu=0, sd=1, dims=('subject', f'{name}_regressors')) nodes[name] = pm.Deterministic( name, nodes[f'{name}_mu'] + nodes[f'{name}_sd'] * nodes[f'{name}_offset'], dims=('subject', f'{name}_regressors')) nodes[f'{name}_trialwise'] = softplus(tt.sum(nodes[name][inputs['subject_ix']] * \ np.asarray(self.design_matrices[name]), 1)) return nodes # Hyperpriors for group nodes nodes = {} nodes.update(build_hierarchical_nodes('risky_prior_mu'), mu_intercept=np.log(20.)) nodes.update(build_hierarchical_nodes('risky_prior_sd'), mu_intercept=1.) nodes.update(build_hierarchical_nodes('evidence_sd1'), mu_intercept=1.) nodes.update(build_hierarchical_nodes('evidence_sd2'), mu_intercept=1.) evidence_sd = tt.stack((nodes['evidence_sd1_trialwise'], nodes['evidence_sd2_trialwise']), 0) post_risky_mu, post_risky_sd = get_posterior( nodes['risky_prior_mu_trialwise'], nodes['risky_prior_sd_trialwise'], inputs['risky_mu'], evidence_sd[inputs['risky_ix'], np.arange(self.data.shape[0])]) post_safe_mu, post_safe_sd = get_posterior( safe_prior_mu, safe_prior_sd, inputs['safe_mu'], evidence_sd[inputs['safe_ix'], np.arange(self.data.shape[0])]) diff_mu, diff_sd = get_diff_dist(post_risky_mu, post_risky_sd, post_safe_mu, post_safe_sd) p = pm.Deterministic( 'p', cumulative_normal(tt.log(.55), diff_mu, diff_sd)) ll = pm.Bernoulli('ll_bernoulli', p=p, observed=choices)
def SIR_with_change_points( new_cases_obs, change_points_list, date_begin_simulation, num_days_sim, diff_data_sim, N, priors_dict=None, add_week_end_factor=False ): """ Parameters ---------- new_cases_obs : list or array Timeseries (day over day) of newly reported cases (not the total number) change_points_list : list of dicts List of dictionaries, each corresponding to one change point. Each dict can have the following key-value pairs. If a pair is not provided, the respective default is used. * pr_mean_date_begin_transient : datetime.datetime, NO default * pr_median_lambda : number, same as default priors, below * pr_sigma_lambda : number, same as default priors, below * pr_sigma_date_begin_transient : number, 3 * pr_median_transient_len : number, 3 * pr_sigma_transient_len : number, 0.3 date_begin_simulation: datetime.datetime The begin of the simulation data num_days_sim : integer Number of days to forecast into the future diff_data_sim : integer Number of days that the simulation-begin predates the first data point in `new_cases_obs`. This is necessary so the model can fit the reporting delay. Set this parameter to a value larger than what you expect to find for the reporting delay. N : number The population size. For Germany, we used 83e6 priors_dict : dict Dictionary of the prior assumptions Possible key-value pairs (and default values) are: * pr_beta_I_begin : number, default = 100 * pr_median_lambda_0 : number, default = 0.4 * pr_sigma_lambda_0 : number, default = 0.5 * pr_median_mu : number, default = 1/8 * pr_sigma_mu : number, default = 0.2 * pr_median_delay : number, default = 8 * pr_sigma_delay : number, default = 0.2 * pr_beta_sigma_obs : number, default = 10 * week_end_days : tuple, default = (6,7) * pr_mean_weekend_factor : number, default = 0.7 * pr_sigma_weekend_factor :number, default = 0.3 add_week_end_factor : bool Whether to add the prior that cases are less reported on week ends. Multiplies the new cases numbers on weekends by a number between 0 and 1, given by a prior beta distribution. The beta distribution is parametrised by pr_mean_weekend_factor and pr_sigma_weekend_factor, and which days to consider as weekends by week_end_days. 6 and 7 corresponds to Saturday and Sunday respectively (the default). Returns ------- : pymc3.Model Returns an instance of pymc3 model with the change points """ if priors_dict is None: priors_dict = dict() default_priors = dict( pr_beta_I_begin=100, pr_median_lambda_0=0.4, pr_sigma_lambda_0=0.5, pr_median_mu=1 / 8, pr_sigma_mu=0.2, pr_median_delay=8, pr_sigma_delay=0.2, pr_beta_sigma_obs=10, week_end_days = (6,7), pr_mean_weekend_factor=0.7, pr_sigma_weekend_factor=0.3 ) default_priors_change_points = dict( pr_median_lambda=default_priors["pr_median_lambda_0"], pr_sigma_lambda=default_priors["pr_sigma_lambda_0"], pr_sigma_date_begin_transient=3, pr_median_transient_len=3, pr_sigma_transient_len=0.3, pr_mean_date_begin_transient=None, ) if not add_week_end_factor: del default_priors['week_end_days'] del default_priors['pr_mean_weekend_factor'] del default_priors['pr_sigma_weekend_factor'] for prior_name in priors_dict.keys(): if prior_name not in default_priors: raise RuntimeError(f"Prior with name {prior_name} not known") for change_point in change_points_list: for prior_name in change_point.keys(): if prior_name not in default_priors_change_points: raise RuntimeError(f"Prior with name {prior_name} not known") for prior_name, value in default_priors.items(): if prior_name not in priors_dict: priors_dict[prior_name] = value print(f"{prior_name} was set to default value {value}") for prior_name, value in default_priors_change_points.items(): for i_cp, change_point in enumerate(change_points_list): if prior_name not in change_point: change_point[prior_name] = value print( f"{prior_name} of change point {i_cp} was set to default value {value}" ) if ( diff_data_sim < priors_dict["pr_median_delay"] + 3 * priors_dict["pr_median_delay"] * priors_dict["pr_sigma_delay"] ): raise RuntimeError("diff_data_sim is to small compared to the prior delay") if num_days_sim < len(new_cases_obs) + diff_data_sim: raise RuntimeError( "Simulation ends before the end of the data. Increase num_days_sim." ) # ------------------------------------------------------------------------------ # # Model and prior implementation # ------------------------------------------------------------------------------ # with pm.Model() as model: # all pm functions now apply on the model instance # true cases at begin of loaded data but we do not know the real number I_begin = pm.HalfCauchy(name="I_begin", beta=priors_dict["pr_beta_I_begin"]) # fraction of people that are newly infected each day lambda_list = [] lambda_list.append( pm.Lognormal( name="lambda_0", mu=np.log(priors_dict["pr_median_lambda_0"]), sigma=priors_dict["pr_sigma_lambda_0"], ) ) for i, cp in enumerate(change_points_list): lambda_list.append( pm.Lognormal( name=f"lambda_{i + 1}", mu=np.log(cp["pr_median_lambda"]), sigma=cp["pr_sigma_lambda"], ) ) # list of start dates of the transient periods of the change points tr_begin_list = [] dt_before = date_begin_simulation for i, cp in enumerate(change_points_list): dt_begin_transient = cp["pr_mean_date_begin_transient"] if dt_before is not None and dt_before > dt_begin_transient: raise RuntimeError("Dates of change points are not temporally ordered") prior_mean = ( dt_begin_transient - date_begin_simulation ).days # convert the provided date format (argument) into days (a number) tr_begin = pm.Normal( name=f"transient_begin_{i}", mu=prior_mean, sigma=cp["pr_sigma_date_begin_transient"], ) tr_begin_list.append(tr_begin) dt_before = dt_begin_transient # same for transient times tr_len_list = [] for i, cp in enumerate(change_points_list): tr_len = pm.Lognormal( name=f"transient_len_{i}", mu=np.log(cp["pr_median_transient_len"]), sigma=cp["pr_sigma_transient_len"], ) tr_len_list.append(tr_len) # build the time-dependent spreading rate lambda_t_list = [lambda_list[0] * tt.ones(num_days_sim)] lambda_before = lambda_list[0] for tr_begin, tr_len, lambda_after in zip( tr_begin_list, tr_len_list, lambda_list[1:] ): lambda_t = mh.smooth_step_function( start_val=0, end_val=1, t_begin=tr_begin, t_end=tr_begin + tr_len, t_total=num_days_sim, ) * (lambda_after - lambda_before) lambda_before = lambda_after lambda_t_list.append(lambda_t) lambda_t = sum(lambda_t_list) # fraction of people that recover each day, recovery rate mu mu = pm.Lognormal( name="mu", mu=np.log(priors_dict["pr_median_mu"]), sigma=priors_dict["pr_sigma_mu"], ) # delay in days between contracting the disease and being recorded delay = pm.Lognormal( name="delay", mu=np.log(priors_dict["pr_median_delay"]), sigma=priors_dict["pr_sigma_delay"], ) # prior of the error of observed cases sigma_obs = pm.HalfCauchy("sigma_obs", beta=priors_dict["pr_beta_sigma_obs"]) # -------------------------------------------------------------------------- # # training the model with loaded data provided as argument # -------------------------------------------------------------------------- # S_begin = N - I_begin S, I, new_I = _SIR_model( lambda_t=lambda_t, mu=mu, S_begin=S_begin, I_begin=I_begin, N=N ) new_cases_inferred = mh.delay_cases( new_I_t=new_I, len_new_I_t=num_days_sim, len_out=num_days_sim - diff_data_sim, delay=delay, delay_diff=diff_data_sim, ) if add_week_end_factor: week_end_factor = pm.Beta('weekend_factor', mu=priors_dict['pr_mean_weekend_factor'], sigma=priors_dict['pr_sigma_weekend_factor']) mask = np.zeros(num_days_sim - diff_data_sim) for i in range(num_days_sim - diff_data_sim): date_curr = date_begin_simulation + datetime.timedelta(days=i + diff_data_sim + 1) if date_curr.isoweekday() in priors_dict['week_end_days']: mask[i] = 1 multiplication_vec = np.ones(num_days_sim - diff_data_sim) - (1-week_end_factor)*mask new_cases_inferred_eff = new_cases_inferred * multiplication_vec else: new_cases_inferred_eff = new_cases_inferred # likelihood of the model: # observed cases are distributed following studentT around the model. # we want to approximate a Poisson distribution of new cases. # we choose nu=4 to get heavy tails and robustness to outliers. # https://www.jstor.org/stable/2290063 num_days_data = new_cases_obs.shape[-1] pm.StudentT( name="_new_cases_studentT", nu=4, mu=new_cases_inferred_eff[:num_days_data], sigma=tt.abs_(new_cases_inferred[:num_days_data] + 1) ** 0.5 * sigma_obs, # +1 and tt.abs to avoid nans observed=new_cases_obs, ) # add these observables to the model so we can extract a time series of them # later via e.g. `model.trace['lambda_t']` pm.Deterministic("lambda_t", lambda_t) pm.Deterministic("new_cases", new_cases_inferred) pm.Deterministic("new_cases_eff", new_cases_inferred_eff) return model
ax[0].plot(x, y, 'C0.') ax[0].set_xlabel('x') ax[0].set_ylabel('y', rotation=0) ax[0].plot(x, y_real, 'k') az.plot_kde(y, ax=ax[1]) ax[1].set_xlabel('y') plt.tight_layout() # Fit posterior with MCMC instead of analytically (for simplicity and flexibility) # This is the same as BAP code, except we fix the noise variance to a constant. with pm.Model() as model_g: w0 = pm.Normal('w0', mu=0, sd=10) w1 = pm.Normal('w1', mu=0, sd=1) #ϵ = pm.HalfCauchy('ϵ', 5) mu = pm.Deterministic('mu', w0 + w1 * x) #y_pred = pm.Normal('y_pred', mu=μ, sd=ϵ, observed=y) y_pred = pm.Normal('y_pred', mu=mu, sd=noiseSD, observed=y) trace_g = pm.sample(1000, cores=1, chains=2) az.plot_trace(trace_g, var_names=['w0', 'w1']) az.plot_pair(trace_g, var_names=['w0', 'w1'], plot_kwargs={'alpha': 0.1}) pml.savefig('linreg_2d_bayes_post_noncentered_data.pdf') plt.show() # To reduce the correlation between alpha and beta, we can center the data x = x_orig - x_orig.mean() # or standardize the data #x = (x - x.mean())/x.std()
def main(): st.title('Flu Inference') st.write("The full notebook can be found [here](https://github.com/benlevyx/modelling-infectious-disease/tree/master/notebooks/bayesian_model.ipynb).") st.write('## Gathering state-level features') st.write("The purpose of this section is to gather state-level features that may affect the degree to which a given state is suspectible or resistant to a virus such as the flu or Covid-19. Collecting these state-level characteristics can help us identify which features are responsible for the correlation in viral infection rates between states, and thus can also be used to quantify the correlation between states based on fundamental attributes of the states rather than just the raw wILI time series.") st.write("The density of a state is a natural feature to include because the denser a location, the more easily a virus can spread (look no further than NYC right now). However, it wouldn't make sense to report the density of a state because, for example, the high population density in Manhattan shouldn't be influenced by the fact that upstate New York State has a massive amount of scarsely populated land. Instead, a more sensible measure is a weighted average of the densities of each county in a given state, where the weights are the fraction of the state population that lives in the given county.") pred_dir = config.data / 'state_predictors' # dataset that reports the land area in square miles of each county in the U.S. land_df = pd.read_csv(pred_dir / 'land_area.csv') # dataset that reports the population of each county in the U.S. popn_df = pd.read_csv(pred_dir / 'population.csv') # st.write(land_df.head()) # st.write(popn_df.head()) land_df = land_df[['Areaname', 'LND010190D']] popn_df = popn_df[['Areaname', 'PST045200D']] # limit analysis to Lower 48 states lower_48 = ["AL", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"] state_end = tuple(', ' + abbrev for abbrev in lower_48) # ignore AL and HI filtered_land_df = land_df[land_df.Areaname.str.endswith(state_end)] filtered_popn_df = popn_df[land_df.Areaname.str.endswith(state_end)] # There are 5 counties in Virginia that are included twice in both the land area and population datasets # so we need to ignore the duplicated row virginia_counties_df = filtered_land_df[filtered_land_df.Areaname.str.endswith(', VA')] indices_to_delete = [] counties_set = set() for index, row in virginia_counties_df.iterrows(): county = row['Areaname'] if county not in counties_set: counties_set.add(county) else: indices_to_delete.append(index) filtered_land_df = filtered_land_df[~filtered_land_df.index.isin(indices_to_delete)] filtered_popn_df = filtered_popn_df[~filtered_popn_df.index.isin(indices_to_delete)] # merge land area and population datasets combined_df = pd.merge(filtered_land_df, filtered_popn_df, on='Areaname', how='inner') # extract state from Areaname column combined_df['state'] = combined_df.Areaname.str[-2:] combined_df.head() # rename column names combined_df.rename(columns={'Areaname': 'county', 'LND010190D': 'area', 'PST045200D': 'popn'}, inplace=True) # fill in missing value of land area of Broomfield, CO from Wikipedia page combined_df.loc[combined_df.county == 'Broomfield, CO', 'area'] = 33.00 # calculate density of each county by dividing population by land area combined_df['density'] = combined_df['popn'] / combined_df['area'] st.write(combined_df.head(10)) # calculate total population of each state accross all counties state2pop = combined_df.groupby('state').agg({'popn': sum}).to_dict()['popn'] combined_df['state_popn'] = [state2pop[state] for state in combined_df.state] combined_df.head() # calculate density metric for each state by weighing the density of each population by the fraction of # the state population that lives in the given state state2density_metric = (combined_df.groupby('state'). apply(lambda x: round(x['popn'] * (x['density'] ** 1) / x['state_popn'], 1)) .groupby('state').sum()).to_dict() # sort states in order of decreasing density sorted_density_metrics = sorted(list(state2density_metric.values()), reverse=True) density_metric2state = {v: k for k, v in state2density_metric.items()} ordered_density_metric2state = {x: density_metric2state[x] for x in sorted_density_metrics} # create dataframe with this first state-level feature state_stats_df = pd.DataFrame(ordered_density_metric2state.keys(), columns=['density_metric'], index=ordered_density_metric2state.values()) st.write(state_stats_df) st.write("The next feature is the average latitude of each state.") latlong_df = pd.read_csv(pred_dir / 'statelatlong.csv') latlong_df.head() # include this latitude value in the feature dataframe state_stats_df1 = (pd.merge(state_stats_df, latlong_df[['Latitude', 'State']], left_index=True, right_on='State').drop(columns=['State'])) state_stats_df1.index = ordered_density_metric2state.values() st.write(state_stats_df1) st.write("The next feature is whether each Lower 48 state borders either the Atlantic or Pacific Ocean. This can potentially be an important feature because tourists and immigrants usually fly into the country in a coastal location") coastal_states = set('ME NH MA RI CT NY NJ PA MD DE VA NC SC GA FL WA OR CA'.split()) state_stats_df1['is_coastal'] = [int(state in coastal_states) for state in state_stats_df.index] st.write(state_stats_df1) st.write("A potentially important state-level feature is the number of airline passengers arriving in the state. As we've seen with Covid-19, clusters have started in particular locations because visiters have come into these places with the virus from foreigns countries. The most readily available source for this data are the 'List of airports in [state]' Wikipedia article for each state. Each of these pages contains the number of commerical passenger boardings in 2016 for each airport in the state. Although commerical passenger arrivals are not included, it's reasonable to assume that the number of boardings and arrivals are closely related to each other. The values in the dictionary below represents the sum of the number of commerical passenger arrivals for the major airports in each state. Note: the number of major airports variesby state (e.g. the only major airport in Massachusetts in Logan, there are no major airports in Delaware, and there are three major airports in Kentucky (Cincinatti, Louisville and Lexington). Finally, the number of annual boardings in each state in normalized by the population of the given state, as this metric represents the relative influence of air traffic on the given state.") state2passengers = {'NY': 50868391, 'PA': 15285948 + 4670954 + 636916, 'NJ': 19923009 + 589091, 'MD': 13371816, 'IL': round((83245472 / 2) + (22027737 / 2)), 'MA': 17759044, 'VA': 11470854 + 10596942 + 1777648 + 1602631, 'MO': 6793076 + 5391557 + 462126, 'CA': (39636042 + 25707101 + 10340164 + 5934639 + 5321603 + 5217242 + 4969366 + 2104625 + 2077892 + 1386357 + 995801 + 761298), 'MI': 16847135 + 1334979 + 398508, 'CO': 28267394 + 657694, 'MN': 18123844, 'TX': 31283579 + 20062072 + 7554596 + 6285181 + 6095545 + 4179994 + 1414376, 'RI': 1803000, 'GA': 50501858 + 1056265, 'OH': 4083476 + 3567864 + 1019922 + 685553, 'CT': 2982194, 'IN': 4216766 + 360369 + 329957 + 204352, 'DE': 0, 'KY': 3269979 + 1631494 + 638316, 'FL': (20875813 + 20283541 + 14263270 + 9194994 + 4239261 + 3100624 + 2729129 + 1321675 + 986766 + 915672 + 589860), 'NE': 2127387 + 162876, 'UT': 11143738, 'OR': 9071154, 'TN': 6338517 + 2016089 + 887103, 'LA': 5569705 + 364200, 'OK': 1796473 + 1342315, 'NC': 21511880 + 5401714 + 848261, 'KS': 781944, 'WA': 21887110 + 1570652, 'WI': 3496724 + 1043185 + 348026 + 314909, 'NH': 995403, 'AL': 1304467 + 527801 + 288209 + 173210, 'NM': 2341719, 'IA': 1216357 + 547786, 'AZ': 20896265 + 1594594 + 705731, 'SC': 1811695 + 991276 + 944849 + 553658, 'AR': 958824 + 673810, 'WV': 213412, 'ID': 1633507, 'NV': 22833267 + 1771864, 'ME': 886343 + 269013, 'MS': 491464 + 305157, 'VT': 593311, 'SD': 510105 + 272537, 'ND': 402976 + 273980 + 150634 + 132557 + 68829, 'MT': 553245 + 423213 + 381582 + 247816 + 176730 + 103239, 'WY': 342044 + 92805} # population of each state according to the 2010 census state2popn_2010 = { 'AL': 4779736, 'AR': 2915918, 'AZ': 6392017, 'CA': 37253956, 'CO': 5029196, 'CT': 3574097, 'DE': 897934, 'FL': 18801310, 'GA': 9687653, 'IA': 3046355, 'ID': 1567582, 'IL': 12830632, 'IN': 6483802, 'KS': 2853118, 'KY': 4339367, 'LA': 4533372, 'MA': 6547629, 'MD': 5773552, 'ME': 1328361, 'MI': 9883640, 'MN': 5303925, 'MO': 5988927, 'MS': 2967297, 'MT': 989415, 'NC': 9535483, 'ND': 672591, 'NE': 1826341, 'NH': 1316470, 'NJ': 8791894, 'NM': 2059179, 'NV': 2700551, 'NY': 19378102, 'OH': 11536504, 'OK': 3751351, 'OR': 3831074, 'PA': 12702379, 'RI': 1052567, 'SC': 4625364, 'SD': 814180, 'TN': 6346105, 'TX': 25145561, 'UT': 2763885, 'VA': 8001024, 'VT': 625741, 'WA': 6724540, 'WI': 5686986, 'WV': 1852994, 'WY': 563626 } state_stats_df1['airport_boardings'] = [state2passengers[state] / state2popn_2010[state] for state in state_stats_df.index] st.write(state_stats_df1) abbrev2state = { 'AK': 'Alaska', 'AL': 'Alabama', 'AR': 'Arkansas', 'AS': 'American Samoa', 'AZ': 'Arizona', 'CA': 'California', 'CO': 'Colorado', 'CT': 'Connecticut', 'DC': 'District of Columbia', 'DE': 'Delaware', 'FL': 'Florida', 'GA': 'Georgia', 'GU': 'Guam', 'HI': 'Hawaii', 'IA': 'Iowa', 'ID': 'Idaho', 'IL': 'Illinois', 'IN': 'Indiana', 'KS': 'Kansas', 'KY': 'Kentucky', 'LA': 'Louisiana', 'MA': 'Massachusetts', 'MD': 'Maryland', 'ME': 'Maine', 'MI': 'Michigan', 'MN': 'Minnesota', 'MO': 'Missouri', 'MP': 'Northern Mariana Islands', 'MS': 'Mississippi', 'MT': 'Montana', 'NA': 'National', 'NC': 'North Carolina', 'ND': 'North Dakota', 'NE': 'Nebraska', 'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', 'NV': 'Nevada', 'NY': 'New York', 'OH': 'Ohio', 'OK': 'Oklahoma', 'OR': 'Oregon', 'PA': 'Pennsylvania', 'PR': 'Puerto Rico', 'RI': 'Rhode Island', 'SC': 'South Carolina', 'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah', 'VA': 'Virginia', 'VI': 'Virgin Islands', 'VT': 'Vermont', 'WA': 'Washington', 'WI': 'Wisconsin', 'WV': 'West Virginia', 'WY': 'Wyoming' } state2abbrev = {v: k for k, v in abbrev2state.items()} st.write("The next feature is the fraction of each state's population that falls into a set of age categories") age_df = pd.read_csv(pred_dir / 'age.csv') # merge age dataframe with dataframe that contains the rest of the features age_df['Location'] = [state2abbrev[state] for state in age_df.Location] state_stats_df2 = (pd.merge(state_stats_df1, age_df, left_index=True, right_on='Location') .drop(columns=['Location'])) state_stats_df2.index = ordered_density_metric2state.values() st.write(state_stats_df2) st.write("The next feature is average temperature of each state during each of the four seasons of the year.") temps_df = pd.read_csv(pred_dir / 'temps.csv') temps_df['State'] = [state2abbrev[state] for state in temps_df.State] # merge temperature dataframe with dataframe that contains the rest of the features state_stats_df3 = (pd.merge(state_stats_df2, temps_df, left_index=True, right_on='State') .drop(columns=['State'])) state_stats_df3.index = ordered_density_metric2state.values() st.write(state_stats_df3) st.write("It's possible that state-level political policies have an impact on the proliferation of virus infections. The Cook Partisan Voting Index taken from Wikipedia assigns a number to each state that indicates how strongly the state leads toward the Republican or Democratic Party based on recent state and federal elections. In our convention, a positive value signifies leaning Republican, while a negative value signifies leading Democratic.") state2partisan_score = { 'AL': 14, 'AR': 15, 'AZ': 5, 'CA': -12, 'CO': 1, 'CT': -6, 'DE': -6, 'FL': 2, 'GA': 5, 'IA': 3, 'ID': 19, 'IL': -7, 'IN': 9, 'KS': 13, 'KY': 15, 'LA': 11, 'MA': -12, 'MD': -12, 'ME': -3, 'MI': -1, 'MN': -1, 'MO': 9, 'MS': 9, 'MT': 11, 'NC': 3, 'ND': 17, 'NE': 14, 'NH': 0, 'NJ': -7, 'NM': -3, 'NV': -1, 'NY': -12, 'OH': 3, 'OK': 20, 'OR': -5, 'PA': 0, 'RI': -10, 'SC': 8, 'SD': 15, 'TN': 14, 'TX': 8, 'UT': 20, 'VA': -1, 'VT': -15, 'WA': -7, 'WI': 0, 'WV': 19, 'WY': 25 } state_stats_df3['partisan_score'] = [state2partisan_score[state] for state in state_stats_df3.index] st.write(state_stats_df3) st.write("The following dataset was taken from a Stat139 problem set last semester and contains a range of socioeconomic, demographic and health indicators. These include:\n\n Cancer: prevalence of cancer per 100,000 individuals\n\n Hispanic: percent of adults that are hispanic \n\n Minority: percent of adults that are nonwhite\n\n Female: percent of adults that are female\n\n Income: median income\n\n Nodegree: percent of adults who have not completed high school\n\n Bachelor: percent of adults with a bachelor’s degree\n\nInactive: percent of adults who do not exercise in their leisure time\n\nObesity: percent of individuals with BMI > 30\n\n Cancer: prevalence of cancer per 100,000 individuals\n\n We're not considering unemployment rate, as these rates are likely no longer accurate for many states.\n\nJust as with the density metric, the state-level value for each of these features is determined by calculating a weighted average of the measurements for each county, where the weights are the fraction of the state population that lives in the given county.") county_metrics_df = pd.read_csv(pred_dir / 'county_metrics.csv') county_metrics_df['state'] = [state2abbrev[state] for state in county_metrics_df.state] county_metrics_df = county_metrics_df[county_metrics_df.state.isin(lower_48)] st.write(county_metrics_df.head()) state2pop_ = county_metrics_df.groupby('state').agg({'population': sum}).to_dict()['population'] county_metrics_df['state_popn'] = [state2pop_[state] for state in county_metrics_df.state] metrics = ['hispanic', 'minority', 'female', 'unemployed', 'income', 'nodegree', 'bachelor', 'inactivity', 'obesity', 'cancer'] for metric in metrics: state2metric = (county_metrics_df.groupby('state'). apply(lambda x: round((x['population'] * x[metric]) / x['state_popn'], 3)) .groupby('state').sum()).to_dict() denom = 1000 if metric == 'income' else 1 state_stats_df3[metric] = [state2metric[state] / denom for state in state_stats_df3.index] st.write(state_stats_df3) st.write("The more people travel between states, the more closely related the states should be in terms of rate of virus infections. The Census Bureau Journey to Work datset reports the number of people that commute from any given county in the county to any other county in the country. This means we can aggregate these county to county commuting flows to determine the number of people that commute between any two states. From this data, we can create a symmetric matrix where the $i,j$ and $j,i$ elements represent the number of people that commute from state $i$ to state $j$ plus the number of people that commute from state $j$ to state $i$. However, just as with the number of annual boardings in each state, the final value of the number of people who commute between two states in normalized by the popualation of the given state. This means that this commuting matrix is no longer symmetric because the populations of state $i$ and state $j$ are different.") commuting_df_complete = pd.read_csv(pred_dir / 'commuting.csv') commuting_df = commuting_df_complete[['State Name', 'State Name.1', 'Workers in Commuting Flow']] commuting_df.rename(columns={'State Name': 'home_state', 'State Name.1': 'work_state', 'Workers in Commuting Flow': 'commuters'}, inplace=True) lower_48_full_name = [abbrev2state[abbrev] for abbrev in lower_48] commuting_df = commuting_df[commuting_df.work_state.isin(lower_48_full_name)] commuting_df['home_state'] = [state2abbrev[state] for state in commuting_df.home_state] commuting_df['work_state'] = [state2abbrev[state] for state in commuting_df.work_state] st.write(commuting_df.head(10)) commuting_df['commuters'] = commuting_df['commuters'].apply(lambda x: int(''.join([y for y in x if y.isdigit()]))) commuting_groupby_df = (commuting_df.groupby(['work_state', 'home_state'], as_index=False) .agg({'commuters': 'sum'})) # calculate the number of commuters between two states for all pairs of states for work_state in state_stats_df3.index: vals = [] for home_state in state_stats_df3.index: try: num1 = int((commuting_groupby_df[(commuting_groupby_df.work_state == work_state) & (commuting_groupby_df.home_state == home_state)].commuters)) num2 = int((commuting_groupby_df[(commuting_groupby_df.work_state == home_state) & (commuting_groupby_df.home_state == work_state)].commuters)) num = num1 + num2 num /= state2popn_2010[work_state] except TypeError: num = 0 vals.append(num) state_stats_df3[work_state + '_dest'] = vals st.write(state_stats_df3) st.write("States that are in close proximity may be similarly affected by viruses. Therefore, we include a column for each state in the design matrix that denotes whether that given states borders each of the other states.") # dictionary that maps each state in the Lower 48 to the states that directly border it or are not contiguous # but are very close (e.g. NJ and CT) state2neighbors = {'AL': {'AL', 'MS', 'TN', 'FL', 'GA', 'NC', 'SC'}, 'GA': {'GA', 'TN', 'FL', 'AL', 'SC', 'NC', 'MS'}, 'FL': {'FL', 'GA', 'AL', 'MS', 'SC'}, 'MS': {'MS', 'AL', 'TN', 'FL', 'LA', 'AR', 'GA'}, 'LA': {'LA', 'TX', 'AR', 'MS', 'OK', 'AL'}, 'SC': {'SC', 'FL', 'GA', 'NC', 'TN'}, 'NC': {'NC', 'SC', 'GA', 'TN', 'VA', 'KY'}, 'AR': {'AR', 'LA', 'TX', 'MS', 'TN', 'OK', 'MO', 'KY'}, 'VA': {'VA', 'NC', 'KY', 'WV', 'TN', 'DC', 'MD', 'DE'}, 'MD': {'MD', 'DC', 'VA', 'WV', 'DE', 'NJ', 'PA'}, 'DE': {'DE', 'MD', 'DC', 'NJ', 'PA'}, 'NJ': {'NJ', 'DE', 'MD', 'PA', 'NY', 'NJ', 'CT'}, 'NY': {'NY', 'NJ', 'PA', 'CT', 'MA', 'VT'}, 'CT': {'CT', 'NY', 'RI', 'MA', 'NJ'}, 'RI': {'RI', 'CT', 'MA'}, 'MA': {'MA', 'CT', 'RI', 'NH', 'VT', 'NY'}, 'NH': {'NH', 'VT', 'ME', 'MA'}, 'ME': {'ME', 'NH', 'MA', 'VT'}, 'VT': {'VT', 'NH', 'NY', 'MA'}, 'PA': {'PA', 'NY', 'NJ', 'MD', 'WV', 'OH', 'DE'}, 'WV': {'WV', 'DC', 'MD', 'PA', 'OH', 'KY', 'VA'}, 'OH': {'OH', 'PA', 'WV', 'MI', 'IN', 'KY'}, 'MI': {'MI', 'OH', 'WI', 'IN', 'IL'}, 'KY': {'KY', 'WV', 'OH', 'IN', 'IL', 'MO', 'TN', 'VA', 'AR', 'NC'}, 'TN': {'TN', 'KY', 'VA', 'NC', 'SC', 'GA', 'AL', 'MS', 'AR', 'MO', 'IL'}, 'IN': {'IN', 'KY', 'OH', 'MI', 'IL', 'WI'}, 'IL': {'IL', 'IN', 'MI', 'WI', 'IA', 'MO', 'KY', 'TN'}, 'WI': {'WI', 'IL', 'MN', 'MI', 'IA'}, 'MN': {'MN', 'MI', 'WI', 'IA', 'ND', 'SD', 'NE', 'IL'}, 'IA': {'IA', 'WI', 'MN', 'IL', 'MO', 'KS', 'NE', 'SD'}, 'MO': {'MO', 'IA', 'IL', 'KY', 'TN', 'AR', 'OK', 'KS', 'NE'}, 'ND': {'ND', 'SD', 'MN', 'MT', 'WY'}, 'SD': {'SD', 'ND', 'MN', 'IA', 'NE', 'MT', 'WY'}, 'NE': {'NE', 'SD', 'IA', 'MO', 'KS', 'WY', 'CO'}, 'KS': {'KS', 'NE', 'IA', 'MO', 'AR', 'OK', 'CO', 'TX', 'NM'}, 'OK': {'OK', 'KS', 'MO', 'AR', 'TX', 'NM', 'CO', 'LA'}, 'TX': {'TX', 'LA', 'AR', 'OK', 'NM', 'CO'}, 'MT': {'MT', 'ND', 'SD', 'WY', 'ID'}, 'WY': {'WY', 'MT', 'ND', 'SD', 'NE', 'CO', 'UT', 'ID'}, 'CO': {'CO', 'WY', 'NE', 'KS', 'OK', 'TX', 'NM', 'UT', 'AZ'}, 'NM': {'NM', 'CO', 'KS', 'OK', 'TX', 'AZ', 'UT'}, 'ID': {'ID', 'MT', 'WY', 'UT', 'NV', 'WA', 'OR'}, 'UT': {'UT', 'ID', 'WY', 'CO', 'NM', 'AZ', 'NV'}, 'AZ': {'AZ', 'NM', 'CO', 'UT', 'NV', 'CA'}, 'WA': {'WA', 'ID', 'OR'}, 'OR': {'OR', 'WA', 'ID', 'NV', 'CA'}, 'NV': {'NV', 'ID', 'OR', 'UT', 'AZ', 'CA'}, 'CA': {'CA', 'OR', 'NV', 'AZ'} } for neighboring_state in state_stats_df3.index: states = [int(neighboring_state in state2neighbors[state]) for state in state_stats_df3.index] state_stats_df3[neighboring_state + '_is_neighbor'] = states st.write(state_stats_df3) st.write("The proportion of each state that is vaccinated may affect the number of people who are infected with the flu. Therefore, we include information on the adult and child vaccination rate for each state.") flu_df = pd.read_csv(pred_dir / 'flu.csv') flu_df['State'] = [state2abbrev[state] for state in flu_df.State] state_stats_df4 = (pd.merge(state_stats_df3, flu_df, left_index=True, right_on='State').drop(columns=['State'])) state_stats_df4.index = state_stats_df3.index st.write(state_stats_df4) st.write("Smoking may also affect suspectibility to viruses such as the flu and Covid-19, so we include a feature that reports the fraction of adults who smoke in each state.") state2smoking_rate = { 'AL': 20.9, 'AR': 22.3, 'AZ': 15.6, 'CA': 11.3, 'CO': 14.6, 'CT': 12.7, 'DE': 17.0, 'FL': 16.1, 'GA': 17.5, 'IA': 17.1, 'ID': 14.3, 'IL': 15.5, 'IN': 21.8, 'KS': 17.4, 'KY': 24.6, 'LA': 23.1, 'MA': 13.7, 'MD': 13.8, 'ME': 17.3, 'MI': 19.3, 'MN': 14.5, 'MO': 20.8, 'MS': 22.2, 'MT': 17.2, 'NC': 17.2, 'ND': 18.3, 'NE': 15.4, 'NH': 15.7, 'NJ': 13.7, 'NM': 17.5, 'NV': 17.6, 'NY': 14.1, 'OH': 21.1, 'OK': 20.1, 'OR': 16.1, 'PA': 18.7, 'RI': 14.9, 'SC': 18.8, 'SD': 19.3, 'TN': 22.6, 'TX': 15.7, 'UT': 8.9, 'VA': 16.4, 'VT': 15.8, 'WA': 13.5, 'WI': 16, 'WV': 26, 'WY': 18.7 } state_stats_df4['smoking_rate'] = [state2smoking_rate[state] / 100 for state in state_stats_df4.index] st.write(state_stats_df4) st.write("## Bayesian Model") st.write("### Motivation") st.write("Before describing the model, it's important to first discuss the motivation behind it in the first place. The wILI time series clearly show that the states are affected differently by the flu. Therefore, we wanted to determine whether there are any state-level features that account for the disrepencies between the states. If we could identify these particular features, then we'd also be able to figure out which states are intrinsically linked based on their attributes.") st.write("This information would then allow us to transfer this knowledge about the flu to Covid-19. Because both the flu and Covid are viruses, we'd expect some of the underlying risk factors of flu to generalize to Covid as well. We could then take one of two routes: first, we could assess if the interstate correlations discovered from the flu data apply in the case of Covid by comparing the number of Covid cases among different states. And second, we could assume that the flu relationships apply in the case of Covid and use these insights to look deeper than just the raw Covid numbers. For example, if the flu analysis reveals that two states share many similar characteristics, and one of these states has more Covid cases per 1000 people but also has more testing, then we may believe that the second state has more case of Covid than are reported. Alternatively, we can identify states that, based on their characteristics (e.g. high density, high obesity rate), are more susceptible to a major spike in Covid cases and thus should take additional precautions when opening up their states.") st.write("### Model Formulation") st.write("If the state wILI rates are correlated with each other, then we should, in theory, be able to predict the wILI rate in a given state and for a given week from the wILI rates of all the other states for the same week. Because correlated states may have similar flu trajectories but have different raw wILI rates, it's more robust to predict the weekly percent change in wILI rather than the absolute change in wILI. This means that we want to predict the trend in the number of flu cases for each state based on the trends of all the other states at the same time.") st.write("The big question is obviously how to use the percent change in the wILI rate of every other state to predict the percent change in the wILI rate for a single state. Because some states are more closely correlated with a given state than others, it makes sense to predict the percent change for a given state to be a weighted average of the percent changes of the other weeks, where the weights should ideally be proportional to the underlying correlation between the two states. For example, if we were trying to predict the trend in New York, we'd take into account the trend of every other state (except for Alaska and Hawaii), but the influence of each of these states on our overall prediction for New York would vary (e.g. the influence of New Jersey and Connecticut may be high, while the influenced of Idaho and Nebraska may be low).") st.write("Converting this into formal notation, let's define $\\delta_i$ to be the percent change in the wILI rate between two consecutive weeks for state $i$, and define $\\alpha_{ij}$ to be the weight coefficient of state $j$ on state $i$. We predict each $\\delta_i$ as:") st.latex("\\delta_i \\sim N\\left(\\frac{\\sum_{j=1}^{48}\\alpha_{ij}\\delta_jI(j \\neq i)}{\\sum_{j=1}^{48}\\alpha_{ij}I(j \\neq i)}, {\\sigma_{i}}^2\\right)") st.write("where ${\\sigma_{i}}^2$ is a state-specific variance. Intuitively, the lower the value of ${\\sigma}^2$ for a given state, the more the variation in the state's wILI trend can be explained by the wILI trends of the other states, and vice versa.") st.write("Next, we want to link the $\\alpha_{ij}$ weights to the features associated with each state such that states with more similar characteristics and high rates of interstate travel have higher $\\alpha_{ij}$ and $\\alpha_{ji}$ values and vice versa. Additionally, we only want a few of the $\\alpha_{ij}$s corresponding to state $i$ to be large, and the rest to be small (in a similar spirit to regularization). We can accomplish both of these features as follows: first, each $\\alpha_{ij}$ is modelled as being distributed according to an exponential distribution with a scale (i.e. inverse rate) parameter of $\\lambda_{ij}$. Because an exponential distribution is right skewed and has most of its mass near zero, this ensures that most of the $\\alpha_{ij}$ that are drawn from exponential distributions will take on relatively small values, while only a few will take on relatively large values. Next, we link the scale parameter ($\\lambda_{ij}$) of this exponential distribution to the state-level features by setting the log of $\\lambda_{ij}$ equal to the linear predictor function (taking the log is necessary to map the domain of the scale parameter (all positive real numbers) to the domain of the linear prediction function (the entire real line)).") st.write("Translating this into formal notation:") st.latex("\\alpha_{ij} \\sim Expo(\\lambda_{ij})") st.latex("log(\\lambda_{ij}) = \\beta_0 + \\beta_1X_1 + ... + \\beta_kX_k") st.write("In this case the linear predictor function is a little different that usual. Two of the predictors (normalized number of commuters between states $i$ and $j$ and the indicator of whether state $j$ borders state $i$) are included in the usual form of $\\beta_iX_i$, where a unit increase in $X_i$ corresponds to a $\\beta_i$ increase in the linear predictor. However, the rest of the predictors are state-level features such as obesity rate and density. This means that we don't care about the raw values of these features; instead, we only care about the difference between the values for state $i$ and state $j$. Therefore, each of the predictors is defined to be $|X_i - X_j|$, such that the predictor value is 0 when the two states have the same feature value, and increases as the difference between the two states grows.") st.write("Finally, because this is a Bayesian model, we need to define a prior distribution for the model parameters, which in this case are the $\\beta$ coefficient associated with each predictor variable and the ${\\sigma}^2$ parameter associated with each state. Because we have no substantial prior domain knowledge, we placed relatively uninformative priors on these parameters. Putting all of these components together produces the following generative model:") st.latex("\\delta_i \\sim N\\left(\\frac{\\sum_{j=1}^{48}\\alpha_{ij}\\delta_jI(j \\neq i)}{\\sum_{j=1}^{48}\\alpha_{ij}I(j \\neq i)}, {\\sigma_{i}}^2\\right)") st.latex("\\sigma_{i}^{2} \\sim Inv-Gamma(2, 2)") st.latex("\\alpha_{ij} \\sim Expo(\\lambda_{ij})") st.latex("log(\\lambda_{ij}) = \\beta_0 + \\beta_1X_1 + ... + \\beta_kX_k") st.latex("\\beta_i \\sim N(0, 5^2) ") st.write("Performing inference for this model yields the posterior distribution of the $\\beta$s and the ${\\sigma}^2$, but we only really care about the $\\beta$s. Because the exponential distribution is parameterized by a scale parameter rather than the usual rate parameter, the expected value of the distribution is equal to the scale parameter. This means that a larger $\\lambda_{ij}$ value corresponds, on average, to a higher $\\alpha_{ij}$ coefficient, and because the linear predictor function is defined to be the log of $\\lambda_{ij}$, this in turn means that a larger linear predictor corresponds, on average, to a higher $\\alpha_{ij}$ coefficient. For the two predictors that are not differences between the two given states, this means that a positive $\\beta$ coefficent indicates that a unit increase in the predictor value produces a stronger correlation between the two given states and vice versa. On the other hand, for the rest of the predictors that are included as differences between certain features of the two states, a strong correlation between two given states is signified by a negative $\\beta$ coefficient. This is the case because the predictor value represents the absolute differences between the features of the states, so a larger predictor value corresponds to a larger discrepancy between the states. Thus, the corresponding $\\beta$ coefficient can be interpreted as a penalty parameter, such that states that are less similar in terms of the given feature are less correlated with each other (assuming the $\\beta$ coefficient value is negative).") st.write("Overall, the model provides us with two interpretative results. First, the $\\beta$ coefficients indicate which features contribute to the correlation between the wILI time series of different states. And second, the $\\beta$ coefficients tell us about the $\\alpha_{ij}$ weights, which, in turn, inform us about which states are highly correlated with each other based on the fundamental characteristics of the states.") st.write("Finally, one major advantage of this model is that the observations (i.e. the percent change in the wILI rate for a given week) are independent of each other conditioned on the percent changes of the other states for the same week. This means that unlike in a classic time seris model, the past wILI rates of a state are irrelevant to predicting the percent change in the wILI rate at any given time. This greatly simplifies things, as it's much easier to deal with independent observations than it is to handle observations that are correlated with previous observations.") predictor_df = pd.read_csv(pred_dir / 'state_stats.csv') predictor_df.drop(index='FL', inplace=True, errors='ignore') flu_percent_change_df = pd.read_csv(pred_dir / 'flu_percent_change_imputed_48.csv') week_nums = flu_percent_change_df.week_num flu_percent_change_df.drop(columns='week_num', inplace=True) flu_percent_change_df = flu_percent_change_df[predictor_df.index] st.write("Weekly percent change in wILI rate by state:") st.write(flu_percent_change_df.head()) # predictors that are compared between states comparison_predictors = ['density_metric', 'Latitude', 'is_coastal', 'airport_boardings', 'Children 0-18', 'Adults 19-25', 'Adults 26-34', 'Adults 35-54', 'Adults 55-64', '65+', 'partisan_score', 'hispanic', 'minority', 'female', 'income', 'nodegree', 'bachelor', 'inactivity', 'obesity', 'cancer', 'overall_vacc_rate', 'child_vacc_rate', 'smoking_rate'] season_predictors = ['spring', 'fall', 'winter'] # predictors that are not compared between states no_comparison_predictors = ['commuters', 'is_neighbor'] st.write("An important preprocessing step is to standardize each of the predictors (except for `is_coastal` and `is_neighbor` as these variables only take on the values 0 and 1. This ensures that the $\\beta$ coefficients associated with each predictor are all on the same scale and thus are easily comparable to each other. Additionally, ensuring the the $\\beta$ parameters lie in a similar range may help with the MCMC sampling.") predictors_to_standardize = [x for x in comparison_predictors if x != 'is_coastal'] + season_predictors # there are no observations during the summer so we don't need the summer weather predictor predictor_df_standardized = predictor_df.drop(columns='summer') for predictor in predictors_to_standardize: data = predictor_df_standardized[predictor] mean = np.mean(data) std = np.std(data) predictor_df_standardized[predictor] = [(x - mean) / std for x in data] commute_columns = [column for column in predictor_df_standardized if column.endswith('_dest')] commute_vals = predictor_df_standardized[commute_columns].to_numpy().flatten() commute_mean = np.mean(commute_vals) commute_std = np.std(commute_vals) for commute_column in commute_columns: predictor_df_standardized[commute_column] = [(x - commute_mean) / commute_std for x in predictor_df_standardized[commute_column]] comparison_preds_df = predictor_df_standardized[comparison_predictors + season_predictors] st.write("Resulting state feature dataframe:") st.write(predictor_df_standardized) # determine season from week of the year def get_season(week): if week >= 52 or week < 13: return np.array([0, 0, 1]) if 13 <= week < 26: return np.array([1, 0, 0]) if 39 <= week < 52: return np.array([0, 1, 0]) raise predictor_num = len(comparison_predictors) + len(season_predictors) + len(no_comparison_predictors) state_num = flu_percent_change_df.shape[1] comparison_preds_num = len(comparison_predictors) obs_num = len(flu_percent_change_df) # indicate which season each observation fall into season_indictor_array = np.zeros((obs_num, state_num - 1, len(season_predictors))) for i, week_num in enumerate(week_nums[1:]): season_indictor_array[i, :, :] = np.repeat(get_season(week_num)[np.newaxis, :], state_num - 1, axis=0) st.write("`Y_target` is a 1D array that contains the percent change of each state for each week of the time series that is included in the analysis. This is the variable we want to predict for each observation. Because there are 47 states (Lower 48 except for Florida) and 217 observations for each state, this array has a length of $47*217=10199$. \n\n`Y_state_idx` is a 1D array of the same length as `Y_target` that represents the specific state associated with each `Y_target` value. Therefore, it takes on values between 0 and 46. This is necessary to pick out the variance parameter corresponding to the given state. \n\n`X` is a 3D design matrix. The first axis has a length equal to the total number of observations (10199). The second axis has a length of 46, which represents the $47-1=46$ other states from which we're trying to predict the final state. And the first axis has a length of 29, which contain the 28 predictors in addition to an intercept term, which is simply the value of 1. Therefore, this `X` matrix contains all the predictors for each state for each observation.\n\n`X_flu` is a 2D array. The first axis has a length equal to the total number of observations (10199), while the second axis has a length of 46 and represents the percent change in wILI rate for all the $47-1=46$ other states from which we're trying to predict the final state. Therefore, this array is contains all the $\\delta_jI(j \\neq i)$ values for each observation.") Y_target = np.zeros(state_num * obs_num) X = np.zeros((Y_target.shape[0], state_num - 1, predictor_num + 1)) Y_state_idx = np.zeros(Y_target.shape[0], dtype=int) X_flu = np.zeros((Y_target.shape[0], state_num - 1)) X.shape for idx, state in enumerate(predictor_df_standardized.index): # response variable Y_target[obs_num * idx: obs_num * idx + obs_num] = flu_percent_change_df[state] # percent change of other states X_flu[obs_num * idx: obs_num * idx + obs_num, :] = flu_percent_change_df.drop(columns=state).to_numpy() # index of response state Y_state_idx[obs_num * idx: obs_num * idx + obs_num] = [idx] * obs_num state_comparison_preds = np.array(comparison_preds_df.loc[state]) constant_design_matrix = np.zeros((X.shape[1], X.shape[2])) constant_design_matrix[:, 0] = np.ones(state_num - 1) # two predictors that aren't differences between states: neighboring state and number of commuters other_states_preds_df = predictor_df_standardized.drop(index=state) not_difference_matrix = other_states_preds_df[[state + '_is_neighbor', state + '_dest']].to_numpy() constant_design_matrix[:, 1: 1 + len(no_comparison_predictors)] = not_difference_matrix # the rest of the predictors are differences between two states other_states_comparison_preds_array = comparison_preds_df.drop(index=state).to_numpy() difference_matrix = abs((other_states_comparison_preds_array - state_comparison_preds) ** 1) constant_design_matrix[:, 1 + len(no_comparison_predictors):] = difference_matrix constant_design_matrix_3D = np.repeat(constant_design_matrix[np.newaxis, :, :], repeats=obs_num, axis=0) # pick out appropriate season and set the rest of the temperature predictors to zero constant_design_matrix_3D[:, :, -len(season_predictors):] *= season_indictor_array X[obs_num * idx: obs_num * idx + obs_num, :, :] = constant_design_matrix_3D st.write("The observations are shuffled before they are inputted to the pymc3 model.") # randomly shuffle the observations np.random.seed(109) indices = np.arange(len(Y_target)) np.random.shuffle(indices) Y_target_random = Y_target[indices] X_flu_random = X_flu[indices] X_random = X[indices] Y_state_idx_random = Y_state_idx[indices] st.write("See bottom of document for model specification.") st.write("Just as we did in HW3, it's important to first check whether the generative model is correctly specified. This can be done by hardcoding the values for the parameters, generating response variables from these parameters and then trying to infer the parameters using MCMC.") st.write("The sampling took a whopping 13 hours to sample just 500 times for each chain (with a 500 burn-in sample). However, as shown below the results confirm that the model was correctly specified, as the majority of the true $\\beta$ values lie within the corresponding 94 percent credible interval. Therefore, performance inference for the actual data should yield reliable results.\n\nHowever, carrying out inference on this synthetic data reveals several issues. First, many of the r_hat values are significantly larger than 1.0, which means that more than 500 samples are needed for the chains to converge to the posterior distribution. And second, the fact that the sampling took so long may indicate that the uninformative priors are too flat and make it difficult for the NUTS sampler to sample points from the true posterior distribution. To address these issues, the number of samples is increased from 500 to 1000 and a semi-informative prior is placed on the $\\beta$ and $\\sigma^2$ parameters ($N(0, 25)$ for each of the $\\beta$s and $Inv-Gamma(2, 2)$ for each $\\sigma^2$.") sim_trace_df = pd.read_csv(pred_dir / 'sim_trace.csv') st.write(sim_trace_df) st.write("Unfortunately we ran into major issues running MCMC for the actual data. A burn-in of 500 and a sample of 1000 should have taken around 18 hours to finish. However, the first time we ran it, it was 80 percent complete after 14 hours and then the screen saver didn't turn off and the notebook shut down. We then tried running in a second time, and this time it again was 80 percent done after another 14 hours and then encountered a memory failure issue that terminated the notebook. Therefore, the third time we only asked for 500 samples, even though we knew this likely wouldn't be large enough for the sampler to converge. It took 14 hours to run but finished successfully. Even so, the model was so unwieldy that it took an additional three hours just to save the model and create a summary dataframe.") st.write("Results of MCMC sampling:") trace_df = pd.read_csv(pred_dir / 'trace.csv') st.write(trace_df) st.write("Unfortunately, most of the r_hat values of the $\\beta$ coefficients are extremely inflated (the average r_hat value is just under 2.0). This means that the sampler hasn't come close to converging and means that it's pointless to try to interpret the sign or the magnitude of the coefficients. At this point, we ran out of time. However, if we had more time, we'd randomly select a subset of the observations and get more samples for these observations, as it's better to have trustworthy results on less data than it is to have unreliable results on the entire datset.") st.write("While the results of the inference were unreliable, it's still worthwhile to discuss what the next steps would have been in the analysis. First, we would check the sign and 94 percent credible interval of each of the $\\beta$ coefficients to see if the majority of them make intuitive sense (i.e. negative coefficients for the difference predictors and positive coefficients for the non-difference predictors.) Next, we would evaluate the predictive power of the model and test the model assumptions at the same time. This could be done by first calculating the predictive power of a baseline naive model where the average of all the other states is used to predict for the percent change in the final state (in other words, where the weights associated with each state are the same). Because the likelihood function is modelled as a normal distribution, the optimal loss function is the mean squared error. The predictions would be performed for each state separately. \n\nAfter calculating the MSE for the naive model, we'd evaluate the Bayesian model as follows: first, we'd sample hundreds of times from the posterior distribution of each of the $\\beta$ coefficients. Then, for each sample, we'd work our way up the model (i.e. sample an $\\alpha$ for each state) and calculate the mean of the prediction. We'd then plot the residuals by subtracting the predicted percent change from the true percent change. Calculating the average of the square of the residuals would give us the MSE, which we'd compare to the baseline model to see if this model has any increased predictive power. Meanwhile, we'd plot these residuals to assess the assumption that the observations are normally distributed about the weighted average of the percent change of each of the other states. If this is the case, then we'd expect the distribution to being normally distributed around 0.0. Finally, we can calculate the variance of the residuals for each state and compare this sample variance to the posterior distribution of $\\sigma^2$ for each state to check if they are consistent with each other.") st.write("Model specification in pymc3:") with st.echo(): model = pm.Model() with model: # define prior distribution for beta parameters beta = pm.Normal('beta', mu=0, sigma=5, shape=predictor_num + 1) # define prior distribution for state-specific variance parameter sigma_sq = pm.InverseGamma('sigma_sq', alpha=2, beta=2, shape=state_num) # calculate the linear predictor for each state by multipling the 3D X design matrix with the vector # of beta parameters nu = pm.Deterministic('nu', pm.math.dot(X_random, beta)) # calculate the lambda parameter for each state by exponentiating the linear predictor lambda_ = pm.Deterministic('lambda', pm.math.exp(nu)) # sample an alpha random variable for each state from an exponential distribution with the # corresponding rate parameter alpha = pm.Exponential('alpha', lam=1/lambda_, shape=(X_random.shape[0], state_num - 1)) # calculate the mean of each response variable by taking the dot product between the alpha vector # and the vector of the percent change in the wILI rates of the other 46 states and dividing by the # sum of the alpha weights mu = pm.Deterministic('mu', pm.math.sum(alpha * X_flu_random, axis=1) / pm.math.sum(alpha, axis=1)) # define the response variable to be normally distributed about the mean and with a standard deviation that # is the square root of the variance parameter associated with the given state Y_obs = pm.Normal('Y_obs', mu=mu, sigma=pm.math.sqrt(sigma_sq[Y_state_idx_random]), observed=Y_target_random)
def find_ism_params(grid, dustlaw, obs, pca_result, line_ls, drpall_row, Zsol=.0142, nrad=30, m_at_rad=5, rlim=None): ''' run a pymc3 grid on a whole galaxy - grid_covs, grid_alphas: yields from pre-GP-trained photoionization grid - dustlaw: - line_obs: tuple of flux, uncertainty, and mask - line_ls: - drpall_row: ''' # access results from pca to get priors on tauV*mu and tauV*(1-mu) pca_results_good = ~np.logical_or(pca_result.mask, pca_result.badPDF()) tauV_mu_loc, tauV_mu_sd = pca_result.to_normaldist('tau_V mu') tauV_1mmu_loc, tauV_1mmu_sd = pca_result.to_normaldist('tau_V (1 - mu)') logQH_loc, logQH_sd = pca_result.to_normaldist('logQH') # good spaxels must be good in both PCA results and emlines measurements goodspax = np.logical_and(obs.spaxels_good_to_run(), pca_results_good) print(goodspax.sum(), 'spaxels') # access emission-line measurements, and pick the good ones f = np.column_stack([obs.line_flux[k][goodspax] for k in obs.lines_used]) unc = np.column_stack( [obs.line_unc[k].array[goodspax] for k in obs.lines_used]) # filter PCA measurements of tauV mu and tauV (1 - mu) tauV_mu_loc, tauV_mu_sd = \ tauV_mu_loc[goodspax].astype(np.float32), tauV_mu_sd[goodspax].astype(np.float32) tauV_1mmu_loc, tauV_1mmu_sd = \ tauV_1mmu_loc[goodspax].astype(np.float32), tauV_1mmu_sd[goodspax].astype(np.float32) logQH_loc, logQH_sd = \ logQH_loc[goodspax].astype(np.float32), logQH_sd[goodspax].astype(np.float32) # radius in Re units Rreff = obs.hdulist['SPX_ELLCOO'].data[1, ...][goodspax].astype(np.float32) #''' if type(rlim) is list: Rtargets = np.linspace(rlim[0], rlim[1], nrad) else: Rtargets = np.linspace(Rreff.min(), Rreff.max(), nrad) meas_ixs = np.unique( np.argsort(np.abs(Rreff[None, :] - Rtargets[:, None]), axis=1)[:, :m_at_rad]) print(meas_ixs) Rreff, f, unc = Rreff[meas_ixs], f[meas_ixs], unc[meas_ixs] tauV_mu_loc, tauV_mu_sd = tauV_mu_loc[meas_ixs], tauV_mu_sd[meas_ixs] tauV_1mmu_loc, tauV_1mmu_sd = tauV_1mmu_loc[meas_ixs], tauV_1mmu_sd[ meas_ixs] logQH_loc, logQH_sd = logQH_loc[meas_ixs], logQH_sd[meas_ixs] #''' # distance, for absolute-scaling purposes zdist = drpall_row['nsa_zdist'] four_pi_r2 = (4. * np.pi * cosmo.luminosity_distance(zdist)**2.).to( units.cm**2).value *obs_shape_, nlines = f.shape obs_shape = tuple(obs_shape_) print('in galaxy: {} measurements of {} lines'.format(obs_shape, nlines)) with pymc3.Model() as model: #''' # gaussian process on radius determines logZ ls_logZ = pymc3.Gamma('ls-logZ', alpha=3., beta=3., testval=1.) # effectively [0.5, 3] Re gp_eta = pymc3.HalfCauchy('eta', beta=.5, testval=.25) cov_r = gp_eta**2. * pymc3.gp.cov.ExpQuad(input_dim=1, ls=ls_logZ) logZ_gp = pymc3.gp.Latent(cov_func=cov_r) # draw from GP logZ_rad = logZ_gp.prior('logZ-r', X=Rreff[:, None]) logZ_gp_rad_sigma = pymc3.HalfCauchy('logZ-rad-sigma', beta=.2) logZ = pymc3.Bound(pymc3.Normal, *grid.range('logZ'))('logZ', mu=logZ_rad, sd=logZ_gp_rad_sigma, shape=obs_shape, testval=-.1) #''' # priors ## first on photoionization model #logZ = pymc3.Uniform('logZ', *grid.range('logZ'), shape=obs_shape, testval=0.) Z = Zsol * 10.**logZ logU = pymc3.Bound(pymc3.Normal, *grid.range('logU'))('logU', mu=-2., sd=5., shape=obs_shape, testval=-2.) age = pymc3.Bound(pymc3.Normal, *grid.range('Age'))('age', mu=5., sd=10., shape=obs_shape, testval=2.5) #xid = theano.shared(0.46) # dust laws come from PCA fits tauV_mu_norm = pymc3.Bound(pymc3.Normal, lower=-tauV_mu_loc / tauV_mu_sd)( 'tauV mu norm', mu=0, sd=1., shape=obs_shape, testval=0.) tauV_mu = pymc3.Deterministic('tauV mu', tauV_mu_loc + tauV_mu_sd * tauV_mu_norm) tauV_1mmu_norm = pymc3.Bound(pymc3.Normal, lower=-tauV_1mmu_loc / tauV_1mmu_sd)( 'tauV 1mmu norm', mu=0, sd=1., shape=obs_shape, testval=0.) tauV_1mmu = pymc3.Deterministic( 'tauV 1mmu', tauV_1mmu_loc + tauV_1mmu_sd * tauV_1mmu_norm) #tauV = tauV_mu + tauV_1mmu #logGMSD = pymc3.Deterministic( # 'logGMSD', theano.tensor.log10(0.2 * tauV / (xid * Z))) grid_params = theano.tensor.stack([logZ, logU, age], axis=0) # the attenuation power-laws dense_powerlaw = theano.shared( (line_ls.quantity.value.astype('float32') / 5500)**-1.3) diffuse_powerlaw = theano.shared( (line_ls.quantity.value.astype('float32') / 5500)**-0.7) transmission = pymc3.math.exp( -(theano.tensor.outer(tauV_1mmu, dense_powerlaw) + \ theano.tensor.outer(tauV_mu, diffuse_powerlaw))) # dim lines based on distance distmod = theano.shared(four_pi_r2) one_e17 = theano.shared(1.0e17) obsnorm = one_e17 / distmod # next on normalization of emission line strengths logQHnorm = pymc3.Normal('logQHnorm', mu=0., sd=1., testval=0., shape=obs_shape) logQH = pymc3.Deterministic('logQH', logQH_loc + logQH_sd * logQHnorm) eff_QH = pymc3.Kumaraswamy('effQH', a=3., b=3., shape=obs_shape, testval=0.66) linelumnorm = theano.tensor.outer( eff_QH * 10**logQH, grid.observable_norms_t.astype('float32')) norm = obsnorm * linelumnorm * transmission for i, (name, alpha, cov) in enumerate( zip(grid.observable_names, grid.alphas, grid.covs)): pymc3.StudentT( '-'.join(('obsflux', name)), nu=1., mu=((gp_grid.gp_predictt(cov, alpha, grid.X0, grid_params) + 1.) * norm[:, i]), sd=unc[:, i], observed=f[:, i]) model_graph = pymc3.model_to_graphviz() model_graph.format = 'svg' model_graph.render() step, start = densemass_sample(model, cores=1, chains=1, nstart=200, nburn=200, ntune=5000) try: nchains = 10 trace = pymc3.sample(step=step, start=start * nchains, draws=500, tune=500, burn=500, cores=1, chains=nchains, nuts_kwargs=dict(target_accept=.95), init='adapt_diag') except Exception as e: print(e) trace = None return model, trace, f, unc, Rreff
X_masked = np.ma.masked_invalid(X) # model with pm.Model() as model: # priors intercept = pm.Normal('intercept', mu=0, sigma=100) beta = pm.Normal('beta', mu=0, sigma=100, shape=X_masked.shape[1]) alpha = pm.HalfCauchy('alpha', beta=5) # impute missing X chol, stds, corr = pm.LKJCholeskyCov('chol', n=X_masked.shape[1], eta=2, sd_dist=pm.Exponential.dist(1), compute_corr=True) cov = pm.Deterministic('cov', chol.dot(chol.T)) X_mu = pm.Normal('X_mu', mu=0, sigma=100, shape=X_masked.shape[1], testval=X_masked.mean(axis=0)) X_modeled = pm.MvNormal('X', mu=X_mu, chol=chol, observed=X_masked) # observation mu_ = intercept + tt.dot(X_modeled, beta) # likelihood mu = tt.exp(mu_) likelihood = pm.Gamma('y', alpha=alpha, beta=alpha / mu, observed=y) # sample
def gp_modeling(self, time=None, flux=None, flux_err=None, mask=None, sigma=3, niters=8, iterative=False): """Applies GP model to trend normalized light curve. """ if flux is None: flux = self.norm_flux if time is None: time = self.time if flux_err is None: flux_err = self.flux_err if mask is None: mask = np.zeros(len(time), dtype=bool) if (len(time) != len(flux)) or (len(time) != len(flux_err)): raise ValueError( "Please ensure you're passing in arrays of the same length.") self.mask = mask x = np.array(time) y = np.array(flux) yerr = np.array(flux_err) x = np.array(x[~mask]) y = np.array(y[~mask]) yerr = np.array(yerr[~mask]) x = np.ascontiguousarray(x, dtype=np.float64) y = np.ascontiguousarray(y, dtype=np.float64) yerr = np.ascontiguousarray(yerr, dtype=np.float64) time = np.ascontiguousarray(self.time, dtype=np.float64) mu = np.nanmean(y) y = (y / mu - 1) * 1e3 yerr = yerr * 1e3 / mu results = xo.estimators.lomb_scargle_estimator( x, y, min_period=self.p_rot * 0.5, max_period=self.p_rot * 2) peak_per = results['peaks'][0]['period'] per_uncert = results['peaks'][0]['period_uncert'] self.xo_LS_results = results peak = results["peaks"][0] freq, power = results["periodogram"] with pm.Model() as model: mean = pm.Normal("mean", mu=0.0, sd=5.0) # white noise logs2 = pm.Normal("logs2", mu=np.log(np.nanmin(yerr) / 2.0), sd=10.0) # The parameters of the RotationTerm kernel logamp = pm.Normal("logamp", mu=np.log(np.var(y) / 2.0), sd=20.0) # Bounds on period # BoundedNormal = pm.Bound(pm.Normal, lower=np.log(peak_per*0.5), # upper=np.log(peak_per*3)) # logperiod = BoundedNormal("logperiod", mu=np.log(2*peak["period"]), sd=per_uncert) # Q from simple harmonic oscillator logQ0 = pm.Normal("logQ0", mu=1.0, sd=10.0) logdeltaQ = pm.Normal("logdeltaQ", mu=2.0, sd=10.0) # TRY WITH NORMAL MU 0.5 SD LOW mix = pm.Uniform("mix", lower=0, upper=1.0) # Track the period as a deterministic # period = pm.Deterministic("period", tt.exp(logperiod)) # Set up the Gaussian Process model # TRY WITH SHOTERM INSTEAD OF ROTATIONTERM kernel = xo.gp.terms.RotationTerm(log_amp=logamp, period=peak_per, log_Q0=logQ0, log_deltaQ=logdeltaQ, mix=mix) gp = xo.gp.GP(kernel, x, yerr**2 + tt.exp(logs2), J=4) # Compute the Gaussian Process likelihood and add it into the # the PyMC3 model as a "potential" pm.Potential("loglike", gp.log_likelihood(y - mean)) # Compute the mean model prediction for plotting purposes pm.Deterministic("pred", gp.predict()) # Fit mean model first # Fit period and amplitude together # Fit over Q # Fit over mean # Fit period and amplitude together again map_soln = xo.optimize(start=model.test_point) map_soln = xo.optimize(start=model.test_point, vars=[mean]) map_soln = xo.optimize(start=map_soln, vars=[logamp]) # map_soln = xo.optimize(start=map_soln, vars=[logperiod]) map_soln = xo.optimize(start=map_soln, vars=[logQ0]) map_soln = xo.optimize(start=map_soln, vars=[logdeltaQ]) map_soln = xo.optimize(start=map_soln, vars=[logs2]) map_soln = xo.optimize(start=map_soln, vars=[mix]) map_soln = xo.optimize(start=map_soln, vars=[mean]) map_soln = xo.optimize(start=map_soln, vars=[logamp]) #, logperiod]) map_soln = xo.optimize(start=map_soln, vars=[mix]) map_soln = xo.optimize(start=map_soln) with model: mu, var = xo.eval_in_model(gp.predict(time, return_var=True), map_soln) if iterative is False: self.gp_soln = map_soln self.gp_model = mu self.gp_flux = self.norm_flux - (mu + 1) else: self.gp_it_soln = map_soln self.gp_it_model = mu self.gp_it_glux = self.norm_flux - (mu + 1)
def calibration_main(locator, config): # INITIALIZE TIMER t0 = time.clock() # Local variables building_name = config.single_calibration.building building_load = config.single_calibration.load iteration_pymc3 = config.single_calibration.iterations with open(locator.get_calibration_problem(building_name, building_load), 'r') as input_file: problem = pickle.load(input_file) emulator = joblib.load(locator.get_calibration_gaussian_emulator(building_name, building_load)) distributions = problem['probabiltiy_vars'] variables = problem['variables'] # Create function to call predictions (mu) @as_op(itypes=[tt.dscalar, tt.dscalar, tt.dscalar, tt.dscalar, tt.dscalar, tt.dscalar], otypes=[tt.dvector]) def predict_y(var1, var2, var3, var4, var5, var6): input_sample = np.array([var1, var2, var3, var4, var5, var6]).reshape(1, -1) prediction = emulator.predict(input_sample) return prediction # Create function to call predictions (sigma) @as_op(itypes=[tt.dscalar, tt.dscalar, tt.dscalar, tt.dscalar, tt.dscalar, tt.dscalar], otypes=[tt.dvector]) def predict_sigma(var1, var2, var3, var4, var5, var6): input_sample = np.array([var1, var2, var3, var4, var5, var6]).reshape(1, -1) _, sigma = emulator.predict(input_sample, return_std=True) return sigma with pymc3.Model() as basic_model: # DECLARE PRIORS for i, variable in enumerate(variables): arguments = np.array([distributions.loc[variable, 'min'], distributions.loc[variable, 'max'], distributions.loc[variable, 'mu']]).reshape(-1, 1) min_max_scaler = preprocessing.MinMaxScaler(copy=True, feature_range=(0, 1)) arguments_norm = min_max_scaler.fit_transform(arguments) globals()['var' + str(i + 1)] = pymc3.Triangular('var' + str(i + 1), lower=arguments_norm[0][0], upper=arguments_norm[1][0], c=arguments_norm[2][0]) # DECLARE OBJECTIVE FUNCTION mu = pymc3.Deterministic('mu', predict_y(var1, var2, var3, var4, var5, var6)) sigma = pymc3.HalfNormal('sigma', 0.15) # sigma = pm.Deterministic('sigma', predict_sigma(var1, var2, var3, var4, var5, var6)) y_obs = pymc3.Normal('y_obs', mu=mu, sd=sigma, observed=0) # RUN MODEL, SAVE TO DISC AND PLOT RESULTS with basic_model: # Running step = pymc3.Metropolis() trace = pymc3.sample(iteration_pymc3, tune=1000, njobs=1, step=step) # Saving df_trace = pymc3.trace_to_dataframe(trace) #CREATE GRAPHS AND SAVE TO DISC df_trace.to_csv(locator.get_calibration_posteriors(building_name, building_load)) pymc3.traceplot(trace) columns = ["var1", "var2", "var3", "var4", "var5", "var6"] seaborn.pairplot(df_trace[columns]) if config.single_calibration.show_plots: plt.show() #SAVING POSTERIORS IN PROBLEM problem['posterior_norm'] = df_trace.as_matrix(columns=columns) pickle.dump(problem, open(locator.get_calibration_problem(building_name, building_load), 'w')) return
def __init__( self, cell_state_mat: np.ndarray, X_data: np.ndarray, n_comb: int = 50, data_type: str = 'float32', n_iter=20000, learning_rate=0.005, total_grad_norm_constraint=200, verbose=True, var_names=None, var_names_read=None, obs_names=None, fact_names=None, sample_id=None, gene_level_prior={'mean': 1 / 2, 'sd': 1 / 4}, gene_level_var_prior={'mean_var_ratio': 1}, cell_number_prior={'cells_per_spot': 8, 'factors_per_spot': 7, 'combs_per_spot': 2.5}, cell_number_var_prior={'cells_mean_var_ratio': 1, 'factors_mean_var_ratio': 1, 'combs_mean_var_ratio': 1}, phi_hyp_prior={'mean': 3, 'sd': 1}, spot_fact_mean_var_ratio=0.5 ): ############# Initialise parameters ################ super().__init__(cell_state_mat, X_data, data_type, n_iter, learning_rate, total_grad_norm_constraint, verbose, var_names, var_names_read, obs_names, fact_names, sample_id) for k in gene_level_var_prior.keys(): gene_level_prior[k] = gene_level_var_prior[k] self.gene_level_prior = gene_level_prior self.phi_hyp_prior = phi_hyp_prior self.n_comb = n_comb self.spot_fact_mean_var_ratio = spot_fact_mean_var_ratio cell_number_prior['factors_per_combs'] = (cell_number_prior['factors_per_spot'] / cell_number_prior['combs_per_spot']) for k in cell_number_var_prior.keys(): cell_number_prior[k] = cell_number_var_prior[k] self.cell_number_prior = cell_number_prior ############# Define the model ################ self.model = pm.Model() with self.model: # =====================Gene expression level scaling======================= # # Explains difference in expression between genes and # how it differs in single cell and spatial technology # compute hyperparameters from mean and sd shape = gene_level_prior['mean'] ** 2 / gene_level_prior['sd'] ** 2 rate = gene_level_prior['mean'] / gene_level_prior['sd'] ** 2 shape_var = shape / gene_level_prior['mean_var_ratio'] rate_var = rate / gene_level_prior['mean_var_ratio'] self.gene_level_alpha_hyp = pm.Gamma('gene_level_alpha_hyp', mu=shape, sigma=np.sqrt(shape_var), shape=(1, 1)) self.gene_level_beta_hyp = pm.Gamma('gene_level_beta_hyp', mu=rate, sigma=np.sqrt(rate_var), shape=(1, 1)) self.gene_level = pm.Gamma('gene_level', self.gene_level_alpha_hyp, self.gene_level_beta_hyp, shape=(self.n_genes, 1)) # scale cell state factors by gene_level self.gene_factors = pm.Deterministic('gene_factors', self.cell_state) # tt.printing.Print('gene_factors sum')(gene_factors.sum(0).shape) # tt.printing.Print('gene_factors sum')(gene_factors.sum(0)) # =====================Spot factors======================= # # prior on spot factors reflects the number of cells, fraction of their cytoplasm captured, # times heterogeniety in the total number of mRNA between individual cells with each cell type self.cells_per_spot = pm.Gamma('cells_per_spot', mu=cell_number_prior['cells_per_spot'], sigma=np.sqrt(cell_number_prior['cells_per_spot'] \ / cell_number_prior['cells_mean_var_ratio']), shape=(self.n_cells, 1)) self.comb_per_spot = pm.Gamma('combs_per_spot', mu=cell_number_prior['combs_per_spot'], sigma=np.sqrt(cell_number_prior['combs_per_spot'] \ / cell_number_prior['combs_mean_var_ratio']), shape=(self.n_cells, 1)) shape = self.comb_per_spot / np.array(self.n_comb).reshape((1, 1)) rate = tt.ones((1, 1)) / self.cells_per_spot * self.comb_per_spot self.combs_factors = pm.Gamma('combs_factors', alpha=shape, beta=rate, shape=(self.n_cells, self.n_comb)) self.factors_per_combs = pm.Gamma('factors_per_combs', mu=cell_number_prior['factors_per_combs'], sigma=np.sqrt(cell_number_prior['factors_per_combs'] \ / cell_number_prior['factors_mean_var_ratio']), shape=(self.n_comb, 1)) c2f_shape = self.factors_per_combs / np.array(self.n_fact).reshape((1, 1)) self.comb2fact = pm.Gamma('comb2fact', alpha=c2f_shape, beta=self.factors_per_combs, shape=(self.n_comb, self.n_fact)) self.spot_factors = pm.Gamma('spot_factors', mu=pm.math.dot(self.combs_factors, self.comb2fact), sigma=pm.math.sqrt(pm.math.dot(self.combs_factors, self.comb2fact) \ / self.spot_fact_mean_var_ratio), shape=(self.n_cells, self.n_fact)) # =====================Spot-specific additive component======================= # # molecule contribution that cannot be explained by cell state signatures # these counts are distributed between all genes not just expressed genes self.spot_add_hyp = pm.Gamma('spot_add_hyp', 1, 1, shape=2) self.spot_add = pm.Gamma('spot_add', self.spot_add_hyp[0], self.spot_add_hyp[1], shape=(self.n_cells, 1)) # =====================Gene-specific additive component ======================= # # per gene molecule contribution that cannot be explained by cell state signatures # these counts are distributed equally between all spots (e.g. background, free-floating RNA) self.gene_add_hyp = pm.Gamma('gene_add_hyp', 1, 1, shape=2) self.gene_add = pm.Gamma('gene_add', self.gene_add_hyp[0], self.gene_add_hyp[1], shape=(self.n_genes, 1)) # =====================Gene-specific overdispersion ======================= # self.phi_hyp = pm.Gamma('phi_hyp', mu=phi_hyp_prior['mean'], sigma=phi_hyp_prior['sd'], shape=(1, 1)) self.gene_E = pm.Exponential('gene_E', self.phi_hyp, shape=(self.n_genes, 1)) # =====================Expected expression ======================= # # expected expression self.mu_biol = pm.math.dot(self.spot_factors, self.gene_factors.T) * self.gene_level.T \ + self.gene_add.T + self.spot_add # tt.printing.Print('mu_biol')(self.mu_biol.shape) # =====================DATA likelihood ======================= # # Likelihood (sampling distribution) of observations & add overdispersion via NegativeBinomial / Poisson self.data_target = pm.NegativeBinomial('data_target', mu=self.mu_biol, alpha=1 / (self.gene_E.T * self.gene_E.T), observed=self.x_data, total_size=self.X_data.shape) # =====================Compute nUMI from each factor in spots ======================= # self.nUMI_factors = pm.Deterministic('nUMI_factors', (self.spot_factors * (self.gene_factors * self.gene_level).sum(0)))
# plt.show() # ====================================================================== # unpooled_model # ====================================================================== with pm.Model() as unpooled_model: # define priors sigma = pm.HalfCauchy('sigma', beta=10, testval=1.) # mu = pm.Uniform('mu', 0, 10) beta = pm.Normal('beta', 0, 20, shape=companiesABC) beta1 = pm.Normal('beta1', 0, 20, shape=companiesABC) beta2 = pm.Normal('beta2', 0, 10) # theta = pm.Uniform('theta', lower=0, upper=10) mu = pm.Deterministic( 'mu', tt.exp(beta[companyABC] + beta1[companyABC] * elec_year + beta2 * elec_tem)) # mu = tt.exp(beta + beta1 * elec_year + beta2 * elec_tem) # mu = pm.math.exp(theta) Observed_pred = pm.NegativeBinomial("Observed_pred", mu=mu, alpha=sigma, shape=elec_faults.shape) # 观测值 Observed = pm.NegativeBinomial("Observed", mu=mu, alpha=sigma, observed=elec_faults) # 观测值 start = pm.find_MAP() # step1 = pm.Slice([beta, beta1, beta2]) # step = pm.Metropolis()
# we use a shared variable from theano to feed the x values into the model # this is need for PPC # when using the model for predictions we can set this shared variable to x_test shared_x = shared(x_train) # training the model # model specifications in PyMC3 are wrapped in a with-statement with pm.Model() as model: # Define priors x_coeff = pm.Normal('x', 0, sd=20) # prior for coefficient of x intercept = pm.Normal('Intercept', 0, sd=20) # prior for the intercept sigma = pm.HalfCauchy('sigma', beta=10) # prior for the error term of due to the noise reg = intercept + tt.dot(shared_x, x_coeff) p = pm.Deterministic("p", invlogit(reg)) # represent the logistic regression relationship # Define likelihood likelihood = pm.Bernoulli('y', p=p, observed=y_train) # Inference! trace = pm.sample(1000) # draw 3000 posterior samples using NUTS sampling # predicting the unseen y values # uses posterior predictive checks (PPC) shared_x.set_value(x_test) # let's set the shared x to the test dataset ppc = pm.sample_ppc(trace, model=model, samples=1000) # performs PPC predictions = ppc['y'].mean(axis=0) # compute the mean of the samples draws from each new y predictions = predictions >= 0.5 # now you can check the error
from pandas_datareader import data import pymc3 as pm import matplotlib.pyplot as plt import numpy as np returns = data.get_data_google('SPY', start='2008-5-1', end='2009-12-1')['Close'].pct_change() print(returns) with pm.Model() as sp500_model: nu = pm.Exponential('nu', 1. / 10, testval=5.) sigma = pm.Exponential('sigma', 1. / .02, testval=.1) s = pm.GaussianRandomWalk('s', sigma**-2, shape=len(returns)) volatility_process = pm.Deterministic('volatility_process', pm.math.exp(-2 * s)) r = pm.StudentT('r', nu, lam=volatility_process, observed=returns) with sp500_model: trace = pm.sample(2000) pm.traceplot(trace, [nu, sigma]) fig, ax = plt.subplots(figsize=(15, 8)) returns.plot(ax=ax) ax.plot(returns.index, 1 / np.exp(trace['s', ::5].T), 'r', alpha=.03) ax.set(title='volatility_process', xlabel='time', ylabel='volatility') ax.legend(['S&P500', 'stochastic volatility process']) plt.show()
def main(args): print("Begin", file=sys.stderr) check_args(args) ####################################### ## ## Import Data, remove missing guides ## ####################################### print("Import Data, remove missing guides", file=sys.stderr) data = pd.read_table(args.input_data, sep="\t", header=0) hs_zero = data['HS_reads'] > 0 ls_zero = data['LS_reads'] > 0 rm_zero = hs_zero & ls_zero data = data[rm_zero] ####################################### ## ## Downsample larger lib to comparible ## ####################################### print("Downsample", file=sys.stderr) ## Rescale to floats rescale = min(data['LS_reads'].sum(), data['HS_reads'].sum()) / data.loc[:, ('HS_reads', 'LS_reads')].sum(axis=0) data.loc[:, ('HS_reads', 'LS_reads')] *= rescale ## Sample downsized library runif = np.random.uniform(size=data.loc[:, ('HS_reads', 'LS_reads')].shape) int_part, sample_p = np.divmod(data.loc[:, ('HS_reads', 'LS_reads')], 1) data.loc[:, ('HS_reads', 'LS_reads')] = int_part + (runif < sample_p) ## Return as int data.loc[:, ('HS_reads', 'LS_reads')] = data.loc[:, ('HS_reads', 'LS_reads')].astype(int) + 1 ####################################### ## ## Calc. simple data representations ## ####################################### data['beta_mean'] = data['LS_reads'] / (data['LS_reads'] + data['HS_reads']) data['log(LS/HS)'] = np.log(data['LS_reads'] / data['HS_reads']) ####################################### ## ## Organize positional information ## ####################################### print("Parse positional information", file=sys.stderr) ## Line guide effects up to genome targ_data = data[ (~data['Coordinates'].str.contains("NT")) &\ (~data['Coordinates'].str.contains('CTRL')) &\ (~data['Coordinates'].str.contains('FILLER-LV2')) &\ (~data['Coordinates'].str.contains('FILLER-SgO')) ] if args.no_offsets: plus_offsets = [0, 0] minus_offsets = [0, 0] else: plus_offsets = [152, 147] minus_offsets = [146, 153] uniq_chrom = np.unique( [coord.split(':')[0] for coord in targ_data['Coordinates']]) chrom2idx = OrderedDict([(x, i) for i, x in enumerate(uniq_chrom)]) idx2chrom = OrderedDict([(i, x) for i, x in enumerate(uniq_chrom)]) pos_array = np.array([ (chrom2idx[coord.split(':')[0]], int(coord.split(':')[1].split('-')[1]) - plus_offsets[0], int(coord.split(':')[1].split('-')[1]) + plus_offsets[1]) if coord.split(':')[2] == '+' else (chrom2idx[coord.split(':')[0]], int(coord.split(':')[1].split('-')[1]) - minus_offsets[0], int(coord.split(':')[1].split('-')[1]) + minus_offsets[1]) for coord in targ_data['Coordinates'] ]) ## Get genomic windows genome_lims = OrderedDict([ (idx, (pos_array[pos_array[:, 0] == idx, 1].min(), pos_array[pos_array[:, 0] == idx, 2].max())) for idx, chrom in idx2chrom.items() ]) sliding_window = [ (idx, np.vstack((np.arange(*lims, args.step_size), np.minimum( np.arange(*lims, args.step_size) + args.window_size, lims[1]))).T) for idx, lims in genome_lims.items() ] sliding_window = np.concatenate([ np.concatenate((np.tile([[idx]], (a_window.shape[0], 1)), a_window), axis=1) for idx, a_window in sliding_window ]) sliding_window = sliding_window[[ np.any(check_overlap_bed(interval, pos_array)) for interval in sliding_window ]] ## Get chromosome chrom = targ_data['Coordinates'].iloc[0].split(':')[0] ####################################### ## ## Process guide data ## ####################################### print("Process guide data", file=sys.stderr) ovl_array = np.stack([ check_overlap_bed(guide_interval, sliding_window) for guide_interval in pos_array ], axis=0).astype(int) ovl_array = np.concatenate((np.zeros_like(ovl_array[:, 0:1]), ovl_array), axis=1) ovl_dex = pd.DataFrame( ovl_array, columns=["wnd_{}".format(i) for i in np.arange(ovl_array.shape[1])]) NT_count = data.loc[(data['Coordinates'].str.contains("NT") | data['Coordinates'].str.contains("CTRL")), ('Coordinates', 'HS_reads', 'LS_reads')].shape[0] NT_hold = np.zeros((NT_count, ovl_array.shape[1])).astype(int) NT_hold[:, 0] = 1 NT_dex = pd.DataFrame( NT_hold, columns=["wnd_{}".format(i) for i in np.arange(ovl_array.shape[1])]) wind_data = pd.concat( (pd.concat( (data.loc[(data['Coordinates'].str.contains("NT") | data['Coordinates'].str.contains("CTRL")), ('Coordinates', 'HS_reads', 'LS_reads')].reset_index( drop=True), NT_dex.reset_index(drop=True)), axis=1).reset_index(drop=True), pd.concat((targ_data.loc[:, ('Coordinates', 'HS_reads', 'LS_reads')].reset_index(drop=True), ovl_dex.reset_index(drop=True)), axis=1).reset_index(drop=True)), axis=0, ignore_index=True) max_idx = max([ int(item.replace('wnd_', '')) for item in wind_data.columns if 'wnd' in item ]) ####################################### ## ## Call peaks on chunk ## ####################################### print("Call peaks", file=sys.stderr) chunk_size = math.ceil(float(max_idx) / args.job_range) start_idx = 1 + (chunk_size * args.job_index) end_idx = start_idx + chunk_size peak_calls = [] diff_hdr = [] for i in range(start_idx, min(max_idx, end_idx)): print("Starting wnd_{}".format(i)) group0 = (wind_data['wnd_0'] == 1).astype(int) group1 = (wind_data['wnd_{}'.format(i)] == 1).astype(int) slicer = np.vstack([group0, group1]).T use_data = wind_data[np.sum(slicer, axis=1) == 1] slicer = slicer[np.sum(slicer, axis=1) == 1] slicer = np.argmax(slicer, axis=1) e_mean = np.mean(np.log(wind_data['LS_reads'] / wind_data['HS_reads'])) e_sd = np.std(np.log(wind_data['LS_reads'] / wind_data['HS_reads'])) ct_mean = np.mean(wind_data['LS_reads'].values + wind_data['HS_reads'].values) ct_sd = np.std(wind_data['LS_reads'].values + wind_data['HS_reads'].values) g_var = (ct_sd**2) - ct_mean if g_var <= 0: g_sigma = ct_sd print( "Warning! Count data is underdispersed, results may be inaccurate." ) else: g_sigma = np.sqrt(g_var) with pm.Model() as model: g = pm.Gamma('guide_intensity', mu=ct_mean, sigma=g_sigma, shape=slicer.shape[0]) e = pm.Normal('enhancer_activity', mu=e_mean, sigma=e_sd, shape=2) p = pm.Deterministic('bin_bias', tt.nnet.sigmoid(e)) l = pm.Deterministic('low_bin_theta', g * p[slicer]) h = pm.Deterministic('high_bin_theta', g * (1 - p[slicer])) diff = pm.Deterministic('enhancer_boost', e[1] - e[0]) l_ct = pm.Poisson('low_reads', mu=l, observed=use_data['LS_reads']) h_ct = pm.Poisson('high_reads', mu=h, observed=use_data['HS_reads']) with model: trace = pm.sample(1000, tune=4000, cores=8) hdr = pm.stats.hpd(trace['enhancer_boost'], alpha=0.001) thresh = [-args.rope_threshold, args.rope_threshold] the_call = check_overlap(np.array(thresh), np.expand_dims(hdr, axis=0))[0] peak_calls.append(the_call) diff_hdr.append(hdr) with open(args.output_data, 'w') as f: for i, j in enumerate(range(start_idx, min(max_idx, end_idx))): peak_position = sliding_window[j - 1] region_hdr = diff_hdr[i] region_call = peak_calls[i] == False interval_info = [ idx2chrom[peak_position[0]], peak_position[1], peak_position[2], "{},{}".format(*region_hdr), region_call, '.' ] print("{}\t{}\t{}\t{}\t{}\t{}".format(*interval_info), file=f) print("Done.", file=sys.stderr)
#Setup inversion pi = 3.14 Niter = 300000 conds_mod = 3.5 path_results = '../../../results/' with pm.Model() as model: gpsconst = pm.Uniform('gpsconst', lower=-15, upper=15) A_mod = pm.Uniform('A_mod', lower=0, upper=1000) B_mod = pm.Uniform('B_mod', lower=0, upper=1000) E_mod = pm.Uniform('E_mod', lower=0, upper=1000) Vd_exp = pm.Uniform('Vd_exp', lower=8, upper=11) Vs_exp = pm.Uniform('Vs_exp', lower=8, upper=12) kd_exp = pm.Uniform('kd_exp', lower=7, upper=10) Vd_mod = pm.Deterministic('Vd_mod', 10**Vd_exp) Vs_mod = pm.Deterministic('Vs_mod', 10**Vs_exp) #ratio = pm.Uniform('ratio',lower = 0.1,upper = 5e+3) kd_mod = pm.Deterministic('kd_mod', 10**kd_exp) pspd_mod = pm.Uniform('pspd_mod', lower=1e+5, upper=1e+7) #conds_mod = pm.Uniform('conds_mod',lower=1,upper=10) condd_mod = pm.Uniform('condd_mod', lower=1, upper=30) dsh_mod = pm.Normal('dsh_mod', mu=dsh, sigma=dshErr) xsh_mod = pm.Normal('xsh_mod', mu=xsh, sigma=xshErr) ysh_mod = pm.Normal('ysh_mod', mu=ysh, sigma=yshErr) coeffx = cs * dsh_mod * (x - xsh_mod) / (dsh_mod**2 + (x - xsh_mod)**2 + (y - ysh_mod)**2)**(5. / 2) * Vd_mod coeffy = cs * dsh_mod * (y - ysh_mod) / (dsh_mod**2 + (x - xsh_mod)**2 + (y - ysh_mod)**2)**(5. / 2) * Vd_mod
# pm.traceplot(trace1) # plt.show() with pm.Model() as unpooled_model: # define priors alpha = pm.HalfCauchy('alpha', 10, testval=.9) beta = pm.Normal('beta', 0, 100, shape=companiesABC, testval=-3.) # beta1 = pm.Normal('beta1', 0, 10, shape=companiesABC, testval=.3) # beta2 = pm.Normal('beta2', 0, 100, testval=0.01) # beta3 = pm.Normal('beta3', 0, 100) theta = pm.Normal('theta', 0, 100, shape=companiesABC) theta1 = pm.Normal('theta1', 0, 20, shape=companiesABC) beta1 = theta[companyABC] + theta1[companyABC] * x_shared1 # mu = tt.exp(beta[companyABC] + beta1[companyABC]*elec_year + beta2*elec_tem) beta_mu = pm.Deterministic( 'beta_mu', tt.exp(beta[companyABC] + beta1[companyABC] * x_shared)) # Observed_pred = pm.Weibull("Observed_pred", alpha=mu, beta=sigma, shape=elec_faults.shape) # 观测值 Observed = pm.Weibull("Observed", alpha=alpha, beta=beta_mu, observed=y_shared) # 观测值 start = pm.find_MAP() # step = pm.Slice([beta1, u]) trace2 = pm.sample(2000, start=start) chain2 = trace2[1000:] varnames1 = ['alpha', 'beta_mu'] # varnames2 = ['beta', 'beta1', 'beta2', 'alpha', 'beta3'] # pm.plot_posterior(chain2, varnames2, ref_val=0) pm.traceplot(chain2)
def density_bhm_harmonic_dht(data, omega, use_mcmc=True, nchains=2, ncores=2, tune=1500): # Full model tdays = shared(data['tdays']) nparams = 6 nt = data['n_times'] nomega = len(omega) with pm.Model() as rho_model: ### # Create priors for each of our means BoundedNormal = pm.Bound(pm.Normal, lower=0.0) aa = pm.Normal('aa', mu=0, sd=2, shape=4) # Order the mid-points aa_mid = pm.Normal('aa_mid', mu=np.array([1, 2]), sd=np.array([0.25, 0.25]), shape=2, transform=pm.distributions.transforms.ordered, testval=np.array([0.5, 1.2])) Aa = pm.Normal('Aa', mu=0, sd=1, shape=(nomega, nparams)) Ba = pm.Normal('Ba', mu=0, sd=1, shape=(nomega, nparams)) mu_beta_0 = pm.Deterministic( 'mu_beta_0', harmonic_beta(aa[0], Aa[:, 0], Ba[:, 0], omega, tdays)) mu_beta_1 = pm.Deterministic( 'mu_beta_1', harmonic_beta(aa[1], Aa[:, 1], Ba[:, 1], omega, tdays)) mu_beta_2 = pm.Deterministic( 'mu_beta_2', harmonic_beta(aa_mid[0], Aa[:, 2], Ba[:, 2], omega, tdays)) mu_beta_3 = pm.Deterministic( 'mu_beta_3', harmonic_beta(aa[2], Aa[:, 3], Ba[:, 3], omega, tdays)) mu_beta_4 = pm.Deterministic( 'mu_beta_4', harmonic_beta(aa_mid[1], Aa[:, 4], Ba[:, 4], omega, tdays)) mu_beta_5 = pm.Deterministic( 'mu_beta_5', harmonic_beta(aa[3], Aa[:, 5], Ba[:, 5], omega, tdays)) # Half-normal priors #sigma_beta = pm.HalfNormal('sigma_beta', sd=1.0, shape=(nparams,)) #sigma_curve = pm.HalfNormal('sigma_curve', sd=2.0 ) # Inverse Gamma priors sigma_beta = pm.InverseGamma('sigma_beta', 1, 1, shape=(nparams, )) sigma_curve = pm.InverseGamma('sigma_curve', 1, 1) beta_0 = pm.Normal('beta_0', mu=mu_beta_0, sd=sigma_beta[0], shape=nt) beta_1 = BoundedNormal('beta_1', mu=mu_beta_1, sd=sigma_beta[1], shape=nt) beta_3 = BoundedNormal('beta_3', mu=mu_beta_3, sd=sigma_beta[3], shape=nt) beta_5 = BoundedNormal('beta_5', mu=mu_beta_5, sd=sigma_beta[5], shape=nt) # This is a trick for ordering along the last axis of a multivariate distribution # (it seems to work...) beta_mid = BoundedNormal('beta_mid', mu=tt.stack([mu_beta_2, mu_beta_4]).T, sd=tt.stack([sigma_beta[2], sigma_beta[4]]).T, shape=(nt, 2), transform=pm.distributions.transforms.ordered) beta_s = [ beta_0, beta_1, beta_mid[..., 0], beta_3, beta_mid[..., 1], beta_5, ] ### # Generate the likelihood function using the deterministic variable as the mean mu_x = double_tanh_pm(beta_s, data['timeidx'], data['z']) # shape parameter not requires as shape is specified in the priors... rho_out = pm.Normal('rho', mu=mu_x, sd=sigma_curve, observed=data['rho']) ### # Inference step #trace = pm.sample(500) if use_mcmc: trace = pm.sample(500, tune=tune, step=pm.NUTS(), cores=ncores, chains=nchains) else: # Use variational inference inference = pm.ADVI() approx = pm.fit(n=20000, method=inference) trace = approx.sample(draws=500) return trace, rho_model, tdays
outcome=df['deposit'] data = df[['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'euribor3m']] data['outcome'] = outcome data.corr()['outcome'].sort_values(ascending=False) #%% this is really slow convergence y_simple = data['outcome'] x_n = 'duration' x_0 = data[x_n].values x_c = x_0 - x_0.mean() with pm.Model() as model_simple: α = pm.Normal('α', mu=0, sd=10) β = pm.Normal('β', mu=0, sd=10) μ = α + pm.math.dot(x_c, β) θ = pm.Deterministic('θ', pm.math.sigmoid(μ)) bd = pm.Deterministic('bd', -α/β) y_1 = pm.Bernoulli('y_1', p=θ, observed=y_simple) trace_simple = pm.sample(1000, tune=1000) #%% this is slow convergence theta = trace_simple['θ'].mean(axis=0) idx = np.argsort(x_c) plt.plot(x_c[idx], theta[idx], color='C2', lw=3) plt.vlines(trace_simple['bd'].mean(), 0, 1, color='k') bd_hpd = az.hpd(trace_simple['bd']) plt.fill_betweenx([0, 1], bd_hpd[0], bd_hpd[1], color='k', alpha=0.5) plt.scatter(x_c, np.random.normal(y_simple, 0.02), marker='.', color=[f'C{x}' for x in y_simple]) az.plot_hpd(x_c, trace_simple['θ'], color='C2') plt.xlabel(x_n) plt.ylabel('θ', rotation=0)
p2 = 1 - p1 p = tt.stack([p1, p2]) assignment = pm.Categorical("assignment", p, shape = data.shape[0], testval = np.random.randint(0, 2, data.shape[0])) print("prior assignment, with p = %.2f:" % p1.tag.test_value) print(assignment.tag.test_value[:10]) with model: sds = pm.Uniform("sds", 0, 100, shape =2) centers = pm.Normal("centers", mu = np.array([120, 190]), sd = np.array([10, 10]), shape = 2) center_i = pm.Deterministic('center_i', centers[assignment]) sd_i = pm.Deterministic('sd_i', sds[assignment]) # and to combine it with observations: observations = pm.Normal("obs", mu=center_i, sd=sd_i, observed=data) print("Random assignments: ", assignment.tag.test_value[:4], "...") print("Assigned center: ", center_i.tag.test_value[:4], "...") print("Assigned standard deviation: ", sd_i.tag.test_value[:4]) with model: step1 = pm.Metropolis(vars=[p, sds, centers]) step2 = pm.ElemwiseCategorical(vars=[assignment]) trace = pm.sample(25000, step=[step1, step2]) figsize(12.5, 9)
import pymc3 as pm from numpy import ones, array # Samples for each dose level n = 5 * ones(4, dtype=int) # Log-dose dose = array([-.86, -.3, -.05, .73]) with pm.Model() as model: # Logit-linear model parameters alpha = pm.Normal('alpha', 0, sd=100.) beta = pm.Normal('beta', 0, sd=1.) # Calculate probabilities of death theta = pm.Deterministic('theta', pm.math.invlogit(alpha + beta * dose)) # Data likelihood deaths = pm.Binomial('deaths', n=n, p=theta, observed=[0, 1, 3, 5]) def run(n=1000): if n == "short": n = 50 with model: pm.sample(n, tune=1000) if __name__ == '__main__': run()
def learn_bayesian_linear_model(self, encoded_plans, prior_weights, number_of_dimensions, sd=1, sampling_count=2000, num_chains=2, bias_preference=0.0): #the encoded plans contains a list of [<encoding>,<rating>] input_dataset = np.array([x[0] for x in encoded_plans], dtype=np.float) output_dataset = np.array([x[1] for x in encoded_plans], dtype=np.float) #TODO USE SAME MODEL AND TEST ON DUMMY DATA WITH CLEARLY KNOWN FUNCTION # maybe it is ok that it does not converge, but works with metropolis sampling. Expected? in early stages bias_preference = tt.constant(bias_preference) #todo Make bias A learnable parameter with pm.Model() as linear_model: # Intercept # alpha = pm.Normal('alpha', mu=0.5, sd=sd) alpha = pm.Deterministic('alpha', bias_preference) cov = np.diag(np.full((number_of_dimensions, ), sd)) #for both mu and beta (slope) #todo note: may consider making mu and cov as parameters sampled from distributions too # mu = pm.MvNormal('mu', mu=prior_weights, cov=cov, shape=(number_of_dimensions,)) # Slope prior_weights = np.random.rand(number_of_dimensions) betas = pm.MvNormal('betas', mu=prior_weights, cov=cov, shape=(number_of_dimensions, )) # Standard deviation sigma = pm.HalfNormal('sigma', sd=sd) # sigma = sd #seems to work better # Estimate of mean mean = alpha + tt.dot(input_dataset, betas) # Observed values Y_obs = pm.Normal('Y_obs', mu=mean, sd=sigma, observed=output_dataset) # Sampler step = pm.NUTS() # step = pm.Metropolis() # step = pm.HamiltonianMC() # Posterior distribution linear_params_trace = pm.sample( sampling_count, step, chains=num_chains, cores=num_chains ) #todo NOTE do not add tuning if deterministic. Fails spectacularly, not it's intended use. #end with # todo look into the aplha values that were sampled, because they didn't appear in the plot self.full_param_trace = linear_params_trace # we only take the last 2000, and assume it is after sufficient mixing and good values. self.linear_params_values = linear_params_trace[ -2000:] # we only take the last 2000, and assume it is after sufficient mixing and good values. self.set_normal_distr_params()
def SEIR_with_extensions( new_cases_obs, change_points_list, date_begin_simulation, num_days_sim, diff_data_sim, N, priors_dict=None, with_random_walk=True, ): """ This model includes 3 extensions to the `SIR_model_with_change_points`: 1. The SIR model now includes a incubation period during which infected people are not infectious, in the spirit of an SEIR model. In contrast to the SEIR model, the length of incubation period is not exponentially distributed but has a lognormal distribution. 2. People that are infectious are observed with a delay that is now lognormal distributed. In the `SIR_model_with_change_points` we assume a fixed delay between infection and observation. 3. `lambda_t` has an additive term given by a Gaussian random walk. Thereby, we want to fit any deviation in `lambda_t` that is not captured by the change points. If the change points are wisely chosen, and the rest of the model captures the dynamics well, one would expect that the amplitude of the random walk is small. In this case, the posterior distribution of `sigma_random_walk` will be small. Parameters ---------- new_cases_obs : list or array Timeseries (day over day) of newly reported cases (not the total number) change_points_list : list of dicts List of dictionaries, each corresponding to one change point Each dict can have the following key-value pairs. If a pair is not provided, the respective default is used. * pr_mean_date_begin_transient: datetime.datetime, NO default * pr_median_lambda: float, default: 0.4 * pr_sigma_lambda: float, default: 0.5 * pr_sigma_begin_transient: float, default: 3 * pr_median_transient_len: float, default: 3 * pr_sigma_transient_len: float, default: 0.3 date_begin_simulation: datetime.datetime. The begin of the simulation data num_days_sim : integer Number of days to forecast into the future diff_data_sim : integer Number of days that the simulation-begin predates the first data point in `new_cases_obs`. This is necessary so the model can fit the reporting delay. Set this parameter to a value larger than what you expect to find for the reporting delay. N : number The population size. For Germany, we used 83e6 priors_dict : dict Dictionary of the prior assumptions Possible key-value pairs (and default values) are: * pr_beta_I_begin : number, default: 100 * pr_beta_E_begin_scale : number, default: 10 * pr_median_lambda_0 : number, default: 2 * pr_sigma_lambda_0 : number, default: 0.7 * pr_median_mu : number, default: 1/3 * pr_sigma_mu : number, default: 0.3 * pr_median_delay : number, default: 5 * pr_sigma_delay : number, default: 0.2 * scale_delay : number, default: 0.3 * pr_beta_sigma_obs : number, default: 10 * pr_sigma_random_walk : number, default: 0.05 * pr_mean_median_incubation : number, default: 5 https://www.ncbi.nlm.nih.gov/pubmed/32150748 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7014672/ about -1 day compared to the sources day because persons likely become infectious before. * pr_sigma_median_incubation : number, default: 1 The error from the sources above is smaller, but as the -1 day is a very rough estimate, we take here a larger error. * sigma_incubation : number, default: 0.418 https://www.ncbi.nlm.nih.gov/pubmed/32150748 with_random_walk: boolean whether to add a Gaussian walk to `lambda_t`. computationolly expensive Returns ------- : pymc3.Model Returns an instance of pymc3 model with the change points """ if priors_dict is None: priors_dict = dict() default_priors = dict( pr_beta_I_begin=100, pr_beta_E_begin_scale=10, pr_median_lambda_0=2, pr_sigma_lambda_0=0.7, pr_median_mu=1 / 3, pr_sigma_mu=0.3, pr_median_delay=5, pr_sigma_delay=0.2, scale_delay=0.3, pr_beta_sigma_obs=10, pr_sigma_random_walk=0.05, pr_mean_median_incubation=5, # https://www.ncbi.nlm.nih.gov/pubmed/32150748 # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7014672/ # about -1 day because persons likely become infectious before pr_sigma_median_incubation=1, sigma_incubation=0.418, # https://www.ncbi.nlm.nih.gov/pubmed/32150748 ) if not with_random_walk: del default_priors["pr_sigma_random_walk"] default_priors_change_points = dict( pr_median_lambda=default_priors["pr_median_lambda_0"], pr_sigma_lambda=default_priors["pr_sigma_lambda_0"], pr_sigma_date_begin_transient=3, pr_median_transient_len=3, pr_sigma_transient_len=0.3, pr_mean_date_begin_transient=None, ) for prior_name in priors_dict.keys(): if prior_name not in default_priors: raise RuntimeError(f"Prior with name {prior_name} not known") for change_point in change_points_list: for prior_name in change_point.keys(): if prior_name not in default_priors_change_points: raise RuntimeError(f"Prior with name {prior_name} not known") for prior_name, value in default_priors.items(): if prior_name not in priors_dict: priors_dict[prior_name] = value print(f"{prior_name} was set to default value {value}") for prior_name, value in default_priors_change_points.items(): for i_cp, change_point in enumerate(change_points_list): if prior_name not in change_point: change_point[prior_name] = value print( f"{prior_name} of change point {i_cp} was set to default value {value}" ) if ( diff_data_sim < priors_dict["pr_median_delay"] + 3 * priors_dict["pr_median_delay"] * priors_dict["pr_sigma_delay"] ): raise RuntimeError("diff_data_sim is to small compared to the prior delay") if num_days_sim < len(new_cases_obs) + diff_data_sim: raise RuntimeError( "Simulation ends before the end of the data. Increase num_days_sim." ) with pm.Model() as model: # all pm functions now apply on the model instance # true cases at begin of loaded data but we do not know the real number I_begin = pm.HalfCauchy(name="I_begin", beta=priors_dict["pr_beta_I_begin"]) E_begin_scale = pm.HalfCauchy( name="E_begin_scale", beta=priors_dict["pr_beta_E_begin_scale"] ) new_E_begin = pm.HalfCauchy("E_begin", beta=E_begin_scale, shape=9) # fraction of people that are newly infected each day lambda_list = [] lambda_list.append( pm.Lognormal( name="lambda_0", mu=np.log(priors_dict["pr_median_lambda_0"]), sigma=priors_dict["pr_sigma_lambda_0"], ) ) for i, cp in enumerate(change_points_list): lambda_list.append( pm.Lognormal( name="lambda_{}".format(i + 1), mu=np.log(cp["pr_median_lambda"]), sigma=cp["pr_sigma_lambda"], ) ) # set the start dates of the two periods tr_begin_list = [] dt_before = None for i, cp in enumerate(change_points_list): date_begin_transient = cp["pr_mean_date_begin_transient"] if dt_before is not None and dt_before > date_begin_transient: raise RuntimeError("Dates of change points are not temporally ordered") prior = (date_begin_transient - date_begin_simulation).days tr_begin = pm.Normal( name="transient_begin_{}".format(i), mu=prior, sigma=cp["pr_sigma_date_begin_transient"], ) tr_begin_list.append(tr_begin) dt_before = date_begin_transient # transient time tr_len_list = [] for i, cp in enumerate(change_points_list): transient_len = pm.Lognormal( name="transient_len_{}".format(i), mu=np.log(cp["pr_median_transient_len"]), sigma=cp["pr_sigma_transient_len"], ) tr_len_list.append(transient_len) # build the time-dependent spreading rate if with_random_walk: sigma_random_walk = pm.HalfNormal( name="sigma_random_walk", sigma=priors_dict["pr_sigma_random_walk"] ) lambda_t_random_walk = pm.distributions.timeseries.GaussianRandomWalk( name="lambda_t_random_walk", mu=0, sigma=sigma_random_walk, shape=num_days_sim, init=pm.Normal.dist(sigma=priors_dict["pr_sigma_random_walk"]), ) lambda_base = lambda_t_random_walk + lambda_list[0] else: lambda_base = lambda_list[0] * tt.ones(num_days_sim) lambda_t_list = [lambda_base] lambda_step_before = lambda_list[0] for tr_begin, transient_len, lambda_step in zip( tr_begin_list, tr_len_list, lambda_list[1:] ): lambda_t = mh.smooth_step_function( start_val=0, end_val=1, t_begin=tr_begin, t_end=tr_begin + transient_len, t_total=num_days_sim, ) * (lambda_step - lambda_step_before) lambda_step_before = lambda_step lambda_t_list.append(lambda_t) lambda_t = sum(lambda_t_list) # fraction of people that recover each day, recovery rate mu mu = pm.Lognormal( name="mu", mu=np.log(priors_dict["pr_median_mu"]), sigma=priors_dict["pr_sigma_mu"], ) # delay in days between contracting the disease and being recorded delay = pm.Lognormal( name="delay", mu=np.log(priors_dict["pr_median_delay"]), sigma=priors_dict["pr_sigma_delay"], ) # prior of the error of observed cases sigma_obs = pm.HalfCauchy( name="sigma_obs", beta=priors_dict["pr_beta_sigma_obs"] ) # -------------------------------------------------------------------------- # # training the model with loaded data provided as argument # -------------------------------------------------------------------------- # median_incubation = pm.Normal( name="median_incubation", mu=priors_dict["pr_mean_median_incubation"], sigma=priors_dict["pr_sigma_median_incubation"], ) # sources: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7014672/ # S_begin = N - I_begin S_t, new_E_t, I_t, new_I_t = _SEIR_model_with_delay( lambda_t=lambda_t, mu=mu, S_begin=S_begin, new_E_begin=new_E_begin, I_begin=I_begin, N=N, median_incubation=median_incubation, sigma_incubation=0.418, # https://www.ncbi.nlm.nih.gov/pubmed/32150748 ) new_cases_inferred = mh.delay_cases_lognormal( input_arr=new_I_t, len_input_arr=num_days_sim, len_output_arr=num_days_sim - diff_data_sim, median_delay=delay, scale_delay=priors_dict["scale_delay"], delay_betw_input_output=diff_data_sim, ) num_days_data = new_cases_obs.shape[-1] # likelihood of the model: # observed cases are distributed following studentT around the model. # we want to approximate a Poisson distribution of new cases. # we choose nu=4 to get heavy tails and robustness to outliers. # https://www.jstor.org/stable/2290063 pm.StudentT( name="_new_cases_studentT", nu=4, mu=new_cases_inferred[:num_days_data], sigma=tt.abs_(new_cases_inferred[:num_days_data] + 1) ** 0.5 * sigma_obs, # +1 and tt.abs to avoid nans observed=new_cases_obs, ) # add these observables to the model so we can extract a time series of them # later via e.g. `model.trace['lambda_t']` pm.Deterministic("lambda_t", lambda_t) pm.Deterministic("new_cases", new_cases_inferred) return model
log_like1 = - 0.5 * n * tt.log(2 * np.pi) \ - 0.5 * tt.log(dsigma) \ - 0.5 * (x - mu1).tt.dot(isigma).dot(x - mu1) log_like2 = - 0.5 * n * tt.log(2 * np.pi) \ - 0.5 * tt.log(dsigma) \ - 0.5 * (x - mu2).T.dot(isigma).dot(x - mu2) return tt.log(w1 * tt.exp(log_like1) + w2 * tt.exp(log_like2)) with pm.Model() as ATMIP_test: X = pm.Uniform('X', shape=n, lower=-2. * np.ones_like(mu1), upper=2. * np.ones_like(mu1), testval=-1. * np.ones_like(mu1), transform=None) like = pm.Deterministic('like', two_gaussians(X)) llk = pm.Potential('like', like) with ATMIP_test: step = atmcmc.ATMCMC(n_chains=n_chains, tune_interval=tune_interval, likelihood_name=ATMIP_test.deterministics[0].name) trcs = atmcmc.ATMIP_sample( n_steps=n_steps, step=step, njobs=njobs, progressbar=True, trace=test_folder, model=ATMIP_test) pm.summary(trcs)
with pm.Model() as unpooled_model: # define priors alpha = pm.HalfCauchy('alpha', 10, testval=.9) switch = pm.DiscreteUniform('swich', lower=x_shared.min() + 3, upper=x_shared.max() - 0.5) early_rate = pm.Normal('early_rate', 0, 100) late_rate = pm.Normal('late_rate', 0, 100) beta1 = pm.math.switch(x_shared <= switch, early_rate, late_rate) beta = pm.Normal('beta', 0, 100, shape=companiesABC) u = pm.Normal('u', 0, 0.0001) # mu = tt.exp(beta[companyABC] + beta1[companyABC]*elec_year + beta2*elec_tem) beta_mu = pm.Deterministic('beta_mu', tt.exp(beta[Num_shared] + beta1 * x_shared + u)) # Observed_pred = pm.Weibull("Observed_pred", alpha=mu, beta=sigma, shape=elec_faults.shape) # 观测值 Observed = pm.Weibull("Observed", alpha=alpha, beta=beta_mu, observed=train_faults) # 观测值 start = pm.find_MAP() # step = pm.Slice([beta1, u]) trace2 = pm.sample(3000, start=start, tune=1000) chain2 = trace2[1000:] varnames1 = ['alpha', 'beta_mu', 'swich'] print(pm.df_summary(trace2, varnames1)) varnames2 = ['beta', 'early_rate', 'late_rate', 'alpha', 'u'] # pm.plot_posterior(chain2, varnames2, ref_val=0)
print("Obs from Site B: ", observations_B[:30], "...") # In[25]: print(np.mean(observations_A)) print(np.mean(observations_B)) # In[26]: # Set up the pymc3 model. Again assume Uniform priors for p_A and p_B. with pm.Model() as model: p_A = pm.Uniform("p_A", 0, 1) p_B = pm.Uniform("p_B", 0, 1) # Define the deterministic delta function. This is our unknown of interest. delta = pm.Deterministic("delta", p_A - p_B) # Set of observations, in this case we have two observation datasets. obs_A = pm.Bernoulli("obs_A", p_A, observed=observations_A) obs_B = pm.Bernoulli("obs_B", p_B, observed=observations_B) # To be explained in chapter 3. step = pm.Metropolis() trace = pm.sample(20000, step=step) burned_trace = trace[1000:] # Below we plot the posterior distributions for the three unknowns: # In[27]: p_A_samples = burned_trace["p_A"]
def build_model(self): base_numbers = self.data.n_safe.unique() choices = self.data.chose_risky.values mean_safe = np.mean(np.log(base_numbers)) std_safe = np.std(np.log(base_numbers)) self.coords = { "subject": self.unique_subjects, "presentation": ['first', 'second'], } with pm.Model(coords=self.coords) as self.model: inputs = self._get_model_input() for key, value in inputs.items(): inputs[key] = pm.Data(key, value) # Hyperpriors for group nodes risky_prior_mu_mu = pm.HalfNormal("risky_prior_mu_mu", sigma=np.log(20.)) risky_prior_mu_sd = pm.HalfCauchy('risky_prior_mu_sd', .5) risky_prior_mu_offset = pm.Normal( 'risky_prior_mu_offset', mu=0, sd=1, dims='subject') #shape=n_subjects) risky_prior_mu = pm.Deterministic( 'risky_prior_mu', risky_prior_mu_mu + risky_prior_mu_sd * risky_prior_mu_offset, dims='subject') risky_prior_sd_mu = pm.HalfNormal("risky_prior_sd_mu", sigma=1.25) risky_prior_sd_sd = pm.HalfCauchy('risky_prior_sd_sd', .5) risky_prior_sd = pm.TruncatedNormal('risky_prior_sd', mu=risky_prior_sd_mu, sigma=risky_prior_sd_sd, lower=0, dims='subject') safe_prior_mu = mean_safe safe_prior_sd = std_safe # ix0 = first presented, ix1=later presented evidence_sd_mu = pm.HalfNormal("evidence_sd_mu", sigma=1., dims=('presentation')) evidence_sd_sd = pm.HalfCauchy("evidence_sd_sd", 1., dims=('presentation')) evidence_sd = pm.TruncatedNormal('evidence_sd', mu=evidence_sd_mu, sigma=evidence_sd_sd, lower=0, dims=('subject', 'presentation')) post_risky_mu, post_risky_sd = get_posterior( risky_prior_mu[inputs['subject_ix']], risky_prior_sd[inputs['subject_ix']], inputs['risky_mu'], evidence_sd[inputs['subject_ix'], inputs['risky_ix']]) post_safe_mu, post_safe_sd = get_posterior( safe_prior_mu, safe_prior_sd, inputs['safe_mu'], evidence_sd[inputs['subject_ix'], inputs['safe_ix']]) diff_mu, diff_sd = get_diff_dist(post_risky_mu, post_risky_sd, post_safe_mu, post_safe_sd) p = pm.Deterministic( 'p', cumulative_normal(tt.log(.55), diff_mu, diff_sd)) ll = pm.Bernoulli('ll_bernoulli', p=p, observed=choices)
def recreate_mod(self): ''' ''' with pm.Model() as self.model: # Parameters for the stellar properties mean = pm.Normal("mean", mu=self.soln['mean'], sd=10.0) u_star = xo.distributions.QuadLimbDark("u_star") # Stellar parameters from Huang et al (2018) M_star_huang = 1.094, 0.039 R_star_huang = 1.10, 0.023 BoundedNormal = pm.Bound(pm.Normal, lower=0, upper=3) if self.do_even_odd == False: logP = pm.Normal("logP", mu=self.soln['logP'], sd=1) t0 = pm.Normal("t0", mu=self.soln['t0'], sd=1) period = pm.Deterministic("period", tt.exp(logP)) m_star = BoundedNormal("m_star", mu=self.soln['m_star'], sd=M_star_huang[1]) r_star = BoundedNormal("r_star", mu=self.soln['r_star'], sd=R_star_huang[1]) b = pm.Uniform("b", lower=0, upper=0.9, testval=self.soln['b']) BoundedNormal_logr = pm.Bound(pm.Normal, lower=-5, upper=0) logr = BoundedNormal_logr('logr', mu=self.soln['logr'], sd=1.0) r_pl = pm.Deterministic("r_pl", tt.exp(logr)) ror = pm.Deterministic("ror", r_pl / r_star) BoundedBeta = pm.Bound(pm.Beta, lower=0, upper=1-1e-5) ecc = BoundedBeta("ecc", alpha=0.867, beta=3.03, testval=self.soln['ecc']) omega = xo.distributions.Angle("omega") # Even-Odd Test else: logP_even = pm.Normal("logP_even", mu=self.soln['logP_even'], sd=1) t0_even = pm.Normal("t0_even", mu=self.soln['t0_even'], sd=1) period_even = pm.Deterministic("period_even", tt.exp(logP_even)) m_star_even = BoundedNormal("m_star_even", mu=self.soln['m_star_even'], sd=M_star_huang[1]) r_star_even = BoundedNormal("r_star_even", mu=self.soln['r_star_even'], sd=R_star_huang[1]) b_even = pm.Uniform("b_even", lower=0, upper=0.9, testval=self.soln['b_even']) BoundedNormal_logr = pm.Bound(pm.Normal, lower=-5, upper=0) logr_even = BoundedNormal_logr('logr_even', mu=self.soln['logr_even'], sd=1.0) r_pl_even = pm.Deterministic("r_pl_even", tt.exp(logr_even)) ror_even = pm.Deterministic("ror_even", r_pl_even / r_star_even) BoundedBeta = pm.Bound(pm.Beta, lower=0, upper=1-1e-5) ecc_even = BoundedBeta("ecc_even", alpha=0.867, beta=3.03, testval=self.soln['ecc_even']) omega_even = xo.distributions.Angle("omega_even") logP_odd = pm.Normal("logP_odd", mu=self.soln['logP_odd'], sd=1) t0_odd = pm.Normal("t0_odd", mu=self.soln['t0_odd'], sd=1) period_odd = pm.Deterministic("period_odd", tt.exp(logP_odd)) m_star_odd = BoundedNormal("m_star_odd", mu=self.soln['m_star_odd'], sd=M_star_huang[1]) r_star_odd = BoundedNormal("r_star_odd", mu=self.soln['r_star_odd'], sd=R_star_huang[1]) b_odd = pm.Uniform("b_odd", lower=0, upper=0.9, testval=self.soln['b_odd']) logr_odd = BoundedNormal_logr('logr_odd', mu=self.soln['logr_odd'], sd=1.0) r_pl_odd = pm.Deterministic("r_pl_odd", tt.exp(logr_odd)) ror_odd = pm.Deterministic("ror_odd", r_pl_odd / r_star_odd) ecc_odd = BoundedBeta("ecc_odd", alpha=0.867, beta=3.03, testval=self.soln['ecc_odd']) omega_odd = xo.distributions.Angle("omega_odd") # The parameters of the RotationTerm kernel logamp = pm.Normal("logamp", mu=self.soln['logamp'], sd=5.0) logrotperiod = pm.Normal("logrotperiod", mu=self.soln['logrotperiod'], sd=5.0) logQ0 = pm.Normal("logQ0", mu=self.soln['logQ0'], sd=10.0) logdeltaQ = pm.Normal("logdeltaQ", mu=self.soln['logdeltaQ'], sd=10.0) mix = pm.Uniform("mix", lower=0, upper=1.0, testval=self.soln['mix']) # Transit jitter & GP parameters logs2 = pm.Normal("logs2", mu=self.soln['logs2'], sd=5.0) # Track the rotation period as a deterministic rotperiod = pm.Deterministic("rotation_period", tt.exp(logrotperiod)) # GP model for the light curve kernel = xo.gp.terms.RotationTerm(log_amp=logamp, period=rotperiod, log_Q0=logQ0, log_deltaQ=logdeltaQ, mix=mix) gp = xo.gp.GP(kernel, self.time[self.mask], ((self.flux_err[self.mask])**2 + tt.exp(logs2)), J=4) if self.do_even_odd == False: # Orbit model orbit = xo.orbits.KeplerianOrbit(r_star=r_star, m_star=m_star, period=period, t0=t0, b=b, ecc=ecc, omega=omega) light_curves = xo.StarryLightCurve(u_star).get_light_curve(orbit=orbit, r=r_pl, t=self.time[self.mask], texp=0.021) light_curve = pm.math.sum(light_curves, axis=-1) pm.Deterministic("light_curves", light_curves) # Compute the Gaussian Process likelihood and add it into the # the PyMC3 model as a "potential" pm.Potential("loglike", gp.log_likelihood(self.flux[self.mask] - mean - light_curve)) # Compute the mean model prediction for plotting purposes pm.Deterministic("pred", gp.predict()) pm.Deterministic("loglikelihood", gp.log_likelihood(self.flux[self.mask] - mean - light_curve)) else: orbit_even = xo.orbits.KeplerianOrbit(r_star=r_star_even, m_star=m_star_even, period=period_even, t0=t0_even, b=b_even, ecc=ecc_even, omega=omega_even) orbit_odd = xo.orbits.KeplerianOrbit(r_star=r_star_odd, m_star=m_star_odd, period=period_odd, t0=t0_odd, b=b_odd, ecc=ecc_odd, omega=omega_odd) light_curves_even = xo.StarryLightCurve(u_star).get_light_curve(orbit=orbit_even, r=r_pl_even, t=self.time[self.mask], texp=0.021) light_curves_odd = xo.StarryLightCurve(u_star).get_light_curve(orbit=orbit_odd, r=r_pl_odd, t=self.time[self.mask], texp=0.021) light_curve_even = pm.math.sum(light_curves_even, axis=-1) light_curve_odd = pm.math.sum(light_curves_odd, axis=-1) pm.Deterministic("light_curves_even", light_curves_even) pm.Deterministic("light_curves_odd", light_curves_odd) # Compute the Gaussian Process likelihood and add it into the # the PyMC3 model as a "potential" pm.Potential("loglike", gp.log_likelihood(self.flux[self.mask] - mean - (light_curve_even + light_curve_odd))) # Compute the mean model prediction for plotting purposes pm.Deterministic("pred", gp.predict()) pm.Deterministic("loglikelihood", gp.log_likelihood(self.flux[self.mask] - mean - (light_curve_even + light_curve_odd)))
# This reparameterizations is implemented in *exoplanet* as custom *PyMC3* distribution :class:`exoplanet.distributions.QuadLimbDark`. # + import pymc3 as pm with pm.Model() as model: # The baseline flux mean = pm.Normal("mean", mu=0.0, sd=1.0) # The time of a reference transit for each planet t0 = pm.Normal("t0", mu=t0s, sd=1.0, shape=2) # The log period; also tracking the period itself logP = pm.Normal("logP", mu=np.log(periods), sd=0.1, shape=2) period = pm.Deterministic("period", pm.math.exp(logP)) # The Kipping (2013) parameterization for quadratic limb darkening paramters u = xo.distributions.QuadLimbDark("u", testval=np.array([0.3, 0.2])) r = pm.Uniform("r", lower=0.01, upper=0.1, shape=2, testval=np.array([0.04, 0.06])) b = xo.distributions.ImpactParameter("b", ror=r, shape=2, testval=np.random.rand(2)) # Set up a Keplerian orbit for the planets
Bx = basis_funcs(elec_year) # 表示在取值为x时的插值函数值 # shared:符号变量(symbolic variable),a之所以叫shared variable是因为a的赋值在不同的函数中都是一致的搜索,即a是被shared的 Bx_ = shared(Bx) # #样条模型 with pm.Model() as partial_model: # define priors sigma = pm.HalfCauchy('sigma', 5) σ_a = pm.HalfCauchy('σ_a', 5.) a0 = pm.Normal('a0', 0., 10.) Δ_a = pm.Normal('Δ_a', 0., 10., shape=Num_5) δ_1 = pm.Gamma('δ_1', alpha=5, beta=1) δ = pm.Normal('δ', 0, sd=(δ_1 * δ_1)) # δ = pm.Normal('δ', 0, sd=100) # 若模型收敛差则δ改用这个语句 theta1 = pm.Deterministic('theta1', a0 + (σ_a * Δ_a).cumsum()) # theta1 = a0 + (σ_a * Δ_a).cumsum() theta = Bx_.dot(theta1) + δ Observed = pm.Normal('Observed', mu=theta, sd=sigma, observed=elec_faults_miss) # 观测值 start = pm.find_MAP() # step = pm.Metropolis() # trace2 = pm.sample(nuts_kwargs={'target_accept': 0.95}) trace2 = pm.sample(3000, tune=1000, start=start) chain2 = trace2 varnames1 = ['σ_a', 'a0', 'Δ_a', 'δ', 'theta1'] pm.traceplot(chain2, varnames1)