def generate_time_series( sampler: BayesianModelSampling, length: int, labels: typing.List[str], seed: int = 42, ): # Initialize progress bar pbar = notebook.tqdm(total=length) # Generate first sample given no evidence with io.capture_output() as captured: # When no evidence is provided, the function under-the-hood performs forward sampling sample = sampler.rejection_sample(seed=seed) sample = sample.reindex(sorted(sample.columns), axis=1) # Split sample in 'current' and 'next' slices: # - the 'current' slice will be the first row of the generated time series # - the 'next' slice is added as the second row, and will be used as # evidence for subsequent predictions df_synth = sample.filter(regex="_T$") next_slice = sample.filter(regex="_T\+1").iloc[0].values.tolist() df_synth = df_synth.append(pd.Series(next_slice, index=df_synth.columns), ignore_index=True) evidence = [ State(n, v) for n, v in zip(df_synth.columns.values, next_slice) ] # Update progress bar pbar.update(2) for _ in range(2, length): # Generate new data with io.capture_output() as captured: sample = sampler.rejection_sample(evidence=evidence) sample = sample.reindex(sorted(sample.columns), axis=1) # Append 'next' slice to the generated time series, and use it as new evidence next_slice = sample.filter(regex="_T\+1").iloc[0].values.tolist() df_synth = df_synth.append(pd.Series(next_slice, index=df_synth.columns), ignore_index=True) evidence = [ State(n, v) for n, v in zip(df_synth.columns.values, next_slice) ] # Update progress bar pbar.update(1) # Close progress bar pbar.close() # Update column names df_synth.columns = labels return df_synth
def rejection_estimate(n): inferences = BayesianModelSampling(disease_model) evidences = [ State(var='Fatigue', state=0), State(var='Fever', state=0), State(var='FluShot', state=0) ] p = inferences.rejection_sample(evidences, n) i = 0 for t in range(n): if p['Flu'][t] == float(0): i = i + 1 plt.plot(t, (i / n), 'bo') plt.ylabel('Evolving esimate') plt.xlabel('Number of samples') plt.show()
def sample_slots(model_info_file, mr_slot_names): model_info = helpers.load_from_pickle(model_info_file) model = model_info['model'] inference = BayesianModelSampling(model) # use the missing mr slots as evidence all_slots = model_info['all_slots'] missing_slots = [mr for mr in all_slots if mr not in mr_slot_names] evidence = [State(mr, 0) for mr in missing_slots] inference = BayesianModelSampling(model) # don't allow empty samples sampled_slots = [] while (sampled_slots == []): sample = inference.rejection_sample(evidence=evidence, size=1, return_type='recarray') # return a list of the column names which had presence sampled_slots = [ name for var, name in zip(sample.view('<i8'), sample.dtype.names) if var == 1 ] return sampled_slots
corr_mat = simulated_sample.corr() corr_mat.style.background_gradient(cmap="coolwarm").set_precision(2) # eaxmple: if we condition on "child_screen_time".. # ..then "child_physical_activity" becomes independent of "parent_education": corr_mat = simulated_sample.query("child_screen_time==1").drop( "child_screen_time", axis=1).corr() corr_mat.style.background_gradient(cmap="coolwarm").set_precision(2) corr_mat = simulated_sample.query("child_screen_time==0").drop( "child_screen_time", axis=1).corr() corr_mat.style.background_gradient(cmap="coolwarm").set_precision(2) # suppose that we are interested in measuring the average causal effect of "child_screen_time" on "obesity" # we can estimate this by simulating from the system: simulated_sample_lowScreentime = inference.rejection_sample( evidence=[State(var='child_screen_time', state="low")], size=10_000) simulated_sample_highScreentime = inference.rejection_sample( evidence=[State(var='child_screen_time', state="high")], size=10_000) # the observed effect of high screen time on prob. of high child obesity is: ((simulated_sample_highScreentime["child_obesity"] == "high").sum() / len(simulated_sample_highScreentime)) / ( (simulated_sample_lowScreentime["child_obesity"] == "high").sum() / len(simulated_sample_lowScreentime)) # i.e. around 2x infer_adjusted = CausalInference(pg_model) print( infer_adjusted.query(variables=["child_obesity"], do={"child_screen_time": "high"})) # we can estimate this effect from the observed data using a logistic regression model:
class TestBayesianModelSampling(unittest.TestCase): def setUp(self): self.bayesian_model = BayesianModel([('A', 'J'), ('R', 'J'), ('J', 'Q'), ('J', 'L'), ('G', 'L')]) cpd_a = TabularCPD('A', 2, [[0.2], [0.8]]) cpd_r = TabularCPD('R', 2, [[0.4], [0.6]]) cpd_j = TabularCPD('J', 2, [[0.9, 0.6, 0.7, 0.1], [0.1, 0.4, 0.3, 0.9]], ['R', 'A'], [2, 2]) cpd_q = TabularCPD('Q', 2, [[0.9, 0.2], [0.1, 0.8]], ['J'], [2]) cpd_l = TabularCPD('L', 2, [[0.9, 0.45, 0.8, 0.1], [0.1, 0.55, 0.2, 0.9]], ['G', 'J'], [2, 2]) cpd_g = TabularCPD('G', 2, [[0.6], [0.4]]) self.bayesian_model.add_cpds(cpd_a, cpd_g, cpd_j, cpd_l, cpd_q, cpd_r) self.sampling_inference = BayesianModelSampling(self.bayesian_model) self.markov_model = MarkovModel() def test_init(self): with self.assertRaises(TypeError): BayesianModelSampling(self.markov_model) def test_forward_sample(self): sample = self.sampling_inference.forward_sample(25) self.assertEquals(len(sample), 25) self.assertEquals(len(sample.columns), 6) self.assertIn('A', sample.columns) self.assertIn('J', sample.columns) self.assertIn('R', sample.columns) self.assertIn('Q', sample.columns) self.assertIn('G', sample.columns) self.assertIn('L', sample.columns) self.assertTrue(set(sample.A).issubset({0, 1})) self.assertTrue(set(sample.J).issubset({0, 1})) self.assertTrue(set(sample.R).issubset({0, 1})) self.assertTrue(set(sample.Q).issubset({0, 1})) self.assertTrue(set(sample.G).issubset({0, 1})) self.assertTrue(set(sample.L).issubset({0, 1})) def test_rejection_sample_basic(self): sample = self.sampling_inference.rejection_sample( [State('A', 1), State('J', 1), State('R', 1)], 25) self.assertEquals(len(sample), 25) self.assertEquals(len(sample.columns), 6) self.assertIn('A', sample.columns) self.assertIn('J', sample.columns) self.assertIn('R', sample.columns) self.assertIn('Q', sample.columns) self.assertIn('G', sample.columns) self.assertIn('L', sample.columns) self.assertTrue(set(sample.A).issubset({1})) self.assertTrue(set(sample.J).issubset({1})) self.assertTrue(set(sample.R).issubset({1})) self.assertTrue(set(sample.Q).issubset({0, 1})) self.assertTrue(set(sample.G).issubset({0, 1})) self.assertTrue(set(sample.L).issubset({0, 1})) @patch("pgmpy.sampling.BayesianModelSampling.forward_sample", autospec=True) def test_rejection_sample_less_arg(self, forward_sample): sample = self.sampling_inference.rejection_sample(size=5) forward_sample.assert_called_once_with(self.sampling_inference, 5) self.assertEqual(sample, forward_sample.return_value) def test_likelihood_weighted_sample(self): sample = self.sampling_inference.likelihood_weighted_sample( [State('A', 0), State('J', 1), State('R', 0)], 25) self.assertEquals(len(sample), 25) self.assertEquals(len(sample.columns), 7) self.assertIn('A', sample.columns) self.assertIn('J', sample.columns) self.assertIn('R', sample.columns) self.assertIn('Q', sample.columns) self.assertIn('G', sample.columns) self.assertIn('L', sample.columns) self.assertIn('_weight', sample.columns) self.assertTrue(set(sample.A).issubset({0, 1})) self.assertTrue(set(sample.J).issubset({0, 1})) self.assertTrue(set(sample.R).issubset({0, 1})) self.assertTrue(set(sample.Q).issubset({0, 1})) self.assertTrue(set(sample.G).issubset({0, 1})) self.assertTrue(set(sample.L).issubset({0, 1})) def tearDown(self): del self.sampling_inference del self.bayesian_model del self.markov_model
def bayesian_net(): alarm_model = BayesianModel([ ('Burglary', 'Alarm'), # Alarm has two parents, thus, it is twice as son. ('Earthquake', 'Alarm'), ('Alarm', 'JohnCalls'), ('Alarm', 'MaryCalls') ]) for i in alarm_model.get_parents('Alarm'): print(i) # variable_card indicates the number of posible values this variable can take. cpd_burglary = TabularCPD( variable='Burglary', variable_card=2, # 0->True, 1->False values=[ [0.001], # true probabilities of the table [0.999] ]) # false probabilities of the table cpd_earthquake = TabularCPD( variable='Earthquake', variable_card=2, # 0->True 1->False values=[ [0.002], # true probabilities of the table [0.998] ]) # false probabilities of the table # evidence_card indicates the number of possible values the parents of the variable can take cpd_alarm = TabularCPD( variable='Alarm', variable_card=2, # 0->True 1->False values=[ [0.95, 0.94, 0.29, 0.001], # true probabilities of the table [0.05, 0.06, 0.71, 0.999] ], # false probabilities of the table evidence=['Burglary', 'Earthquake'], evidence_card=[2, 2]) cpd_john_calls = TabularCPD( variable='JohnCalls', variable_card=2, # 0->True 1->False values=[[0.95, 0.05], [0.05, 0.95]], evidence=['Alarm'], evidence_card=[2]) cpd_mary_calls = TabularCPD( variable='MaryCalls', variable_card=2, # 0->True 1->False values=[ [0.7, 0.1], # true probabilities of the table [0.3, 0.9] ], # false probabilities of the table evidence=['Alarm'], evidence_card=[2]) for i in [ cpd_burglary, cpd_earthquake, cpd_alarm, cpd_john_calls, cpd_mary_calls ]: print(i) alarm_model.add_cpds(cpd_burglary, cpd_earthquake, cpd_alarm, cpd_john_calls, cpd_mary_calls) alarm_model.check_model() infer = VariableElimination(alarm_model) # Uncomment to obtain the result before normalization # infer = SimpleInference(alarm_model) print( infer.query( ['JohnCalls'], evidence={ 'Burglary': 1, 'Earthquake': 1, 'Alarm': 0, 'MaryCalls': 0 }, )['JohnCalls']) print( infer.query(['Burglary'], evidence={ 'JohnCalls': 0, 'MaryCalls': 0 })['Burglary']) # Variable order can be specified if necessary print( infer.query(['Burglary'], evidence={ 'JohnCalls': 0, 'MaryCalls': 0 }, elimination_order=['Alarm', 'Earthquake'])['Burglary']) sampling = BayesianModelSampling(alarm_model) data = sampling.rejection_sample(evidence={}, size=20, return_type="dataframe") print(data) data = sampling.rejection_sample(evidence=[('JohnCalls', 0), ('MaryCalls', 0)], size=20, return_type='dataframe') print(data) sampling = BayesianModelSampling(alarm_model) data = sampling.rejection_sample(evidence=None, size=5000, return_type="dataframe") approx_alarm_model = BayesianModel([('Burglary', 'Alarm'), ('Earthquake', 'Alarm'), ('Alarm', 'JohnCalls'), ('Alarm', 'MaryCalls')]) approx_alarm_model.fit(data, estimator=BayesianEstimator) approx_alarm_model.check_model() for cpd in approx_alarm_model.get_cpds(): print("CPD of {variable}:".format(variable=cpd.variable)) print(cpd) infer = VariableElimination(approx_alarm_model) print( infer.query( ['JohnCalls'], evidence={ 'Burglary': 1, 'Earthquake': 1, 'Alarm': 0, 'MaryCalls': 0 }, )['JohnCalls']) print( infer.query(['Burglary'], evidence={ 'JohnCalls': 0, 'MaryCalls': 0 })['Burglary']) print( alarm_model.predict_probability( data[['Burglary', 'Earthquake', 'Alarm', 'JohnCalls']]))
class TestBayesianModelSampling(unittest.TestCase): def setUp(self): self.bayesian_model = BayesianModel([('A', 'J'), ('R', 'J'), ('J', 'Q'), ('J', 'L'), ('G', 'L')]) cpd_a = TabularCPD('A', 2, [[0.2], [0.8]]) cpd_r = TabularCPD('R', 2, [[0.4], [0.6]]) cpd_j = TabularCPD('J', 2, [[0.9, 0.6, 0.7, 0.1], [0.1, 0.4, 0.3, 0.9]], ['R', 'A'], [2, 2]) cpd_q = TabularCPD('Q', 2, [[0.9, 0.2], [0.1, 0.8]], ['J'], [2]) cpd_l = TabularCPD('L', 2, [[0.9, 0.45, 0.8, 0.1], [0.1, 0.55, 0.2, 0.9]], ['G', 'J'], [2, 2]) cpd_g = TabularCPD('G', 2, [[0.6], [0.4]]) self.bayesian_model.add_cpds(cpd_a, cpd_g, cpd_j, cpd_l, cpd_q, cpd_r) self.sampling_inference = BayesianModelSampling(self.bayesian_model) self.markov_model = MarkovModel() def test_init(self): with self.assertRaises(TypeError): BayesianModelSampling(self.markov_model) def test_forward_sample(self): sample = self.sampling_inference.forward_sample(25) self.assertEquals(len(sample), 25) self.assertEquals(len(sample.columns), 6) self.assertIn('A', sample.columns) self.assertIn('J', sample.columns) self.assertIn('R', sample.columns) self.assertIn('Q', sample.columns) self.assertIn('G', sample.columns) self.assertIn('L', sample.columns) self.assertTrue(set(sample.A).issubset({0, 1})) self.assertTrue(set(sample.J).issubset({0, 1})) self.assertTrue(set(sample.R).issubset({0, 1})) self.assertTrue(set(sample.Q).issubset({0, 1})) self.assertTrue(set(sample.G).issubset({0, 1})) self.assertTrue(set(sample.L).issubset({0, 1})) def test_rejection_sample_basic(self): sample = self.sampling_inference.rejection_sample([State('A', 1), State('J', 1), State('R', 1)], 25) self.assertEquals(len(sample), 25) self.assertEquals(len(sample.columns), 6) self.assertIn('A', sample.columns) self.assertIn('J', sample.columns) self.assertIn('R', sample.columns) self.assertIn('Q', sample.columns) self.assertIn('G', sample.columns) self.assertIn('L', sample.columns) self.assertTrue(set(sample.A).issubset({1})) self.assertTrue(set(sample.J).issubset({1})) self.assertTrue(set(sample.R).issubset({1})) self.assertTrue(set(sample.Q).issubset({0, 1})) self.assertTrue(set(sample.G).issubset({0, 1})) self.assertTrue(set(sample.L).issubset({0, 1})) @patch("pgmpy.sampling.BayesianModelSampling.forward_sample", autospec=True) def test_rejection_sample_less_arg(self, forward_sample): sample = self.sampling_inference.rejection_sample(size=5) forward_sample.assert_called_once_with(self.sampling_inference, 5) self.assertEqual(sample, forward_sample.return_value) def test_likelihood_weighted_sample(self): sample = self.sampling_inference.likelihood_weighted_sample([State('A', 0), State('J', 1), State('R', 0)], 25) self.assertEquals(len(sample), 25) self.assertEquals(len(sample.columns), 7) self.assertIn('A', sample.columns) self.assertIn('J', sample.columns) self.assertIn('R', sample.columns) self.assertIn('Q', sample.columns) self.assertIn('G', sample.columns) self.assertIn('L', sample.columns) self.assertIn('_weight', sample.columns) self.assertTrue(set(sample.A).issubset({0, 1})) self.assertTrue(set(sample.J).issubset({0, 1})) self.assertTrue(set(sample.R).issubset({0, 1})) self.assertTrue(set(sample.Q).issubset({0, 1})) self.assertTrue(set(sample.G).issubset({0, 1})) self.assertTrue(set(sample.L).issubset({0, 1})) def tearDown(self): del self.sampling_inference del self.bayesian_model del self.markov_model