def test_set_start_state_list(self, check_state): model = MC(['b', 'a'], [1, 2]) check_state.return_value = True model.set_start_state([State('a', 0), State('b', 1)]) model_state = [State('b', 1), State('a', 0)] check_state.assert_called_once_with(model, model_state) self.assertEqual(model.state, model_state)
def test_generate_sample_less_arg(self, random_state, sample_discrete): model = MC(['a', 'b'], [2, 2]) model.transition_models['a'] = { 0: { 0: 0.1, 1: 0.9 }, 1: { 0: 0.2, 1: 0.8 } } model.transition_models['b'] = { 0: { 0: 0.3, 1: 0.7 }, 1: { 0: 0.4, 1: 0.6 } } random_state.return_value = [State('a', 0), State('b', 1)] sample_discrete.side_effect = [[1], [0]] * 2 gen = model.generate_sample(size=2) samples = [sample for sample in gen] expected_samples = [[State('a', 1), State('b', 0)]] * 2 self.assertEqual(samples, expected_samples)
def test_sample(self): model = MC(['a', 'b'], [2, 2]) model.transition_models['a'] = { 0: { 0: 0.1, 1: 0.9 }, 1: { 0: 0.2, 1: 0.8 } } model.transition_models['b'] = { 0: { 0: 0.3, 1: 0.7 }, 1: { 0: 0.4, 1: 0.6 } } sample = model.sample(start_state=[State('a', 0), State('b', 1)], size=2) self.assertEqual(len(sample), 2) self.assertEqual(list(sample.columns), ['a', 'b']) self.assertTrue( list(sample.loc[0]) in [[0, 0], [0, 1], [1, 0], [1, 1]]) self.assertTrue( list(sample.loc[1]) in [[0, 0], [0, 1], [1, 0], [1, 1]])
def test_sample_less_arg(self, random_state): model = MC(['a', 'b'], [2, 2]) random_state.return_value = [State('a', 0), State('b', 1)] sample = model.sample(size=1) random_state.assert_called_once_with(model) self.assertEqual(model.state, random_state.return_value) self.assertEqual(len(sample), 1) self.assertEqual(list(sample.columns), ['a', 'b']) self.assertEqual(list(sample.loc[0]), [0, 1])
def test_sample_less_arg(self, random_state): self.gibbs.state = None random_state.return_value = [ State('diff', 0), State('intel', 0), State('grade', 0) ] sample = self.gibbs.sample(size=2) random_state.assert_called_once_with(self.gibbs) self.assertEqual(len(sample), 2)
def test_sample(self): start_state = [State("diff", 0), State("intel", 0), State("grade", 0)] sample = self.gibbs.sample(start_state, 2) self.assertEquals(len(sample), 2) self.assertEquals(len(sample.columns), 3) self.assertIn("diff", sample.columns) self.assertIn("intel", sample.columns) self.assertIn("grade", sample.columns) self.assertTrue(set(sample["diff"]).issubset({0, 1})) self.assertTrue(set(sample["intel"]).issubset({0, 1})) self.assertTrue(set(sample["grade"]).issubset({0, 1, 2}))
def test_generate_sample(self): start_state = [State('diff', 0), State('intel', 0), State('grade', 0)] gen = self.gibbs.generate_sample(start_state, 2) samples = [sample for sample in gen] self.assertEqual(len(samples), 2) self.assertEqual( {samples[0][0].var, samples[0][1].var, samples[0][2].var}, {'diff', 'intel', 'grade'}) self.assertEqual( {samples[1][0].var, samples[1][1].var, samples[1][2].var}, {'diff', 'intel', 'grade'})
def test_sample(self): start_state = [State('diff', 0), State('intel', 0), State('grade', 0)] sample = self.gibbs.sample(start_state, 2) self.assertEquals(len(sample), 2) self.assertEquals(len(sample.columns), 3) self.assertIn('diff', sample.columns) self.assertIn('intel', sample.columns) self.assertIn('grade', sample.columns) self.assertTrue(set(sample['diff']).issubset({0, 1})) self.assertTrue(set(sample['intel']).issubset({0, 1})) self.assertTrue(set(sample['grade']).issubset({0, 1, 2}))
def setUp(self): self.variables = ['intel', 'diff', 'grade'] self.card = [3, 2, 3] self.cardinalities = {'intel': 3, 'diff': 2, 'grade': 3} self.intel_tm = { 0: { 0: 0.1, 1: 0.25, 2: 0.65 }, 1: { 0: 0.5, 1: 0.3, 2: 0.2 }, 2: { 0: 0.3, 1: 0.3, 2: 0.4 } } self.intel_tm_matrix = np.array([[0.1, 0.25, 0.65], [0.5, 0.3, 0.2], [0.3, 0.3, 0.4]]) self.diff_tm = {0: {0: 0.3, 1: 0.7}, 1: {0: 0.75, 1: 0.25}} self.diff_tm_matrix = np.array([[0.3, 0.7], [0.75, 0.25]]) self.grade_tm = { 0: { 0: 0.4, 1: 0.2, 2: 0.4 }, 1: { 0: 0.9, 1: 0.05, 2: 0.05 }, 2: { 0: 0.1, 1: 0.4, 2: 0.5 } } self.grade_tm_matrix = [[0.4, 0.2, 0.4], [0.9, 0.05, 0.05], [0.1, 0.4, 0.5]] self.start_state = [ State('intel', 0), State('diff', 1), State('grade', 2) ] self.model = MC() self.sample = DataFrame(index=range(200), columns=['a', 'b']) self.sample.a = [1] * 100 + [0] * 100 self.sample.b = [0] * 100 + [1] * 100
def generate_time_series( sampler: BayesianModelSampling, length: int, labels: typing.List[str], seed: int = 42, ): # Initialize progress bar pbar = notebook.tqdm(total=length) # Generate first sample given no evidence with io.capture_output() as captured: # When no evidence is provided, the function under-the-hood performs forward sampling sample = sampler.rejection_sample(seed=seed) sample = sample.reindex(sorted(sample.columns), axis=1) # Split sample in 'current' and 'next' slices: # - the 'current' slice will be the first row of the generated time series # - the 'next' slice is added as the second row, and will be used as # evidence for subsequent predictions df_synth = sample.filter(regex="_T$") next_slice = sample.filter(regex="_T\+1").iloc[0].values.tolist() df_synth = df_synth.append(pd.Series(next_slice, index=df_synth.columns), ignore_index=True) evidence = [ State(n, v) for n, v in zip(df_synth.columns.values, next_slice) ] # Update progress bar pbar.update(2) for _ in range(2, length): # Generate new data with io.capture_output() as captured: sample = sampler.rejection_sample(evidence=evidence) sample = sample.reindex(sorted(sample.columns), axis=1) # Append 'next' slice to the generated time series, and use it as new evidence next_slice = sample.filter(regex="_T\+1").iloc[0].values.tolist() df_synth = df_synth.append(pd.Series(next_slice, index=df_synth.columns), ignore_index=True) evidence = [ State(n, v) for n, v in zip(df_synth.columns.values, next_slice) ] # Update progress bar pbar.update(1) # Close progress bar pbar.close() # Update column names df_synth.columns = labels return df_synth
def test_generate_sample(self): start_state = [State("diff", 0), State("intel", 0), State("grade", 0)] gen = self.gibbs.generate_sample(start_state, 2) samples = [sample for sample in gen] self.assertEqual(len(samples), 2) self.assertEqual( {samples[0][0].var, samples[0][1].var, samples[0][2].var}, {"diff", "intel", "grade"}, ) self.assertEqual( {samples[1][0].var, samples[1][1].var, samples[1][2].var}, {"diff", "intel", "grade"}, )
def set_start_state(self, start_state): """ Set the start state of the Markov Chain. If the start_state is given as a array-like iterable, its contents are reordered in the internal representation. Parameters: ----------- start_state: dict or array-like iterable object Dict (or list) of tuples representing the starting states of the variables. Examples: --------- >>> from pgmpy.models import MarkovChain as MC >>> from pgmpy.factors.discrete import State >>> model = MC(['a', 'b'], [2, 2]) >>> model.set_start_state([State('a', 0), State('b', 1)]) """ if start_state is not None: if not hasattr(start_state, '__iter__') or isinstance(start_state, six.string_types): raise ValueError('start_state must be a non-string iterable.') # Must be an array-like iterable. Reorder according to self.variables. state_dict = {var: st for var, st in start_state} start_state = [State(var, state_dict[var]) for var in self.variables] if start_state is None or self._check_state(start_state): self.state = start_state
def sample(self, start_state=None, size=1): """ Sample from the Markov Chain. Parameters: ----------- start_state: dict or array-like iterable Representing the starting states of the variables. If None is passed, a random start_state is chosen. size: int Number of samples to be generated. Return Type: ------------ pandas.DataFrame Examples: --------- >>> from pgmpy.models import MarkovChain as MC >>> from pgmpy.factors.discrete import State >>> model = MC(['intel', 'diff'], [2, 3]) >>> model.set_start_state([State('intel', 0), State('diff', 2)]) >>> intel_tm = {0: {0: 0.25, 1: 0.75}, 1: {0: 0.5, 1: 0.5}} >>> model.add_transition_model('intel', intel_tm) >>> diff_tm = {0: {0: 0.1, 1: 0.5, 2: 0.4}, 1: {0: 0.2, 1: 0.2, 2: 0.6 }, 2: {0: 0.7, 1: 0.15, 2: 0.15}} >>> model.add_transition_model('diff', diff_tm) >>> model.sample(size=5) intel diff 0 0 2 1 1 0 2 0 1 3 1 0 4 0 2 """ if start_state is None: if self.state is None: self.state = self.random_state() # else use previously-set state else: self.set_start_state(start_state) sampled = DataFrame(index=range(size), columns=self.variables) sampled.loc[0] = [st for var, st in self.state] var_states = defaultdict(dict) var_values = defaultdict(dict) samples = defaultdict(dict) for var in self.transition_models.keys(): for st in self.transition_models[var]: var_states[var][st] = list(self.transition_models[var][st].keys()) var_values[var][st] = list(self.transition_models[var][st].values()) samples[var][st] = sample_discrete(var_states[var][st], var_values[var][st], size=size) for i in range(size - 1): for j, (var, st) in enumerate(self.state): next_st = samples[var][st][i] self.state[j] = State(var, next_st) sampled.loc[i + 1] = [st for var, st in self.state] return sampled
def rejection_estimate(n): inferences = BayesianModelSampling(disease_model) evidences = [ State(var='Fatigue', state=0), State(var='Fever', state=0), State(var='FluShot', state=0) ] p = inferences.rejection_sample(evidences, n) i = 0 for t in range(n): if p['Flu'][t] == float(0): i = i + 1 plt.plot(t, (i / n), 'bo') plt.ylabel('Evolving esimate') plt.xlabel('Number of samples') plt.show()
def test_rejection_sample_basic(self): sample = self.sampling_inference.rejection_sample( [State('A', 1), State('J', 1), State('R', 1)], 25) self.assertEquals(len(sample), 25) self.assertEquals(len(sample.columns), 6) self.assertIn('A', sample.columns) self.assertIn('J', sample.columns) self.assertIn('R', sample.columns) self.assertIn('Q', sample.columns) self.assertIn('G', sample.columns) self.assertIn('L', sample.columns) self.assertTrue(set(sample.A).issubset({1})) self.assertTrue(set(sample.J).issubset({1})) self.assertTrue(set(sample.R).issubset({1})) self.assertTrue(set(sample.Q).issubset({0, 1})) self.assertTrue(set(sample.G).issubset({0, 1})) self.assertTrue(set(sample.L).issubset({0, 1}))
def test_likelihood_weighted_sample(self): sample = self.sampling_inference.likelihood_weighted_sample( [State('A', 0), State('J', 1), State('R', 0)], 25) self.assertEquals(len(sample), 25) self.assertEquals(len(sample.columns), 7) self.assertIn('A', sample.columns) self.assertIn('J', sample.columns) self.assertIn('R', sample.columns) self.assertIn('Q', sample.columns) self.assertIn('G', sample.columns) self.assertIn('L', sample.columns) self.assertIn('_weight', sample.columns) self.assertTrue(set(sample.A).issubset({0, 1})) self.assertTrue(set(sample.J).issubset({0, 1})) self.assertTrue(set(sample.R).issubset({0, 1})) self.assertTrue(set(sample.Q).issubset({0, 1})) self.assertTrue(set(sample.G).issubset({0, 1})) self.assertTrue(set(sample.L).issubset({0, 1}))
def test_rejection_sample_basic(self): sample = self.sampling_inference.rejection_sample() sample = self.sampling_inference.rejection_sample( [State("A", 1), State("J", 1), State("R", 1)], 25) self.assertEquals(len(sample), 25) self.assertEquals(len(sample.columns), 6) self.assertIn("A", sample.columns) self.assertIn("J", sample.columns) self.assertIn("R", sample.columns) self.assertIn("Q", sample.columns) self.assertIn("G", sample.columns) self.assertIn("L", sample.columns) self.assertTrue(set(sample.A).issubset({1})) self.assertTrue(set(sample.J).issubset({1})) self.assertTrue(set(sample.R).issubset({1})) self.assertTrue(set(sample.Q).issubset({0, 1})) self.assertTrue(set(sample.G).issubset({0, 1})) self.assertTrue(set(sample.L).issubset({0, 1}))
def test_likelihood_weighted_sample(self): sample = self.sampling_inference.likelihood_weighted_sample() sample = self.sampling_inference.likelihood_weighted_sample( [State("A", 0), State("J", 1), State("R", 0)], 25) self.assertEquals(len(sample), 25) self.assertEquals(len(sample.columns), 7) self.assertIn("A", sample.columns) self.assertIn("J", sample.columns) self.assertIn("R", sample.columns) self.assertIn("Q", sample.columns) self.assertIn("G", sample.columns) self.assertIn("L", sample.columns) self.assertIn("_weight", sample.columns) self.assertTrue(set(sample.A).issubset({0, 1})) self.assertTrue(set(sample.J).issubset({0, 1})) self.assertTrue(set(sample.R).issubset({0, 1})) self.assertTrue(set(sample.Q).issubset({0, 1})) self.assertTrue(set(sample.G).issubset({0, 1})) self.assertTrue(set(sample.L).issubset({0, 1}))
def is_stationarity(self, tolerance=0.2, sample=None): """ Checks if the given markov chain is stationary and checks the steady state probablity values for the state are consistent. Parameters: ----------- tolerance: float represents the diff between actual steady state value and the computed value sample: [State(i,j)] represents the list of state which the markov chain has sampled Return Type: ------------ Boolean True, if the markov chain converges to steady state distribution within the tolerance False, if the markov chain does not converge to steady state distribution within tolerance Examples: --------- >>> from pgmpy.models.MarkovChain import MarkovChain >>> from pgmpy.factors.discrete import State >>> model = MarkovChain() >>> model.add_variables_from(['intel', 'diff'], [3, 2]) >>> intel_tm = {0: {0: 0.2, 1: 0.4, 2:0.4}, 1: {0: 0, 1: 0.5, 2: 0.5}, 2: {0: 0.3, 1: 0.3, 2: 0.4}} >>> model.add_transition_model('intel', intel_tm) >>> diff_tm = {0: {0: 0.5, 1: 0.5}, 1: {0: 0.25, 1:0.75}} >>> model.add_transition_model('diff', diff_tm) >>> model.is_stationarity() True """ keys = self.transition_models.keys() return_val = True for k in keys: # convert dict to numpy matrix transition_mat = np.array([np.array(list(self.transition_models[k][i].values())) for i in self.transition_models[k].keys()], dtype=np.float) S, U = eig(transition_mat.T) stationary = np.array(U[:, np.where(np.abs(S - 1.) < 1e-8)[0][0]].flat) stationary = (stationary / np.sum(stationary)).real probabilites = [] window_size = 10000 if sample is None else len(sample) for i in range(0, transition_mat.shape[0]): probabilites.extend(self.prob_from_sample([State(k, i)], window_size=window_size)) if any(np.abs(i) > tolerance for i in np.subtract(probabilites, stationary)): return_val = return_val and False else: return_val = return_val and True return return_val
def test_is_stationarity_failure(self): model = MC(['intel', 'diff'], [2, 3]) model.set_start_state([State('intel', 0), State('diff', 2)]) intel_tm = {0: {0: 0.25, 1: 0.75}, 1: {0: 0.5, 1: 0.5}} model.add_transition_model('intel', intel_tm) diff_tm = { 0: { 0: 0.1, 1: 0.5, 2: 0.4 }, 1: { 0: 0.2, 1: 0.2, 2: 0.6 }, 2: { 0: 0.7, 1: 0.15, 2: 0.15 } } model.add_transition_model('diff', diff_tm) self.assertFalse(model.is_stationarity(0.002, None))
def random_state(self): """ Generates a random state of the Markov Chain. Return Type: ------------ List of namedtuples, representing a random assignment to all variables of the model. Examples: --------- >>> from pgmpy.models import MarkovChain as MC >>> model = MC(['intel', 'diff'], [2, 3]) >>> model.random_state() [State('diff', 2), State('intel', 1)] """ return [State(var, np.random.randint(self.cardinalities[var])) for var in self.variables]
def sample_slots(model_info_file, mr_slot_names): model_info = helpers.load_from_pickle(model_info_file) model = model_info['model'] inference = BayesianModelSampling(model) # use the missing mr slots as evidence all_slots = model_info['all_slots'] missing_slots = [mr for mr in all_slots if mr not in mr_slot_names] evidence = [State(mr, 0) for mr in missing_slots] inference = BayesianModelSampling(model) # don't allow empty samples sampled_slots = [] while (sampled_slots == []): sample = inference.rejection_sample(evidence=evidence, size=1, return_type='recarray') # return a list of the column names which had presence sampled_slots = [ name for var, name in zip(sample.view('<i8'), sample.dtype.names) if var == 1 ] return sampled_slots
def generate_sample(self, start_state=None, size=1): """ Generator version of self.sample Returns ------- List of State namedtuples, representing the assignment to all variables of the model. Examples -------- >>> from pgmpy.models.MarkovChain import MarkovChain >>> from pgmpy.factors.discrete import State >>> model = MarkovChain() >>> model.add_variables_from(['intel', 'diff'], [3, 2]) >>> intel_tm = {0: {0: 0.2, 1: 0.4, 2:0.4}, 1: {0: 0, 1: 0.5, 2: 0.5}, 2: {0: 0.3, 1: 0.3, 2: 0.4}} >>> model.add_transition_model('intel', intel_tm) >>> diff_tm = {0: {0: 0.5, 1: 0.5}, 1: {0: 0.25, 1:0.75}} >>> model.add_transition_model('diff', diff_tm) >>> gen = model.generate_sample([State('intel', 0), State('diff', 0)], 2) >>> [sample for sample in gen] [[State(var='intel', state=2), State(var='diff', state=1)], [State(var='intel', state=2), State(var='diff', state=0)]] """ if start_state is None: if self.state is None: self.state = self.random_state() # else use previously-set state else: self.set_start_state(start_state) # sampled.loc[0] = [self.state[var] for var in self.variables] for i in range(size): for j, (var, st) in enumerate(self.state): next_st = sample_discrete( list(self.transition_models[var][st].keys()), list(self.transition_models[var][st].values()), )[0] self.state[j] = State(var, next_st) yield self.state[:]
def test_check_state_bad_var_value(self): model = MC(['a'], [2]) # value of variable >= cardinaliity self.assertRaises(ValueError, model._check_state, [State('a', 3)])
trainData = modelData.iloc[0:int(0.85 * modelData.shape[0])].copy() model.fit(trainData, estimator=MaximumLikelihoodEstimator) #for cpd in model.get_cpds(): # print("CPD of {variable}:".format(variable=cpd.variable)) # print(cpd) model_sample = BayesianModelSampling(model) pickle.dump(model_sample, open('results/sampler.p', 'wb')) # open the nhts sample and add the inferred resType requirements nhtsSample = pd.read_csv('results/nhtsSample.csv') resType = [] for ind, row in nhtsSample.iterrows(): evidence = [ State('IncomeQ', min(row['hh_income'] - 1, 10)), State('HhSize', min(row['hh_size'] - 1, 5)) ] sample = model_sample.likelihood_weighted_sample(evidence=evidence, size=1) resType.extend([int(sample['Bedrooms']) * 3 + int(sample['RentQ'])]) nhtsSample['resType'] = resType os.chdir('..') nhtsSample[nhtsSample['occupation_type'] == 1].sample( n=50, replace=True).to_csv('ABM/includes/pop_occat_1.csv', index=False) nhtsSample[nhtsSample['occupation_type'] == 2].sample( n=50, replace=True).to_csv('ABM/includes/pop_occat_2.csv', index=False) nhtsSample[nhtsSample['occupation_type'] == 3].sample( n=50, replace=True).to_csv('ABM/includes/pop_occat_3.csv', index=False) nhtsSample[nhtsSample['occupation_type'] == 4].sample( n=50, replace=True).to_csv('ABM/includes/pop_occat_4.csv', index=False) nhtsSample[nhtsSample['occupation_type'] == 5].sample(
def test_check_state_success(self): model = MC(['a'], [2]) self.assertTrue(model._check_state([State('a', 1)]))
def test_copy(self): model = MC(['a', 'b'], [2, 2], [State('a', 0), State('b', 1)]) model.add_transition_model('a', { 0: { 0: 0.1, 1: 0.9 }, 1: { 0: 0.2, 1: 0.8 } }) model.add_transition_model('b', { 0: { 0: 0.3, 1: 0.7 }, 1: { 0: 0.4, 1: 0.6 } }) copy = model.copy() self.assertIsInstance(copy, MC) self.assertEqual(sorted(model.variables), sorted(copy.variables)) self.assertEqual(model.cardinalities, copy.cardinalities) self.assertEqual(model.transition_models, copy.transition_models) self.assertEqual(model.state, copy.state) model.add_variable('p', 1) model.set_start_state([State('a', 0), State('b', 1), State('p', 0)]) model.add_transition_model('p', {0: {0: 1}}) self.assertNotEqual(sorted(model.variables), sorted(copy.variables)) self.assertEqual(sorted(['a', 'b']), sorted(copy.variables)) self.assertNotEqual(model.cardinalities, copy.cardinalities) self.assertEqual({'a': 2, 'b': 2}, copy.cardinalities) self.assertNotEqual(model.state, copy.state) self.assertEqual([State('a', 0), State('b', 1)], copy.state) self.assertNotEqual(model.transition_models, copy.transition_models) self.assertEqual(len(copy.transition_models), 2) self.assertEqual(copy.transition_models['a'], { 0: { 0: 0.1, 1: 0.9 }, 1: { 0: 0.2, 1: 0.8 } }) self.assertEqual(copy.transition_models['b'], { 0: { 0: 0.3, 1: 0.7 }, 1: { 0: 0.4, 1: 0.6 } })
def test_prob_from_sample(self, sample): model = MC(['a', 'b'], [2, 2]) sample.return_value = self.sample probabilites = model.prob_from_sample([State('a', 1), State('b', 0)]) self.assertEqual(list(probabilites), [1] * 50 + [0] * 50)
corr_mat.style.background_gradient(cmap="coolwarm").set_precision(2) # eaxmple: if we condition on "child_screen_time".. # ..then "child_physical_activity" becomes independent of "parent_education": corr_mat = simulated_sample.query("child_screen_time==1").drop( "child_screen_time", axis=1).corr() corr_mat.style.background_gradient(cmap="coolwarm").set_precision(2) corr_mat = simulated_sample.query("child_screen_time==0").drop( "child_screen_time", axis=1).corr() corr_mat.style.background_gradient(cmap="coolwarm").set_precision(2) # suppose that we are interested in measuring the average causal effect of "child_screen_time" on "obesity" # we can estimate this by simulating from the system: simulated_sample_lowScreentime = inference.rejection_sample( evidence=[State(var='child_screen_time', state="low")], size=10_000) simulated_sample_highScreentime = inference.rejection_sample( evidence=[State(var='child_screen_time', state="high")], size=10_000) # the observed effect of high screen time on prob. of high child obesity is: ((simulated_sample_highScreentime["child_obesity"] == "high").sum() / len(simulated_sample_highScreentime)) / ( (simulated_sample_lowScreentime["child_obesity"] == "high").sum() / len(simulated_sample_lowScreentime)) # i.e. around 2x infer_adjusted = CausalInference(pg_model) print( infer_adjusted.query(variables=["child_obesity"], do={"child_screen_time": "high"})) # we can estimate this effect from the observed data using a logistic regression model:
def test_check_state_bad_vars(self): model = MC() # state_vars and model_vars differ self.assertRaises(ValueError, model._check_state, [State(1, 2)])