Exemplo n.º 1
0
def generate_datasets(networks, folder, nb_samples=2000):
    for network in networks:
        dataset_out_path = os.path.join(folder, 'datasets', network + '.csv')
        inference = BayesianModelSampling(networks[network])
        samples = inference.forward_sample(size=nb_samples)

        samples.to_csv(dataset_out_path)
Exemplo n.º 2
0
def sampling(model, n=1000, verbose=3):
    '''
    
    Parameters
    ----------
    model:      [DICT] Contains model and adjmat

    n:          [INT] Number of samples to generate
                n=1000 (default)

    verbose:    [INT] Print messages to screen.
                0: NONE
                1: ERROR
                2: WARNING
                3: INFO (default)
                4: DEBUG
                5: TRACE
    Returns
    -------
    Pandas DataFrame

    '''

    assert n > 0, 'n must be 1 or larger'
    assert 'BayesianModel' in str(
        type(model['model'])
    ), 'Model must contain DAG from BayesianModel. Note that <misarables> example does not include DAG.'

    # http://pgmpy.org/sampling.html
    inference = BayesianModelSampling(model['model'])
    # inference = GibbsSampling(model)
    # Forward sampling and make dataframe
    df = inference.forward_sample(size=n, return_type='dataframe')
    return (df)
Exemplo n.º 3
0
    def sample(self, nb_sample=1):
        # sampling of pgmpy samples the index of the values
        # Here we convert back this index to the actual value
        def convert(samples):
            for col in samples.columns:
                _, states = self.get_state_space(col)
                samples[col] = samples[col].apply(lambda x: states[x])
            return samples

        inference = BayesianModelSampling(self.bn)
        samples = inference.forward_sample(size=nb_sample)

        return convert(samples)
Exemplo n.º 4
0
    def sample(self, n_samples=1) :
        """
        Sample n data points from the Bayesian Network

        :param n_samples: int, amount of datapoints to generate.
        :return: Dataframe of new datapoints shape (n_samples,n_features)
        """
        np.random.seed(self.random_state)

        inference = BayesianModelSampling(self.model)
        
        Y = inference.forward_sample(size=n_samples, return_type='dataframe')

        Y = Y[sorted(Y.columns)]
    
        return Y[cols]
Exemplo n.º 5
0
    def getDataset(self, size=1000, return_type='DataFrame'):
        """
        Method: retrun a set of samples generated from Bayesian Network. (Simply using forward-sampling)

        Parameters
        ----------
        size: size of the dataset to be generated (default: 1000)

        return_type: return type of dataset (default: panda.DataFrame)

        """
        # For more info, see: likelihood_weighted, rejection or Gibb sampling
        from pgmpy.sampling import BayesianModelSampling

        inference = BayesianModelSampling(self.__covid_model)
        dataset = inference.forward_sample(size=size, return_type=return_type)

        return dataset
Exemplo n.º 6
0
def sampling(DAG, n=1000, verbose=3):
    """Generate sample(s) using forward sampling from joint distribution of the bayesian network.

    Parameters
    ----------
    DAG : dict
        Contains model and adjmat of the DAG.
    n : int, optional
        Number of samples to generate. The default is 1000.
    verbose : int, optional
        Print progress to screen. The default is 3.
        0: None, 1: ERROR, 2: WARN, 3: INFO (default), 4: DEBUG, 5: TRACE

    Returns
    -------
    df : pd.DataFrame().
        Dataframe containing sampled data from the input DAG model.


    Example
    -------
    >>> import bnlearn
    >>> DAG = bnlearn.import_DAG('sprinkler')
    >>> df = bnlearn.sampling(DAG, n=1000)

    """
    if n <= 0: raise ValueError('n must be 1 or larger')
    if 'BayesianModel' not in str(type(DAG['model'])):
        raise ValueError('DAG must contain BayesianModel.')
    if verbose >= 3:
        print('[bnlearn] >Forward sampling for %.0d samples..' % (n))

    if len(DAG['model'].get_cpds()) == 0:
        print(
            '[bnlearn] >This seems like a DAG containing only edges, and no CPDs. Tip: use bn.parameter_learning.fit(DAG, df) to learn the CPDs first.'
        )
        return

    # http://pgmpy.org/sampling.html
    infer_model = BayesianModelSampling(DAG['model'])
    # inference = GibbsSampling(model['model'])
    # Forward sampling and make dataframe
    df = infer_model.forward_sample(size=n, return_type='dataframe')
    return (df)
Exemplo n.º 7
0
def sampling(model, n=1000, verbose=3):
    """Sample based on DAG.

    Parameters
    ----------
    model : dict
        Contains model and adjmat.
    n : int, optional
        Number of samples to generate. The default is 1000.
    verbose : int, optional
        Print progress to screen. The default is 3.
        0: NONE
        1: ERROR
        2: WARNING
        3: INFO (default)
        4: DEBUG
        5: TRACE

    Returns
    -------
    df : pd.DataFrame().


    Example
    -------
    >>> import bnlearn
    >>> model = bnlearn.import_DAG('sprinkler')
    >>> df = bnlearn.sampling(model, n=1000)

    """
    assert n > 0, 'n must be 1 or larger'
    assert 'BayesianModel' in str(
        type(model['model'])
    ), 'Model must contain DAG from BayesianModel. Note that <misarables> example does not include DAG.'
    if verbose >= 3:
        print('[BNLEARN][sampling] Forward sampling for %.0d samples..' % (n))

    # http://pgmpy.org/sampling.html
    inference = BayesianModelSampling(model['model'])
    # inference = GibbsSampling(model)
    # Forward sampling and make dataframe
    df = inference.forward_sample(size=n, return_type='dataframe')
    return (df)
Exemplo n.º 8
0
def sample(N):
    bn_generate = BayesianModel([('D', 'G'), ('I', 'G'), ('E', 'L'),
                                 ('G', 'L')])

    cpd_d = TabularCPD('D', 2, [[0.6], [0.4]])
    cpd_i = TabularCPD('I', 2, [[0.7], [0.3]])
    cpd_g = TabularCPD('G', 3, [[0.3, 0.9, 0.05, 0.5], [0.4, 0.08, 0.25, 0.3],
                                [0.3, 0.02, 0.7, 0.2]], ['D', 'I'], [2, 2])
    cpd_e = TabularCPD('E', 2, [[0.5], [0.5]])
    cpd_l = TabularCPD(
        'L', 2,
        [[0.1, 0.3, 0.4, 0.25, 0.8, 0.99], [0.9, 0.7, 0.6, 0.75, 0.2, 0.01]],
        ['G', 'E'], [3, 2])

    bn_generate.add_cpds(cpd_d, cpd_i, cpd_g, cpd_e, cpd_l)

    infer = BayesianModelSampling(bn_generate)
    data = infer.forward_sample(N)
    return data, bn_generate
class DynamicBayesianNetwork(Process):

    defaults = {
        'nodes': [],
        'edges': [],
        'conditional_probabilities': {
            'node_id': []
        }
    }

    def __init__(self, parameters=None):
        super().__init__(parameters)

        # set up the network based on the parameters
        self.model = DBN()
        self.model.add_nodes_from(self.parameters['nodes'])
        self.model.add_edges_from(self.parameters['edges'])

        print(f'EDGES: {sorted(self.model.edges())}')

        import ipdb
        ipdb.set_trace()

        # TODO -- add 'evidence' -- get from network?
        cpds = (TabularCPD(variable=node_id,
                           variable_card=len(values),
                           values=values,
                           evidence=[]) for node_id, values in
                self.parameters['conditional_probabilities'])
        self.model.add_cpds(cpds)

        # make an inference instance for sampling the model
        self.inference = BayesianModelSampling(self.model)

        # get a sample
        sample = self.inference.forward_sample(size=2)

    def ports_schema(self):
        return {}

    def next_update(self, timestep, states):
        return {}
Exemplo n.º 10
0
def sample_dag(dag, num):

    #zzz this loses disconnected nodes!!!
    # bayesmod = BayesianModel(dag.edges())
    # bayesmod = BayesianModel(dag)
    bayesmod = BayesianModel()
    bayesmod.add_nodes_from(dag.nodes())
    bayesmod.add_edges_from(dag.edges())

    tab_cpds = []
    cards = {node: len(dag.node[node]['cpd']) for node in dag.nodes()}
    for node in dag.nodes():
        parents = dag.predecessors(node)
        cpd = dag.node[node]['cpd']
        if parents:
            parent_cards = [cards[par] for par in parents]
            logging.debug("TablularCPD({}, {}, {}, {}, {})".format(
                node, cards[node], cpd, parents, parent_cards))
            tab_cpds.append(
                TabularCPD(node, cards[node], cpd, parents, parent_cards))
        else:
            logging.debug("TablularCPD({}, {}, {})".format(
                node, cards[node], cpd))
            tab_cpds.append(TabularCPD(node, cards[node], cpd))

    logging.debug("cpds add: {}".format(tab_cpds))

    print "model variables:", bayesmod.nodes()
    for tab_cpd in tab_cpds:
        print "cpd variables:", tab_cpd.variables

    bayesmod.add_cpds(*tab_cpds)

    logging.debug("cpds get: {}".format(bayesmod.get_cpds()))
    inference = BayesianModelSampling(bayesmod)

    logging.debug("generating data")
    recs = inference.forward_sample(size=num, return_type='recarray')
    return recs
Exemplo n.º 11
0
cpd1.append(p_21)
cpd1.append(p_52)
cpd1.append(p_14)
cpd1.append(p_64)
cpd1.append(p_36)
cpd1.append(p4)

model1.add_cpds(*cpd1)

print("------------------------------------------")
print("Edges of model1:", model1.edges())
print("Checking Model1:", model1.check_model())
print("------------------------------------------")
'''generate data for model1'''
inference = BayesianModelSampling(model1)
data=inference.forward_sample(size=3000, return_type='dataframe')
print("Data for model1:")
print(data)   
k2=K2Score(data)
print('Model1 K2 Score: ' + str(k2.score(model1)))

'''Inference'''
from pgmpy.inference import VariableElimination
infer = VariableElimination(model1)
print("Inference of x3:")
print(infer.query(['x3']) ['x3'])
print("Inference of x5|x2:")
print(infer.query(['x5'], evidence={ 'x2': 1}) ['x5'])


''''Model2'''
class TestBayesianModelSampling(unittest.TestCase):
    def setUp(self):
        self.bayesian_model = BayesianModel([('A', 'J'), ('R', 'J'), ('J', 'Q'),
                                             ('J', 'L'), ('G', 'L')])
        cpd_a = TabularCPD('A', 2, [[0.2], [0.8]])
        cpd_r = TabularCPD('R', 2, [[0.4], [0.6]])
        cpd_j = TabularCPD('J', 2,
                           [[0.9, 0.6, 0.7, 0.1],
                            [0.1, 0.4, 0.3, 0.9]],
                           ['R', 'A'], [2, 2])
        cpd_q = TabularCPD('Q', 2,
                           [[0.9, 0.2],
                            [0.1, 0.8]],
                           ['J'], [2])
        cpd_l = TabularCPD('L', 2,
                           [[0.9, 0.45, 0.8, 0.1],
                            [0.1, 0.55, 0.2, 0.9]],
                           ['G', 'J'], [2, 2])
        cpd_g = TabularCPD('G', 2, [[0.6], [0.4]])
        self.bayesian_model.add_cpds(cpd_a, cpd_g, cpd_j, cpd_l, cpd_q, cpd_r)
        self.sampling_inference = BayesianModelSampling(self.bayesian_model)
        self.markov_model = MarkovModel()

    def test_init(self):
        with self.assertRaises(TypeError):
            BayesianModelSampling(self.markov_model)

    def test_forward_sample(self):
        sample = self.sampling_inference.forward_sample(25)
        self.assertEquals(len(sample), 25)
        self.assertEquals(len(sample.columns), 6)
        self.assertIn('A', sample.columns)
        self.assertIn('J', sample.columns)
        self.assertIn('R', sample.columns)
        self.assertIn('Q', sample.columns)
        self.assertIn('G', sample.columns)
        self.assertIn('L', sample.columns)
        self.assertTrue(set(sample.A).issubset({0, 1}))
        self.assertTrue(set(sample.J).issubset({0, 1}))
        self.assertTrue(set(sample.R).issubset({0, 1}))
        self.assertTrue(set(sample.Q).issubset({0, 1}))
        self.assertTrue(set(sample.G).issubset({0, 1}))
        self.assertTrue(set(sample.L).issubset({0, 1}))

    def test_rejection_sample_basic(self):
        sample = self.sampling_inference.rejection_sample([State('A', 1), State('J', 1), State('R', 1)], 25)
        self.assertEquals(len(sample), 25)
        self.assertEquals(len(sample.columns), 6)
        self.assertIn('A', sample.columns)
        self.assertIn('J', sample.columns)
        self.assertIn('R', sample.columns)
        self.assertIn('Q', sample.columns)
        self.assertIn('G', sample.columns)
        self.assertIn('L', sample.columns)
        self.assertTrue(set(sample.A).issubset({1}))
        self.assertTrue(set(sample.J).issubset({1}))
        self.assertTrue(set(sample.R).issubset({1}))
        self.assertTrue(set(sample.Q).issubset({0, 1}))
        self.assertTrue(set(sample.G).issubset({0, 1}))
        self.assertTrue(set(sample.L).issubset({0, 1}))

    @patch("pgmpy.sampling.BayesianModelSampling.forward_sample", autospec=True)
    def test_rejection_sample_less_arg(self, forward_sample):
        sample = self.sampling_inference.rejection_sample(size=5)
        forward_sample.assert_called_once_with(self.sampling_inference, 5)
        self.assertEqual(sample, forward_sample.return_value)

    def test_likelihood_weighted_sample(self):
        sample = self.sampling_inference.likelihood_weighted_sample([State('A', 0), State('J', 1), State('R', 0)], 25)
        self.assertEquals(len(sample), 25)
        self.assertEquals(len(sample.columns), 7)
        self.assertIn('A', sample.columns)
        self.assertIn('J', sample.columns)
        self.assertIn('R', sample.columns)
        self.assertIn('Q', sample.columns)
        self.assertIn('G', sample.columns)
        self.assertIn('L', sample.columns)
        self.assertIn('_weight', sample.columns)
        self.assertTrue(set(sample.A).issubset({0, 1}))
        self.assertTrue(set(sample.J).issubset({0, 1}))
        self.assertTrue(set(sample.R).issubset({0, 1}))
        self.assertTrue(set(sample.Q).issubset({0, 1}))
        self.assertTrue(set(sample.G).issubset({0, 1}))
        self.assertTrue(set(sample.L).issubset({0, 1}))

    def tearDown(self):
        del self.sampling_inference
        del self.bayesian_model
        del self.markov_model
Exemplo n.º 13
0
class TestBayesianModelSampling(unittest.TestCase):
    def setUp(self):
        self.bayesian_model = BayesianModel([('A', 'J'), ('R', 'J'),
                                             ('J', 'Q'), ('J', 'L'),
                                             ('G', 'L')])
        cpd_a = TabularCPD('A', 2, [[0.2], [0.8]])
        cpd_r = TabularCPD('R', 2, [[0.4], [0.6]])
        cpd_j = TabularCPD('J', 2,
                           [[0.9, 0.6, 0.7, 0.1], [0.1, 0.4, 0.3, 0.9]],
                           ['R', 'A'], [2, 2])
        cpd_q = TabularCPD('Q', 2, [[0.9, 0.2], [0.1, 0.8]], ['J'], [2])
        cpd_l = TabularCPD('L', 2,
                           [[0.9, 0.45, 0.8, 0.1], [0.1, 0.55, 0.2, 0.9]],
                           ['G', 'J'], [2, 2])
        cpd_g = TabularCPD('G', 2, [[0.6], [0.4]])
        self.bayesian_model.add_cpds(cpd_a, cpd_g, cpd_j, cpd_l, cpd_q, cpd_r)
        self.sampling_inference = BayesianModelSampling(self.bayesian_model)
        self.markov_model = MarkovModel()

    def test_init(self):
        with self.assertRaises(TypeError):
            BayesianModelSampling(self.markov_model)

    def test_forward_sample(self):
        sample = self.sampling_inference.forward_sample(25)
        self.assertEquals(len(sample), 25)
        self.assertEquals(len(sample.columns), 6)
        self.assertIn('A', sample.columns)
        self.assertIn('J', sample.columns)
        self.assertIn('R', sample.columns)
        self.assertIn('Q', sample.columns)
        self.assertIn('G', sample.columns)
        self.assertIn('L', sample.columns)
        self.assertTrue(set(sample.A).issubset({0, 1}))
        self.assertTrue(set(sample.J).issubset({0, 1}))
        self.assertTrue(set(sample.R).issubset({0, 1}))
        self.assertTrue(set(sample.Q).issubset({0, 1}))
        self.assertTrue(set(sample.G).issubset({0, 1}))
        self.assertTrue(set(sample.L).issubset({0, 1}))

    def test_rejection_sample_basic(self):
        sample = self.sampling_inference.rejection_sample(
            [State('A', 1), State('J', 1),
             State('R', 1)], 25)
        self.assertEquals(len(sample), 25)
        self.assertEquals(len(sample.columns), 6)
        self.assertIn('A', sample.columns)
        self.assertIn('J', sample.columns)
        self.assertIn('R', sample.columns)
        self.assertIn('Q', sample.columns)
        self.assertIn('G', sample.columns)
        self.assertIn('L', sample.columns)
        self.assertTrue(set(sample.A).issubset({1}))
        self.assertTrue(set(sample.J).issubset({1}))
        self.assertTrue(set(sample.R).issubset({1}))
        self.assertTrue(set(sample.Q).issubset({0, 1}))
        self.assertTrue(set(sample.G).issubset({0, 1}))
        self.assertTrue(set(sample.L).issubset({0, 1}))

    @patch("pgmpy.sampling.BayesianModelSampling.forward_sample",
           autospec=True)
    def test_rejection_sample_less_arg(self, forward_sample):
        sample = self.sampling_inference.rejection_sample(size=5)
        forward_sample.assert_called_once_with(self.sampling_inference, 5)
        self.assertEqual(sample, forward_sample.return_value)

    def test_likelihood_weighted_sample(self):
        sample = self.sampling_inference.likelihood_weighted_sample(
            [State('A', 0), State('J', 1),
             State('R', 0)], 25)
        self.assertEquals(len(sample), 25)
        self.assertEquals(len(sample.columns), 7)
        self.assertIn('A', sample.columns)
        self.assertIn('J', sample.columns)
        self.assertIn('R', sample.columns)
        self.assertIn('Q', sample.columns)
        self.assertIn('G', sample.columns)
        self.assertIn('L', sample.columns)
        self.assertIn('_weight', sample.columns)
        self.assertTrue(set(sample.A).issubset({0, 1}))
        self.assertTrue(set(sample.J).issubset({0, 1}))
        self.assertTrue(set(sample.R).issubset({0, 1}))
        self.assertTrue(set(sample.Q).issubset({0, 1}))
        self.assertTrue(set(sample.G).issubset({0, 1}))
        self.assertTrue(set(sample.L).issubset({0, 1}))

    def tearDown(self):
        del self.sampling_inference
        del self.bayesian_model
        del self.markov_model
Exemplo n.º 14
0
def generateWysiwygDataDI(samplesize=4000):
    ''' same principle as generateWysiwygData(), but we have 3 continous variables and 3 discrete C and X variables.
	This distribution was used for the DI removal experiment, because IBM AIF360's DI removal only impacts continous variables. '''
    wysiwygmodel = BayesianModel([('A', 'C1'), ('A', 'C2'), ('A', 'C3'),
                                  ('C1', 'Y'), ('Y', 'C2'), ('Y', 'C3'),
                                  ('A', 'X1'), ('A', 'X2'), ('A', 'X3'),
                                  ('Y', 'X1'), ('Y', 'X2'), ('Y', 'X3')])

    cpd_a = TabularCPD(variable='A', variable_card=2, values=[[0.5], [0.5]])

    cpd_y = TabularCPD(variable='Y',
                       variable_card=2,
                       values=[[0.7], [0.35], [0.3], [0.65]],
                       evidence=['C1'],
                       evidence_card=[2])

    cpd_c1 = TabularCPD(variable='C1',
                        variable_card=2,
                        values=[[0.65, 0.3], [0.35, 0.7]],
                        evidence=['A'],
                        evidence_card=[2])

    cpd_c2 = TabularCPD(variable='C2',
                        variable_card=4,
                        values=[[0.24, 0.27, 0.25, 0.24],
                                [0.28, 0.23, 0.24, 0.22],
                                [0.24, 0.27, 0.25, 0.26],
                                [0.24, 0.23, 0.26, 0.28]],
                        evidence=['A', 'Y'],
                        evidence_card=[2, 2])

    cpd_c3 = TabularCPD(variable='C3',
                        variable_card=4,
                        values=[[0.22, 0.25, 0.25, 0.37],
                                [0.23, 0.25, 0.26, 0.21],
                                [0.23, 0.25, 0.25, 0.22],
                                [0.32, 0.25, 0.24, 0.20]],
                        evidence=['A', 'Y'],
                        evidence_card=[2, 2])

    cpd_x1 = TabularCPD(variable='X1',
                        variable_card=2,
                        values=[[0.54, 0.48, 0.52, 0.45],
                                [0.46, 0.52, 0.48, 0.55]],
                        evidence=['A', 'Y'],
                        evidence_card=[2, 2])

    cpd_x2 = TabularCPD(variable='X2',
                        variable_card=4,
                        values=[[0.25, 0.27, 0.26, 0.23],
                                [0.30, 0.23, 0.24, 0.23],
                                [0.23, 0.27, 0.26, 0.23],
                                [0.22, 0.23, 0.24, 0.31]],
                        evidence=['A', 'Y'],
                        evidence_card=[2, 2])

    cpd_x3 = TabularCPD(variable='X3',
                        variable_card=4,
                        values=[[0.22, 0.25, 0.25, 0.30],
                                [0.23, 0.25, 0.26, 0.24],
                                [0.23, 0.25, 0.24, 0.24],
                                [0.32, 0.25, 0.25, 0.22]],
                        evidence=['A', 'Y'],
                        evidence_card=[2, 2])

    wysiwygmodel.add_cpds(cpd_a, cpd_c1, cpd_c2, cpd_c3, cpd_x1, cpd_x2,
                          cpd_x3, cpd_y)
    datasamples = BayesianModelSampling(wysiwygmodel)
    discframe = datasamples.forward_sample(samplesize)
    AY = discframe[["A", "Y"]]

    C4 = samplecontinuous(AY,
                          samplesize=samplesize,
                          contatt="C4",
                          meana0=5,
                          meana1=6,
                          covy0=[1],
                          covy1=[1.8])
    C5 = samplecontinuous(AY,
                          samplesize=samplesize,
                          contatt="C5",
                          meana0=1,
                          meana1=2,
                          covy0=[1],
                          covy1=[0.9])
    C6 = samplecontinuous(AY,
                          samplesize=samplesize,
                          contatt="C6",
                          meana0=4,
                          meana1=5.3,
                          covy0=[1],
                          covy1=[0.95])

    X4 = samplecontinuous(AY,
                          samplesize=samplesize,
                          contatt="X4",
                          meana0=5.5,
                          meana1=6,
                          covy0=[1.2],
                          covy1=[1.4])
    X5 = samplecontinuous(AY,
                          samplesize=samplesize,
                          contatt="X5",
                          meana0=1.1,
                          meana1=1.7,
                          covy0=[1.1],
                          covy1=[1.0])
    X6 = samplecontinuous(AY,
                          samplesize=samplesize,
                          contatt="X6",
                          meana0=4.5,
                          meana1=5.1,
                          covy0=[1],
                          covy1=[1.1])

    discframe = pd.concat([discframe, C4, C5, C6, X4, X5, X6], axis=1)
    discframe.to_csv(path_or_buf="data/wysiwygdata5.csv")
Exemplo n.º 15
0
model = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D'), ('B', 'E')])
model.fit(values)
predict_data = predict_data.copy()
predict_data.drop('E', axis=1, inplace=True)
#print predict_data
y_pred = model.predict(predict_data)
y_prob = model.predict_probability(predict_data)


from pgmpy.sampling import BayesianModelSampling
model = BayesianModel([('D', 'G'), ('I', 'G')])
cpd_d = TabularCPD('D', 2, [[0.6], [0.4]])
cpd_i = TabularCPD('I', 2, [[0.7], [0.3]])
cpd_g = TabularCPD('G', 3, 
                   [[0.3, 0.05, 0.9, 0.5], 
                    [0.4, 0.25, 0.08, 0.3], 
                    [0.3, 0.7, 0.02, 0.2]],
                   ['D', 'I'], [2, 2])
model.add_cpds(cpd_d, cpd_i, cpd_g)
 
infer = BayesianModelSampling(model)
data = infer.forward_sample(500)
#print data
 
model.fit(data, estimator=MaximumLikelihoodEstimator)
for cpd in model.get_cpds():
    print("CPD of {variable}:".format(variable=cpd.variable))
    print(cpd)


Exemplo n.º 16
0
                   variable_card=2,
                   values=[[0.95, 0.2], [0.05, 0.8]],
                   evidence=['M'],
                   evidence_card=[2])

# Associating the CPDs with the network
model.add_cpds(cpd_d, cpd_m, cpd_r, cpd_l, cpd_e)

# check_model checks for the network structure and CPDs and verifies that the CPDs are correctly
# defined and sum to 1.
model.check_model()

# Forward_sample then iterate and count strong musician/ good letter/both
inference = BayesianModelSampling(model)
numSamples = 10000
samples = inference.forward_sample(size=numSamples, return_type='recarray')

part1 = 0
strongLetter = 0
weakMusician = 0
strongLetterWeakMuscician = 0

# Samples have structure (M E D R L)
for sample in samples:
    # P(m = strong)P(d = low)P(r = ∗ ∗ |m = strong, d = low)P(e = high|m = strong)P(letter = weak| ∗ ∗)
    if sample[0] and not sample[2] and sample[3] == 2 and sample[
            1] and not sample[4]:
        part1 += 1
    # P(letter = strong)
    if sample[4]:
        strongLetter += 1
Exemplo n.º 17
0
# In[15]:

# In[19]:

#infer1 = BayesianModelSampling(Mental_health_model)
#evidence2 = [State('treatment',1)]
#np.mean(infer1.likelihood_weighted_sample(evidence2,5))

# In[20]:

# In[30]:

infer1 = BayesianModelSampling(Mental_health_model)
evidence1 = [State('treatment', 1)]
sample1 = infer1.forward_sample(5)
sample1

# In[31]:

m = np.mean(sample1)
print("Mean: ", m)

# In[32]:

# In[33]:

scipy.stats.entropy(sample1)

# In[71]:
Exemplo n.º 18
0
def generateWysiwygFIDataOld(samplesize=4000, filename="data/preFIData.csv"):
    ''' old version of the bayesian model for the Fair Inference experiment.
	Here Y still influences X to make modelling Y simpler.
	This is not suitable for FI.
	This model is unused in the experiments in the final thesis.
	'''
    wysiwygmodel = BayesianModel([('A', 'C1'), ('A', 'C2'), ('A', 'C3'),
                                  ('A', 'C4'), ('C1', 'Y'), ('Y', 'C2'),
                                  ('Y', 'C3'), ('Y', 'C4'), ('A', 'X1'),
                                  ('A', 'X2'), ('A', 'X3'), ('A', 'X4'),
                                  ('Y', 'X1'), ('Y', 'X2'), ('Y', 'X3'),
                                  ('Y', 'X4'), ('D1', 'X1'), ('D1', 'X2'),
                                  ('D2', 'X3'), ('D3', 'X4')])

    cpd_a = TabularCPD(variable='A', variable_card=2, values=[[0.5], [0.5]])

    cpd_d1 = TabularCPD(variable='D1',
                        variable_card=2,
                        values=[[0.45], [0.55]])

    cpd_d2 = TabularCPD(variable='D2',
                        variable_card=4,
                        values=[[0.22], [0.24], [0.28], [0.26]])
    cpd_d3 = TabularCPD(variable='D3',
                        variable_card=2,
                        values=[[0.54], [0.46]])

    cpd_y = TabularCPD(variable='Y',
                       variable_card=2,
                       values=[[0.7], [0.3], [0.3], [0.7]],
                       evidence=['C1'],
                       evidence_card=[2])

    cpd_c1 = TabularCPD(variable='C1',
                        variable_card=2,
                        values=[[0.85, 0.2], [0.15, 0.8]],
                        evidence=['A'],
                        evidence_card=[2])

    cpd_c2 = TabularCPD(variable='C2',
                        variable_card=4,
                        values=[[0.23, 0.27, 0.25, 0.20],
                                [0.35, 0.23, 0.24, 0.15],
                                [0.22, 0.27, 0.25, 0.25],
                                [0.20, 0.23, 0.26, 0.40]],
                        evidence=['A', 'Y'],
                        evidence_card=[2, 2])

    cpd_c3 = TabularCPD(variable='C3',
                        variable_card=2,
                        values=[[0.52, 0.49, 0.5, 0.45],
                                [0.48, 0.51, 0.5, 0.55]],
                        evidence=['A', 'Y'],
                        evidence_card=[2, 2])

    cpd_c4 = TabularCPD(variable='C4',
                        variable_card=4,
                        values=[[0.22, 0.25, 0.25, 0.37],
                                [0.23, 0.25, 0.26, 0.21],
                                [0.23, 0.25, 0.25, 0.22],
                                [0.32, 0.25, 0.24, 0.20]],
                        evidence=['A', 'Y'],
                        evidence_card=[2, 2])

    cpd_x1 = TabularCPD(
        variable='X1',
        variable_card=2,
        values=[
            [0.38, 0.40, 0.42, 0.44, 0.57, 0.59, 0.60, 0.62],  #GOOD
            [0.62, 0.60, 0.58, 0.56, 0.43, 0.41, 0.40, 0.38]
        ],
        evidence=['A', 'Y', 'D1'],
        evidence_card=[2, 2, 2])

    cpd_x2 = TabularCPD(
        variable='X2',
        variable_card=4,
        values=[
            [0.30, 0.28, 0.27, 0.25, 0.17, 0.16, 0.15, 0.14],
            [0.24, 0.26, 0.26, 0.27, 0.29, 0.31, 0.30, 0.32],  #GOOD 2
            [0.16, 0.18, 0.20, 0.22, 0.35, 0.37, 0.38, 0.40],  #GOOD 1
            [0.30, 0.28, 0.27, 0.26, 0.19, 0.16, 0.17, 0.14]
        ],
        evidence=['A', 'Y', 'D1'],
        evidence_card=[2, 2, 2])

    cpd_x3 = TabularCPD(
        variable='X3',
        variable_card=2,
        values=[[
            0.64, 0.62, 0.62, 0.63, 0.60, 0.58, 0.58, 0.59, 0.40, 0.39, 0.39,
            0.38, 0.38, 0.35, 0.35, 0.37
        ],
                [
                    0.36, 0.38, 0.38, 0.37, 0.40, 0.42, 0.42, 0.41, 0.60, 0.61,
                    0.61, 0.62, 0.62, 0.65, 0.65, 0.63
                ]],  #GOOD
        evidence=['A', 'Y', 'D2'],
        evidence_card=[2, 2, 4])

    cpd_x4 = TabularCPD(
        variable='X4',
        variable_card=4,
        values=[
            [0.25, 0.27, 0.21, 0.23, 0.10, 0.12, 0.07, 0.09],
            [0.36, 0.34, 0.42, 0.40, 0.60, 0.58, 0.64, 0.62],  #GOOD1
            [0.25, 0.27, 0.21, 0.23, 0.10, 0.12, 0.07, 0.09],
            [0.14, 0.12, 0.16, 0.14, 0.20, 0.18, 0.22, 0.20]
        ],  #GOOD2
        evidence=['A', 'Y', 'D3'],
        evidence_card=[2, 2, 2])

    wysiwygmodel.add_cpds(cpd_a, cpd_c1, cpd_c2, cpd_c3, cpd_c4, cpd_x1,
                          cpd_x2, cpd_x3, cpd_x4, cpd_y, cpd_d1, cpd_d2,
                          cpd_d3)
    datasamples = BayesianModelSampling(wysiwygmodel)
    discframe = datasamples.forward_sample(samplesize)
    AY = discframe[["A", "Y"]]

    C5 = samplecontinuous(AY,
                          samplesize=samplesize,
                          contatt="C5",
                          meana0=1,
                          meana1=1.2,
                          covy0=[1],
                          covy1=[0.9])
    C6 = samplecontinuous(AY,
                          samplesize=samplesize,
                          contatt="C6",
                          meana0=2,
                          meana1=1.8,
                          covy0=[1],
                          covy1=[0.95])

    X5 = samplecontinuous(AY,
                          samplesize=samplesize,
                          contatt="X5",
                          meana0=1.1,
                          meana1=1.4,
                          covy0=[1.1],
                          covy1=[0.95])
    X6 = samplecontinuous(AY,
                          samplesize=samplesize,
                          contatt="X6",
                          meana0=1.9,
                          meana1=1.5,
                          covy0=[1],
                          covy1=[1.1])

    discframe = pd.concat([discframe, C5, C6, X5, X6], axis=1)
    ndf = discframe.reindex(axis=1,
                            labels=[
                                'A', 'Y', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6',
                                'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'D1', 'D2',
                                'D3'
                            ])
    ndf.to_csv(path_or_buf=filename)
Exemplo n.º 19
0
def generateWysiwygFIData(samplesize=4000, filename="data/preFIData.csv"):
    ''' The bayesian network that was used in the FI experiment.
	The edges between X and Y are flipped from the previous models,
	so X causally influences Y. The D variables are added to more closely approximate 
	the experiments from the 'Fair Inference on Outcomes' paper.  '''
    wysiwygmodel = BayesianModel([('A', 'C1'), ('A', 'C2'), ('A', 'C3'),
                                  ('A', 'C4'), ('Y', 'C2'), ('Y', 'C3'),
                                  ('Y', 'C4'), ('A', 'X1'), ('A', 'X2'),
                                  ('A', 'X3'), ('A', 'X4'), ('X1', 'Y'),
                                  ('X2', 'Y'), ('X3', 'Y'), ('X4', 'Y'),
                                  ('D1', 'X1'), ('D1', 'X2'), ('D2', 'X3'),
                                  ('D3', 'X4')])

    cpd_a = TabularCPD(variable='A', variable_card=2, values=[[0.5], [0.5]])

    cpd_d1 = TabularCPD(variable='D1',
                        variable_card=2,
                        values=[[0.45], [0.55]])

    cpd_d2 = TabularCPD(variable='D2',
                        variable_card=4,
                        values=[[0.22], [0.24], [0.28], [0.26]])
    cpd_d3 = TabularCPD(variable='D3',
                        variable_card=2,
                        values=[[0.54], [0.46]])

    ydists = computeYDist()

    cpd_y = TabularCPD(variable='Y',
                       variable_card=2,
                       values=[ydists[0], ydists[1]],
                       evidence=['X1', 'X3', 'X2', 'X4'],
                       evidence_card=[2, 2, 4, 4])

    cpd_c1 = TabularCPD(variable='C1',
                        variable_card=2,
                        values=[[0.85, 0.2], [0.15, 0.8]],
                        evidence=['A'],
                        evidence_card=[2])

    cpd_c2 = TabularCPD(variable='C2',
                        variable_card=4,
                        values=[[0.23, 0.27, 0.25, 0.20],
                                [0.35, 0.23, 0.24, 0.15],
                                [0.22, 0.27, 0.25, 0.25],
                                [0.20, 0.23, 0.26, 0.40]],
                        evidence=['A', 'Y'],
                        evidence_card=[2, 2])

    cpd_c3 = TabularCPD(variable='C3',
                        variable_card=2,
                        values=[[0.52, 0.49, 0.5, 0.45],
                                [0.48, 0.51, 0.5, 0.55]],
                        evidence=['A', 'Y'],
                        evidence_card=[2, 2])

    cpd_c4 = TabularCPD(variable='C4',
                        variable_card=4,
                        values=[[0.22, 0.25, 0.25, 0.37],
                                [0.23, 0.25, 0.26, 0.21],
                                [0.23, 0.25, 0.25, 0.22],
                                [0.32, 0.25, 0.24, 0.20]],
                        evidence=['A', 'Y'],
                        evidence_card=[2, 2])

    cpd_x1 = TabularCPD(
        variable='X1',
        variable_card=2,
        values=[
            [0.38, 0.40, 0.60, 0.62],  #GOOD
            [0.62, 0.60, 0.40, 0.38]
        ],
        evidence=['A', 'D1'],
        evidence_card=[2, 2])

    cpd_x2 = TabularCPD(
        variable='X2',
        variable_card=4,
        values=[
            [0.30, 0.28, 0.15, 0.14],
            [0.24, 0.26, 0.30, 0.32],  #GOOD 2
            [0.16, 0.18, 0.38, 0.40],  #GOOD 1
            [0.30, 0.28, 0.17, 0.14]
        ],
        evidence=['A', 'D1'],
        evidence_card=[2, 2])

    cpd_x3 = TabularCPD(
        variable='X3',
        variable_card=2,
        values=[[0.64, 0.62, 0.62, 0.63, 0.38, 0.35, 0.35, 0.37],
                [0.36, 0.38, 0.38, 0.37, 0.62, 0.65, 0.65, 0.63]],  #GOOD
        evidence=['A', 'D2'],
        evidence_card=[2, 4])

    cpd_x4 = TabularCPD(
        variable='X4',
        variable_card=4,
        values=[
            [0.25, 0.27, 0.07, 0.09],
            [0.36, 0.34, 0.64, 0.62],  #GOOD1
            [0.25, 0.27, 0.07, 0.09],
            [0.14, 0.12, 0.22, 0.20]
        ],  #GOOD2
        evidence=['A', 'D3'],
        evidence_card=[2, 2])

    wysiwygmodel.add_cpds(cpd_a, cpd_c1, cpd_c2, cpd_c3, cpd_c4, cpd_x1,
                          cpd_x2, cpd_x3, cpd_x4, cpd_y, cpd_d1, cpd_d2,
                          cpd_d3)
    datasamples = BayesianModelSampling(wysiwygmodel)
    discframe = datasamples.forward_sample(samplesize)
    AY = discframe[["A", "Y"]]

    C5 = samplecontinuous(AY,
                          samplesize=samplesize,
                          contatt="C5",
                          meana0=1,
                          meana1=1.2,
                          covy0=[1],
                          covy1=[0.9])
    C6 = samplecontinuous(AY,
                          samplesize=samplesize,
                          contatt="C6",
                          meana0=2,
                          meana1=1.8,
                          covy0=[1],
                          covy1=[0.95])

    X5 = samplecontinuous(AY,
                          samplesize=samplesize,
                          contatt="X5",
                          meana0=1.1,
                          meana1=1.4,
                          covy0=[1.1],
                          covy1=[0.95])
    X6 = samplecontinuous(AY,
                          samplesize=samplesize,
                          contatt="X6",
                          meana0=1.9,
                          meana1=1.5,
                          covy0=[1],
                          covy1=[1.1])

    discframe = pd.concat([discframe, C5, C6, X5, X6], axis=1)
    ndf = discframe.reindex(axis=1,
                            labels=[
                                'A', 'Y', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6',
                                'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'D1', 'D2',
                                'D3'
                            ])
    ndf.to_csv(path_or_buf=filename)
Exemplo n.º 20
0
            if parents.has_key(node):
                model.add_cpds(
                    TabularCPD(node, variableCard[node], cpds[node],
                               parents[node], parentsCardList[node]))
            else:
                model.add_cpds(TabularCPD(node, variableCard[node],
                                          cpds[node]))
        except Exception as e:
            tempException2 = 0

    #print("Tabular cpds added to model")

    print "Creating samples using Bayesian model forward Sampling"

    inference = BayesianModelSampling(model)
    normalSamples = inference.forward_sample(size=5000,
                                             return_type='dataframe')
    #print "length ", normalSamples.shape

    print "Some of the samples are as follows"
    print normalSamples[1:2]
    print " "
    print "Calculating relative entropies between different Sampling models"
    smean = {}
    sentropy = {}
    for i in range(normalSamples.shape[1]):
        sentropy[list(normalSamples[[i]])[0]] = -1 * np.sum(
            norm.logpdf(normalSamples[[i]])) / normalSamples.shape[0]
        smean[list(normalSamples[[i]])[0]] = np.mean(normalSamples[[i]])

    relEntropy = {}
Exemplo n.º 21
0
cpd_x6x2.normalize(True)
cpd_x3x5.normalize(True)

# ##### Creating Models and generating data

# In[31]:

# First Model
model1 = BayesianModel()
model1.add_nodes_from(['x1', 'x2', 'x3', 'x4', 'x5', 'x6'])
model1.add_edges_from([('x1', 'x2'), ('x1', 'x4'), ('x1', 'x6'), ('x2', 'x3'),
                       ('x2', 'x5')])
model1.add_cpds(cpd_x1, cpd_x1x2, cpd_x1x4, cpd_x1x6, cpd_x2x3, cpd_x2x5)
inference = BayesianModelSampling(model1)
# print(inference.forward_sample(size=1000, return_type='dataframe'))
data1 = inference.forward_sample(size=1000, return_type='dataframe')

# Second Model
model2 = BayesianModel()
model2.add_nodes_from(['x1', 'x2', 'x3', 'x4', 'x5', 'x6'])
model2.add_edges_from([('x1', 'x2'), ('x1', 'x4'), ('x6', 'x1'), ('x2', 'x3'),
                       ('x2', 'x5')])
model2.add_cpds(cpd_x6, cpd_x1x2, cpd_x1x4, cpd_x6x1, cpd_x2x3, cpd_x2x5)
inference = BayesianModelSampling(model2)
# print(inference.forward_sample(size=1000, return_type='dataframe'))
data2 = inference.forward_sample(size=1000, return_type='dataframe')

# Third Model
model3 = BayesianModel()
model3.add_nodes_from(['x1', 'x2', 'x3', 'x4', 'x5', 'x6'])
model3.add_edges_from([('x2', 'x3'), ('x2', 'x5'), ('x3', 'x6'), ('x6', 'x4'),
Exemplo n.º 22
0
    def start(self):

        cpd_difficulty = TabularCPD(variable='Difficulty', variable_card=2, values=[[0.6], [0.4]])
        cpd_musicianship = TabularCPD(variable='Musicianship', variable_card=2, values=[[0.7], [0.3]])

        cpd_Rating = TabularCPD(variable='Rating', variable_card=3,
                                    values=[[0.3, 0.05, 0.9, 0.5],
                                            [0.4, 0.25, 0.08, 0.3],
                                            [0.3, 0.7, 0.02, 0.2]],
                                            evidence=['Difficulty', 'Musicianship'],
                                            evidence_card=[2, 2])

        cpd_Exam = TabularCPD(variable='Exam', variable_card=2,
                                values=[[0.95, 0.2],
                                        [0.05, 0.8]],
                                evidence=['Musicianship'],
                                evidence_card=[2])

        cpd_Letter = TabularCPD(variable='Letter', variable_card=2,
                                values=[[0.1, 0.4, 0.99],
                                        [0.9, 0.6, 0.01]],
                                evidence=['Rating'],
                                evidence_card=[3])
        self.musicModel.add_cpds(cpd_difficulty, cpd_musicianship, cpd_Rating, cpd_Exam, cpd_Letter)
        print(self.musicModel.check_model())
        inference = BayesianModelSampling(self.musicModel)

        #Part one P(m = strong)P(d = low)P(r = ∗∗|m = strong,d = low) P(e = high|m = strong)P(letter = weak|∗∗)
        resultOne = inference.forward_sample(size=10000, return_type='recarray')
        # Musicianship = 0, Exam = 1, Difficulty = 2, Rating = 3, Letter = 4
        # Musicianship = 0
        musSum = 0
        for sample in resultOne:
                if sample[0] == 1:
                    musSum += 1
        musProb = musSum/10000
        print('music prob: ', musProb)

        difSum = 0
        for sample in resultOne:
                if sample[2] == 0:
                    difSum += 1
        diffProb = difSum/10000
        print('difficulty prob: ', diffProb)

        examSum = 0
        for sample in resultOne:
                if sample[1] == 1 and sample[0] == 1:
                    examSum += 1
        examProb = (examSum/10000) / musProb
        print('exam prob: ', examProb)

        ratingSum = 0
        for sample in resultOne:
                if sample[3] == 1 and sample[0] == 1 and sample[2] == 0:
                    ratingSum += 1
        ratingProb = (ratingSum/10000)/ (diffProb * musProb)
        print('rating prob: ', ratingProb)

        letterSum = 0
        for sample in resultOne:
                if sample[4] == 0 and sample[3] == 1:
                    letterSum += 1
        letterProb = (letterSum/10000) / ratingProb
        print('letter prob: ', letterProb)

        letterStrongSum = 0
        for sample in resultOne:
                if sample[4] == 1:
                    letterStrongSum += 1
        letterStrongSumProb = (letterStrongSum/10000)
        print('letter strong no evidence prob: ', letterStrongSumProb)

        letterStrongGivenMusicianshipSum = 0
        for sample in resultOne:
                if sample[4] == 1 and sample[0] == 0:
                    letterStrongGivenMusicianshipSum += 1
        letterStrongGivenProb = (letterStrongGivenMusicianshipSum/10000) / (1-musProb)
        print('letter strong given weak music prob: ', letterStrongGivenProb)
# Next, let us define another objective function, _i.e._ the KL divergence.

#------------------------------------
# Method 2: Training Markov Network
#------------------------------------
# Generate samples from Bayesian network

bn_sampler = BayesianModelSampling(grass_model)
bn_sampler.topological_order = NODES  # make sure the topological oder is consistent with NODES order
kld_temp = 10.
i = 1

while kld_temp > 0.001:
    print('Iteration %d, kld %f' % (i, kld_temp))
    i += 1
    bn_samps = bn_sampler.forward_sample(size=NUM_READS,
                                         return_type='dataframe')
    # calculate true data stats
    data_stats = np.zeros(shape=(len(NODES) + len(MORAL_EDGES), ))
    np.copyto(data_stats[:len(NODES)], np.mean(bn_samps, axis=0))
    np.copyto(
        data_stats[len(NODES):],
        np.dot(bn_samps.T, bn_samps)[np.array(MORAL_EDGES)[:, 0],
                                     np.array(MORAL_EDGES)[:, 1]] /
        (NUM_READS * 1.))
    p_data_bn = calculate_histogram(bn_samps.as_matrix())
    kld_temp = kld(p_true, p_data_bn)

plt.bar(range(16), p_true, width=0.4)
plt.bar(np.array(range(16)) + 0.4, p_data_bn, width=0.4)
print('KLD from true distribution to generated data distribution:', kld_temp)
Exemplo n.º 24
0
                  cpd_self_harm)

# check_model checks for the network structure and CPDs and verifies that the CPDs are correctly
# defined and sum to 1.
pg_model.check_model()

# examine conditional independence relationships:
pg_model.local_independencies("parent_education")
pg_model.local_independencies("child_obesity")
pg_model.local_independencies("child_screen_time")
pg_model.local_independencies("child_physical_activity")

# sample data from the network:
inference = BayesianModelSampling(pg_model)
sim_n = 50_000
simulated_sample = inference.forward_sample(size=sim_n)
for colname_j in simulated_sample.columns:
    simulated_sample[colname_j] = (
        simulated_sample[colname_j] == "high").astype(int)

# draw correlation plot of the variables:
corr_mat = simulated_sample.corr()
corr_mat.style.background_gradient(cmap="coolwarm").set_precision(2)

# eaxmple: if we condition on "child_screen_time"..
# ..then "child_physical_activity" becomes independent of "parent_education":
corr_mat = simulated_sample.query("child_screen_time==1").drop(
    "child_screen_time", axis=1).corr()
corr_mat.style.background_gradient(cmap="coolwarm").set_precision(2)

corr_mat = simulated_sample.query("child_screen_time==0").drop(
                         'difficulty': 2,
                         'Q9': 3
                     }))
# print(belpro.map_query(variables=['Q25', 'Q18','Q16'],evidence={'instr':1}))
print(
    belpro.map_query(variables=['attendance', 'Q9', 'difficulty'],
                     evidence={'class': 7}))

#Commented some queries because taking a lot of time to run

# print(belpro.map_query(variables=['Q28','Q11'],evidence={'instr':2, 'class':10}))
# print(belpro.map_query(variables=['Q18', 'Q26','Q13'],evidence={'instr':2}))
# print(belpro.map_query(variables=['Q23', 'Q21','Q17'],evidence={'instr':2}))
inference = BayesianModelSampling(bayesmodel)

df = inference.forward_sample(5)
# print df.shape
print df
print np.mean(df)
# print scipy.stats.entropy(df)

dataarray = panda.DataFrame.as_matrix(df)
print dataarray
arr = dataarray.astype(float)
print arr
sum1 = []
total = 0
count = 0

for j in range(0, 18):
    for i in arr:
Exemplo n.º 26
0
from pgmpy.models.BayesianModel import BayesianModel
from pgmpy.factors.discrete import TabularCPD
from pgmpy.sampling import BayesianModelSampling
student = BayesianModel([('diff', 'grade'), ('intel', 'grade')])
cpd_d = TabularCPD('diff', 2, [[0.6], [0.4]])
cpd_i = TabularCPD('intel', 2, [[0.7], [0.3]])
cpd_g = TabularCPD('grade', 3, [[0.3, 0.05, 0.9, 0.5], [0.4, 0.25,0.08, 0.3], [0.3, 0.7, 0.02, 0.2]],['intel', 'diff'], [2, 2])
student.add_cpds(cpd_d, cpd_i, cpd_g)
inference = BayesianModelSampling(student)
print inference.forward_sample(size=3, return_type='recarray')
Exemplo n.º 27
0
def generateWysiwygData(samplesize=4000, filename="data/wysiwygdata4.csv"):
    ''' We define a bayesian model based on the WYSIWYG model from the thesis.
		There are 6 C variables and 6 X variables. For both C and X the first four are discrete variables,
		the other two continous. The variable C1 is causally influencing Y to assure a certain level of 
		group unfairness in the data.'''

    wysiwygmodel = BayesianModel([('A', 'C1'), ('A', 'C2'), ('A', 'C3'),
                                  ('A', 'C4'), ('C1', 'Y'), ('Y', 'C2'),
                                  ('Y', 'C3'), ('Y', 'C4'), ('A', 'X1'),
                                  ('A', 'X2'), ('A', 'X3'), ('A', 'X4'),
                                  ('Y', 'X1'), ('Y', 'X2'), ('Y', 'X3'),
                                  ('Y', 'X4')])

    cpd_a = TabularCPD(variable='A', variable_card=2, values=[[0.5], [0.5]])

    cpd_y = TabularCPD(variable='Y',
                       variable_card=2,
                       values=[[0.65], [0.4], [0.35], [0.6]],
                       evidence=['C1'],
                       evidence_card=[2])

    cpd_c1 = TabularCPD(variable='C1',
                        variable_card=2,
                        values=[[0.85, 0.2], [0.15, 0.8]],
                        evidence=['A'],
                        evidence_card=[2])

    cpd_c2 = TabularCPD(variable='C2',
                        variable_card=4,
                        values=[[0.23, 0.27, 0.25, 0.20],
                                [0.35, 0.23, 0.24, 0.15],
                                [0.22, 0.27, 0.25, 0.25],
                                [0.20, 0.23, 0.26, 0.40]],
                        evidence=['A', 'Y'],
                        evidence_card=[2, 2])

    cpd_c3 = TabularCPD(variable='C3',
                        variable_card=2,
                        values=[[0.52, 0.49, 0.5, 0.45],
                                [0.48, 0.51, 0.5, 0.55]],
                        evidence=['A', 'Y'],
                        evidence_card=[2, 2])

    cpd_c4 = TabularCPD(variable='C4',
                        variable_card=4,
                        values=[[0.22, 0.25, 0.25, 0.37],
                                [0.23, 0.25, 0.26, 0.21],
                                [0.23, 0.25, 0.25, 0.22],
                                [0.32, 0.25, 0.24, 0.20]],
                        evidence=['A', 'Y'],
                        evidence_card=[2, 2])

    cpd_x1 = TabularCPD(variable='X1',
                        variable_card=2,
                        values=[[0.57, 0.48, 0.52, 0.38],
                                [0.43, 0.52, 0.48, 0.62]],
                        evidence=['A', 'Y'],
                        evidence_card=[2, 2])

    cpd_x2 = TabularCPD(variable='X2',
                        variable_card=4,
                        values=[[0.24, 0.28, 0.26, 0.19],
                                [0.38, 0.22, 0.24, 0.15],
                                [0.20, 0.28, 0.26, 0.23],
                                [0.18, 0.22, 0.24, 0.43]],
                        evidence=['A', 'Y'],
                        evidence_card=[2, 2])

    cpd_x3 = TabularCPD(variable='X3',
                        variable_card=2,
                        values=[[0.54, 0.48, 0.52, 0.4],
                                [0.46, 0.52, 0.48, 0.6]],
                        evidence=['A', 'Y'],
                        evidence_card=[2, 2])

    cpd_x4 = TabularCPD(variable='X4',
                        variable_card=4,
                        values=[[0.20, 0.25, 0.24, 0.40],
                                [0.21, 0.25, 0.28, 0.21],
                                [0.21, 0.25, 0.24, 0.21],
                                [0.38, 0.25, 0.24, 0.18]],
                        evidence=['A', 'Y'],
                        evidence_card=[2, 2])

    wysiwygmodel.add_cpds(cpd_a, cpd_c1, cpd_c2, cpd_c3, cpd_c4, cpd_x1,
                          cpd_x2, cpd_x3, cpd_x4, cpd_y)
    datasamples = BayesianModelSampling(wysiwygmodel)
    discframe = datasamples.forward_sample(samplesize)
    AY = discframe[["A", "Y"]]

    C5 = samplecontinuous(AY,
                          samplesize=samplesize,
                          contatt="C5",
                          meana0=1,
                          meana1=1.2,
                          covy0=[1],
                          covy1=[0.9])
    C6 = samplecontinuous(AY,
                          samplesize=samplesize,
                          contatt="C6",
                          meana0=2,
                          meana1=1.8,
                          covy0=[1],
                          covy1=[0.95])

    X5 = samplecontinuous(AY,
                          samplesize=samplesize,
                          contatt="X5",
                          meana0=1.1,
                          meana1=1.4,
                          covy0=[1.1],
                          covy1=[0.95])
    X6 = samplecontinuous(AY,
                          samplesize=samplesize,
                          contatt="X6",
                          meana0=1.9,
                          meana1=1.5,
                          covy0=[1],
                          covy1=[1.1])

    discframe = pd.concat([discframe, C5, C6, X5, X6], axis=1)
    discframe.to_csv(path_or_buf=filename)