示例#1
0
 def test_score_titanic(self):
     scorer = K2Score(self.titanic_data2)
     titanic = BayesianModel([("Sex", "Survived"), ("Pclass", "Survived")])
     self.assertAlmostEqual(scorer.score(titanic), -1891.0630673606006)
     titanic2 = BayesianModel([("Pclass", "Sex"), ])
     titanic2.add_nodes_from(["Sex", "Survived", "Pclass"])
     self.assertLess(scorer.score(titanic2), scorer.score(titanic))
 def setup(self):
     model = get_example_model('alarm')
     samples = model.simulate(n_samples=int(1e4),
                              seed=42,
                              show_progress=False)
     self.scoring_method = K2Score(samples)
     self.est = HillClimbSearch(data=samples)
    def __init__(self, data, scoring_method=None, **kwargs):
        """
        Class for heuristic hill climb searches for DAGs, to learn
        network structure from data. `estimate` attempts to find a model with optimal score.

        Parameters
        ----------
        data: pandas DataFrame object
            datafame object where each column represents one variable.
            (If some values in the data are missing the data cells should be set to `numpy.NaN`.
            Note that pandas converts each column containing `numpy.NaN`s to dtype `float`.)

        scoring_method: Instance of a `StructureScore`-subclass (`K2Score` is used as default)
            An instance of `K2Score`, `BdeuScore`, or `BicScore`.
            This score is optimized during structure estimation by the `estimate`-method.

        state_names: dict (optional)
            A dict indicating, for each variable, the discrete set of states (or values)
            that the variable can take. If unspecified, the observed values in the data set
            are taken to be the only possible states.

        complete_samples_only: bool (optional, default `True`)
            Specifies how to deal with missing data, if present. If set to `True` all rows
            that contain `np.Nan` somewhere are ignored. If `False` then, for each variable,
            every row where neither the variable nor its parents are `np.NaN` is used.
            This sets the behavior of the `state_count`-method.
        """
        if scoring_method is not None:
            self.scoring_method = scoring_method
        else:
            self.scoring_method = K2Score(data, **kwargs)

        super(HillClimbSearch, self).__init__(data, **kwargs)
示例#4
0
    def estimate(self,
                 tabu_length=100,
                 max_indegree=2,
                 black_list=None,
                 epsilon=1e-4,
                 max_iter=1e6,
                 show_progress=True):

        # We will be using K2Score for this model
        score = K2Score(data=self.data)
        # Model gets the score for a node and its parents
        # This is used on every iteration for all possible changes
        # This is greddy and picks the best available option
        score_fn = score.local_score
        # Initialize a Starting DAG
        # PGMPY made a DAG class that adds some functionality to nx.DiGrpah
        start_dag = DAG()
        start_dag.add_nodes_from(self.variables)
        # Set the edges we do not want to have in the graph
        if black_list is None:
            black_list = set()
        else:
            black_list = set(black_list)

        # Just change Maxindegree to a certain number when doing the model

        # I think this is to keep track of the changes we already made to the model
        tabu_list = deque(maxlen=tabu_length)
        # Initialize a current model
        current_model = start_dag
        if show_progress:
            iteration = trange(int(max_iter))
        else:
            iteration = range(int(max_iter))
        for _ in iteration:
            # Get the best operations based on K2 score with self._legal_operations
            best_operation, best_score_change = max(self._legal_operations(
                model=current_model,
                score=score_fn,
                tabu_list=tabu_list,
                max_indegree=max_indegree,
                black_list=black_list,
            ),
                                                    key=lambda t: t[1])

            if best_score_change < epsilon:
                break
            elif best_operation[0] == '+':
                current_model.add_edge(*best_operation[1])
                tabu_list.append(("-", best_operation[1]))
            elif best_operation[0] == '-':
                current_model.remove_edge(*best_operation[1])
                tabu_list.append(("+", best_operation[1]))
            elif best_operation[0] == 'flip':
                X, Y = best_operation[1]
                current_model.remove_edge(X, Y)
                current_model.add_edge(Y, X)
                tabu_list.append(best_operation)

        return current_model
示例#5
0
def _SetScoringType(df, scoretype, verbose=3):
    if verbose >= 3: print('[bnlearn] >Set scoring type at [%s]' % (scoretype))

    if scoretype == 'bic':
        scoring_method = BicScore(df)
    elif scoretype == 'k2':
        scoring_method = K2Score(df)
    elif scoretype == 'bdeu':
        scoring_method = BDeuScore(df, equivalent_sample_size=5)

    return (scoring_method)
示例#6
0
def SetScoringType(df, scoretype, verbose=3):
    if verbose >= 3:
        print('[BNLEARN][STRUCTURE LEARNING] Set scoring type at [%s]' %
              (scoretype))

    if scoretype == 'bic':
        scoring_method = BicScore(df)
    elif scoretype == 'k2':
        scoring_method = K2Score(df)
    elif scoretype == 'bdeu':
        scoring_method = BdeuScore(df, equivalent_sample_size=5)

    return (scoring_method)
示例#7
0
def scoreStructureLearn(data,
                        search='HillClimbSearch',
                        scoring_method='BicScore'):
    #基于score-search的结构学习
    #search:HillClimbSearch, ExhaustiveSearch
    #scoring_method: 'BicScore', K2Score, BdeuScore
    if scoring_method == 'BicScore':
        scoring_method_tmp = BicScore(data)
    elif scoring_method == 'K2Score':
        scoring_method_tmp = K2Score(data)
    elif scoring_method == 'BdeuScore':
        scoring_method_tmp = BdeuScore(data, equivalent_sample_size=5)
    if search == 'HillClimbSearch':
        es = HillClimbSearch(data, scoring_method=scoring_method_tmp)
    else:
        es = ExhaustiveSearch(data, scoring_method=scoring_method_tmp)
    best_model = es.estimate()
    return best_model
    def setUp(self):
        self.rand_data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)),
                                      columns=list("AB"))
        self.rand_data["C"] = self.rand_data["B"]
        self.est_rand = HillClimbSearch(self.rand_data,
                                        scoring_method=K2Score(self.rand_data))
        self.model1 = BayesianModel()
        self.model1.add_nodes_from(["A", "B", "C"])
        self.model2 = self.model1.copy()
        self.model2.add_edge("A", "B")

        # link to dataset: "https://www.kaggle.com/c/titanic/download/train.csv"
        self.titanic_data = pd.read_csv(
            "pgmpy/tests/test_estimators/testdata/titanic_train.csv")
        self.titanic_data1 = self.titanic_data[[
            "Survived", "Sex", "Pclass", "Age", "Embarked"
        ]]
        self.titanic_data2 = self.titanic_data[["Survived", "Sex", "Pclass"]]
        self.est_titanic1 = HillClimbSearch(self.titanic_data1)
        self.est_titanic2 = HillClimbSearch(self.titanic_data2)
示例#9
0
    def learn_structure(self, method, scoring_method, log=True):
        ''' (4)
        Method that builds the structure of the data
        -----------------
        Parameters:
        method          : The technique used to search for the structure
            -> scoring_approx     - To use an approximated search with scoring method
            -> scoring_exhaustive - To use an exhaustive search with scoring method
            -> constraint         - To use the constraint based technique
        scoring_method : K2, bic, bdeu
        log             - "True" if you want to print debug information in the console    
        '''

        #Select the scoring method for the local search of the structure
        if scoring_method == "K2":
            scores = K2Score(self.data)
        elif scoring_method == "bic":
            scores = BicScore(self.data)
        elif scoring_method == "bdeu":
            scores = BdeuScore(self.data)

        #Select the actual method
        if method == "scoring_approx":
            est = HillClimbSearch(self.data, scores)
        elif method == "scoring_exhaustive":
            est = ExhaustiveSearch(self.data, scores)
        elif method == "constraint":
            est = ConstraintBasedEstimator(self.data)

        self.best_model = est.estimate()
        self.eliminate_isolated_nodes(
        )  # REMOVE all nodes not connected to anything else

        for edge in self.best_model.edges_iter():
            self.file_writer.write_txt(str(edge))

        self.log("Method used for structural learning: " + method, log)
        #self.log("Training instances skipped: " + str(self.extractor.get_skipped_lines()), log)
        self.log("Search terminated", log)
def scoreModels(h0Diff, h0Rarity):
	diffModel0 = [('d5', 'd9'), ('d5', 'd3'), ('d3', 'd4'), ('d3', 'd8'), 
				  ('d9', 'd6'), ('d9', 'd1'), ('d9', 'd7'), ('d9', 'd8')]

	diffModel1 = [('d2', 'd5'), ('d5', 'd9'), ('d5', 'd3'), ('d3', 'd4'),
				  ('d3', 'd8'), ('d9', 'd6'), ('d9', 'd1'), ('d9', 'd7'),
				  ('d9', 'd8')]

	diffModel2 = [('d1', 'd2'), ('d5', 'd9'), ('d5', 'd3'), ('d3', 'd4'),
				  ('d3', 'd8'), ('d9', 'd6'), ('d9', 'd1'), ('d9', 'd7'),
				  ('d9', 'd8')]

	print(' \nestimating K2/BIC score of difference structures\n')
	print('k2score model0: {0}		BicScore model0: {1}'.format(
		K2Score(h0Diff).score(BayesianModel(diffModel0)),
		BicScore(h0Diff).score(BayesianModel(diffModel0))))
	print('k2score model1: {0}		BicScore model1: {1}'.format(
		K2Score(h0Diff).score(BayesianModel(diffModel1)),
		BicScore(h0Diff).score(BayesianModel(diffModel1))))
	print('k2score model2: {0}		BicScore model2: {1}'.format(
		K2Score(h0Diff).score(BayesianModel(diffModel2)),
		BicScore(h0Diff).score(BayesianModel(diffModel2))))

	rarityModel0 = [('r5', 'r9'), ('r5', 'r3'), ('r9', 'r1'), ('r8', 'r3'),
					('r6', 'r9'), ('r6', 'r3')]


	rarityModel1 = [('r6', 'r9'), ('r7', 'r9'), ('r3', 'r4'), ('r3', 'r5'),
					('r3', 'r9'), ('r2', 'r9'), ('r5', 'r9'), ('r9', 'r8'),
					('r9', 'r1')]

	rarityModel2 = [('r7', 'r9'), ('r4', 'r3'), ('r4', 'r9'), ('r1', 'r2'),
					('r1', 'r9'), ('r2', 'r9'), ('r5', 'r9'), ('r9', 'r8'),
					('r9', 'r6')]

	print(' \nestimating K2/BIC score of rarity structures\n')
	print('k2score model0: {0}		BicScore model0: {1}'.format(
		K2Score(h0Rarity).score(BayesianModel(rarityModel0)),
		BicScore(h0Rarity).score(BayesianModel(rarityModel0))))
	print('k2score model1: {0}		BicScore model1: {1}'.format(
		K2Score(h0Rarity).score(BayesianModel(rarityModel1)),
		BicScore(h0Rarity).score(BayesianModel(rarityModel1))))
	print('k2score model2: {0}		BicScore model2: {1}'.format(
		K2Score(h0Rarity).score(BayesianModel(rarityModel2)),
		BicScore(h0Rarity).score(BayesianModel(rarityModel2))))
示例#11
0
cpd1.append(p_64)
cpd1.append(p_36)
cpd1.append(p4)

model1.add_cpds(*cpd1)

print("------------------------------------------")
print("Edges of model1:", model1.edges())
print("Checking Model1:", model1.check_model())
print("------------------------------------------")
'''generate data for model1'''
inference = BayesianModelSampling(model1)
data=inference.forward_sample(size=3000, return_type='dataframe')
print("Data for model1:")
print(data)   
k2=K2Score(data)
print('Model1 K2 Score: ' + str(k2.score(model1)))

'''Inference'''
from pgmpy.inference import VariableElimination
infer = VariableElimination(model1)
print("Inference of x3:")
print(infer.query(['x3']) ['x3'])
print("Inference of x5|x2:")
print(infer.query(['x5'], evidence={ 'x2': 1}) ['x5'])


''''Model2'''

model2 = BayesianModel([('x1', 'x2'),('x1', 'x6'),('x2','x5'),('x2','x3'),('x6','x4')])
model2.add_cpds(p1,p_21,p_52,p_32,p_46,p_61)
示例#12
0
def create_BN_model_using_BayesianEstimator(data):
    #data = pd.DataFrame(sensor_data)#, columns= feature_names)#['X', 'Y'])
    #print(data)
    data = pd.DataFrame(
        data
    )  #read_data_from_file_remove_date_and_time(r"E:\Lessons_tutorials\Behavioural user profile articles\Datasets\7 twor.2009\twor.2009\converted\pgmpy\sensor+PCA_n=5.csv" , data_type='float'))
    #print(data)

    #start_time = time.time()
    # 2 hours running, without output
    #hc = HillClimbSearch(data, scoring_method=BicScore(data))
    #best_model = hc.estimate()
    #print(hc.scoring_method)
    #print(best_model.edges())
    #end_time = time.time()
    #print("execution time in seconds:")
    #print(end_time-start_time)

    #start_time = time.time()
    #hc = HillClimbSearch(data, scoring_method=BdeuScore(data))
    #best_model = hc.estimate()
    #print(hc.scoring_method)
    #print(best_model.edges())
    #end_time = time.time()
    #print("execution time in seconds:")
    #print(end_time-start_time)

    #structure learning
    print("structure learning")
    start_time = time.time()
    hc = HillClimbSearch(data, scoring_method=K2Score(
        data))  #BicScore(data))#K2Score(data))BdeuScore(data)
    best_model = hc.estimate()
    print(hc.scoring_method)
    print(best_model.edges())
    end_time = time.time()
    print("execution time in seconds:{}".format(end_time - start_time))

    #parameter learning
    #model = BayesianModel([('A', 'C'), ('B', 'C')])
    #model.fit(data)
    #model.get_cpds()

    ######
    #best_model = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D')])

    casas7_model = BayesianModel(best_model.edges())
    print("*******************")
    #BayesianEstimator.get_parameters(self, prior_type, equivalent_sample_size, pseudo_counts)
    #####estimator = BayesianEstimator(best_model, data)
    #####print(estimator.get_parameters(prior_type='K2'))#, equivalent_sample_size=5)

    estimator = BayesianEstimator(casas7_model, data)

    #casas7_model.fit(data, estimator=BayesianEstimator, prior_type="K2")#MaximumLikelihoodEstimator)
    ######print(casas7_model.get_cpds())
    ###casas7_model.predict(data)
    #print("casas7_model.node:{}".format(casas7_model.node))

    ########return estimator
    return estimator
示例#13
0
import numpy as np
import pandas as pd
from pgmpy.estimators import HillClimbSearch
from pgmpy.models import BayesianModel
from pgmpy.estimators import K2Score
from pgmpy.factors.discrete import TabularCPD
from pgmpy.sampling import BayesianModelSampling
from pgmpy.inference import VariableElimination

feature_val1 = pd.read_csv('15features_f.csv')
'''1pen_pressure	2letter_spacing	3size	4dimension	 5is_lowercase
6is_continuous 7slantness 8tilt	9entry_stroke_a
10staff_of_a	11formation_n	 12staff_of_d	 13exit_stroke_d	 
14word_formation  15constancy'''

hill = HillClimbSearch(feature_val1, scoring_method=K2Score(feature_val1))
f_model = hill.estimate()
print(f_model.edges())

feature_val2 = pd.read_csv('15features_g.csv')

hill1 = HillClimbSearch(feature_val2, scoring_method=K2Score(feature_val2))
g_model = hill1.estimate()
print(g_model.edges())

corr_mat = feature_val1.corr()
print(corr_mat)
corr_feature = set()
for i in range(len(corr_mat.columns)):
    for j in range(i):
        if abs(corr_mat.iloc[i, j]) > 0.2:
示例#14
0
model5.add_edges_from([('x1', 'x2'), ('x1', 'x6'), ('x6', 'x4'), ('x2', 'x3'),
                       ('x3', 'x5')])
model5.add_cpds(cpd_x1, cpd_x1x2, cpd_x1x6, cpd_x6x4, cpd_x2x3, cpd_x3x5)
inference = BayesianModelSampling(model5)
# print(inference.forward_sample(size=1000, return_type='dataframe'))
data5 = inference.forward_sample(size=1000, return_type='dataframe')

# ##### Evaluating the models using K2 score on the generated data

# In[70]:

# Evaluating the models on the data sets generated by them
data = pd.concat([data1, data2, data3, data4, data5])
data.shape

k2 = K2Score(data)

print('Model 1 K2 Score: ' +
      str(k2.score(model1)))  # model 1 is the best model
print('Model 2 K2 Score: ' + str(k2.score(model2)))
print('Model 3 K2 Score: ' + str(k2.score(model3)))
print('Model 4 K2 Score: ' + str(k2.score(model4)))
print('Model 5 K2 Score: ' + str(k2.score(model5)))

# ##### Find the high and low probability patterns of 'th'

# In[153]:

# Finding 'th' highest frequency pattern
frequency = data.groupby(['x1', 'x2', 'x3', 'x4', 'x5',
                          'x6']).size().to_frame('count').reset_index()
models = [model1, model2]

[m.fit(data) for m in models]  # ML-fit

STATE_NAMES = model1.cpds[0].state_names
print('\nState names:')
for s in STATE_NAMES:
    print(s, STATE_NAMES[s])

# Information for the curious:
# Structure-scores: http://pgmpy.org/estimators.html#structure-score
# K2-score: for instance http://www.lx.it.pt/~asmc/pub/talks/09-TA/ta_pres.pdf
# Additive smoothing and pseudocount: https://en.wikipedia.org/wiki/Additive_smoothing
# Scoring functions: https://www.cs.helsinki.fi/u/bmmalone/probabilistic-models-spring-2014/ScoringFunctions.pdf
k2 = K2Score(data)
print('Structure scores:', [k2.score(m) for m in models])

separator()

print('\n\nExhaustive structure search based on structure scores:')

from pgmpy.estimators import ExhaustiveSearch
from pgmpy.estimators import HillClimbSearch
from pgmpy.estimators import BicScore

# Warning: Doing exhaustive search on a PGM with all 5 variables
# takes more time than you should have to wait. Hence
# re-fit the models to data where some variable(s) has been removed
# for this assignement.
raw_data2 = {
示例#16
0
    def opt(self, file1, file2):
        f1 = open(file1, encoding="utf8")
        lines = f1.readlines()
        nodes = self.getegdes(lines[0])
        edges = self.getegdes(lines[1])
        data = pd.read_csv(file2)

        G = BayesianModel()
        G.add_nodes_from(nodes)
        for i in range(int(len(edges) / 2)):
            G.add_edge(edges[2 * i], edges[2 * i + 1])
        # nx.draw(G)
        # plt.show()
        k2 = K2Score(data).score(G)
        bic = BicScore(data).score(G)
        bdeu = BDeuScore(data).score(G)
        print(k2, ",", bic, ",", bdeu)

        est = HillClimbSearch(data, scoring_method=K2Score(data))
        model = est.estimate()
        model_edges = model.edges()
        G_ = nx.DiGraph()
        G_.add_edges_from(model_edges)
        G_copy = nx.DiGraph()
        G_copy.add_edges_from(G.edges)
        add = []
        add_mut = []
        delete = []
        delete_mut = []
        # a = list(G.edges._adjdict.key())
        for edge in model_edges:
            node1 = edge[0]
            node2 = edge[1]
            if not nx.has_path(G, node2, node1):
                if not G.has_edge(node1, node2):
                    this = (node1, node2)
                    # this = '('+node1+','+node2+')'
                    add.append(this)
                    x = data[node1]
                    mut = mr.mutual_info_score(data[node1], data[node2])
                    add_mut.append(mut)
        seq = list(zip(add_mut, add))
        seq = sorted(seq, key=lambda s: s[0], reverse=True)
        alpha = 0.015
        # if seq[0][0] > alpha:
        #     add = seq[0:1]

        add = seq[0:1]

        data_edges = []
        for edge in G.edges:
            node1 = edge[0]
            node2 = edge[1]
            mut = mr.mutual_info_score(data[node1], data[node2])
            delete_mut.append(mut)
            data_edges.append(edge)
            # if not (nx.has_path(G_, node1, node2) or nx.has_path(G_, node2, node1)):
            #     this = '('+node1+','+node2+')'
            #     delete.append(this)
        seq = list(zip(delete_mut, data_edges))
        seq = sorted(seq, key=lambda s: s[0])

        # if seq[0][0] < alpha:
        #     delete = seq[0:1]
        if len(edges) > 2:
            delete = seq[0:1]
            if len(add) > 0:
                if delete[0][0] > add[0][0]:
                    delete = []

        print('add')
        for i in add:
            print(str(i[1]) + "," + str(i[0]))

        print('delete')
        for j in delete:
            print(str(j[1]) + "," + str(j[0]))
            # print(j[0])

        print('cpt')
        estimator = BayesianEstimator(G, data)
        for i in G.nodes:
            cpd = estimator.estimate_cpd(i, prior_type="K2")
            nodeName = i
            values = dict(data[i].value_counts())
            valueNum = len(values)
            CPT = np.transpose(cpd.values)
            # CPT = cpd.values
            sequence = cpd.variables[1::]
            card = []
            for x in sequence:
                s = len(dict(data[x].value_counts()))
                card.append(s)
            output = nodeName + '\t' + str(valueNum) + '\t' + str(
                CPT.tolist()) + '\t' + str(sequence) + '\t' + str(card)
            print(output)

        print('mutual')
        output1 = []
        for i in range(int(len(edges) / 2)):
            mut = mr.mutual_info_score(data[edges[2 * i]],
                                       data[edges[2 * i + 1]])
            output1.append(mut)
        output2 = {}
        for node1 in G.nodes():
            d = {}
            for node2 in G.nodes():
                if node1 == node2:
                    continue
                mut = mr.mutual_info_score(data[node1], data[node2])

                d[node2] = mut
            output2[node1] = d
        print(output1)
        print(output2)
示例#17
0
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.preprocessing import KBinsDiscretizer

data = pd.read_csv("data/data_auto_mpg.csv")
# data = pd.DataFrame(np.random.randn(500, 5), columns=list('ABCDE'))
# data['F'] = data['A'] * data['B']

for col in data.columns:
    if (data[col].dtype == np.float64 or data[col].dtype == np.float32):
        # bin_size = np.unique(data[col].values).shape[0]
        # kbins = KBinsDiscretizer(n_bins=bin_size, encode='ordinal', strategy='uniform').fit(data[col].values.reshape(-1,1))
        # data[col] = kbins.transform(data[col].values.reshape(-1,1)).astype(np.int64)
        data[col] = data[col].astype(np.int64)

data = data.iloc[:, :10]

print(data.dtypes)
print(data)

print("aq")
est = HillClimbSearch(data, scoring_method=K2Score(data))
print("aq")
model = est.estimate(max_indegree=5)
print("aq")

print(model.edges)

plt.figure()
nx.draw_networkx(model)
plt.show()
示例#18
0
#feature_names.append("Person")
#print(feature_names)
#mydata = np.random.randint(low=0, high=2,size=(100, 6))
mydata = np.genfromtxt(r'E:\Lessons_tutorials\Behavioural user profile articles\Datasets\Aras\House A\CSV_Summery\Sequential\Day\occur\Whole_data.csv', delimiter=",")
#pd.read_csv(r'E:\Lessons_tutorials\Behavioural user profile articles\Datasets\7 twor.2009\twor.2009\converted\pgmpy\data.csv')
#print(mydata)
feature_names = [str(i) for i in range (1,41)]
feature_names.append("Person")
feature_names.append("activity")
print(feature_names)
data = pd.DataFrame(mydata, columns= feature_names)#['X', 'Y'])
print(data)

list_of_scoring_methods = [#BicScore(data),
                           #BdeuScore(data),
                           K2Score(data)]

for scoreMethod in list_of_scoring_methods:
    start_time = time.time()
    hc = HillClimbSearch(data, scoreMethod)
    best_model = hc.estimate()
    print(hc.scoring_method)
    print(best_model.edges())
    end_time = time.time()
    print("execution time in seconds:")
    print(end_time-start_time)



#casas7_model = BayesianModel()
#casas7_model.fit(data, estimator=BayesianEstimator)#MaximumLikelihoodEstimator)
示例#19
0
 def test_score(self):
     self.assertAlmostEqual(K2Score(self.d1).score(self.m1), -10.73813429536977)
     self.assertEqual(K2Score(self.d1).score(BayesianModel()), 0)
示例#20
0
from pgmpy.estimators import ExhaustiveSearch, K2Score

if __name__ == '__main__':
#     fp = os.path.join('data', 'MTurk_Harvey.csv')
#     df = pd.read_csv(fp)
#     data = np.genfromtxt(fp, delimiter=",", dtype=float, skip_header=1)
#     x = data[:,:-1]
#     y = data[:,-1]
    
#     data = pd.DataFrame(np.random.randint(0, 5, size=(2500, 3)), columns=list('XYZ'))
#     data['sum'] = data.sum(axis=1)
#     #print(data)
    
#     est = ConstraintBasedEstimator(data)
#     skel, sep_sets = est.estimate_skeleton()
#     print(skel.edges())

#     s = ExhaustiveSearch(pd.DataFrame(data={'Temperature': [23, 19],'Weather': ['sunny', 'cloudy'],'Humidity': [65, 75]}))
#     print(len(list(s.all_dags())))
#     for dag in s.all_dags():
#         print(dag.edges())
        
    data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)), columns=list('AB'))
    data['C'] = data['B']
    searcher = ExhaustiveSearch(data, scoring_method=K2Score(data))
    for score, model in searcher.all_scores():
        print score
        print model.edges()
        

示例#21
0
models = [model1, model2]

[m.fit(data) for m in models]  # ML-fit

STATE_NAMES = model1.cpds[0].state_names
#print(model2.cpds[3])
print('\nState names:')
for s in STATE_NAMES:
    print(s, STATE_NAMES[s])

# Information for the curious:
# Structure-scores: http://pgmpy.org/estimators.html#structure-score
# K2-score: for instance http://www.lx.it.pt/~asmc/pub/talks/09-TA/ta_pres.pdf
# Additive smoothing and pseudocount: https://en.wikipedia.org/wiki/Additive_smoothing
# Scoring functions: https://www.cs.helsinki.fi/u/bmmalone/probabilistic-models-spring-2014/ScoringFunctions.pdf
k2 = K2Score(data)
print('Structure scores:', [k2.score(m) for m in models])

separator()

print('\n\nExhaustive structure search based on structure scores:')

from pgmpy.estimators import ExhaustiveSearch, HillClimbSearch, BicScore

# Warning: Doing exhaustive search on a PGM with all 5 variables
# takes more time than you should have to wait. Hence
# re-fit the models to data where some variable(s) has been removed
# for this assignement.
raw_data2 = {
    'age': data['age'],
    'avg_cs': data['avg_cs'],
# %% codecell
from pgmpy.estimators import BDeuScore, K2Score, BicScore

# Create random data sample with 3 variables, where Z is dependent on X, Y:
data: DataFrame = DataFrame(data=np.random.randint(low=0,
                                                   high=4,
                                                   size=(5000, 2)),
                            columns=list('XY'))

# Making Z dependent (in some arbitrary relation like addition) on X and Y
data['Z'] = data['X'] + data['Y']

# %% codecell
# Creating the scoring objects from this data:
bdeu: BDeuScore = BDeuScore(data, equivalent_sample_size=5)
k2: K2Score = K2Score(data=data)
bic: BicScore = BicScore(data=data)

# %% codecell
commonEvidenceModel: BayesianModel = BayesianModel([('X', 'Z'), ('Y', 'Z')])
drawGraph(commonEvidenceModel)
# %% codecell
commonCauseModel: BayesianModel = BayesianModel([('X', 'Z'), ('X', 'Y')])
drawGraph(commonCauseModel)

# %% codecell
bdeu.score(commonEvidenceModel)
# %% codecell
k2.score(commonEvidenceModel)
# %% codecell
bic.score(commonEvidenceModel)
示例#23
0
                    PRED[X_j] = NEW_PRED[i, Xj]

                X_mat = X_mat.difference(S)
                X_pred = X_pred.intersection(S)

        break


def pi(G, Xi):
    return set([p for p, f in G.edges if f == Xi])


def beta(G, xi):
    pass


data = pd.read_csv("../data/asia.csv")
newData = data.copy()

for col in newData.columns:
    if (newData[col].dtype == np.float64 or newData[col].dtype == np.float32):
        newData[col] = newData[col].astype(np.int64)

newData = newData.iloc[:, :7]
e_t = [1, 1, 1, 1, 1, 1]

G = HillClimbSearch(newData,
                    scoring_method=K2Score(newData)).estimate(max_indegree=5)

MaxIndependentSet(data, e_t, G, pi)
示例#24
0
import pandas as pd
from pgmpy.estimators import HillClimbSearch, ExhaustiveSearch
from pgmpy.estimators import BDeuScore, BicScore, K2Score
##结构学习
data = pd.read_csv('data.csv', encoding='gb18030')
df = pd.DataFrame(data)
bic = BicScore(df)
k2 = K2Score(df)
hc = HillClimbSearch(df, scoring_method=bic)
#hc = ExhaustiveSearch(df, k2)
model = hc.estimate()
for ee in model.edges():
    print(ee)



##参数学习
from pgmpy.models import BayesianModel
mod = BayesianModel(model.edges())
mod.fit(df)
for cpd in mod.get_cpds():
    print(cpd)

#print(mod.local_independencies('HA'))

##模型推理
from pgmpy.inference import VariableElimination, BeliefPropagation
cancer_infer = VariableElimination(mod)
q = cancer_infer.query(variables=['HA'])
print(q)