Пример #1
0
def test_parallel_structure_learning():
    logps = -19.8282, -345.9527, -4847.59688, -604.0190
    for X, logp in zip(datasets, logps):
        model = BayesianNetwork.from_samples(X, algorithm='exact')
        model2 = BayesianNetwork.from_samples(X, algorithm='exact', n_jobs=2)
        assert_equal(model.log_probability(X).sum(), model2.log_probability(X).sum())
        assert_almost_equal(model.log_probability(X).sum(), logp, 4)
Пример #2
0
def test_parallel_structure_learning():
    logps = -19.8282, -345.9527, -4847.59688, -604.0190
    for X, logp in zip(datasets, logps):
        model = BayesianNetwork.from_samples(X, algorithm='exact')
        model2 = BayesianNetwork.from_samples(X, algorithm='exact', n_jobs=2)
        assert_equal(model.log_probability(X).sum(), model2.log_probability(X).sum())
        assert_almost_equal(model.log_probability(X).sum(), logp, 4)
Пример #3
0
def test_exact_nan_structure_learning():
    logps = -6.13764, -159.6505, -2055.76364, -201.73615
    for X, logp in zip(datasets_nan, logps):
        model = BayesianNetwork.from_samples(X, algorithm='exact')
        model2 = BayesianNetwork.from_samples(X, algorithm='exact-dp')

        assert_equal(model.log_probability(X).sum(), model2.log_probability(X).sum())
        assert_almost_equal(model.log_probability(X).sum(), logp, 4)
Пример #4
0
def test_io_from_samples():
    model1 = BayesianNetwork.from_samples(X, weights=weights)
    model2 = BayesianNetwork.from_samples(data_generator)

    logp1 = model1.log_probability(X)
    logp2 = model2.log_probability(X)

    assert_array_almost_equal(logp1, logp2)
Пример #5
0
def test_exact_nan_structure_learning():
    logps = -6.13764, -159.6505, -2055.76364, -201.73615
    for X, logp in zip(datasets_nan, logps):
        model = BayesianNetwork.from_samples(X, algorithm='exact')
        model2 = BayesianNetwork.from_samples(X, algorithm='exact-dp')

        assert_equal(model.log_probability(X).sum(), model2.log_probability(X).sum())
        assert_almost_equal(model.log_probability(X).sum(), logp, 4)  
def createModel(train, test):
    print("I am in create model")

    header = [
        'acceleration_mean', 'acceleration_stdev', 'pitch1', 'pitch2',
        'pitch3', 'roll1', 'roll2', 'roll3', 'classes', 'total_accel_sensor_1',
        'total_accel_sensor_2', 'total_accel_sensor_4'
    ]

    start_time = datetime.now()
    print("Start time: ", start_time)

    model = BayesianNetwork.from_samples(train,
                                         algorithm='greedy',
                                         state_names=header)

    print("doing model.bake")
    model.bake()

    time = datetime.now() - start_time
    print("Time: ", time)

    predict = test['classes'].tolist()
    test['classes'] = None

    print("Evaluating predict...")
    test = test.to_numpy()
    pred_values = model.predict(test)

    pred_values = [x.item(2) for x in pred_values]
    main.calculate_accuracy(predict, pred_values)
Пример #7
0
    def _likelihoods(cls,
                     real_data,
                     synthetic_data,
                     metadata=None,
                     structure=None):
        metadata = cls._validate_inputs(real_data, synthetic_data, metadata)
        structure = metadata.get('structure', structure)
        fields = cls._select_fields(metadata, ('categorical', 'boolean'))

        if not fields:
            return np.full(len(real_data), np.nan)

        LOGGER.debug('Fitting the BayesianNetwork to the real data')
        if structure:
            if isinstance(structure, dict):
                structure = BayesianNetwork.from_json(
                    json.dumps(structure)).structure

            bn = BayesianNetwork.from_structure(real_data[fields].to_numpy(),
                                                structure)
        else:
            bn = BayesianNetwork.from_samples(real_data[fields].to_numpy(),
                                              algorithm='chow-liu')

        LOGGER.debug('Evaluating likelihood of the synthetic data')
        probabilities = []
        for _, row in synthetic_data[fields].iterrows():
            try:
                probabilities.append(bn.probability([row.to_numpy()]))
            except ValueError:
                probabilities.append(0)

        return np.asarray(probabilities)
Пример #8
0
 def __naive_algorithm(self, X):
     graph = networkx.DiGraph()
     for i in range(1, len(self.state_names)):
         graph.add_edge((0, ), (i, ))
     return BayesianNetwork.from_samples(X,
                                         algorithm=self.algorithm_name,
                                         state_names=self.state_names,
                                         root=0,
                                         constraint_graph=graph)
Пример #9
0
    def fit(self, features, prediction, **kwargs):
        """Create a Bayesian network from the given samples"""
        data = pd.concat([features, prediction], axis='columns')

        self.model = BayesianNetwork.from_samples(X=data,
                                                  state_names=data.columns,
                                                  name="Insurance Advisor",
                                                  **kwargs)
        self.model.freeze()
        print("Training finished")
Пример #10
0
 def fit_chow_liu(self, X_train, y_train, sequence_length_train):
     # TODO: use sequence_length_train
     self.formatted_labels = self.le.fit_transform(y_train)
     self.formatted_labels = self.formatted_labels.reshape(
         self.formatted_labels.shape[0], 1)
     X = np.concatenate((self.formatted_labels, X_train), axis=1)
     self.model = BayesianNetwork.from_samples(X,
                                               algorithm='chow-liu',
                                               state_names=self.state_names,
                                               root=0)
Пример #11
0
def pomegranate_test():
    mydb = np.array([[1, 1, 1], [1, 1, 1], [0, 1, 1]])

    mymodel = BayesianNetwork.from_samples(mydb)

    # print(mymodel.node_count())

    # mymodel.plot()

    print(mymodel.probability([[1, 1, 1]]))
    print(mymodel.probability([[None, 1, 1]]))
    print(mymodel.predict_proba({}))
def pomegranadeMethod():
	# filename
	features_file = './../data/features.csv'

	# reading data
	data = postmaster.readCSVIntoList(features_file)
	data = np.array(data, dtype='int32')

	# learn model
	model = BayesianNetwork.from_samples(data, algorithm='exact')
	print model.structure
	model.plot()
Пример #13
0
def generateSkeleton(data):
    config = Config()
    dfrm = getDataFrames(data)
    print('LOG: Generate Skeleton')
    model = BayesianNetwork.from_samples(dfrm,
                                         algorithm='greedy',
                                         state_names=config.variables())
    model.bake()
    with open(
            'generatedSkeleton/skeletonGraph' + str(config.nOfBuckets()) +
            'buckets.txt', "w+") as f:
        f.write(model.to_json())
Пример #14
0
def produceModelsForValidationToJSON(data,
                                     train_indices,
                                     dirname='./',
                                     filename_base='model_bn_'):
    y = data.iloc[:, 0].values
    X = data.iloc[:, 1:].values
    state_names = data.columns.values
    model_estimating_times = []
    model_fitting_time = []
    index = 0
    for train_index in train_indices:
        X_train = X[train_index, :].copy()
        y_train = y[train_index].copy()
        y_train = reshape(y_train, [-1, 1])
        X_train = np.hstack([y_train, X_train])
        dummy = np.ones([2, X_train.shape[1]])
        dummy[:, 0] = -1  ### all
        dummy[1, 1:] = 0
        X_train = np.vstack([X_train, dummy])
        X_train = X_train.astype(int)
        #Learning structure
        print "Learning..."
        tic = time.time()
        tic2 = time.clock()
        model = BayesianNetwork.from_samples(X_train,
                                             root=0,
                                             state_names=state_names,
                                             algorithm='chow-liu',
                                             n_jobs=8)
        toc2 = time.clock()
        toc = time.time()
        model_estimating_times.append([toc2 - tic2, toc - tic])
        print 'Model estimated in %.5f clock, %.5f time' % (toc2 - tic2,
                                                            toc - tic)
        tic = time.time()
        tic2 = time.clock()
        model.fit(X_train, pseudocount=1, verbose=True)
        toc2 = time.clock()
        toc = time.time()
        model_fitting_time.append([toc2 - tic2, toc - tic])
        print 'Model fitted in %.5f clock, %.5f time' % (toc2 - tic2,
                                                         toc - tic)
        #model.bake()
        #print 'Model was baked'
        string = model.to_json()
        model_filename = dirname + filename_base + str(index) + '.json'
        with open(model_filename, 'w+') as f:
            f.writelines(string)
        index += 1
        np.savez_compressed(dirname + filename_base + 'times',
                            model_estimating_times=model_estimating_times,
                            model_fitting_time=model_fitting_time)
Пример #15
0
 def fit_naive(self, X_train, y_train, sequence_length_train):
     self.formatted_labels = self.le.fit_transform(y_train)
     self.formatted_labels = self.formatted_labels.reshape(
         self.formatted_labels.shape[0], 1)
     graph = nx.DiGraph()
     for i in range(1, len(self.state_names)):
         graph.add_edge((0, ), (i, ))
     X = np.concatenate((self.formatted_labels, X_train), axis=1)
     self.model = BayesianNetwork.from_samples(X,
                                               algorithm='exact',
                                               state_names=self.state_names,
                                               root=0,
                                               constraint_graph=graph)
Пример #16
0
 def _get_structure(self, X_plus, root=0):
     """
     Get the features dependency structure of the minority class
     """
     bayes = BayesianNetwork.from_samples(X_plus,
                                          algorithm='chow-liu',
                                          root=root)
     depend = []
     for i in bayes.structure:
         if i:
             depend.append(i[0])
         else:
             depend.append(-1)
     return depend
Пример #17
0
def run():

    seaborn.set_style('whitegrid')

    X = numpy.random.randint(2, size=(2000, 7))
    X[:, 3] = X[:, 1]
    X[:, 6] = X[:, 1]

    X[:, 0] = X[:, 2]

    X[:, 4] = X[:, 5]

    model = BayesianNetwork.from_samples(X, algorithm='exact')

    model.structure
    model.plot()
Пример #18
0
def setup_random_mixed():
    numpy.random.seed(0)
    global X
    X = numpy.array([
        numpy.random.choice([True, False], size=50),
        numpy.random.choice(['A', 'B'], size=50),
        numpy.random.choice(2, size=50)
    ], dtype=object).T.copy()

    global weights
    weights = numpy.abs(numpy.random.randn(50))

    global data_generator
    data_generator = DataGenerator(X, weights)

    global model
    model = BayesianNetwork.from_samples(X)
Пример #19
0
 def predict_from_data(self, samples_file_name: str):
     """
     This function will predict diseases from symptoms using a given dataset with an expected structure.
     The dataset structure must be in the format of symptoms in columns 0 to last-1 and diseases in the last column.
     :param samples_file_name: the name of the csv_file in the csv folder
     :return: None
     """
     program_start_before_input = default_timer()
     samples = pandas.read_csv(f"../csv/{samples_file_name}.csv",
                               delimiter=",",
                               header=None)
     program_end_before_input = default_timer()
     user_symptoms = self.__get_symptoms_from_user(samples)
     program_start_after_input = default_timer()
     number_symptoms = samples.shape[1] - 1  # number columns in samples
     model_start_time = default_timer()
     model = BayesianNetwork.from_samples(
         X=samples.values,
         include_edges=[(symptom, number_symptoms)
                        for symptom in range(number_symptoms)],
         exclude_edges=(list(
             itertools.combinations(
                 [symptom for symptom in range(number_symptoms)],
                 2,
             ))),
     )
     model.bake()
     model_end_time = default_timer()
     print(
         f"Model finished construction in {model_end_time - model_start_time} seconds"
     )
     predicted_disease = model.predict([user_symptoms])[0]
     prediction_probability = model.probability([predicted_disease])
     print(
         f"The predicted disease is {predicted_disease[-1]} with probability of {prediction_probability}"
     )
     program_end_after_input = default_timer()
     full_program_runtime = (
         program_end_before_input - program_start_before_input) + (
             program_end_after_input - program_start_after_input)
     print(
         f"The Bayes's Net implementation completed in {full_program_runtime} seconds"
     )
Пример #20
0
def test_exact_structure_learning_slap_constraints():
    for ds in datasets:
        dims = numpy.shape(ds)[1]
        half = int(numpy.ceil(dims / 2))
        # Node groups
        g1 = tuple(range(0, half))
        g2 = tuple(range(half, dims))
        # Constraint graph:
        cg = DiGraph()
        cg.add_edge(g1, g2)
        cg.add_edge(g2, g2)
        # Learn constrained network
        model = BayesianNetwork.from_samples(ds,
                                             algorithm='exact',
                                             constraint_graph=cg)
        # Check structure constraints satisfied
        s = model.structure
        for node in g1:
            assert_equal(0, len(s[node]))
Пример #21
0
def test_pom():
    data = get_test_data()
    data = 4 * data

    fields = list(data[0].keys())
    data_matrix = data_to_matrix(data, fields)

    network = BayesianNetwork.from_samples(data_matrix,
                                           algorithm='exact',
                                           pseudocount=0)

    example = [['1', 'Sara Smith', 'Boston', '100000', 'bat'],
               ['1', 'Sara Smith', 'Boston', '100000', 'ball'],
               ['1', 'Sara Smith', 'Boston', '100000', 'hat'],
               ['1', 'Sara Smith', 'Boston', '100000', 'glove']]

    prob = network.probability(example)
    prob /= prob.sum()
    print(prob)

    return network
Пример #22
0
def inference(data, infs):
    config = Config()
    dfrm = getDataFrames(data)
    model = BayesianNetwork.from_samples(dfrm,
                                         algorithm='greedy',
                                         state_names=config.variables())
    model.bake()
    testsArray = np.array(
        Enumerable(infs).select(lambda x: [
            x.x1, x.y1, x.z1, x.x2, x.y2, x.z2, x.x3, x.y3, x.z3, x.x4, x.y4, x
            .z4, None
        ]).to_list())
    print('LOG: Predicting')
    prediction = model.predict(testsArray)
    if len(infs) > 1:
        print('LOG: Printing predictions in "' + config.outInference() + '"')
        with open(config.outInference(), "w+") as f:
            f.write('\n'.join(
                Enumerable(prediction).select(
                    lambda x: str(x) + ' ' + parseVal(x[12])).to_list()))
    else:
        print('Predicted value is "' + parseVal(prediction[0][12]) + '"')
Пример #23
0
def testModel(data, tests):
    config = Config()
    dfrm = getDataFrames(data)
    model = BayesianNetwork.from_samples(dfrm,
                                         algorithm='greedy',
                                         state_names=config.variables())
    model.bake()
    testsArray = np.array(
        Enumerable(tests).select(lambda x: [
            x.x1, x.y1, x.z1, x.x2, x.y2, x.z2, x.x3, x.y3, x.z3, x.x4, x.y4, x
            .z4, None
        ]).to_list())
    tags = np.array(Enumerable(tests).select(lambda x: x.harClass).to_list())
    print('LOG: Testing')
    prediction = model.predict(testsArray)
    i = 0
    corrects = 0
    for p in prediction:
        if (p[12] == tags[i]):
            corrects += 1
        i += 1
    print('Score: ' + str(corrects * 100 / len(tests)) + '%')
Пример #24
0
def build_bn(df_app, output_dir, options):

    df_app_tmp = df_app.copy()
    df_app_tmp.drop('ANNOTATE', axis=1, inplace = True)
    if options['CLASS'] != '':
        df_app_tmp.drop('CLASS', axis=1, inplace=True)

    X = df_app_tmp
    model = BayesianNetwork.from_samples(X, algorithm='chow-liu')

    print("\nModel Structure:\n")
    print(model.structure)
    for idx, parent in enumerate(model.structure):
        if len(parent) == 0:
            print('Singleton: {}'.format(df_app.columns[idx]))
        elif len(parent) == 1:
            print('Parent: {} - Child: {}'.format(df_app.columns[parent[0]], df_app.columns[idx]))

    file_out = ea_decode.options_filename(options) + '_' + 'bn_graph'

    plt.figure(figsize=(9, 7))
    model.plot()

    if output_dir == '':
        plt.show()
    else:
        plt.savefig(os.path.join(output_dir, file_out))
        plt.close()

    file_out = ea_decode.options_filename(options) + '_bn.mdl'
    model_file = os.path.join(output_dir, file_out)

    with open(model_file, 'wb') as f:
        pickle.dump(model_file, f)

    logging.info('\n%s: Loglikelihood: %.2f\n', 'BN', model.log_probability(X).sum())
Пример #25
0
def test_greedy_structure_learning():
    logps = -19.8282, -345.9527, -4847.59688, -611.0356
    for X, logp in zip(datasets, logps):
        model = BayesianNetwork.from_samples(X, algorithm='greedy')
        assert_almost_equal(model.log_probability(X).sum(), logp, 4)
Пример #26
0
def test_chow_liu_structure_learning():
    logps = -19.8282, -344.248785, -4842.40158, -603.2370
    for X, logp in zip(datasets, logps):
        model = BayesianNetwork.from_samples(X, algorithm='chow-liu')
        assert_almost_equal(model.log_probability(X).sum(), logp, 4)
            'Personal Email', 'Professional Email', 'Religion',
            'Sexual Orientation', 'Illnesses', 'Hobby/Pastime',
            'Hurt Sentiments - Movie', 'Holiday Destination', 'Music Genre',
            'Age for Adult movie', 'Favourite Movie', 'Money on cinema weekly',
            'Illegal streaming/downloading', 'Favourite Pornstar'
        ]
    if block == '3':
        column_names = [
            'Name', 'Country of Residence', 'Home Postcode', 'Employer Name',
            'Work Address', 'Phone Number', 'Relationship Status',
            'Lied to Partner', 'Languages', 'Annual Income',
            'Shared X-rated movies', 'Lied about Age', 'Musician',
            'Favourite Movie Genre', 'Favourite Soundtrack',
            'Online rental subscriptions'
        ]
    print('Generating Bayesian Network for Question Block ' + block + '.')
    model = BayesianNetwork.from_samples(subset,
                                         state_names=column_names,
                                         algorithm=bayes_algorithm)
    if block not in bayesian_net_models:
        bayesian_net_models[block] = model

    plt.title('Truthfulness \n Bayesian Network \n' + 'Block-' + block,
              fontsize=30,
              fontweight='bold')
    model.plot(with_labels=True)
    save_fig(BAYESIAN_DIR,
             bayes_algorithm + '_bayesian_net_likert_' + 'block_' + block)
    print('Saving Bayesian Network for Question Block ' + block + ' in ' +
          BAYESIAN_DIR + ' directory.')
Пример #28
0
    def train(self, samples, weights, state_names=None):
        '''
        @samples: 2d array. Each row represents a unique point in the joint
        distribution, with each column representing a random variable.
        '''
        start = time.time()
        assert state_names is not None
        self.state_names = state_names

        weights = np.array(weights, dtype=np.int32)
        for col in range(samples.shape[1]):
            self.word2index.append({})
            col_alphabets = np.unique(samples[:, col])
            for i, alph in enumerate(col_alphabets):
                self.word2index[col][alph] = i

        mapped_samples = np.zeros(samples.shape, dtype=np.int32)

        for i in range(mapped_samples.shape[0]):
            for j in range(mapped_samples.shape[1]):
                mapped_samples[i][j] = self.word2index[j][samples[i][j]]

        if self.save_csv:
            np.set_printoptions(
                formatter={'float': lambda x: "{0:0.3f}".format(x)})
            np.savetxt("data.csv", mapped_samples, delimiter=",")
            np.savetxt("counts.csv", weights, delimiter=",")

        if self.backend == "ourpgm":
            pgm.py_init.restype = c_void_p
            print("before py_init")
            self.ourpgm_model = pgm.py_init(
                mapped_samples.ctypes.data_as(c_void_p),
                c_long(mapped_samples.shape[0]),
                c_long(mapped_samples.shape[1]),
                weights.ctypes.data_as(c_void_p), c_long(weights.shape[0]),
                self.use_svd, c_long(self.num_singular_vals), self.recompute)
            pgm.py_train(c_void_p(self.ourpgm_model))
        elif self.backend == "pomegranate":
            # TODO: cache the trained model, based on hash of mapped samples?
            # TODO: mapped samples should be extended to include all 0's.
            self.pom_model = BayesianNetwork.from_samples(
                mapped_samples,
                weights=weights,
                state_names=self.state_names,
                algorithm="chow-liu",
                n_jobs=-1)
            print("pomegranate training done!")

            if self.alg_name == "greg":
                # compute all the appropriate SVD's
                self.edge_svds = {}

                # TODO: might want to store this globally
                state_to_idx = {}
                for i, s in enumerate(self.pom_model.states):
                    state_to_idx[s.name] = i

                # Expensive computation, so save it if possible
                misc_cache = klepto.archives.dir_archive(
                    "./misc_cache/edge_svds/")
                misc_cache.load()
                for edge in self.pom_model.edges:
                    node1 = state_to_idx[edge[0].name]
                    node2 = state_to_idx[edge[1].name]
                    edge_nodes = [node1, node2]
                    edge_nodes.sort()
                    edge_key = (edge_nodes[0], edge_nodes[1])

                    # FIXME: check
                    cond_dist = edge[1].distribution
                    assert "ConditionalProbabilityTable" in str(
                        type(cond_dist))
                    marg1 = self.pom_model.marginal()[node1].values()
                    node1_vals = [
                        k for k in self.pom_model.marginal()
                        [node1].parameters[0].keys()
                    ]
                    dim1 = len(marg1)
                    marg2 = self.pom_model.marginal()[node2].values()
                    node2_vals = [
                        k for k in self.pom_model.marginal()
                        [node2].parameters[0].keys()
                    ]
                    dim2 = len(marg2)

                    svd_key = str(marg1) + str(marg2) + str(node1_vals) + str(
                        node2_vals)
                    svd_key = deterministic_hash(svd_key)
                    if svd_key in misc_cache:
                        self.edge_svds[edge_key] = misc_cache[svd_key]
                        print("found edge key {} in cache".format(edge_key))
                        print(np.max(self.edge_svds[edge_key][1]))
                        continue
                    else:
                        print("did not find edge key {} in cache".format(
                            edge_keedge_key))

                    joint_mat = np.zeros((dim1, dim2))
                    for i in range(dim1):
                        for j in range(dim2):
                            ind_term = marg1[i] * marg2[j]
                            # FIXME: assuming that these are state values
                            assert node1_vals[i] == i
                            assert node2_vals[j] == j
                            # FIXME: assuming marg1 is always the parent in the
                            # conditional dist
                            sample = [node1_vals[i], node2_vals[j]]
                            joint_term = cond_dist.probability(
                                sample) * marg1[i]

                            joint_mat[i, j] = (joint_term -
                                               ind_term) / math.sqrt(ind_term)

                    # TODO: replace this by scipy.sparse svd's so can only
                    # compute for top-k values

                    uh, sv, vh = np.linalg.svd(joint_mat, full_matrices=False)
                    # print(np.max(sv))
                    assert np.max(sv) < 1.1
                    # pdb.set_trace()

                    # TODO: check if this computation is what we need
                    # compute the f and g vectors
                    for xi in range(dim1):
                        uh[xi, :] /= math.sqrt(marg1[xi])

                    for xj in range(dim2):
                        vh[:, xj] /= math.sqrt(marg2[xj])

                    assert edge_key not in self.edge_svds
                    self.edge_svds[edge_key] = (uh, sv, vh)

                    misc_cache[svd_key] = self.edge_svds[edge_key]
                misc_cache.dump()
                misc_cache.clear()
            elif self.alg_name == "chow-liu":
                # should not need to do anything here.
                print("trained chow-liu using pomegranate")
            else:
                assert False

        print("pgm model took {} seconds to train".format(time.time() - start))
def run_research(df):
    # Prepare data
    #df = get_factorized_dataset(path="../data").drop(['veil-type', 'stalk-root'],axis=1)
    target_column = "class"

    #Prepare models
    nb = CategoricalNB(num_epochs=20)

    df1 = df
    pgm1 = PGM(df1, num_epochs=20)
    m1 = BayesianNetwork.from_samples(df1, algorithm='chow-liu')
    pgm1.import_pomegranate_model(m1, df1.columns)

    df2 = df.drop(['odor'], axis=1)
    pgm2 = PGM(df2, num_epochs=20)
    m2 = BayesianNetwork.from_samples(df2, algorithm='chow-liu')
    pgm2.import_pomegranate_model(m2, df2.columns)

    df3 = df.get([
        'odor', 'class', 'spore-print-color', 'gill-color', 'cap-color',
        'cap-shape', 'cap-surface', 'gill-size', 'gill-spacing',
        'gill-attachment', 'stalk-color-above-ring',
        'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-shape'
    ])

    DAG = nx.DiGraph()

    edges = [('odor', 'class'), ('spore-print-color', 'class'),
             ('gill-color', 'class'), ('cap-color', 'class'),
             ('cap-shape', 'cap-color'), ('cap-surface', 'cap-color'),
             ('gill-size', 'gill-color'), ('gill-spacing', 'gill-color'),
             ('gill-attachment', 'gill-color'),
             ('stalk-color-above-ring', 'class'),
             ('stalk-surface-below-ring', 'stalk-color-above-ring'),
             ('stalk-surface-above-ring', 'stalk-color-above-ring'),
             ('stalk-shape', 'stalk-color-above-ring')]
    DAG.add_edges_from(edges)

    pgm3 = PGM(df3, graph=DAG, num_epochs=20)

    models = [
        (df, nb, "Naive Bayes"),
        (df1, pgm1, "Bayesian Net #1"),
        (df2, pgm2, "Bayesian Net #2"),
        (df3, pgm3, "Bayesian Net #3"),
    ]

    #Results structure
    all_results = {}

    #Prepare file
    file = open("research_results.txt", "a")

    # Run experiments
    for (data, model, model_name) in models[0:1]:

        print(f"##### {model_name} #####\n")
        file.write(f"##### {model_name} #####\n")

        result = test_split(model=model,
                            n_splits=5,
                            df=data,
                            class_column='class')
        print(result)

        file.write(str(result) + "\n")

        all_results[model_name] = result

    file.write("##### Partial results after test_split #####\n")
    file.write(str(all_results) + "\n")
    print("##### Partial results after test_split #####\n")
    print(str(all_results) + "\n")

    #Run experiments
    for (data, model, model_name) in models[1:]:

        print(f"##### {model_name} #####\n")
        file.write(f"##### {model_name} #####\n")

        result = test_cross(model=model,
                            n_splits=5,
                            df=data,
                            class_column='class')
        print(result)

        file.write(str(result) + "\n")

        all_results[model_name] = result

    file.write("##### Combined results #####\n")
    file.write(str(all_results) + "\n")
    print("##### Combined results #####\n")
    print(str(all_results) + "\n")

    generate_plots(all_results)
    generate_plots_std(all_results)

    #Close file
    file.close()

    print("Done.")
Пример #30
0
def test_greedy_nan_structure_learning():
    logps = -7.5239, -159.6505, -2058.5706, -203.7662
    for X, logp in zip(datasets_nan, logps):
        model = BayesianNetwork.from_samples(X, algorithm='greedy')
        assert_almost_equal(model.log_probability(X).sum(), logp, 4)
Пример #31
0
from pomegranate import BayesianNetwork

import seaborn, time
import numpy

seaborn.set_style('whitegrid')

X = numpy.random.randint(2, size=(2000, 7))

X[:, 3] = X[:, 1]
X[:, 6] = X[:, 1]
X[:, 0] = X[:, 2]
X[:, 4] = X[:, 5]

model = BayesianNetwork.from_samples(X, algorithm='exact')
print(model.structure)
model.plot()
Пример #32
0
    def __init__(
            self,
            #  dataset,
            table,
            num_samples,
            algorithm="greedy",
            max_parents=-1,
            topological_sampling_order=True,
            use_pgm=True,
            discretize=None,
            discretize_method="equal_size",
            root=None):

        from pomegranate import BayesianNetwork
        self.discretize = discretize
        self.discretize_method = discretize_method
        self.table = copy.deepcopy(table)
        self.dataset = np.stack([
            col.discretize(self.table.data[cname])
            for cname, col in self.table.columns.items()
        ],
                                axis=1)
        self.algorithm = algorithm
        self.topological_sampling_order = topological_sampling_order
        self.num_samples = num_samples
        self.discrete_mapping = self.build_discrete_mapping(
            self.dataset, discretize, discretize_method)
        self.discrete_table = self.apply_discrete_mapping(
            self.dataset, self.discrete_mapping)
        L.info('calling BayesianNetwork.from_samples...')
        t = time.time()
        self.model = BayesianNetwork.from_samples(self.discrete_table,
                                                  algorithm=self.algorithm,
                                                  max_parents=max_parents,
                                                  n_jobs=NUM_THREADS,
                                                  root=root)
        L.info(f'done! took {(time.time() - t)/60:.2f} mins')

        def size(states):
            n = 0
            for state in states:
                if "distribution" in state:
                    dist = state["distribution"]
                else:
                    dist = state
                if dist["name"] == "DiscreteDistribution":
                    for p in dist["parameters"]:
                        n += len(p)
                elif dist["name"] == "ConditionalProbabilityTable":
                    for t in dist["table"]:
                        n += len(t)
                    if "parents" in dist:
                        for parent in dist["parents"]:
                            n += size(dist["parents"])
                else:
                    assert False, dist["name"]
            return n

        self.size = 4 * size(json.loads(self.model.to_json())["states"])
        L.info(f'model size is {self.size/1024/1024:.2f}MB')

        # print('json:\n', self.model.to_json())
        self.json_size = len(self.model.to_json())
        self.use_pgm = use_pgm
        #        print(self.model.to_json())

        if topological_sampling_order:
            self.sampling_order = []
            while len(self.sampling_order) < len(self.model.structure):
                for i, deps in enumerate(self.model.structure):
                    if i in self.sampling_order:
                        continue  # already ordered
                    if all(d in self.sampling_order for d in deps):
                        self.sampling_order.append(i)
                L.debug(f"Building sampling order {self.sampling_order}")
        else:
            self.sampling_order = list(range(len(self.model.structure)))
        L.info(f"Using sampling order {self.sampling_order} {str(self)}")

        if use_pgm:
            from pgmpy.models import BayesianModel
            data = pd.DataFrame(self.discrete_table.astype(np.int64))
            spec = []
            orphans = []
            for i, parents in enumerate(self.model.structure):
                for p in parents:
                    spec.append((p, i))
                if not parents:
                    orphans.append(i)
            L.info(f"Model spec {spec}")
            model = BayesianModel(spec)
            for o in orphans:
                model.add_node(o)
            L.info('calling pgm.BayesianModel.fit...')
            t = time.time()
            model.fit(data)
            L.info(f'done! took {(time.time() - t)/60:.2f} mins')
            self.pgm_model = model
Пример #33
0
def test_chow_liu_structure_learning():
    logps = -19.8282, -344.248785, -4842.40158, -603.2370
    for X, logp in zip(datasets, logps):
        model = BayesianNetwork.from_samples(X, algorithm='chow-liu')
        assert_almost_equal(model.log_probability(X).sum(), logp, 4)
Пример #34
0
 def fit(self, data, categorical_columns=tuple(), ordinal_columns=tuple()):
     self.discretizer = DiscretizeTransformer(n_bins=15)
     self.discretizer.fit(data, categorical_columns, ordinal_columns)
     discretized_data = self.discretizer.transform(data)
     self.model = BayesianNetwork.from_samples(discretized_data,
                                               algorithm='chow-liu')
Пример #35
0
def test_greedy_structure_learning():
    logps = -19.8282, -345.9527, -4847.59688, -611.0356
    for X, logp in zip(datasets, logps):
        model = BayesianNetwork.from_samples(X, algorithm='greedy')
        assert_almost_equal(model.log_probability(X).sum(), logp, 4)