def learn(data): spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3), domains=domains, alpha=1, # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=50) return spn
def learn(data): spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3), domains=domains, alpha=0.1, families = ['histogram'] * data.shape[1], # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), #min_instances_slice=int(data.shape[0]*0.01)) min_instances_slice=200) return spn
def learn(): spn = SPN.LearnStructure( data, featureTypes=["discrete"] * data.shape[1], row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3, linear=True), # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=200) return spn
def learn(): spn = SPN.LearnStructure( traintopics, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.1, linear=True), featureNames=featureNames, domains=domains, # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=100) return spn
def estimate_density(self, training_data, validation_data=None): """Fit a MSPN on the training data. The variable validation_data is never used.""" feature_types = [] feature_names = [] families = [] for feat, str_type in training_data.features: feature_types.append(str_type) feature_names.append(feat.symbol_name()) if 'leaf' in self.learner_args: families.append(self.learner_args['leaf']) else: families.append(MSPNLearner.SPN_feat_fams[feat.symbol_type()]) if 'row_split' in self.learner_args: if self.learner_args['row_split'] == 'gower': row_split_method = Splitting.Gower(n_clusters=2) elif self.learner_args['row_split'] == 'rdc-kmeans': row_split_method = Splitting.KmeansRDCRows(n_clusters=2, k=20, OHE=1) else: raise NotImplementedError() else: row_split_method = Splitting.KmeansRDCRows(n_clusters=2, k=20, OHE=1) col_split_method = Splitting.RDCTest(threshold=0.1, OHE=1, linear=1) rand_seed = self.learner_args['seed'] mspnargs = { k: v for k, v in self.learner_args.items() if k not in ['seed', 'leaf', 'row_split'] } # let MSPNs sort this out families = None self.spn = SPN.LearnStructure(asarray(training_data.data), feature_types, families=families, featureNames=feature_names, rand_seed=rand_seed, row_split_method=row_split_method, col_split_method=col_split_method, **mspnargs)
# plt.hist(test[:,0], bins=100, histtype='stepfilled', normed=True, color='r', alpha=0.5, label='Uniform') # # plt.show() # print(domains) print(feature_names) print(feature_types) print(train.shape) # spn = SPN.LearnStructure(train, featureNames=feature_names, domains=domains, featureTypes=feature_types, row_split_method=Splitting.RandomPartitionConditioningRows(), col_split_method=Splitting.RDCTestOHEpy(threshold=0.75), # spn = SPN.LearnStructure(train, featureNames=feature_names, domains=domains, featureTypes=feature_types, row_split_method=Splitting.DBScanOHE(eps=1.0, min_samples=2), col_split_method=Splitting.RDCTestOHEpy(threshold=0.75), # spn = SPN.LearnStructure(train, featureNames=feature_names, # domains=domains, featureTypes=feature_types, # row_split_method=Splitting.KmeansOHERows(), # col_split_method=Splitting.RDCTest(threshold=0.75), spn = SPN.LearnStructure(train, featureNames=feature_names, domains=domains, featureTypes=feature_types, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.05), min_instances_slice=20, cluster_first=True) print(spn) result.append([dsname, numpy.mean(spn.root.eval(train)), numpy.mean( spn.root.eval(valid)), numpy.mean(spn.root.eval(test))]) print("train", numpy.mean(spn.root.eval(train))) print("valid", numpy.mean(spn.root.eval(valid))) print("test", numpy.mean(spn.root.eval(test))) print("train", numpy.min(spn.root.eval(train))) print("valid", numpy.min(spn.root.eval(valid))) print("test", numpy.min(spn.root.eval(test))) print(result)