def output_json(file="forest", size=100, prune=True, seed=50, burn=1): random.seed(seed) if file == "forest": instances = ds.load_forest_fires() variables = False elif file == "voting": instances = ds.load_congressional_voting() variables = False elif file == "iris": instances = ds.load_iris() variables = False elif file == "mushroom": instances = ds.load_mushroom() variables = False elif file == "rb_com_11": instances = ds.load_rb_com_11() variables = True elif file == "rb_s_07": instances = ds.load_rb_s_07() variables = True elif file == "rb_s_13": instances = ds.load_rb_s_13() variables = True elif file == "rb_wb_03": instances = ds.load_rb_wb_03() variables = True else: instances = ds.load_forest_fires() variables = False random.shuffle(instances) pprint.pprint(instances[0]) instances = instances[:size] print(len(instances)) if variables: variablizer = ObjectVariablizer() instances = [variablizer.transform(t) for t in instances] tree = TrestleTree() tree.fit(instances, iterations=burn) pprint.pprint(tree.root.output_json()) with open('output.js', 'w') as out: out.write("var trestle_output = ") out.write(json.dumps(tree.root.output_json())) out.write(";")
def output_json(file="forest", size=100, prune=True, seed=50, burn=1): random.seed(seed) if file == "forest": instances = ds.load_forest_fires() variables = False elif file == "voting": instances = ds.load_congressional_voting() variables = False elif file == "iris": instances = ds.load_iris() variables = False elif file == "mushroom": instances = ds.load_mushroom() variables = False elif file == "rb_com_11": instances = ds.load_rb_com_11() variables = True elif file == "rb_s_07": instances = ds.load_rb_s_07() variables = True elif file == "rb_s_13": instances = ds.load_rb_s_13() variables = True elif file == "rb_wb_03": instances = ds.load_rb_wb_03() variables = True else: instances = ds.load_forest_fires() variables = False random.shuffle(instances) pprint.pprint(instances[0]) instances = instances[:size] print(len(instances)) if variables: variablizer = ObjectVariablizer() instances = [variablizer.transform(t) for t in instances] tree = TrestleTree() tree.fit(instances, iterations=burn) # pprint.pprint(tree.root.output_json()) with open('output.js', 'w') as out: out.write("var trestle_output = ") out.write(json.dumps(tree.root.output_json())) out.write(";")
def calculate_aris(dataset): shuffle(dataset) dataset = dataset[:60] variablizer = ObjectVariablizer() dataset = [variablizer.transform(t) for t in dataset] tree = TrestleTree() tree.fit(dataset) clusters = [cluster_split_search(tree, dataset, h, minsplit=1, maxsplit=40, mod=False) for h in hueristics] human_labels = [ds['_human_cluster_label'] for ds in dataset] return [max(adjusted_rand_score(human_labels, huer), 0.01) for huer in clusters]
class ScikitTrestle(object): def __init__(self, params=None): if params is None: self.tree = TrestleTree() else: self.tree = TrestleTree(**params) def ifit(self, x, y): x = deepcopy(x) x['_y_label'] = "%i" % y self.tree.ifit(x) def fit(self, X, y): X = deepcopy(X) for i, x in enumerate(X): x['_y_label'] = "%i" % y[i] self.tree.fit(X, randomize_first=False) def predict(self, X): return [int(self.tree.categorize(x).predict('_y_label')) for x in X]
class ScikitTrestle(object): def __init__(self, **kwargs): self.tree = TrestleTree(**kwargs) self.state_format = "variablized_state" def ifit(self, x, y): x = deepcopy(x) x['_y_label'] = float(y) self.tree.ifit(x) def fit(self, X, y): X = deepcopy(X) for i, x in enumerate(X): x['_y_label'] = float(y) self.tree.fit(X, randomize_first=False) def skill_info(self, X): raise NotImplementedError("Not implemented Erik H. says there is a way \ to serialize this -> TODO") def predict(self, X): return [self.tree.categorize(x).predict('_y_label') for x in X]
def calculate_aris(dataset): shuffle(dataset) dataset = dataset[:60] variablizer = ObjectVariablizer() dataset = [variablizer.transform(t) for t in dataset] tree = TrestleTree() tree.fit(dataset) clusters = [ cluster_split_search(tree, dataset, h, minsplit=1, maxsplit=40, mod=False) for h in hueristics ] human_labels = [dataset['_human_cluster_label'] for dataset in dataset] return [ max(adjusted_rand_score(human_labels, huer), 0.01) for huer in clusters ]
from concept_formation.visualize import visualize # These lines load up and use one of the example datasets included in the # library if you don't have a readily available dataset to test. The rb_s_07 # dataset is similar to but not exactly the same as the one used to generate # the figures in the paper. from concept_formation.datasets import load_rb_s_07 from concept_formation.preprocessor import ObjectVariablizer data = load_rb_s_07() # As long as your data conforms to the instance representation: # https://concept-formation.readthedocs.io/en/latest/instance_representation.html # it can be basically anything. # data = [] # This step is to make sure the component attributes of the instances are # properly tagged as variable. See the instance representation link above for # this. # ov = ObjectVariablizer() # data = ov.batch_transform(data) # These three lines are the core of the process. They will fit the data and # generate a visualization that will automatically open a browser to the view. # If you want to embed the output in some other process, like a LearnSphere # workflow, it would take a little more work but is easy in principle. tree = TrestleTree() tree.fit(data) visualize(tree, "vizfiles")
from concept_formation.cluster import cluster_split_search from concept_formation.cluster import AIC, BIC, AICc, CU from concept_formation.datasets import load_rb_wb_03 from concept_formation.preprocessor import ObjectVariablizer seed(5) towers = load_rb_wb_03() shuffle(towers) towers = towers[:60] variablizer = ObjectVariablizer() towers = [variablizer.transform(t) for t in towers] tree = TrestleTree() tree.fit(towers) hueristics = [AIC, BIC, CU, AICc] clusters = [ cluster_split_search(tree, towers, h, minsplit=1, maxsplit=40, mod=False) for h in hueristics ] human_labels = [tower['_human_cluster_label'] for tower in towers] x = np.arange(len(hueristics)) y = [max(adjusted_rand_score(human_labels, huer), 0.01) for huer in clusters] width = 0.45 hueristic_names = ['AIC', 'BIC', 'CU', 'AICc'] for i in range(len(clusters)):
seed(0) X = np.sort(5 * rng.rand(80, 1), axis=0) y = np.sin(X).ravel() y[::5] += 3 * (0.5 - rng.rand(16)) # Fit regression models (Decision Tree and TRESTLE) # For TRESTLE the y attribute is hidden, so only the X is used to make # predictions. dtree = DecisionTreeRegressor(max_depth=3) dtree.fit(X, y) ttree = TrestleTree() training_data = [{ 'x': float(X[i][0]), '_y': float(y[i]) } for i, v in enumerate(X)] ttree.fit(training_data, iterations=1) # Predict X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis] y_dtree = dtree.predict(X_test) y_trestle = [ttree.categorize({'x': float(v)}).predict('_y') for v in X_test] # Plot the results plt.figure() plt.scatter(X, y, c="k", label="Data") plt.plot(X_test, y_trestle, c="g", label="TRESTLE", linewidth=2) plt.plot(X_test, y_dtree, c="r", label="Decison Tree (Depth=3)", linewidth=2) plt.xlabel("Data") plt.ylabel("Target") plt.title("TRESTLE/Decision Tree Regression") plt.legend(loc=3)