def train(self): """ To train the model using decision tree algorithm. :return: """ entries = self.train_file features = set(entries[0].features.keys()) self.tree = tree.make_tree(entries, features, [], MAX_DEPTH) file = open(self.out_file, "wb") pickle.dump(self, file) file.close()
def search(query): print query[:-1] Tree = tree.make_tree(query[:-1]) t_goto = 0 t_ev = 0 res = [] last_id = -1 Tree.goto(last_id) last_id = Tree.evaluate() while last_id != -1: res.append(last_id) last_id += 1 Tree.goto(last_id) last_id = Tree.evaluate() print len(res) with open('urls.txt', 'r') as f: lines = f.readlines() for r in res: print lines[r][:-1]
def train(self, ensemble_size=SIZE): """ To train the model using Adaboost algorithm. :param ensemble_size: the size of the stumps :return: """ entries = self.train_file features = set(entries[0].features.keys()) weights = Weights(entries) self.ensemble = [] # create and store each stump for i in range(ensemble_size): stump = tree.make_tree(entries, features, [], 1) error = 0 for entry in entries: decision = stump.decide_classification(entry) if decision != entry.target: error += entry.weight for j in range(len(entries)): entry = entries[j] decision = stump.decide_classification(entry) if decision == entry.target: new_weight = entry.weight * error / (weights.total - error) weights.update_weight(j, new_weight) weights.normalization() stump.weight = math.log(weights.total - error) / error self.ensemble.append(stump) # store the model to a binary file file = open(self.out_file, "wb") pickle.dump(self, file) file.close()
def index(): treeData = make_tree(level=1000) return render_template('index.html', treeData=treeData)
from monitor import FolderMonitor from tree import make_tree path = 'D:/temp/dev' some = FolderMonitor(path) some.run() # while True: # intakes = input() # if intakes == 'go': # some.start() # if intakes == 'stop': # some.stop() make_tree(path)
training_data = pandas.read_csv(training_data_file_path) validation_data = pandas.read_csv(vaildation_data_file_path) testing_data = pandas.read_csv(testing_data_file_path) column_list = training_data.columns.values attr = column_list[:-1] classname = column_list[-1] instance_classes = utilities.getInstanceClasses(training_data, classname) instances = utilities.getInstances(training_data) validation_instances = utilities.getInstances(validation_data) validation_column_list = validation_data.columns.values validation_attr = column_list[:-1] testing_instances = utilities.getInstances(testing_data) testing_column_list = testing_data.columns.values testing_attr = column_list[:-1] node_label = 1 parent = tree.make_tree(instances, instance_classes, attr, training_data, node_label) print('Decision Tree : ') tree.printTree(parent, 0) #Pre Prune Accuracy print('-------------------') print('Pre-Pruned Accuracy') print('-------------------') print('Number if Training instances = ', len(instances)) print('Number if Training attributes = ', len(attr)) print('Total number of nodes in the tree = ', tree.countNodes(parent)) print('Number of leaf nodes in the tree = ', tree.countPureNodes(parent)) print('Accuracy of the model on the training dataset : ', round(utilities.getAccuracy(parent, training_data) * 100, 2), '%') print('') print('Number if Validation instances = ', len(validation_instances)) print('Number if Validation attributes = ', len(validation_attr))
def __init__(self, nleaves, nbreakpoints, G, mappings=[]): self.nleaves = nleaves self.nbreakpoints = nbreakpoints self.G = G epoch_sizes = self.G.getEpochSizes() self.all_sizes = [] for e in xrange(len(nbreakpoints)): self.all_sizes.extend([epoch_sizes[e]] * nbreakpoints[e]) # Build a list of all paths through the SCC graph paths = [] for S in G.all_paths(): paths.append(S[:]) epoch_sizes = G.getEpochSizes() component_index = dict() components = [(0,)] for e,esize in enumerate(epoch_sizes): for c in xrange(len(G.G[e].V)): component = array(G.all_states(e,c)) component_idx = len(components) components.append(component) component_index[(e,c)] = component_idx self.components_flat = zeros(sum(self.all_sizes)+1, dtype=int32) component_starts = [] component_ends = [] offset = 0 for c in components: a = offset b = offset + len(c) offset = b assert self.components_flat[a:b].sum() == 0 assert 0 < b <= len(self.components_flat) self.components_flat[a:b] = array(c, dtype=int32) component_starts.append(a) component_ends.append(b) assert all(component_ends[i] == component_starts[i+1] for i in range(len(component_ends)-1)) # Build all distributions of the paths over our intervals paths_final = [] tree_map = {} paths_indices = [] npaths = 0 for s in enumerate_all_transitions(paths, nbreakpoints): # FIXME: instead of removing the first component in the path, # we shouldn't have it there to begin with... s = s[1:] cpath = (0,)+tuple(component_index[(e,p)] for e,p in s) path_as_offsets = [] for ci in cpath: path_as_offsets.append(component_starts[ci]) path_as_offsets.append(component_ends[ci]) path_as_offsets = array(path_as_offsets, dtype=int32) paths_final.extend(path_as_offsets) npaths += 1 ta = make_tree(G, s, 0) tb = make_tree(G, s, 1) a = tree_map.setdefault(ta, len(tree_map)) b = tree_map.setdefault(tb, len(tree_map)) paths_indices.append(a) paths_indices.append(b) self.tree_map = tree_map self.ntrees = len(tree_map) self.paths_final_indices = array(paths_indices, dtype=int32) self.paths_final = array(paths_final, dtype=int32) self.npaths = npaths self.mappings = mappings
def __init__(self, nleaves, nbreakpoints, G, mappings=[]): self.nleaves = nleaves self.nbreakpoints = nbreakpoints self.G = G epoch_sizes = self.G.getEpochSizes() self.all_sizes = [] for e in xrange(len(nbreakpoints)): self.all_sizes.extend([epoch_sizes[e]] * nbreakpoints[e]) # Build a list of all paths through the SCC graph paths = [] for S in G.all_paths(): paths.append(S[:]) epoch_sizes = G.getEpochSizes() component_index = dict() components = [(0, )] for e, esize in enumerate(epoch_sizes): for c in xrange(len(G.G[e].V)): component = array(G.all_states(e, c)) component_idx = len(components) components.append(component) component_index[(e, c)] = component_idx self.components_flat = zeros(sum(self.all_sizes) + 1, dtype=int32) component_starts = [] component_ends = [] offset = 0 for c in components: a = offset b = offset + len(c) offset = b assert self.components_flat[a:b].sum() == 0 assert 0 < b <= len(self.components_flat) self.components_flat[a:b] = array(c, dtype=int32) component_starts.append(a) component_ends.append(b) assert all(component_ends[i] == component_starts[i + 1] for i in range(len(component_ends) - 1)) # Build all distributions of the paths over our intervals paths_final = [] tree_map = {} paths_indices = [] npaths = 0 for s in enumerate_all_transitions(paths, nbreakpoints): # FIXME: instead of removing the first component in the path, # we shouldn't have it there to begin with... s = s[1:] cpath = (0, ) + tuple(component_index[(e, p)] for e, p in s) path_as_offsets = [] for ci in cpath: path_as_offsets.append(component_starts[ci]) path_as_offsets.append(component_ends[ci]) path_as_offsets = array(path_as_offsets, dtype=int32) paths_final.extend(path_as_offsets) npaths += 1 ta = make_tree(G, s, 0) tb = make_tree(G, s, 1) a = tree_map.setdefault(ta, len(tree_map)) b = tree_map.setdefault(tb, len(tree_map)) paths_indices.append(a) paths_indices.append(b) self.tree_map = tree_map self.ntrees = len(tree_map) self.paths_final_indices = array(paths_indices, dtype=int32) self.paths_final = array(paths_final, dtype=int32) self.npaths = npaths self.mappings = mappings