def print_tree(event, neighborhood, dataset, wave, c_0, balance, alpha): treefile = 'results/' + '_'.join( [event, neighborhood, dataset, wave, 'c_0=' + str(c_0), 'balance=' + balance, 'alpha=' + alpha]) + '.tree' tree = TreeNode('dummy') with open(treefile) as f: tree.load(f) print str(tree)
def stuff(): # events = ['FL','SG','AR','CH'] # events = ['SG'] events = ["AR"] # events = ['FL','SG','FI','CH','AR','SS'] # neighborhoods = ['rook','queen','rooktemp'] # neighborhoods = ['rook','rooktemp','rooktemplong'] neighborhoods = ["rook"] datasets = ["1DAY"] thetas = [0.7] # theta is the classification parameter grid = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] alphas = [x for x in itertools.product(grid, grid) if sum(x) <= 1] # alphas = [(0.3,0.3),(0.6,0.3),(0.3,0.6)] # alphas = [(0.3,0.3)] # waves = [['0193'],['0171'],['0094']] waves = [["0193"]] # balances = ['Mirror','Duplication','Random'] balances = ["Mirror"] c_0Dict = dict([(("AR", "1DAY"), 40), (("SG", "3DAYDEMO"), 9), (("AR", "3DAYDEMO"), 111)]) X = [x for x in itertools.product(events, neighborhoods, datasets, waves, balances, alphas)] for event, neighborhood, dataset, wave, balance, alpha in X: alphastr = str(alpha).replace(",", "-").replace("(", "[").replace(")", "]").replace(" ", "") wavestr = wave[0] c_0 = c_0Dict[(event, dataset)] treefile = ( "results/" + "_".join( [event, neighborhood, dataset, wavestr, "c_0=" + str(c_0), "balance=" + balance, "alpha=" + alphastr] ) + ".tree" ) Tree = TreeNode("dummy") with open(treefile) as f: Tree.load(f) # print str(Tree) print event, neighborhood, dataset, wave, balance, alpha S = Tree.size() print "size:", S print "balance:", Tree.balance() print "splits:", Tree.total_splits_evaluated() TBSR = Tree.total_best_split_runtime() print "total time:", TBSR print "avg time:", TBSR / ((S - 1) / 2) print
def stuff(): treefileA = 'results/AR_rook_1DAY_0193_c_0=40_balance=Mirror_alpha=[0.0-0.0].tree' treefileB = 'hopefullyslowerresults/AR_rook_1DAY_0193_c_0=40_balance=Mirror_alpha=[0.0-0.0].tree' for treefile in [treefileA,treefileB]: Tree = TreeNode('dummy') with open(treefile) as f: Tree.load(f) print treefile S = Tree.size() print 'size:', S print 'balance:', Tree.balance() print 'splits:', Tree.total_splits_evaluated() TBSR = Tree.total_best_split_runtime() print 'total time:', TBSR print 'avg time:', TBSR/((S-1)/2) print
def best_svm_split(self, node, X, y, alfa, complexity): T = self.classification_tree #Creo un nuovo sottoalbero fittizio con root nel nodo new_node = TreeNode.copy_node(node) #if new_node.is_leaf: #complexity += 1 rho = 0 data = X[new_node.data_idxs] label = y[new_node.data_idxs] if not all(i == label[0] for i in label) and len(data) > 0: #Questo fa SVM multiclasse 1 vs rest clf = LinearSVC(tol=1e-6, C=10, max_iter=10000, loss='squared_hinge', penalty='l1', dual=False) clf.fit(data, label) #for n_class in range(n_classes): #n_misclassified = np.count_nonzero(label-np.sign(np.dot(data, clf.coef_[n_class].T)+clf.intercept_[n_class])) #Devo capire come ottenere i coefficienti migliori tra il numero di iperpiani addestrati weights = clf.coef_.reshape((len(X[0])), ) intercept = clf.intercept_ if new_node.is_leaf: ClassificationTree.create_new_children(node, X, y, self.max_id, None, None, oblique=T.oblique, weights=weights, intercept=intercept) rho = 1 else: new_node.weights = weights new_node.intercept = intercept return new_node, ClassificationTree.misclassification_loss( new_node, X, y, new_node.data_idxs, T.oblique) + alfa * (complexity + rho) return new_node, np.inf
thresh = 1.5 * X[sorted_indexes[i], j] node.threshold = -thresh actual_loss = zero_one_loss(node.threshold, node, X, labels) if actual_loss < error_best: error_best = actual_loss best_tresh = -thresh best_feature = j i += 1 return best_tresh, best_feature best = np.inf best_f = 0 best_t = 0 node = TreeNode(0, 0, None, None, None, None, 0, 0, 0, None) for j in range(len(X[0])): node.feature = j node.threshold = 1 #opt_t = gradient_descend(1, node, X, labels) opt_t = minimize(quadratically_loss, 0, args=(node, X, labels), method='BFGS', tol=1e-04).x print("ottimizzo feature: ", j) err = quadratically_loss(opt_t, node, X, labels) if err < best: best = err best_f = j
def best_parallel_split(self, node, X, y, alfa, complexity): T = self.classification_tree #Creo un nuovo sottoalbero fittizio con root nel nodo new_node = TreeNode.copy_node(node) error_best = np.inf #if new_node.is_leaf: #complexity += 1 was_leaf = False improve = False rho = 0 if new_node.is_leaf: was_leaf = True rho = 1 if new_node.data_idxs: for j in range(len(X[0])): #Prendo tutte le j-esime componenti del dataset e le ordino #in modo crescente vals = {} for point_idx in new_node.data_idxs: vals[point_idx] = X[point_idx, j] values = sorted(vals.items(), key=lambda x: x[1]) sorted_indexes = [tuple[0] for tuple in values] #plt.scatter(X[sorted_indexes, j], range(len(values)), s=0.4, c=list(correct_classification_tuples.values())) #plt.show() new_node.feature = j #if j==2: #base = actual_loss #actual_loss = self.binary_loss(node_id, X, y, sorted_indexes[i], correct_classification_tuples[sorted_indexes[i]], actual_loss, thresh) #print ("loss: ", actual_loss, "n punti: ", len(care_points_idxs)) #print("vecchia: ", self.vecchia_loss(node_id, X, y, care_points_idxs, correct_classification_tuples, thresh)) #Ciclo su ogni valore di quella componente e provo tutti gli split #possibili ''' new_node.threshold = 0.5*X[sorted_indexes[0], j] ''' i = -1 actual_loss = ClassificationTree.misclassification_loss( new_node, X, y, sorted_indexes, self.classification_tree. oblique) + alfa * (complexity + rho) while i < len(sorted_indexes): pre_thresh = new_node.threshold #print("Ottimizzo best parallel: ", i*100/len(sorted_indexes)) if i < 0: thresh = 0.5 * X[sorted_indexes[0], j] if i < len(sorted_indexes) - 1: thresh = 0.5 * (X[sorted_indexes[i], j] + X[sorted_indexes[i + 1], j]) else: thresh = 1.5 * X[sorted_indexes[i], j] new_node.threshold = thresh ''' #Se il nodo da ottimizzare era una foglia allora dobbiamo creare le foglie figlie #queste vengono ottimizzate subito per maggioranza if was_leaf: self.create_new_children(new_node, j, thresh) inc, k = self.binary_loss(X, new_node, sorted_indexes, i, y, pre_thresh) actual_loss += inc else: inc, k = self.binary_loss(X, new_node, sorted_indexes, i, y, pre_thresh) actual_loss += inc ''' #Se il nodo da ottimizzare era una foglia allora dobbiamo creare le foglie figlie #queste vengono ottimizzate subito per maggioranza if was_leaf: ClassificationTree.create_new_children( new_node, X, y, self.max_id, j, thresh, oblique=T.oblique) actual_loss = ClassificationTree.misclassification_loss( new_node, X, y, sorted_indexes, self.classification_tree.oblique) + alfa * ( complexity + rho) if actual_loss < error_best: improve = True error_best = actual_loss best_t = thresh best_feature = j i += 1 #print ("error best: ", error_best) new_node.threshold = best_t new_node.feature = best_feature if was_leaf and improve: self.max_id += 2 return new_node, error_best
n = 'rooktemp' w = ['0193'] b = 'Mirror' theta = 0.7 alphaSpatial = '[0.3-0.0]' alphaSpatiotemp = '0.3-0.4]' alphaNonSpatial = '[0.0-0.0]' c_0 = cref[e] eventClass = e dataset = d print eventClass,dataset headers,matches = SOLARGenImageList.image_event_matches(dataset=d,waves = w) print 'matches calculated' treefileSpatial = "results/"+"_".join([str(e),str(n),str(d),"-".join(w),'c_0='+str(c_0),'balance='+str(b),'alpha='+str(alphaSpatial)])+".tree" treefileNonSpatial = "results/"+"_".join([str(e),str(n),str(d),"-".join(w),'c_0='+str(c_0),'balance='+str(b),'alpha='+str(alphaNonSpatial)])+".tree" treeSpatial = TreeNode('dummy') treeNonSpatial = TreeNode('dummy') with open(treefileSpatial) as f: treeSpatial.load(f) with open(treefileNonSpatial) as f: treeNonSpatial.load(f) S_train,S_test, = read_data(e,n,d,w,b) # read the data set cells_train, adj = S_train cells_test, adj_test = S_test counter = 0 for x in sorted(matches.keys()): # for each image of the data set paramsFilename = x[0] imageFilename = paramsFilename[:-4]+'_th.png' ISpatial = m.imread(imageFilename) # read the image INonSpatial = ISpatial.copy() outputname = os.path.join(outputFolder,e,os.path.basename(imageFilename)[:-4]+'_'+e+'_'+d+'.png')
for i in range(1): idx = np.random.permutation(len(data)) data = data[idx] y = y[idx] val_split = 0.2 #data = dataset.data[idx] #label = dataset.target[idx] valid_id = int(len(data) * (1 - val_split)) X = data[0:valid_id] labels = y[0:valid_id] X_valid = data[valid_id:] y_valid = y[valid_id:] depth = 3 to_optimize = [] node = TreeNode(0, 0, None, None, None, None, 0, 0, 0, None) node.data_idxs = range(len(X)) to_optimize.append(node) while to_optimize: actual_node = to_optimize.pop() if actual_node.depth <= depth - 1 and len(actual_node.data_idxs) > 0: actual_node.is_leaf = False try: best_feature = np.nanargmax([ np.abs( spearmanr(X[actual_node.data_idxs, j], labels[actual_node.data_idxs])[0]) for j in range(len(X[0])) ]) except: best_feature = np.random.randint(0, len(X[0]))