Пример #1
0
def print_tree(event, neighborhood, dataset, wave, c_0, balance, alpha):
    treefile = 'results/' + '_'.join(
        [event, neighborhood, dataset, wave, 'c_0=' + str(c_0), 'balance=' + balance, 'alpha=' + alpha]) + '.tree'
    tree = TreeNode('dummy')
    with open(treefile) as f:
        tree.load(f)
    print str(tree)
Пример #2
0
def stuff():
    #  events = ['FL','SG','AR','CH']
    #  events = ['SG']
    events = ["AR"]
    #  events = ['FL','SG','FI','CH','AR','SS']
    #  neighborhoods = ['rook','queen','rooktemp']
    #  neighborhoods = ['rook','rooktemp','rooktemplong']
    neighborhoods = ["rook"]
    datasets = ["1DAY"]

    thetas = [0.7]  # theta is the classification parameter

    grid = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    alphas = [x for x in itertools.product(grid, grid) if sum(x) <= 1]
    #  alphas = [(0.3,0.3),(0.6,0.3),(0.3,0.6)]
    #  alphas = [(0.3,0.3)]

    #  waves = [['0193'],['0171'],['0094']]
    waves = [["0193"]]
    #  balances = ['Mirror','Duplication','Random']
    balances = ["Mirror"]
    c_0Dict = dict([(("AR", "1DAY"), 40), (("SG", "3DAYDEMO"), 9), (("AR", "3DAYDEMO"), 111)])
    X = [x for x in itertools.product(events, neighborhoods, datasets, waves, balances, alphas)]
    for event, neighborhood, dataset, wave, balance, alpha in X:
        alphastr = str(alpha).replace(",", "-").replace("(", "[").replace(")", "]").replace(" ", "")
        wavestr = wave[0]
        c_0 = c_0Dict[(event, dataset)]
        treefile = (
            "results/"
            + "_".join(
                [event, neighborhood, dataset, wavestr, "c_0=" + str(c_0), "balance=" + balance, "alpha=" + alphastr]
            )
            + ".tree"
        )
        Tree = TreeNode("dummy")
        with open(treefile) as f:
            Tree.load(f)

        #  print str(Tree)
        print event, neighborhood, dataset, wave, balance, alpha
        S = Tree.size()
        print "size:", S
        print "balance:", Tree.balance()
        print "splits:", Tree.total_splits_evaluated()
        TBSR = Tree.total_best_split_runtime()
        print "total time:", TBSR
        print "avg time:", TBSR / ((S - 1) / 2)
        print
def stuff():
  treefileA = 'results/AR_rook_1DAY_0193_c_0=40_balance=Mirror_alpha=[0.0-0.0].tree'
  treefileB = 'hopefullyslowerresults/AR_rook_1DAY_0193_c_0=40_balance=Mirror_alpha=[0.0-0.0].tree'
  for treefile in [treefileA,treefileB]:
    Tree = TreeNode('dummy')
    with open(treefile) as f:
     Tree.load(f)
    print treefile
    S = Tree.size()
    print 'size:', S
    print 'balance:', Tree.balance()
    print 'splits:', Tree.total_splits_evaluated()
    TBSR = Tree.total_best_split_runtime()
    print 'total time:', TBSR
    print 'avg time:', TBSR/((S-1)/2)
    print
Пример #4
0
 def best_svm_split(self, node, X, y, alfa, complexity):
     T = self.classification_tree
     #Creo un nuovo sottoalbero fittizio con root nel nodo
     new_node = TreeNode.copy_node(node)
     #if new_node.is_leaf:
     #complexity += 1
     rho = 0
     data = X[new_node.data_idxs]
     label = y[new_node.data_idxs]
     if not all(i == label[0] for i in label) and len(data) > 0:
         #Questo fa SVM multiclasse 1 vs rest
         clf = LinearSVC(tol=1e-6,
                         C=10,
                         max_iter=10000,
                         loss='squared_hinge',
                         penalty='l1',
                         dual=False)
         clf.fit(data, label)
         #for n_class in range(n_classes):
         #n_misclassified = np.count_nonzero(label-np.sign(np.dot(data, clf.coef_[n_class].T)+clf.intercept_[n_class]))
         #Devo capire come ottenere i coefficienti migliori tra il numero di iperpiani addestrati
         weights = clf.coef_.reshape((len(X[0])), )
         intercept = clf.intercept_
         if new_node.is_leaf:
             ClassificationTree.create_new_children(node,
                                                    X,
                                                    y,
                                                    self.max_id,
                                                    None,
                                                    None,
                                                    oblique=T.oblique,
                                                    weights=weights,
                                                    intercept=intercept)
             rho = 1
         else:
             new_node.weights = weights
             new_node.intercept = intercept
         return new_node, ClassificationTree.misclassification_loss(
             new_node, X, y, new_node.data_idxs,
             T.oblique) + alfa * (complexity + rho)
     return new_node, np.inf
Пример #5
0
                thresh = 1.5 * X[sorted_indexes[i], j]
            node.threshold = -thresh
            actual_loss = zero_one_loss(node.threshold, node, X, labels)
            if actual_loss < error_best:
                error_best = actual_loss
                best_tresh = -thresh
                best_feature = j
            i += 1

    return best_tresh, best_feature


best = np.inf
best_f = 0
best_t = 0
node = TreeNode(0, 0, None, None, None, None, 0, 0, 0, None)
for j in range(len(X[0])):
    node.feature = j
    node.threshold = 1

    #opt_t = gradient_descend(1, node, X, labels)
    opt_t = minimize(quadratically_loss,
                     0,
                     args=(node, X, labels),
                     method='BFGS',
                     tol=1e-04).x
    print("ottimizzo feature: ", j)
    err = quadratically_loss(opt_t, node, X, labels)
    if err < best:
        best = err
        best_f = j
Пример #6
0
    def best_parallel_split(self, node, X, y, alfa, complexity):

        T = self.classification_tree

        #Creo un nuovo sottoalbero fittizio con root nel nodo
        new_node = TreeNode.copy_node(node)
        error_best = np.inf
        #if new_node.is_leaf:
        #complexity += 1
        was_leaf = False
        improve = False
        rho = 0
        if new_node.is_leaf:
            was_leaf = True
            rho = 1

        if new_node.data_idxs:
            for j in range(len(X[0])):

                #Prendo tutte le j-esime componenti del dataset e le ordino
                #in modo crescente
                vals = {}
                for point_idx in new_node.data_idxs:
                    vals[point_idx] = X[point_idx, j]

                values = sorted(vals.items(), key=lambda x: x[1])
                sorted_indexes = [tuple[0] for tuple in values]

                #plt.scatter(X[sorted_indexes, j], range(len(values)), s=0.4, c=list(correct_classification_tuples.values()))
                #plt.show()
                new_node.feature = j
                #if j==2:
                #base = actual_loss
                #actual_loss = self.binary_loss(node_id, X, y, sorted_indexes[i], correct_classification_tuples[sorted_indexes[i]], actual_loss, thresh)
                #print ("loss: ", actual_loss, "n punti: ", len(care_points_idxs))
                #print("vecchia: ", self.vecchia_loss(node_id, X, y, care_points_idxs, correct_classification_tuples, thresh))
                #Ciclo su ogni valore di quella componente e provo tutti gli split
                #possibili
                '''
                new_node.threshold = 0.5*X[sorted_indexes[0], j]
                '''
                i = -1
                actual_loss = ClassificationTree.misclassification_loss(
                    new_node, X, y, sorted_indexes, self.classification_tree.
                    oblique) + alfa * (complexity + rho)
                while i < len(sorted_indexes):

                    pre_thresh = new_node.threshold
                    #print("Ottimizzo best parallel: ", i*100/len(sorted_indexes))

                    if i < 0:
                        thresh = 0.5 * X[sorted_indexes[0], j]

                    if i < len(sorted_indexes) - 1:
                        thresh = 0.5 * (X[sorted_indexes[i], j] +
                                        X[sorted_indexes[i + 1], j])
                    else:
                        thresh = 1.5 * X[sorted_indexes[i], j]

                    new_node.threshold = thresh
                    '''
                    #Se il nodo da ottimizzare era una foglia allora dobbiamo creare le foglie figlie
                    #queste vengono ottimizzate subito per maggioranza
                    if was_leaf:
                        self.create_new_children(new_node, j, thresh)
                        inc, k = self.binary_loss(X, new_node, sorted_indexes, i, y, pre_thresh)
                        actual_loss += inc
                    else:
                        inc, k = self.binary_loss(X, new_node, sorted_indexes, i, y, pre_thresh)
                        actual_loss += inc

                    '''
                    #Se il nodo da ottimizzare era una foglia allora dobbiamo creare le foglie figlie
                    #queste vengono ottimizzate subito per maggioranza
                    if was_leaf:
                        ClassificationTree.create_new_children(
                            new_node,
                            X,
                            y,
                            self.max_id,
                            j,
                            thresh,
                            oblique=T.oblique)

                    actual_loss = ClassificationTree.misclassification_loss(
                        new_node, X, y, sorted_indexes,
                        self.classification_tree.oblique) + alfa * (
                            complexity + rho)

                    if actual_loss < error_best:
                        improve = True
                        error_best = actual_loss
                        best_t = thresh
                        best_feature = j
                    i += 1

            #print ("error best: ", error_best)
            new_node.threshold = best_t
            new_node.feature = best_feature
            if was_leaf and improve:
                self.max_id += 2
        return new_node, error_best
 n = 'rooktemp'
 w = ['0193']
 b = 'Mirror'
 theta = 0.7
 alphaSpatial = '[0.3-0.0]'
 alphaSpatiotemp = '0.3-0.4]'
 alphaNonSpatial = '[0.0-0.0]'
 c_0 = cref[e]
 eventClass = e
 dataset = d
 print eventClass,dataset
 headers,matches = SOLARGenImageList.image_event_matches(dataset=d,waves = w)
 print 'matches calculated'
 treefileSpatial = "results/"+"_".join([str(e),str(n),str(d),"-".join(w),'c_0='+str(c_0),'balance='+str(b),'alpha='+str(alphaSpatial)])+".tree"
 treefileNonSpatial = "results/"+"_".join([str(e),str(n),str(d),"-".join(w),'c_0='+str(c_0),'balance='+str(b),'alpha='+str(alphaNonSpatial)])+".tree"
 treeSpatial = TreeNode('dummy')
 treeNonSpatial = TreeNode('dummy')
 with open(treefileSpatial) as f:
   treeSpatial.load(f)
 with open(treefileNonSpatial) as f:
   treeNonSpatial.load(f)
 S_train,S_test, = read_data(e,n,d,w,b) # read the data set
 cells_train, adj = S_train
 cells_test, adj_test = S_test
 counter = 0
 for x in sorted(matches.keys()): # for each image of the data set
   paramsFilename = x[0]
   imageFilename = paramsFilename[:-4]+'_th.png'
   ISpatial = m.imread(imageFilename) # read the image
   INonSpatial = ISpatial.copy()
   outputname = os.path.join(outputFolder,e,os.path.basename(imageFilename)[:-4]+'_'+e+'_'+d+'.png')
Пример #8
0
for i in range(1):
    idx = np.random.permutation(len(data))
    data = data[idx]
    y = y[idx]
    val_split = 0.2
    #data = dataset.data[idx]
    #label = dataset.target[idx]
    valid_id = int(len(data) * (1 - val_split))
    X = data[0:valid_id]
    labels = y[0:valid_id]

    X_valid = data[valid_id:]
    y_valid = y[valid_id:]
    depth = 3
    to_optimize = []
    node = TreeNode(0, 0, None, None, None, None, 0, 0, 0, None)
    node.data_idxs = range(len(X))
    to_optimize.append(node)
    while to_optimize:
        actual_node = to_optimize.pop()
        if actual_node.depth <= depth - 1 and len(actual_node.data_idxs) > 0:
            actual_node.is_leaf = False
            try:
                best_feature = np.nanargmax([
                    np.abs(
                        spearmanr(X[actual_node.data_idxs, j],
                                  labels[actual_node.data_idxs])[0])
                    for j in range(len(X[0]))
                ])
            except:
                best_feature = np.random.randint(0, len(X[0]))