Exemplo n.º 1
0
def pre_processing():
    """
    Processing settings and data
    :return: required data for execution
    """

    # Process and print settings
    settings = process_command_line()
    print 'Current settings:'
    pp.pprint(vars(settings))

    # Resetting random seed
    reset_random_seed(settings)

    # Loading data
    data = load_data(settings)
    param, cache = precompute_minimal(data, settings)

    return settings, data, param, cache
Exemplo n.º 2
0
def main():
    # Import settings from command line
    settings = process_command_line()
    print 'Current settings:'
    pp.pprint(vars(settings))

    # Resetting random seed
    reset_random_seed(settings)

    # Loading batch data
    batch_type_list = ['office', 'kitchen', 'bookstore']
    #batch_type_list = ['kitchen','office','bathroom','bedroom','bookstore','living_room']

    incremental_type_list = [
        'computer_room', 'home_office', 'office_kitchen', 'classroom'
    ]
    #incremental_type_list = ['study_space','classroom','computer_room','lobby','home_office','office_kitchen','playroom','reception_room','study','dining_room','cafeteria','furniture_store','conference_room','dinette','gym','storage_room','indoor_balcony','laundromat','printer_room','basement','recreation_room']
    training_perc = 0

    data, training_list, test_list = load_type_dataset(settings,
                                                       batch_type_list,
                                                       incremental_type_list,
                                                       training_perc)

    param, cache = precompute_minimal(data, settings)

    mf = MondrianForest(settings, data)

    batch_train_ids = data['train_ids_partition']['batch']
    print '\nBatch training on %d element...' % (len(batch_train_ids))

    mf.fit(data, batch_train_ids, settings, param, cache)

    print '...batch training done. \n'

    if training_perc > 0:
        incremental_train_ids = data['train_ids_partition']['incremental']
        print 'Incremental training on %d element...' % (
            len(incremental_train_ids))

        mf.partial_fit(data, incremental_train_ids, settings, param, cache)

        print '...incremental training done. \n'

    #Evaluation
    print 'Evaluation... \n'
    weights_prediction = np.ones(
        settings.n_mondrians) * 1.0 / settings.n_mondrians
    pred_forest_test, metrics_test = \
        mf.evaluate_predictions(data, data['x_test'], data['y_test'], \
        settings, param, weights_prediction, False)
    name_metric = settings.name_metric  # acc or mse
    metric_test = metrics_test[name_metric]
    tree_numleaves = np.zeros(settings.n_mondrians)
    for i_t, tree in enumerate(mf.forest):
        tree_numleaves[i_t] = len(tree.leaf_nodes)
    forest_numleaves = np.mean(tree_numleaves)
    f_stats = open(
        '/home/alberto/tesi/mondrianforest/src/results/statistics.txt', 'w')
    print '%s\t\tnum_leaves' % (name_metric)
    f_stats.write(str(name_metric) + ' : ' + str(metric_test))
    f_stats.write('\n')
    f_stats.write('num_leaves : ' + str(forest_numleaves))
    f_stats.write('\n')
    f_stats.write('\n')
    print '%.3f\t\t%.3f' % (metric_test, forest_numleaves)

    print '\nFinal forest stats:'
    f_stats.write('Final forest stats: \n')
    tree_stats = np.zeros((settings.n_mondrians, 2))
    tree_average_depth = np.zeros(settings.n_mondrians)
    for i_t, tree in enumerate(mf.forest):
        tree_stats[i_t, -2:] = np.array(
            [len(tree.leaf_nodes),
             len(tree.non_leaf_nodes)])
        tree_average_depth[i_t] = tree.get_average_depth(settings, data)[0]
    print 'mean(num_leaves) = %.1f, mean(num_non_leaves) = %.1f, mean(tree_average_depth) = %.1f' \
            % (np.mean(tree_stats[:, -2]), np.mean(tree_stats[:, -1]), np.mean(tree_average_depth))
    print 'n_train = %d, log_2(n_train) = %.1f, mean(tree_average_depth) = %.1f +- %.1f' \
            % (data['n_train'], np.log2(data['n_train']), np.mean(tree_average_depth), np.std(tree_average_depth))

    f_stats.write('mean(num_leaves) = ' + str(np.mean(tree_stats[:, -2])))
    f_stats.write('  mean(num_non_leaves) = ' +
                  str(np.mean(tree_stats[:, -1])))
    f_stats.write('  mean(tree_average_depth) = ' +
                  str(np.mean(tree_average_depth)) + '\n')

    f_stats.write('n_train = ' + str(data['n_train']))
    f_stats.write('  log_2(n_train) = ' + str(np.log2(data['n_train'])))
    f_stats.write('  mmean(tree_average_depth) = ' +
                  str(np.mean(tree_average_depth)) + ' +- ' +
                  str(np.std(tree_average_depth)) + '\n')

    f_stats.write(
        '\n------------------------------------------------------------------\n'
    )

    print '\n...evaluation done.'
    print 'Computing confusion matrices...'
    uf_dir = settings.data_path + '/unary_csv'
    labels_dir = settings.data_path + '/labels_csv'
    cm_res_dir = '../results/cm'
    for file_name in test_list:
        curr_uf_csv = uf_dir + '/' + file_name
        curr_lables_csv = labels_dir + '/' + file_name
        x_df = pd.read_csv(curr_uf_csv, usecols=unary_features)
        y_df = pd.read_csv(curr_lables_csv, dtype=int)
        x_test = x_df.to_numpy()
        y_test = y_df.to_numpy()
        y_test.shape = (y_test.shape[0], )

        if settings.normalize_features == 1:
            min_d = np.minimum(np.min(data['x_train'], 0),
                               np.min(data['x_test'], 0))
            max_d = np.maximum(np.max(data['x_train'], 0),
                               np.max(data['x_test'], 0))
            range_d = max_d - min_d
            idx_range_d_small = range_d <= 0.  # find columns where all features are identical
            if data['n_dim'] > 1:
                range_d[
                    idx_range_d_small] = 1e-3  # non-zero value just to prevent division by 0
            elif idx_range_d_small:
                range_d = 1e-3
            x_test -= min_d + 0.
            x_test /= range_d

        cm_weights_prediction = np.ones(
            settings.n_mondrians) * 1.0 / settings.n_mondrians
        cm_pred_forest_test, cm_metrics_test = \
            mf.evaluate_predictions(data, x_test, y_test, \
            settings, param, weights_prediction, False)

        #y_test_pred = get_y_pred(cm_pred_forest_test['pred_prob'])
        y_test_pred = get_label_predictions(cm_pred_forest_test['pred_prob'])

        f_stats.write(str(file_name) + '\n')

        print 'y_test'
        print y_test[:25]

        print 'y_test_pred'
        print y_test_pred[:25]

        f_stats.write('\n y_test     y_mf_pred: \n')
        for x in range(25):
            if (y_test[x] < 10):
                f_stats.write('#  ' + str(y_test[x]) + '       #  ' +
                              str(y_test_pred[x]) + '\n')
            else:
                f_stats.write('#  ' + str(y_test[x]) + '      #  ' +
                              str(y_test_pred[x]) + '\n')

        f_stats.write('\n---------------------------------------\n')

        cm = compute_confusion_matrix(y_test, y_test_pred, print_cm=False)
        cm_path = cm_res_dir + '/' + file_name
        #np.savetxt(cm_path, cm, delimiter=",")

        # SAVE PREDICTIONS ON CORRESPONDING PCD
        '''end_idx = file_name.rfind('.')
        input_path = '/home/alberto/tesi/dataset/NYUDV2/trained_semseg_data/clustering/'
        pcd_name = file_name[0:end_idx] + '.pcd'
        print pcd_name
        pcd_path = input_path + pcd_name
        input_cloud = pypcd.PointCloud.from_path(pcd_path)

        point_x_list = input_cloud.pc_data['x']
        point_y_list = input_cloud.pc_data['y']
        point_z_list = input_cloud.pc_data['z']
        cluster_idx_list = input_cloud.pc_data['label']

        new_cloud = input_cloud.pc_data.copy
        new_cloud = input_cloud.pc_data.view(np.float32).reshape(input_cloud.pc_data.shape + (-1,))

        print 'Cluster cloud shape:' 
        print new_cloud.shape

        print 'Cluster label length: %d' % (len(cluster_idx_list))


        for n in range(new_cloud.shape[0]):
            new_cloud[n][0] = point_x_list[n]
            new_cloud[n][1] = point_y_list[n]
            new_cloud[n][2] = point_z_list[n]
            if(cluster_idx_list[n] > 4000):
                new_cloud[n][3] = 0
            else: 
                new_cloud[n][3] = y_test_pred[cluster_idx_list[n]]
        
        #res_pcd = pypcd.make_xyz_rgb_point_cloud(new_cloud)
        res_pcd = pypcd.make_xyz_label_point_cloud(new_cloud)
        output_path = '/home/alberto/tesi/mondrianforest/src/results/pcd/'+pcd_name
        res_pcd.save(output_path)'''

    print '...computation done.\n END.'
    f_stats.close()
Exemplo n.º 3
0
#!/usr/bin/env python
import numpy as np
import pprint as pp  # pretty printing module
from matplotlib import pyplot as plt  # required only for plotting results
from mondrianforest_utils import load_data, reset_random_seed, precompute_minimal
from mondrianforest import process_command_line, MondrianForest

settings = process_command_line()
print 'Current settings:'
pp.pprint(vars(settings))

# Resetting random seed
reset_random_seed(settings)

# Loading data
data = load_data(settings)
print "Data: ", data
print type(settings)

param, cache = precompute_minimal(data, settings)

mf = MondrianForest(settings, data)

data['x_test']
print(data)

train_ids_current_minibatch = data['train_ids_partition']['current'][0]
print train_ids_current_minibatch.shape

print "First batch train on 5 data points"
mf.fit(data, train_ids_current_minibatch[0:5], settings, param, cache)
Exemplo n.º 4
0
#!/usr/bin/env python

import numpy as np
import pprint as pp     # pretty printing module
from matplotlib import pyplot as plt        # required only for plotting results
from mondrianforest_utils import load_data, reset_random_seed, precompute_minimal 
from mondrianforest import process_command_line, MondrianForest

PLOT = False

settings = process_command_line()
print 'Current settings:'
pp.pprint(vars(settings))

# Resetting random seed
reset_random_seed(settings)

# Loading data
data = load_data(settings)

param, cache = precompute_minimal(data, settings)

mf = MondrianForest(settings, data)

print '\nminibatch\tmetric_train\tmetric_test\tnum_leaves'

for idx_minibatch in range(settings.n_minibatches):
    train_ids_current_minibatch = data['train_ids_partition']['current'][idx_minibatch]
    if idx_minibatch == 0:
        # Batch training for first minibatch
        mf.fit(data, train_ids_current_minibatch, settings, param, cache)
Exemplo n.º 5
0
def main():
    # Import settings from command line
    settings = process_command_line()
    print 'Current settings:'
    pp.pprint(vars(settings))

    # Resetting random seed
    reset_random_seed(settings)

    # Loading data
    data = load_dataset(settings)

    param, cache = precompute_minimal(data,settings)

    mf = MondrianForest(settings, data)

    print '\nminibatch\tmetric_train\tmetric_test\tnum_leaves'

    for idx_minibatch in range(settings.n_minibatches):
        train_ids_current_minibatch = data['train_ids_partition']['current'][idx_minibatch]
        if idx_minibatch == 0:
            # Batch training for first minibatch
            mf.fit(data, train_ids_current_minibatch, settings, param, cache)
        else:
            # Online update
            mf.partial_fit(data, train_ids_current_minibatch, settings, param, cache)

        # Evaluate
        weights_prediction = np.ones(settings.n_mondrians) * 1.0 / settings.n_mondrians
        train_ids_cumulative = data['train_ids_partition']['cumulative'][idx_minibatch]
        pred_forest_train, metrics_train = \
            mf.evaluate_predictions(data, data['x_train'][train_ids_cumulative, :], \
            data['y_train'][train_ids_cumulative], \
            settings, param, weights_prediction, False)
        pred_forest_test, metrics_test = \
            mf.evaluate_predictions(data, data['x_test'], data['y_test'], \
            settings, param, weights_prediction, False)
        name_metric = settings.name_metric     # acc or mse
        metric_train = metrics_train[name_metric]
        metric_test = metrics_test[name_metric]
        tree_numleaves = np.zeros(settings.n_mondrians)
        for i_t, tree in enumerate(mf.forest):
            tree_numleaves[i_t] = len(tree.leaf_nodes)
        forest_numleaves = np.mean(tree_numleaves)
        print '%9d\t%.3f\t\t%.3f\t\t%.3f' % (idx_minibatch, metric_train, metric_test, forest_numleaves)
        print 'length of y_test'
        print data['y_test'].shape

        y_test_pred = get_y_pred(pred_forest_test['pred_prob'])
        print 'lenght of y_test_pred:'
        print y_test_pred.shape

        for x in range(0,len(y_test_pred)):
            print 'label: %d mf prediction: %d' % (data['y_test'][x], y_test_pred[x])

        cm = confusion_matrix(data['y_test'], y_test_pred)
        
        # Show confusion matrix in a separate window
        plt.matshow(cm)
        plt.title('Confusion matrix')
        plt.colorbar()
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.show()





    print '\nFinal forest stats:'
    tree_stats = np.zeros((settings.n_mondrians, 2))
    tree_average_depth = np.zeros(settings.n_mondrians)
    for i_t, tree in enumerate(mf.forest):
        tree_stats[i_t, -2:] = np.array([len(tree.leaf_nodes), len(tree.non_leaf_nodes)])
        tree_average_depth[i_t] = tree.get_average_depth(settings, data)[0]
    print 'mean(num_leaves) = %.1f, mean(num_non_leaves) = %.1f, mean(tree_average_depth) = %.1f' \
            % (np.mean(tree_stats[:, -2]), np.mean(tree_stats[:, -1]), np.mean(tree_average_depth))
    print 'n_train = %d, log_2(n_train) = %.1f, mean(tree_average_depth) = %.1f +- %.1f' \
            % (data['n_train'], np.log2(data['n_train']), np.mean(tree_average_depth), np.std(tree_average_depth))