示例#1
0
def parameter_test_worker(i, p, num_params, alpha, group_size, data_x, data_y):
    """
    Get parameter sets from a queue and find the random forest performance. Send them back through another queue.

    :param qu_gl: queue for global objects, such as the dataset
    :param qu_in: queue for the parameter sets
    :param qu_out: queue for the output
    """
    out_str = "# Parameter set %d of %d" % (
        i + 1, num_params) + "\n" + json.dumps(p) + "\n\n"

    rf_split_counts = []
    rf_performance = []
    rf_num_nodes = []
    rf_train_time = []
    fg = {
        a: {
            "split_counts": [],
            "performance": [],
            "num_nodes": [],
            "train_time": []
        }
        for a in alpha
    }

    kf = sklearn.cross_validation.KFold(data_x.shape[0], n_folds=10)
    for kf_i, (train, test) in enumerate(kf):
        print "## kf %d of %d" % (kf_i + 1, len(kf))
        train_x = data_x[train]
        train_y = data_y[train]
        test_x = data_x[test]
        test_y = data_y[test]

        # Train the rf and get the performance.
        rf = randomforest.RandomForestClassifier(n_rand_dims="auto",
                                                 n_jobs=1,
                                                 **p)
        start = time.time()
        rf.fit(train_x, train_y)
        end = time.time()
        pred, split_counts = rf.predict(test_x, return_split_counts=True)
        split_counts /= float(len(pred))
        count = sum(1 for a, b in zip(test_y, pred) if a == b)
        performance = count / float(len(pred))

        rf_split_counts.append(split_counts)
        rf_performance.append(performance)
        rf_num_nodes.append(rf.num_nodes())
        rf_train_time.append(end - start)

        # Train the forest garrote and get the performance.
        for a_i, a in enumerate(alpha):
            print "## forest garrote %d of %d" % (a_i + 1, len(alpha))
            start = time.time()
            if rf.num_trees() <= group_size:
                refined_rf = forest_garrote(rf,
                                            train_x,
                                            train_y,
                                            group_size=None,
                                            alpha=a)
            else:
                refined_rf = forest_garrote(rf,
                                            train_x,
                                            train_y,
                                            group_size=group_size,
                                            alpha=a)
            end = time.time()
            pred, split_counts = refined_rf.predict(test_x,
                                                    return_split_counts=True)
            split_counts /= float(len(pred))
            count = sum(1 for a, b in zip(test_y, pred) if a == b)
            performance = count / float(len(pred))

            fg[a]["split_counts"].append(split_counts)
            fg[a]["performance"].append(performance)
            fg[a]["num_nodes"].append(refined_rf.num_nodes())
            fg[a]["train_time"].append(end - start)

    # Create the output string.
    out_str += "# performance\n" + str(numpy.mean(rf_performance)) + " " + str(
        numpy.std(rf_performance)) + "\n"
    out_str += "# train_time\n" + str(numpy.mean(rf_train_time)) + " " + str(
        numpy.std(rf_train_time)) + "\n"
    out_str += "# split_counts\n" + str(
        numpy.mean(rf_split_counts)) + " " + str(
            numpy.std(rf_split_counts)) + "\n"
    out_str += "# num_nodes\n" + str(numpy.mean(rf_num_nodes)) + " " + str(
        numpy.std(rf_num_nodes)) + "\n\n"
    for a in alpha:
        out_str += "fg " + str(a) + "\n\n"
        out_str += "# performance\n" + str(
            numpy.mean(fg[a]["performance"])) + " " + str(
                numpy.std(fg[a]["performance"])) + "\n"
        out_str += "# train_time\n" + str(
            numpy.mean(fg[a]["train_time"])) + " " + str(
                numpy.std(fg[a]["train_time"])) + "\n"
        out_str += "# split_counts\n" + str(numpy.mean(
            fg[a]["split_counts"])) + " " + str(
                numpy.std(fg[a]["split_counts"])) + "\n"
        out_str += "# num_nodes\n" + str(
            numpy.mean(fg[a]["num_nodes"])) + " " + str(
                numpy.std(fg[a]["num_nodes"])) + "\n\n"

    print out_str[:-1]
    with open("LOGFILE.txt", "a") as f:
        f.write(out_str)
示例#2
0
def parameter_test_worker(i, p, num_params, alpha, group_size, data_x, data_y):
    """
    Get parameter sets from a queue and find the random forest performance. Send them back through another queue.

    :param qu_gl: queue for global objects, such as the dataset
    :param qu_in: queue for the parameter sets
    :param qu_out: queue for the output
    """
    out_str = "# Parameter set %d of %d" % (i+1, num_params) + "\n" + json.dumps(p) + "\n\n"

    rf_split_counts = []
    rf_performance = []
    rf_num_nodes = []
    rf_train_time = []
    fg = {a: {"split_counts": [],
              "performance": [],
              "num_nodes": [],
              "train_time": []}
          for a in alpha}

    kf = sklearn.cross_validation.KFold(data_x.shape[0], n_folds=10)
    for kf_i, (train, test) in enumerate(kf):
        print "## kf %d of %d" % (kf_i+1, len(kf))
        train_x = data_x[train]
        train_y = data_y[train]
        test_x = data_x[test]
        test_y = data_y[test]

        # Train the rf and get the performance.
        rf = randomforest.RandomForestClassifier(n_rand_dims="auto", n_jobs=1, **p)
        start = time.time()
        rf.fit(train_x, train_y)
        end = time.time()
        pred, split_counts = rf.predict(test_x, return_split_counts=True)
        split_counts /= float(len(pred))
        count = sum(1 for a, b in zip(test_y, pred) if a == b)
        performance = count/float(len(pred))

        rf_split_counts.append(split_counts)
        rf_performance.append(performance)
        rf_num_nodes.append(rf.num_nodes())
        rf_train_time.append(end-start)

        # Train the forest garrote and get the performance.
        for a_i, a in enumerate(alpha):
            print "## forest garrote %d of %d" % (a_i+1, len(alpha))
            start = time.time()
            if rf.num_trees() <= group_size:
                refined_rf = forest_garrote(rf, train_x, train_y, group_size=None, alpha=a)
            else:
                refined_rf = forest_garrote(rf, train_x, train_y, group_size=group_size, alpha=a)
            end = time.time()
            pred, split_counts = refined_rf.predict(test_x, return_split_counts=True)
            split_counts /= float(len(pred))
            count = sum(1 for a, b in zip(test_y, pred) if a == b)
            performance = count/float(len(pred))

            fg[a]["split_counts"].append(split_counts)
            fg[a]["performance"].append(performance)
            fg[a]["num_nodes"].append(refined_rf.num_nodes())
            fg[a]["train_time"].append(end-start)

    # Create the output string.
    out_str += "# performance\n" + str(numpy.mean(rf_performance)) + " " + str(numpy.std(rf_performance)) + "\n"
    out_str += "# train_time\n" + str(numpy.mean(rf_train_time)) + " " + str(numpy.std(rf_train_time)) + "\n"
    out_str += "# split_counts\n" + str(numpy.mean(rf_split_counts)) + " " + str(numpy.std(rf_split_counts)) + "\n"
    out_str += "# num_nodes\n" + str(numpy.mean(rf_num_nodes)) + " " + str(numpy.std(rf_num_nodes)) + "\n\n"
    for a in alpha:
        out_str += "fg " + str(a) + "\n\n"
        out_str += "# performance\n" + str(numpy.mean(fg[a]["performance"])) + " " + str(numpy.std(fg[a]["performance"])) + "\n"
        out_str += "# train_time\n" + str(numpy.mean(fg[a]["train_time"])) + " " + str(numpy.std(fg[a]["train_time"])) + "\n"
        out_str += "# split_counts\n" + str(numpy.mean(fg[a]["split_counts"])) + " " + str(numpy.std(fg[a]["split_counts"])) + "\n"
        out_str += "# num_nodes\n" + str(numpy.mean(fg[a]["num_nodes"])) + " " + str(numpy.std(fg[a]["num_nodes"])) + "\n\n"

    print out_str[:-1]
    with open("LOGFILE.txt", "a") as f:
        f.write(out_str)
示例#3
0
def train_rf(n_trees,
             n_jobs,
             predict=True,
             save=False,
             load=False,
             filename=None,
             refine=False,
             group_size=None):
    """
    Train a random forest and compute the accuracy on a test set.

    :param n_trees: number of trees
    :param n_jobs: number of jobs
    :param predict: use the random forest to predict on a test set
    :param save: save the random forest to a file
    :param load: load the random forest from a file
    :param filename: file name
    """
    # train_x, train_y, test_x, test_y = load_data([3, 8])
    train_x, train_y, test_x, test_y = load_neuro_data()

    if load:
        assert os.path.isfile(filename)
        print "Loading random forest from file %s." % filename
        with open(filename, "r") as f:
            rf_str = f.read()
        rf = randomforest.RandomForestClassifier.from_string(rf_str)
        if n_jobs is not None:
            rf._n_jobs = n_jobs
    else:
        print "Training random forest with %d trees." % n_trees
        rf = randomforest.RandomForestClassifier(
            n_estimators=n_trees,
            n_rand_dims="auto",
            n_jobs=n_jobs,
            # bootstrap_sampling=True, use_sample_label_count=True, resample_count=None,
            # bootstrap_sampling=False, use_sample_label_count=False, resample_count=None,
            bootstrap_sampling=True,
            use_sample_label_count=False,
            resample_count=None,
            # bootstrap_sampling=False, use_sample_label_count=True, resample_count=None,  # does not make sense
            # resample_count=20,
            # loggamma_tau=1e-6,
            split_selection="gini")
        with Timer("Training took %.03f seconds"):
            rf.fit(train_x, train_y)
        print "The random forest has %d nodes." % rf.num_nodes()

    if save and not load:
        print "Saving random forest to file %s." % filename
        with open(filename, "w") as f:
            f.write(rf.to_string())

    if predict:
        print "Predicting on a test set with the random forest."
        with Timer("Random forest prediction took %.03f seconds."):
            pred, split_counts = rf.predict(test_x, return_split_counts=True)
        split_counts /= float(len(pred))
        count = sum([1 if a == b else 0 for a, b in zip(test_y, pred)])
        print "%d of %d correct (%.03f%%), used %.02f splits per instance" % (
            count, len(pred), (100.0 * count) / len(pred), split_counts)

    if refine:
        print "Refining the random forest using forest garrote."
        with Timer("Refining took %.03f seconds."):
            refined_rf = forest_garrote(rf,
                                        train_x,
                                        train_y,
                                        group_size=group_size)
            # refined_rf = global_refinement(rf, train_x, train_y)
        print "The refined forest has %d nodes." % refined_rf.num_nodes()

        if save:
            f0, f1 = os.path.split(filename)
            refined_filename = os.path.join(f0, "refined_" + f1)
            print "Saving refined random forest to file %s." % refined_filename
            with open(refined_filename, "w") as f:
                f.write(refined_rf.to_string())

        if predict:
            print "Predicting on a test set with the forest garrote."
            with Timer("Forest garrote prediction took %.03f seconds."):
                pred, split_counts = refined_rf.predict(
                    test_x, return_split_counts=True)
            split_counts /= float(len(pred))
            count = sum([1 if a == b else 0 for a, b in zip(test_y, pred)])
            print "%d of %d correct (%.03f%%), used %.02f splits per instance" % (
                count, len(pred), (100.0 * count) / len(pred), split_counts)
示例#4
0
def train_rf(n_trees, n_jobs, predict=True, save=False, load=False, filename=None, refine=False, group_size=None):
    """
    Train a random forest and compute the accuracy on a test set.

    :param n_trees: number of trees
    :param n_jobs: number of jobs
    :param predict: use the random forest to predict on a test set
    :param save: save the random forest to a file
    :param load: load the random forest from a file
    :param filename: file name
    """
    # train_x, train_y, test_x, test_y = load_data([3, 8])
    train_x, train_y, test_x, test_y = load_neuro_data()

    if load:
        assert os.path.isfile(filename)
        print "Loading random forest from file %s." % filename
        with open(filename, "r") as f:
            rf_str = f.read()
        rf = randomforest.RandomForestClassifier.from_string(rf_str)
        if n_jobs is not None:
            rf._n_jobs = n_jobs
    else:
        print "Training random forest with %d trees." % n_trees
        rf = randomforest.RandomForestClassifier(n_estimators=n_trees, n_rand_dims="auto", n_jobs=n_jobs,
                                                 # bootstrap_sampling=True, use_sample_label_count=True, resample_count=None,
                                                 # bootstrap_sampling=False, use_sample_label_count=False, resample_count=None,
                                                 bootstrap_sampling=True, use_sample_label_count=False, resample_count=None,
                                                 # bootstrap_sampling=False, use_sample_label_count=True, resample_count=None,  # does not make sense
                                                 # resample_count=20,
                                                 # loggamma_tau=1e-6,
                                                 split_selection="gini"
                                                 )
        with Timer("Training took %.03f seconds"):
            rf.fit(train_x, train_y)
        print "The random forest has %d nodes." % rf.num_nodes()

    if save and not load:
        print "Saving random forest to file %s." % filename
        with open(filename, "w") as f:
            f.write(rf.to_string())

    if predict:
        print "Predicting on a test set with the random forest."
        with Timer("Random forest prediction took %.03f seconds."):
            pred, split_counts = rf.predict(test_x, return_split_counts=True)
        split_counts /= float(len(pred))
        count = sum([1 if a == b else 0 for a, b in zip(test_y, pred)])
        print "%d of %d correct (%.03f%%), used %.02f splits per instance" % (count, len(pred), (100.0*count)/len(pred), split_counts)

    if refine:
        print "Refining the random forest using forest garrote."
        with Timer("Refining took %.03f seconds."):
            refined_rf = forest_garrote(rf, train_x, train_y, group_size=group_size)
            # refined_rf = global_refinement(rf, train_x, train_y)
        print "The refined forest has %d nodes." % refined_rf.num_nodes()

        if save:
            f0, f1 = os.path.split(filename)
            refined_filename = os.path.join(f0, "refined_" + f1)
            print "Saving refined random forest to file %s." % refined_filename
            with open(refined_filename, "w") as f:
                f.write(refined_rf.to_string())

        if predict:
            print "Predicting on a test set with the forest garrote."
            with Timer("Forest garrote prediction took %.03f seconds."):
                pred, split_counts = refined_rf.predict(test_x, return_split_counts=True)
            split_counts /= float(len(pred))
            count = sum([1 if a == b else 0 for a, b in zip(test_y, pred)])
            print "%d of %d correct (%.03f%%), used %.02f splits per instance" % (count, len(pred), (100.0*count)/len(pred), split_counts)