def test_DT(self):
        records, attributes = load_data("data/mushrooms_train.data")
        test_records = load_data("data/mushrooms_train.data")[0]
        #print(records, attributes)
        RF = RandomForest(tree_num=10)

        RF.train(records, attributes)
Exemplo n.º 2
0
def predict_test_data():
    forest = RandomForest(num_trees = 250, max_depth = 7, categorical_vars = cat_set)
    forest.train(training_data, training_labels)

    num_right = 0
    for i in range(num_training_points):
        prediction = forest.predict(training_data[i])
        if prediction == training_labels[i]:
            num_right += 1
    print("Training Accuracy: " + str(num_right / num_training_points))

    num_right = 0
    for i in range(num_validation_points):
        prediction = forest.predict(validation_data[i])
        if prediction == validation_labels[i]:
            num_right += 1
    print("Validation Accuracy: " + str(num_right / num_validation_points))

    guesses = []
    for i in range(TEST_SIZE):
        point = testing_data[i]
        guess = tree.predict(point)
        guesses.append(int(guess))

    with open('titanic_1.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Id', 'Category'])
        i = 1
        for g in guesses:
            writer.writerow([i, g])
            i += 1
Exemplo n.º 3
0
def get_frequent_splits():
    forest = RandomForest(num_trees=100, max_depth=2)
    forest.train(training_data, training_labels)
    lst = forest.most_frequent_first_splits()
    for item in lst:
        word = ' < '
        split, frequency = item
        feature, value = split
        name = feature_names[feature]
        print(name + word + str(value) + ' (' + str(frequency) + ' trees)')
def random_forests_classification(X, y, test_dat):
    classifier = RandomForest(20, round(math.sqrt(np.size(X, 1))), np.size(X, 0))
    # classifier = RandomForest(1, round(math.sqrt(np.size(X, 1))), 100, 45)
    classifier.train(X, y)
    y_hat = classifier.predict(test_dat)
    f = open("census_predictions_random_forest.csv", 'w')
    f.write("Id,Category\n")
    for i in range(np.size(test_dat, 0)):
        f.write(str(i + 1) + "," + str(int(y_hat[i, 0])) + "\n")
    f.close()
    print("DONE")
def get_frequent_splits():
    forest = RandomForest(num_trees = 100, max_depth = 2, categorical_vars = cat_set)
    forest.train(training_data, training_labels)
    lst = forest.most_frequent_first_splits()
    for item in lst:
        word = ' < '
        split, frequency = item
        feature, value = split
        if feature in cat_set:
            value = inverse_list[feature - CONTINUOUS_FEATURES][value]
            word = ' is '
        name = feature_names[feature]
        print(name + word + str(value) + ' (' + str(frequency) + ' trees)')
Exemplo n.º 6
0
def train(rf):
    '''
    Trains a random forest on the data from all data
    '''
    theData = generateTrainData()
    testForest = RandomForest(theData)
    print("Training")
    testForest.train()
    print("Done!")

    with open(rf, 'wb') as f:
        cPickle.dump(testForest, f)
        print('randomForest model saved to: ' + rf)
Exemplo n.º 7
0
def classify_with_random_forest():
    forest = RandomForest(num_trees = 250, max_depth = 7, categorical_vars = cat_set)
    forest.train(training_data, training_labels)

    num_right = 0
    for i in range(num_training_points):
        prediction = forest.predict(training_data[i])
        if prediction == training_labels[i]:
            num_right += 1
    print("Training Accuracy: " + str(num_right / num_training_points))

    num_right = 0
    for i in range(num_validation_points):
        prediction = forest.predict(validation_data[i])
        if prediction == validation_labels[i]:
            num_right += 1
    print("Validation Accuracy: " + str(num_right / num_validation_points))
def graph_accuracy():
    accuracy = []
    num_trees = []
    for j in range(5, 41, 5):
        forest = RandomForest(num_trees = j, max_depth = 10, categorical_vars = cat_set)
        forest.train(training_data, training_labels)
        num_right = 0
        for i in range(num_validation_points):
            prediction = forest.predict(validation_data[i])
            if prediction == validation_labels[i]:
                num_right += 1
        accuracy.append(num_right / num_validation_points)
        num_trees.append(j)
        print(j)
        sys.stdout.flush()
    plt.figure()
    plt.plot(num_trees, accuracy)
    plt.title("Census Accuracy For Random Forest")
    plt.ylabel("Accuracy Rate")
    plt.xlabel("Number of Trees")
    plt.show()
Exemplo n.º 9
0
def crossValidationPositions():
    '''
    Performs 10 fold cross validation on the total
    joint position dataset
    '''
    theData = generateAllPositionTrainingData()
    means, stdDevs = theData.normalizeData()
    k = 10

    #Partition the data into 10 subsets
    dataSets = theData.getKSegments(k)

    #For each of the 10 subsets leave one out, train on the
    # other 9, test on the one left out, print the accuracy.
    results = confusionMatrix(labels)
    for i in xrange(k):
        print i
        #testing set
        testSet = dataSets[i]
        #Build the training set
        trainingSet = TrainingData("CrossVal")
        trainingList = copy.deepcopy(dataSets)
        trainingList.pop(i)
        for elem in trainingList:
            trainingSet.combineWithNewData(elem)

        #train the classifier on the trainingSet
        testForest = RandomForest(trainingSet)
        testForest.train()

        #Evaluate the classifer on the test set

        for samp in testSet.getData():
            resultLabel = testForest.classify(samp)
            trueLabel = samp.getLabel()

            results.update(trueLabel, resultLabel)

    results.printMatrix()
Exemplo n.º 10
0
def crossValidationPositions():
    '''
    Performs 10 fold cross validation on the total 
    joint position dataset
    '''
    theData = generateAllPositionTrainingData() 
    means, stdDevs = theData.normalizeData()
    k = 10

    #Partition the data into 10 subsets
    dataSets = theData.getKSegments(k)

    #For each of the 10 subsets leave one out, train on the 
    # other 9, test on the one left out, print the accuracy. 
    results = confusionMatrix(labels)
    for i in xrange(k):
        print i
        #testing set
        testSet = dataSets[i]
        #Build the training set
        trainingSet = TrainingData("CrossVal")
        trainingList = copy.deepcopy(dataSets)
        trainingList.pop(i)
        for elem in trainingList:
            trainingSet.combineWithNewData(elem)

        #train the classifier on the trainingSet
        testForest = RandomForest(trainingSet)
        testForest.train()

        #Evaluate the classifer on the test set
        
        for samp in testSet.getData():
            resultLabel = testForest.classify(samp)
            trueLabel = samp.getLabel()

            results.update(trueLabel, resultLabel)

    results.printMatrix()
Exemplo n.º 11
0
def twoVsOneAngles():
    '''
    Trains a random forest on the data from participants 1 and 2
    and tests it on participant 3. The data used here
    uses the angle features
    '''
    theData = generateTwoAngleTrainingData()
    testForest = RandomForest(theData)
    print "Training"
    testForest.train()
    print "Done!"

    testList = generateOneTestAngleData()

    results = confusionMatrix(labels)

    for samp in testList:
        resultLabel = testForest.classify(samp)
        trueLabel = samp.getLabel()

        results.update(trueLabel, resultLabel)

    results.printMatrix()
Exemplo n.º 12
0
def twoVsOneAngles():
    '''
    Trains a random forest on the data from participants 1 and 2 
    and tests it on participant 3. The data used here 
    uses the angle features
    '''
    theData = generateTwoAngleTrainingData()
    testForest = RandomForest(theData)
    print "Training"
    testForest.train()
    print "Done!"

    testList = generateOneTestAngleData()

    results = confusionMatrix(labels)

    for samp in testList:
        resultLabel = testForest.classify(samp)
        trueLabel = samp.getLabel()

        results.update(trueLabel, resultLabel)

    results.printMatrix()
Exemplo n.º 13
0
def oneVsTwoPositions():
    '''
    Trains a random forest on the data from participant 1
    and tests it on participant 2 and 3. The data used here
    uses the position features
    '''
    theData = generateOneTrainPositionData()
    means, stdDevs = theData.normalizeData()
    testForest = RandomForest(theData)
    print "Training"
    testForest.train()
    print "Done!"

    testList = generateTwoTestPositionData(means, stdDevs)

    results = confusionMatrix(labels)

    for samp in testList:
        resultLabel = testForest.classify(samp)
        trueLabel = samp.getLabel()

        results.update(trueLabel, resultLabel)

    results.printMatrix()
Exemplo n.º 14
0
def oneVsTwoPositions():
    '''
    Trains a random forest on the data from participant 1 
    and tests it on participant 2 and 3. The data used here 
    uses the position features
    '''
    theData = generateOneTrainPositionData()
    means, stdDevs = theData.normalizeData()
    testForest = RandomForest(theData)
    print "Training"
    testForest.train()
    print "Done!"

    testList = generateTwoTestPositionData(means, stdDevs)

    results = confusionMatrix(labels)

    for samp in testList:
        resultLabel = testForest.classify(samp)
        trueLabel = samp.getLabel()

        results.update(trueLabel, resultLabel)

    results.printMatrix()
def main():
    argument_parser = ArgumentParser(
        description="Script to run the RandomForest program.", add_help=False)
    mutually_exclusive_group = argument_parser.add_mutually_exclusive_group()
    mutually_exclusive_group.add_argument(
        '--use_gini',
        action='store_true',
        help="Use the Gini index for attribute splitting in the decision trees."
    )
    mutually_exclusive_group.add_argument(
        '--use_entropy',
        action='store_true',
        help="Use entropy for attribute splitting in the decision trees.")
    mutually_exclusive_group.add_argument(
        '--use_variance',
        action='store_true',
        help="Use entropy for attribute splitting in the decision trees.")
    mutually_exclusive_group2 = argument_parser.add_mutually_exclusive_group()
    mutually_exclusive_group2.add_argument(
        '--use_hockey_preprocessor',
        action='store_true',
        help=
        "Use hockey dataset preprocessing logic on the given dataset. (default)"
    )
    mutually_exclusive_group2.add_argument(
        '--use_custom_preprocessor',
        help=
        "Use custom dataset preprocessing logic on the given dataset.  Where USE_CUSTOM_PREPROCESSOR is the"
        "filename of the preprocessor file in the preprocessors directory to use, e.g. TemplateDataSetPreprocessor."
    )
    argument_parser.add_argument('-d',
                                 '--data_file',
                                 required=True,
                                 help="File containing the dataset.")
    argument_parser.add_argument(
        '-t',
        '--number_of_trees',
        type=int,
        default=4,
        help="The number of trees to create for the random forest.")
    argument_parser.add_argument(
        '-m',
        '--max_depth',
        type=int,
        help=
        "The maximum depth of all trees in the random forest.  (default: None)."
    )
    argument_parser.add_argument(
        '-s',
        '--min_split_size',
        type=int,
        default=1,
        help=
        "The threshold number of samples required at a node to stop further splitting.  (default: 1)."
    )
    argument_parser.add_argument(
        '-f',
        '--n_features',
        type=int,
        help=
        "The number of features to use when building each tree in the random forest.  Specifying None will use all"
        " the features (default: None).")
    argument_parser.add_argument('-c',
                                 '--target_label',
                                 type=str,
                                 required=True,
                                 help="Target label that we want to predict.")
    argument_parser.add_argument(
        '-k',
        '--sklearn_rf',
        action='store_true',
        help='Train and test dataset on SKlearn Random Forest')
    argument_parser.add_argument(
        '-w',
        '--number_of_workers',
        type=int,
        help=
        "The number of workers to spawn during training of the random forest.  Specifying None will disable this"
        "feature. (default: None).")
    argument_parser.add_argument(
        '-o',
        '--output_file',
        help=
        "Output file of the results.  If the file exists already, new entries will be appended to the end. (default: None)."
    )
    argument_parser.add_argument('-h',
                                 '--help',
                                 action='help',
                                 help="Show this message and exit.")
    arguments = argument_parser.parse_args()

    dataset_file = arguments.data_file
    output_file = arguments.output_file

    preprocessor = None

    if arguments.use_custom_preprocessor:
        preprocessor = import_module("preprocessors." +
                                     arguments.use_custom_preprocessor)
    else:
        preprocessor = HockeyPP

    #select class name
    class_name = arguments.target_label

    #select splitting cost function
    split_function = 'gini'

    if arguments.use_entropy:
        split_function = 'entropy'

    elif arguments.use_variance:
        split_function = 'variance'

    #Test regression with 'sum_7yr_GP'
    train_data, test_data = preprocessor.process(dataset_file, class_name)

    random_forest = RandomForest(arguments.number_of_trees,
                                 arguments.max_depth, arguments.min_split_size,
                                 arguments.n_features,
                                 arguments.number_of_workers, split_function)

    t0 = datetime.now()
    random_forest.train(train_data, class_name)
    diff = datetime.now() - t0
    t = divmod(diff.days * 86400 + diff.seconds, 60)
    train_results = random_forest.bagging_predict(train_data)
    t0 = datetime.now()
    test_results = random_forest.bagging_predict(test_data)
    diff = datetime.now() - t0
    tp = divmod(diff.days * 86400 + diff.seconds, 60)

    if arguments.use_variance:
        train_accuracy = random_forest.mse(train_results, train_data[:, -1])
        print("\nTrain Mean squared error: {}".format(train_accuracy))

        test_accuracy = random_forest.mse(test_results, test_data[:, -1])
        print("Test Mean squared error: {}\n".format(test_accuracy))
    else:
        train_accuracy = random_forest.evaluate(train_results, train_data[:,
                                                                          -1])
        print("\nTrain Percent Correct: {}".format(train_accuracy))

        test_accuracy = random_forest.evaluate(test_results, test_data[:, -1])
        print("Test Percent Correct: {}\n".format(test_accuracy))

    print("\nTime for train: {}min {}sec".format(t[0], t[1]))
    print("Time for prediction: {}min {}sec\n".format(tp[0], tp[1]))

    if arguments.sklearn_rf is True:
        sk_rf = Sklearn_RF(arguments.number_of_trees, arguments.max_depth,
                           arguments.min_split_size, arguments.n_features)

        # TODO: Need a sklearn_regression tree as well
        sk_rf.train(train_data, class_name)

        accuracy_sk = sk_rf.evaluate(
            test_data,
            tree_type='regressor' if arguments.use_variance else 'classifier')

        if arguments.use_variance:
            print('{}{}'.format('sklearn rf MSE: ', accuracy_sk))
        else:
            print('{}{}'.format('sklearn rf Percent correct: ',
                                accuracy_sk * 100))

    # Write out the results to a file, if one is specified, for downstream processing.
    if output_file:
        creating_new_file = True
        if os.path.isfile(output_file):
            creating_new_file = False

        headers = [
            "Features", "MaxDepth", "MinSplitThreshold", "Trees",
            "SplitCriteria", "Target", "TrainAccuracy", "TestAccuracy"
        ]

        with open(output_file, "a") as csv_file:
            writer = csv.DictWriter(csv_file,
                                    fieldnames=headers,
                                    lineterminator="\n")
            if creating_new_file:
                print("Creating a new file {}!\n".format(output_file))
                writer.writeheader()
            else:
                print("Appending to the file {}!\n".format(output_file))
            writer.writerow({
                "Features":
                arguments.n_features if arguments.n_features else "ALL",
                "MaxDepth":
                arguments.max_depth if arguments.max_depth else "NOLIMIT",
                "MinSplitThreshold":
                arguments.min_split_size,
                "Trees":
                arguments.number_of_trees,
                "SplitCriteria":
                split_function,
                "Target":
                class_name,
                "TrainAccuracy":
                train_accuracy,
                "TestAccuracy":
                test_accuracy
            })
Exemplo n.º 16
0
    parser = argparse.ArgumentParser()
    parser.add_argument('-dataset', nargs=1, type=str, required=True)
    parser.add_argument('-target', nargs=1, type=str, required=True)
    parser.add_argument('-alias', nargs=1, type=str, required=True)
    arguments = parser.parse_args()

    newDF = readCSV(arguments.dataset[0], arguments.target[0])

    nTrees = [50]
    k = 8
    m = floor(sqrt(newDF.shape[1]))

    for n in nTrees:
        print("-----------------")
        print("Number of trees: " + str(n))
        newDF = readCSV(arguments.dataset[0], arguments.target[0])

        folds = generate_kfolds(newDF, arguments.target[0], k)

        for i in range(k):
            trainList = [x for j, x in enumerate(folds) if j != i]
            trainDF = pd.concat(trainList)
            testDF = folds[i]

            randForest = RandomForest()
            randForest.train(trainDF, arguments.target[0], n,
                            m, False)
            newStr = randForest.eval(testDF)
            with open("outputResults/" + str(n) + arguments.alias[0] + ".txt", "a") as outputFile:
                outputFile.write(newStr)
Exemplo n.º 17
0
X = iris.data
y = iris.target

ratio_train_test = 0.85

num_samples, num_features = X.shape
idx = np.random.permutation(range(num_samples))
num_samples_train = int(num_samples * ratio_train_test)
idx_train = idx[:num_samples_train]
idx_test = idx[num_samples_train:]
X_train, y_train = X[idx_train], y[idx_train]
X_test, y_test = X[idx_test], y[idx_test]

# HYPER PARAMETERS
max_depth = 7
min_split_size = 5
ratio_samples = 0.2
num_trees = 30
num_features_node = int(np.sqrt(num_features))
coefficient = 'gini'
percentile = 90
values = None
min_std_deviation = 0

rf = RandomForest(max_depth, min_split_size, ratio_samples, num_trees,
                  num_features_node, coefficient, percentile, values,
                  min_std_deviation)
rf.train(X_train, y_train)
rf.predict(X_test, y_test)
def test10Fold():
    global allWords
    splits = tenFoldCrossValidation()

    count = 0
    total = 0
    print("Naive Bayes")
    for split in splits:
        nb = naiveBayes()
        trainFeatures = []
        trainClasses = []
        testFeatures = []
        testClasses = []
        for example in split.train:
            trainFeatures.append(example.features)
            trainClasses.append(example.klass)
        for example in split.test:
            testFeatures.append(example.features)
            testClasses.append(example.klass)

        nb.train(trainFeatures, trainClasses)
        nb.test(testFeatures, testClasses)
        accuracy = nb.getCorrectCount() / len(testClasses)
        total = total + accuracy
        print("[INFO]\tFold ", str(count), " Accuracy:", str(accuracy))
        count = count + 1

    print("[INFO]\tAccuracy:", str(total / 10))

    count = 0
    total = 0
    print("Random Forest")
    for split in splits:
        nb = RandomForest(100)
        trainFeatures = []
        trainClasses = []
        testFeatures = []
        testClasses = []
        for example in split.train:
            trainFeatures.append(example.features)
            trainClasses.append(example.klass)
        for example in split.test:
            testFeatures.append(example.features)
            testClasses.append(example.klass)

        nb.train(trainFeatures, trainClasses)
        nb.test(testFeatures, testClasses)
        accuracy = nb.getCorrectCount() / len(testClasses)
        total = total + accuracy
        print("[INFO]\tFold ", str(count), " Accuracy:", str(accuracy))
        count = count + 1

    print("[INFO]\tAccuracy:", str(total / 10))

    count = 0
    total = 0
    print("Neural 5")
    for split in splits:
        nb = neuralNetwork((5, ), 1000)
        trainFeatures = []
        trainClasses = []
        testFeatures = []
        testClasses = []
        for example in split.train:
            trainFeatures.append(example.features)
            trainClasses.append(example.klass)
        for example in split.test:
            testFeatures.append(example.features)
            testClasses.append(example.klass)

        nb.train(trainFeatures, trainClasses)
        nb.test(testFeatures, testClasses)
        accuracy = nb.getCorrectCount() / len(testClasses)
        total = total + accuracy
        print("[INFO]\tFold ", str(count), " Accuracy:", str(accuracy))
        count = count + 1

    print("[INFO]\tAccuracy:", str(total / 10))

    count = 0
    total = 0
    print("Neural 3")
    for split in splits:
        nb = neuralNetwork((3, ), 1000)
        trainFeatures = []
        trainClasses = []
        testFeatures = []
        testClasses = []
        for example in split.train:
            trainFeatures.append(example.features)
            trainClasses.append(example.klass)
        for example in split.test:
            testFeatures.append(example.features)
            testClasses.append(example.klass)

        nb.train(trainFeatures, trainClasses)
        nb.test(testFeatures, testClasses)
        accuracy = nb.getCorrectCount() / len(testClasses)
        total = total + accuracy
        print("[INFO]\tFold ", str(count), " Accuracy:", str(accuracy))
        count = count + 1

    print("[INFO]\tAccuracy:", str(total / 10))

    count = 0
    total = 0
    print("SVM")
    for split in splits:
        nb = svm()
        trainFeatures = []
        trainClasses = []
        testFeatures = []
        testClasses = []
        for example in split.train:
            trainFeatures.append(example.features)
            trainClasses.append(example.klass)
        for example in split.test:
            testFeatures.append(example.features)
            testClasses.append(example.klass)

        nb.train(trainFeatures, trainClasses)
        nb.test(testFeatures, testClasses)
        accuracy = nb.getCorrectCount() / len(testClasses)
        total = total + accuracy
        print("[INFO]\tFold ", str(count), " Accuracy:", str(accuracy))
        count = count + 1

    print("[INFO]\tAccuracy:", str(total / 10))
Exemplo n.º 19
0
from Dataset import Dataset
from RandomForest import RandomForest
import ClassifierStats as stats

dataset_path = 'synthetic.social'
train = Dataset.from_file('../data/{}.train'.format(dataset_path))
test = Dataset.from_file('../data/{}.test'.format(dataset_path))
model = RandomForest(num_trees=100, max_depth=100, bagging_data_fraction=0.4)
model.train(train)
predictions = model.classify(test)
accuracy = stats.accuracy(test.labels, predictions)
print(accuracy)
Exemplo n.º 20
0
from DecisionTree import DecisionTree
from RandomForest import RandomForest
import pandas as pd


def getData():
    data_train = pd.read_csv("./hw6_train.dat", sep=" ",
                             header=None).rename(columns={10: "y"})
    data_test = pd.read_csv("./hw6_test.dat", sep=" ",
                            header=None).rename(columns={10: "y"})
    return data_train, data_test


if __name__ == "__main__":
    train_set, test_set = getData()
    RF = RandomForest(train=train_set, test=test_set)
    RF.train(n_tree=2000, get_oob=True)
    # RF.predict(mode="train")
    # RF.predict(mode="test")
    # DT = DecisionTree(train=train_set, test=test_set)
    # DT.train()
    # DT.predict(mode="test")