def test_model(n_folds=5, n_trees):
  kf = KFold(n_splits=n_folds)
  kf.get_n_splits(features)
  model = RandomForest()
  model.n_features = n_features

  accuracies = []
  durations = []

  for train_index, test_index in kf.split(features):
    train_features, test_features = features[train_index], features[test_index]
    train_labels, test_labels = labels[train_index], labels[test_index]
    rf.fit(train_features, train_labels)
    model.train_set = train_features
    model.test_set = test_labels

    rf_predictions = rf.predict(test_features)
    model_prediction = model.run()
    
    errors = abs(predictions - test_labels)
    mape = 100 * (errors / test_labels)
    accuracy = 100 - np.mean(mape)
    print('Accuracy:', round(accuracy, 2), '%.')

    rf_scratch = RandomForest()
示例#2
0
文件: main.py 项目: tincho4t/aaTP
def run_kfold(method, kf, X, y, text, transformer=None):
    accuracy = 0
    fold = 0
    print("Running " + str(text))
    for train_index, test_index in kf:
        print("Starting fold " + str(fold))
        fold += 1
        X_train = X[train_index, :]
        y_train = y[train_index]
        X_test = X[test_index, :]
        y_test = y[test_index]
        if transformer is not None:
            t = transformer.fit(X_train)
            X_train = t.transform(X_train)
            X_test = t.transform(X_test)
        if method == "rf":
            clf = RandomForest(X_train, y_train, n_estimators=1000)
            clf.fit()
        elif method == "lr":
            clf = linear_model.RidgeClassifier(alpha=2)
            clf.fit(X_train, y_train)
        elif method == "ex":
            clf = ExtraTreesClassifier(n_estimators=2000)
            clf.fit(X_train, y_train)
        y_hat = clf.predict(X_test)
        accuracy += score(y_hat, y_test)
    return (accuracy * 1.0 / len(kf))
示例#3
0
def predict_test_data():
    forest = RandomForest(num_trees = 250, max_depth = 7, categorical_vars = cat_set)
    forest.train(training_data, training_labels)

    num_right = 0
    for i in range(num_training_points):
        prediction = forest.predict(training_data[i])
        if prediction == training_labels[i]:
            num_right += 1
    print("Training Accuracy: " + str(num_right / num_training_points))

    num_right = 0
    for i in range(num_validation_points):
        prediction = forest.predict(validation_data[i])
        if prediction == validation_labels[i]:
            num_right += 1
    print("Validation Accuracy: " + str(num_right / num_validation_points))

    guesses = []
    for i in range(TEST_SIZE):
        point = testing_data[i]
        guess = tree.predict(point)
        guesses.append(int(guess))

    with open('titanic_1.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Id', 'Category'])
        i = 1
        for g in guesses:
            writer.writerow([i, g])
            i += 1
示例#4
0
def main():
    use_feature_index = [2, 3]
    iris = datasets.load_iris()
    X = iris.data[:, use_feature_index]
    y = iris.target
    feature_names = np.array(iris.feature_names)[use_feature_index]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=0)

    n_estimators = 50

    rf = RandomForest(n_estimators=n_estimators, random_state=300)
    rf.fit(X_train, y_train)
    score_m = rf.score(X_test, y_test)

    # scoreの出力
    print("-" * 50)
    print("score:" + str(score_m))

    # 特徴量の重要度の出力
    print("-" * 50)
    f_importance_m = rf.feature_importances

    print("feature importances:")
    for f_name, f_importance in zip(feature_names, f_importance_m):
        print("    ", f_name, ":", f_importance)

    # 決定した領域の出力
    plt = PlotResult(rf, X_train, y_train, X_test, y_test, feature_names,
                     "my_random_forest")
    plt.plot_result()
示例#5
0
def main():
    """
    Process the input arguments
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('-m', '--model', default="0")
    parser.add_argument(
        '-t',
        '--training',
        default="C://Users//Mike Mraz//random-forest//random-forest//data//"
        "mushrooms_train.data")
    parser.add_argument(
        '-e',
        '--testing',
        default="C://Users//Mike Mraz//random-forest//random-forest//data//"
        "mushrooms_test.data")
    parser.add_argument('-d', '--max_depth', default=10)
    parser.add_argument('-n', '--tree_nums', default=20)
    args = parser.parse_args()

    if args.model == "0":
        print("Testing Decision Tree model")
        model = DecisionTree()
    else:
        print("Testing Random Forest model")
        model = RandomForest(args.tree_nums)
    test_model(model, args.training, args.testing)
    def test_DT(self):
        records, attributes = load_data("data/mushrooms_train.data")
        test_records = load_data("data/mushrooms_train.data")[0]
        #print(records, attributes)
        RF = RandomForest(tree_num=10)

        RF.train(records, attributes)
def main():
    '''
    Main classification driver. Read in data files, classify the sensor data
    they contain, and evaluate the performance of the classifier
    '''

    if len(sys.argv) < 2:
        print 'USAGE: ActivityClassifier.py (path to data file)'
        sys.exit(1)

    (sample_set, id_to_cat) = load_sample_data(sys.argv[1])
    # TESTING: Reduce the number of samples
    # sample_set = filter_sample_set(sample_set)

    # Divide the samples into two, training and test. Keep 25% for test
    np.random.shuffle(sample_set)
    test_sample_count = sample_set.shape[0] / 4
    test_samples = sample_set[:test_sample_count]
    training_samples = sample_set[test_sample_count:]

    random_forest = RandomForest(training_samples, 9)
    print random_forest

    confusion_matrix = ConfusionMatrix(id_to_cat.values())

    for sample in test_samples:
        classified_category = random_forest.classify_activity(sample)
        confusion_matrix.add_result(int(sample[-1]), classified_category)

    print confusion_matrix
    confusion_matrix.report_stats()
示例#8
0
def randomForestTest(feature_len, all_lines, all_features, all_labels):
    best_trees_num = 0
    temp = 0
    counts = {}
    for i in range(10):
        rate = 0
        print("Test %d:" % (i + 1))
        train_features = all_features[0:int(0.8 * len(all_features))]
        train_labels = all_labels[0:int(0.8 * len(all_features))]
        test_features = all_features[int(0.8 * len(all_features)):]
        test_labels = all_labels[int(0.8 * len(all_features)):]
        for trees_num in range(25, 36):
            rate = 0
            if trees_num not in counts:
                counts[trees_num] = 0
            print("trees_num:%d " % (trees_num), end=" ")
            new_forest = RandomForest(trees_num)
            new_forest.buildTrees(train_features, train_labels,
                                  len(train_features[0]), 3, 6)
            length = len(test_labels)
            for j in range(0, length):
                res = new_forest.predictForest(test_features[j])
                if res == test_labels[j]:
                    rate += 1
            print(rate / length)
            counts[trees_num] += rate / length
            if temp < counts[trees_num]:
                temp = counts[trees_num]
                best_trees_num = trees_num
        all_features, all_labels = now_provider.getFeatureAndLabel(
            all_lines, feature_len)
    print("Best trees_num:%d %f" %
          (best_trees_num, counts[best_trees_num] / 10))
    for x in counts:
        print(x, counts[x])
    def processData(self, modelName, gender, pClass, siblings, embarked):
        # loading the dataset
        df = pd.read_csv('train.csv', sep=',')

        # droping passengers id
        df = df.drop('PassengerId', axis=1)

        # changing strings to numeric values
        df["Sex"].replace({"male": 0, "female": 1}, inplace=True)
        df["Embarked"].replace({"S": 0, "C": 1, "Q": 2}, inplace=True)

        # fillin empty values
        df["Embarked"].fillna(df["Embarked"].mean(), inplace=True)

        # seperating inputs and outputs
        x = df.drop('Survived', axis=1)
        y = df['Survived']

        model = None

        if modelName == 'Decision Tree':
            model = DecisionTree(df)
        elif modelName == 'Naive Bayes':
            model = NaiveBayes(df)
        elif modelName == 'Neural Network':
            model = NeuralNetwork(df)
        elif modelName == 'Random Forest':
            model = RandomForest(df)
        else:
            model = SupportVector(df)
        return model
示例#10
0
    def get_classifier_object(self):
        # if self.classifier_name == 'LogReg':
        #     self.clf = LogReg(self.x_train, self.y_train, self.x_test, self.y_test)
        #     self.clf.train()
        #     self.y_pred = self.clf.predict()
        # elif self.classifier_name == 'DeciTree':
        #     self.clf = DecisionTree(self.x_train, self.y_train, self.x_test, self.y_test)
        #     self.clf.train()
        #     self.y_pred = self.clf.predict()
        # elif self.classifier_name == 'svm':
        #     self.clf = SVM(self.x_train, self.y_train, self.x_test, self.y_test)
        #     self.clf.train()
        #     self.y_pred = self.clf.predict()
        if self.classifier_name == 'RForest':
            self.clf = RandomForest(self.x_train, self.y_train, self.x_test,
                                    self.y_test)
            self.clf.train()
            self.y_pred = self.clf.predict()
        elif self.classifier_name == 'XGB':
            self.clf = XGBoost(self.x_train, self.y_train, self.x_test,
                               self.y_test)
            self.clf.train()
            self.y_pred = self.clf.predict()
        elif self.classifier_name == 'NaiveBayes':
            self.clf = NaiveBayes(self.x_train, self.y_train, self.x_test,
                                  self.y_test)
            self.clf.train()
            self.y_pred = self.clf.predict()
        elif self.classifier_name == 'AdaBoost':
            self.clf = AdaBoost(self.x_train, self.y_train, self.x_test,
                                self.y_test)
            self.clf.train()
            self.y_pred = self.clf.predict()

        return self.clf.get_classifier()
示例#11
0
def run_model():
	
	# load data
	train_file = 'data/hw7_train.dat.txt'; test_file = 'data/hw7_test.dat.txt'
	data_train = pd.read_csv(train_file, sep = ' ', header = None, names=[0, 1, 'y'])
	data_test = pd.read_csv(test_file, sep = ' ', header = None, names=[0, 1, 'y'])
	X_train, Y_train = generate_data(train_file); X_test, Y_test = generate_data(test_file)
	
	# train model
	col_y = 'y'
	T = 30000; max_height = 1

	time_start = time.clock()
	RF_Prune = RandomForest()
	RF_Prune.construct_forest(data_train, col_y, size = T, max_height = max_height)

	print("Using %.3f seconds" % (time.clock() - time_start))

	# model accuracy
	print('\n--- Pruned Random forest model accuarcy ---')

	Y_train_pred = [RF_Prune.predict(x) for x in np.array(X_train)]
	train_acc = np.sum(Y_train_pred == Y_train) / len(Y_train) * 100
	print('Model accuracy on the training set: %.2f %%' %train_acc)

	Y_test_pred = [RF_Prune.predict(x) for x in np.array(X_test)]
	test_acc = np.sum(Y_test_pred == Y_test) / len(Y_test) * 100
	print('Accuracy on the testing set: %.2f %%\n' %test_acc)
示例#12
0
 def _fit_small_pc(self, images, y):
     start_time = time.time()
     print("PCA RANDOM FOREST")
     ds = self.ip.getImagesWithGrayHistogramEqualized(images=images)
     self.pca_randomForest_pca, self.pca_randomForest_norm, ds = self.ip.getPcaFeatures(ds, 150, Constants.IMAGES_SIZES)
     self.pca_randomForest = RandomForest(ds, y, n_estimators=2000)
     self.pca_randomForest.fit()
     print("COMPELTE PCA RANDOM FOREST --- %s seconds ---" %(time.time() - start_time))
 def __init__(self, S, p, k, weights, name='BRAF'):
     """
     :param raw_data: specify the name of the csv file
     :param S: Spesify the size of the Biased Random Forest method
     :param p: Specify the ratio between R1 and R2
     :param k: Specify the KN Nearest Neighbours for minority class
     """
     self.S = S
     self.p = p
     self.k = k
     self.name = name
     self.weights = weights
     "Initialize the Forests"
     self.R1 = RandomForest('R1_Forest', self.weights, int(self.p * self.S),
                            True)
     self.R2 = RandomForest('R2_Forest', self.weights,
                            int((1 - self.p) * self.S), True)
示例#14
0
def run_randomforest(train_examples, train_labels, attributes, test_examples,
                     test_labels, n_trees):
    rforest = RandomForest(entropy, 2, n_trees, len(attributes))
    rforest.train_dataset(train_examples, attributes, train_labels)

    preds, error = rforest.test_dataset(test_examples, test_labels)

    return error
示例#15
0
def get_frequent_splits():
    forest = RandomForest(num_trees=100, max_depth=2)
    forest.train(training_data, training_labels)
    lst = forest.most_frequent_first_splits()
    for item in lst:
        word = ' < '
        split, frequency = item
        feature, value = split
        name = feature_names[feature]
        print(name + word + str(value) + ' (' + str(frequency) + ' trees)')
def random_forests_classification(X, y, test_dat):
    classifier = RandomForest(20, round(math.sqrt(np.size(X, 1))), np.size(X, 0))
    # classifier = RandomForest(1, round(math.sqrt(np.size(X, 1))), 100, 45)
    classifier.train(X, y)
    y_hat = classifier.predict(test_dat)
    f = open("census_predictions_random_forest.csv", 'w')
    f.write("Id,Category\n")
    for i in range(np.size(test_dat, 0)):
        f.write(str(i + 1) + "," + str(int(y_hat[i, 0])) + "\n")
    f.close()
    print("DONE")
示例#17
0
def main():
    plot_zipf()
    word2vec_explore()
    pickle_in = open("processed_text_list.pickle", "rb")
    processed_text_list = pickle.load(pickle_in)
    #preprocess_text(train_text)
    train_w2v_model(processed_text_list)

    #shuffle and partition dataset
    from sklearn.utils import shuffle
    data = pd.DataFrame({'text': processed_text_list, 'labels': polarity})
    data = shuffle(data)
    get_w2v_array(data[:400000])
    w2v_array = pickle.load(open('w2v_features.pickle', 'rb'))
    num_tweets = 400000  # number of tweets to consider
    w2v_array = w2v_array[:num_tweets]
    split_ratio = int(num_tweets * .8)

    w2v_train = w2v_array[:split_ratio]  # w2v averages for each tweet
    w2v_test = w2v_array[split_ratio:]

    data = shuffle(data)
    simple_train = data['text'][:split_ratio]  # preprocessed text
    simple_test = data['text'][split_ratio:]

    labels_list = data['labels'].tolist()[:num_tweets]
    train_labels = labels_list[:split_ratio]  # list of labels
    test_labels = labels_list[split_ratio:]

    # get_w2v_array(data=data)
    # pickle_in = open("w2v_features.pickle", "rb")
    # w2v_features = pickle.load(pickle_in)

    # naive_bayes = NaiveBayes(simple_train.tolist(), simple_test.tolist(), labels_list)
    # accuracy = naive_bayes.evaluate()
    # print("Naive Bayes accuracy: " + str(accuracy)) #.499

    # svm = SVM(simple_train, train_labels, simple_test, test_labels, 3000, .0000001)
    # accuracy = svm.predict()
    # print("SVM accuracy: " + str(accuracy)) #.744 with a=.0000001 and 3000 epochs

    random_forest = RandomForest(w2v_train,
                                 w2v_test,
                                 train_labels,
                                 test_labels,
                                 'sqrt',
                                 max_depth=25,
                                 min_leaf=2,
                                 n_trees=500,
                                 model_type='scikit')
    accuracy = random_forest.evaluate()
    print("Random Forest accuracy: " + str(accuracy))
示例#18
0
    def makeForests(
            self, size: int, class_label: str, split_ratio: float,
            attribute_choice_fn) -> Tuple[List[RandomForest], List[float]]:

        forests = []
        tests = []
        for (train_pack, test_pack) in self.packs:
            forest = RandomForest(size, train_pack, class_label, split_ratio,
                                  attribute_choice_fn)
            forests.append(forest)
            tests.append(forest.test(test_pack))

        return (forests, tests)
def best_params():
    acc_max = 0
    n_trees_max = 0
    n_trees_list = [i for i in range(2, 11)]
    for n_tree in n_trees_list:
        clf = RandomForest(n_trees=n_tree)
        clf.fit(X_train, Y_train)
        predictions = clf.predict(X_test)
        acc = accuracy(Y_test, predictions)
        if acc > acc_max:
            acc_max = acc
            n_trees_max = n_tree
    return (n_trees_max, acc_max)
示例#20
0
def train(rf):
    '''
    Trains a random forest on the data from all data
    '''
    theData = generateTrainData()
    testForest = RandomForest(theData)
    print("Training")
    testForest.train()
    print("Done!")

    with open(rf, 'wb') as f:
        cPickle.dump(testForest, f)
        print('randomForest model saved to: ' + rf)
def get_frequent_splits():
    forest = RandomForest(num_trees = 100, max_depth = 2, categorical_vars = cat_set)
    forest.train(training_data, training_labels)
    lst = forest.most_frequent_first_splits()
    for item in lst:
        word = ' < '
        split, frequency = item
        feature, value = split
        if feature in cat_set:
            value = inverse_list[feature - CONTINUOUS_FEATURES][value]
            word = ' is '
        name = feature_names[feature]
        print(name + word + str(value) + ' (' + str(frequency) + ' trees)')
class CrossValidation():

    classificationAlgorithms = [
        logisticRegression(),
        RandomForest(),
        SVM(),
        AdaBoost(),
        XGBoost()
    ]

    def __init__(self, dataset, X_train, X_test, y_train, y_test):
        self.ds = dataset
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.accuracyDict = {}
        self.models = {}

    def run(self):
        for alg in self.classificationAlgorithms:
            results = alg.run(self.ds, self.X_train, self.X_test, self.y_train,
                              self.y_test)
            #results incuding: the name of the algorithm and the model
            self.appendToAccuracyDict(
                results[0], self.kFoldCrossValidation(results[0], results[1]))
            self.appendModel(results[0], results[1])

    def kFoldCrossValidation(self, algName, classifier):
        accuracies = cross_val_score(estimator=classifier,
                                     X=self.X_train,
                                     y=self.y_train,
                                     cv=300)
        accuracy = accuracies.mean()
        print algName + ' accuracy:', accuracy * 100, '%'
        return accuracy

    def appendToAccuracyDict(self, algName, accuracy):
        #tup[0]->algorithm name, tup[1]->accuracy
        self.accuracyDict[algName] = accuracy * 100

    def appendModel(self, algName, model):
        #tup[0]->algorithm name, tup[1]->accuracy
        self.models[algName] = model

    def getAccuracyDict(self):
        return self.accuracyDict

    def getModel(self, name):
        return self.models[name]
示例#23
0
def analyze_RF(X, y):

    X_train, X_test, y_train, y_test = train_test_split(X, y)

    arr = list(range(50, 150, 10))
    scores = []
    for e in arr:
        model = RandomForest(num_trees=e, num_features=3)
        model.fit(X_train, y_train)
        scores.append(model.score(X_test, y_test))

    fig, ax = plt.subplots()
    ax.plot(arr, scores)

    return scores, arr
示例#24
0
    def rfdef(self):
        print("RF Start")
        self.trainname.setText("RandomForest")
        file = self.trainfile.text()
        print(file)
        start = time.time()
        s = RandomForest()
        a = s.accuracy(file)

        end = time.time()
        t = (end - start)

        self.traintime.setText(str(round(t, 2)) + " (sec)")
        self.label_4.setText("Accuracy")
        self.trainstatus.setText(str(round(a, 3)))
        AccuracyStore.store('rf', a)
示例#25
0
def classify_with_random_forest():
    forest = RandomForest(num_trees = 250, max_depth = 7, categorical_vars = cat_set)
    forest.train(training_data, training_labels)

    num_right = 0
    for i in range(num_training_points):
        prediction = forest.predict(training_data[i])
        if prediction == training_labels[i]:
            num_right += 1
    print("Training Accuracy: " + str(num_right / num_training_points))

    num_right = 0
    for i in range(num_validation_points):
        prediction = forest.predict(validation_data[i])
        if prediction == validation_labels[i]:
            num_right += 1
    print("Validation Accuracy: " + str(num_right / num_validation_points))
示例#26
0
def main():
    if (len(sys.argv) < 3):
        print("Usage: python3 %s <dataset-csv> <target-attr>" % sys.argv[0])
        exit(-1)

    datasetFile = sys.argv[1]
    targetAttr = sys.argv[2]
    separator = ','

    random.seed(0)
    np.random.seed(0)

    # Read dataset
    D = pd.read_csv(datasetFile, sep=separator)

    t0 = time.time()

    # tree = DecisionTree(D, targetAttr, D.nunique(), sqrt)
    # tree.render()
    forest = RandomForest(D, targetAttr, D.nunique(), 10, sqrt, False)
def main():
    """
    Process the input arguments
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('-m', '--model', default="0")
    parser.add_argument('-t', '--training', default="/usr/cs/grad/master/sbeathan/DataMining/random-forest/random-forest/data/mushrooms_train.data")
    parser.add_argument('-e', '--testing', default="/usr/cs/grad/master/sbeathan/DataMining/random-forest/random-forest/data/mushrooms_test.data")
    parser.add_argument('-d', '--max_depth', default=10)
    parser.add_argument('-n', '--tree_nums', default=20)
    args = parser.parse_args()

    if args.model == "0":
        print ("Testing Decision Tree model")
        model = DecisionTree()
    else:
        print ("Testing Random Forest model")
        model = RandomForest(args.tree_nums)

    test_model(model, args.training, args.testing)
示例#28
0
def main():
    """
    Process the input arguments
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('-m', '--model', default="0")
    parser.add_argument('-t', '--training', default="data/mushrooms_train.data")
    parser.add_argument('-e', '--testing', default="data/mushrooms_test.data")
    parser.add_argument('-d', '--max_depth', default=10)
    parser.add_argument('-n', '--tree_nums', default=20)
    args = parser.parse_args()
    treeNum=int(args.tree_nums)
    if args.model == "0":
        print "Testing Decision Tree model"
        model = DecisionTree()
    else:   
        print "Testing Random Forest model"
        model = RandomForest(treeNum)
    
    test_model(model, args.training, args.testing)
def graph_accuracy():
    accuracy = []
    num_trees = []
    for j in range(5, 41, 5):
        forest = RandomForest(num_trees = j, max_depth = 10, categorical_vars = cat_set)
        forest.train(training_data, training_labels)
        num_right = 0
        for i in range(num_validation_points):
            prediction = forest.predict(validation_data[i])
            if prediction == validation_labels[i]:
                num_right += 1
        accuracy.append(num_right / num_validation_points)
        num_trees.append(j)
        print(j)
        sys.stdout.flush()
    plt.figure()
    plt.plot(num_trees, accuracy)
    plt.title("Census Accuracy For Random Forest")
    plt.ylabel("Accuracy Rate")
    plt.xlabel("Number of Trees")
    plt.show()
示例#30
0
def crossValidationPositions():
    '''
    Performs 10 fold cross validation on the total
    joint position dataset
    '''
    theData = generateAllPositionTrainingData()
    means, stdDevs = theData.normalizeData()
    k = 10

    #Partition the data into 10 subsets
    dataSets = theData.getKSegments(k)

    #For each of the 10 subsets leave one out, train on the
    # other 9, test on the one left out, print the accuracy.
    results = confusionMatrix(labels)
    for i in xrange(k):
        print i
        #testing set
        testSet = dataSets[i]
        #Build the training set
        trainingSet = TrainingData("CrossVal")
        trainingList = copy.deepcopy(dataSets)
        trainingList.pop(i)
        for elem in trainingList:
            trainingSet.combineWithNewData(elem)

        #train the classifier on the trainingSet
        testForest = RandomForest(trainingSet)
        testForest.train()

        #Evaluate the classifer on the test set

        for samp in testSet.getData():
            resultLabel = testForest.classify(samp)
            trueLabel = samp.getLabel()

            results.update(trueLabel, resultLabel)

    results.printMatrix()