def __init__(self, num_trees, num_features, impurity_criterion, prune=False):
     """
        num_trees:  number of trees to create in the forest:
     num_features:  the number of features to consider when choosing the
                        best split for each node of the decision trees
     """
     RandomForest.__init__(self, num_trees, num_features, impurity_criterion)
     self.prune = prune
示例#2
0
def run_randomforest(train_examples, train_labels, attributes, test_examples,
                     test_labels, n_trees):
    rforest = RandomForest(entropy, 2, n_trees, len(attributes))
    rforest.train_dataset(train_examples, attributes, train_labels)

    preds, error = rforest.test_dataset(test_examples, test_labels)

    return error
示例#3
0
def get_frequent_splits():
    forest = RandomForest(num_trees=100, max_depth=2)
    forest.train(training_data, training_labels)
    lst = forest.most_frequent_first_splits()
    for item in lst:
        word = ' < '
        split, frequency = item
        feature, value = split
        name = feature_names[feature]
        print(name + word + str(value) + ' (' + str(frequency) + ' trees)')
def random_forests_classification(X, y, test_dat):
    classifier = RandomForest(20, round(math.sqrt(np.size(X, 1))), np.size(X, 0))
    # classifier = RandomForest(1, round(math.sqrt(np.size(X, 1))), 100, 45)
    classifier.train(X, y)
    y_hat = classifier.predict(test_dat)
    f = open("census_predictions_random_forest.csv", 'w')
    f.write("Id,Category\n")
    for i in range(np.size(test_dat, 0)):
        f.write(str(i + 1) + "," + str(int(y_hat[i, 0])) + "\n")
    f.close()
    print("DONE")
示例#5
0
def main():
    plot_zipf()
    word2vec_explore()
    pickle_in = open("processed_text_list.pickle", "rb")
    processed_text_list = pickle.load(pickle_in)
    #preprocess_text(train_text)
    train_w2v_model(processed_text_list)

    #shuffle and partition dataset
    from sklearn.utils import shuffle
    data = pd.DataFrame({'text': processed_text_list, 'labels': polarity})
    data = shuffle(data)
    get_w2v_array(data[:400000])
    w2v_array = pickle.load(open('w2v_features.pickle', 'rb'))
    num_tweets = 400000  # number of tweets to consider
    w2v_array = w2v_array[:num_tweets]
    split_ratio = int(num_tweets * .8)

    w2v_train = w2v_array[:split_ratio]  # w2v averages for each tweet
    w2v_test = w2v_array[split_ratio:]

    data = shuffle(data)
    simple_train = data['text'][:split_ratio]  # preprocessed text
    simple_test = data['text'][split_ratio:]

    labels_list = data['labels'].tolist()[:num_tweets]
    train_labels = labels_list[:split_ratio]  # list of labels
    test_labels = labels_list[split_ratio:]

    # get_w2v_array(data=data)
    # pickle_in = open("w2v_features.pickle", "rb")
    # w2v_features = pickle.load(pickle_in)

    # naive_bayes = NaiveBayes(simple_train.tolist(), simple_test.tolist(), labels_list)
    # accuracy = naive_bayes.evaluate()
    # print("Naive Bayes accuracy: " + str(accuracy)) #.499

    # svm = SVM(simple_train, train_labels, simple_test, test_labels, 3000, .0000001)
    # accuracy = svm.predict()
    # print("SVM accuracy: " + str(accuracy)) #.744 with a=.0000001 and 3000 epochs

    random_forest = RandomForest(w2v_train,
                                 w2v_test,
                                 train_labels,
                                 test_labels,
                                 'sqrt',
                                 max_depth=25,
                                 min_leaf=2,
                                 n_trees=500,
                                 model_type='scikit')
    accuracy = random_forest.evaluate()
    print("Random Forest accuracy: " + str(accuracy))
示例#6
0
    def makeForests(
            self, size: int, class_label: str, split_ratio: float,
            attribute_choice_fn) -> Tuple[List[RandomForest], List[float]]:

        forests = []
        tests = []
        for (train_pack, test_pack) in self.packs:
            forest = RandomForest(size, train_pack, class_label, split_ratio,
                                  attribute_choice_fn)
            forests.append(forest)
            tests.append(forest.test(test_pack))

        return (forests, tests)
def best_params():
    acc_max = 0
    n_trees_max = 0
    n_trees_list = [i for i in range(2, 11)]
    for n_tree in n_trees_list:
        clf = RandomForest(n_trees=n_tree)
        clf.fit(X_train, Y_train)
        predictions = clf.predict(X_test)
        acc = accuracy(Y_test, predictions)
        if acc > acc_max:
            acc_max = acc
            n_trees_max = n_tree
    return (n_trees_max, acc_max)
def get_frequent_splits():
    forest = RandomForest(num_trees = 100, max_depth = 2, categorical_vars = cat_set)
    forest.train(training_data, training_labels)
    lst = forest.most_frequent_first_splits()
    for item in lst:
        word = ' < '
        split, frequency = item
        feature, value = split
        if feature in cat_set:
            value = inverse_list[feature - CONTINUOUS_FEATURES][value]
            word = ' is '
        name = feature_names[feature]
        print(name + word + str(value) + ' (' + str(frequency) + ' trees)')
示例#9
0
def train(rf):
    '''
    Trains a random forest on the data from all data
    '''
    theData = generateTrainData()
    testForest = RandomForest(theData)
    print("Training")
    testForest.train()
    print("Done!")

    with open(rf, 'wb') as f:
        cPickle.dump(testForest, f)
        print('randomForest model saved to: ' + rf)
示例#10
0
 def __init__(self,
              num_trees,
              num_features,
              impurity_criterion,
              prune=False):
     '''
        num_trees:  number of trees to create in the forest:
     num_features:  the number of features to consider when choosing the
                        best split for each node of the decision trees
     '''
     RandomForest.__init__(self, num_trees, num_features,
                           impurity_criterion)
     self.prune = prune
示例#11
0
def analyze_RF(X, y):

    X_train, X_test, y_train, y_test = train_test_split(X, y)

    arr = list(range(50, 150, 10))
    scores = []
    for e in arr:
        model = RandomForest(num_trees=e, num_features=3)
        model.fit(X_train, y_train)
        scores.append(model.score(X_test, y_test))

    fig, ax = plt.subplots()
    ax.plot(arr, scores)

    return scores, arr
    def processData(self, modelName, gender, pClass, siblings, embarked):
        # loading the dataset
        df = pd.read_csv('train.csv', sep=',')

        # droping passengers id
        df = df.drop('PassengerId', axis=1)

        # changing strings to numeric values
        df["Sex"].replace({"male": 0, "female": 1}, inplace=True)
        df["Embarked"].replace({"S": 0, "C": 1, "Q": 2}, inplace=True)

        # fillin empty values
        df["Embarked"].fillna(df["Embarked"].mean(), inplace=True)

        # seperating inputs and outputs
        x = df.drop('Survived', axis=1)
        y = df['Survived']

        model = None

        if modelName == 'Decision Tree':
            model = DecisionTree(df)
        elif modelName == 'Naive Bayes':
            model = NaiveBayes(df)
        elif modelName == 'Neural Network':
            model = NeuralNetwork(df)
        elif modelName == 'Random Forest':
            model = RandomForest(df)
        else:
            model = SupportVector(df)
        return model
示例#13
0
def main():
    """
    Process the input arguments
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('-m', '--model', default="0")
    parser.add_argument(
        '-t',
        '--training',
        default="C://Users//Mike Mraz//random-forest//random-forest//data//"
        "mushrooms_train.data")
    parser.add_argument(
        '-e',
        '--testing',
        default="C://Users//Mike Mraz//random-forest//random-forest//data//"
        "mushrooms_test.data")
    parser.add_argument('-d', '--max_depth', default=10)
    parser.add_argument('-n', '--tree_nums', default=20)
    args = parser.parse_args()

    if args.model == "0":
        print("Testing Decision Tree model")
        model = DecisionTree()
    else:
        print("Testing Random Forest model")
        model = RandomForest(args.tree_nums)
    test_model(model, args.training, args.testing)
示例#14
0
    def rfdef(self):
        print("RF Start")
        self.trainname.setText("RandomForest")
        file = self.trainfile.text()
        print(file)
        start = time.time()
        s = RandomForest()
        a = s.accuracy(file)

        end = time.time()
        t = (end - start)

        self.traintime.setText(str(round(t, 2)) + " (sec)")
        self.label_4.setText("Accuracy")
        self.trainstatus.setText(str(round(a, 3)))
        AccuracyStore.store('rf', a)
示例#15
0
    def get_classifier_object(self):
        # if self.classifier_name == 'LogReg':
        #     self.clf = LogReg(self.x_train, self.y_train, self.x_test, self.y_test)
        #     self.clf.train()
        #     self.y_pred = self.clf.predict()
        # elif self.classifier_name == 'DeciTree':
        #     self.clf = DecisionTree(self.x_train, self.y_train, self.x_test, self.y_test)
        #     self.clf.train()
        #     self.y_pred = self.clf.predict()
        # elif self.classifier_name == 'svm':
        #     self.clf = SVM(self.x_train, self.y_train, self.x_test, self.y_test)
        #     self.clf.train()
        #     self.y_pred = self.clf.predict()
        if self.classifier_name == 'RForest':
            self.clf = RandomForest(self.x_train, self.y_train, self.x_test,
                                    self.y_test)
            self.clf.train()
            self.y_pred = self.clf.predict()
        elif self.classifier_name == 'XGB':
            self.clf = XGBoost(self.x_train, self.y_train, self.x_test,
                               self.y_test)
            self.clf.train()
            self.y_pred = self.clf.predict()
        elif self.classifier_name == 'NaiveBayes':
            self.clf = NaiveBayes(self.x_train, self.y_train, self.x_test,
                                  self.y_test)
            self.clf.train()
            self.y_pred = self.clf.predict()
        elif self.classifier_name == 'AdaBoost':
            self.clf = AdaBoost(self.x_train, self.y_train, self.x_test,
                                self.y_test)
            self.clf.train()
            self.y_pred = self.clf.predict()

        return self.clf.get_classifier()
示例#16
0
文件: Ensemble.py 项目: tincho4t/aaTP
 def _fit_small_pc(self, images, y):
     start_time = time.time()
     print("PCA RANDOM FOREST")
     ds = self.ip.getImagesWithGrayHistogramEqualized(images=images)
     self.pca_randomForest_pca, self.pca_randomForest_norm, ds = self.ip.getPcaFeatures(ds, 150, Constants.IMAGES_SIZES)
     self.pca_randomForest = RandomForest(ds, y, n_estimators=2000)
     self.pca_randomForest.fit()
     print("COMPELTE PCA RANDOM FOREST --- %s seconds ---" %(time.time() - start_time))
 def __init__(self, S, p, k, weights, name='BRAF'):
     """
     :param raw_data: specify the name of the csv file
     :param S: Spesify the size of the Biased Random Forest method
     :param p: Specify the ratio between R1 and R2
     :param k: Specify the KN Nearest Neighbours for minority class
     """
     self.S = S
     self.p = p
     self.k = k
     self.name = name
     self.weights = weights
     "Initialize the Forests"
     self.R1 = RandomForest('R1_Forest', self.weights, int(self.p * self.S),
                            True)
     self.R2 = RandomForest('R2_Forest', self.weights,
                            int((1 - self.p) * self.S), True)
示例#18
0
def test_model(n_folds=5, n_trees):
  kf = KFold(n_splits=n_folds)
  kf.get_n_splits(features)
  model = RandomForest()
  model.n_features = n_features

  accuracies = []
  durations = []

  for train_index, test_index in kf.split(features):
    train_features, test_features = features[train_index], features[test_index]
    train_labels, test_labels = labels[train_index], labels[test_index]
    rf.fit(train_features, train_labels)
    model.train_set = train_features
    model.test_set = test_labels

    rf_predictions = rf.predict(test_features)
    model_prediction = model.run()
    
    errors = abs(predictions - test_labels)
    mape = 100 * (errors / test_labels)
    accuracy = 100 - np.mean(mape)
    print('Accuracy:', round(accuracy, 2), '%.')

    rf_scratch = RandomForest()
示例#19
0
def predict_test_data():
    forest = RandomForest(num_trees=25, max_depth=25)
    forest.train(training_data, training_labels)

    num_right = 0
    for i in range(num_training_points):
        prediction = forest.predict(training_data[i])
        if prediction == training_labels[i]:
            num_right += 1
    print("Training Accuracy: " + str(num_right / num_training_points))

    num_right = 0
    for i in range(num_validation_points):
        prediction = forest.predict(validation_data[i])
        if prediction == validation_labels[i]:
            num_right += 1
    print("Validation Accuracy: " + str(num_right / num_validation_points))

    guesses = []
    for i in range(testdata.shape[0]):
        point = testdata[i]
        guess = forest.predict(point)
        guesses.append(int(guess))

    with open('spam_1.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Id', 'Category'])
        i = 0
        for g in guesses:
            writer.writerow([i, g])
            i += 1
def graph_accuracy():
    accuracy = []
    num_trees = []
    for j in range(5, 41, 5):
        forest = RandomForest(num_trees = j, max_depth = 10, categorical_vars = cat_set)
        forest.train(training_data, training_labels)
        num_right = 0
        for i in range(num_validation_points):
            prediction = forest.predict(validation_data[i])
            if prediction == validation_labels[i]:
                num_right += 1
        accuracy.append(num_right / num_validation_points)
        num_trees.append(j)
        print(j)
        sys.stdout.flush()
    plt.figure()
    plt.plot(num_trees, accuracy)
    plt.title("Census Accuracy For Random Forest")
    plt.ylabel("Accuracy Rate")
    plt.xlabel("Number of Trees")
    plt.show()
示例#21
0
def run_model():
	
	# load data
	train_file = 'data/hw7_train.dat.txt'; test_file = 'data/hw7_test.dat.txt'
	data_train = pd.read_csv(train_file, sep = ' ', header = None, names=[0, 1, 'y'])
	data_test = pd.read_csv(test_file, sep = ' ', header = None, names=[0, 1, 'y'])
	X_train, Y_train = generate_data(train_file); X_test, Y_test = generate_data(test_file)
	
	# train model
	col_y = 'y'
	T = 30000; max_height = 1

	time_start = time.clock()
	RF_Prune = RandomForest()
	RF_Prune.construct_forest(data_train, col_y, size = T, max_height = max_height)

	print("Using %.3f seconds" % (time.clock() - time_start))

	# model accuracy
	print('\n--- Pruned Random forest model accuarcy ---')

	Y_train_pred = [RF_Prune.predict(x) for x in np.array(X_train)]
	train_acc = np.sum(Y_train_pred == Y_train) / len(Y_train) * 100
	print('Model accuracy on the training set: %.2f %%' %train_acc)

	Y_test_pred = [RF_Prune.predict(x) for x in np.array(X_test)]
	test_acc = np.sum(Y_test_pred == Y_test) / len(Y_test) * 100
	print('Accuracy on the testing set: %.2f %%\n' %test_acc)
	def __init__(self, n_trees=10, max_depth=2, min_size=2, cost='gini'):
		"""
		Constructor for random forest classifier. This mainly just initialize
		the attributes of the class by calling the base class constructor. 
		However, here is where it is the cost function string is checked
		to make sure it either using 'gini', otherwise an error is thrown.

		Args:
			cost (str) : The name of the cost function to use for evaluating
						 the split.

			n_trees (int): The number of trees to use.

			max_depth (int): The maximum depth of tree.

			min_size (int): The minimum number of datapoints in terminal nodes.
	
		"""
		if cost != 'gini':
			raise NameError('Not valid cost function')
		else:
			RandomForest.__init__(self, cost,  n_trees=10, max_depth=2, min_size=2)
示例#23
0
def crossValidationPositions():
    '''
    Performs 10 fold cross validation on the total
    joint position dataset
    '''
    theData = generateAllPositionTrainingData()
    means, stdDevs = theData.normalizeData()
    k = 10

    #Partition the data into 10 subsets
    dataSets = theData.getKSegments(k)

    #For each of the 10 subsets leave one out, train on the
    # other 9, test on the one left out, print the accuracy.
    results = confusionMatrix(labels)
    for i in xrange(k):
        print i
        #testing set
        testSet = dataSets[i]
        #Build the training set
        trainingSet = TrainingData("CrossVal")
        trainingList = copy.deepcopy(dataSets)
        trainingList.pop(i)
        for elem in trainingList:
            trainingSet.combineWithNewData(elem)

        #train the classifier on the trainingSet
        testForest = RandomForest(trainingSet)
        testForest.train()

        #Evaluate the classifer on the test set

        for samp in testSet.getData():
            resultLabel = testForest.classify(samp)
            trueLabel = samp.getLabel()

            results.update(trueLabel, resultLabel)

    results.printMatrix()
示例#24
0
def crossValidationPositions():
    '''
    Performs 10 fold cross validation on the total 
    joint position dataset
    '''
    theData = generateAllPositionTrainingData() 
    means, stdDevs = theData.normalizeData()
    k = 10

    #Partition the data into 10 subsets
    dataSets = theData.getKSegments(k)

    #For each of the 10 subsets leave one out, train on the 
    # other 9, test on the one left out, print the accuracy. 
    results = confusionMatrix(labels)
    for i in xrange(k):
        print i
        #testing set
        testSet = dataSets[i]
        #Build the training set
        trainingSet = TrainingData("CrossVal")
        trainingList = copy.deepcopy(dataSets)
        trainingList.pop(i)
        for elem in trainingList:
            trainingSet.combineWithNewData(elem)

        #train the classifier on the trainingSet
        testForest = RandomForest(trainingSet)
        testForest.train()

        #Evaluate the classifer on the test set
        
        for samp in testSet.getData():
            resultLabel = testForest.classify(samp)
            trueLabel = samp.getLabel()

            results.update(trueLabel, resultLabel)

    results.printMatrix()
示例#25
0
def twoVsOneAngles():
    '''
    Trains a random forest on the data from participants 1 and 2 
    and tests it on participant 3. The data used here 
    uses the angle features
    '''
    theData = generateTwoAngleTrainingData()
    testForest = RandomForest(theData)
    print "Training"
    testForest.train()
    print "Done!"

    testList = generateOneTestAngleData()

    results = confusionMatrix(labels)

    for samp in testList:
        resultLabel = testForest.classify(samp)
        trueLabel = samp.getLabel()

        results.update(trueLabel, resultLabel)

    results.printMatrix()
示例#26
0
def twoVsOneAngles():
    '''
    Trains a random forest on the data from participants 1 and 2
    and tests it on participant 3. The data used here
    uses the angle features
    '''
    theData = generateTwoAngleTrainingData()
    testForest = RandomForest(theData)
    print "Training"
    testForest.train()
    print "Done!"

    testList = generateOneTestAngleData()

    results = confusionMatrix(labels)

    for samp in testList:
        resultLabel = testForest.classify(samp)
        trueLabel = samp.getLabel()

        results.update(trueLabel, resultLabel)

    results.printMatrix()
class CrossValidation():

    classificationAlgorithms = [
        logisticRegression(),
        RandomForest(),
        SVM(),
        AdaBoost(),
        XGBoost()
    ]

    def __init__(self, dataset, X_train, X_test, y_train, y_test):
        self.ds = dataset
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.accuracyDict = {}
        self.models = {}

    def run(self):
        for alg in self.classificationAlgorithms:
            results = alg.run(self.ds, self.X_train, self.X_test, self.y_train,
                              self.y_test)
            #results incuding: the name of the algorithm and the model
            self.appendToAccuracyDict(
                results[0], self.kFoldCrossValidation(results[0], results[1]))
            self.appendModel(results[0], results[1])

    def kFoldCrossValidation(self, algName, classifier):
        accuracies = cross_val_score(estimator=classifier,
                                     X=self.X_train,
                                     y=self.y_train,
                                     cv=300)
        accuracy = accuracies.mean()
        print algName + ' accuracy:', accuracy * 100, '%'
        return accuracy

    def appendToAccuracyDict(self, algName, accuracy):
        #tup[0]->algorithm name, tup[1]->accuracy
        self.accuracyDict[algName] = accuracy * 100

    def appendModel(self, algName, model):
        #tup[0]->algorithm name, tup[1]->accuracy
        self.models[algName] = model

    def getAccuracyDict(self):
        return self.accuracyDict

    def getModel(self, name):
        return self.models[name]
示例#28
0
def oneVsTwoPositions():
    '''
    Trains a random forest on the data from participant 1 
    and tests it on participant 2 and 3. The data used here 
    uses the position features
    '''
    theData = generateOneTrainPositionData()
    means, stdDevs = theData.normalizeData()
    testForest = RandomForest(theData)
    print "Training"
    testForest.train()
    print "Done!"

    testList = generateTwoTestPositionData(means, stdDevs)

    results = confusionMatrix(labels)

    for samp in testList:
        resultLabel = testForest.classify(samp)
        trueLabel = samp.getLabel()

        results.update(trueLabel, resultLabel)

    results.printMatrix()
示例#29
0
def oneVsTwoPositions():
    '''
    Trains a random forest on the data from participant 1
    and tests it on participant 2 and 3. The data used here
    uses the position features
    '''
    theData = generateOneTrainPositionData()
    means, stdDevs = theData.normalizeData()
    testForest = RandomForest(theData)
    print "Training"
    testForest.train()
    print "Done!"

    testList = generateTwoTestPositionData(means, stdDevs)

    results = confusionMatrix(labels)

    for samp in testList:
        resultLabel = testForest.classify(samp)
        trueLabel = samp.getLabel()

        results.update(trueLabel, resultLabel)

    results.printMatrix()
示例#30
0
def main():
    """
    Process the input arguments
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('-m', '--model', default="0")
    parser.add_argument('-t', '--training', default="data/mushrooms_train.data")
    parser.add_argument('-e', '--testing', default="data/mushrooms_test.data")
    parser.add_argument('-d', '--max_depth', default=10)
    parser.add_argument('-n', '--tree_nums', default=20)
    args = parser.parse_args()
    treeNum=int(args.tree_nums)
    if args.model == "0":
        print "Testing Decision Tree model"
        model = DecisionTree()
    else:   
        print "Testing Random Forest model"
        model = RandomForest(treeNum)
    
    test_model(model, args.training, args.testing)
def main():
    """
    Process the input arguments
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('-m', '--model', default="0")
    parser.add_argument('-t', '--training', default="/usr/cs/grad/master/sbeathan/DataMining/random-forest/random-forest/data/mushrooms_train.data")
    parser.add_argument('-e', '--testing', default="/usr/cs/grad/master/sbeathan/DataMining/random-forest/random-forest/data/mushrooms_test.data")
    parser.add_argument('-d', '--max_depth', default=10)
    parser.add_argument('-n', '--tree_nums', default=20)
    args = parser.parse_args()

    if args.model == "0":
        print ("Testing Decision Tree model")
        model = DecisionTree()
    else:
        print ("Testing Random Forest model")
        model = RandomForest(args.tree_nums)

    test_model(model, args.training, args.testing)
示例#32
0
def main():
    if (len(sys.argv) < 3):
        print("Usage: python3 %s <dataset-csv> <target-attr>" % sys.argv[0])
        exit(-1)

    datasetFile = sys.argv[1]
    targetAttr = sys.argv[2]
    separator = ','

    random.seed(0)
    np.random.seed(0)

    # Read dataset
    D = pd.read_csv(datasetFile, sep=separator)

    t0 = time.time()

    # tree = DecisionTree(D, targetAttr, D.nunique(), sqrt)
    # tree.render()
    forest = RandomForest(D, targetAttr, D.nunique(), 10, sqrt, False)
示例#33
0
def classify_with_random_forest():
    forest = RandomForest(num_trees = 250, max_depth = 7, categorical_vars = cat_set)
    forest.train(training_data, training_labels)

    num_right = 0
    for i in range(num_training_points):
        prediction = forest.predict(training_data[i])
        if prediction == training_labels[i]:
            num_right += 1
    print("Training Accuracy: " + str(num_right / num_training_points))

    num_right = 0
    for i in range(num_validation_points):
        prediction = forest.predict(validation_data[i])
        if prediction == validation_labels[i]:
            num_right += 1
    print("Validation Accuracy: " + str(num_right / num_validation_points))
示例#34
0
def main():
    """
    Process the input arguments
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('-m', '--model', default="0")
    parser.add_argument('-t',
                        '--training',
                        default="data/tic_tac_toe_train.data")
    parser.add_argument('-e',
                        '--testing',
                        default="data/tic_tac_toe_test.data")
    parser.add_argument('-n', '--tree_nums', default=20)
    args = parser.parse_args()

    if args.model == "0":
        print("Testing Decision Tree model")
        model = DecisionTree()
    else:
        print("Testing Random Forest model")
        model = RandomForest(args.tree_nums)

    test_model(model, args.training, args.testing)
示例#35
0
    def get_classifier_object(self):
        if self.classifier_name == 'svm':
            self.clf = SVM(self.x_train, self.y_train, self.x_test, self.y_test)
            self.clf.train()
            self.y_pred = self.clf.predict()
        elif self.classifier_name == 'logreg':
            self.clf = LogReg(self.x_train, self.y_train, self.x_test, self.y_test)
            self.clf.train()
            self.y_pred = self.clf.predict()
        elif self.classifier_name == 'RForest':
            self.clf = RandomForest(self.x_train, self.y_train, self.x_test, self.y_test)
            self.clf.train()
            self.y_pred = self.clf.predict()
        elif self.classifier_name == 'knn':
            self.clf = KNN(self.x_train, self.y_train, self.x_test, self.y_test)
            self.clf.train()
            self.y_pred = self.clf.predict()
        elif self.classifier_name == 'lda':
            self.clf = LDA(self.x_train, self.y_train, self.x_test, self.y_test)
            self.clf.train()
            self.y_pred = self.clf.predict()

        return self.clf.get_classifier()
示例#36
0
文件: Ensemble.py 项目: tincho4t/aaTP
class Ensemble(object):

    def __init__(self):
        self.pca_randomForest = None
        self.pca_randomForest_norm = None
        self.pca_randomForest_pca = None
        self.rbm_lr_rbm = None
        self.rbm_lr = None
        self.texture_10_8 = None
        self.texture_5_10 = None
        self.texture_7_10 = None
        self.texture_9_8 = None
        self.texture_4_10 = None
        self.texture_20_8 = None
        self.ensemble_logistic_regression = None
        self.edge_pca_lr = None
        self.pca_edge_norm = None
        self.pca_edge_pca = None
        self.ip = ImagesProcessor()
        # Agregamos las predicciones aca porque no logramos pasarlas por referencia
        self.pca_randomForest_y_hat = None
        self.rbm_lr_y_hat = None
        self.texture_10_8_y_hat = None
        self.texture_5_10_y_hat = None

    def load(self):
        self.texture_10_8 = self._load_classifier('./ridgeClassifier_10_8')
        self.texture_5_10 = self._load_classifier('./ridgeClassifier_5_10')
        self.texture_7_10 = self._load_classifier('./ridgeClassifier_7_10')
        self.texture_9_8 = self._load_classifier('./ridgeClassifier_9_8')
        self.texture_4_10 = self._load_classifier('./ridgeClassifier_4_10')
        self.texture_20_8 = self._load_classifier('./ridgeClassifier_20_8')
        self.ensemble_logistic_regression = self._load_classifier('ensemble_logistic_regression')
        #pca_randomForest_pca = _load_classifier('./pca')
        #rbm_lr = _load_classifier('./rbm')


    def _load_classifier(self, path):
        f = file(path, 'r')
        classifier = cPickle.load(f)
        f.close()
        return classifier

    def fit_small(self, images, y):
        images_transformed, y_transformed = self.ip.transformImages(images, y, rotate=True, crop=True)
        
        t_t10_8 = threading.Thread(target=self._fit_small_texture10_8, args=(images[:], y, self.texture_10_8, 10, 8, 2))
        t_t10_8.daemon = True
        t_t10_8.start()

        t_t5_10 = threading.Thread(target=self._fit_small_texture5_10, args=(images[:], y, self.texture_5_10, 5, 10, 2))
        t_t5_10.daemon = True
        t_t5_10.start()

        t_t7_10 = threading.Thread(target=self._fit_small_texture7_10, args=(images[:], y, self.texture_7_10, 7, 10, 2))
        t_t7_10.daemon = True
        t_t7_10.start()

        t_t9_8 = threading.Thread(target=self._fit_small_texture9_8, args=(images[:], y, self.texture_9_8, 9, 8, 2))
        t_t9_8.daemon = True
        t_t9_8.start()

        t_t4_10 = threading.Thread(target=self._fit_small_texture4_10, args=(images[:], y, self.texture_4_10, 4, 10, 2))
        t_t4_10.daemon = True
        t_t4_10.start()

        t_t20_8 = threading.Thread(target=self._fit_small_texture20_8, args=(images[:], y, self.texture_20_8, 20, 8, 2))
        t_t20_8.daemon = True
        t_t20_8.start()

        t_pc = threading.Thread(target=self._fit_small_pc, args=(images_transformed[:], y_transformed))
        t_pc.daemon = True
        t_pc.start()

        t_rbm = threading.Thread(target=self._fit_small_rbm, args=(images_transformed[:], y_transformed))
        t_rbm.daemon = True
        t_rbm.start()

        t_t10_8.join()
        t_t5_10.join()
        t_t7_10.join()
        t_t9_8.join()
        t_t4_10.join()
        t_t20_8.join()
        t_pc.join()
        t_rbm.join()
        

    def _fit_small_texture10_8(self, images, y, estimator, radius, points, alpha):
        start_time = time.time()
        print("TEXTURE %d %d" % (radius, points))
        ds = self.ip.getTextureFeature(images, radius, points)
        self.texture_10_8 = RidgeClassifier(ds, y, alpha=alpha)
        self.texture_10_8.fit()
        print("COMPLETE TEXTURE %d %d --- %s seconds ---" % (radius, points, time.time() - start_time))

    # FIXE: unificar estas dos funciones. No le gusta pasar el estimador como atributo
    def _fit_small_texture5_10(self, images, y, estimator, radius, points, alpha):
        start_time = time.time()
        print("TEXTURE %d %d" % (radius, points))
        ds = self.ip.getTextureFeature(images, radius, points)
        self.texture_5_10 = RidgeClassifier(ds, y, alpha=alpha)
        self.texture_5_10.fit()
        print("COMPLETE TEXTURE %d %d --- %s seconds ---" % (radius, points, time.time() - start_time))

    def _fit_small_texture7_10(self, images, y, estimator, radius, points, alpha):
        start_time = time.time()
        print("TEXTURE %d %d" % (radius, points))
        ds = self.ip.getTextureFeature(images, radius, points)
        self.texture_7_10 = RidgeClassifier(ds, y, alpha=alpha)
        self.texture_7_10.fit()
        print("COMPLETE TEXTURE %d %d --- %s seconds ---" % (radius, points, time.time() - start_time))

    def _fit_small_texture9_8(self, images, y, estimator, radius, points, alpha):
        start_time = time.time()
        print("TEXTURE %d %d" % (radius, points))
        ds = self.ip.getTextureFeature(images, radius, points)
        self.texture_9_8 = RidgeClassifier(ds, y, alpha=alpha)
        self.texture_9_8.fit()
        print("COMPLETE TEXTURE %d %d --- %s seconds ---" % (radius, points, time.time() - start_time))

    def _fit_small_texture4_10(self, images, y, estimator, radius, points, alpha):
        start_time = time.time()
        print("TEXTURE %d %d" % (radius, points))
        ds = self.ip.getTextureFeature(images, radius, points)
        self.texture_4_10 = RidgeClassifier(ds, y, alpha=alpha)
        self.texture_4_10.fit()
        print("COMPLETE TEXTURE %d %d --- %s seconds ---" % (radius, points, time.time() - start_time))

    def _fit_small_texture20_8(self, images, y, estimator, radius, points, alpha):
        start_time = time.time()
        print("TEXTURE %d %d" % (radius, points))
        ds = self.ip.getTextureFeature(images, radius, points)
        self.texture_20_8 = RidgeClassifier(ds, y, alpha=alpha)
        self.texture_20_8.fit()
        print("COMPLETE TEXTURE %d %d --- %s seconds ---" % (radius, points, time.time() - start_time))

    def _fit_small_pc(self, images, y):
        start_time = time.time()
        print("PCA RANDOM FOREST")
        ds = self.ip.getImagesWithGrayHistogramEqualized(images=images)
        self.pca_randomForest_pca, self.pca_randomForest_norm, ds = self.ip.getPcaFeatures(ds, 150, Constants.IMAGES_SIZES)
        self.pca_randomForest = RandomForest(ds, y, n_estimators=2000)
        self.pca_randomForest.fit()
        print("COMPELTE PCA RANDOM FOREST --- %s seconds ---" %(time.time() - start_time))

    def _fit_small_rbm(self, ds, y):
        start_time = time.time()
        print("RBM LR")
        ds = self.ip.getImagesAsDataset(ds, Constants.IMAGES_SIZES)
        ds = (ds - np.min(ds, 0)) / (np.max(ds, 0) + 0.0001)
        self.rbm_lr_rbm = BernoulliRBM(random_state=0, verbose=True)
        self.rbm_lr_rbm.learning_rate = 0.01
        self.rbm_lr_rbm.n_iter = 5
        self.rbm_lr_rbm.n_components = 150
        logistic = linear_model.RidgeClassifier(alpha=2)
        self.rbm_lr = Pipeline(steps=[('rbm', self.rbm_lr_rbm), ('lr', logistic)])
        self.rbm_lr.fit(ds, y)
        print("COMPLETE RBM LR --- %s seconds ---" % (time.time() - start_time))


    def fit_big(self, ds, y):
        self.ensemble_logistic_regression = linear_model.LogisticRegression()
        self.ensemble_logistic_regression.fit(ds, y)

    def predict_small(self, images):

        # t_predict_small_pac_ranfomForest = threading.Thread(target=self._predict_small_pac_ranfomForest, args=(images, ))
        # t_predict_small_pac_ranfomForest.daemon = True
        # t_predict_small_pac_ranfomForest.start()

        # t_predict_small_rbm_lr = threading.Thread(target=self._predict_small_rbm_lr, args=(images, ))
        # t_predict_small_rbm_lr.daemon = True
        # t_predict_small_rbm_lr.start()

        t_predict_small_texture_10_8 = threading.Thread(target=self._predict_small_texture_10_8, args=(images, ))
        t_predict_small_texture_10_8.daemon = True
        t_predict_small_texture_10_8.start()

        t_predict_small_texture_5_10 = threading.Thread(target=self._predict_small_texture_5_10, args=(images, ))
        t_predict_small_texture_5_10.daemon = True
        t_predict_small_texture_5_10.start()

        t_predict_small_texture_7_10 = threading.Thread(target=self._predict_small_texture_7_10, args=(images, ))
        t_predict_small_texture_7_10.daemon = True
        t_predict_small_texture_7_10.start()

        t_predict_small_texture_9_8 = threading.Thread(target=self._predict_small_texture_9_8, args=(images, ))
        t_predict_small_texture_9_8.daemon = True
        t_predict_small_texture_9_8.start()

        t_predict_small_texture_4_10 = threading.Thread(target=self._predict_small_texture_4_10, args=(images, ))
        t_predict_small_texture_4_10.daemon = True
        t_predict_small_texture_4_10.start()

        t_predict_small_texture_20_8 = threading.Thread(target=self._predict_small_texture_20_8, args=(images, ))
        t_predict_small_texture_20_8.daemon = True
        t_predict_small_texture_20_8.start()

        # t_predict_small_pac_ranfomForest.join()
        # t_predict_small_rbm_lr.join()
        t_predict_small_texture_10_8.join()
        t_predict_small_texture_5_10.join()
        t_predict_small_texture_9_8.join()
        t_predict_small_texture_4_10.join()
        t_predict_small_texture_20_8.join()
        t_predict_small_texture_7_10.join()

        return(np.vstack((self.texture_10_8_y_hat, self.texture_5_10_y_hat, self.texture_7_10_y_hat,self.texture_9_8_y_hat,self.texture_4_10_y_hat,self.texture_20_8_y_hat)).T)
        #return(np.vstack((self.pca_randomForest_y_hat, self.rbm_lr_y_hat, self.texture_10_8_y_hat, self.texture_5_10_y_hat, self.texture_7_10_y_hat,self.texture_9_8_y_hat,self.texture_4_10_y_hat,self.texture_20_8_y_hat)).T)
        #return(np.vstack((self.pca_randomForest_y_hat, self.texture_10_8_y_hat, self.texture_5_10_y_hat, self.texture_7_10_y_hat,self.texture_9_8_y_hat,self.texture_4_10_y_hat,self.texture_20_8_y_hat)).T)

    def _predict_small_rbm_lr(self, images):
        start_time = time.time()
        ds = images[:]
        ds = self.ip.getImagesAsDataset(ds, Constants.IMAGES_SIZES)
        ds = (ds - np.min(ds, 0)) / (np.max(ds, 0) + 0.0001)
        self.rbm_lr_y_hat = self.rbm_lr.predict(ds)
        print "Complete prediction RBM --- %s ---" % (time.time() - start_time)

    def _predict_small_pac_ranfomForest(self, images):
        start_time = time.time()
        ds = self.ip.getImagesWithGrayHistogramEqualized(images=images)
        ds = self.ip.getImagesAsDataset(ds, Constants.IMAGES_SIZES)
        ds = self.pca_randomForest_norm.transform(ds)
        ds = self.pca_randomForest_pca.transform(ds)
        self.pca_randomForest_y_hat = self.pca_randomForest.predict(ds)
        print "Complete prediction PCA --- %s ---" % (time.time() - start_time)

    def _predict_small_texture_10_8(self, images):
        start_time = time.time()
        ds = self.ip.getTextureFeature(images, 10, 8)
        self.texture_10_8_y_hat = self.texture_10_8.predict(ds)
        print "Complete prediction Texture 10 8 --- %s ---" % (time.time() - start_time)

    def _predict_small_texture_5_10(self, images):
        start_time = time.time()
        ds = self.ip.getTextureFeature(images, 5, 10)
        self.texture_5_10_y_hat = self.texture_5_10.predict(ds)
        print "Complete prediction Texture 5 10 --- %s ---" % (time.time() - start_time)
    
    def _predict_small_texture_7_10(self, images):
        start_time = time.time()
        ds = self.ip.getTextureFeature(images, 7, 10)
        self.texture_7_10_y_hat = self.texture_7_10.predict(ds)
        print "Complete prediction Texture 7 10 --- %s ---" % (time.time() - start_time)
    
    def _predict_small_texture_9_8(self, images):
        start_time = time.time()
        ds = self.ip.getTextureFeature(images, 9, 8)
        self.texture_9_8_y_hat = self.texture_9_8.predict(ds)
        print "Complete prediction Texture 9 8 --- %s ---" % (time.time() - start_time)

    def _predict_small_texture_4_10(self, images):
        start_time = time.time()
        ds = self.ip.getTextureFeature(images, 4, 10)
        self.texture_4_10_y_hat = self.texture_4_10.predict(ds)
        print "Complete prediction Texture 4 10 --- %s ---" % (time.time() - start_time)
    
    def _predict_small_texture_20_8(self, images):
        start_time = time.time()
        ds = self.ip.getTextureFeature(images, 20, 8)
        self.texture_20_8_y_hat = self.texture_20_8.predict(ds)
        print "Complete prediction Texture 20 8 --- %s ---" % (time.time() - start_time)
    
    def predict_big(self, ds):
        return(self.ensemble_logistic_regression.predict(ds))
示例#37
0
def main(cv=False,kaggle=True, num_Trees=10, verbose=False):
    X = []
    y = []
    # Load data set
    with open("hw4-data.csv") as f:
        next(f, None)
        for line in csv.reader(f, delimiter = ","):
            X.append(line[:-1])
            y.append(line[-1])
    #end

    X = np.array(X, dtype = float)
    y = np.array(y, dtype = int)

    # Split training/test sets
    # You need to modify the following code for cross validation
    if cv == True:
        K = 10
        cv_accuracy =[]
        for ii in xrange(K):
            X_train = np.array([x for i, x in enumerate(X) if i % K != ii],
                                dtype = float)
            y_train = np.array([z for i, z in enumerate(y) if i % K != ii],
                                dtype = int)
            X_test  = np.array([x for i, x in enumerate(X) if i % K == ii],
                                dtype = float)
            y_test  = np.array([z for i, z in enumerate(y) if i % K == ii],
                                dtype = int)

            randomForest = RandomForest(num_trees=num_Trees, verbose=verbose)
            t0 = time()
            randomForest.fit(X_train, y_train)
            t1 = time()
            print "time elapses = %.3f s" % (t1-t0)

            y_predicted = randomForest.predict(X_test)

            results = [prediction == truth for prediction,
                       truth in zip(y_predicted, y_test)]

            # Accuracy
            accuracy = float(results.count(True)) / float(len(results))
            print "test accuracy: %.4f" % accuracy
            cv_accuracy.append(accuracy)
        print "average cv accuracy: %.4f" % np.mean(cv_accuracy)
    else:
        ii = 3
        K = 10
        X_train = np.array([x for i, x in enumerate(X) if i % K != ii],
                           dtype = float)
        y_train = np.array([z for i, z in enumerate(y) if i % K != ii],
                           dtype = int)
        X_test  = np.array([x for i, x in enumerate(X) if i % K == ii],
                           dtype = float)
        y_test  = np.array([z for i, z in enumerate(y) if i % K == ii],
                           dtype = int)
        if kaggle==True:
            randomForest = RandomForest(num_trees=num_Trees, verbose=verbose)
            t0 = time()
            # randomForest.fit(X_train,y_train)
            randomForest.fit(X,y) #use the full data
            t1 = time()
            print "time elapses = %.3f s" % (t1-t0)
            # y_predicted = randomForest.predict(X_test)
            # results = [prediction == truth 
            #            for prediction,truth in zip(y_predicted,y_test)]
            # # Accuracy
            # accuracy = float(results.count(True)) / float(len(results))
            # print "test accuracy: %.4f" % accuracy
            generateSubmissionFile(myname, randomForest)
        else:
            randomForest = RandomForest(num_trees=num_Trees, verbose=verbose)
            t0 = time()
            randomForest.fit(X_train,y_train)
            t1 = time()
            print "time elapses = %.3f s" % (t1-t0)
            y_predicted = randomForest.predict(X_test)
            results = [prediction == truth 
                       for prediction,truth in zip(y_predicted,y_test)]
            accuracy = float(results.count(True)) / float(len(results))
            print "test accuracy: %.4f" % accuracy