示例#1
0
def predict_test_data():
    forest = RandomForest(num_trees = 250, max_depth = 7, categorical_vars = cat_set)
    forest.train(training_data, training_labels)

    num_right = 0
    for i in range(num_training_points):
        prediction = forest.predict(training_data[i])
        if prediction == training_labels[i]:
            num_right += 1
    print("Training Accuracy: " + str(num_right / num_training_points))

    num_right = 0
    for i in range(num_validation_points):
        prediction = forest.predict(validation_data[i])
        if prediction == validation_labels[i]:
            num_right += 1
    print("Validation Accuracy: " + str(num_right / num_validation_points))

    guesses = []
    for i in range(TEST_SIZE):
        point = testing_data[i]
        guess = tree.predict(point)
        guesses.append(int(guess))

    with open('titanic_1.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Id', 'Category'])
        i = 1
        for g in guesses:
            writer.writerow([i, g])
            i += 1
示例#2
0
def run_model():
	
	# load data
	train_file = 'data/hw7_train.dat.txt'; test_file = 'data/hw7_test.dat.txt'
	data_train = pd.read_csv(train_file, sep = ' ', header = None, names=[0, 1, 'y'])
	data_test = pd.read_csv(test_file, sep = ' ', header = None, names=[0, 1, 'y'])
	X_train, Y_train = generate_data(train_file); X_test, Y_test = generate_data(test_file)
	
	# train model
	col_y = 'y'
	T = 30000; max_height = 1

	time_start = time.clock()
	RF_Prune = RandomForest()
	RF_Prune.construct_forest(data_train, col_y, size = T, max_height = max_height)

	print("Using %.3f seconds" % (time.clock() - time_start))

	# model accuracy
	print('\n--- Pruned Random forest model accuarcy ---')

	Y_train_pred = [RF_Prune.predict(x) for x in np.array(X_train)]
	train_acc = np.sum(Y_train_pred == Y_train) / len(Y_train) * 100
	print('Model accuracy on the training set: %.2f %%' %train_acc)

	Y_test_pred = [RF_Prune.predict(x) for x in np.array(X_test)]
	test_acc = np.sum(Y_test_pred == Y_test) / len(Y_test) * 100
	print('Accuracy on the testing set: %.2f %%\n' %test_acc)
示例#3
0
文件: main.py 项目: tincho4t/aaTP
def run_kfold(method, kf, X, y, text, transformer=None):
    accuracy = 0
    fold = 0
    print("Running " + str(text))
    for train_index, test_index in kf:
        print("Starting fold " + str(fold))
        fold += 1
        X_train = X[train_index, :]
        y_train = y[train_index]
        X_test = X[test_index, :]
        y_test = y[test_index]
        if transformer is not None:
            t = transformer.fit(X_train)
            X_train = t.transform(X_train)
            X_test = t.transform(X_test)
        if method == "rf":
            clf = RandomForest(X_train, y_train, n_estimators=1000)
            clf.fit()
        elif method == "lr":
            clf = linear_model.RidgeClassifier(alpha=2)
            clf.fit(X_train, y_train)
        elif method == "ex":
            clf = ExtraTreesClassifier(n_estimators=2000)
            clf.fit(X_train, y_train)
        y_hat = clf.predict(X_test)
        accuracy += score(y_hat, y_test)
    return (accuracy * 1.0 / len(kf))
示例#4
0
def classify_with_random_forest():
    forest = RandomForest(num_trees = 250, max_depth = 7, categorical_vars = cat_set)
    forest.train(training_data, training_labels)

    num_right = 0
    for i in range(num_training_points):
        prediction = forest.predict(training_data[i])
        if prediction == training_labels[i]:
            num_right += 1
    print("Training Accuracy: " + str(num_right / num_training_points))

    num_right = 0
    for i in range(num_validation_points):
        prediction = forest.predict(validation_data[i])
        if prediction == validation_labels[i]:
            num_right += 1
    print("Validation Accuracy: " + str(num_right / num_validation_points))
class BRAF(object):
    def __init__(self, S, p, k, weights, name='BRAF'):
        """
        :param raw_data: specify the name of the csv file
        :param S: Spesify the size of the Biased Random Forest method
        :param p: Specify the ratio between R1 and R2
        :param k: Specify the KN Nearest Neighbours for minority class
        """
        self.S = S
        self.p = p
        self.k = k
        self.name = name
        self.weights = weights
        "Initialize the Forests"
        self.R1 = RandomForest('R1_Forest', self.weights, int(self.p * self.S),
                               True)
        self.R2 = RandomForest('R2_Forest', self.weights,
                               int((1 - self.p) * self.S), True)

    def fit(self, data):
        """
        :param data: Read Data and preprocess for further analysis
        T is for Vanilla Random Forest and Tc is for biased forest
        :return: fitted R1 and R2
        """
        if data is not None:
            T, Tc = data
            print('fitting Biased Random Forest starts...')
            self.R1.fit(T[:, 0:-1], T[:, -1].astype(np.int))
            self.R2.fit(Tc[:, 0:-1], Tc[:, -1].astype(np.int))

        else:
            print("Data Not Found. Please check the file name and directory")

    def predict(self, x_test):
        """
        :param x_test: receives the given x to predict
        :return: logits
        """
        pred1 = self.R1.predict(x_test)
        pred2 = self.R2.predict(x_test)
        return (pred1 + pred2) / 2
def random_forests_classification(X, y, test_dat):
    classifier = RandomForest(20, round(math.sqrt(np.size(X, 1))), np.size(X, 0))
    # classifier = RandomForest(1, round(math.sqrt(np.size(X, 1))), 100, 45)
    classifier.train(X, y)
    y_hat = classifier.predict(test_dat)
    f = open("census_predictions_random_forest.csv", 'w')
    f.write("Id,Category\n")
    for i in range(np.size(test_dat, 0)):
        f.write(str(i + 1) + "," + str(int(y_hat[i, 0])) + "\n")
    f.close()
    print("DONE")
def best_params():
    acc_max = 0
    n_trees_max = 0
    n_trees_list = [i for i in range(2, 11)]
    for n_tree in n_trees_list:
        clf = RandomForest(n_trees=n_tree)
        clf.fit(X_train, Y_train)
        predictions = clf.predict(X_test)
        acc = accuracy(Y_test, predictions)
        if acc > acc_max:
            acc_max = acc
            n_trees_max = n_tree
    return (n_trees_max, acc_max)
def graph_accuracy():
    accuracy = []
    num_trees = []
    for j in range(5, 41, 5):
        forest = RandomForest(num_trees = j, max_depth = 10, categorical_vars = cat_set)
        forest.train(training_data, training_labels)
        num_right = 0
        for i in range(num_validation_points):
            prediction = forest.predict(validation_data[i])
            if prediction == validation_labels[i]:
                num_right += 1
        accuracy.append(num_right / num_validation_points)
        num_trees.append(j)
        print(j)
        sys.stdout.flush()
    plt.figure()
    plt.plot(num_trees, accuracy)
    plt.title("Census Accuracy For Random Forest")
    plt.ylabel("Accuracy Rate")
    plt.xlabel("Number of Trees")
    plt.show()
示例#9
0
from sklearn.model_selection import train_test_split
from sklearn import datasets
from RandomForest import RandomForest

iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

model = RandomForest()
model.fit(X_train, y_train)

predictions = model.predict(X_test)

print('prediction score: {}'.format(sum(predictions == y_test) / len(y_test)))
示例#10
0
class Ensemble(object):

    def __init__(self):
        self.pca_randomForest = None
        self.pca_randomForest_norm = None
        self.pca_randomForest_pca = None
        self.rbm_lr_rbm = None
        self.rbm_lr = None
        self.texture_10_8 = None
        self.texture_5_10 = None
        self.texture_7_10 = None
        self.texture_9_8 = None
        self.texture_4_10 = None
        self.texture_20_8 = None
        self.ensemble_logistic_regression = None
        self.edge_pca_lr = None
        self.pca_edge_norm = None
        self.pca_edge_pca = None
        self.ip = ImagesProcessor()
        # Agregamos las predicciones aca porque no logramos pasarlas por referencia
        self.pca_randomForest_y_hat = None
        self.rbm_lr_y_hat = None
        self.texture_10_8_y_hat = None
        self.texture_5_10_y_hat = None

    def load(self):
        self.texture_10_8 = self._load_classifier('./ridgeClassifier_10_8')
        self.texture_5_10 = self._load_classifier('./ridgeClassifier_5_10')
        self.texture_7_10 = self._load_classifier('./ridgeClassifier_7_10')
        self.texture_9_8 = self._load_classifier('./ridgeClassifier_9_8')
        self.texture_4_10 = self._load_classifier('./ridgeClassifier_4_10')
        self.texture_20_8 = self._load_classifier('./ridgeClassifier_20_8')
        self.ensemble_logistic_regression = self._load_classifier('ensemble_logistic_regression')
        #pca_randomForest_pca = _load_classifier('./pca')
        #rbm_lr = _load_classifier('./rbm')


    def _load_classifier(self, path):
        f = file(path, 'r')
        classifier = cPickle.load(f)
        f.close()
        return classifier

    def fit_small(self, images, y):
        images_transformed, y_transformed = self.ip.transformImages(images, y, rotate=True, crop=True)
        
        t_t10_8 = threading.Thread(target=self._fit_small_texture10_8, args=(images[:], y, self.texture_10_8, 10, 8, 2))
        t_t10_8.daemon = True
        t_t10_8.start()

        t_t5_10 = threading.Thread(target=self._fit_small_texture5_10, args=(images[:], y, self.texture_5_10, 5, 10, 2))
        t_t5_10.daemon = True
        t_t5_10.start()

        t_t7_10 = threading.Thread(target=self._fit_small_texture7_10, args=(images[:], y, self.texture_7_10, 7, 10, 2))
        t_t7_10.daemon = True
        t_t7_10.start()

        t_t9_8 = threading.Thread(target=self._fit_small_texture9_8, args=(images[:], y, self.texture_9_8, 9, 8, 2))
        t_t9_8.daemon = True
        t_t9_8.start()

        t_t4_10 = threading.Thread(target=self._fit_small_texture4_10, args=(images[:], y, self.texture_4_10, 4, 10, 2))
        t_t4_10.daemon = True
        t_t4_10.start()

        t_t20_8 = threading.Thread(target=self._fit_small_texture20_8, args=(images[:], y, self.texture_20_8, 20, 8, 2))
        t_t20_8.daemon = True
        t_t20_8.start()

        t_pc = threading.Thread(target=self._fit_small_pc, args=(images_transformed[:], y_transformed))
        t_pc.daemon = True
        t_pc.start()

        t_rbm = threading.Thread(target=self._fit_small_rbm, args=(images_transformed[:], y_transformed))
        t_rbm.daemon = True
        t_rbm.start()

        t_t10_8.join()
        t_t5_10.join()
        t_t7_10.join()
        t_t9_8.join()
        t_t4_10.join()
        t_t20_8.join()
        t_pc.join()
        t_rbm.join()
        

    def _fit_small_texture10_8(self, images, y, estimator, radius, points, alpha):
        start_time = time.time()
        print("TEXTURE %d %d" % (radius, points))
        ds = self.ip.getTextureFeature(images, radius, points)
        self.texture_10_8 = RidgeClassifier(ds, y, alpha=alpha)
        self.texture_10_8.fit()
        print("COMPLETE TEXTURE %d %d --- %s seconds ---" % (radius, points, time.time() - start_time))

    # FIXE: unificar estas dos funciones. No le gusta pasar el estimador como atributo
    def _fit_small_texture5_10(self, images, y, estimator, radius, points, alpha):
        start_time = time.time()
        print("TEXTURE %d %d" % (radius, points))
        ds = self.ip.getTextureFeature(images, radius, points)
        self.texture_5_10 = RidgeClassifier(ds, y, alpha=alpha)
        self.texture_5_10.fit()
        print("COMPLETE TEXTURE %d %d --- %s seconds ---" % (radius, points, time.time() - start_time))

    def _fit_small_texture7_10(self, images, y, estimator, radius, points, alpha):
        start_time = time.time()
        print("TEXTURE %d %d" % (radius, points))
        ds = self.ip.getTextureFeature(images, radius, points)
        self.texture_7_10 = RidgeClassifier(ds, y, alpha=alpha)
        self.texture_7_10.fit()
        print("COMPLETE TEXTURE %d %d --- %s seconds ---" % (radius, points, time.time() - start_time))

    def _fit_small_texture9_8(self, images, y, estimator, radius, points, alpha):
        start_time = time.time()
        print("TEXTURE %d %d" % (radius, points))
        ds = self.ip.getTextureFeature(images, radius, points)
        self.texture_9_8 = RidgeClassifier(ds, y, alpha=alpha)
        self.texture_9_8.fit()
        print("COMPLETE TEXTURE %d %d --- %s seconds ---" % (radius, points, time.time() - start_time))

    def _fit_small_texture4_10(self, images, y, estimator, radius, points, alpha):
        start_time = time.time()
        print("TEXTURE %d %d" % (radius, points))
        ds = self.ip.getTextureFeature(images, radius, points)
        self.texture_4_10 = RidgeClassifier(ds, y, alpha=alpha)
        self.texture_4_10.fit()
        print("COMPLETE TEXTURE %d %d --- %s seconds ---" % (radius, points, time.time() - start_time))

    def _fit_small_texture20_8(self, images, y, estimator, radius, points, alpha):
        start_time = time.time()
        print("TEXTURE %d %d" % (radius, points))
        ds = self.ip.getTextureFeature(images, radius, points)
        self.texture_20_8 = RidgeClassifier(ds, y, alpha=alpha)
        self.texture_20_8.fit()
        print("COMPLETE TEXTURE %d %d --- %s seconds ---" % (radius, points, time.time() - start_time))

    def _fit_small_pc(self, images, y):
        start_time = time.time()
        print("PCA RANDOM FOREST")
        ds = self.ip.getImagesWithGrayHistogramEqualized(images=images)
        self.pca_randomForest_pca, self.pca_randomForest_norm, ds = self.ip.getPcaFeatures(ds, 150, Constants.IMAGES_SIZES)
        self.pca_randomForest = RandomForest(ds, y, n_estimators=2000)
        self.pca_randomForest.fit()
        print("COMPELTE PCA RANDOM FOREST --- %s seconds ---" %(time.time() - start_time))

    def _fit_small_rbm(self, ds, y):
        start_time = time.time()
        print("RBM LR")
        ds = self.ip.getImagesAsDataset(ds, Constants.IMAGES_SIZES)
        ds = (ds - np.min(ds, 0)) / (np.max(ds, 0) + 0.0001)
        self.rbm_lr_rbm = BernoulliRBM(random_state=0, verbose=True)
        self.rbm_lr_rbm.learning_rate = 0.01
        self.rbm_lr_rbm.n_iter = 5
        self.rbm_lr_rbm.n_components = 150
        logistic = linear_model.RidgeClassifier(alpha=2)
        self.rbm_lr = Pipeline(steps=[('rbm', self.rbm_lr_rbm), ('lr', logistic)])
        self.rbm_lr.fit(ds, y)
        print("COMPLETE RBM LR --- %s seconds ---" % (time.time() - start_time))


    def fit_big(self, ds, y):
        self.ensemble_logistic_regression = linear_model.LogisticRegression()
        self.ensemble_logistic_regression.fit(ds, y)

    def predict_small(self, images):

        # t_predict_small_pac_ranfomForest = threading.Thread(target=self._predict_small_pac_ranfomForest, args=(images, ))
        # t_predict_small_pac_ranfomForest.daemon = True
        # t_predict_small_pac_ranfomForest.start()

        # t_predict_small_rbm_lr = threading.Thread(target=self._predict_small_rbm_lr, args=(images, ))
        # t_predict_small_rbm_lr.daemon = True
        # t_predict_small_rbm_lr.start()

        t_predict_small_texture_10_8 = threading.Thread(target=self._predict_small_texture_10_8, args=(images, ))
        t_predict_small_texture_10_8.daemon = True
        t_predict_small_texture_10_8.start()

        t_predict_small_texture_5_10 = threading.Thread(target=self._predict_small_texture_5_10, args=(images, ))
        t_predict_small_texture_5_10.daemon = True
        t_predict_small_texture_5_10.start()

        t_predict_small_texture_7_10 = threading.Thread(target=self._predict_small_texture_7_10, args=(images, ))
        t_predict_small_texture_7_10.daemon = True
        t_predict_small_texture_7_10.start()

        t_predict_small_texture_9_8 = threading.Thread(target=self._predict_small_texture_9_8, args=(images, ))
        t_predict_small_texture_9_8.daemon = True
        t_predict_small_texture_9_8.start()

        t_predict_small_texture_4_10 = threading.Thread(target=self._predict_small_texture_4_10, args=(images, ))
        t_predict_small_texture_4_10.daemon = True
        t_predict_small_texture_4_10.start()

        t_predict_small_texture_20_8 = threading.Thread(target=self._predict_small_texture_20_8, args=(images, ))
        t_predict_small_texture_20_8.daemon = True
        t_predict_small_texture_20_8.start()

        # t_predict_small_pac_ranfomForest.join()
        # t_predict_small_rbm_lr.join()
        t_predict_small_texture_10_8.join()
        t_predict_small_texture_5_10.join()
        t_predict_small_texture_9_8.join()
        t_predict_small_texture_4_10.join()
        t_predict_small_texture_20_8.join()
        t_predict_small_texture_7_10.join()

        return(np.vstack((self.texture_10_8_y_hat, self.texture_5_10_y_hat, self.texture_7_10_y_hat,self.texture_9_8_y_hat,self.texture_4_10_y_hat,self.texture_20_8_y_hat)).T)
        #return(np.vstack((self.pca_randomForest_y_hat, self.rbm_lr_y_hat, self.texture_10_8_y_hat, self.texture_5_10_y_hat, self.texture_7_10_y_hat,self.texture_9_8_y_hat,self.texture_4_10_y_hat,self.texture_20_8_y_hat)).T)
        #return(np.vstack((self.pca_randomForest_y_hat, self.texture_10_8_y_hat, self.texture_5_10_y_hat, self.texture_7_10_y_hat,self.texture_9_8_y_hat,self.texture_4_10_y_hat,self.texture_20_8_y_hat)).T)

    def _predict_small_rbm_lr(self, images):
        start_time = time.time()
        ds = images[:]
        ds = self.ip.getImagesAsDataset(ds, Constants.IMAGES_SIZES)
        ds = (ds - np.min(ds, 0)) / (np.max(ds, 0) + 0.0001)
        self.rbm_lr_y_hat = self.rbm_lr.predict(ds)
        print "Complete prediction RBM --- %s ---" % (time.time() - start_time)

    def _predict_small_pac_ranfomForest(self, images):
        start_time = time.time()
        ds = self.ip.getImagesWithGrayHistogramEqualized(images=images)
        ds = self.ip.getImagesAsDataset(ds, Constants.IMAGES_SIZES)
        ds = self.pca_randomForest_norm.transform(ds)
        ds = self.pca_randomForest_pca.transform(ds)
        self.pca_randomForest_y_hat = self.pca_randomForest.predict(ds)
        print "Complete prediction PCA --- %s ---" % (time.time() - start_time)

    def _predict_small_texture_10_8(self, images):
        start_time = time.time()
        ds = self.ip.getTextureFeature(images, 10, 8)
        self.texture_10_8_y_hat = self.texture_10_8.predict(ds)
        print "Complete prediction Texture 10 8 --- %s ---" % (time.time() - start_time)

    def _predict_small_texture_5_10(self, images):
        start_time = time.time()
        ds = self.ip.getTextureFeature(images, 5, 10)
        self.texture_5_10_y_hat = self.texture_5_10.predict(ds)
        print "Complete prediction Texture 5 10 --- %s ---" % (time.time() - start_time)
    
    def _predict_small_texture_7_10(self, images):
        start_time = time.time()
        ds = self.ip.getTextureFeature(images, 7, 10)
        self.texture_7_10_y_hat = self.texture_7_10.predict(ds)
        print "Complete prediction Texture 7 10 --- %s ---" % (time.time() - start_time)
    
    def _predict_small_texture_9_8(self, images):
        start_time = time.time()
        ds = self.ip.getTextureFeature(images, 9, 8)
        self.texture_9_8_y_hat = self.texture_9_8.predict(ds)
        print "Complete prediction Texture 9 8 --- %s ---" % (time.time() - start_time)

    def _predict_small_texture_4_10(self, images):
        start_time = time.time()
        ds = self.ip.getTextureFeature(images, 4, 10)
        self.texture_4_10_y_hat = self.texture_4_10.predict(ds)
        print "Complete prediction Texture 4 10 --- %s ---" % (time.time() - start_time)
    
    def _predict_small_texture_20_8(self, images):
        start_time = time.time()
        ds = self.ip.getTextureFeature(images, 20, 8)
        self.texture_20_8_y_hat = self.texture_20_8.predict(ds)
        print "Complete prediction Texture 20 8 --- %s ---" % (time.time() - start_time)
    
    def predict_big(self, ds):
        return(self.ensemble_logistic_regression.predict(ds))
示例#11
0
X = iris.data
y = iris.target

ratio_train_test = 0.85

num_samples, num_features = X.shape
idx = np.random.permutation(range(num_samples))
num_samples_train = int(num_samples * ratio_train_test)
idx_train = idx[:num_samples_train]
idx_test = idx[num_samples_train:]
X_train, y_train = X[idx_train], y[idx_train]
X_test, y_test = X[idx_test], y[idx_test]

# HYPER PARAMETERS
max_depth = 7
min_split_size = 5
ratio_samples = 0.2
num_trees = 30
num_features_node = int(np.sqrt(num_features))
coefficient = 'gini'
percentile = 90
values = None
min_std_deviation = 0

rf = RandomForest(max_depth, min_split_size, ratio_samples, num_trees,
                  num_features_node, coefficient, percentile, values,
                  min_std_deviation)
rf.train(X_train, y_train)
rf.predict(X_test, y_test)
示例#12
0
def main():
    """ 
    # ----------------------Iris------------------------
    iris = sklearn.datasets.load_iris()
    print(iris.DESCR)
    X, y = iris.data, iris.target
    # --------------------------------------------------
    """
    """ # ----------------------Sonar------------------------
    X, y = load_sonar()
    print(X.shape, y.shape)
    # --------------------------------------------------- """
    """ # -------------------Iris i Sonar--------------------
    ratio_train_test = 0.8
    num_samples, num_features = X.shape
    idx = np.random.permutation(range(num_samples))
    num_samples_train = int(num_samples*ratio_train_test)
    idx_train = idx[:num_samples_train]
    idx_test = idx[num_samples_train:]
    X_train,Y_train = X[idx_train], y[idx_train]
    X_test,Y_test = X[idx_test], y[idx_test]
    # --------------------------------------------------- """

    # ----------------------MNIST------------------------
    X_train, Y_train, X_test, Y_test = load()
    # ---------------------------------------------------

    num_trees = 10
    max_depth = 10  # maxim nombre nivells arbre
    min_size_split = 5  # si elements al node < 5 ja no dividim
    ratio_samples = 0.8  # bagging
    num_trees = 10
    criterion = "Gini"
    num_features_node = int(np.sqrt(
        X_train.shape[1]))  # nombre de features diferents a consisderar
    # en cada norain

    num_samples_train = X_train.shape[0]
    num_samples_test = X_test.shape[0]
    logger.info("{} train and {} test samples".format(num_samples_train,
                                                      num_samples_test))

    try:
        start = timeit.default_timer()
        rf = RandomForest(max_depth, min_size_split, ratio_samples, num_trees,
                          num_features_node, criterion)

        # ----------------------MNIST------------------------
        rf.values = range(0, 156, 64)
        # ---------------------------------------------------
        rf.fit(X_train, Y_train)
        #print("Fit is done")
        Ypred = rf.predict(X_test)
        stop = timeit.default_timer()
        execution_time = (stop - start) / 60.
        logger.info("Program Executed in " + str(execution_time) + " minutes.")
        num_correct_predictions = np.sum(Ypred == Y_test)
        accuracy = num_correct_predictions / float(len(Y_test))
        logger.info('accuracy {} %'.format(100 *
                                           np.round(accuracy, decimals=2)))
        logger.info("Ypred = {}".format(Ypred))
        logger.info("Y_test = {}".format(Y_test))
        logger.info("Y_test - Y_train = {}".format(
            np.array([Y_test[i] - Ypred[i] for i in range(len(Y_test))])))

    except Exception as e:
        logger.critical("Failed on executing due to:\n{}".format(str(e)))
示例#13
0
#change class label to integer by assifninf automatically
print('Class labels', np.unique(df_wine['Class label']))
#print first five lines of dataframe
print(df_wine.head())

for columns in df_wine.columns:
    print(columns)



#require the second to all column as X, and the first column as y
X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values

#split the train data and test data by 70%, 30% radomly
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
#Decision tree below
#tree=DecisionTree(criterion='entropy', max_depth=6, random_state=None)
#tree.fit(X_train, y_train, df_wine.columns[1:])
#y_pred=tree.predict(X_test)

#Random Forest below
forest=RandomForest(criterion='gini', n_estimators=20, max_features='auto', max_depth=3, min_samples_split = 2,random_state=None)
forest.fit(X_train, y_train, df_wine.columns[1:])
y_pred=forest.predict(X_test)

#mis-classified number
print ("Misclassified samples/total test samples: %d/%d" %((y_test != y_pred).sum(), len(y_test)))
#print (y_test)
#print (y_pred)
#print "The first sample probability is ", tree.predict_proba(X_test[0,:])
示例#14
0
from DecisionTree import DecisionTree
from RandomForest import RandomForest

data = datasets.load_breast_cancer()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

print('Shape:', X_train.shape, X_test.shape, y_train.shape, y_test.shape)

dt = DecisionTree(min_samples_split=3, max_depth=10, n_features=20)
dt.fit(X_train, y_train)

rf = RandomForest(n_trees=3, min_samples_split=3, max_depth=10, n_features=15)
rf.fit(X_train, y_train)

print('Parameters:', dt.min_samples_split, dt.max_depth, dt.n_features)
    
y_pred_dt = dt.predict(X_test)
acc_dt = accuracy_score(y_test, y_pred_dt)
f1_dt = f1_score(y_test, y_pred_dt)

y_pred_rf = rf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

print ("Accuracy score of Decision Tree:", acc_dt)
print ("Accuracy score of Random Forest:", acc_rf)
print ("F1 score of Decision Tree:", f1_dt)
print ("F1 score of Random Forest:", f1_rf) # random forest overfiting for this small data
示例#15
0
    # pres = []
    # falls = []
    # thresholds = np.linspace(0, 1, 101)

    # for threshold in thresholds:
    #     acc, pre, fall = cv_threshold(df_train_X, df_train_y, threshold=threshold)
    #     accs.append(acc)
    #     pres.append(pre)
    #     falls.append(fall)

    # fig, ax = plt.subplots(figsize=(12, 6))
    # #ax.plot(thresholds, accs)
    # ax.plot(pres, falls)
    # ax.set_xlabel("threshold")
    # ax.set_ylabel("pres")
    # ax.set_title("precision")

    #predict test_X with threshold 0.53
    X_train = df_train_X.to_numpy()
    y_train = df_train_y.to_numpy()
    X_test = df_test_X.to_numpy()
    #model = LogisticRegression()

    #scores, arr = analyze_RF(X_train, y_train)

    model = RandomForest(num_trees=120, num_features=3)
    model.fit(X_train, y_train)
    y_hat = model.predict(X_test)

    submit_prediction(y_hat)
示例#16
0
class Classifier(object):

    # classifiers with parameters
    LogReg = 1
    Norm = 2
    GMM = 3
    kNN = 4
    LinReg = 5
    Perceptron = 6
    MLP = 7
    SVM = 8
    DecisionTree = 9
    RandomForest = 10

    # classifiers without parameters
    NaiveBayes = 100
    Gauss = 101

    def __init__(self, classifier, parameters, featurespace):
        super(Classifier, self).__init__()

        self.__classifier = classifier
        self.__parameters = parameters
        self.__featurespace = featurespace
        self.__clf = None

    def copy(self):
        return Classifier(self.__classifier, self.__parameters,
                          self.__featurespace)

    def initialize(self):
        self.__samples, self.__labels = self.__featurespace.getSamples()
        if len(self.__labels) == 0:
            self.__clf = None
            return

        if self.__classifier == self.LogReg:
            maxIter = self.__parameters.getLogRegMaxNumIterations()
            learningRate = self.__parameters.getLogRegLearningRate()
            self.__clf = LinearLogisticRegression(learningRate=learningRate,
                                                  maxIterations=maxIter)
            self.__clf.fit(self.__samples, self.__labels)

        elif self.__classifier == self.Norm:
            norm = self.__parameters.getNormNorm()
            self.__clf = NormClassifier(norm)
            self.__clf.fit(self.__samples, self.__labels)

        elif self.__classifier == self.NaiveBayes:
            #			self.__clf = naive_bayes.GaussianNB()
            #			self.__clf.fit(self.__samples, self.__labels)
            self.__clf = GaussianClassifier(samplesIndependent=True)
            self.__clf.fit(self.__samples, self.__labels)

        elif self.__classifier == self.Gauss:
            self.__clf = GaussianClassifier(samplesIndependent=False)
            self.__clf.fit(self.__samples, self.__labels)

        elif self.__classifier == self.GMM:
            numComponents = self.__parameters.getGmmNumComponentsPerClass()
            maxIterations = self.__parameters.getGmmMaxNumIterations()
            self.__clf = GMMClassifier(numComponents, maxIterations)
            self.__clf.fit(self.__samples, self.__labels)

        elif self.__classifier == self.kNN:
            algo = self.__parameters.getKNNAlgorithm()
            k = self.__parameters.getKNNNumberOfNeighbors()
            w = self.__parameters.getKNNWeightFunction()
            if algo == 'scikit-learn':
                self.__clf = neighbors.KNeighborsClassifier(k, weights=w)
            else:  # 'own'
                if k == 1:
                    self.__clf = NearestNeighbor()
                else:
                    self.__clf = kNearestNeighbor(k)
            self.__clf.fit(self.__samples, self.__labels)

        elif self.__classifier == self.LinReg:
            lossFunc = self.__parameters.getLinRegLossFunction()
            a = self.__parameters.getLinRegLossFunctionParam()
            self.__clf = LinearRegression(lossFunc, a, True)
            self.__clf.fit(self.__samples, self.__labels)

        elif self.__classifier == self.Perceptron:
            maxIter = self.__parameters.getPerceptronMaxNumIterations()
            learningRate = self.__parameters.getPerceptronLearningRate()
            batchMode = self.__parameters.getPerceptronBatchMode()
            self.__clf = Perceptron(batchMode=batchMode,
                                    learningRate=learningRate,
                                    maxIterations=maxIter)
            self.__clf.fit(self.__samples, self.__labels)

        elif self.__classifier == self.MLP:
            layers = self.__parameters.getMLPHiddenLayers()
            act = self.__parameters.getMLPActivationFunction()
            algo = self.__parameters.getMLPOptimizationAlgorithm()
            alpha = self.__parameters.getMLPAlpha()
            rate = self.__parameters.getMLPLearningRate()
            self.__clf = sklearn.neural_network.MLPClassifier(
                hidden_layer_sizes=layers,
                activation=act,
                algorithm=algo,
                alpha=alpha,
                learning_rate=rate)
            self.__clf.fit(self.__samples, self.__labels)

        elif self.__classifier == self.SVM:
            algorithm = self.__parameters.getSVMAlgorithm()
            kernel = self.__parameters.getSVMKernel()
            C = self.__parameters.getSVMC()
            gamma = self.__parameters.getSVMGamma()
            coef0 = self.__parameters.getSVMCoef0()
            degree = self.__parameters.getSVMDegree()
            if algorithm == 'LinearSVC':
                self.__clf = svm.LinearSVC(C=C)
            elif algorithm == 'SVC':
                self.__clf = svm.SVC(kernel=kernel,
                                     C=C,
                                     gamma=gamma,
                                     coef0=coef0,
                                     degree=degree)
            elif algorithm == 'HardMarginSVM':
                self.__clf = HardMarginSVM()
            elif algorithm == 'SoftMarginSVM':
                self.__clf = SoftMarginSVM(C=C)
            else:
                self.__clf = KernelSVM(C=C, gamma=gamma)
            self.__clf.fit(self.__samples, self.__labels)

        elif self.__classifier == self.DecisionTree:
            algorithm = self.__parameters.getDecisionTreeAlgorithm()
            criterion = self.__parameters.getDecisionTreeCriterion()
            splitter = self.__parameters.getDecisionTreeSplitter()
            maxDepth = self.__parameters.getDecisionTreeMaxDepth()
            minSamplesSplit = self.__parameters.getDecisionTreeMinSamplesSplit(
            )
            minSamplesLeaf = self.__parameters.getDecisionTreeMinSamplesLeaf()
            minWeightedFractionLeaf = self.__parameters.getDecisionTreeMinWeightedFractionLeaf(
            )
            maxLeafNodes = self.__parameters.getDecisionTreeMaxLeafNodes()
            trials = self.__parameters.getDecisionTreeNumTrialsPerSplit()
            if algorithm == 'sklearn':
                self.__clf = tree.DecisionTreeClassifier(
                    criterion=criterion,
                    splitter=splitter,
                    max_features=2,
                    max_depth=maxDepth,
                    min_samples_split=minSamplesSplit,
                    min_samples_leaf=minSamplesLeaf,
                    min_weight_fraction_leaf=minWeightedFractionLeaf,
                    max_leaf_nodes=maxLeafNodes)
            else:
                self.__clf = DecisionTree(maxDepth, minSamplesLeaf, trials)
            self.__clf.fit(self.__samples, self.__labels)

        elif self.__classifier == self.RandomForest:
            algorithm = self.__parameters.getRandomForestAlgorithm()
            numTrees = self.__parameters.getRandomForestNumTrees()
            criterion = self.__parameters.getRandomForestCriterion()
            maxDepth = self.__parameters.getRandomForestMaxDepth()
            minSamplesSplit = self.__parameters.getRandomForestMinSamplesSplit(
            )
            minSamplesLeaf = self.__parameters.getRandomForestMinSamplesLeaf()
            minWeightedFractionLeaf = self.__parameters.getRandomForestMinWeightedFractionLeaf(
            )
            maxLeafNodes = self.__parameters.getRandomForestMaxLeafNodes()
            trials = self.__parameters.getRandomForestNumTrialsPerSplit()
            # print('Num trees: {0}'.format(numTrees))
            # print('Max depth: {0}'.format(maxDepth))
            # print('Min samples split: {0}'.format(minSamplesSplit))
            # print('Min samples leaf: {0}'.format(minSamplesLeaf))
            # print('Min weighted fraction leaf: {0}'.format(minWeightedFractionLeaf))
            # print('Max leaf nodes: {0}'.format(maxLeafNodes))
            # print('Num trials per node: {0}'.format(trials))
            if algorithm == 'sklearn':
                self.__clf = ensemble.RandomForestClassifier(
                    n_estimators=numTrees,
                    criterion=criterion,
                    max_features=2,
                    max_depth=maxDepth,
                    min_samples_split=minSamplesSplit,
                    min_samples_leaf=minSamplesLeaf,
                    min_weight_fraction_leaf=minWeightedFractionLeaf,
                    max_leaf_nodes=maxLeafNodes)
            else:
                self.__clf = RandomForest(numTrees, maxDepth, minSamplesLeaf,
                                          trials)
            self.__clf.fit(self.__samples, self.__labels)

        else:
            print("unsupported classifier")

    def runFeatureSpaceComputations(self):
        if self.__clf:
            x_min, y_min, x_max, y_max = self.__featurespace.coordinateSystem.getLimits(
            )
            ppuX, ppuY = self.__featurespace.coordinateSystem.getPixelsPerUnit(
            )

            stepsize = 1.0 / ppuX
            xrange = numpy.arange(x_min, x_max, stepsize)
            w = len(xrange)

            stepsize = 1.0 / ppuY
            yrange = numpy.arange(y_max, y_min, -stepsize)
            h = len(yrange)

            xx, yy = numpy.meshgrid(xrange, yrange)

            data = numpy.c_[xx.ravel(), yy.ravel()]

            Z = self.__clf.predict(data)

            if Z is None:
                return None

            Z = Z.astype(numpy.int64)
            for k in range(Parameters.NUMBER_SUPPORTED_CLASSES):
                col, _, _ = MyColors.rgbForClass(k)
                Z = numpy.where(Z == k, col, Z)
            Z = Z.astype(numpy.int32)

            img = QtGui.QImage(Z, w, h, QtGui.QImage.Format_RGB32)
            # img.save('test.png')

            return img

        else:
            return None
示例#17
0
def main(cv=False,kaggle=True, num_Trees=10, verbose=False):
    X = []
    y = []
    # Load data set
    with open("hw4-data.csv") as f:
        next(f, None)
        for line in csv.reader(f, delimiter = ","):
            X.append(line[:-1])
            y.append(line[-1])
    #end

    X = np.array(X, dtype = float)
    y = np.array(y, dtype = int)

    # Split training/test sets
    # You need to modify the following code for cross validation
    if cv == True:
        K = 10
        cv_accuracy =[]
        for ii in xrange(K):
            X_train = np.array([x for i, x in enumerate(X) if i % K != ii],
                                dtype = float)
            y_train = np.array([z for i, z in enumerate(y) if i % K != ii],
                                dtype = int)
            X_test  = np.array([x for i, x in enumerate(X) if i % K == ii],
                                dtype = float)
            y_test  = np.array([z for i, z in enumerate(y) if i % K == ii],
                                dtype = int)

            randomForest = RandomForest(num_trees=num_Trees, verbose=verbose)
            t0 = time()
            randomForest.fit(X_train, y_train)
            t1 = time()
            print "time elapses = %.3f s" % (t1-t0)

            y_predicted = randomForest.predict(X_test)

            results = [prediction == truth for prediction,
                       truth in zip(y_predicted, y_test)]

            # Accuracy
            accuracy = float(results.count(True)) / float(len(results))
            print "test accuracy: %.4f" % accuracy
            cv_accuracy.append(accuracy)
        print "average cv accuracy: %.4f" % np.mean(cv_accuracy)
    else:
        ii = 3
        K = 10
        X_train = np.array([x for i, x in enumerate(X) if i % K != ii],
                           dtype = float)
        y_train = np.array([z for i, z in enumerate(y) if i % K != ii],
                           dtype = int)
        X_test  = np.array([x for i, x in enumerate(X) if i % K == ii],
                           dtype = float)
        y_test  = np.array([z for i, z in enumerate(y) if i % K == ii],
                           dtype = int)
        if kaggle==True:
            randomForest = RandomForest(num_trees=num_Trees, verbose=verbose)
            t0 = time()
            # randomForest.fit(X_train,y_train)
            randomForest.fit(X,y) #use the full data
            t1 = time()
            print "time elapses = %.3f s" % (t1-t0)
            # y_predicted = randomForest.predict(X_test)
            # results = [prediction == truth 
            #            for prediction,truth in zip(y_predicted,y_test)]
            # # Accuracy
            # accuracy = float(results.count(True)) / float(len(results))
            # print "test accuracy: %.4f" % accuracy
            generateSubmissionFile(myname, randomForest)
        else:
            randomForest = RandomForest(num_trees=num_Trees, verbose=verbose)
            t0 = time()
            randomForest.fit(X_train,y_train)
            t1 = time()
            print "time elapses = %.3f s" % (t1-t0)
            y_predicted = randomForest.predict(X_test)
            results = [prediction == truth 
                       for prediction,truth in zip(y_predicted,y_test)]
            accuracy = float(results.count(True)) / float(len(results))
            print "test accuracy: %.4f" % accuracy