Exemplo n.º 1
0
print('f1 score for DecisionTreeClassifier', f1_tree)
print('precision for DecisionTreeClassifier', precision_tree)
print('recall for DecisionTreeClassifier', recall_tree)

if (precision_tree >= min_precision) & (recall_tree >= min_recall):
    print('DecisionTree is a good classifier with set parameters')
else:
    print(
        'Low precision and recall, DecisionTree is not a good classifier with set parameters'
    )

print(
    '################### Try DecisionTreeClassifier ###################################'
)
# DecisionTree - 1
tree_clf_1 = tree.DecisionTreeClassifier(random_state=0)

# create feature union
features_pipeline = []
features_pipeline.append(('pca', PCA(n_components=3)))
features_pipeline.append(('select_best', SelectKBest(k=k)))
feature_union = FeatureUnion(features_pipeline)

# Create a pipeline with feature selection and classification
pipe = Pipeline([('feature_union', feature_union),
                 ('feature_selection', SelectKBest(k=k)),
                 ('classification', tree_clf_1)])

# Check the parameters that can be set for DecisionTree Classifier, and create a param_grid
estimated = tree_clf_1.get_params().keys()
print('param_keys########################', estimated)
Exemplo n.º 2
0
def main():
    item_type = 'hotel'
    # item_type = 'restaurant'
    my_folder = '/Users/fpena/UCC/Thesis/datasets/context/'
    my_file = my_folder + 'classified_' + item_type + '_reviews.json'
    binary_reviews_file = my_folder + 'classified_' + item_type + '_reviews.pkl'
    my_records = ETLUtils.load_json_file(my_file)

    with open(binary_reviews_file, 'rb') as read_file:
        my_reviews = pickle.load(read_file)

    num_features = 2

    my_metrics = numpy.zeros((len(my_reviews), num_features))
    for index in range(len(my_reviews)):
        my_metrics[index] =\
            review_metrics_extractor.get_review_metrics(my_reviews[index])

    review_metrics_extractor.normalize_matrix_by_columns(my_metrics)

    count_specific = 0
    count_generic = 0
    for record in my_records:

        if record['specific'] == 'yes':
            count_specific += 1

        if record['specific'] == 'no':
            count_generic += 1

    print('count_specific: %d' % count_specific)
    print('count_generic: %d' % count_generic)
    print('specific percentage: %f%%' % (float(count_specific)/len(my_records)))
    print('generic percentage: %f%%' % (float(count_generic)/len(my_records)))

    my_labels = numpy.array([record['specific'] == 'yes' for record in my_records])

    classifiers = [
        DummyClassifier(strategy='most_frequent', random_state=0),
        DummyClassifier(strategy='stratified', random_state=0),
        DummyClassifier(strategy='uniform', random_state=0),
        # DummyClassifier(strategy='constant', random_state=0, constant=True),
        LogisticRegression(C=100),
        SVC(C=1.0, kernel='rbf'),
        SVC(C=1.0, kernel='linear'),
        KNeighborsClassifier(n_neighbors=10),
        tree.DecisionTreeClassifier(),
        NuSVC(),
        LinearSVC()
    ]
    scores = [[] for _ in range(len(classifiers))]

    Xtrans = my_metrics

    cv = KFold(n=len(my_metrics), n_folds=5)

    for i in range(len(classifiers)):
        for train, test in cv:
            x_train, y_train = Xtrans[train], my_labels[train]
            x_test, y_test = Xtrans[test], my_labels[test]

            clf = classifiers[i]
            clf.fit(x_train, y_train)
            scores[i].append(clf.score(x_test, y_test))

    for classifier, score in zip(classifiers, scores):
        print("Mean(scores)=%.5f\tStddev(scores)=%.5f" % (numpy.mean(score), numpy.std(score)))

    plot(my_metrics, my_labels)
Exemplo n.º 3
0
from skompiler import skompile
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score,cross_val_predict
from sklearn.tree import DecisionTreeClassifier,tree
import matplotlib.pyplot as plt
from sklearn.metrics import *

dia=pd.read_csv("10.1 diabetes.csv.csv")
df=dia.copy()
df=df.dropna()
y=df["Outcome"]
X=df.drop(["Outcome"],axis=1)
#X=df["Pregnancies"]
X=pd.DataFrame(X)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=42)

cart=DecisionTreeClassifier()
cart_model=cart.fit(X_train,y_train)
print(skompile(cart_model.predict))
y_pred=cart_model.predict(X_test)
print(accuracy_score(y_test,y_pred))
cart_grid={"max_depth":range(1,10),
           "min_samples_split":range(2,50)}
cart=tree.DecisionTreeClassifier()
cart_cv=GridSearchCV(cart,cart_grid,cv=10,n_jobs=-1,verbose=2)
cart_cv_model=cart_cv.fit(X_train,y_train)
print("en iyi parametreler:"+str(cart_cv_model.best_params_))
cart=tree.DecisionTreeClassifier(max_depth=5,min_samples__split=19)
cart_tuned=cart.fit(X_train,y_train)
y_pred=cart_tuned.predict(X_test)
print(accuracy_score(y_test,y_pred))
principalComponents2 = pca1.transform(test_set)
print("This is important principal components")
print(pca1.explained_variance_ratio_)
principalDf = pd.DataFrame(data=principalComponents1,
                           columns=[
                               'principal component 1',
                               'principal component 2',
                               'principal component 3',
                               'principal component 4',
                               'principal component 5',
                               'principal component 6', 'principal component 7'
                           ])

#decision tree method
print("****THIS PART IS DECISION TREE CLASSIFICATION****")
data = tree.DecisionTreeClassifier()
data = data.fit(
    train[[
        'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
        'exang', 'oldpeak', 'slope', 'ca', 'thal'
    ]], train['heartdisease'])
predictions_data = data.predict(test[[
    'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
    'exang', 'oldpeak', 'slope', 'ca', 'thal'
]])
predictright = 0
predictions_data.shape[0]
for i in range(0, predictions_data.shape[0] - 1):
    if (predictions_data[i] == test.iloc[i][13]):
        predictright += 1
accuracy = predictright / predictions_data.shape[0]
import pandas as pd
from sklearn.tree import tree

data = pd.read_csv("data/_ea07570741a3ec966e284208f588e50e_titanic.csv",
                   index_col='PassengerId')
data = data[['Pclass', 'Fare', 'Age', 'Sex', 'Survived']]
data = data.dropna()
data.loc[data['Sex'] != 'female', 'Sex'] = 0
data.loc[data['Sex'] == 'female', 'Sex'] = 1
print(data)

X = data[['Pclass', 'Fare', 'Age', 'Sex']]
Y = data['Survived']

clf = tree.DecisionTreeClassifier(random_state=241)
clf.fit(X, Y)
importances = clf.feature_importances_

print(importances.round(4))
Exemplo n.º 6
0
def MaildecisionTree(vec, lab):
    mode = tree.DecisionTreeClassifier(criterion='gini')
    mode.fit(vec, lab)
    res = mode.predict(vec)
    print("Accuracy: " + str(getAc(res, lab)))
Exemplo n.º 7
0
 def setUp(self):
     self.tmp_fn = 'Tmp'
     self.iris = load_iris()
     self.n_features = len(self.iris.data[0])
     self.clf = tree.DecisionTreeClassifier(random_state=0)
     self.clf.fit(self.iris.data, self.iris.target)
Image(graph[0].create_png())
#############################################################################################
#############################################################################################

#############################################################################################
#############################################################################################
############# decision tree ##################################################################
#############################################################################################
#############################################################################################
from sklearn.tree import tree

X_train, X_test, y_train, y_test = train_test_split_for_oneclass(
    data_X, data_Y)

#making the instance
model_DT = tree.DecisionTreeClassifier(criterion='gini')
#Hyper Parameters Set
params = {
    'max_features': ['sqrt', 'log2'],
    'max_depth': [2, 3, 4, 5, 10, 20],
    'min_samples_split': [2, 3, 4, 5, 10, 50, 100, 200],
    'min_samples_leaf': [2, 3, 4, 5, 10, 100],
    'random_state': [random_state]
}

#Making models with hyper parameters sets
model_DT = GridSearchCV(model_DT,
                        param_grid=params,
                        n_jobs=-1,
                        cv=10,
                        scoring='roc_auc')
Exemplo n.º 9
0
 def test_with_invalid_k(self):
     with self.assertRaises(AssertionError):
         MRBBagging(0, tree.DecisionTreeClassifier())
Exemplo n.º 10
0
def get_classifiers():
    dict_clfs = {}

    dict_clfs[CLF_TYPES.RandomForestClassifier50] = RandomForestClassifier(n_estimators=50, n_jobs=12)

    dict_clfs[CLF_TYPES.RandomForestClassifier5] = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, n_jobs=12)

    dict_clfs[CLF_TYPES.BernoulliRBM] = Pipeline(steps=[('rbm', BernoulliRBM(n_components=200,
                                                                             n_iter=1,
                                                                             learning_rate=0.01,
                                                                             verbose=False)),
                                                        ('logistic', LogisticRegression(C=10000))])

    dict_clfs[CLF_TYPES.MLPClassifier] = MLPClassifier(hidden_layer_sizes=(75,), max_iter=250, alpha=1e-4, solver='sgd', verbose=0, tol=1e-4,
                                                       random_state=RANDOM_SEED, learning_rate_init=.1, early_stopping=True)

    kwargs = dict(n_estimators=50,
                 learning_rate=1.,
                 algorithm='SAMME.R',
                 random_state=RANDOM_SEED)
    dict_clfs[CLF_TYPES.AdaBoostClassifier] = AdaBoostClassifier(**kwargs)

    kwargs = {'algorithm': 'auto', 'leaf_size': 5, 'metric': 'minkowski', 'n_jobs': 12, 'n_neighbors': 6, 'p': 1, 'weights': 'distance'}
    # kwargs = {}
    dict_clfs[CLF_TYPES.KNN] = KNeighborsClassifier(**kwargs)

    kwargs = dict(learning_rate=0.1)
    # kwargs = {}
    dict_clfs[CLF_TYPES.GBC] = GradientBoostingClassifier(**kwargs)

    dict_clfs[CLF_TYPES.GNB] = GaussianNB()

    kwargs = {'alpha': 0.10526315789473684}
    # kwargs = {}
    dict_clfs[CLF_TYPES.MultinomialNB] = MultinomialNB(**kwargs)

    # kwargs = None
    kwargs = {'alpha': 0.10526315789473684, 'norm': False}
    # kwargs = {}
    dict_clfs[CLF_TYPES.ComplementNB] = ComplementNB(**kwargs)

    # kwargs = None
    kwargs = {'alpha': 0.05263157894736842, 'binarize': 0.9473684210526315}
    # kwargs = {}
    dict_clfs[CLF_TYPES.BernoulliNB] = BernoulliNB(**kwargs)

    kwargs = {'criterion': 'gini', 'max_depth': 1.0, 'max_features': 3, 'min_samples_leaf': 0.4, 'min_samples_split': 0.01, 'min_weight_fraction_leaf': 0.4, 'random_state': 42, 'splitter': 'best'}
    # kwargs = {}
    dict_clfs[CLF_TYPES.DecisionTreeClassifier] = tree.DecisionTreeClassifier(**kwargs)

    dict_clfs[CLF_TYPES.ExtraTreeClassifier] = tree.ExtraTreeClassifier()

    kwargs = {'C': 10, 'gamma': 0.001, 'kernel': 'rbf', 'random_state': 42, 'probability': True}
    # kwargs = {}
    dict_clfs[CLF_TYPES.SVC] = svm.SVC(**kwargs)

    kwargs = {'C': 1.0, 'dual': False, 'fit_intercept': True, 'max_iter': 1000, 'multi_class': 'ovr', 'penalty': 'l1', 'solver': 'liblinear'}
    # kwargs = {}
    dict_clfs[CLF_TYPES.LogReg100] = LogisticRegression(**kwargs)
    dict_clfs[CLF_TYPES.LogReg10k] = LogisticRegression(C=10000)

    return dict_clfs
Exemplo n.º 11
0
# -------------------------------------------------------------------------------------------------------------------------#
#Definition des classifieurs dans un dictionnaire
clf_init = None
clfs =	{
    #Naive Bayes Classifier
    'NBS' : GaussianNB(),

    #Random Forest
    'RF':   RandomForestClassifier(n_estimators=100),

    #K plus proches voisins
    'KNN':  KNeighborsClassifier(n_neighbors=10,  weights='uniform', algorithm='auto', p=2, metric='minkowski'),

    #Arbres de décision CART
    'CART': tree.DecisionTreeClassifier(min_samples_split=50, random_state=99,criterion='gini'),

    #Adaboost avec arbre de décision
    'ADAB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1,random_state=99,criterion='gini'),algorithm="SAMME",n_estimators=100),

    # MLP perceptron multi-couches,
    'MLP' : MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(200,10), random_state=3, learning_rate = 'adaptive'),

    #Gradient boosting classifier
    'GBC' : GradientBoostingClassifier( loss='deviance', learning_rate=0.1, n_estimators=100, subsample=0.3,min_samples_split=2,
                                        min_samples_leaf=1, max_depth=1, init=clf_init,random_state=1, max_features=None, verbose=0),

    #Bagging with KNearestNeighbours
    'BC With KNN'  : BaggingClassifier(KNeighborsClassifier(),max_samples=0.5, max_features=0.5),

    # Extra Trees Classifier
Exemplo n.º 12
0
def GraphdecisionTree(trainvec, trainlab, testvec, testlab):
    mode = tree.DecisionTreeClassifier(criterion='gini')
    mode.fit(trainvec, trainlab)
    res = mode.predict(testvec)
    print("Accuracy: " + str(getAc(res, testlab)))
                    continue

                if not 2 in cluster_datasets[index].keys():
                    continue
                X_cluster = cluster_datasets[index][0]
                Y_cluster = cluster_datasets[index][1]
                X_cluster_unlabelled = cluster_datasets[index][2]
                index_list = cluster_datasets[index][3]

                trash = False
                if len(np.unique(Y_cluster)) == 1  and len(X_cluster_unlabelled) < 10: #and len(Y_cluster) > 10
                    continue


                if not trash:
                    model_cluster = tree.DecisionTreeClassifier()
                    model_cluster.fit(X_cluster, Y_cluster)
                    Y_cluster_guess = model_cluster.predict(X_cluster_unlabelled)
                    Y_cluster_guess_proba = model_cluster.predict_proba(X_cluster_unlabelled)
                    model_cluster = None
                    for index2 in range(0, len(Y_cluster_guess)):
                        label_index = index_list[index2]
                        if np.max(Y_cluster_guess_proba[index2]) > 0.99:
                            labels_[label_index] = Y_cluster_guess[index2]
                        else:
                            labels_[label_index] = -1
                else:
                    for index2 in range(0, len(X_cluster_unlabelled)):
                        label_index = index_list[index2]
                        labels_[label_index] = -1
Exemplo n.º 14
0
    'SVC': {
        'resampler': resamplers,
        'classifier': [SVC(random_state=RANDOM_STATE)],
        'classifier__kernel': ['rbf', 'linear'],
        'classifier__C': [0.1, 1.0, 10, 100, 1000]
        # 'classifier__C': [0.1, 1.0, 10]
    },
    'KNeighborsClassifier': {
        'resampler': resamplers,
        'classifier': [KNeighborsClassifier()],
        'classifier__n_neighbors': [1, 2, 5, 10, 20],
        'classifier__weights': ['uniform', 'distance']
    },
    'DecisionTreeClassifier': {
        'resampler': resamplers,
        'classifier': [tree.DecisionTreeClassifier(random_state=RANDOM_STATE)],
        'classifier__max_depth': [None, 2, 3, 5, 10],
        'classifier__min_samples_leaf': [2, 5, 10]
    },
    'RandomForestClassifier': {
        'resampler': resamplers,
        'classifier': [RandomForestClassifier(random_state=RANDOM_STATE)],
        'classifier__n_estimators': [10, 50, 100, 200]
    }
}

# PARAM_GRID_MAP = {
#     'SVC None': {
#         'resampler': [None],
#         'classifier': [SVC(random_state=RANDOM_STATE)],
#         'classifier__C': [0.1, 1.0, 10, 100, 1000]
################################################################################

### your code here!  name your classifier object clf if you want the
### visualization code (prettyPicture) to show you the decision boundary
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import tree

neigh = KNeighborsClassifier(n_neighbors=10, weights='distance')
neigh.fit(features_train, labels_train)
outcome_knn = neigh.predict(features_test)
print " KNN accuracy >> ", accuracy_score(outcome_knn, labels_test)  #94%

ada = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=5),
                         algorithm="SAMME",
                         n_estimators=200)
ada.fit(features_train, labels_train)
outcome_ada = ada.predict(features_test)
print "ADA accuracy >> ", accuracy_score(outcome_ada, labels_test)  # 92.4%

rf = RandomForestClassifier(bootstrap=True,
                            max_depth=2,
                            max_features='auto',
                            n_estimators=200)
rf.fit(features_train, labels_train)
outcome_rf = rf.predict(features_test)
outcome_rf_prb = rf.predict_proba(features_test)
print "RF accuracy >> ", accuracy_score(outcome_rf, labels_test)  #
#print "RF probabilities >> ",outcome_rf_prb
Exemplo n.º 16
0
 def test__group_data_with_none(self):
     mrbbagging = MRBBagging(1, tree.DecisionTreeClassifier())
     x = [[1, 1, 1], [2, 2, 2], [3, 3, 3]]
     y = ["A", None, "C"]
     with self.assertRaises(AssertionError):
         mrbbagging._group_data(x, y)
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
# features_train, features_test, labels_train, labels_test = preprocess()
features_train, labels_train, features_test, labels_test = makeTerrainData()

#########################################################
### your code goes here ###
from sklearn.tree import tree
from sklearn.metrics import accuracy_score

clear()

print("Start execution")

# min_samples_split = 50
classifier = tree.DecisionTreeClassifier(min_samples_split=50)
classifier.fit(features_train, labels_train)

prediction = classifier.predict(features_test)

pictureName = "decision_tree_classifier_bigger.png"

accuracy = accuracy_score(labels_test, prediction)

print(accuracy)

prettyPicture(classifier, features_test, labels_test, pictureName)

show_img(pictureName)

#########################################################
Exemplo n.º 18
0
from pandas import read_csv
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.tree import tree, export_graphviz
from obprueba import Diabetes as odi

reglog = linear_model.LogisticRegression()
navidad = tree.DecisionTreeClassifier()
archivo = 'dataset_final.csv'

df = read_csv(archivo)

# print(df)

arreglox = df[df.columns[1:-1]].as_matrix()
arregloy = df[df.columns[-1]].as_matrix()

# print(arregloy)

# jugando con el modelo

X_train, X_test, y_train, y_test = train_test_split(arreglox, arregloy)

entrena = navidad.fit(X_train, y_train)  # reglog.fit(X_train, y_train)
entrena2 = reglog.fit(X_train, y_train)  # reglog.fit(X_train, y_train)

print(entrena)
print(str(entrena.score(X_test, y_test)) + ' scort arbol ')

print(entrena2)
print(str(entrena2.score(X_test, y_test)) + ' scort regresion lineal ')
Exemplo n.º 19
0
X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                    iris.target,
                                                    test_size=0.3,
                                                    random_state=0)
predictions = {}

# classifier - logistic regression

classifier = LogisticRegression()
classifier.fit(X_train, y_train)
predict = classifier.predict(X_test)
predictions["Logistic Regression"] = predict

# classifier -  decision trees

classifier = tree.DecisionTreeClassifier()
classifier.fit(X_train, y_train)
predict = classifier.predict(X_test)
predictions["Decision Tree"] = predict

# classifier -  k neighbours

classifier = KNeighborsClassifier(n_neighbors=1)
classifier.fit(X_train, y_train)
predict = classifier.predict(X_test)
predictions["K Neighbours"] = predict

# classifier -  naive bayes

classifier = GaussianNB()
classifier.fit(X_train, y_train)
Exemplo n.º 20
0
    def getTrainAndTest(self):
        #df = pd.read_csv('H:\pc programming\Django(Prac)\ML\Classification\Classification\Review_Testing_Format.txt')
        df = pd.read_csv('Review_Testing_Format.txt')
        df.replace('?', -99999, inplace=True)
        df.drop(['id'], 1, inplace=True)

        X = np.array(df.drop(['class'], 1))
        y = np.array(df['class'])

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=.10)

        #  Built-In Decision Tree
        self.clf_DTree = tree.DecisionTreeClassifier()
        self.clf_DTree.fit(X_train, y_train)
        accuracy = self.clf_DTree.score(X_test, y_test)
        print("Accuracy in Decision Tree: %s" % accuracy)

        #  Built-In K-Nearest Neighbour
        self.clf_KNN = neighbors.KNeighborsClassifier()
        self.clf_KNN.fit(X_train, y_train)
        accuracy = self.clf_KNN.score(X_test, y_test)
        print("Accuracy in KNN: %s" % accuracy)

        #  Built-In Support Vector Machine
        self.clf_SVM = svm.SVC()
        self.clf_SVM.fit(X_train, y_train)
        accuracy = self.clf_SVM.score(X_test, y_test)
        print("Accuracy in SVM: %s" % accuracy)

        Y = label_binarize(y, classes=['A', 'B', 'C'])
        n_classes = Y.shape[1]

        X_train, X_test, Y_train, Y_test = train_test_split(
            X,
            Y,
            test_size=.5,
        )
        classifier = OneVsRestClassifier(svm.LinearSVC(random_state=None))
        classifier.fit(X_train, Y_train)
        y_score = classifier.decision_function(X_test)
        # For each class
        precision = dict()
        recall = dict()
        average_precision = dict()
        '''
        for i in range(n_classes):
            average_precision[i] = average_precision_score(Y_test[:, i], y_score[:, i])

        average_precision["micro"] = average_precision_score(Y_test, y_score,
                                                             average="micro")
        average_precision["macro"] = average_precision_score(Y_test, y_score,
                                                             average="macro")
        average_precision["weighted"] = average_precision_score(Y_test, y_score,
                                                             average="weighted")
        print('Average precision score, micro-averaged over all classes: {0:0.2f}'
              .format(average_precision["micro"]))

        recall["micro"] = recall_score(Y_test, y_score,average="micro")
        print('Recall score, micro over all classes: {0:0.2f}'
              .format(recall["micro"]))
              '''
        '''
y = np.array(recipes['label'])

print("feature load done")

#x_train, x_test, y_train, y_test = tts(X, y, test_size=0.6)

#print("train-test done")

###classifiers

#clf_nb = MultinomialNB()
print("model start")

#clf_svm = svm.LinearSVC(verbose=True)
clf_lr = LogisticRegression(verbose=True)
clf_tree = tree.DecisionTreeClassifier()
clf_knn = KNeighborsClassifier(n_neighbors=5)
clf_nb = MultinomialNB()
#########

##model save
print("training start.........")
print(".")
print("tree start")
clf_tree.fit(X, y)
filename = 'tree.sav'
pickle.dump(clf_tree, open(filename, 'wb'))
print("tree done")

print(".")
print("lr start")
Exemplo n.º 22
0
    trees_str = gen_quant_trees_str(tree, precisions)

    with open(filename, 'w') as f:
        f.write(trees_str)


def get_tree_results(tree, Xtest):
    """
    Runs data through a quantized DecisionTreeClassifier
    :param tree: DTC function handle
    :param Xtest: data to test
    :returns: predicted results
    """
    results = [tree(X) for X in Xtest]
    return np.array([results], ndmin=1).T


if __name__ == '__main__':
    DIR = r'C:\Users\brady\GitHub\MinVAD\feature_extract'

    tr_data = np.load(os.path.join(DIR, 'train_130k.npy'))
    tr_class = np.load(os.path.join(DIR, 'train_130k_class.npy'))

    myData = np.hstack((tr_data, tr_class))
    np.random.shuffle(myData)

    cutoff = int(np.floor(0.8 * len(tr_class)))
    clf = tree.DecisionTreeClassifier(max_depth=5)
    clf = clf.fit(myData[:cutoff, :19], myData[:cutoff, 20])
    test_str = gen_quant_trees_str(clf, np.arange(16, 15, -1))
    print(test_str)