def TreeTest():
    spamDat = spamData()
    k = 10
    all_folds = hw3.partition_folds(spamDat, k)
    num_in_fold = []
    err_in_fold = []
    for i in range(len(all_folds) - 1):
        spam = all_folds[i]
        num_in_fold.append(len(spam))
        truth, f_data = decTree.split_truth_from_data(spam)
        tree = decTree.TreeOptimal(max_depth=2)
        #tree = decTree.TreeRandom()
        tree.fit(f_data, truth)
        print 'Prediction...\n'
        predict = tree.predict(f_data)
        print predict
        print truth
        error = 1. - hw3.get_accuracy(predict, truth)
        err_in_fold.append(error)
        print 'Tree error is: {}'.format(error)
    spam = all_folds[k -1]
    truth, f_data = decTree.split_truth_from_data(spam)
    tree = decTree.TreeOptimal(max_depth=2)
    #tree = decTree.TreeRandom()
    tree.fit(f_data, truth)
    predict = tree.predict(f_data)
    error = 1. - hw3.get_accuracy(predict, truth)
    sum_training_err = 0
    for i in range(len(num_in_fold)):
        sum_training_err += err_in_fold[i]
        #sum_training_err += float(err_in_fold)/num_in_fold
    average_training_error = float(sum_training_err)/len(num_in_fold)
    print 'Average training error: {}\nAverage testing error: {}'.format(average_training_error, error)
def d_tree():
    data = get_crime_data().as_matrix()
    X = data[:, [0, 1, 2, 3, 4, 5, 6, 7, 9]]
    y = data[:, 8]

    xTr, xTe, yTr, yTe = train_test_split(X, y, test_size=0.4, random_state=0)

    dt = DecisionTreeClassifier(min_samples_split=20, random_state=99)
    tree = dt.fit(xTr, yTr)
    preds = tree.predict(xTe)

    print "Train Accuracy :: ", accuracy_score(yTr, tree.predict(xTr))
    print "Test Accuracy  :: ", accuracy_score(yTe, preds)

    print("-- 10-fold cross-validation --")
    cv_dt = cross_val_score(dt, xTe, yTe, cv=10)

    print("mean: {:.3f} (std: {:.3f})".format(cv_dt.mean(), cv_dt.std()))
    '''
	Train Accuracy ::  0.9437
	Test Accuracy  ::  0.87695
	-- 10-fold cross-validation --
	mean: 0.876 (std: 0.007)
	'''
    #	---- STAT TEST ----
    from scipy.stats import ttest_ind
    results = pd.DataFrame({'preds': preds, 'yTe': yTe})

    value, pvalue = ttest_ind(preds, yTe, equal_var=True)
    print(value, pvalue)
    if pvalue >= 0.05:
        print('Dtree is a good predictor for classification')
    else:
        print('Dtree is a bad predictor for classification')
示例#3
0
    def testRunWithIrisData(self):
        # Load data and store it into pandas DataFrame objects
        iris = load_iris()
        X = pd.DataFrame(iris.data[:, :], columns=iris.feature_names[:])
        y = pd.DataFrame(iris.target, columns=["Species"])

        # Defining and fitting a DecisionTreeClassifier instance
        tree = DecisionTreeClassifier(max_depth=2)
        tree.fit(X, y)

        # Creates dot file named tree.dot
        export_graphviz(tree,
                        out_file="../output/IrisOutput_DT.dot",
                        feature_names=list(X.columns),
                        class_names=iris.target_names,
                        filled=True,
                        rounded=True)

        sample_one_pred = int(tree.predict([[5, 5, 1, 3]]))
        sample_two_pred = int(tree.predict([[5, 5, 2.6, 1.5]]))
        print(
            f"The first sample most likely belongs a {iris.target_names[sample_one_pred]} flower."
        )
        print(
            f"The second sample most likely belongs a {iris.target_names[sample_two_pred]} flower."
        )
示例#4
0
    def landmark_decision_tree(X, y):  # pylint: disable=C0103
        """Compute statistic."""
        try:
            if scipy.sparse.issparse(X):
                return np.NaN

            import sklearn.tree

            # pylint: disable=C0103
            if len(y.shape) == 1 or y.shape[1] == 1:
                kf = sklearn.model_selection.StratifiedKFold(n_splits=10)
            else:
                kf = sklearn.model_selection.KFold(n_splits=10)

            accuracy = 0.
            for train, test in kf.split(X, y):
                random_state = sklearn.utils.check_random_state(42)
                tree = sklearn.tree.DecisionTreeClassifier(
                    random_state=random_state)

                if len(y.shape) == 1 or y.shape[1] == 1:
                    tree.fit(X[train], y[train])
                else:
                    tree = OneVsRestClassifier(tree)
                    tree.fit(X[train], y[train])

                predictions = tree.predict(X[test])
                accuracy += sklearn.metrics.accuracy_score(
                    predictions, y[test])
            return accuracy / 10
        except Exception as ex:  # pylint: disable=W0703
            automl_log(
                "Landmark Decision Tree could not be computed. Returning 0 \
instead. Originally failed with exception '{ex}'".format(ex=ex), 'WARNING')
            return 0.
示例#5
0
def process_query(tree, query):
    target_value = tree.predict(query)
    print_query(query)
    print("Predicted Target Value:", target_value, '\n')
    if target_value != query[TARGET]:
        classification_error_samples += 1
    plot_categorical_decision_tree(query)
示例#6
0
def get_best_tree(model, X, keep_scores=False):
    """
    Given a model of ensembled trees with an `estimators_` attribute,
    finds the tree that most closely resembles

    Parameters
    ----------
    model
    X

    Returns
    -------

    """
    overall_prediction = model.predict(X)

    predictions = dict()
    scores = dict()

    best_score, best_tree_number = -999, -999

    for tree_num, tree in enumerate(model.estimators_):
        predictions[tree_num] = tree.predict(X)
        new_score = tree.score(X, overall_prediction)
        scores[tree_num] = new_score

        if new_score > best_score:
            best_score = new_score
            best_tree_number = tree_num

    nearest_tree = model.estimators_[best_tree_number]

    if keep_scores:
        return best_tree_number, nearest_tree, scores
    return best_tree_number, nearest_tree
示例#7
0
def check_accuracy(dt, dataset, test_percentage=20, num_repeats=3):
    training_set = copy.deepcopy(dataset)
    accuracies = []

    num_test_samples = int(len(training_set) / (100 / test_percentage))

    for i in range(num_repeats):
        random.shuffle(training_set)
        test_set = [
            training_set.pop(random.randrange(len(training_set)))
            for i in range(num_test_samples)
        ]

        # Fit the dt
        tree = dt.fit(training_set)
        values = [tree.predict(a) for a in test_set]
        expected = [a.classification for a in test_set]

        same = 0
        for i in range(len(values)):
            if values[i] == expected[i]:
                same += 1

        accuracies.append((same / len(values)) * 100)
        [training_set.append(i)
         for i in test_set]  # test set overwritten later,
        # so no need to pop
    average_accuracy = sum(accuracies) / len(accuracies)
    std_dev = 0
    for a in accuracies:
        std_dev += ((a - average_accuracy)**2)
    std_dev = math.sqrt(1 / len(accuracies) * std_dev)

    print("Accuracy: {}, Std dev: {}".format(average_accuracy, std_dev))
    return (average_accuracy, std_dev)
示例#8
0
 def predict(self, X):
     """
     Funtion to run the BaggingClassifier on a data point
     Input:
     X: pd.DataFrame with rows as samples and columns as features
     Output:
     y: pd.Series with rows corresponding to output variable. THe output variable in a row is the prediction for sample in corresponding row in X.
     """
     out = "y"
     if (isinstance(X, pd.DataFrame)):
         if (out in X.columns):
             X = X.drop(['y'],axis=1)
     
     y_hat = np.zeros(len(X))
     
     all_predictions = []
     for tree in self.trees:
         all_predictions.append(tree.predict(X))
     
     pred_arr = np.array(all_predictions)
     pred_arr = pred_arr.T
     # pred max pred value for each samples
     y_hat = [np.argmax(np.bincount(i)) for i in pred_arr]
     
     return(pd.Series(y_hat))
示例#9
0
文件: core.py 项目: itaicaspi/AI
def k_fold_cross_validation(folds, noisy_folds, criteria, m):
    mean_accuracy = 0.0
    features_set_size = len(folds[0][0])-1
    fold_size = len(folds)
    trees = []
    for test_fold_idx in range(fold_size):
        # train for all folds except for test_fold_idx
        X = []
        Y = []
        for train_fold_idx in range(fold_size):
            if train_fold_idx == test_fold_idx:
                continue
            X += [row[:-1] for row in noisy_folds[train_fold_idx]]
            Y += [row[-1] for row in noisy_folds[train_fold_idx]]
        classifier = FeaturesClassifier(criteria, m)
        tree = classifier.fit(X, Y, list(range(features_set_size)))

        # test for test_fold_idx
        X = [row[:-1] for row in folds[test_fold_idx]]
        Y = [row[-1] for row in folds[test_fold_idx]]
        results = tree.predict(X)
        count = [1 for i in range(len(results)) if results[i] == Y[i]]
        mean_accuracy += len(count)/float(len(results))
        #print(len(count)/float(len(results)))
        trees += [tree]
    mean_accuracy /= float(len(folds))
    return trees , mean_accuracy
def test_errors(treeName, tree, merged):
    errors = []
    ar = np.array(merged)
    for i in range(0, len(ar)):
        r = ar[i]
        prediction = tree.predict([
            r[1], r[2], r[3], r[4], r[5], r[6], r[7], r[8], r[9], r[10], r[11],
            r[12], r[13]
        ])
        actual = r[14]
        error = actual - prediction
        if actual == 0.0:
            actual += 0.00001
        error /= actual
        errors.append(error)
    errorsArray = np.array(errors)
    print("Min error: " + str(np.min(abs(errorsArray))))
    print("Max error: " + str(np.max(abs(errorsArray))))
    print("Median error: " + str(np.median(errorsArray)))
    plt.figure()
    plt.suptitle("Histogram of Error: " + treeName + "\nMinError: " +
                 str(round(np.min(abs(errorsArray)), 4)) + ", Max error: " +
                 str(round(np.max(abs(errorsArray)), 4)) + ", Median error: " +
                 str(round(np.median(errorsArray), 4)))
    plt.hist(errorsArray, bins=100)
    fileName = treeName + "\\" + treeName + "ErrorHistogram.png"
    plt.savefig(fileName)
    return errors
示例#11
0
    def predict(self, df):
        """
        function that predicts the expected effect
        """
        if not hasattr(self, 'model'):
            raise Exception('Model not fitted.')

        # removing 'index' from index_cols if necessary
        index_cols = self.index_cols.copy()
        if 'new_index' in self.index_cols:
            index_cols.remove('new_index')

        # removing w from df, if necessary
        if not self.use_w_in_tree:
            for col in self.w_var:
                if col in df.columns:
                    index_cols += [col]

        # removing y from df, if necessary
        for col in self.y_var:
            if col in df.columns:
                index_cols += [col]

        preds = df.set_index(index_cols)[[]].copy()
        for index, tree in enumerate(self.model):
            if self.algorithm == 'propensity':
                preds[f'pred_tree_{index}'] = (pd.Series(
                    tree.apply(df.set_index(index_cols))).map(
                        self.propensity_score[index]).tolist())
            else:
                preds[f'pred_tree_{index}'] = tree.predict(
                    df.set_index(index_cols))
        preds['prediction'] = preds.mean(axis=1)

        return preds
示例#12
0
def suggest(session_id, results):
    global all_trees
    global all_data
    global all_y

    if session_id in all_trees:
        tree = all_trees[session_id]
        keys = range(1, len(results.keys()) + 1)
        data = []
        for key in keys:
            num_reviews = results[key]["reviews"]
            rating = results[key]["rating"]
            row = [num_reviews, rating]
            data.append(row)
        predictions = tree.predict(data)
        for i in range(1, len(predictions) + 1):
            prediction = predictions[i - 1]
            print prediction
            if prediction == 1:
                print "suggesting", i
                return i
        print "-------- suggesting default"
        return 1
    else:
        print "---------- 1 suggesting default"
        return 1
示例#13
0
    def _calculate(self, X, y, logger, categorical):
        import sklearn.tree

        if type(y) in ('binary', 'multiclass'):
            kf = sklearn.model_selection.StratifiedKFold(n_splits=5)
        else:
            kf = sklearn.model_selection.KFold(n_splits=5)

        accuracy = 0.
        for train, test in kf.split(X, y):
            random_state = sklearn.utils.check_random_state(42)
            tree = sklearn.tree.DecisionTreeClassifier(
                random_state=random_state)

            if len(y.shape) == 1 or y.shape[1] == 1:
                tree.fit(
                    X.iloc[train] if hasattr(X, 'iloc') else X[train],
                    y.iloc[train] if hasattr(y, 'iloc') else y[train],
                )
            else:
                tree = OneVsRestClassifier(tree)
                tree.fit(
                    X.iloc[train] if hasattr(X, 'iloc') else X[train],
                    y.iloc[train] if hasattr(y, 'iloc') else y[train],
                )

            predictions = tree.predict(
                X.iloc[test] if hasattr(X, 'iloc') else X[test], )
            accuracy += sklearn.metrics.accuracy_score(
                predictions,
                y.iloc[test] if hasattr(y, 'iloc') else y[test],
            )
        return accuracy / 5
示例#14
0
def get_track(genre, tree):
    genre = str(genre)
    proceed = True
    pl_list = [
        'new music friday', 'singled out', 'new music friday uk',
        'new music friday canada', 'new music friday au'
    ]
    rand_int = random.randint(0, 4)
    #goes to 'new music friday' playlist which is updated by spotify
    print(rand_int)
    print(f"the playlist: {pl_list[rand_int]}")

    playlist_id = search(
        pl_list[rand_int],
        'playlist')  #returns id of first search result in a string
    playlist_data = get(
        f'v1/playlists/{playlist_id}/tracks')  #array of song data

    num_array = np.linspace(0, 98, 99)
    int_array = [int(x) for x in num_array]

    while proceed == True:

        index = random.randint(0, len(int_array) - 1)
        n = int_array[index]
        print(index, n)

        track_id = playlist_data['items'][n]['track']['id']
        try:
            track_data = get(f'v1/audio-features/{track_id}')

            key = track_data['key']
            time_sig = track_data['time_signature']
            acst = track_data['acousticness']
            dance = track_data['danceability']
            enrg = track_data['energy']
            loud = track_data['loudness']
            tempo = track_data['tempo']
            speech = track_data['speechiness']
            instrum = track_data['instrumentalness']

            track_features = [
                speech, instrum, key, time_sig, acst, dance, enrg, loud, tempo
            ]
            #print(track_features)

            result = tree.predict([track_features])
            #print(result)

        except:
            result = 'invalid'
            #print(result)

        int_array = np.delete(int_array, index)
        print(result, track_id)

        if result == genre:
            print("hello")
            return f'https://open.spotify.com/embed/track/{track_id}'
            break
示例#15
0
def calc_dna_dist_mat(
    model: t.Union[sklearn.ensemble.RandomForestClassifier,
                   sklearn.ensemble.RandomForestRegressor],
    X: np.ndarray,
) -> t.Tuple[np.ndarray, str, float]:
    """Calculate DNA distance matrix between trees."""
    inst_num = X.shape[0]
    dna = np.zeros((model.n_estimators, inst_num), dtype=X.dtype)

    for tree_ind, tree in enumerate(model.estimators_):
        dna[tree_ind, :] = tree.predict(X)

    if isinstance(model, sklearn.ensemble.RandomForestClassifier):
        # Shift Cohen's Kappa to prevent negative values, and also transform
        # it to a distance measure (i.e., the higher is the correlation, the
        # smaller will be the dna_dists value.)
        # Note: this distance measure is in [0, 2], with 0 being 'totally
        # equal' and 2 being 'totally distinct.'
        dna_dists = 1.0 - scipy.spatial.distance.pdist(
            X=dna, metric=sklearn.metrics.cohen_kappa_score)

        dist_formula = "1 - Cohen_kappa(x)"
        max_limit = 2.0

    else:
        dna_min, dna_max = np.quantile(dna, (0, 1), axis=0)
        dna = (dna - dna_min) / (1e-8 + dna_max - dna_min)

        dna_dists = scipy.spatial.distance.pdist(X=dna, metric="euclidean")

        dist_formula = "Euclidean_dist(x)"
        max_limit = inst_num**0.5

    return dna_dists, dist_formula, max_limit
示例#16
0
    def predict(self, input_data):
        # first create a dictionary that will store the results
        results = {}
        for tree in self.random_forest:
            # get the result form one of the tree and add it to appropriate element in dict
            tree_result = tree.predict(input_data)
            if tree_result in results:
                results[tree_result] += 1
            else:
                results[tree_result] = 1

        # IMPORTANT - following operations are required to make sure that the result is the same as obtained from scikit
        # the problem (class name, number of votes):
        # 0: 5, 1: 0, 2: 1, 3: 5
        # scikit result - 0 (even though 0 and 3 have the same number of votes)
        # my result - it depends on which value was presented first, so it can be 0 or 3

        # find maximal value
        max_value = max(results.values())
        # and use it to get all pairs that are equal
        max_result = [(key, value) for key, value in results.items()
                      if value == max_value]
        # at the end get element with the lowest key value
        chosen_class = min(max_result, key=lambda t: t[0])[0]

        return chosen_class
示例#17
0
def process_query(tree, query, index):
    global classification_error_samples
    target_value = tree.predict(query)
    print_query(query, index)
    print("Predicted Target Value : " + target_value, '\n')
    plot_categorical_decision_tree(index, query)
    return target_value
示例#18
0
def LandmarkDecisionTree(X, y, categorical):
    if not sps.issparse(X):
        import sklearn.tree

        if len(y.shape) == 1 or y.shape[1] == 1:
            kf = sklearn.model_selection.StratifiedKFold(n_splits=10)
        else:
            kf = sklearn.model_selection.KFold(n_splits=10)

        accuracy = 0.
        for train, test in kf.split(X, y):
            random_state = sklearn.utils.check_random_state(42)
            tree = sklearn.tree.DecisionTreeClassifier(
                random_state=random_state)

            if len(y.shape) == 1 or y.shape[1] == 1:
                tree.fit(X[train], y[train])
            else:
                tree = OneVsRestClassifier(tree)
                tree.fit(X[train], y[train])

            predictions = tree.predict(X[test])
            accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
        return accuracy / 10

    else:
        return np.NaN
def kfold_cross_validation(df,tree,folds_num,features,target):
    kf = KFold(n_splits = folds_num, shuffle=True)
    attributes = df[features]
    labels = df[target]
    accuracy= []
    precision = []
    recall = []
    f1score =[]
    #RMSE = []
    scores= []
    for i in range(folds_num):
        result = next(kf.split(attributes), None)
        x_train = attributes.iloc[result[0]]
        x_test = attributes.iloc[result[1]]
        y_train = labels.iloc[result[0]]
        y_test = labels.iloc[result[1]]
        model = tree.fit(x_train,y_train)
        y_pred = tree.predict(x_test)
        accuracy.append(accuracy_score(y_test, y_pred))
        precision.append(precision_score(y_test, y_pred, average="weighted")) #labels=np.unique(y_pred) can be added to calculate the measure only for the labels that have predicted samples
        recall.append(recall_score(y_test, y_pred, average="weighted"))
        f1score.append(f1_score(y_test, y_pred, average="weighted"))
        #RMSE.append(root_mean_squared_error(y_test, y_pred))
        #accuracy.append(model.score(x_test,y_test))
    #print("Accuracy:",accuracy)
    #print("Avg accuracy:",np.mean(accuracy))
    scores = [np.mean(accuracy),np.mean(precision), np.mean(recall),np.mean(f1score)]
    #scores = [np.mean(accuracy),np.mean(precision), np.mean(recall),np.mean(f1score),np.mean(RMSE)]
    return(scores)
示例#20
0
def predictAndCreateDataFrame(tree, testFeatures, testData):
    import numpy as np
    prediction = tree.predict(testFeatures)
    passengerId = np.array(testData['PassengerId']).astype(int)
    solution = pd.DataFrame(prediction, passengerId, columns=['Survived'])
    solution.to_csv('solution.csv', index_label=['PassengerId'])
    return solution
示例#21
0
def oob_regression_mse_score(rf, X_train, y_train):
    """
    Compute out-of-bag (OOB) MSE for a scikit-learn random forest
    regressor. We learned the guts of scikit's RF from the BSD licensed
    code:
    https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L702
    """
    X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
    y = y_train.values if isinstance(y_train, pd.Series) else y_train

    n_samples = len(X)
    predictions = np.zeros(n_samples)
    n_predictions = np.zeros(n_samples)
    for tree in rf.estimators_:
        unsampled_indices = _generate_unsampled_indices(
            tree.random_state, n_samples)
        tree_preds = tree.predict(X[unsampled_indices, :])
        predictions[unsampled_indices] += tree_preds
        n_predictions[unsampled_indices] += 1

    if (n_predictions == 0).any():
        warnings.warn("Too few trees; some variables do not have OOB scores.")
        n_predictions[n_predictions == 0] = 1

    predictions /= n_predictions

    oob_score = mean_squared_error(y, predictions)
    return oob_score
def plot_rf_mae(rf, X_test, y_test):
    """Plot MAE for all trees in a
    RandomForestRegressor sklearn object.

    :param rf: trained RandomForestRegressor
    :param X_test: X test DataFrame
    :param y_test: y test DataFrame
    """

    mae_trees = [
        mean_absolute_error(tree.predict(X_test), y_test)
        for tree in rf.estimators_
    ]

    index_trees = np.arange(len(rf.estimators_))
    mae_ens = mean_absolute_error(rf.predict(X_test), y_test)

    # Plotting time
    plt.figure(figsize=FIGSIZE)
    plt.bar(x=index_trees, height=mae_trees, color="cornflowerblue")
    plt.ylim(0, 10)
    plt.axhline(mae_ens,
                color="tomato",
                linewidth=3,
                linestyle="dashed",
                label="Random Forest MAE")
    plt.xticks(index_trees)
    plt.xlabel("Single Decision Tree")
    plt.ylabel("MAE")
    plt.legend()
def Tree_graph(X_train, y_train, X_test, y_test,file_out,name,\
               X_dev= None,y_dev=None,\
               max_depth=5):
    x_axis = np.linspace(1, max_depth, num=max_depth)

    plt.figure(figsize=(10, 7))

    y_acc = np.zeros(x_axis.shape)
    y_rec_pos = np.zeros(x_axis.shape)
    y_rec_neg = np.zeros(x_axis.shape)

    for n in range(max_depth):
        print(n + 1)
        tree = model_tree_fit(X_train,y_train,X_dev,y_dev,X_test,y_test,file_out,\
                             max_depth=n+1,\
                             out = False)

        all = precision_recall_fscore_support(y_test, tree.predict(X_test))
        prec_all, rec_all, f1_all, __ = all

        y_rec_pos[n] = rec_all[0]
        y_rec_neg[n] = rec_all[1]
        #print (accuracy_score(y_test,gb.predict(X_test)))
        y_acc[n] = accuracy_score(y_test, tree.predict(X_test))
        print('finished')

    plt.plot(x_axis,
             y_rec_pos,
             color='b',
             lw=3,
             alpha=0.7,
             label='Recall_positive')
    plt.plot(x_axis,
             y_rec_neg,
             color='g',
             lw=3,
             alpha=0.7,
             label='Recall_negative')
    plt.plot(x_axis, y_acc, color='r', lw=3, alpha=0.7, label='Accuracy')
    plt.title('Tree')
    plt.xlabel('depth')
    plt.ylabel('Metric, %')
    plt.legend(loc='upper right')
    plt.grid(True)

    path = 'Graphs/Tree_for_'
    plt.savefig(path + 'depth_' + str(max_depth) + name + '.png')
示例#24
0
    def fit(self, X, y):
        '''
        Trains the model
        Arguments:
            X is a n-by-d numpy array
            y is an n-dimensional numpy array
        '''
        #TODO
        n = len(X)
        self.k = len(np.unique(y))

        #build map of indices to classes
        a = 0
        convert_map = {}
        for i in np.unique(y):
            self.class_map[a] = i
            convert_map[i] = a
            a += 1

        #convert y to be labelled 0-(k-1) instead of 1-k
        for i in xrange(len(y)):
            y[i] = convert_map[y[i]]

        w = np.ones(n) / n

        for t in xrange(self.numBoostingIters):
            #fit based on weights
            tree = DecisionTreeClassifier(max_depth=self.maxTreeDepth)
            tree.fit(X, y, sample_weight=w)
            self.tree_array.append(tree)

            #calculate weighted training error
            epsilon = 0
            y_preds = tree.predict(X)
            wrong_preds = np.nonzero(y_preds - y)
            for index in wrong_preds[0]:
                epsilon += w[index]

            #early stopping
            if epsilon == 0:
                break

            #calculate beta
            beta = 0.5 * np.log((1 - epsilon)/epsilon) + np.log(self.k-1)
            self.beta_array.append(beta)

            #update all instance weights
            for i in xrange(len(w)):
                if i not in wrong_preds[0]:
                    w[i] = w[i] * np.exp(-1 * beta)
                else:
                    w[i] = w[i] * np.exp(beta)

            #normalize weight vector
            w = w / np.sum(w)

        #fix y
        for i in xrange(len(y)):
            y[i] = self.class_map[y[i]]
示例#25
0
def evaluate_regressor(tree, X, Y):
    """
    Evaluates a tree with the data values passed, returning the R2 and MSE
    """
    r2 = tree.score(X, Y)
    e = tree.predict(X)
    mse = np.average(np.power((e - Y.values), 2))
    return r2, mse
示例#26
0
def calc_accuracy(tree, test_dataset):
    true_predicted = 0
    for i in range(0, len(test_dataset.index)):
        if test_dataset.loc[
                i, test_dataset.columns == 'y'].values == tree.predict(
                    [test_dataset.loc[i, test_dataset.columns != 'y']]):
            true_predicted += 1
    return true_predicted / len(test_dataset.index)
示例#27
0
def makeAcuracy(tree, x_test, y_test):
    predictions = tree.predict(x_test)
    erro = 0.0
    for x in range(len(predictions)):
        if predictions[x] != y_test[x]:
            erro += 1.
    acuracy = (1 - (erro / len(predictions)))
    return acuracy
def plot_n_predictions_rf(rf, X_test, N=10):
    sample = X_test.sample(N, random_state=42)
    predictions = pd.DataFrame(
        [tree.predict(sample).tolist() for tree in rf.estimators_],
        columns=["#{}".format(i) for i in sample.index])
    plt.figure(figsize=(13, 7))
    sns.boxplot(data=predictions)
    plt.xlabel("Index of sample")
    plt.ylabel("Prediction")
示例#29
0
    def predict(self, X):
        predictions = []
        for tree in self.decision_trees:
            predictions.append(tree.predict(X))

        total_pred = np.vstack(predictions)
        mode_prediction = stats.mode(total_pred).mode[0]

        return mode_prediction
示例#30
0
def calc_prediction_result_success(tree, dataset):
    result = list()
    for i in range(0, len(dataset.index)):
        if dataset.loc[i, dataset.columns == 'class'].values \
                != tree.predict([dataset.loc[i, dataset.columns != 'class']]):
            result.append(-1)
        else:
            result.append(1)
    return result
def predictions(tree, path):
    os.chdir(path)
    allsentences = []
    for file in glob.glob("*.html"):
        sopa = BeautifulSoup(codecs.open(file), 'html.parser')
        texto = get_text(sopa)
        allsentences.append(texto)
    vocab = tokenize(allsentences)
    X_baseline = bag_of_words_from_sentences(allsentences, vocab)
    return tree.predict(X_baseline)
示例#32
0
def err(x, y):
    from sklearn import tree
    tree = tree.DecisionTreeClassifier(random_state=0)
    tree.fit(x, y)
    error = 0
    for i, v in enumerate(tree.predict(D_data)):
        if v != D_target[i]:
            error += 1
    erate = error / float(len(D_target))
    return erate
示例#33
0
def decision_tree(train_set, test_set, features):
    start_time = time.time()
    # Instantiate the classifier
    tree = skl.tree.DecisionTreeClassifier(criterion="entropy")
    # Train classifier
    tree.fit(train_set[features].values, train_set['target'])
    # Predict
    y_pred = tree.predict(test_set[features])
    # Report results
    report_results("Decision Tree",
                   time.time() - start_time, test_set["target"], y_pred)
示例#34
0
 def _calculate(self, X, y, categorical):
     import sklearn.tree
     kf = sklearn.cross_validation.StratifiedKFold(y, n_folds=10)
     accuracy = 0.
     for train, test in kf:
         random_state = sklearn.utils.check_random_state(42)
         tree = sklearn.tree.DecisionTreeClassifier(random_state=random_state)
         tree.fit(X[train], y[train])
         predictions = tree.predict(X[test])
         accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
     return accuracy / 10
示例#35
0
    def _calculate(self, X, y, categorical):
        import sklearn.tree

        kf = sklearn.cross_validation.StratifiedKFold(y, n_folds=10)
        accuracy = 0.0
        for train, test in kf:
            random_state = sklearn.utils.check_random_state(42)
            tree = sklearn.tree.DecisionTreeClassifier(random_state=random_state)
            tree.fit(X[train], y[train])
            predictions = tree.predict(X[test])
            accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
        return accuracy / 10
示例#36
0
 def predict(self, x):
     if self.forest == None:
         raise Exception('Model hasn\'t been trained yet!')
     yy = []
     for tree in self.forest:
       yy.append(tree.predict(x))
     y = []
     for i in range(len(x)):
         r = []
         for j in range(len(yy)):
             r.append(yy[j][i])
         y.append(r.count(1) >= r.count(0))
     return y
示例#37
0
 def predict(self, x):
     if self.forest == None:
         raise Exception('Model hasn\'t been trained yet!')
     yy = []
     for tree in self.forest:
       yy.append(tree.predict(x))
     y = []
     for i in range(len(x)):
         m = 0
         for j in range(len(yy)):
             m += yy[j][i]
         m /= len(yy)
         y.append(m)
     return y
示例#38
0
def cluster_then_forest(xs, ys, in_sample_size):
    isi, in_sample, osi, out_sample = create_in_out_samples(xs, in_sample_size)
    clf = cluster.KMeans(n_clusters = 4)
    clf.fit(in_sample)
    oos_clusterid = clf.predict(out_sample)
    ins_clusterid = clf.predict(in_sample)

    for id in numpy.unique(oos_clusterid):
        print "Now working on Cluster " + str(id)
        oos_ind = oos_clusterid == id
        ins_ind = ins_clusterid == id

        tree = ensemble.RandomForestRegressor(50)

        tree.fit(in_sample[ins_ind], ys[isi][ins_ind])
        print "Score for in-sample"
        print str(tree.score(in_sample[ins_ind], ys[isi][ins_ind]))

        print "Score for out-of sample"
        tree.predict(out_sample[oos_ind])
        print str(tree.score(out_sample[oos_ind], ys[osi][oos_ind]))

    return None
示例#39
0
def using_DecisionTree():
    trainfile = os.path.join('data', 'train.csv')
    testFile = os.path.join('data', 'test.csv')
    train, test = regular(trainfile, testFile)
    data = train.values
    label = train.index.values
    tree = DecisionTree(maxDeep = 4)
    tree.buildTree(data, label)

    id = test.index.values
    testData = test.values

    res = 'id,country\n'
    for i in range(len(testData)):
        vector = testData[i]
        id_i = id[i]
        label = tree.predict(vector)
        line = '{0},{1}\n'.format(id_i, label)
        res += line
    with open('submit_myDecisionTree.txt', 'w') as f:
        f.write(res)
示例#40
0
    def _calculate(self, X, y, categorical):
        import sklearn.tree

        if len(y.shape) == 1 or y.shape[1] == 1:
            kf = sklearn.cross_validation.StratifiedKFold(y, n_folds=10)
        else:
            kf = sklearn.cross_validation.KFold(y.shape[0], n_folds=10)

        accuracy = 0.
        for train, test in kf:
            random_state = sklearn.utils.check_random_state(42)
            tree = sklearn.tree.DecisionTreeClassifier(random_state=random_state)

            if len(y.shape) == 1 or y.shape[1] == 1:
                tree.fit(X[train], y[train])
            else:
                tree = OneVsRestClassifier(tree)
                tree.fit(X[train], y[train])

            predictions = tree.predict(X[test])
            accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
        return accuracy / 10
示例#41
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import tree, datasets
from sklearn.externals import joblib
from numpy import *

print ('3.- Resultados del vector de prueba')

#Se importan los datos
p = pd.read_csv('test.csv')
X_test = p.values[:,0:4] 
y_test = p.values[:,4]

#Cargamos la maquina
tree = joblib.load('maquinaDTC.pkl')

# Verificamos el acierto con el grupo de test
P = tree.predict(X_test)
a = sum(P == y_test)
b = y_test.shape[0]

print ('Porcentaje de Acierto :'+str(a)+' / ' +str(b)+ ' => ' + str(float(a)/float(b)))

print ('')

# ## Pickle the clustering model

# In[77]:

clustering_model = {"model": clf}
pickle.dump( clustering_model, open('../tree_model.pickle', 'wb') )


# In[78]:


for t in ['all', 'test']:
    cur_data = pd.read_csv('../' + t + '_data_vectorized.csv', sep = '|', error_bad_lines=False, index_col="SubjectID")
    cur_data = cur_data[clustering_columns]
    res = pd.DataFrame(index = cur_data.index.astype(str)) # SubjectID is always str for later joins
    res['cluster'] = tree.predict(cur_data)
    print np.bincount(res.cluster)
    print t, res.shape
    res.to_csv('../' + t + '_tree_clusters.csv',sep='|')


# In[ ]:

res.head()


# In[ ]:



def predict(tree, locLs, X, goodLocsFile):
    result = tree.predict(X)
    with open(goodLocsFile, 'w') as fout:
        for k,r in zip(locLs, result):
            ls = (k,r)
            print >> fout, '\t'.join(ls)
示例#44
0
'''
Decision Tree
'''
binary_data = pd.get_dummies(all_census_prep)

X_train, X_test, y_train, y_test = cross_validation.train_test_split(binary_data[binary_data.columns.difference(["earning_class"])], binary_data["earning_class"], train_size=0.80)
scaler = preprocessing.StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train.astype("f64")), columns=X_train.columns)
X_test = scaler.transform(X_test.astype("f64"))


from sklearn.tree import DecisionTreeClassifier, export_graphviz
tree = DecisionTreeClassifier(criterion='entropy',max_depth=20)

tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)
cm = metrics.confusion_matrix(y_test, y_pred)
(cm[0][0]+cm[1][1]).astype('f64')/sum(sum(cm))

feature_names = list(X_train.columns)
export_graphviz(tree, out_file="tree.dot",feature_names=feature_names)

import pydotplus
import pyparsing
import StringIO
dotfile = StringIO.StringIO()
export_graphviz(tree, out_file=dotfile,feature_names=feature_names)
graph = pydotplus.graph_from_dot_data(dotfile.getvalue())
graph.write_png("dtree2.png")

'''
from sklearn import tree

def classify(Xtrain, Ytrain):
    """ Use entirety of provided X, Y to predict

    Arguments
    Xtrain -- Training data
    Ytrain -- Training prediction

    Returns
    ready_tree -- a tree fitted to Xtrain and Ytrain
    """
    ready_tree = tree.DecisionTreeClassifier()
    ready_tree.fit(Xtrain, Ytrain)
    return ready_tree

if __name__ == "__main__":
    # Let's take our training data and train a decision tree
    # on a subset. Scikit-learn provides a good module for cross-
    # validation.

    if len(sys.argv) < 2:
        print "Usage: $ python decision-tree.py /path/to/data/file/"
    else:
        training = sys.argv[1]
        X,Y,n,f = load_data(training)
        Xt, Xv, Yt, Yv = shuffle_split(X,Y)
        tree = classify(Xt, Yt)
        print "Decision Tree Accuracy:",acc(Yv, tree.predict(Xv)),"%"