def fit(self, X, y):
        self.w = np.ones(X.shape[0]) / X.shape[0]  # Weights on data
        self.a = np.zeros(self.n)  # Weights on decision trees
        k = 0
        self.features = []
        for tree in self.decision_trees:
            mask = np.random.randint(0, high=len(X), size=len(X))
            features = np.random.choice(X.shape[1], size=self.m)
            Xsampling = X[mask, :]
            Xsampling = Xsampling[:, features]
            ysampling = y[mask]
            tree.fit(Xsampling, ysampling)
            self.features.append(features)

            ej = 0
            for j in range(len(Xsampling)):
                ej = checkXY(Xsampling[j, :], ysampling[j],
                             tree) * self.w[j] + ej

            ej = ej / float(sum(self.w))

            self.a[k] = 0.5 * np.log((1 - ej) / float(ej))

            for i in range(len(Xsampling)):
                if checkXY(Xsampling[i, :], ysampling[i], tree) > 0.5:
                    self.w[i] = self.w[i] * np.exp(self.a[k])
                else:
                    self.w[i] = self.w[i] * np.exp(-self.a[k])
            k = k + 1
示例#2
0
def drawDecisionTree(data, name):
    data['change next day class'] = data['change next day'].apply(classify)
    X_train, X_test, y_train, y_test = train_test_split(
        data[[
            'rate of increase', 'increase length', 'rate of decrease',
            'decrease length'
        ]],
        data['change next day class'],
        test_size=0.2,
        random_state=42)
    tree = DecisionTreeClassifier(max_depth=6, random_state=0)
    tree.fit(X_train, y_train)
    print('Train score:{:.3f}'.format(tree.score(X_train, y_train)))
    print('Test score:{:.3f}'.format(tree.score(X_test, y_test)))
    #生成可视化图
    export_graphviz(tree,
                    out_file="tree.dot",
                    feature_names=[
                        'rate of increase', 'increase length',
                        'rate of decrease', 'decrease length'
                    ],
                    impurity=False,
                    filled=True)
    #展示可视化图
    graph = pydotplus.graph_from_dot_file('tree.dot')
    graph.write_pdf(name + '.pdf')
示例#3
0
def LandmarkDecisionTree(X, y, categorical):
    if not sps.issparse(X):
        import sklearn.tree

        if len(y.shape) == 1 or y.shape[1] == 1:
            kf = sklearn.model_selection.StratifiedKFold(n_splits=10)
        else:
            kf = sklearn.model_selection.KFold(n_splits=10)

        accuracy = 0.
        for train, test in kf.split(X, y):
            random_state = sklearn.utils.check_random_state(42)
            tree = sklearn.tree.DecisionTreeClassifier(
                random_state=random_state)

            if len(y.shape) == 1 or y.shape[1] == 1:
                tree.fit(X[train], y[train])
            else:
                tree = OneVsRestClassifier(tree)
                tree.fit(X[train], y[train])

            predictions = tree.predict(X[test])
            accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
        return accuracy / 10

    else:
        return np.NaN
示例#4
0
    def testRunWithIrisData(self):
        # Load data and store it into pandas DataFrame objects
        iris = load_iris()
        X = pd.DataFrame(iris.data[:, :], columns=iris.feature_names[:])
        y = pd.DataFrame(iris.target, columns=["Species"])

        # Defining and fitting a DecisionTreeClassifier instance
        tree = DecisionTreeClassifier(max_depth=2)
        tree.fit(X, y)

        # Creates dot file named tree.dot
        export_graphviz(tree,
                        out_file="../output/IrisOutput_DT.dot",
                        feature_names=list(X.columns),
                        class_names=iris.target_names,
                        filled=True,
                        rounded=True)

        sample_one_pred = int(tree.predict([[5, 5, 1, 3]]))
        sample_two_pred = int(tree.predict([[5, 5, 2.6, 1.5]]))
        print(
            f"The first sample most likely belongs a {iris.target_names[sample_one_pred]} flower."
        )
        print(
            f"The second sample most likely belongs a {iris.target_names[sample_two_pred]} flower."
        )
示例#5
0
    def _calculate(self, X, y, logger, categorical):
        import sklearn.tree

        if type(y) in ('binary', 'multiclass'):
            kf = sklearn.model_selection.StratifiedKFold(n_splits=5)
        else:
            kf = sklearn.model_selection.KFold(n_splits=5)

        accuracy = 0.
        for train, test in kf.split(X, y):
            random_state = sklearn.utils.check_random_state(42)
            tree = sklearn.tree.DecisionTreeClassifier(
                random_state=random_state)

            if len(y.shape) == 1 or y.shape[1] == 1:
                tree.fit(
                    X.iloc[train] if hasattr(X, 'iloc') else X[train],
                    y.iloc[train] if hasattr(y, 'iloc') else y[train],
                )
            else:
                tree = OneVsRestClassifier(tree)
                tree.fit(
                    X.iloc[train] if hasattr(X, 'iloc') else X[train],
                    y.iloc[train] if hasattr(y, 'iloc') else y[train],
                )

            predictions = tree.predict(
                X.iloc[test] if hasattr(X, 'iloc') else X[test], )
            accuracy += sklearn.metrics.accuracy_score(
                predictions,
                y.iloc[test] if hasattr(y, 'iloc') else y[test],
            )
        return accuracy / 5
示例#6
0
    def fit(self, X_train, Y_train):
        # 每一颗树都通过get_bookstrap_data获得随机的数据集

        sub_sets = self.get_bootstrap_data(X_train, Y_train)
        n_features = X_train.shape[1]

        if self.max_features == None:
            self.max_features = int(np.sqrt(n_features))

        for i in range(self.n_estimators):
            # 现在为每一颗树选择随机的特征
            tree = DecisionTreeClassifier(
                min_samples_split=self.min_samples_split,
                min_impurity_decrease=self.min_gain,
                max_depth=self.max_depth)

            sub_X, sub_Y = sub_sets[i]
            features = np.random.choice(n_features,
                                        self.max_features,
                                        replace=True)
            sub_X = sub_X[:, features]
            #print("X",sub_X)
            #print("X",sub_Y)
            tree.fit(sub_X, sub_Y)
            self.trees.append(tree)
            self.trees_feature.append(features)
def TreeTest():
    spamDat = spamData()
    k = 10
    all_folds = hw3.partition_folds(spamDat, k)
    num_in_fold = []
    err_in_fold = []
    for i in range(len(all_folds) - 1):
        spam = all_folds[i]
        num_in_fold.append(len(spam))
        truth, f_data = decTree.split_truth_from_data(spam)
        tree = decTree.TreeOptimal(max_depth=2)
        #tree = decTree.TreeRandom()
        tree.fit(f_data, truth)
        print 'Prediction...\n'
        predict = tree.predict(f_data)
        print predict
        print truth
        error = 1. - hw3.get_accuracy(predict, truth)
        err_in_fold.append(error)
        print 'Tree error is: {}'.format(error)
    spam = all_folds[k -1]
    truth, f_data = decTree.split_truth_from_data(spam)
    tree = decTree.TreeOptimal(max_depth=2)
    #tree = decTree.TreeRandom()
    tree.fit(f_data, truth)
    predict = tree.predict(f_data)
    error = 1. - hw3.get_accuracy(predict, truth)
    sum_training_err = 0
    for i in range(len(num_in_fold)):
        sum_training_err += err_in_fold[i]
        #sum_training_err += float(err_in_fold)/num_in_fold
    average_training_error = float(sum_training_err)/len(num_in_fold)
    print 'Average training error: {}\nAverage testing error: {}'.format(average_training_error, error)
示例#8
0
    def learn(self, learn):
        self._model.clear()
        self._model.config['version'] = sklearn.__version__
        self._tree = None

        input = []
        classes = []
        for data in learn:
            try:
                input += [self._get_input(data[0])]
                classes += [data[1]]
            except github.GithubError:
                continue

        # Check for empty data set
        if len(input) == 0 or len(classes) == 0:
            logging.error(
                'Trying to learn MetadataClassifier with an empty data set. This is not possible.\n'
                'Possible errors:\n'
                ' * Your learning folder is not set up correctly\n'
                ' * Your rate limit is exhausted\n'
                ' * There is an error with your internet connection\n'
                ' * There  is an error while connecting to GitHub\n')
            self._model.clear()
            self._model.save()
            return

        tree = sklearn.tree.DecisionTreeClassifier(min_samples_leaf=3)
        tree.fit(input, classes)

        self._tree = tree
        self._model.config['tree'] = base64.b64encode(
            pickle.dumps(tree)).decode()
        self._model.save()
示例#9
0
    def landmark_decision_tree(X, y):  # pylint: disable=C0103
        """Compute statistic."""
        try:
            if scipy.sparse.issparse(X):
                return np.NaN

            import sklearn.tree

            # pylint: disable=C0103
            if len(y.shape) == 1 or y.shape[1] == 1:
                kf = sklearn.model_selection.StratifiedKFold(n_splits=10)
            else:
                kf = sklearn.model_selection.KFold(n_splits=10)

            accuracy = 0.
            for train, test in kf.split(X, y):
                random_state = sklearn.utils.check_random_state(42)
                tree = sklearn.tree.DecisionTreeClassifier(
                    random_state=random_state)

                if len(y.shape) == 1 or y.shape[1] == 1:
                    tree.fit(X[train], y[train])
                else:
                    tree = OneVsRestClassifier(tree)
                    tree.fit(X[train], y[train])

                predictions = tree.predict(X[test])
                accuracy += sklearn.metrics.accuracy_score(
                    predictions, y[test])
            return accuracy / 10
        except Exception as ex:  # pylint: disable=W0703
            automl_log(
                "Landmark Decision Tree could not be computed. Returning 0 \
instead. Originally failed with exception '{ex}'".format(ex=ex), 'WARNING')
            return 0.
示例#10
0
 def fit(self, X, y):
     # TODO implement function
     n = X.shape[0]
     for tree in self.decision_trees:
         indices = np.random.randint(0, n, n)
         tree.fit(X[indices], y[indices])
     return self
示例#11
0
    def fit(self, X, y):
        '''
        Trains the model
        Arguments:
            X is a n-by-d numpy array
            y is an n-dimensional numpy array
        '''
        #TODO
        n = len(X)
        self.k = len(np.unique(y))

        #build map of indices to classes
        a = 0
        convert_map = {}
        for i in np.unique(y):
            self.class_map[a] = i
            convert_map[i] = a
            a += 1

        #convert y to be labelled 0-(k-1) instead of 1-k
        for i in xrange(len(y)):
            y[i] = convert_map[y[i]]

        w = np.ones(n) / n

        for t in xrange(self.numBoostingIters):
            #fit based on weights
            tree = DecisionTreeClassifier(max_depth=self.maxTreeDepth)
            tree.fit(X, y, sample_weight=w)
            self.tree_array.append(tree)

            #calculate weighted training error
            epsilon = 0
            y_preds = tree.predict(X)
            wrong_preds = np.nonzero(y_preds - y)
            for index in wrong_preds[0]:
                epsilon += w[index]

            #early stopping
            if epsilon == 0:
                break

            #calculate beta
            beta = 0.5 * np.log((1 - epsilon)/epsilon) + np.log(self.k-1)
            self.beta_array.append(beta)

            #update all instance weights
            for i in xrange(len(w)):
                if i not in wrong_preds[0]:
                    w[i] = w[i] * np.exp(-1 * beta)
                else:
                    w[i] = w[i] * np.exp(beta)

            #normalize weight vector
            w = w / np.sum(w)

        #fix y
        for i in xrange(len(y)):
            y[i] = self.class_map[y[i]]
 def fit(self, X, y):
     self.mask = []
     for tree in self.decision_trees:
         mask = np.random.randint(0, high=len(X), size=len(X))
         Xsampling = X[mask, :]
         ysampling = y[mask]
         tree.fit(Xsampling, ysampling)
         self.mask.append(mask)
示例#13
0
文件: lambdamart.py 项目: Haiga/pyltr
    def _fit_stage(self, i, X, y, qids, y_pred, sample_weight, sample_mask,
                   query_groups, random_state):
        """Fit another tree to the boosting model."""
        assert sample_mask.dtype == np.bool

        n_samples = X.shape[0]

        all_lambdas = np.zeros(n_samples)
        all_deltas = np.zeros(n_samples)

        mat = []
        for qidx, (qid, a, b, _) in enumerate(query_groups):
            score = self.metric.evaluate_preds(qid, y[a:b], y_pred[a:b])

            score_b1 = self.metric.evaluate_preds(
                qid, y[a:b], X[:, int(self.features_risk[0])][a:b])

            score_b2 = self.metric.evaluate_preds(
                qid, y[a:b], X[:, int(self.features_risk[1])][a:b])

            mat.append([score, score_b1, score_b2])

        mat = np.array(mat)
        grisk_normal = getGeoRiskDefault(mat, 5)[0]
        posix = 0
        for qid, a, b, _ in query_groups:
            lambdas, deltas = self._calc_lambdas_deltas(
                qid, y[a:b], y_pred[a:b], mat, posix, grisk_normal)
            all_lambdas[a:b] = lambdas
            all_deltas[a:b] = deltas
            posix += 1

        tree = sklearn.tree.DecisionTreeRegressor(
            criterion='friedman_mse',
            splitter='best',
            presort=True,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            min_weight_fraction_leaf=0.0,
            max_features=self.max_features,
            max_leaf_nodes=self.max_leaf_nodes,
            random_state=random_state)

        if self.subsample < 1.0 or self.query_subsample < 1.0:
            sample_weight = sample_weight * sample_mask.astype(np.float64)

        tree.fit(X,
                 all_lambdas,
                 sample_weight=sample_weight,
                 check_input=False)

        self._update_terminal_regions(tree.tree_, X, y, all_lambdas,
                                      all_deltas, y_pred, sample_mask)
        self.estimators_[i, 0] = tree
        self.estimators_fitted_ = i + 1

        return y_pred
示例#14
0
 def mapper(self, _, line):
     ratings = pd.read_csv(self.train_set)
     ratings = ratings.sample(round(len(ratings.index)/10))
     tree = sklearn.tree.DecisionTreeRegressor()
     labels = ratings.iloc[:, 3]
     ratings = ratings.drop(ratings.columns[[3]], axis=1)
     samples = ratings
     tree.fit(samples, labels)
     yield jsonpickle.encode(tree), 1
示例#15
0
def err(x, y):
    from sklearn import tree
    tree = tree.DecisionTreeClassifier(random_state=0)
    tree.fit(x, y)
    error = 0
    for i, v in enumerate(tree.predict(D_data)):
        if v != D_target[i]:
            error += 1
    erate = error / float(len(D_target))
    return erate
示例#16
0
    def train(self, dataset, targets, hyper_params):
        numQueries=len(dataset.docsPerQuery)
        validDocs=numpy.minimum(dataset.docsPerQuery, self.rankingSize)
        queryDocPosTriplets=numpy.dot(dataset.docsPerQuery, validDocs)
        designMatrix=numpy.zeros((queryDocPosTriplets, self.numFeatures), dtype=numpy.float32, order='F')
        regressionTargets=numpy.zeros(queryDocPosTriplets, dtype=numpy.float64, order='C')
        sampleWeights=numpy.zeros(queryDocPosTriplets, dtype=numpy.float32)
        currID=-1
        for i in range(numQueries):
            numAllowedDocs=dataset.docsPerQuery[i]
            currValidDocs=validDocs[i]
            allFeatures=dataset.features[i].toarray()
            
            for doc in range(numAllowedDocs):
                docID=doc
                if dataset.mask is not None:
                    docID=dataset.mask[i][doc]
                    
                for j in range(currValidDocs):
                    currID+=1

                    designMatrix[currID,:]=self.createFeature(allFeatures[docID,:], j)
                    regressionTargets[currID]=targets[i][j,doc] 
                    sampleWeights[currID]=1.0/(numAllowedDocs * currValidDocs)
        
        for i in targets:
            del i
        del targets
        
        print("L2RPolicy:train [LOG] Finished creating features and targets ", 
                numpy.amin(regressionTargets), numpy.amax(regressionTargets), numpy.median(regressionTargets), flush=True)
        print("L2RPolicy:train [LOG] Histogram of targets ", numpy.histogram(regressionTargets), flush=True)
        
        if self.modelType == 'gbrt':
            tree=sklearn.ensemble.GradientBoostingRegressor(learning_rate=hyper_params['lr'],
                            n_estimators=hyper_params['ensemble'], subsample=hyper_params['subsample'], max_leaf_nodes=hyper_params['leaves'], 
                            max_features=1.0, presort=False)
            tree.fit(designMatrix, regressionTargets, sample_weight=sampleWeights)
            self.tree=tree
            print("L2RPolicy:train [INFO] %s" % self.modelType, flush=True)
                
        elif self.modelType == 'ridge':
            ridgeCV=sklearn.linear_model.RidgeCV(alphas=self.hyperParams, fit_intercept=False,
                                                            normalize=False, cv=3)
            ridgeCV.fit(designMatrix, regressionTargets, sample_weight=sampleWeights)
            self.policyParams=ridgeCV.coef_
            print("L2RPolicy:train [INFO] Done. ", flush=True)
            
        else:
            print("L2RPolicy:train [ERR] %s not supported." % self.modelType, flush = True)
            sys.exit(0)
            
        print("L2R:train [INFO] Created %s predictor using dataset %s." %
                (self.modelType, dataset.name), flush = True)
示例#17
0
 def _calculate(self, X, y, categorical):
     import sklearn.tree
     kf = sklearn.cross_validation.StratifiedKFold(y, n_folds=10)
     accuracy = 0.
     for train, test in kf:
         random_state = sklearn.utils.check_random_state(42)
         tree = sklearn.tree.DecisionTreeClassifier(random_state=random_state)
         tree.fit(X[train], y[train])
         predictions = tree.predict(X[test])
         accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
     return accuracy / 10
示例#18
0
def decision_tree(train_set, test_set, features):
    start_time = time.time()
    # Instantiate the classifier
    tree = skl.tree.DecisionTreeClassifier(criterion="entropy")
    # Train classifier
    tree.fit(train_set[features].values, train_set['target'])
    # Predict
    y_pred = tree.predict(test_set[features])
    # Report results
    report_results("Decision Tree",
                   time.time() - start_time, test_set["target"], y_pred)
def predict_with_dtree(col_names, data):
    N = len(data)
    training_data = data[:int(0.80 * N)]

    tree = DecisionTree()
    tree.fit(col_names, training_data)

    testing_data = data[int(0.80 * N):]

    for row in testing_data:
        print 'Actual label is %s and predicted %s' % (row[-1],
                                                       tree.predict(row))
 def fit(self, X, y):
     self.mask = []
     self.features = []
     for tree in self.decision_trees:
         mask = np.random.randint(0, high=len(X), size=len(X))
         features = np.random.choice(X.shape[1], size=self.m)
         Xsampling = X[mask, :]
         Xsampling = Xsampling[:, features]
         ysampling = y[mask]
         tree.fit(Xsampling, ysampling)
         self.mask.append(mask)
         self.features.append(features)
示例#21
0
文件: ABPDN.py 项目: willis-hu/spyn
 def addBoostIteration(self):
     rv = self.regressionValues()
     trees = []
     mask = numpy.array([True] * self.nF)
     for i in range(0, self.nF):
         mask[:] = True
         mask[i] = False
         tree = DecisionTreeRegressor(max_depth=self.max_depth)
         tree.fit(self.data[:, mask], rv[:, i])
         # newpsis[:, i] = tree.predict(self.data[:, mask])
         trees.append(tree)
     self.trees.append(trees)
示例#22
0
    def _calculate(self, X, y, categorical):
        import sklearn.tree

        kf = sklearn.cross_validation.StratifiedKFold(y, n_folds=10)
        accuracy = 0.0
        for train, test in kf:
            random_state = sklearn.utils.check_random_state(42)
            tree = sklearn.tree.DecisionTreeClassifier(random_state=random_state)
            tree.fit(X[train], y[train])
            predictions = tree.predict(X[test])
            accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
        return accuracy / 10
示例#23
0
def build_tree_classifier(X_train, X_test, y_train, y_test, criterion='gini'):
	'''
	Build classifier, return results, predictions, and tree

	Inputs: X_train, X_test, y_train, y_test, criterion

	Returns: y_test, y_pred, tree
	'''
	tree = DecisionTreeClassifier(criterion)
	tree.fit(X_train, y_train)
	y_pred = tree.predict(X_test)
	return y_test, y_pred, tree
示例#24
0
    def learn(self, learn):
        self._model.clear()
        self._model.config['version'] = sklearn.__version__
        self._tree = None

        known_languages = set()
        for data in learn:
            try:
                languages = data[0].get_languages()
            except github.GithubError:
                continue

            for language in languages:
                known_languages.add(language)

        known_languages = list(known_languages)

        dataset = []
        labels = []

        for data in learn:
            try:
                languages = data[0].get_languages()
            except github.GithubError:
                continue

            entry = self._get_entry(languages, known_languages)

            dataset += [entry]
            labels += [data[1]]

        # Check for empty data set
        if len(dataset) == 0 or len(labels) == 0:
            logging.error(
                'Trying to learn LanguageDetailsClassifier with an empty data set. This is not possible.\n'
                'Possible errors:\n'
                ' * Your learning folder is not set up correctly\n'
                ' * Your rate limit is exhausted\n'
                ' * There is an error with your internet connection\n'
                ' * There  is an error while connecting to GitHub\n')
            self._model.clear()
            self._model.save()
            return

        tree = sklearn.tree.DecisionTreeClassifier(min_samples_leaf=3)
        tree.fit(dataset, labels)

        self._tree = tree
        self._model.config['tree'] = base64.b64encode(
            pickle.dumps(tree)).decode()
        self._model.config['known_languages'] = known_languages
        self._model.save()
示例#25
0
    def save_features(self, X, y):
        feats = dict()

        print "univariate feature selectors"
        selector_clf = SelectKBest(score_func=f_classif, k='all')
        selector_clf.fit(X, y)
        pvalues_clf = selector_clf.pvalues_
        pvalues_clf[np.isnan(pvalues_clf)] = 1

        #put feature vectors into dictionary
        feats['univ_sub01'] = (pvalues_clf < 0.1)
        feats['univ_sub005'] = (pvalues_clf < 0.05)
        feats['univ_clf_sub005'] = (pvalues_clf < 0.05)

        print "randomized logistic regression feature selector"
        sel_log = linear_model.RandomizedLogisticRegression(random_state=42,
                                                            n_jobs=4).fit(
                                                                X, y)
        #put rand_lasso feats into feature dict
        feats['rand_logreg'] = sel_log.get_support()

        print "l1-based feature selectors"
        X_sp = sparse.coo_matrix(X)
        sel_svc = svm.LinearSVC(C=0.1,
                                penalty="l1",
                                dual=False,
                                random_state=42).fit(X, y)
        feats['LinearSVC'] = np.ravel(sel_svc.coef_ > 0)
        sel_log = linear_model.LogisticRegression(C=0.01, random_state=42).fit(
            X_sp, y)
        feats['LogReg'] = np.ravel(sel_log.coef_ > 0)

        tree_max_features = 20
        print "ExtraTrees feature selectors (%s)" % tree_max_features
        feats['tree'] = np.zeros(len(feats['LogReg']))
        tree = ExtraTreesClassifier(n_estimators=250,
                                    max_features=tree_max_features)
        tree.fit(X, y)
        feature_importance = tree.feature_importances_
        feature_importance = 100.0 * (feature_importance /
                                      feature_importance.max())
        sorted_idx = np.argsort(feature_importance)[::-1]
        for i in xrange(tree_max_features):
            feats['tree'][sorted_idx[i]] = 1

        feat_sums = np.zeros(len(feats['LogReg']))
        for key in feats:
            feat_sums += feats[key].astype(int)
        feats[
            'ensemble'] = feat_sums >= 4  #take features which get 5 or more votes
        joblib.dump(feats, 'features/feats.pkl', compress=3)
        return feats
示例#26
0
 def build_policies(self, n):
     c2 = DatasetBandit.DATASETS[self.dataset](L=1, loop=True)
     Policies = []
     for p in range(n):
         X = np.zeros((100, c2.d))
         r = np.zeros((100, ))
         for n in range(100):
             (curr_x, curr_r) = c2.next()
             a = np.random.choice(curr_x.get_K())
             X[n, :] = curr_x.get_ld_features()[a, :]
             r[n] = curr_r[a]
         tree = sklearn.tree.DecisionTreeRegressor(max_depth=3)
         tree.fit(X, r)
         Policies.append(RegressionPolicy(tree))
     return (Policies)
示例#27
0
def select_model(trainx,trainy,validx,validy):
    trees = {}
    criterion = ["gini","entropy"]
    depth = 7
    for i in range(1,8):
        for j in criterion:
            tree = sklearn.tree.DecisionTreeClassifier(criterion=j, max_depth=i)
            tree.fit(trainx,trainy)
            accuracy = check_accuracy(tree,validx,validy)
            print("Model(depth= " +  str(i) + "criteria= "  +  str(j)+") with accuracy = " + str(accuracy))
            trees[accuracy] = tree
    keys =  trees.keys()
    max_key =  max(keys)
    solution = trees[max_key]
    return solution
示例#28
0
    def fit(self, X, y):
        assert self.sample_size <= len(
            y), "Sample size cannot be greater or equal to input size"

        full = np.concatenate((X, y.reshape(-1, 1)), axis=1)

        for tree in self.decision_trees:
            bagged_samples = np.random.choice(list(range(len(full))),
                                              size=self.sample_size,
                                              replace=True)

            train_data = full[bagged_samples, :]
            train_data_x = train_data[:, :-1]
            train_data_y = train_data[:, -1:]

            tree.fit(train_data_x, train_data_y)
def kfold_cross_validation(df,tree,folds_num,features,target):
    kf = KFold(n_splits = folds_num, shuffle=True)
    attributes = df[features]
    labels = df[target]
    accuracy= []
    precision = []
    recall = []
    f1score =[]
    #RMSE = []
    scores= []
    for i in range(folds_num):
        result = next(kf.split(attributes), None)
        x_train = attributes.iloc[result[0]]
        x_test = attributes.iloc[result[1]]
        y_train = labels.iloc[result[0]]
        y_test = labels.iloc[result[1]]
        model = tree.fit(x_train,y_train)
        y_pred = tree.predict(x_test)
        accuracy.append(accuracy_score(y_test, y_pred))
        precision.append(precision_score(y_test, y_pred, average="weighted")) #labels=np.unique(y_pred) can be added to calculate the measure only for the labels that have predicted samples
        recall.append(recall_score(y_test, y_pred, average="weighted"))
        f1score.append(f1_score(y_test, y_pred, average="weighted"))
        #RMSE.append(root_mean_squared_error(y_test, y_pred))
        #accuracy.append(model.score(x_test,y_test))
    #print("Accuracy:",accuracy)
    #print("Avg accuracy:",np.mean(accuracy))
    scores = [np.mean(accuracy),np.mean(precision), np.mean(recall),np.mean(f1score)]
    #scores = [np.mean(accuracy),np.mean(precision), np.mean(recall),np.mean(f1score),np.mean(RMSE)]
    return(scores)
示例#30
0
def train_tree(X,
               Y,
               k=2,
               max_depths=default_max_depths,
               criterions=default_criterions):
    depths = []
    scores = []
    for depth in max_depths:
        for criteria in criterions:
            tree = sklearn.tree.DecisionTreeClassifier(criterion=criteria,
                                                       max_depth=depth)
            tree.fit(X, Y)
            train_score = validate(tree, X, Y)
            depths.append(depth)
            scores.append(train_score)
    return depths, scores
 def fit(
     self, feature_list, label_list
 ):  #build the RandomForest by DecisionTree , every RandomForest =num DecisionTrees
     train_list = []
     train_label_list = []
     vec = DictVectorizer()
     for i in range(self.num):
         x_train, x_test, y_train, y_test = train_test_split(feature_list,
                                                             label_list,
                                                             test_size=0.33,
                                                             random_state=i)
         x_train_vec = vec.fit_transform(x_train).toarray()
         train_list.append(x_train_vec)
         train_label_list.append(y_train)
         tree = DecisionTree()
         tree.fit(x_train_vec, y_train)
         self.tree_list.append(tree)
def dt(X_train, y_train):
    from sklearn import tree
    param_grid = {'max_depth': np.arange(3, 25)}
    clf = tree.DecisionTreeClassifier(random_state=0)
    tree = GridSearchCV(clf, param_grid)

    clf = tree.fit(X_train, y_train)
    return clf
示例#33
0
 def calculate(self):
     import sklearn.tree
     if len(self.y.shape) == 1 or self.y.shape[1] == 1:
         kf = sklearn.model_selection.StratifiedKFold(n_splits=10)
     else:
         kf = sklearn.model_selection.KFold(n_splits=10)
     accuracy = 0.
     for train, test in kf.split(self.X, self.y):
         random_state = sklearn.utils.check_random_state(42)
         tree = sklearn.tree.DecisionTreeClassifier(random_state=random_state)
         if len(self.y.shape) == 1 or self.y.shape[1] == 1:
             tree.fit(self.X.iloc[train], np.ravel(self.y.iloc[train],order='C'))
         else:
             tree = OneVsRestClassifier(tree)
             tree.fit(self.X.iloc[train], np.ravel(self.y.iloc[train],order='C'))
         predictions = tree.predict(self.X.iloc[test])
         accuracy += sklearn.metrics.accuracy_score(predictions, self.y.iloc[test])
     return accuracy / 10
示例#34
0
 def fit(self, X, y):
     for tree in self.decision_trees:
         X_tree = np.zeros_like(X)
         y_tree = np.zeros_like(y)
         for i in range(X.shape[0]):
             rand_i = np.random.choice(X.shape[0])
             X_tree[i, :] = X[rand_i, :]
             y_tree[i] = y[rand_i]
         tree = tree.fit(X_tree, y_tree)
     return self
def findMisClf(df, X, y, y_pred, name):
    '''
    Takes a dataframe (df), column names of predictors (X) and a dependent
    variable (y). Loops over generic classifiers to find predictions. Creates
    a decision tree using prediction misclassification as the dependent variable.
    '''

    var_name = name + '_predict'
    try:
        df[var_name] = y_pred
    except:
        import pdb
        pdb.set_trace()
    correct = name + '_correct'
    
    # Determine "correctness" based on 0.5 threshold
    df[correct] = (df[var_name] > 0.5).astype(int)

    # Determine which observations are being misclassified
    tree = DecisionTreeClassifier(max_depth=3)
    tree.fit(X, df[correct])
    feature_names = df.columns
    left, right = tree.tree_.children_left, tree.tree_.children_right
    threshold = tree.tree_.threshold
    features = [feature_names[i] for i in tree.tree_.feature]
    value = tree.tree_.value
 
    def recurse(left, right, threshold, features, node):
            if (threshold[node] != -2):
                    print "if ( " + features[node] + " <= " + str(threshold[node]) + " ) {"
                    if left[node] != -1:
                            recurse (left, right, threshold, features,left[node])
                    print "      } else {"
                    if right[node] != -1:
                            recurse (left, right, threshold, features,right[node])
                    print "}"
            else:
                    print "return " + str(value[node])

    recurse(left, right, threshold, features, 0)
示例#36
0
    def _fit_stage(self, i, X, y, qids, y_pred, sample_weight, sample_mask,
                   query_groups, random_state):
        """Fit another tree to the boosting model."""
        assert sample_mask.dtype == np.bool

        n_samples = X.shape[0]

        all_lambdas = np.zeros(n_samples)
        all_deltas = np.zeros(n_samples)
        for qid, a, b, _ in query_groups:
            for coef, metric in zip(self.metric_coefs, self.metrics):
                lambdas, deltas = metric.calc_lambdas_deltas(
                    qid, y[a:b], y_pred[a:b])
                all_lambdas[a:b] += coef * lambdas
                all_deltas[a:b] += coef * deltas

        tree = sklearn.tree.DecisionTreeRegressor(
            criterion='friedman_mse',
            splitter='best',
            presort=True,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            min_weight_fraction_leaf=0.0,
            max_features=self.max_features,
            max_leaf_nodes=self.max_leaf_nodes,
            random_state=random_state)

        if self.subsample < 1.0 or self.query_subsample < 1.0:
            sample_weight = sample_weight * sample_mask.astype(np.float64)

        tree.fit(X, all_lambdas, sample_weight=sample_weight,
                 check_input=False)

        self._update_terminal_regions(tree.tree_, X, y, all_lambdas,
                                      all_deltas, y_pred, sample_mask)
        self.estimators_[i, 0] = tree
        self.estimators_fitted_ = i + 1

        return y_pred
示例#37
0
    def _calculate(self, X, y, categorical):
        import sklearn.tree

        if len(y.shape) == 1 or y.shape[1] == 1:
            kf = sklearn.cross_validation.StratifiedKFold(y, n_folds=10)
        else:
            kf = sklearn.cross_validation.KFold(y.shape[0], n_folds=10)

        accuracy = 0.
        for train, test in kf:
            random_state = sklearn.utils.check_random_state(42)
            tree = sklearn.tree.DecisionTreeClassifier(random_state=random_state)

            if len(y.shape) == 1 or y.shape[1] == 1:
                tree.fit(X[train], y[train])
            else:
                tree = OneVsRestClassifier(tree)
                tree.fit(X[train], y[train])

            predictions = tree.predict(X[test])
            accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
        return accuracy / 10
示例#38
0
def cluster_then_forest(xs, ys, in_sample_size):
    isi, in_sample, osi, out_sample = create_in_out_samples(xs, in_sample_size)
    clf = cluster.KMeans(n_clusters = 4)
    clf.fit(in_sample)
    oos_clusterid = clf.predict(out_sample)
    ins_clusterid = clf.predict(in_sample)

    for id in numpy.unique(oos_clusterid):
        print "Now working on Cluster " + str(id)
        oos_ind = oos_clusterid == id
        ins_ind = ins_clusterid == id

        tree = ensemble.RandomForestRegressor(50)

        tree.fit(in_sample[ins_ind], ys[isi][ins_ind])
        print "Score for in-sample"
        print str(tree.score(in_sample[ins_ind], ys[isi][ins_ind]))

        print "Score for out-of sample"
        tree.predict(out_sample[oos_ind])
        print str(tree.score(out_sample[oos_ind], ys[osi][oos_ind]))

    return None
示例#39
0
文件: lambdamart.py 项目: Ulden/news
    def _fit_stage(self, i, X, y, qids, y_pred, sample_weight, sample_mask,
                   query_groups, random_state):
        """Fit another tree to the boosting model."""
        assert sample_mask.dtype == np.bool

        n_samples = X.shape[0]

        all_lambdas = np.zeros(n_samples)
        all_deltas = np.zeros(n_samples)
        for qid, a, b, _ in query_groups:
            lambdas, deltas = self._calc_lambdas_deltas(qid, y[a:b], y_pred[a:b])
            all_lambdas[a:b] = lambdas
            all_deltas[a:b] = deltas

        tree = sklearn.ensemble.RandomForestRegressor(
            criterion='friedman_mse',
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            min_weight_fraction_leaf=0.0,
            max_features=self.max_features,
            max_leaf_nodes=self.max_leaf_nodes,
            random_state=random_state,
            n_jobs = -1
        )

        if self.subsample < 1.0 or self.query_subsample < 1.0:
            sample_weight = sample_weight * sample_mask.astype(np.float64)

        tree.fit(X, all_lambdas, sample_weight=sample_weight)

        self._update_terminal_regions(tree.tree_, X, y, all_lambdas,
                                      all_deltas, y_pred, sample_mask)
        self.estimators_[i, 0] = tree
        self.estimators_fitted_ = i + 1

        return y_pred
示例#40
0
target = 'safe_loans'

loans = loans[features + [target]]


#train_idx=ps.read_json('module-5-assignment-1-train-idx.json')
with open('C:\Users\Isaac\Course 3/module-5-assignment-1-train-idx.json', 'r') as f:
    train_idx = json.load(f)

#test_idx=ps.read_json('module-5-assignment-1-test-idx.json')
with open('C:\Users\Isaac\Course 3/module-5-assignment-1-validation-idx.json', 'r') as f:
    validation_idx = json.load(f)
    
train_data = loans.iloc[train_idx]
validation_data = loans.iloc[validation_idx]
train_matrix,train_output=get_numpy_data(train_data,features,target)
validation_matrix,validation_output=get_numpy_data(validation_data,features,target)

safe_loans_raw = loans[loans[target] == +1]
risky_loans_raw = loans[loans[target] == -1]
print "Number of safe loans  : %s" % len(safe_loans_raw)
print "Number of risky loans : %s" % len(risky_loans_raw)
percentage = len(risky_loans_raw)/float(len(safe_loans_raw))
risky_loans = risky_loans_raw
safe_loans = safe_loans_raw.sample(percentage, seed=1)

tree=sklearn.tree.DecisionTreeClassifier(max_depth=6)

decision_tree_model=tree.fit(train_matrix,train_output)
示例#41
0
'''
Decision Tree
'''
binary_data = pd.get_dummies(all_census_prep)

X_train, X_test, y_train, y_test = cross_validation.train_test_split(binary_data[binary_data.columns.difference(["earning_class"])], binary_data["earning_class"], train_size=0.80)
scaler = preprocessing.StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train.astype("f64")), columns=X_train.columns)
X_test = scaler.transform(X_test.astype("f64"))


from sklearn.tree import DecisionTreeClassifier, export_graphviz
tree = DecisionTreeClassifier(criterion='entropy',max_depth=20)

tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)
cm = metrics.confusion_matrix(y_test, y_pred)
(cm[0][0]+cm[1][1]).astype('f64')/sum(sum(cm))

feature_names = list(X_train.columns)
export_graphviz(tree, out_file="tree.dot",feature_names=feature_names)

import pydotplus
import pyparsing
import StringIO
dotfile = StringIO.StringIO()
export_graphviz(tree, out_file=dotfile,feature_names=feature_names)
graph = pydotplus.graph_from_dot_data(dotfile.getvalue())
graph.write_png("dtree2.png")
示例#42
0
	def fit(self, X_train, Y_train):
		for i in range(self.n_tree):
			Xb_train,Yb_train = rswr(X_train,Y_train,500);
			tree = DecisionTreeClassifier(max_depth = self.md, max_features = 'auto',min_samples_split=2,)
			tree.fit(X_train, Y_train)
			self.tree_bags.append(tree)
fpr_master.append(fpr)
tpr_master.append(tpr)
model.append('NB')
aucs.append(roc_auc)

# how about multi-fold cross-validation with 5 folds
cv_results_gnb = cross_val_score(gnb, x_train, y_train, cv=5)
print 'CV Results:',round(cv_results_gnb.mean(),3)  # cross-validation average accuracy

# -------------------------------------------------------------------
# Create the Decision Tree Model
# ------------------------------------------------------------------- 

#fit the linear svc model
tree = tree.DecisionTreeClassifier(random_state = 9999, criterion='entropy', max_depth=6, min_samples_leaf=5)
tree_model_fit = tree.fit(x_train, y_train)

# predicted class in training data only
y_pred = tree_model_fit.predict(x_test)
print '\nDecision Tree Results:'
print 'Confustion Matrix:\n',confusion_matrix(y_test, y_pred)
print 'Accuracy Score:',round(accuracy_score(y_test, y_pred), 3)

#calculate the TPR/FPR for NB
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
print "AUC : %f" % roc_auc
fpr_master.append(fpr)
tpr_master.append(tpr)
model.append('DT')
aucs.append(roc_auc)
示例#44
0
 def train(self, feature_list, name):
     self.featureList=feature_list
     self.name=name+'-'+self.modelType
     modelFile=Settings.DATA_DIR+self.dataset.name+'_'+self.name
     if 'alpha' not in self.hyperParams:
         #Expecting hyper-params for GBRT; Add those hyper-params to the model file name
         modelFile=modelFile+'ensemble-'+str(self.hyperParams['ensemble'])+'_lr-'+str(self.hyperParams['lr'])+'_subsample-'+str(self.hyperParams['subsample'])+'_leaves-'+str(self.hyperParams['leaves'])
         
     if self.modelType=='tree' or self.modelType=='gbrt':
         modelFile+='.z'
     else:
         modelFile+='.npz'
         
     self.savedRankingsSize=None
     self.savedRankings=None
     
     if os.path.exists(modelFile):
         if self.modelType=='tree' or self.modelType=='gbrt':
             self.tree=joblib.load(modelFile)
             print("DeterministicPolicy:train [INFO] Using precomputed policy", modelFile, flush=True)
         else:
             with numpy.load(modelFile) as npFile:
                 self.policyParams=npFile['policyParams']
             print("DeterministicPolicy:train [INFO] Using precomputed policy", modelFile, flush=True)
             print("DeterministicPolicy:train [INFO] PolicyParams", self.policyParams,flush=True)
     else:
         numQueries=len(self.dataset.features)
     
         allFeatures=None
         allTargets=None
         print("DeterministicPolicy:train [INFO] Constructing features and targets", flush=True)
             
         if self.dataset.mask is None:
             allFeatures=scipy.sparse.vstack(self.dataset.features, format='csc')
             allTargets=numpy.hstack(self.dataset.relevances)
         else:
             temporaryFeatures=[]
             temporaryTargets=[]
             for currentQuery in range(numQueries):
                 temporaryFeatures.append(self.dataset.features[currentQuery][self.dataset.mask[currentQuery], :])
                 temporaryTargets.append(self.dataset.relevances[currentQuery][self.dataset.mask[currentQuery]])
             
             allFeatures=scipy.sparse.vstack(temporaryFeatures, format='csc')
             allTargets=numpy.hstack(temporaryTargets)
     
         if self.regressGains:
             allTargets=numpy.exp2(allTargets)-1.0
         
         allSampleWeights=None
         fitParams=None
         if self.weighted:
             allSampleWeights=numpy.array(self.dataset.docsPerQuery, dtype=numpy.float64)
             allSampleWeights=numpy.reciprocal(allSampleWeights)
             allSampleWeights=numpy.repeat(allSampleWeights, self.dataset.docsPerQuery)    
             fitParams={'sample_weight': allSampleWeights}
         
         #Restrict features to only the unmasked features
         if self.featureList is not None:
             print("DeterministicPolicy:train [INFO] Masking unused features. Remaining feature size", 
                 len(feature_list), flush=True)
             allFeatures = allFeatures[:, self.featureList]
     
         print("DeterministicPolicy:train [INFO] Beginning training", self.modelType, flush=True)
         if self.modelType=='tree':
             treeCV=sklearn.model_selection.GridSearchCV(sklearn.tree.DecisionTreeRegressor(criterion="mse",
                                                     splitter="random", min_samples_split=4, 
                                                     min_samples_leaf=4, presort=False),
                             param_grid=self.treeDepths,
                             scoring=None, fit_params=fitParams, n_jobs=-2,
                             iid=True, cv=5, refit=True, verbose=0, pre_dispatch="1*n_jobs",
                             error_score='raise', return_train_score=False)
                         
             treeCV.fit(allFeatures, allTargets)
             self.tree=treeCV.best_estimator_
             print("DeterministicPolicy:train [INFO] Done. Best depth", 
                         treeCV.best_params_['max_depth'], flush=True)
             joblib.dump(self.tree, modelFile, compress=9, protocol=-1)
         
         elif self.modelType=='lasso':
             lassoCV=sklearn.model_selection.GridSearchCV(sklearn.linear_model.Lasso(fit_intercept=False,
                                                     normalize=False, precompute=False, copy_X=False, 
                                                     max_iter=3000, tol=1e-4, warm_start=False, positive=False,
                                                     random_state=None, selection='random'),
                             param_grid=self.hyperParams,
                             scoring=None, fit_params=fitParams, n_jobs=-2,
                             iid=True, cv=5, refit=True, verbose=0, pre_dispatch="1*n_jobs",
                             error_score='raise', return_train_score=False)
                             
             lassoCV.fit(allFeatures, allTargets)
             self.policyParams=lassoCV.best_estimator_.coef_
             print("DeterministicPolicy:train [INFO] Done. CVAlpha", lassoCV.best_params_['alpha'], flush=True)
             print("DeterministicPolicy:train [INFO] PolicyParams", self.policyParams,flush=True)
             numpy.savez_compressed(modelFile, policyParams=self.policyParams)
     
         elif self.modelType == 'ridge':
             ridgeCV=sklearn.model_selection.GridSearchCV(sklearn.linear_model.Ridge(fit_intercept=False,
                                                                                 normalize=False, copy_X=False,
                                                                                 max_iter=3000, tol=1e-4, random_state=None),
                                                      param_grid=self.hyperParams,
                                                      n_jobs=-2, fit_params=fitParams,
                                                      iid=True, cv=3, refit=True, verbose=0, pre_dispatch='1*n_jobs')
             ridgeCV.fit(allFeatures, allTargets)
             self.policyParams=ridgeCV.best_estimator_.coef_
             print("DeterministicPolicy:train [INFO] Done. CVAlpha", ridgeCV.best_params_['alpha'], flush=True)
         elif self.modelType=='gbrt':
             tree=sklearn.ensemble.GradientBoostingRegressor(learning_rate=self.hyperParams['lr'],
                         n_estimators=self.hyperParams['ensemble'], subsample=self.hyperParams['subsample'], max_leaf_nodes=self.hyperParams['leaves'], 
                         max_features=1.0, presort=False)
             tree.fit(allFeatures, allTargets, sample_weight=allSampleWeights)
             self.tree=tree
             print("DeterministicPolicy:train [INFO] Done.", flush=True)
             joblib.dump(self.tree, modelFile, compress=9, protocol=-1)
         
         else:
             print("DeterministicPolicy:train [ERR] %s not supported." % self.modelType, flush=True)
             sys.exit(0)