Exemplo n.º 1
0
    def load_estimator(self):
        """Loads a decision forest estimator from a pre-built model.

        The ``model_dir`` in the :class:`DecisionForestParameters` needs to be set appropriately.
        """
        self.estimator = estimator.SKCompat(random_forest.TensorForestEstimator(self.parameters,
                                                                                model_dir=self.model_dir))
Exemplo n.º 2
0
    def testAdditionalOutputs(self):
        """Tests multi-class classification using matrix data as input."""
        hparams = tensor_forest.ForestHParams(num_trees=1,
                                              max_nodes=100,
                                              num_classes=3,
                                              num_features=4,
                                              split_after_samples=20,
                                              inference_tree_paths=True)
        classifier = random_forest.TensorForestEstimator(
            hparams.fill(), keys_column='keys', include_all_in_serving=True)

        iris = base.load_iris()
        data = iris.data.astype(np.float32)
        labels = iris.target.astype(np.int32)

        input_fn = numpy_io.numpy_input_fn(x={
            'x':
            data,
            'keys':
            np.arange(len(iris.data)).reshape(150, 1)
        },
                                           y=labels,
                                           batch_size=10,
                                           num_epochs=1,
                                           shuffle=False)

        classifier.fit(input_fn=input_fn, steps=100)
        predictions = list(classifier.predict(input_fn=input_fn))
        # Check that there is a key column, tree paths and var.
        for pred in predictions:
            self.assertTrue('keys' in pred)
            self.assertTrue('tree_paths' in pred)
            self.assertTrue('prediction_variance' in pred)
Exemplo n.º 3
0
def rf_train(x_train, y_train, x_test, y_test):
    params = tensor_forest.ForestHParams(num_classes=10,
                                         num_features=784,
                                         num_trees=100,
                                         max_nodes=10000)

    graph_builder_class = tensor_forest.TrainingLossForest

    est = estimator.SKCompat(
        random_forest.TensorForestEstimator(
            params,
            graph_builder_class=graph_builder_class,
            model_dir="./models"))

    est.fit(x=x_train, y=y_train, batch_size=128)

    metric_name = "accuracy"

    metric = {
        metric_name:
        metric_spec.MetricSpec(
            eval_metrics.get_metric(metric_name),
            prediction_key=eval_metrics.get_prediction_key(metric_name))
    }

    results = est.score(x=x_test, y=y_test, batch_size=128, metrics=metric)

    for key in sorted(results):
        print("%s: %s" % (key, results[key]))
Exemplo n.º 4
0
def build_estimator(model_dir):
  params = tensor_forest.ForestHParams(
      num_classes=10, num_features=784,
      num_trees=FLAGS.num_trees, max_nodes=FLAGS.max_nodes)
  graph_builder_class = tensor_forest.RandomForestGraphs
 
  return random_forest.TensorForestEstimator(
      params, graph_builder_class=graph_builder_class,
      model_dir=model_dir)
def predict(train_x, train_y, test_x, model):
    if model == 'rf':
        # Train random forest model on all train samples
        hparams = randomForestParams()
        clf = random_forest.TensorForestEstimator(hparams)
        clf.fit(x=train_x, y=train_y)

    # Return list of predictions
    return list(clf.predict(x=test_x))
Exemplo n.º 6
0
def build_estimator(model_dir):
    """Build an estimator."""
    params = tensor_forest.ForestHParams(num_classes=10,
                                         num_features=784,
                                         num_trees=FLAGS.num_trees,
                                         max_nodes=FLAGS.max_nodes)
    graph_builder_class = tensor_forest.RandomForestGraphs
    if FLAGS.use_training_loss:
        graph_builder_class = tensor_forest.TrainingLossForest
    return random_forest.TensorForestEstimator(
        params, graph_builder_class=graph_builder_class, model_dir=model_dir)
Exemplo n.º 7
0
def train_rf(num_features, config):
    """Build the tf model."""
    params = tensor_forest.ForestHParams(
        num_classes=config.num_classes, num_features=num_features,
        num_trees=config.num_trees, max_nodes=config.max_nodes)
    graph_builder_class = tensor_forest.RandomForestGraphs
    if config.use_training_loss:
        graph_builder_class = tensor_forest.TrainingLossForest
    return random_forest.TensorForestEstimator(
        params, graph_builder_class=graph_builder_class,
        model_dir=config.model_output)
def build_rf_estimator(model_dir, feature_count):
    params = tensor_forest.ForestHParams(
        num_classes=2,
        num_features=feature_count,
        num_trees=100,
        max_nodes=1000,
        min_split_samples=10)
    
    graph_builder_class = tensor_forest.RandomForestGraphs
    return estimator.SKCompat(random_forest.TensorForestEstimator(
        params, graph_builder_class=graph_builder_class,
        model_dir=model_dir))
Exemplo n.º 9
0
def build_estimator(model_dir):
  """Build an estimator."""
  params = tensor_forest.ForestHParams(
      num_classes=10, num_features=784,
      num_trees=FLAGS.num_trees, max_nodes=FLAGS.max_nodes)
  graph_builder_class = tensor_forest.RandomForestGraphs
  if FLAGS.use_training_loss:
    graph_builder_class = tensor_forest.TrainingLossForest
  # Use the SKCompat wrapper, which gives us a convenient way to split
  # in-memory data like MNIST into batches.
  return estimator.SKCompat(random_forest.TensorForestEstimator(
      params, graph_builder_class=graph_builder_class,
      model_dir=model_dir))
Exemplo n.º 10
0
  def testClassificationTrainingLoss(self):
    """Tests multi-class classification using matrix data as input."""
    hparams = tensor_forest.ForestHParams(
        num_trees=3, max_nodes=1000, num_classes=3, num_features=4)
    classifier = random_forest.TensorForestEstimator(
        hparams, graph_builder_class=(tensor_forest.TrainingLossForest))

    iris = base.load_iris()
    data = iris.data.astype(np.float32)
    labels = iris.target.astype(np.float32)

    monitors = [random_forest.TensorForestLossHook(10)]
    classifier.fit(x=data, y=labels, steps=100, monitors=monitors)
    classifier.evaluate(x=data, y=labels, steps=10)
Exemplo n.º 11
0
    def testClassification(self):
        """Tests multi-class classification using matrix data as input."""
        hparams = tensor_forest.ForestHParams(num_trees=3,
                                              max_nodes=1000,
                                              num_classes=3,
                                              num_features=4,
                                              split_after_samples=20)
        classifier = random_forest.TensorForestEstimator(hparams.fill())

        iris = base.load_iris()
        data = iris.data.astype(np.float32)
        labels = iris.target.astype(np.float32)

        classifier.fit(x=data, y=labels, steps=100, batch_size=50)
        classifier.evaluate(x=data, y=labels, steps=10)
def figShowAndWrite(dataSet, label):
    """
    对数据集进行决策树分类并用图表显示

    :param dataSet: 数据集
    :param label: 标签

    :return:
    """

    # 获取特征个数和类个数
    featureNum = dataSet.shape[1]
    classNum = len(set(label))

    # 调用高层api实现决策树

    # 根据参数生成type=ForestHParams的决策树参数
    params = tensor_forest.ForestHParams(num_classes=classNum,
                                         num_features=featureNum,
                                         num_trees=1,
                                         max_nodes=20)

    # 使用type=ForestHParams的参数生成决策树
    classifier = random_forest.TensorForestEstimator(params)

    # 决策树拟合训练集
    classifier.fit(dataSet, label)

    # 显示决策树的分类结果

    # 画图
    x_min, x_max = dataSet[:, 0].min() - 1, dataSet[:, 0].max() + 1
    y_min, y_max = dataSet[:, 1].min() - 1, dataSet[:, 1].max() + 1

    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                         np.arange(y_min, y_max, 0.1))

    # 使用生成的决策树进行分类
    Z = classifier.predict(np.c_[xx.ravel(), yy.ravel()].astype(np.float32))

    Z = np.array(list(Z))
    for i in range(len(Z)):
        Z[i] = Z[i]['classes']
    Z = Z.reshape(xx.shape)

    plt.contourf(xx, yy, Z, alpha=0.2)
    plt.scatter(dataSet[:, 0], dataSet[:, 1], c=label + 3, alpha=1)
    plt.show()
    def estimator_fit(self,
                      regression=True,
                      num_classes=2,
                      num_trees=None,
                      max_nodes=1000,
                      max_fertile_nodes=0,
                      rootLogger=None):
        #### Random Forest
        X_train = np.float32(self.dict_test_and_train['X_train'])
        Y_train = np.float32(self.dict_test_and_train['Y_train'])
        num_features = X_train.shape[-1]

        if (num_trees == None):
            num_trees = num_features

        tf.reset_default_graph()

        params = tf.contrib.tensor_forest.python.tensor_forest.ForestHParams(
            regression=regression,
            num_classes=num_classes,
            num_features=num_features,
            num_trees=num_trees,  ## can double it
            max_nodes=max_nodes,
            num_outputs=np.shape(Y_train)[1],
            max_fertile_nodes=max_fertile_nodes,  ## 100?
            #prune_every_samples = 300,
            #split_finish_name='basic',
            #    pruning_name='half'

            #model_name = 'all_sparse' ## default is all_dense
            #feature_bagging_fraction = 0.7
            #      use_running_stats_method=True,
            #      checkpoint_stats= True,
            ##      bagging_fraction=1
            #      feature_bagg
        )
        estimator = random_forest.TensorForestEstimator(
            params, report_feature_importances=True)
        estimator.config.save_checkpoints_steps
        estimator.config.save_checkpoints_secs

        #with tf.Session() as session:
        #input_fn_train = tf.estimator.inputs.numpy_input_fn(X_train, X_train, batch_size=1, shuffle=False, num_epochs=1)
        #input_fn_test = tf.estimator.inputs.numpy_input_fn(X_test, batch_size=1, shuffle=False, num_epochs=1)
        rootLogger.info(estimator.fit(X_train, Y_train))
        #rootLogger.info(estimator.fit(input_fn_train))
        self.estimator = estimator
def build_estimator(model_dir):
    params = tensor_forest.ForestHParams(
        num_classes=config.num_classes,
        num_features=config.num_features,
        num_trees=config.num_trees,
        max_nodes=config.max_nodes,
        bagging_fraction=config.bagging_fraction,
        feature_bagging_fraction=config.feature_bagging_fraction)
    graph_builder_class = tensor_forest.RandomForestGraphs
    if config.use_training_loss:
        graph_builder_class = tensor_forest.TrainingLossForest
    # Use the SKCompat wrapper, which gives us a convenient way to split
    # in-memory data like MNIST into batches.
    return estimator.SKCompat(
        random_forest.TensorForestEstimator(
            params,
            graph_builder_class=graph_builder_class,
            model_dir=model_dir))
    def testRegression(self):
        """Tests multi-class classification using matrix data as input."""

        hparams = tensor_forest.ForestHParams(num_trees=3,
                                              max_nodes=1000,
                                              num_classes=1,
                                              num_features=13,
                                              regression=True,
                                              split_after_samples=20)

        regressor = random_forest.TensorForestEstimator(hparams.fill())

        boston = base.load_boston()
        data = boston.data.astype(np.float32)
        labels = boston.target.astype(np.int32)

        regressor.fit(x=data, y=labels, steps=100, batch_size=50)
        regressor.evaluate(x=data, y=labels, steps=10)
def crossValidate(train_x, train_y, model):

    ### Training & Cross-validating
    # 1) Make dataset for cross validation (K-Fold)
    # 2) Split train dataset to train & test

    # Generate dataset to perform k-fold cross validation (currently 10-fold)
    dataset = generateDataset(train_x, train_y, 10)
    models = dict()
    i = 1
    print('-----------------------------------------------')

    # Perform k-fold cross-validation
    for x_train, y_train, x_test, y_test in tfe.Iterator(dataset):

        # Random Forest
        if model == 'rf':
            hparams = randomForestParams()
            clf = random_forest.TensorForestEstimator(hparams)

        # Temporary location where models are stored
        # print(clf.model_dir)
        # Create and fit model
        clf.fit(x=x_train.numpy(), y=y_train.numpy())

        # Make predictions
        pred = list(clf.predict(x=x_test.numpy()))
        # pred_prob = list(y['probabilities'] for y in pred)
        # Make list of predictions
        pred_class = list(y['classes'] for y in pred)

        # Calculate accuracy
        n = len(y_test.numpy())
        class_zip = list(zip(y_test.numpy(), pred_class))
        n_correct = sum(1 for p in class_zip if p[0] == p[1])
        acc = n_correct / n
        models[i] = acc
        print('The accuracy of model #%d is: %f' % (i, acc))
        i += 1

    print('-----------------------------------------------')
    print('The average accuracy of the models is : %f' %
          (sum(models.values()) / len(models.values())))
    print('-----------------------------------------------')
Exemplo n.º 17
0
    def testEarlyStopping(self):
        """Tests multi-class classification using matrix data as input."""
        hparams = tensor_forest.ForestHParams(num_trees=100,
                                              max_nodes=10000,
                                              num_classes=3,
                                              num_features=4,
                                              split_after_samples=20,
                                              inference_tree_paths=True)
        classifier = random_forest.TensorForestEstimator(
            hparams.fill(),
            # Set a crazy threshold - 30% loss change.
            early_stopping_loss_threshold=0.3,
            early_stopping_rounds=2)

        input_fn, _ = _get_classification_input_fns()
        classifier.fit(input_fn=input_fn, steps=100)

        # We stopped early.
        self._assert_checkpoint(classifier.model_dir, global_step=5)
Exemplo n.º 18
0
    def testClassification(self):
        """Tests multi-class classification using matrix data as input."""
        hparams = tensor_forest.ForestHParams(num_trees=3,
                                              max_nodes=1000,
                                              num_classes=3,
                                              num_features=4,
                                              split_after_samples=20,
                                              inference_tree_paths=True)
        classifier = random_forest.TensorForestEstimator(hparams.fill())

        input_fn, predict_input_fn = _get_classification_input_fns()
        classifier.fit(input_fn=input_fn, steps=100)
        res = classifier.evaluate(input_fn=input_fn, steps=10)

        self.assertEqual(1.0, res['accuracy'])
        self.assertAllClose(0.55144483, res['loss'])

        predictions = list(classifier.predict(input_fn=predict_input_fn))
        self.assertAllClose([[0.576117, 0.211942, 0.211942]],
                            [pred['probabilities'] for pred in predictions])
Exemplo n.º 19
0
    def testRegression(self):
        """Tests regression using matrix data as input."""

        hparams = tensor_forest.ForestHParams(num_trees=5,
                                              max_nodes=1000,
                                              num_classes=1,
                                              num_features=13,
                                              regression=True,
                                              split_after_samples=20)

        regressor = random_forest.TensorForestEstimator(hparams.fill())

        input_fn, predict_input_fn = _get_regression_input_fns()

        regressor.fit(input_fn=input_fn, steps=100)
        res = regressor.evaluate(input_fn=input_fn, steps=10)
        self.assertGreaterEqual(0.1, res['loss'])

        predictions = list(regressor.predict(input_fn=predict_input_fn))
        self.assertAllClose([24.], [pred['scores'] for pred in predictions],
                            atol=1)
Exemplo n.º 20
0
    def train(self, data: np.ndarray, labels: np.ndarray):
        """Trains the decision forest classifier.

        Args:
            data (np.ndarray): The training data.
            labels (np.ndarray): The labels of the training data.
        """

        # build the estimator
        if self.use_training_loss:
            graph_builder_class = tensor_forest.TrainingLossForest
        else:
            graph_builder_class = tensor_forest.RandomForestGraphs

        self.estimator = estimator.SKCompat(
            random_forest.TensorForestEstimator(
                self.parameters,
                graph_builder_class=graph_builder_class,
                model_dir=self.model_dir,
                report_feature_importances=self.report_feature_importances
        ))

        self.estimator.fit(x=data, y=labels, batch_size=self.batch_size)
Exemplo n.º 21
0
    def get_experiment(output_dir):
        """Function that creates an experiment http://goo.gl/HcKHlT.

    Args:
      output_dir: The directory where the training output should be written.
    Returns:
      A `tf.contrib.learn.Experiment`.
    """

        config = PIPELINE_CONFIG.get(args.dataset)
        columns = feature_columns(config, args.model_type, vocab_sizes,
                                  not args.ignore_crosses)

        runconfig = tf.contrib.learn.RunConfig()
        cluster = runconfig.cluster_spec
        num_table_shards = max(1, runconfig.num_ps_replicas * 3)
        num_partitions = max(
            1, 1 + cluster.num_tasks('worker')
            if cluster and 'worker' in cluster.jobs else 0)

        monitors = []
        if args.model_type == LINEAR:
            l2_regularization = args.l2_regularization or config[
                L2_REGULARIZATION]
            estimator = tf.contrib.learn.LinearClassifier(
                model_dir=output_dir,
                feature_columns=columns,
                optimizer=tf.contrib.linear_optimizer.SDCAOptimizer(
                    example_id_column=KEY_FEATURE_COLUMN,
                    symmetric_l2_regularization=l2_regularization,
                    num_loss_partitions=num_partitions,  # workers
                    num_table_shards=num_table_shards))  # ps
        elif args.model_type == DEEP:
            estimator = tf.contrib.learn.DNNClassifier(
                hidden_units=args.hidden_units,
                feature_columns=columns,
                model_dir=output_dir)
        elif args.model_type == RANDOM_FOREST:
            params = tensor_forest.ForestHParams(
                num_classes=2,
                num_features=CRITEO_NUM_FEATURES,
                num_trees=args.num_trees,
                max_nodes=args.max_nodes)
            # TODO(pew): Returns TensorForestEstimator object directly after a new
            # release of TensorFlow becomes available.
            estimator = random_forest.TensorForestEstimator(
                params.fill(),
                feature_engineering_fn=_int_to_float_map,
                early_stopping_rounds=2000,
                model_dir=output_dir)._estimator
            monitors.append(random_forest.TensorForestLossHook(100))

        l2_regularization = args.l2_regularization or config[L2_REGULARIZATION]

        input_placeholder_for_prediction = get_placeholder_input_fn(
            config, args.model_type, vocab_sizes, not args.ignore_crosses)

        # Export the last model to a predetermined location on GCS.
        export_monitor = util.ExportLastModelMonitor(
            output_dir=output_dir,
            final_model_location='model',  # Relative to the output_dir.
            additional_assets=[args.metadata_path],
            input_fn=input_placeholder_for_prediction,
            input_feature_key=EXAMPLES_PLACEHOLDER_KEY,
            signature_fn=get_export_signature)
        monitors.append(export_monitor)

        train_input_fn = get_reader_input_fn(args.train_data_paths, config,
                                             args.model_type, vocab_sizes,
                                             args.batch_size,
                                             not args.ignore_crosses,
                                             tf.contrib.learn.ModeKeys.TRAIN)

        eval_input_fn = get_reader_input_fn(args.eval_data_paths, config,
                                            args.model_type, vocab_sizes,
                                            args.eval_batch_size,
                                            not args.ignore_crosses,
                                            tf.contrib.learn.ModeKeys.EVAL)

        train_set_size = args.train_set_size or config[NUM_EXAMPLES]

        def _get_eval_metrics(model_type):
            """Returns a dict of 'string' to 'MetricSpec' objects."""
            classes_prediction_key = "classes"
            if model_type == RANDOM_FOREST:
                classes_prediction_key = "predictions"
            eval_metrics = {}
            eval_metrics["accuracy"] = metric_spec.MetricSpec(
                prediction_key=classes_prediction_key,
                metric_fn=tf.contrib.metrics.streaming_accuracy)
            eval_metrics["precision"] = metric_spec.MetricSpec(
                prediction_key=classes_prediction_key,
                metric_fn=tf.contrib.metrics.streaming_precision)
            eval_metrics["recall"] = metric_spec.MetricSpec(
                prediction_key=classes_prediction_key,
                metric_fn=tf.contrib.metrics.streaming_recall)
            return eval_metrics

        # TODO(zoy): Switch to using ExportStrategy when available.
        experiment = tf.contrib.learn.Experiment(
            estimator=estimator,
            train_steps=(args.train_steps or
                         args.num_epochs * train_set_size // args.batch_size),
            eval_steps=args.eval_steps,
            train_input_fn=train_input_fn,
            eval_input_fn=eval_input_fn,
            eval_metrics=_get_eval_metrics(args.model_type),
            train_monitors=monitors,
            min_eval_frequency=100)

        return experiment
Exemplo n.º 22
0
                                                    num_epochs=None,
                                                    shuffle=True)

# Parameters
num_classes = 4
num_features = 59
num_trees = 4
max_nodes = 1000

# Random forest parameters
hparams = tensor_forest.ForestHParams(num_classes=num_classes,
                                      num_features=num_features,
                                      num_trees=num_trees,
                                      max_nodes=max_nodes).fill()

classifier = random_forest.TensorForestEstimator(hparams)

classifier.fit(input_fn=train_input_fn, steps=None)

# Verify results
metric_name = 'accuracy'
metric = {
    metric_name:
    metric_spec.MetricSpec(
        eval_metrics.get_metric(metric_name),
        prediction_key=eval_metrics.get_prediction_key(metric_name))
}

test_input_fn = tf.estimator.inputs.numpy_input_fn(x={'x': features[0:10]},
                                                   y=labels[0:10],
                                                   num_epochs=None,
Exemplo n.º 23
0
    "accuracy":
    tf.contrib.learn.MetricSpec(
        metric_fn=tf.contrib.metrics.streaming_accuracy,
        prediction_key='probabilities'
        ),
    "precision":
    tf.contrib.learn.MetricSpec(
         metric_fn=tf.contrib.metrics.streaming_precision,
         prediction_key='probabilities'
         ),
    "recall":
    tf.contrib.learn.MetricSpec(
        metric_fn=tf.contrib.metrics.streaming_recall,
        prediction_key='probabilities'
        )
    }

hparams = tf.contrib.tensor_forest.python.tensor_forest.ForestHParams(
    num_trees=10,
    max_nodes=1000,
    num_classes=2,
    num_features=len(CONTINUOUS_COLUMNS) + len(CATEGORICAL_COLUMNS))
classifier = random_forest.TensorForestEstimator(hparams, model_dir=model_dir, config=tf.contrib.learn.RunConfig(save_checkpoints_secs=60))

classifier.fit(input_fn=train_input_fn, steps=200)
results = classifier.evaluate(
    input_fn=eval_input_fn, steps=1, metrics=validation_metrics)
print results
for key in sorted(results):
    print("%s: %s" % (key, results[key]))
Exemplo n.º 24
0
    num_outputs=np.shape(Y_train)[1],
    max_fertile_nodes=0,  ## 100?
    #prune_every_samples = 300,
    #split_finish_name='basic',
    #    pruning_name='half'

    #model_name = 'all_sparse' ## default is all_dense
    #feature_bagging_fraction = 0.7
    #      use_running_stats_method=True,
    #      checkpoint_stats= True,
    ##      bagging_fraction=1
    #      feature_bagg
)

#random_forest.estimator.list_variables('/tmp/tmpzvdna1ol')
estimator = random_forest.TensorForestEstimator(
    params, report_feature_importances=True)

estimator.config.save_checkpoints_steps
estimator.config.save_checkpoints_secs
## Fit
X_train = np.float32(X_train)
Y_train = np.float32(Y_train)
#fit = estimator.fit(input_fn = input_fn)

rootLogger.info(estimator.fit(X_train, Y_train))
dict_feature_importances = md.get_feature_importances(feature_names, log_path)
if (compare_results_with_previous_run):
    previous_log_path = log_path[:-4] + '_previous.log'
    dict_feature_importances_previous = md.get_feature_importances(
        feature_names, previous_log_path)
    dict_compare_feature_importances = {}
Exemplo n.º 25
0
    }
    # Merges the two dictionaries into one.
    feature_cols = dict(continuous_cols.items() + categorical_cols.items())
    # Add example id list
    # Converts the label column into a constant Tensor.
    label = tf.constant(df[LABEL_COLUMN].values)
    # Returns the feature columns and the label.
    return feature_cols, label


def train_input_fn():
    return input_fn(df_train)


def eval_input_fn():
    return input_fn(df_test)


model_dir = '../rf_model_dir'

hparams = tf.contrib.tensor_forest.python.tensor_forest.ForestHParams(
    num_trees=10,
    max_nodes=1000,
    num_classes=2,
    num_features=len(CONTINUOUS_COLUMNS) + len(CATEGORICAL_COLUMNS))
classifier = random_forest.TensorForestEstimator(hparams, model_dir=model_dir)
classifier.fit(input_fn=train_input_fn, steps=100)
results = classifier.evaluate(input_fn=eval_input_fn, steps=1)
print results
for key in sorted(results):
    print("%s: %s" % (key, results[key]))