def load_estimator(self): """Loads a decision forest estimator from a pre-built model. The ``model_dir`` in the :class:`DecisionForestParameters` needs to be set appropriately. """ self.estimator = estimator.SKCompat(random_forest.TensorForestEstimator(self.parameters, model_dir=self.model_dir))
def testAdditionalOutputs(self): """Tests multi-class classification using matrix data as input.""" hparams = tensor_forest.ForestHParams(num_trees=1, max_nodes=100, num_classes=3, num_features=4, split_after_samples=20, inference_tree_paths=True) classifier = random_forest.TensorForestEstimator( hparams.fill(), keys_column='keys', include_all_in_serving=True) iris = base.load_iris() data = iris.data.astype(np.float32) labels = iris.target.astype(np.int32) input_fn = numpy_io.numpy_input_fn(x={ 'x': data, 'keys': np.arange(len(iris.data)).reshape(150, 1) }, y=labels, batch_size=10, num_epochs=1, shuffle=False) classifier.fit(input_fn=input_fn, steps=100) predictions = list(classifier.predict(input_fn=input_fn)) # Check that there is a key column, tree paths and var. for pred in predictions: self.assertTrue('keys' in pred) self.assertTrue('tree_paths' in pred) self.assertTrue('prediction_variance' in pred)
def rf_train(x_train, y_train, x_test, y_test): params = tensor_forest.ForestHParams(num_classes=10, num_features=784, num_trees=100, max_nodes=10000) graph_builder_class = tensor_forest.TrainingLossForest est = estimator.SKCompat( random_forest.TensorForestEstimator( params, graph_builder_class=graph_builder_class, model_dir="./models")) est.fit(x=x_train, y=y_train, batch_size=128) metric_name = "accuracy" metric = { metric_name: metric_spec.MetricSpec( eval_metrics.get_metric(metric_name), prediction_key=eval_metrics.get_prediction_key(metric_name)) } results = est.score(x=x_test, y=y_test, batch_size=128, metrics=metric) for key in sorted(results): print("%s: %s" % (key, results[key]))
def build_estimator(model_dir): params = tensor_forest.ForestHParams( num_classes=10, num_features=784, num_trees=FLAGS.num_trees, max_nodes=FLAGS.max_nodes) graph_builder_class = tensor_forest.RandomForestGraphs return random_forest.TensorForestEstimator( params, graph_builder_class=graph_builder_class, model_dir=model_dir)
def predict(train_x, train_y, test_x, model): if model == 'rf': # Train random forest model on all train samples hparams = randomForestParams() clf = random_forest.TensorForestEstimator(hparams) clf.fit(x=train_x, y=train_y) # Return list of predictions return list(clf.predict(x=test_x))
def build_estimator(model_dir): """Build an estimator.""" params = tensor_forest.ForestHParams(num_classes=10, num_features=784, num_trees=FLAGS.num_trees, max_nodes=FLAGS.max_nodes) graph_builder_class = tensor_forest.RandomForestGraphs if FLAGS.use_training_loss: graph_builder_class = tensor_forest.TrainingLossForest return random_forest.TensorForestEstimator( params, graph_builder_class=graph_builder_class, model_dir=model_dir)
def train_rf(num_features, config): """Build the tf model.""" params = tensor_forest.ForestHParams( num_classes=config.num_classes, num_features=num_features, num_trees=config.num_trees, max_nodes=config.max_nodes) graph_builder_class = tensor_forest.RandomForestGraphs if config.use_training_loss: graph_builder_class = tensor_forest.TrainingLossForest return random_forest.TensorForestEstimator( params, graph_builder_class=graph_builder_class, model_dir=config.model_output)
def build_rf_estimator(model_dir, feature_count): params = tensor_forest.ForestHParams( num_classes=2, num_features=feature_count, num_trees=100, max_nodes=1000, min_split_samples=10) graph_builder_class = tensor_forest.RandomForestGraphs return estimator.SKCompat(random_forest.TensorForestEstimator( params, graph_builder_class=graph_builder_class, model_dir=model_dir))
def build_estimator(model_dir): """Build an estimator.""" params = tensor_forest.ForestHParams( num_classes=10, num_features=784, num_trees=FLAGS.num_trees, max_nodes=FLAGS.max_nodes) graph_builder_class = tensor_forest.RandomForestGraphs if FLAGS.use_training_loss: graph_builder_class = tensor_forest.TrainingLossForest # Use the SKCompat wrapper, which gives us a convenient way to split # in-memory data like MNIST into batches. return estimator.SKCompat(random_forest.TensorForestEstimator( params, graph_builder_class=graph_builder_class, model_dir=model_dir))
def testClassificationTrainingLoss(self): """Tests multi-class classification using matrix data as input.""" hparams = tensor_forest.ForestHParams( num_trees=3, max_nodes=1000, num_classes=3, num_features=4) classifier = random_forest.TensorForestEstimator( hparams, graph_builder_class=(tensor_forest.TrainingLossForest)) iris = base.load_iris() data = iris.data.astype(np.float32) labels = iris.target.astype(np.float32) monitors = [random_forest.TensorForestLossHook(10)] classifier.fit(x=data, y=labels, steps=100, monitors=monitors) classifier.evaluate(x=data, y=labels, steps=10)
def testClassification(self): """Tests multi-class classification using matrix data as input.""" hparams = tensor_forest.ForestHParams(num_trees=3, max_nodes=1000, num_classes=3, num_features=4, split_after_samples=20) classifier = random_forest.TensorForestEstimator(hparams.fill()) iris = base.load_iris() data = iris.data.astype(np.float32) labels = iris.target.astype(np.float32) classifier.fit(x=data, y=labels, steps=100, batch_size=50) classifier.evaluate(x=data, y=labels, steps=10)
def figShowAndWrite(dataSet, label): """ 对数据集进行决策树分类并用图表显示 :param dataSet: 数据集 :param label: 标签 :return: """ # 获取特征个数和类个数 featureNum = dataSet.shape[1] classNum = len(set(label)) # 调用高层api实现决策树 # 根据参数生成type=ForestHParams的决策树参数 params = tensor_forest.ForestHParams(num_classes=classNum, num_features=featureNum, num_trees=1, max_nodes=20) # 使用type=ForestHParams的参数生成决策树 classifier = random_forest.TensorForestEstimator(params) # 决策树拟合训练集 classifier.fit(dataSet, label) # 显示决策树的分类结果 # 画图 x_min, x_max = dataSet[:, 0].min() - 1, dataSet[:, 0].max() + 1 y_min, y_max = dataSet[:, 1].min() - 1, dataSet[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) # 使用生成的决策树进行分类 Z = classifier.predict(np.c_[xx.ravel(), yy.ravel()].astype(np.float32)) Z = np.array(list(Z)) for i in range(len(Z)): Z[i] = Z[i]['classes'] Z = Z.reshape(xx.shape) plt.contourf(xx, yy, Z, alpha=0.2) plt.scatter(dataSet[:, 0], dataSet[:, 1], c=label + 3, alpha=1) plt.show()
def estimator_fit(self, regression=True, num_classes=2, num_trees=None, max_nodes=1000, max_fertile_nodes=0, rootLogger=None): #### Random Forest X_train = np.float32(self.dict_test_and_train['X_train']) Y_train = np.float32(self.dict_test_and_train['Y_train']) num_features = X_train.shape[-1] if (num_trees == None): num_trees = num_features tf.reset_default_graph() params = tf.contrib.tensor_forest.python.tensor_forest.ForestHParams( regression=regression, num_classes=num_classes, num_features=num_features, num_trees=num_trees, ## can double it max_nodes=max_nodes, num_outputs=np.shape(Y_train)[1], max_fertile_nodes=max_fertile_nodes, ## 100? #prune_every_samples = 300, #split_finish_name='basic', # pruning_name='half' #model_name = 'all_sparse' ## default is all_dense #feature_bagging_fraction = 0.7 # use_running_stats_method=True, # checkpoint_stats= True, ## bagging_fraction=1 # feature_bagg ) estimator = random_forest.TensorForestEstimator( params, report_feature_importances=True) estimator.config.save_checkpoints_steps estimator.config.save_checkpoints_secs #with tf.Session() as session: #input_fn_train = tf.estimator.inputs.numpy_input_fn(X_train, X_train, batch_size=1, shuffle=False, num_epochs=1) #input_fn_test = tf.estimator.inputs.numpy_input_fn(X_test, batch_size=1, shuffle=False, num_epochs=1) rootLogger.info(estimator.fit(X_train, Y_train)) #rootLogger.info(estimator.fit(input_fn_train)) self.estimator = estimator
def build_estimator(model_dir): params = tensor_forest.ForestHParams( num_classes=config.num_classes, num_features=config.num_features, num_trees=config.num_trees, max_nodes=config.max_nodes, bagging_fraction=config.bagging_fraction, feature_bagging_fraction=config.feature_bagging_fraction) graph_builder_class = tensor_forest.RandomForestGraphs if config.use_training_loss: graph_builder_class = tensor_forest.TrainingLossForest # Use the SKCompat wrapper, which gives us a convenient way to split # in-memory data like MNIST into batches. return estimator.SKCompat( random_forest.TensorForestEstimator( params, graph_builder_class=graph_builder_class, model_dir=model_dir))
def testRegression(self): """Tests multi-class classification using matrix data as input.""" hparams = tensor_forest.ForestHParams(num_trees=3, max_nodes=1000, num_classes=1, num_features=13, regression=True, split_after_samples=20) regressor = random_forest.TensorForestEstimator(hparams.fill()) boston = base.load_boston() data = boston.data.astype(np.float32) labels = boston.target.astype(np.int32) regressor.fit(x=data, y=labels, steps=100, batch_size=50) regressor.evaluate(x=data, y=labels, steps=10)
def crossValidate(train_x, train_y, model): ### Training & Cross-validating # 1) Make dataset for cross validation (K-Fold) # 2) Split train dataset to train & test # Generate dataset to perform k-fold cross validation (currently 10-fold) dataset = generateDataset(train_x, train_y, 10) models = dict() i = 1 print('-----------------------------------------------') # Perform k-fold cross-validation for x_train, y_train, x_test, y_test in tfe.Iterator(dataset): # Random Forest if model == 'rf': hparams = randomForestParams() clf = random_forest.TensorForestEstimator(hparams) # Temporary location where models are stored # print(clf.model_dir) # Create and fit model clf.fit(x=x_train.numpy(), y=y_train.numpy()) # Make predictions pred = list(clf.predict(x=x_test.numpy())) # pred_prob = list(y['probabilities'] for y in pred) # Make list of predictions pred_class = list(y['classes'] for y in pred) # Calculate accuracy n = len(y_test.numpy()) class_zip = list(zip(y_test.numpy(), pred_class)) n_correct = sum(1 for p in class_zip if p[0] == p[1]) acc = n_correct / n models[i] = acc print('The accuracy of model #%d is: %f' % (i, acc)) i += 1 print('-----------------------------------------------') print('The average accuracy of the models is : %f' % (sum(models.values()) / len(models.values()))) print('-----------------------------------------------')
def testEarlyStopping(self): """Tests multi-class classification using matrix data as input.""" hparams = tensor_forest.ForestHParams(num_trees=100, max_nodes=10000, num_classes=3, num_features=4, split_after_samples=20, inference_tree_paths=True) classifier = random_forest.TensorForestEstimator( hparams.fill(), # Set a crazy threshold - 30% loss change. early_stopping_loss_threshold=0.3, early_stopping_rounds=2) input_fn, _ = _get_classification_input_fns() classifier.fit(input_fn=input_fn, steps=100) # We stopped early. self._assert_checkpoint(classifier.model_dir, global_step=5)
def testClassification(self): """Tests multi-class classification using matrix data as input.""" hparams = tensor_forest.ForestHParams(num_trees=3, max_nodes=1000, num_classes=3, num_features=4, split_after_samples=20, inference_tree_paths=True) classifier = random_forest.TensorForestEstimator(hparams.fill()) input_fn, predict_input_fn = _get_classification_input_fns() classifier.fit(input_fn=input_fn, steps=100) res = classifier.evaluate(input_fn=input_fn, steps=10) self.assertEqual(1.0, res['accuracy']) self.assertAllClose(0.55144483, res['loss']) predictions = list(classifier.predict(input_fn=predict_input_fn)) self.assertAllClose([[0.576117, 0.211942, 0.211942]], [pred['probabilities'] for pred in predictions])
def testRegression(self): """Tests regression using matrix data as input.""" hparams = tensor_forest.ForestHParams(num_trees=5, max_nodes=1000, num_classes=1, num_features=13, regression=True, split_after_samples=20) regressor = random_forest.TensorForestEstimator(hparams.fill()) input_fn, predict_input_fn = _get_regression_input_fns() regressor.fit(input_fn=input_fn, steps=100) res = regressor.evaluate(input_fn=input_fn, steps=10) self.assertGreaterEqual(0.1, res['loss']) predictions = list(regressor.predict(input_fn=predict_input_fn)) self.assertAllClose([24.], [pred['scores'] for pred in predictions], atol=1)
def train(self, data: np.ndarray, labels: np.ndarray): """Trains the decision forest classifier. Args: data (np.ndarray): The training data. labels (np.ndarray): The labels of the training data. """ # build the estimator if self.use_training_loss: graph_builder_class = tensor_forest.TrainingLossForest else: graph_builder_class = tensor_forest.RandomForestGraphs self.estimator = estimator.SKCompat( random_forest.TensorForestEstimator( self.parameters, graph_builder_class=graph_builder_class, model_dir=self.model_dir, report_feature_importances=self.report_feature_importances )) self.estimator.fit(x=data, y=labels, batch_size=self.batch_size)
def get_experiment(output_dir): """Function that creates an experiment http://goo.gl/HcKHlT. Args: output_dir: The directory where the training output should be written. Returns: A `tf.contrib.learn.Experiment`. """ config = PIPELINE_CONFIG.get(args.dataset) columns = feature_columns(config, args.model_type, vocab_sizes, not args.ignore_crosses) runconfig = tf.contrib.learn.RunConfig() cluster = runconfig.cluster_spec num_table_shards = max(1, runconfig.num_ps_replicas * 3) num_partitions = max( 1, 1 + cluster.num_tasks('worker') if cluster and 'worker' in cluster.jobs else 0) monitors = [] if args.model_type == LINEAR: l2_regularization = args.l2_regularization or config[ L2_REGULARIZATION] estimator = tf.contrib.learn.LinearClassifier( model_dir=output_dir, feature_columns=columns, optimizer=tf.contrib.linear_optimizer.SDCAOptimizer( example_id_column=KEY_FEATURE_COLUMN, symmetric_l2_regularization=l2_regularization, num_loss_partitions=num_partitions, # workers num_table_shards=num_table_shards)) # ps elif args.model_type == DEEP: estimator = tf.contrib.learn.DNNClassifier( hidden_units=args.hidden_units, feature_columns=columns, model_dir=output_dir) elif args.model_type == RANDOM_FOREST: params = tensor_forest.ForestHParams( num_classes=2, num_features=CRITEO_NUM_FEATURES, num_trees=args.num_trees, max_nodes=args.max_nodes) # TODO(pew): Returns TensorForestEstimator object directly after a new # release of TensorFlow becomes available. estimator = random_forest.TensorForestEstimator( params.fill(), feature_engineering_fn=_int_to_float_map, early_stopping_rounds=2000, model_dir=output_dir)._estimator monitors.append(random_forest.TensorForestLossHook(100)) l2_regularization = args.l2_regularization or config[L2_REGULARIZATION] input_placeholder_for_prediction = get_placeholder_input_fn( config, args.model_type, vocab_sizes, not args.ignore_crosses) # Export the last model to a predetermined location on GCS. export_monitor = util.ExportLastModelMonitor( output_dir=output_dir, final_model_location='model', # Relative to the output_dir. additional_assets=[args.metadata_path], input_fn=input_placeholder_for_prediction, input_feature_key=EXAMPLES_PLACEHOLDER_KEY, signature_fn=get_export_signature) monitors.append(export_monitor) train_input_fn = get_reader_input_fn(args.train_data_paths, config, args.model_type, vocab_sizes, args.batch_size, not args.ignore_crosses, tf.contrib.learn.ModeKeys.TRAIN) eval_input_fn = get_reader_input_fn(args.eval_data_paths, config, args.model_type, vocab_sizes, args.eval_batch_size, not args.ignore_crosses, tf.contrib.learn.ModeKeys.EVAL) train_set_size = args.train_set_size or config[NUM_EXAMPLES] def _get_eval_metrics(model_type): """Returns a dict of 'string' to 'MetricSpec' objects.""" classes_prediction_key = "classes" if model_type == RANDOM_FOREST: classes_prediction_key = "predictions" eval_metrics = {} eval_metrics["accuracy"] = metric_spec.MetricSpec( prediction_key=classes_prediction_key, metric_fn=tf.contrib.metrics.streaming_accuracy) eval_metrics["precision"] = metric_spec.MetricSpec( prediction_key=classes_prediction_key, metric_fn=tf.contrib.metrics.streaming_precision) eval_metrics["recall"] = metric_spec.MetricSpec( prediction_key=classes_prediction_key, metric_fn=tf.contrib.metrics.streaming_recall) return eval_metrics # TODO(zoy): Switch to using ExportStrategy when available. experiment = tf.contrib.learn.Experiment( estimator=estimator, train_steps=(args.train_steps or args.num_epochs * train_set_size // args.batch_size), eval_steps=args.eval_steps, train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, eval_metrics=_get_eval_metrics(args.model_type), train_monitors=monitors, min_eval_frequency=100) return experiment
num_epochs=None, shuffle=True) # Parameters num_classes = 4 num_features = 59 num_trees = 4 max_nodes = 1000 # Random forest parameters hparams = tensor_forest.ForestHParams(num_classes=num_classes, num_features=num_features, num_trees=num_trees, max_nodes=max_nodes).fill() classifier = random_forest.TensorForestEstimator(hparams) classifier.fit(input_fn=train_input_fn, steps=None) # Verify results metric_name = 'accuracy' metric = { metric_name: metric_spec.MetricSpec( eval_metrics.get_metric(metric_name), prediction_key=eval_metrics.get_prediction_key(metric_name)) } test_input_fn = tf.estimator.inputs.numpy_input_fn(x={'x': features[0:10]}, y=labels[0:10], num_epochs=None,
"accuracy": tf.contrib.learn.MetricSpec( metric_fn=tf.contrib.metrics.streaming_accuracy, prediction_key='probabilities' ), "precision": tf.contrib.learn.MetricSpec( metric_fn=tf.contrib.metrics.streaming_precision, prediction_key='probabilities' ), "recall": tf.contrib.learn.MetricSpec( metric_fn=tf.contrib.metrics.streaming_recall, prediction_key='probabilities' ) } hparams = tf.contrib.tensor_forest.python.tensor_forest.ForestHParams( num_trees=10, max_nodes=1000, num_classes=2, num_features=len(CONTINUOUS_COLUMNS) + len(CATEGORICAL_COLUMNS)) classifier = random_forest.TensorForestEstimator(hparams, model_dir=model_dir, config=tf.contrib.learn.RunConfig(save_checkpoints_secs=60)) classifier.fit(input_fn=train_input_fn, steps=200) results = classifier.evaluate( input_fn=eval_input_fn, steps=1, metrics=validation_metrics) print results for key in sorted(results): print("%s: %s" % (key, results[key]))
num_outputs=np.shape(Y_train)[1], max_fertile_nodes=0, ## 100? #prune_every_samples = 300, #split_finish_name='basic', # pruning_name='half' #model_name = 'all_sparse' ## default is all_dense #feature_bagging_fraction = 0.7 # use_running_stats_method=True, # checkpoint_stats= True, ## bagging_fraction=1 # feature_bagg ) #random_forest.estimator.list_variables('/tmp/tmpzvdna1ol') estimator = random_forest.TensorForestEstimator( params, report_feature_importances=True) estimator.config.save_checkpoints_steps estimator.config.save_checkpoints_secs ## Fit X_train = np.float32(X_train) Y_train = np.float32(Y_train) #fit = estimator.fit(input_fn = input_fn) rootLogger.info(estimator.fit(X_train, Y_train)) dict_feature_importances = md.get_feature_importances(feature_names, log_path) if (compare_results_with_previous_run): previous_log_path = log_path[:-4] + '_previous.log' dict_feature_importances_previous = md.get_feature_importances( feature_names, previous_log_path) dict_compare_feature_importances = {}
} # Merges the two dictionaries into one. feature_cols = dict(continuous_cols.items() + categorical_cols.items()) # Add example id list # Converts the label column into a constant Tensor. label = tf.constant(df[LABEL_COLUMN].values) # Returns the feature columns and the label. return feature_cols, label def train_input_fn(): return input_fn(df_train) def eval_input_fn(): return input_fn(df_test) model_dir = '../rf_model_dir' hparams = tf.contrib.tensor_forest.python.tensor_forest.ForestHParams( num_trees=10, max_nodes=1000, num_classes=2, num_features=len(CONTINUOUS_COLUMNS) + len(CATEGORICAL_COLUMNS)) classifier = random_forest.TensorForestEstimator(hparams, model_dir=model_dir) classifier.fit(input_fn=train_input_fn, steps=100) results = classifier.evaluate(input_fn=eval_input_fn, steps=1) print results for key in sorted(results): print("%s: %s" % (key, results[key]))