예제 #1
0
  def inference_graph(self, input_data, **inference_args):
    """Constructs a TF graph for evaluating a random forest.

    Args:
      input_data: A tensor or dict of string->Tensor for input data.
      **inference_args: Keyword arguments to pass through to each tree.

    Returns:
      The last op in the random forest inference graph.

    Raises:
      NotImplementedError: If trying to use feature bagging with sparse
        features.
    """
    processed_dense_features, processed_sparse_features, data_spec = (
        data_ops.ParseDataTensorOrDict(input_data))

    probabilities = []
    for i in range(self.params.num_trees):
      with ops.device(self.variables.device_dummies[i].device):
        tree_data = processed_dense_features
        if self.params.bagged_features:
          if processed_sparse_features is not None:
            raise NotImplementedError(
                'Feature bagging not supported with sparse features.')
          tree_data = self._bag_features(i, input_data)
        probabilities.append(self.trees[i].inference_graph(
            tree_data,
            data_spec,
            sparse_features=processed_sparse_features,
            **inference_args))
    with ops.device(self.variables.device_dummies[0].device):
      all_predict = array_ops.stack(probabilities)
      return math_ops.div(
          math_ops.reduce_sum(all_predict, 0), self.params.num_trees,
          name='probabilities')
예제 #2
0
    def training_graph(self,
                       input_data,
                       input_labels,
                       num_trainers=1,
                       trainer_id=0,
                       **tree_kwargs):
        """Constructs a TF graph for training a random forest.

    Args:
      input_data: A tensor or dict of string->Tensor for input data.
      input_labels: A tensor or placeholder for labels associated with
        input_data.
      num_trainers: Number of parallel trainers to split trees among.
      trainer_id: Which trainer this instance is.
      **tree_kwargs: Keyword arguments passed to each tree's training_graph.

    Returns:
      The last op in the random forest training graph.

    Raises:
      NotImplementedError: If trying to use bagging with sparse features.
    """
        processed_dense_features, processed_sparse_features, data_spec = (
            data_ops.ParseDataTensorOrDict(input_data))

        if input_labels is not None:
            labels = data_ops.ParseLabelTensorOrDict(input_labels)

        data_spec = data_spec or self.get_default_data_spec(input_data)

        tree_graphs = []
        trees_per_trainer = self.params.num_trees / num_trainers
        tree_start = int(trainer_id * trees_per_trainer)
        tree_end = int((trainer_id + 1) * trees_per_trainer)
        for i in range(tree_start, tree_end):
            with ops.device(self.variables.device_dummies[i].device):
                seed = self.params.base_random_seed
                if seed != 0:
                    seed += i
                # If using bagging, randomly select some of the input.
                tree_data = processed_dense_features
                tree_labels = labels
                if self.params.bagging_fraction < 1.0:
                    # TODO(gilberth): Support bagging for sparse features.
                    if processed_sparse_features is not None:
                        raise NotImplementedError(
                            'Bagging not supported with sparse features.')
                    # TODO(thomaswc): This does sampling without replacement.  Consider
                    # also allowing sampling with replacement as an option.
                    batch_size = array_ops.strided_slice(
                        array_ops.shape(processed_dense_features), [0], [1])
                    r = random_ops.random_uniform(batch_size, seed=seed)
                    mask = math_ops.less(
                        r,
                        array_ops.ones_like(r) * self.params.bagging_fraction)
                    gather_indices = array_ops.squeeze(array_ops.where(mask),
                                                       squeeze_dims=[1])
                    # TODO(thomaswc): Calculate out-of-bag data and labels, and store
                    # them for use in calculating statistics later.
                    tree_data = array_ops.gather(processed_dense_features,
                                                 gather_indices)
                    tree_labels = array_ops.gather(labels, gather_indices)
                if self.params.bagged_features:
                    if processed_sparse_features is not None:
                        raise NotImplementedError(
                            'Feature bagging not supported with sparse features.'
                        )
                    tree_data = self._bag_features(i, tree_data)

                tree_graphs.append(self.trees[i].training_graph(
                    tree_data,
                    tree_labels,
                    seed,
                    data_spec=data_spec,
                    sparse_features=processed_sparse_features,
                    **tree_kwargs))

        return control_flow_ops.group(*tree_graphs, name='train')
예제 #3
0
    def evaluate(self, X, Y, metric, batch_size=None):
        """ evaluate.

        Evaluate model performance given data and metric.

        Arguments:
            X: `Tensor` or `Tensor list`. The input data. It must be a list of
                `Tensor` in case of multiple inputs.
            Y: `Tensor`. The labels/targets tensor.
            metric: `func` returning a `Tensor`. The metric function.
            batch_size: `int`. The batch size.

        Return:
            The metric value.

        """

        with self.graph.as_default():
            # Verify data dimension
            validate.validate_dim(X, max_dim=2, min_dim=2, var_name='X')
            if not self.regression:
                validate.validate_dim(Y, max_dim=1, min_dim=1, var_name='Y')
            else:
                validate.validate_dim(Y, min_dim=1, var_name='Y')

            # Get data size
            num_samples = data_util.get_num_sample(X)
            capacity = None
            if batch_size is None:
                batch_size = num_samples
                capacity = 1

            # Build Tree Graph
            self._build_estimator(X, Y)

            # Generate Data Tensors. Be aware that every eval with different
            # data will re-create a data tensor.
            if self._eval.get_params('X') != hex(id(X)) or \
                self._eval.get_params('Y') != hex(id(Y)) or \
                self._eval.get_params('batch_size') != batch_size or \
                self._eval.get_params('metric') != metric or \
                not self._eval.is_ready:

                X, Y, cr = io.generate_data_tensor(X,
                                                   Y,
                                                   batch_size=batch_size,
                                                   shuffle=False,
                                                   num_threads=8,
                                                   capacity=capacity)
                X, _, spec = data_ops.ParseDataTensorOrDict(X)
                Y = data_ops.ParseLabelTensorOrDict(Y)

                if not self.params.regression:
                    Y = math_ops.to_float(
                        array_ops.one_hot(
                            math_ops.to_int64(array_ops.squeeze(Y)),
                            self.params.num_classes, 1, 0))
                    Y = tf.reshape(Y, [-1, self.num_classes])

                pred = self.forest_graph.inference_graph(X)
                self._eval_op = metric(pred, Y)
                self._build_eval(X, Y, metric, batch_size)

                # Start QueueRunners
                tf.train.start_queue_runners(sess=self.session)
                if cr: cr.launch_threads(self.session)

            n_batches = int(math.ceil(float(num_samples) / batch_size))

            m = 0.
            for i in range(n_batches):
                m += self.session.run(self._eval_op) / n_batches
            return m
예제 #4
0
    def fit(self,
            X,
            Y,
            batch_size=1024,
            shuffle=True,
            display_step=500,
            n_jobs=1,
            max_steps=None,
            verbose=0):
        """ fit.

        Train model.

        Args:
            Args:
            X: `Tensor` or `Tensor list`. The input data. It must be a list of
                `Tensor` in case of multiple inputs.
            Y: `Tensor`. The labels/targets tensor.
            n_steps: `int`. Total number of steps to run the training.
            batch_size: `int`. The batch size.
            display_step: `int`. The step to display information on screen.
            snapshot_step: `int`. The step to snapshot the model (save and
                evaluate if valX/valY provided).
            n_epoch: Maximum number of epich (Unlimited by default).

        """

        with self.graph.as_default():

            # Verify data dimension
            validate.validate_dim(X, max_dim=2, min_dim=2, var_name='X')
            if not self.regression:
                validate.validate_dim(Y, max_dim=1, min_dim=1, var_name='Y')
            else:
                validate.validate_dim(Y, min_dim=1, var_name='Y')

            # Get data size
            num_samples = data_util.get_num_sample(X)

            # Build Tree Graph
            self._build_estimator(X, Y)

            # Generate Data Tensors. Be aware that every fit with different
            # data will re-create a data tensor.
            if self._train.get_params('X') != hex(id(X)) or \
                self._train.get_params('Y') != hex(id(Y)) or \
                self._train.get_params('batch_size') != batch_size or \
                not self._train.is_ready:

                X, Y, cr = io.generate_data_tensor(X,
                                                   Y,
                                                   batch_size=batch_size,
                                                   shuffle=shuffle,
                                                   num_threads=8)
                X, _, spec = data_ops.ParseDataTensorOrDict(X)
                Y = data_ops.ParseLabelTensorOrDict(Y)

                self._train_op = tf.group(
                    self.forest_graph.training_graph(X, Y,
                                                     num_trainers=n_jobs),
                    state_ops.assign_add(self.global_step, 1))
                self._loss_op = self.forest_graph.training_loss(X, Y)
                self._build_fit(X, Y, batch_size)

                # Start QueueRunners
                tf.train.start_queue_runners(sess=self.session)
                if cr: cr.launch_threads(self.session)

            gstep = self.global_step.eval(session=self.session)

            last_loss = []
            loss_val = None
            step = 0

            # Set step to -1 to exit training
            while True:
                # Monitor loss
                last_loss.append(loss_val)
                if len(last_loss) > 10: last_loss.pop(0)

                start_time = time.time()
                if (step) % display_step == 0:
                    _, loss_val = self.session.run(
                        [self._train_op, self._loss_op])  # TODO: Add acc
                else:
                    _, loss_val = self.session.run(
                        [self._train_op, self._loss_op])
                duration = time.time() - start_time

                if (step) % display_step == 0:
                    examples_per_sec = batch_size / duration
                    sec_per_batch = duration
                    if self.metric:
                        format_str = '%s: step %d, loss = %.2f, acc = %.2f, ' \
                                     '(%.1f examples/sec; %.3f sec/batch)'
                        print(format_str %
                              (datetime.now(), step + gstep, loss_val,
                               examples_per_sec, sec_per_batch))
                    else:
                        format_str = '%s: step %d, loss = %.2f, ' \
                                     '(%.1f examples/sec; %.3f sec/batch)'
                        print(format_str %
                              (datetime.now(), step + gstep, loss_val,
                               examples_per_sec, sec_per_batch))

                step += 1

                # Automatic stop after ten flat loss
                if len(last_loss) == 10 and len(
                        set(last_loss)) <= 1 and not max_steps:
                    break

                # Max Steps stop
                if max_steps:
                    if step == max_steps:
                        break

            save_path = os.path.join(self.log_dir, 'randomforest.ckpt')
            self.saver.save(sess=self.session,
                            save_path=save_path,
                            global_step=self.global_step)
예제 #5
0
파일: kmeans.py 프로젝트: yutoc/tflearn
    def fit(self,
            X,
            shuffle=True,
            display_step=500,
            n_jobs=1,
            max_steps=None,
            verbose=0,
            **kwargs):

        with self.graph.as_default():

            # Verify data dimension
            validate_dim(X, max_dim=2, min_dim=2, var_name='X')

            # Get data size
            num_samples = get_num_sample(X)

            # Set batch size
            if 'batch_size' in kwargs.keys():
                batch_size = kwargs['batch_size']
            else:
                batch_size = num_samples

                # Build Tree Graph
            self._build_estimator(X)

            # Generate Data Tensors. Be aware that every fit with different
            # data will re-create a data tensor.
            if self._train.get_params('X') != hex(id(X)) or \
                self._train.get_params('batch_size') != batch_size or \
                not self._train.is_ready:

                #TODO: raise Exception("Fitting different data not supported")

                X, _, cr = generate_data_tensor(X,
                                                X,
                                                batch_size=batch_size,
                                                shuffle=shuffle,
                                                num_threads=8)
                X, _, spec = data_ops.ParseDataTensorOrDict(X)

                self._train_op = tf.group(
                    self._train_op, state_ops.assign_add(self.global_step, 1))
                self._loss_op = self.avg_distance
                self._build_fit(X, X, batch_size)

                # Start QueueRunners
                tf.train.start_queue_runners(sess=self.session)
                if cr: cr.launch_threads(self.session)

            gstep = self.global_step.eval(session=self.session)

            last_loss = []
            loss_val = None
            step = 0

            # Set step to -1 to exit training
            while True:
                # Monitor loss
                if loss_val: last_loss.append(loss_val)
                if len(last_loss) > 10: last_loss.pop(0)

                start_time = time.time()
                if (step) % display_step == 0:
                    _, loss_val, idx = self.session.run(
                        [self._train_op, self._loss_op, self._cluster_idx])
                else:
                    _, loss_val, idx = self.session.run(
                        [self._train_op, self._loss_op, self._cluster_idx])
                duration = time.time() - start_time

                if (step) % display_step == 0:
                    examples_per_sec = batch_size / duration
                    sec_per_batch = duration
                    if self.metric:
                        format_str = '%s: step %d, loss = %.2f, acc = %.2f, ' \
                                     '(%.1f examples/sec; %.3f sec/batch)'
                        print(format_str %
                              (datetime.now(), step + gstep, loss_val,
                               examples_per_sec, sec_per_batch))
                    else:
                        format_str = '%s: step %d, loss = %.2f, ' \
                                     '(%.1f examples/sec; %.3f sec/batch)'
                        print(format_str %
                              (datetime.now(), step + gstep, loss_val,
                               examples_per_sec, sec_per_batch))

                step += 1

                # Automatic stop after ten flat loss
                # TODO(aymeric): better stopping.
                if len(last_loss
                       ) == 10 and np.var(last_loss) <= 0.01 and not max_steps:
                    break

                # Max Steps stop
                if max_steps:
                    if step == max_steps:
                        break