Пример #1
0
    def predict(self, x, batch_per_thread=None, distributed=False):

        if isinstance(x, TFDataset):
            # todo check arguments
            x = _standarize_feature_dataset(x, self.model)
            return self._predict_distributed(x)
        else:
            if distributed:
                sc = getOrCreateSparkContext()
                rdd, types, shapes = _create_rdd_x(
                    x, self.model._feed_input_names, sc)

                dataset = TFDataset.from_rdd(
                    rdd,
                    names=self.model._feed_input_names,
                    types=types,
                    shapes=shapes,
                    batch_per_thread=-1
                    if batch_per_thread is None else batch_per_thread)
                results = self._predict_distributed(dataset).collect()
                output_num = len(self.model.outputs)
                if output_num == 1:
                    return np.stack(results)
                else:
                    predictions = []
                    for i in range(0, output_num):
                        predictions.append(
                            np.stack([res[i] for res in results]))
                    return predictions
            else:
                return self.model.predict(x=x, batch_size=batch_per_thread)
Пример #2
0
 def predict(self, x, batch_per_thread=1, distributed=True):
     """
     Use a model to do prediction.
     """
     if isinstance(x, ImageSet):
         results = callBigDlFunc(self.bigdl_type, "zooPredict",
                                 self.value,
                                 x,
                                 batch_per_thread)
         return ImageSet(results)
     if distributed:
         if isinstance(x, np.ndarray):
             data_rdd = to_sample_rdd(x, np.zeros([x.shape[0]]), getOrCreateSparkContext())
         elif isinstance(x, RDD):
             data_rdd = x
         else:
             raise TypeError("Unsupported prediction data type: %s" % type(x))
         results = callBigDlFunc(self.bigdl_type, "zooPredict",
                                 self.value,
                                 data_rdd,
                                 batch_per_thread)
         return results.map(lambda result: Layer.convert_output(result))
     else:
         if isinstance(x, np.ndarray) or isinstance(x, list):
             results = callBigDlFunc(self.bigdl_type, "zooPredict",
                                     self.value,
                                     self._to_jtensors(x),
                                     batch_per_thread)
             return [Layer.convert_output(result) for result in results]
         else:
             raise TypeError("Unsupported prediction data type: %s" % type(x))
Пример #3
0
    def from_ndarrays(tensors,
                      batch_size=-1,
                      batch_per_thread=-1,
                      hard_code_batch_size=False,
                      val_tensors=None,
                      sequential_order=False,
                      shuffle=True):
        sc = getOrCreateSparkContext()
        node_num, core_num = get_node_and_core_number()
        total_core_num = node_num * core_num

        rdd, tensor_structure = _tensors_to_rdd(tensors, sc, total_core_num)

        val_rdd = None
        if val_tensors is not None:
            val_rdd, _ = _tensors_to_rdd(val_tensors, sc, total_core_num)

        return TFNdarrayDataset(rdd,
                                tensor_structure,
                                batch_size,
                                batch_per_thread,
                                hard_code_batch_size,
                                val_rdd,
                                sequential_order=sequential_order,
                                shuffle=shuffle)
Пример #4
0
 def evaluate(self,
              x=None,
              y=None,
              batch_per_thread=None,
              distributed=False):
     if isinstance(x, TFDataset):
         x = _standarize_feature_label_dataset(x, self.model)
         # todo check arguments
         return self._evaluate_distributed(x)
     else:
         if distributed:
             sc = getOrCreateSparkContext()
             rdd, types, shapes = _create_rdd_x_y(
                 x, y, self.model._feed_input_names,
                 self.model._feed_output_names, sc)
             names = self.model._feed_input_names + self.model._feed_output_names
             dataset = TFDataset.from_rdd(
                 rdd,
                 names=names,
                 types=types,
                 shapes=shapes,
                 batch_per_thread=-1
                 if batch_per_thread is None else batch_per_thread)
             return self._evaluate_distributed(dataset)
         else:
             return self.model.evaluate(x=x,
                                        y=y,
                                        batch_size=batch_per_thread)
Пример #5
0
    def __init__(self, file_path, parse_fn, batch_size,
                 batch_per_thread, hard_code_batch_size=False, validation_file_path=None):
        import tensorflow as tf
        g = tf.Graph()
        with g.as_default():
            serialized_example = tf.placeholder(dtype=tf.string, shape=[])
            results = parse_fn(serialized_example)

            flattened = nest.flatten(results)
            output_names = [tf.cast(t, dtype=tf.float32).name for t in flattened]

        serialized_graph = bytearray(g.as_graph_def().SerializeToString())

        sc = getOrCreateSparkContext()
        train_rdd = callBigDlFunc("float", "createRDDFromTFRecords",
                                  file_path, sc, serialized_graph,
                                  serialized_example.name, output_names)
        validation_rdd = None
        if validation_file_path is not None:
            validation_rdd = callBigDlFunc("float", "createRDDFromTFRecords",
                                           validation_file_path, sc, serialized_graph,
                                           serialized_example.name, output_names)

        tensor_structure = nest.pack_sequence_as(results,
                                                 [TensorMeta(tf.as_dtype(t.dtype),
                                                  shape=t.shape,
                                                  name="data_%s" % i)
                                                  for i, t in enumerate(nest.flatten(results))])

        super(TFRecordDataset, self).__init__(tensor_structure, batch_size,
                                              batch_per_thread, hard_code_batch_size)

        self.train_rdd = train_rdd
        self.validation_rdd = validation_rdd
Пример #6
0
 def evaluate(self,
              x=None,
              y=None,
              batch_per_thread=None,
              distributed=False):
     if isinstance(x, TFDataset):
         if not x.has_batch:
             raise ValueError("The batch_per_thread of TFDataset must be " +
                              "specified when used in KerasModel evaluate.")
         x = _standarize_feature_label_dataset(x, self.model)
         # todo check arguments
         return self._evaluate_distributed(x)
     else:
         if distributed:
             sc = getOrCreateSparkContext()
             rdd, types, shapes = _create_rdd_x_y(
                 x, y, self.model._feed_input_names,
                 self.model._feed_output_names, sc)
             names = self.model._feed_input_names + self.model._feed_output_names
             dataset = TFDataset.from_rdd(
                 rdd,
                 names=names,
                 types=types,
                 shapes=shapes,
                 batch_per_thread=-1
                 if batch_per_thread is None else batch_per_thread)
             return self._evaluate_distributed(dataset)
         else:
             return self.model.evaluate(x=x,
                                        y=y,
                                        batch_size=batch_per_thread)
Пример #7
0
    def predict(self, x, batch_per_thread=1, distributed=True):
        """
        Use a model to do prediction.
        """
        if isinstance(x, ImageSet):
            results = callZooFunc(self.bigdl_type, "zooPredict",
                                  self.value,
                                  x,
                                  batch_per_thread)
            return ImageSet(results)
        if distributed:
            if isinstance(x, np.ndarray):
                data_rdd = to_sample_rdd(x, np.zeros([x.shape[0]]), getOrCreateSparkContext())
            elif isinstance(x, RDD):
                data_rdd = x
            else:
                raise TypeError("Unsupported prediction data type: %s" % type(x))
            results = callZooFunc(self.bigdl_type, "zooPredict",
                                  self.value,
                                  data_rdd,
                                  batch_per_thread)
            return results.map(lambda result: Layer.convert_output(result))
        else:
            start_idx = 0
            results = []
            while start_idx < len(x):
                end_idx = min(start_idx + batch_per_thread, len(x))
                results.append(self.forward(x[start_idx:end_idx]))
                start_idx += batch_per_thread

            return np.concatenate(results, axis=0)
Пример #8
0
    def predict(self,
                x,
                batch_per_thread=None,
                distributed=False):

        """
        Use a model to do prediction.

        :param x: Input data. It could be:
            - a TFDataset object
            - A Numpy array (or array-like), or a list of arrays
               (in case the model has multiple inputs).
            - A dict mapping input names to the corresponding array/tensors,
            if the model has named inputs.
        :param batch_per_thread:
          The default value is 1.
          When distributed is True,the total batch size is batch_per_thread * rdd.getNumPartitions.
          When distributed is False the total batch size is batch_per_thread * numOfCores.
        :param distributed: Boolean. Whether to do prediction in distributed mode or local mode.
                     Default is True. In local mode, x must be a Numpy array.
        """

        if isinstance(x, TFDataset):
            # todo check arguments
            if not x.has_batch:
                raise ValueError("The batch_per_thread of TFDataset" +
                                 " must be specified when used in KerasModel predict.")
            if isinstance(x, TFNdarrayDataset):
                x = _standarize_feature_dataset(x, self.model)
            return self._predict_distributed(x)
        else:
            if distributed:
                sc = getOrCreateSparkContext()
                rdd, types, shapes = _create_rdd_x(x, self.model._feed_input_names, sc)

                dataset = TFDataset.from_rdd(rdd,
                                             names=self.model._feed_input_names,
                                             types=types,
                                             shapes=shapes,
                                             batch_per_thread=-1 if batch_per_thread is None
                                             else batch_per_thread)
                results = self._predict_distributed(dataset).collect()
                output_num = len(self.model.outputs)
                if output_num == 1:
                    return np.stack(results)
                else:
                    predictions = []
                    for i in range(0, output_num):
                        predictions.append(np.stack([res[i] for res in results]))
                    return predictions
            else:
                return self.model.predict(x=x,
                                          batch_size=batch_per_thread)
Пример #9
0
    def fit(self,
            x=None,
            y=None,
            batch_size=None,
            epochs=1,
            validation_split=0.,
            validation_data=None,
            distributed=False,
            **kwargs):
        if isinstance(x, TFDataset):
            # todo check arguments
            if not x.has_batch:
                raise ValueError("The batch_size of TFDataset must be " +
                                 "specified when used in KerasModel fit.")
            x = _standarize_feature_label_dataset(x, self.model)
            self._fit_distributed(x, validation_split, epochs, **kwargs)

        elif distributed:
            sc = getOrCreateSparkContext()
            train_rdd, types, shapes = _create_rdd_x_y(
                x, y, self.model._feed_input_names,
                self.model._feed_output_names, sc)

            val_rdd = None
            if validation_data is not None:
                val_rdd, _, _ = _create_rdd_x_y(validation_data[0],
                                                validation_data[1],
                                                self.model._feed_input_names,
                                                self.model._feed_output_names,
                                                sc)
            names = self.model._feed_input_names + self.model._feed_output_names
            dataset = TFDataset.from_rdd(
                train_rdd,
                names=names,
                shapes=shapes,
                types=types,
                batch_size=batch_size if batch_size is not None else 32,
                val_rdd=val_rdd)
            self._fit_distributed(dataset, validation_split, epochs, **kwargs)

        else:
            self.model.fit(x=x,
                           y=y,
                           batch_size=batch_size,
                           epochs=epochs,
                           validation_split=validation_split,
                           validation_data=validation_data,
                           **kwargs)
Пример #10
0
    def from_ndarrays(tensors, batch_size=-1, batch_per_thread=-1,
                      hard_code_batch_size=False, val_tensors=None):
        '''
        Create a TFDataset from a nested structure of numpy ndarrays. Each element
        in the resulting TFDataset has the same structure of the argument tensors and
        is created by indexing on the first dimension of each ndarray in the tensors
        argument.

        This method is equivalent to sc.parallize the tensors and call TFDataset.from_rdd

        :return:
        '''
        sc = getOrCreateSparkContext()
        node_num, core_num = get_node_and_core_number()
        total_core_num = node_num * core_num

        rdd, tensor_structure = _tensors_to_rdd(tensors, sc, total_core_num)

        val_rdd = None
        if val_tensors is not None:
            val_rdd, _ = _tensors_to_rdd(val_tensors, sc, total_core_num)

        return TFNdarrayDataset(rdd, tensor_structure, batch_size,
                                batch_per_thread, hard_code_batch_size, val_rdd)