示例#1
0
 def get_training_data(self):
     sample_rdd = self.rdd.map(
         lambda t: Sample.from_ndarray(nest.flatten(t), np.array([0.0])))
     fs = FeatureSet.sample_rdd(sample_rdd,
                                sequential_order=self.sequential_order,
                                shuffle=self.shuffle)
     return fs
示例#2
0
    def predict(self, input_fn, checkpoint_path=None):
        with tf.Graph().as_default() as g:
            result = self.estimator._call_input_fn(
                input_fn, tf.estimator.ModeKeys.PREDICT)
            if isinstance(result, TFDataset):
                spec = self._call_model_fn(result.feature_tensors, None,
                                           tf.estimator.ModeKeys.PREDICT,
                                           self.config)
                latest_checkpoint = self.estimator.latest_checkpoint()

                if latest_checkpoint:
                    checkpoint_path = latest_checkpoint

                with tf.Session() as sess:
                    saver = tf.train.Saver()
                    if checkpoint_path:
                        saver.restore(sess, checkpoint_path)
                    else:
                        sess.run(tf.global_variables_initializer())
                    inputs = nest.flatten(result.feature_tensors)
                    outputs = nest.flatten(spec.predictions)
                    tfnet = TFNet.from_session(sess,
                                               inputs=inputs,
                                               outputs=outputs)

                    rdd = result.rdd.map(lambda t: Sample.from_ndarray(
                        nest.flatten(t), np.array([0.0])))

                    results = tfnet.predict(rdd, result.batch_per_thread)
                    return results

        return self.estimator.predict(input_fn,
                                      checkpoint_path=checkpoint_path)
示例#3
0
 def get_validation_data(self):
     if self.validation_text_set is not None:
         return self.validation_text_set.get_samples().map(
             lambda sample: Sample.from_jtensor(
                 features=sample.features + sample.labels,
                 labels=JTensor.from_ndarray(np.array([0.0]))))
     return None
示例#4
0
 def get_prediction_data(self):
     rdd = self.rdd.map(lambda t: Sample.from_ndarray(
         nest.flatten(t[0]
                      if isinstance(t, tuple) else t), np.array([0.0])))
     rdd_wrapper = callZooFunc("float", "zooRDDSampleToMiniBatch", rdd,
                               self.batch_per_thread)
     return rdd_wrapper.value().toJavaRDD()
示例#5
0
 def get_prediction_data(self):
     rdd = self.text_set.get_samples().map(
         lambda sample: Sample.from_jtensor(features=sample.features,
                                            labels=JTensor.from_ndarray(
                                                np.array([0.0]))))
     rdd_wrapper = callZooFunc("float", "zooRDDSampleToMiniBatch", rdd,
                               self.batch_per_thread)
     return rdd_wrapper.value().toJavaRDD()
示例#6
0
 def get_training_data(self):
     sample_rdd = self.text_set.get_samples().map(
         lambda sample: Sample.from_jtensor(
             features=sample.features + sample.labels,
             labels=JTensor.from_ndarray(np.array([0.0]))))
     return FeatureSet.sample_rdd(sample_rdd,
                                  sequential_order=self.sequential_order,
                                  shuffle=self.shuffle)
示例#7
0
 def get_validation_data(self):
     if self.val_rdd is not None:
         sample_rdd = self.val_rdd.map(lambda t: Sample.from_ndarray(
             nest.flatten(t), np.array([0.0])))
         return FeatureSet.sample_rdd(
             sample_rdd,
             sequential_order=self.sequential_order,
             shuffle=self.shuffle)
     return None
示例#8
0
 def get_validation_data(self):
     if self.val_rdd is not None:
         sample_rdd = self.val_rdd.map(lambda t: Sample.from_ndarray(
             nest.flatten(t), np.array([0.0])))
         fs = FeatureSet.sample_rdd(sample_rdd,
                                    sequential_order=self.sequential_order,
                                    shuffle=self.shuffle)
         fs = fs.transform(SampleToMiniBatch(self.batch_size))
         return fs
     return None
示例#9
0
 def get_validation_data(self):
     if self.validation_text_set is not None:
         sample_rdd = self.validation_text_set.get_samples().map(
             lambda sample: Sample.from_jtensor(
                 features=sample.features + sample.labels,
                 labels=JTensor.from_ndarray(np.array([0.0]))))
         return FeatureSet.sample_rdd(
             sample_rdd,
             sequential_order=self.sequential_order,
             shuffle=self.shuffle)
     return None
示例#10
0
    def evaluate(self,
                 input_fn,
                 eval_methods,
                 steps=None,
                 checkpoint_path=None):
        if not all(
                isinstance(metric, six.string_types)
                for metric in eval_methods):
            raise ValueError("All metrics should be string types")
        with tf.Graph().as_default() as g:
            result = self.estimator._call_input_fn(input_fn,
                                                   tf.estimator.ModeKeys.EVAL)
            if isinstance(result, TFDataset):
                spec = self._call_model_fn(result.feature_tensors,
                                           result.label_tensors,
                                           tf.estimator.ModeKeys.PREDICT,
                                           self.config)
                latest_checkpoint = self.estimator.latest_checkpoint()

                if latest_checkpoint:
                    checkpoint_path = latest_checkpoint

                with tf.Session() as sess:
                    if checkpoint_path:
                        saver = tf.train.Saver()
                        saver.restore(sess, checkpoint_path)
                    else:
                        sess.run(tf.global_variables_initializer())
                    inputs = nest.flatten(result._original_tensors[0])
                    outputs = nest.flatten(spec.predictions)
                    tfnet = TFNet.from_session(sess,
                                               inputs=inputs,
                                               outputs=outputs)

                    rdd = result.rdd.map(lambda t: Sample.from_ndarray(
                        nest.flatten(t[0]), nest.flatten(t[1])))
                    if result.batch_per_thread < 0:
                        batch_size = result.batch_size
                    else:
                        batch_size = result.batch_per_thread * result.rdd.getNumPartitions(
                        )

                    eval_methods = [
                        self._to_bigdl_metric(m) for m in eval_methods
                    ]
                    results = tfnet.evaluate(rdd, batch_size, eval_methods)
                    final_result = dict([(r.method, r.result)
                                         for r in results])
                    return final_result

        return self.estimator.evaluate(input_fn,
                                       steps,
                                       checkpoint_path=checkpoint_path)
示例#11
0
 def get_validation_data(self):
     if self.validation_text_set is not None:
         sample_rdd = self.validation_text_set.get_samples().map(
             lambda sample: Sample.from_jtensor(
                 features=sample.features + sample.labels,
                 labels=JTensor.from_ndarray(np.array([0.0]))))
         fs = FeatureSet.sample_rdd(sample_rdd,
                                    sequential_order=self.sequential_order,
                                    shuffle=self.shuffle)
         fs = fs.transform(SampleToMiniBatch(self.batch_size))
         return fs
     return None
示例#12
0
def to_sample_rdd(x, y, sc, num_slices=None):
    """
    Convert x and y into RDD[Sample]
    :param sc: SparkContext
    :param x: ndarray and the first dimension should be batch
    :param y: ndarray and the first dimension should be batch
    :param num_slices: The number of partitions for x and y.
    :return:
    """
    x_rdd = sc.parallelize(x, num_slices)
    y_rdd = sc.parallelize(y, num_slices)
    return x_rdd.zip(y_rdd).map(lambda item: Sample.from_ndarray(item[0], item[1]))
示例#13
0
 def get_training_data(self):
     return self.rdd.map(lambda t: Sample.from_ndarray(nest.flatten(t), np.array([0.0])))
示例#14
0
 def get_evaluation_data(self):
     if isinstance(self.tensor_structure, tuple):
         return self.rdd.map(
             lambda t: Sample.from_ndarray(nest.flatten(t[0]), nest.flatten(t[1])))
     return self.rdd.map(lambda t: Sample.from_ndarray(nest.flatten(t), np.array([0.0])))
示例#15
0
 def get_prediction_data(self):
     data = self.rdd.map(lambda t: Sample.from_ndarray(
         nest.flatten(t[0] if isinstance(t, tuple) else t), np.array([0.0])))
     return data
示例#16
0
 def get_training_data(self):
     return self.text_set.get_samples().map(
         lambda sample: Sample.from_jtensor(features=sample.features + sample.labels,
                                            labels=JTensor.from_ndarray(np.array([0.0]))))
示例#17
0
    def __init__(self,
                 loss,
                 optim_method,
                 sess=None,
                 dataset=None,
                 inputs=None,
                 grads=None,
                 variables=None,
                 graph=None,
                 val_outputs=None,
                 val_labels=None,
                 val_method=None,
                 val_split=0.0):
        import tensorflow as tf
        from zoo.util.tf import export_tf
        '''
        TFOptimizer is used for distributed training of tensorflow
        on Spark/BigDL.

        :param loss: The loss tensor of the tensorflow model, should be a scalar
        :param optim_method: the optimization method to be used, such as bigdl.optim.optimizer.Adam
        :param sess: the current tensorflow Session, if you want to used a pre-trained model, you
        should use the Session to load the pre-trained variables and pass it to TFOptimizer.
        '''

        if dataset is None:
            args = TFOptimizer._get_arguments_from_loss(
                loss, optim_method, sess, val_outputs, val_labels, val_method)
            loss, optim_method, sess, dataset, inputs = args[:5]
            grads, variables, graph, val_outputs, val_labels, val_method = args[
                5:]

        self.optim_method = optim_method
        self.sess = sess
        self.dataset = dataset
        self.inputs = inputs
        self.graph = graph

        from zoo.util.tf import process_grad
        grads = [process_grad(grad) for grad in grads]

        if self.dataset.batch_size <= 0:
            raise ValueError(
                "You should set batch_size instead of batch_per_thread for training"
            )

        if val_outputs is not None and val_labels is not None:
            with self.graph.as_default():
                val_labels = [tf.identity(v) for v in val_labels]
            outputs = val_outputs + val_labels + [loss]
        else:
            outputs = [loss]

        self.export_dir = tempfile.mkdtemp()
        export_tf(self.sess,
                  self.export_dir,
                  inputs=self.inputs,
                  outputs=grads + outputs)

        variable_names = [v.name for v in variables]
        grad_names = [g.name for g in grads]
        output_names = [o.name for o in outputs]

        meta = {
            "input_names": [i.name for i in self.inputs],
            "output_names": output_names,
            "variables": variable_names,
            "grad_variables": grad_names
        }

        with open(os.path.join(self.export_dir, "training_meta.json"),
                  "w") as f:
            f.write(json.dumps(meta))

        self.variable_placeholders = []
        with self.graph.as_default():
            assigns = []
            for v in variables:
                p = tf.placeholder(dtype=tf.float32, shape=v.shape)
                a = tf.assign(v, p)
                self.variable_placeholders.append(p)
                assigns.append(a)
            assign = tf.group(*assigns)
        self.assign = assign
        try:
            self.training_helper_layer = TFTrainingHelper(self.export_dir)
        except Py4JJavaError as e:
            if "expects to be colocated with unknown node" in str(e):
                raise Exception("""
If you are using the embedding layer in tf.keras, then this is a
known issue of tensorflow, see https://github.com/tensorflow/tensorflow/issues/21889.
Please add zoo.util.tf.variable_creator_scope before model construction.
For example:
from zoo.util.tf import variable_creator_scope
with variable_creator_scope():
    model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(1, 1, input_length=1)])
                """)
            else:
                raise e

        data = self.dataset.rdd
        batch_size = self.dataset.batch_size
        sample_rdd = data.map(
            lambda t: Sample.from_ndarray(t, [np.array([0.0])]))

        if val_outputs is not None and val_labels is not None:
            if self.dataset.val_rdd is not None:
                val_rdd = self.dataset.val_rdd \
                    .map(lambda t: Sample.from_ndarray(t,
                                                       [np.array([0.0])]))
                val_method = [
                    TFValidationMethod(m, len(val_outputs), len(val_labels))
                    for m in to_list(val_method)
                ]
                training_rdd = sample_rdd

            elif val_split != 0.0:
                training_rdd, val_rdd = sample_rdd.randomSplit(
                    [1 - val_split, val_split])
                val_method = [
                    TFValidationMethod(m, len(val_outputs), len(val_labels))
                    for m in to_list(val_method)
                ]
            else:
                raise ValueError(
                    "Validation data is not specified. Please set " +
                    "val rdd in TFDataset, or set val_split larger than zero")

            self.optimizer = Optimizer.create(self.training_helper_layer,
                                              training_rdd,
                                              IdentityCriterion(),
                                              batch_size=batch_size,
                                              optim_method=self.optim_method)
            self.optimizer.set_validation(self.dataset.batch_size, val_rdd,
                                          EveryEpoch(), val_method)
        else:
            training_rdd = sample_rdd
            self.optimizer = Optimizer.create(self.training_helper_layer,
                                              training_rdd,
                                              IdentityCriterion(),
                                              batch_size=batch_size,
                                              optim_method=self.optim_method)
示例#18
0
    def predict(self):
        rdd = self.dataset.rdd
        sample_rdd = rdd.map(lambda x: Sample.from_ndarray(x, np.array([0.0])))

        return self.tfnet.predict(sample_rdd, self.dataset.batch_per_thread)
示例#19
0
 def to_sample(t):
     if isinstance(t, list):
         t = tuple(t)
     return Sample.from_ndarray(nest.flatten(t), [np.array([0.0])])
示例#20
0
 def get_validation_data(self):
     if self.val_rdd is not None:
         return self.val_rdd.map(lambda t: Sample.from_ndarray(nest.flatten(t),
                                                               np.array([0.0])))
     return None
示例#21
0
文件: net.py 项目: zfxu/analytics-zoo
    def __init__(self,
                 loss,
                 optim_method,
                 sess=None,
                 val_outputs=None,
                 val_labels=None,
                 val_method=None):
        import tensorflow as tf
        from zoo.util.tf import export_tf
        '''
        TFOptimizer is used for distributed training of tensorflow
        on Spark/BigDL.

        :param loss: The loss tensor of the tensorflow model, should be a scalar
        :param optim_method: the optimization method to be used, such as bigdl.optim.optimizer.Adam
        :param sess: the current tensorflow Session, if you want to used a pre-trained model, you
        should use the Session to load the pre-trained variables and pass it to TFOptimizer.
        '''
        self.optim_method = optim_method
        if sess is None:
            self.sess = tf.Session()
            self.sess.run(tf.global_variables_initializer())
        else:
            self.sess = sess
        grads_vars = tf.train.GradientDescentOptimizer(0).compute_gradients(
            loss)
        variables = []
        grads = []
        for (grad, var) in grads_vars:
            variables.append(var)
            grads.append(grad)
        self.export_dir = tempfile.mkdtemp()
        all_required_inputs = _find_placeholders([loss])
        self.dataset = tf.get_collection(all_required_inputs[0].name)[0]
        if self.dataset.batch_size <= 0:
            raise ValueError(
                "You should set batch_size instead of batch_per_thread for training"
            )
        self.inputs = self.dataset.tensors

        _check_the_same(all_required_inputs, self.inputs)

        if val_outputs is not None and val_labels is not None:
            outputs = val_outputs + val_labels + [loss]
        else:
            outputs = [loss]

        export_tf(self.sess,
                  self.export_dir,
                  inputs=self.inputs,
                  outputs=grads + outputs)

        variable_names = [v.name for v in variables]
        grad_names = [g.name for g in grads]
        output_names = [o.name for o in outputs]

        meta = {
            "input_names": [i.name for i in self.inputs],
            "output_names": output_names,
            "variables": variable_names,
            "grad_variables": grad_names
        }

        with open(os.path.join(self.export_dir, "training_meta.json"),
                  "w") as f:
            f.write(json.dumps(meta))

        self.training_helper_layer = TFTrainingHelper(self.export_dir)

        self.variable_placeholders = []
        assigns = []
        for v in variables:
            p = tf.placeholder(dtype=tf.float32, shape=v.shape)
            a = tf.assign(v, p)
            self.variable_placeholders.append(p)
            assigns.append(a)
        self.assign = tf.group(*assigns)

        data = self.dataset.rdd
        batch_size = self.dataset.batch_size
        sample_rdd = data.map(
            lambda t: Sample.from_ndarray(t, [np.array([0.0])]))

        self.optimizer = Optimizer.create(self.training_helper_layer,
                                          sample_rdd,
                                          IdentityCriterion(),
                                          batch_size=batch_size,
                                          optim_method=self.optim_method)

        if val_outputs is not None and val_labels is not None:
            val_sample_rdd = self.dataset.val_rdd\
                .map(lambda t: Sample.from_ndarray(t, [np.array([0.0])]))
            val_method = TFValidationMethod(val_method, len(val_outputs),
                                            len(val_labels))
            self.optimizer.set_validation(self.dataset.batch_size,
                                          val_sample_rdd, EveryEpoch(),
                                          val_method)
示例#22
0
    def __init__(self,
                 loss,
                 optim_method,
                 sess=None,
                 dataset=None,
                 inputs=None,
                 grads=None,
                 variables=None,
                 graph=None,
                 val_outputs=None,
                 val_labels=None,
                 val_method=None,
                 add_sample_weights_num=0):
        import tensorflow as tf
        from zoo.util.tf import export_tf
        '''
        TFOptimizer is used for distributed training of tensorflow
        on Spark/BigDL.

        :param loss: The loss tensor of the tensorflow model, should be a scalar
        :param optim_method: the optimization method to be used, such as bigdl.optim.optimizer.Adam
        :param sess: the current tensorflow Session, if you want to used a pre-trained model, you
        should use the Session to load the pre-trained variables and pass it to TFOptimizer.
        '''

        if dataset is None:
            args = TFOptimizer._get_arguments_from_loss(
                loss, optim_method, sess, val_outputs, val_labels, val_method)
            loss, optim_method, sess, dataset, inputs = args[:5]
            grads, variables, graph, val_outputs, val_labels, val_method = args[
                5:]

        self.optim_method = optim_method
        self.sess = sess
        self.dataset = dataset
        self.inputs = inputs
        self.graph = graph

        if self.dataset.batch_size <= 0:
            raise ValueError(
                "You should set batch_size instead of batch_per_thread for training"
            )

        if val_outputs is not None and val_labels is not None:
            with self.graph.as_default():
                val_labels = [tf.identity(v) for v in val_labels]
            outputs = val_outputs + val_labels + [loss]
        else:
            outputs = [loss]

        self.export_dir = tempfile.mkdtemp()
        export_tf(self.sess,
                  self.export_dir,
                  inputs=self.inputs,
                  outputs=grads + outputs)

        variable_names = [v.name for v in variables]
        grad_names = [g.name for g in grads]
        output_names = [o.name for o in outputs]

        meta = {
            "input_names": [i.name for i in self.inputs],
            "output_names": output_names,
            "variables": variable_names,
            "grad_variables": grad_names
        }

        with open(os.path.join(self.export_dir, "training_meta.json"),
                  "w") as f:
            f.write(json.dumps(meta))

        self.variable_placeholders = []
        with self.graph.as_default():
            assigns = []
            for v in variables:
                p = tf.placeholder(dtype=tf.float32, shape=v.shape)
                a = tf.assign(v, p)
                self.variable_placeholders.append(p)
                assigns.append(a)
            assign = tf.group(*assigns)
        self.assign = assign

        self.training_helper_layer = TFTrainingHelper(self.export_dir)

        data = self.dataset.rdd
        batch_size = self.dataset.batch_size
        sample_rdd = data.map(lambda t: Sample.from_ndarray(
            t + [np.array(1.0)] * add_sample_weights_num, [np.array([0.0])]))

        self.optimizer = Optimizer.create(self.training_helper_layer,
                                          sample_rdd,
                                          IdentityCriterion(),
                                          batch_size=batch_size,
                                          optim_method=self.optim_method)

        if val_outputs is not None and val_labels is not None:
            val_sample_rdd = self.dataset.val_rdd\
                .map(lambda t: Sample.from_ndarray(t + [np.array(1.0)] * add_sample_weights_num,
                                                   [np.array([0.0])]))
            val_method = [
                TFValidationMethod(m, len(val_outputs), len(val_labels))
                for m in to_list(val_method)
            ]
            self.optimizer.set_validation(self.dataset.batch_size,
                                          val_sample_rdd, EveryEpoch(),
                                          val_method)
示例#23
0
 def to_sample(t):
     return Sample.from_ndarray(nest.flatten(t), [np.array([0.0])])