예제 #1
0
    def _transform(self, dataset):
        if len([field for field in dataset.schema if field.dataType == DoubleType()]) > 0:
            logger.warn("Detected DoubleType columns in dataframe passed to transform(). In "
                        "Deep Learning Pipelines 1.0 and above, DoubleType columns can only be "
                        "fed to input tensors of type tf.float64. To feed dataframe data to "
                        "tensors of other types (e.g. tf.float32, tf.int32, tf.int64), use the "
                        "corresponding Spark SQL data types (FloatType, IntegerType, LongType).")

        graph_def = self._optimize_for_inference()
        input_mapping = self.getInputMapping()
        output_mapping = self.getOutputMapping()

        graph = tf.Graph()
        with tf.Session(graph=graph):
            analyzed_df = tfs.analyze(dataset)
            out_tnsr_op_names = [tfx.op_name(tnsr_name) for tnsr_name, _ in output_mapping]
            # Load graph
            tf.import_graph_def(graph_def=graph_def, name='', return_elements=out_tnsr_op_names)
            # Feed dict maps from placeholder name to DF column name
            feed_dict = {tfx.op_name(tnsr_name): col_name for col_name, tnsr_name in input_mapping}
            fetches = [tfx.get_tensor(tnsr_name, graph) for tnsr_name in out_tnsr_op_names]
            out_df = tfs.map_blocks(fetches, analyzed_df, feed_dict=feed_dict)
            # We still have to rename output columns
            for tnsr_name, new_colname in output_mapping:
                old_colname = tfx.op_name(tnsr_name, graph)
                if old_colname != new_colname:
                    out_df = out_df.withColumnRenamed(old_colname, new_colname)

        return out_df
예제 #2
0
    def _transform(self, dataset):
        if any([field.dataType == DoubleType() for field in dataset.schema]):
            logger.warning("Detected DoubleType columns in dataframe passed to transform(). In "
                           "Deep Learning Pipelines 1.0 and above, DoubleType columns can only be "
                           "fed to input tensors of type tf.float64. To feed dataframe data to "
                           "tensors of other types (e.g. tf.float32, tf.int32, tf.int64), use the "
                           "corresponding Spark SQL data types (FloatType, IntegerType, LongType).")

        graph_def = self._optimize_for_inference()
        input_mapping = self.getInputMapping()
        output_mapping = self.getOutputMapping()

        graph = tf.Graph()
        with tf.Session(graph=graph):
            analyzed_df = tfs.analyze(dataset)
            out_tnsr_op_names = [tfx.op_name(tnsr_name) for tnsr_name, _ in output_mapping]
            # Load graph
            tf.import_graph_def(graph_def=graph_def, name='', return_elements=out_tnsr_op_names)
            # Feed dict maps from placeholder name to DF column name
            feed_dict = {tfx.op_name(tnsr_name): col_name for col_name, tnsr_name in input_mapping}
            fetches = [tfx.get_tensor(tnsr_name, graph) for tnsr_name in out_tnsr_op_names]
            out_df = tfs.map_blocks(fetches, analyzed_df, feed_dict=feed_dict)
            # We still have to rename output columns
            for tnsr_name, new_colname in output_mapping:
                old_colname = tfx.op_name(tnsr_name, graph)
                if old_colname != new_colname:
                    out_df = out_df.withColumnRenamed(old_colname, new_colname)

        return out_df
예제 #3
0
    def _transform(self, dataset):
        graph_def = self._optimize_for_inference()
        input_mapping = self.getInputMapping()
        output_mapping = self.getOutputMapping()

        graph = tf.Graph()
        with tf.Session(graph=graph):
            analyzed_df = tfs.analyze(dataset)
            out_tnsr_op_names = [
                tfx.op_name(tnsr_name) for tnsr_name, _ in output_mapping
            ]
            # Load graph
            tf.import_graph_def(graph_def=graph_def,
                                name='',
                                return_elements=out_tnsr_op_names)

            # Feed dict maps from placeholder name to DF column name
            feed_dict = {
                self._getSparkDlOpName(tnsr_name): col_name
                for col_name, tnsr_name in input_mapping
            }
            fetches = [
                tfx.get_tensor(tnsr_name, graph)
                for tnsr_name in out_tnsr_op_names
            ]

            out_df = tfs.map_blocks(fetches, analyzed_df, feed_dict=feed_dict)
            # We still have to rename output columns
            for tnsr_name, new_colname in output_mapping:
                old_colname = tfx.op_name(tnsr_name, graph)
                if old_colname != new_colname:
                    out_df = out_df.withColumnRenamed(old_colname, new_colname)

        return out_df
예제 #4
0
 def _optimize_for_inference(self):
     graph_def = self.getTFInputGraph().graph_def
     # Get data types of input placeholders
     placeholder_types = self._get_placeholder_types(graph_def)
     # Strip away graph nodes not used in computing the tensors with the specified output names
     input_names = [tfx.op_name(tnsr_name) for _, tnsr_name in self.getInputMapping()]
     output_names = [tfx.op_name(tnsr_name) for tnsr_name, _ in self.getOutputMapping()]
     return infr_opt.optimize_for_inference(graph_def,
                                            input_names,
                                            output_names,
                                            placeholder_types)
예제 #5
0
 def _optimize_for_inference(self):
     graph_def = self.getTFInputGraph().graph_def
     # Get data types of input placeholders
     placeholder_types = self._get_placeholder_types(graph_def)
     # Strip away graph nodes not used in computing the tensors with the specified output names
     input_names = [tfx.op_name(tnsr_name) for _, tnsr_name in self.getInputMapping()]
     output_names = [tfx.op_name(tnsr_name) for tnsr_name, _ in self.getOutputMapping()]
     return infr_opt.optimize_for_inference(graph_def,
                                            input_names,
                                            output_names,
                                            placeholder_types)
    def test_simple_keras_udf(self):
        """ Simple Keras sequential model """
        # Notice that the input layer for a image UDF model
        # must be of shape (width, height, numChannels)
        # The leading batch size is taken care of by Keras
        with IsolatedSession(using_keras=True) as issn:
            model = Sequential()
            model.add(Flatten(input_shape=(640,480,3)))
            model.add(Dense(units=64))
            model.add(Activation('relu'))
            model.add(Dense(units=10))
            model.add(Activation('softmax'))
            # Initialize the variables
            init_op = tf.global_variables_initializer()
            issn.run(init_op)
            makeGraphUDF(issn.graph,
                         'my_keras_model_udf',
                         model.outputs,
                         {tfx.op_name(issn.graph, model.inputs[0]): 'image_col'})
            # Run the training procedure
            # Export the graph in this IsolatedSession as a GraphFunction
            # gfn = issn.asGraphFunction(model.inputs, model.outputs)
            fh_name = "test_keras_simple_sequential_model"
            registerKerasImageUDF(fh_name, model)

        self._assert_function_exists(fh_name)
예제 #7
0
 def _getSparkDlOpName(self, tensor_name):
     """
     Given a tensor name, returns the name of the op generating the tensor, prefixed with
     a special scope indicating that the op has been added by Sparkdl.
     """
     op_name = tfx.op_name(tensor_name)
     return tfx.add_scope_to_name(scope=self.SPARKDL_OP_SCOPE, name=op_name)
예제 #8
0
 def exec_gfn_spimg_decode(spimg_dict, img_dtype):
     gfn = gfac.buildSpImageConverter(img_dtype)
     with IsolatedSession() as issn:
         feeds, fetches = issn.importGraphFunction(gfn, prefix="")
         feed_dict = dict((tnsr, spimg_dict[tfx.op_name(issn.graph, tnsr)]) for tnsr in feeds)
         img_out = issn.run(fetches[0], feed_dict=feed_dict)
     return img_out
예제 #9
0
    def _optimize_for_inference(self):
        """ Optimize the graph for inference """
        gin = self.getTFInputGraph()
        input_mapping = self.getInputMapping()
        output_mapping = self.getOutputMapping()
        input_node_names = [tfx.op_name(tnsr_name) for _, tnsr_name in input_mapping]
        output_node_names = [tfx.op_name(tnsr_name) for tnsr_name, _ in output_mapping]

        # NOTE(phi-dbq): Spark DataFrame assumes float64 as default floating point type
        opt_gdef = infr_opt.optimize_for_inference(gin.graph_def,
                                                   input_node_names,
                                                   output_node_names,
                                                   # TODO: below is the place to change for
                                                   #       the `float64` data type issue.
                                                   tf.float64.as_datatype_enum)
        return opt_gdef
예제 #10
0
    def test_pipeline(self):
        """ Pipeline should provide correct function composition """
        img_fpaths = glob(os.path.join(_getSampleJPEGDir(), '*.jpg'))

        xcpt_model = Xception(weights="imagenet")
        stages = [('spimage',
                   gfac.buildSpImageConverter(SparkMode.RGB_FLOAT32)),
                  ('xception', GraphFunction.fromKeras(xcpt_model))]
        piped_model = GraphFunction.fromList(stages)

        for fpath in img_fpaths:
            target_size = tuple(xcpt_model.input.shape.as_list()[1:-1])
            img = load_img(fpath, target_size=target_size)
            img_arr = np.expand_dims(img_to_array(img), axis=0)
            img_input = xcpt.preprocess_input(img_arr)
            preds_ref = xcpt_model.predict(img_input)

            spimg_input_dict = imageArrayToStruct(img_input).asDict()
            spimg_input_dict['data'] = bytes(spimg_input_dict['data'])
            with IsolatedSession() as issn:
                # Need blank import scope name so that spimg fields match the input names
                feeds, fetches = issn.importGraphFunction(piped_model,
                                                          prefix="")
                feed_dict = dict(
                    (tnsr, spimg_input_dict[tfx.op_name(tnsr, issn.graph)])
                    for tnsr in feeds)
                preds_tgt = issn.run(fetches[0], feed_dict=feed_dict)
                # Uncomment the line below to see the graph
                # tfx.write_visualization_html(issn.graph,
                #                              NamedTemporaryFile(prefix="gdef", suffix=".html").name)

            self.assertTrue(np.all(preds_tgt == preds_ref))
예제 #11
0
    def test_pipeline(self):
        """ Pipeline should provide correct function composition """
        img_fpaths = glob(os.path.join(_getSampleJPEGDir(), '*.jpg'))

        xcpt_model = Xception(weights="imagenet")
        stages = [('spimage', gfac.buildSpImageConverter(SparkMode.RGB_FLOAT32)),
                  ('xception', GraphFunction.fromKeras(xcpt_model))]
        piped_model = GraphFunction.fromList(stages)

        for fpath in img_fpaths:
            target_size = tuple(xcpt_model.input.shape.as_list()[1:-1])
            img = load_img(fpath, target_size=target_size)
            img_arr = np.expand_dims(img_to_array(img), axis=0)
            img_input = xcpt.preprocess_input(img_arr)
            preds_ref = xcpt_model.predict(img_input)

            spimg_input_dict = imageArrayToStruct(img_input).asDict()
            spimg_input_dict['data'] = bytes(spimg_input_dict['data'])
            with IsolatedSession() as issn:
                # Need blank import scope name so that spimg fields match the input names
                feeds, fetches = issn.importGraphFunction(piped_model, prefix="")
                feed_dict = dict((tnsr, spimg_input_dict[tfx.op_name(issn.graph, tnsr)]) for tnsr in feeds)
                preds_tgt = issn.run(fetches[0], feed_dict=feed_dict)
                # Uncomment the line below to see the graph
                # tfx.write_visualization_html(issn.graph,
                #                              NamedTemporaryFile(prefix="gdef", suffix=".html").name)

            self.assertTrue(np.all(preds_tgt == preds_ref))
예제 #12
0
 def exec_gfn_spimg_decode(spimg_dict, img_dtype):
     gfn = gfac.buildSpImageConverter('BGR', img_dtype)
     with IsolatedSession() as issn:
         feeds, fetches = issn.importGraphFunction(gfn, prefix="")
         feed_dict = dict(
             (tnsr, spimg_dict[tfx.op_name(tnsr, issn.graph)]) for tnsr in feeds)
         img_out = issn.run(fetches[0], feed_dict=feed_dict)
     return img_out
예제 #13
0
    def test_get_graph_elements(self):
        """ Fetching graph elements by names and other graph elements """

        with IsolatedSession() as issn:
            x = tf.placeholder(tf.double, shape=[], name="x")
            z = tf.add(x, 3, name='z')

            g = issn.graph
            self.assertEqual(tfx.get_tensor(g, z), z)
            self.assertEqual(tfx.get_tensor(g, x), x)
            self.assertEqual(g.get_tensor_by_name("x:0"), tfx.get_tensor(g, x))
            self.assertEqual("x:0", tfx.tensor_name(g, x))
            self.assertEqual(g.get_operation_by_name("x"), tfx.get_op(g, x))
            self.assertEqual("x", tfx.op_name(g, x))
            self.assertEqual("z", tfx.op_name(g, z))
            self.assertEqual(tfx.tensor_name(g, z), "z:0")
            self.assertEqual(tfx.tensor_name(g, x), "x:0")
예제 #14
0
def _gen_tensor_op_string_input_tests():
    op_name = 'someOp'
    for tnsr_idx in [0, 1, 2, 3, 5, 8, 15, 17]:
        tnsr_name = '{}:{}'.format(op_name, tnsr_idx)
        yield TestCase(data=(op_name, tfx.op_name(tnsr_name)),
                       description='test tensor name to op name')
        yield TestCase(data=(tnsr_name, tfx.tensor_name(tnsr_name)),
                       description='test tensor name to tensor name')
예제 #15
0
def _gen_tensor_op_string_input_tests():
    op_name = 'someOp'
    for tnsr_idx in [0, 1, 2, 3, 5, 8, 15, 17]:
        tnsr_name = '{}:{}'.format(op_name, tnsr_idx)
        yield TestCase(data=(op_name, tfx.op_name(tnsr_name)),
                       description='test tensor name to op name')
        yield TestCase(data=(tnsr_name, tfx.tensor_name(tnsr_name)),
                       description='test tensor name to tensor name')
예제 #16
0
 def _loadTFGraph(self):
     with KSessionWrap() as (sess, g):
         assert K.backend() == "tensorflow", \
             "Keras backend is not tensorflow but KerasImageTransformer only supports " + \
             "tensorflow-backed Keras models."
         with g.as_default():
             K.set_learning_phase(0)  # Testing phase
             model = load_model(self.getModelFile())
             out_op_name = tfx.op_name(g, model.output)
             self._inputTensor = model.input.name
             self._outputTensor = model.output.name
             return tfx.strip_and_freeze_until([out_op_name], g, sess, return_graph=True)
예제 #17
0
 def _loadTFGraph(self):
     with KSessionWrap() as (sess, graph):
         assert K.backend() == "tensorflow", \
             "Keras backend is not tensorflow but KerasImageTransformer only supports " + \
             "tensorflow-backed Keras models."
         with graph.as_default():
             K.set_learning_phase(0)  # Testing phase
             model = load_model(self.getModelFile())
             out_op_name = tfx.op_name(model.output, graph)
             self._inputTensor = model.input.name
             self._outputTensor = model.output.name
             return tfx.strip_and_freeze_until([out_op_name], graph, sess, return_graph=True)
예제 #18
0
    def _optimize_for_inference(self):
        gin = self.getTFInputGraph()
        # Inject cast ops to convert float64 input fed from Spark into the datatypes of the
        # Graph's input nodes.
        graphdef_with_casts = self._addCastOps(
            self.getTFInputGraph().graph_def)

        # Strip away graph nodes not used in computing the tensors with the specified output names
        input_names = [
            self._getSparkDlOpName(tnsr_name)
            for _, tnsr_name in self.getInputMapping()
        ]
        output_names = [
            tfx.op_name(tnsr_name) for tnsr_name, _ in self.getOutputMapping()
        ]
        opt_gdef = infr_opt.optimize_for_inference(graphdef_with_casts,
                                                   input_names, output_names,
                                                   tf.float64.as_datatype_enum)
        return opt_gdef
    def _loadTFGraph(self, sess, graph):
        """
        Loads the Keras model into memory, then uses the passed-in session to load the
        model's inference-related ops into the passed-in Tensorflow graph.

        :return: A tuple (graph, input_name, output_name) where graph is the TF graph
        corresponding to the Keras model's inference subgraph, input_name is the name of the
        Keras model's input tensor, and output_name is the name of the Keras model's output tensor.
        """
        keras_backend = K.backend()
        assert keras_backend == "tensorflow", \
            "Only tensorflow-backed Keras models are supported, tried to load Keras model " \
            "with backend %s."%(keras_backend)
        with graph.as_default():
            K.set_learning_phase(0)  # Inference phase
            model = load_model(self.getModelFile())
            out_op_name = tfx.op_name(model.output, graph)
            stripped_graph = tfx.strip_and_freeze_until([out_op_name], graph, sess,
                                                        return_graph=True)
            return stripped_graph, model.input.name, model.output.name
예제 #20
0
    def _loadTFGraph(self, sess, graph):
        """
        Loads the Keras model into memory, then uses the passed-in session to load the
        model's inference-related ops into the passed-in Tensorflow graph.

        :return: A tuple (graph, input_name, output_name) where graph is the TF graph
        corresponding to the Keras model's inference subgraph, input_name is the name of the
        Keras model's input tensor, and output_name is the name of the Keras model's output tensor.
        """
        keras_backend = K.backend()
        assert keras_backend == "tensorflow", \
            "Only tensorflow-backed Keras models are supported, tried to load Keras model " \
            "with backend %s." % (keras_backend)
        with graph.as_default():
            K.set_learning_phase(0)  # Inference phase
            model = load_model(self.getModelFile())
            out_op_name = tfx.op_name(model.output, graph)
            stripped_graph = tfx.strip_and_freeze_until([out_op_name], graph, sess,
                                                        return_graph=True)
            return stripped_graph, model.input.name, model.output.name
예제 #21
0
    def _transform(self, dataset):
        graph = self.getGraph()
        composed_graph = self._addReshapeLayers(graph,
                                                self._getImageDtype(dataset))
        final_graph = self._stripGraph(composed_graph)

        with final_graph.as_default():
            image = dataset[self.getInputCol()]
            image_df_exploded = (dataset.withColumn(
                "__sdl_image_height", image.height).withColumn(
                    "__sdl_image_width", image.width).withColumn(
                        "__sdl_image_nchannels",
                        image.nChannels).withColumn("__sdl_image_data",
                                                    image.data))

            final_output_name = self._getFinalOutputTensorName()
            output_tensor = final_graph.get_tensor_by_name(final_output_name)
            final_df = (tfs.map_rows(
                [output_tensor],
                image_df_exploded,
                feed_dict={
                    "height": "__sdl_image_height",
                    "width": "__sdl_image_width",
                    "num_channels": "__sdl_image_nchannels",
                    "image_buffer": "__sdl_image_data"
                }).drop("__sdl_image_height", "__sdl_image_width",
                        "__sdl_image_nchannels", "__sdl_image_data"))

            tfs_output_name = tfx.op_name(output_tensor, final_graph)
            original_output_name = self._getOriginalOutputTensorName()
            output_shape = final_graph.get_tensor_by_name(
                original_output_name).shape
            output_mode = self.getOrDefault(self.outputMode)
            # TODO: support non-1d tensors (return np.array).
            if output_mode == "image":
                return self._convertOutputToImage(final_df, tfs_output_name,
                                                  output_shape)
            else:
                assert output_mode == "vector", "Unknown output mode: %s" % output_mode
                return self._convertOutputToVector(final_df, tfs_output_name)
예제 #22
0
    def _addCastOps(self, user_graph_def):
        """
        Given a GraphDef object corresponding to a user-specified graph G, creates a copy G'
        of G with ops injected before each input node. The injected ops allow the input nodes of G'
        to accept tf.float64 input fed from Spark, casting float64 input into the datatype
        requested by each input node.

        :return: GraphDef representing the copied, modified graph.
        """
        # Load user-specified graph into memory
        user_graph = tf.Graph()
        with user_graph.as_default():
            tf.import_graph_def(user_graph_def, name="")

        # Build a subgraph containing our injected ops
        # TODO: Cheap optimization: if all input tensors are of type float64, just do nothing here
        injected_op_subgraph = tf.Graph()
        # Maps names of input tensors in our original graph to outputs of the injected-op subgraph
        input_map = {}
        with injected_op_subgraph.as_default():
            with tf.name_scope(self.SPARKDL_OP_SCOPE):
                for _, orig_tensor_name in self.getInputMapping():
                    orig_tensor = tfx.get_tensor(orig_tensor_name, user_graph)
                    # Create placeholder with same shape as original input tensor, but that accepts
                    # float64 input from Spark.
                    spark_placeholder = tf.placeholder(
                        tf.float64,
                        shape=orig_tensor.shape,
                        name=tfx.op_name(orig_tensor_name))
                    # If the original tensor was of type float64, just pass through the Spark input
                    if orig_tensor.dtype == tf.float64:
                        input_map[orig_tensor_name] = spark_placeholder
                    # Otherwise, cast the Spark input to the datatype of the original tensor
                    else:
                        input_map[orig_tensor_name] = tf.cast(
                            spark_placeholder, dtype=orig_tensor.dtype)
            tf.import_graph_def(graph_def=user_graph_def,
                                input_map=input_map,
                                name="")
        return injected_op_subgraph.as_graph_def(add_shapes=True)
예제 #23
0
    def _transform(self, dataset):
        graph = self.getGraph()
        composed_graph = self._addReshapeLayers(graph, self._getImageDtype(dataset))
        final_graph = self._stripGraph(composed_graph)
        with final_graph.as_default():  # pylint: disable=not-context-manager
            image = dataset[self.getInputCol()]
            image_df_exploded = (dataset
                                 .withColumn("__sdl_image_height", image.height)
                                 .withColumn("__sdl_image_width", image.width)
                                 .withColumn("__sdl_image_nchannels", image.nChannels)
                                 .withColumn("__sdl_image_data", image.data)
                                )  # yapf: disable

            final_output_name = self._getFinalOutputTensorName()
            output_tensor = final_graph.get_tensor_by_name(final_output_name)
            final_df = (
                tfs.map_rows([output_tensor], image_df_exploded,
                             feed_dict={
                                 "height": "__sdl_image_height",
                                 "width": "__sdl_image_width",
                                 "num_channels": "__sdl_image_nchannels",
                                 "image_buffer": "__sdl_image_data"})
                .drop("__sdl_image_height", "__sdl_image_width", "__sdl_image_nchannels",
                      "__sdl_image_data")
            )   # yapf: disable

            tfs_output_name = tfx.op_name(output_tensor, final_graph)
            original_output_name = self._getOriginalOutputTensorName()
            output_shape = final_graph.get_tensor_by_name(original_output_name).shape
            output_mode = self.getOrDefault(self.outputMode)
            # TODO: support non-1d tensors (return np.array).
            if output_mode == "image":
                return self._convertOutputToImage(final_df, tfs_output_name, output_shape)
            else:
                assert output_mode == "vector", "Unknown output mode: %s" % output_mode
                return self._convertOutputToVector(final_df, tfs_output_name)
예제 #24
0
def _gen_valid_tensor_op_input_combos():
    op_name = 'someConstOp'
    tnsr_name = '{}:0'.format(op_name)
    tnsr = tf.constant(1427.08, name=op_name)
    graph = tnsr.graph

    # Test for op_name
    yield TestCase(data=(op_name, tfx.op_name(tnsr)),
                   description='get op name from tensor (no graph)')
    yield TestCase(data=(op_name, tfx.op_name(tnsr, graph)),
                   description='get op name from tensor (with graph)')
    yield TestCase(data=(op_name, tfx.op_name(tnsr_name)),
                   description='get op name from tensor name (no graph)')
    yield TestCase(data=(op_name, tfx.op_name(tnsr_name, graph)),
                   description='get op name from tensor name (with graph)')
    yield TestCase(data=(op_name, tfx.op_name(tnsr.op)),
                   description='get op name from op (no graph)')
    yield TestCase(data=(op_name, tfx.op_name(tnsr.op, graph)),
                   description='get op name from op (with graph)')
    yield TestCase(data=(op_name, tfx.op_name(op_name)),
                   description='get op name from op name (no graph)')
    yield TestCase(data=(op_name, tfx.op_name(op_name, graph)),
                   description='get op name from op name (with graph)')

    # Test for tensor_name
    yield TestCase(data=(tnsr_name, tfx.tensor_name(tnsr)),
                   description='get tensor name from tensor (no graph)')
    yield TestCase(data=(tnsr_name, tfx.tensor_name(tnsr, graph)),
                   description='get tensor name from tensor (with graph)')
    yield TestCase(data=(tnsr_name, tfx.tensor_name(tnsr_name)),
                   description='get tensor name from tensor name (no graph)')
    yield TestCase(data=(tnsr_name, tfx.tensor_name(tnsr_name, graph)),
                   description='get tensor name from tensor name (with graph)')
    yield TestCase(data=(tnsr_name, tfx.tensor_name(tnsr.op)),
                   description='get tensor name from op (no graph)')
    yield TestCase(data=(tnsr_name, tfx.tensor_name(tnsr.op, graph)),
                   description='get tensor name from op (with graph)')
    yield TestCase(data=(tnsr_name, tfx.tensor_name(tnsr_name)),
                   description='get tensor name from op name (no graph)')
    yield TestCase(data=(tnsr_name, tfx.tensor_name(tnsr_name, graph)),
                   description='get tensor name from op name (with graph)')

    # Test for get_tensor
    yield TestCase(data=(tnsr, tfx.get_tensor(tnsr, graph)),
                   description='get tensor from tensor')
    yield TestCase(data=(tnsr, tfx.get_tensor(tnsr_name, graph)),
                   description='get tensor from tensor name')
    yield TestCase(data=(tnsr, tfx.get_tensor(tnsr.op, graph)),
                   description='get tensor from op')
    yield TestCase(data=(tnsr, tfx.get_tensor(op_name, graph)),
                   description='get tensor from op name')

    # Test for get_op
    yield TestCase(data=(tnsr.op, tfx.get_op(tnsr, graph)),
                   description='get op from tensor')
    yield TestCase(data=(tnsr.op, tfx.get_op(tnsr_name, graph)),
                   description='get op from tensor name')
    yield TestCase(data=(tnsr.op, tfx.get_op(tnsr.op, graph)),
                   description='get op from op')
    yield TestCase(data=(tnsr.op, tfx.get_op(op_name, graph)),
                   description='test op from op name')
예제 #25
0
def _gen_valid_tensor_op_input_combos():
    op_name = 'someConstOp'
    tnsr_name = '{}:0'.format(op_name)
    tnsr = tf.constant(1427.08, name=op_name)
    graph = tnsr.graph

    # Test for op_name
    yield TestCase(data=(op_name, tfx.op_name(tnsr)),
                   description='get op name from tensor (no graph)')
    yield TestCase(data=(op_name, tfx.op_name(tnsr, graph)),
                   description='get op name from tensor (with graph)')
    yield TestCase(data=(op_name, tfx.op_name(tnsr_name)),
                   description='get op name from tensor name (no graph)')
    yield TestCase(data=(op_name, tfx.op_name(tnsr_name, graph)),
                   description='get op name from tensor name (with graph)')
    yield TestCase(data=(op_name, tfx.op_name(tnsr.op)),
                   description='get op name from op (no graph)')
    yield TestCase(data=(op_name, tfx.op_name(tnsr.op, graph)),
                   description='get op name from op (with graph)')
    yield TestCase(data=(op_name, tfx.op_name(op_name)),
                   description='get op name from op name (no graph)')
    yield TestCase(data=(op_name, tfx.op_name(op_name, graph)),
                   description='get op name from op name (with graph)')

    # Test for tensor_name
    yield TestCase(data=(tnsr_name, tfx.tensor_name(tnsr)),
                   description='get tensor name from tensor (no graph)')
    yield TestCase(data=(tnsr_name, tfx.tensor_name(tnsr, graph)),
                   description='get tensor name from tensor (with graph)')
    yield TestCase(data=(tnsr_name, tfx.tensor_name(tnsr_name)),
                   description='get tensor name from tensor name (no graph)')
    yield TestCase(data=(tnsr_name, tfx.tensor_name(tnsr_name, graph)),
                   description='get tensor name from tensor name (with graph)')
    yield TestCase(data=(tnsr_name, tfx.tensor_name(tnsr.op)),
                   description='get tensor name from op (no graph)')
    yield TestCase(data=(tnsr_name, tfx.tensor_name(tnsr.op, graph)),
                   description='get tensor name from op (with graph)')
    yield TestCase(data=(tnsr_name, tfx.tensor_name(tnsr_name)),
                   description='get tensor name from op name (no graph)')
    yield TestCase(data=(tnsr_name, tfx.tensor_name(tnsr_name, graph)),
                   description='get tensor name from op name (with graph)')

    # Test for get_tensor
    yield TestCase(data=(tnsr, tfx.get_tensor(tnsr, graph)),
                   description='get tensor from tensor')
    yield TestCase(data=(tnsr, tfx.get_tensor(tnsr_name, graph)),
                   description='get tensor from tensor name')
    yield TestCase(data=(tnsr, tfx.get_tensor(tnsr.op, graph)),
                   description='get tensor from op')
    yield TestCase(data=(tnsr, tfx.get_tensor(op_name, graph)),
                   description='get tensor from op name')

    # Test for get_op
    yield TestCase(data=(tnsr.op, tfx.get_op(tnsr, graph)),
                   description='get op from tensor')
    yield TestCase(data=(tnsr.op, tfx.get_op(tnsr_name, graph)),
                   description='get op from tensor name')
    yield TestCase(data=(tnsr.op, tfx.get_op(tnsr.op, graph)),
                   description='get op from op')
    yield TestCase(data=(tnsr.op, tfx.get_op(op_name, graph)),
                   description='test op from op name')
def makeGraphUDF(graph, udf_name, fetches, feeds_to_fields_map=None, blocked=False, register=True):
    """
    Create a Spark SQL UserDefinedFunction from a given TensorFlow Graph

    The following example creates a UDF that takes the input
    from a DataFrame column named 'image_col' and produce some random prediction.

    .. code-block:: python

        from sparkdl.graph.tensorframes_udf import makeUDF
        
        with IsolatedSession() as issn:
            x = tf.placeholder(tf.double, shape=[], name="input_x")
            z = tf.add(x, 3, name='z')
            makeGraphUDF(issn.graph, "my_tensorflow_udf", [z])

    Then this function can be used in a SQL query.

    .. code-block:: python

        df = spark.createDataFrame([Row(xCol=float(x)) for x in range(100)])
        df.createOrReplaceTempView("my_float_table")
        spark.sql("select my_tensorflow_udf(xCol) as zCol from my_float_table").show()            

    :param graph: :py:class:`tf.Graph`, a TensorFlow Graph
    :param udf_name: str, name of the SQL UDF
    :param fetches: list, output tensors of the graph
    :param feeds_to_fields_map: a dict of str -> str,
                                The key is the name of a placeholder in the current
                                TensorFlow graph of computation.
                                The value is the name of a column in the dataframe.
                                For now, only the top-level fields in a dataframe are supported.

                                .. note:: For any placeholder that is
                                          not specified in the feed dictionary,
                                          the name of the input column is assumed to be
                                          the same as that of the placeholder.

    :param blocked: bool, if set to True, the TensorFrames will execute the function
                    over blocks/batches of rows. This should provide better performance.
                    Otherwise, the function is applied to individual rows
    :param register: bool, if set to True, the SQL UDF will be registered.
                     In this case, it will be accessible in SQL queries.
    :return: JVM function handle object
    """
    graph = tfx.validated_graph(graph)
    # pylint: disable=W0212
    # TODO: Work with TensorFlow's registered expansions
    # https://github.com/tensorflow/tensorflow/blob/v1.1.0/tensorflow/python/client/session.py#L74
    # TODO: Most part of this implementation might be better off moved to TensorFrames
    jvm_builder = JVMAPI.createTensorFramesModelBuilder()
    tfs.core._add_graph(graph, jvm_builder)

    # Obtain the fetches and their shapes
    fetch_names = [tfx.tensor_name(graph, fetch) for fetch in fetches]
    fetch_shapes = [tfx.get_shape(graph, fetch) for fetch in fetches]

    # Traverse the graph nodes and obtain all the placeholders and their shapes
    placeholder_names = []
    placeholder_shapes = []
    for node in graph.as_graph_def(add_shapes=True).node:
        if len(node.input) == 0 and str(node.op) == 'Placeholder':
            tnsr_name = tfx.tensor_name(graph, node.name)
            tnsr = graph.get_tensor_by_name(tnsr_name)
            try:
                tnsr_shape = tfx.get_shape(graph, tnsr)
                placeholder_names.append(tnsr_name)
                placeholder_shapes.append(tnsr_shape)
            except ValueError:
                pass

    # Passing fetches and placeholders to TensorFrames
    jvm_builder.shape(fetch_names + placeholder_names, fetch_shapes + placeholder_shapes)
    jvm_builder.fetches(fetch_names)
    # Passing feeds to TensorFrames
    placeholder_op_names = [tfx.op_name(graph, name) for name in placeholder_names]
    # Passing the graph input to DataFrame column mapping and additional placeholder names
    tfs.core._add_inputs(jvm_builder, feeds_to_fields_map, placeholder_op_names)

    if register:
        return jvm_builder.registerUDF(udf_name, blocked)
    else:
        return jvm_builder.makeUDF(udf_name, blocked)
예제 #27
0
 def _getFinalOutputOpName(self):
     return tfx.op_name(self._getFinalOutputTensorName())
예제 #28
0
 def _getFinalOutputOpName(self):
     return tfx.op_name(self._getFinalOutputTensorName())
예제 #29
0
    def fromList(cls, functions):
        """
        Construct a single GraphFunction from a list of graph functions.
        Each function in the list corresponds to a stage.

        Each function is also scoped by a scope name, in order to avoid
        variable name conflict and also to make the graph cleaner for visualization.
        If a scope name is not provided, we generate one as `GFN-BLK-<stage_index>`.

        The inputs and outputs are picked out of the scopes, so that users
        will still be able to call the function with the expected inputs/outputs names.

        It is assumed that there is only one input and one output in the intermediary layers

        :param functions: a list of tuples (scope name, GraphFunction object).
        """
        assert len(functions) >= 1, ("must provide at least one function",
                                     functions)
        if 1 == len(functions):
            return functions[0]
        # Check against each intermediary layer input output function pairs
        for (scope_in, gfn_in), (scope_out,
                                 gfn_out) in zip(functions[:-1],
                                                 functions[1:]):
            # For stage F => G, the composition G(F(.)) must work, which means
            # the number of outputs for F is equal to the number of inputs for G
            assert len(gfn_in.output_names) == len(gfn_out.input_names), \
                "graph function link {} -> {} require compatible layers".format(scope_in, scope_out)
            # We currently only support single input/output for intermediary stages
            # The functions could still take multi-dimensional tensor, but only one
            if len(gfn_out.input_names) != 1:
                raise NotImplementedError(
                    "Only support single input/output for intermediary layers")

        # Acquire initial placeholders' properties
        # We want the input names of the merged function are not under scoped
        # In this way users of the merged function could still use the input names
        # of the first function to get the correct input tensors.
        first_input_info = []
        with IsolatedSession() as issn:
            _, first_gfn = functions[0]
            feeds, _ = issn.importGraphFunction(first_gfn, prefix='')
            for tnsr in feeds:
                name = tfx.op_name(tnsr, issn.graph)
                first_input_info.append((tnsr.dtype, tnsr.shape, name))
            # TODO: make sure that this graph is not reused to prevent name conflict
            # Report error if the graph is not manipulated by anyone else
            # https://www.tensorflow.org/api_docs/python/tf/Graph#finalize
            issn.graph.finalize()

        # Build a linear chain of all the provide functions
        with IsolatedSession() as issn:
            first_inputs = [
                tf.placeholder(dtype, shape, name)
                for (dtype, shape, name) in first_input_info
            ]
            prev_outputs = first_inputs

            for idx, (scope, gfn) in enumerate(functions):
                # Give a scope to each function to avoid name conflict
                if scope is None or len(scope.strip()) == 0:
                    scope = 'GFN-BLK-{}'.format(idx)
                _msg = 'merge: stage {}, scope {}'.format(idx, scope)
                logger.info(_msg)
                input_map = dict(zip(gfn.input_names, prev_outputs))
                _, fetches = issn.importGraphFunction(gfn,
                                                      prefix=scope,
                                                      input_map=input_map)
                prev_outputs = fetches

            # Add a non-scoped output name as the output node
            # So that users can still use the output name of the last function's output
            # to fetch the correct output tensors
            last_output_names = functions[-1][1].output_names
            last_outputs = []
            for tnsr, name in zip(prev_outputs, last_output_names):
                last_outputs.append(tf.identity(tnsr, name=name))

            gfn = issn.asGraphFunction(first_inputs, last_outputs)

        return gfn
예제 #30
0
 def test_invalid_op_name_inputs_with_wrong_types(self, data, description):
     """ Must fail when provided wrong types """
     with self.assertRaises(TypeError, msg=description):
         tfx.op_name(data)
예제 #31
0
    def fromList(cls, functions):
        """
        Construct a single GraphFunction from a list of graph functions.
        Each function in the list corresponds to a stage.

        Each function is also scoped by a scope name, in order to avoid
        variable name conflict and also to make the graph cleaner for visualization.
        If a scope name is not provided, we generate one as `GFN-BLK-<stage_index>`.

        The inputs and outputs are picked out of the scopes, so that users
        will still be able to call the function with the expected inputs/outputs names.

        It is assumed that there is only one input and one output in the intermediary layers

        :param functions: a list of tuples (scope name, GraphFunction object).
        """
        assert len(functions) >= 1, ("must provide at least one function", functions)
        if 1 == len(functions):
            return functions[0]
        # Check against each intermediary layer input output function pairs
        for (scope_in, gfn_in), (scope_out, gfn_out) in zip(functions[:-1], functions[1:]):
            # For stage F => G, the composition G(F(.)) must work, which means
            # the number of outputs for F is equal to the number of inputs for G
            assert len(gfn_in.output_names) == len(gfn_out.input_names), \
                "graph function link {} -> {} require compatible layers".format(scope_in, scope_out)
            # We currently only support single input/output for intermediary stages
            # The functions could still take multi-dimensional tensor, but only one
            if len(gfn_out.input_names) != 1:
                raise NotImplementedError("Only support single input/output for intermediary layers")

        # Acquire initial placeholders' properties
        # We want the input names of the merged function are not under scoped
        # In this way users of the merged function could still use the input names
        # of the first function to get the correct input tensors.
        first_input_info = []
        with IsolatedSession() as issn:
            _, first_gfn = functions[0]
            feeds, _ = issn.importGraphFunction(first_gfn, prefix='')
            for tnsr in feeds:
                name = tfx.op_name(issn.graph, tnsr)
                first_input_info.append((tnsr.dtype, tnsr.shape, name))
            # TODO: make sure that this graph is not reused to prevent name conflict
            # Report error if the graph is not manipulated by anyone else
            # https://www.tensorflow.org/api_docs/python/tf/Graph#finalize
            issn.graph.finalize()

        # Build a linear chain of all the provide functions
        with IsolatedSession() as issn:
            first_inputs = [tf.placeholder(dtype, shape, name)
                            for (dtype, shape, name) in first_input_info]
            prev_outputs = first_inputs

            for idx, (scope, gfn) in enumerate(functions):
                # Give a scope to each function to avoid name conflict
                if scope is None or len(scope.strip()) == 0:
                    scope = 'GFN-BLK-{}'.format(idx)
                _msg = 'merge: stage {}, scope {}'.format(idx, scope)
                logger.info(_msg)
                input_map = dict(zip(gfn.input_names, prev_outputs))
                _, fetches = issn.importGraphFunction(
                    gfn, prefix=scope, input_map=input_map)
                prev_outputs = fetches

            # Add a non-scoped output name as the output node
            # So that users can still use the output name of the last function's output
            # to fetch the correct output tensors
            last_output_names = functions[-1][1].output_names
            last_outputs = []
            for tnsr, name in zip(prev_outputs, last_output_names):
                last_outputs.append(tf.identity(tnsr, name=name))

            gfn = issn.asGraphFunction(first_inputs, last_outputs)

        return gfn
예제 #32
0
def makeGraphUDF(graph,
                 udf_name,
                 fetches,
                 feeds_to_fields_map=None,
                 blocked=False,
                 register=True):
    """
    Create a Spark SQL UserDefinedFunction from a given TensorFlow Graph

    The following example creates a UDF that takes the input
    from a DataFrame column named 'image_col' and produce some random prediction.

    .. code-block:: python

        from sparkdl.graph.tensorframes_udf import makeUDF

        with IsolatedSession() as issn:
            x = tf.placeholder(tf.double, shape=[], name="input_x")
            z = tf.add(x, 3, name='z')
            makeGraphUDF(issn.graph, "my_tensorflow_udf", [z])

    Then this function can be used in a SQL query.

    .. code-block:: python

        df = spark.createDataFrame([Row(xCol=float(x)) for x in range(100)])
        df.createOrReplaceTempView("my_float_table")
        spark.sql("select my_tensorflow_udf(xCol) as zCol from my_float_table").show()

    :param graph: :py:class:`tf.Graph`, a TensorFlow Graph
    :param udf_name: str, name of the SQL UDF
    :param fetches: list, output tensors of the graph
    :param feeds_to_fields_map: a dict of str -> str,
                                The key is the name of a placeholder in the current
                                TensorFlow graph of computation.
                                The value is the name of a column in the dataframe.
                                For now, only the top-level fields in a dataframe are supported.

                                .. note:: For any placeholder that is
                                          not specified in the feed dictionary,
                                          the name of the input column is assumed to be
                                          the same as that of the placeholder.

    :param blocked: bool, if set to True, the TensorFrames will execute the function
                    over blocks/batches of rows. This should provide better performance.
                    Otherwise, the function is applied to individual rows
    :param register: bool, if set to True, the SQL UDF will be registered.
                     In this case, it will be accessible in SQL queries.
    :return: JVM function handle object
    """
    graph = tfx.validated_graph(graph)
    # pylint: disable=W0212
    # TODO: Work with TensorFlow's registered expansions
    # https://github.com/tensorflow/tensorflow/blob/v1.1.0/tensorflow/python/client/session.py#L74
    # TODO: Most part of this implementation might be better off moved to TensorFrames
    jvm_builder = JVMAPI.createTensorFramesModelBuilder()
    tfs.core._add_graph(graph, jvm_builder)

    # Obtain the fetches and their shapes
    fetch_names = [tfx.tensor_name(fetch, graph) for fetch in fetches]
    fetch_shapes = [tfx.get_shape(fetch, graph) for fetch in fetches]

    # Traverse the graph nodes and obtain all the placeholders and their shapes
    placeholder_names = []
    placeholder_shapes = []
    for node in graph.as_graph_def(add_shapes=True).node:
        # pylint: disable=len-as-condition
        # todo: refactor if not(node.input) and ...
        if len(node.input) == 0 and str(node.op) == 'Placeholder':
            tnsr_name = tfx.tensor_name(node.name, graph)
            tnsr = graph.get_tensor_by_name(tnsr_name)
            try:
                tnsr_shape = tfx.get_shape(tnsr, graph)
                placeholder_names.append(tnsr_name)
                placeholder_shapes.append(tnsr_shape)
            except ValueError:
                pass

    # Passing fetches and placeholders to TensorFrames
    jvm_builder.shape(fetch_names + placeholder_names,
                      fetch_shapes + placeholder_shapes)
    jvm_builder.fetches(fetch_names)
    # Passing feeds to TensorFrames
    placeholder_op_names = [
        tfx.op_name(name, graph) for name in placeholder_names
    ]
    # Passing the graph input to DataFrame column mapping and additional placeholder names
    tfs.core._add_inputs(jvm_builder, feeds_to_fields_map,
                         placeholder_op_names)

    if register:
        return jvm_builder.registerUDF(udf_name, blocked)
    else:
        return jvm_builder.makeUDF(udf_name, blocked)
예제 #33
0
 def test_invalid_op_name_inputs_with_wrong_types(self, data, description):
     """ Must fail when provided wrong types """
     with self.assertRaises(TypeError, msg=description):
         tfx.op_name(data)