def _transform(self, dataset): if len([field for field in dataset.schema if field.dataType == DoubleType()]) > 0: logger.warn("Detected DoubleType columns in dataframe passed to transform(). In " "Deep Learning Pipelines 1.0 and above, DoubleType columns can only be " "fed to input tensors of type tf.float64. To feed dataframe data to " "tensors of other types (e.g. tf.float32, tf.int32, tf.int64), use the " "corresponding Spark SQL data types (FloatType, IntegerType, LongType).") graph_def = self._optimize_for_inference() input_mapping = self.getInputMapping() output_mapping = self.getOutputMapping() graph = tf.Graph() with tf.Session(graph=graph): analyzed_df = tfs.analyze(dataset) out_tnsr_op_names = [tfx.op_name(tnsr_name) for tnsr_name, _ in output_mapping] # Load graph tf.import_graph_def(graph_def=graph_def, name='', return_elements=out_tnsr_op_names) # Feed dict maps from placeholder name to DF column name feed_dict = {tfx.op_name(tnsr_name): col_name for col_name, tnsr_name in input_mapping} fetches = [tfx.get_tensor(tnsr_name, graph) for tnsr_name in out_tnsr_op_names] out_df = tfs.map_blocks(fetches, analyzed_df, feed_dict=feed_dict) # We still have to rename output columns for tnsr_name, new_colname in output_mapping: old_colname = tfx.op_name(tnsr_name, graph) if old_colname != new_colname: out_df = out_df.withColumnRenamed(old_colname, new_colname) return out_df
def _transform(self, dataset): if any([field.dataType == DoubleType() for field in dataset.schema]): logger.warning("Detected DoubleType columns in dataframe passed to transform(). In " "Deep Learning Pipelines 1.0 and above, DoubleType columns can only be " "fed to input tensors of type tf.float64. To feed dataframe data to " "tensors of other types (e.g. tf.float32, tf.int32, tf.int64), use the " "corresponding Spark SQL data types (FloatType, IntegerType, LongType).") graph_def = self._optimize_for_inference() input_mapping = self.getInputMapping() output_mapping = self.getOutputMapping() graph = tf.Graph() with tf.Session(graph=graph): analyzed_df = tfs.analyze(dataset) out_tnsr_op_names = [tfx.op_name(tnsr_name) for tnsr_name, _ in output_mapping] # Load graph tf.import_graph_def(graph_def=graph_def, name='', return_elements=out_tnsr_op_names) # Feed dict maps from placeholder name to DF column name feed_dict = {tfx.op_name(tnsr_name): col_name for col_name, tnsr_name in input_mapping} fetches = [tfx.get_tensor(tnsr_name, graph) for tnsr_name in out_tnsr_op_names] out_df = tfs.map_blocks(fetches, analyzed_df, feed_dict=feed_dict) # We still have to rename output columns for tnsr_name, new_colname in output_mapping: old_colname = tfx.op_name(tnsr_name, graph) if old_colname != new_colname: out_df = out_df.withColumnRenamed(old_colname, new_colname) return out_df
def _transform(self, dataset): graph_def = self._optimize_for_inference() input_mapping = self.getInputMapping() output_mapping = self.getOutputMapping() graph = tf.Graph() with tf.Session(graph=graph): analyzed_df = tfs.analyze(dataset) out_tnsr_op_names = [ tfx.op_name(tnsr_name) for tnsr_name, _ in output_mapping ] # Load graph tf.import_graph_def(graph_def=graph_def, name='', return_elements=out_tnsr_op_names) # Feed dict maps from placeholder name to DF column name feed_dict = { self._getSparkDlOpName(tnsr_name): col_name for col_name, tnsr_name in input_mapping } fetches = [ tfx.get_tensor(tnsr_name, graph) for tnsr_name in out_tnsr_op_names ] out_df = tfs.map_blocks(fetches, analyzed_df, feed_dict=feed_dict) # We still have to rename output columns for tnsr_name, new_colname in output_mapping: old_colname = tfx.op_name(tnsr_name, graph) if old_colname != new_colname: out_df = out_df.withColumnRenamed(old_colname, new_colname) return out_df
def _optimize_for_inference(self): graph_def = self.getTFInputGraph().graph_def # Get data types of input placeholders placeholder_types = self._get_placeholder_types(graph_def) # Strip away graph nodes not used in computing the tensors with the specified output names input_names = [tfx.op_name(tnsr_name) for _, tnsr_name in self.getInputMapping()] output_names = [tfx.op_name(tnsr_name) for tnsr_name, _ in self.getOutputMapping()] return infr_opt.optimize_for_inference(graph_def, input_names, output_names, placeholder_types)
def test_simple_keras_udf(self): """ Simple Keras sequential model """ # Notice that the input layer for a image UDF model # must be of shape (width, height, numChannels) # The leading batch size is taken care of by Keras with IsolatedSession(using_keras=True) as issn: model = Sequential() model.add(Flatten(input_shape=(640,480,3))) model.add(Dense(units=64)) model.add(Activation('relu')) model.add(Dense(units=10)) model.add(Activation('softmax')) # Initialize the variables init_op = tf.global_variables_initializer() issn.run(init_op) makeGraphUDF(issn.graph, 'my_keras_model_udf', model.outputs, {tfx.op_name(issn.graph, model.inputs[0]): 'image_col'}) # Run the training procedure # Export the graph in this IsolatedSession as a GraphFunction # gfn = issn.asGraphFunction(model.inputs, model.outputs) fh_name = "test_keras_simple_sequential_model" registerKerasImageUDF(fh_name, model) self._assert_function_exists(fh_name)
def _getSparkDlOpName(self, tensor_name): """ Given a tensor name, returns the name of the op generating the tensor, prefixed with a special scope indicating that the op has been added by Sparkdl. """ op_name = tfx.op_name(tensor_name) return tfx.add_scope_to_name(scope=self.SPARKDL_OP_SCOPE, name=op_name)
def exec_gfn_spimg_decode(spimg_dict, img_dtype): gfn = gfac.buildSpImageConverter(img_dtype) with IsolatedSession() as issn: feeds, fetches = issn.importGraphFunction(gfn, prefix="") feed_dict = dict((tnsr, spimg_dict[tfx.op_name(issn.graph, tnsr)]) for tnsr in feeds) img_out = issn.run(fetches[0], feed_dict=feed_dict) return img_out
def _optimize_for_inference(self): """ Optimize the graph for inference """ gin = self.getTFInputGraph() input_mapping = self.getInputMapping() output_mapping = self.getOutputMapping() input_node_names = [tfx.op_name(tnsr_name) for _, tnsr_name in input_mapping] output_node_names = [tfx.op_name(tnsr_name) for tnsr_name, _ in output_mapping] # NOTE(phi-dbq): Spark DataFrame assumes float64 as default floating point type opt_gdef = infr_opt.optimize_for_inference(gin.graph_def, input_node_names, output_node_names, # TODO: below is the place to change for # the `float64` data type issue. tf.float64.as_datatype_enum) return opt_gdef
def test_pipeline(self): """ Pipeline should provide correct function composition """ img_fpaths = glob(os.path.join(_getSampleJPEGDir(), '*.jpg')) xcpt_model = Xception(weights="imagenet") stages = [('spimage', gfac.buildSpImageConverter(SparkMode.RGB_FLOAT32)), ('xception', GraphFunction.fromKeras(xcpt_model))] piped_model = GraphFunction.fromList(stages) for fpath in img_fpaths: target_size = tuple(xcpt_model.input.shape.as_list()[1:-1]) img = load_img(fpath, target_size=target_size) img_arr = np.expand_dims(img_to_array(img), axis=0) img_input = xcpt.preprocess_input(img_arr) preds_ref = xcpt_model.predict(img_input) spimg_input_dict = imageArrayToStruct(img_input).asDict() spimg_input_dict['data'] = bytes(spimg_input_dict['data']) with IsolatedSession() as issn: # Need blank import scope name so that spimg fields match the input names feeds, fetches = issn.importGraphFunction(piped_model, prefix="") feed_dict = dict( (tnsr, spimg_input_dict[tfx.op_name(tnsr, issn.graph)]) for tnsr in feeds) preds_tgt = issn.run(fetches[0], feed_dict=feed_dict) # Uncomment the line below to see the graph # tfx.write_visualization_html(issn.graph, # NamedTemporaryFile(prefix="gdef", suffix=".html").name) self.assertTrue(np.all(preds_tgt == preds_ref))
def test_pipeline(self): """ Pipeline should provide correct function composition """ img_fpaths = glob(os.path.join(_getSampleJPEGDir(), '*.jpg')) xcpt_model = Xception(weights="imagenet") stages = [('spimage', gfac.buildSpImageConverter(SparkMode.RGB_FLOAT32)), ('xception', GraphFunction.fromKeras(xcpt_model))] piped_model = GraphFunction.fromList(stages) for fpath in img_fpaths: target_size = tuple(xcpt_model.input.shape.as_list()[1:-1]) img = load_img(fpath, target_size=target_size) img_arr = np.expand_dims(img_to_array(img), axis=0) img_input = xcpt.preprocess_input(img_arr) preds_ref = xcpt_model.predict(img_input) spimg_input_dict = imageArrayToStruct(img_input).asDict() spimg_input_dict['data'] = bytes(spimg_input_dict['data']) with IsolatedSession() as issn: # Need blank import scope name so that spimg fields match the input names feeds, fetches = issn.importGraphFunction(piped_model, prefix="") feed_dict = dict((tnsr, spimg_input_dict[tfx.op_name(issn.graph, tnsr)]) for tnsr in feeds) preds_tgt = issn.run(fetches[0], feed_dict=feed_dict) # Uncomment the line below to see the graph # tfx.write_visualization_html(issn.graph, # NamedTemporaryFile(prefix="gdef", suffix=".html").name) self.assertTrue(np.all(preds_tgt == preds_ref))
def exec_gfn_spimg_decode(spimg_dict, img_dtype): gfn = gfac.buildSpImageConverter('BGR', img_dtype) with IsolatedSession() as issn: feeds, fetches = issn.importGraphFunction(gfn, prefix="") feed_dict = dict( (tnsr, spimg_dict[tfx.op_name(tnsr, issn.graph)]) for tnsr in feeds) img_out = issn.run(fetches[0], feed_dict=feed_dict) return img_out
def test_get_graph_elements(self): """ Fetching graph elements by names and other graph elements """ with IsolatedSession() as issn: x = tf.placeholder(tf.double, shape=[], name="x") z = tf.add(x, 3, name='z') g = issn.graph self.assertEqual(tfx.get_tensor(g, z), z) self.assertEqual(tfx.get_tensor(g, x), x) self.assertEqual(g.get_tensor_by_name("x:0"), tfx.get_tensor(g, x)) self.assertEqual("x:0", tfx.tensor_name(g, x)) self.assertEqual(g.get_operation_by_name("x"), tfx.get_op(g, x)) self.assertEqual("x", tfx.op_name(g, x)) self.assertEqual("z", tfx.op_name(g, z)) self.assertEqual(tfx.tensor_name(g, z), "z:0") self.assertEqual(tfx.tensor_name(g, x), "x:0")
def _gen_tensor_op_string_input_tests(): op_name = 'someOp' for tnsr_idx in [0, 1, 2, 3, 5, 8, 15, 17]: tnsr_name = '{}:{}'.format(op_name, tnsr_idx) yield TestCase(data=(op_name, tfx.op_name(tnsr_name)), description='test tensor name to op name') yield TestCase(data=(tnsr_name, tfx.tensor_name(tnsr_name)), description='test tensor name to tensor name')
def _loadTFGraph(self): with KSessionWrap() as (sess, g): assert K.backend() == "tensorflow", \ "Keras backend is not tensorflow but KerasImageTransformer only supports " + \ "tensorflow-backed Keras models." with g.as_default(): K.set_learning_phase(0) # Testing phase model = load_model(self.getModelFile()) out_op_name = tfx.op_name(g, model.output) self._inputTensor = model.input.name self._outputTensor = model.output.name return tfx.strip_and_freeze_until([out_op_name], g, sess, return_graph=True)
def _loadTFGraph(self): with KSessionWrap() as (sess, graph): assert K.backend() == "tensorflow", \ "Keras backend is not tensorflow but KerasImageTransformer only supports " + \ "tensorflow-backed Keras models." with graph.as_default(): K.set_learning_phase(0) # Testing phase model = load_model(self.getModelFile()) out_op_name = tfx.op_name(model.output, graph) self._inputTensor = model.input.name self._outputTensor = model.output.name return tfx.strip_and_freeze_until([out_op_name], graph, sess, return_graph=True)
def _optimize_for_inference(self): gin = self.getTFInputGraph() # Inject cast ops to convert float64 input fed from Spark into the datatypes of the # Graph's input nodes. graphdef_with_casts = self._addCastOps( self.getTFInputGraph().graph_def) # Strip away graph nodes not used in computing the tensors with the specified output names input_names = [ self._getSparkDlOpName(tnsr_name) for _, tnsr_name in self.getInputMapping() ] output_names = [ tfx.op_name(tnsr_name) for tnsr_name, _ in self.getOutputMapping() ] opt_gdef = infr_opt.optimize_for_inference(graphdef_with_casts, input_names, output_names, tf.float64.as_datatype_enum) return opt_gdef
def _loadTFGraph(self, sess, graph): """ Loads the Keras model into memory, then uses the passed-in session to load the model's inference-related ops into the passed-in Tensorflow graph. :return: A tuple (graph, input_name, output_name) where graph is the TF graph corresponding to the Keras model's inference subgraph, input_name is the name of the Keras model's input tensor, and output_name is the name of the Keras model's output tensor. """ keras_backend = K.backend() assert keras_backend == "tensorflow", \ "Only tensorflow-backed Keras models are supported, tried to load Keras model " \ "with backend %s."%(keras_backend) with graph.as_default(): K.set_learning_phase(0) # Inference phase model = load_model(self.getModelFile()) out_op_name = tfx.op_name(model.output, graph) stripped_graph = tfx.strip_and_freeze_until([out_op_name], graph, sess, return_graph=True) return stripped_graph, model.input.name, model.output.name
def _loadTFGraph(self, sess, graph): """ Loads the Keras model into memory, then uses the passed-in session to load the model's inference-related ops into the passed-in Tensorflow graph. :return: A tuple (graph, input_name, output_name) where graph is the TF graph corresponding to the Keras model's inference subgraph, input_name is the name of the Keras model's input tensor, and output_name is the name of the Keras model's output tensor. """ keras_backend = K.backend() assert keras_backend == "tensorflow", \ "Only tensorflow-backed Keras models are supported, tried to load Keras model " \ "with backend %s." % (keras_backend) with graph.as_default(): K.set_learning_phase(0) # Inference phase model = load_model(self.getModelFile()) out_op_name = tfx.op_name(model.output, graph) stripped_graph = tfx.strip_and_freeze_until([out_op_name], graph, sess, return_graph=True) return stripped_graph, model.input.name, model.output.name
def _transform(self, dataset): graph = self.getGraph() composed_graph = self._addReshapeLayers(graph, self._getImageDtype(dataset)) final_graph = self._stripGraph(composed_graph) with final_graph.as_default(): image = dataset[self.getInputCol()] image_df_exploded = (dataset.withColumn( "__sdl_image_height", image.height).withColumn( "__sdl_image_width", image.width).withColumn( "__sdl_image_nchannels", image.nChannels).withColumn("__sdl_image_data", image.data)) final_output_name = self._getFinalOutputTensorName() output_tensor = final_graph.get_tensor_by_name(final_output_name) final_df = (tfs.map_rows( [output_tensor], image_df_exploded, feed_dict={ "height": "__sdl_image_height", "width": "__sdl_image_width", "num_channels": "__sdl_image_nchannels", "image_buffer": "__sdl_image_data" }).drop("__sdl_image_height", "__sdl_image_width", "__sdl_image_nchannels", "__sdl_image_data")) tfs_output_name = tfx.op_name(output_tensor, final_graph) original_output_name = self._getOriginalOutputTensorName() output_shape = final_graph.get_tensor_by_name( original_output_name).shape output_mode = self.getOrDefault(self.outputMode) # TODO: support non-1d tensors (return np.array). if output_mode == "image": return self._convertOutputToImage(final_df, tfs_output_name, output_shape) else: assert output_mode == "vector", "Unknown output mode: %s" % output_mode return self._convertOutputToVector(final_df, tfs_output_name)
def _addCastOps(self, user_graph_def): """ Given a GraphDef object corresponding to a user-specified graph G, creates a copy G' of G with ops injected before each input node. The injected ops allow the input nodes of G' to accept tf.float64 input fed from Spark, casting float64 input into the datatype requested by each input node. :return: GraphDef representing the copied, modified graph. """ # Load user-specified graph into memory user_graph = tf.Graph() with user_graph.as_default(): tf.import_graph_def(user_graph_def, name="") # Build a subgraph containing our injected ops # TODO: Cheap optimization: if all input tensors are of type float64, just do nothing here injected_op_subgraph = tf.Graph() # Maps names of input tensors in our original graph to outputs of the injected-op subgraph input_map = {} with injected_op_subgraph.as_default(): with tf.name_scope(self.SPARKDL_OP_SCOPE): for _, orig_tensor_name in self.getInputMapping(): orig_tensor = tfx.get_tensor(orig_tensor_name, user_graph) # Create placeholder with same shape as original input tensor, but that accepts # float64 input from Spark. spark_placeholder = tf.placeholder( tf.float64, shape=orig_tensor.shape, name=tfx.op_name(orig_tensor_name)) # If the original tensor was of type float64, just pass through the Spark input if orig_tensor.dtype == tf.float64: input_map[orig_tensor_name] = spark_placeholder # Otherwise, cast the Spark input to the datatype of the original tensor else: input_map[orig_tensor_name] = tf.cast( spark_placeholder, dtype=orig_tensor.dtype) tf.import_graph_def(graph_def=user_graph_def, input_map=input_map, name="") return injected_op_subgraph.as_graph_def(add_shapes=True)
def _transform(self, dataset): graph = self.getGraph() composed_graph = self._addReshapeLayers(graph, self._getImageDtype(dataset)) final_graph = self._stripGraph(composed_graph) with final_graph.as_default(): # pylint: disable=not-context-manager image = dataset[self.getInputCol()] image_df_exploded = (dataset .withColumn("__sdl_image_height", image.height) .withColumn("__sdl_image_width", image.width) .withColumn("__sdl_image_nchannels", image.nChannels) .withColumn("__sdl_image_data", image.data) ) # yapf: disable final_output_name = self._getFinalOutputTensorName() output_tensor = final_graph.get_tensor_by_name(final_output_name) final_df = ( tfs.map_rows([output_tensor], image_df_exploded, feed_dict={ "height": "__sdl_image_height", "width": "__sdl_image_width", "num_channels": "__sdl_image_nchannels", "image_buffer": "__sdl_image_data"}) .drop("__sdl_image_height", "__sdl_image_width", "__sdl_image_nchannels", "__sdl_image_data") ) # yapf: disable tfs_output_name = tfx.op_name(output_tensor, final_graph) original_output_name = self._getOriginalOutputTensorName() output_shape = final_graph.get_tensor_by_name(original_output_name).shape output_mode = self.getOrDefault(self.outputMode) # TODO: support non-1d tensors (return np.array). if output_mode == "image": return self._convertOutputToImage(final_df, tfs_output_name, output_shape) else: assert output_mode == "vector", "Unknown output mode: %s" % output_mode return self._convertOutputToVector(final_df, tfs_output_name)
def _gen_valid_tensor_op_input_combos(): op_name = 'someConstOp' tnsr_name = '{}:0'.format(op_name) tnsr = tf.constant(1427.08, name=op_name) graph = tnsr.graph # Test for op_name yield TestCase(data=(op_name, tfx.op_name(tnsr)), description='get op name from tensor (no graph)') yield TestCase(data=(op_name, tfx.op_name(tnsr, graph)), description='get op name from tensor (with graph)') yield TestCase(data=(op_name, tfx.op_name(tnsr_name)), description='get op name from tensor name (no graph)') yield TestCase(data=(op_name, tfx.op_name(tnsr_name, graph)), description='get op name from tensor name (with graph)') yield TestCase(data=(op_name, tfx.op_name(tnsr.op)), description='get op name from op (no graph)') yield TestCase(data=(op_name, tfx.op_name(tnsr.op, graph)), description='get op name from op (with graph)') yield TestCase(data=(op_name, tfx.op_name(op_name)), description='get op name from op name (no graph)') yield TestCase(data=(op_name, tfx.op_name(op_name, graph)), description='get op name from op name (with graph)') # Test for tensor_name yield TestCase(data=(tnsr_name, tfx.tensor_name(tnsr)), description='get tensor name from tensor (no graph)') yield TestCase(data=(tnsr_name, tfx.tensor_name(tnsr, graph)), description='get tensor name from tensor (with graph)') yield TestCase(data=(tnsr_name, tfx.tensor_name(tnsr_name)), description='get tensor name from tensor name (no graph)') yield TestCase(data=(tnsr_name, tfx.tensor_name(tnsr_name, graph)), description='get tensor name from tensor name (with graph)') yield TestCase(data=(tnsr_name, tfx.tensor_name(tnsr.op)), description='get tensor name from op (no graph)') yield TestCase(data=(tnsr_name, tfx.tensor_name(tnsr.op, graph)), description='get tensor name from op (with graph)') yield TestCase(data=(tnsr_name, tfx.tensor_name(tnsr_name)), description='get tensor name from op name (no graph)') yield TestCase(data=(tnsr_name, tfx.tensor_name(tnsr_name, graph)), description='get tensor name from op name (with graph)') # Test for get_tensor yield TestCase(data=(tnsr, tfx.get_tensor(tnsr, graph)), description='get tensor from tensor') yield TestCase(data=(tnsr, tfx.get_tensor(tnsr_name, graph)), description='get tensor from tensor name') yield TestCase(data=(tnsr, tfx.get_tensor(tnsr.op, graph)), description='get tensor from op') yield TestCase(data=(tnsr, tfx.get_tensor(op_name, graph)), description='get tensor from op name') # Test for get_op yield TestCase(data=(tnsr.op, tfx.get_op(tnsr, graph)), description='get op from tensor') yield TestCase(data=(tnsr.op, tfx.get_op(tnsr_name, graph)), description='get op from tensor name') yield TestCase(data=(tnsr.op, tfx.get_op(tnsr.op, graph)), description='get op from op') yield TestCase(data=(tnsr.op, tfx.get_op(op_name, graph)), description='test op from op name')
def makeGraphUDF(graph, udf_name, fetches, feeds_to_fields_map=None, blocked=False, register=True): """ Create a Spark SQL UserDefinedFunction from a given TensorFlow Graph The following example creates a UDF that takes the input from a DataFrame column named 'image_col' and produce some random prediction. .. code-block:: python from sparkdl.graph.tensorframes_udf import makeUDF with IsolatedSession() as issn: x = tf.placeholder(tf.double, shape=[], name="input_x") z = tf.add(x, 3, name='z') makeGraphUDF(issn.graph, "my_tensorflow_udf", [z]) Then this function can be used in a SQL query. .. code-block:: python df = spark.createDataFrame([Row(xCol=float(x)) for x in range(100)]) df.createOrReplaceTempView("my_float_table") spark.sql("select my_tensorflow_udf(xCol) as zCol from my_float_table").show() :param graph: :py:class:`tf.Graph`, a TensorFlow Graph :param udf_name: str, name of the SQL UDF :param fetches: list, output tensors of the graph :param feeds_to_fields_map: a dict of str -> str, The key is the name of a placeholder in the current TensorFlow graph of computation. The value is the name of a column in the dataframe. For now, only the top-level fields in a dataframe are supported. .. note:: For any placeholder that is not specified in the feed dictionary, the name of the input column is assumed to be the same as that of the placeholder. :param blocked: bool, if set to True, the TensorFrames will execute the function over blocks/batches of rows. This should provide better performance. Otherwise, the function is applied to individual rows :param register: bool, if set to True, the SQL UDF will be registered. In this case, it will be accessible in SQL queries. :return: JVM function handle object """ graph = tfx.validated_graph(graph) # pylint: disable=W0212 # TODO: Work with TensorFlow's registered expansions # https://github.com/tensorflow/tensorflow/blob/v1.1.0/tensorflow/python/client/session.py#L74 # TODO: Most part of this implementation might be better off moved to TensorFrames jvm_builder = JVMAPI.createTensorFramesModelBuilder() tfs.core._add_graph(graph, jvm_builder) # Obtain the fetches and their shapes fetch_names = [tfx.tensor_name(graph, fetch) for fetch in fetches] fetch_shapes = [tfx.get_shape(graph, fetch) for fetch in fetches] # Traverse the graph nodes and obtain all the placeholders and their shapes placeholder_names = [] placeholder_shapes = [] for node in graph.as_graph_def(add_shapes=True).node: if len(node.input) == 0 and str(node.op) == 'Placeholder': tnsr_name = tfx.tensor_name(graph, node.name) tnsr = graph.get_tensor_by_name(tnsr_name) try: tnsr_shape = tfx.get_shape(graph, tnsr) placeholder_names.append(tnsr_name) placeholder_shapes.append(tnsr_shape) except ValueError: pass # Passing fetches and placeholders to TensorFrames jvm_builder.shape(fetch_names + placeholder_names, fetch_shapes + placeholder_shapes) jvm_builder.fetches(fetch_names) # Passing feeds to TensorFrames placeholder_op_names = [tfx.op_name(graph, name) for name in placeholder_names] # Passing the graph input to DataFrame column mapping and additional placeholder names tfs.core._add_inputs(jvm_builder, feeds_to_fields_map, placeholder_op_names) if register: return jvm_builder.registerUDF(udf_name, blocked) else: return jvm_builder.makeUDF(udf_name, blocked)
def _getFinalOutputOpName(self): return tfx.op_name(self._getFinalOutputTensorName())
def fromList(cls, functions): """ Construct a single GraphFunction from a list of graph functions. Each function in the list corresponds to a stage. Each function is also scoped by a scope name, in order to avoid variable name conflict and also to make the graph cleaner for visualization. If a scope name is not provided, we generate one as `GFN-BLK-<stage_index>`. The inputs and outputs are picked out of the scopes, so that users will still be able to call the function with the expected inputs/outputs names. It is assumed that there is only one input and one output in the intermediary layers :param functions: a list of tuples (scope name, GraphFunction object). """ assert len(functions) >= 1, ("must provide at least one function", functions) if 1 == len(functions): return functions[0] # Check against each intermediary layer input output function pairs for (scope_in, gfn_in), (scope_out, gfn_out) in zip(functions[:-1], functions[1:]): # For stage F => G, the composition G(F(.)) must work, which means # the number of outputs for F is equal to the number of inputs for G assert len(gfn_in.output_names) == len(gfn_out.input_names), \ "graph function link {} -> {} require compatible layers".format(scope_in, scope_out) # We currently only support single input/output for intermediary stages # The functions could still take multi-dimensional tensor, but only one if len(gfn_out.input_names) != 1: raise NotImplementedError( "Only support single input/output for intermediary layers") # Acquire initial placeholders' properties # We want the input names of the merged function are not under scoped # In this way users of the merged function could still use the input names # of the first function to get the correct input tensors. first_input_info = [] with IsolatedSession() as issn: _, first_gfn = functions[0] feeds, _ = issn.importGraphFunction(first_gfn, prefix='') for tnsr in feeds: name = tfx.op_name(tnsr, issn.graph) first_input_info.append((tnsr.dtype, tnsr.shape, name)) # TODO: make sure that this graph is not reused to prevent name conflict # Report error if the graph is not manipulated by anyone else # https://www.tensorflow.org/api_docs/python/tf/Graph#finalize issn.graph.finalize() # Build a linear chain of all the provide functions with IsolatedSession() as issn: first_inputs = [ tf.placeholder(dtype, shape, name) for (dtype, shape, name) in first_input_info ] prev_outputs = first_inputs for idx, (scope, gfn) in enumerate(functions): # Give a scope to each function to avoid name conflict if scope is None or len(scope.strip()) == 0: scope = 'GFN-BLK-{}'.format(idx) _msg = 'merge: stage {}, scope {}'.format(idx, scope) logger.info(_msg) input_map = dict(zip(gfn.input_names, prev_outputs)) _, fetches = issn.importGraphFunction(gfn, prefix=scope, input_map=input_map) prev_outputs = fetches # Add a non-scoped output name as the output node # So that users can still use the output name of the last function's output # to fetch the correct output tensors last_output_names = functions[-1][1].output_names last_outputs = [] for tnsr, name in zip(prev_outputs, last_output_names): last_outputs.append(tf.identity(tnsr, name=name)) gfn = issn.asGraphFunction(first_inputs, last_outputs) return gfn
def test_invalid_op_name_inputs_with_wrong_types(self, data, description): """ Must fail when provided wrong types """ with self.assertRaises(TypeError, msg=description): tfx.op_name(data)
def fromList(cls, functions): """ Construct a single GraphFunction from a list of graph functions. Each function in the list corresponds to a stage. Each function is also scoped by a scope name, in order to avoid variable name conflict and also to make the graph cleaner for visualization. If a scope name is not provided, we generate one as `GFN-BLK-<stage_index>`. The inputs and outputs are picked out of the scopes, so that users will still be able to call the function with the expected inputs/outputs names. It is assumed that there is only one input and one output in the intermediary layers :param functions: a list of tuples (scope name, GraphFunction object). """ assert len(functions) >= 1, ("must provide at least one function", functions) if 1 == len(functions): return functions[0] # Check against each intermediary layer input output function pairs for (scope_in, gfn_in), (scope_out, gfn_out) in zip(functions[:-1], functions[1:]): # For stage F => G, the composition G(F(.)) must work, which means # the number of outputs for F is equal to the number of inputs for G assert len(gfn_in.output_names) == len(gfn_out.input_names), \ "graph function link {} -> {} require compatible layers".format(scope_in, scope_out) # We currently only support single input/output for intermediary stages # The functions could still take multi-dimensional tensor, but only one if len(gfn_out.input_names) != 1: raise NotImplementedError("Only support single input/output for intermediary layers") # Acquire initial placeholders' properties # We want the input names of the merged function are not under scoped # In this way users of the merged function could still use the input names # of the first function to get the correct input tensors. first_input_info = [] with IsolatedSession() as issn: _, first_gfn = functions[0] feeds, _ = issn.importGraphFunction(first_gfn, prefix='') for tnsr in feeds: name = tfx.op_name(issn.graph, tnsr) first_input_info.append((tnsr.dtype, tnsr.shape, name)) # TODO: make sure that this graph is not reused to prevent name conflict # Report error if the graph is not manipulated by anyone else # https://www.tensorflow.org/api_docs/python/tf/Graph#finalize issn.graph.finalize() # Build a linear chain of all the provide functions with IsolatedSession() as issn: first_inputs = [tf.placeholder(dtype, shape, name) for (dtype, shape, name) in first_input_info] prev_outputs = first_inputs for idx, (scope, gfn) in enumerate(functions): # Give a scope to each function to avoid name conflict if scope is None or len(scope.strip()) == 0: scope = 'GFN-BLK-{}'.format(idx) _msg = 'merge: stage {}, scope {}'.format(idx, scope) logger.info(_msg) input_map = dict(zip(gfn.input_names, prev_outputs)) _, fetches = issn.importGraphFunction( gfn, prefix=scope, input_map=input_map) prev_outputs = fetches # Add a non-scoped output name as the output node # So that users can still use the output name of the last function's output # to fetch the correct output tensors last_output_names = functions[-1][1].output_names last_outputs = [] for tnsr, name in zip(prev_outputs, last_output_names): last_outputs.append(tf.identity(tnsr, name=name)) gfn = issn.asGraphFunction(first_inputs, last_outputs) return gfn
def makeGraphUDF(graph, udf_name, fetches, feeds_to_fields_map=None, blocked=False, register=True): """ Create a Spark SQL UserDefinedFunction from a given TensorFlow Graph The following example creates a UDF that takes the input from a DataFrame column named 'image_col' and produce some random prediction. .. code-block:: python from sparkdl.graph.tensorframes_udf import makeUDF with IsolatedSession() as issn: x = tf.placeholder(tf.double, shape=[], name="input_x") z = tf.add(x, 3, name='z') makeGraphUDF(issn.graph, "my_tensorflow_udf", [z]) Then this function can be used in a SQL query. .. code-block:: python df = spark.createDataFrame([Row(xCol=float(x)) for x in range(100)]) df.createOrReplaceTempView("my_float_table") spark.sql("select my_tensorflow_udf(xCol) as zCol from my_float_table").show() :param graph: :py:class:`tf.Graph`, a TensorFlow Graph :param udf_name: str, name of the SQL UDF :param fetches: list, output tensors of the graph :param feeds_to_fields_map: a dict of str -> str, The key is the name of a placeholder in the current TensorFlow graph of computation. The value is the name of a column in the dataframe. For now, only the top-level fields in a dataframe are supported. .. note:: For any placeholder that is not specified in the feed dictionary, the name of the input column is assumed to be the same as that of the placeholder. :param blocked: bool, if set to True, the TensorFrames will execute the function over blocks/batches of rows. This should provide better performance. Otherwise, the function is applied to individual rows :param register: bool, if set to True, the SQL UDF will be registered. In this case, it will be accessible in SQL queries. :return: JVM function handle object """ graph = tfx.validated_graph(graph) # pylint: disable=W0212 # TODO: Work with TensorFlow's registered expansions # https://github.com/tensorflow/tensorflow/blob/v1.1.0/tensorflow/python/client/session.py#L74 # TODO: Most part of this implementation might be better off moved to TensorFrames jvm_builder = JVMAPI.createTensorFramesModelBuilder() tfs.core._add_graph(graph, jvm_builder) # Obtain the fetches and their shapes fetch_names = [tfx.tensor_name(fetch, graph) for fetch in fetches] fetch_shapes = [tfx.get_shape(fetch, graph) for fetch in fetches] # Traverse the graph nodes and obtain all the placeholders and their shapes placeholder_names = [] placeholder_shapes = [] for node in graph.as_graph_def(add_shapes=True).node: # pylint: disable=len-as-condition # todo: refactor if not(node.input) and ... if len(node.input) == 0 and str(node.op) == 'Placeholder': tnsr_name = tfx.tensor_name(node.name, graph) tnsr = graph.get_tensor_by_name(tnsr_name) try: tnsr_shape = tfx.get_shape(tnsr, graph) placeholder_names.append(tnsr_name) placeholder_shapes.append(tnsr_shape) except ValueError: pass # Passing fetches and placeholders to TensorFrames jvm_builder.shape(fetch_names + placeholder_names, fetch_shapes + placeholder_shapes) jvm_builder.fetches(fetch_names) # Passing feeds to TensorFrames placeholder_op_names = [ tfx.op_name(name, graph) for name in placeholder_names ] # Passing the graph input to DataFrame column mapping and additional placeholder names tfs.core._add_inputs(jvm_builder, feeds_to_fields_map, placeholder_op_names) if register: return jvm_builder.registerUDF(udf_name, blocked) else: return jvm_builder.makeUDF(udf_name, blocked)