def _test(array): height, width, chan = array.shape imgAsStruct = imageIO.imageArrayToStruct(array) self.assertEqual(imgAsStruct.height, height) self.assertEqual(imgAsStruct.width, width) self.assertEqual(imgAsStruct.data, array.tobytes()) imgReconstructed = imageIO.imageStructToArray(imgAsStruct) np.testing.assert_array_equal(array, imgReconstructed)
def _getNumpyFeaturesAndLabels(self, dataset): """ We assume the training data fits in memory on a single server. The input dataframe is converted to numerical image features and broadcast to all the worker nodes. """ image_uri_col = self.getInputCol() label_col = None if self.isDefined(self.labelCol) and self.getLabelCol() != "": label_col = self.getLabelCol() tmp_image_col = self._loadedImageCol() image_df = self.loadImagesInternal( dataset, image_uri_col).dropna(subset=[tmp_image_col]) # Extract features localFeatures = [] rows = image_df.collect() for row in rows: spimg = row[tmp_image_col] features = imageStructToArray(spimg) localFeatures.append(features) if not localFeatures: # NOTE(phi-dbq): pep-8 recommended against testing 0 == len(array) raise ValueError("Cannot extract any feature from dataset!") X = np.stack(localFeatures, axis=0) # Extract labels y = None if label_col is not None: label_schema = image_df.schema[label_col] label_dtype = label_schema.dataType assert isinstance(label_dtype, spla.VectorUDT), \ "must encode labels in one-hot vector format, but got {}".format(label_dtype) localLabels = [] for row in rows: try: _keras_label = row[label_col].toArray() except ValueError: raise ValueError("Cannot extract encoded label array") localLabels.append(_keras_label) if not localLabels: raise ValueError( "Failed to load any labels from dataset, but labels are required" ) y = np.stack(localLabels, axis=0) assert y.shape[0] == X.shape[0], \ "number of features {} != number of labels {}".format(X.shape[0], y.shape[0]) return X, y
def test_image_round_trip(self): # Test round trip: array -> png -> sparkImg -> array binarySchema = StructType([StructField("data", BinaryType(), False)]) df = self.session.createDataFrame([[bytearray(pngData)]], binarySchema) # Convert to images decImg = udf(imageIO._decodeImage, imageIO.imageSchema) imageDF = df.select(decImg("data").alias("image")) row = imageDF.first() testArray = imageIO.imageStructToArray(row.image) self.assertEqual(testArray.shape, array.shape) self.assertEqual(testArray.dtype, array.dtype) self.assertTrue(np.all(array == testArray))
def _getNumpyFeaturesAndLabels(self, dataset): """ We assume the training data fits in memory on a single server. The input dataframe is converted to numerical image features and broadcast to all the worker nodes. """ image_uri_col = self.getInputCol() label_col = None if self.isDefined(self.labelCol) and self.getLabelCol() != "": label_col = self.getLabelCol() tmp_image_col = self._loadedImageCol() image_df = self.loadImagesInternal(dataset, image_uri_col).dropna(subset=[tmp_image_col]) # Extract features localFeatures = [] rows = image_df.collect() for row in rows: spimg = row[tmp_image_col] features = imageStructToArray(spimg) localFeatures.append(features) if not localFeatures: # NOTE(phi-dbq): pep-8 recommended against testing 0 == len(array) raise ValueError("Cannot extract any feature from dataset!") X = np.stack(localFeatures, axis=0) # Extract labels y = None if label_col is not None: label_schema = image_df.schema[label_col] label_dtype = label_schema.dataType assert isinstance(label_dtype, spla.VectorUDT), \ "must encode labels in one-hot vector format, but got {}".format(label_dtype) localLabels = [] for row in rows: try: _keras_label = row[label_col].toArray() except ValueError: raise ValueError("Cannot extract encoded label array") localLabels.append(_keras_label) if not localLabels: raise ValueError("Failed to load any labels from dataset, but labels are required") y = np.stack(localLabels, axis=0) assert y.shape[0] == X.shape[0], \ "number of features {} != number of labels {}".format(X.shape[0], y.shape[0]) return X, y
def _executeTensorflow(self, graph, input_tensor_name, output_tensor_name, df, input_col="image"): with tf.Session(graph=graph) as sess: output_tensor = graph.get_tensor_by_name(output_tensor_name) image_collected = df.collect() values = {} topK = {} for img_row in image_collected: image = np.expand_dims(imageStructToArray(img_row[input_col]), axis=0) uri = img_row['image']['origin'] output = sess.run([output_tensor], feed_dict={ graph.get_tensor_by_name(input_tensor_name): image }) values[uri] = np.array(output[0]) topK[uri] = decode_predictions(values[uri], top=5)[0] return values, topK
def _executeTensorflow(self, graph, input_tensor_name, output_tensor_name, df, id_col="filePath", input_col="image"): with tf.Session(graph=graph) as sess: output_tensor = graph.get_tensor_by_name(output_tensor_name) image_collected = df.collect() values = {} topK = {} for img_row in image_collected: image = np.expand_dims(imageStructToArray(img_row[input_col]), axis=0) uri = img_row[id_col] output = sess.run([output_tensor], feed_dict={ graph.get_tensor_by_name(input_tensor_name): image }) values[uri] = np.array(output[0]) topK[uri] = decode_predictions(values[uri], top=5)[0] return values, topK
def test_loadImages(self): input_col = "uri" output_col = "preds" model_path = image_utils.prepInceptionV3KerasModelFile("inceptionV3.h5") transformer = KerasImageFileTransformer( inputCol=input_col, outputCol=output_col, modelFile=model_path, imageLoader=image_utils.loadAndPreprocessKerasInceptionV3, outputMode="vector") uri_df = image_utils.getSampleImagePathsDF(self.sql, input_col) image_df = transformer.loadImagesInternal(uri_df, input_col) self.assertEqual(len(image_df.columns), 2) img_col = transformer._loadedImageCol() expected_shape = InceptionV3Constants.INPUT_SHAPE + (3,) for row in image_df.collect(): arr = imageStructToArray(row[img_col]) self.assertEqual(arr.shape, expected_shape)
def test_resize(self): self.assertRaises(ValueError, imageIO.createResizeImageUDF, [1, 2, 3]) make_smaller = imageIO.createResizeImageUDF([4, 5]).func imgAsRow = imageIO.imageArrayToStruct(array) smallerImg = make_smaller(imgAsRow) self.assertEqual(smallerImg.height, 4) self.assertEqual(smallerImg.width, 5) # Compare to PIL resizing imgAsPIL = PIL.Image.fromarray(obj=imageIO._reverseChannels(array)).resize((5, 4)) smallerAry = imageIO._reverseChannels(np.asarray(imgAsPIL)) np.testing.assert_array_equal(smallerAry, imageIO.imageStructToArray(smallerImg)) # Test that resize with the same size is a no-op sameImage = imageIO.createResizeImageUDF((imgAsRow.height, imgAsRow.width)).func(imgAsRow) self.assertEqual(imgAsRow, sameImage) # Test that we have a valid image schema (all fields are in) for n in ImageSchema.imageSchema['image'].dataType.names: smallerImg[n]
def test_loadImages(self): input_col = "uri" output_col = "preds" model_path = image_utils.prepInceptionV3KerasModelFile("inceptionV3.h5") transformer = KerasImageFileTransformer(inputCol=input_col, outputCol=output_col, modelFile=model_path, imageLoader=image_utils.loadAndPreprocessKerasInceptionV3, outputMode="vector") uri_df = image_utils.getSampleImagePathsDF(self.sql, input_col) image_df = transformer._loadImages(uri_df) self.assertEqual(len(image_df.columns), 2) img_col = transformer._loadedImageCol() expected_shape = InceptionV3Constants.INPUT_SHAPE + (3,) for row in image_df.collect(): arr = imageStructToArray(row[img_col]) self.assertEqual(arr.shape, expected_shape)
def test_resize(self): self.assertRaises(ValueError, imageIO.createResizeImageUDF, [1, 2, 3]) make_smaller = imageIO.createResizeImageUDF([4, 5]).func imgAsRow = imageIO.imageArrayToStruct(array) smallerImg = make_smaller(imgAsRow) self.assertEqual(smallerImg.height, 4) self.assertEqual(smallerImg.width, 5) # Compare to PIL resizing imgAsPIL = PIL.Image.fromarray( obj=imageIO._reverseChannels(array)).resize((5, 4)) smallerAry = imageIO._reverseChannels(np.asarray(imgAsPIL)) np.testing.assert_array_equal(smallerAry, imageIO.imageStructToArray(smallerImg)) # Test that resize with the same size is a no-op sameImage = imageIO.createResizeImageUDF( (imgAsRow.height, imgAsRow.width)).func(imgAsRow) self.assertEqual(imgAsRow, sameImage) # Test that we have a valid image schema (all fields are in) for n in ImageSchema.imageSchema['image'].dataType.names: smallerImg[n]
def do_nothing(imgRow): array = imageIO.imageStructToArray(imgRow) return imageIO.imageArrayToStruct(array)
def do_nothing(imgRow): imType = imageIO.imageType(imgRow) array = imageIO.imageStructToArray(imgRow) return imageIO.imageArrayToStruct(array, imType.sparkMode)