def test_read_images_multiple_times(self): # This test case is to check if `ImageSchema.readImages` tries to # initiate Hive client multiple times. See SPARK-22651. data_path = 'data/mllib/images/origin/kittens' ImageSchema.readImages(data_path, recursive=True, dropImageFailures=True) ImageSchema.readImages(data_path, recursive=True, dropImageFailures=True)
def getDataFrame(img_dir): dic = {} df_train = [] df_test = [] count = 0 for root, directories, files in os.walk(img_dir): for file in directories: temp_df = ImageSchema.readImages(img_dir + "/" + file).withColumn("label", lit(count)) train_df, test_df = temp_df.randomSplit([0.6, 0.4]) df_train.append(train_df) df_test.append(test_df) if dic.get(count,None): continue else: dic[count] = file count += 1 trained_df = df_train[0] for i in range(1, len(df_train)): trained_df = trained_df.unionAll(df_train[i]) tested_df = df_test[0] for i in range(1, len(df_test)): tested_df = tested_df.unionAll(df_test[i]) return trained_df, tested_df, dic
def _create_image_df_with_label(image_folder): """ Creates a image data frame for a given image class (same label) :param image_folder: Folder which contains a single type of images :return: DataFrame with label and image columns (all with same label) """ label = int(image_folder.stem[1:]) path = ImageUtils._find_images_path(image_folder) return ImageSchema.readImages(path).withColumn('label', functions.lit(label))
def createDataFrame(spark, sc): sparkHomeDir = "file:/Users/beginspark/Apps/spark" # 1. 외부 데이터소스로부터 데이터프레임 생성 df1 = spark.read.json(sparkHomeDir + "/examples/src/main/resources/people.json") df2 = spark.read.parquet(sparkHomeDir + "/examples/src/main/resources/users.parquet") df3 = spark.read.text(sparkHomeDir + "/examples/src/main/resources/people.txt") # 2. 로컬 컬렉션으로부터 데이터프레임 생성 (ex5-17) row1 = Row(name="hayoon", age=7, job="student") row2 = Row(name="sunwoo", age=13, job="student") row3 = Row(name="hajoo", age=5, job="kindergartener") row4 = Row(name="jinwoo", age=13, job="student") data = [row1, row2, row3, row4] df4 = spark.createDataFrame(data) # 3. 기존 RDD로부터 데이터프레임 생성 (ex5-20) rdd = spark.sparkContext.parallelize(data) df5 = spark.createDataFrame(data) # 4. 스키마 지정을 통한 데이터프레임 생성(ex5-23) sf1 = StructField("name", StringType(), True) sf2 = StructField("age", IntegerType(), True) sf3 = StructField("job", StringType(), True) schema = StructType([sf1, sf2, sf3]) r1 = ("hayoon", 7, "student") r2 = ("sunwoo", 13, "student") r3 = ("hajoo", 5, "kindergartener") r4 = ("jinwoo", 13, "student") rows = [r1, r2, r3, r4] df6 = spark.createDataFrame(rows, schema) # 5. 이미지를 이용한 데이터프레임 생성 path = sparkHomeDir + "/data/mllib/images" recursive = True numPartitions = 2 dropImageFailures = True sampleRatio = 1.0 seed = 0 imgdf = ImageSchema.readImages(path, recursive, numPartitions, dropImageFailures, sampleRatio, seed) imgdf = imgdf.select(imgdf["image.origin"], imgdf["image.height"], imgdf["image.width"], imgdf["image.nChannels"], imgdf["image.mode"])
def image_predictor(path, input_col="image", output_col="predicted_labels", model_name="InceptionV3", decode_predictions=True, topK=10): image_df = ImageSchema.readImages(path) predictor = DeepImagePredictor(inputCol=input_col, outputCol=output_col, modelName=model_name, decodePredictions=decode_predictions, topK=topK) preds = predictor.transform(image_df) firstelement = udf(lambda v: (str(v[0][1]), float(v[0][2])), ArrayType(StringType())) return preds.select( firstelement('predicted_labels').alias("predicted_labels"))
def test_read_images(self): data_path = 'data/mllib/images/origin/kittens' df = ImageSchema.readImages(data_path, recursive=True, dropImageFailures=True) self.assertEqual(df.count(), 4) first_row = df.take(1)[0][0] array = ImageSchema.toNDArray(first_row) self.assertEqual(len(array), first_row[1]) self.assertEqual(ImageSchema.toImage(array, origin=first_row[0]), first_row) self.assertEqual(df.schema, ImageSchema.imageSchema) self.assertEqual(df.schema["image"].dataType, ImageSchema.columnSchema) expected = { 'CV_8UC3': 16, 'Undefined': -1, 'CV_8U': 0, 'CV_8UC1': 0, 'CV_8UC4': 24 } self.assertEqual(ImageSchema.ocvTypes, expected) expected = ['origin', 'height', 'width', 'nChannels', 'mode', 'data'] self.assertEqual(ImageSchema.imageFields, expected) self.assertEqual(ImageSchema.undefinedImageType, "Undefined") with QuietTest(self.sc): self.assertRaisesRegexp( TypeError, "image argument should be pyspark.sql.types.Row; however", lambda: ImageSchema.toNDArray("a")) with QuietTest(self.sc): self.assertRaisesRegexp( ValueError, "image argument should have attributes specified in", lambda: ImageSchema.toNDArray(Row(a=1))) with QuietTest(self.sc): self.assertRaisesRegexp( TypeError, "array argument should be numpy.ndarray; however, it got", lambda: ImageSchema.toImage("a"))
image_uri_df = sqlContext.createDataFrame(local_rows) return image_uri_df label_cardinality = 2 label_list = ['Tap', 'Teapot'] label_cardinality = len(label_list) label_nums = list(range(label_cardinality)) banana_image_df = ImageSchema.readImages("hdfs://ec2-18-235-62-224.compute-1.amazonaws.com:9000/OID/Dataset/test/Banana").withColumn("label", lit(1)) # banana_image_df = banana_image_df.withColumn("prefix", lit('Entity/data/food/fruit/')) accordion_image_df = ImageSchema.readImages("hdfs://ec2-18-235-62-224.compute-1.amazonaws.com:9000/OID/Dataset/test/Accordion").withColumn("label", lit(0)) # accordion_image_df = accordion_image_df.withColumn("prefix", lit('Entity/data/food/fruit/')) banana_train, banana_test, _ = banana_image_df.randomSplit([0.99, 0.005, 0.005]) # use larger training sets (e.g. [0.6, 0.4] for non-community edition clusters) accordion_train, accordion_test, _ = accordion_image_df.randomSplit([0.99, 0.005, 0.005]) # use larger training sets (e.g. [0.6, 0.4] for non-community edition clusters) train_df = accordion_train.unionAll(banana_train) test_df = accordion_test.unionAll(accordion_train)
import logging from pyspark.ml.image import ImageSchema from pyspark.sql.functions import lit, col, udf from pyspark.ml.linalg import VectorUDT, Vectors from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml.classification import LogisticRegression from pyspark.ml import Pipeline from pyspark.sql import SparkSession spark = SparkSession.builder.appName( "Algortimo de clasificacion multiclase").getOrCreate() path = "./resources/" angry_df = ImageSchema.readImages(path + "0/").withColumn("label", lit(0)) happy_df = ImageSchema.readImages(path + "3/").withColumn("label", lit(1)) sad_df = ImageSchema.readImages(path + "4/").withColumn("label", lit(2)) sc = spark.sparkContext log4jLogger = sc._jvm.org.apache.log4j log = log4jLogger.Logger.getLogger(__name__) log.info("pyspark script logger initialized") df1 = angry_df.union(happy_df).union(sad_df) parse_ = udf(lambda a: Vectors.dense(a), VectorUDT()) df = df1.withColumn("features", parse_(df1["image.data"])) train, test, _ = df.randomSplit([0.1, 0.05, 0.85])
img_rescaled = resizeimage.resize_cover(new_im, [width, width]) img_rescaled.save("{}/rescaled/{}".format(root, img)) if __name__ == "__main__": sc = SparkContext() img_dic = joblib.load("dictionary.pkl")[0] featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") lr = LogisticRegressionModel.load('./lrModel') p_model = PipelineModel(stages=[featurizer, lr]) directory = "./media" rescaled_dir = "{}/rescaled".format(directory) rescale_image(directory, rescaled_dir) temp_df = ImageSchema.readImages(rescaled_dir) df = p_model.transform(temp_df) f = open("predict_output.txt", "r+") f.seek(0) f.truncate() for i in df.select(['image', 'prediction']).collect(): print("{} = {}".format(i[0][0].split('/')[-1], img_dic[int(i[1])])) f.write("{} = {}\n".format(i[0][0].split('/')[-1], img_dic[int(i[1])])) f.close() shutil.rmtree(rescaled_dir) # spark-submit --packages databricks:spark-deep-learning:1.5.0-spark2.4-s_2.11 predict.py
# COMMAND ---------- # load image # COMMAND ---------- display(dbutils.fs.ls("dbfs:/FileStore/tables")) # COMMAND ---------- from pyspark.ml.image import ImageSchema from pyspark.sql.functions import lit # Read images using Spark ... as a DataFrame. # Each image is stored as a row in the imageSchema format. image_cle = ImageSchema.readImages("dbfs:/FileStore/tables/cle/").withColumn( "label", lit(0)) image_cre = ImageSchema.readImages("dbfs:/FileStore/tables/cre/").withColumn( "label", lit(1)) image_ole = ImageSchema.readImages("dbfs:/FileStore/tables/ole/").withColumn( "label", lit(2)) image_ore = ImageSchema.readImages("dbfs:/FileStore/tables/ore/").withColumn( "label", lit(3)) # COMMAND ---------- image_cle.show(), image_cre.show(), image_ole.show(), image_ore.show() # COMMAND ---------- type(image_cle)
sc = SparkContext(conf=conf) # Add in the sparkdl Dependancies sys.path.insert( 0, "/home/cdsw/spark-deep-learning/target/scala-2.11/spark-deep-learning-assembly-1.5.1-SNAPSHOT-spark2.4.jar" ) from pyspark.ml.classification import LogisticRegression from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml import Pipeline from sparkdl import DeepImageFeaturizer from pyspark.ml.image import ImageSchema label1_df = ImageSchema.readImages("data/personalities/jobs").withColumn( "label", lit(0)) label2_df = ImageSchema.readImages("data/personalities/zuckerberg").withColumn( "label", lit(1)) train1_df, test1_df = label1_df.randomSplit([0.6, 0.4]) train2_df, test2_df = label2_df.randomSplit([0.6, 0.4]) train1_df.show() test1_df.show() train_images_df = train1_df.unionAll(train2_df) test_images_df = test1_df.unionAll(test2_df) # Training Set train_images_df.show() # Test Set test_images_df.show()
import glob fs = glob.glob("flower_photos/sample/*.jpg") import IPython.display as dp # create list of image objects images = [] for ea in fs: images.append(dp.Image(filename=ea, format='png')) # display all images for ea in images: dp.display_png(ea) from pyspark.ml.image import ImageSchema image_df = ImageSchema.readImages("flower_photos/sample/") image_df.show() from pyspark.ml.image import ImageSchema from pyspark.sql.functions import lit from sparkdl.image import imageIO from keras.applications import InceptionV3 model = InceptionV3(weights="imagenet") model.save('model-full.h5') # saves to the local filesystem from keras.applications.inception_v3 import preprocess_input from keras.preprocessing.image import img_to_array, load_img import numpy as np from pyspark.sql.types import StringType, StructType, StructField, ArrayType, FloatType
def get_a_df(fpath): """将图片文件变成spark的DataFrame模型,该模型可以支持sql操作 fpath:文件子路径和图片的label,将DATAPATH/fpath 下的图片读出来 并设置Label为fpath值""" dftemp = ImageSchema.readImages(DATAPATH+"/"+str(fpath)).withColumn("Label",lit(fpath)) df_train, df_test = dftemp.randomSplit([.8, .2]) return df_train, df_test
from pyspark.sql import functions as F from pyspark.sql.functions import lit import tensorflow as tf import keras from PIL import Image import sparkdl from pyspark.ml.image import ImageSchema from pyspark.ml.classification import LogisticRegression from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml import Pipeline from sparkdl import DeepImageFeaturizer # Loading Train Images and creating Labels as 0 for Normal and 1 for Pneumonia images Train_normal = ImageSchema.readImages("s3://image-class/train/NORMAL/") Train_pneumonia = ImageSchema.readImages("s3://image-class/train/PNEUMONIA/") Train_normal = Train_normal.withColumn("label", lit(0)) Train_pneumonia = Train_pneumonia.withColumn("label", lit(1)) # Combining all the Train Images into a single Train Dataset Train_images = Train_pneumonia.union(Train_normal) # Example of a Pneumonia Image loaded from the S3 Bucket Train_pneumonia.first() # Pictures of the Train Normal Chest X-Ray loaded from the S3 Bucket display(Train_normal) # Loading Test Images and Creating Labels as 0 for Normal Images and 1 for Pneumonia Images Test data Test_normal = ImageSchema.readImages("s3://image-class/test/NORMAL/")
###### Loading the images from the input directory to MongoDB ###### from pyspark.ml.image import ImageSchema from pyspark.sql import SparkSession, functions, types from pyspark.sql.window import Window sparkTrain = SparkSession \ .builder \ .appName("ElecNET") \ .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/ElecNet.ImgColl") \ .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/ElecNet.ImgColl") \ .getOrCreate() #Reading images from the input directory df = ImageSchema.readImages('data', recursive=True, dropImageFailures=True) paths = df.select(df['image']['origin'].alias('filename'), df['image']['data'].alias('image_bytes'), df['image']['width'].alias('width'), df['image']['height'].alias('height')) split_col_filename = functions.split(paths['filename'], ':') split_col_label = functions.split(paths['filename'], '-') paths = paths.withColumn('category', split_col_label.getItem(1)) paths = paths.withColumn('filepath', split_col_filename.getItem(1)) paths = paths.select(paths['filepath'], paths['category'], paths['image_bytes'], paths['width'], paths['height']) #Creating train and test sets splits = paths.randomSplit(weights=[0.8, 0.2]) train = splits[0] test = splits[1]
# coding: utf-8 # ## Leitura das imagens # In[1]: from pyspark.ml.image import ImageSchema from pyspark.sql.functions import lit IMG_DIR = "chest_xray/" train_df_normal = ImageSchema.readImages(IMG_DIR + "/train/NORMAL").withColumn( "label", lit(0)) train_df_pneumonia = ImageSchema.readImages(IMG_DIR + "/train/PNEUMONIA").withColumn( "label", lit(1)) train_df = train_df_normal.union(train_df_pneumonia) test_df_normal = ImageSchema.readImages(IMG_DIR + "/test/NORMAL").withColumn( "label", lit(0)) test_df_pneumonia = ImageSchema.readImages(IMG_DIR + "/test/PNEUMONIA").withColumn( "label", lit(1)) test_df = test_df_normal.union(test_df_pneumonia) # ## Treino do modelo # In[ ]: from sparkdl import DeepImageFeaturizer from pyspark.ml import Pipeline
from pyspark.sql import SparkSession spark = SparkSession.builder \ .appName("ImageClassification") \ .config("spark.executor.memory", "70g") \ .config("spark.driver.memory", "50g") \ .config("spark.memory.offHeap.enabled",True) \ .config("spark.memory.offHeap.size","16g") \ .getOrCreate() import pyspark.sql.functions as f import sparkdl as dl from pyspark.ml.image import ImageSchema dfbuses = ImageSchema.readImages('data/buses/').withColumn('label', f.lit(0)) dfcars = ImageSchema.readImages('data/cars/').withColumn('label', f.lit(1)) dfbuses.show(5) dfcars.show(5) trainDFbuses, testDFbuses = dfbuses.randomSplit([0.60, 0.40], seed=123) trainDFcars, testDFcars = dfcars.randomSplit([0.60, 0.40], seed=122) trainDF = trainDFbuses.unionAll(trainDFcars) testDF = testDFbuses.unionAll(testDFcars) from pyspark.ml.classification import LogisticRegression from pyspark.ml import Pipeline vectorizer = dl.DeepImageFeaturizer(inputCol="image", outputCol="features",
from pyspark.ml.classification import LogisticRegression from pyspark.ml.evaluation import MulticlassClassificationEvaluator __author__ = "Jillur Quddus" __credits__ = ["Jillur Quddus"] __version__ = "1.0.0" _maintainer__ = "Jillur Quddus" __email__ = "*****@*****.**" __status__ = "Development" # (2) Create a Spark Session using the Spark Context instantiated from spark-submit spark = SparkSession.builder.appName("Convolutional Neural Networks - Transfer Learning - Image Recognition").getOrCreate() # (3) Load the Plane and Bird images into Spark DataFrames and define a literal label column path_to_img_directory = '/data/workspaces/jillur.quddus/jupyter/notebooks/Machine-Learning-with-Apache-Spark-QuickStart-Guide/chapter07/data/image-recognition-data' birds_df = ImageSchema.readImages(path_to_img_directory + "/birds").withColumn("label", lit(0)) planes_df = ImageSchema.readImages(path_to_img_directory + "/planes").withColumn("label", lit(1)) # (4) Create Training and Test DataFrames respectively planes_train_df, planes_test_df = planes_df.randomSplit([0.75, 0.25], seed=12345) birds_train_df, birds_test_df = birds_df.randomSplit([0.75, 0.25], seed=12345) train_df = planes_train_df.unionAll(birds_train_df) test_df = planes_test_df.unionAll(birds_test_df) # (5) Transform the Images into Numeric Feature Vectors using Transfer Learning and the pre-trained InceptionV3 Convolutional Neural Network featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") # (6) Train a Logistic Regression Model to classify our images logistic_regression = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label") # (7) Execute the Featurizer and Logistic Regression estimator within a Pipeline to generate the Trained Model
def load_images_path_and_shuffle(): annual_crop_df = ImageSchema.readImages("path_to_images").withColumn("label", lit(0)) forest_df = ImageSchema.readImages("path_to_images").withColumn("label", lit(1)) herb_veg_df = ImageSchema.readImages("path_to_images").withColumn("label", lit(2)) highway_df = ImageSchema.readImages("path_to_images").withColumn("label", lit(3)) industrial_df = ImageSchema.readImages("path_to_images").withColumn("label", lit(4)) pasture_df = ImageSchema.readImages("path_to_images").withColumn("label", lit(5)) perm_crop_df = ImageSchema.readImages("path_to_images").withColumn("label", lit(6)) residential_df = ImageSchema.readImages("path_to_images").withColumn("label", lit(7)) river_df = ImageSchema.readImages("path_to_images").withColumn("label", lit(8)) sea_lake_df = ImageSchema.readImages("path_to_images").withColumn("label", lit(9)) annual_crop_train, annual_crop_vali = annual_crop_df.randomSplit([0.4, 0.6]) forest_train, forest_vali = forest_df.randomSplit([0.4, 0.6]) herb_veg_train, herb_veg_vail = herb_veg_df.randomSplit([0.4, 0.6]) highway_train, highway_vali = highway_df.randomSplit([0.4, 0.6]) industrial_train, industrial_vali = industrial_df.randomSplit([0.4, 0.6]) pasture_train, pasture_vali = pasture_df.randomSplit([0.4, 0.6]) perm_crop_train, perm_crop_vali = perm_crop_df.randomSplit([0.4, 0.6]) residential_train, residential_vali = residential_df.randomSplit([0.4, 0.6]) river_train, river_vali = river_df.randomSplit([0.4, 0.6]) sea_lake_train, sea_lake_vali = sea_lake_df.randomSplit([0.4, 0.6]) train_df_phase1 = annual_crop_train.union(forest_train) vali_df_pase1 = annual_crop_vali.union(forest_vali) train_df_phase2 = train_df_phase1.union(herb_veg_train) vali_df_pase2 = vali_df_pase1.union(herb_veg_vail) shuffle(train_df_phase2) shuffle(vali_df_pase2) train_df_phase3 = train_df_phase2.union(highway_train) vali_df_phase3 = vali_df_pase2.union(highway_vali) train_df_phase4 = train_df_phase3.union(industrial_train) vali_df_phase4 = vali_df_phase3.union(industrial_vali) shuffle(train_df_phase4) shuffle(vali_df_phase4) train_df_phase5 = train_df_phase4.union(pasture_train) vali_df_phase5 = vali_df_phase4.union(pasture_vali) shuffle(train_df_phase5) shuffle(vali_df_phase5) train_df_phase6 = train_df_phase5.union(perm_crop_train) vali_df_phase6 = vali_df_phase5.union(perm_crop_vali) train_df_phase7 = train_df_phase6.union(residential_train) vali_df_phase7 = vali_df_phase6.union(residential_vali) shuffle(train_df_phase7) shuffle(vali_df_phase7) train_df_phase8 = train_df_phase7.union(river_train) vali_df_phase8 = vali_df_phase7.union(river_vali) train_df = train_df_phase8.union(sea_lake_train) vali_df = vali_df_phase8.union(sea_lake_vali) train = shuffle(train_df) vali = shuffle(vali_df) for img in train[0] element = preprocess_img(img) processed_images_train.append(element) for img in sorted(range(len(vali[0]), key = vali.__getitem__)) element = preprocess_img(img) processed_images_vali.append(element)
from sklearn.metrics import roc_curve, auc, roc_auc_score from sklearn.metrics import classification_report from sklearn.preprocessing import LabelBinarizer from sklearn.metrics import confusion_matrix import matplotlib.pyplot as plt from functools import reduce import seaborn as sns import numpy as np import itertools # create spark session spark = SparkSession.builder.appName('BD Recognizer').getOrCreate() # loaded image zero_df = ImageSchema.readImages("images/0").withColumn("label", lit(0)) one_df = ImageSchema.readImages("images/1").withColumn("label", lit(1)) two_df = ImageSchema.readImages("images/2").withColumn("label", lit(2)) three_df = ImageSchema.readImages("images/3").withColumn("label", lit(3)) four_df = ImageSchema.readImages("images/4").withColumn("label", lit(4)) five_df = ImageSchema.readImages("images/5").withColumn("label", lit(5)) six_df = ImageSchema.readImages("images/6").withColumn("label", lit(6)) seven_df = ImageSchema.readImages("images/7").withColumn("label", lit(7)) eight_df = ImageSchema.readImages("images/8").withColumn("label", lit(8)) nine_df = ImageSchema.readImages("images/9").withColumn("label", lit(9)) # merge data frame dataframes = [ zero_df, one_df, two_df, three_df, four_df, five_df, six_df, seven_df, eight_df, nine_df ]
# MAGIC # MAGIC ### Prepare training and validation dataframes # MAGIC # MAGIC Deep Learning Pipelines require training data to be loaded into Spark DataFrames. The below code utilizes Spark's native support for image data to load 6000 training images to a DataFrame. It than adds a new column called `label` that annotates an image with a type of land it depicts. The label is extracted from the pathname of the image. # COMMAND ---------- # MAGIC %md # MAGIC #### Load training images to a dataframe # COMMAND ---------- from pyspark.ml.image import ImageSchema from pyspark.sql.functions import lit img_df = ImageSchema.readImages(img_dir + 'train', recursive=True) # COMMAND ---------- # MAGIC %md # MAGIC #### Add the label column and split data into training and validation DataFrames. # COMMAND ---------- from pyspark.sql.functions import regexp_extract, col from pyspark.ml.feature import StringIndexer # Add a label columns img_labeled = img_df.withColumn( 'label', regexp_extract(col('image.origin'), '(.)(train/)(\w+)', 3)) # Split a dataframe into training and validation dataframes
plt.imshow(two_d, interpolation='nearest') return plt # get an image img = gen_image(X_train[0]) # save image as png img.savefig('/dbfs/mnt/' + account_name + '/' + container_name + '/sample_mnist_img.png', mode="overwrite") plt.close() # open png and display from pyspark.ml.image import ImageSchema image_df = ImageSchema.readImages('/mnt/' + account_name + '/' + container_name + '/sample_mnist_img.png') display(image_df) # start the run run = exp.start_logging() # train a model clf = LogisticRegression() clf.fit(X_train, y_train) # predict on test y_hat = clf.predict(X_test) # calculate accuracy on the prediction acc = np.average(y_hat == y_test)
from pyspark.ml.image import ImageSchema from pyspark.sql.functions import lit img_dir = "/home/ourui/deeplearning/images_classification-master/personalities" #Read images and Create training & test DataFrames for transfer learning jobs_df = ImageSchema.readImages(img_dir + "/jobs").withColumn("label", lit(1)) zuckerberg_df = ImageSchema.readImages(img_dir + "/zuckerberg").withColumn( "label", lit(0)) jobs_train, jobs_test = jobs_df.randomSplit([0.6, 0.4]) zuckerberg_train, zuckerberg_test = zuckerberg_df.randomSplit([0.6, 0.4]) #dataframe for training a classification model train_df = jobs_train.unionAll(zuckerberg_train) #dataframe for testing the classification model test_df = jobs_test.unionAll(zuckerberg_test) from pyspark.ml.classification import LogisticRegression from pyspark.ml import Pipeline from sparkdl import DeepImageFeaturizer from pyspark.ml.evaluation import MulticlassClassificationEvaluator featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") lr = LogisticRegression(maxIter=50, regParam=0.005, elasticNetParam=0.1, labelCol="label") p = Pipeline(stages=[featurizer, lr]) p_model = p.fit(train_df) print("training model") predictions = p_model.transform(test_df).select("image", "probability",
# Spark 2.3 버전으로 실행해야 합니다. # 스파크 딥러닝 관련 내용은 다음의 링크를 참조하십시오. # https://docs.databricks.com/applications/deep-learning/deep-learning-pipelines.html # spark-deep-learning 프로젝트는 다음의 링크를 참조하십시오. # https://github.com/databricks/spark-deep-learning from pyspark.ml.image import ImageSchema # 이미지 파일이 많기 때문에 /tulips 디렉터리와 /daisy 디렉터리의 일부 파일을 /sample 디렉터리에 복제하여 사용합니다. # 약 10개의 파일을 /sample 디렉터리에 복제합니다. img_dir = '/data/deep-learning-images/' sample_img_dir = img_dir + "/sample" image_df = ImageSchema.readImages(sample_img_dir) # COMMAND ---------- image_df.printSchema() # COMMAND ---------- from pyspark.ml.image import ImageSchema from pyspark.sql.functions import lit from sparkdl.image import imageIO tulips_df = ImageSchema.readImages(img_dir + "/tulips").withColumn("label", lit(1)) daisy_df = imageIO.readImagesWithCustomFn(img_dir + "/daisy", decode_f=imageIO.PIL_decode).withColumn("label", lit(0)) tulips_train, tulips_test = tulips_df.randomSplit([0.6, 0.4])
# DBTITLE 1,Extract Training Images # Extract Images extractImagesSave(srcVideoPath, targetImgPath) # Remove Empty Files removeEmptyFiles(targetImgPath) # View file list of images extracted from video display(dbutils.fs.ls(targetImgPath)) # COMMAND ---------- # DBTITLE 1,Review Training Images from pyspark.ml.image import ImageSchema trainImages = ImageSchema.readImages(targetImgPath) display(trainImages) # COMMAND ---------- # DBTITLE 1,Feature Extraction using DeepImageFeaturizer # MAGIC %md # MAGIC Use [Spark Deep Learning Pipelines](https://github.com/databricks/spark-deep-learning) `DeepImageFeaturizer` to build image features via the InceptionV3 model # COMMAND ---------- # DBTITLE 0,Save Features Function # Save Image Features using def saveImageFeatures(images, filePath): from sparkdl import DeepImageFeaturizer
# rdd = op.sc.parallelize([Row(predicted_labels=['daisy', '0.8918145298957825']), # Row(predicted_labels=['picket_fence', '0.14247830212116241']), # Row(predicted_labels=['daisy', '0.9532104134559631'])]) # df_row = spark.createDataFrame(rdd) def assert_spark_df(df): assert isinstance(df, pyspark.sql.dataframe.DataFrame), "Not a Spark DF" def assert_spark_model(model): assert isinstance(model, pyspark.ml.PipelineModel), "Not a model" tulips_df = ImageSchema.readImages("tests/testtulips/").withColumn( "label", lit(1)) daisy_df = imageIO.readImagesWithCustomFn( "tests/testdaisy/", decode_f=imageIO.PIL_decode).withColumn("label", lit(0)) train_df = tulips_df.unionAll(daisy_df) def test_image_classifier_lr(): model, df_preds = op.dl.image_classifier_lr(train_df) assert_spark_model(model) assert_spark_df(df_preds) def test_evaluate_img_lr():
# spark = SparkSession.builder.appName('SparkDeepLearning').getOrCreate() # change configuration settings on Spark # conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '4g'), ('spark.app.name', 'Spark Updated Conf'), ( # 'spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.driver.memory', '8g')]) imageDir = "T://courses//BigData//data//flower_photos" # Load images # image_df = ImageSchema.readImages(imageDir, recursive = True).withColumn("label", lit(1)) # image_df.printSchema() # image_df.show(5) # train_df, test_df, _=image_df.randomSplit([0.1, 0.05, 0.85]) # read images using two methods tulips_df = ImageSchema.readImages(imageDir + "/tulips").withColumn( "label", lit(1)) daisy_df = imageIO.readImagesWithCustomFn( imageDir + "/daisy", decode_f=imageIO.PIL_decode).withColumn("label", lit(0)) # use larger training sets (e.g. [0.6, 0.4] for getting more images) tulips_train, tulips_test, _ = tulips_df.randomSplit([0.1, 0.05, 0.85]) # use larger training sets (e.g. [0.6, 0.4] for getting more images) daisy_train, daisy_test, _ = daisy_df.randomSplit([0.1, 0.05, 0.85]) train_df = tulips_train.unionAll(daisy_train) test_df = tulips_test.unionAll(daisy_test) # Under the hood, each of the partitions is fully loaded in memory, which may be expensive. # This ensure that each of the paritions has a small size. train_df = train_df.repartition(100)
np.log(box[2] / anchors[best_anchor][0]), np.log(box[3] / anchors[best_anchor][1]), box_class ], dtype=np.float32) matching_true_boxes[i, j, best_anchor] = adjusted_box return detectors_mask, matching_true_boxes # COMMAND ---------- # MAGIC %md # MAGIC # ImageSchema # COMMAND ---------- images_df = ImageSchema.readImages('/mnt/roy/object-detection/images/', numPartitions=16) # COMMAND ---------- # MAGIC %md # MAGIC # Prediction on `test.jpg` # MAGIC Use it later to assert prediction using `UDF` # COMMAND ---------- test_row = images_df.where( "image.origin='dbfs:/mnt/roy/object-detection/images/test.jpg'").take( 1)[0][0] array = ImageSchema.toNDArray(test_row) # COMMAND ----------
# This would print all the files and directories directory_list = [] for file in dirs: directory_list.append(file) directory_list.pop(0) base_image_dir = '/images/' full_image_df = 0 for dir in directory_list: full_image_dir = base_image_dir + "/" + dir if full_image_df == 0: df = ImageSchema.readImages(full_image_dir) df = df.withColumn("image_label", lit(dir.lower())) full_image_df = df else: df = ImageSchema.readImages(full_image_dir) df = df.withColumn("image_label", lit(dir.lower())) full_image_df = full_image_df.union(df) full_image_df.write.format("parquet").mode("overwrite").save("/images/full_image_df/") # COMMAND ---------- display(full_image_df.where) # COMMAND ----------
from pyspark.ml.regression import LinearRegression from pyspark.ml.feature import VectorAssembler from pyspark.sql import Row from pyspark.ml.linalg import Vectors from pyspark.ml.classification import MultilayerPerceptronClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator if __name__ == '__main__': sc = SparkContext(appName="Recognition App") output_directory = sc.broadcast(sys.argv[1]) image_directory = sc.broadcast(sys.argv[2]) # hdfs://pierre:41234/cs455/combined_images image_rdd = ImageSchema.readImages(image_directory.value).rdd image_rdd = image_rdd.repartition(20) image_rdd # this function will create a 128 feature embedding for each face, now we need to train a neural network on top of this def extract_embeddings(partition): face_cascade = cv2.CascadeClassifier( output_directory.value + '/haarcascade_frontalface_alt.xml') embedder = cv2.dnn.readNetFromTorch(output_directory.value + "/openface.nn4.small2.v1.t7") for old_image in partition: filename = old_image.image.origin.split('/')[-1] image = np.array(old_image.image.data) image = image.reshape((480, 854, 3)) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)