def __init__(self): self.classifier_pset = { "logistic_regression": [ "LOGISTIC_REGPARAM_MIN", "LOGISTIC_REGPARAM_MAX", "LOGISTIC_REGPARAM_NUM", "LOGISTIC_MAXITER_MIN", "LOGISTIC_MAXITER_MAX", "LOGISTIC_MAXITER_NUM", "LOGISTIC_ELASTICNETPARAM_MIN", "LOGISTIC_ELASTICNETPARAM_MAX", "LOGISTIC_ELASTICNETPARAM_NUM" ], "random_forest": [ "RANDOMFOREST_NUMTREES_MIN", "RANDOMFOREST_NUMTREES_MAX", "RANDOMFOREST_NUMTREES_NUM" ], "linear_svc": [ "LSVC_REGPARAM_MIN", "LSVC_REGPARAM_MAX", "LSVC_REGPARAM_NUM", "LSVC_MAXITER_MIN", "LSVC_MAXITER_MAX", "LSVC_MAXITER_NUM" ] } self.classifier_params = {} self.classifier_params_range = {} self.data_params = {} self.featurizers = [] self.classifiers = [] self.train_pipeline = Pipeline() self.best_model = {} self.featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features")
def load_model(): lr_model = LogisticRegressionModel.load('s3a://trainingmodel/lr') featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") p_test = PipelineModel(stages=[featurizer, lr_model]) return p_test
def predicate(self, featurizer_name, classifier, test_df): featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName=featurizer_name) predictor = PipelineModel(stages=[featurizer, classifier]) predictions = predictor.transform(test_df) return predictions
def Pretrain_Model(train_df, max_iter, reg_param, elastic_net_param): featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") lr = LogisticRegression(maxIter=max_iter, regParam=reg_param, elasticNetParam=elastic_net_param, labelCol="label") p = Pipeline(stages=[featurizer, lr]) model = p.fit(train_df) model.stages[1].write().overwrite().save('s3a://trainingmodel/lr') print("Coefficients: \n" + str(model.coefficientMatrix)) print("Intercept: " + str(model.interceptVector)) trainingSummary = model.summary # Obtain the objective per iteration objectiveHistory = trainingSummary.objectiveHistory print("objectiveHistory:") for objective in objectiveHistory: print(objective) # for multiclass, we can inspect metrics on a per-label basis print("False positive rate by label:") for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel): print("label %d: %s" % (i, rate)) print("True positive rate by label:") for i, rate in enumerate(trainingSummary.truePositiveRateByLabel): print("label %d: %s" % (i, rate)) print("Precision by label:") for i, prec in enumerate(trainingSummary.precisionByLabel): print("label %d: %s" % (i, prec)) print("Recall by label:") for i, rec in enumerate(trainingSummary.recallByLabel): print("label %d: %s" % (i, rec)) print("F-measure by label:") for i, f in enumerate(trainingSummary.fMeasureByLabel()): print("label %d: %s" % (i, f)) accuracy = trainingSummary.accuracy falsePositiveRate = trainingSummary.weightedFalsePositiveRate truePositiveRate = trainingSummary.weightedTruePositiveRate fMeasure = trainingSummary.weightedFMeasure() precision = trainingSummary.weightedPrecision recall = trainingSummary.weightedRecall print( "Training Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s" % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall)) return model
def saveImageFeatures(images, filePath): from sparkdl import DeepImageFeaturizer # Build featurizer using DeepImageFeaturizer and the InceptionV3 model featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") # Transform images to pull out image (origin, height, width, nChannels, mode, data) and features (udt) features = featurizer.transform(images) # Push feature information into Parquet file format # This might take a few minutes dbutils.fs.mkdirs(filePath) # Extract only image file name (imgFileName) within our saved features features.select( "image.origin", "features").coalesce(2).write.mode("overwrite").parquet(filePath)
def image_classifier_lr(df, input_col="image", output_col="features", model_name="InceptionV3"): featurizer = DeepImageFeaturizer(inputCol=input_col, outputCol=output_col, modelName=model_name) lr = LogisticRegression(maxIter=10, regParam=0.05, elasticNetParam=0.3, labelCol="label") p = Pipeline(stages=[featurizer, lr]) p_model = p.fit(df) return p_model, p_model.transform(df)
def transfer_learning(): load_images_path_and_shuffle() processed_images_train = processed_images_train.repartition(100) processed_images_vali = processed_images_vali.repartition(100) featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") lr = LogisticRegression(maxIter=10, regParam=0.05, elasticNetParam=0.3, labelCol="label") p = Pipeline(stages=[featurizer, lr]) transfer_model = p.fit(processed_images_train) validated_df = transfer_model.transform(processed_images_vali) evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print("Test set accuracy = " + str(evaluator.evaluate(validated_df.select("prediction", "label"))))
idf = (spark .read .format("parquet") .load("/images/full_image_df") ) display(idf) # COMMAND ---------- train_df, test_df = idf.randomSplit([0.6, 0.4]) # train logistic regression on features generated by InceptionV3: from sparkdl import DeepImageFeaturizer featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") # COMMAND ---------- from pyspark.ml.classification import LogisticRegression from pyspark.ml import Pipeline from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml.feature import StringIndexer # Index string label column stringIndexer = StringIndexer(inputCol = "image_label", outputCol = "label") # Build our logistic regression transformation lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label") # Build our ML piepline
#categories udf_categorie = udf(parse_category, StringType()) df_img = df_img.withColumn('categorie', udf_categorie('path')) return df_img # # Loading #Loading of Df with path, images, and categories spark_df = load_data(path) # # Preprocessing # In[11]: from sparkdl import DeepImageFeaturizer # We'll use ResNet50 for the transformation featurizer = DeepImageFeaturizer(inputCol="image", outputCol="image_preprocessed", modelName="ResNet50") spark_df_preprocessed = featurizer.transform(spark_df).select( ['path', 'categorie', 'image_preprocessed']) # # Saving #Saving as parquet file spark_df_preprocessed.repartition(16).write.format("parquet").mode( 'overwrite').save(path_to_save + 'preprocessed_parquet')
# Prepare Test Data # Prepare Test Data tmpTestDf = readImages(imageDir + "train25_2") tmpTestRDD = tmpTestDf.rdd.map( lambda x: Row(filepath=x[0], image=x[1], fileName=getFileName(x[0]))) tmptestX = tmpTestRDD.toDF() csvTestTmp = spark.read.format("csv").option( "header", "true").load(imageDir + "train25_2.csv") csvTestRDD = csvTestTmp.rdd.map(lambda x: Row(image=x[0], label=int(x[1]))) csvTest = csvTestRDD.toDF() finalTestDataFrame = tmptestX.join(csvTest, tmptestX.fileName == csvTest.image, 'inner').drop(csvTest.image) featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") featureVector = featurizer.transform(finalTestDataFrame) del tmpTestDf del tmpTestRDD del tmpTestX del csvTestTmp del csvTestRDD del csvTest del finalTestDataFrame model_nb = OneVsRestModel.load(imageDir + 'model-naive-bayes') model_nb.fit(featureVector) model_nb.write().overwrite().save(imageDir + 'model-naive-bayes-retrained') print '***re-train complete***'
from pyspark.ml import Pipeline from sparkdl import DeepImageFeaturizer spark = SparkSession.builder.appName('Weather Image Classifier - Data Analysis').getOrCreate() assert sys.version_info >= (3, 4) # make sure we have Python 3.4+ assert spark.version >= '2.2' # make sure we have Spark 2.2+ from sparkdl import readImages img_dir = "katkam-scaled" #Read images and Create training & test DataFrames for transfer learning jobs_df = readImages(img_dir) jobs_df.show() df = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3").transform(jobs_df) df.show() i = 2 df = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(df).transform(df) jobs_train, jobs_test = df.randomSplit([0.6, 0.4]) lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label") p_model = lr.fit(jobs_train) predictions = p_model.transform(jobs_test) predictions.select("filePath", "prediction").show(truncate=False) from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# MAGIC #### Prepare a training pipeline # COMMAND ---------- from pyspark.ml import Pipeline from sparkdl import DeepImageFeaturizer from pyspark.ml.classification import LogisticRegression from pyspark.ml.feature import IndexToString, StringIndexer # Create a label indexer that will convert string labels to numeric. labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(img_validate) # Create a featurizer based on a pretrained ResNet50 DNN featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="ResNet50") # Create a RandomForest model classifier = LogisticRegression(labelCol="indexedLabel", featuresCol="features", maxIter=500, regParam=0.06, elasticNetParam=0.06) # Create a converter that will convert numeric labels back to original labels labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelIndexer.labels) # Chain the components into a pipeline
def main(): #get or create spark session to read image and json data spark = SparkSession.builder.appName("yelp").getOrCreate() #read images, drop files that are not images and store it as a dataframe image_df = spark.read.format("image").load(sys.argv[1],dropImageFailures=True) #image_df.take(1) #preprocessing : getting the photo id of the images to join it to the JSON file dataframe split_col = pyspark.sql.functions.split(image_df['image.origin'], '/') image_df = image_df.withColumn('filename1', split_col.getItem(4)) split_col1 = pyspark.sql.functions.split(image_df['filename1'], '\\.') image_df = image_df.withColumn('filename2', split_col1.getItem(0)) #image_df.take(1) #reading the json file containing label info and joining it to image data path = sys.argv[2] photo_df = spark.read.json(path) final_df = image_df.join(photo_df, image_df.filename2 == photo_df.photo_id).select([col('image'),col('label')]) #final_df.take(1) #mapping the string label to numeric values final_df= final_df.withColumn("label1", when(col("label")=='food', 1).when(col("label")=='menu', 2).when(col("label")=='drink', 3).when(col("label")=='inside', 4).otherwise(5)) final_df = final_df.selectExpr("image as image", "label1 as label") #splitting the image dataset to train and test data final_train, final_test = final_df.randomSplit([0.8, 0.2]) #final_train.show() #applying transfer learning using InceptionV3 model featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label") p = Pipeline(stages=[featurizer, lr]) #ParamGridBuilder and cross validation for parameter tuning #paramGrid = ParamGridBuilder().addGrid(lr.elasticNetParam, [0.5, 0.1]).addGrid(lr.regParam, [0.1, 0.01]).build() #crossval = CrossValidator(estimator=p,estimatorParamMaps=paramGrid,evaluator=MulticlassClassificationEvaluator(),numFolds=4) #fitting the training data and transforming the test data #cvModel = crossval.fit(final_train) #predictions = cvModel.transform(final_test) yelp_model = p.fit(final_train) predictions = yelp_model.transform(final_test) #predictions.select("label1", "prediction").take(1) #selecting the prediction and label columns and calculating classification metrics predictionAndLabels = predictions.selectExpr("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") evaluator2 = MulticlassClassificationEvaluator(metricName="weightedPrecision") evaluator3 = MulticlassClassificationEvaluator(metricName="weightedRecall") evaluator4 = MulticlassClassificationEvaluator(metricName="f1") #printing the summary statistics print("Summary Stats") print("Accuracy = " + str(evaluator.evaluate(predictionAndLabels))) print("Precision = "+ str(evaluator2.evaluate(predictionAndLabels))) print("Recall = "+ str(evaluator3.evaluate(predictionAndLabels))) print("F1 Score = " + str(evaluator4.evaluate(predictionAndLabels))) spark.stop()