def __init__(self):

        self.classifier_pset = {
            "logistic_regression": [
                "LOGISTIC_REGPARAM_MIN", "LOGISTIC_REGPARAM_MAX",
                "LOGISTIC_REGPARAM_NUM", "LOGISTIC_MAXITER_MIN",
                "LOGISTIC_MAXITER_MAX", "LOGISTIC_MAXITER_NUM",
                "LOGISTIC_ELASTICNETPARAM_MIN", "LOGISTIC_ELASTICNETPARAM_MAX",
                "LOGISTIC_ELASTICNETPARAM_NUM"
            ],
            "random_forest": [
                "RANDOMFOREST_NUMTREES_MIN", "RANDOMFOREST_NUMTREES_MAX",
                "RANDOMFOREST_NUMTREES_NUM"
            ],
            "linear_svc": [
                "LSVC_REGPARAM_MIN", "LSVC_REGPARAM_MAX", "LSVC_REGPARAM_NUM",
                "LSVC_MAXITER_MIN", "LSVC_MAXITER_MAX", "LSVC_MAXITER_NUM"
            ]
        }

        self.classifier_params = {}
        self.classifier_params_range = {}
        self.data_params = {}
        self.featurizers = []
        self.classifiers = []
        self.train_pipeline = Pipeline()
        self.best_model = {}
        self.featurizer = DeepImageFeaturizer(inputCol="image",
                                              outputCol="features")
def load_model():
    lr_model = LogisticRegressionModel.load('s3a://trainingmodel/lr')
    featurizer = DeepImageFeaturizer(inputCol="image",
                                     outputCol="features",
                                     modelName="InceptionV3")
    p_test = PipelineModel(stages=[featurizer, lr_model])

    return p_test
Пример #3
0
    def predicate(self, featurizer_name, classifier, test_df):

        featurizer = DeepImageFeaturizer(inputCol="image",
                                         outputCol="features",
                                         modelName=featurizer_name)
        predictor = PipelineModel(stages=[featurizer, classifier])
        predictions = predictor.transform(test_df)
        return predictions
def Pretrain_Model(train_df, max_iter, reg_param, elastic_net_param):

    featurizer = DeepImageFeaturizer(inputCol="image",
                                     outputCol="features",
                                     modelName="InceptionV3")
    lr = LogisticRegression(maxIter=max_iter,
                            regParam=reg_param,
                            elasticNetParam=elastic_net_param,
                            labelCol="label")
    p = Pipeline(stages=[featurizer, lr])
    model = p.fit(train_df)
    model.stages[1].write().overwrite().save('s3a://trainingmodel/lr')

    print("Coefficients: \n" + str(model.coefficientMatrix))
    print("Intercept: " + str(model.interceptVector))

    trainingSummary = model.summary

    # Obtain the objective per iteration
    objectiveHistory = trainingSummary.objectiveHistory
    print("objectiveHistory:")
    for objective in objectiveHistory:
        print(objective)

    # for multiclass, we can inspect metrics on a per-label basis
    print("False positive rate by label:")
    for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
        print("label %d: %s" % (i, rate))

    print("True positive rate by label:")
    for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
        print("label %d: %s" % (i, rate))

    print("Precision by label:")
    for i, prec in enumerate(trainingSummary.precisionByLabel):
        print("label %d: %s" % (i, prec))

    print("Recall by label:")
    for i, rec in enumerate(trainingSummary.recallByLabel):
        print("label %d: %s" % (i, rec))

    print("F-measure by label:")
    for i, f in enumerate(trainingSummary.fMeasureByLabel()):
        print("label %d: %s" % (i, f))

    accuracy = trainingSummary.accuracy
    falsePositiveRate = trainingSummary.weightedFalsePositiveRate
    truePositiveRate = trainingSummary.weightedTruePositiveRate
    fMeasure = trainingSummary.weightedFMeasure()
    precision = trainingSummary.weightedPrecision
    recall = trainingSummary.weightedRecall
    print(
        "Training Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
        % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision,
           recall))

    return model
Пример #5
0
def saveImageFeatures(images, filePath):
    from sparkdl import DeepImageFeaturizer

    # Build featurizer using DeepImageFeaturizer and the InceptionV3 model
    featurizer = DeepImageFeaturizer(inputCol="image",
                                     outputCol="features",
                                     modelName="InceptionV3")

    # Transform images to pull out image (origin, height, width, nChannels, mode, data) and features (udt)
    features = featurizer.transform(images)

    # Push feature information into Parquet file format
    # This might take a few minutes
    dbutils.fs.mkdirs(filePath)

    # Extract only image file name (imgFileName) within our saved features
    features.select(
        "image.origin",
        "features").coalesce(2).write.mode("overwrite").parquet(filePath)
Пример #6
0
 def image_classifier_lr(df,
                         input_col="image",
                         output_col="features",
                         model_name="InceptionV3"):
     featurizer = DeepImageFeaturizer(inputCol=input_col,
                                      outputCol=output_col,
                                      modelName=model_name)
     lr = LogisticRegression(maxIter=10,
                             regParam=0.05,
                             elasticNetParam=0.3,
                             labelCol="label")
     p = Pipeline(stages=[featurizer, lr])
     p_model = p.fit(df)
     return p_model, p_model.transform(df)
Пример #7
0
def transfer_learning():

    load_images_path_and_shuffle()
    
    processed_images_train = processed_images_train.repartition(100)
    processed_images_vali = processed_images_vali.repartition(100)

    featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3")
    lr = LogisticRegression(maxIter=10, regParam=0.05, elasticNetParam=0.3, labelCol="label")
    p = Pipeline(stages=[featurizer, lr])

    transfer_model = p.fit(processed_images_train)

    validated_df = transfer_model.transform(processed_images_vali)
    evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
    
    print("Test set accuracy = " + str(evaluator.evaluate(validated_df.select("prediction", "label"))))
Пример #8
0
idf = (spark
       .read
       .format("parquet")
       .load("/images/full_image_df")
      )

display(idf)

# COMMAND ----------

train_df, test_df = idf.randomSplit([0.6, 0.4])

# train logistic regression on features generated by InceptionV3:
from sparkdl import DeepImageFeaturizer
featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3")

# COMMAND ----------

from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer

# Index string label column
stringIndexer = StringIndexer(inputCol = "image_label", outputCol = "label")

# Build our logistic regression transformation
lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")

# Build our ML piepline
Пример #9
0
    #categories
    udf_categorie = udf(parse_category, StringType())
    df_img = df_img.withColumn('categorie', udf_categorie('path'))

    return df_img


# # Loading

#Loading of Df with path, images, and categories
spark_df = load_data(path)

# # Preprocessing

# In[11]:

from sparkdl import DeepImageFeaturizer

# We'll use ResNet50 for the transformation
featurizer = DeepImageFeaturizer(inputCol="image",
                                 outputCol="image_preprocessed",
                                 modelName="ResNet50")
spark_df_preprocessed = featurizer.transform(spark_df).select(
    ['path', 'categorie', 'image_preprocessed'])

# # Saving

#Saving as parquet file
spark_df_preprocessed.repartition(16).write.format("parquet").mode(
    'overwrite').save(path_to_save + 'preprocessed_parquet')
Пример #10
0
# Prepare Test Data
# Prepare Test Data
tmpTestDf = readImages(imageDir + "train25_2")
tmpTestRDD = tmpTestDf.rdd.map(
    lambda x: Row(filepath=x[0], image=x[1], fileName=getFileName(x[0])))
tmptestX = tmpTestRDD.toDF()
csvTestTmp = spark.read.format("csv").option(
    "header", "true").load(imageDir + "train25_2.csv")
csvTestRDD = csvTestTmp.rdd.map(lambda x: Row(image=x[0], label=int(x[1])))
csvTest = csvTestRDD.toDF()
finalTestDataFrame = tmptestX.join(csvTest, tmptestX.fileName == csvTest.image,
                                   'inner').drop(csvTest.image)

featurizer = DeepImageFeaturizer(inputCol="image",
                                 outputCol="features",
                                 modelName="InceptionV3")
featureVector = featurizer.transform(finalTestDataFrame)
del tmpTestDf
del tmpTestRDD
del tmpTestX
del csvTestTmp
del csvTestRDD
del csvTest
del finalTestDataFrame

model_nb = OneVsRestModel.load(imageDir + 'model-naive-bayes')

model_nb.fit(featureVector)
model_nb.write().overwrite().save(imageDir + 'model-naive-bayes-retrained')
print '***re-train complete***'
Пример #11
0
from pyspark.ml import Pipeline
from sparkdl import DeepImageFeaturizer

spark = SparkSession.builder.appName('Weather Image Classifier - Data Analysis').getOrCreate()

assert sys.version_info >= (3, 4) # make sure we have Python 3.4+
assert spark.version >= '2.2' # make sure we have Spark 2.2+

from sparkdl import readImages

img_dir = "katkam-scaled"

#Read images and Create training & test DataFrames for transfer learning
jobs_df = readImages(img_dir)
jobs_df.show()
df = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3").transform(jobs_df)
df.show()
i = 2



df = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(df).transform(df)
jobs_train, jobs_test = df.randomSplit([0.6, 0.4])

lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")
p_model = lr.fit(jobs_train)
predictions = p_model.transform(jobs_test)

predictions.select("filePath", "prediction").show(truncate=False)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
Пример #12
0
# MAGIC #### Prepare a training pipeline

# COMMAND ----------

from pyspark.ml import Pipeline
from sparkdl import DeepImageFeaturizer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import IndexToString, StringIndexer

# Create a label indexer that will convert string labels to numeric.
labelIndexer = StringIndexer(inputCol="label",
                             outputCol="indexedLabel").fit(img_validate)

# Create a featurizer based on a pretrained ResNet50 DNN
featurizer = DeepImageFeaturizer(inputCol="image",
                                 outputCol="features",
                                 modelName="ResNet50")

# Create a RandomForest model
classifier = LogisticRegression(labelCol="indexedLabel",
                                featuresCol="features",
                                maxIter=500,
                                regParam=0.06,
                                elasticNetParam=0.06)

# Create a converter that will convert numeric labels back to original labels
labelConverter = IndexToString(inputCol="prediction",
                               outputCol="predictedLabel",
                               labels=labelIndexer.labels)

# Chain the components into a pipeline
Пример #13
0
def main():
    
    #get or create spark session to read image and json data
    spark = SparkSession.builder.appName("yelp").getOrCreate()
    #read images, drop files that are not images and store it as a dataframe
    image_df = spark.read.format("image").load(sys.argv[1],dropImageFailures=True)
    #image_df.take(1)

    
    #preprocessing : getting the photo id of the images to join it to the JSON file dataframe
    split_col = pyspark.sql.functions.split(image_df['image.origin'], '/')
    image_df = image_df.withColumn('filename1', split_col.getItem(4))
    split_col1 = pyspark.sql.functions.split(image_df['filename1'], '\\.')
    image_df = image_df.withColumn('filename2', split_col1.getItem(0))
    #image_df.take(1)


    #reading the json file containing label info and joining it to image data
    path = sys.argv[2]
    photo_df = spark.read.json(path)
    final_df = image_df.join(photo_df, image_df.filename2 == photo_df.photo_id).select([col('image'),col('label')])
    #final_df.take(1)


    #mapping the string label to numeric values
    final_df= final_df.withColumn("label1", when(col("label")=='food', 1).when(col("label")=='menu', 2).when(col("label")=='drink', 3).when(col("label")=='inside', 4).otherwise(5))
    final_df = final_df.selectExpr("image as image", "label1 as label")


    #splitting the image dataset to train and test data
    final_train, final_test = final_df.randomSplit([0.8, 0.2])
    #final_train.show()


    #applying transfer learning using InceptionV3 model
    featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3")
    lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")
    p = Pipeline(stages=[featurizer, lr])


    #ParamGridBuilder and cross validation for parameter tuning
    #paramGrid = ParamGridBuilder().addGrid(lr.elasticNetParam, [0.5, 0.1]).addGrid(lr.regParam, [0.1, 0.01]).build()
    #crossval = CrossValidator(estimator=p,estimatorParamMaps=paramGrid,evaluator=MulticlassClassificationEvaluator(),numFolds=4)


    #fitting the training data and transforming the test data                    
    #cvModel = crossval.fit(final_train)
    #predictions = cvModel.transform(final_test)
    yelp_model = p.fit(final_train)
    predictions = yelp_model.transform(final_test)
    #predictions.select("label1", "prediction").take(1)


    #selecting the prediction and label columns and calculating classification metrics
    predictionAndLabels = predictions.selectExpr("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
    evaluator2 = MulticlassClassificationEvaluator(metricName="weightedPrecision")
    evaluator3 = MulticlassClassificationEvaluator(metricName="weightedRecall")
    evaluator4 = MulticlassClassificationEvaluator(metricName="f1")


    #printing the summary statistics
    print("Summary Stats")
    print("Accuracy = " + str(evaluator.evaluate(predictionAndLabels)))
    print("Precision = "+ str(evaluator2.evaluate(predictionAndLabels)))
    print("Recall = "+ str(evaluator3.evaluate(predictionAndLabels)))
    print("F1 Score = " + str(evaluator4.evaluate(predictionAndLabels)))
    spark.stop()