Exemplo n.º 1
0
def transform_input(input_text):
    ''' '''
    lines = [(input_text, )]
    df = spark.createDataFrame(lines, ['text'])

    def removePunctuation(text):
        text = text.lower().strip()
        text = re.sub('[^0-9a-zA-Z ]', '', text)
        return text

    remove_punt_udf = udf(removePunctuation, StringType())

    tokenizer = Tokenizer(inputCol='text_noPunct', outputCol='token_text')
    df_new = df.withColumn('text_noPunct', remove_punt_udf('text'))
    df_new = tokenizer.transform(df_new)

    def remove_blank_token(text):
        text = list(filter(lambda x: x != '', text))
        return text

    remove_blank_token_udf = udf(remove_blank_token, ArrayType(StringType()))
    df_new = df_new.withColumn('token_text',
                               remove_blank_token_udf('token_text'))

    sw_remover = StopWordsRemover(inputCol='token_text',
                                  outputCol='stop_token')
    normalizer = Normalizer(inputCol='w2v', outputCol='w2v_norm')

    pipe = PipelineModel(stages=(sw_remover, w2v_model, normalizer))
    df_final = pipe.transform(df_new)

    return df_final
Exemplo n.º 2
0
def process(rdd):

    spark = getSparkSessionInstance(rdd.context.getConf())

    dota = rdd.map(lambda x: x[1])
    featuresdata = dota.map(lambda x: x.split(':')[2])
    actualdata = featuresdata.map(lambda x: x.split(','))
    rowRdd = actualdata.map(lambda x: Row(sl=float(x[0][1:]), sw=float(x[1]), pl=float(x[2]), pw=float(x[3]), stringlabel=x[4][:-4]))
    features = spark.createDataFrame(rowRdd)
    features.show()
    rowRdd = actualdata.map(lambda x: Row(sl=float(x[0]), sw=float(x[1]), pl=float(x[2]), pw=float(x[3]), stringlabel=x[4]))
    
    indexer = StringIndexerModel()
    assembler = VectorAssembler()
    lr = LogisticRegressionModel()

    pipe = PipelineModel(stages=[indexer,assembler,lr]).load('gs://suryasuresh/lab8output')

    result = pipe.transform(features)

    f1score = MulticlassClassificationEvaluator(metricName='f1')
    precision = MulticlassClassificationEvaluator(metricName='weightedPrecision')
    recall = MulticlassClassificationEvaluator(metricName='weightedRecall')
    accuracy = MulticlassClassificationEvaluator(metricName='accuracy')

    print(result.values)
    print("Accuracy:\t",accuracy.evaluate(result),"\nF1score:\t",f1score.evaluate(result),"\nWeighted Recall:\t",recall.evaluate(result),"\nWeighted Precision:\t",precision.evaluate(result))
Exemplo n.º 3
0
def process_data(df: DataFrame, ml_model: PipelineModel = model) -> DataFrame:
    df = convert_types_for_ml(df)
    df = convert_heroes_to_lineup(df)
    df = ml_model.transform(df)
    df = convert_types_for_kafka(df)

    return df
Exemplo n.º 4
0
def process_data(df: DataFrame, ml_model: PipelineModel = model) -> DataFrame:
    df = convert_types_for_ml(df)
    df = convert_heroes_to_lineup(df)
    df = ml_model.transform(df)
    df = convert_types_for_es(df)

    return df.select("probability_arr", "radiant_win_prediction",
                     "match_seq_num")
Exemplo n.º 5
0
    def predicate(self, featurizer_name, classifier, test_df):

        featurizer = DeepImageFeaturizer(inputCol="image",
                                         outputCol="features",
                                         modelName=featurizer_name)
        predictor = PipelineModel(stages=[featurizer, classifier])
        predictions = predictor.transform(test_df)
        return predictions
Exemplo n.º 6
0
class Pipe(Transformer):
    """Conditional pipeline which runs one or another list of transformers based on condition"""

    def __init__(self, stages: List[Transformer]):
        super(Pipe, self).__init__()
        self._pipeline = PipelineModel(stages)

    def _transform(self, dataset: DataFrame) -> DataFrame:
        return self._pipeline.transform(dataset)
Exemplo n.º 7
0
def get_predictions():
    s3_name = get_best_model()
    model = get_model(s3_name)

    df = get_data(test=True)
    df, stage_pca, first_stages = rebuild_pipeline(s3_name, df)
    print("Modelo evaluado: ", model, "con params: ", model.explainParams())

    df_assem = first_stages.transform(df)
    model_pca = stage_pca.fit(df_assem)

    # Creates Pipeline
    pipeline = PipelineModel(stages=[first_stages, model_pca, model])
    prediction = pipeline.transform(df)

    #vars_pred = ['rawPrediction','probability', 'prediction', 'distance', 'flight_number_reporting_airline']
    vars_pred = [
        'dayofmonth', 'prediction', 'distance',
        'flight_number_reporting_airline'
    ]
    df_pred = prediction.select(
        [c for c in prediction.columns if c in vars_pred])
    df_pred = df_pred.withColumn('s3_name', lit(s3_name))

    df_pred = df_pred.withColumn(
        'auxi',
        f.when(f.col('dayofmonth') < 9, "0").otherwise(""))

    df_pred = df_pred.withColumn(
        'fecha', concat(lit("2020"), lit("02"), col('auxi'),
                        col('dayofmonth')))
    vars_pred = [
        'flight_number_reporting_airline', 'prediction', 'distance', 's3_name',
        'fecha'
    ]
    df_pred = df_pred.select([c for c in df_pred.columns if c in vars_pred])

    return df_pred, s3_name
    .setSubscriptionKey(TEXT_API_KEY)\
    .setOutputCol("sentiment")

#Extract the sentiment score from the API response body
getSentiment = SQLTransformer(
    statement="SELECT *, sentiment[0].sentiment as sentimentLabel FROM __THIS__"
)

# COMMAND ----------

# MAGIC %md
# MAGIC ### Tying it all together
# MAGIC
# MAGIC Now that we have built the stages of our pipeline its time to chain them together into a single model that can be used to process batches of incoming data
# MAGIC
# MAGIC <img src="https://mmlspark.blob.core.windows.net/graphics/Cog%20Service%20NB/full_pipe_2.jpg" width="800" style="float: center;"/>

# COMMAND ----------

from mmlspark.stages import SelectColumns
# Select the final coulmns
cleanupColumns = SelectColumns().setCols(
    ["url", "firstCeleb", "text", "sentimentLabel"])

celebrityQuoteAnalysis = PipelineModel(stages=[
    bingSearch, getUrls, celebs, firstCeleb, recognizeText, getText,
    sentimentTransformer, getSentiment, cleanupColumns
])

celebrityQuoteAnalysis.transform(bingParameters).show(5)
Exemplo n.º 9
0
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
import sparkdl as dl
from pyspark.ml.classification import LogisticRegressionModel
from pyspark.ml import Pipeline, PipelineModel
conf = SparkConf().setAppName("image_testset").setMaster("yarn")
sc = SparkContext(conf=conf)
sql_sc = SQLContext(sc)

lr_test = LogisticRegressionModel.load('hdfs:///lr')
featurizer_test = dl.DeepImageFeaturizer(inputCol="image",
                                         outputCol="features",
                                         modelName="InceptionV3")
p_lr_test = PipelineModel(stages=[featurizer_test, lr_test])
image_path = "hdfs:///project_data/pets/test_images/"
image_DF = dl.readImages(image_path)
image_DF.show(10)
tested_lr_test = p_lr_test.transform(image_DF)
tested_lr_test.sample(False, 0.1).show()
    .setTextCol("checkin_comment")\
    .setUrl("https://{}.api.cognitive.microsoft.com/text/analytics/v3.0/sentiment".format(cognitive_location))\
    .setSubscriptionKey(TEXT_API_KEY)\
    .setOutputCol("sentiment")

#Extract the sentiment score from the API response body
getSentiment = SQLTransformer(
    statement="SELECT *, sentiment[0].sentiment as sentimentLabel FROM __THIS__"
)

# COMMAND ----------

celebrityQuoteAnalysis = PipelineModel(
    stages=[sentimentTransformer, getSentiment])

display(celebrityQuoteAnalysis.transform(spark.table('facts')))

# COMMAND ----------

keyPhrasesTransformer = TextSentiment()\
    .setTextCol("description")\
    .setUrl("https://{}.api.cognitive.microsoft.com/text/analytics/v3.0/keyPhrases".format(cognitive_location))\
    .setSubscriptionKey(TEXT_API_KEY)\
    .setOutputCol("keyPhrases")

#Extract the sentiment score from the API response body
# getSentiment = SQLTransformer(statement="SELECT *, sentiment[0].sentiment as sentimentLabel FROM __THIS__")

# COMMAND ----------

keyPhrasesAnalysis = PipelineModel(stages=[keyPhrasesTransformer])
Exemplo n.º 11
0
sentimentTransformer = TextSentiment()\
    .setTextCol("checkin_comment")\
    .setUrl("https://{}.api.cognitive.microsoft.com/text/analytics/v3.0/sentiment".format(cognitive_location))\
    .setSubscriptionKey(TEXT_API_KEY)\
    .setOutputCol("sentiment")

#Extract the sentiment score from the API response body
# unneeded when doing raw capture
# getSentiment = SQLTransformer(statement="SELECT *, sentiment[0].sentiment as sentimentLabel FROM __THIS__")

# COMMAND ----------

commentSentimentAnalysis = PipelineModel(stages=[sentimentTransformer])
df_checkin_comments = df.select(df.checkin_comment, df.checkin_id)
df_sentiment = commentSentimentAnalysis.transform(df_checkin_comments).drop(
    col('checkin_comment'))
error_column_name = df_sentiment.columns[1]
df_sentiment_renamed = df_sentiment.withColumnRenamed(error_column_name,
                                                      'TextSentiment_error')
df = df_sentiment_renamed

# COMMAND ----------

sentiment_raw_path = base_path + 'raw/sentiment/{}/{}/{}/untappd.json'.format(
    date.year, date.month, date.day)
sentiment_raw_delta_path = base_path + 'raw/sentiment/delta'
sentiment_query_path = base_path + 'query/sentiment'

# COMMAND ----------

# df_sentiment_renamed.write.format('delta').mode("append").save(sentiment_raw_delta_path)
Exemplo n.º 12
0
    b_df = dl.readImages(img_dir + "/b" + m).withColumn("label", lit(1))
    m_df = dl.readImages(img_dir + "/m" + m).withColumn("label", lit(0))

    #Splitting the data into training and test in the ratio 80% & 20%
    trainb, testb = b_df.randomSplit([80.00, 20.00], seed=42)
    trainm, testm = m_df.randomSplit([80.00, 20.00], seed=42)

    #combining the dataset benign and malignanent for the training and testing
    trainDF = trainb.unionAll(trainm)
    testDF = testb.unionAll(testm)

    lr_test = LogisticRegressionModel.load('./test-' + m)

    # Use a featurizer to use trained features from an existing model
    featurizer_test = dl.DeepImageFeaturizer(inputCol="image",
                                             outputCol="features",
                                             modelName="InceptionV3")

    # Setup a pipeline
    p_lr_test = PipelineModel(stages=[featurizer_test, lr_test])

    # Test and evaluate
    tested_lr_test = p_lr_test.transform(testDF)
    evaluator_lr_test = MulticlassClassificationEvaluator(
        metricName="accuracy")
    print("Logistic Regression Model: Test set accuracy = " + str(
        evaluator_lr_test.evaluate(tested_lr_test.select(
            "prediction", "label"))))

    tested_lr_test.select("label", "probability", "prediction").show(20, False)
Exemplo n.º 13
0
def score_model(data: pyspark.sql.DataFrame,
                model: PipelineModel) -> pyspark.sql.DataFrame:
    predictions_test = model.transform(data)
    return predictions_test
TEXT_API_KEY = cognitive_key
# VISION_API_KEY        = os.environ["VISION_API_KEY"]
# BING_IMAGE_SEARCH_KEY = os.environ["BING_IMAGE_SEARCH_KEY"]

# COMMAND ----------

# MAGIC %sql
# MAGIC SELECT * from facts

# COMMAND ----------

sentimentTransformer = TextSentiment()\
    .setTextCol("checkin_comment")\
    .setUrl("https://{}.api.cognitive.microsoft.com/text/analytics/v3.0/sentiment".format(cognitive_location))\
    .setSubscriptionKey(TEXT_API_KEY)\
    .setOutputCol("sentiment")

#Extract the sentiment score from the API response body
getSentiment = SQLTransformer(
    statement="SELECT *, sentiment[0].sentiment as sentimentLabel FROM __THIS__"
)

# COMMAND ----------

celebrityQuoteAnalysis = PipelineModel(
    stages=[sentimentTransformer, getSentiment])

display(celebrityQuoteAnalysis.transform(spark.table('facts')))

# COMMAND ----------
            img_rescaled = resizeimage.resize_cover(new_im, [width, width])
            img_rescaled.save("{}/rescaled/{}".format(root, img))


if __name__ == "__main__":
    sc = SparkContext()
    img_dic = joblib.load("dictionary.pkl")[0]
    featurizer = DeepImageFeaturizer(inputCol="image",
                                     outputCol="features",
                                     modelName="InceptionV3")
    lr = LogisticRegressionModel.load('./lrModel')
    p_model = PipelineModel(stages=[featurizer, lr])

    directory = "./media"
    rescaled_dir = "{}/rescaled".format(directory)

    rescale_image(directory, rescaled_dir)

    temp_df = ImageSchema.readImages(rescaled_dir)
    df = p_model.transform(temp_df)
    f = open("predict_output.txt", "r+")
    f.seek(0)
    f.truncate()
    for i in df.select(['image', 'prediction']).collect():
        print("{} = {}".format(i[0][0].split('/')[-1], img_dic[int(i[1])]))
        f.write("{} = {}\n".format(i[0][0].split('/')[-1], img_dic[int(i[1])]))
    f.close()

    shutil.rmtree(rescaled_dir)

    # spark-submit --packages databricks:spark-deep-learning:1.5.0-spark2.4-s_2.11 predict.py
Exemplo n.º 16
0
    def run_pipeline(df: pyspark.sql.DataFrame,
                     pipeline: PipelineModel) -> int:
        imputed_df = pipeline.transform(df)

        return imputed_df.filter(imputed_df["imputed_age"].isNull()).count()
trainingDF.show(10)
print("show over")
vectorizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName='InceptionV3')
logreg = LogisticRegression(maxIter=30,regParam=0.05, elasticNetParam=0.3, labelCol = "label", featuresCol="features")
pipeline = Pipeline(stages=[vectorizer, logreg])

pipeline_model = pipeline.fit(trainingDF)
lrModel = pipeline_model
lrModel.stages[1].write().overwrite().save('hdfs:///lr')

from pyspark.ml.classification import LogisticRegressionModel
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

lr_test = LogisticRegressionModel.load('hdfs:///lr')

# Use a featurizer to use trained features from an existing model
featurizer_test = dl.DeepImageFeaturizer(inputCol = "image", outputCol = "features", modelName = "InceptionV3")

# Pipeline both entities
p_lr_test = PipelineModel(stages=[featurizer_test, lr_test])

# Test and evaluate
tested_lr_test = p_lr_test.transform(validationDF)
evaluator_lr_test = MulticlassClassificationEvaluator(metricName = "accuracy")
print("Logistic Regression Model: Test set accuracy = " + str(evaluator_lr_test.evaluate(tested_lr_test.select("prediction", "label"))))

tested_lr_test.select("label", "probability", "prediction").show(10)