Exemplo n.º 1
0
    def test_nested_pipeline_persistence(self):
        """
        Pipeline[HashingTF, Pipeline[PCA]]
        """
        sqlContext = SQLContext(self.sc)
        temp_path = tempfile.mkdtemp()

        try:
            df = sqlContext.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"])
            tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features")
            pca = PCA(k=2, inputCol="features", outputCol="pca_features")
            p0 = Pipeline(stages=[pca])
            pl = Pipeline(stages=[tf, p0])
            model = pl.fit(df)

            pipeline_path = temp_path + "/pipeline"
            pl.save(pipeline_path)
            loaded_pipeline = Pipeline.load(pipeline_path)
            self._compare_pipelines(pl, loaded_pipeline)

            model_path = temp_path + "/pipeline-model"
            model.save(model_path)
            loaded_model = PipelineModel.load(model_path)
            self._compare_pipelines(model, loaded_model)
        finally:
            try:
                rmtree(temp_path)
            except OSError:
                pass
Exemplo n.º 2
0
    def process(time, rdd):
        print("========= %s =========" % str(time))

        try:
            # Get the singleton instance of SparkSession
            spark = getSparkSessionInstance(rdd.context.getConf())

            # Convert RDD[String] to RDD[Row] to DataFrame
            rowRdd = rdd.map(lambda w: Row(title=w[1]))
            wordsDataFrame = spark.createDataFrame(rowRdd)


            
            # load model pipeline
            model = PipelineModel.load('kmeans')
            prediction = model.transform(wordsDataFrame).select("6_kmeans")
            prediction.show(5)
        except:
            pass
Exemplo n.º 3
0
    def test_pipeline_persistence(self):
        sqlContext = SQLContext(self.sc)
        temp_path = tempfile.mkdtemp()

        try:
            df = sqlContext.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"])
            tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features")
            pca = PCA(k=2, inputCol="features", outputCol="pca_features")
            pl = Pipeline(stages=[tf, pca])
            model = pl.fit(df)
            pipeline_path = temp_path + "/pipeline"
            pl.save(pipeline_path)
            loaded_pipeline = Pipeline.load(pipeline_path)
            self.assertEqual(loaded_pipeline.uid, pl.uid)
            self.assertEqual(len(loaded_pipeline.getStages()), 2)

            [loaded_tf, loaded_pca] = loaded_pipeline.getStages()
            self.assertIsInstance(loaded_tf, HashingTF)
            self.assertEqual(loaded_tf.uid, tf.uid)
            param = loaded_tf.getParam("numFeatures")
            self.assertEqual(loaded_tf.getOrDefault(param), tf.getOrDefault(param))

            self.assertIsInstance(loaded_pca, PCA)
            self.assertEqual(loaded_pca.uid, pca.uid)
            self.assertEqual(loaded_pca.getK(), pca.getK())

            model_path = temp_path + "/pipeline-model"
            model.save(model_path)
            loaded_model = PipelineModel.load(model_path)
            [model_tf, model_pca] = model.stages
            [loaded_model_tf, loaded_model_pca] = loaded_model.stages
            self.assertEqual(model_tf.uid, loaded_model_tf.uid)
            self.assertEqual(model_tf.getOrDefault(param), loaded_model_tf.getOrDefault(param))

            self.assertEqual(model_pca.uid, loaded_model_pca.uid)
            self.assertEqual(model_pca.pc, loaded_model_pca.pc)
            self.assertEqual(model_pca.explainedVariance, loaded_model_pca.explainedVariance)
        finally:
            try:
                rmtree(temp_path)
            except OSError:
                pass
import numpy as np
import pandas as pd
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.ml import PipelineModel
from flask import Flask, request, jsonify, render_template

sc = SparkContext('local')
sqlContext = SQLContext(sc)
app = Flask(__name__)
model = PipelineModel.load('final_model')


@app.route('/')
def home():
    return render_template('index.html')


@app.route('/predict', methods=['POST', 'GET'])
def predict():
    schema = StructType([ StructField("CONTROL", IntegerType(), False)\
                       ,StructField("ADM_RATE", DoubleType(), True)\
                       ,StructField("ADM_RATE_ALL", DoubleType(), True)\
                       ,StructField("SAT_AVG_ALL", DoubleType(), True)\
                       ,StructField("SATMTMID", DoubleType(), True)\
                       ,StructField("UGDS", DoubleType(), True)\
                       ,StructField("HIGHDEG", IntegerType(), False)\
                       ,StructField("TUITFTE", DoubleType(), True)\
from pyspark.ml import Pipeline
from pyspark.ml import PipelineModel
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

model = PipelineModel.load('/FileStore/lrmodel')
newDF = [
    StructField("id", IntegerType(), True),
    StructField("text", StringType(), True),
    StructField("label", DoubleType(), True)]
finalSchema = StructType(fields=newDF)
dataset = sqlContext.read.format('csv').options(header='true',schema=finalSchema,delimiter='|').load('/FileStore/tables/dataset.csv')
dataset = dataset.withColumn("label", dataset["label"].cast(DoubleType()))
dataset = dataset.withColumn("id", dataset["id"].cast(IntegerType()))

result = model.transform(dataset)\
    .select("features", "label", "prediction")
correct = result.where(result["label"] == result["prediction"])
accuracy = correct.count()/dataset.count()
print("Accuracy of model = "+str(accuracy))
Exemplo n.º 6
0
from pyspark.sql import *
from pyspark.ml import PipelineModel
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import util

spark = SparkSession.builder.appName('Demo').getOrCreate()

model = PipelineModel.load("gbtregressor.model")


analyzer = SentimentIntensityAnalyzer()
def sentiment_analyze(text, flag):
    vs = analyzer.polarity_scores(text)
    return vs[flag]


def predict(doc):
    tweet = Row(source=doc['source'], 
                retweet_count=doc['retweet_count'], 
                favorite_count=doc['favorite_count'], is_retweet=doc['is_retweet'],
                sentiment_compound=sentiment_analyze(doc['text'], "compound"),
                sentiment_neg=sentiment_analyze(doc['text'], "neg"),
                sentiment_neu=sentiment_analyze(doc['text'], "neu"),
                sentiment_pos=sentiment_analyze(doc['text'], "pos"),
                hour=util.convertUTCtoHourOfDay(doc['created_at']),
                day=util.convertUTCtoDay(doc['created_at']),
                week=util.convertUTCtoWeekNumber(doc['created_at']),
                month=util.convertUTCtoMonth(doc['created_at']),
Exemplo n.º 7
0
sparkConf = SparkConf().set("spark.app.name", "dotingestion2") \
                        .set("es.nodes", "elasticsearch") \
                        .set("es.port", "9200") \
                        .set("es.mapping.id", "match_seq_num") \
                        .set("es.write.operation", "upsert")

# Load the hero_id conversions
with open("heroes.json", 'r', encoding="utf-8") as f:
    heroes_dict = {hero['id']: i for i, hero in enumerate(loads(f.read()))}

# Create a spark context with the provided conficurations
sc = SparkContext.getOrCreate(conf=sparkConf)
spark = SparkSession(sc)

# Load the Machine Learning model
model = PipelineModel.load("model")


# Convert "dire_lineup" and "radiant_lineup" from array to Vector, and apply the "onehot" function
def convert_heroes_to_lineup(df: DataFrame) -> DataFrame:
    def onehot(heroes: ArrayType):
        lineup = tuple(heroes_dict[hero] for hero in heroes)
        return Vectors.dense([
            1 if hero_slot in lineup else 0
            for hero_slot in range(len(heroes_dict))
        ])

    heros_to_lineup_udf = udf(onehot, VectorUDT())
    return df.withColumn("dire_lineup_vec", heros_to_lineup_udf(df.dire_lineup))\
             .withColumn("radiant_lineup_vec", heros_to_lineup_udf(df.radiant_lineup))
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.ml import PipelineModel

## Note this a local Spark instance running in the engine
spark = SparkSession.builder \
      .appName("Flight Predictor") \
      .master("local[*]") \
      .config("spark.driver.memory","4g")\
      .config("spark.hadoop.fs.s3a.aws.credentials.provider","org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")\
      .config("spark.hadoop.fs.s3a.metadatastore.impl","org.apache.hadoop.fs.s3a.s3guard.NullMetadataStore")\
      .config("spark.hadoop.fs.s3a.delegation.token.binding","")\
      .config("spark.hadoop.yarn.resourcemanager.principal","jfletcher")\
      .getOrCreate()

model = PipelineModel.load(
    "s3a://ml-field/demo/flight-analysis/data/models/lr_model")

from pyspark.sql.types import *

feature_schema = StructType([
    StructField("OP_CARRIER", StringType(), True),
    StructField("ORIGIN", StringType(), True),
    StructField("DEST", StringType(), True),
    StructField("CRS_DEP_TIME", StringType(), True),
    StructField("CRS_ELAPSED_TIME", DoubleType(), True),
    StructField("DISTANCE", DoubleType(), True)
])

from pyspark.sql.types import StringType
from pyspark.sql.functions import udf, substring
Exemplo n.º 9
0
def save_model(
    spark_model,
    path,
    mlflow_model=None,
    conda_env=None,
    dfs_tmpdir=None,
    sample_input=None,
    signature: ModelSignature = None,
    input_example: ModelInputExample = None,
    pip_requirements=None,
    extra_pip_requirements=None,
):
    """
    Save a Spark MLlib Model to a local path.

    By default, this function saves models using the Spark MLlib persistence mechanism.
    Additionally, if a sample input is specified using the ``sample_input`` parameter, the model
    is also serialized in MLeap format and the MLeap flavor is added.

    :param spark_model: Spark model to be saved - MLflow can only save descendants of
                        pyspark.ml.Model which implement MLReadable and MLWritable.
    :param path: Local path where the model is to be saved.
    :param mlflow_model: MLflow model config this flavor is being added to.
    :param conda_env: Either a dictionary representation of a Conda environment or the path to a
                      Conda environment yaml file. If provided, this decsribes the environment
                      this model should be run in. At minimum, it should specify the dependencies
                      contained in :func:`get_default_conda_env()`. If `None`, the default
                      :func:`get_default_conda_env()` environment is added to the model.
                      The following is an *example* dictionary representation of a Conda
                      environment::

                        {
                            'name': 'mlflow-env',
                            'channels': ['defaults'],
                            'dependencies': [
                                'python=3.7.0',
                                'pyspark=2.3.0'
                            ]
                        }
    :param dfs_tmpdir: Temporary directory path on Distributed (Hadoop) File System (DFS) or local
                       filesystem if running in local mode. The model is be written in this
                       destination and then copied to the requested local path. This is necessary
                       as Spark ML models read from and write to DFS if running on a cluster. All
                       temporary files created on the DFS are removed if this operation
                       completes successfully. Defaults to ``/tmp/mlflow``.
    :param sample_input: A sample input that is used to add the MLeap flavor to the model.
                         This must be a PySpark DataFrame that the model can evaluate. If
                         ``sample_input`` is ``None``, the MLeap flavor is not added.

    :param signature: :py:class:`ModelSignature <mlflow.models.ModelSignature>`
                      describes model input and output :py:class:`Schema <mlflow.types.Schema>`.
                      The model signature can be :py:func:`inferred <mlflow.models.infer_signature>`
                      from datasets with valid model input (e.g. the training dataset with target
                      column omitted) and valid model output (e.g. model predictions generated on
                      the training dataset), for example:

                      .. code-block:: python

                        from mlflow.models.signature import infer_signature
                        train = df.drop_column("target_label")
                        predictions = ... # compute model predictions
                        signature = infer_signature(train, predictions)
    :param input_example: Input example provides one or several instances of valid
                          model input. The example can be used as a hint of what data to feed the
                          model. The given example will be converted to a Pandas DataFrame and then
                          serialized to json using the Pandas split-oriented format. Bytes are
                          base64-encoded.
    :param pip_requirements: {{ pip_requirements }}
    :param extra_pip_requirements: {{ extra_pip_requirements }}

    .. code-block:: python
        :caption: Example

        from mlflow import spark
        from pyspark.ml.pipeline.PipelineModel

        # your pyspark.ml.pipeline.PipelineModel type
        model = ...
        mlflow.spark.save_model(model, "spark-model")
    """
    _validate_model(spark_model)
    _validate_env_arguments(conda_env, pip_requirements,
                            extra_pip_requirements)

    from pyspark.ml import PipelineModel

    if not isinstance(spark_model, PipelineModel):
        spark_model = PipelineModel([spark_model])
    if mlflow_model is None:
        mlflow_model = Model()
    # Spark ML stores the model on DFS if running on a cluster
    # Save it to a DFS temp dir first and copy it to local path
    if dfs_tmpdir is None:
        dfs_tmpdir = DFS_TMP
    tmp_path = _tmp_path(dfs_tmpdir)
    spark_model.save(tmp_path)
    sparkml_data_path = os.path.abspath(
        os.path.join(path, _SPARK_MODEL_PATH_SUB))
    # We're copying the Spark model from DBFS to the local filesystem if (a) the temporary DFS URI
    # we saved the Spark model to is a DBFS URI ("dbfs:/my-directory"), or (b) if we're running
    # on a Databricks cluster and the URI is schemeless (e.g. looks like a filesystem absolute path
    # like "/my-directory")
    copying_from_dbfs = is_valid_dbfs_uri(tmp_path) or (
        databricks_utils.is_in_cluster()
        and posixpath.abspath(tmp_path) == tmp_path)
    if copying_from_dbfs and databricks_utils.is_dbfs_fuse_available():
        tmp_path_fuse = dbfs_hdfs_uri_to_fuse_path(tmp_path)
        shutil.move(src=tmp_path_fuse, dst=sparkml_data_path)
    else:
        _HadoopFileSystem.copy_to_local_file(tmp_path,
                                             sparkml_data_path,
                                             remove_src=True)
    _save_model_metadata(
        dst_dir=path,
        spark_model=spark_model,
        mlflow_model=mlflow_model,
        sample_input=sample_input,
        conda_env=conda_env,
        signature=signature,
        input_example=input_example,
        pip_requirements=pip_requirements,
        extra_pip_requirements=extra_pip_requirements,
    )
Exemplo n.º 10
0
def content_userid(self, file1, file2, input_model, u_id, sim_bus_limit=3):

    from pyspark import SparkContext
    from pyspark.sql import SparkSession
    sparkconf_builder = spark_celery_app.sparkconf_builder
    spark_conf = sparkconf_builder()
    sc = SparkContext.getOrCreate(conf=spark_conf)
    spark = SparkSession.builder.config(conf=spark_conf).getOrCreate()

    data = spark.read.parquet(file1)
    data.createOrReplaceTempView('review')
    df_business = spark.read.parquet(file2)
    schema = StructType([
        StructField("business_id", StringType(), True),
        StructField("score", IntegerType(), True),
        StructField("input_business_id", StringType(), True)
    ])

    similar_businesses_df = spark.createDataFrame([], schema)
    df = data.select('business_id', 'text')
    #df_review = df.groupby('business_id').agg(functions.collect_set('text')).show(100)
    review_rdd = df.rdd.map(tuple).reduceByKey(operator.add)
    review_df = spark.createDataFrame(review_rdd).withColumnRenamed(
        '_1', 'business_id').withColumnRenamed('_2', 'text')

    # create text preprocessing pipeline
    # Build the pipeline
    # tokenize review
    regexTokenizer = RegexTokenizer(gaps=False,
                                    pattern='\w+',
                                    inputCol='text',
                                    outputCol='text_token')
    #yelpTokenDF = regexTokenizer.transform(review_df)

    # filter stopwords
    stopWordsRemover = StopWordsRemover(inputCol='text_token',
                                        outputCol='nonstopwrd')
    #yelp_remove_df = stopWordsRemover.transform(yelpTokenDF)

    # TF
    countVectorizer = CountVectorizer(inputCol='nonstopwrd',
                                      outputCol='raw_features',
                                      minDF=2)
    #yelp_CountVec = cv.transform(yelp_remove_df)

    # IDF
    idf = IDF(inputCol="raw_features", outputCol="idf_vec")
    word2Vec = Word2Vec(vectorSize=500,
                        minCount=5,
                        inputCol='nonstopwrd',
                        outputCol='word_vec',
                        seed=123)
    #vectorAssembler = VectorAssembler(inputCols=['idf_vec', 'word_vec'], outputCol='comb_vec')
    pipeline = Pipeline(stages=[
        regexTokenizer, stopWordsRemover, countVectorizer, idf, word2Vec
    ])
    #pipeline_model = pipeline.fit(review_df)
    #pipeline_model.write().overwrite().save('content_userid')

    pipeline_model = PipelineModel.load(input_model)
    reviews_by_business_df = pipeline_model.transform(review_df)
    all_business_vecs = reviews_by_business_df.select(
        'business_id', 'word_vec').rdd.map(lambda x: (x[0], x[1])).collect()
    usr_rev_bus = spark.sql(
        'SELECT distinct business_id FROM review where stars >= 3.0 and user_id = "{}"'
        .format(u_id))

    bus_list = [i for i in usr_rev_bus.collect()]

    for b_id in bus_list:
        input_vec = [(r[1]) for r in all_business_vecs if r[0] == b_id[0]][0]
        similar_business_rdd = sc.parallelize(
            (i[0], float(CosineSim(input_vec, i[1])))
            for i in all_business_vecs)
        similar_business_df = spark.createDataFrame(
            similar_business_rdd).withColumnRenamed(
                '_1', 'business_id').withColumnRenamed('_2', 'score').orderBy(
                    "score", ascending=False)
        similar_business_df = similar_business_df.filter(
            col("business_id") != b_id[0]).limit(10)
        similar_business_df = similar_business_df.withColumn(
            'input_business_id', lit(b_id[0]))
        # get restaurants similar to the user_id
        result = similar_businesses_df.union(similar_business_df)
    #result.cache()
    # filter out those have been reviewd before by the user
    d = [i[0] for i in usr_rev_bus.collect()]
    df_1 = result.filter(~(col('business_id').isin(d))).select(
        'business_id', 'score')
    #df_1= result.join(usr_rev_bus, 'business_id', 'left_outer').where(col("usr_rev_bus.business_id").isNull()).select([col('result.business_id'),col('result.score')])
    df_2 = df_1.orderBy("score", ascending=False).limit(sim_bus_limit)
    df_result = df_business.join(df_2, 'business_id',
                                 'right').select('business_id', 'score',
                                                 'name', 'categories',
                                                 'latitude', 'longitude')
    df_result.show()
    df_result = df_result.collect()
    return df_result
Exemplo n.º 11
0
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark import SQLContext
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.functions import col, udf, lag, date_add, explode, lit, concat, unix_timestamp, sum, abs
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import PipelineModel

sc = SparkContext(appName="MyFirstApp3_Task_task2")
spark = SparkSession(sc)

df_node16 = spark.read.format("parquet").load(
    path="hdfs://namenode:9000/example3/test.parquet")
model_node17 = PipelineModel.load("hdfs://namenode:9000/example3/model/")
df_node18 = model_node17.transform(df_node16)

evaluator_node19 = MulticlassClassificationEvaluator(
    labelCol="indexedSurvived",
    predictionCol="prediction",
    metricName="accuracy")
score_node19 = evaluator_node19.evaluate(df_node18)
df_node19 = spark.createDataFrame([(score_node19, )], ["score"])

df_node19.write.format("csv").save(
    path="hdfs://namenode:9000/example3/EvalResult3.csv",
    quote="\"",
    header=True,
    sep=",")
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import StringIndexerModel
from itertools import chain


spark = SparkSession \
    .builder \
    .appName("Kafka Spark Structured Streaming") \
    .config("spark.master", "local[*]") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

model = PipelineModel.load("/user/2618B56/big_data_phd")

print(model)

df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "c.insofe.edu.in:9092") \
  .option("subscribe", "big_data_phd_2618B56") \
  .option("startingOffsets", "earliest") \
  .load()

df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

df.printSchema()
Exemplo n.º 13
0
             array().cast("array<string>")).otherwise(split_text))

    return df


if __name__ == "__main__":
    val_file = "hdfs:///user/pknees/RSC20/val.tsv"
    #train_file = "data/training_sample.tsv"
    val_df = load_file(val_file)

    response_cols = [
        'reply_timestamp', 'retweet_timestamp',
        'retweet_with_comment_timestamp', 'like_timestamp'
    ]

    #pipeline = Pipeline.load("pipeline")
    pipeline = PipelineModel.load(
        "hdfs:///user/e1553958/RecSys/pipeline_logReg")

    # Fit Pipeline and transform df
    val_df = pipeline.transform(val_df)

    get_probability = udf(lambda v: float(v[1]), FloatType())
    for column in response_cols:
        # Write results to file
        val_df = val_df.withColumn(column, get_probability(column + "_proba"))
        val_df.select("tweet_id", "engaging_user_id", column).write.option(
            "header",
            "false").csv("hdfs:///user/e1553958/RecSys/val_result_logReg/" +
                         column)
Exemplo n.º 14
0
from pyspark.mllib.evaluation import MulticlassMetrics


spark = SparkSession \
    .builder.config("spark.master", "local") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("WARN")

schema = StructType().add(StructField("message", StringType())).add(
    StructField("label", IntegerType()))
df = spark.read.option("mode",
                       "DROPMALFORMED").schema(schema).csv("spam_out.csv")

loaded_model = PipelineModel.load("data/sparkmodel")
schemaPred = StructType().add("message", "string")

rowDf = spark.createDataFrame([
    Row("Winner! You have won a car"),
    Row("I feel bad today"),
    Row("Please call our customer service representative"),
    Row("Your free ringtone is waiting to be collected. Simply text the password"
        )
], schemaPred)

predictions_loaded = loaded_model.transform(rowDf)
print(predictions_loaded)
result = predictions_loaded.select(["message", "probability",
                                    "prediction"]).collect()
Exemplo n.º 15
0
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext, SparkSession
from pyspark.ml import Pipeline, PipelineModel
from collections import namedtuple

sc = SparkContext(master="local[2]", appName="Tweet Streaming App")
sc.setLogLevel("ERROR")
ssc = StreamingContext(sc, 10)
sqlContext = SQLContext(sc)
ssc.checkpoint("file:/home/ubuntu/tweets/checkpoint/")
# ssc.checkpoint("checkpoints/")
tweet_count = 0
fields = ("SentimentText")
Tweet = namedtuple('Tweet', fields)

pipelineFit = PipelineModel.load("logreg1.model")


def getSparkSessionInstance(sparkConf):
    if ("sparkSessionSingletonInstance" not in globals()):
        globals()["sparkSessionSingletonInstance"] = SparkSession \
            .builder \
            .config(conf=sparkConf) \
            .getOrCreate()
    return globals()["sparkSessionSingletonInstance"]


def do_something(time, rdd):
    # print("========= %s =========" % str(time))
    # try:
    # Get the singleton instance of SparkSession
Exemplo n.º 16
0
# Predict With Model
#################
logistic_regression_predictions = logistic_regression_pipeline_model.transform(test_data)

#################
# Evaluate Model
#################
logistic_regression_predictions_selected = logistic_regression_predictions.select(CAT_COLS + CONT_COLS + ["income", "income_str_idx", "prediction", "probability"])
logistic_regression_predictions_selected.show(30)
logistic_regression_predictions_selected.groupby('income').agg({'income': 'count'}).show()
lr_pred = logistic_regression_predictions.select("income_str_idx", "prediction")
lr_accuracy_rate = lr_pred.filter(lr_pred.income_str_idx == lr_pred.prediction).count() / (lr_pred.count() * 1.0)
print('MODEL RESULTS:')
print("Overall Accuracy: {}".format(lr_accuracy_rate))

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol='income_str_idx')
print('{}: {}'.format(evaluator.getMetricName(), evaluator.evaluate(logistic_regression_predictions)))


#################
# Save and Load Model
#################
logistic_regression_pipeline_model.write().overwrite().save('my_logistic_regression_model_2.model')
loaded_lr_model = PipelineModel.load("my_logistic_regression_model_2.model")
more_predictions = loaded_lr_model.transform(test_data)
print('\nLOADED MODEL RESULTS:')
print("Coefficients: " + str(loaded_lr_model.stages[-1].coefficients))
print("Intercept: " + str(loaded_lr_model.stages[-1].intercept))
lr_pred = more_predictions.select("income_str_idx", "prediction")
loaded_accuracy = lr_pred.filter(lr_pred.income_str_idx == lr_pred.prediction).count() / (lr_pred.count() * 1.0)
print("Overall Accuracy Loaded: {}".format(loaded_accuracy))
Exemplo n.º 17
0
from flask import Flask, jsonify, render_template, request
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline, PipelineModel


import json

MASTER = 'local'
APPNAME = 'simple-ml-serving'
MODEL_PATH = 'file:///home/cdsw/cdsw-simple-serving-python/model/spark-model'

spark = SparkSession.builder.master(MASTER).appName(APPNAME).getOrCreate()
model = PipelineModel.load(MODEL_PATH)


def classify(input):
  #target_columns = input.columns + ["prediction"]
  target_columns = ["prediction"]
  return model.transform(input).select(target_columns).collect()

# webapp
app = Flask(__name__)


@app.route('/api/predict', methods=['POST'])
def predict():
  input_df = spark.sparkContext.parallelize([request.json]).toDF()
  output = classify(input_df)
  return jsonify(input=request.json, prediction=output)

@app.route('/')
Exemplo n.º 18
0
# start a kafka consumer session
from kafka.consumer import KafkaConsumer
from kafka.producer import KafkaProducer
consumer = KafkaConsumer(
    "titanic",
    bootstrap_servers=['ip-172-31-12-218.us-east-2.compute.internal:6667'])
producer = KafkaProducer(
    bootstrap_servers=['ip-172-31-12-218.us-east-2.compute.internal:6667'])

testSchema = [
    "PassengerId", "Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Ticket",
    "Fare", "Cabin", "Embarked"
]

pipeline = Pipeline.load("/home/ubuntu/titanic/pipeline")
model = PipelineModel.load("/home/ubuntu/titanic/model")


def getTrain(msg):
    # put passenger info into dataframe
    # print msg
    # combine two lists into list of tuple
    # combined = map(lambda x, y: (x, y), trainSchema, msg)
    msg = [ast.literal_eval(msg)]
    msg[0][0] = float(msg[0][0])
    msg[0][1] = float(msg[0][1])
    msg[0][4] = float(msg[0][4])
    msg[0][5] = float(msg[0][5])
    msg[0][6] = float(msg[0][6])
    msg[0][8] = float(msg[0][8])
    df = spark.createDataFrame(msg, testSchema)
Exemplo n.º 19
0
from pyspark.sql.functions import *
from pyspark.ml import PipelineModel
import sys

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("train_model")\
        .config("spark.debug.maxToStringFields", 1000)\
        .getOrCreate()

    reload(sys)
    sys.setdefaultencoding('utf-8')

    # Load model
    model = PipelineModel.load("/home/tupolev4/final_project/model")

    # import the csv file as dataFrame
    Info_Content = spark.read.options(header='True', inferSchema = 'True')\
        .csv("gs://r09922114-bucket/Info_Content.csv")
    Info_UserData = spark.read.options(header='True', inferSchema = 'True')\
        .csv("gs://r09922114-bucket/Info_UserData.csv")
    Log_Problem = spark.read.options(header='True', inferSchema = 'True')\
        .csv("gs://r09922114-bucket/Log_Problem.csv")

    def ith_(v, i):
        try:
            return float(v[i])
        except ValueError:
            return None
Exemplo n.º 20
0
def log_model(
    spark_model,
    artifact_path,
    conda_env=None,
    dfs_tmpdir=None,
    sample_input=None,
    registered_model_name=None,
    signature: ModelSignature = None,
    input_example: ModelInputExample = None,
    await_registration_for=DEFAULT_AWAIT_MAX_SLEEP_SECONDS,
    pip_requirements=None,
    extra_pip_requirements=None,
):
    """
    Log a Spark MLlib model as an MLflow artifact for the current run. This uses the
    MLlib persistence format and produces an MLflow Model with the Spark flavor.

    Note: If no run is active, it will instantiate a run to obtain a run_id.

    :param spark_model: Spark model to be saved - MLflow can only save descendants of
                        pyspark.ml.Model which implement MLReadable and MLWritable.
    :param artifact_path: Run relative artifact path.
    :param conda_env: Either a dictionary representation of a Conda environment or the path to a
                      Conda environment yaml file. If provided, this decsribes the environment
                      this model should be run in. At minimum, it should specify the dependencies
                      contained in :func:`get_default_conda_env()`. If `None`, the default
                      :func:`get_default_conda_env()` environment is added to the model.
                      The following is an *example* dictionary representation of a Conda
                      environment::

                        {
                            'name': 'mlflow-env',
                            'channels': ['defaults'],
                            'dependencies': [
                                'python=3.7.0',
                                'pyspark=2.3.0'
                            ]
                        }
    :param dfs_tmpdir: Temporary directory path on Distributed (Hadoop) File System (DFS) or local
                       filesystem if running in local mode. The model is written in this
                       destination and then copied into the model's artifact directory. This is
                       necessary as Spark ML models read from and write to DFS if running on a
                       cluster. If this operation completes successfully, all temporary files
                       created on the DFS are removed. Defaults to ``/tmp/mlflow``.
    :param sample_input: A sample input used to add the MLeap flavor to the model.
                         This must be a PySpark DataFrame that the model can evaluate. If
                         ``sample_input`` is ``None``, the MLeap flavor is not added.
    :param registered_model_name: If given, create a model version under
                                  ``registered_model_name``, also creating a registered model if one
                                  with the given name does not exist.

    :param signature: :py:class:`ModelSignature <mlflow.models.ModelSignature>`
                      describes model input and output :py:class:`Schema <mlflow.types.Schema>`.
                      The model signature can be :py:func:`inferred <mlflow.models.infer_signature>`
                      from datasets with valid model input (e.g. the training dataset with target
                      column omitted) and valid model output (e.g. model predictions generated on
                      the training dataset), for example:

                      .. code-block:: python

                        from mlflow.models.signature import infer_signature
                        train = df.drop_column("target_label")
                        predictions = ... # compute model predictions
                        signature = infer_signature(train, predictions)
    :param input_example: Input example provides one or several instances of valid
                          model input. The example can be used as a hint of what data to feed the
                          model. The given example will be converted to a Pandas DataFrame and then
                          serialized to json using the Pandas split-oriented format. Bytes are
                          base64-encoded.
    :param await_registration_for: Number of seconds to wait for the model version to finish
                            being created and is in ``READY`` status. By default, the function
                            waits for five minutes. Specify 0 or None to skip waiting.
    :param pip_requirements: {{ pip_requirements }}
    :param extra_pip_requirements: {{ extra_pip_requirements }}

    .. code-block:: python
        :caption: Example

        from pyspark.ml import Pipeline
        from pyspark.ml.classification import LogisticRegression
        from pyspark.ml.feature import HashingTF, Tokenizer
        training = spark.createDataFrame([
            (0, "a b c d e spark", 1.0),
            (1, "b d", 0.0),
            (2, "spark f g h", 1.0),
            (3, "hadoop mapreduce", 0.0) ], ["id", "text", "label"])
        tokenizer = Tokenizer(inputCol="text", outputCol="words")
        hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
        lr = LogisticRegression(maxIter=10, regParam=0.001)
        pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
        model = pipeline.fit(training)
        mlflow.spark.log_model(model, "spark-model")
    """
    from py4j.protocol import Py4JError

    _validate_model(spark_model)
    from pyspark.ml import PipelineModel

    if not isinstance(spark_model, PipelineModel):
        spark_model = PipelineModel([spark_model])
    run_id = mlflow.tracking.fluent._get_or_start_run().info.run_id
    run_root_artifact_uri = mlflow.get_artifact_uri()
    # If the artifact URI is a local filesystem path, defer to Model.log() to persist the model,
    # since Spark may not be able to write directly to the driver's filesystem. For example,
    # writing to `file:/uri` will write to the local filesystem from each executor, which will
    # be incorrect on multi-node clusters - to avoid such issues we just use the Model.log() path
    # here.
    if is_local_uri(run_root_artifact_uri):
        return Model.log(
            artifact_path=artifact_path,
            flavor=mlflow.spark,
            spark_model=spark_model,
            conda_env=conda_env,
            dfs_tmpdir=dfs_tmpdir,
            sample_input=sample_input,
            registered_model_name=registered_model_name,
            signature=signature,
            input_example=input_example,
            await_registration_for=await_registration_for,
            pip_requirements=pip_requirements,
            extra_pip_requirements=extra_pip_requirements,
        )
    model_dir = os.path.join(run_root_artifact_uri, artifact_path)
    # Try to write directly to the artifact repo via Spark. If this fails, defer to Model.log()
    # to persist the model
    try:
        spark_model.save(posixpath.join(model_dir, _SPARK_MODEL_PATH_SUB))
    except Py4JError:
        return Model.log(
            artifact_path=artifact_path,
            flavor=mlflow.spark,
            spark_model=spark_model,
            conda_env=conda_env,
            dfs_tmpdir=dfs_tmpdir,
            sample_input=sample_input,
            registered_model_name=registered_model_name,
            signature=signature,
            input_example=input_example,
            await_registration_for=await_registration_for,
            pip_requirements=pip_requirements,
            extra_pip_requirements=extra_pip_requirements,
        )

    # Otherwise, override the default model log behavior and save model directly to artifact repo
    mlflow_model = Model(artifact_path=artifact_path, run_id=run_id)
    with TempDir() as tmp:
        tmp_model_metadata_dir = tmp.path()
        _save_model_metadata(
            tmp_model_metadata_dir,
            spark_model,
            mlflow_model,
            sample_input,
            conda_env,
            signature=signature,
            input_example=input_example,
        )
        mlflow.tracking.fluent.log_artifacts(tmp_model_metadata_dir,
                                             artifact_path)
        if registered_model_name is not None:
            mlflow.register_model(
                "runs:/%s/%s" % (run_id, artifact_path),
                registered_model_name,
                await_registration_for,
            )
Exemplo n.º 21
0
paramGrid = ParamGridBuilder().build()#ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.01, 0.001, 0.0001]).build()
#lr = LinearRegression()
#paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [500]).addGrid(lr.regParam, [0]).addGrid(lr.elasticNetParam, [1]).build()
pipeline_new = Pipeline(stages=[rf])
evaluator = MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("f1")  #/setMetricName/ "f1" (default), "weightedPrecision", "weightedRecall", "accuracy"
#evaluator = RegressionEvaluator(metricName="mae")
crossval = CrossValidator(estimator=pipeline_new, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=10)
model_new_rf = crossval.fit(trainingData)
model_new_rf.bestModel
model_new_rf.bestModel.save('rf_pipeline_model_saved')
model_new_rf.avgMetrics

#loading a saved model
from pyspark.ml import PipelineModel
loadedModel = PipelineModel.load("rf_pipeline_model_saved")


#Checkpointing is a process of truncating RDD lineage graph and saving it to a reliable distributed (HDFS) or local file system.
sc.setCheckpointDir("hdfs://hadoop-master:9000/data/checkpoint")
df.repartition(100)


#read / write parquet files
df.write.option("compression","none").save("hdfs://address/folder",format="parquet",mode="overwrite")
spark.read.parquet("hdfs://address/folder")
df.write.option("compression","snappy").parquet("hdfs://address/folder")


#Assign unique continuous numbes to rows of a dataframe
Z = spark.createDataFrame(d.select("colid").distinct().rdd.map(lambda x: x[0]).zipWithUniqueId())
Exemplo n.º 22
0
    return d


if __name__ == "__main__":

    sc = SparkContext()
    sqlContext = SQLContext(sc)

    lr_model = LogisticRegressionModel.load("lrm.model")
    model = NaiveBayesModel.load("model.model")

    key = "00254a08-1426-4547-b54f-bc0137d9d547"
    from_date = "2018-02-01"
    to_date = "2018-02-12"

    url = 'http://content.guardianapis.com/search?from-date=' + from_date + '&to-date=' + to_date + \
          '&order-by=newest&show-fields=all&page-size=200&%20num_per_section=10000&api-key=' + key

    data = get_data(url)
    df = sqlContext.createDataFrame(data, schema=["category", "text"])
    pipeline_fit = PipelineModel.load("pipelining")
    dataset = pipeline_fit.transform(df)

    predictions = lr_model.transform(dataset)
    predictions1 = model.transform(dataset)
    evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
    percent = evaluator.evaluate(predictions)
    print("accuracy of lr model is" + str(percent * 100))
    percent = evaluator.evaluate(predictions1)
    print("accuracy of NB model is" + str(percent * 100))
all_input_cols = all_columns[:-1]
print(all_input_cols)
assembler = VectorAssembler(inputCols=all_input_cols, outputCol="features")
stages += [assembler]

pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(spark_df_balanced)
pipelineModel.write().overwrite().save('saves/pipelineModelBalanced')
spark_df_balanced_2 = pipelineModel.transform(spark_df_balanced)
selectedCols = ['Class', 'features'] + all_input_cols
spark_df_balanced_2 = spark_df_balanced_2.select(selectedCols)
# spark_df_balanced_2.printSchema()

from pyspark.ml import PipelineModel

pipelineModelLoaded = PipelineModel.load("saves/pipelineModelBalanced")
spark_df_balanced_2 = pipelineModelLoaded.transform(spark_df_balanced)
selectedCols = ['Class', 'features'] + all_input_cols
spark_df_balanced_2 = spark_df_balanced_2.select(selectedCols)

train, test = spark_df_balanced_2.randomSplit([0.9, 0.1], seed=2018)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))
# print(test.show(5))

from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel
save_path = 'saves/LRBalancedModel'

lr = LogisticRegression(featuresCol='features', labelCol='Class', maxIter=10)
lrModel = lr.fit(train)
lrModel.write().overwrite().save(save_path)
from pyspark.sql import SparkSession
import pandas as pd
from kafka import KafkaConsumer
import sys
from pyspark.ml import PipelineModel
from pyspark.ml.classification import LogisticRegressionModel, NaiveBayesModel
from sklearn.metrics import accuracy_score, recall_score, precision_score

sc = SparkContext()
sqlContext = SQLContext(sc)

spark = SparkSession.builder.appName('consumer').getOrCreate()
brokers, topic = sys.argv[1:]
consumer = KafkaConsumer(topic, bootstrap_servers=['localhost:9092'])

pip = PipelineModel.load('/Users/aditya/PycharmProjects/BigDataHW3/pipeline')
model_nb = NaiveBayesModel.load(
    '/Users/aditya/PycharmProjects/BigDataHW3/nbModel')
model_lr = LogisticRegressionModel.load(
    '/Users/aditya/PycharmProjects/BigDataHW3/lrModel')

columns = ['actual', 'predicted']
result_df_lr = pd.DataFrame(columns=columns)
result_df_nb = pd.DataFrame(columns=columns)
feed = 0

for msg in consumer:
    article = msg.value
    data = article.split("||")
    label = data[0]
    text = data[1]
Exemplo n.º 25
0
# Separamos TAC, SNR y CD de la columna IMEI
# Formato de los IMEI: TAC -- Serial_Number (14 digitos)

dataset = dataset.withColumn('tac_a', dataset.imei.substr(1, 2))
dataset = dataset.withColumn('tac_b', dataset.imei.substr(3, 6))
dataset = dataset.withColumn('snr', dataset.imei.substr(9, 6))

# Normalizamos columna hora

dataset = dataset.withColumn("hour", (F.col("hour") - 0) / (23 - 0) * 6)

# StringIndexer

string_indexer_model_path = "{}/data/stringIndexerModel.bin".format(base_path)
string_indexer = PipelineModel.load(string_indexer_model_path)
dataset = string_indexer.transform(dataset)

# MinMaxScaler

minMaxScaler_output_path = "{}/data/minMaxScalerModel.bin".format(base_path)
minMaxScaler = PipelineModel.load(minMaxScaler_output_path)
dataset = minMaxScaler.transform(dataset)

# VectorAssembler

vector_assembler_output_path = "{}/data/vectorAssemblerModel.bin".format(
    base_path)
vector_assembler = VectorAssembler.load(vector_assembler_output_path)
dataset = vector_assembler.transform(dataset)
Exemplo n.º 26
0
	'''
    Args
    -------
    val_data_file:
        validation data (string ID)
    
    model_file:
        path to the pipeline(stringIndexers + als) model
    '''

    # Read data
    val = spark.read.parquet(val_data_file).drop("__index_level_0__")

    print('Loading the trained model...')
    # Load the trained pipeline model
    model = PipelineModel.load(model_file)

    #########################################################
    #                       Evaluate                        #
    #########################################################
    print("Predicting...")
    # Run the model to create a prediction agains validation set
    preds = model.transform(val)
    
    print("Evaluating...")
    # Generate top 10 movie recommendations for each user (sorted??)
    # Returns a DataFrame of (userCol, recommendations), 
    # where recommendations are stored as an array of (itemCol, rating) Rows.
    perUserPredictions = model.stages[-1].recommendForAllUsers(500)\
                            .selectExpr("userId","recommendations.itemId as items_pred")
    # perUserPredictions.show(5)
    def load_model(self, load_file):

        logging.warning("Loading model from {}".format(load_file))
        self.trigram_model = PipelineModel.load(load_file)
pipeline_model = pipeline.fit(train)

# (5) Use the `save` method to save the pipeline model to the
# `models/pipeline_model` directory in HDFS.

pipeline_model.write().overwrite().save("models/pipeline_model")

# (6) Import the `PipelineModel` class from the `pyspark.ml` package.

from pyspark.ml import PipelineModel

# (7) Use the `load` method of `PipelineModel` class to load the saved pipeline
# model.

pipeline_model_loaded = PipelineModel.load("models/pipeline_model")
                                           
# (8) Apply the loaded pipeline model to the test DataFrame and examine the
# resulting DataFrame.

test_transformed = pipeline_model.transform(test)
test_transformed.printSchema()
test_transformed.select("features", "label").show(truncate=False)


# ## References

# [Spark Documentation - ML
# Pipelines](http://spark.apache.org/docs/latest/ml-pipeline.html)

# [Spark Python API - pyspark.ml
    .setTextCol("text")\
    .setUrl("https://eastus.api.cognitive.microsoft.com/text/analytics/v3.0/sentiment")\
    .setSubscriptionKey(TEXT_API_KEY)\
    .setOutputCol("sentiment")

#Extract the sentiment score from the API response body
getSentiment = SQLTransformer(
    statement="SELECT *, sentiment[0].sentiment as sentimentLabel FROM __THIS__"
)

# COMMAND ----------

# MAGIC %md ### Tying it all together
# MAGIC
# MAGIC Now that we have built the stages of our pipeline its time to chain them together into a single model that can be used to process batches of incoming data
# MAGIC
# MAGIC <img src="https://mmlspark.blob.core.windows.net/graphics/Cog%20Service%20NB/full_pipe_2.jpg" width="800" style="float: center;"/>

# COMMAND ----------

from mmlspark.stages import SelectColumns
# Select the final coulmns
cleanupColumns = SelectColumns().setCols(
    ["url", "firstCeleb", "text", "sentimentLabel"])

celebrityQuoteAnalysis = PipelineModel(stages=[
    bingSearch, getUrls, celebs, firstCeleb, recognizeText, getText,
    sentimentTransformer, getSentiment, cleanupColumns
])

celebrityQuoteAnalysis.transform(bingParameters).show(5)
Exemplo n.º 30
0
            inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)],
            outputCol="rawFeatures")
    ]
    label_stringIdx = [StringIndexer(inputCol="label", outputCol="labels")]
    selector = [
        ChiSqSelector(numTopFeatures=2**14,
                      featuresCol='rawFeatures',
                      outputCol="features")
    ]
    lr = [LogisticRegression(maxIter=1000)]
    return Pipeline(stages=tokenizer + remover + ngrams + cv + idf +
                    assembler + label_stringIdx + selector + lr)


#saving pipeline steps of execute
pipeline_load = PipelineModel.load(
    "/Users/chaitanyavarmamudundi/Desktop/pipeLineModel")
predictions = pipeline_load.transform(
    test_set)  #put dataframe for testing here
int(predictions.collect()[-1]['prediction'])  #prediction

#finding the accuracy of the model.
accuracy = predictions.filter(
    predictions.label == predictions.prediction).count() / float(
        test_set.count())
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
roc_auc = evaluator.evaluate(predictions)
print("Accuracy Score: ", accuracy)
print("ROC-AUC: {0:.4f}", roc_auc)

ddf = spark.createDataFrame(df)
def main():
    # Logging purposes
    logging.basicConfig(format=logger_format,
                        datefmt=logger_datefmt,
                        level=logger_level)
    # Parse the arguments from the shellscript
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-i",
        "--input_filepath",
        help="Please provide the HDFS filepath to the ... dataset to score",
        required=True,
    )
    parser.add_argument(
        "-o",
        "--output_filepath",
        help="Please provide the HDFS filepath for the output",
    )
    parser.add_argument(
        "-c",
        "--config_filepath",
        help="Please provide the filepath for the config file",
    )
    parser.add_argument(
        "-m",
        "--model_save_filepath",
        help="Please provide the HDFS filepath to the scoring model",
        required=True,
    )
    parser.add_argument("--kafka_topic", help="output kafka topic")
    parser.add_argument("--kafka_brokers",
                        help="output kafka broker(s) comma separated")
    flags = parser.parse_args()

    logging.info("EVENT=START flags=%s" % (flags))
    start_s = time.time()

    input_filepath = flags.input_filepath
    output_filepath = flags.output_filepath
    config_filepath = flags.config_filepath
    model_save_filepath = flags.model_save_filepath

    kafka_topic = flags.kafka_topic
    kafka_brokers = flags.kafka_brokers

    if any([kafka_topic, kafka_brokers
            ]) and (not all([kafka_topic, kafka_brokers])):
        logging.error("Must provide both topic and kafka_brokers or none.")
        sys.exit(2)

    if not any([kafka_topic, output_filepath]):
        logging.error("No output/topic specified!")
        sys.exit(2)

    spark = SparkSession.builder.getOrCreate()
    # Register any JAVA functions that are required here #
    register_udfs(spark)
    init(spark)

    # Extract the config values from the config_fp
    with open(config_filepath, "r") as fp:
        cfg = json.load(fp)

    EXCLUDED_RTYPE_LIST = cfg["EXCLUDED_RTYPE_LIST"]
    EXCLUDED_RDATA_LIST = cfg["EXCLUDED_RDATA_LIST"]
    MIN_IP_IP_2LD_COUNT = cfg["MIN_IP_IP_2LD_COUNT"]
    MIN_IP_IP_2LD_UNIQUE_QNAME_PERC = cfg["MIN_IP_IP_2LD_UNIQUE_QNAME_PERC"]
    MIN_IP_IP_2LD_TUNNEL_PERC = cfg["MIN_IP_IP_2LD_TUNNEL_PERC"]

    # Seems that I can't put this above, it has to be after the SparkSession is
    # created
    EXCLUDED_RTYPE_ARRAY = f.array([f.lit(x) for x in EXCLUDED_RTYPE_LIST])
    EXCLUDED_RDATA_ARRAY = f.array([f.lit(x) for x in EXCLUDED_RDATA_LIST])

    # Read in the dataset from the filepath
    df = spark.read.parquet(input_filepath)

    filtered_df = filter_dataframe(df, MIN_IP_IP_2LD_COUNT)
    filtered_df = filtered_df.withColumn("lld", f.col("2ld"))
    filtered_df = filtered_df.withColumn("RDATA",
                                         f.col("payload.answers.rdata"))

    #concat_RDATA_df = concat_RDATA(
    #    filtered_df, EXCLUDED_RTYPE_ARRAY, EXCLUDED_RDATA_ARRAY
    #)

    #features_df = generate_features(concat_RDATA_df)
    features_df = generate_features(filtered_df)
    features_df = features_df.fillna(0)

    logging.info("EVENT=Features have been generated")

    # Load the Model #
    model = PipelineModel.load(model_save_filepath)
    logging.info("EVENT=Model loaded %s" % (model))

    # make prediction
    logging.info("EVENT=Scoring on dataset")
    pred = model.transform(features_df)
    pred = pred.cache()
    # pred_cnt = pred.count()
    # logging.info("COUNT=%d rows were scored" % (pred_cnt))

    pred = pred.withColumn(
        "label_str",
        f.udf(lambda x: inv_ans_mapping[x],
              t.StringType())(f.col("prediction")),
    )

    # Get srcIP-destIP-2ld which have any non-normal traffic
    possible_tunnel_ip_ip_2ld_df = (
        pred.filter("label_str != 'normal'").select("srcIP", "destIP",
                                                    "2ld").distinct())
    # Collect all the data from the srcIP-destIP-2ld which have any non-normal
    # traffic
    possible_tunnel_data_df = pred.join(possible_tunnel_ip_ip_2ld_df,
                                        ["srcIP", "destIP", "2ld"],
                                        how="inner")
    possible_tunnel_data_df = possible_tunnel_data_df.withColumn(
        "normal",
        f.when(f.col("label_str") == "normal", 1).otherwise(0))
    possible_tunnel_data_df = possible_tunnel_data_df.withColumn(
        "tunnel",
        f.when(f.col("label_str") != "normal", 1).otherwise(0))

    # Groupby srcIP-destIP-2ld and aggregate the following information:
    # Count of normal traffic
    # Count of tunnel traffic
    # Count of unique QNAME
    # Percentage of tunnel traffic
    # Percentage of unique QNAME
    grpby_pos_tun_ip_ip_2ld_df = possible_tunnel_data_df.groupby([
        "srcIP", "destIP", "2ld"
    ]).agg(
        #f.max("interval_time").alias("max_interval_time"),
        #f.avg("interval_time").alias("average_interval_time"),
        f.countDistinct("QNAME").alias("unique_QNAME_count"),
        f.count("2ld").alias("2ld_count"),
        f.sum("normal").alias("normal_count"),
        f.sum("tunnel").alias("tunnel_count"),
    )
    grpby_pos_tun_ip_ip_2ld_df = grpby_pos_tun_ip_ip_2ld_df.withColumn(
        "unique_QNAME_perc",
        f.col("unique_QNAME_count") / f.col("2ld_count"))
    grpby_pos_tun_ip_ip_2ld_df = grpby_pos_tun_ip_ip_2ld_df.withColumn(
        "tunnel_perc",
        f.col("tunnel_count") / f.col("2ld_count"))
    grpby_pos_tun_ip_ip_2ld_df = grpby_pos_tun_ip_ip_2ld_df.cache()
    gptii_cnt = grpby_pos_tun_ip_ip_2ld_df.count()
    logging.info("EVENT=COUNT %d ip-ip-2ld(s) are possibly \
                 tunnelling" % (gptii_cnt))

    # Filter based on Percentage of tunnel traffic & Percentage of unique QNAME
    tunnel_ip_ip_2ld_df = grpby_pos_tun_ip_ip_2ld_df.filter(
        (f.col("unique_QNAME_perc") >= MIN_IP_IP_2LD_UNIQUE_QNAME_PERC)
        & (f.col("tunnel_perc") >= MIN_IP_IP_2LD_TUNNEL_PERC))
    tunnel_ip_ip_2ld_df = tunnel_ip_ip_2ld_df.cache()
    tii2_cnt = tunnel_ip_ip_2ld_df.count()
    logging.info("EVENT=COUNT %d ip-ip-2ld(s) are detected as tunnelling \
                 based on the additional thresholds" % (tii2_cnt))

    # Add in the alert_id for each ip-ip-2ld tuple
    tunnel_ip_ip_2ld_df = tunnel_ip_ip_2ld_df.selectExpr(
        "*",
        "-1 as alert_id"  #"generateId() as alert_id"
    )

    tunnel_traffic_df = possible_tunnel_data_df.join(
        tunnel_ip_ip_2ld_df, ["srcIP", "destIP", "2ld"], how="inner")

    #TODO
    final_output_traffic = tunnel_traffic_df.select("key", "payload")
    final_output_traffic.write.parquet(output_filepath)
    """
Exemplo n.º 32
0
## Create the pipeline by defining all the stages
pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, hashingTF, idf, algoStage, colPruner])

## Test exporting and importing the pipeline. On Systems where HDFS & Hadoop is not available, this call store the pipeline
## to local file in the current directory. In case HDFS & Hadoop is available, this call stores the pipeline to HDFS home
## directory for the current user. Absolute paths can be used as wells. The same holds for the model import/export bellow.
pipeline.write().overwrite().save("examples/build/pipeline")
loaded_pipeline = Pipeline.load("examples/build/pipeline")

## Train the pipeline model
data = load()
model = loaded_pipeline.fit(data)

model.write().overwrite().save("examples/build/model")
loaded_model = PipelineModel.load("examples/build/model")




##
## Make predictions on unlabeled data
## Spam detector
##
def isSpam(smsText, model, hamThreshold = 0.5):
    smsTextDF = spark.createDataFrame([(smsText,)], ["text"]) # create one element tuple
    prediction = model.transform(smsTextDF)
    return prediction.select("prediction_output.p1").first()["p1"] > hamThreshold


print(isSpam("Michal, h2oworld party tonight in MV?", loaded_model))
Exemplo n.º 33
0
def main():
    # Build the SparkSession
    spark = SparkSession.builder \
        .master("local[*]") \
        .appName("Income Model") \
        .config("spark.executor.memory", "1gb") \
        .getOrCreate()
    # note: you might need to add export SPARK_LOCAL_IP=127.0.0.1

    # Load in the data. For the sake of time, this dataset is extremely small.
    # NOTE: In this case, the schema is being inferred. Most other times, you would specify your schema.
    df = spark.read.csv("dataset.csv", header=True, inferSchema=True)
    logging.info('Observing the raw data schema:')
    df.printSchema()
    logging.info('Observing a snippet of the raw data:')
    df.show()

    census_model = CensusModel(df)
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",
                                              labelCol='income_str_idx')
    training_set, test_set = census_model.create_test_and_train()

    # logistic regression
    logger.info('LOGISTIC REGRESSION')
    lr = LogisticRegression(labelCol="income_str_idx", featuresCol="features")
    lr_pipeline = census_model.build_pipeline_single_estimator(lr)
    lr_model = census_model.train_model(training_set, lr_pipeline)
    lr_predictions = census_model.fit_model(test_set, lr_model)
    census_model.evaluate_model(lr_predictions, evaluator)

    # random forest
    logger.info('RANDOM FOREST')
    rf = RandomForestClassifier(labelCol="income_str_idx",
                                featuresCol="features")
    rf_pipeline = census_model.build_pipeline_single_estimator(rf)
    rf_model = census_model.train_model(training_set, rf_pipeline)
    rf_predictions = census_model.fit_model(test_set, rf_model)
    census_model.evaluate_model(rf_predictions, evaluator)

    # comparing
    print('\nLOGISTIC REGRESSION RESULTS')
    census_model.evaluate_model(lr_predictions, evaluator)
    print('\nRANDOM FOREST RESULTS')
    census_model.evaluate_model(rf_predictions, evaluator)

    # save and load
    lr_model.write().overwrite().save('my_logistic_regression_model.model')
    rf_model.write().overwrite().save('my_random_forest_model.model')
    lr_model_loaded = PipelineModel.load("my_logistic_regression_model.model")
    rf_model_loaded = PipelineModel.load("my_random_forest_model.model")
    # du - hd1
    lr_predictions_loaded = census_model.fit_model(test_set,
                                                   lr_model_loaded,
                                                   show_snippet=False)
    rf_predictions_loaded = census_model.fit_model(test_set,
                                                   rf_model_loaded,
                                                   show_snippet=False)
    print('\nLOADED MODEL LOGISTIC REGRESSION RESULTS')
    census_model.evaluate_model(lr_predictions_loaded, evaluator)
    print('\nLOADED MODEL RANDOM FOREST RESULTS')
    census_model.evaluate_model(rf_predictions_loaded, evaluator)
Exemplo n.º 34
0
from rocket_pyspark_ml.simple_custom_transformer import LiteralColumnAdder

spark = SparkSession.builder \
    .master("local") \
    .appName("test") \
    .getOrCreate()

# Prepare training documents from a list of (id, text, label) tuples.
df = spark.createDataFrame([(0, "a b c d e spark", 1.0), (1, "b d", 0.0),
                            (2, "spark f g h", 1.0),
                            (3, "hadoop mapreduce", 3.0)],
                           ["id", "text", "label"])

# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                      outputCol="features",
                      numFeatures=1000)
lr = LogisticRegression(maxIter=10, regParam=0.001)

# Custom transformer
custom = LiteralColumnAdder()

sub_pipeline = Pipeline(stages=[custom, tokenizer, hashingTF, lr])
model = sub_pipeline.fit(df)

model.write().overwrite().save("/tmp/my_custom_model")

loaded_model = PipelineModel.load("/tmp/my_custom_model")

loaded_model.transform(df).show()
Exemplo n.º 35
0
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
from pyspark import SparkContext,SparkConf
import shutil

#refer https://github.com/tthustla/setiment_analysis_pyspark/blob/master/Sentiment%20Analysis%20with%20PySpark.ipynb

conf = ps.SparkConf().setAll([('spark.executor.memory', '2g'), ('spark.executor.cores', '1'), ('spark.cores.max', '3'), ('spark.driver.memory','2g')])
sc = ps.SparkContext(conf=conf)
sqlContext = SQLContext(sc)


df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('../sentiment_training_pipeline/sentiment_test.csv')
# first look at data
#print(type(df))
print(df.count())
print(df.show(5))
df.printSchema()



modelPath = '../sentiment_training_pipeline/output/tfidf_logistic_pipelineModel'



# step_7 Load the PipelineModel
loadedPipelineModel = PipelineModel.load(modelPath)
test_reloadedModel = loadedPipelineModel.transform(df)
test_reloadedModel.show(5)