def func(rd):
    rd = rd.map(lambda l: l.strip('"')).map(lambda l: l.split("||")).map(
        lambda l: (int(l[0]), l[1]))
    sqlContext = SQLContext(sc)
    test_data = sqlContext.createDataFrame(rd, schema=["label", "text"])

    data = sc.textFile("/home/asdf/Documents/news", 1)
    data = data.map(lambda l: l.strip('"')).map(lambda l: l.split("||")).map(
        lambda l: (int(l[0]), l[1]))

    df = sqlContext.createDataFrame(data, schema=["label", "text"])

    lr = Pipeline.load("/home/asdf/Documents/models/lr")
    nb = Pipeline.load("/home/asdf/Documents/models/nb")

    lr_pred = lr.fit(df).transform(test_data)
    nb_pred = nb.fit(df).transform(test_data)
    accuracy = MulticlassClassificationEvaluator(predictionCol="prediction")
    recall = MulticlassClassificationEvaluator(predictionCol="prediction",
                                               metricName="weightedPrecision")
    precision = MulticlassClassificationEvaluator(predictionCol="prediction",
                                                  metricName="weightedRecall")

    with open("/home/asdf/Documents/op1", 'a') as file:
        file.write("Logistic Regression:\n")
        file.write("Accuracy:" + str(accuracy.evaluate(lr_pred) * 100))
        file.write("\nRecall:" + str(recall.evaluate(lr_pred) * 100))
        file.write("\nPrecision:" + str(precision.evaluate(lr_pred) * 100))
        file.write('\n')

        file.write("\nNaive Bayes:\n")
        file.write("Accuracy:" + str(accuracy.evaluate(nb_pred) * 100))
        file.write("\nRecall:" + str(recall.evaluate(nb_pred) * 100))
        file.write("\nPrecision:" + str(precision.evaluate(nb_pred) * 100))
Exemplo n.º 2
0
    def test_h2o_mojo_model_serialization_in_pipeline(self):
        mojo = H2OMOJOModel.createFromMojo("file://" + os.path.abspath(
            "../ml/src/test/resources/binom_model_prostate.mojo"))
        prostate_frame = self._spark.read.csv(
            "file://" +
            unit_test_utils.locate("smalldata/prostate/prostate.csv"),
            header=True)

        pipeline = Pipeline(stages=[mojo])

        pipeline.write().overwrite().save(
            "file://" +
            os.path.abspath("build/test_spark_pipeline_model_mojo"))
        loaded_pipeline = Pipeline.load(
            "file://" +
            os.path.abspath("build/test_spark_pipeline_model_mojo"))

        model = loaded_pipeline.fit(prostate_frame)

        model.write().overwrite().save(
            "file://" +
            os.path.abspath("build/test_spark_pipeline_model_mojo_model"))
        PipelineModel.load(
            "file://" +
            os.path.abspath("build/test_spark_pipeline_model_mojo_model"))
Exemplo n.º 3
0
def testPipelineWithTargetEncoderIsSerializable():
    targetEncoder = H2OTargetEncoder(
        foldCol="ID",
        labelCol="CAPSULE",
        inputCols=["RACE", "DPROS", "DCAPS"],
        outputCols=["RACE_out", "DPROS_out", "DCAPS_out"],
        holdoutStrategy="KFold",
        blendedAvgEnabled=True,
        blendedAvgInflectionPoint=15.0,
        blendedAvgSmoothing=25.0,
        noise=0.05,
        noiseSeed=123)
    gbm = H2OGBM() \
        .setLabelCol("CAPSULE") \
        .setFeaturesCols(targetEncoder.getOutputCols())
    pipeline = Pipeline(stages=[targetEncoder, gbm])
    path = "file://" + os.path.abspath(
        "build/testPipelineWithTargetEncoderIsSerializable")
    pipeline.write().overwrite().save(path)
    loadedPipeline = Pipeline.load(path)
    [loadedTargetEncoder, loadedGbm] = loadedPipeline.getStages()

    assertTargetEncoderAndMOJOModelParamsAreEqual(targetEncoder,
                                                  loadedTargetEncoder)
    assert gbm.getLabelCol() == loadedGbm.getLabelCol()
    assert gbm.getFeaturesCols() == loadedGbm.getFeaturesCols()
Exemplo n.º 4
0
def testPipelineSerialization(craiglistDataset):
    [traningDataset, testingDataset] = craiglistDataset.randomSplit([0.9, 0.1],
                                                                    42)

    tokenizer = RegexTokenizer(inputCol="jobtitle",
                               minTokenLength=2,
                               outputCol="tokenized")
    stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                        outputCol="stopWordsRemoved")
    w2v = H2OWord2Vec(sentSampleRate=0,
                      epochs=10,
                      inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="w2v")
    gbm = H2OGBM(labelCol="category", featuresCols=[w2v.getOutputCol()])

    pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, w2v, gbm])

    pipeline.write().overwrite().save("file://" +
                                      os.path.abspath("build/w2v_pipeline"))
    loadedPipeline = Pipeline.load("file://" +
                                   os.path.abspath("build/w2v_pipeline"))
    model = loadedPipeline.fit(traningDataset)
    expected = model.transform(testingDataset)

    model.write().overwrite().save("file://" +
                                   os.path.abspath("build/w2v_pipeline_model"))
    loadedModel = PipelineModel.load(
        "file://" + os.path.abspath("build/w2v_pipeline_model"))
    result = loadedModel.transform(testingDataset)

    unit_test_utils.assert_data_frames_are_identical(expected, result)
Exemplo n.º 5
0
    def featureExtractLr(self, trainDataframe, predictionDataframe):
        pipeline = None
        try:
            # pipeline = PipelineModel.load(ROOT_PATH+'/logistic')
            pipeline = Pipeline.load(ROOT_PATH + '/logistic')
        except Exception:
            print Exception.message
            self.logger.error(Exception)
        if pipeline is None:
            # tokenizer = Tokenizer(inputCol="keywords", outputCol="words")
            remover = StopWordsRemover(inputCol="keywords",
                                       outputCol="filtered")
            # 设置停用词
            remover.setStopWords(self.cuttingMachine.chineseStopwords())
            hashingTF = HashingTF(inputCol=remover.getOutputCol(),
                                  outputCol="features")
            lr = LogisticRegression(maxIter=10,
                                    regParam=0.001).setElasticNetParam(0.8)
            pipeline = Pipeline(stages=[remover, hashingTF, lr])
        model = pipeline.fit(trainDataframe)
        pipeline.write().overwrite().save(ROOT_PATH + '/logistic')
        # model.write().overwrite().save(ROOT_PATH+'/logistic')
        resultDataframe = model.transform(predictionDataframe)
        resultDataframe.show()
        selected = resultDataframe.select("id", "features", "probability",
                                          "prediction")

        for row in selected.collect():
            rid, features, prob, prediction = row
            self.logger.info("features: %s", features)
            self.logger.info("prob: %s", str(prob))
            self.logger.info("prediction: %s", str(prediction))
Exemplo n.º 6
0
    def featureExtract(self, trainDataframe, predictionDataframe):
        pipeline = None
        try:
            pipeline = Pipeline.load(ROOT_PATH + '/pipeline')
        except Exception:
            print Exception.message
            self.logger.error(Exception)
        if pipeline is None:
            # tokenizer = Tokenizer(inputCol="keywords", outputCol="words")
            remover = StopWordsRemover(inputCol="keywords",
                                       outputCol="filtered")
            # 设置停用词
            remover.setStopWords(self.cuttingMachine.chineseStopwords())
            hashingTF = HashingTF(inputCol=remover.getOutputCol(),
                                  outputCol="features")
            idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="idff")
            # lr = LogisticRegression(maxIter=10, regParam=0.001)
            pipeline = Pipeline(stages=[remover, hashingTF, idf])
        model = pipeline.fit(trainDataframe)
        pipeline.write().overwrite().save(ROOT_PATH + '/pipeline')
        resultDataframe = model.transform(predictionDataframe)
        resultDataframe.show()
        selected = resultDataframe.select("filtered", "features", "idff")

        for row in selected.collect():
            filtered, features, idff = row
            self.logger.info("features: %s", features)
            self.logger.info("idff: %s", idff)
            self.logger.info(
                "filtered: %s",
                str(filtered).decode("unicode_escape").encode("utf-8"))
        return selected
def trainPipelineModel(idf, hashingTF, stopWordsRemover, tokenizer, algoStage,
                       data):
    ## Remove all helper columns
    colPruner = ColumnPruner(columns=[
        idf.getOutputCol(),
        hashingTF.getOutputCol(),
        stopWordsRemover.getOutputCol(),
        tokenizer.getOutputCol()
    ])

    ## Create the pipeline by defining all the stages
    pipeline = Pipeline(stages=[
        tokenizer, stopWordsRemover, hashingTF, idf, algoStage, colPruner
    ])

    ## Test exporting and importing the pipeline. On Systems where HDFS & Hadoop is not available, this call store the pipeline
    ## to local file in the current directory. In case HDFS & Hadoop is available, this call stores the pipeline to HDFS home
    ## directory for the current user. Absolute paths can be used as wells. The same holds for the model import/export bellow.
    pipelinePath = "file://" + os.path.abspath("../build/pipeline")
    pipeline.write().overwrite().save(pipelinePath)
    loaded_pipeline = Pipeline.load(pipelinePath)

    ## Train the pipeline model
    modelPath = "file://" + os.path.abspath("../build/model")
    model = loaded_pipeline.fit(data)
    model.write().overwrite().save(modelPath)
    return PipelineModel.load(modelPath)
    def test_grid_gbm_in_spark_pipeline(self):
        prostate_frame = self._spark.read.csv(
            "file://" +
            unit_test_utils.locate("smalldata/prostate/prostate.csv"),
            header=True,
            inferSchema=True)

        algo = H2OGridSearch(predictionCol="AGE",
                             hyperParameters={"_seed": [1, 2, 3]},
                             ratio=0.8,
                             algo=H2OGBM())

        pipeline = Pipeline(stages=[algo])
        pipeline.write().overwrite().save(
            "file://" + os.path.abspath("build/grid_gbm_pipeline"))
        loaded_pipeline = Pipeline.load(
            "file://" + os.path.abspath("build/grid_gbm_pipeline"))
        model = loaded_pipeline.fit(prostate_frame)

        model.write().overwrite().save(
            "file://" + os.path.abspath("build/grid_gbm_pipeline_model"))
        loaded_model = PipelineModel.load(
            "file://" + os.path.abspath("build/grid_gbm_pipeline_model"))

        loaded_model.transform(prostate_frame).count()
    def test_glm_in_spark_pipeline(self):
        prostate_frame = self._spark.read.csv(
            "file://" +
            unit_test_utils.locate("smalldata/prostate/prostate.csv"),
            header=True,
            inferSchema=True)

        algo = H2OGLM(featuresCols=[
            "CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"
        ],
                      labelCol="AGE",
                      seed=1,
                      ratio=0.8)

        pipeline = Pipeline(stages=[algo])
        pipeline.write().overwrite().save(
            "file://" + os.path.abspath("build/glm_pipeline"))
        loaded_pipeline = Pipeline.load("file://" +
                                        os.path.abspath("build/glm_pipeline"))
        model = loaded_pipeline.fit(prostate_frame)

        model.write().overwrite().save(
            "file://" + os.path.abspath("build/glm_pipeline_model"))
        loaded_model = PipelineModel.load(
            "file://" + os.path.abspath("build/glm_pipeline_model"))

        loaded_model.transform(prostate_frame).count()
Exemplo n.º 10
0
    def getIn(self):
        # 训练Spark等模型
        from pyspark.ml import Pipeline

        self.originalDF = utils.dataUtil.SparkReadHive(self.inputUrl2,
                                                       self.spark)
        self.model = Pipeline.load(self.inputUrl1).getStages()[0]
Exemplo n.º 11
0
    def getIn(self):
        self.logger.debug("using PySpark")
        from pyspark.ml import Pipeline

        self.originalDF = utils.dataUtil.SparkReadHive(self.inputUrl2,
                                                       self.spark)
        self.model = Pipeline.load(self.inputUrl1).getStages()[0]
    def test_grid_gbm_in_spark_pipeline(self):
        prostate_frame = self._spark.read.csv(
            "file://" +
            unit_test_utils.locate("smalldata/prostate/prostate.csv"),
            header=True,
            inferSchema=True)

        algo = H2OGridSearch(labelCol="AGE",
                             hyperParameters={"_seed": [1, 2, 3]},
                             ratio=0.8,
                             algo=H2OGBM(),
                             strategy="RandomDiscrete",
                             maxModels=3,
                             maxRuntimeSecs=60,
                             selectBestModelBy="RMSE")

        pipeline = Pipeline(stages=[algo])
        pipeline.write().overwrite().save(
            "file://" + os.path.abspath("build/grid_gbm_pipeline"))
        loaded_pipeline = Pipeline.load(
            "file://" + os.path.abspath("build/grid_gbm_pipeline"))
        model = loaded_pipeline.fit(prostate_frame)

        model.write().overwrite().save(
            "file://" + os.path.abspath("build/grid_gbm_pipeline_model"))
        loaded_model = PipelineModel.load(
            "file://" + os.path.abspath("build/grid_gbm_pipeline_model"))

        loaded_model.transform(prostate_frame).count()
Exemplo n.º 13
0
    def test_nested_pipeline_persistence(self):
        """
        Pipeline[HashingTF, Pipeline[PCA]]
        """
        sqlContext = SQLContext(self.sc)
        temp_path = tempfile.mkdtemp()

        try:
            df = sqlContext.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"])
            tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features")
            pca = PCA(k=2, inputCol="features", outputCol="pca_features")
            p0 = Pipeline(stages=[pca])
            pl = Pipeline(stages=[tf, p0])
            model = pl.fit(df)

            pipeline_path = temp_path + "/pipeline"
            pl.save(pipeline_path)
            loaded_pipeline = Pipeline.load(pipeline_path)
            self._compare_pipelines(pl, loaded_pipeline)

            model_path = temp_path + "/pipeline-model"
            model.save(model_path)
            loaded_model = PipelineModel.load(model_path)
            self._compare_pipelines(model, loaded_model)
        finally:
            try:
                rmtree(temp_path)
            except OSError:
                pass
Exemplo n.º 14
0
    def test_mojo_dai_pipeline_serialize(self):
        mojo = H2OMOJOPipelineModel.createFromMojo("file://" + os.path.abspath(
            "../ml/src/test/resources/mojo2data/pipeline.mojo"))
        prostateFrame = self._spark.read.csv(
            "file://" +
            unit_test_utils.locate("smalldata/prostate/prostate.csv"),
            header=True)
        # Create Spark pipeline of single step - mojo pipeline
        pipeline = Pipeline(stages=[mojo])
        pipeline.write().overwrite().save(
            "file://" +
            os.path.abspath("build/test_dai_pipeline_as_spark_pipeline"))
        loadedPipeline = Pipeline.load(
            "file://" +
            os.path.abspath("build/test_dai_pipeline_as_spark_pipeline"))

        # Train the pipeline model
        model = loadedPipeline.fit(prostateFrame)

        model.write().overwrite().save(
            "file://" +
            os.path.abspath("build/test_dai_pipeline_as_spark_pipeline_model"))
        loadedModel = PipelineModel.load(
            "file://" +
            os.path.abspath("build/test_dai_pipeline_as_spark_pipeline_model"))

        preds = loadedModel.transform(prostateFrame).repartition(1).select(
            mojo.selectPredictionUDF("AGE")).take(5)

        assert preds[0][0] == 65.36320409515132
        assert preds[1][0] == 64.96902128114817
        assert preds[2][0] == 64.96721023747583
        assert preds[3][0] == 65.78772654671035
        assert preds[4][0] == 66.11327967814829
    def __prediction(self, df_pred):
        logger.info(
            "__prediction: LOADING PIPELINE ##################################### "
        )
        gbt_pipeline = Pipeline.load('budget_prediction_pipeline/')
        gbt_pipeline_loaded = gbt_pipeline.fit(df_pred)
        ddf_features_df = gbt_pipeline_loaded.transform(df_pred)

        logger.info(
            "__prediction: FILTERING DATA ##################################### "
        )
        ddf_features_df = ddf_features_df.filter("idmovie in(99999)")

        logger.info(
            "__prediction: LOADING MODEL ##################################### "
        )
        gbt_model_load = GBTRegressionModel.load('gbt_model_old/')
        gbt_model_pred = gbt_model_load.transform(ddf_features_df)
        gbt_model_pred.selectExpr(
            'idmovie', 'director', 'genres', 'runtime',
            'cast(prediction as Decimal(38,2)) as prediction').show(
                truncate=False)

        logger.info(
            "__prediction: DATA PREDICTED ##################################### "
        )
        return gbt_model_pred.selectExpr('director', 'genres', 'runtime',
                                         'prediction')
Exemplo n.º 16
0
    def test_nested_pipeline_persistence(self):
        """
        Pipeline[HashingTF, Pipeline[PCA]]
        """
        sqlContext = SQLContext(self.sc)
        temp_path = tempfile.mkdtemp()

        try:
            df = sqlContext.createDataFrame([(["a", "b", "c"], ),
                                             (["c", "d", "e"], )], ["words"])
            tf = HashingTF(numFeatures=10,
                           inputCol="words",
                           outputCol="features")
            pca = PCA(k=2, inputCol="features", outputCol="pca_features")
            p0 = Pipeline(stages=[pca])
            pl = Pipeline(stages=[tf, p0])
            model = pl.fit(df)

            pipeline_path = temp_path + "/pipeline"
            pl.save(pipeline_path)
            loaded_pipeline = Pipeline.load(pipeline_path)
            self._compare_pipelines(pl, loaded_pipeline)

            model_path = temp_path + "/pipeline-model"
            model.save(model_path)
            loaded_model = PipelineModel.load(model_path)
            self._compare_pipelines(model, loaded_model)
        finally:
            try:
                rmtree(temp_path)
            except OSError:
                pass
Exemplo n.º 17
0
def test2():
    trA = MyTransformer()
    pipeA = Pipeline(stages=[trA])
    print type(pipeA)
    pipeA.save('testA.pipe')
    pipeAA = PysparkPipelineWrapper.unwrap(Pipeline.load('testA.pipe'))
    stagesAA = pipeAA.getStages()
    trAA = stagesAA[0]
    print trAA.dataset_count
 def __init__(self):
     print("Initializing SentAnalysisProdiction!!");
     self.appName = "Sentiment Analysis in Spark"
     # create Spark session
     self.spark = SparkSession.builder.appName(self.appName) \
         .config("spark.executor.heartbeatInterval", "200000") \
         .config("spark.network.timeout", "300000") \
         .getOrCreate()
     self.model_name = "prediction_pipeline"
     self.pipeline = Pipeline.load(self.model_name)
     return
Exemplo n.º 19
0
def create_cross_val(pipelineStr, regressor):
    print("Creating cross validator")
    pipelineModel = Pipeline.load(pipelineStr)
    paramGrid = ParamGridBuilder().addGrid(regressor.numTrees,
                                           [100, 500]).build()

    crossval = CrossValidator(estimator=pipelineModel,
                              estimatorParamMaps=paramGrid,
                              evaluator=RegressionEvaluator(labelCol="MSRP"),
                              numFolds=3)

    return crossval
Exemplo n.º 20
0
def gridSearchTester(algo, prostateDataset):
    grid = H2OGridSearch(labelCol="AGE", hyperParameters={"seed": [1, 2, 3]}, splitRatio=0.8, algo=algo,
                         strategy="RandomDiscrete", maxModels=3, maxRuntimeSecs=60, selectBestModelBy="RMSE")

    pipeline = Pipeline(stages=[grid])
    pipeline.write().overwrite().save("file://" + os.path.abspath("build/grid_pipeline"))
    loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/grid_pipeline"))
    model = loadedPipeline.fit(prostateDataset)

    model.write().overwrite().save("file://" + os.path.abspath("build/grid_pipeline_model"))
    loadedModel = PipelineModel.load("file://" + os.path.abspath("build/grid_pipeline_model"))

    loadedModel.transform(prostateDataset).count()
Exemplo n.º 21
0
def testPipelineSerialization(prostateDataset):
    algo = H2OIsolationForest(seed=1)

    pipeline = Pipeline(stages=[algo])
    pipeline.write().overwrite().save("file://" + os.path.abspath("build/isolation_forest_pipeline"))
    loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/isolation_forest_pipeline"))
    model = loadedPipeline.fit(prostateDataset)
    expected = model.transform(prostateDataset)

    model.write().overwrite().save("file://" + os.path.abspath("build/isolation_forest_pipeline_model"))
    loadedModel = PipelineModel.load("file://" + os.path.abspath("build/isolation_forest_pipeline_model"))
    result = loadedModel.transform(prostateDataset)

    unit_test_utils.assert_data_frames_are_identical(expected, result)
Exemplo n.º 22
0
def testPipelineSerialization(prostateDataset):
    algo = H2ODRF(featuresCols=["CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"],
                  labelCol="AGE",
                  seed=1,
                  splitRatio=0.8)

    pipeline = Pipeline(stages=[algo])
    pipeline.write().overwrite().save("file://" + os.path.abspath("build/drf_pipeline"))
    loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/drf_pipeline"))
    model = loadedPipeline.fit(prostateDataset)

    model.write().overwrite().save("file://" + os.path.abspath("build/drf_pipeline_model"))
    loadedModel = PipelineModel.load("file://" + os.path.abspath("build/drf_pipeline_model"))

    loadedModel.transform(prostateDataset).count()
Exemplo n.º 23
0
    def test_pipeline_persistence(self):
        sqlContext = SQLContext(self.sc)
        temp_path = tempfile.mkdtemp()

        try:
            df = sqlContext.createDataFrame([(["a", "b", "c"], ),
                                             (["c", "d", "e"], )], ["words"])
            tf = HashingTF(numFeatures=10,
                           inputCol="words",
                           outputCol="features")
            pca = PCA(k=2, inputCol="features", outputCol="pca_features")
            pl = Pipeline(stages=[tf, pca])
            model = pl.fit(df)
            pipeline_path = temp_path + "/pipeline"
            pl.save(pipeline_path)
            loaded_pipeline = Pipeline.load(pipeline_path)
            self.assertEqual(loaded_pipeline.uid, pl.uid)
            self.assertEqual(len(loaded_pipeline.getStages()), 2)

            [loaded_tf, loaded_pca] = loaded_pipeline.getStages()
            self.assertIsInstance(loaded_tf, HashingTF)
            self.assertEqual(loaded_tf.uid, tf.uid)
            param = loaded_tf.getParam("numFeatures")
            self.assertEqual(loaded_tf.getOrDefault(param),
                             tf.getOrDefault(param))

            self.assertIsInstance(loaded_pca, PCA)
            self.assertEqual(loaded_pca.uid, pca.uid)
            self.assertEqual(loaded_pca.getK(), pca.getK())

            model_path = temp_path + "/pipeline-model"
            model.save(model_path)
            loaded_model = PipelineModel.load(model_path)
            [model_tf, model_pca] = model.stages
            [loaded_model_tf, loaded_model_pca] = loaded_model.stages
            self.assertEqual(model_tf.uid, loaded_model_tf.uid)
            self.assertEqual(model_tf.getOrDefault(param),
                             loaded_model_tf.getOrDefault(param))

            self.assertEqual(model_pca.uid, loaded_model_pca.uid)
            self.assertEqual(model_pca.pc, loaded_model_pca.pc)
            self.assertEqual(model_pca.explainedVariance,
                             loaded_model_pca.explainedVariance)
        finally:
            try:
                rmtree(temp_path)
            except OSError:
                pass
Exemplo n.º 24
0
def kmeans_with_loading():
    df1 = sqlContext.read.format("csv").option("header", "true").option("mode", "DROPMALFORMED").load \
        ("canadatweets.csv")
    df2 = sqlContext.read.format("csv").option("header", "true").option(
        "mode", "DROPMALFORMED").load("products.csv")
    df3 = sqlContext.read.format("csv").option("header", "true").option(
        "mode", "DROPMALFORMED").load("products.csv")
    df4 = sqlContext.read.format("csv").option("header", "true").option(
        "mode", "DROPMALFORMED").load("claritin.csv")
    df = df1.unionAll(df2)
    df = df.unionAll(df3)
    df = df.unionAll(df4)
    df.show()
    clusteringPipeline = Pipeline.load('KMeansPipeline')
    model = clusteringPipeline.fit(df)
    transf = model.transform(df)
    transf.show(200, False)
Exemplo n.º 25
0
    def test_pipeline_persistence(self):
        sqlContext = SQLContext(self.sc)
        temp_path = tempfile.mkdtemp()

        try:
            df = sqlContext.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"])
            tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features")
            pca = PCA(k=2, inputCol="features", outputCol="pca_features")
            pl = Pipeline(stages=[tf, pca])
            model = pl.fit(df)
            pipeline_path = temp_path + "/pipeline"
            pl.save(pipeline_path)
            loaded_pipeline = Pipeline.load(pipeline_path)
            self.assertEqual(loaded_pipeline.uid, pl.uid)
            self.assertEqual(len(loaded_pipeline.getStages()), 2)

            [loaded_tf, loaded_pca] = loaded_pipeline.getStages()
            self.assertIsInstance(loaded_tf, HashingTF)
            self.assertEqual(loaded_tf.uid, tf.uid)
            param = loaded_tf.getParam("numFeatures")
            self.assertEqual(loaded_tf.getOrDefault(param), tf.getOrDefault(param))

            self.assertIsInstance(loaded_pca, PCA)
            self.assertEqual(loaded_pca.uid, pca.uid)
            self.assertEqual(loaded_pca.getK(), pca.getK())

            model_path = temp_path + "/pipeline-model"
            model.save(model_path)
            loaded_model = PipelineModel.load(model_path)
            [model_tf, model_pca] = model.stages
            [loaded_model_tf, loaded_model_pca] = loaded_model.stages
            self.assertEqual(model_tf.uid, loaded_model_tf.uid)
            self.assertEqual(model_tf.getOrDefault(param), loaded_model_tf.getOrDefault(param))

            self.assertEqual(model_pca.uid, loaded_model_pca.uid)
            self.assertEqual(model_pca.pc, loaded_model_pca.pc)
            self.assertEqual(model_pca.explainedVariance, loaded_model_pca.explainedVariance)
        finally:
            try:
                rmtree(temp_path)
            except OSError:
                pass
Exemplo n.º 26
0
def testPipelineSerialization(dataset):
    algo = H2OKMeans(
        splitRatio=0.8,
        seed=1,
        k=3,
        featuresCols=["sepal_len", "sepal_wid", "petal_len", "petal_wid"])

    pipeline = Pipeline(stages=[algo])
    pipeline.write().overwrite().save("file://" +
                                      os.path.abspath("build/kmeans_pipeline"))
    loadedPipeline = Pipeline.load("file://" +
                                   os.path.abspath("build/kmeans_pipeline"))
    model = loadedPipeline.fit(dataset)

    model.write().overwrite().save(
        "file://" + os.path.abspath("build/kmeans_pipeline_model"))
    loadedModel = PipelineModel.load(
        "file://" + os.path.abspath("build/kmeans_pipeline_model"))

    loadedModel.transform(dataset).count()
Exemplo n.º 27
0
def testMojoModelSerializationInPipeline(prostateDataset):
    mojo = H2OMOJOModel.createFromMojo(
        "file://" +
        os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo"))

    pipeline = Pipeline(stages=[mojo])

    pipeline.write().overwrite().save(
        "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo"))
    loadedPipeline = Pipeline.load(
        "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo"))

    model = loadedPipeline.fit(prostateDataset)

    model.write().overwrite().save(
        "file://" +
        os.path.abspath("build/test_spark_pipeline_model_mojo_model"))
    PipelineModel.load(
        "file://" +
        os.path.abspath("build/test_spark_pipeline_model_mojo_model"))
Exemplo n.º 28
0
def testPipelineSerialization(heartDataset):
    features = ['age', 'year', 'surgery', 'transplant', 'start', 'stop']
    algo = H2OCoxPH(labelCol="event",
                    featuresCols=features,
                    startCol='start',
                    stopCol='stop')

    pipeline = Pipeline(stages=[algo])
    pipeline.write().overwrite().save("file://" +
                                      os.path.abspath("build/cox_ph_pipeline"))
    loadedPipeline = Pipeline.load("file://" +
                                   os.path.abspath("build/cox_ph_pipeline"))
    model = loadedPipeline.fit(heartDataset)
    expected = model.transform(heartDataset)

    model.write().overwrite().save("file://" +
                                   os.path.abspath("build/cox_ph_pipeline"))
    loadedModel = PipelineModel.load("file://" +
                                     os.path.abspath("build/cox_ph_pipeline"))
    result = loadedModel.transform(heartDataset)

    unit_test_utils.assert_data_frames_are_identical(expected, result)
colPruner = ColumnPruner(columns=[
    idf.getOutputCol(),
    hashingTF.getOutputCol(),
    stopWordsRemover.getOutputCol(),
    tokenizer.getOutputCol()
])

## Create the pipeline by defining all the stages
pipeline = Pipeline(
    stages=[tokenizer, stopWordsRemover, hashingTF, idf, algoStage, colPruner])

## Test exporting and importing the pipeline. On Systems where HDFS & Hadoop is not available, this call store the pipeline
## to local file in the current directory. In case HDFS & Hadoop is available, this call stores the pipeline to HDFS home
## directory for the current user. Absolute paths can be used as wells. The same holds for the model import/export bellow.
pipeline.write().overwrite().save("examples/build/pipeline")
loaded_pipeline = Pipeline.load("examples/build/pipeline")

## Train the pipeline model
data = load()
model = loaded_pipeline.fit(data)

model.write().overwrite().save("examples/build/model")
loaded_model = PipelineModel.load("examples/build/model")


##
## Make predictions on unlabeled data
## Spam detector
##
def isSpam(smsText, model, hamThreshold=0.5):
    smsTextDF = spark.createDataFrame([(smsText, )],
Exemplo n.º 30
0
elif algo == "xgboost":
    ## Create H2OXGBoost model
    algoStage = H2OXGBoost(convertUnknownCategoricalLevelsToNa=True,
                           featuresCols=[idf.getOutputCol()],
                           predictionCol="label")
## Remove all helper columns
colPruner = ColumnPruner(columns=[idf.getOutputCol(), hashingTF.getOutputCol(), stopWordsRemover.getOutputCol(), tokenizer.getOutputCol()])

## Create the pipeline by defining all the stages
pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, hashingTF, idf, algoStage, colPruner])

## Test exporting and importing the pipeline. On Systems where HDFS & Hadoop is not available, this call store the pipeline
## to local file in the current directory. In case HDFS & Hadoop is available, this call stores the pipeline to HDFS home
## directory for the current user. Absolute paths can be used as wells. The same holds for the model import/export bellow.
pipeline.write().overwrite().save("examples/build/pipeline")
loaded_pipeline = Pipeline.load("examples/build/pipeline")

## Train the pipeline model
data = load()
model = loaded_pipeline.fit(data)

model.write().overwrite().save("examples/build/model")
loaded_model = PipelineModel.load("examples/build/model")




##
## Make predictions on unlabeled data
## Spam detector
##
Exemplo n.º 31
0
from pyspark.ml import Pipeline
from pyspark.ml import PipelineModel

# start a kafka consumer session
from kafka.consumer import KafkaConsumer
consumer = KafkaConsumer(
    "titanic",
    bootstrap_servers=['ip-172-31-12-218.us-east-2.compute.internal:6667'])
print('consumer launched')

testSchema = [
    "PassengerId", "Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Ticket",
    "Fare", "Cabin", "Embarked"
]

pipeline = Pipeline.load("/home/ubuntu/titanic/pipeline")
model = PipelineModel.load("/home/ubuntu/titanic/model")


def getTrain(msg):
    # put passenger info into dataframe
    # print msg
    # combine two lists into list of tuple
    # combined = map(lambda x, y: (x, y), trainSchema, msg)
    msg = [ast.literal_eval(msg)]
    msg[0][0] = float(msg[0][0])
    msg[0][1] = float(msg[0][1])
    msg[0][4] = float(msg[0][4])
    msg[0][5] = float(msg[0][5])
    msg[0][6] = float(msg[0][6])
    msg[0][8] = float(msg[0][8])
Exemplo n.º 32
0
    .config("spark.hadoop.yarn.resourcemanager.principal",os.environ["HADOOP_USER_NAME"])\
    .config("spark.executor.instances", 2)\
    .config("spark.executor.cores", 2)\
    .getOrCreate()
    
### Reading latest data and recreating pipeline from model development notebook

#Notice we are only picking the most recent 1000 rows reflecting latest customer interactions
df = spark.sql("SELECT RECENCY, HISTORY, USED_DISCOUNT, USED_BOGO, ZIP_CODE, IS_REFERRAL, CHANNEL, OFFER, SCORE, CONVERSION FROM DEFAULT.CUSTOMER_INTERACTIONS_CICD ORDER BY BATCH_TMS DESC LIMIT 20000")

#Renaming target feature as "LABEL":
df = df.withColumnRenamed("CONVERSION","label")

latest_url_formatted = find_latest("spark_pipeline")

pm = Pipeline.load(latest_url_formatted)

pm = pm.fit(df)

## Saving the newly trained model

run_time_suffix = datetime.now()
run_time_suffix_string = run_time_suffix.strftime("%d%m%Y%H%M%S")

pm.write().overwrite().save(os.environ["STORAGE"]+"/testpysparkmodels/"+"{}".format(run_time_suffix_string))

## Saving the newly trained model metadata
conn = sqlite3.connect('models.db')
c = conn.cursor()

import re
Exemplo n.º 33
0
# In[13]:

import pyspark.ml.evaluation as ev

evaluator = ev.BinaryClassificationEvaluator(rawPredictionCol='probability',
                                             labelCol='INFANT_ALIVE_AT_REPORT')

print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderROC'}))
print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderPR'}))

# In[14]:

pipelinePath = './model/infant_oneHotEncoder_Logistic_Pipeline'
pipeline.write().overwrite().save(pipelinePath)

# In[15]:

loadedPipeline = Pipeline.load(pipelinePath)
loadedPipeline.fit(births_train).transform(births_test).take(1)

# In[16]:

from pyspark.ml import PipelineModel

modelPath = './model/infant_oneHotEncoder_Logistic_PipelineModel'
model.write().overwrite().save(modelPath)

loadedPipelineModel = PipelineModel.load(modelPath)
test_loadedModel = loadedPipelineModel.transform(births_test)