Exemplo n.º 1
0
 def test_list_string(self):
     for labels in [
             np.array(['a', u'b']), ['a', u'b'],
             np.array(['a', 'b'])
     ]:
         idx_to_string = IndexToString(labels=labels)
         self.assertListEqual(idx_to_string.getLabels(), ['a', 'b'])
     self.assertRaises(TypeError, lambda: IndexToString(labels=['a', 2]))
Exemplo n.º 2
0
def index_to_string(df, input_cols, output_cols=None, columns=None, **kargs):
    """
    Maps a column of indices back to a new column of corresponding string values. The index-string mapping is
    either from the ML attributes of the input column, or from user-supplied labels (which take precedence over
    ML attributes).
    :param df: Dataframe to be transformed.
    :param input_cols: Columns to be indexed.
    :param output_cols: Column where the output is going to be saved.
    :param columns:
    :return: Dataframe with indexed columns.
    """
    df_actual = df

    if columns is None:
        input_cols = parse_columns(df, input_cols)
        if output_cols is None:
            output_cols = [name_col(input_col, "index_to_string") for input_col in input_cols]
        output_cols = get_output_cols(input_cols, output_cols)
    else:
        input_cols, output_cols = zip(*columns)

    indexers = [IndexToString(inputCol=input_col, outputCol=output_col, **kargs) for input_col, output_col
                in zip(list(set(input_cols)), list(set(output_cols)))]
    pipeline = Pipeline(stages=indexers)
    df = pipeline.fit(df).transform(df)

    df = df.preserve_meta(df_actual, Actions.INDEX_TO_STRING.value, output_cols)

    return df
Exemplo n.º 3
0
    def _transform(self, df) -> DataFrame:
        """
        :param df: A pyspark.sql.dataframe.DataFrame
        """

        # Apply string indexer
        for in_col, out_col in zip(self.inputCols, self.outputCols):
            self.__logger().info("Applying StringIndexer on col {}".format(in_col))
            df = self.dict_indexers[in_col]['indexer'].transform(df)
            n_to_keep = self.dict_indexers[in_col]['n_to_keep']
            # If elements occur below (threshold * number of rows), replace them with n_to_keep.
            this_meta = df.select(out_col).schema.fields[0].metadata
            if n_to_keep != len(this_meta['ml_attr']['vals']):
                this_meta['ml_attr']['vals'] = this_meta['ml_attr']['vals'][0:(n_to_keep + 1)]
                this_meta['ml_attr']['vals'][n_to_keep] = self.groupText
                self.__logger().info("Truncating number of categories of {} at {}".format(in_col, n_to_keep))
                df = df.withColumn(out_col,
                                   F.when(F.col(out_col) >= n_to_keep, F.lit(n_to_keep)).otherwise(
                                           F.col(out_col)))

            # add the new indexed column with correct metadata, remove original indexed column.
            df = df.withColumn(out_col,
                               with_meta(F.col(out_col), "", this_meta))

        if not self.returnIndexed:
            for output_col in self.outputCols:
                df = df.withColumnRenamed(output_col, output_col + '_temp')
                df = IndexToString(inputCol=output_col + '_temp', outputCol=output_col).transform(df)
                df = df.drop(output_col + '_temp')

        if self.dropInputCols:
            df = df.drop(*self.inputCols)

        return df
Exemplo n.º 4
0
def DecisionTree():
    IrisData = spark.sparkContext.textFile("file:///home/unbroken/MyFiles/Work/Programming/Spark/DecisionTree/Iris.txt")\
    .map(lambda line: line.split(',')).map(lambda p: Row(**f(p))).toDF()
    IrisData.createOrReplaceTempView("iris")
    df = spark.sql("select * from iris")
    labelIndexer = StringIndexer(inputCol='label',
                                 outputCol='labelIndex').fit(IrisData)
    featureIndexer = VectorIndexer(
        inputCol='feature',
        outputCol='indexFeature').setMaxCategories(4).fit(IrisData)
    labelConverter = IndexToString(inputCol='prediction',
                                   outputCol='predictionLabel').setLabels(
                                       labelIndexer.labels)
    trainningData, testingData = IrisData.randomSplit([0.7, 0.3])
    dtClassifier = DecisionTreeClassifier().setLabelCol(
        'labelIndex').setFeaturesCol('indexFeature')
    pipelineClassifier = Pipeline().setStages(
        [labelIndexer, featureIndexer, dtClassifier, labelConverter])
    modelClassifier = pipelineClassifier.fit(trainningData)
    prediction = modelClassifier.transform(testingData)
    print(prediction.show())

    evaluator = MulticlassClassificationEvaluator().setLabelCol(
        'labelIndex').setPredictionCol('prediction').setMetricName("accuracy")
    accuracy = evaluator.evaluate(prediction)
    print(accuracy)

    treeModelClassifier = modelClassifier.stages[2]
    print("Learned classification tree model:\n" +
          str(treeModelClassifier.toDebugString))
Exemplo n.º 5
0
def recommend_users(spark,
                    input_user,
                    input_video,
                    model,
                    user_indexer,
                    video_indexer,
                    user_language,
                    video_language,
                    num_recommend=20,
                    is_show=True):
    """Use als model to recommend for users
    Args:
        spark: Spark session
        input_user:
        input_video:
        model: ALS model
        user_indexer:
        video_indexer:
        num_recommend: The maximum number of recommendation videos
        is_show: If true, the data would be shown
        user_language: The language of user
        video_language: The language of video
    """
    # Recommend for all users
    userRecs = model.recommendForAllUsers(num_recommend)

    # Turn index back string
    indexer_user = IndexToString(inputCol=input_user + "_index",
                                 outputCol=input_user,
                                 labels=user_indexer.labels)
    index_user = indexer_user.transform(userRecs)

    video_labels = array(*[lit(x) for x in video_indexer.labels])
    recommendations = array(*[
        struct(
            video_labels[col("recommendations")[i][
                input_video + "_index"]].alias(input_video),
            col("recommendations")[i][input_rating])
        for i in range(num_recommend)
    ])

    recs = index_user.withColumn("recommendations", recommendations).select(
        input_user, "recommendations")
    explode_recs = recs.select(input_user, explode("recommendations").alias("recommendation")).\
                        select(input_user, "recommendation.*").\
                        select(input_user, input_video, col("col2").alias("score"))

    # Keep user and video have same language
    user_label = read_data_hive(spark, [input_user, user_language], is_show)
    video_label = read_data_hive(spark, [input_video, video_language, is_show])
    explode_recs_filter = explode_recs.join(user_label, input_user,
                                            "inner").join(
                                                video_label, input_video,
                                                "inner")
    explode_recs_filter = explode_recs_filter.filter(
        explode_recs_filter[user_language] ==
        explode_recs_filter[video_language])
    if is_show:
        explode_recs_filter.show(20)
    explode_recs_filter.registerTempTable("temp_table")
def createPipeline(irisData, lrElasticNetParam, lrRegParam):
    '''Creates a pipeline for coverting the data into features and label with the required format
    Args: irisData - Input data for the feature and label processing
          lrElasticNetParam - ElasticNet parameter of LR, 0-L2 penalty and 1-L1 penalty
          lrRegParam - Regularization parameter
    '''
    strIndexer = StringIndexer().setInputCol('species').setOutputCol(
        'label').fit(irisData)
    va = VectorAssembler(inputCols=[
        'sepal_length', 'sepal_width', 'petal_length', 'petal_width'
    ],
                         outputCol='vec_features')
    ss = StandardScaler().setInputCol(
        va.getOutputCol()).setOutputCol('features').fit(va.transform(irisData))
    lr = LogisticRegression().setFeaturesCol('features')
    labelConverter = IndexToString(inputCol='prediction',
                                   outputCol='predictedLabel',
                                   labels=strIndexer.labels)
    stages = [strIndexer, va, ss, lr, labelConverter]
    pipeline = Pipeline().setStages(stages)

    params = ParamGridBuilder().addGrid(lr.elasticNetParam,
                                        lrElasticNetParam).addGrid(
                                            lr.regParam, lrRegParam).build()
    evaluator = MulticlassClassificationEvaluator(labelCol='label',
                                                  predictionCol='prediction',
                                                  metricName=lrMetric)

    return pipeline, params, evaluator
Exemplo n.º 7
0
def _feature_transform(df):
    string_indexers = list()
    label_converter = None
    feature_names = []
    for field in df.schema.fields:
        is_label = field.metadata and field.metadata['label']
        if is_label:
            if not isinstance(field.dataType, NumericType):
                string_indexer = StringIndexer(inputCol=field.name,
                                               outputCol="label").fit(df)
                label_converter = IndexToString(inputCol="prediction",
                                                outputCol="predictedLabel",
                                                labels=string_indexer.labels)
                string_indexers.append(string_indexer)
            else:
                df.withColumnRenamed(field.name, "label")
        elif not isinstance(field.dataType, NumericType):
            feature_name = field.name + "_idx"
            string_indexer = StringIndexer(inputCol=field.name,
                                           outputCol=feature_name).fit(df)
            string_indexers.append(string_indexer)
            feature_names.append(feature_name)
        else:
            feature_names.append(field.name)
    return feature_names, label_converter, string_indexers
Exemplo n.º 8
0
def RunRandomForest(tf, ctx):
	sqlContext = SQLContext(ctx)
	rdd = tf.map(parseForRandomForest)
	# The schema is encoded in a string.
	schema = ['genre', 'track_id', 'features']
	# Apply the schema to the RDD.
	songDF = sqlContext.createDataFrame(rdd, schema)

	# Register the DataFrame as a table.
	songDF.registerTempTable("genclass")
	labelIndexer = StringIndexer().setInputCol("genre").setOutputCol("indexedLabel").fit(songDF)

	trainingData, testData = songDF.randomSplit([0.8, 0.2])

	labelConverter = IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)

	rfc = RandomForestClassifier().setMaxDepth(10).setNumTrees(2).setLabelCol("indexedLabel").setFeaturesCol("features")
	#rfc = SVMModel([.5, 10, 20], 5)
	#rfc = LogisticRegression(maxIter=10, regParam=0.01).setLabelCol("indexedLabel").setFeaturesCol("features")

	pipeline = Pipeline(stages=[labelIndexer, rfc, labelConverter])
	model = pipeline.fit(trainingData)

	predictions = model.transform(testData)
	predictions.show()

	evaluator = MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("precision")
	accuracy = evaluator.evaluate(predictions)
	print 'Accuracy of RandomForest = ', accuracy * 100
	print "Test Error = ", (1.0 - accuracy) * 100
    def trainModel(self, trainingData):
        """ Ham huan luyen du lieu
        Mac dinh training toan bo du lieu trong dataset splitratio 100% training, 0% testing
        """
        # Chuyen toan bo nhan thanh so neu chua chuyen
        # trainingData.select("label").groupBy("label").count().show()
        labelIndexer = StringIndexer(
            inputCol="label", outputCol="indexedLabel").fit(trainingData)
        # Chuyen toan bo gia tri thuoc tinh thanh so neu chua chuyen
        featureIndexer = VectorIndexer(inputCol="features",
                                       outputCol="indexedFeatures",
                                       maxCategories=4).fit(trainingData)
        # Khai bao thuat toan RandomForest
        rf = RandomForestClassifier(labelCol="indexedLabel",
                                    featuresCol="indexedFeatures",
                                    numTrees=30,
                                    maxDepth=5,
                                    maxBins=32,
                                    seed=None,
                                    impurity="gini")
        # Chuyen nhan du doan duoc tu dang so ve dang ban dau,
        labelConverter = IndexToString(inputCol="prediction",
                                       outputCol="predictedLabel",
                                       labels=labelIndexer.labels)
        # Hop nhat tat ca cac buoc thanh mot luong duy nhat pipeline
        pipeline = Pipeline(
            stages=[labelIndexer, featureIndexer, rf, labelConverter])

        # Train model qua pipeline
        model = pipeline.fit(trainingData)
        model.write().overwrite().save(os.path.join(self.modelpath,
                                                    "detector"))
        return model
Exemplo n.º 10
0
 def trainModel(self, trainingData):
     """ Ham huan luyen du lieu
     Mac dinh training toan bo du lieu trong dataset splitratio 100% training, 0% testing
     """
     labelIndexer = StringIndexer(
         inputCol="label", outputCol="indexedLabel").fit(trainingData)
     featureIndexer = VectorIndexer(inputCol="features",
                                    outputCol="indexedFeatures",
                                    maxCategories=4).fit(trainingData)
     rf = RandomForestClassifier(labelCol="indexedLabel",
                                 featuresCol="indexedFeatures",
                                 numTrees=30,
                                 maxDepth=5,
                                 maxBins=32,
                                 seed=None,
                                 impurity="gini")
     labelConverter = IndexToString(inputCol="prediction",
                                    outputCol="predictedLabel",
                                    labels=labelIndexer.labels)
     pipeline = Pipeline(
         stages=[labelIndexer, featureIndexer, rf, labelConverter])
     model = pipeline.fit(trainingData)
     model.write().overwrite().save(os.path.join(self.modelpath,
                                                 "detector"))
     return model
Exemplo n.º 11
0
    def test_index_to_string(self):
        original_data = self.spark.createDataFrame([(0, "a"), (1, "b"),
                                                    (2, "c"), (3, "a"),
                                                    (4, "a"), (5, "c")],
                                                   ["id", "category"])
        string_indexer = StringIndexer(inputCol="category",
                                       outputCol="categoryIndex")
        string_indexer_model = string_indexer.fit(original_data)
        data = string_indexer_model.transform(original_data)

        model = IndexToString(inputCol="categoryIndex",
                              outputCol="originalCategory",
                              labels=['A', 'B', 'C'])
        # the input name should match that of what IndexToString.inputCol
        model_onnx = convert_sparkml(
            model, 'Sparkml IndexToString',
            [('categoryIndex', Int64TensorType([1, 1]))])
        self.assertTrue(model_onnx is not None)
        # run the model
        predicted = model.transform(data)
        expected = predicted.select("originalCategory").toPandas().values
        data_np = data.select('categoryIndex').toPandas().values.astype(
            numpy.int64)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlIndexToString")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['originalCategory'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
  def prepare(self):
    data = (self.spark_session.read.format(self.data_format)
            .load(self.data_file))
    labelIndexer = StringIndexer(
        inputCol="label", outputCol="indexedLabel").fit(data)
    featureIndexer = (VectorIndexer(inputCol="features",
                                    outputCol="indexedFeatures",
                                    maxCategories=self.max_categories)
                      .fit(data))
    self.train_data, self.valid_data = data.randomSplit([0.8, 0.2])
    if self.model_builder.__name__ == 'DecisionTreeClassifier':
      classifier = self.model_builder(labelCol="indexedLabel",
                                      featuresCol="indexedFeatures")
    elif self.model_builder.__name__ == 'RandomForestClassifier':
      classifier = self.model_builder(labelCol="indexedLabel",
                                      featuresCol="indexedFeatures",
                                      numTrees=self.num_trees)
      labelConverter = IndexToString(inputCol="prediction",
                                     outputCol="predictedLabel",
                                     labels=labelIndexer.labels)
    elif self.model_builder.__name__ == 'GBTClassifier':
      classifier = self.model_builder(labelCol="indexedLabel",
                                      featuresCol="indexedFeatures",
                                      maxIter=self.max_iter)

    if self.model_builder.__name__ == 'RandomForestClassifier':
      self.pipeline = Pipeline(stages=[labelIndexer,
                                       featureIndexer,
                                       classifier,
                                       labelConverter])
    else:
      self.pipeline = Pipeline(stages=[labelIndexer,
                                       featureIndexer,
                                       classifier])
Exemplo n.º 13
0
def my_transform(rdd):
    with open("./index2whiskey1.json", mode="r", encoding="utf-8") as f:
        whiskey_list = list(json.loads(f.read()).values())
    model = ALSModel.load("hdfs://master/ALSModel1/")
    spark = SparkSession.builder.appName('sql coming~').getOrCreate()
    whiskey = rdd.map(lambda x: Row(whiskeyId=int(x[1]), user_name=x[0]))
    whiskey_df = spark.createDataFrame(whiskey)
    predict = model.recommendForItemSubset(whiskey_df, 1)
    df_user = predict.select(
        predict.whiskeyId,
        predict.recommendations[0].userId.alias("userId"),
    )

    df_whiskey = model.recommendForUserSubset(df_user, 5)
    result_df = df_user.join(df_whiskey, on=['userId'], how='left')
    result_df = result_df.join(whiskey_df, on=['whiskeyId'], how='left')
    result_df = result_df.select("user_name",
                    result_df["recommendations"][0].whiskeyId.alias("whiskeyId1"), \
                    result_df["recommendations"][1].whiskeyId.alias("whiskeyId2"), \
                    result_df["recommendations"][2].whiskeyId.alias("whiskeyId3"), \
                    result_df["recommendations"][3].whiskeyId.alias("whiskeyId4"), \
                    result_df["recommendations"][4].whiskeyId.alias("whiskeyId5") \
                    )
    whiskeyId1converter = IndexToString(inputCol="whiskeyId1",
                                        outputCol="whiskey1",
                                        labels=whiskey_list)
    whiskeyId2converter = IndexToString(inputCol="whiskeyId2",
                                        outputCol="whiskey2",
                                        labels=whiskey_list)
    whiskeyId3converter = IndexToString(inputCol="whiskeyId3",
                                        outputCol="whiskey3",
                                        labels=whiskey_list)
    whiskeyId4converter = IndexToString(inputCol="whiskeyId4",
                                        outputCol="whiskey4",
                                        labels=whiskey_list)
    whiskeyId5converter = IndexToString(inputCol="whiskeyId5",
                                        outputCol="whiskey5",
                                        labels=whiskey_list)

    result_df = whiskeyId1converter.transform(result_df)
    result_df = whiskeyId2converter.transform(result_df)
    result_df = whiskeyId3converter.transform(result_df)
    result_df = whiskeyId4converter.transform(result_df)
    result_df = whiskeyId5converter.transform(result_df)

    return result_df.rdd
Exemplo n.º 14
0
def Customer_List(model, user):
    # Create a dataset with distinct Customers as one column and the asin as another column
    Customer = data_train.select("userid").distinct().withColumn("item", lit(user))

#     # convert index back to original CustomerID 
    userconverter = IndexToString(inputCol="userid", outputCol="List of Customers")
    userString = userconverter.transform(Customer)
    userString.drop("userid").drop("item").show()
Exemplo n.º 15
0
def main(train_x,
         train_y,
         test_x,
         test_y=None,
         idf=False,
         ngram=1,
         base='gs',
         asm=False):
    # Load : DF[id, url, features, label?]
    # The DataFrames only have a labels column if labels are given.
    # We drop the text, since Naive Bayes doesn't use it and we already have all the tokens
    kind = 'asm' if asm else 'bytes'
    train = elizabeth.load(train_x, train_y, base=base, kind=kind).drop('text')
    test = elizabeth.load(test_x, test_y, base=base, kind=kind).drop('text')

    # convert the string labels to numeric indices
    # the handleInvalid param allows the label indexer to deal with labels that weren't seen during fitting
    label_indexer = StringIndexer(inputCol='label',
                                  outputCol='indexedLabel',
                                  handleInvalid="skip")
    label_indexer = label_indexer.fit(train)
    train = label_indexer.transform(train)
    # the test set won't always have labels
    if test_y is not None:
        test = label_indexer.transform(test)

    index_labeller = IndexToString(inputCol='prediction',
                                   outputCol='predictedClass',
                                   labels=label_indexer.labels)

    # Train the preprocessor and transform the data.
    prep = elizabeth.Preprocessor()
    prep.add(NGram(n=int(ngram)))
    prep.add(CountVectorizer())
    if idf: prep.add(IDF())
    train = prep.fit(train)
    test = prep.transform(test)

    # Naive Bayes : DF[id, url, text, features, label?, rawPrediction, probability, prediction]
    nb = NaiveBayes(labelCol='indexedLabel').fit(train)
    test = nb.transform(test)
    test = index_labeller.transform(
        test)  # DF[id, url, ... prediction, predictedClass]

    # If labels are given for the test set, print a score.s
    if test_y:
        test = test.orderBy(test.id)
        test = test.withColumn(
            'correct', (test.label == test.predictedClass).cast('double'))
        test = test.select(avg(test.correct))
        print(test.show())

    # If no labels are given for the test set, print predictions.
    else:
        test = test.orderBy(test.id).select(test.predictedClass)
        test = test.rdd.map(lambda row: int(row.predictedClass))
        test = test.toLocalIterator()
        print(*test, sep='\n')
Exemplo n.º 16
0
def exec_method(spark, data):
    # Index labels, adding metadata to the label column.
    # Fit on whole dataset to include all labels in index.
    labelIndexer = StringIndexer(inputCol="label",
                                 outputCol="indexedLabel").fit(data)

    # Automatically identify categorical features, and index them.
    # Set maxCategories so features with > 4 distinct values are treated as continuous.
    featureIndexer = \
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Convert indexed labels back to original labels.
    labelConverter = IndexToString(inputCol="prediction",
                                   outputCol="predictedLabel",
                                   labels=labelIndexer.labels)

    # dtree
    rf = RandomForestClassifier(labelCol="indexedLabel",
                                featuresCol="indexedFeatures")
    pipeline = Pipeline(
        stages=[labelIndexer, featureIndexer, rf, labelConverter])
    model = pipeline.fit(trainingData)

    # gbt
    # gbt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10)
    # pipeline = Pipeline(stages=[labelIndexer, featureIndexer, gbt, labelConverter])
    # model = pipeline.fit(trainingData)

    # Make predictions.
    predictions = model.transform(testData)

    # Select example rows to display.
    predictions.select("predictedLabel", "label", "features").show(5)

    print("predictedLabel=1: " +
          str(predictions.filter("predictedLabel=1.0").count()))
    print("label=1: " + str(predictions.filter("label=1.0").count()))
    print("total: " + str(predictions.count()))

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g" % (1.0 - accuracy))

    rfModel = model.stages[2]
    print(rfModel)  # summary only
    # $example off$
    print("model detail\n:" + rfModel.toDebugString)

    spark.stop()
Exemplo n.º 17
0
def run_randomforest(df):

    (train_data, test_data) = df.randomSplit([0.8, 0.2], 1234)

    labelIndexer = StringIndexer(
        inputCol="quality", outputCol="indexedLabel"
    ).fit(
        df
    )  # Identify and index labels that could be fit through classification pipeline
    assembler = VectorAssembler(
        inputCols=['temp', 'pressure', 'duration'],
        outputCol="features").setHandleInvalid(
            "skip"
        )  # Incorporate all input fields as vector for classificaion pipeline
    # scaler = StandardScaler(inputCol="features_assembler", outputCol="features")  # Scale input fields using standard scale
    labelConverter = IndexToString(
        inputCol="prediction",
        outputCol="predicted_quality",
        labels=labelIndexer.labels
    )  # Convert/Lookup prediction label index to actual label

    numTreesList = [10, 25, 50]
    maxDepthList = [3, 10, 5]

    for numTrees, maxDepth in [(numTrees, maxDepth)
                               for numTrees in numTreesList
                               for maxDepth in maxDepthList]:
        params = {
            "numTrees": numTrees,
            "maxDepth": maxDepth,
            "model": "RandomForest"
        }
        params.update({
            "model_data_date":
            model_data_date['start_date'] + ' - ' + model_data_date['end_date']
        })
        if run_exists(mlflow_exp_id, params):
            print("Trees: %s, Depth: %s, Run already exists" %
                  (numTrees, maxDepth))
        else:
            rf = RandomForestClassifier(labelCol="indexedLabel",
                                        featuresCol="features",
                                        numTrees=numTrees,
                                        maxDepth=maxDepth,
                                        seed=512)
            model, predictions, accuracy, ml_run_info = randomforest_model(
                [labelIndexer, assembler, rf, labelConverter], params,
                train_data, test_data)
            print("Trees: %s, Depth: %s, Accuracy: %s\n" %
                  (numTrees, maxDepth, accuracy))

    mlflow_search_query = "params.model = 'RandomForest' and params.model_data_date = '" + model_data_date[
        'start_date'] + ' - ' + model_data_date['end_date'] + "'"

    return best_run(mlflow_exp_id, mlflow_search_query)
Exemplo n.º 18
0
def pipeline_for_feature_cols(input_df_with_non_features):
    string_indexer = StringIndexer(
        inputCol="category", outputCol="label").fit(input_df_with_non_features)
    return Pipeline(stages=[
        string_indexer,
        VectorAssembler(inputCols=["id"], outputCol="features"),
        LogisticRegression(),
        IndexToString(inputCol="prediction",
                      outputCol="originalLabel",
                      labels=string_indexer.labels),
    ])
Exemplo n.º 19
0
 def __predict_sentiment(self, schema):
     """Gets predictions for a given tweet formatted RDD
     Returns: an RDD with format {"sentiment":"POSITIVE"}
     """
     test = self.pipelineFit.transform(schema)
     predict = self.model.transform(test)
     converter = IndexToString(inputCol="prediction",
                               outputCol="predicted_label",
                               labels=self.pipelineFit.stages[3].labels)
     converted = converter.transform(predict)
     return converted.select("predicted_label").collect()[0].asDict()
def perceptron_random_forest_train(sp_session, df_p):

    data = df_p
    # Index labels, adding metadata to the label column.
    # Fit on whole dataset to include all labels in index.
    labelIndexer = StringIndexer(inputCol="label",
                                 outputCol="indexedLabel").fit(data)

    # Automatically identify categorical features, and index them.
    # Set maxCategories so features with > 4 distinct values are treated as continuous.
    featureIndexer =\
         VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a RandomForest model.
    rf = RandomForestClassifier(labelCol="indexedLabel",
                                featuresCol="indexedFeatures",
                                numTrees=10)
    #rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)

    # Convert indexed labels back to original labels.
    labelConverter = IndexToString(inputCol="prediction",
                                   outputCol="predictedLabel",
                                   labels=labelIndexer.labels)

    # Chain indexers and forest in a Pipeline
    pipeline = Pipeline(
        stages=[labelIndexer, featureIndexer, rf, labelConverter])

    # Train model.  This also runs the indexers.
    # Train on complete data
    model = pipeline.fit(data)
    ##model = rf.fit(trainingData)

    #Save the model
    model.save("ml_randomforest.model")

    # Make predictions.
    predictions = model.transform(testData)

    # Select example rows to display.
    predictions.select("predictedLabel", "label", "features").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g" % (1.0 - accuracy))

    rfModel = model.stages[2]
    print(rfModel)  # summary only
Exemplo n.º 21
0
def run_xgboost(df):

    (train_data, test_data) = df.randomSplit([0.8, 0.2])

    labelIndexer = StringIndexer(
        inputCol="quality", outputCol="indexedLabel"
    ).fit(
        df
    )  # Identify and index labels that could be fit through classification pipeline
    assembler = VectorAssembler(
        inputCols=['temp', 'pressure', 'duration'],
        outputCol="features_assembler").setHandleInvalid(
            "skip"
        )  # Incorporate all input fields as vector for classificaion pipeline
    scaler = StandardScaler(
        inputCol="features_assembler",
        outputCol="features")  # Scale input fields using standard scale
    labelConverter = IndexToString(
        inputCol="prediction",
        outputCol="predicted_quality",
        labels=labelIndexer.labels
    )  # Convert/Lookup prediction label index to actual label

    numTreesList = [10, 25, 50]
    learningRateList = [.1, .2, .3]

    for numTrees, learningRate in [(numTrees, learningRate)
                                   for numTrees in numTreesList
                                   for learningRate in learningRateList]:
        params = {
            "numTrees": numTrees,
            "learningRate": learningRate,
            "model": "XGBoost"
        }
        params.update(model_data_date)
        if run_exists(mlflow_exp_id, params):
            print("Trees: %s, learning Rate: %s, Run already exists" %
                  (numTrees, learningRate))
        else:
            xgboost = XGBoostEstimator(labelCol="indexedLabel",
                                       featuresCol="features",
                                       eta=learningRate,
                                       maxDepth=maxDepth)
            model, predictions, accuracy, ml_run_info = xgboost_model(
                [labelIndexer, assembler, scaler, rf, labelConverter], params,
                train_data, test_data)
            print("Trees: %s, learning Rate: %s, Accuracy: %s\n" %
                  (numTrees, learningRate, accuracy))

    mlflow_search_query = "params.model = 'XGBoost' and params.model_data_date = '" + model_data_date[
        'model_data_date'] + "'"

    return best_run(mlflow_exp_id, mlflow_search_query)
Exemplo n.º 22
0
def index2string(df,columns:list,param):
    for column in columns:
        column_new=column+"_str"
        print(f"index2string {column} to {column_new}")

        labels=param[column+"_labels"]
        model=IndexToString(inputCol=column,outputCol=column_new,labels=labels)

        df=model.transform(df)\
                 .withColumn(column,col(column_new))\
                 .drop(column_new)
    return df
    def indexToString(infoData):
        stringIndexerPath = infoData.get(mc.INDEXERPATH)
        inverterColm = infoData.get(mc.COLMTOINVERT)
        dataset = infoData.get(mc.DATASET)
        stringIndexer = StringIndexerModel.load(stringIndexerPath)
        inverter = IndexToString(inputCol=inverterColm, outputCol=mc.DMXINVERTEDCOLM,
                                 labels=stringIndexer.labels)
        dataset = inverter.transform(dataset)

        #drop the indexed colm and rename the new unindexed colm with the actual one
        dataset = dataset.drop(inverterColm)
        dataset = dataset.withColumnRenamed(mc.DMXINVERTEDCOLM, inverterColm)
        return dataset
Exemplo n.º 24
0
    def test_index_to_string_throws(self):
        original_data = self.spark.createDataFrame(
            [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
            ["id", "category"])
        string_indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
        string_indexer_model = string_indexer.fit(original_data)
        data = string_indexer_model.transform(original_data)

        model = IndexToString(inputCol="categoryIndex", outputCol="originalCategory")
        # the input name should match that of what IndexToString.inputCol
        model_onnx = None
        with pytest.raises(SparkMlConversionError):
            model_onnx = convert_sparkml(model, 'Sparkml IndexToString', [('categoryIndex', Int64TensorType([1, 1]))])
Exemplo n.º 25
0
def multinomial_lr_with_index_to_string_stage_pipeline(
        multinomial_df_with_string_labels):
    string_indexer = StringIndexer(
        inputCol="category",
        outputCol="label").fit(multinomial_df_with_string_labels)
    return Pipeline(stages=[
        string_indexer,
        VectorAssembler(inputCols=["id"], outputCol="features"),
        LogisticRegression(),
        IndexToString(inputCol="prediction",
                      outputCol="originalLabel",
                      labels=string_indexer.labels),
    ])
Exemplo n.º 26
0
def random_decision_tree_usecase():
    spark = getSparkSession()

    # Load and parse the data file, converting it to a DataFrame.
    data = spark.read.format("libsvm").load("../data/lib_svm.txt")

    # Index labels, adding metadata to the label column.
    # Fit on whole dataset to include all labels in index.
    labelIndexer = StringIndexer(inputCol="label",
                                 outputCol="indexedLabel").fit(data)

    # Automatically identify categorical features, and index them.
    # Set maxCategories so features with > 4 distinct values are treated as continuous.
    featureIndexer = \
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a RandomForest model.
    rf = RandomForestClassifier(labelCol="indexedLabel",
                                featuresCol="indexedFeatures",
                                numTrees=10)

    # Convert indexed labels back to original labels.
    labelConverter = IndexToString(inputCol="prediction",
                                   outputCol="predictedLabel",
                                   labels=labelIndexer.labels)

    # Chain indexers and forest in a Pipeline
    pipeline = Pipeline(
        stages=[labelIndexer, featureIndexer, rf, labelConverter])

    # Train model.  This also runs the indexers.
    model = pipeline.fit(trainingData)

    # Make predictions.
    predictions = model.transform(testData)

    # Select example rows to display.
    predictions.select("predictedLabel", "label", "features").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g" % (1.0 - accuracy))

    rfModel = model.stages[2]
    print(rfModel)  # summary only
Exemplo n.º 27
0
def keywords_naive_bayes_classifier(sc, training_path, number_of_features,
                                    train_path_parquet, eval_path_parquet,
                                    save, path_to_save):

    sqlContext = SQLContext(sc)

    # if classification_type == 'home_pages':
    #     classifier_utils.prepare_input_for_home_page_classifier(
    #         sc,sqlContext,training_path,evaluation_rdd,prepare_training_input,train_path_parquet,eval_path_parquet)
    # elif classification_type == 'cluster_pages':
    #     classifier_utils.prepare_input_for_cluster_page_classifier(
    #         sc,sqlContext,training_path,evaluation_rdd,prepare_training_input,train_path_parquet,eval_path_parquet)

    schemaTrain = sqlContext.read.load(train_path_parquet)
    schemaEval = sqlContext.read.load(eval_path_parquet)

    categoryIndexer = StringIndexer(inputCol="category", outputCol="label")
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol="words",
                          outputCol="features",
                          numFeatures=number_of_features)
    nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
    categoryConverter = IndexToString(inputCol="label", outputCol="pred")
    pipeline = Pipeline(
        stages=[categoryIndexer, tokenizer, hashingTF, nb, categoryConverter])

    model = pipeline.fit(schemaTrain)
    pr = model.transform(schemaEval)

    classifier_utils.print_metrics(pr, 'label', 'prediction')

    output = pr.rdd

    # if classification_type == 'home_pages':
    #     output = output.filter(lambda row: row.prediction == 1.0)
    # if classification_type == 'cluster_pages':
    output = output.map(
        lambda row: {
            'url': row.url,
            'cluster_label': row.cluster_label,
            'probability': row.probability,
            'prediction': row.prediction,
            'pred': row.pred
        })

    # |category|cluster_label|domain|referring_url|text|url|label|words|features| rawPrediction|probability|prediction|pred

    rdd_utils.save_rdd(output, save, path_to_save)

    return output
Exemplo n.º 28
0
    def train_test(self, df):
        
        df = self.dropNonTCPUDP(df)

        catCols = []
        numCols = ['avg_ipt', 'bytes_in', 'bytes_out', 'entropy', 'total_entropy', 'num_pkts_out', 'num_pkts_in', 'duration']
        labelCol = 'label'

        data = self.get_dummy(df, catCols, numCols, labelCol)
        data.show()

        labelIndexer = StringIndexer(inputCol='label',
                             outputCol='indexedLabel').fit(data)

        labelIndexer.transform(data)

        featureIndexer = VectorIndexer(inputCol="features", \
                                        outputCol="indexedFeatures").fit(data)
        featureIndexer.transform(data)

        (trainingData, testData) = data.randomSplit([0.7, 0.3])
        trainingData.cache()
     #   trainingData.repartition(200)
        testData.cache()
       # testData.repartition(200)
        trainingData.show(5,False)
        testData.show(5,False)

        rf = RandomForestClassifier(featuresCol='indexedFeatures', labelCol='indexedLabel')
        gbt = GBTClassifier(featuresCol='indexedFeatures', labelCol='indexedLabel')
        logr = LogisticRegression(featuresCol='indexedFeatures', labelCol='indexedLabel')

        # Convert indexed labels back to original labels.
        labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)
        
        pipeline = Pipeline(stages=[labelIndexer, featureIndexer, gbt, labelConverter])
        model = pipeline.fit(trainingData)
        predictions = model.transform(testData)
        # Select example rows to display.
        predictions.select("features","label","predictedLabel", "prediction")

        # Select (prediction, true label) and compute test error
 
        print(self.getTestError(predictions))
        self.printMetrics(predictions)
      #  print(self.ExtractFeatureImp(model.stages[-2].featureImportances, testData, "features"))

        return model
Exemplo n.º 29
0
def main():
    sensor_data_df = spark.read.parquet('sensor_data_ts')

    # creating ML pipelines for classification and regression problems
    sensor_data_df = sensor_data_df.select(
        sensor_data_df['datetime'], sensor_data_df['message_code_name'],
        sensor_data_df['H2S'], sensor_data_df['CO'], sensor_data_df['LEL'],
        sensor_data_df['O2']).orderBy(sensor_data_df['datetime'].asc())

    condition = (sensor_data_df['datetime'] < datetime(2020, 10, 1))
    train_df = sensor_data_df.where(condition)
    test_df = sensor_data_df.where(~condition).cache()
    """ CLASSIFICATION PROBLEM FOR GAS EVENT IDENTIFICATION"""
    classification_set = train_df.select(
        train_df['message_code_name'].alias('target'), train_df['H2S'],
        train_df['CO'], train_df['LEL'], train_df['O2'])

    train_set, validation_set = classification_set.randomSplit([0.75, 0.25])
    train_set.cache()
    validation_set.cache()

    # message_code_name is the target column representing gas events (Normal, GasHighAlarm, GasLowAlarm, GasAlarm)
    # sql_transformer_statement = "SELECT datetime ,H2S, LEL, O2, CO, message_code_name AS target " \
    # #                               "FROM __THIS__"

    # sql_transformer = SQLTransformer(statement=sql_transformer_statement_2)
    assemble_features = VectorAssembler(inputCols=['H2S', 'LEL', 'O2', 'CO'],
                                        outputCol='features')
    target_indexer = StringIndexer(inputCol='target',
                                   outputCol='label').fit(train_set)
    label_converter = IndexToString(inputCol='label',
                                    outputCol='predicted_target')
    print(target_indexer.labels)
    classifier = MultilayerPerceptronClassifier(layers=[4, 20, 4])
    pipeline = Pipeline(stages=[
        assemble_features, target_indexer, classifier, label_converter
    ])
    model = pipeline.fit(train_set)

    predictions = model.transform(validation_set)
    predictions.select(['target', 'predicted_target']).show(30)

    evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',
                                                  labelCol='label')
    prediction_score = evaluator.evaluate(predictions)
    print('Gas Events Prediction Score: ', prediction_score)

    model.write().overwrite().save('mlp_model')
def predict(rdd, bestModel):
    if (not rdd.isEmpty()):
        df = sqlContext.createDataFrame(rdd).toDF("descr")
        predictions = bestModel.transform(df)
        converter = IndexToString(inputCol="prediction",
                                  outputCol="label",
                                  labels=bestModel.stages[0].labels)
        labelReverse = converter.transform(predictions)
        print("predictions for tweet:")
        print(
            labelReverse.select("features", "probability", "prediction",
                                "label").show())
        labelReverse.write.mode('append').parquet(
            'hdfs:///predictions/tweets_predictions.parquet')
    else:
        print("No data received")