예제 #1
0
sc = SparkContext("local", "Features - IndexToString")
sqlContext = SQLContext(sc)
spark = SparkSession.builder.getOrCreate()

df = spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"),
                            (5, "c")], ["id", "category"])

indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
model = indexer.fit(df)
indexed = model.transform(df)

print("Transformed string column '%s' to indexed column '%s'" %
      (indexer.getInputCol(), indexer.getOutputCol()))

indexed.show()

print("StringIndexer will store labels in output column metadata\n")

converter = IndexToString(inputCol="categoryIndex",
                          outputCol="originalCategory")

converted = converter.transform(indexed)

print(
    "Transformed indexed column '%s' back to original string column '%s' using "
    "labels in metadata" % (converter.getInputCol(), converter.getOutputCol()))

converted.select("id", "categoryIndex", "originalCategory").show()

spark.stop()
예제 #2
0
파일: test_param.py 프로젝트: zwxhnu/spark
 def test_list_string(self):
     for labels in [np.array(['a', u'b']), ['a', u'b'], np.array(['a', 'b'])]:
         idx_to_string = IndexToString(labels=labels)
         self.assertListEqual(idx_to_string.getLabels(), ['a', 'b'])
     self.assertRaises(TypeError, lambda: IndexToString(labels=['a', 2]))
## creating the pipeline
# pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover,countVectors, label_stringIdx]+[lr])

# Fit the pipeline to training documents.

df = data_filtered.select('trend', 'creation_time', "twid", "text_words")
pipelineFit = pipeline.fit(df)
dataset = pipelineFit.transform(df)
# dataset.show(5)
# dataset.count()

predictions = model.transform(dataset)

labeler = IndexToString(inputCol="prediction",
                        outputCol="topic",
                        labels=[
                            'event', 'sports', 'politics', 'news',
                            'technology', 'business', 'entertainment', 'health'
                        ])
# print(predictions)
prediciton_with_label = labeler.transform(predictions)
# prediciton_with_label.show(5)
# print(prediciton_with_label.count())

# ta = data.alias('ta')
# tb = prediciton_with_label.select('trend','creation_time','twid','topic').alias('tb')
prediciton_with_label.write.mode('append').format(
    'org.apache.spark.sql.cassandra').options(table='tweet',
                                              keyspace='graphy').save()

# final_df=ta.join(tb,(ta.twid==tb.twid) & (ta.creation_time==tb.creation_time) & (ta.trend==tb.trend),how="left").select(ta.trend,ta.creation_time,ta.twid,ta.body,ta.location,ta.topic,ta.user,tb.predictedLabel)
# final_df.show()
예제 #4
0
    def fit(self, data):
        '''Dataset must at least contain the following two columns:
        label: the class labels
        features: feature vector

        Attributes
        ----------
            data (Dataset<Row>): input data

        Returns
        -------
            map with metrics
        '''

        classCount = int(data.select(self.label).distinct().count())

        labelIndexer = StringIndexer().setInputCol(self.label) \
                                      .setOutputCol("indexedLabel") \
                                      .fit(data)

        # Split the data into training and test sets (30% held out for testing)
        splits = data.randomSplit([1.0 - self.testFraction, self.testFraction],
                                  self.seed)
        trainingData = splits[0]
        testData = splits[1]

        labels = labelIndexer.labels

        print("\n Class\tTrain\tTest")
        for l in labels:
            print("%s\t%i\t%i" % (l \
                                  ,(trainingData.filter(trainingData[self.label] == l)).count() \
                                  ,(testData.filter(testData[self.label] == l)).count() \
                                  )
                  )

        # Set input columns
        self.predictor.setLabelCol("indexedLabel").setFeaturesCol("features")

        # Convert indexed labels back to original labels
        labelConverter = IndexToString().setInputCol("prediction") \
                                        .setOutputCol("predictedLabel") \
                                        .setLabels(labelIndexer.labels)

        # Chain indexers and forest ina Pipline
        pipeline = Pipeline().setStages(
            [labelIndexer, self.predictor, labelConverter])

        # Train model. This also runs the indexers
        model = pipeline.fit(trainingData)

        # Make predictions
        predictions = model.transform(testData).cache()

        # Display some sample predictions
        print(f"\nSample predictions: {str(self.predictor).split('_')[0]}"
              )  # TODO predictor.getClass().getSimpleName()
        predictions.sample(False, 0.1, self.seed).show(25)

        predictions = predictions.withColumnRenamed(self.label, "stringLabel")
        predictions = predictions.withColumnRenamed("indexedLabel", self.label)

        # Collect metrics

        pred = predictions.select("prediction", self.label)

        metrics = OrderedDict()
        metrics["Method"] = str(self.predictor).split('_')[0]
        if classCount == 2:
            b = BinaryClassificationMetrics(pred)
            metrics["AUC"] = str(b.areaUnderROC())

        m = MulticlassMetrics(pred.rdd)
        metrics["F"] = str(m.weightedFMeasure())
        metrics["Accuracy"] = str(m.accuracy)
        metrics["Precision"] = str(m.weightedPrecision)
        metrics["Recall"] = str(m.weightedRecall)
        metrics["False Positive Rase"] = str(m.weightedFalsePositiveRate)
        metrics["True Positive Rate"] = str(m.weightedTruePositiveRate)
        metrics[""] = f"\nConfusion Matrix\n{labels}\n{m.confusionMatrix()}"

        return metrics
예제 #5
0
model = indexer.fit(df)
indexed = model.transform(df)
indexed.show()


# # IndexToString
# 与StringIndexer相对应,IndexToString的作用是把标签索引的一列重新映射回原有的字符型标签。
# 
# 其主要使用场景一般都是和StringIndexer配合,先用StringIndexer将标签转化成标签索引,进行模型训练,然后在预测标签的时候再把标签索引转化成原有的字符标签。当然,你也可以另外定义其他的标签。
# 
# 首先,和StringIndexer的实验相同,我们用StringIndexer读取数据集中的“category”列,把字符型标签转化成标签索引,然后输出到“categoryIndex”列上,构建出新的DataFrame。

# In[35]:


converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory")
converted = converter.transform(indexed)
converted.select("id", "categoryIndex", "originalCategory").show()


# # VectorIndexer
# 之前介绍的StringIndexer是针对单个类别型特征进行转换,倘若所有特征都已经被组织在一个向量中,又想对其中某些单个分量进行处理时,Spark ML提供了VectorIndexer类来解决向量数据集中的类别性特征转换。
# 
# 通过为其提供maxCategories超参数,它可以自动识别哪些特征是类别型的,并且将原始值转换为类别索引。它基于不同特征值的数量来识别哪些特征需要被类别化,那些取值可能性最多不超过maxCategories的特征需要会被认为是类别型的。
# 
# 在下面的例子中,我们读入一个数据集,然后使用VectorIndexer训练出模型,来决定哪些特征需要被作为类别特征,将类别特征转换为索引,这里设置maxCategories为10,即只有种类小10的特征才被认为是类别型特征,否则被认为是连续型特征:

# In[42]:


from pyspark.ml.feature import VectorIndexer
encoder = OneHotEncoderEstimator(
    inputCols=["UniqueCarrierInd", "OriginInd", "DestInd"],
    outputCols=["UniqueCarrierOHE", "OriginOHE", "DestOHE"])
assembler = VectorAssembler(inputCols=[
    "Month", "Day", "DayOfWeek", "CRSDepHour", "UniqueCarrierOHE", "OriginOHE",
    "DestOHE"
],
                            outputCol="features")
classifier = RandomForestClassifier(labelCol='delayCatInd',
                                    featuresCol='features',
                                    numTrees=10,
                                    maxDepth=10,
                                    maxBins=500,
                                    predictionCol="prediction")
labelConv = IndexToString(inputCol="prediction",
                          outputCol="predictedLabel",
                          labels=labelInd.labels)

pipeline = Pipeline(stages=[
    ucInd, oInd, dInd, labelInd, encoder, assembler, classifier, labelConv
])
(train, test) = df2.randomSplit([0.7, 0.3])
model = pipeline.fit(train)
predictions = model.transform(test)
predictions.head()

# ## Model Evalution
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="delayCatInd",
                                              predictionCol="prediction",
                                              metricName="accuracy")
예제 #7
0
def main(args):
    fields = [StructField("hashcodefile", StringType(), True), StructField("label", StringType(), True),
              StructField("n-grams", ArrayType(StringType(), True), True)]

    schema = StructType(fields)
    fieldsTest = [StructField("hashcodefile", StringType(), True),
              StructField("n-grams", ArrayType(StringType(), True), True)]
    schemaTest = StructType(fieldsTest)
    ## args[0] Preprocessd  training Parquet file of byte or opcode
    trainingParque=spark.read.parquet(args[0])
    print ("Parquet File read completed")

    #Creating for Trainig and Testing N-Gram
    # args[1]: No of grams: 1,2,3,4,....N
    ngram = NGram(n=args[1], inputCol="content", outputCol="n-grams")
    ngramDataFrame = ngram.transform(trainingParque).select("hashcodefile","label","n-grams")
    ngramRDD = ngramDataFrame.rdd

    # args[2]: Preprocessed Testing Parquet file of byte or opcode
    testingParqueTemp = spark.read.parquet(args[2])
    ngramTestData = NGram(n=args[1], inputCol="content", outputCol="n-grams")
    ngramTestDataFrame = ngramTestData.transform(testingParqueTemp).select("hashcodefile","n-grams")
    ngramTestDataRDD=ngramTestDataFrame.rdd
    inputNgram=spark.createDataFrame(ngramRDD,schema)
    inputTestNgram = spark.createDataFrame(ngramTestDataRDD, schemaTest)

    print("N-gram completed for testing & training")

    ################################################################################
    # Count Vectorizer for training data set
    ################################################################################

    cv = CountVectorizer(inputCol="n-grams", outputCol="features", vocabSize=1000, minDF=1.0,minTF=2.0)
    model = cv.fit(inputNgram)
    featurizedData = model.transform(inputNgram).select("hashcodefile","label","features")
    print ("Term Frequency completed for training data set")

    # # ######################################
    # Count Vectorizer for testing data set
    # # ######################################
    cvTest = CountVectorizer(inputCol="n-grams", outputCol="features", vocabSize=1000, minDF=1.0,minTF=2.0)

    modelTest = cvTest.fit(inputTestNgram)

    featurizedTestData = modelTest.transform(inputTestNgram).select("hashcodefile","features")
    featurizedTestData.write.parquet("opcodeFeaturesTesting.parquet")

    print("Term Frequency completed for testing data set")

    ###################################################################
    # Code for Random Forest Classifier
    ##################################################################
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(featurizedData)
    # # Train a RandomForest model.
    randomforest = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features", numTrees=600,maxDepth=10)

    #  Convert indexed labels back to original labels.
    labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                                    labels=labelIndexer.labels)
    # # Chain indexers and forest in a Pipeline
    pipeline = Pipeline(stages=[labelIndexer,randomforest,labelConverter])

    # # Train model.  This also runs the indexers.
    model = pipeline.fit(featurizedData)
    predictions = model.transform(featurizedTestData)
    filterPredictions=predictions.select("predictedLabel","hashcodefile")
    predictionsRDD=filterPredictions.rdd

    predictionsRDD.saveAsTextFile("output.text")
예제 #8
0
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import print_function

# $example on$
from pyspark.ml.feature import IndexToString, StringIndexer

# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession.builder.appName("IndexToStringExample").getOrCreate()

    # $example on$
    df = spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"])

    stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
    model = stringIndexer.fit(df)
    indexed = model.transform(df)

    converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory")
    converted = converter.transform(indexed)

    converted.select("id", "originalCategory").show()
    # $example off$

    spark.stop()
예제 #9
0
                  outputCol=cat_col + 'index',
                  handleInvalid='keep').fit(df_data) for cat_col in cat_cols
]

# ## categorical target column and use of StringIndexer and IndexToString

# In[11]:

eval_indexer = StringIndexer(inputCol='Eval',
                             outputCol='EvalIndex',
                             handleInvalid='keep').fit(df_data)

# In[12]:

indexer_to_eval = IndexToString(inputCol='prediction',
                                outputCol='Evaluted_Class',
                                labels=eval_indexer.labels)

# ## VectorAssembler and Pipeline and CrossValidator

# In[13]:

feature_set = [cat_col + 'index' for cat_col in cat_cols]

# In[14]:

assembler = VectorAssembler(inputCols=feature_set, outputCol='features')

# In[15]:

random_forest_dt = RandomForestClassifier(featuresCol='features',
예제 #10
0
def my_transform(rdd):
    with open("./index2whiskey1.json", mode="r", encoding="utf-8") as f:
        whiskey_list = list(json.loads(f.read()).values())
    model = ALSModel.load("hdfs://master/ALSModel1/")
    spark = SparkSession.builder.appName('sql coming~').getOrCreate()
    whiskey = rdd.map(lambda x: Row(whiskeyId=int(x[1]), user_name=x[0]))
    whiskey_df = spark.createDataFrame(whiskey)
    predict = model.recommendForItemSubset(whiskey_df, 1)
    df_user = predict.select(
        predict.whiskeyId,
        predict.recommendations[0].userId.alias("userId"),
    )

    df_whiskey = model.recommendForUserSubset(df_user, 5)
    result_df = df_user.join(df_whiskey, on=['userId'], how='left')
    result_df = result_df.join(whiskey_df, on=['whiskeyId'], how='left')
    result_df = result_df.select("user_name",
                    result_df["recommendations"][0].whiskeyId.alias("whiskeyId1"), \
                    result_df["recommendations"][1].whiskeyId.alias("whiskeyId2"), \
                    result_df["recommendations"][2].whiskeyId.alias("whiskeyId3"), \
                    result_df["recommendations"][3].whiskeyId.alias("whiskeyId4"), \
                    result_df["recommendations"][4].whiskeyId.alias("whiskeyId5") \
                    )
    whiskeyId1converter = IndexToString(inputCol="whiskeyId1",
                                        outputCol="whiskey1",
                                        labels=whiskey_list)
    whiskeyId2converter = IndexToString(inputCol="whiskeyId2",
                                        outputCol="whiskey2",
                                        labels=whiskey_list)
    whiskeyId3converter = IndexToString(inputCol="whiskeyId3",
                                        outputCol="whiskey3",
                                        labels=whiskey_list)
    whiskeyId4converter = IndexToString(inputCol="whiskeyId4",
                                        outputCol="whiskey4",
                                        labels=whiskey_list)
    whiskeyId5converter = IndexToString(inputCol="whiskeyId5",
                                        outputCol="whiskey5",
                                        labels=whiskey_list)

    result_df = whiskeyId1converter.transform(result_df)
    result_df = whiskeyId2converter.transform(result_df)
    result_df = whiskeyId3converter.transform(result_df)
    result_df = whiskeyId4converter.transform(result_df)
    result_df = whiskeyId5converter.transform(result_df)

    return result_df.rdd
    def Train(self):
        st = time.time()
        categorical_columns = self._dataframe_helper.get_string_columns()
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        result_column = self._dataframe_context.get_result_column()
        categorical_columns = [
            x for x in categorical_columns if x != result_column
        ]

        model_path = self._dataframe_context.get_model_path()
        pipeline_filepath = model_path + "/LogisticRegression/TrainedModels/pipeline"
        model_filepath = model_path + "/LogisticRegression/TrainedModels/model"
        summary_filepath = model_path + "/LogisticRegression/ModelSummary/summary.json"

        df = self._data_frame
        pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns,
                                                      categorical_columns,
                                                      result_column)
        pipelineModel = pipeline.fit(df)
        indexed = pipelineModel.transform(df)
        MLUtils.save_pipeline_or_model(pipelineModel, pipeline_filepath)
        trainingData, validationData = MLUtils.get_training_and_validation_data(
            indexed, result_column, 0.8)
        OriginalTargetconverter = IndexToString(
            inputCol="label", outputCol="originalTargetColumn")
        levels = trainingData.select("label").distinct().collect()

        if self._classifier == "lr":
            if len(levels) == 2:
                lr = LogisticRegression(maxIter=10,
                                        regParam=0.3,
                                        elasticNetParam=0.8)
            elif len(levels) > 2:
                lr = LogisticRegression(maxIter=10,
                                        regParam=0.3,
                                        elasticNetParam=0.8,
                                        family="multinomial")
            fit = lr.fit(trainingData)
        elif self._classifier == "OneVsRest":
            lr = LogisticRegression()
            ovr = OneVsRest(classifier=lr)
            fit = ovr.fit(trainingData)
        transformed = fit.transform(validationData)
        MLUtils.save_pipeline_or_model(fit, model_filepath)

        print fit.coefficientMatrix
        print fit.interceptVector

        # feature_importance = MLUtils.calculate_sparkml_feature_importance(indexed,fit,categorical_columns,numerical_columns)
        label_classes = transformed.select("label").distinct().collect()
        results = transformed.select(["prediction", "label"])
        if len(label_classes) > 2:
            evaluator = MulticlassClassificationEvaluator(
                predictionCol="prediction")
            evaluator.evaluate(results)
            self._model_summary["model_accuracy"] = evaluator.evaluate(
                results,
                {evaluator.metricName: "accuracy"})  # accuracy of the model
        else:
            evaluator = BinaryClassificationEvaluator(
                rawPredictionCol="prediction")
            evaluator.evaluate(results)
            # print evaluator.evaluate(results,{evaluator.metricName: "areaUnderROC"})
            # print evaluator.evaluate(results,{evaluator.metricName: "areaUnderPR"})
            self._model_summary["model_accuracy"] = evaluator.evaluate(
                results,
                {evaluator.metricName: "areaUnderPR"})  # accuracy of the model

        # self._model_summary["feature_importance"] = MLUtils.transform_feature_importance(feature_importance)
        self._model_summary["runtime_in_seconds"] = round((time.time() - st),
                                                          2)

        transformed = OriginalTargetconverter.transform(transformed)
        label_indexer_dict = [
            dict(enumerate(field.metadata["ml_attr"]["vals"]))
            for field in transformed.schema.fields if field.name == "label"
        ][0]
        prediction_to_levels = udf(lambda x: label_indexer_dict[x],
                                   StringType())
        transformed = transformed.withColumn(
            "predictedClass", prediction_to_levels(transformed.prediction))
        prediction_df = transformed.select(
            ["originalTargetColumn", "predictedClass"]).toPandas()
        objs = {
            "actual": prediction_df["originalTargetColumn"],
            "predicted": prediction_df["predictedClass"]
        }

        self._model_summary[
            "confusion_matrix"] = MLUtils.calculate_confusion_matrix(
                objs["actual"], objs["predicted"])
        overall_precision_recall = MLUtils.calculate_overall_precision_recall(
            objs["actual"], objs["predicted"])
        self._model_summary[
            "precision_recall_stats"] = overall_precision_recall[
                "classwise_stats"]
        self._model_summary["model_precision"] = overall_precision_recall[
            "precision"]
        self._model_summary["model_recall"] = overall_precision_recall[
            "recall"]
        self._model_summary["target_variable"] = result_column
        self._model_summary[
            "test_sample_prediction"] = overall_precision_recall[
                "prediction_split"]
        self._model_summary["algorithm_name"] = "Random Forest"
        self._model_summary["validation_method"] = "Train and Test"
        self._model_summary["independent_variables"] = len(
            categorical_columns) + len(numerical_columns)
        self._model_summary["level_counts"] = CommonUtils.get_level_count_dict(
            trainingData,
            categorical_columns,
            self._dataframe_context.get_column_separator(),
            dataType="spark")
        # print json.dumps(self._model_summary,indent=2)
        self._model_summary["total_trees"] = 100
        self._model_summary["total_rules"] = 300
        CommonUtils.write_to_file(
            summary_filepath, json.dumps({"modelSummary":
                                          self._model_summary}))
예제 #12
0
hashtf = HashingTF(numFeatures=2**16, inputCol="ngrams", outputCol="tf")
idf = IDF(inputCol="tf", outputCol="features", minDocFreq=5)
labels = StringIndexer(inputCol="original", outputCol="label")
lines = Pipeline(stages=[tokenizer, ngrams, hashtf, idf, labels])

linesFit = lines.fit(trainSet)
trainModel = linesFit.transform(trainSet)
validationModel = linesFit.transform(valSet)

lr = LogisticRegression(maxIter=100)
model = lr.fit(trainModel)
predictions = model.transform(validationModel)
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
predictions.show(30)

converter = IndexToString(inputCol="label", outputCol="label meaning")
converted = converter.transform(predictions.select("label").distinct())
converted.select("label", "label meaning").distinct().show()

truePositive = predictions[(predictions.label == 0)
                           & (predictions.prediction == 0)].count()
trueNegative = predictions[(predictions.label == 1)
                           & (predictions.prediction == 1)].count()
falsePositive = predictions[(predictions.label == 1)
                            & (predictions.prediction == 0)].count()
falseNegative = predictions[(predictions.label == 0)
                            & (predictions.prediction == 1)].count()
recall = float(truePositive) / (truePositive + falseNegative)
precision = float(truePositive) / (truePositive + falsePositive)

print("True Positive", truePositive)
예제 #13
0
파일: test_param.py 프로젝트: Brett-A/spark
 def test_list_string(self):
     for labels in [np.array(['a', u'b']), ['a', u'b'], np.array(['a', 'b'])]:
         idx_to_string = IndexToString(labels=labels)
         self.assertListEqual(idx_to_string.getLabels(), ['a', 'b'])
     self.assertRaises(TypeError, lambda: IndexToString(labels=['a', 2]))
예제 #14
0
    # Automatically identify categorical features, and index them.
    # Set maxCategories so features with > 4 distinct values are treated as continuous.
    featureIndexer =\
    VectorIndexer(inputCol='features', outputCol='indexedFeatures', maxCategories=2).fit(dataset)

    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = dataset.randomSplit([0.7, 0.3])

    # Train a RandomForest model.
    rf = RandomForestClassifier(labelCol='indexedLabel',
                                featuresCol='indexedFeatures',
                                numTrees=10)

    # Convert indexed labels back to original labels.
    labelConverter = IndexToString(inputCol='prediction',
                                   outputCol='predictedLabel',
                                   labels=labelIndexer.labels)

    # Chain indexers and forest in a Pipeline
    pipeline = Pipeline(
        stages=[labelIndexer, featureIndexer, rf, labelConverter])

    # Train model.  This also runs the indexers.
    model = pipeline.fit(trainingData)

    # Make predictions.
    predictions = model.transform(testData)

    # Select example rows to display.
    predictions.select('predictedLabel', 'label', 'features').show(5)
def make_class_model(data,
                     sc,
                     model_path,
                     model_name,
                     target,
                     ml_model='default',
                     save=True):

    t0 = time()
    # Stages for pipline
    stages = []

    # Index labels, adding metadata to the label column.
    # Fit on whole dataset to include all labels in index.
    targetIndexer = StringIndexer(inputCol="target",
                                  outputCol="indexedTarget",
                                  handleInvalid="keep").fit(data)
    stages += [targetIndexer]

    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Identify categorical and numerical variables
    catCols = [
        x for (x, dataType) in trainingData.dtypes
        if (((dataType == "string") | (dataType == "boolean"))
            & (x != "target"))
    ]

    numCols = [
        x for (x, dataType) in trainingData.dtypes
        if ((dataType == "int") | (dataType == "bigint")
            | (dataType == "float") | (dataType == "double"))
    ]

    # OneHotEncode categorical variables
    indexers = [
        StringIndexer(inputCol=column,
                      outputCol=column + "-index",
                      handleInvalid="keep") for column in catCols
    ]

    encoder = OneHotEncoder(
        inputCols=[indexer.getOutputCol() for indexer in indexers],
        outputCols=[
            "{0}-encoded".format(indexer.getOutputCol())
            for indexer in indexers
        ])
    assembler_cat = VectorAssembler(inputCols=encoder.getOutputCols(),
                                    outputCol="categorical-features",
                                    handleInvalid="skip")

    stages += indexers
    stages += [encoder, assembler_cat]

    assembler_num = VectorAssembler(inputCols=numCols,
                                    outputCol="numerical-features",
                                    handleInvalid="skip")

    # Standardize numerical variables
    scaler = StandardScaler(inputCol="numerical-features",
                            outputCol="numerical-features_scaled")

    # Combine all features in one vector
    assembler_all = VectorAssembler(
        inputCols=['categorical-features', 'numerical-features_scaled'],
        outputCol='features',
        handleInvalid="skip")

    stages += [assembler_num, scaler, assembler_all]

    # Train a RandomForest model by default or another specified model.
    if ml_model == 'default':
        rf = RandomForestClassifier(labelCol="indexedTarget",
                                    featuresCol="features",
                                    numTrees=10)
    else:
        rf = ml_model

    # Convert indexed labels back to original labels.
    labelConverter = IndexToString(inputCol="prediction",
                                   outputCol="predictedLabel",
                                   labels=targetIndexer.labels)

    stages += [rf, labelConverter]

    # Chain indexers and forest in a Pipeline
    pipeline = Pipeline(stages=stages)

    # Train model.  This also runs the indexers.
    model = pipeline.fit(trainingData)

    # Make predictions.
    predictions = model.transform(testData)

    # Select example rows to display.
    #predictions.select("predictedLabel", "target", "features").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(labelCol="indexedTarget",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Accuracy = %g" % (0.0 + accuracy))

    if save:
        # Final model saving and statistics writing
        tt = time() - t0
        timestamp = int(time())
        model.write().overwrite().save(model_path)

        cluster = Cluster(['127.0.0.1'], "9042")
        session = cluster.connect("models")
        query = (
            "INSERT INTO %s (model_name, timestamp, target, learning_time, model_path, stat)"
        ) % ("models_statistics")
        query = query + " VALUES (%s, %s, %s, %s, %s, %s)"
        session.execute(
            query, (model_name, timestamp, target, tt, model_path, accuracy))
        session.shutdown()
        cluster.shutdown()

        # Stop spark session
        sc.stop()

    if not save:
        return model, sc
#test_rdd = test_transformed.map(lambda data: Vectors.dense([float(c) for c in data]))


data_transformed = test_transformed.select(col("Id").alias("label"), col("features")).map(lambda row: LabeledPoint(row.label, row.features))

#Evaluate the model on the training data - output "ID", "prediction"
realTest_labelsAndPreds = data_transformed.map(lambda p: (p.label, (float(nb_model.predict(p.features)))))

output = sqlContext.createDataFrame(realTest_labelsAndPreds,['id','Category_Index'])

#convert back to Categories
#you need SPARK1.6 for this
#in cmd prompt,type in: sudo yum install spark-core spark-master spark-worker spark-python
from pyspark.ml.feature import IndexToString
converter = IndexToString(inputCol="Category_Index", outputCol="originalCategory", labels=classifymodel.labels)
converted = converter.transform(output)

#converted.write.format('com.databricks.spark.csv').save('submission1.csv')

def toCSVLine(data):
  return ','.join(str(d) for d in data)

lines = converted.map(toCSVLine)
lines.saveAsTextFile('submission1.csv')


#view Error rates
#realTest_trainErr = realTest_labelsAndPreds.filter(lambda vp: vp[0] != vp[1]).count() / float(test_transformed.count())
#print("Training Error = " + str(realTest_trainErr))
예제 #17
0
    result = []
    for rec in x:
        result.append(url2domain(rec[0]))
    return result


udfFunc = udf(lambda y: array2domain(y), ArrayType(StringType()))

domains_df = st.select('uid', udfFunc('visits').alias("urls"))

model = PipelineModel.load("lab04/model")
indexed = model.transform(domains_df)
labels = model.stages[1].labels

converter = IndexToString(inputCol="prediction",
                          outputCol="gender_age",
                          labels=labels)
converted = converter.transform(indexed)

out_df = converted.select("uid", "gender_age")

out_columns = list(out_df.columns)

query = out_df \
  .select(to_json(struct(*out_columns)).alias("value")) \
  .writeStream \
  .outputMode("update") \
  .format("kafka") \
  .option("checkpointLocation", "chk_12") \
  .option("kafka.bootstrap.servers", kafka_bootstrap ) \
  .option("topic", topic_out) \
예제 #18
0
    # one-hot encode categorical features
    encoder = OneHotEncoder(
        inputCols=["{}_index".format(col) for col in string_cols],
        outputCols=one_hot_encoded_features)

    # assemble all features into feature vector
    features_assembler = VectorAssembler(inputCols=num_bool_features,
                                         outputCol="features")

    # Index labels, adding metadata to the label column.
    label_indexer = StringIndexer(inputCol="has_over_50k",
                                  outputCol="label").fit(processed_train_set)

    # Convert indexed labels back to original labels.
    label_converter = IndexToString(inputCol="prediction",
                                    outputCol="predicted_label",
                                    labels=label_indexer.labels)

    # - ChiSQ feature Selection
    selector = ChiSqSelector(numTopFeatures=20,
                             featuresCol="features",
                             outputCol="featuresSel",
                             labelCol="label")

    # - RandomForest model with parameter tuning using cross validation
    rf = RandomForestClassifier(labelCol="label",
                                featuresCol="featuresSel",
                                numTrees=20)

    # - Create ParamGrid for Cross Validation
    rf_param_grid = (ParamGridBuilder().addGrid(
예제 #19
0
 def test_attr_spark(self):
     conf = SparkConf().setAppName("toy_test").setMaster('local[2]')
     num_partitions = 2
     enumerator = "join"
     model_type = "regression"
     label = 'target'
     sparkContext = SparkContext(conf=conf)
     sqlContext = SQLContext(sparkContext)
     train_df = sqlContext.read.csv("toy_train.csv", header='true',
                         inferSchema='true')
     test_df = sqlContext.read.csv("toy.csv", header='true',
                         inferSchema='true')
     # initializing stages of main transformation pipeline
     stages = []
     # list of categorical features for further hot-encoding
     cat_features = ['a', 'b', 'c']
     for feature in cat_features:
         string_indexer = StringIndexer(inputCol=feature, outputCol=feature + "_index").setHandleInvalid("skip")
         encoder = OneHotEncoderEstimator(inputCols=[string_indexer.getOutputCol()], outputCols=[feature + "_vec"])
         encoder.setDropLast(False)
         stages += [string_indexer, encoder]
     assembler_inputs = [feature + "_vec" for feature in cat_features]
     assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="assembled_inputs")
     stages += [assembler]
     assembler_final = VectorAssembler(inputCols=["assembled_inputs"], outputCol="features")
     stages += [assembler_final]
     pipeline = Pipeline(stages=stages)
     train_pipeline_model = pipeline.fit(train_df)
     test_pipeline_model = pipeline.fit(test_df)
     train_df_transformed = train_pipeline_model.transform(train_df)
     test_df_transformed = test_pipeline_model.transform(test_df)
     train_df_transformed = train_df_transformed.withColumn('model_type', sf.lit(0))
     test_df_transformed = test_df_transformed.withColumn('model_type', sf.lit(0))
     decode_dict = {}
     counter = 0
     cat = 0
     for feature in cat_features:
         colIdx = test_df_transformed.select(feature, feature + "_index").distinct().rdd.collectAsMap()
         colIdx = {k: v for k, v in sorted(colIdx.items(), key=lambda item: item[1])}
         for item in colIdx:
             decode_dict[counter] = (cat, item, colIdx[item], counter)
             counter = counter + 1
         cat = cat + 1
     train_df_transform_fin = train_df_transformed.select('features', label, 'model_type')
     test_df_transform_fin = test_df_transformed.select('features', label, 'model_type')
     lr = LinearRegression(featuresCol='features', labelCol=label, maxIter=10, regParam=0.0, elasticNetParam=0.8)
     lr_model = lr.fit(train_df_transform_fin)
     eval = lr_model.evaluate(test_df_transform_fin)
     f_l2 = eval.meanSquaredError
     pred = eval.predictions
     pred_df_fin = pred.withColumn('error', spark_utils.calc_loss(pred[label], pred['prediction'], pred['model_type']))
     predictions = pred_df_fin.select('features', 'error').repartition(num_partitions)
     converter = IndexToString(inputCol='features', outputCol='cats')
     all_features = list(decode_dict)
     predictions = predictions.collect()
     spark_join = spark_slicer.parallel_process(all_features, predictions, f_l2, sparkContext, debug=self.debug, alpha=self.alpha,
                                   k=self.k, w=self.w, loss_type=self.loss_type, enumerator="join")
     spark_union = spark_union_slicer.process(all_features, predictions, f_l2, sparkContext, debug=self.debug, alpha=self.alpha,
                                   k=self.k, w=self.w, loss_type=self.loss_type, enumerator="union")
     self.assertEqual(3, len(spark_join.slices))
     print("check1")
     self.assertEqual(spark_join.min_score, spark_union.min_score)
     print("check2")
     self.assertEqual(spark_join.keys, spark_union.keys)
     print("check3")
     self.assertEqual(len(spark_join.slices), len(spark_union.slices))
     print("check4")
     idx = -1
     for sliced in spark_join.slices:
         idx += 1
         self.assertEqual(sliced.score, spark_union.slices[idx].score)
     print("check5")
예제 #20
0
# for item in rel:
#     print(item)
"""构建ML的pipeline"""
## 分别获取标签列和特征列,并进行了重命名
label_indexer = StringIndexer().setInputCol("label").setOutputCol(
    "indexedLabel").fit(df)
feature_indexer = VectorIndexer().setInputCol("features").setOutputCol(
    "indexedFeatures").fit(df)
"""把数据集分成训练集和测试集"""
training_data, test_data = df.randomSplit([0.7, 0.3])
mlr = LogisticRegression().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures").setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8) \
    .setFamily("multinomial")

# print("LogisticRegression parameters:\n" + lr.explainParams)
"""设置lebelConverter,目的是把预测的类别重新转成字符型"""
label_converter = IndexToString().setInputCol("prediction").setOutputCol(
    "predictionLabel").setLabels(label_indexer.labels)
mlr_pipeline = Pipeline().setStages(
    [label_indexer, feature_indexer, mlr, label_converter])
mlr_pipeline_model = mlr_pipeline.fit(training_data)
"""pipeline本质上是一个Estimator,当pipeline调用fit()的时候就产生了一个PipelineModel,本质上是一个Transformer。
然后这个PipelineModel就可以调用transform()来进行预测,生成一个新的DataFrame,即利用训练得到的模型对测试集进行验证。"""

mlr_predictions = mlr_pipeline_model.transform(test_data)
pre_rel = mlr_predictions.select("predictionLabel", "label", "features",
                                 "probability").collect()
for item in pre_rel:
    print(
        str(item['label']) + "," + str(item['features']) + "-->prob" +
        str(item['probability']) + ",predictedLabel " +
        str(item['predictionLabel']))
"""创建一个MulticlassClassificationEvaluator实例,用setter方法把预测分类的列名和真实分类的列名进行设置,然后计算预测准确率和错误率"""
예제 #21
0
assembler = VectorAssembler(
    inputCols=['latitude', 'longitude', 'gps_height', 'construction_year', 'population', 'payment_indexed', 'scheme_management_indexed', 'basin_indexed', 'management_indexed',
               'water_quality_indexed', 'quantity_indexed', 'source_indexed', 'extraction_type_indexed',
               'waterpoint_type_indexed'],
    outputCol="features")

scaler = StandardScaler(inputCol='features', outputCol='features_scaled', withStd=True, withMean=False)

labelIndexer = StringIndexer(inputCol="status_group", outputCol="label").fit(df_train)

rf = RandomForestClassifier(labelCol='label', featuresCol='features_scaled', seed=42, maxMemoryInMB=2048)
evaluator = MulticlassClassificationEvaluator(metricName='accuracy')

# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="status_group_prediction", labels=labelIndexer.labels)

param_grid = ParamGridBuilder()\
    .addGrid(assembler.outputCol, ['features_scaled'])\
    .addGrid(rf.maxDepth, [10])\
    .addGrid(rf.maxBins, [20])\
    .addGrid(rf.minInstancesPerNode, [1])\
    .addGrid(rf.minInfoGain, [0.0])\
    .addGrid(rf.impurity, ['gini'])\
    .addGrid(rf.numTrees, [30])\
    .addGrid(rf.featureSubsetStrategy, ['all'])\
    .build()

pipeline = Pipeline(stages=[paymentIndexer, schemeManagementIndexer, basinIndexer, qualityIndexer, managementIndexer, quantityIndexer, sourceIndexer, extractionTypeIndexer, waterpointTypeIndexer, assembler, labelIndexer, rf, labelConverter])

cross_val = CrossValidator(
예제 #22
0
def index_to_string(dataset, inputCol):
    from pyspark.ml.feature import IndexToString
    return IndexToString(inputCol=inputCol, outputCol=inputCol+'_i2s').transform(dataset)
    #(trainingData, testData) = fullData.randomSplit([0.9, 0.1])

    #trainingData = trainingData.dropna()
    #testData = testData.dropna()

    indexer = StringIndexer(inputCol="category",
                            outputCol="label").fit(fullData)
    tokenizer = RegexTokenizer(pattern=u'\W+',
                               inputCol="TEXT",
                               outputCol="words",
                               toLowercase=False)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    lr = LogisticRegression(maxIter=20, regParam=0.001)
    labelConverter = IndexToString(inputCol="prediction",
                                   outputCol="originalcategory",
                                   labels=indexer.labels)

    pipeline = Pipeline(
        stages=[indexer, tokenizer, hashingTF, idf, lr, labelConverter])
    model = pipeline.fit(fullData)

    print("Done training classifier")

    model.save("/home/jys308/weights")

    #pred = model.transform(testData)
    #pl = pred.select("label", "prediction").rdd.cache()
    #metrics = MulticlassMetrics(pl)
    #metrics.fMeasure()
예제 #24
0
print("The deserialized model stages are", model_deserialized.stages)

##############################################################################
## export the final model with mleap

## remove the stringIndexer for the label column so it won't be required for prediction
model_final = model.copy()

si_label_index = -3
model_final.stages.pop(si_label_index)  #si_label

## append an IndexToString transformer to the model pipeline to get the original labels
#labelReverse = IndexToString(inputCol = "label", outputCol = "predIncome") #no need to provide labels
labelReverse = IndexToString(
    inputCol="prediction",
    outputCol="predictedIncome",
    labels=model.stages[si_label_index].labels
)  #must provide labels (from si_label) otherwise will fail
model_final.stages.append(labelReverse)

pred_final = model_final.transform(test)
pred_final.printSchema()
pred_final.show(5)

# remove an old model file, if needed.
if os.path.isfile(model_file):
    os.remove(model_file)
model_final.serializeToBundle(model_file_path, model_final.transform(train))

print("persist the mleap bundle from local to hdfs")
from subprocess import Popen, PIPE
예제 #25
0

# ### Build the feature Vector Assembler

# In[9]:


assembler =  VectorAssembler(outputCol="features", inputCols=list(featureCols))


# ### Convert indexed labels back to original labels

# In[10]:


predConverter = IndexToString(inputCol="prediction",outputCol="predictedLabel",labels=labelIndexer.labels)


# ## Do the Data Preparation

# In[11]:


labeledData = labelIndexer.transform(df)
# TODO add the other additional indexer
indexedLabedData = collegeIndexer.transform(labeledData)
labeledPointData = assembler.transform(indexedLabedData)


# ### Spliting the dataset into train and test set
예제 #26
0
            `petal_length` DOUBLE,
            `petal_width` DOUBLE,
            `class` STRING
        """

df = spark.read.csv(dbfs_file, schema=schema)

categoricalCols = ["class"]

# The following two lines are estimators. They return functions that we will later apply to transform the dataset.

# Convert it to a numeric value using StringIndexer.
labelToIndex = StringIndexer(inputCol="class", outputCol="indexed_class")
labelIndexer = labelToIndex.fit(df)
labelReverser = IndexToString(inputCol="prediction",
                              outputCol="class",
                              labels=labelIndexer.labels)

# This includes both the numeric columns and the one-hot encoded binary vector columns in our dataset.
numericCols = ["sepal_length", "sepal_width", "petal_length", "petal_width"]

vecAssembler = VectorAssembler(inputCols=numericCols, outputCol="features")

lr = LogisticRegression(featuresCol="features",
                        labelCol="indexed_class",
                        regParam=1e5)

# Define the pipeline based on the stages created in previous steps.
pipeline = Pipeline(stages=[labelToIndex, vecAssembler, lr, labelReverser])

# Define the pipeline model.
indexer_acc_fitted = indexer_acc.fit(df)
df = indexer_acc_fitted.transform(df)

indexer_mer = StringIndexer(inputCol="itemId", outputCol="itemIndex")
indexer_mer_fitted = indexer_mer.fit(df)
df = indexer_mer_fitted.transform(df)

print('############################## - LOADING MODEL - ##############################')
model = ALSModel.load('models/moviesrec/')

print('############################## - CLASSIFYING DATA')

userRecommends = model.recommendForAllUsers(10)
userRecommends.show(truncate=False)

print('############################## - EXPLODING PREDICTIONS')
flatUserRecomends = userRecommends.withColumn('userAndRatings', explode(userRecommends.recommendations)).select('userIndex','userAndRatings.*')
flatUserRecomends.show(truncate=False)
print('############################## - CONVERTING INDEXES TO STRING')
userConverter = IndexToString(inputCol='userIndex', outputCol='userId', labels=indexer_acc_fitted.labels)
itemConverter = IndexToString(inputCol='itemIndex', outputCol='itemId', labels=indexer_mer_fitted.labels)

convertedMoviesRecs = Pipeline(stages=[userConverter,itemConverter]).fit(df).transform(flatUserRecomends)
print('############################## - SAVING DATA')

convertedMoviesRecs.write.json('results/usersrec/')

#userRecomends.write.format('json').save('/ML/movies/usersrec/')
# spark-submit als-model-predictions.py --master yarn --deploy-mode client --num-executors 2 --driver-java-options "-XX:+UseG1GC -XX:ResizePLAB -Xms1g -Xmx1g -XX:InitiatingHeapOccupancyPercent=35" --conf "spark.sql.tungthen.enabled=true" --conf "spark.serializer=org.apache.spark.serializer.KyrioSerializer" --conf "spark.memory.fraction=0.3" --conf "spark.driver.memoryOverhead=2g" --conf "spark.executor.memoryOverhead=1g" --conf "spark.executor.extraJavaOptions -XX:+UseG1GC -XX:ResizePLAB -Xms3g -Xmx3g -XX:InitiatingHeapOccupancyPercent=35 -XX:ConcGCThread=20"

예제 #28
0
# -*- encoding:utf-8 -*-
"""
@author: zhouning
@file:IndexToString.py
@time:2018/8/7 21:09
@desc:
与StringIndexer相对应,IndexToString的作用是把标签索引的一列重新映射回原有的字符型标签。
其主要使用场景一般都是和StringIndexer配合,先用StringIndexer将标签转化成标签索引,进行模型训练,
然后在预测标签的时候再把标签索引转化为原有的字符标签。当然,你也可以另外定义其他的标签

"""

from pyspark.ml.feature import StringIndexer, IndexToString
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("logistic_regression").getOrCreate()
df = spark.createDataFrame([(0, "ab"), (1, "bb"), (2, "cb"), (3, "aa"),
                            (4, "aa"), (5, "ca")], ["id", "category"])

indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
model = indexer.fit(df)
indexed = model.transform(df)
indexed.show()

converter = IndexToString(inputCol="categoryIndex",
                          outputCol="originalCategory")
converted = converter.transform(indexed)
converted.select("id", "categoryIndex", "originalCategory").show()

spark.stop()
예제 #29
0
from pyspark.ml.linalg import Vectors

# #### Data Preparation
# Before using any model, the data needs to be organized into a set of 'features' and 'labels'.
# In this case, our features are sensor names and their readings, and the label is whether a
# particular asset needs maintenance or not. We'll use Spark's feature extraction libraries for this.
modelData = rawMeasurements.filter('isMaintenance')
si1 = StringIndexer(inputCol='sensor_name', outputCol='sensor_id').fit(modelData).transform(modelData)
va = VectorAssembler(inputCols=['sensor_id','value'], outputCol="features").transform(si1)
li = StringIndexer(inputCol='asset_name', outputCol='label').fit(va)

# #### Model Training
# We split the data into 2 subsets - one to train the model, and one to test/evaluate it 
(trainingData, testData) = va.randomSplit([0.7, 0.3])
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)
li2s = IndexToString(inputCol="prediction", outputCol="predictedLabel",labels=li.labels)
pipeline = Pipeline(stages=[li, rf, li2s])
model = pipeline.fit(trainingData)

# #### Model Evaluation
# The training data was used to fit the model (ie. train it), now we can test the model
# using the test subset, and calculate the accuracy (ie. false prediction rate)
predictions = model.transform(testData)
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

# Our model is very accurate, let's visualize the results. A heatmap can show how many correct and incorrect predctions we made
predictionResults = predictions.groupBy(predictions.predictedLabel.alias('Prediction'),
                    predictions.asset_name.alias('Actual'))\
    .count().toPandas()
예제 #30
0
from pyspark.ml.feature import StringIndexer

lblIndxr = StringIndexer().setInputCol("lab").setOutputCol("labelInd")
idxRes = lblIndxr.fit(simpleDF).transform(simpleDF)
idxRes.show()

# COMMAND ----------

valIndexer = StringIndexer().setInputCol("value1").setOutputCol("valueInd")
valIndexer.fit(simpleDF).transform(simpleDF).show(5)

# COMMAND ----------

from pyspark.ml.feature import IndexToString

labelReverse = IndexToString().setInputCol("labelInd")
labelReverse.transform(idxRes).show(5)

# COMMAND ----------

from pyspark.ml.feature import VectorIndexer
from pyspark.ml.linalg import Vectors

idxIn = spark.createDataFrame([(Vectors.dense(1, 2, 3), 1),
                               (Vectors.dense(2, 5, 6), 2),
                               (Vectors.dense(1, 8, 9), 3)
                               ]).toDF("features", "label")
indxr = VectorIndexer()\
  .setInputCol("features")\
  .setOutputCol("idxed")\
  .setMaxCategories(2)
        map(lambda x: x.split(",")).\
        map(lambda x: Row(**f(x))).\
        toDF()
data.show()

labelIndexer = StringIndexer().setInputCol("label").\
                setOutputCol("indexedLabel").\
                fit(data)

featureIndexer = VectorIndexer().setInputCol("features").\
                setOutputCol("indexedFeatures").\
                setMaxCategories(4).\
                fit(data)

labelConverter = IndexToString().\
            setInputCol("prediction").\
            setOutputCol("predictedLabel").\
            setLabels(labelIndexer.labels)

dc = DecisionTreeClassifier().\
        setLabelCol("indexedLabel").\
        setFeaturesCol("indexedFeatures")

dcPipeline = Pipeline().setStages(
    [labelIndexer, featureIndexer, dc, labelConverter])

trainingData, testData = data.randomSplit([0.7, 0.3])
dcPipelineModel = dcPipeline.fit(trainingData)
dcPredictions = dcPipelineModel.transform(testData)

preRel = dcPredictions.select("predictedLabel", "label", "features",
                              "probability").collect()
예제 #32
0
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer = VectorIndexer(inputCol="features",
                               outputCol="indexedFeatures",
                               maxCategories=4).fit(data)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedLabel",
                            featuresCol="indexedFeatures",
                            numTrees=10)

# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction",
                               outputCol="predictedLabel",
                               labels=labelIndexer.labels)

# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("predictedLabel", "label", "features").show(5)

# Select (prediction, true label) and compute test error
예제 #33
0
df = label_indexer.transform(df)

# only select the features and label column
df = df.select(['features', 'label'])
print("Reading for machine learning")

df.show(10)

train, test = df.randomSplit([0.70, 0.30])
test.show()

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
model = lr.fit(train)

predictions = model.transform(test)
converter = IndexToString(inputCol="label", outputCol="originallabel")
converted = converter.transform(predictions)

converter = IndexToString(inputCol="prediction",
                          outputCol="prediction_label",
                          labels=user_labels)
converted = converter.transform(converted)
converted.show(5)

customSchema = StructType([
    StructField("sepal_length", DoubleType(), True),
    StructField("sepal_width", DoubleType(), True),
    StructField("petal_length", DoubleType(), True),
    StructField("petal_width", DoubleType(), True)
])
myrdd = spark.sparkContext.parallelize([[5.1, 3.5, 1.4, 0.2]])
if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("IndexToStringExample")\
        .getOrCreate()

    # $example on$
    df = spark.createDataFrame(
        [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
        ["id", "category"])

    indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
    model = indexer.fit(df)
    indexed = model.transform(df)

    print("Transformed string column '%s' to indexed column '%s'"
          % (indexer.getInputCol(), indexer.getOutputCol()))
    indexed.show()

    print("StringIndexer will store labels in output column metadata\n")

    converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory")
    converted = converter.transform(indexed)

    print("Transformed indexed column '%s' back to original string column '%s' using "
          "labels in metadata" % (converter.getInputCol(), converter.getOutputCol()))
    converted.select("id", "categoryIndex", "originalCategory").show()
    # $example off$

    spark.stop()
from pyspark.ml.feature import StringIndexer
lblIndxr = StringIndexer().setInputCol("lab").setOutputCol("labelInd")
idxRes = lblIndxr.fit(simpleDF).transform(simpleDF)
idxRes.show()


# COMMAND ----------

valIndexer = StringIndexer().setInputCol("value1").setOutputCol("valueInd")
valIndexer.fit(simpleDF).transform(simpleDF).show()


# COMMAND ----------

from pyspark.ml.feature import IndexToString
labelReverse = IndexToString().setInputCol("labelInd")
labelReverse.transform(idxRes).show()


# COMMAND ----------

from pyspark.ml.feature import VectorIndexer
from pyspark.ml.linalg import Vectors
idxIn = spark.createDataFrame([
  (Vectors.dense(1, 2, 3),1),
  (Vectors.dense(2, 5, 6),2),
  (Vectors.dense(1, 8, 9),3)
]).toDF("features", "label")
indxr = VectorIndexer()\
  .setInputCol("features")\
  .setOutputCol("idxed")\