indexed.show()

# COMMAND ----------

###Indexedtostring retrieves the original labels from the indexed label
from pyspark.ml.feature import IndexToString, StringIndexer

df = spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"),
                            (5, "c")], ["id", "category"])

indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
model = indexer.fit(df)
indexed = model.transform(df)

print("Transformed string column '%s' to indexed column '%s'" %
      (indexer.getInputCol(), indexer.getOutputCol()))
indexed.show()

print("StringIndexer will store labels in output column metadata\n")

converter = IndexToString(inputCol="categoryIndex",
                          outputCol="originalCategory")
converted = converter.transform(indexed)

print(
    "Transformed indexed column '%s' back to original string column '%s' using "
    "labels in metadata" % (converter.getInputCol(), converter.getOutputCol()))
converted.select("id", "categoryIndex", "originalCategory").show()

# COMMAND ----------
예제 #2
0
data = df.withColumn("WeekOfYear", weekofyear(df["Date"]))

data2 = data.withColumn("dayOfMonth", dayofmonth(data["Date"]))

data2.printSchema()

good_data = data2.select(["WeekOfYear", "dayOfMonth", "Currency Code", "Rate"])

currency_indexer = StringIndexer(inputCol='Currency Code',
                                 outputCol='CurrencyIndex')

model = currency_indexer.fit(good_data)
indexed = model.transform(good_data)

print("Transformed string column '%s' to indexed column '%s'" %
      (currency_indexer.getInputCol(), currency_indexer.getOutputCol()))
indexed.show()

encoder = OneHotEncoder(inputCol="CurrencyIndex", outputCol="CurrencyVec")
encoded = encoder.transform(indexed)
encoded.show()

assembler = VectorAssembler(
    inputCols=['WeekOfYear', 'dayOfMonth', 'CurrencyVec'],
    outputCol='features')

output = assembler.transform(encoded)
print(
    "Assembled columns 'WeekOfYear', 'dayOfMonth', 'CurrencyVec' to vector column 'features'"
)
#output.select("features", "Rate").show(truncate=False)
if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("IndexToStringExample")\
        .getOrCreate()

    # $example on$
    df = spark.createDataFrame(
        [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
        ["id", "category"])

    indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
    model = indexer.fit(df)
    indexed = model.transform(df)

    print("Transformed string column '%s' to indexed column '%s'"
          % (indexer.getInputCol(), indexer.getOutputCol()))
    indexed.show()

    print("StringIndexer will store labels in output column metadata\n")

    converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory")
    converted = converter.transform(indexed)

    print("Transformed indexed column '%s' back to original string column '%s' using "
          "labels in metadata" % (converter.getInputCol(), converter.getOutputCol()))
    converted.select("id", "categoryIndex", "originalCategory").show()
    # $example off$

    spark.stop()
예제 #4
0
pipeline = Pipeline(stages=[
  indexer, 
  assembler, 
  multinomialRegression
])

# COMMAND ----------

# TEST - Run this cell to test your solution
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import StringIndexer, VectorAssembler

dbTest("ML1-P-07-02-01", True, type(indexer) == type(StringIndexer()))
dbTest("ML1-P-07-02-02", True, indexer.getInputCol() == 'species')
dbTest("ML1-P-07-02-03", True, indexer.getOutputCol() == 'speciesClass')

dbTest("ML1-P-07-02-04", True, type(assembler) == type(VectorAssembler()))
dbTest("ML1-P-07-02-05", True, assembler.getInputCols() == irisDF.columns[:-1])
dbTest("ML1-P-07-02-06", True, assembler.getOutputCol() == 'features')

dbTest("ML1-P-07-02-07", True, type(multinomialRegression) == type(LogisticRegression()))
dbTest("ML1-P-07-02-08", True, multinomialRegression.getLabelCol() == "speciesClass")
dbTest("ML1-P-07-02-09", True, multinomialRegression.getFeaturesCol() == 'features')

dbTest("ML1-P-07-02-10", True, type(pipeline) == type(Pipeline()))

print("Tests passed!")

# COMMAND ----------