if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("IndexToStringExample")\
        .getOrCreate()

    # $example on$
    df = spark.createDataFrame(
        [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
        ["id", "category"])

    indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
    model = indexer.fit(df)
    indexed = model.transform(df)

    print("Transformed string column '%s' to indexed column '%s'"
          % (indexer.getInputCol(), indexer.getOutputCol()))
    indexed.show()

    print("StringIndexer will store labels in output column metadata\n")

    converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory")
    converted = converter.transform(indexed)

    print("Transformed indexed column '%s' back to original string column '%s' using "
          "labels in metadata" % (converter.getInputCol(), converter.getOutputCol()))
    converted.select("id", "categoryIndex", "originalCategory").show()
    # $example off$

    spark.stop()
model = indexer.fit(df)
indexed = model.transform(df)

print("Transformed string column '%s' to indexed column '%s'" %
      (indexer.getInputCol(), indexer.getOutputCol()))
indexed.show()

print("StringIndexer will store labels in output column metadata\n")

converter = IndexToString(inputCol="categoryIndex",
                          outputCol="originalCategory")
converted = converter.transform(indexed)

print(
    "Transformed indexed column '%s' back to original string column '%s' using "
    "labels in metadata" % (converter.getInputCol(), converter.getOutputCol()))
converted.select("id", "categoryIndex", "originalCategory").show()

# COMMAND ----------

###One hot encode estimator maps the categorical features to binary vector. It is common practice to run string indexer first to convert the raw features into indexed features (Stringindexer)
from pyspark.ml.feature import OneHotEncoderEstimator

df = spark.createDataFrame([(0.0, 1.0), (1.0, 0.0), (2.0, 1.0), (0.0, 2.0),
                            (0.0, 1.0), (2.0, 0.0)],
                           ["categoryIndex1", "categoryIndex2"])

encoder = OneHotEncoderEstimator(
    inputCols=["categoryIndex1", "categoryIndex2"],
    outputCols=["categoryVec1", "categoryVec2"])
model = encoder.fit(df)
from pyspark.ml.feature import IndexToString, StringIndexer
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

df = spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"),
                            (5, "c")], ["id", "category"])

indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
model = indexer.fit(df)
indexed = model.transform(df)
print("Transformed string column '%s' to indexed column '%s'" %
      (indexer.getInputCol(), indexer.getOutputCol()))
indexed.show()

converter = IndexToString(inputCol="categoryIndex",
                          outputCol="originalCategory")
converted = converter.transform(indexed)
print(
    "Transformed indexed column '%s' back to original string column '%s' using labels in metadata"
    % (converter.getInputCol(), converter.getOutputCol()))
converted.select("id", "categoryIndex", "originalCategory").show()

spark.stop()