Python StringIndexer.getInputCol 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: pyspark.ml.feature

클래스/타입: StringIndexer

메소드/함수: getInputCol

hotexamples.com에서의 예제들: 4

Python StringIndexer.getInputCol - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 pyspark.ml.feature.StringIndexer.getInputCol에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

StringIndexer(30)

fit(30)

transform(30)

getOutputCol(22)

show(19)

select(15)

setHandleInvalid(14)

write(10)

drop(9)

randomSplit(8)

toPandas(4)

withColumnRenamed(4)

getInputCol(3)

withColumn(3)

groupBy(3)

where(3)

printSchema(3)

save(2)

setInputCol(2)

count(2)

take(1)

describe(1)

setOutputCol(1)

filter(1)

dropna(1)

fitAsync(1)

orderBy(1)

_call_java(1)

labels(1)

groupby(1)

getOutputCols(1)

fillna(1)

load(1)

예제 #1

파일 보기

파일: Data engineering pyspark.py 프로젝트: hmk88/Pyspark_ML_databricks_ApacheSpark

indexed.show()

# COMMAND ----------

###Indexedtostring retrieves the original labels from the indexed label
from pyspark.ml.feature import IndexToString, StringIndexer

df = spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"),
                            (5, "c")], ["id", "category"])

indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
model = indexer.fit(df)
indexed = model.transform(df)

print("Transformed string column '%s' to indexed column '%s'" %
      (indexer.getInputCol(), indexer.getOutputCol()))
indexed.show()

print("StringIndexer will store labels in output column metadata\n")

converter = IndexToString(inputCol="categoryIndex",
                          outputCol="originalCategory")
converted = converter.transform(indexed)

print(
    "Transformed indexed column '%s' back to original string column '%s' using "
    "labels in metadata" % (converter.getInputCol(), converter.getOutputCol()))
converted.select("id", "categoryIndex", "originalCategory").show()

# COMMAND ----------

예제 #2

파일 보기

data = df.withColumn("WeekOfYear", weekofyear(df["Date"]))

data2 = data.withColumn("dayOfMonth", dayofmonth(data["Date"]))

data2.printSchema()

good_data = data2.select(["WeekOfYear", "dayOfMonth", "Currency Code", "Rate"])

currency_indexer = StringIndexer(inputCol='Currency Code',
                                 outputCol='CurrencyIndex')

model = currency_indexer.fit(good_data)
indexed = model.transform(good_data)

print("Transformed string column '%s' to indexed column '%s'" %
      (currency_indexer.getInputCol(), currency_indexer.getOutputCol()))
indexed.show()

encoder = OneHotEncoder(inputCol="CurrencyIndex", outputCol="CurrencyVec")
encoded = encoder.transform(indexed)
encoded.show()

assembler = VectorAssembler(
    inputCols=['WeekOfYear', 'dayOfMonth', 'CurrencyVec'],
    outputCol='features')

output = assembler.transform(encoded)
print(
    "Assembled columns 'WeekOfYear', 'dayOfMonth', 'CurrencyVec' to vector column 'features'"
)
#output.select("features", "Rate").show(truncate=False)

예제 #3

파일 보기

파일: index_to_string_example.py 프로젝트: lhfei/spark-in-action

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("IndexToStringExample")\
        .getOrCreate()

    # $example on$
    df = spark.createDataFrame(
        [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
        ["id", "category"])

    indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
    model = indexer.fit(df)
    indexed = model.transform(df)

    print("Transformed string column '%s' to indexed column '%s'"
          % (indexer.getInputCol(), indexer.getOutputCol()))
    indexed.show()

    print("StringIndexer will store labels in output column metadata\n")

    converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory")
    converted = converter.transform(indexed)

    print("Transformed indexed column '%s' back to original string column '%s' using "
          "labels in metadata" % (converter.getInputCol(), converter.getOutputCol()))
    converted.select("id", "categoryIndex", "originalCategory").show()
    # $example off$

    spark.stop()

예제 #4

파일 보기

pipeline = Pipeline(stages=[
  indexer, 
  assembler, 
  multinomialRegression
])

# COMMAND ----------

# TEST - Run this cell to test your solution
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import StringIndexer, VectorAssembler

dbTest("ML1-P-07-02-01", True, type(indexer) == type(StringIndexer()))
dbTest("ML1-P-07-02-02", True, indexer.getInputCol() == 'species')
dbTest("ML1-P-07-02-03", True, indexer.getOutputCol() == 'speciesClass')

dbTest("ML1-P-07-02-04", True, type(assembler) == type(VectorAssembler()))
dbTest("ML1-P-07-02-05", True, assembler.getInputCols() == irisDF.columns[:-1])
dbTest("ML1-P-07-02-06", True, assembler.getOutputCol() == 'features')

dbTest("ML1-P-07-02-07", True, type(multinomialRegression) == type(LogisticRegression()))
dbTest("ML1-P-07-02-08", True, multinomialRegression.getLabelCol() == "speciesClass")
dbTest("ML1-P-07-02-09", True, multinomialRegression.getFeaturesCol() == 'features')

dbTest("ML1-P-07-02-10", True, type(pipeline) == type(Pipeline()))

print("Tests passed!")

# COMMAND ----------