# MAGIC %md
# MAGIC Now that we have created two new features through bucketing, let's combine those two features into a `Vector` with `VectorAssembler`.  VectorAssembler can be found in [pyspark.ml.feature](https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.VectorAssembler) for Python and the [org.apache.spark.ml.feature](http://spark.apache.org/docs/latest/api/scala/#org.apache.spark.ml.feature.VectorAssembler) package for Scala.
# MAGIC
# MAGIC Set the params of `assembler` so that both "lengthFeatures" and "widthFeatures" are assembled into a column called "featuresBucketized".
# MAGIC
# MAGIC Then, set the stages of `pipeline` to include both bucketizers and the assembler as the last stage.
# MAGIC
# MAGIC Finally, use `pipeline` to generate a new `DataFrame` called `irisAssembled`.

# COMMAND ----------

from pyspark.ml.feature import VectorAssembler
pipeline = Pipeline()
assembler = VectorAssembler()

print assembler.explainParams()
print '\n', pipeline.explainParams()

# COMMAND ----------

# ANSWER
# Set assembler params
(assembler.setInputCols(['lengthFeatures',
                         'widthFeatures']).setOutputCol('featuresBucketized'))

pipeline.setStages([lengthBucketizer, widthBucketizer, assembler])
irisAssembled = pipeline.fit(irisSeparateFeatures).transform(
    irisSeparateFeatures)
display(irisAssembled)

# COMMAND ----------
    "CharlsonIndexI_ave", "CharlsonIndexI_range", "CharlsonIndexI_stdev",
    "pcg1", "pcg2", "pcg3", "pcg4", "pcg5", "pcg6", "pcg7", "pcg8", "pcg9",
    "pcg10", "pcg11", "pcg12", "pcg13", "pcg14", "pcg15", "pcg16", "pcg17",
    "pcg18", "pcg19", "pcg20", "pcg21", "pcg22", "pcg23", "pcg24", "pcg25",
    "pcg26", "pcg27", "pcg28", "pcg29", "pcg30", "pcg31", "pcg32", "pcg33",
    "pcg34", "pcg35", "pcg36", "pcg37", "pcg38", "pcg39", "pcg40", "pcg41",
    "pcg42", "pcg43", "pcg44", "pcg45", "pcg46", "sp1", "sp2", "sp3", "sp4",
    "sp5", "sp6", "sp7", "sp8", "sp9", "sp10", "sp11", "sp12", "sp13", "pg1",
    "pg2", "pg3", "pg4", "pg5", "pg6", "pg7", "pg8", "pg9", "pg10", "pg11",
    "pg12", "pg13", "pg14", "pg15", "pg16", "pg17", "pg18", "ps1", "ps2",
    "ps3", "ps4", "ps5", "ps6", "ps7", "ps8", "ps9", "drugCount_max",
    "drugCount_min", "drugCount_ave", "drugcount_months", "labCount_max",
    "labCount_min", "labCount_ave", "labcount_months"
])
vecAssembler.setOutputCol("features")
print vecAssembler.explainParams()

from pyspark.ml.classification import DecisionTreeClassifier

aft = DecisionTreeClassifier()
aft.setLabelCol("Readmitlabel")
aft.setMaxDepth(30)

print aft.explainParams()

# COMMAND ----------

from pyspark.ml import Pipeline

# We will use the new spark.ml pipeline API. If you have worked with scikit-learn this will be very familiar.
lrPipeline = Pipeline()
예제 #3
0
파일: V.py 프로젝트: Inscrutive/spark
# MAGIC %md
# MAGIC Now that we have created two new features through bucketing, let's combined those two features into a `Vector` with `VectorAssembler`.
# MAGIC  
# MAGIC Set the params of `assembler` so that both "lengthFeatures" and "widthFeatures" are assembled into a column called "featuresBucketized".
# MAGIC  
# MAGIC Then, set the stages of `pipeline` to include both bucketizers and the assembler as the last stage.
# MAGIC  
# MAGIC Finally, use `pipeline` to generate a new `DataFrame` called `irisAssembled`.

# COMMAND ----------

from pyspark.ml.feature import VectorAssembler
pipeline = Pipeline()
assembler = VectorAssembler()

print assembler.explainParams()
print '\n',pipeline.explainParams()

# COMMAND ----------

# ANSWER
# Set assembler params
(assembler
 .setInputCols(['lengthFeatures', 'widthFeatures'])
 .setOutputCol('featuresBucketized'))

pipeline.setStages([lengthBucketizer, widthBucketizer, assembler])
irisAssembled = pipeline.fit(irisSeparateFeatures).transform(irisSeparateFeatures)
display(irisAssembled)

# COMMAND ----------
print(PatientLabIDCounts)

# COMMAND ----------

from pyspark.ml.feature import VectorAssembler

ignore = ['SUBJECT_ID', 'TARGET', 'GENDER']
assembler = VectorAssembler(
    inputCols=[x for x in PatientLabIDCounts.columns if x not in ignore],
    outputCol='features')

assembler.transform(PatientLabIDCounts)

# COMMAND ----------

print (assembler.explainParams())
if 0:
  from pyspark.ml import Pipeline
  pipeline = Pipeline(stages=[assembler, kmeans_estimator])
  model = Pipeline.fit(PatientLabIDCounts)

# COMMAND ----------

# use sepsisCodeSet as columns, for some reason can't resolve columns 50827, 50825, 50816 - not sure why , 
# print(sepsisCodeSet)
# ['50893', '50878', '50956', '51300', '51244', '51248', '51265', 
#'50820', '50821', '50827', '50825', '51222', '51221', '50882', '50889', 
# '50818', '51288', '50863', '50816', '50867', '51237', '51256', '51274',
# '51275', '51277', '50912', '51006', '50931', '50813']