# MAGIC %md # MAGIC Now that we have created two new features through bucketing, let's combine those two features into a `Vector` with `VectorAssembler`. VectorAssembler can be found in [pyspark.ml.feature](https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.VectorAssembler) for Python and the [org.apache.spark.ml.feature](http://spark.apache.org/docs/latest/api/scala/#org.apache.spark.ml.feature.VectorAssembler) package for Scala. # MAGIC # MAGIC Set the params of `assembler` so that both "lengthFeatures" and "widthFeatures" are assembled into a column called "featuresBucketized". # MAGIC # MAGIC Then, set the stages of `pipeline` to include both bucketizers and the assembler as the last stage. # MAGIC # MAGIC Finally, use `pipeline` to generate a new `DataFrame` called `irisAssembled`. # COMMAND ---------- from pyspark.ml.feature import VectorAssembler pipeline = Pipeline() assembler = VectorAssembler() print assembler.explainParams() print '\n', pipeline.explainParams() # COMMAND ---------- # ANSWER # Set assembler params (assembler.setInputCols(['lengthFeatures', 'widthFeatures']).setOutputCol('featuresBucketized')) pipeline.setStages([lengthBucketizer, widthBucketizer, assembler]) irisAssembled = pipeline.fit(irisSeparateFeatures).transform( irisSeparateFeatures) display(irisAssembled) # COMMAND ----------
"CharlsonIndexI_ave", "CharlsonIndexI_range", "CharlsonIndexI_stdev", "pcg1", "pcg2", "pcg3", "pcg4", "pcg5", "pcg6", "pcg7", "pcg8", "pcg9", "pcg10", "pcg11", "pcg12", "pcg13", "pcg14", "pcg15", "pcg16", "pcg17", "pcg18", "pcg19", "pcg20", "pcg21", "pcg22", "pcg23", "pcg24", "pcg25", "pcg26", "pcg27", "pcg28", "pcg29", "pcg30", "pcg31", "pcg32", "pcg33", "pcg34", "pcg35", "pcg36", "pcg37", "pcg38", "pcg39", "pcg40", "pcg41", "pcg42", "pcg43", "pcg44", "pcg45", "pcg46", "sp1", "sp2", "sp3", "sp4", "sp5", "sp6", "sp7", "sp8", "sp9", "sp10", "sp11", "sp12", "sp13", "pg1", "pg2", "pg3", "pg4", "pg5", "pg6", "pg7", "pg8", "pg9", "pg10", "pg11", "pg12", "pg13", "pg14", "pg15", "pg16", "pg17", "pg18", "ps1", "ps2", "ps3", "ps4", "ps5", "ps6", "ps7", "ps8", "ps9", "drugCount_max", "drugCount_min", "drugCount_ave", "drugcount_months", "labCount_max", "labCount_min", "labCount_ave", "labcount_months" ]) vecAssembler.setOutputCol("features") print vecAssembler.explainParams() from pyspark.ml.classification import DecisionTreeClassifier aft = DecisionTreeClassifier() aft.setLabelCol("Readmitlabel") aft.setMaxDepth(30) print aft.explainParams() # COMMAND ---------- from pyspark.ml import Pipeline # We will use the new spark.ml pipeline API. If you have worked with scikit-learn this will be very familiar. lrPipeline = Pipeline()
# MAGIC %md # MAGIC Now that we have created two new features through bucketing, let's combined those two features into a `Vector` with `VectorAssembler`. # MAGIC # MAGIC Set the params of `assembler` so that both "lengthFeatures" and "widthFeatures" are assembled into a column called "featuresBucketized". # MAGIC # MAGIC Then, set the stages of `pipeline` to include both bucketizers and the assembler as the last stage. # MAGIC # MAGIC Finally, use `pipeline` to generate a new `DataFrame` called `irisAssembled`. # COMMAND ---------- from pyspark.ml.feature import VectorAssembler pipeline = Pipeline() assembler = VectorAssembler() print assembler.explainParams() print '\n',pipeline.explainParams() # COMMAND ---------- # ANSWER # Set assembler params (assembler .setInputCols(['lengthFeatures', 'widthFeatures']) .setOutputCol('featuresBucketized')) pipeline.setStages([lengthBucketizer, widthBucketizer, assembler]) irisAssembled = pipeline.fit(irisSeparateFeatures).transform(irisSeparateFeatures) display(irisAssembled) # COMMAND ----------
print(PatientLabIDCounts) # COMMAND ---------- from pyspark.ml.feature import VectorAssembler ignore = ['SUBJECT_ID', 'TARGET', 'GENDER'] assembler = VectorAssembler( inputCols=[x for x in PatientLabIDCounts.columns if x not in ignore], outputCol='features') assembler.transform(PatientLabIDCounts) # COMMAND ---------- print (assembler.explainParams()) if 0: from pyspark.ml import Pipeline pipeline = Pipeline(stages=[assembler, kmeans_estimator]) model = Pipeline.fit(PatientLabIDCounts) # COMMAND ---------- # use sepsisCodeSet as columns, for some reason can't resolve columns 50827, 50825, 50816 - not sure why , # print(sepsisCodeSet) # ['50893', '50878', '50956', '51300', '51244', '51248', '51265', #'50820', '50821', '50827', '50825', '51222', '51221', '50882', '50889', # '50818', '51288', '50863', '50816', '50867', '51237', '51256', '51274', # '51275', '51277', '50912', '51006', '50931', '50813']