def get_ml1_pipeline():
    stages = []

    imputer = Imputer(inputCols=ML1_NUMERICAL_COLUMNS , outputCols=ML1_NUMERICAL_COLUMNS )
    stages.append(imputer)

    ohe_input_cols = []
    ohe_output_cols = []
    for categorical_column in ML1_CATEGORICAL_COLUMNS:
        str_indexer = StringIndexer(inputCol=categorical_column, outputCol=categorical_column + "_index", handleInvalid='keep')
        ohe_input_cols.append(str_indexer.getOutputCol())
        ohe_output_cols.append(categorical_column + "_class_vec")
        stages.append(str_indexer)

    encoder = OneHotEncoderEstimator(inputCols=ohe_input_cols, outputCols=ohe_output_cols, handleInvalid="error", dropLast=False)
    stages.append(encoder)

    numerical_vector_assembler = VectorAssembler(inputCols=ML1_NUMERICAL_COLUMNS , outputCol="numerial_cols_vec", handleInvalid="keep")
    scaler = MinMaxScaler(inputCol="numerial_cols_vec", outputCol= "scaled_numerical_cols")
    stages.append(numerical_vector_assembler)
    stages.append(scaler)

    label_str_indexer = StringIndexer(inputCol="result", outputCol="label", handleInvalid="keep")
    stages.append(label_str_indexer)

    assembler_input = encoder.getOutputCols() + [scaler.getOutputCol()]
    assembler = VectorAssembler(inputCols= assembler_input, outputCol="features", handleInvalid="skip")
    stages.append(assembler)

    pipeline = Pipeline(stages = stages)
    return pipeline
示例#2
0
def build_pipeline (pipeconfig: dict) -> pyspark.ml.Pipeline:
    '''
    Build a Pipeline instance based on config file
    :param pipeconfig: metadata dictionary
    :return: pyspark.ml.Pipeline
    '''

    # Pipeline metadata
    cats = pipeconfig['variables']['categoricals']
    nums = pipeconfig['variables']['numericals']
    index_names = pipeconfig['metadata']['index_names']
    encoded_names = pipeconfig['metadata']['encoded_names']
    vect_name = pipeconfig['metadata']['vect_name']
    feats_name = pipeconfig['metadata']['feats_name']
    labelcol = pipeconfig['model']['labelCol']
    maxdepth = pipeconfig['model']['maxDepth']
    maxbins = pipeconfig['model']['maxBins']
    maxiter = pipeconfig['model']['maxIter']
    seed = pipeconfig['model']['seed']

    # Build stages
    stageone = StringIndexer(inputCols=cats,
                             outputCols=index_names)

    stagetwo = OneHotEncoder(dropLast=False,
                             inputCols=stageone.getOutputCols(),
                             outputCols=encoded_names)

    stagethree = VectorAssembler(inputCols=nums + stagetwo.getOutputCols(),
                                 outputCol=vect_name)

    stagefour = MinMaxScaler(inputCol=stagethree.getOutputCol(),
                             outputCol=feats_name)

    stagefive = GBTClassifier(featuresCol=stagefour.getOutputCol(),
                              labelCol=labelcol,
                              maxDepth=maxdepth,
                              maxBins=maxbins,
                              maxIter=maxiter,
                              seed=seed)
    pipeline = Pipeline(stages=[stageone, stagetwo, stagethree, stagefour, stagefive])

    return pipeline
numTrees = 25
maxDepth = 5
maxBins = 5

# COMMAND ----------

# MAGIC %md
# MAGIC #### 4.2 Initialize the Model
# MAGIC
# MAGIC In this step, we are only giving instructions to the algorithm and chaining it to the feature engineering steps. We will not train the model, yet.

# COMMAND ----------

rf = RandomForestClassifier(
    labelCol="conversion",  # Label we are trying to predict 
    featuresCol=scaler.getOutputCol(
    ),  # Feature names from last step of the pipeline
    numTrees=numTrees,  # Parameters
    maxDepth=maxDepth,
    maxBins=maxBins)

pipeline = Pipeline(stages=[
    discretizer,  # Feature engineering steps
    index_pipeline,
    encoder,
    vec_assembler,
    scaler,
    rf  # Initialized model
])

# COMMAND ----------