예제 #1
0
def dtRegression(df, conf):
    """ 
        input : df [spark.dataframe], conf [configuration params]
        output : decisiontree_regression model [model]
    """
    featuresCol = conf["params"].get("featuresCol")
    impurity = conf["params"].get("impurity", "variance")
    
    maxDepth    = conf["params"].get("maxDepth", 5)
    maxBin = conf["params"].get("maxBins",32)
    minInstancesPerNode = conf["params"].get("minInstancesPerNode", 1)
    minInfoGain = conf ["params"].get("minInfoGain", 0.0)
    maxMemoryInMB = conf["params"].get("maxMemoryInMB",256)
    cacheNodeIds = conf["params"].get("cacheNodeIds", False)
    checkpointInterval = conf["params"].get("checkpointInterval", 10)
    seed = conf["params"].get("seed", None)
    varianceCol = conf["params"].get("varianceCol", None)   
    
    dt = DecisionTreeRegressor(maxDepth=maxDepth,featuresCol=featuresCol)
    pipeline = Pipeline(stages=[featureIndexer, dt])
    
    print ("maxDepth : " , dt.getMaxDepth())
    
    #jika menggunakan ml-tuning
    if conf["tuning"]:
            
          #jika menggunakan ml-tuning cross validation  
          if conf["tuning"].get("method").lower() == "crossval":
            paramgGrids = conf["tuning"].get("paramGrids")
            pg = ParamGridBuilder()
            for key in paramgGrids:
              pg.addGrid(key, paramgGrids[key])
          
            grid = pg.build()
            folds = conf["tuning"].get("methodParam")
            evaluator = RegressionEvaluator()
            cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, 
                                evaluator=evaluator, numFolds= folds)
            model = cv.fit(df)
          
          #jika menggunakan ml-tuning train validation split
          elif conf["tuning"].get("method").lower() == "trainvalsplit":
            paramgGrids = conf["tuning"].get("paramGrids")
            pg = ParamGridBuilder()
            for key in paramGrids:
              pg.addGrid(key, paramGrids[key])
          
            grid = pg.build()
            tr = conf["tuning"].get("methodParam")
            evaluator = RegressionEvaluator()
            tvs = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=grid, 
                                       evaluator=evaluator, trainRatio=tr )
            model = tvs.fit(df)
            
    #jika tidak menggunakan ml-tuning
    elif conf["tuning"] == None:
          print ("test")
          model = pipeline.fit(df)
          
    return model
예제 #2
0
def estimator_gridbuilder(estimator, paramnames_values):
    """Help to abbreviate ParamGridBuilder construction from dict"""
    pgb = ParamGridBuilder()
    for pn, vals in paramnames_values.items():
        assert hasattr(vals, '__iter__'), "List of values required for each parameter name"
        pgb.addGrid(estimator.getParam(pn), vals)
    return estimator, pgb
예제 #3
0
def logisticClassifier(df, conf):
    feature_col = conf["params"].get("featuresCol", "features")
    label_col = conf["params"].get("labelCol", "label")
    pred_col = conf["params"].get("predictionCol", "prediction")
    prob_col = conf["params"].get("probabilityCol", "probability")

    max_iter = conf["params"].get("maxIter", 100)
    reg_param = conf["params"].get("regParam", 0.0)
    elasticNet_param = conf["params"].get("elasticNetParam", 0.0)
    tolr = conf["params"].get("tol", 1e-6)
    fit_intercept = conf["params"].get("fitIntercept", True)
    thres = conf["params"].get("threshold", 0.5)
    thresh = conf["params"].get("thresholds", None)
    std = conf["params"].get("standardization", True)
    weight = conf["params"].get("weightCol", None)
    aggr = conf["params"].get("aggregationDepth", 2)
    fml = conf["params"].get("family", "auto")


    lr = LogisticRegression(maxIter=max_iter, regParam=reg_param, elasticNetParam=elasticNet_param, \
            tol=tolr, fitIntercept=fit_intercept, threshold=thres, standardization=std, \
              aggregationDepth=aggr, family=fml)

    if conf["tuning"]:
        if conf["tuning"].get("method").lower() == "crossval":
            logReg = LogisticRegression()
            paramgGrids = conf["tuning"].get("paramGrids")
            folds = conf["tuning"].get("methodParam", 2)
            pg = ParamGridBuilder()
            for key in paramgGrids:
                pg.addGrid(key, paramgGrids[key])

            grid = pg.build()
            evaluator = BinaryClassificationEvaluator()
            cv = CrossValidator(estimator=logReg,
                                estimatorParamMaps=grid,
                                evaluator=evaluator,
                                numFolds=folds)
            model = cv.fit(df)
        elif conf["tuning"].get("method").lower() == "trainvalsplit":
            paramgGrids = conf["tuning"].get("paramGrids")
            tr = conf["tuning"].get("methodParam", 0.8)
            pg = ParamGridBuilder()
            for key in paramgGrids:
                pg.addGrid(key, paramgGrids[key])

            grid = pg.build()
            evaluator = BinaryClassificationEvaluator()
            tvs = TrainValidationSplit(estimator=lr,
                                       estimatorParamMaps=grid,
                                       evaluator=evaluator,
                                       trainRatio=tr)
            model = tvs.fit(df)

    elif conf["tuning"] == None:
        model = lr.fit(df)
    return model
예제 #4
0
def randomforestRegression (df,conf):
    """input  : - Dataframe train (df)
                - Hyperparameter configuration (conf)
       output : - Random Forest Regression Model
    """     
# set params with default value (if value isn't set in rfr_params)
    feature_col = conf["params"].get("featuresCol", "features")
    label_col = conf["params"].get("labelCol", "label")
    pred_col = conf["params"].get("predictionCol", "prediction")
    max_depth = conf["params"].get("maxDepth", 5)
    num_trees = conf["params"].get("numTrees", 20)
    max_bins= conf["params"].get("maxBins", 32)
    seed = conf["params"].get("seed", None)
    minInstancesPerNode = conf["params"].get("minInstancesPerNode", 1)
    minInfoGain = conf ["params"].get("minInfoGain", 0.0)
    maxMemoryInMB = conf["params"].get("maxMemoryInMB", 256)
    cacheNodeIds = conf["params"].get("cacheNodeIds", False)
    checkpointInterval = conf["params"].get("checkpointInterval", 10)
    impurity = conf["params"].get("impurity", "variance")  
    subSamplingRate = conf["params"].get("subsamplingRate", 1.0)
    featureSubsetStrategy = conf["params"].get("featureSubsetStrategy", "auto")
    
    rfr = RandomForestRegressor(featuresCol=feature_col, labelCol=label_col,
                                predictionCol=pred_col, maxDepth=max_depth,
                                numTrees=num_trees, impurity=impurity)
    
    pipeline = Pipeline(stages=[featureIndexer, rfr])
    if conf["tuning"]:
        if conf["tuning"].get("method").lower() == "crossval":
            folds = conf["tuning"].get("methodParam", 4)
# Set the hiperparameter that we want to grid, ex: maxDepth and numTrees
            paramGrids = conf["tuning"].get("paramGrids")
            pg=ParamGridBuilder()
            for key in paramGrids:
              pg.addGrid(key, paramGrids[key])
            grid = pg.build()
            evaluator = RegressionEvaluator()
            cv = CrossValidator(estimator=rfr, estimatorParamMaps=grid,
                            evaluator=evaluator, numFolds=folds)
            model = cv.fit(df)
        elif conf["tuning"].get("method").lower() == "trainvalsplit":
            tr = conf["tuning"].get("methodParam", 0.8)
# Set the hiperparameter that we want to grid, ex: maxDepth and numTrees
            paramGrids = conf["tuning"].get("paramGrids")
            pg=ParamGridBuilder()
            for key in paramGrids:
              pg.addGrid(key, paramGrids[key])
            grid = pg.build()
            evaluator = RegressionEvaluator()
            tvs = TrainValidationSplit(estimator=rfr, estimatorParamMaps=grid,
                                   evaluator=evaluator, trainRatio=tr)
            model = tvs.fit(df)
    elif conf["tuning"] ==  None:
        model = pipeline.fit(df)
    return model
예제 #5
0
def aftsurvivalRegression(df, conf):
  """ AFT Survival Regression training
        Input  : - Dataframe of training (df)
                 - tuning and hiperparameter configuration (conf)
        output : - AFT survival regression model (model)
  """
  feature_col = conf["params"].get("featuresCol", "features")
  label_col = conf["params"].get("labelCol", "label")
  pred_col = conf["params"].get("predictionCol", "prediction")
  cens_col = conf["params"].get("censorCol", "censor")
  fit_intercept = conf["params"].get("fitIntercept",True)
  max_iter = conf["params"].get("maxIter", 100)
  tol = conf["params"].get("tol", )
  quant_p = conf["params"].get("quantileProbabilities", [0.01, 0.05, 0.1, 0.25, 
                                                        0.5, 0.75, 0.9, 0.95, 0.99])
  quant_col = conf["params"].get("quantilesCol", None)
  agg_depth = conf["params"].get("aggregationDepth", 2)
      
  afts = AFTSurvivalRegression(featuresCol=feature_col,labelCol=label_col,
                          predictionCol=pred_col, censorCol=cens_col,
                          maxIter=max_iter, fitIntercept=fit_intercept,
                          tol=tol, aggregationDepth=agg_depth)

  if conf["tuning"]:
    if conf["tuning"].get("method").lower() == "crossval":
      folds = conf["tuning"].get("methodParam", 2)
      # Set the hiperparameter that we want to grid, incase: maxIter and aggregationDepth
      paramGrids = conf["tuning"].get("paramGrids")
      pg=ParamGridBuilder()
      for key in paramGrids:
          pg.addGrid(key, paramGrids[key])
      grid = pg.build()
      evaluator = RegressionEvaluator()
      cv = CrossValidator(estimator=afts, estimatorParamMaps=grid,
                          evaluator=evaluator, numFolds=folds)
      model = cv.fit(df)
      
    elif conf["tuning"].get("method").lower() == "trainvalsplit":
      tr = conf["tuning"].get("methodParam", 0.8)
      # Set the hiperparameter that we want to grid, incase: maxIter and aggregationDepth
      paramGrids = conf["tuning"].get("paramGrids")
      pg=ParamGridBuilder()
      for key in paramGrids:
          pg.addGrid(key, paramGrids[key])
      grid = pg.build()
      evaluator = RegressionEvaluator()
      tvs = TrainValidationSplit(estimator=afts, estimatorParamMaps=grid,
                                 evaluator=evaluator, trainRatio=tr)
      model = tvs.fit(df)
  elif conf["tuning"] ==  None:
    model = afts.fit(df)
  return model
예제 #6
0
 def _build_param_grid(self):
     param_grid_builder = ParamGridBuilder()
     param_grid_builder.addGrid(self.tokenizer.tokenizer, self.tokenizer_map)
     param_grid_builder.addGrid(self.ngram.n, self.ngram_map)
     param_grid_builder.addGrid(self.hashing_tf.numFeatures, self.hashing_tf_map)
     param_grid_builder.addGrid(self.clf.regParam, self.clf_map)
     return param_grid_builder.build()
예제 #7
0
    def _inner_search(self, estimator_family, train, test, model_key,
                      model_class, fixed_params, hyper_params, scoring,
                      greater_is_better, cv, *args, **kwargs):
        model_object = model_class(labelCol=kwargs['variable_to_predict'],
                                   **fixed_params.get(model_key, {}))
        tuned_parameters = hyper_params.get(model_key, {})

        # build hyper parameter grid:
        pgb = ParamGridBuilder()
        for tp_key in tuned_parameters:
            pgb = pgb.addGrid(getattr(model_object, tp_key),
                              tuned_parameters[tp_key])
        param_map = pgb.build()

        # run cross validator:
        evaluator = self._get_scorer(estimator_family, scoring, *args,
                                     **kwargs)
        cv_f = CrossValidationSpark(estimator=model_object,
                                    estimatorParamMaps=param_map,
                                    evaluator=evaluator,
                                    cv=cv)

        hps, metrics = cv_f.fit(train)

        return hps, self._calc_cv_df(model_key, metrics, param_map)
예제 #8
0
파일: vista.py 프로젝트: Advitya17/Vista
    def hyperparameter_tuned_model(clf, train_df):
        pipeline = Pipeline(stages=[clf])

        paramGrid = ParamGridBuilder()
        for i in extra_config:
            if i == 'numFolds':
                continue
            paramGrid = paramGrid.addGrid(eval('clf.' + i), extra_config[i])

        paramGrid = paramGrid.build()
        evaluator = MulticlassClassificationEvaluator()

        if tuning_method == 'CrossValidator':

            if 'numFolds' in extra_config:
                numFolds = extra_config['numFolds']
            else:
                numFolds = 3  # default

            val_model = CrossValidator(estimator=pipeline,
                                       estimatorParamMaps=paramGrid,
                                       evaluator=evaluator,
                                       numFolds=numFolds,
                                       seed=seed)

        if tuning_method == 'TrainValidationSplit':

            val_model = TrainValidationSplit(
                estimator=pipeline,
                estimatorParamMaps=paramGrid,
                evaluator=evaluator,
                seed=seed,
                # 80% of the data will be used for training, 20% for validation.
                trainRatio=1 - test_size)

    # Run cross-validation, and choose the best set of parameters.
        return val_model.fit(train_df)
예제 #9
0
    def build_dag(self, paramMaps):

        # Type of tree search to execute dag
        tree = BFSTree if self.tree_type.lower() == "bfs" else DFSTree

        stages = self.getStages()
        stage_prev = None

        # Locate the last estimator, will not transform training data after fit
        for i, stage in enumerate(stages):
            if isinstance(stage, Estimator):
                last_est_index = i

        nodes = []
        roots = []
        stage_nodes = {}
        for i, stage in enumerate(stages):

            # Determine what type of Node for the stage
            if isinstance(stage, Estimator):
                if i < last_est_index:
                    Node = tree.FeatureExtractionEstimatorNode
                else:
                    Node = tree.EstimatorNode
            else:
                Node = tree.TransformerNode

            # Separate ParamMaps for the stage
            temp_map = {}
            for param_map in paramMaps:
                for k, v in param_map.iteritems():
                    if k.parent == stage.uid:
                        temp_map.setdefault(k, set()).add(v)

            # Check if have a param grid for this stage
            if temp_map:
                grid_builder = ParamGridBuilder()
                for k, v in temp_map.iteritems():
                    grid_builder.addGrid(k, v)
                stage_param_grid = grid_builder.build()
                new_nodes = [
                    Node(stage, param_map) for param_map in stage_param_grid
                ]
            else:
                new_nodes = [Node(stage, {})]

            # Make nodes for each node of parent stage
            if stage_prev:
                temp_nodes = []
                parent_nodes = stage_nodes[stage_prev]
                for parent_node in parent_nodes:
                    for node in new_nodes:
                        child_node = copy.copy(node)
                        child_node.children = []
                        child_node.parent = parent_node
                        parent_node.children.append(child_node)
                        temp_nodes.append(child_node)
                new_nodes = temp_nodes
            else:
                roots += new_nodes

            # Store all new nodes created for this stage
            stage_nodes[stage] = new_nodes
            nodes += new_nodes

            stage_prev = stage

        return roots, nodes
    def Train(self):
        st_global = time.time()

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._slug,
            "initialization",
            "info",
            display=True,
            emptyBin=False,
            customMsg=None,
            weightKey="total")

        algosToRun = self._dataframe_context.get_algorithms_to_run()
        algoSetting = [
            x for x in algosToRun if x.get_algorithm_slug() == self._slug
        ][0]
        categorical_columns = self._dataframe_helper.get_string_columns()
        uid_col = self._dataframe_context.get_uid_column()

        if self._metaParser.check_column_isin_ignored_suggestion(uid_col):
            categorical_columns = list(set(categorical_columns) - {uid_col})

        allDateCols = self._dataframe_context.get_date_columns()
        categorical_columns = list(set(categorical_columns) - set(allDateCols))
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        result_column = self._dataframe_context.get_result_column()
        categorical_columns = [
            x for x in categorical_columns if x != result_column
        ]

        appType = self._dataframe_context.get_app_type()

        model_path = self._dataframe_context.get_model_path()
        if model_path.startswith("file"):
            model_path = model_path[7:]
        validationDict = self._dataframe_context.get_validation_dict()
        print("model_path", model_path)
        pipeline_filepath = "file://" + str(model_path) + "/" + str(
            self._slug) + "/pipeline/"
        model_filepath = "file://" + str(model_path) + "/" + str(
            self._slug) + "/model"
        pmml_filepath = "file://" + str(model_path) + "/" + str(
            self._slug) + "/modelPmml"

        df = self._data_frame
        levels = df.select(result_column).distinct().count()

        appType = self._dataframe_context.get_app_type()

        model_filepath = model_path + "/" + self._slug + "/model"
        pmml_filepath = str(model_path) + "/" + str(
            self._slug) + "/traindeModel.pmml"

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._slug,
            "training",
            "info",
            display=True,
            emptyBin=False,
            customMsg=None,
            weightKey="total")

        st = time.time()
        pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns,
                                                      categorical_columns,
                                                      result_column)

        trainingData, validationData = MLUtils.get_training_and_validation_data(
            df, result_column, 0.8)  # indexed

        labelIndexer = StringIndexer(inputCol=result_column, outputCol="label")
        # OriginalTargetconverter = IndexToString(inputCol="label", outputCol="originalTargetColumn")

        # Label Mapping and Inverse
        labelIdx = labelIndexer.fit(trainingData)
        labelMapping = {k: v for k, v in enumerate(labelIdx.labels)}
        inverseLabelMapping = {
            v: float(k)
            for k, v in enumerate(labelIdx.labels)
        }
        if self._dataframe_context.get_trainerMode() == "autoML":
            automl_enable = True
        else:
            automl_enable = False
        clf = NaiveBayes()
        if not algoSetting.is_hyperparameter_tuning_enabled():
            algoParams = algoSetting.get_params_dict()
        else:
            algoParams = algoSetting.get_params_dict_hyperparameter()
        print("=" * 100)
        print(algoParams)
        print("=" * 100)
        clfParams = [prm.name for prm in clf.params]
        algoParams = {
            getattr(clf, k): v if isinstance(v, list) else [v]
            for k, v in algoParams.items() if k in clfParams
        }
        #print("="*100)
        #print("ALGOPARAMS - ",algoParams)
        #print("="*100)

        paramGrid = ParamGridBuilder()
        # if not algoSetting.is_hyperparameter_tuning_enabled():
        #     for k,v in algoParams.items():
        #         if v == [None] * len(v):
        #             continue
        #         if k.name == 'thresholds':
        #             paramGrid = paramGrid.addGrid(k,v[0])
        #         else:
        #             paramGrid = paramGrid.addGrid(k,v)
        #     paramGrid = paramGrid.build()

        # if not algoSetting.is_hyperparameter_tuning_enabled():
        for k, v in algoParams.items():
            print(k, v)
            if v == [None] * len(v):
                continue
            paramGrid = paramGrid.addGrid(k, v)
        paramGrid = paramGrid.build()
        # else:
        #     for k,v in algoParams.items():
        #         print k.name, v
        #         if v[0] == [None] * len(v[0]):
        #             continue
        #         paramGrid = paramGrid.addGrid(k,v[0])
        #     paramGrid = paramGrid.build()

        #print("="*143)
        #print("PARAMGRID - ", paramGrid)
        #print("="*143)

        if len(paramGrid) > 1:
            hyperParamInitParam = algoSetting.get_hyperparameter_params()
            evaluationMetricDict = {
                "name": hyperParamInitParam["evaluationMetric"]
            }
            evaluationMetricDict[
                "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[
                    evaluationMetricDict["name"]]
        else:
            evaluationMetricDict = {
                "name": GLOBALSETTINGS.CLASSIFICATION_MODEL_EVALUATION_METRIC
            }
            evaluationMetricDict[
                "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[
                    evaluationMetricDict["name"]]

        self._result_setter.set_hyper_parameter_results(self._slug, None)

        if validationDict["name"] == "kFold":
            numFold = int(validationDict["value"])
            estimator = Pipeline(stages=[pipeline, labelIndexer, clf])
            if algoSetting.is_hyperparameter_tuning_enabled():
                modelFilepath = "/".join(model_filepath.split("/")[:-1])
                pySparkHyperParameterResultObj = PySparkGridSearchResult(
                    estimator, paramGrid, appType, modelFilepath, levels,
                    evaluationMetricDict, trainingData, validationData,
                    numFold, self._targetLevel, labelMapping,
                    inverseLabelMapping, df)
                resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models(
                )
                self._result_setter.set_hyper_parameter_results(
                    self._slug, resultArray)
                self._result_setter.set_metadata_parallel_coordinates(
                    self._slug, {
                        "ignoreList":
                        pySparkHyperParameterResultObj.get_ignore_list(),
                        "hideColumns":
                        pySparkHyperParameterResultObj.get_hide_columns(),
                        "metricColName":
                        pySparkHyperParameterResultObj.
                        get_comparison_metric_colname(),
                        "columnOrder":
                        pySparkHyperParameterResultObj.get_keep_columns()
                    })

                bestModel = pySparkHyperParameterResultObj.getBestModel()
                prediction = pySparkHyperParameterResultObj.getBestPrediction()

            else:
                if automl_enable:
                    paramGrid = (ParamGridBuilder().addGrid(
                        clf.smoothing, [1.0, 0.2]).build())
                crossval = CrossValidator(
                    estimator=estimator,
                    estimatorParamMaps=paramGrid,
                    evaluator=BinaryClassificationEvaluator()
                    if levels == 2 else MulticlassClassificationEvaluator(),
                    numFolds=3 if numFold is None else
                    numFold)  # use 3+ folds in practice
                cvnb = crossval.fit(trainingData)
                prediction = cvnb.transform(validationData)
                bestModel = cvnb.bestModel

        else:
            train_test_ratio = float(
                self._dataframe_context.get_train_test_split())
            estimator = Pipeline(stages=[pipeline, labelIndexer, clf])
            if algoSetting.is_hyperparameter_tuning_enabled():
                modelFilepath = "/".join(model_filepath.split("/")[:-1])
                pySparkHyperParameterResultObj = PySparkTrainTestResult(
                    estimator, paramGrid, appType, modelFilepath, levels,
                    evaluationMetricDict, trainingData, validationData,
                    train_test_ratio, self._targetLevel, labelMapping,
                    inverseLabelMapping, df)
                resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models(
                )
                self._result_setter.set_hyper_parameter_results(
                    self._slug, resultArray)
                self._result_setter.set_metadata_parallel_coordinates(
                    self._slug, {
                        "ignoreList":
                        pySparkHyperParameterResultObj.get_ignore_list(),
                        "hideColumns":
                        pySparkHyperParameterResultObj.get_hide_columns(),
                        "metricColName":
                        pySparkHyperParameterResultObj.
                        get_comparison_metric_colname(),
                        "columnOrder":
                        pySparkHyperParameterResultObj.get_keep_columns()
                    })

                bestModel = pySparkHyperParameterResultObj.getBestModel()
                prediction = pySparkHyperParameterResultObj.getBestPrediction()

            else:
                tvs = TrainValidationSplit(
                    estimator=estimator,
                    estimatorParamMaps=paramGrid,
                    evaluator=BinaryClassificationEvaluator()
                    if levels == 2 else MulticlassClassificationEvaluator(),
                    trainRatio=train_test_ratio)

                tvspnb = tvs.fit(trainingData)
                prediction = tvspnb.transform(validationData)
                bestModel = tvspnb.bestModel

        modelmanagement_ = {
            param[0].name: param[1]
            for param in bestModel.stages[2].extractParamMap().items()
        }

        MLUtils.save_pipeline_or_model(bestModel, model_filepath)
        predsAndLabels = prediction.select(['prediction',
                                            'label']).rdd.map(tuple)
        # label_classes = prediction.select("label").distinct().collect()
        # label_classes = prediction.agg((F.collect_set('label').alias('label'))).first().asDict()['label']
        #results = transformed.select(["prediction","label"])
        # if len(label_classes) > 2:
        #     metrics = MulticlassMetrics(predsAndLabels) # accuracy of the model
        # else:
        #     metrics = BinaryClassificationMetrics(predsAndLabels)
        posLabel = inverseLabelMapping[self._targetLevel]
        metrics = MulticlassMetrics(predsAndLabels)

        trainingTime = time.time() - st

        f1_score = metrics.fMeasure(inverseLabelMapping[self._targetLevel],
                                    1.0)
        precision = metrics.precision(inverseLabelMapping[self._targetLevel])
        recall = metrics.recall(inverseLabelMapping[self._targetLevel])
        accuracy = metrics.accuracy

        print(f1_score, precision, recall, accuracy)

        #gain chart implementation
        def cal_prob_eval(x):
            if len(x) == 1:
                if x == posLabel:
                    return (float(x[1]))
                else:
                    return (float(1 - x[1]))
            else:
                return (float(x[int(posLabel)]))

        column_name = 'probability'

        def y_prob_for_eval_udf():
            return udf(lambda x: cal_prob_eval(x))

        prediction = prediction.withColumn(
            "y_prob_for_eval",
            y_prob_for_eval_udf()(col(column_name)))

        try:
            pys_df = prediction.select(
                ['y_prob_for_eval', 'prediction', 'label'])
            gain_lift_ks_obj = GainLiftKS(pys_df, 'y_prob_for_eval',
                                          'prediction', 'label', posLabel,
                                          self._spark)
            gain_lift_KS_dataframe = gain_lift_ks_obj.Run().toPandas()
        except:
            try:
                temp_df = pys_df.toPandas()
                gain_lift_ks_obj = GainLiftKS(temp_df, 'y_prob_for_eval',
                                              'prediction', 'label', posLabel,
                                              self._spark)
                gain_lift_KS_dataframe = gain_lift_ks_obj.Rank_Ordering()
            except:
                print("gain chant failed")
                gain_lift_KS_dataframe = None

        #feature_importance = MLUtils.calculate_sparkml_feature_importance(df, bestModel.stages[-1], categorical_columns, numerical_columns)
        act_list = prediction.select('label').collect()
        actual = [int(row.label) for row in act_list]

        pred_list = prediction.select('prediction').collect()
        predicted = [int(row.prediction) for row in pred_list]
        prob_list = prediction.select('probability').collect()
        probability = [list(row.probability) for row in prob_list]
        # objs = {"trained_model":bestModel,"actual":prediction.select('label'),"predicted":prediction.select('prediction'),
        # "probability":prediction.select('probability'),"feature_importance":None,
        # "featureList":list(categorical_columns) + list(numerical_columns),"labelMapping":labelMapping}
        objs = {
            "trained_model": bestModel,
            "actual": actual,
            "predicted": predicted,
            "probability": probability,
            "feature_importance": None,
            "featureList": list(categorical_columns) + list(numerical_columns),
            "labelMapping": labelMapping
        }

        conf_mat_ar = metrics.confusionMatrix().toArray()
        print(conf_mat_ar)
        confusion_matrix = {}
        for i in range(len(conf_mat_ar)):
            confusion_matrix[labelMapping[i]] = {}
            for j, val in enumerate(conf_mat_ar[i]):
                confusion_matrix[labelMapping[i]][labelMapping[j]] = val
        print(confusion_matrix)  # accuracy of the model
        '''ROC CURVE IMPLEMENTATION'''
        y_prob = probability
        y_score = predicted
        y_test = actual
        logLoss = log_loss(y_test, y_prob)
        if levels <= 2:
            positive_label_probs = []
            for val in y_prob:
                positive_label_probs.append(val[int(posLabel)])
            roc_auc = roc_auc_score(y_test, y_score)

            roc_data_dict = {
                "y_score": y_score,
                "y_test": y_test,
                "positive_label_probs": positive_label_probs,
                "y_prob": y_prob,
                "positive_label": posLabel
            }
            roc_dataframe = pd.DataFrame({
                "y_score":
                y_score,
                "y_test":
                y_test,
                "positive_label_probs":
                positive_label_probs
            })
            #roc_dataframe.to_csv("binary_roc_data.csv")
            fpr, tpr, thresholds = roc_curve(y_test,
                                             positive_label_probs,
                                             pos_label=posLabel)
            roc_df = pd.DataFrame({
                "FPR": fpr,
                "TPR": tpr,
                "thresholds": thresholds
            })
            roc_df["tpr-fpr"] = roc_df["TPR"] - roc_df["FPR"]

            optimal_index = np.argmax(np.array(roc_df["tpr-fpr"]))
            fpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "FPR"]
            tpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "TPR"]

            rounded_roc_df = roc_df.round({'FPR': 2, 'TPR': 4})

            unique_fpr = rounded_roc_df["FPR"].unique()

            final_roc_df = rounded_roc_df.groupby("FPR",
                                                  as_index=False)[["TPR"
                                                                   ]].mean()
            endgame_roc_df = final_roc_df.round({'FPR': 2, 'TPR': 3})
        elif levels > 2:
            positive_label_probs = []
            for val in y_prob:
                positive_label_probs.append(val[int(posLabel)])

            y_test_roc_multi = []
            for val in y_test:
                if val != posLabel:
                    val = posLabel + 1
                    y_test_roc_multi.append(val)
                else:
                    y_test_roc_multi.append(val)

            y_score_roc_multi = []
            for val in y_score:
                if val != posLabel:
                    val = posLabel + 1
                    y_score_roc_multi.append(val)
                else:
                    y_score_roc_multi.append(val)

            roc_auc = roc_auc_score(y_test_roc_multi, y_score_roc_multi)

            fpr, tpr, thresholds = roc_curve(y_test_roc_multi,
                                             positive_label_probs,
                                             pos_label=posLabel)
            roc_df = pd.DataFrame({
                "FPR": fpr,
                "TPR": tpr,
                "thresholds": thresholds
            })
            roc_df["tpr-fpr"] = roc_df["TPR"] - roc_df["FPR"]

            optimal_index = np.argmax(np.array(roc_df["tpr-fpr"]))
            fpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "FPR"]
            tpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "TPR"]

            rounded_roc_df = roc_df.round({'FPR': 2, 'TPR': 4})
            unique_fpr = rounded_roc_df["FPR"].unique()
            final_roc_df = rounded_roc_df.groupby("FPR",
                                                  as_index=False)[["TPR"
                                                                   ]].mean()
            endgame_roc_df = final_roc_df.round({'FPR': 2, 'TPR': 3})
        # Calculating prediction_split
        val_cnts = prediction.groupBy('label').count()
        val_cnts = map(lambda row: row.asDict(), val_cnts.collect())
        prediction_split = {}
        total_nos = prediction.select('label').count()
        for item in val_cnts:
            print(labelMapping)
            classname = labelMapping[item['label']]
            prediction_split[classname] = round(
                item['count'] * 100 / float(total_nos), 2)

        if not algoSetting.is_hyperparameter_tuning_enabled():
            modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH -
                                     1) + "1"
            modelFilepathArr = model_filepath.split("/")[:-1]
            modelFilepathArr.append(modelName)
            bestModel.save("/".join(modelFilepathArr))
        runtime = round((time.time() - st_global), 2)

        try:
            print(pmml_filepath)
            pmmlBuilder = PMMLBuilder(self._spark, trainingData,
                                      bestModel).putOption(
                                          clf, 'compact', True)
            pmmlBuilder.buildFile(pmml_filepath)
            pmmlfile = open(pmml_filepath, "r")
            pmmlText = pmmlfile.read()
            pmmlfile.close()
            self._result_setter.update_pmml_object({self._slug: pmmlText})
        except Exception as e:
            print("PMML failed...", str(e))
            pass

        cat_cols = list(set(categorical_columns) - {result_column})
        self._model_summary = MLModelSummary()
        self._model_summary.set_algorithm_name("Naive Bayes")
        self._model_summary.set_algorithm_display_name("Naive Bayes")
        self._model_summary.set_slug(self._slug)
        self._model_summary.set_training_time(runtime)
        self._model_summary.set_confusion_matrix(confusion_matrix)
        # self._model_summary.set_feature_importance(objs["feature_importance"])
        self._model_summary.set_feature_list(objs["featureList"])
        self._model_summary.set_model_accuracy(accuracy)
        self._model_summary.set_training_time(round((time.time() - st), 2))
        self._model_summary.set_precision_recall_stats([precision, recall])
        self._model_summary.set_model_precision(precision)
        self._model_summary.set_model_recall(recall)
        self._model_summary.set_model_F1_score(f1_score)
        self._model_summary.set_model_log_loss(logLoss)
        self._model_summary.set_gain_lift_KS_data(gain_lift_KS_dataframe)
        self._model_summary.set_AUC_score(roc_auc)
        self._model_summary.set_target_variable(result_column)
        self._model_summary.set_prediction_split(prediction_split)
        self._model_summary.set_validation_method("KFold")
        self._model_summary.set_level_map_dict(objs["labelMapping"])
        # self._model_summary.set_model_features(list(set(x_train.columns)-set([result_column])))
        self._model_summary.set_model_features(objs["featureList"])
        self._model_summary.set_level_counts(
            self._metaParser.get_unique_level_dict(
                list(set(categorical_columns)) + [result_column]))
        #self._model_summary.set_num_trees(objs['trained_model'].getNumTrees)
        self._model_summary.set_num_rules(300)
        self._model_summary.set_target_level(self._targetLevel)

        if not algoSetting.is_hyperparameter_tuning_enabled():
            modelDropDownObj = {
                "name": self._model_summary.get_algorithm_name(),
                "evaluationMetricValue": accuracy,
                "evaluationMetricName": "accuracy",
                "slug": self._model_summary.get_slug(),
                "Model Id": modelName
            }
            modelSummaryJson = {
                "dropdown": modelDropDownObj,
                "levelcount": self._model_summary.get_level_counts(),
                "modelFeatureList": self._model_summary.get_feature_list(),
                "levelMapping": self._model_summary.get_level_map_dict(),
                "slug": self._model_summary.get_slug(),
                "name": self._model_summary.get_algorithm_name()
            }
        else:
            modelDropDownObj = {
                "name": self._model_summary.get_algorithm_name(),
                "evaluationMetricValue": accuracy,
                "evaluationMetricName": "accuracy",
                "slug": self._model_summary.get_slug(),
                "Model Id": resultArray[0]["Model Id"]
            }
            modelSummaryJson = {
                "dropdown": modelDropDownObj,
                "levelcount": self._model_summary.get_level_counts(),
                "modelFeatureList": self._model_summary.get_feature_list(),
                "levelMapping": self._model_summary.get_level_map_dict(),
                "slug": self._model_summary.get_slug(),
                "name": self._model_summary.get_algorithm_name()
            }
        self._model_management = MLModelSummary()
        print(modelmanagement_)
        self._model_management.set_job_type(
            self._dataframe_context.get_job_name())  #Project name
        self._model_management.set_training_status(
            data="completed")  # training status
        self._model_management.set_target_level(
            self._targetLevel)  # target column value
        self._model_management.set_training_time(runtime)  # run time
        self._model_management.set_model_accuracy(round(metrics.accuracy, 2))
        # self._model_management.set_model_accuracy(round(metrics.accuracy_score(objs["actual"], objs["predicted"]),2))#accuracy
        self._model_management.set_algorithm_name(
            "NaiveBayes")  #algorithm name
        self._model_management.set_validation_method(
            str(validationDict["displayName"]) + "(" +
            str(validationDict["value"]) + ")")  #validation method
        self._model_management.set_target_variable(
            result_column)  #target column name
        self._model_management.set_creation_date(data=str(
            datetime.now().strftime('%b %d ,%Y  %H:%M ')))  #creation date
        self._model_management.set_datasetName(self._datasetName)
        self._model_management.set_model_type(data='classification')
        self._model_management.set_var_smoothing(
            data=int(modelmanagement_['smoothing']))

        # self._model_management.set_no_of_independent_variables(df) #no of independent varables

        modelManagementSummaryJson = [
            ["Project Name",
             self._model_management.get_job_type()],
            ["Algorithm",
             self._model_management.get_algorithm_name()],
            ["Training Status",
             self._model_management.get_training_status()],
            ["Accuracy",
             self._model_management.get_model_accuracy()],
            ["RunTime", self._model_management.get_training_time()],
            #["Owner",None],
            ["Created On",
             self._model_management.get_creation_date()]
        ]

        modelManagementModelSettingsJson = [
            ["Training Dataset",
             self._model_management.get_datasetName()],
            ["Target Column",
             self._model_management.get_target_variable()],
            ["Target Column Value",
             self._model_management.get_target_level()],
            ["Algorithm",
             self._model_management.get_algorithm_name()],
            [
                "Model Validation",
                self._model_management.get_validation_method()
            ],
            ["Model Type",
             self._model_management.get_model_type()],
            ["Smoothing",
             self._model_management.get_var_smoothing()],

            #,["priors",self._model_management.get_priors()]
            #,["var_smoothing",self._model_management.get_var_smoothing()]
        ]

        nbOverviewCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj))
            for cardObj in MLUtils.create_model_management_card_overview(
                self._model_management, modelManagementSummaryJson,
                modelManagementModelSettingsJson)
        ]
        nbPerformanceCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj))
            for cardObj in MLUtils.create_model_management_cards(
                self._model_summary, endgame_roc_df)
        ]
        nbDeploymentCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj))
            for cardObj in MLUtils.create_model_management_deploy_empty_card()
        ]
        nbCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for
            cardObj in MLUtils.create_model_summary_cards(self._model_summary)
        ]
        NB_Overview_Node = NarrativesTree()
        NB_Overview_Node.set_name("Overview")
        NB_Performance_Node = NarrativesTree()
        NB_Performance_Node.set_name("Performance")
        NB_Deployment_Node = NarrativesTree()
        NB_Deployment_Node.set_name("Deployment")
        for card in nbOverviewCards:
            NB_Overview_Node.add_a_card(card)
        for card in nbPerformanceCards:
            NB_Performance_Node.add_a_card(card)
        for card in nbDeploymentCards:
            NB_Deployment_Node.add_a_card(card)
        for card in nbCards:
            self._prediction_narrative.add_a_card(card)

        self._result_setter.set_model_summary({
            "naivebayes":
            json.loads(
                CommonUtils.convert_python_object_to_json(self._model_summary))
        })
        self._result_setter.set_naive_bayes_model_summary(modelSummaryJson)
        self._result_setter.set_nb_cards(nbCards)
        self._result_setter.set_nb_nodes(
            [NB_Overview_Node, NB_Performance_Node, NB_Deployment_Node])
        self._result_setter.set_nb_fail_card({
            "Algorithm_Name": "Naive Bayes",
            "success": "True"
        })

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._slug,
            "completion",
            "info",
            display=True,
            emptyBin=False,
            customMsg=None,
            weightKey="total")

        print("\n\n")
예제 #11
0
def build_ParamGrid(d):
  pgb=ParamGridBuilder()
  for k in d.keys():
    pgb.addGrid(eval(k),d[k])
  ParamGrid=pgb.build()  
  return(ParamGrid)
# Grid Search
# Linear regression with an intercept. Fit to training data.
regression = LinearRegression(labelCol='consumption', fitIntercept=True)
regression = regression.fit(cars_train)
evaluator.evaluate(regression.transform(cars_test))
# Linear regression without an intercept. Fit to training data.
regression = LinearRegression(labelCol='consumption', fitIntercept=False)
regression = regression.fit(cars_train)
evaluator.evaluate(regression.transform(cars_test))

from pyspark.ml.tuning import ParamGridBuilder
# Create a parameter grid builder
params = ParamGridBuilder()
# Add grid points
params = params.addGrid(regression.fitIntercept, [True, False])
# Construct the grid
params = params.build()

# How many models?
print('Number of models to be tested: ', len(params))

# Create a cross-validator and fit to the training data
cv = CrossValidator(estimator=regression,
    estimatorParamMaps=params,
    evaluator=evaluator)
cv = cv.setNumFolds(10).setSeed(13).fit(cars_train)
# What's the cross-validated RMSE for each model
print(cv.avgMetrics)
# Access the best model
print(cv.bestModel)
예제 #13
0
 def _build_param_grid(self):
     param_grid_builder = ParamGridBuilder()
     param_grid_builder.addGrid(self.hashing_tf.numFeatures, self.hashing_tf_map)
     param_grid_builder.addGrid(self.lr.regParam, self.lr_map)
     return param_grid_builder.build()
예제 #14
0
 def _build_param_grid(self):
     param_grid_builder = ParamGridBuilder()
     param_grid_builder.addGrid(self.lr.regParam, self.lr_map)
     return param_grid_builder.build()
예제 #15
0
    def Train(self):
        st_global = time.time()

        CommonUtils.create_update_and_save_progress_message(self._dataframe_context, self._scriptWeightDict,
                                                            self._scriptStages, self._slug, "initialization", "info",
                                                            display=True, emptyBin=False, customMsg=None,
                                                            weightKey="total")

        algosToRun = self._dataframe_context.get_algorithms_to_run()
        algoSetting = [x for x in algosToRun if x.get_algorithm_slug()==self._slug][0]
        categorical_columns = self._dataframe_helper.get_string_columns()
        uid_col = self._dataframe_context.get_uid_column()

        if self._metaParser.check_column_isin_ignored_suggestion(uid_col):
            categorical_columns = list(set(categorical_columns) - {uid_col})

        allDateCols = self._dataframe_context.get_date_columns()
        categorical_columns = list(set(categorical_columns) - set(allDateCols))
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        result_column = self._dataframe_context.get_result_column()
        categorical_columns = [x for x in categorical_columns if x != result_column]

        appType = self._dataframe_context.get_app_type()

        model_path = self._dataframe_context.get_model_path()
        if model_path.startswith("file"):
            model_path = model_path[7:]
        validationDict = self._dataframe_context.get_validation_dict()

        # pipeline_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/pipeline/"
        # model_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/model"
        # pmml_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/modelPmml"

        df = self._data_frame
        levels = df.select(result_column).distinct().count()

        appType = self._dataframe_context.get_app_type()

        model_filepath = model_path + "/" + self._slug + "/model"
        pmml_filepath = str(model_path) + "/" + str(self._slug) + "/trainedModel.pmml"

        CommonUtils.create_update_and_save_progress_message(self._dataframe_context, self._scriptWeightDict,
                                                            self._scriptStages, self._slug, "training", "info",
                                                            display=True, emptyBin=False, customMsg=None,
                                                            weightKey="total")

        st = time.time()
        pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns, categorical_columns, result_column)
        vectorFeats = pipeline.getStages()[-1].transform(df)
        input_feats = len(vectorFeats.select('features').take(1)[0][0])

        trainingData, validationData = MLUtils.get_training_and_validation_data(df, result_column, 0.8)  # indexed

        labelIndexer = StringIndexer(inputCol=result_column, outputCol="label")
        # OriginalTargetconverter = IndexToString(inputCol="label", outputCol="originalTargetColumn")

        # Label Mapping and Inverse
        labelIdx = labelIndexer.fit(trainingData)
        labelMapping = {k: v for k, v in enumerate(labelIdx.labels)}
        inverseLabelMapping = {v: float(k) for k, v in enumerate(labelIdx.labels)}

        clf = MultilayerPerceptronClassifier()
        if not algoSetting.is_hyperparameter_tuning_enabled():
            algoParams = algoSetting.get_params_dict()
        else:
            algoParams = algoSetting.get_params_dict_hyperparameter()
        clfParams = [prm.name for prm in clf.params]

        algoParams = {getattr(clf, k): v if isinstance(v, list) else [v] for k, v in algoParams.items() if
                      k in clfParams}

        paramGrid = ParamGridBuilder()
        layer_param_val = algoParams[getattr(clf, 'layers')]

        for layer in layer_param_val:
            layer.insert(0, input_feats)
            layer.append(levels)

        print('layer_param_val =', layer_param_val)

        # if not algoSetting.is_hyperparameter_tuning_enabled():
        #     for k,v in algoParams.items():
        #         if k.name == 'layers':
        #             paramGrid = paramGrid.addGrid(k,layer_param_val)
        #         else:
        #             paramGrid = paramGrid.addGrid(k,v)
        #     paramGrid = paramGrid.build()
        # else:
        for k, v in algoParams.items():
            if v == [None] * len(v):
                continue
            if k.name == 'layers':
                paramGrid = paramGrid.addGrid(k, layer_param_val)
            else:
                paramGrid = paramGrid.addGrid(k, v)
        paramGrid = paramGrid.build()

        if len(paramGrid) > 1:
            hyperParamInitParam = algoSetting.get_hyperparameter_params()
            evaluationMetricDict = {"name": hyperParamInitParam["evaluationMetric"]}
            evaluationMetricDict["displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[
                evaluationMetricDict["name"]]
        else:
            evaluationMetricDict = {"name": GLOBALSETTINGS.CLASSIFICATION_MODEL_EVALUATION_METRIC}
            evaluationMetricDict["displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[
                evaluationMetricDict["name"]]

        self._result_setter.set_hyper_parameter_results(self._slug, None)

        if validationDict["name"] == "kFold":
            numFold = int(validationDict["value"])
            estimator = Pipeline(stages=[pipeline, labelIndexer, clf])
            if algoSetting.is_hyperparameter_tuning_enabled():
                modelFilepath = "/".join(model_filepath.split("/")[:-1])
                pySparkHyperParameterResultObj = PySparkGridSearchResult(estimator, paramGrid, appType, modelFilepath,
                                                                         levels,
                                                                         evaluationMetricDict, trainingData,
                                                                         validationData, numFold, self._targetLevel,
                                                                         labelMapping, inverseLabelMapping,
                                                                         df)
                resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models()
                self._result_setter.set_hyper_parameter_results(self._slug, resultArray)
                self._result_setter.set_metadata_parallel_coordinates(self._slug,
                                                                      {
                                                                          "ignoreList": pySparkHyperParameterResultObj.get_ignore_list(),
                                                                          "hideColumns": pySparkHyperParameterResultObj.get_hide_columns(),
                                                                          "metricColName": pySparkHyperParameterResultObj.get_comparison_metric_colname(),
                                                                          "columnOrder": pySparkHyperParameterResultObj.get_keep_columns()})

                bestModel = pySparkHyperParameterResultObj.getBestModel()
                prediction = pySparkHyperParameterResultObj.getBestPrediction()
                bestModelName = resultArray[0]["Model Id"]

            else:
                crossval = CrossValidator(estimator=estimator,
                                          estimatorParamMaps=paramGrid,
                                          evaluator=BinaryClassificationEvaluator() if levels == 2 else MulticlassClassificationEvaluator(),
                                          numFolds=3 if numFold is None else numFold)  # use 3+ folds in practice
                cvrf = crossval.fit(trainingData)
                prediction = cvrf.transform(validationData)
                bestModel = cvrf.bestModel
                bestModelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1"

        else:
            train_test_ratio = float(self._dataframe_context.get_train_test_split())
            estimator = Pipeline(stages=[pipeline, labelIndexer, clf])
            if algoSetting.is_hyperparameter_tuning_enabled():
                modelFilepath = "/".join(model_filepath.split("/")[:-1])
                pySparkHyperParameterResultObj = PySparkTrainTestResult(estimator, paramGrid, appType, modelFilepath,
                                                                        levels,
                                                                        evaluationMetricDict, trainingData,
                                                                        validationData, train_test_ratio,
                                                                        self._targetLevel, labelMapping,
                                                                        inverseLabelMapping,
                                                                        df)
                resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models()
                self._result_setter.set_hyper_parameter_results(self._slug, resultArray)
                self._result_setter.set_metadata_parallel_coordinates(self._slug,
                                                                      {
                                                                          "ignoreList": pySparkHyperParameterResultObj.get_ignore_list(),
                                                                          "hideColumns": pySparkHyperParameterResultObj.get_hide_columns(),
                                                                          "metricColName": pySparkHyperParameterResultObj.get_comparison_metric_colname(),
                                                                          "columnOrder": pySparkHyperParameterResultObj.get_keep_columns()})

                bestModel = pySparkHyperParameterResultObj.getBestModel()
                prediction = pySparkHyperParameterResultObj.getBestPrediction()
                bestModelName = resultArray[0]["Model Id"]

            else:
                tvs = TrainValidationSplit(estimator=estimator,
                                           estimatorParamMaps=paramGrid,
                                           evaluator=BinaryClassificationEvaluator() if levels == 2 else MulticlassClassificationEvaluator(),
                                           trainRatio=train_test_ratio)

                tvrf = tvs.fit(trainingData)
                prediction = tvrf.transform(validationData)
                bestModel = tvrf.bestModel
                bestModelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1"

        MLUtils.save_pipeline_or_model(bestModel,model_filepath)
        predsAndLabels = prediction.select(['prediction', 'label']).rdd.map(tuple)
        metrics = MulticlassMetrics(predsAndLabels)
        posLabel = inverseLabelMapping[self._targetLevel]

        conf_mat_ar = metrics.confusionMatrix().toArray()
        print(conf_mat_ar)
        confusion_matrix = {}
        for i in range(len(conf_mat_ar)):
            confusion_matrix[labelMapping[i]] = {}
            for j, val in enumerate(conf_mat_ar[i]):
                confusion_matrix[labelMapping[i]][labelMapping[j]] = val
        print(confusion_matrix)

        trainingTime = time.time() - st

        f1_score = metrics.fMeasure(inverseLabelMapping[self._targetLevel], 1.0)
        precision = metrics.precision(inverseLabelMapping[self._targetLevel])
        recall = metrics.recall(inverseLabelMapping[self._targetLevel])
        accuracy = metrics.accuracy
        roc_auc = 'Undefined'
        if levels == 2:
            bin_metrics = BinaryClassificationMetrics(predsAndLabels)
            roc_auc = bin_metrics.areaUnderROC
            precision = metrics.precision(inverseLabelMapping[self._targetLevel])
            recall = metrics.recall(inverseLabelMapping[self._targetLevel])
        print(f1_score,precision,recall,accuracy)

        #gain chart implementation
        def cal_prob_eval(x):
            if len(x) == 1:
                if x == posLabel:
                    return(float(x[1]))
                else:
                    return(float(1 - x[1]))
            else:
                return(float(x[int(posLabel)]))


        column_name= 'probability'
        def y_prob_for_eval_udf():
            return udf(lambda x:cal_prob_eval(x))
        prediction = prediction.withColumn("y_prob_for_eval", y_prob_for_eval_udf()(col(column_name)))

        try:
            pys_df = prediction.select(['y_prob_for_eval','prediction','label'])
            gain_lift_ks_obj = GainLiftKS(pys_df, 'y_prob_for_eval', 'prediction', 'label', posLabel, self._spark)
            gain_lift_KS_dataframe = gain_lift_ks_obj.Run().toPandas()
        except:
            try:
                temp_df = pys_df.toPandas()
                gain_lift_ks_obj = GainLiftKS(temp_df, 'y_prob_for_eval', 'prediction', 'label', posLabel, self._spark)
                gain_lift_KS_dataframe = gain_lift_ks_obj.Rank_Ordering()
            except:
                print("gain chant failed")
                gain_lift_KS_dataframe = None


        objs = {"trained_model": bestModel, "actual": prediction.select('label'),
                "predicted": prediction.select('prediction'),
                "probability": prediction.select('probability'), "feature_importance": None,
                "featureList": list(categorical_columns) + list(numerical_columns), "labelMapping": labelMapping}

        # Calculating prediction_split
        val_cnts = prediction.groupBy('label').count()
        val_cnts = map(lambda row: row.asDict(), val_cnts.collect())
        prediction_split = {}
        total_nos = objs['actual'].count()
        for item in val_cnts:
            classname = labelMapping[item['label']]
            prediction_split[classname] = round(item['count'] * 100 / float(total_nos), 2)

        if not algoSetting.is_hyperparameter_tuning_enabled():
            # modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1"
            modelFilepathArr = model_filepath.split("/")[:-1]
            modelFilepathArr.append(bestModelName)
            bestModel.save("/".join(modelFilepathArr))
        runtime = round((time.time() - st_global), 2)

        try:
            print(pmml_filepath)
            pmmlBuilder = PMMLBuilder(self._spark, trainingData, bestModel).putOption(clf, 'compact', True)
            pmmlBuilder.buildFile(pmml_filepath)
            pmmlfile = open(pmml_filepath, "r")
            pmmlText = pmmlfile.read()
            pmmlfile.close()
            self._result_setter.update_pmml_object({self._slug: pmmlText})
        except Exception as e:
            print("PMML failed...", str(e))
            pass

        cat_cols = list(set(categorical_columns) - {result_column})
        self._model_summary = MLModelSummary()
        self._model_summary.set_algorithm_name("Spark ML Multilayer Perceptron")
        self._model_summary.set_algorithm_display_name("Spark ML Multilayer Perceptron")
        self._model_summary.set_slug(self._slug)
        self._model_summary.set_training_time(runtime)
        self._model_summary.set_confusion_matrix(confusion_matrix)
        self._model_summary.set_feature_importance(objs["feature_importance"])
        self._model_summary.set_feature_list(objs["featureList"])
        self._model_summary.set_model_accuracy(accuracy)
        self._model_summary.set_training_time(round((time.time() - st), 2))
        self._model_summary.set_precision_recall_stats([precision, recall])
        self._model_summary.set_model_precision(precision)
        self._model_summary.set_model_recall(recall)
        self._model_summary.set_target_variable(result_column)
        self._model_summary.set_prediction_split(prediction_split)
        self._model_summary.set_validation_method("KFold")
        self._model_summary.set_level_map_dict(objs["labelMapping"])
        self._model_summary.set_model_features(objs["featureList"])
        self._model_summary.set_level_counts(
            self._metaParser.get_unique_level_dict(list(set(categorical_columns)) + [result_column]))
        self._model_summary.set_num_trees(None)
        self._model_summary.set_num_rules(300)
        self._model_summary.set_target_level(self._targetLevel)

        modelManagementJson = {
            "Model ID": "SPMLP-" + bestModelName,
            "Project Name": self._dataframe_context.get_job_name(),
            "Algorithm": self._model_summary.get_algorithm_name(),
            "Status": 'Completed',
            "Accuracy": accuracy,
            "Runtime": runtime,
            "Created On": "",
            "Owner": "",
            "Deployment": 0,
            "Action": ''
        }

        # if not algoSetting.is_hyperparameter_tuning_enabled():
        #     modelDropDownObj = {
        #         "name": self._model_summary.get_algorithm_name(),
        #         "evaluationMetricValue": locals()[evaluationMetricDict["name"]], # accuracy
        #         "evaluationMetricName": evaluationMetricDict["displayName"], # accuracy
        #         "slug": self._model_summary.get_slug(),
        #         "Model Id": bestModelName
        #     }
        #     modelSummaryJson = {
        #         "dropdown": modelDropDownObj,
        #         "levelcount": self._model_summary.get_level_counts(),
        #         "modelFeatureList": self._model_summary.get_feature_list(),
        #         "levelMapping": self._model_summary.get_level_map_dict(),
        #         "slug": self._model_summary.get_slug(),
        #         "name": self._model_summary.get_algorithm_name()
        #     }
        # else:
        modelDropDownObj = {
            "name": self._model_summary.get_algorithm_name(),
            "evaluationMetricValue": accuracy, #locals()[evaluationMetricDict["name"]],
            "evaluationMetricName": "accuracy", # evaluationMetricDict["name"],
            "slug": self._model_summary.get_slug(),
            "Model Id": bestModelName
        }
        modelSummaryJson = {
            "dropdown": modelDropDownObj,
            "levelcount": self._model_summary.get_level_counts(),
            "modelFeatureList": self._model_summary.get_feature_list(),
            "levelMapping": self._model_summary.get_level_map_dict(),
            "slug": self._model_summary.get_slug(),
            "name": self._model_summary.get_algorithm_name()
        }

        mlpcCards = [json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in
                     MLUtils.create_model_summary_cards(self._model_summary)]
        for card in mlpcCards:
            self._prediction_narrative.add_a_card(card)

        self._result_setter.set_model_summary(
            {"sparkperceptron": json.loads(CommonUtils.convert_python_object_to_json(self._model_summary))})
        self._result_setter.set_spark_multilayer_perceptron_model_summary(modelSummaryJson)
        self._result_setter.set_spark_multilayer_perceptron_management_summary(modelManagementJson)
        self._result_setter.set_mlpc_cards(mlpcCards)

        CommonUtils.create_update_and_save_progress_message(self._dataframe_context, self._scriptWeightDict,
                                                            self._scriptStages, self._slug, "completion", "info",
                                                            display=True, emptyBin=False, customMsg=None,
                                                            weightKey="total")
예제 #16
0
def linearRegression(df, conf):
    """ 
        input : df [spark.dataframe], conf [configuration params]
        output : linear_regression model [model]
    """
    #memanggil parameter (nilai default)
    featuresCol= conf["params"].get("featuresCol", "features")
    labelCol= conf["params"].get("labelCol", "label")
    predictionCol = conf["params"].get("predictionCol", "prediction")
        
    max_iter = conf["params"].get("maxIter", 100)
    reg_param = conf["params"].get("regParam", 0.0)
    elasticnet_param = conf["params"].get("elasticNetParam", 0.0)
    tol = conf["params"].get("tol", 1e-6)
    fitIntercept = conf["params"].get("fitIntercept", True)
    standardization = conf["params"].get("standardization", True)
    solver = conf["params"].get("solver", "auto")
    weightCol = conf["params"].get("weightCol", None)
    aggregationDepth = conf["params"].get("aggregationDepth", 2)
    loss = conf["params"].get("loss", "squaredError")
    epsilon =  conf["params"].get("epsilon", 1.35)        
        
    lr = LinearRegression(maxIter=max_iter, regParam=reg_param, 
                              elasticNetParam=elasticnet_param)
        
    print ("maxIter : " , lr.getMaxIter())
    print ("regParam : " , lr.getRegParam())
    print ("aggrDepth : " , lr.getAggregationDepth())
        
    #jika menggunakan ml-tuning
    if conf["tuning"]:
            
        #jika menggunakan ml-tuning cross validation  
        if conf["tuning"].get("method").lower() == "crossval":
            paramGrids = conf["tuning"].get("paramGrids")
            pg = ParamGridBuilder()
            for key in paramGrids:
              pg.addGrid(key, paramGrids[key])
          
            grid = pg.build()
            folds = conf["tuning"].get("methodParam")
            evaluator = RegressionEvaluator()
            cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, 
                                evaluator=evaluator, numFolds= folds)
            model = cv.fit(df)
          
        #jika menggunakan ml-tuning train validation split
        elif conf["tuning"].get("method").lower() == "trainvalsplit":
            paramGrids = conf["tuning"].get("paramGrids")
            pg = ParamGridBuilder()
            for key in paramGrids:
              pg.addGrid(key, paramGrids[key])
          
            grid = pg.build()
            tr = conf["tuning"].get("methodParam")
            evaluator = RegressionEvaluator()
            tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, 
                                       evaluator=evaluator, trainRatio=tr )
            model = tvs.fit(df)
            
    #jika tidak menggunakan ml-tuning
    elif conf["tuning"] == None:
          print ("test")
          model = lr.fit(df)
          
    return model
#     test datasets e.g., with k=3 folds, K-fold cross validation will generate 3 (training, test)
#      dataset pairs, each of which uses 2/3 of the data for training and 1/3 for testing.
#       Each fold is used as the test set exactly once.

# 15.1 One Way

grid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.5, 2.0])		# Regularization (L2) parameter value
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])	# alpha. aplha =0 => only L2, alpha =1 => Only L1
             .addGrid(lr.maxIter, [1, 5, 10])			# Max iteration
             .build())

# 15.2 Another way (This way creates problems)

grid= ParamGridBuilder()
grid.addGrid(lr.regParam, [0.01,0.5,2.0])
grid.addGrid(lr.elasticNetParam,[0.0,0.5,1.0]).addGrid(lr.maxIter,[1,5,10])
grid.build()

# 15.3 Create 5-fold CrossValidator-object
#      Ref: https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.tuning.CrossValidator

# 15.3.1 Specify first way to evaluate cv results

lr = LogisticRegression(labelCol="label",      \
                        featuresCol="features" \
                        )

evaluator = BinaryClassificationEvaluator()
# 15.3.2
예제 #18
0
assembler = VectorAssembler(inputCols=['km', 'org_dummy', 'dow_dummy'],
                            outputCol='features')

# Split the data into training and testing sets
flights_train, flights_test = flites.randomSplit([0.8, 0.2], seed=23)

# Create a regression object and train on training data
regression = LinearRegression(labelCol="duration")

# Combine steps into a pipeline
pipeline = Pipeline(stages=[indexer, onehot, assembler, regression])

# Create parameter grid
params = ParamGridBuilder()
# Add grids for two parameters
params = params.addGrid(regression.regParam, [0.01, 0.1, 1.0, 10.0]) \
               .addGrid(regression.elasticNetParam, [0, 0.5, 1.0])

# Build the parameter grid
params = params.build()

print('Number of models to be tested: ', len(params))

# object to evaluate performance
evaluator = RegressionEvaluator(labelCol='duration')

# create cross-validation object
cv = CrossValidator(estimator=pipeline,
                    estimatorParamMaps=params,
                    evaluator=evaluator,
                    numFolds=5,
                    seed=13)
def generalizedLinearRegressor(dataFrame, conf):
    """
        input: df [spark.dataFrame], conf [configuration params]
        output: generalized linear regression model [model]
    """

    # calling params
    label_col = conf["params"].get("labelCol", "label")
    features_col = conf["params"].get("featuresCol", "features")
    prediction_col = conf["params"].get("predictionCol", "prediction")
    fam = conf["params"].get("family", "gaussian")

    fit_intercept = conf["params"].get("fitIntercept", True)
    max_iter = conf["params"].get("maxIter", 25)
    tolp = conf["params"].get("tol", 1e-6)
    reg_param = conf["params"].get("regParam", 0.0)
    weight_col = conf["params"].get("weightCol", None)
    solverp = conf["params"].get("solver", "irls")
    link_prediction_col = conf["params"].get("linkPredictionCol", None)
    variance_power = conf["params"].get("variancePower", 0.0)
    link_power = conf["params"].get("linkPower", None)

    if (fam == "gaussian"):
        li = conf["params"].get("link", "identity")
    elif (fam == "binomial"):
        li = conf["params"].get("link", "logit")
    elif (fam == "poisson"):
        li = conf["params"].get("link", "log")
    elif (fam == "gamma"):
        li = conf["params"].get("link", "inverse")
    elif (fam == "tweedle"):
        li = conf["params"].get("link", 1 - variance_power)
    else:
        li = conf["params"].get("link", None)

    glr = GeneralizedLinearRegression(labelCol=label_col,
                                      featuresCol=features_col,
                                      predictionCol=prediction_col,
                                      family=fam,
                                      link=li,
                                      fitIntercept=fit_intercept,
                                      maxIter=max_iter,
                                      tol=tolp,
                                      regParam=reg_param,
                                      solver=solverp,
                                      linkPredictionCol=link_prediction_col,
                                      variancePower=variance_power,
                                      linkPower=link_power)

    # with tuning
    if conf["tuning"]:
        # method: cross validation
        if conf["tuning"].get("method").lower() == "crossval":
            paramGrids = conf["tuning"].get("paramGrids")
            pg = ParamGridBuilder()
            for key in paramGrids:
                pg.addGrid(key, paramGrids[key])

            grid = pg.build()
            folds = conf["tuning"].get("methodParam")
            evaluator = RegressionEvaluator()
            cv = CrossValidator(estimator=glr,
                                estimatorParamMaps=grid,
                                evaluator=evaluator,
                                numFolds=folds)
            model = cv.fit(dataFrame)

        # method: train validation split
        elif conf["tuning"].get("method").lower() == "trainvalsplit":
            paramGrids = conf["tuning"].get("paramGrids")
            pg = ParamGridBuilder()
            for key in paramGrids:
                pg.addGrid(key, paramGrids[key])

            grid = pg.build()
            tr = conf["tuning"].get("methodParam")
            evaluator = RegressionEvaluator()
            tvs = TrainValidationSplit(estimator=glr,
                                       estimatorParamMaps=grid,
                                       evaluator=evaluator,
                                       trainRatio=tr)
            model = tvs.fit(dataFrame)

    # without tuning
    else:
        model = glr.fit(dataFrame)

    return model
예제 #20
0
print("Sample model input")
print(flites.toPandas().sample(12))

# Split the data into training and testing sets
flights_train, flights_test = flites.randomSplit([0.8, 0.2], seed=23)

# Create model objects and train on training data
#tree = DecisionTreeClassifier().fit(flights_train)
#gbt = GBTClassifier().fit(flights_train)

forest = RandomForestClassifier()

# Create parameter grid
params = ParamGridBuilder()
# Add grids for two parameters
params = params.addGrid(forest.featureSubsetStrategy, ['all', 'onethird', 'sqrt', 'log2']) \
               .addGrid(forest.maxDepth, [2, 5, 10])

# Build the parameter grid
params = params.build()

# Compare AUC on testing data
evaluator = BinaryClassificationEvaluator()

# create cross-validation object
cv = CrossValidator(estimator=forest,
                    estimatorParamMaps=params,
                    evaluator=evaluator,
                    numFolds=5,
                    seed=13)

# run fit on training data
def run_pipeline(name: str, data: str, save: str) -> None:

    spark = SparkSession.builder.appName(name).getOrCreate()

    # Dataset Creation #

    # read bike ride history csv's
    df = spark.read.csv(f'{data}/rides/*', header=True)
    df = df.select(['Duration', 'Start date', 'Start station number', 'Member type'])
    df = df.withColumn('Start station number', df['Start station number'].cast(IntegerType()))
    print(f'The rides dataset has [{df.count()}] rows!')

    # read station information csv
    stations = spark.read.csv(f'{data}/stations/*', header=True)
    print(f'The stations dataset has {stations.count()} rows!')
    stations = stations.withColumnRenamed('LATITUDE', 'start_station_lat')
    stations = stations.withColumnRenamed('LONGITUDE', 'start_station_long')
    stations = stations.withColumn('Start station number', stations['TERMINAL_NUMBER'].cast(IntegerType()))
    stations = stations.select(['start_station_lat', 'start_station_long', 'Start station number'])

    # remove rides longer than 1.5 hours
    one_and_a_half_hours = 60 * 60 * 1.5
    df = df.filter(df['Duration'] <= one_and_a_half_hours)

    # remove rides shorter than 3 minutes
    three_minutes = 60 * 3
    df = df.filter(df['Duration'] >= three_minutes)

    # remove unknown 'Member type's
    df = df.filter(df['Member type'] != 'Unknown')

    # remove non-existent stations
    df = df.filter(~(df['Start station number'] == 31008) & ~(
            df['Start station number'] == 32051) & ~(df['Start station number'] == 32034))

    # make target feature
    df = df.withColumn('label', F.log1p(df.Duration))

    # join on 'Start station number'
    print('Merging rides and stations dataframes!')
    df = df.join(stations, on='Start station number')
    df = df.withColumn('start_station_long', df['start_station_long'].cast(DoubleType()))
    df = df.withColumn('start_station_lat', df['start_station_lat'].cast(DoubleType()))

    print(f'Complete rides and stations dataset has {df.count()} rows!')

    # Feature Transformations #
    print('Doing Feature Transformations!')

    # convert to datetime type
    df = df.withColumn('Start date', F.to_timestamp('Start date', 'yyyy-MM-dd HH:mm:ss'))
    df = df.withColumn('day_of_week', F.dayofweek('Start date'))
    df = df.withColumn('week_of_year', F.weekofyear('Start date'))
    df = df.withColumn('month', F.month('Start date'))
    df = df.withColumn('minute', F.minute('Start date'))
    df = df.withColumn('hour', F.hour('Start date'))

    # make time features cyclical
    pi = 3.141592653589793

    df = df.withColumn('sin_day_of_week', F.sin(2 * pi * df['day_of_week'] / 7))
    df = df.withColumn('sin_week_of_year', F.sin(2 * pi * df['week_of_year'] / 53))
    df = df.withColumn('sin_month', F.sin(2 * pi * (df['month'] - 1) / 12))
    df = df.withColumn('sin_minute', F.sin(2 * pi * df['minute'] / 60))
    df = df.withColumn('sin_hour', F.sin(2 * pi * df['hour'] / 24))

    df = df.withColumn('cos_day_of_week', F.cos(2 * pi * df['day_of_week'] / 7))
    df = df.withColumn('cos_week_of_year', F.cos(2 * pi * df['week_of_year'] / 53))
    df = df.withColumn('cos_month', F.cos(2 * pi * (df['month'] - 1) / 12))
    df = df.withColumn('cos_minute', F.cos(2 * pi * df['minute'] / 60))
    df = df.withColumn('cos_hour', F.cos(2 * pi * df['hour'] / 24))

    df = df.withColumn('hour_and_day_of_week', df['hour'].cast(StringType()) + '_' + df['day_of_week'].cast(StringType()))
    df = df.withColumn('member_type_and_day_of_week', df['Member type'] + '_' + df['day_of_week'].cast(StringType()))

    # drop unused columns
    drop_columns = [
        'Start date',
        'Start station number',
        'Duration',
        'day_of_week',
        'week_of_year',
        'month',
        'minute',
        'hour'
    ]
    df = df.drop(*drop_columns)

    # df.select([F.count(F.when(F.isnan(c), c)).alias(c) for c in df.columns]).show()

    # Model and Pipeline #

    # split training and test
    train, test = df.randomSplit([.7, .3])

    # encode categorical column 'Member type'
    member_indexer = StringIndexer(inputCol='Member type', outputCol='member_idx')
    member_encoder = OneHotEncoder(inputCol='member_idx', outputCol='member_enc')

    # create vector of features named 'features'
    vector = VectorAssembler(
        inputCols=[
            'start_station_lat',
            'start_station_long',
            'sin_day_of_week',
            'cos_day_of_week',
            'sin_week_of_year',
            'cos_week_of_year',
            'sin_month',
            'cos_month',
            'sin_minute',
            'cos_minute',
            'sin_hour',
            'cos_hour',
            'member_enc'
        ],
        outputCol='features'
    )

    # scale features
    scaler = StandardScaler(
        inputCol='features',
        outputCol='scaled_features'
    )

    # define model
    model = GeneralizedLinearRegression(
        featuresCol='scaled_features'
    )

    # create pipeline and fill in stages
    pipeline = Pipeline(
        stages=[
            member_indexer,
            member_encoder,
            vector,
            scaler,
            model
        ]
    )

    # evaluation method
    evaluation = RegressionEvaluator()

    # best parameter search
    grid = ParamGridBuilder()
    # grid = grid.addGrid(model.maxDepth, [5, 7])
    # grid = grid.addGrid(model.numTrees, [200, 500])
    grid = grid.addGrid(model.maxIter, [40, 50])
    grid = grid.addGrid(model.family, ['gaussian', 'gamma'])
    grid = grid.addGrid(model.regParam, [0.0, 0.1])
    grid = grid.build()

    # run cross validation
    cv = CrossValidator(
        estimator=pipeline,
        estimatorParamMaps=grid,
        evaluator=evaluation,
        numFolds=7
    )

    print('Doing Cross Validation!')

    cv_models = cv.fit(train)
    print(f'CV results: {cv_models.avgMetrics} (RMSE)')

    best_model = cv_models.bestModel
    best_params = extract_best_params(best_model.stages[-1].extractParamMap())
    print(f'Best params:\n{best_params}')

    results = cv_models.transform(test)
    print(f'CV results on holdout dataset: {evaluation.evaluate(results)} (RMSE)')

    print('Re-fitting pipeline on entire dataset!')
    cv_models = cv.fit(df)

    print('Saving to pipeline into S3!')
    entire_dataset_best_model = cv_models.bestModel
    entire_dataset_best_model.save(f'{save}/{name}')
    print('Done!')

    return
예제 #22
0
class BaseCVModel(metaclass=Metrics):

    best_params: Dict[str, Any] = None
    __best_model: PipelineModel = None
    logger = get_logger()
    metrics: Dict[str, Callable] = dict()

    def __init__(self, estimator=None, evaluator=None):
        self.set_params(estimator, evaluator)

    def set_params(self, estimator=None, evaluator=None):
        assert estimator is None or isinstance(estimator, Estimator),\
            'estimator must be a pyspark.ml.base.Estimator.'
        assert evaluator is None or isinstance(evaluator, Evaluator),\
            'evaluator must be a pyspark.ml.base.Evaluator.'
        if estimator is not None:
            self.estimator = estimator
        if evaluator is not None:
            self.evaluator = evaluator

    def _get_features(self, df: Optional[DataFrame] = None):
        """
        Returns three lists of feature names to be used in
        the model training. Specifically binary, numeric
        (continuous), categorical features in that order.

        Returns
        -------
        binary: List[str]
        numeric: List[str]
        categorical: List[str]
        """
        raise NotImplementedError

    def train(
        self,
        df: DataFrame,
        params_map: Optional[Dict[str, List[Any]]] = None,
        num_folds: Optional[int] = 10,
        collect_sub_models: Optional[bool] = False,
        return_cv: Optional[bool] = False
    ) -> Union[PipelineModel, Tuple[PipelineModel, CrossValidatorModel]]:
        """
        Train model.

        Params
        ------
        df: Spark DataFrame
            Input train data

        params_map: Optional[Dict[str, List[Any]]] (default=None)
            Parameters mapping to grid search over

        num_folds: Optional[int] (default=10)
            Number of cross-validation folds

        collect_sub_models: Optional[bool] (default=False)
            Collect models per fold per parameter
            combination

        return_cv: Optional[bool] (default=False)
            Additionally return the CrossValidatorModel
            object or not

        Returns
        -------
            self: PipelineModel
                The (best) model trained on df.
            cv_model: Optional[CrossValidatorModel]
                The CrossValidatorModel object.
        """
        # get input features
        binary, numeric, categorical = self._get_features(df)

        # convert categorical to numeric labels
        indexed_cols = [f'{c}_idx' for c in categorical]
        indexers = [
            StringIndexer(inputCol=c[:-6], outputCol=c) for c in indexed_cols
        ]
        self.features = binary + numeric + indexed_cols
        self.logger.info(f'Final model features list: {self.features}')

        # assemble features into feature vector
        assembler = VectorAssembler(inputCols=self.features,
                                    outputCol=self.estimator.getFeaturesCol())
        p = Pipeline(stages=indexers + [assembler]).fit(df)
        self.logger.info('Index and vector assemble features')
        df = p.transform(df)\
            .select(self.estimator.getFeaturesCol(), self.estimator.getLabelCol())

        # if provided, set estimator params map
        if params_map:
            self.params_map = params_map

        # run cross-validation and choose the best set of parameters
        self.logger.info('Start Cross Validation')
        cv_params = {
            'estimator': self.estimator,
            'estimatorParamMaps': self.__params_grid,
            'evaluator': self.evaluator,
            'numFolds': num_folds,
            'collectSubModels': collect_sub_models
        }
        cv_model = CrossValidator(**cv_params).fit(df)

        # set the best model
        p.stages.append(cv_model.bestModel)
        self.best_model = p
        self.logger.info(
            f'Set the best model with best params: {self.best_params}')

        if return_cv:
            return self.best_model, cv_model
        else:
            return self.best_model

    def test(self, df: DataFrame) -> Tuple[DataFrame, Dict[str, Any]]:
        """
        Test the best model found so far.

        Params
        ------
        df: Spark DataFrame
            Input test data

        Returns
        -------
        predictions: DataFrame
            DataFrame `df` with added `prediction` column
        results: Dict[str, Any]
            Dictionary with any results from testing the model
            e.g., metrics, feature importances, plots etc.
        """
        assert self.best_model is not None, 'Call train() or load() first.'
        df = df.withColumnRenamed(self.estimator.getLabelCol(), 'label')
        self.logger.info('Get model predictions')
        predictions = self.best_model.transform(df)

        # execute all metrics
        results = {'best_params': self.best_params}
        for name, metric in self.metrics.items():
            results.update({name: metric(self, predictions)})
        self.logger.info(f'Results: {results}')

        return predictions, results

    def train_final(self, df: DataFrame):
        """
        Train final model using best parameters found
        on given dataframe.

        Params
        ------
        df: Spark DataFrame
            (Ideally) both train and test combined
        """
        assert self.best_params is not None, 'Call train() or load() first.'
        est = self.estimator\
            .setParams(**self.best_params)
        #         for k, v in self.best_params.items():
        #             getattr(est, 'set' + k[0].upper() + k[1:])(v)
        est = Pipeline(stages=self.best_model.stages[:-1] + [est])
        self.best_model = est.fit(df)

    def score(self, df: DataFrame) -> DataFrame:
        """
        Score on given dataset using best model
        found so far.

        Params
        ------
        df: Spark DataFrame
            Input data to score

        Returns
        -------
        df: Spark DataFrame
            Same as input with additional
            prediction columns
        """
        return self.best_model.transform(df)

    @property
    def params_map(self) -> Dict[str, List[Any]]:
        return self.__params_map

    @params_map.setter
    def params_map(self, params_map: Dict[str, List[Any]]):
        assert isinstance(params_map, dict)
        self.__params_map = params_map
        self.__params_grid = ParamGridBuilder()
        for k, v in params_map.items():
            self.__params_grid.addGrid(getattr(self.estimator, k), v)
        self.__params_grid = self.__params_grid.build()

    @property
    def best_model(self) -> PipelineModel:
        return self.__best_model

    @best_model.setter
    def best_model(self, model):
        assert isinstance(model, PipelineModel),\
            'model must be of type PipelineModel.'
        self.__best_model = model
        est = model.stages[-1]
        # coalesce = lambda *x: next(y for y in x if y is not None)
        if est._java_obj.parent() is not None\
            and self.params_map is not None:
            self.best_params = {
                k: getattr(est._java_obj.parent(),
                           'get' + k[0].upper() + k[1:])()
                for k in self.params_map.keys()
            }
        elif hasattr(est, 'extractParamMap'):
            self.best_params = {
                param[0].name: param[1]
                for param in est.extractParamMap().items()
            }
        else:
            self.best_params = None

    def save(self, path: str):
        self.best_model.save(path)

        # save additional model metadata
        metadata = {}
        if hasattr(self, 'best_params'):
            metadata.update({'best_params': self.best_params})
        if hasattr(self, 'features'):
            metadata.update({'features': self.features})
        with open(path + '/BaseCVModel_metadata', 'w') as fp:
            json.dump(metadata, fp, separators=[',', ':'])

    def load(self, path: str):
        self.best_model = PipelineModel.load(path)

        # load additional model metadata
        import os
        metadata_exists = (os.path.isfile(path + '/BaseCVModel_metadata'),
                           os.path.isfile(path + '/BaseModel_metadata'))
        if metadata_exists[0]:
            path = path + '/BaseCVModel_metadata'
        elif metadata_exists[1]:
            # for backward compatibility
            path = path + '/BaseModel_metadata'
        if any(metadata_exists):
            with open(path, 'r') as fp:
                metadata = json.load(fp)
            if 'best_params' in metadata:
                self.best_params = metadata['best_params']
            if 'features' in metadata:
                self.features = metadata['features']

        return self

    @Metrics.register()
    def self_evaluator(self, predictions: DataFrame):
        if hasattr(self, 'evaluator')\
            and hasattr(self.evaluator, 'getMetricName'):
            return {
                self.evaluator.getMetricName():
                self.evaluator.evaluate(predictions)
            }
예제 #23
0
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol = 'features', labelCol = 'Survived')

model = lr.fit(tr)
pred = model.transform(te)

import pyspark.ml.evaluation as evals
evaluator = evals.BinaryClassificationEvaluator(rawPredictionCol = 'prediction', labelCol = 'Survived')
AUC = evaluator.evaluate(pred)
AUC


############# model tunning
from pyspark.ml.tuning import ParamGridBuilder
params = ParamGridBuilder()
params = params.addGrid(lr.regParam, np.arange(0, .1, .01))
params = params.addGrid(lr.elasticNetParam, [0, .5, 1])
params = params.build()
print("Number of models to be tested:", len(params))

# create the CrossValidator
from pyspark.ml.tuning import CrossValidator
cv = CrossValidator(estimator = lr,
                    estimatorParamMaps = params,
                    evaluator = evaluator)
cv = cv.setNumFolds(10).setSeed(24).fit(tr)

# extract the best model
best_model = cv.bestModel
pred = best_model.transform(te)
print(evaluator.evaluate(pred))