예제 #1
0
파일: tests.py 프로젝트: A7mech/spark
 def test_glr_summary(self):
     from pyspark.mllib.linalg import Vectors
     df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], []))],
                                     ["label", "weight", "features"])
     glr = GeneralizedLinearRegression(family="gaussian", link="identity", weightCol="weight",
                                       fitIntercept=False)
     model = glr.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertEqual(s.numIterations, 1)  # this should default to a single iteration of WLS
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.residuals(), DataFrame))
     self.assertTrue(isinstance(s.residuals("pearson"), DataFrame))
     coefStdErr = s.coefficientStandardErrors
     self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float))
     tValues = s.tValues
     self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float))
     pValues = s.pValues
     self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float))
     self.assertEqual(s.degreesOfFreedom, 1)
     self.assertEqual(s.residualDegreeOfFreedom, 1)
     self.assertEqual(s.residualDegreeOfFreedomNull, 2)
     self.assertEqual(s.rank, 1)
     self.assertTrue(isinstance(s.solver, basestring))
     self.assertTrue(isinstance(s.aic, float))
     self.assertTrue(isinstance(s.deviance, float))
     self.assertTrue(isinstance(s.nullDeviance, float))
     self.assertTrue(isinstance(s.dispersion, float))
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned, Scala version runs full test
     sameSummary = model.evaluate(df)
     self.assertAlmostEqual(sameSummary.deviance, s.deviance)
예제 #2
0
def model():

    data = sql.read.parquet(str(DATA_PARQUET))
    data.createOrReplaceTempView('data')
    sample = sql.sql('''
        select
            hash_number_A
            ,interest_1
            ,phone_price_category
            ,sum(cost) as label
        from data
        group by hash_number_A, interest_1, phone_price_category''')
    # ,phone_price_category

    pipeline = Pipeline(stages=[
        StringIndexer(inputCol='interest_1', outputCol='interest'),
        StringIndexer(inputCol='phone_price_category',
                      outputCol='phone_price'),
        VectorAssembler(inputCols=['interest', 'phone_price'],
                        outputCol='features'),
    ])
    model_data = pipeline.fit(sample)

    sample = model_data.transform(sample)

    # 'gaussian', 'binomial', 'poisson', 'gamma', 'tweedie'

    regression = GeneralizedLinearRegression(family='gaussian',
                                             labelCol='label',
                                             featuresCol='features',
                                             maxIter=10,
                                             regParam=0.3)
    model = regression.fit(sample)
    breakpoint()
예제 #3
0
def generalizeRegression(df, label, features, adjust):
    """ This function returns the rmse and the predictions form the applied generalized 
        regression model on the dataframe with the speficied feature columns """
    ## Columns with non numerical values are adjusted
    for col in adjust:
        indexer = StringIndexer(inputCol=col, outputCol="{}_num".format(col))
        features.append("{}_num".format(col))
        df = indexer.fit(df).transform(df)
    ## Features vector configured from dataframe for model processing
    assembler = VectorAssembler(inputCols=features, outputCol="features")
    assembled = assembler.transform(df)
    gr = GeneralizedLinearRegression(featuresCol='features',
                                     labelCol=label,
                                     regParam=0.3,
                                     family="poisson")
    grModel = gr.fit(assembled)
    predictions = grModel.transform(assembled)
    ## Evaluator required for rmse estimation
    evaluator = RegressionEvaluator(labelCol=label, metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    result = {
        "RMSE":
        rmse,
        "predictions":
        [r["prediction"] for r in predictions.select("prediction").collect()]
    }
    return result
예제 #4
0
def logisT(value):
  glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3)
  # Fit the model
  model = glr.fit(value)
  # Print the coefficients and intercept for generalized linear regression model
  print("Coefficients: " + str(model.coefficients))
  print("Intercept: " + str(model.intercept))
  return (model.coefficients,1)
예제 #5
0
def model_dev_glm(df_train, df_test, max_iter, fit_intercept, reg_param):

    glm_start_time = time()

    # Create an Initial Model Instance
    mod_glm = GeneralizedLinearRegression(labelCol='label',
                                          featuresCol='features',
                                          family="gaussian",
                                          link="identity",
                                          fitIntercept=fit_intercept,
                                          maxIter=max_iter,
                                          regParam=reg_param)

    # Training The Model
    glm_final_model = mod_glm.fit(df_train)

    # Scoring The Model On Test Sample
    glm_transformed = glm_final_model.transform(df_test)
    glm_test_results = glm_transformed.select(['prediction', 'label'])

    # Collecting The Model Statistics
    glm_evaluator = RegressionEvaluator(predictionCol="prediction",
                                        labelCol="label")
    glm_r2 = round(
        glm_evaluator.evaluate(glm_test_results,
                               {glm_evaluator.metricName: "r2"}), 3)
    glm_mse = round(
        glm_evaluator.evaluate(glm_test_results,
                               {glm_evaluator.metricName: "mse"}), 3)
    glm_rmse = round(
        glm_evaluator.evaluate(glm_test_results,
                               {glm_evaluator.metricName: "rmse"}), 3)
    glm_mae = round(
        glm_evaluator.evaluate(glm_test_results,
                               {glm_evaluator.metricName: "mae"}), 3)

    # Printing The Model Statitics
    print("\n++++++ Printing Generalized Linear Model Accuracy ++++++\n")
    print("R Square: " + str(glm_r2 * 100) + "%")
    print("Mean Squared Error: " + str(glm_mse))
    print("Root Mean Squared Error: " + str(glm_rmse))
    print("Mean Absolute Error: " + str(glm_mae))

    glm_end_time = time()
    glm_elapsed_time = (glm_end_time - glm_start_time) / 60
    glm_model_stat = pd.DataFrame({
        "Model Name": ["Generalized Linear Model"],
        "R Square": glm_r2,
        "Mean Squared Error": glm_mse,
        "Root Mean Squared Error": glm_rmse,
        "Mean Absolute Error": glm_mae,
        "Time (Min.)": round(glm_elapsed_time, 3)
    })
    glm_output = (glm_final_model, glm_model_stat)

    return (glm_output)
예제 #6
0
def generalized_linear_regression(trainingDataFrame, family="gaussian", link="identity",
                                  maxIter=10, regParam=0.3):
    glr = GeneralizedLinearRegression(family=family, link=link, maxIter=maxIter, regParam=regParam)
    glrModel = glr.fit(trainingDataFrame)
    result = {}
    result["model"] = glrModel
    result["summary"] = glrModel.summary
    result["intercept"] = glrModel.intercept
    result["coefficients"] = glrModel.coefficients
    return result
예제 #7
0
    def main(self, sc, *args):
        points_rdd = self.requires().get_points_rdd(sc)

        model = GeneralizedLinearRegression(family='poisson',
                                            link=self.link,
                                            maxIter=self.iterations)

        spark_sql = SparkSession.builder.getOrCreate()
        model = model.fit(spark_sql.createDataFrame(points_rdd))
        model.save(self.output().path)
예제 #8
0
def linear_regression(ticker, writer):
    spark = SparkSession\
        .builder\
        .appName("GeneralizedLinearRegressionExample")\
        .getOrCreate()

    # $example on$
    # Load training data
    dataset1 = spark.read.format("libsvm")\
        .load("../data/newlr/" + ticker + "_no_today.csv")

    glr1 = GeneralizedLinearRegression(family="gaussian",
                                       link="identity",
                                       maxIter=10,
                                       regParam=0.3)

    # Fit the model
    model1 = glr1.fit(dataset1)

    with open("../data/tickers/" + ticker + ".csv") as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='|')
        count = 0
        for row in reader:
            if count == 1:
                today_volume = row[5]
                count = count + 1
            else:
                count = count + 1

    # Print the coefficients and intercept for generalized linear regression model
    predict_close_value = -1 * float(str(model1.coefficients[0])) + float(
        str(today_volume)) * float(str(model1.coefficients[1])) + float(
            str(model1.intercept))
    print(predict_close_value)

    today_close_value = 0
    yesterday_close_value = 0
    with open("../data/newlr/" + ticker + ".csv") as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        count = 0
        for row in reader:
            if count is 0:
                today_close_value = row[0]
                count += 1
            elif count is 1:
                yesterday_close_value = row[0]
                break

    spark.stop()
    if predict_close_value >= yesterday_close_value and today_close_value >= yesterday_close_value:
        return True
    elif predict_close_value <= yesterday_close_value and today_close_value <= yesterday_close_value:
        return True
    else:
        return False
예제 #9
0
    def test_offset(self):

        df = self.spark.createDataFrame(
            [(0.2, 1.0, 2.0, Vectors.dense(0.0, 5.0)),
             (0.5, 2.1, 0.5, Vectors.dense(1.0, 2.0)),
             (0.9, 0.4, 1.0, Vectors.dense(2.0, 1.0)),
             (0.7, 0.7, 0.0, Vectors.dense(3.0, 3.0))], ["label", "weight", "offset", "features"])

        glr = GeneralizedLinearRegression(family="poisson", weightCol="weight", offsetCol="offset")
        model = glr.fit(df)
        self.assertTrue(np.allclose(model.coefficients.toArray(), [0.664647, -0.3192581],
                                    atol=1E-4))
        self.assertTrue(np.isclose(model.intercept, -1.561613, atol=1E-4))
예제 #10
0
def regression(train_set, test_set, featuresColumn, labelColumn):

    regressor = GeneralizedLinearRegression(featuresCol=featuresColumn,
                                            labelCol=labelColumn,
                                            family="gaussian",
                                            link="log",
                                            maxIter=10,
                                            regParam=0.3)
    regressor = regressor.fit(train_set)

    predict_results = regressor.evaluate(test_set)
    result = predict_results.predictions

    return result
예제 #11
0
def best_subset_selection_GLM(df,
                              labelCol,
                              Cols,
                              label_is_categorical=False,
                              family='gaussian',
                              link='identity'):

    print('Total number of iterations: {}'.format(2**len(Cols)))

    AIC_values, feature_list, num_features = [], [], []

    for k in np.arange(1, len(Cols) + 1):

        for i, combo in enumerate(itertools.combinations(Cols, k)):

            continuousCols, categoricalCols = [], []

            for col in list(combo):
                data_type = str(df.schema[col].dataType)
                if data_type == 'StringType':
                    categoricalCols.append(col)
                else:
                    continuousCols.append(col)

            data = prepare_data(df=df,
                                labelCol=labelCol,
                                label_is_categorical=False,
                                categoricalCols=categoricalCols,
                                continuousCols=continuousCols)

            model = GeneralizedLinearRegression(family=family,
                                                link=link,
                                                featuresCol='features',
                                                labelCol='label')

            AIC = model.fit(data).summary.aic
            AIC_values.append(AIC)

            feature_list.append(combo)
            num_features.append(len(combo))

            print('Feature/s: {}, AIC={:.3f}'.format(combo, AIC))

    return pd.DataFrame({
        'num_features': num_features,
        'AIC': AIC_values,
        'features': feature_list
    }).rename_axis('Model ID').sort_values('AIC', ascending=False)
예제 #12
0
def create_model(training_data, features_col, label_col):
    """
    Create machine learning model
    :param training_data: -- dataframe: training dataset
    :param features_col: -- col: containing all the features needed.
    :param label_col: -- col: label
    :return: model created and its evaluator
    """

    # Create Generalized Linear Regression Model
    glr = GeneralizedLinearRegression()

    # Create params for the model
    params = ParamGridBuilder().baseOn({
        glr.labelCol: label_col
    }).baseOn({
        glr.featuresCol: features_col
    }).addGrid(glr.family, ["gaussian", "poisson"]).build()

    # Model Evaluator
    glr_evaluator = RegressionEvaluator(labelCol=label_col)

    # Create model with Cross Validation to get the best results
    glr_cv = CrossValidator(estimator=glr,
                            estimatorParamMaps=params,
                            evaluator=glr_evaluator)

    dt_cv_model = glr_cv.fit(training_data)

    return dt_cv_model, glr_evaluator
예제 #13
0
    def test_tweedie_distribution(self):

        df = self.spark.createDataFrame(
            [(1.0, Vectors.dense(0.0, 0.0)),
             (1.0, Vectors.dense(1.0, 2.0)),
             (2.0, Vectors.dense(0.0, 0.0)),
             (2.0, Vectors.dense(1.0, 1.0)), ], ["label", "features"])

        glr = GeneralizedLinearRegression(family="tweedie", variancePower=1.6)
        model = glr.fit(df)
        self.assertTrue(np.allclose(model.coefficients.toArray(), [-0.4645, 0.3402], atol=1E-4))
        self.assertTrue(np.isclose(model.intercept, 0.7841, atol=1E-4))

        model2 = glr.setLinkPower(-1.0).fit(df)
        self.assertTrue(np.allclose(model2.coefficients.toArray(), [-0.6667, 0.5], atol=1E-4))
        self.assertTrue(np.isclose(model2.intercept, 0.6667, atol=1E-4))
예제 #14
0
 def test_glr_load(self):
     df = self.spark.createDataFrame([(1.0, Vectors.dense(0.0, 0.0)),
                                      (1.0, Vectors.dense(1.0, 2.0)),
                                      (2.0, Vectors.dense(0.0, 0.0)),
                                      (2.0, Vectors.dense(1.0, 1.0))],
                                     ["label",  "features"])
     glr = GeneralizedLinearRegression(family="gaussian", link="identity", linkPredictionCol="p")
     model = glr.fit(df)
     self.assertEqual(model.getSolver(), "irls")
     transformed1 = model.transform(df)
     path = tempfile.mkdtemp()
     model_path = path + "/glr"
     model.save(model_path)
     model2 = GeneralizedLinearRegressionModel.load(model_path)
     self.assertEqual(model2.getSolver(), "irls")
     transformed2 = model2.transform(df)
     self.assertEqual(transformed1.take(4), transformed2.take(4))
예제 #15
0
    def test_glr_summary(self):
        from pyspark.ml.linalg import Vectors

        df = self.spark.createDataFrame(
            [(1.0, 2.0, Vectors.dense(1.0)),
             (0.0, 2.0, Vectors.sparse(1, [], []))],
            ["label", "weight", "features"],
        )
        glr = GeneralizedLinearRegression(family="gaussian",
                                          link="identity",
                                          weightCol="weight",
                                          fitIntercept=False)
        model = glr.fit(df)
        self.assertTrue(model.hasSummary)
        s = model.summary
        # test that api is callable and returns expected types
        self.assertEqual(s.numIterations,
                         1)  # this should default to a single iteration of WLS
        self.assertTrue(isinstance(s.predictions, DataFrame))
        self.assertEqual(s.predictionCol, "prediction")
        self.assertEqual(s.numInstances, 2)
        self.assertTrue(isinstance(s.residuals(), DataFrame))
        self.assertTrue(isinstance(s.residuals("pearson"), DataFrame))
        coefStdErr = s.coefficientStandardErrors
        self.assertTrue(
            isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float))
        tValues = s.tValues
        self.assertTrue(
            isinstance(tValues, list) and isinstance(tValues[0], float))
        pValues = s.pValues
        self.assertTrue(
            isinstance(pValues, list) and isinstance(pValues[0], float))
        self.assertEqual(s.degreesOfFreedom, 1)
        self.assertEqual(s.residualDegreeOfFreedom, 1)
        self.assertEqual(s.residualDegreeOfFreedomNull, 2)
        self.assertEqual(s.rank, 1)
        self.assertTrue(isinstance(s.solver, str))
        self.assertTrue(isinstance(s.aic, float))
        self.assertTrue(isinstance(s.deviance, float))
        self.assertTrue(isinstance(s.nullDeviance, float))
        self.assertTrue(isinstance(s.dispersion, float))
        # test evaluation (with training dataset) produces a summary with same values
        # one check is enough to verify a summary is returned
        # The child class GeneralizedLinearRegressionTrainingSummary runs full test
        sameSummary = model.evaluate(df)
        self.assertAlmostEqual(sameSummary.deviance, s.deviance)
예제 #16
0
def train_fit_glmm(window, date_label: str):
    poisson_regression = GeneralizedLinearRegression(family="poisson",
                                                     link="log",
                                                     maxIter=10,
                                                     regParam=0.3)

    columns = [
        denoise("train").alias("features"),
        F.expr(f"{date_label} as label")
    ]
    model = poisson_regression.fit(window.select(*columns))
    # TODO: may want to persist the fitted model
    observations = model.transform(
        window.withColumn("features", denoise("retrain")))

    columns = ["page_id", "train", "validate", "retrain", "test", "prediction"]
    return observations.select(*columns)
예제 #17
0
def GL(df_data):
    print("Train a GeneralizedLinearRegression model...")
    t1 = time.time()
    gl_model = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3) \
        .setFeaturesCol("features") \
        .setLabelCol("label") \
        .fit(df_data)
    t2 = time.time() - t1
    print("gl_model using time: %.2fs\n" % t2)
    return gl_model
예제 #18
0
def algorithm(target):
    rf = RandomForestRegressor(featuresCol='Features', labelCol=target)
    gbt = GBTRegressor(featuresCol='Features', labelCol=target)
    dt = DecisionTreeRegressor(featuresCol='Features', labelCol=target)
    lr = LinearRegression(featuresCol='Features', labelCol=target)
    glr = GeneralizedLinearRegression(family="gaussian",
                                      link="identity",
                                      featuresCol='Features',
                                      labelCol=target)
    model = [gbt, dt, lr, glr, rf]
    return rf, gbt, dt, lr, glr, model
예제 #19
0
def main(argv):

    # Name of prediction column
    label = argv[1]

    start = time.time()

    spark = SparkSession.builder \
                        .master("local[*]") \
                        .appName("datasetRegressor") \
                        .getOrCreate()

    data = spark.read.parquet(argv[0]).cache()

    vector = data.first()
    print(vector)
    featureCount = len(vector)
    print("Feature count    : {featureCount}")

    print("Dataset size (unbalanced)    : {data.count()}")

    testFraction = 0.3
    seed = 123

    # Linear Regression
    lr = LinearRegression().setLabelCol(label) \
                           .setFeaturesCol("features")
    reg = sparkRegressor(lr, label, testFraction, seed)
    matrics = reg.fit(data)
    for k, v in matrics.items():
        print(f"{k}\t{v}")

    # GBTRegressor
    gbt = GBTRegressor().setLabelCol(label) \
                        .setFeaturesCol("features")
    reg = sparkRegressor(gbt, label, testFraction, seed)
    matrics = reg.fit(data)
    for k, v in matrics.items():
        print(f"{k}\t{v}")

    # GeneralizedLinearRegression
    glr = GeneralizedLinearRegression().setLabelCol(label) \
                                       .setFeaturesCol("features") \
                                       .setFamily("gaussian") \
                                       .setLink("identity") \
                                       .setMaxIter(10) \
                                       .setRegParam(0.3)
    reg = sparkRegressor(glr, label, testFraction, seed)
    matrics = reg.fit(data)
    for k, v in matrics.items():
        print(f"{k}\t{v}")

    end = time.time()
    print("Time: %f  sec." % (end - start))
예제 #20
0
 def _model(self):
     if self.family == GAUSSIAN_:
         reg = LinearRegression()
     elif self.family == BINOMIAL_:
         reg = GeneralizedLinearRegression(family="binomial", link="logit")
     else:
         raise NotImplementedError("Family '{}' not implemented".format(
             self.family))
     reg.setLabelCol(self.response)
     reg.setMaxIter(self.__max_iter)
     return reg
예제 #21
0
def get_glm_pipeline_stages(categorical_columns,
                            continuous_columns,
                            label_column,
                            family="tweedie",
                            link="identity",
                            variance_power=0.0,
                            link_power=1.0) -> List:
    encoders = []
    for c in categorical_columns:
        indexer = StringIndexer(inputCol=c, outputCol=f"{c}_IDX")
        encoders.append(indexer)
        encoder = OneHotEncoder(inputCol=indexer.getOutputCol(),
                                outputCol=f"{c}_OHE",
                                dropLast=False)
        encoders.append(encoder)

    features_column = f"features_{label_column}"
    prediction_column = f"prediction_{label_column}"

    assembler = VectorAssembler(
        inputCols=[f"{c}_OHE"
                   for c in categorical_columns] + continuous_columns,
        outputCol=features_column)

    if family == 'tweedie':
        _model = GeneralizedLinearRegression(labelCol=label_column,
                                             featuresCol=features_column,
                                             predictionCol=prediction_column,
                                             family=family,
                                             linkPower=link_power,
                                             variancePower=variance_power)
    else:
        _model = GeneralizedLinearRegression(labelCol=label_column,
                                             featuresCol=features_column,
                                             predictionCol=prediction_column,
                                             family=family,
                                             link=link)

    stages: List = encoders + [assembler, _model]

    return stages
예제 #22
0
def GL_for(df_data):
    print("Train a GeneralizedLinearRegression model...")
    t1 = time.time()
    family = ['gaussian', 'binomial', 'poisson']
    for family_name in family:
        gl_model = GeneralizedLinearRegression(family=family_name, link="identity", maxIter=10, regParam=0.3) \
            .setFeaturesCol("features") \
            .setLabelCol("label") \
            .fit(df_data)
    t2 = time.time() - t1
    print("gl_model using time: %.2fs\n" % t2)
    return gl_model
예제 #23
0
def linear_regression(ticker,writer):
    spark = SparkSession \
        .builder \
        .appName("GeneralizedLinearRegressionExample") \
        .getOrCreate()
    # Load training data
    dataset = spark.read.format("libsvm").load("../data/lr/" + ticker + "_no_today.csv")
    glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=1, regParam=0.8)

    # Fit the model
    model = glr.fit(dataset)
    data=[ticker, 'coefficient:', model.coefficients[0],'intercept:',model.intercept]
    writer.writerow(data)
    print(data)
    # predict
    today_close_value = 0
    yesterday_close_value = 0
    with open("../data/lr/" + ticker + ".csv") as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        count = 0
        for row in reader:
            if count is 0:
                today_close_value = row[0]
                count += 1
            elif count is 1:
                yesterday_close_value = row[0]
                break

    # # print(today_close_value)
    # # print(yesterday_close_value)

    predict_close_value = -1 * float(str(model.coefficients[0])) + float(str(model.intercept))
    # print(predict_close_value)
    spark.stop()
    if predict_close_value >= yesterday_close_value and today_close_value >= yesterday_close_value:
        return True
    elif predict_close_value <= yesterday_close_value and today_close_value <= yesterday_close_value:
        return True
    else:
        return False
예제 #24
0
def store_multiple_trained_models():
    print("-- store_multiple_trained_models")
    spark = SparkSession.builder \
        .appName("tryout") \
        .getOrCreate()
    sc = spark.sparkContext

    # Create small df if not exists
    # hlp.create_small_dataframe(spark)

    # Read data and filter for traing data
    pp: DataFrame = hlp.readFromDatadirParquet(spark, "s5_01") \
        .where(F.col("label").isNotNull())

    # Create key column
    key_udf = F.udf(lambda a, b: f"{a}_{b}", T.StringType())
    pp1 = pp.withColumn('key', key_udf(pp.item_id, pp.store_id))

    # pp1.show()
    pp1.describe().show()

    # data ordered by key
    pp2 = pp1 \
        .sort('key')

    def train_simple(data: DataFrame, esti: Estimator, key: str):
        print(f"--- train_simple {key}")
        # Prepare training and test data.
        df_train, df_test = data.randomSplit([0.9, 0.1], seed=12345)

        # Run TrainValidationSplit, and choose the best set of parameters.
        trained_model: Transformer = esti.fit(df_train)

        # Make predictions on test data. model is the model with combination of parameters
        # that performed best.
        predictions = trained_model.transform(df_test) \
            .select("features", "label", "prediction")

        # Select (prediction, true label) and compute test error
        evaluator = RegressionEvaluator(labelCol="label",
                                        predictionCol="prediction",
                                        metricName="rmse")
        rmse = evaluator.evaluate(predictions)
        print(f"-- (RMSE) for {key} {rmse}")

    keys = chain(*pp1.select("key").distinct().orderBy('key').take(200))
    for k in keys:
        pp3 = pp2.filter(f"key = '{k}'")
        esti = GeneralizedLinearRegression(family='gaussian', link='identity')
        train_simple(pp3, esti, k)
예제 #25
0
def create_models(ml_models, train_df, test_df, train_FM_df, test_FM_df):
    # Declare evaluator for crossvalidation
    evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")

    for mo in ml_models:    #, maxMemoryInMB=5000
        if mo == "LR":
            print("---- Linear Regression ----")
            model = LinearRegression(featuresCol="features", labelCol="label")
            paramGrid = ParamGridBuilder().addGrid(model.regParam, [0.1, 0.01]).addGrid(model.elasticNetParam, [0.1, 0.6]).build()

        elif mo == "GLR":
            print("---- Generalized linear Regression ----")
            model = GeneralizedLinearRegression(featuresCol="features", labelCol="label")
            paramGrid = ParamGridBuilder().addGrid(model.regParam, [0.1, 0.01]).build()

        elif mo == "RF":
            print("---- Random Forest ----")
            model = RandomForestRegressor(featuresCol="features", labelCol="label", maxMemoryInMB=5000)
            paramGrid = ParamGridBuilder().addGrid(model.maxDepth, [5, 10]).addGrid(model.numTrees, [10 ,20 ]).build()
            
        elif mo == "GBT":
            print("---- Gradient Boost Tree ----")
            model = GBTRegressor(featuresCol="features", labelCol="label", maxMemoryInMB=5000)
            paramGrid = ParamGridBuilder().addGrid(model.maxDepth, [5, 10]).build()

        elif mo == "FM":
            print("---- Factorization Machines Regression ----")
            model = FMRegressor(featuresCol="features", labelCol="label")
            paramGrid = ParamGridBuilder().addGrid(model.regParam, [0.5, 0.3, 0.1, 0.01]).build()

        else:
            print(f"{mo} no detected as a ml model")

        if mo != "FM":
            # Cross validation
            #cval = CrossValidatorVerbose(estimator=model,estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=4)
            cval = CrossValidator(estimator=model,estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=4)

            cvModel = cval.fit(train_df)

            model_evaluation(cvModel, test_df, "features", "label")
        else:
            # Cross validation
            #cval = CrossValidatorVerbose(estimator=model,estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=4)
            cval = CrossValidator(estimator=model,estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=4)

            cvModel = cval.fit(train_FM_df)

            model_evaluation(cvModel, test_FM_df, "features", "label")
예제 #26
0
def selectRegressionMethod(regressionMethodName, featureName):

    if regressionMethodName == "rf":
        if test == True:
            nt = 1
        else:
            nt = 100
        modelParameters = {
            'featuresCol': featureName,
            'numTrees': nt,
            'subsamplingRate': 1,
            'maxDepth': 10
        }
        regressionMethod = RandomForestRegressor(
            featuresCol=modelParameters['featuresCol'],
            numTrees=modelParameters['numTrees'],
            subsamplingRate=modelParameters['subsamplingRate'],
            maxDepth=modelParameters['maxDepth'])

    elif regressionMethodName == "gbt":
        modelParameters = {'featuresCol': featureName, 'maxIter': 10}
        regressionMethod = GBTRegressor(
            featuresCol=modelParameters['featuresCol'],
            maxIter=modelParameters['maxIter'])

    elif regressionMethodName == "glr":
        modelParameters = {
            'featuresCol': featureName,
            'family': "poisson",
            'link': 'log',
            'maxIter': 10,
            'regParam': 0.3
        }
        regressionMethod = GeneralizedLinearRegression(
            family=modelParameters['family'],
            link=modelParameters['link'],
            maxIter=modelParameters['maxIter'],
            regParam=modelParameters['regParam'])
    else:
        print('Invalid regression method')
        return ()
    #print('Regression method selected')
    return (regressionMethod, modelParameters)
def bestGeneralizedLR(trainDf, metricDF, metricToCompare):
    regParam = [1.0, 0.6, 0.2]
    tol = [1.0, 0.6, 0.2, 0.0]
    family = ["poisson", "gaussian"]
    link = {"poisson": ["identity", "sqrt", "log"], "gaussian": ["identity"]}
    models = []

    for r in regParam:
        for f in family:
            for l in link.get(f):
                for t in tol:
                    models.append(
                        GeneralizedLinearRegression(maxIter=10,
                                                    regParam=r,
                                                    family=f,
                                                    link=l,
                                                    tol=t).fit(trainDf))

    return getBestModel(models, metricDF, metricToCompare)
예제 #28
0
def generalized_linear_regression():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    df = spark.createDataFrame([
        (1.0, Vectors.dense(0.0, 0.0)),
        (1.0, Vectors.dense(1.0, 2.0)),
        (2.0, Vectors.dense(0.0, 0.0)),
        (2.0, Vectors.dense(1.0, 1.0)),
    ], ["label", "features"])
    glr = GeneralizedLinearRegression(
        family="gaussian",
        link="identity",
    )  # linkPredictionCol="p")
    model = glr.fit(df)
    transformed = model.transform(df)
    abs(transformed.head().prediction - 1.5) < 0.001
    # True
    abs(transformed.head().p - 1.5) < 0.001
    # True
    model.coefficients
    model.numFeatures
    # 2
    abs(model.intercept - 1.5) < 0.001
    # True
    temp_path = "./"
    glr_path = temp_path + "/glr"
    glr.save(glr_path)
    glr2 = GeneralizedLinearRegression.load(glr_path)
    glr.getFamily() == glr2.getFamily()
    # True
    model_path = temp_path + "/glr_model"
    model.save(model_path)
    model2 = GeneralizedLinearRegressionModel.load(model_path)
    model.intercept == model2.intercept
    # True
    model.coefficients[0] == model2.coefficients[0]
예제 #29
0
# COMMAND ----------

(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100)
print(trainingData.count())
print(testData.count())

# COMMAND ----------

from pyspark.ml.regression import GeneralizedLinearRegression

# Load training data
dataset = spark.read.format("libsvm")\
    .load("data/mllib/sample_linear_regression_data.txt")

glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3)

# Fit the model
model = glr.fit(dataset)

# Print the coefficients and intercept for generalized linear regression model
print("Coefficients: " + str(model.coefficients))
print("Intercept: " + str(model.intercept))

# Summarize the model over the training set and print out some metrics
summary = model.summary
print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors))
print("T Values: " + str(summary.tValues))
print("P Values: " + str(summary.pValues))
print("Dispersion: " + str(summary.dispersion))
print("Null Deviance: " + str(summary.nullDeviance))
예제 #30
0
],
                                  outputCol='features')
v_data = vectorAssembler.transform(data)
v_data.show(10)

# 划分训练集,集测试集
vdata = v_data.select(['features', 'medv'])
vdata.show(10)
splits = vdata.randomSplit([0.7, 0.3])
train_data = splits[0]
test_data = splits[1]

# 训练
glr = GeneralizedLinearRegression(family="gaussian",
                                  link="identity",
                                  labelCol='medv',
                                  featuresCol='features',
                                  maxIter=1000,
                                  regParam=0.3)
# Fit the model
GlModel = glr.fit(train_data)

# Print the coefficients and intercept for generalized linear regression model
print("Coefficients: " + str(GlModel.coefficients))
print("Intercept: " + str(GlModel.intercept))

# Summarize the model over the training set and print out some metrics
summary = GlModel.summary
print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors))
print("Null Deviance: " + str(summary.nullDeviance))
print("Residual Degree Of Freedom Null: " +
      str(summary.residualDegreeOfFreedomNull))
assembler = VectorAssembler(inputCols=featureNames, outputCol="features")
test_df = assembler.transform(test_df)
test_df = test_df.select("id", "features")

print("test vector assembled")
test_df.show(5)

# Split `train_df` into train and test sets (30% held out for testing)
#Split train and test
seed(0)
(trainingData, testData) = train_df.randomSplit([0.7, 0.3])

# ## Logistic Regression
#Fit logistic regression
glr = GeneralizedLinearRegression(family="binomial",
                                  link="logit",
                                  featuresCol="features",
                                  labelCol="is_duplicate")
trainLogitModel = glr.fit(trainingData)

#Logistic model predictions
LogitPredictions = trainLogitModel.transform(testData)

# Calculate AUC
evaluator = BinaryClassificationEvaluator(labelCol="is_duplicate",
                                          rawPredictionCol="prediction",
                                          metricName="areaUnderROC")
AUClogit = evaluator.evaluate(LogitPredictions)
print("Logistic Regression AUC = %g " % AUClogit)

# ## Decision trees
#Fit decision tree model
# $example on$
from pyspark.ml.regression import GeneralizedLinearRegression
# $example off$

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("GeneralizedLinearRegressionExample")\
        .getOrCreate()

    # $example on$
    # Load training data
    dataset = spark.read.format("libsvm")\
        .load("data/mllib/sample_linear_regression_data.txt")

    glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3)

    # Fit the model
    model = glr.fit(dataset)

    # Print the coefficients and intercept for generalized linear regression model
    print("Coefficients: " + str(model.coefficients))
    print("Intercept: " + str(model.intercept))

    # Summarize the model over the training set and print out some metrics
    summary = model.summary
    print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors))
    print("T Values: " + str(summary.tValues))
    print("P Values: " + str(summary.pValues))
    print("Dispersion: " + str(summary.dispersion))
    print("Null Deviance: " + str(summary.nullDeviance))
예제 #33
0
# Random Spliting
training, testing = modelprep2.randomSplit([0.8, 0.2])

#modelprep2.count()
#training.count()
#testing.count()


#######################################################################################
#
#   Modeling - GLM (Regression)
#
#######################################################################################

glm = GeneralizedLinearRegression(featuresCol="features", labelCol="label", maxIter=10, regParam=0.3)
glmmodel = glm.fit(training)

summary = glmmodel.summary

# Show Coefficients and Intercept
print("\nFeatures: " + str(features) + "\n")
print("\nCoefficients: " + str(glmmodel.coefficients) + "\n")
print("\nIntercept: " + str(glmmodel.intercept) + "\n")
print("\nPValues: " + str(summary.pValues) + "\n")

# Summarize the model over the training set and print out some metrics
#print("\nCoefficient Standard Errors: " + str(summary.coefficientStandardErrors))
#print("T Values: " + str(summary.tValues))
#print("P Values: " + str(summary.pValues))
#print("Dispersion: " + str(summary.dispersion))
# COMMAND ----------

summary = lrModel.summary
summary.residuals.show()
print summary.totalIterations
print summary.objectiveHistory
print summary.rootMeanSquaredError
print summary.r2


# COMMAND ----------

from pyspark.ml.regression import GeneralizedLinearRegression
glr = GeneralizedLinearRegression()\
  .setFamily("gaussian")\
  .setLink("identity")\
  .setMaxIter(10)\
  .setRegParam(0.3)\
  .setLinkPredictionCol("linkOut")
print glr.explainParams()
glrModel = glr.fit(df)


# COMMAND ----------

from pyspark.ml.regression import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
print dtr.explainParams()
dtrModel = dtr.fit(df)


# COMMAND ----------