def test_with_life_goal(self):
        source_data = [("jose", 1), ("li", 2)]
        source_df = get_spark().createDataFrame(source_data, ["name", "age"])

        actual_df = with_life_goal(source_df)

        expected_data = [("jose", 1, "escape!"), ("li", 2, "escape!")]
        expected_df = get_spark().createDataFrame(expected_data,
                                                  ["name", "age", "life_goal"])

        assert (expected_df.collect() == actual_df.collect())
示例#2
0
def print_summary(jmodel):
    '''Print train and valid summary for model
        Args:
            jmodel(ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier)
    '''
    # get spark and logger
    spark = get_spark(app_name="pyspark-xgb")
    logger = get_logger(spark, "app")

    train_summary = jmodel.summary().trainObjectiveHistory()
    valid_summary = jmodel.summary().validationObjectiveHistory()
    dataset_summary = [train_summary]
    dataset_name = ['train']
    for idx in range(valid_summary.size()):
        eval_name = valid_summary.apply(idx)._1()
        eval_summary = valid_summary.apply(idx)._2()
        dataset_name.append(eval_name)
        dataset_summary.append(eval_summary)

    stop_flg = False
    for round_idx, row in enumerate(zip(*dataset_summary), 1):
        printString = "{:6} ".format('[{}]'.format(round_idx))
        for idx, r in enumerate(row):
            if r == 0:
                stop_flg = True
                break
            printString += "{:5}\t{:10}\t".format(dataset_name[idx], r)

        if stop_flg is True:
            break
        logger.info(printString)
示例#3
0
def read_table(table_name):
    from spark import get_spark
    spark_session = get_spark()

    import pandas as pd
    result = spark_session.sql("SELECT * FROM %s"%table_name)
    pandas_df = result.toPandas()
    json_result = pandas_df.to_json(orient='split')
    return json_result
示例#4
0
def count_length(string):
    from spark import get_spark
    spark_session = get_spark()

    sc = spark_session.sparkContext
    list_string = list(string)
    rdd = sc.parallelize(list_string)
    count = rdd.count()
    return count
示例#5
0
def spark_query():
    if request.method == 'POST':
        query = str(request.form["query_text"])

        from spark import get_spark
        spark_session = get_spark()
        sc = spark_session.sparkContext
        df = spark_session.sql(query)
        pandas_df = df.toPandas()

        return render_template('index.html', text=pandas_df.to_html())
示例#6
0
def spark_process():
    if request.method == 'POST':
        from spark import get_spark
        spark_session = get_spark()
        sc = spark_session.sparkContext
        array = session.get('list_tables', None)

        rdd = sc.parallelize(array)
        count = str(rdd.count())

        return render_template(
            'index.html',
            text=
            "This is calculated using Spark, its counting number of tables results from 'Get Table' Features.\n Result is : %s"
            % count)
示例#7
0
def main():

    try:

        # init spark
        spark = get_spark(app_name="pyspark-xgb")

        # get logger
        logger = get_logger(spark, "app")

        # load data
        df = spark.read.csv(DATASET_PATH + "/iris.data", get_mtrain_schema())

        # preprocess
        LABEL = 'label'
        FEATURES = 'features'
        N_CLASS = 3
        features = [c for c in df.columns if c != "class"]
        assembler = VectorAssembler(inputCols=features, outputCol='features')
        strIdxer = StringIndexer(inputCol="class", outputCol=LABEL)
        pipeline = Pipeline(stages=[assembler, strIdxer])
        df = pipeline.fit(df).transform(df).select(FEATURES, LABEL)
        train, test = df.randomSplit([0.8, 0.2])

        # training
        logger.info('training')
        xgb_params = {
            "eta": 0.1, "gamma": 0, "max_depth": 4,
            "num_round": 100, "num_early_stopping_rounds": 10,
            "num_workers": 1, "use_external_memory": False, "missing": np.nan,
            "num_class": 3, "eval_metric": "mlogloss",
            "min_child_weight": 1, "train_test_ratio": 0.8,
            "objective": "multi:softprob"
        }
        scala_map = spark._jvm.PythonUtils.toScalaMap(xgb_params)
        j = JavaWrapper._new_java_obj(
            "ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier", scala_map) \
            .setFeaturesCol(FEATURES).setLabelCol(LABEL)
        jmodel = j.train(train._jdf)
        logger.info(jmodel.summary().toString())

        # get validation metric
        preds = jmodel.transform(test._jdf)
        pred = DataFrame(preds, spark)
        slogloss = pred.withColumn('log_loss', udf_logloss('label', 'probability', N_CLASS)) \
            .agg({"log_loss": "mean"}).collect()[0]['avg(log_loss)']
        logger.info('valid logloss: {}'.format(slogloss))

        # save or update model
        model_path = MODEL_PATH + '/model.bin'
        if os.path.exists(model_path):
            shutil.rmtree(model_path)
            logger.info('model exist, rm old model')
        jmodel.save(model_path)
        logger.info('save model to {}'.format(model_path))

    except Exception:
        logger.error(traceback.print_exc())

    finally:
        # stop spark
        spark.stop()
from pyspark.sql import SparkSession
from spark import get_spark

if __name__ == "__main__":

    spark = get_spark()

    l = [('Alice', 30)]
    ar = spark.createDataFrame(l, ['name', 'age'])

    ar.show()

    spark.stop()
示例#9
0
def terminate_spark():
    import pandas as pd
    from spark import get_spark
    spark_session = get_spark()
    spark_session.stop()
def main():

    try:

        # init spark
        spark = get_spark(app_name="pyspark-xgb")

        # get logger
        logger = get_logger(spark, "app")

        # load data
        train = spark.read.csv(DATASET_PATH + "/iris_train.csv",
                    get_mtrain_schema(),
                    header=True)
        test = spark.read.csv(DATASET_PATH + "/iris_test.csv",
                    get_mtrain_schema(),
                    header=True)

        # preprocess
        # get label encode result from csv, since StringIndexer will get different result
        STR_LABEL = 'class'
        LABEL = 'label'
        FEATURES = 'features'
        N_CLASS = 3
        features = [c for c in train.columns if c not in [STR_LABEL, LABEL]]
        assembler = VectorAssembler(inputCols=features, outputCol=FEATURES)
        pipeline = Pipeline(stages=[assembler])
        preprocess = pipeline.fit(train)
        train = preprocess.transform(train).select(FEATURES, LABEL)
        test = preprocess.transform(test).select(FEATURES, LABEL)

        # set param map
        xgb_params = {
            "eta": 0.1, "eval_metric": "mlogloss",
            "gamma": 0, "max_depth": 5, "min_child_weight": 1.0,
            "objective": "multi:softprob", "seed": 0,
            "num_class": N_CLASS,
            # xgboost4j only
            "num_round": 100, "num_early_stopping_rounds": 10,
            "maximize_evaluation_metrics": False,
            "num_workers": 1, "use_external_memory": False,
            "missing": np.nan,
        }
        scala_map = spark._jvm.PythonUtils.toScalaMap(xgb_params)

        # set evaluation set
        eval_set = {'eval': test._jdf}
        scala_eval_set = spark._jvm.PythonUtils.toScalaMap(eval_set)

        logger.info('training')
        j = JavaWrapper._new_java_obj(
            "ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier", scala_map) \
            .setFeaturesCol(FEATURES).setLabelCol(LABEL) \
            .setEvalSets(scala_eval_set)
        jmodel = j.fit(train._jdf)
        print_summary(jmodel)

        # get validation metric
        preds = jmodel.transform(test._jdf)
        pred = DataFrame(preds, spark)
        slogloss = pred.withColumn('log_loss', udf_logloss(LABEL, 'probability', N_CLASS)) \
            .agg({"log_loss": "mean"}).collect()[0]['avg(log_loss)']
        logger.info('[xgboost4j] valid logloss: {}'.format(slogloss))

        # save model - using native booster for single node library to read
        model_path = MODEL_PATH + '/model.bin'
        logger.info('save model to {}'.format(model_path))
        jbooster = jmodel.nativeBooster()
        jbooster.saveModel(model_path)

        # get feature score
        imp_type = "gain"
        feature_map_path = MODEL_PATH + '/feature.map'
        create_feature_map(feature_map_path, features)
        jfeatureMap = jbooster.getScore(feature_map_path, imp_type)
        f_imp = dict()
        for feature in features:
            if not jfeatureMap.get(feature).isEmpty():
                f_imp[feature] = jfeatureMap.get(feature).get()
        feature_imp_path = MODEL_PATH + '/feature.imp'
        create_feature_imp(feature_imp_path, f_imp)

        # [Optional] load model training by xgboost, predict and get validation metric
        local_model_path = LOCAL_MODEL_PATH + '/model.bin'
        if os.path.exists(local_model_path):
            logger.info('load model from {}'.format(local_model_path))
            scala_xgb = spark.sparkContext._jvm.ml.dmlc.xgboost4j.scala.XGBoost
            jbooster = scala_xgb.loadModel(local_model_path)

            # uid, num_class, booster
            xgb_cls_model = JavaWrapper._new_java_obj(
                "ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel",
                "xgbc", N_CLASS, jbooster)

            jpred = xgb_cls_model.transform(test._jdf)
            pred = DataFrame(jpred, spark)
            slogloss = pred.withColumn('log_loss', udf_logloss(LABEL, 'probability', N_CLASS)) \
                .agg({"log_loss": "mean"}).collect()[0]['avg(log_loss)']
            logger.info('[xgboost] valid logloss: {}'.format(slogloss))
        else:
            logger.info(
                "local model is not exist, call python_xgb/train_multi.py to get the model "
                "and compare logloss between xgboost and xgboost4j"
            )

    except Exception:
        logger.error(traceback.print_exc())

    finally:
        # stop spark
        spark.stop()
示例#11
0
def spark():
    sc = get_spark()
    sc.setLogLevel("WARN")
    yield sc
    sc.stop()
示例#12
0
def main():

    try:

        # init spark
        spark = get_spark(app_name="pyspark-xgb")

        # get logger
        logger = get_logger(spark, "app")

        # load data
        train = spark.read.schema(get_btrain_schema()).option('header', True).csv(
            DATASET_PATH + '/emp_train.csv')
        test = spark.read.schema(get_btrain_schema()).option('header', True).csv(
            DATASET_PATH + '/emp_test.csv')

        # preprocess
        LABEL = 'Attrition'
        FEATURES = 'features'
        features = [c for c in train.columns if c != LABEL]
        assembler = VectorAssembler(inputCols=features, outputCol=FEATURES)
        train = assembler.transform(train).select(FEATURES, LABEL)
        test = assembler.transform(test).select(FEATURES, LABEL)

        # training
        logger.info('training')
        xgb_params = {
            "eta": 0.1, "gamma": 0, "max_depth": 4,
            "num_round": 100, "num_early_stopping_rounds": 10,
            "num_workers": 1, "use_external_memory": False, "missing": np.nan,
            "eval_metric": "logloss", "min_child_weight": 1, "train_test_ratio": 0.8,
            "objective": "binary:logistic"
        }
        scala_map = spark._jvm.PythonUtils.toScalaMap(xgb_params)
        j = JavaWrapper._new_java_obj(
            "ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier", scala_map) \
            .setFeaturesCol(FEATURES).setLabelCol(LABEL)
        jmodel = j.train(train._jdf)
        logger.info(jmodel.summary().toString())

        # get validation metric
        preds = jmodel.transform(test._jdf)
        pred = DataFrame(preds, spark)
        slogloss = pred.withColumn('log_loss', udf_logloss(LABEL, 'probability')) \
            .agg({"log_loss": "mean"}).collect()[0]['avg(log_loss)']
        logger.info('valid logloss: {}'.format(slogloss))

        # save or update model
        model_path = MODEL_PATH + '/model.bin'
        if os.path.exists(model_path):
            shutil.rmtree(model_path)
            logger.info('model exist, rm old model')
        jw = JavaWrapper._new_java_obj(
            "ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel.XGBoostClassificationModelWriter", jmodel)
        jw.saveImpl(model_path)
        logger.info('save model to {}'.format(model_path))

    except Exception:
        logger.error(traceback.print_exc())

    finally:
        # stop spark
        spark.stop()
示例#13
0
    ## Keeping only Rows with valid float type in Coordinates ==> May be escape if coordinates is not needed for our training steps
    ## hence the df_1 will be sufficient.
    #df_2 = df_1[df_1.coordinates.apply(lambda x: type(x['latitude']) in [int, np.int64, float, np.float64])]
    #df_2 = df_1[df_1.coordinates.apply(lambda x: type(x['longitude']) in [int, np.int64, float, np.float64])]

    return df_1


if __name__ == "__main__":

    #spark = SparkSession \
    #    .builder \
    #    .appName("Python Spark SQL data source example") \
    #    .getOrCreate()
    spark = spark.get_spark()

    if len(sys.argv) < 4 or len(sys.argv) > 5:
        print(
            "Usage: kmeans <input file> <mode(trainning or predicting)> <k> <output file> OR kmeans <input file> <mode(trainning or predicting)> <k> ",
            file=sys.stderr)
        sys.exit(-1)

    mode = sys.argv[2]  # mode is 'trainning' or 'predicting'
    k = int(sys.argv[3])  # Number of cluster needed for clustering
    currentdate = datetime.now().strftime("%Y-%m-%d")

    ###### LOADING DATA #####
    # INPUT
    input_data_path = os.path.join(PATH, "datasets", "input", sys.argv[1])
    output_data_path = os.path.join(PATH, "datasets", "output")