示例#1
0
    def test_java_params(self):
        import pyspark.ml.feature
        import pyspark.ml.classification
        import pyspark.ml.clustering
        import pyspark.ml.evaluation
        import pyspark.ml.pipeline
        import pyspark.ml.recommendation
        import pyspark.ml.regression

        modules = [pyspark.ml.feature, pyspark.ml.classification, pyspark.ml.clustering,
                   pyspark.ml.evaluation, pyspark.ml.pipeline, pyspark.ml.recommendation,
                   pyspark.ml.regression]
        for module in modules:
            for name, cls in inspect.getmembers(module, inspect.isclass):
                if not name.endswith('Model') and not name.endswith('Params') \
                        and issubclass(cls, JavaParams) and not inspect.isabstract(cls):
                    # NOTE: disable check_params_exist until there is parity with Scala API
                    check_params(self, cls(), check_params_exist=False)

        # Additional classes that need explicit construction
        from pyspark.ml.feature import CountVectorizerModel, StringIndexerModel
        check_params(self, CountVectorizerModel.from_vocabulary(['a'], 'input'),
                     check_params_exist=False)
        check_params(self, StringIndexerModel.from_labels(['a', 'b'], 'input'),
                     check_params_exist=False)
示例#2
0
    def test_java_params(self):
        import pyspark.ml.feature
        import pyspark.ml.classification
        import pyspark.ml.clustering
        import pyspark.ml.evaluation
        import pyspark.ml.pipeline
        import pyspark.ml.recommendation
        import pyspark.ml.regression

        modules = [
            pyspark.ml.feature, pyspark.ml.classification,
            pyspark.ml.clustering, pyspark.ml.evaluation, pyspark.ml.pipeline,
            pyspark.ml.recommendation, pyspark.ml.regression
        ]
        for module in modules:
            for name, cls in inspect.getmembers(module, inspect.isclass):
                if not name.endswith('Model') and not name.endswith('Params') \
                        and issubclass(cls, JavaParams) and not inspect.isabstract(cls):
                    # NOTE: disable check_params_exist until there is parity with Scala API
                    check_params(self, cls(), check_params_exist=False)

        # Additional classes that need explicit construction
        from pyspark.ml.feature import CountVectorizerModel, StringIndexerModel
        check_params(self,
                     CountVectorizerModel.from_vocabulary(['a'], 'input'),
                     check_params_exist=False)
        check_params(self,
                     StringIndexerModel.from_labels(['a', 'b'], 'input'),
                     check_params_exist=False)
示例#3
0
def process(rdd):

    spark = getSparkSessionInstance(rdd.context.getConf())

    dota = rdd.map(lambda x: x[1])
    featuresdata = dota.map(lambda x: x.split(':')[2])
    actualdata = featuresdata.map(lambda x: x.split(','))
    rowRdd = actualdata.map(lambda x: Row(sl=float(x[0][1:]), sw=float(x[1]), pl=float(x[2]), pw=float(x[3]), stringlabel=x[4][:-4]))
    features = spark.createDataFrame(rowRdd)
    features.show()
    rowRdd = actualdata.map(lambda x: Row(sl=float(x[0]), sw=float(x[1]), pl=float(x[2]), pw=float(x[3]), stringlabel=x[4]))
    
    indexer = StringIndexerModel()
    assembler = VectorAssembler()
    lr = LogisticRegressionModel()

    pipe = PipelineModel(stages=[indexer,assembler,lr]).load('gs://suryasuresh/lab8output')

    result = pipe.transform(features)

    f1score = MulticlassClassificationEvaluator(metricName='f1')
    precision = MulticlassClassificationEvaluator(metricName='weightedPrecision')
    recall = MulticlassClassificationEvaluator(metricName='weightedRecall')
    accuracy = MulticlassClassificationEvaluator(metricName='accuracy')

    print(result.values)
    print("Accuracy:\t",accuracy.evaluate(result),"\nF1score:\t",f1score.evaluate(result),"\nWeighted Recall:\t",recall.evaluate(result),"\nWeighted Precision:\t",precision.evaluate(result))
示例#4
0
    def test_string_indexer_from_labels(self):
        model = StringIndexerModel.from_labels(
            ["a", "b", "c"], inputCol="label", outputCol="indexed", handleInvalid="keep"
        )
        self.assertEqual(model.labels, ["a", "b", "c"])
        self.assertEqual(model.labelsArray, [("a", "b", "c")])

        df1 = self.spark.createDataFrame(
            [(0, "a"), (1, "c"), (2, None), (3, "b"), (4, "b")], ["id", "label"]
        )

        result1 = model.transform(df1)
        actual1 = result1.select("id", "indexed").collect()
        expected1 = [
            Row(id=0, indexed=0.0),
            Row(id=1, indexed=2.0),
            Row(id=2, indexed=3.0),
            Row(id=3, indexed=1.0),
            Row(id=4, indexed=1.0),
        ]
        self.assertEqual(actual1, expected1)

        model_empty_labels = StringIndexerModel.from_labels(
            [], inputCol="label", outputCol="indexed", handleInvalid="keep"
        )
        actual2 = model_empty_labels.transform(df1).select("id", "indexed").collect()
        expected2 = [
            Row(id=0, indexed=0.0),
            Row(id=1, indexed=0.0),
            Row(id=2, indexed=0.0),
            Row(id=3, indexed=0.0),
            Row(id=4, indexed=0.0),
        ]
        self.assertEqual(actual2, expected2)

        # Test model with default settings can transform
        model_default = StringIndexerModel.from_labels(["a", "b", "c"], inputCol="label")
        df2 = self.spark.createDataFrame(
            [(0, "a"), (1, "c"), (2, "b"), (3, "b"), (4, "b")], ["id", "label"]
        )
        transformed_list = (
            model_default.transform(df2)
            .select(model_default.getOrDefault(model_default.outputCol))
            .collect()
        )
        self.assertEqual(len(transformed_list), 5)
示例#5
0
    def get_top_ratings(self, user_id, count):
        """Retrun top <count> bussiness
           Calls 
        """
        start_time = time.time()
        #bid = self.reviewDF.select('business_id_num','business_id').distinct().cache()
        businessDF_vegas_food_save = os.path.join(
            'dataset', 'businessDF_vegas_food.parquet')
        businessDF_vegas_food = self.ss.read.parquet(
            businessDF_vegas_food_save)

        #bid.show(20)
        logger.error('{} seconds has elapsed. {} entries remained'.format(
            time.time() - start_time, businessDF_vegas_food.count()))
        #predDF = bid.filter(bid['user_id'] == user_id)
        #build user request using input id
        logger.error(
            '{} seconds has elapsed before loading building predDF'.format(
                time.time() - start_time))
        bid = businessDF_vegas_food.select('business_id', 'latitude',
                                           'longitude')
        indexer_business_save = os.path.join('model', 'bus_ind_model')
        indexer_business_model = StringIndexerModel.load(indexer_business_save)
        bid = indexer_business_model.transform(bid)
        predDF = bid.withColumn("user_id", lit(user_id)).cache()

        logger.error('{} seconds has elapsed before loading indexer'.format(
            time.time() - start_time))
        indexer_user_model = StringIndexerModel.load(self.indexer_user_save)
        predDF = indexer_user_model.transform(predDF)
        '''user_id_converter =  IndexToString(inputCol= 'user_id',outputCol='user_id')
        convert_df = '''
        #predDF.show(10)
        logger.error('{} seconds has elapsed before model'.format(time.time() -
                                                                  start_time))
        model = ALSModel.load(self.model_save)
        prediction_user = model.transform(predDF)
        #prediction_user.show(20)
        ratings = prediction_user.sort(desc('prediction')).limit(count).select(
            'business_id', 'prediction', 'latitude', 'longitude')
        #ratings.show(20)
        #ratings.printSchema()
        logger.error('{} seconds has elapsed'.format(time.time() - start_time))
        return ratings.toPandas().to_json(orient='records')
示例#6
0
def main(spark, user_indexer_model, item_indexer_model, test_file, save_test):
    '''
    Parameters
    ----------
    spark : SparkSession object
    data_file : string, path to the parquet file to load
    model_file : string, path to store the serialized model file
    '''

    # Load the parquet file
    test = spark.read.parquet(test_file)
    user_index = StringIndexerModel.load(user_indexer_model)
    item_index = StringIndexerModel.load(item_indexer_model)
    test = user_index.transform(test)
    test = item_index.transform(test)
    test = test.sort('count', ascending = False)
    
    relevant_docs = test.groupBy('user').agg(F.collect_list('item').alias('item'))
    relevant_docs.write.parquet(save_test)
示例#7
0
def modelPredicting(testSetWoeDF, fn):
    # 数据预转换,满足ML-linearRegression输入格式要求
    strInd = StringIndexerModel.load(savePath +
                                     '{}/{}/strInd'.format(curDate, fn))
    lrModel = LogisticRegressionModel.load(savePath +
                                           '{}/{}/lrModel'.format(curDate, fn))
    testSetVecAse = vecAseembler.transform(testSetWoeDF)
    testSetVecAseStrInd = strInd.transform(testSetVecAse)
    testSetWithProba = lrModel.transform(testSetVecAseStrInd)
    return (testSetWithProba)
示例#8
0
def transform_spark(data, input, transformed_column_name):
    from pyspark.ml.feature import StringIndexerModel
    import pyspark.sql.functions as F

    indexer = StringIndexerModel.from_labels(input["indexes"]["index"],
                                             inputCol=input["col"],
                                             outputCol=transformed_column_name)

    return indexer.transform(data).withColumn(
        transformed_column_name,
        F.col(transformed_column_name).cast("int"))
示例#9
0
def transform_spark(data, features, args, transformed_feature):
    from pyspark.ml.feature import StringIndexerModel
    import pyspark.sql.functions as F

    indexer = StringIndexerModel.from_labels(args["index"],
                                             inputCol=features["text"],
                                             outputCol=transformed_feature)

    return indexer.transform(data).withColumn(
        transformed_feature,
        F.col(transformed_feature).cast("int"))
    def indexToString(infoData):
        stringIndexerPath = infoData.get(mc.INDEXERPATH)
        inverterColm = infoData.get(mc.COLMTOINVERT)
        dataset = infoData.get(mc.DATASET)
        stringIndexer = StringIndexerModel.load(stringIndexerPath)
        inverter = IndexToString(inputCol=inverterColm, outputCol=mc.DMXINVERTEDCOLM,
                                 labels=stringIndexer.labels)
        dataset = inverter.transform(dataset)

        #drop the indexed colm and rename the new unindexed colm with the actual one
        dataset = dataset.drop(inverterColm)
        dataset = dataset.withColumnRenamed(mc.DMXINVERTEDCOLM, inverterColm)
        return dataset
示例#11
0
    def load(self, path, spark_session=None):
        self.iforest_model = IForestModel.load(self._get_iforest_path(path))
        self.scaler_model = StandardScalerModel.load(
            self._get_scaler_path(path))

        file_manager = FileManager(path, spark_session)
        params = file_manager.load_from_file(self._get_params_path(path),
                                             format='json')
        self.set_params(**params)

        self.indexes = []
        for feature in self.categorical_features:
            self.indexes.append(
                StringIndexerModel.load(self._get_index_path(path, feature)))
示例#12
0
def main(stations_indexer_path, onehot_path, weather_indexer_path,
         model_data_folder, station_id, hour, month, dayofyear, visibility,
         air_temp, wind_speed, weather_class):

    #Starting session
    spark = SparkSession.builder.appName('BigDataML').getOrCreate()
    spark.sparkContext.setLogLevel("ERROR")

    #Loading data
    columns = [
        "hour", 'dayofyear', 'month', 'air_temp', 'wind_speed', 'visibility',
        'weather_class', 'station'
    ]
    vals = [(hour, dayofyear, month, air_temp, wind_speed, visibility,
             weather_class, station_id)]
    data = spark.createDataFrame(vals, columns)

    model = PipelineModel.load(model_data_folder)

    stringIndexer = StringIndexerModel.load(stations_indexer_path)
    data = stringIndexer.transform(data)
    stringIndexer_weather = StringIndexerModel.load(weather_indexer_path)
    data = stringIndexer_weather.transform(data)

    encoder = OneHotEncoderModel.load(onehot_path)
    data = encoder.transform(data)

    features =\
        VectorAssembler(inputCols=["hour", 'dayofyear', 'month', 'air_temp', 'wind_speed', 'visibility', 'weather_index', 'station_vector'],
         outputCol="features")
    test_data = features.transform(data)

    model = PipelineModel.load(model_data_folder)
    # Make predictions.
    predictions = model.transform(test_data).collect()

    print("Predicted count is: {}".format(int(predictions[0]['prediction'])))
示例#13
0
    def test_string_indexer_from_labels(self):
        model = StringIndexerModel.from_labels(["a", "b", "c"], inputCol="label",
                                               outputCol="indexed", handleInvalid="keep")
        self.assertEqual(model.labels, ["a", "b", "c"])

        df1 = self.spark.createDataFrame([
            (0, "a"),
            (1, "c"),
            (2, None),
            (3, "b"),
            (4, "b")], ["id", "label"])

        result1 = model.transform(df1)
        actual1 = result1.select("id", "indexed").collect()
        expected1 = [Row(id=0, indexed=0.0), Row(id=1, indexed=2.0), Row(id=2, indexed=3.0),
                     Row(id=3, indexed=1.0), Row(id=4, indexed=1.0)]
        self.assertEqual(actual1, expected1)

        model_empty_labels = StringIndexerModel.from_labels(
            [], inputCol="label", outputCol="indexed", handleInvalid="keep")
        actual2 = model_empty_labels.transform(df1).select("id", "indexed").collect()
        expected2 = [Row(id=0, indexed=0.0), Row(id=1, indexed=0.0), Row(id=2, indexed=0.0),
                     Row(id=3, indexed=0.0), Row(id=4, indexed=0.0)]
        self.assertEqual(actual2, expected2)

        # Test model with default settings can transform
        model_default = StringIndexerModel.from_labels(["a", "b", "c"], inputCol="label")
        df2 = self.spark.createDataFrame([
            (0, "a"),
            (1, "c"),
            (2, "b"),
            (3, "b"),
            (4, "b")], ["id", "label"])
        transformed_list = model_default.transform(df2) \
            .select(model_default.getOrDefault(model_default.outputCol)).collect()
        self.assertEqual(len(transformed_list), 5)
示例#14
0
def main():
    
    feature_model = VectorIndexerModel.load(featureIndexer_path)
    vectorAssembler = vectorAssembler.load(vectorAssembler_path)
    ohe_model = OneHotEncoderModel.load(ohe_model_path)
    stringIndexer_model = StringIndexerModel.load(stringIndexerPath)
    lr_model = LinearRegressionModel.load(model_path)
    
    spark = SparkSession.builder.master("local").appName("Connection").getOrCreate()
    
    json_data = request.get_json()
    
    availability = json_data.availability
    minimum_nights = json_data.minimum_nights
    latitude = json_data.latitude
    longitude = json_data.longitude
    name = json_data.name
    neighbourhood_group = json_data.neighbourhood_group
    neighbourhood = json_data.neighbourhood
    room_type = json_data.room_type
    
    dept = [(name,neighbourhood_group,neighbourhood,room_type,latitude,longitude,0.0,minimum_nights,0.0,1.0,availability,0.0)]

    df = spark.createDataFrame(data=dept, schema = deptColumns)
    
    df = stringIndexer_model.transform(df)
    
    df = df.drop(*["neighbourhood_group", 'neighbourhood', 'room_type'])
    df = ohe_model.transform(df)
    df = df.drop(*["neighbourhood_group_int", 'neighbourhood_int', 'room_type_int'])

    df = df.withColumn("minimum_nights", when(df["minimum_nights_int"] > 30, 30).otherwise(df["minimum_nights_int"])).drop('minimum_nights_int')
    df = df.withColumn('name_length', length('name')).drop('name')

    df = vectorAssembler.transform(df)
    df = df.select(['features'])
    df = feature_model.transform(df)
    df = df.select(['features_vec'])

    lr_predictions = lr_model.transform(df)
    
    return jsonify(data=lr_predictions.collect()[-1].prediction)
        
示例#15
0
def multiHotEncoderExample(movieSamples):
    samplesWithGenres = movieSamples.select(
        "movieId", "title",
        explode(split(F.col("genres"),
                      "\\|").cast(ArrayType(StringType()))).alias("genre"))
    genreIndexer = StringIndexer(inputCol="genre", outputCol="genreIndex")
    StringIndexerModel = genreIndexer.fit(samplesWithGenres)
    genreIndexSamples = StringIndexerModel.transform(
        samplesWithGenres).withColumn("genreIndexInt",
                                      F.col("genreIndex").cast(IntegerType()))

    indexSize = genreIndexSamples.agg(max(
        F.col("genreIndexInt"))).head()[0] + 1
    processedSamples = genreIndexSamples.groupBy("movieId").agg(
        F.collect_list("genreIndexInt").alias('genreIndexes')).withColumn(
            "IndexSize", F.lit(indexSize))
    finalSample = processedSamples.withColumn(
        "vector",
        udf(array2vec, VectorUDT())(F.col("genreIndexes"), F.col("indexSize")))

    finalSample.printSchema()
    finalSample.show(10)
示例#16
0
    def test_java_params(self):
        import re

        import pyspark.ml.feature
        import pyspark.ml.classification
        import pyspark.ml.clustering
        import pyspark.ml.evaluation
        import pyspark.ml.pipeline
        import pyspark.ml.recommendation
        import pyspark.ml.regression

        modules = [
            pyspark.ml.feature,
            pyspark.ml.classification,
            pyspark.ml.clustering,
            pyspark.ml.evaluation,
            pyspark.ml.pipeline,
            pyspark.ml.recommendation,
            pyspark.ml.regression,
        ]
        for module in modules:
            for name, cls in inspect.getmembers(module, inspect.isclass):
                if (not name.endswith("Model") and not name.endswith("Params")
                        and issubclass(cls, JavaParams)
                        and not inspect.isabstract(cls)
                        and not re.match("_?Java", name) and name != "_LSH"
                        and name != "_Selector"):
                    check_params(self, cls(), check_params_exist=True)

        # Additional classes that need explicit construction
        from pyspark.ml.feature import CountVectorizerModel, StringIndexerModel

        check_params(self,
                     CountVectorizerModel.from_vocabulary(["a"], "input"),
                     check_params_exist=True)
        check_params(self,
                     StringIndexerModel.from_labels(["a", "b"], "input"),
                     check_params_exist=True)
def preprocess_renewal_model_scoring_data(pargs, params):
    """
    Function to pre-process raw scoring data for renewal model
    """

    # Load parameters
    score_filter_flag = configs['score_filter_flag']
    score_filter_condition = configs['score_filter_condition']
    score_sampling_flag = configs['score_sampling_flag']
    score_sampling_fraction = configs['score_sampling_fraction']
    primary_key_columns = configs['primary_key_columns']
    fillna_non_categorical_value = configs['fillna_non_categorical_value']
    fillna_categorical_value = configs['fillna_categorical_value']
    target_variable = configs['target_variable']
    seed = configs['seed']
    s3_bucket = configs['s3_bucket']

    # Load raw scoring data
    score_raw = spark.read.parquet(
        data_paths['yr1_renewal_scoring_raw'].format(run_mode=run['run_mode'],
                                                     run_id=run['run_id']))

    # Load feature config saved in the pre-processing step
    feature_config = util.load_yml_file_from_s3(
        s3_bucket,
        data_paths['renewal_feature_config'].format(run_mode=run['run_mode'],
                                                    run_id=run['run_id'])[12:])
    feature_list = feature_config['feature_list']
    feature_list_indexed = feature_config['feature_list_indexed']
    categorical_columns = feature_config['categorical_columns']
    non_categorical_columns = feature_config['non_categorical_columns']

    # Load string indexers
    string_indexers = []
    for i in range(len(categorical_columns)):
        string_indexers.append(
            StringIndexerModel.load(
                data_paths['renewal_string_indexer'].format(
                    run_mode=run['run_mode'], run_id=run['run_id'], i=i)))

    # Select only the features used in model training
    score_raw = score_raw.select(primary_key_columns + feature_list)

    # Filter scoring data
    if score_filter_flag:
        score_raw = score_raw.filter(score_filter_condition)

    # Sample scoring data
    if score_sampling_flag:
        score_raw = model.sampling(score_raw, score_sampling_fraction, seed)

    # Ensure that all "n_" cols are indeed numeric
    score_raw = model.ensureColsAreNumeric(score_raw, non_categorical_columns)

    # Ensure that all "i_" cols are indeed string
    score_raw = model.ensureColsAreString(score_raw, categorical_columns)

    # Impute missing values
    score_raw = model.imputeMissing(score_raw, non_categorical_columns,
                                    categorical_columns,
                                    fillna_non_categorical_value,
                                    fillna_categorical_value)

    # Apply string indexer on string columns
    score_raw, string_indexers, categorical_columns_indexed = model.applyStringIndexer(
        score_raw, categorical_columns, string_indexers)

    # Assemble the final feature list
    score_raw = model.assembleFeaturesIntoVector(score_raw,
                                                 feature_list_indexed)

    # Save score data
    score_raw.write.parquet(data_paths['renewal_score'].format(
        run_mode=run['run_mode'], run_id=run['run_id']),
                            mode='overwrite')
def read_parquet(parquet_path):
    parquet_df = spark.read.parquet(parquet_path)

    parquet_df = parquet_df.drop('id')
    parquet_df = parquet_df.drop('one_area_price')
    parquet_df = parquet_df.drop('agency_nameVec')
    parquet_df = parquet_df.drop('districtVec')
    parquet_df = parquet_df.drop('room_type')
    parquet_df.show(truncate=False)
    print('parquet_df.count()==========11', parquet_df.count(),
          parquet_df.columns)
    for i in parquet_df.columns:
        if ('Vec' not in i) & ('facilities_vectors' not in i):

            if parquet_df.filter(parquet_df[i].isNull()).count() > 0:

                parquet_df = parquet_df.na.fill(0, i)
            elif parquet_df.filter(parquet_df[i] == 'NULL').count() > 0:

                parquet_df = parquet_df.filter(parquet_df[i] != 'NULL')

            parquet_df = parquet_df.select(
                '*', parquet_df[i].cast('float').alias('tmp_name')).drop(i)
            parquet_df = parquet_df.withColumnRenamed('tmp_name', i)
        parquet_df = parquet_df.filter(parquet_df[i].isNotNull())
        print('parquet_df.count()==========22', i, parquet_df.count())

    columns = parquet_df.columns
    columns.remove('price')
    from pyspark.ml.feature import OneHotEncoder, StringIndexer, StringIndexerModel
    from pyspark.ml.feature import CountVectorizer, CountVectorizerModel
    model_path = "/user/limeng/ganji_daxing_save_models/"
    columns_list = []
    for i in columns:
        if i == 'facilities_vectors':
            loadedCountVectorizerModel = CountVectorizerModel.load(
                model_path + 'count-vectorizer-model')
            temp = loadedCountVectorizerModel.vocabulary
            columns_list.extend(temp)
        elif i == 'rent_typeVec':
            loadedStringIndexerModel = StringIndexerModel.load(
                model_path + 'stringIndexer_modelrent_type')
            temp = loadedStringIndexerModel.labels
            columns_list.extend(temp)
        elif i == 'agency_nameVec':
            loadedStringIndexerModel = StringIndexerModel.load(
                model_path + 'stringIndexer_modelagency_name')
            temp = loadedStringIndexerModel.labels
            columns_list.extend(temp)
        elif i == 'directionVec':
            loadedStringIndexerModel = StringIndexerModel.load(
                model_path + 'stringIndexer_modeldirection')
            temp = loadedStringIndexerModel.labels
            columns_list.extend(temp)
        elif i == 'zoneVec':
            loadedStringIndexerModel = StringIndexerModel.load(
                model_path + 'stringIndexer_modelzone')
            temp = loadedStringIndexerModel.labels
            columns_list.extend(temp)
        elif i == 'pay_typeVec':
            loadedStringIndexerModel = StringIndexerModel.load(
                model_path + 'stringIndexer_modelpay_type')
            temp = loadedStringIndexerModel.labels
            columns_list.extend(temp)
        elif i == 'districtVec':
            loadedStringIndexerModel = StringIndexerModel.load(
                model_path + 'stringIndexer_modeldistrict')
            temp = loadedStringIndexerModel.labels
            columns_list.extend(temp)
        else:
            columns_list.append(i)

    vecAssembler = VectorAssembler(inputCols=columns, outputCol="features")
    parquet_df = vecAssembler.transform(parquet_df).select('features', 'price')

    parquet_df = parquet_df.withColumnRenamed('price', 'label')

    return parquet_df, columns_list
示例#19
0
dataset3 = dataset2.select("conn.srcip", "conn.sport", "conn.dstip",
                           "conn.dsport", "conn.proto", "conn.dur",
                           "conn.sbytes", "conn.dbytes", "conn.service",
                           "conn.Spkts", "conn.Dpkts")
dataset3.printSchema()
print(type(dataset3))

#Rellena los valores nulos con un 0 (prevencion de errores)
dataset3 = dataset3.fillna(0)

# Procesamiento de caracteristicas
string_indexer_models = {}
for column in ['proto', 'service', 'attack_cat']:
    string_indexer_model_path = "{}/data/str_indexer_extended/str_indexer_model_extended_{}.bin".format(
        base_path, column)
    string_indexer = StringIndexerModel.load(string_indexer_model_path)
    string_indexer_models[column] = string_indexer

for column in ['proto', 'service', 'attack_cat']:
    string_indexer_model = string_indexer_models[column]
    dataset3 = string_indexer_model.transform(dataset3)

vector_assembler_path = "{}/data/numeric_vector_assembler_RFE.bin".format(
    base_path)
vector_assembler = VectorAssembler.load(vector_assembler_path)
finalDataset = vector_assembler.transform(dataset3)

# Carga el modelo de Machine Learning y lo aplica a los datos recibidos
model_path = "{}/data/RandomForest_extended.bin".format(base_path)
model = RandomForestClassificationModel.load(model_path)
predictions = model.transform(finalDataset)
def main(base_path):

    APP_NAME = "make_predictions_streaming.py"

    # Process data every 10 seconds
    PERIOD = 10
    BROKERS = 'localhost:9092'
    PREDICTION_TOPIC = 'flight_delay_classification_request'

    try:
        sc and ssc
    except NameError as e:
        import findspark

        # Add the streaming package and initialize
        findspark.add_packages(
            ["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"])
        findspark.init()

        import pyspark
        import pyspark.sql
        import pyspark.streaming

        conf = SparkConf().set("spark.default.parallelism", 1)
        sc = SparkContext(
            appName="Agile Data Science: PySpark Streaming 'Hello, World!'",
            conf=conf)
        ssc = StreamingContext(sc, PERIOD)
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # Load all models to be used in making predictions
    #

    # Load the arrival delay bucketizer
    from pyspark.ml.feature import Bucketizer
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)

    # Load all the string field vectorizer pipelines into a dict
    from pyspark.ml.feature import StringIndexerModel

    string_indexer_models = {}
    for column in [
            "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin",
            "Dest", "Route"
    ]:
        string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format(
            base_path, column)
        string_indexer_model = StringIndexerModel.load(
            string_indexer_model_path)
        string_indexer_models[column] = string_indexer_model

    # Load the numeric vector assembler
    from pyspark.ml.feature import VectorAssembler
    vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(
        base_path)
    vector_assembler = VectorAssembler.load(vector_assembler_path)

    # Load the classifier model
    from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
    random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
        base_path)
    rfc = RandomForestClassificationModel.load(random_forest_model_path)

    #
    # Process Prediction Requests in Streaming
    #
    stream = KafkaUtils.createDirectStream(ssc, [PREDICTION_TOPIC], {
        "metadata.broker.list": BROKERS,
        "group.id": "0",
    })

    object_stream = stream.map(lambda x: json.loads(x[1]))
    object_stream.pprint()

    row_stream = object_stream.map(
        lambda x: Row(FlightDate=iso8601.parse_date(x['FlightDate']),
                      Origin=x['Origin'],
                      Distance=x['Distance'],
                      DayOfMonth=x['DayOfMonth'],
                      DayOfYear=x['DayOfYear'],
                      UUID=x['UUID'],
                      DepDelay=x['DepDelay'],
                      DayOfWeek=x['DayOfWeek'],
                      FlightNum=x['FlightNum'],
                      Dest=x['Dest'],
                      Timestamp=iso8601.parse_date(x['Timestamp']),
                      Carrier=x['Carrier']))
    row_stream.pprint()

    #
    # Create a dataframe from the RDD-based object stream
    #

    def classify_prediction_requests(rdd):

        from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType
        from pyspark.sql.types import StructType, StructField

        prediction_request_schema = StructType([
            StructField("Carrier", StringType(), True),
            StructField("DayOfMonth", IntegerType(), True),
            StructField("DayOfWeek", IntegerType(), True),
            StructField("DayOfYear", IntegerType(), True),
            StructField("DepDelay", DoubleType(), True),
            StructField("Dest", StringType(), True),
            StructField("Distance", DoubleType(), True),
            StructField("FlightDate", DateType(), True),
            StructField("FlightNum", StringType(), True),
            StructField("Origin", StringType(), True),
            StructField("Timestamp", TimestampType(), True),
            StructField("UUID", StringType(), True),
        ])

        prediction_requests_df = spark.createDataFrame(
            rdd, schema=prediction_request_schema)
        prediction_requests_df.show()

        #
        # Add a Route variable to replace FlightNum
        #

        from pyspark.sql.functions import lit, concat
        prediction_requests_with_route = prediction_requests_df.withColumn(
            'Route',
            concat(prediction_requests_df.Origin, lit('-'),
                   prediction_requests_df.Dest))
        prediction_requests_with_route.show(6)

        # Vectorize string fields with the corresponding pipeline for that column
        # Turn category fields into categoric feature vectors, then drop intermediate fields
        for column in [
                "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin",
                "Dest", "Route"
        ]:
            string_indexer_model = string_indexer_models[column]
            prediction_requests_with_route = string_indexer_model.transform(
                prediction_requests_with_route)

        # Vectorize numeric columns: DepDelay, Distance and index columns
        final_vectorized_features = vector_assembler.transform(
            prediction_requests_with_route)

        # Inspect the vectors
        final_vectorized_features.show()

        # Drop the individual index columns
        index_columns = [
            "Carrier_index", "DayOfMonth_index", "DayOfWeek_index",
            "DayOfYear_index", "Origin_index", "Dest_index", "Route_index"
        ]
        for column in index_columns:
            final_vectorized_features = final_vectorized_features.drop(column)

        # Inspect the finalized features
        final_vectorized_features.show()

        # Make the prediction
        predictions = rfc.transform(final_vectorized_features)

        # Drop the features vector and prediction metadata to give the original fields
        predictions = predictions.drop("Features_vec")
        final_predictions = predictions.drop("indices").drop("values").drop(
            "rawPrediction").drop("probability")

        # Inspect the output
        final_predictions.show()

        # Store to Mongo
        if final_predictions.count() > 0:
            final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB(
                "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response"
            )

    # Do the classification and store to Mongo
    row_stream.foreachRDD(classify_prediction_requests)

    ssc.start()
    ssc.awaitTermination()
示例#21
0
def prepare_data_ml3(spark, jenkins_builds, sonar_issues, sonar_analyses,
                     spark_artefacts_dir, run_mode):

    # Change build result to only SUCCESS/FAIL for binary classification
    modify_result = udf(lambda x: "SUCCESS" if x == "SUCCESS" else "FAIL",
                        StringType())
    spark.udf.register("modify_result", modify_result)

    if jenkins_builds is not None:
        jenkins_builds = jenkins_builds.withColumn("result",
                                                   modify_result("result"))

    pipeline_path = Path(spark_artefacts_dir).joinpath("pipeline_3")
    label_idx_model_path = Path(spark_artefacts_dir).joinpath(
        "label_indexer_3")

    # Getting pipeline and label indexer models
    if run_mode == "first":

        pipeline_model = get_ml3_pipeline().fit(sonar_issues)
        pipeline_model.write().overwrite().save(str(pipeline_path.absolute()))

        label_idx_model = StringIndexer(
            inputCol="result", outputCol="label",
            handleInvalid="skip").fit(jenkins_builds)
        label_idx_model.write().overwrite().save(
            str(label_idx_model_path.absolute()))

    elif run_mode == "incremental":

        pipeline_model = PipelineModel.load(str(pipeline_path.absolute()))
        label_idx_model = StringIndexerModel.load(
            str(label_idx_model_path.absolute()))

    # Columns to return
    rules = pipeline_model.stages[0].labels
    columns = list(map(lambda x: "removed_" + x, rules)) + list(
        map(lambda x: "introduced_" + x, rules))

    # Preparing
    removed_rules_df = sonar_issues.filter(
        "status IN ('RESOLVED', 'CLOSED', 'REVIEWED')").select(
            "current_analysis_key", "rule")

    df1 = pipeline_model.transform(removed_rules_df)
    rdd1 = df1.rdd.map(lambda x : (x[0],x[3])).reduceByKey(lambda v1,v2: sum_sparse_vectors(v1,v2)) \
                                                .map(lambda x: Row(current_analysis_key = x[0], removed_rule_vec = x[1]))

    if rdd1.count() == 0:
        return None, columns
    removed_issues_rule_vec_df = spark.createDataFrame(rdd1)

    introduced_rules_df = sonar_issues.filter(
        "status IN ('OPEN', 'REOPENED', 'CONFIRMED', 'TO_REVIEW')").select(
            "creation_analysis_key", "rule")
    df2 = pipeline_model.transform(introduced_rules_df)
    rdd2 = df2.rdd.map(lambda x : (x[0],x[3])).reduceByKey(lambda v1,v2: sum_sparse_vectors(v1,v2)) \
                                                .map(lambda x: Row(creation_analysis_key = x[0], introduced_rule_vec = x[1]))

    if rdd2.count() == 0:
        return None, columns
    introduced_issues_rule_vec_df = spark.createDataFrame(rdd2)

    joined_sonar_rules_df = removed_issues_rule_vec_df.join(
        introduced_issues_rule_vec_df,
        removed_issues_rule_vec_df.current_analysis_key ==
        introduced_issues_rule_vec_df.creation_analysis_key,
        how="outer")

    joined_sonar_rules_df.createOrReplaceTempView("sonar_rules")
    joined_sonar_rules_df = spark.sql("""SELECT 
        coalesce(current_analysis_key, creation_analysis_key) AS analysis_key,
        introduced_rule_vec,
        removed_rule_vec
        FROM sonar_rules
    """)

    num_rules = len(pipeline_model.stages[0].labels)

    imputed_sonar_rules_rdd = joined_sonar_rules_df.rdd.map(
        lambda row: Row(analysis_key=row[0],
                        introduced_rule_vec=SparseVector(num_rules, {})
                        if row[1] is None else row[1],
                        removed_rule_vec=SparseVector(num_rules, {})
                        if row[2] is None else row[2]))

    imputed_sonar_rules_df = spark.createDataFrame(imputed_sonar_rules_rdd)

    v_assembler = VectorAssembler(
        inputCols=["removed_rule_vec", "introduced_rule_vec"],
        outputCol="features")
    sonar_issues_df = v_assembler.transform(imputed_sonar_rules_df).select(
        "analysis_key", "features")

    sonar_df = sonar_issues_df.join(
        sonar_analyses,
        sonar_issues_df.analysis_key == sonar_analyses.analysis_key,
        how="inner")
    df = sonar_df.join(jenkins_builds,
                       sonar_df.revision == jenkins_builds.revision_number,
                       how="inner").select("result", "features")
    ml_df = label_idx_model.transform(df).select("label", "features")

    return ml_df, columns
示例#22
0
    './cf_train_subset_idx_full.parquet', './cf_train_subset_idx.parquet',
    './cf_train_idx.parquet', './cf_train_subset.parquet',
    './cf_train_extra.parquet'
]
model_file = './sc2_final1'

u_idx_model_file = './sc2_final_u_indexer'
t_idx_model_file = './sc2_final_t_indexer'

print(datetime.now())

data_file = path + files[1]

data = spark.read.parquet(data_file)
data = data.sample(False, 0.001)
u_model = StringIndexerModel.load(u_idx_model_file)
t_model = StringIndexerModel.load(t_idx_model_file)

transformed_data = u_model.transform(data)
transformed_data = t_model.transform(transformed_data).select(
    'u_id', 't_id', 'count')

ratings = transformed_data.rdd.map(
    lambda l: Rating(l.u_id, l.t_id, l['count']))
rank = 10

model = ALS.trainImplicit(ratings, rank)

val_data_file = path + files[1]
val_data = spark.read.parquet(val_data_file)
val_data = val_data.sample(False, 0.001)
示例#23
0
def main(base_path):

    APP_NAME = "make_predictions_streaming.py"

    # 10초마다 데이터 처리
    PERIOD = 10
    BROKERS = 'localhost:9092'
    PREDICTION_TOPIC = 'flight_delay_classification_request'

    try:
        sc and ssc
    except NameError as e:
        import findspark

        # 스트리밍 패키지 추가 및 초기화
        findspark.add_packages(
            ["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"])
        findspark.init()

        import pyspark
        import pyspark.sql
        import pyspark.streaming

        conf = SparkConf().set("spark.default.parallelism", 1)
        sc = SparkContext(
            appName="Agile Data Science: PySpark Streaming 'Hello, World!'",
            conf=conf)
        ssc = StreamingContext(sc, PERIOD)
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # 예측 생성에 사용된 모든 모델 적재
    #

    # 도착 지연 구간화 모델 적재
    from pyspark.ml.feature import Bucketizer
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)

    # 모든 문자열 필드 벡터화 파이프라인을 dict에 적재
    from pyspark.ml.feature import StringIndexerModel

    string_indexer_models = {}
    for column in ["Carrier", "Origin", "Dest", "Route"]:
        string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format(
            base_path, column)
        string_indexer_model = StringIndexerModel.load(
            string_indexer_model_path)
        string_indexer_models[column] = string_indexer_model

    # 숫자 벡터 어셈블러 적재
    from pyspark.ml.feature import VectorAssembler
    vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(
        base_path)
    vector_assembler = VectorAssembler.load(vector_assembler_path)

    # 분류 모델 적재
    from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
    random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
        base_path)
    rfc = RandomForestClassificationModel.load(random_forest_model_path)

    #
    # 스트리밍에서 예측 요청 처리
    #
    stream = KafkaUtils.createDirectStream(ssc, [PREDICTION_TOPIC], {
        "metadata.broker.list": BROKERS,
        "group.id": "0",
    })

    object_stream = stream.map(lambda x: json.loads(x[1]))
    object_stream.pprint()

    row_stream = object_stream.map(
        lambda x: Row(FlightDate=iso8601.parse_date(x['FlightDate']),
                      Origin=x['Origin'],
                      Distance=x['Distance'],
                      DayOfMonth=x['DayOfMonth'],
                      DayOfYear=x['DayOfYear'],
                      UUID=x['UUID'],
                      DepDelay=x['DepDelay'],
                      DayOfWeek=x['DayOfWeek'],
                      FlightNum=x['FlightNum'],
                      Dest=x['Dest'],
                      Timestamp=iso8601.parse_date(x['Timestamp']),
                      Carrier=x['Carrier']))
    row_stream.pprint()

    #
    # RDD 기반 객체 스트림에서 dataframe 생성
    #

    def classify_prediction_requests(rdd):

        from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType
        from pyspark.sql.types import StructType, StructField

        prediction_request_schema = StructType([
            StructField("Carrier", StringType(), True),
            StructField("DayOfMonth", IntegerType(), True),
            StructField("DayOfWeek", IntegerType(), True),
            StructField("DayOfYear", IntegerType(), True),
            StructField("DepDelay", DoubleType(), True),
            StructField("Dest", StringType(), True),
            StructField("Distance", DoubleType(), True),
            StructField("FlightDate", DateType(), True),
            StructField("FlightNum", StringType(), True),
            StructField("Origin", StringType(), True),
            StructField("Timestamp", TimestampType(), True),
            StructField("UUID", StringType(), True),
        ])

        prediction_requests_df = spark.createDataFrame(
            rdd, schema=prediction_request_schema)
        prediction_requests_df.show()

        #
        # FlightNum을 대체할 Route 변수 추가
        #

        from pyspark.sql.functions import lit, concat
        prediction_requests_with_route = prediction_requests_df.withColumn(
            'Route',
            concat(prediction_requests_df.Origin, lit('-'),
                   prediction_requests_df.Dest))
        prediction_requests_with_route.show(6)

        # 문자열 필드를 해당 열에 대응하는 파이프라인으로 벡터화
        # 범주 필드를 범주형 특징 벡터로 변환한 다음 중간 결과 필드 삭제
        for column in ["Carrier", "Origin", "Dest", "Route"]:
            string_indexer_model = string_indexer_models[column]
            prediction_requests_with_route = string_indexer_model.transform(
                prediction_requests_with_route)

        # 숫사 열 벡터화: DepDelay, Distance, 인덱스 열
        final_vectorized_features = vector_assembler.transform(
            prediction_requests_with_route)

        # 벡터 검사
        final_vectorized_features.show()

        # 개별 인덱스 열 제거
        index_columns = [
            "Carrier_index", "Origin_index", "Dest_index", "Route_index"
        ]
        for column in index_columns:
            final_vectorized_features = final_vectorized_features.drop(column)

        # 확정된 특징 검사
        final_vectorized_features.show()

        # 예측 생성
        predictions = rfc.transform(final_vectorized_features)

        # 원 필드에 제공하기 위해 특징 벡터와 예측 메타데이터 제거
        predictions = predictions.drop("Features_vec")
        final_predictions = predictions.drop("indices").drop("values").drop(
            "rawPrediction").drop("probability")

        # 결과 검사
        final_predictions.show()

        # 몽고DB에 저장
        if final_predictions.count() > 0:
            final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB(
                "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response"
            )

    # 분류를 수행하고 몽고 DB에 저장
    row_stream.foreachRDD(classify_prediction_requests)

    ssc.start()
    ssc.awaitTermination()
def main(spark, indexer_user, indexer_item, train_data_file, val_data_file):
    '''
    Parameters
    ----------
    spark : SparkSession object
    data_file : string, path to the parquet file to load
    model_file : string, path to store the serialized model file
    '''

    # Load the parquet file
    train = spark.read.parquet(train_data_file)
    val = spark.read.parquet(val_data_file)

    user_index = StringIndexerModel.load(indexer_user)
    item_index = StringIndexerModel.load(indexer_item)

    train = user_index.transform(train)
    train = item_index.transform(train)
    #     train = train.sample(withReplacement = False, fraction = 0.8)
    rank = [10, 20, 30]  #default is 10
    regularization = [.01, .1, 1]  #default is 1
    alpha = [.5, 1, 10]  #default is 1

    rank_list = []
    reg_list = []
    alpha_list = []
    precisions = []

    for i in rank:
        for j in regularization:
            for k in alpha:
                als = ALS(userCol='user',
                          itemCol='item',
                          implicitPrefs=True,
                          ratingCol='count',
                          rank=i,
                          regParam=j,
                          alpha=k)
                model = als.fit(train)
                subset = val.select('user').distinct()
                predictions = model.recommendForUserSubset(subset, 50)
                predictions = predictions.select(
                    "user",
                    col("recommendations.item").alias("item")).sort('user')
                val = val.sort('user')
                predictionAndLabels = predictions.join(
                    val, ["user"],
                    "inner").rdd.map(lambda tup: (tup[1], tup[2]))
                metrics = RankingMetrics(predictionAndLabels)
                precision = metrics.meanAveragePrecision
                rank_list.append(i)
                reg_list.append(j)
                alpha_list.append(k)
                precisions.append(precision)
                print('rank: %f, reg: %f, alpha: %f' % (i, j, k))
                print(precision)

    print(rank_list)
    print(reg_list)
    print(alpha_list)
    print(rmses)
    print('Max MAP value: %f' % max(precisions))
    ind = np.argmax(rmses)
    print('Rank: %f' % rank_list[ind])
    print('Reg: %f' % reg_list[ind])
    print('Alpha: %f' % alpha_list[ind])
示例#25
0
	print('====> Start computation')
	dataset = spark.read.csv('/user/ronghui_safe/hgy/nid/datasets/{}_{}'.format(args.query_month, args.mode), header=True, inferSchema=True)
	dataset = dataset.withColumn('source', F.when(F.col('source') == '__HIVE_DEFAULT_PARTITION__', 'null').otherwise(F.col('source')))
	dataset = dataset.withColumn('source', F.when(F.col('source') == 'cm_mail', 'null').otherwise(F.col('source')))
	if args.mode != 'test':
		dataset = dataset.withColumn('duration', F.when(F.col('duration') == 0, 1e-6).otherwise(F.col('duration')))
		dataset = dataset.withColumn('duration', F.log(F.lit(1e-6))/F.col('duration'))
		dataset = dataset.withColumn('duration', F.exp(F.col('duration')))
	stringIndex_model = None
	if args.mode == 'train':
		stringIndexer = StringIndexer(inputCol='source', outputCol='source_index')
		stringIndex_model = stringIndexer.fit(dataset)
		stringIndex_model.save('/user/ronghui_safe/hgy/nid/edw/stringIndex_model_v2')
	else:
		stringIndex_model = StringIndexerModel.load('/user/ronghui_safe/hgy/nid/edw/stringIndex_model_v2')
	dataset = stringIndex_model.transform(dataset)
	encoder_model = None
	if args.mode == 'train':
		encoder = OneHotEncoder(inputCol='source_index', outputCol='source_vec')
		encoder_model = encoder.fit(dataset)
		encoder_model.save('/user/ronghui_safe/hgy/nid/edw/oneHotEncoder_model_v2')
	else:
		encoder_model = OneHotEncoderModel.load('/user/ronghui_safe/hgy/nid/edw/oneHotEncoder_model_v2')
	dataset = encoder_model.transform(dataset)
	feature_cols = ['source_vec', 'aging', 'PC1', 'PC2', 'PC3', 'PC4']
	assembler = VectorAssembler(inputCols=feature_cols, outputCol='feature_vec')
	dataset = assembler.transform(dataset)
	scaler_model = None
	if args.mode == 'train':
		scaler = StandardScaler(inputCol='feature_vec', outputCol='scaled_feature_vec', withStd=True, withMean=True)
示例#26
0
def main(base_path):

    spark = SparkSession.builder.config("spark.default.parallelism",
                                        1).appName(APP_NAME).getOrCreate()

    #
    # Load all models to be used in making predictions
    #

    # Load the arrival delay bucketizer
    from pyspark.ml.feature import Bucketizer
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)

    # Load all the string field vectorizer pipelines into a dict
    from pyspark.ml.feature import StringIndexerModel

    string_indexer_models = {}
    for column in ["Carrier", "Origin", "Dest", "Route"]:
        string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format(
            base_path, column)
        string_indexer_model = StringIndexerModel.load(
            string_indexer_model_path)
        string_indexer_models[column] = string_indexer_model

    # Load the numeric vector assembler
    from pyspark.ml.feature import VectorAssembler
    vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(
        base_path)
    vector_assembler = VectorAssembler.load(vector_assembler_path)

    # Load the classifier model
    from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
    random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
        base_path)
    rfc = RandomForestClassificationModel.load(random_forest_model_path)

    #
    # Messages look like:
    #

    # {
    #   "Carrier": "DL",
    #   "DayOfMonth": 25,
    #   "DayOfWeek": 4,
    #   "DayOfYear": 359,
    #   "DepDelay": 10.0,
    #   "Dest": "LAX",
    #   "Distance": 2475.0,
    #   "FlightDate": "2015-12-25",
    #   "FlightNum": null,
    #   "Origin": "JFK",
    #   "Timestamp": "2019-10-31T00:19:47.633280",
    #   "UUID": "af74b096-ecc7-4493-a79a-ebcdff699385"
    # }

    #
    # Process Prediction Requests from Kafka
    #
    message_df = spark \
      .readStream \
      .format("kafka") \
      .option("kafka.bootstrap.servers", BROKERS) \
      .option("subscribe", PREDICTION_TOPIC) \
      .load()

    # Create a DataFrame out of the one-hot encoded RDD
    schema = T.StructType([
        T.StructField("Carrier", T.StringType()),
        T.StructField("DayOfMonth", T.IntegerType()),
        T.StructField("DayOfWeek", T.IntegerType()),
        T.StructField("DayOfYear", T.IntegerType()),
        T.StructField("DepDelay", T.FloatType()),
        T.StructField("Dest", T.StringType()),
        T.StructField("Distance", T.FloatType()),
        T.StructField("FlightDate", T.StringType()),
        T.StructField("FlightNum", T.StringType()),
        T.StructField("Origin", T.StringType()),
        T.StructField("Timestamp", T.TimestampType()),
        T.StructField("UUID", T.StringType()),
    ])

    prediction_requests_df = message_df.select(
        F.from_json(F.col("value").cast("string"),
                    schema).alias("data")).select("data.*")

    #
    # Add a Route variable to replace FlightNum
    #
    prediction_requests_with_route = prediction_requests_df.withColumn(
        'Route',
        F.concat(prediction_requests_df.Origin, F.lit('-'),
                 prediction_requests_df.Dest))

    # Vectorize string fields with the corresponding pipeline for that column
    # Turn category fields into categoric feature vectors, then drop intermediate fields
    for column in ["Carrier", "Origin", "Dest", "Route"]:
        string_indexer_model = string_indexer_models[column]
        prediction_requests_with_route = string_indexer_model.transform(
            prediction_requests_with_route)

    # Vectorize numeric columns: DepDelay, Distance and index columns
    final_vectorized_features = vector_assembler.transform(
        prediction_requests_with_route)

    # Drop the individual index columns
    index_columns = [
        "Carrier_index", "Origin_index", "Dest_index", "Route_index"
    ]
    for column in index_columns:
        final_vectorized_features = final_vectorized_features.drop(column)

    # Make the prediction
    predictions = rfc.transform(final_vectorized_features)

    # Drop the features vector and prediction metadata to give the original fields
    predictions = predictions.drop("Features_vec")
    final_predictions = predictions.drop("indices").drop("values").drop(
        "rawPrediction").drop("probability")

    # Store the results to MongoDB
    class MongoWriter:
        def open(self, partition_id, epoch_id):
            print(f"Opened partition id: {partition_id}, epoch: {epoch_id}")

            self.mongo_client = pymongo.MongoClient()
            print(f"Opened MongoClient: {self.mongo_client}")

            return True

        def process(self, row):
            print(f"Processing row: {row}")

            as_dict = row.asDict()
            print(f"Inserting row.asDict(): {as_dict}")

            id = self.mongo_client.agile_data_science.flight_delay_classification_response.insert_one(
                as_dict)
            print(f"Inserted row, got ID: {id.inserted_id}")

            self.mongo_client.close()

            return True

        def close(self, error):
            print("Closed with error: %s" % str(error))

            return True

    query = final_predictions.writeStream.foreach(MongoWriter()).start()

    query.awaitTermination()
def main(iso_date, base_path):

  APP_NAME = "make_predictions.py"
  
  # SparkSession이 없으면 환경 생성
  try:
    sc and spark
  except NameError as e:
    import findspark
    findspark.init()
    import pyspark
    import pyspark.sql
    
    sc = pyspark.SparkContext()
    spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
  
  #
  # 파이프라인에 모든 모델을 적재
  #
  
  # 도착 지연 구간 설정 모델을 적재
  from pyspark.ml.feature import Bucketizer
  arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path)
  arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)
  
  # 모든 문자열 인덱서를 dict에 적재
  from pyspark.ml.feature import StringIndexerModel
  
  string_indexer_models = {}
  for column in ["Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear",
                 "Origin", "Dest", "Route"]:
    string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format(
      base_path,
      column
    )
    string_indexer_model = StringIndexerModel.load(string_indexer_model_path)
    string_indexer_models[column] = string_indexer_model
  
  # 수치 벡터 어셈블러 적재
  from pyspark.ml.feature import VectorAssembler
  vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path)
  vector_assembler = VectorAssembler.load(vector_assembler_path)
    
  # 분류 모델 적재
  from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
  random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
      base_path
  )
  rfc = RandomForestClassificationModel.load(
    random_forest_model_path
  )
  
  #
  # 요청을 훈련 데이터로부터 변환을 통해 실행
  #
  
  # 쿼리 범위를 지정하기 위해 ISO 문자열로 오늘과 내일 날짜 가져오기
  today_dt = iso8601.parse_date(iso_date)
  rounded_today = today_dt.date()쿼리 범위를 지정하기 위해 ISO 문자열로 오늘과 내일 날짜 가져오기
  iso_today = rounded_today.isoformat()

  # 해당 날짜의 입력 경로 생성: 날짜 기반의 프라이머리 키 디렉터리 구조
  today_input_path = "{}/data/prediction_tasks_daily.json/{}".format(
    base_path,
    iso_today
  )

  from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType
  from pyspark.sql.types import StructType, StructField

  schema = StructType([
    StructField("Carrier", StringType(), True),
    StructField("DayOfMonth", IntegerType(), True),
    StructField("DayOfWeek", IntegerType(), True),
    StructField("DayOfYear", IntegerType(), True),
    StructField("DepDelay", DoubleType(), True),
    StructField("Dest", StringType(), True),
    StructField("Distance", DoubleType(), True),
    StructField("FlightDate", DateType(), True),
    StructField("FlightNum", StringType(), True),
    StructField("Origin", StringType(), True),
    StructField("Timestamp", TimestampType(), True),
  ])
  
  prediction_requests = spark.read.json(today_input_path, schema=schema)
  prediction_requests.show()

  #
  # FlightNum을 대체할 Route 변수 추가
  #
  
  from pyspark.sql.functions import lit, concat
  prediction_requests_with_route = prediction_requests.withColumn(
    'Route',
    concat(
      prediction_requests.Origin,
      lit('-'),
      prediction_requests.Dest
    )
  )
  prediction_requests_with_route.show(6)
  
  #  해당 열에 대응하는 인덱서로 문자열 필드를 인덱싱
  for column in ["Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear",
                 "Origin", "Dest", "Route"]:
    string_indexer_model = string_indexer_models[column]
    prediction_requests_with_route = string_indexer_model.transform(prediction_requests_with_route)
      
  # 수치열 벡터화: DepDelay, Distance
  final_vectorized_features = vector_assembler.transform(prediction_requests_with_route)
  
  # 명목형 필드를 위한 인덱스 제거
  index_columns = ["Carrier_index", "DayOfMonth_index","DayOfWeek_index",
                   "DayOfYear_index", "Origin_index", "Origin_index",
                   "Dest_index", "Route_index"]
  for column in index_columns:
    final_vectorized_features = final_vectorized_features.drop(column)

  # 확정된 특징 검사
  final_vectorized_features.show()
  
  # 예측 생성
  predictions = rfc.transform(final_vectorized_features)
  
  # 원래 필드를 제공하기 위해 특징 벡터와 예측 메타데이터를 제거
  predictions = predictions.drop("Features_vec")
  final_predictions = predictions.drop("indices").drop("values").drop("rawPrediction").drop("probability")
  
  # 결과 검사
  final_predictions.show()
  
  # 해당 날짜의 경로 생성: 날짜 기반의 프라이머리 키 디렉터리 구조
  today_output_path = "{}/data/prediction_results_daily.json/{}".format(
    base_path,
    iso_today
  )
  
  # 일별 구간에 결과 저장
  final_predictions.repartition(1).write.mode("overwrite").json(today_output_path)
def main(base_path):

  APP_NAME = "make_predictions_streaming.py"

  # Process data every 10 seconds
  PERIOD = 10
  BROKERS = 'localhost:9092'
  PREDICTION_TOPIC = 'flight_delay_classification_request'
  
  try:
    sc and ssc
  except NameError as e:
    import findspark

    # Add the streaming package and initialize
    findspark.add_packages(["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"])
    findspark.init()
    
    import pyspark
    import pyspark.sql
    import pyspark.streaming
  
    conf = SparkConf().set("spark.default.parallelism", 1)
    sc = SparkContext(appName="Agile Data Science: PySpark Streaming 'Hello, World!'", conf=conf)
    ssc = StreamingContext(sc, PERIOD)
    spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
  
  #
  # Load all models to be used in making predictions
  #
  
  # Load the arrival delay bucketizer
  from pyspark.ml.feature import Bucketizer
  arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path)
  arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)
  
  # Load all the string field vectorizer pipelines into a dict
  from pyspark.ml.feature import StringIndexerModel
  
  string_indexer_models = {}
  for column in ["Carrier", "Origin", "Dest", "Route"]:
    string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format(
      base_path,
      column
    )
    string_indexer_model = StringIndexerModel.load(string_indexer_model_path)
    string_indexer_models[column] = string_indexer_model

  # Load the numeric vector assembler
  from pyspark.ml.feature import VectorAssembler
  vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path)
  vector_assembler = VectorAssembler.load(vector_assembler_path)

  # Load the classifier model
  from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
  random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
    base_path
  )
  rfc = RandomForestClassificationModel.load(
    random_forest_model_path
  )
  
  #
  # Process Prediction Requests in Streaming
  #
  stream = KafkaUtils.createDirectStream(
    ssc,
    [PREDICTION_TOPIC],
    {
      "metadata.broker.list": BROKERS,
      "group.id": "0",
    }
  )

  object_stream = stream.map(lambda x: json.loads(x[1]))
  object_stream.pprint()
  
  row_stream = object_stream.map(
    lambda x: Row(
      FlightDate=iso8601.parse_date(x['FlightDate']),
      Origin=x['Origin'],
      Distance=x['Distance'],
      DayOfMonth=x['DayOfMonth'],
      DayOfYear=x['DayOfYear'],
      UUID=x['UUID'],
      DepDelay=x['DepDelay'],
      DayOfWeek=x['DayOfWeek'],
      FlightNum=x['FlightNum'],
      Dest=x['Dest'],
      Timestamp=iso8601.parse_date(x['Timestamp']),
      Carrier=x['Carrier']
    )
  )
  row_stream.pprint()

  #
  # Create a dataframe from the RDD-based object stream
  #

  def classify_prediction_requests(rdd):
  
    from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType
    from pyspark.sql.types import StructType, StructField
  
    prediction_request_schema = StructType([
      StructField("Carrier", StringType(), True),
      StructField("DayOfMonth", IntegerType(), True),
      StructField("DayOfWeek", IntegerType(), True),
      StructField("DayOfYear", IntegerType(), True),
      StructField("DepDelay", DoubleType(), True),
      StructField("Dest", StringType(), True),
      StructField("Distance", DoubleType(), True),
      StructField("FlightDate", DateType(), True),
      StructField("FlightNum", StringType(), True),
      StructField("Origin", StringType(), True),
      StructField("Timestamp", TimestampType(), True),
      StructField("UUID", StringType(), True),
    ])
    
    prediction_requests_df = spark.createDataFrame(rdd, schema=prediction_request_schema)
    prediction_requests_df.show()

    #
    # Add a Route variable to replace FlightNum
    #

    from pyspark.sql.functions import lit, concat
    prediction_requests_with_route = prediction_requests_df.withColumn(
      'Route',
      concat(
        prediction_requests_df.Origin,
        lit('-'),
        prediction_requests_df.Dest
      )
    )
    prediction_requests_with_route.show(6)
  
    # Vectorize string fields with the corresponding pipeline for that column
    # Turn category fields into categoric feature vectors, then drop intermediate fields
    for column in ["Carrier", "Origin", "Dest", "Route"]:
      string_indexer_model = string_indexer_models[column]
      prediction_requests_with_route = string_indexer_model.transform(prediction_requests_with_route)
  
    # Vectorize numeric columns: DepDelay, Distance and index columns
    final_vectorized_features = vector_assembler.transform(prediction_requests_with_route)
    
    # Inspect the vectors
    final_vectorized_features.show()
  
    # Drop the individual index columns
    index_columns = ["Carrier_index", "Origin_index", "Dest_index", "Route_index"]
    for column in index_columns:
      final_vectorized_features = final_vectorized_features.drop(column)
  
    # Inspect the finalized features
    final_vectorized_features.show()
  
    # Make the prediction
    predictions = rfc.transform(final_vectorized_features)
  
    # Drop the features vector and prediction metadata to give the original fields
    predictions = predictions.drop("Features_vec")
    final_predictions = predictions.drop("indices").drop("values").drop("rawPrediction").drop("probability")
  
    # Inspect the output
    final_predictions.show()
  
    # Store to Mongo
    if final_predictions.count() > 0:
      final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB(
        "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response"
      )
  
  # Do the classification and store to Mongo
  row_stream.foreachRDD(classify_prediction_requests)
  
  ssc.start()
  ssc.awaitTermination()
示例#29
0
def main(iso_date, base_path):

    APP_NAME = "make_predictions.py"

    # If there is no SparkSession, create the environment
    try:
        sc and spark
    except NameError as e:
        import findspark
        findspark.init()
        import pyspark
        import pyspark.sql

        sc = pyspark.SparkContext()
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # Load each and every model in the pipeline
    #

    # Load the arrival delay bucketizer
    from pyspark.ml.feature import Bucketizer
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)

    # Load all the string indexers into a dict
    from pyspark.ml.feature import StringIndexerModel

    string_indexer_models = {}
    for column in [
            "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin",
            "Dest", "Route"
    ]:
        string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format(
            base_path, column)
        string_indexer_model = StringIndexerModel.load(
            string_indexer_model_path)
        string_indexer_models[column] = string_indexer_model

    # Load the numeric vector assembler
    from pyspark.ml.feature import VectorAssembler
    vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(
        base_path)
    vector_assembler = VectorAssembler.load(vector_assembler_path)

    # Load the classifier model
    from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
    random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
        base_path)
    rfc = RandomForestClassificationModel.load(random_forest_model_path)

    #
    # Run the requests through the transformations from training
    #

    # Get today and tomorrow's dates as iso strings to scope query
    today_dt = iso8601.parse_date(iso_date)
    rounded_today = today_dt.date()
    iso_today = rounded_today.isoformat()

    # Build the day's input path: a date based primary key directory structure
    today_input_path = "{}/data/prediction_tasks_daily.json/{}".format(
        base_path, iso_today)

    from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType
    from pyspark.sql.types import StructType, StructField

    schema = StructType([
        StructField("Carrier", StringType(), True),
        StructField("DayOfMonth", IntegerType(), True),
        StructField("DayOfWeek", IntegerType(), True),
        StructField("DayOfYear", IntegerType(), True),
        StructField("DepDelay", DoubleType(), True),
        StructField("Dest", StringType(), True),
        StructField("Distance", DoubleType(), True),
        StructField("FlightDate", DateType(), True),
        StructField("FlightNum", StringType(), True),
        StructField("Origin", StringType(), True),
        StructField("Timestamp", TimestampType(), True),
    ])

    prediction_requests = spark.read.json(today_input_path, schema=schema)
    prediction_requests.show()

    #
    # Add a Route variable to replace FlightNum
    #

    from pyspark.sql.functions import lit, concat
    prediction_requests_with_route = prediction_requests.withColumn(
        'Route',
        concat(prediction_requests.Origin, lit('-'), prediction_requests.Dest))
    prediction_requests_with_route.show(6)

    # Index string fields with the corresponding indexer for that column
    for column in [
            "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin",
            "Dest", "Route"
    ]:
        string_indexer_model = string_indexer_models[column]
        prediction_requests_with_route = string_indexer_model.transform(
            prediction_requests_with_route)

    # Vectorize numeric columns: DepDelay and Distance
    final_vectorized_features = vector_assembler.transform(
        prediction_requests_with_route)

    # Drop the indexes for the nominal fields
    index_columns = [
        "Carrier_index", "DayOfMonth_index", "DayOfWeek_index",
        "DayOfYear_index", "Origin_index", "Origin_index", "Dest_index",
        "Route_index"
    ]
    for column in index_columns:
        final_vectorized_features = final_vectorized_features.drop(column)

    # Inspect the finalized features
    final_vectorized_features.show()

    # Make the prediction
    predictions = rfc.transform(final_vectorized_features)

    # Drop the features vector and prediction metadata to give the original fields
    predictions = predictions.drop("Features_vec")
    final_predictions = predictions.drop("indices").drop("values").drop(
        "rawPrediction").drop("probability")

    # Inspect the output
    final_predictions.show()

    # Build the day's output path: a date based primary key directory structure
    today_output_path = "{}/data/prediction_results_daily.json/{}".format(
        base_path, iso_today)

    # Save the output to its daily bucket
    final_predictions.repartition(1).write.mode("overwrite").json(
        today_output_path)
示例#30
0
    return ss


if __name__ == '__main__':
    start_time = time.time()
    ss = init_spark_session()  #initial spark session
    final_indexed_save = os.path.join('dataset', 'review_vegas_als.parquet')
    reviewDF = ss.read.parquet(final_indexed_save).cache()
    model_save = os.path.join('model', 'als_model_vegas')
    indexer_user_save = os.path.join('model', 'user_ind_model')

    model = ALSModel.load(model_save)
    uid = reviewDF.select('user_id').rdd.takeSample(False, 1)
    logger.error('{} seconds has elapsed'.format(str(uid)))
    bid = reviewDF.select('business_id_int', 'business_id').distinct()
    bid.show(20)
    logger.error('{} seconds has elapsed. {} rows remain'.format(
        time.time() - start_time, bid.count()))
    #predDF = bid.filter(bid['user_id'] == user_id)
    #build user request using input id
    predDF = bid.withColumn("user_id", lit(uid))
    indexer_model = StringIndexerModel.load(indexer_user_save)
    predDF = indexer_model.transform(predDF)
    '''user_id_converter =  IndexToString(inputCol= 'user_id',outputCol='user_id')
    convert_df = '''
    predDF.show(20)
    prediction_user = model.transform(predDF)
    ratings = prediction_user.sort(
        desc('prediction')).limit(count).select('business_id')
    logger.error('{} seconds has elapsed'.format(time.time() - start_time))