def test_java_params(self): import pyspark.ml.feature import pyspark.ml.classification import pyspark.ml.clustering import pyspark.ml.evaluation import pyspark.ml.pipeline import pyspark.ml.recommendation import pyspark.ml.regression modules = [pyspark.ml.feature, pyspark.ml.classification, pyspark.ml.clustering, pyspark.ml.evaluation, pyspark.ml.pipeline, pyspark.ml.recommendation, pyspark.ml.regression] for module in modules: for name, cls in inspect.getmembers(module, inspect.isclass): if not name.endswith('Model') and not name.endswith('Params') \ and issubclass(cls, JavaParams) and not inspect.isabstract(cls): # NOTE: disable check_params_exist until there is parity with Scala API check_params(self, cls(), check_params_exist=False) # Additional classes that need explicit construction from pyspark.ml.feature import CountVectorizerModel, StringIndexerModel check_params(self, CountVectorizerModel.from_vocabulary(['a'], 'input'), check_params_exist=False) check_params(self, StringIndexerModel.from_labels(['a', 'b'], 'input'), check_params_exist=False)
def test_java_params(self): import pyspark.ml.feature import pyspark.ml.classification import pyspark.ml.clustering import pyspark.ml.evaluation import pyspark.ml.pipeline import pyspark.ml.recommendation import pyspark.ml.regression modules = [ pyspark.ml.feature, pyspark.ml.classification, pyspark.ml.clustering, pyspark.ml.evaluation, pyspark.ml.pipeline, pyspark.ml.recommendation, pyspark.ml.regression ] for module in modules: for name, cls in inspect.getmembers(module, inspect.isclass): if not name.endswith('Model') and not name.endswith('Params') \ and issubclass(cls, JavaParams) and not inspect.isabstract(cls): # NOTE: disable check_params_exist until there is parity with Scala API check_params(self, cls(), check_params_exist=False) # Additional classes that need explicit construction from pyspark.ml.feature import CountVectorizerModel, StringIndexerModel check_params(self, CountVectorizerModel.from_vocabulary(['a'], 'input'), check_params_exist=False) check_params(self, StringIndexerModel.from_labels(['a', 'b'], 'input'), check_params_exist=False)
def process(rdd): spark = getSparkSessionInstance(rdd.context.getConf()) dota = rdd.map(lambda x: x[1]) featuresdata = dota.map(lambda x: x.split(':')[2]) actualdata = featuresdata.map(lambda x: x.split(',')) rowRdd = actualdata.map(lambda x: Row(sl=float(x[0][1:]), sw=float(x[1]), pl=float(x[2]), pw=float(x[3]), stringlabel=x[4][:-4])) features = spark.createDataFrame(rowRdd) features.show() rowRdd = actualdata.map(lambda x: Row(sl=float(x[0]), sw=float(x[1]), pl=float(x[2]), pw=float(x[3]), stringlabel=x[4])) indexer = StringIndexerModel() assembler = VectorAssembler() lr = LogisticRegressionModel() pipe = PipelineModel(stages=[indexer,assembler,lr]).load('gs://suryasuresh/lab8output') result = pipe.transform(features) f1score = MulticlassClassificationEvaluator(metricName='f1') precision = MulticlassClassificationEvaluator(metricName='weightedPrecision') recall = MulticlassClassificationEvaluator(metricName='weightedRecall') accuracy = MulticlassClassificationEvaluator(metricName='accuracy') print(result.values) print("Accuracy:\t",accuracy.evaluate(result),"\nF1score:\t",f1score.evaluate(result),"\nWeighted Recall:\t",recall.evaluate(result),"\nWeighted Precision:\t",precision.evaluate(result))
def test_string_indexer_from_labels(self): model = StringIndexerModel.from_labels( ["a", "b", "c"], inputCol="label", outputCol="indexed", handleInvalid="keep" ) self.assertEqual(model.labels, ["a", "b", "c"]) self.assertEqual(model.labelsArray, [("a", "b", "c")]) df1 = self.spark.createDataFrame( [(0, "a"), (1, "c"), (2, None), (3, "b"), (4, "b")], ["id", "label"] ) result1 = model.transform(df1) actual1 = result1.select("id", "indexed").collect() expected1 = [ Row(id=0, indexed=0.0), Row(id=1, indexed=2.0), Row(id=2, indexed=3.0), Row(id=3, indexed=1.0), Row(id=4, indexed=1.0), ] self.assertEqual(actual1, expected1) model_empty_labels = StringIndexerModel.from_labels( [], inputCol="label", outputCol="indexed", handleInvalid="keep" ) actual2 = model_empty_labels.transform(df1).select("id", "indexed").collect() expected2 = [ Row(id=0, indexed=0.0), Row(id=1, indexed=0.0), Row(id=2, indexed=0.0), Row(id=3, indexed=0.0), Row(id=4, indexed=0.0), ] self.assertEqual(actual2, expected2) # Test model with default settings can transform model_default = StringIndexerModel.from_labels(["a", "b", "c"], inputCol="label") df2 = self.spark.createDataFrame( [(0, "a"), (1, "c"), (2, "b"), (3, "b"), (4, "b")], ["id", "label"] ) transformed_list = ( model_default.transform(df2) .select(model_default.getOrDefault(model_default.outputCol)) .collect() ) self.assertEqual(len(transformed_list), 5)
def get_top_ratings(self, user_id, count): """Retrun top <count> bussiness Calls """ start_time = time.time() #bid = self.reviewDF.select('business_id_num','business_id').distinct().cache() businessDF_vegas_food_save = os.path.join( 'dataset', 'businessDF_vegas_food.parquet') businessDF_vegas_food = self.ss.read.parquet( businessDF_vegas_food_save) #bid.show(20) logger.error('{} seconds has elapsed. {} entries remained'.format( time.time() - start_time, businessDF_vegas_food.count())) #predDF = bid.filter(bid['user_id'] == user_id) #build user request using input id logger.error( '{} seconds has elapsed before loading building predDF'.format( time.time() - start_time)) bid = businessDF_vegas_food.select('business_id', 'latitude', 'longitude') indexer_business_save = os.path.join('model', 'bus_ind_model') indexer_business_model = StringIndexerModel.load(indexer_business_save) bid = indexer_business_model.transform(bid) predDF = bid.withColumn("user_id", lit(user_id)).cache() logger.error('{} seconds has elapsed before loading indexer'.format( time.time() - start_time)) indexer_user_model = StringIndexerModel.load(self.indexer_user_save) predDF = indexer_user_model.transform(predDF) '''user_id_converter = IndexToString(inputCol= 'user_id',outputCol='user_id') convert_df = ''' #predDF.show(10) logger.error('{} seconds has elapsed before model'.format(time.time() - start_time)) model = ALSModel.load(self.model_save) prediction_user = model.transform(predDF) #prediction_user.show(20) ratings = prediction_user.sort(desc('prediction')).limit(count).select( 'business_id', 'prediction', 'latitude', 'longitude') #ratings.show(20) #ratings.printSchema() logger.error('{} seconds has elapsed'.format(time.time() - start_time)) return ratings.toPandas().to_json(orient='records')
def main(spark, user_indexer_model, item_indexer_model, test_file, save_test): ''' Parameters ---------- spark : SparkSession object data_file : string, path to the parquet file to load model_file : string, path to store the serialized model file ''' # Load the parquet file test = spark.read.parquet(test_file) user_index = StringIndexerModel.load(user_indexer_model) item_index = StringIndexerModel.load(item_indexer_model) test = user_index.transform(test) test = item_index.transform(test) test = test.sort('count', ascending = False) relevant_docs = test.groupBy('user').agg(F.collect_list('item').alias('item')) relevant_docs.write.parquet(save_test)
def modelPredicting(testSetWoeDF, fn): # 数据预转换,满足ML-linearRegression输入格式要求 strInd = StringIndexerModel.load(savePath + '{}/{}/strInd'.format(curDate, fn)) lrModel = LogisticRegressionModel.load(savePath + '{}/{}/lrModel'.format(curDate, fn)) testSetVecAse = vecAseembler.transform(testSetWoeDF) testSetVecAseStrInd = strInd.transform(testSetVecAse) testSetWithProba = lrModel.transform(testSetVecAseStrInd) return (testSetWithProba)
def transform_spark(data, input, transformed_column_name): from pyspark.ml.feature import StringIndexerModel import pyspark.sql.functions as F indexer = StringIndexerModel.from_labels(input["indexes"]["index"], inputCol=input["col"], outputCol=transformed_column_name) return indexer.transform(data).withColumn( transformed_column_name, F.col(transformed_column_name).cast("int"))
def transform_spark(data, features, args, transformed_feature): from pyspark.ml.feature import StringIndexerModel import pyspark.sql.functions as F indexer = StringIndexerModel.from_labels(args["index"], inputCol=features["text"], outputCol=transformed_feature) return indexer.transform(data).withColumn( transformed_feature, F.col(transformed_feature).cast("int"))
def indexToString(infoData): stringIndexerPath = infoData.get(mc.INDEXERPATH) inverterColm = infoData.get(mc.COLMTOINVERT) dataset = infoData.get(mc.DATASET) stringIndexer = StringIndexerModel.load(stringIndexerPath) inverter = IndexToString(inputCol=inverterColm, outputCol=mc.DMXINVERTEDCOLM, labels=stringIndexer.labels) dataset = inverter.transform(dataset) #drop the indexed colm and rename the new unindexed colm with the actual one dataset = dataset.drop(inverterColm) dataset = dataset.withColumnRenamed(mc.DMXINVERTEDCOLM, inverterColm) return dataset
def load(self, path, spark_session=None): self.iforest_model = IForestModel.load(self._get_iforest_path(path)) self.scaler_model = StandardScalerModel.load( self._get_scaler_path(path)) file_manager = FileManager(path, spark_session) params = file_manager.load_from_file(self._get_params_path(path), format='json') self.set_params(**params) self.indexes = [] for feature in self.categorical_features: self.indexes.append( StringIndexerModel.load(self._get_index_path(path, feature)))
def main(stations_indexer_path, onehot_path, weather_indexer_path, model_data_folder, station_id, hour, month, dayofyear, visibility, air_temp, wind_speed, weather_class): #Starting session spark = SparkSession.builder.appName('BigDataML').getOrCreate() spark.sparkContext.setLogLevel("ERROR") #Loading data columns = [ "hour", 'dayofyear', 'month', 'air_temp', 'wind_speed', 'visibility', 'weather_class', 'station' ] vals = [(hour, dayofyear, month, air_temp, wind_speed, visibility, weather_class, station_id)] data = spark.createDataFrame(vals, columns) model = PipelineModel.load(model_data_folder) stringIndexer = StringIndexerModel.load(stations_indexer_path) data = stringIndexer.transform(data) stringIndexer_weather = StringIndexerModel.load(weather_indexer_path) data = stringIndexer_weather.transform(data) encoder = OneHotEncoderModel.load(onehot_path) data = encoder.transform(data) features =\ VectorAssembler(inputCols=["hour", 'dayofyear', 'month', 'air_temp', 'wind_speed', 'visibility', 'weather_index', 'station_vector'], outputCol="features") test_data = features.transform(data) model = PipelineModel.load(model_data_folder) # Make predictions. predictions = model.transform(test_data).collect() print("Predicted count is: {}".format(int(predictions[0]['prediction'])))
def test_string_indexer_from_labels(self): model = StringIndexerModel.from_labels(["a", "b", "c"], inputCol="label", outputCol="indexed", handleInvalid="keep") self.assertEqual(model.labels, ["a", "b", "c"]) df1 = self.spark.createDataFrame([ (0, "a"), (1, "c"), (2, None), (3, "b"), (4, "b")], ["id", "label"]) result1 = model.transform(df1) actual1 = result1.select("id", "indexed").collect() expected1 = [Row(id=0, indexed=0.0), Row(id=1, indexed=2.0), Row(id=2, indexed=3.0), Row(id=3, indexed=1.0), Row(id=4, indexed=1.0)] self.assertEqual(actual1, expected1) model_empty_labels = StringIndexerModel.from_labels( [], inputCol="label", outputCol="indexed", handleInvalid="keep") actual2 = model_empty_labels.transform(df1).select("id", "indexed").collect() expected2 = [Row(id=0, indexed=0.0), Row(id=1, indexed=0.0), Row(id=2, indexed=0.0), Row(id=3, indexed=0.0), Row(id=4, indexed=0.0)] self.assertEqual(actual2, expected2) # Test model with default settings can transform model_default = StringIndexerModel.from_labels(["a", "b", "c"], inputCol="label") df2 = self.spark.createDataFrame([ (0, "a"), (1, "c"), (2, "b"), (3, "b"), (4, "b")], ["id", "label"]) transformed_list = model_default.transform(df2) \ .select(model_default.getOrDefault(model_default.outputCol)).collect() self.assertEqual(len(transformed_list), 5)
def main(): feature_model = VectorIndexerModel.load(featureIndexer_path) vectorAssembler = vectorAssembler.load(vectorAssembler_path) ohe_model = OneHotEncoderModel.load(ohe_model_path) stringIndexer_model = StringIndexerModel.load(stringIndexerPath) lr_model = LinearRegressionModel.load(model_path) spark = SparkSession.builder.master("local").appName("Connection").getOrCreate() json_data = request.get_json() availability = json_data.availability minimum_nights = json_data.minimum_nights latitude = json_data.latitude longitude = json_data.longitude name = json_data.name neighbourhood_group = json_data.neighbourhood_group neighbourhood = json_data.neighbourhood room_type = json_data.room_type dept = [(name,neighbourhood_group,neighbourhood,room_type,latitude,longitude,0.0,minimum_nights,0.0,1.0,availability,0.0)] df = spark.createDataFrame(data=dept, schema = deptColumns) df = stringIndexer_model.transform(df) df = df.drop(*["neighbourhood_group", 'neighbourhood', 'room_type']) df = ohe_model.transform(df) df = df.drop(*["neighbourhood_group_int", 'neighbourhood_int', 'room_type_int']) df = df.withColumn("minimum_nights", when(df["minimum_nights_int"] > 30, 30).otherwise(df["minimum_nights_int"])).drop('minimum_nights_int') df = df.withColumn('name_length', length('name')).drop('name') df = vectorAssembler.transform(df) df = df.select(['features']) df = feature_model.transform(df) df = df.select(['features_vec']) lr_predictions = lr_model.transform(df) return jsonify(data=lr_predictions.collect()[-1].prediction)
def multiHotEncoderExample(movieSamples): samplesWithGenres = movieSamples.select( "movieId", "title", explode(split(F.col("genres"), "\\|").cast(ArrayType(StringType()))).alias("genre")) genreIndexer = StringIndexer(inputCol="genre", outputCol="genreIndex") StringIndexerModel = genreIndexer.fit(samplesWithGenres) genreIndexSamples = StringIndexerModel.transform( samplesWithGenres).withColumn("genreIndexInt", F.col("genreIndex").cast(IntegerType())) indexSize = genreIndexSamples.agg(max( F.col("genreIndexInt"))).head()[0] + 1 processedSamples = genreIndexSamples.groupBy("movieId").agg( F.collect_list("genreIndexInt").alias('genreIndexes')).withColumn( "IndexSize", F.lit(indexSize)) finalSample = processedSamples.withColumn( "vector", udf(array2vec, VectorUDT())(F.col("genreIndexes"), F.col("indexSize"))) finalSample.printSchema() finalSample.show(10)
def test_java_params(self): import re import pyspark.ml.feature import pyspark.ml.classification import pyspark.ml.clustering import pyspark.ml.evaluation import pyspark.ml.pipeline import pyspark.ml.recommendation import pyspark.ml.regression modules = [ pyspark.ml.feature, pyspark.ml.classification, pyspark.ml.clustering, pyspark.ml.evaluation, pyspark.ml.pipeline, pyspark.ml.recommendation, pyspark.ml.regression, ] for module in modules: for name, cls in inspect.getmembers(module, inspect.isclass): if (not name.endswith("Model") and not name.endswith("Params") and issubclass(cls, JavaParams) and not inspect.isabstract(cls) and not re.match("_?Java", name) and name != "_LSH" and name != "_Selector"): check_params(self, cls(), check_params_exist=True) # Additional classes that need explicit construction from pyspark.ml.feature import CountVectorizerModel, StringIndexerModel check_params(self, CountVectorizerModel.from_vocabulary(["a"], "input"), check_params_exist=True) check_params(self, StringIndexerModel.from_labels(["a", "b"], "input"), check_params_exist=True)
def preprocess_renewal_model_scoring_data(pargs, params): """ Function to pre-process raw scoring data for renewal model """ # Load parameters score_filter_flag = configs['score_filter_flag'] score_filter_condition = configs['score_filter_condition'] score_sampling_flag = configs['score_sampling_flag'] score_sampling_fraction = configs['score_sampling_fraction'] primary_key_columns = configs['primary_key_columns'] fillna_non_categorical_value = configs['fillna_non_categorical_value'] fillna_categorical_value = configs['fillna_categorical_value'] target_variable = configs['target_variable'] seed = configs['seed'] s3_bucket = configs['s3_bucket'] # Load raw scoring data score_raw = spark.read.parquet( data_paths['yr1_renewal_scoring_raw'].format(run_mode=run['run_mode'], run_id=run['run_id'])) # Load feature config saved in the pre-processing step feature_config = util.load_yml_file_from_s3( s3_bucket, data_paths['renewal_feature_config'].format(run_mode=run['run_mode'], run_id=run['run_id'])[12:]) feature_list = feature_config['feature_list'] feature_list_indexed = feature_config['feature_list_indexed'] categorical_columns = feature_config['categorical_columns'] non_categorical_columns = feature_config['non_categorical_columns'] # Load string indexers string_indexers = [] for i in range(len(categorical_columns)): string_indexers.append( StringIndexerModel.load( data_paths['renewal_string_indexer'].format( run_mode=run['run_mode'], run_id=run['run_id'], i=i))) # Select only the features used in model training score_raw = score_raw.select(primary_key_columns + feature_list) # Filter scoring data if score_filter_flag: score_raw = score_raw.filter(score_filter_condition) # Sample scoring data if score_sampling_flag: score_raw = model.sampling(score_raw, score_sampling_fraction, seed) # Ensure that all "n_" cols are indeed numeric score_raw = model.ensureColsAreNumeric(score_raw, non_categorical_columns) # Ensure that all "i_" cols are indeed string score_raw = model.ensureColsAreString(score_raw, categorical_columns) # Impute missing values score_raw = model.imputeMissing(score_raw, non_categorical_columns, categorical_columns, fillna_non_categorical_value, fillna_categorical_value) # Apply string indexer on string columns score_raw, string_indexers, categorical_columns_indexed = model.applyStringIndexer( score_raw, categorical_columns, string_indexers) # Assemble the final feature list score_raw = model.assembleFeaturesIntoVector(score_raw, feature_list_indexed) # Save score data score_raw.write.parquet(data_paths['renewal_score'].format( run_mode=run['run_mode'], run_id=run['run_id']), mode='overwrite')
def read_parquet(parquet_path): parquet_df = spark.read.parquet(parquet_path) parquet_df = parquet_df.drop('id') parquet_df = parquet_df.drop('one_area_price') parquet_df = parquet_df.drop('agency_nameVec') parquet_df = parquet_df.drop('districtVec') parquet_df = parquet_df.drop('room_type') parquet_df.show(truncate=False) print('parquet_df.count()==========11', parquet_df.count(), parquet_df.columns) for i in parquet_df.columns: if ('Vec' not in i) & ('facilities_vectors' not in i): if parquet_df.filter(parquet_df[i].isNull()).count() > 0: parquet_df = parquet_df.na.fill(0, i) elif parquet_df.filter(parquet_df[i] == 'NULL').count() > 0: parquet_df = parquet_df.filter(parquet_df[i] != 'NULL') parquet_df = parquet_df.select( '*', parquet_df[i].cast('float').alias('tmp_name')).drop(i) parquet_df = parquet_df.withColumnRenamed('tmp_name', i) parquet_df = parquet_df.filter(parquet_df[i].isNotNull()) print('parquet_df.count()==========22', i, parquet_df.count()) columns = parquet_df.columns columns.remove('price') from pyspark.ml.feature import OneHotEncoder, StringIndexer, StringIndexerModel from pyspark.ml.feature import CountVectorizer, CountVectorizerModel model_path = "/user/limeng/ganji_daxing_save_models/" columns_list = [] for i in columns: if i == 'facilities_vectors': loadedCountVectorizerModel = CountVectorizerModel.load( model_path + 'count-vectorizer-model') temp = loadedCountVectorizerModel.vocabulary columns_list.extend(temp) elif i == 'rent_typeVec': loadedStringIndexerModel = StringIndexerModel.load( model_path + 'stringIndexer_modelrent_type') temp = loadedStringIndexerModel.labels columns_list.extend(temp) elif i == 'agency_nameVec': loadedStringIndexerModel = StringIndexerModel.load( model_path + 'stringIndexer_modelagency_name') temp = loadedStringIndexerModel.labels columns_list.extend(temp) elif i == 'directionVec': loadedStringIndexerModel = StringIndexerModel.load( model_path + 'stringIndexer_modeldirection') temp = loadedStringIndexerModel.labels columns_list.extend(temp) elif i == 'zoneVec': loadedStringIndexerModel = StringIndexerModel.load( model_path + 'stringIndexer_modelzone') temp = loadedStringIndexerModel.labels columns_list.extend(temp) elif i == 'pay_typeVec': loadedStringIndexerModel = StringIndexerModel.load( model_path + 'stringIndexer_modelpay_type') temp = loadedStringIndexerModel.labels columns_list.extend(temp) elif i == 'districtVec': loadedStringIndexerModel = StringIndexerModel.load( model_path + 'stringIndexer_modeldistrict') temp = loadedStringIndexerModel.labels columns_list.extend(temp) else: columns_list.append(i) vecAssembler = VectorAssembler(inputCols=columns, outputCol="features") parquet_df = vecAssembler.transform(parquet_df).select('features', 'price') parquet_df = parquet_df.withColumnRenamed('price', 'label') return parquet_df, columns_list
dataset3 = dataset2.select("conn.srcip", "conn.sport", "conn.dstip", "conn.dsport", "conn.proto", "conn.dur", "conn.sbytes", "conn.dbytes", "conn.service", "conn.Spkts", "conn.Dpkts") dataset3.printSchema() print(type(dataset3)) #Rellena los valores nulos con un 0 (prevencion de errores) dataset3 = dataset3.fillna(0) # Procesamiento de caracteristicas string_indexer_models = {} for column in ['proto', 'service', 'attack_cat']: string_indexer_model_path = "{}/data/str_indexer_extended/str_indexer_model_extended_{}.bin".format( base_path, column) string_indexer = StringIndexerModel.load(string_indexer_model_path) string_indexer_models[column] = string_indexer for column in ['proto', 'service', 'attack_cat']: string_indexer_model = string_indexer_models[column] dataset3 = string_indexer_model.transform(dataset3) vector_assembler_path = "{}/data/numeric_vector_assembler_RFE.bin".format( base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) finalDataset = vector_assembler.transform(dataset3) # Carga el modelo de Machine Learning y lo aplica a los datos recibidos model_path = "{}/data/RandomForest_extended.bin".format(base_path) model = RandomForestClassificationModel.load(model_path) predictions = model.transform(finalDataset)
def main(base_path): APP_NAME = "make_predictions_streaming.py" # Process data every 10 seconds PERIOD = 10 BROKERS = 'localhost:9092' PREDICTION_TOPIC = 'flight_delay_classification_request' try: sc and ssc except NameError as e: import findspark # Add the streaming package and initialize findspark.add_packages( ["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"]) findspark.init() import pyspark import pyspark.sql import pyspark.streaming conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext( appName="Agile Data Science: PySpark Streaming 'Hello, World!'", conf=conf) ssc = StreamingContext(sc, PERIOD) spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # Load all models to be used in making predictions # # Load the arrival delay bucketizer from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # Load all the string field vectorizer pipelines into a dict from pyspark.ml.feature import StringIndexerModel string_indexer_models = {} for column in [ "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route" ]: string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column) string_indexer_model = StringIndexerModel.load( string_indexer_model_path) string_indexer_models[column] = string_indexer_model # Load the numeric vector assembler from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format( base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # Load the classifier model from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path) rfc = RandomForestClassificationModel.load(random_forest_model_path) # # Process Prediction Requests in Streaming # stream = KafkaUtils.createDirectStream(ssc, [PREDICTION_TOPIC], { "metadata.broker.list": BROKERS, "group.id": "0", }) object_stream = stream.map(lambda x: json.loads(x[1])) object_stream.pprint() row_stream = object_stream.map( lambda x: Row(FlightDate=iso8601.parse_date(x['FlightDate']), Origin=x['Origin'], Distance=x['Distance'], DayOfMonth=x['DayOfMonth'], DayOfYear=x['DayOfYear'], UUID=x['UUID'], DepDelay=x['DepDelay'], DayOfWeek=x['DayOfWeek'], FlightNum=x['FlightNum'], Dest=x['Dest'], Timestamp=iso8601.parse_date(x['Timestamp']), Carrier=x['Carrier'])) row_stream.pprint() # # Create a dataframe from the RDD-based object stream # def classify_prediction_requests(rdd): from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField prediction_request_schema = StructType([ StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Timestamp", TimestampType(), True), StructField("UUID", StringType(), True), ]) prediction_requests_df = spark.createDataFrame( rdd, schema=prediction_request_schema) prediction_requests_df.show() # # Add a Route variable to replace FlightNum # from pyspark.sql.functions import lit, concat prediction_requests_with_route = prediction_requests_df.withColumn( 'Route', concat(prediction_requests_df.Origin, lit('-'), prediction_requests_df.Dest)) prediction_requests_with_route.show(6) # Vectorize string fields with the corresponding pipeline for that column # Turn category fields into categoric feature vectors, then drop intermediate fields for column in [ "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route" ]: string_indexer_model = string_indexer_models[column] prediction_requests_with_route = string_indexer_model.transform( prediction_requests_with_route) # Vectorize numeric columns: DepDelay, Distance and index columns final_vectorized_features = vector_assembler.transform( prediction_requests_with_route) # Inspect the vectors final_vectorized_features.show() # Drop the individual index columns index_columns = [ "Carrier_index", "DayOfMonth_index", "DayOfWeek_index", "DayOfYear_index", "Origin_index", "Dest_index", "Route_index" ] for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # Make the prediction predictions = rfc.transform(final_vectorized_features) # Drop the features vector and prediction metadata to give the original fields predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop( "rawPrediction").drop("probability") # Inspect the output final_predictions.show() # Store to Mongo if final_predictions.count() > 0: final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB( "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response" ) # Do the classification and store to Mongo row_stream.foreachRDD(classify_prediction_requests) ssc.start() ssc.awaitTermination()
def prepare_data_ml3(spark, jenkins_builds, sonar_issues, sonar_analyses, spark_artefacts_dir, run_mode): # Change build result to only SUCCESS/FAIL for binary classification modify_result = udf(lambda x: "SUCCESS" if x == "SUCCESS" else "FAIL", StringType()) spark.udf.register("modify_result", modify_result) if jenkins_builds is not None: jenkins_builds = jenkins_builds.withColumn("result", modify_result("result")) pipeline_path = Path(spark_artefacts_dir).joinpath("pipeline_3") label_idx_model_path = Path(spark_artefacts_dir).joinpath( "label_indexer_3") # Getting pipeline and label indexer models if run_mode == "first": pipeline_model = get_ml3_pipeline().fit(sonar_issues) pipeline_model.write().overwrite().save(str(pipeline_path.absolute())) label_idx_model = StringIndexer( inputCol="result", outputCol="label", handleInvalid="skip").fit(jenkins_builds) label_idx_model.write().overwrite().save( str(label_idx_model_path.absolute())) elif run_mode == "incremental": pipeline_model = PipelineModel.load(str(pipeline_path.absolute())) label_idx_model = StringIndexerModel.load( str(label_idx_model_path.absolute())) # Columns to return rules = pipeline_model.stages[0].labels columns = list(map(lambda x: "removed_" + x, rules)) + list( map(lambda x: "introduced_" + x, rules)) # Preparing removed_rules_df = sonar_issues.filter( "status IN ('RESOLVED', 'CLOSED', 'REVIEWED')").select( "current_analysis_key", "rule") df1 = pipeline_model.transform(removed_rules_df) rdd1 = df1.rdd.map(lambda x : (x[0],x[3])).reduceByKey(lambda v1,v2: sum_sparse_vectors(v1,v2)) \ .map(lambda x: Row(current_analysis_key = x[0], removed_rule_vec = x[1])) if rdd1.count() == 0: return None, columns removed_issues_rule_vec_df = spark.createDataFrame(rdd1) introduced_rules_df = sonar_issues.filter( "status IN ('OPEN', 'REOPENED', 'CONFIRMED', 'TO_REVIEW')").select( "creation_analysis_key", "rule") df2 = pipeline_model.transform(introduced_rules_df) rdd2 = df2.rdd.map(lambda x : (x[0],x[3])).reduceByKey(lambda v1,v2: sum_sparse_vectors(v1,v2)) \ .map(lambda x: Row(creation_analysis_key = x[0], introduced_rule_vec = x[1])) if rdd2.count() == 0: return None, columns introduced_issues_rule_vec_df = spark.createDataFrame(rdd2) joined_sonar_rules_df = removed_issues_rule_vec_df.join( introduced_issues_rule_vec_df, removed_issues_rule_vec_df.current_analysis_key == introduced_issues_rule_vec_df.creation_analysis_key, how="outer") joined_sonar_rules_df.createOrReplaceTempView("sonar_rules") joined_sonar_rules_df = spark.sql("""SELECT coalesce(current_analysis_key, creation_analysis_key) AS analysis_key, introduced_rule_vec, removed_rule_vec FROM sonar_rules """) num_rules = len(pipeline_model.stages[0].labels) imputed_sonar_rules_rdd = joined_sonar_rules_df.rdd.map( lambda row: Row(analysis_key=row[0], introduced_rule_vec=SparseVector(num_rules, {}) if row[1] is None else row[1], removed_rule_vec=SparseVector(num_rules, {}) if row[2] is None else row[2])) imputed_sonar_rules_df = spark.createDataFrame(imputed_sonar_rules_rdd) v_assembler = VectorAssembler( inputCols=["removed_rule_vec", "introduced_rule_vec"], outputCol="features") sonar_issues_df = v_assembler.transform(imputed_sonar_rules_df).select( "analysis_key", "features") sonar_df = sonar_issues_df.join( sonar_analyses, sonar_issues_df.analysis_key == sonar_analyses.analysis_key, how="inner") df = sonar_df.join(jenkins_builds, sonar_df.revision == jenkins_builds.revision_number, how="inner").select("result", "features") ml_df = label_idx_model.transform(df).select("label", "features") return ml_df, columns
'./cf_train_subset_idx_full.parquet', './cf_train_subset_idx.parquet', './cf_train_idx.parquet', './cf_train_subset.parquet', './cf_train_extra.parquet' ] model_file = './sc2_final1' u_idx_model_file = './sc2_final_u_indexer' t_idx_model_file = './sc2_final_t_indexer' print(datetime.now()) data_file = path + files[1] data = spark.read.parquet(data_file) data = data.sample(False, 0.001) u_model = StringIndexerModel.load(u_idx_model_file) t_model = StringIndexerModel.load(t_idx_model_file) transformed_data = u_model.transform(data) transformed_data = t_model.transform(transformed_data).select( 'u_id', 't_id', 'count') ratings = transformed_data.rdd.map( lambda l: Rating(l.u_id, l.t_id, l['count'])) rank = 10 model = ALS.trainImplicit(ratings, rank) val_data_file = path + files[1] val_data = spark.read.parquet(val_data_file) val_data = val_data.sample(False, 0.001)
def main(base_path): APP_NAME = "make_predictions_streaming.py" # 10초마다 데이터 처리 PERIOD = 10 BROKERS = 'localhost:9092' PREDICTION_TOPIC = 'flight_delay_classification_request' try: sc and ssc except NameError as e: import findspark # 스트리밍 패키지 추가 및 초기화 findspark.add_packages( ["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"]) findspark.init() import pyspark import pyspark.sql import pyspark.streaming conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext( appName="Agile Data Science: PySpark Streaming 'Hello, World!'", conf=conf) ssc = StreamingContext(sc, PERIOD) spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # 예측 생성에 사용된 모든 모델 적재 # # 도착 지연 구간화 모델 적재 from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # 모든 문자열 필드 벡터화 파이프라인을 dict에 적재 from pyspark.ml.feature import StringIndexerModel string_indexer_models = {} for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column) string_indexer_model = StringIndexerModel.load( string_indexer_model_path) string_indexer_models[column] = string_indexer_model # 숫자 벡터 어셈블러 적재 from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format( base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # 분류 모델 적재 from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path) rfc = RandomForestClassificationModel.load(random_forest_model_path) # # 스트리밍에서 예측 요청 처리 # stream = KafkaUtils.createDirectStream(ssc, [PREDICTION_TOPIC], { "metadata.broker.list": BROKERS, "group.id": "0", }) object_stream = stream.map(lambda x: json.loads(x[1])) object_stream.pprint() row_stream = object_stream.map( lambda x: Row(FlightDate=iso8601.parse_date(x['FlightDate']), Origin=x['Origin'], Distance=x['Distance'], DayOfMonth=x['DayOfMonth'], DayOfYear=x['DayOfYear'], UUID=x['UUID'], DepDelay=x['DepDelay'], DayOfWeek=x['DayOfWeek'], FlightNum=x['FlightNum'], Dest=x['Dest'], Timestamp=iso8601.parse_date(x['Timestamp']), Carrier=x['Carrier'])) row_stream.pprint() # # RDD 기반 객체 스트림에서 dataframe 생성 # def classify_prediction_requests(rdd): from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField prediction_request_schema = StructType([ StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Timestamp", TimestampType(), True), StructField("UUID", StringType(), True), ]) prediction_requests_df = spark.createDataFrame( rdd, schema=prediction_request_schema) prediction_requests_df.show() # # FlightNum을 대체할 Route 변수 추가 # from pyspark.sql.functions import lit, concat prediction_requests_with_route = prediction_requests_df.withColumn( 'Route', concat(prediction_requests_df.Origin, lit('-'), prediction_requests_df.Dest)) prediction_requests_with_route.show(6) # 문자열 필드를 해당 열에 대응하는 파이프라인으로 벡터화 # 범주 필드를 범주형 특징 벡터로 변환한 다음 중간 결과 필드 삭제 for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model = string_indexer_models[column] prediction_requests_with_route = string_indexer_model.transform( prediction_requests_with_route) # 숫사 열 벡터화: DepDelay, Distance, 인덱스 열 final_vectorized_features = vector_assembler.transform( prediction_requests_with_route) # 벡터 검사 final_vectorized_features.show() # 개별 인덱스 열 제거 index_columns = [ "Carrier_index", "Origin_index", "Dest_index", "Route_index" ] for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # 확정된 특징 검사 final_vectorized_features.show() # 예측 생성 predictions = rfc.transform(final_vectorized_features) # 원 필드에 제공하기 위해 특징 벡터와 예측 메타데이터 제거 predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop( "rawPrediction").drop("probability") # 결과 검사 final_predictions.show() # 몽고DB에 저장 if final_predictions.count() > 0: final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB( "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response" ) # 분류를 수행하고 몽고 DB에 저장 row_stream.foreachRDD(classify_prediction_requests) ssc.start() ssc.awaitTermination()
def main(spark, indexer_user, indexer_item, train_data_file, val_data_file): ''' Parameters ---------- spark : SparkSession object data_file : string, path to the parquet file to load model_file : string, path to store the serialized model file ''' # Load the parquet file train = spark.read.parquet(train_data_file) val = spark.read.parquet(val_data_file) user_index = StringIndexerModel.load(indexer_user) item_index = StringIndexerModel.load(indexer_item) train = user_index.transform(train) train = item_index.transform(train) # train = train.sample(withReplacement = False, fraction = 0.8) rank = [10, 20, 30] #default is 10 regularization = [.01, .1, 1] #default is 1 alpha = [.5, 1, 10] #default is 1 rank_list = [] reg_list = [] alpha_list = [] precisions = [] for i in rank: for j in regularization: for k in alpha: als = ALS(userCol='user', itemCol='item', implicitPrefs=True, ratingCol='count', rank=i, regParam=j, alpha=k) model = als.fit(train) subset = val.select('user').distinct() predictions = model.recommendForUserSubset(subset, 50) predictions = predictions.select( "user", col("recommendations.item").alias("item")).sort('user') val = val.sort('user') predictionAndLabels = predictions.join( val, ["user"], "inner").rdd.map(lambda tup: (tup[1], tup[2])) metrics = RankingMetrics(predictionAndLabels) precision = metrics.meanAveragePrecision rank_list.append(i) reg_list.append(j) alpha_list.append(k) precisions.append(precision) print('rank: %f, reg: %f, alpha: %f' % (i, j, k)) print(precision) print(rank_list) print(reg_list) print(alpha_list) print(rmses) print('Max MAP value: %f' % max(precisions)) ind = np.argmax(rmses) print('Rank: %f' % rank_list[ind]) print('Reg: %f' % reg_list[ind]) print('Alpha: %f' % alpha_list[ind])
print('====> Start computation') dataset = spark.read.csv('/user/ronghui_safe/hgy/nid/datasets/{}_{}'.format(args.query_month, args.mode), header=True, inferSchema=True) dataset = dataset.withColumn('source', F.when(F.col('source') == '__HIVE_DEFAULT_PARTITION__', 'null').otherwise(F.col('source'))) dataset = dataset.withColumn('source', F.when(F.col('source') == 'cm_mail', 'null').otherwise(F.col('source'))) if args.mode != 'test': dataset = dataset.withColumn('duration', F.when(F.col('duration') == 0, 1e-6).otherwise(F.col('duration'))) dataset = dataset.withColumn('duration', F.log(F.lit(1e-6))/F.col('duration')) dataset = dataset.withColumn('duration', F.exp(F.col('duration'))) stringIndex_model = None if args.mode == 'train': stringIndexer = StringIndexer(inputCol='source', outputCol='source_index') stringIndex_model = stringIndexer.fit(dataset) stringIndex_model.save('/user/ronghui_safe/hgy/nid/edw/stringIndex_model_v2') else: stringIndex_model = StringIndexerModel.load('/user/ronghui_safe/hgy/nid/edw/stringIndex_model_v2') dataset = stringIndex_model.transform(dataset) encoder_model = None if args.mode == 'train': encoder = OneHotEncoder(inputCol='source_index', outputCol='source_vec') encoder_model = encoder.fit(dataset) encoder_model.save('/user/ronghui_safe/hgy/nid/edw/oneHotEncoder_model_v2') else: encoder_model = OneHotEncoderModel.load('/user/ronghui_safe/hgy/nid/edw/oneHotEncoder_model_v2') dataset = encoder_model.transform(dataset) feature_cols = ['source_vec', 'aging', 'PC1', 'PC2', 'PC3', 'PC4'] assembler = VectorAssembler(inputCols=feature_cols, outputCol='feature_vec') dataset = assembler.transform(dataset) scaler_model = None if args.mode == 'train': scaler = StandardScaler(inputCol='feature_vec', outputCol='scaled_feature_vec', withStd=True, withMean=True)
def main(base_path): spark = SparkSession.builder.config("spark.default.parallelism", 1).appName(APP_NAME).getOrCreate() # # Load all models to be used in making predictions # # Load the arrival delay bucketizer from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # Load all the string field vectorizer pipelines into a dict from pyspark.ml.feature import StringIndexerModel string_indexer_models = {} for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column) string_indexer_model = StringIndexerModel.load( string_indexer_model_path) string_indexer_models[column] = string_indexer_model # Load the numeric vector assembler from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format( base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # Load the classifier model from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path) rfc = RandomForestClassificationModel.load(random_forest_model_path) # # Messages look like: # # { # "Carrier": "DL", # "DayOfMonth": 25, # "DayOfWeek": 4, # "DayOfYear": 359, # "DepDelay": 10.0, # "Dest": "LAX", # "Distance": 2475.0, # "FlightDate": "2015-12-25", # "FlightNum": null, # "Origin": "JFK", # "Timestamp": "2019-10-31T00:19:47.633280", # "UUID": "af74b096-ecc7-4493-a79a-ebcdff699385" # } # # Process Prediction Requests from Kafka # message_df = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", BROKERS) \ .option("subscribe", PREDICTION_TOPIC) \ .load() # Create a DataFrame out of the one-hot encoded RDD schema = T.StructType([ T.StructField("Carrier", T.StringType()), T.StructField("DayOfMonth", T.IntegerType()), T.StructField("DayOfWeek", T.IntegerType()), T.StructField("DayOfYear", T.IntegerType()), T.StructField("DepDelay", T.FloatType()), T.StructField("Dest", T.StringType()), T.StructField("Distance", T.FloatType()), T.StructField("FlightDate", T.StringType()), T.StructField("FlightNum", T.StringType()), T.StructField("Origin", T.StringType()), T.StructField("Timestamp", T.TimestampType()), T.StructField("UUID", T.StringType()), ]) prediction_requests_df = message_df.select( F.from_json(F.col("value").cast("string"), schema).alias("data")).select("data.*") # # Add a Route variable to replace FlightNum # prediction_requests_with_route = prediction_requests_df.withColumn( 'Route', F.concat(prediction_requests_df.Origin, F.lit('-'), prediction_requests_df.Dest)) # Vectorize string fields with the corresponding pipeline for that column # Turn category fields into categoric feature vectors, then drop intermediate fields for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model = string_indexer_models[column] prediction_requests_with_route = string_indexer_model.transform( prediction_requests_with_route) # Vectorize numeric columns: DepDelay, Distance and index columns final_vectorized_features = vector_assembler.transform( prediction_requests_with_route) # Drop the individual index columns index_columns = [ "Carrier_index", "Origin_index", "Dest_index", "Route_index" ] for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Make the prediction predictions = rfc.transform(final_vectorized_features) # Drop the features vector and prediction metadata to give the original fields predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop( "rawPrediction").drop("probability") # Store the results to MongoDB class MongoWriter: def open(self, partition_id, epoch_id): print(f"Opened partition id: {partition_id}, epoch: {epoch_id}") self.mongo_client = pymongo.MongoClient() print(f"Opened MongoClient: {self.mongo_client}") return True def process(self, row): print(f"Processing row: {row}") as_dict = row.asDict() print(f"Inserting row.asDict(): {as_dict}") id = self.mongo_client.agile_data_science.flight_delay_classification_response.insert_one( as_dict) print(f"Inserted row, got ID: {id.inserted_id}") self.mongo_client.close() return True def close(self, error): print("Closed with error: %s" % str(error)) return True query = final_predictions.writeStream.foreach(MongoWriter()).start() query.awaitTermination()
def main(iso_date, base_path): APP_NAME = "make_predictions.py" # SparkSession이 없으면 환경 생성 try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() # # 파이프라인에 모든 모델을 적재 # # 도착 지연 구간 설정 모델을 적재 from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # 모든 문자열 인덱서를 dict에 적재 from pyspark.ml.feature import StringIndexerModel string_indexer_models = {} for column in ["Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route"]: string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column ) string_indexer_model = StringIndexerModel.load(string_indexer_model_path) string_indexer_models[column] = string_indexer_model # 수치 벡터 어셈블러 적재 from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # 분류 모델 적재 from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path ) rfc = RandomForestClassificationModel.load( random_forest_model_path ) # # 요청을 훈련 데이터로부터 변환을 통해 실행 # # 쿼리 범위를 지정하기 위해 ISO 문자열로 오늘과 내일 날짜 가져오기 today_dt = iso8601.parse_date(iso_date) rounded_today = today_dt.date()쿼리 범위를 지정하기 위해 ISO 문자열로 오늘과 내일 날짜 가져오기 iso_today = rounded_today.isoformat() # 해당 날짜의 입력 경로 생성: 날짜 기반의 프라이머리 키 디렉터리 구조 today_input_path = "{}/data/prediction_tasks_daily.json/{}".format( base_path, iso_today ) from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField schema = StructType([ StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Timestamp", TimestampType(), True), ]) prediction_requests = spark.read.json(today_input_path, schema=schema) prediction_requests.show() # # FlightNum을 대체할 Route 변수 추가 # from pyspark.sql.functions import lit, concat prediction_requests_with_route = prediction_requests.withColumn( 'Route', concat( prediction_requests.Origin, lit('-'), prediction_requests.Dest ) ) prediction_requests_with_route.show(6) # 해당 열에 대응하는 인덱서로 문자열 필드를 인덱싱 for column in ["Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route"]: string_indexer_model = string_indexer_models[column] prediction_requests_with_route = string_indexer_model.transform(prediction_requests_with_route) # 수치열 벡터화: DepDelay, Distance final_vectorized_features = vector_assembler.transform(prediction_requests_with_route) # 명목형 필드를 위한 인덱스 제거 index_columns = ["Carrier_index", "DayOfMonth_index","DayOfWeek_index", "DayOfYear_index", "Origin_index", "Origin_index", "Dest_index", "Route_index"] for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # 확정된 특징 검사 final_vectorized_features.show() # 예측 생성 predictions = rfc.transform(final_vectorized_features) # 원래 필드를 제공하기 위해 특징 벡터와 예측 메타데이터를 제거 predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop("rawPrediction").drop("probability") # 결과 검사 final_predictions.show() # 해당 날짜의 경로 생성: 날짜 기반의 프라이머리 키 디렉터리 구조 today_output_path = "{}/data/prediction_results_daily.json/{}".format( base_path, iso_today ) # 일별 구간에 결과 저장 final_predictions.repartition(1).write.mode("overwrite").json(today_output_path)
def main(base_path): APP_NAME = "make_predictions_streaming.py" # Process data every 10 seconds PERIOD = 10 BROKERS = 'localhost:9092' PREDICTION_TOPIC = 'flight_delay_classification_request' try: sc and ssc except NameError as e: import findspark # Add the streaming package and initialize findspark.add_packages(["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"]) findspark.init() import pyspark import pyspark.sql import pyspark.streaming conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(appName="Agile Data Science: PySpark Streaming 'Hello, World!'", conf=conf) ssc = StreamingContext(sc, PERIOD) spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() # # Load all models to be used in making predictions # # Load the arrival delay bucketizer from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # Load all the string field vectorizer pipelines into a dict from pyspark.ml.feature import StringIndexerModel string_indexer_models = {} for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column ) string_indexer_model = StringIndexerModel.load(string_indexer_model_path) string_indexer_models[column] = string_indexer_model # Load the numeric vector assembler from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # Load the classifier model from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path ) rfc = RandomForestClassificationModel.load( random_forest_model_path ) # # Process Prediction Requests in Streaming # stream = KafkaUtils.createDirectStream( ssc, [PREDICTION_TOPIC], { "metadata.broker.list": BROKERS, "group.id": "0", } ) object_stream = stream.map(lambda x: json.loads(x[1])) object_stream.pprint() row_stream = object_stream.map( lambda x: Row( FlightDate=iso8601.parse_date(x['FlightDate']), Origin=x['Origin'], Distance=x['Distance'], DayOfMonth=x['DayOfMonth'], DayOfYear=x['DayOfYear'], UUID=x['UUID'], DepDelay=x['DepDelay'], DayOfWeek=x['DayOfWeek'], FlightNum=x['FlightNum'], Dest=x['Dest'], Timestamp=iso8601.parse_date(x['Timestamp']), Carrier=x['Carrier'] ) ) row_stream.pprint() # # Create a dataframe from the RDD-based object stream # def classify_prediction_requests(rdd): from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField prediction_request_schema = StructType([ StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Timestamp", TimestampType(), True), StructField("UUID", StringType(), True), ]) prediction_requests_df = spark.createDataFrame(rdd, schema=prediction_request_schema) prediction_requests_df.show() # # Add a Route variable to replace FlightNum # from pyspark.sql.functions import lit, concat prediction_requests_with_route = prediction_requests_df.withColumn( 'Route', concat( prediction_requests_df.Origin, lit('-'), prediction_requests_df.Dest ) ) prediction_requests_with_route.show(6) # Vectorize string fields with the corresponding pipeline for that column # Turn category fields into categoric feature vectors, then drop intermediate fields for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model = string_indexer_models[column] prediction_requests_with_route = string_indexer_model.transform(prediction_requests_with_route) # Vectorize numeric columns: DepDelay, Distance and index columns final_vectorized_features = vector_assembler.transform(prediction_requests_with_route) # Inspect the vectors final_vectorized_features.show() # Drop the individual index columns index_columns = ["Carrier_index", "Origin_index", "Dest_index", "Route_index"] for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # Make the prediction predictions = rfc.transform(final_vectorized_features) # Drop the features vector and prediction metadata to give the original fields predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop("rawPrediction").drop("probability") # Inspect the output final_predictions.show() # Store to Mongo if final_predictions.count() > 0: final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB( "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response" ) # Do the classification and store to Mongo row_stream.foreachRDD(classify_prediction_requests) ssc.start() ssc.awaitTermination()
def main(iso_date, base_path): APP_NAME = "make_predictions.py" # If there is no SparkSession, create the environment try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # Load each and every model in the pipeline # # Load the arrival delay bucketizer from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # Load all the string indexers into a dict from pyspark.ml.feature import StringIndexerModel string_indexer_models = {} for column in [ "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route" ]: string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column) string_indexer_model = StringIndexerModel.load( string_indexer_model_path) string_indexer_models[column] = string_indexer_model # Load the numeric vector assembler from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format( base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # Load the classifier model from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path) rfc = RandomForestClassificationModel.load(random_forest_model_path) # # Run the requests through the transformations from training # # Get today and tomorrow's dates as iso strings to scope query today_dt = iso8601.parse_date(iso_date) rounded_today = today_dt.date() iso_today = rounded_today.isoformat() # Build the day's input path: a date based primary key directory structure today_input_path = "{}/data/prediction_tasks_daily.json/{}".format( base_path, iso_today) from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField schema = StructType([ StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Timestamp", TimestampType(), True), ]) prediction_requests = spark.read.json(today_input_path, schema=schema) prediction_requests.show() # # Add a Route variable to replace FlightNum # from pyspark.sql.functions import lit, concat prediction_requests_with_route = prediction_requests.withColumn( 'Route', concat(prediction_requests.Origin, lit('-'), prediction_requests.Dest)) prediction_requests_with_route.show(6) # Index string fields with the corresponding indexer for that column for column in [ "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route" ]: string_indexer_model = string_indexer_models[column] prediction_requests_with_route = string_indexer_model.transform( prediction_requests_with_route) # Vectorize numeric columns: DepDelay and Distance final_vectorized_features = vector_assembler.transform( prediction_requests_with_route) # Drop the indexes for the nominal fields index_columns = [ "Carrier_index", "DayOfMonth_index", "DayOfWeek_index", "DayOfYear_index", "Origin_index", "Origin_index", "Dest_index", "Route_index" ] for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # Make the prediction predictions = rfc.transform(final_vectorized_features) # Drop the features vector and prediction metadata to give the original fields predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop( "rawPrediction").drop("probability") # Inspect the output final_predictions.show() # Build the day's output path: a date based primary key directory structure today_output_path = "{}/data/prediction_results_daily.json/{}".format( base_path, iso_today) # Save the output to its daily bucket final_predictions.repartition(1).write.mode("overwrite").json( today_output_path)
return ss if __name__ == '__main__': start_time = time.time() ss = init_spark_session() #initial spark session final_indexed_save = os.path.join('dataset', 'review_vegas_als.parquet') reviewDF = ss.read.parquet(final_indexed_save).cache() model_save = os.path.join('model', 'als_model_vegas') indexer_user_save = os.path.join('model', 'user_ind_model') model = ALSModel.load(model_save) uid = reviewDF.select('user_id').rdd.takeSample(False, 1) logger.error('{} seconds has elapsed'.format(str(uid))) bid = reviewDF.select('business_id_int', 'business_id').distinct() bid.show(20) logger.error('{} seconds has elapsed. {} rows remain'.format( time.time() - start_time, bid.count())) #predDF = bid.filter(bid['user_id'] == user_id) #build user request using input id predDF = bid.withColumn("user_id", lit(uid)) indexer_model = StringIndexerModel.load(indexer_user_save) predDF = indexer_model.transform(predDF) '''user_id_converter = IndexToString(inputCol= 'user_id',outputCol='user_id') convert_df = ''' predDF.show(20) prediction_user = model.transform(predDF) ratings = prediction_user.sort( desc('prediction')).limit(count).select('business_id') logger.error('{} seconds has elapsed'.format(time.time() - start_time))