def __init__(self): self.curSparkSession = SparkSession.builder.getOrCreate() self.vecPipe = PipelineModel.read().load( "hdfs://172.17.113.68:9000/user/hadoop/dentistModels/vecAssemblerPipeline") self.modelsPipe = PipelineModel.read().load( "hdfs://172.17.113.68:9000/user/hadoop/dentistModels/4modelsProcessPipeline") self.betterPipe = PipelineModel.read().load( "hdfs://172.17.113.68:9000/user/hadoop/dentistModels/GBTBetterRegPipeModel") self.wrosePipe = PipelineModel.read().load( "hdfs://172.17.113.68:9000/user/hadoop/dentistModels/GBTWroseRegPipeModel") logging.info(str(datetime.now()) + "(std time):Class 'prediction_analysis' has been initiated successfully.")
def bundle(spark_session, spark_df_schema, spark_pipeline_model): #spark_df_as_java = _py2java(spark_session, spark_df) #spark_df_schema_as_java = spark_df_as_java.schema.__call__() spark_df_schema_as_json = spark_df_schema.json() with open('model.schema', 'wb') as pkl_file: pickle.dump(spark_df_schema_as_json, pkl_file) spark_pipeline_model.write().overwrite().save('model.parquet') ## SERVE FROM HERE with open('model.schema', 'rb') as pkl_file: from pyspark.sql.types import _parse_datatype_json_string restored_spark_df_schema_as_json = pickle.load(pkl_file) restored_spark_df_schema = _parse_datatype_json_string( restored_spark_df_schema_as_json) restored_spark_df_schema_as_java = _py2java(spark_session, restored_spark_df_schema) restored_spark_pipeline_model = PipelineModel.read().load('model.parquet') restored_spark_pipeline_model_as_java = restored_spark_pipeline_model._to_java( ) return spark_session._jvm.org.jpmml.sparkml.ConverterUtil.toPMMLByteArray( restored_spark_df_schema_as_java, restored_spark_pipeline_model_as_java)
def predict(time, rdd): print("------------- %s -------------" % str(time)) schema = StructType([StructField("text", StringType(), True)]) try: df = rdd.toDF(schema=schema) model = PipelineModel.read().load('model/' + MODEL) pred = model.transform(df) pred.show() send_to_dashboard(pred) except: print("Error: ", sys.exc_info())
def predict(self, X, features_names): print(X) print(type(X)) spark=SparkSession.builder.master("spark-master").getOrCreate() data=spark.createDataFrame(X.tolist(), schema=self.cSchema) print("Covert np array to list") print(data) print(type(data)) if self.model is None: # self.pipeline = Pipeline.read().load("hdfs://{}:9000/tmp/classification-pipeline".format(self.server)) self.model=PipelineModel.read().load( "hdfs://{}:9000/tmp/classification-model".format(self.hdfs)) predictions=self.model.transform(data) labels = predictions.select('predictedLabel').collect() return [str(x.predictedLabel) for x in labels]
def __init__(self): """Initialisation of this class""" print("Initialisierung") # Imports from os import path from pyspark.ml import PipelineModel from sparknlp import start from google.cloud import bigquery # Define Service-Account json self.sa = 'sa.json' # Initialisation of Spark print("Starting Spark") start() # Load pretrained Modell print("load Model") if path.exists("Model_Template") is False: print("Model needs to be loaded") self.load_Model() print("Model loaded") else: print("Found existing model") # loads downloaded Modell self.Model = PipelineModel.read().load("Model_Template") # Initialize BigQuery print("Init BigQuery Connection") # Get Client from JSON File (specified above) self.client = bigquery.Client.from_service_account_json(self.sa) dataset_id = 'Dashboard_tables' # Specify your Dataset ID table_id = 'Entity_raw' # Specify your Table ID table_ref = self.client.dataset(dataset_id).table(table_id) self.table = self.client.get_table(table_ref) # request # Initialize Redis Queue self.q = RedisQueue('Explain_Jobs') print("Init done")
def load_pipeline_from_hdfs(cls, path): """ author: [email protected] Load the model from given path, by first taking the model_name and getting the appropriate model load object for it and then loads the model from hdfs into that load object :param path: hdfs path :param model_name: the actual model name (decision_tree, random_forest etc) :return: loaded model """ try: cls.logger.info("Getting object of DAO Factory class") dao_obj = AbstractMachineLearningModelDAOFactory() cls.logger.info("Object retrieved") cls.logger.info("Getting dao instance for database: " + str(CommonConstants.DATABASE_HDFS)) hdfs_dao = dao_obj.get_machine_learning_model_dao(CommonConstants.DATABASE_HDFS) cls.logger.info("Instance retrieved") cls.logger.info("Going to load pipeline model ") model_obj = HDFSUtils._get_loadable_pipeline_object() cls.logger.info("Model load object successfully retrieved") cls.logger.warning("Going to load Pipeline model the HDFS path: " + str(path)) # loaded_model = hdfs_dao.load(path, model_obj) loaded_model = PipelineModel.read().load(path) cls.logger.warning("Pipeline model successfully loaded from path: " + str(path)) return loaded_model except HDFSException as exp: raise CommonBaseException(exp) except CommonBaseException as exp: raise exp except Exception as exp: cls.logger.error('Exception occured while loading model from hdfs ' + str(path)) raise CommonBaseException(exp)
line = begin_re.sub('', line) line = punct_re.sub('', line) line = space_re.sub('_', line) line = ' '.join([line[i:i + 3] for i in range(0, len(line), 3)]) return line SparkContext.setSystemProperty('spark.executor.memory', '6g') conf = SparkConf().setAppName("NBClassifier").setMaster("local[*]") sc = SparkContext(conf=conf) sc._jsc.hadoopConfiguration().set("textinputformat.record.delimiter", "</end_of_article>") spark = SparkSession(sc) sc.setLogLevel("ERROR") model_nb = PipelineModel.read().load("model_nb_5k.save") language_indexer = model_nb.stages[ 2].labels #Language list as in StringIndexer print(language_indexer) start = time.time() schema = StructType([ StructField("fullText", StringType(), True), StructField("lang", StringType(), False) ]) #df = sc.textFile('example.txt') df = sc.textFile('wet_concatenated.txt') df = df.map(remove_syntax) df = df.map(lambda l: (l, 'en')).toDF(schema) print('Dataframe created') p = model_nb.transform(df)
# print("Start fitting") # prediction_model = pipeline_fast_dl.fit(training_data) # print("Fitting is ended") # print (time.time() - start) prediction_data = spark.createDataFrame([["Maria is a nice place."],["any bbq places open before 5 nearby"]]).toDF("text") prediction_data.show() prediction_model.transform(prediction_data).show(truncate=False) prediction_model.write().overwrite().save("ner_dl_model") !cp -r "ner_dl_model" "gdrive/My Drive/Colab Notebooks/SparkNLP/utils/ner_dl_model_base" from pyspark.ml import PipelineModel, Pipeline loaded_prediction_model = PipelineModel.read().load("ner_dl_model") prediction_data = spark.createDataFrame([["Maria is a nice place."],["any bbq places open before 5 nearby"]]).toDF("text") prediction_data.show() #loaded_prediction_model.transform(prediction_data).show(truncate=False) prediction = loaded_prediction_model.transform(prediction_data) prediction.select("finished_ner_metadata").show(truncate=False) prediction.select("finished_ner").show(truncate=False) prediction.select("finished_ner_converter_metadata").show(truncate=False) prediction.select("finished_ner_converter").show(truncate=False) #prediction.select("ner").show(truncate=False) from sparknlp.base import LightPipeline lp = LightPipeline(loaded_prediction_model) result = lp.annotate("Peter is a good person.")
cluster_path = os.path.join("./CURRENT_CLUSTER_MODEL", newest_filename) else: if not os.path.exists("./" + backup_cluster): filepath, cluster_download_success = AS.download_file( save_path, backup_cluster, container_name) if cluster_download_success: cluster_path = AS.unzip(".", filepath) cluster_path = os.path.join("./CURRENT_CLUSTER_MODEL", backup_cluster) df_serve = df_serve.fillna({ 'agg_avg_speed': 0.0, 'agg_avg_acceleration': 0.0, 'agg_avg_consumption': 0.0 }) model_clustering = PipelineModel.read().load(cluster_path) try: df_serve = model_clustering.transform(df_serve) df_serve = df_serve.withColumnRenamed("prediction", "agg_latest_PRED_Clustering") except Exception as ex: print(ex) df_serve = df_serve.withColumn("agg_latest_PRED_Clustering", lit(0).cast(DoubleType())) # Aggregations on routes aggregates_file_name = "aggregates" container_name = "horizon"
try: from mmlspark.vw import VowpalWabbitClassifier from mmlspark.train import ComputeModelStatistics except Exception as ex: print(ex) lines = spark.readStream \ .format("socket") \ .option("host", "localhost") \ .option("port", 9999) \ .load() lines = lines.withColumnRenamed("value", "text") lines.printSchema() model = PipelineModel.read().load( "/home/haitien/Desktop/TwitterSentimentAnalysis_BigData20191/scripts" "/saved_model/model4") prediction = model.transform(lines) selected = prediction.select("text", "probability", "prediction") query = selected.writeStream \ .outputMode('append') \ .format('console') \ .start() query.awaitTermination() # # lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2])) # counts = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b) # counts.pprint()
# MAGIC # MAGIC If at some later time you want to use the model for scoring (inference) you can load it from the disk. # COMMAND ---------- # MAGIC %md # MAGIC # MAGIC ### Load the model # MAGIC # MAGIC As noted before, the DeepImageFeaturizer stage has to be added to the restored model pipeline. # COMMAND ---------- from pyspark.ml import PipelineModel landclassifier = PipelineModel.read().load(save_model_path) featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="ResNet50") landclassifier.stages.insert(0, featurizer) landclassifier.stages # COMMAND ---------- # MAGIC %md # MAGIC # MAGIC You can now use the loaded model for batch inference. # COMMAND ---------- test_img_df = ImageSchema.readImages(img_dir + 'test', recursive=True)
################################################## ################## read data ################## ################################################## hdfsPath="/acm/ml/clsf/data/test001" modelsPath=hdfsPath+"/models" trainDataFile="./data/sanfrancisco-crime/train.csv" #data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(trainDataFile).limit(1000) data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load("hdfs://namenode:9000/"+hdfsPath).limit(1000) print(data.columns) (training, test) = data.randomSplit([0.7, 0.3], seed = 100) pipeline= PipelineModel.read().load("hdfs://namenode:9000/"+modelsPath+"/lr.model.pipeline.savepoint") testData = pipeline.transform(test) print("Test Dataset Count: " + str(testData.count())) testData.show(5) ########################################################## ################## Train/load the model ################## ########################################################## lrModel = LogisticRegressionModel.load("hdfs://namenode:9000/"+modelsPath+"/lr.model.savepoint") predictions = lrModel.transform(testData) predictions.filter(predictions['prediction'] == 7) \
spark = SparkSession.builder.appName("BANK_MODELO").getOrCreate() # Lendo os dados do HDFS data = spark.read.csv("hdfs://elephant:8020/user/labdata/bank_small.csv", header=True, sep=";", inferSchema=True) data = data.selectExpr( *["`{}` as {}".format(col, col.replace('.', '_')) for col in data.columns]) # with open('prediction_log.txt', 'w') as logFile: # logFile.write('testeeeee') # process from pyspark.ml import Pipeline, PipelineModel pipelineModel = PipelineModel.read().load('data_precossing_bank_mkt') data_model = pipelineModel.transform(data) data_model = data_model.select(["label", "features"]) # predict from pyspark.ml.classification import GBTClassifier gbtModel = GBTClassifier.load("modelo_bank_mkt") predictions_gbt = gbtModel.transform(data_model) #evaluate from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator evaluator_accuracy = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="accuracy")
def load_model(self, model_path): if model_path: sameModel = PipelineModel.read().load(model_path) return sameModel else: return None
def basicPredictionPipeline(data, col_target="", first_pred_day=False, dt_execution=False, jarra='quinto', logger=False, verbose=True, checks=False): try: start_all = datetime.now() # Get parameters from config file number_of_models = 6 parser = SafeConfigParser() parser.read(MODEL_CONFIG_FILE) local_save_path = parser.get('save_params', 'local_save_path') if not os.path.exists(local_save_path): os.makedirs(local_save_path) local_save_path = parser.get('save_params', 'local_save_path') # Define name of the variable for predictions cols_cyclical, cols_ohe_in, cols_features, col_target, cols_id = defineFeatures( model_complex='first', use_clustered_data_sets=False, col_target=col_target, verbose=False, logger=False) cols_ohe_out = [s + '_catVec' for s in cols_ohe_in] if first_pred_day is not None: split_value = first_pred_day else: split_value = datetime.today() first_pred_day = split_value.strftime('%Y-%m-%d') split_value = split_value.strftime('%Y-%m-%d') if not dt_execution: dt_execution = split_value s3_save_path = parser.get('save_params', 's3_save_path') s3_save_pipelines_path = s3_save_path + 'pipelines/' + col_target + '/dt-execution=' + dt_execution + '/' # Connect to spark session spark = createSparkSession(jarra='mass', verbose=True, logger=logger) # Load data prep and model pipelines from S3 for model training run on dt_execution: if verbose: logger.info( 'Loading data preparation and model pipelines lists from ' + s3_save_pipelines_path) pipelinePrepList = [] fitList = [] for i in range(number_of_models): pipelinePrepList.append( PipelineModel.read().load(s3_save_pipelines_path + "data_prep_pipeline" + str(i))) fitList.append(RandomForestRegressionModel.read().load( s3_save_pipelines_path + "model_pipeline" + str(i))) if verbose: logger.info( 'Loading data preparation and model pipelines lists end') # Add cyclical variables to features lists, OHE_out not as they are already in pipelines cols_cyclical_sin = [s + '_sin' for s in cols_cyclical] cols_cyclical_cos = [s + '_cos' for s in cols_cyclical] cols_cyclical_out = cols_cyclical_sin + cols_cyclical_cos for i in range(len(cols_features)): cols_features[i] = cols_features[i] + cols_cyclical_out # Create list with start and end dates for each of consecutive models start_days_list, end_days_list = createTestDatesListWFV( split_value, verbose=verbose, logger=logger) # Define date filters for test/pred sets of each consecutive models filterPredStartList = [] filterPredEndList = [] for i in range(len(start_days_list)): filterPredStartList.append( col('dt_flight_date_local') >= start_days_list[i]) filterPredEndList.append( col('dt_flight_date_local') <= end_days_list[i]) # Create list with test data sets for each of the consecutive models, each data set have different features # and dates, also data list for rows/flights with Nulls (e.g. no historical data) is created separately test_data_list, test_data_basic_list = createTestDataLists( data, cols_features, cols_ohe_in, col_target, cols_id, filterPredStartList, filterPredEndList, spark, verbose, logger) # Transform string idexer, ohe, vector assembler using pipeline from training if verbose: logger.info( 'String indexer, one hot encoder and vector assembler test sets, start' ) testDataList = [] testDataBasicList = [] for i in range(len(test_data_list)): if verbose: logger.info('Model ' + str(i)) testDataList.append(pipelinePrepList[i].transform( test_data_list[i])) if verbose: logger.info('RF Model start') # Apply RF model data using pipeline from training resultsList = [] resultsBasicList = [] for i in range(len(testDataList)): # Use the test set, is creating an extra column 'col_target' with the test fit results resultsList.append(fitList[i].transform( testDataList[i]).select(cols_id + [col_target + '_pred'])) if verbose: logger.info('RF Model end') # Union dataframes with results for each model as one dataframe (to get the full results) resultsFull = resultsList[0] resultsFull = resultsFull.union(resultsBasicList[0]) for i in range(1, len(test_data_list)): resultsFull = resultsFull.union(resultsList[i]) resultsFull = resultsFull.union(resultsBasicList[i]) resultsFull.cache() resultsFull = resultsFull.withColumn('dt_flight_date_local', to_date('dt_flight_date_local')) # Add execution date column resultsFull = resultsFull.withColumn('dt_execution', lit(first_pred_day)) resultsFull = resultsFull.withColumn('dt_execution', to_date('dt_execution')) # Save prediction results in local for each model seperately if verbose: logger.info('Changing data frame to Pandas to save in local') model_results = resultsFull.toPandas() if not os.path.isdir(local_save_path): os.mkdir(local_save_path) model_results\ .to_csv(local_save_path + col_target + '_results_' + first_pred_day.replace('-', '_') + '.csv', index=False) if verbose: logger.info('Results saved in: ' + local_save_path + col_target + '_results_' + first_pred_day.replace('-', '_') + '.csv') # Get feature importances featureImportancesFirst, featureImportancesLast, feature_importances_all = calcFeatImportance( fitList, testDataList, col_target, first_pred_day, verbose, logger) # Save feature importance for given target variable feature_importances_all.\ to_csv(local_save_path + col_target + '_feat_importance_' + first_pred_day.replace('-', '_') + '.csv', index=False) end_all = datetime.now() if verbose: logger.info('Random Forest, all models, time: ' + str(end_all - start_all)) logger.info('Feature importance saved in: ' + local_save_path + col_target + '_feat_importance_' + first_pred_day.replace('-', '_') + '.csv') logger.info( 'Check sum of predicted variables per month and count of flights each month: ' ) # Calculate metrics for mlflow if verbose and checks: df_prediction_errors, pred_errors = calcTrainingSetError( number_of_last_days_to_eval=90, last_dt_exec_to_evaluate=False, list_exec_dates_to_evalute=False, remove_outliers=True, verbose=True, logger=logger, checks=True) checkDuplicates = resultsFull.drop_duplicates(subset=['dt_flight_date_local', 'cd_num_flight', 'cd_airport_pair', 'cd_carrier'])\ .count() - resultsFull.count() resultsFullCount = resultsFull.count() # Count sum of rows in all test sets testSetCount = np.sum( [testDataList[i].count() for i in range(len(testDataList))]) testBasicSetCount = np.sum([ testDataBasicList[i].count() for i in range(len(testDataBasicList)) ]) logger.info('Sum of flights per month (real values): ') logger.info( resultsFull.groupBy("dt_flight_year_month").agg( count("cd_airport_pair")).sort( "dt_flight_year_month").toPandas()) logger.info('Sum of predicted ' + col_predict + ' per month (all flights): ') logger.info( resultsFull.groupBy("dt_flight_year_month").agg( sum(col_predict)).sort("dt_flight_year_month").toPandas()) logger.info('Number of duplicated flights: ') logger.info('Number of rows/flights in test sets: ' + str(testSetCount)) logger.info('Number of rows/flights in basic model test sets: ' + str(testBasicSetCount)) logger.info('Number of flights/rows in prediction set:') logger.info(resultsFullCount) logger.info( 'Feature importances for the first model (flights this week):') logger.info(featureImportancesFirst) logger.info('Feature importances for the last model:') logger.info(featureImportancesLast) mlflow_params = { 'checkDuplicates': checkDuplicates, 'resultsFullCount': resultsFullCount, 'testSetCount': testSetCount, 'testBasicSetCount': testBasicSetCount, 'predDateMin': str(resultsFull.toPandas().dt_flight_date_local.min()), 'predDateMax': str(resultsFull.toPandas().dt_flight_date_local.max()), 'time_seconds': (end_all - start_all).total_seconds() } else: mlflow_params = {} #spark.stop() #if verbose: # logger.info('Spark Session stopped') except Exception: logger.exception("Fatal error in demand_forecast_pred()") raise return (mlflow_params, pred_errors)
#```python #pipeline_loaded = Pipeline.load("models/pipeline") #``` # ## entrenar el modelo pipeline_model = pipeline.fit(rides) # ## guardar el modelo # guardar el pipeline model en HDFS: pipeline_model.write().overwrite().save("models/pipeline_model") # leer el modelo desde el directorio en hdfs : from pyspark.ml import PipelineModel pipeline_model_loaded = PipelineModel.read().load("models/pipeline_model") # ## Examinar y evaluar el algoritmo de clasificacion # extraer el modelo desde el stage 5: classifier_model = pipeline_model.stages[5] type(classifier_model) # usar el atributo `toDebugString` para imprimir el arbol de clasificacion : print(classifier_model.toDebugString) # usar el metodo `transform` para aplicar el pipeline model a un DataFrame para predecir Dataframe --> DataFrame: classified = pipeline_model.transform(rides) # usar el metodo persist para dejar en memoria cache el Dataframe : classified.persist()
limit(1000) data.cache() # data.count() # data.show() model = pipeline.fit(data) with open('/pos_tagger/resources/channel.txt', 'r') as content_file: content = content_file.read() test_data = spark.sparkContext.parallelize([[content]]).toDF().toDF("text") test_data.show() res_data = model.transform(test_data) res_data.show() pos = res_data.collect() result = pos[0].finished_pos try: output_file = open("/pos_tagger/resources/output.txt", "w") output_file.write(result) except Exception as e: print("An excetion occurred while catching another excetion.") model.write().overwrite().save("/pos_tagger/resources/pipeline_trained/") load_model = PipelineModel.read().load("/pos_tagger/resources/pipeline_trained/") # load_model.transform(data).show()
from pyspark.ml import PipelineModel from pyspark.sql.types import DoubleType from pyspark import SparkFiles url = "https://s3-us-west-2.amazonaws.com/mlapi-samples/demo/data/input/iris.csv" spark.sparkContext.addFile(url) # Load and parse the data file, converting it to a DataFrame. data = spark.read.csv(SparkFiles.get("iris.csv"), header=True) data = data.withColumn("sepal_length", data["sepal_length"].cast(DoubleType())) data = data.withColumn("sepal_width", data["sepal_width"].cast(DoubleType())) data = data.withColumn("petal_width", data["petal_width"].cast(DoubleType())) data = data.withColumn("petal_length", data["petal_length"].cast(DoubleType())) pipeline = Pipeline.read().load("classification-pipeline") model = PipelineModel.read().load("classification-model") # Make predictions. predictions = model.transform(data) # Select example rows to display. predictions.select("predictedLabel", "species", "features").show(5) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy))
def _read(self): return PipelineModel.read().load(self._file_path)