def train_and_save_pipeline(model_path: str, data_path: str, estimator_class: type, params: dict = ()): """ Train and save a pipeline containing a single estimator with the specified parameters. This is the action called from the command line handler for a train subcommand. :param model_path: where to save the trained pipeline model :param data_path: data on which to train the model :param estimator_class: the estimator to train :param params: dictionary of optional estimator parameters """ def snake_keys_to_camel(snake_dict: dict): def snake_to_camel(text: str): return re.sub(r"_([a-zA-Z0-9])", lambda m: m.group(1).upper(), text) return dict((snake_to_camel(k), v) for k, v in snake_dict.items()) data = spark().read.load(data_path) # Command names taken from Click parameters are lowercase underscore-delimited "snake-case" strings, while Spark # estimators take camel-case parameters. params = snake_keys_to_camel(dict(params)) estimator = estimator_class().setParams(**params) pipeline = Pipeline(stages=[estimator]).fit(data) pipeline.save(model_path) logging.info(f"""Created pipeline model in {model_path}""")
def main(): # 1. Configure Spark conf = SparkConf().setAppName(APP_NAME) conf = conf.setMaster("local[*]") sc = SparkContext(conf=conf) spark = SparkSession(sc) text_file = sc.textFile("s3a://spotifybuck/albumfeatures/2017/*/*/*/*/*") #3. Transform data af = (text_file.map(getVals)) #4. Create a DataFrame out of this using the toDF method and cache it afdf = af.toDF([ 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'duration' ]).cache() # Automatically identify categorical features, and index them. # Set maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer = \ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(afdf) #5. Create a train/test split with 70% of data in training set and 30% of data in test set afdf_train, afdf_test = afdf.randomSplit([0.7, 0.3], seed=123) # Train a RandomForest model. rf = RandomForestRegressor(featuresCol="indexedFeatures") # Chain indexer and forest in a Pipeline pipeline = Pipeline(stages=[featureIndexer, rf]) # Train model. This also runs the indexer. model = pipeline.fit(afdf_train) # Make predictions. predictions = model.transform(afdf_test) # Select example rows to display. predictions.select("prediction", "label", "features").show(5) # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse) rfModel = model.stages[1] print(rfModel) # summary only #Step 3: Building our Pipelines rfModel.save('s3a://spotifybuck/model-export' + datetime.now().strftime('%Y%m%d%H%M')) pipeline.save('s3a://spotifybuck/pipeline-export' + datetime.now().strftime('%Y%m%d%H%M')) sc.stop()
def test_nested_pipeline_persistence(self): """ Pipeline[HashingTF, Pipeline[PCA]] """ sqlContext = SQLContext(self.sc) temp_path = tempfile.mkdtemp() try: df = sqlContext.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"]) tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features") pca = PCA(k=2, inputCol="features", outputCol="pca_features") p0 = Pipeline(stages=[pca]) pl = Pipeline(stages=[tf, p0]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self._compare_pipelines(pl, loaded_pipeline) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) self._compare_pipelines(model, loaded_model) finally: try: rmtree(temp_path) except OSError: pass
def test_nested_pipeline_persistence(self): """ Pipeline[HashingTF, Pipeline[PCA]] """ sqlContext = SQLContext(self.sc) temp_path = tempfile.mkdtemp() try: df = sqlContext.createDataFrame([(["a", "b", "c"], ), (["c", "d", "e"], )], ["words"]) tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features") pca = PCA(k=2, inputCol="features", outputCol="pca_features") p0 = Pipeline(stages=[pca]) pl = Pipeline(stages=[tf, p0]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self._compare_pipelines(pl, loaded_pipeline) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) self._compare_pipelines(model, loaded_model) finally: try: rmtree(temp_path) except OSError: pass
def test2(): trA = MyTransformer() pipeA = Pipeline(stages=[trA]) print type(pipeA) pipeA.save('testA.pipe') pipeAA = PysparkPipelineWrapper.unwrap(Pipeline.load('testA.pipe')) stagesAA = pipeAA.getStages() trAA = stagesAA[0] print trAA.dataset_count
def test3(): sparkSession = SparkSession.builder.master("local[*]").appName("test").config("spark.jars", "file:///E:/tmp/mysql-connector-java-5.1.39.jar").getOrCreate() dfA = make_a_dataframe(sparkSession.sparkContext) trA = MyTransformer() pipeA = Pipeline(stages=[trA]).fit(dfA) print type(pipeA) pipeA.save('testB.pipe') pipeAA = PysparkPipelineWrapper.unwrap(PipelineModel.load('testB.pipe')) stagesAA = pipeAA.stages trAA = stagesAA[0] print trAA.dataset_count dfB = pipeAA.transform(dfA) dfB.show()
def test_pipeline_persistence(self): sqlContext = SQLContext(self.sc) temp_path = tempfile.mkdtemp() try: df = sqlContext.createDataFrame([(["a", "b", "c"], ), (["c", "d", "e"], )], ["words"]) tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features") pca = PCA(k=2, inputCol="features", outputCol="pca_features") pl = Pipeline(stages=[tf, pca]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self.assertEqual(loaded_pipeline.uid, pl.uid) self.assertEqual(len(loaded_pipeline.getStages()), 2) [loaded_tf, loaded_pca] = loaded_pipeline.getStages() self.assertIsInstance(loaded_tf, HashingTF) self.assertEqual(loaded_tf.uid, tf.uid) param = loaded_tf.getParam("numFeatures") self.assertEqual(loaded_tf.getOrDefault(param), tf.getOrDefault(param)) self.assertIsInstance(loaded_pca, PCA) self.assertEqual(loaded_pca.uid, pca.uid) self.assertEqual(loaded_pca.getK(), pca.getK()) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) [model_tf, model_pca] = model.stages [loaded_model_tf, loaded_model_pca] = loaded_model.stages self.assertEqual(model_tf.uid, loaded_model_tf.uid) self.assertEqual(model_tf.getOrDefault(param), loaded_model_tf.getOrDefault(param)) self.assertEqual(model_pca.uid, loaded_model_pca.uid) self.assertEqual(model_pca.pc, loaded_model_pca.pc) self.assertEqual(model_pca.explainedVariance, loaded_model_pca.explainedVariance) finally: try: rmtree(temp_path) except OSError: pass
def test_pipeline_persistence(self): sqlContext = SQLContext(self.sc) temp_path = tempfile.mkdtemp() try: df = sqlContext.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"]) tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features") pca = PCA(k=2, inputCol="features", outputCol="pca_features") pl = Pipeline(stages=[tf, pca]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self.assertEqual(loaded_pipeline.uid, pl.uid) self.assertEqual(len(loaded_pipeline.getStages()), 2) [loaded_tf, loaded_pca] = loaded_pipeline.getStages() self.assertIsInstance(loaded_tf, HashingTF) self.assertEqual(loaded_tf.uid, tf.uid) param = loaded_tf.getParam("numFeatures") self.assertEqual(loaded_tf.getOrDefault(param), tf.getOrDefault(param)) self.assertIsInstance(loaded_pca, PCA) self.assertEqual(loaded_pca.uid, pca.uid) self.assertEqual(loaded_pca.getK(), pca.getK()) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) [model_tf, model_pca] = model.stages [loaded_model_tf, loaded_model_pca] = loaded_model.stages self.assertEqual(model_tf.uid, loaded_model_tf.uid) self.assertEqual(model_tf.getOrDefault(param), loaded_model_tf.getOrDefault(param)) self.assertEqual(model_pca.uid, loaded_model_pca.uid) self.assertEqual(model_pca.pc, loaded_model_pca.pc) self.assertEqual(model_pca.explainedVariance, loaded_model_pca.explainedVariance) finally: try: rmtree(temp_path) except OSError: pass
def test_save_pipeline(spark_context, classification_model): sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) sgd_conf = optimizers.serialize(sgd) # Initialize Spark ML Estimator estimator = ElephasEstimator() estimator.set_keras_model_config(classification_model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("categorical_crossentropy") estimator.set_metrics(['acc']) estimator.set_epochs(10) estimator.set_batch_size(10) estimator.set_validation_split(0.1) estimator.set_categorical_labels(True) estimator.set_nb_classes(10) # Fitting a model returns a Transformer pipeline = Pipeline(stages=[estimator]) pipeline.save('tmp')
def kmeansresults(): df1 = sqlContext.read.format("csv").option("header", "true").option("mode", "DROPMALFORMED").load \ ("canadatweets.csv") df2 = sqlContext.read.format("csv").option("header", "true").option( "mode", "DROPMALFORMED").load("products.csv") df3 = sqlContext.read.format("csv").option("header", "true").option( "mode", "DROPMALFORMED").load("products.csv") df4 = sqlContext.read.format("csv").option("header", "true").option( "mode", "DROPMALFORMED").load("claritin.csv") df = df1.unionAll(df2) df = df.unionAll(df3) df = df.unionAll(df4) df.show() # df2.show() tokenizer = Tokenizer(inputCol="text", outputCol="tokens") remover = StopWordsRemover(inputCol="tokens", outputCol="stopWordsRemovedTokens") hashingTF = HashingTF(inputCol="stopWordsRemovedTokens", outputCol="rawFeatures", numFeatures=2**20) idf = IDF(inputCol="rawFeatures", outputCol="features") kmeans = KMeans(k=8, seed=1, featuresCol='rawFeatures', maxIter=10, initMode='random') pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, kmeans]) pipeline.save("KMeansPipeline") model = pipeline.fit(df) results = model.transform(df) results.cache() results.groupBy("prediction").count().show( ) # Note "display" is for Databricks; use show() for OSS Apache Spark # results.filter(results.prediction == 1).show(200,False) results.show() results.toPandas().to_csv( 'kmeansresultsCanadaAndProductsAndDisastersAndClaritin.csv') model.stages[-1].save("KMeansModel")
(3, "hadoop mapreduce", 0.0) ], ["id", "text", "label"]) # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10, regParam=0.001) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # Fit the pipeline to training documents. model = pipeline.fit(training) path = 'tmp/spark-logistic-regression-model' model.save(path) pipeline.save("tmp/unfit-lr-model") sameModel = PipelineModel.load(path) # Prepare test documents, which are unlabeled (id, text) tuples. test = spark.createDataFrame([ (4, "spark i j k"), (5, "l m n"), (6, "spark hadoop spark"), (7, "apache hadoop") ], ["id", "text"]) # Make predictions on test documents and print columns of interest. prediction = model.transform(test)
for column in response_cols: train_df = train_df.withColumn(column, encode_response(column)) models.append( RandomForestClassifier(labelCol=column, featuresCol="scaledFeatures", numTrees=15).setPredictionCol(column + "_pred"). setRawPredictionCol(column + "_pred_raw").setProbabilityCol(column + "_proba")) # create a list of all transformers stages = list() stages.extend(quantile_discretizers_numeric) stages.extend(string_indexer_categorical) stages.extend(id_feature_hashers) stages.extend(tweet_countVectorizers) stages.extend(doc2vecs) stages.append(feature_assambler) stages.append(scaler) stages.extend(models) # Create Pipeline pipeline = Pipeline(stages=stages) # Fit Pipeline and transform df pipeline = pipeline.fit(train_df) #pipeline.save("pipeline") pipeline.save("hdfs:///user/e1553958/RecSys/pipeline")
models = [] for column in response_cols: train_df = train_df.withColumn(column, encode_response(column)) models.append(LogisticRegression(labelCol=column, featuresCol="scaledFeatures", maxIter=1000, regParam=0.001, predictionCol=column + '_pred', probabilityCol=column + '_proba', rawPredictionCol=column + '_pred_raw')) # create a list of all transformers stages = list() stages.extend(quantile_discretizers_numeric) stages.extend(string_indexer_categorical) stages.extend(id_feature_hashers) #stages.extend(tweet_countVectorizers) stages.append(feature_assambler) stages.append(scaler) stages.extend(models) # Create Pipeline pipeline = Pipeline(stages=stages) # Fit Pipeline and transform df pipeline = pipeline.fit(train_df) #pipeline.save("pipeline") pipeline.save("hdfs:///user/e1553958/RecSys/datasplit/pipeline_logReg")
def main(): logger.info(f"Getting dataset from {path_to_train_dataset}...") client_storage = storage.Client() storage_bucket = client_storage.get_bucket(bucket) data = get_dataset().select(['sentiment', 'text']) logger.info(f"Current number of partitions: {data.rdd.getNumPartitions()}") data = data.repartition(10) logger.info( f"After repartition, number of partitions: {data.rdd.getNumPartitions()}" ) logger.info(f"Creating pre processing transformers...") pt = PreprocessTransformer(inputCol='text', outputCol='text_clean') logger.info(f"Creating feature engineering transformers...") # statement = """ # SELECT # * # FROM # __THIS__ # WHERE # text_clean != '' # # """ # flt = SQLTransformer(statement=statement) tk = Tokenizer(inputCol='text', outputCol='words') ng1 = NGram(n=1, inputCol='words', outputCol='1_gr_words') ng2 = NGram(n=2, inputCol='words', outputCol='2_gr_words') ng3 = NGram(n=3, inputCol='words', outputCol='3_gr_words') statement = """ SELECT *, concat(1_gr_words, 2_gr_words, 3_gr_words) c_words FROM __THIS__ """ cnt = SQLTransformer(statement=statement) cv = CountVectorizer(inputCol='c_words', vocabSize=80000, outputCol='features') logger.info(f"Split dataset...") df_train, df_test = data.randomSplit([0.8, 0.2], seed=100500) logger.info( f"Size of train dataset: {df_train.count()} and test dataset: {df_test.count()}" ) logger.info(f"Building and fitting model...") lr = LogisticRegression(featuresCol='features', labelCol='sentiment', maxIter=5000) pipeline_model = Pipeline( stages=[pt, tk, ng1, ng2, ng3, cnt, cv, lr]).fit(df_train) logger.info(f"Evaluating model...") ev = MulticlassClassificationEvaluator(labelCol='sentiment', metricName="accuracy", predictionCol='prediction') df_predict = pipeline_model.transform(df_test).cache() accuracy = ev.evaluate(df_predict) logger.info(f"Model accuracy: {accuracy}") logger.info(f"Storing model...") storage.Blob(f'models/{model_version}/scores', storage_bucket).upload_from_string(f'"accuracy":{accuracy}') pipeline_model.save(f"gs://{bucket}/models/{model_version}/pipeline")
sqlContext = SQLContext(sc) regex_tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W") stop_words = [] with open('/home/asdf/Documents/stopwords.txt', 'r') as contents: stop_words = contents.read().split() stop_words_remover = StopWordsRemover( inputCol="words", outputCol="filtered").setStopWords(stop_words) count_vectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5) lr = LogisticRegression(maxIter=100, regParam=0.01) nb = NaiveBayes(labelCol="label", featuresCol="features", smoothing=1.0, modelType="multinomial") pipe1 = Pipeline( stages=[regex_tokenizer, stop_words_remover, count_vectors, lr]) pipe2 = Pipeline( stages=[regex_tokenizer, stop_words_remover, count_vectors, nb]) pipe1.save("models/lr") pipe2.save("models/nb")
assembler = VectorAssembler( inputCols = sparseVectorCols, outputCol = 'features' ) pipelineStages += [assembler] normalizer = Normalizer( inputCol = 'features', outputCol = 'normFeatures' ) pipelineStages += [normalizer] pipeline = Pipeline(stages = pipelineStages) pipelineModel = pipeline.fit(train_df) train_df = pipelineModel.transform(train_df) for col in COLUMNS_OHE: train_df.drop(col, col + '_VEC') for col in COLUMNS_HIGH_CARD: train_df.drop(col, col + '_INDEX') train_df.drop('features') train_df = spark.createDataFrame(train_df.rdd, schema = train_df.schema) pipelineModel.save(PIPELINE_DIR + 'pipeline_model_preprocess') pipeline.save(PIPELINE_DIR + 'pipeline_preprocess') train_df.write.parquet(WRITE_DIR + 'train_clean.parquet')
class SPARK_MODEL: #init model params def __init__(self, dataset, dataName, splitRatio, targetType, targetVariable, split, nbSamples, goodClass, sparkModelsId, sparkLearningMethods, sparkOptions, numClasses, extDataSet): self.dataset = dataset self.dataName = dataName self.splitRatio = splitRatio self.targetType = targetType self.targetVariable = targetVariable self.split = split self.nbSamples = nbSamples self.goodClass = goodClass self.sparkModelsId = sparkModelsId self.sparkLearningMethods = sparkLearningMethods self.sparkOptions = sparkOptions self.numClasses = numClasses self.extDataSet = extDataSet #rdd methods def _set_rdd(self, dataset): self._rdd = sc.textFile(dataset, 8) header = self._rdd.first() self._rdd = self._rdd.filter(lambda line: line != header) if self.targetType == 'classification': print "class" self._rdd = self._rdd.map(classParsePoint) else: self._rdd = self._rdd.map(regParsePoint) print self._rdd.first() def _get_rdd(self): return self._rdd def _get_rddTest(self): return self._rddTest def _get_rddTraining(self): return self._rddTraining def _get_rddModel(self): return self._rddModel #model building: rdd def _set_rddModel(self, _type, _SLA, data): if _type == 'regression': if _SLA == 'randomForest': self._rddModel = RandomForest.trainRegressor( data, categoricalFeaturesInfo={}, numTrees=int(self.sparkOptions[4]), featureSubsetStrategy=self.sparkOptions[5], impurity='variance', maxDepth=int(self.sparkOptions[1]), maxBins=32) else: self._rddModel = "" else: #classification if _SLA == 'randomForest': print self.numClasses self._rddModel = RandomForest.trainClassifier( data, numClasses=self.numClasses, categoricalFeaturesInfo={}, numTrees=int(self.sparkOptions[4]), maxDepth=int(self.sparkOptions[1]), featureSubsetStrategy=self.sparkOptions[5], impurity=self.sparkOptions[2]) else: self._rddModel = "" def splitData(self): if self.split != "ExternalValidation": (self._rddTest, self._rddTraining) = self._rdd.randomSplit( [1 - self.splitRatio, self.splitRatio]) else: print "ExternalValidation" self._rddTraining = self._rdd self._rddTest = sc.textFile(self.extDataSet, 8) header = self._rddTest.first() self._rddTest = self._rddTest.filter(lambda line: line != header) if self.targetType == 'classification': self._rddTest = self._rddTest.map(classParsePoint) else: self._rddTest = self._rddTest.map(regParsePoint) #rdd/dataFrame method def rddToDataFrame(self, rdd): return rdd.toDF() def dataFrameToRdd(self, dataFrame): return dataFrame.rdd #dataFrame method def _set_dataFrame(self): self._dataFrame = sqlContext.read.format('csv').options( delimiter=';', header='true', inferschema='true', nullValue='').load(self.dataset) self._dataFrame = self._dataFrame.withColumn( self.targetVariable, self.dataFrame[self.targetVariable].cast("double")) def _get_dataFrame(self): return self._dataFrame def _get_dataFrameTest(self): return self._dataFrameTest def _get_dataFrameTraining(self): return self._dataFrameTraining def splitDataFrameData(self): if self.split != "ExternalValidation": (self._rddTest, self._rddTraining) = self.dataFrameToRdd( self._get_dataFrame()).randomSplit( [1 - self.splitRatio, self.splitRatio]) else: self.splitData() self._dataFrameTest = self._rddTest.toDF() self._dataFrameTraining = self._rddTraining.toDF() def _get_dataFrameModel(self): return self._dataFrameModel def _get_pipeline(self): return self._pipeline def _get_crossval(self): return self._crossval def _get_paramGrid(self): return self._paramGrid def _get_regEval(self): return self._regEval #model building: dataframe def _set_dataFrameModel(self, _type, _SLA, data, vecAssembler): if _type == 'regression': if _SLA == 'randomForest': rf = RandomForestRegressor() rf.setLabelCol(self.targetVariable)\ .setPredictionCol("prediction")\ .setFeaturesCol("features")\ .setProbabilityCol("proba")\ .setSeed(100088121L)\ .setMaxDepth(int(self.sparkOptions[1]))\ .setMaxMemoryInMB(10000)\ .setFeatureSubsetStrategy(self.sparkOptions[5]) self._regEval = RegressionEvaluator( predictionCol="prediction", labelCol=self.targetVariable, metricName="rmse") else: #classification if _SLA == 'randomForest': rf = RandomForestClassifier( labelCol=self.targetVariable, featuresCol="features", maxDepth=int(self.sparkOptions[1]), featureSubsetStrategy=self.sparkOptions[5], impurity=self.sparkOptions[2], probabilityCol="proba") if goodClass != '': self.regEval = BinaryClassificationEvaluator( labelCol=self.targetVariable, metricName="areaUnderROC") else: self.regEval = MulticlassClassificationEvaluator( labelCol=self.targetVariable, predictionCol="prediction", metricName="accuracy") # Create a Pipeline self._pipeline = Pipeline() # Set the stages of the Pipeline #vecAssembler self._pipeline.setStages([vecAssembler, rf]) # GridSearch self._paramGrid = (ParamGridBuilder().addGrid( rf.numTrees, [int(num) for num in self.sparkOptions[4].split(',')]).build()) # Add the grid to the CrossValidator self._crossval = CrossValidator(estimator=self._pipeline, estimatorParamMaps=self._paramGrid, evaluator=self._regEval, numFolds=self.nbSamples) # Now let's find and return the best model self._dataFrameModel = self._crossval.fit(data).bestModel #to be removed #print rf.getNumTrees() #modelText = str(self._dataFrameModel.stages[-1]) #._java_obj.toDebugString() #nbTrees = int(re.sub('.*?([0-9]*) trees$',r'\1',modelText)) #print nbTrees # end TBR rf.save("/home/t752887/python/myModelPath/SPARK_RF_R_" + str(self.sparkModelsId[0])) #end function #model evaluation #classification def computeKappa(self, m): sum = np.sum(m) row = m.sum(axis=0) col = m.sum(axis=1) P0 = m.trace() / sum PE = np.sum((row[i] / sum) * (col[i] / sum) for i in range(m.shape[0])) return (P0 - PE) / (1 - PE) def computeBA(self, m): row = m.sum(axis=0) col = m.sum(axis=1) return np.sum(m[i][i] / col[i] for i in range(m.shape[0])) / m.shape[0] #rdd model evalution def getRddPredictionsLabels(self, model, test_data): predictions = model.predict(test_data.map(lambda r: r.features)) return predictions.zip(test_data.map(lambda r: r.label)) def printRddMulticlassClassificationMetrics(self, predictions_and_labels): metrics = MulticlassMetrics(predictions_and_labels) print "KAPPA=" + str( self.computeKappa(np.array(metrics.confusionMatrix().toArray()))) print "BA=" + str( self.computeBA(np.array(metrics.confusionMatrix().toArray()))) CMarray = metrics.confusionMatrix().toArray() #CMstring = ','.join(['%.5f' % num for num in CMarray]) print "CM=" + str(CMarray) def printRddBinaryClassificationMetrics(self, predictions_and_labels): metrics = BinaryClassificationMetrics(predictions_and_labels) print "KAPPA=" + str( self.computeKappa(np.array(metrics.confusionMatrix().toArray()))) print "BA=" + str( self.computeBA(np.array(metrics.confusionMatrix().toArray()))) CMarray = metrics.confusionMatrix().toArray() #CMstring = ','.join(['%.5f' % num for num in CMarray]) print "CM=" + str(CMarray) def evaluateRddClassificationModel(self): predictions_and_labels = self.getRddPredictionsLabels( self._get_rddModel(), self._get_rddTest()) if self.goodClass != '': #binary classification #self.printRddBinaryClassificationMetrics(predictions_and_labels) self.printRddMulticlassClassificationMetrics( predictions_and_labels) else: self.printRddMulticlassClassificationMetrics( predictions_and_labels) def evaluateRddRegressionModel(self): # Get predictions valuesAndPreds = self.getRddPredictionsLabels(self._get_rddModel(), self._get_rddTest()) # Instantiate metrics object metrics = RegressionMetrics(valuesAndPreds) # Squared Error print("MSE = %s" % metrics.meanSquaredError) print("RMSE = %s" % metrics.rootMeanSquaredError) # R-squared print("R-squared = %s" % metrics.r2) # Mean absolute error print("MAE = %s" % metrics.meanAbsoluteError) # Explained variance print("Explained variance = %s" % metrics.explainedVariance) def evaluateDataFrameRegressionModel(self): # Now let's use rfModel to compute an evaluation metric for our test dataset: testSetDF predictionsAndLabelsDF = self._dataFrameModel.transform( self._dataFrameTest) # Run the previously created RMSE evaluator, regEval, on the predictionsAndLabelsDF DataFrame rmseRF = self._regEval.evaluate(predictionsAndLabelsDF) # Now let's compute the r2 evaluation metric for our test dataset r2RF = self._regEval.evaluate(predictionsAndLabelsDF, {self._regEval.metricName: "r2"}) print("RMSE = %s" % rmseRF) print("R-squared = %s " % r2RF) def evaluateDataFrameClassificationModel(self, sc): #here we have a problem a = 1 #save models def saveRddModel(self, sc): #save rdd API model remove_folder("/home/t752887/python/myModelPath/SPARK_RF_Regression_" + self.sparkModelsId[0]) modelPath = "/home/t752887/python/myModelPath/SPARK_RF_Regression_" + str( self.sparkModelsId[0]) self._rddModel.save(sc, modelPath) def saveDataFrameModel(self): #final model to save #self._dataFrameModel = self._pipeline.fit(self._dataFrame) self._dataFrameModel = self._crossval.fit(self._dataFrame).bestModel modelText = str(self._dataFrameModel.stages[-1]) #._java_obj.toDebugString() nbTrees = int(re.sub('.*?([0-9]*) trees$', r'\1', modelText)) print nbTrees #save data frame API model remove_folder("/home/t752887/python/myModelPath/SPARK_RF_Regression_" + self.sparkModelsId[0]) modelPath = "/home/t752887/python/myModelPath/SPARK_RF_Regression_" + str( self.sparkModelsId[0]) self._dataFrameModel.save(modelPath) self._pipeline.save(modelPath + "_Pipeline") def buildRDDModel(self, sparkContext): print "RDD_MODEL" # init RDD from dataset self._set_rdd(self.dataset) # split into test - training set self.splitData() # save rddTest and rddTraining into CSV and copy to PLP server! #self._rddTest.toDF().write.csv('/home/t752887/data/output/'+self.sparkModelsId[0]+'_'+self.dataName+'_test.csv') #self._rddTraining.toDF().write.csv('/home/t752887/data/output/'+self.sparkModelsId[0]+'_'+self.dataName+'_training.csv') self._rddTraining.toDF().toPandas().to_csv( '/home/t752887/data/output/' + self.sparkModelsId[0] + '_' + self.dataName + '_training.csv') self._rddTest.toDF().toPandas().to_csv('/home/t752887/data/output/' + self.sparkModelsId[0] + '_' + self.dataName + '_test.csv') #lines = self._rddTest.map(toCSVLine) #lines.saveAsTextFile('/home/t752887/data/output/'+self.sparkModelsId[0]+'_'+self.dataName+'_test.csv') #lines = self._rddTraining.map(toCSVLine) #lines.saveAsTextFile('/home/t752887/data/output/'+self.sparkModelsId[0]+'_'+self.dataName+'_training.csv') #could become a loop of models if self.targetType == 'classification': self._set_rddModel('classification', 'randomForest', self._get_rddTraining()) self.evaluateRddClassificationModel() #final model to save self._set_rddModel('classification', 'randomForest', self._get_rdd()) #regression else: self._set_rddModel('regression', 'randomForest', self._get_rddTraining()) self.evaluateRddRegressionModel() #final model to save self._set_rddModel('regression', 'randomForest', self._get_rdd()) #TODO: save the model self.saveRddModel(sparkContext) def buildDataFrameModel(self): # init dataframe from dataset self._set_dataFrame() # split into test - training set self.splitDataFrameData() #vector assembler ignore = [self.targetVariable] vecAssembler = VectorAssembler(inputCols=[ x for x in self._dataFrameTraining.columns if x not in ignore ], outputCol="features") #dataFrame cross-validation Pipeline with model selection if self.targetType == 'regression': #build model on the data we pass self._set_dataFrameModel('regression', 'randomForest', self._get_dataFrameTraining(), vecAssembler) #evaluate best model self.evaluateDataFrameRegressionModel() # save the model self.saveDataFrameModel() else: #build model on the data we pass self._set_dataFrameModel('regression', 'randomForest', self._get_dataFrameTraining(), vecAssembler) #TODO evaluate best model self.evaluateDataFrameClassificationModel(sparkContext) #TODO save the model self.saveDataFrameModel(sparkContext) def performModelSelection(self): try: i = float(self.sparkOptions[4]) return 0 except (ValueError, TypeError): return 1 dataFrame = property(_get_dataFrame, _set_dataFrame) dataFrameTest = property(_get_dataFrameTest) dataFrameTraining = property(_get_dataFrameTraining) dataFrameModel = property(_get_dataFrameModel, _set_dataFrameModel) pipeline = property(_get_pipeline) crossval = property(_get_crossval) paramGrid = property(_get_paramGrid) regEval = property(_get_regEval) rdd = property(_get_rdd, _set_rdd) rddTest = property(_get_rddTest) rddTraining = property(_get_rddTraining) rddModel = property(_get_rddModel, _set_rddModel)
class BinaryRelevance(): def __init__(self, featuresCol): self.models = [] self.featuresCol = featuresCol self.data = None self.label_columns = None self.pipeline = None def fit(self, train, columns): self.label_columns = columns for i in columns: print(i) lr = LogisticRegression(featuresCol=self.featuresCol, labelCol=i, predictionCol=i + '_pred', rawPredictionCol=i + '_rawPrediction', probabilityCol=i + 'prob') model = lr.fit(train) self.models.append(model) self.pipeline = Pipeline(stages=self.models) self.pipeline.fit(train) #print(self.models) def transform(self, data): ''' for model in self.models: data=model.transform(data) ''' from pyspark.ml import Pipeline if self.pipeline is None: pipe = Pipeline(stages=self.models) self.pipeline = pipe else: pipe = self.pipeline data_predicted = pipe.fit(data).transform(data) self.data = data_predicted return data_predicted def save(self, path): self.pipeline.save(path) def load(self, path): self.pipeline = Pipeline.load('pipeline.pkl') def find_recommendation(self, user, id_column): test = self.data predicted = [i for i in test.columns if i.endswith('pred')] label_columns = self.label_columns dict_val = test.where(test[id_column] == user).select( *label_columns).toPandas().to_dict() dict_pred = test.where(test[id_column] == user).select( *predicted).toPandas().to_dict() product_dict = {'current': [], 'recommendation': []} for i in zip(dict_val.items(), dict_pred.items()): #print(i) if int(i[1][1][0]) == 1 and int(i[0][1][0]) == 0: product_dict['recommendation'].append(i[0][0]) if int(i[0][1][0]) == 1: product_dict['current'].append(i[0][0]) return product_dict