def recommend_users(spark, input_user, input_video, model, user_indexer, video_indexer, user_language, video_language, num_recommend=20, is_show=True): """Use als model to recommend for users Args: spark: Spark session input_user: input_video: model: ALS model user_indexer: video_indexer: num_recommend: The maximum number of recommendation videos is_show: If true, the data would be shown user_language: The language of user video_language: The language of video """ # Recommend for all users userRecs = model.recommendForAllUsers(num_recommend) # Turn index back string indexer_user = IndexToString(inputCol=input_user + "_index", outputCol=input_user, labels=user_indexer.labels) index_user = indexer_user.transform(userRecs) video_labels = array(*[lit(x) for x in video_indexer.labels]) recommendations = array(*[ struct( video_labels[col("recommendations")[i][ input_video + "_index"]].alias(input_video), col("recommendations")[i][input_rating]) for i in range(num_recommend) ]) recs = index_user.withColumn("recommendations", recommendations).select( input_user, "recommendations") explode_recs = recs.select(input_user, explode("recommendations").alias("recommendation")).\ select(input_user, "recommendation.*").\ select(input_user, input_video, col("col2").alias("score")) # Keep user and video have same language user_label = read_data_hive(spark, [input_user, user_language], is_show) video_label = read_data_hive(spark, [input_video, video_language, is_show]) explode_recs_filter = explode_recs.join(user_label, input_user, "inner").join( video_label, input_video, "inner") explode_recs_filter = explode_recs_filter.filter( explode_recs_filter[user_language] == explode_recs_filter[video_language]) if is_show: explode_recs_filter.show(20) explode_recs_filter.registerTempTable("temp_table")
def test_index_to_string(self): original_data = self.spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) string_indexer = StringIndexer(inputCol="category", outputCol="categoryIndex") string_indexer_model = string_indexer.fit(original_data) data = string_indexer_model.transform(original_data) model = IndexToString(inputCol="categoryIndex", outputCol="originalCategory", labels=['A', 'B', 'C']) # the input name should match that of what IndexToString.inputCol model_onnx = convert_sparkml( model, 'Sparkml IndexToString', [('categoryIndex', Int64TensorType([1, 1]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.select("originalCategory").toPandas().values data_np = data.select('categoryIndex').toPandas().values.astype( numpy.int64) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlIndexToString") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['originalCategory'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def my_transform(rdd): with open("./index2whiskey1.json", mode="r", encoding="utf-8") as f: whiskey_list = list(json.loads(f.read()).values()) model = ALSModel.load("hdfs://master/ALSModel1/") spark = SparkSession.builder.appName('sql coming~').getOrCreate() whiskey = rdd.map(lambda x: Row(whiskeyId=int(x[1]), user_name=x[0])) whiskey_df = spark.createDataFrame(whiskey) predict = model.recommendForItemSubset(whiskey_df, 1) df_user = predict.select( predict.whiskeyId, predict.recommendations[0].userId.alias("userId"), ) df_whiskey = model.recommendForUserSubset(df_user, 5) result_df = df_user.join(df_whiskey, on=['userId'], how='left') result_df = result_df.join(whiskey_df, on=['whiskeyId'], how='left') result_df = result_df.select("user_name", result_df["recommendations"][0].whiskeyId.alias("whiskeyId1"), \ result_df["recommendations"][1].whiskeyId.alias("whiskeyId2"), \ result_df["recommendations"][2].whiskeyId.alias("whiskeyId3"), \ result_df["recommendations"][3].whiskeyId.alias("whiskeyId4"), \ result_df["recommendations"][4].whiskeyId.alias("whiskeyId5") \ ) whiskeyId1converter = IndexToString(inputCol="whiskeyId1", outputCol="whiskey1", labels=whiskey_list) whiskeyId2converter = IndexToString(inputCol="whiskeyId2", outputCol="whiskey2", labels=whiskey_list) whiskeyId3converter = IndexToString(inputCol="whiskeyId3", outputCol="whiskey3", labels=whiskey_list) whiskeyId4converter = IndexToString(inputCol="whiskeyId4", outputCol="whiskey4", labels=whiskey_list) whiskeyId5converter = IndexToString(inputCol="whiskeyId5", outputCol="whiskey5", labels=whiskey_list) result_df = whiskeyId1converter.transform(result_df) result_df = whiskeyId2converter.transform(result_df) result_df = whiskeyId3converter.transform(result_df) result_df = whiskeyId4converter.transform(result_df) result_df = whiskeyId5converter.transform(result_df) return result_df.rdd
def Customer_List(model, user): # Create a dataset with distinct Customers as one column and the asin as another column Customer = data_train.select("userid").distinct().withColumn("item", lit(user)) # # convert index back to original CustomerID userconverter = IndexToString(inputCol="userid", outputCol="List of Customers") userString = userconverter.transform(Customer) userString.drop("userid").drop("item").show()
def main(train_x, train_y, test_x, test_y=None, idf=False, ngram=1, base='gs', asm=False): # Load : DF[id, url, features, label?] # The DataFrames only have a labels column if labels are given. # We drop the text, since Naive Bayes doesn't use it and we already have all the tokens kind = 'asm' if asm else 'bytes' train = elizabeth.load(train_x, train_y, base=base, kind=kind).drop('text') test = elizabeth.load(test_x, test_y, base=base, kind=kind).drop('text') # convert the string labels to numeric indices # the handleInvalid param allows the label indexer to deal with labels that weren't seen during fitting label_indexer = StringIndexer(inputCol='label', outputCol='indexedLabel', handleInvalid="skip") label_indexer = label_indexer.fit(train) train = label_indexer.transform(train) # the test set won't always have labels if test_y is not None: test = label_indexer.transform(test) index_labeller = IndexToString(inputCol='prediction', outputCol='predictedClass', labels=label_indexer.labels) # Train the preprocessor and transform the data. prep = elizabeth.Preprocessor() prep.add(NGram(n=int(ngram))) prep.add(CountVectorizer()) if idf: prep.add(IDF()) train = prep.fit(train) test = prep.transform(test) # Naive Bayes : DF[id, url, text, features, label?, rawPrediction, probability, prediction] nb = NaiveBayes(labelCol='indexedLabel').fit(train) test = nb.transform(test) test = index_labeller.transform( test) # DF[id, url, ... prediction, predictedClass] # If labels are given for the test set, print a score.s if test_y: test = test.orderBy(test.id) test = test.withColumn( 'correct', (test.label == test.predictedClass).cast('double')) test = test.select(avg(test.correct)) print(test.show()) # If no labels are given for the test set, print predictions. else: test = test.orderBy(test.id).select(test.predictedClass) test = test.rdd.map(lambda row: int(row.predictedClass)) test = test.toLocalIterator() print(*test, sep='\n')
def __predict_sentiment(self, schema): """Gets predictions for a given tweet formatted RDD Returns: an RDD with format {"sentiment":"POSITIVE"} """ test = self.pipelineFit.transform(schema) predict = self.model.transform(test) converter = IndexToString(inputCol="prediction", outputCol="predicted_label", labels=self.pipelineFit.stages[3].labels) converted = converter.transform(predict) return converted.select("predicted_label").collect()[0].asDict()
def index2string(df,columns:list,param): for column in columns: column_new=column+"_str" print(f"index2string {column} to {column_new}") labels=param[column+"_labels"] model=IndexToString(inputCol=column,outputCol=column_new,labels=labels) df=model.transform(df)\ .withColumn(column,col(column_new))\ .drop(column_new) return df
def indexToString(infoData): stringIndexerPath = infoData.get(mc.INDEXERPATH) inverterColm = infoData.get(mc.COLMTOINVERT) dataset = infoData.get(mc.DATASET) stringIndexer = StringIndexerModel.load(stringIndexerPath) inverter = IndexToString(inputCol=inverterColm, outputCol=mc.DMXINVERTEDCOLM, labels=stringIndexer.labels) dataset = inverter.transform(dataset) #drop the indexed colm and rename the new unindexed colm with the actual one dataset = dataset.drop(inverterColm) dataset = dataset.withColumnRenamed(mc.DMXINVERTEDCOLM, inverterColm) return dataset
def predict(rdd, bestModel): if (not rdd.isEmpty()): df = sqlContext.createDataFrame(rdd).toDF("descr") predictions = bestModel.transform(df) converter = IndexToString(inputCol="prediction", outputCol="label", labels=bestModel.stages[0].labels) labelReverse = converter.transform(predictions) print("predictions for tweet:") print( labelReverse.select("features", "probability", "prediction", "label").show()) labelReverse.write.mode('append').parquet( 'hdfs:///predictions/tweets_predictions.parquet') else: print("No data received")
def main(): spark = SparkSession.builder.appName("BigDataProject").getOrCreate() df_train = parse_data("../train.csv") df_test, df = parse_data("../test.csv") #df_train.cache() df_test.cache() best = find_best(df_train) predictions = best[-1].transform(df_test) #predictions.show() converter = IndexToString(inputCol="prediction", outputCol="originalCategory", labels=best[1]) converted = converter.transform(predictions) df = df.withColumn('row_index', func.monotonically_increasing_id()) converted = converted.withColumn('row_index', func.monotonically_increasing_id()) df = df.join(converted["row_index", "originalCategory"], on=["row_index"]).drop("row_index") df.show() df.repartition(1).write.csv('../predictions', sep="\t") spark.stop
def top_movies(user_id, n): """ This function returns the top 'n' movies that user has not seen yet but might like """ #assigning alias name 'a' to unique movies df a = unique_movies.alias('a') #creating another dataframe which contains already watched movie by active user watched_movies = indexed.filter( indexed['userId'] == user_id).select('title_new') #assigning alias name 'b' to watched movies df b = watched_movies.alias('b') #joining both tables on left join total_movies = a.join(b, a.title_new == b.title_new, how='left') #selecting movies which active user is yet to rate or watch remaining_movies = total_movies.where(col("b.title_new").isNull()).select( a.title_new).distinct() #adding new column of user_Id of active useer to remaining movies df remaining_movies = remaining_movies.withColumn("userId", lit(int(user_id))) #making recommendations using ALS recommender model and selecting only top 'n' movies recommendations = rec_model.transform(remaining_movies).orderBy( 'prediction', ascending=False).limit(n) #adding columns of movie titles in recommendations movie_title = IndexToString(inputCol="title_new", outputCol="title", labels=model.labels) final_recommendations = movie_title.transform(recommendations) #return the recommendations to active user return final_recommendations.show(n, False)
# ## Build an evaluator # In[ ]: evaluator = BinaryClassificationEvaluator(labelCol="label",rawPredictionCol="rawPrediction", metricName="areaUnderROC") # ## Do the prediction # In[ ]: predictions = dtModel.transform(test) predictionsConverted = predConverter.transform(predictions) # ## Evaluate / Test the Model # In[ ]: predictionsConverted.select("prediction", "label", "predictedLabel", "LEAVE", "features").show() # Select (prediction, true label) and compute test error. accuracy = evaluator.evaluate(predictions) print("Test Error = " ,(1.0 - accuracy)) spark.stop()
# Index labels, adding metadata to the label column labelIndexer = StringIndexer(inputCol='label', outputCol='indexedLabel').fit(df3) df4 = labelIndexer.transform(df3) from pyspark.ml.classification import DecisionTreeClassifier dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="features", impurity="gini", maxDepth=10, maxBins=14) dt_model = dt.fit(df4) df5 = dt_model.transform(df4) # Convert indexed labels back to original labels. from pyspark.ml.feature import IndexToString labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelIndexer.labels) df6 = labelConverter.transform(df5) df6.crosstab("label", "predictedLabel").show() # pipeline from pyspark.ml import Pipeline pipeline = Pipeline(stages=[categoryIndexer, encoder, assembler, labelIndexer, dt, labelConverter]) pipeline.getStages() pipelineModel = pipeline.fit(ds) pipelineModel.stages[-2].toDebugString predicted = pipelineModel.transform(ds) predicted.crosstab("label", "predictedLabel").show() # explode array into row df = spark.createDataFrame([(1, "A", [1, 2, 3]), (2, "B", [3, 5]), (8, "B", [3, 6])], ["col1", "col2", "col3"]) df.withColumn("col3", F.explode(df.col3)).show() # explode array of struct
def Train(self): st = time.time() categorical_columns = self._dataframe_helper.get_string_columns() numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() categorical_columns = [ x for x in categorical_columns if x != result_column ] model_path = self._dataframe_context.get_model_path() pipeline_filepath = model_path + "/LogisticRegression/TrainedModels/pipeline" model_filepath = model_path + "/LogisticRegression/TrainedModels/model" summary_filepath = model_path + "/LogisticRegression/ModelSummary/summary.json" df = self._data_frame pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns, categorical_columns, result_column) pipelineModel = pipeline.fit(df) indexed = pipelineModel.transform(df) MLUtils.save_pipeline_or_model(pipelineModel, pipeline_filepath) trainingData, validationData = MLUtils.get_training_and_validation_data( indexed, result_column, 0.8) OriginalTargetconverter = IndexToString( inputCol="label", outputCol="originalTargetColumn") levels = trainingData.select("label").distinct().collect() if self._classifier == "lr": if len(levels) == 2: lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) elif len(levels) > 2: lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial") fit = lr.fit(trainingData) elif self._classifier == "OneVsRest": lr = LogisticRegression() ovr = OneVsRest(classifier=lr) fit = ovr.fit(trainingData) transformed = fit.transform(validationData) MLUtils.save_pipeline_or_model(fit, model_filepath) print fit.coefficientMatrix print fit.interceptVector # feature_importance = MLUtils.calculate_sparkml_feature_importance(indexed,fit,categorical_columns,numerical_columns) label_classes = transformed.select("label").distinct().collect() results = transformed.select(["prediction", "label"]) if len(label_classes) > 2: evaluator = MulticlassClassificationEvaluator( predictionCol="prediction") evaluator.evaluate(results) self._model_summary["model_accuracy"] = evaluator.evaluate( results, {evaluator.metricName: "accuracy"}) # accuracy of the model else: evaluator = BinaryClassificationEvaluator( rawPredictionCol="prediction") evaluator.evaluate(results) # print evaluator.evaluate(results,{evaluator.metricName: "areaUnderROC"}) # print evaluator.evaluate(results,{evaluator.metricName: "areaUnderPR"}) self._model_summary["model_accuracy"] = evaluator.evaluate( results, {evaluator.metricName: "areaUnderPR"}) # accuracy of the model # self._model_summary["feature_importance"] = MLUtils.transform_feature_importance(feature_importance) self._model_summary["runtime_in_seconds"] = round((time.time() - st), 2) transformed = OriginalTargetconverter.transform(transformed) label_indexer_dict = [ dict(enumerate(field.metadata["ml_attr"]["vals"])) for field in transformed.schema.fields if field.name == "label" ][0] prediction_to_levels = udf(lambda x: label_indexer_dict[x], StringType()) transformed = transformed.withColumn( "predictedClass", prediction_to_levels(transformed.prediction)) prediction_df = transformed.select( ["originalTargetColumn", "predictedClass"]).toPandas() objs = { "actual": prediction_df["originalTargetColumn"], "predicted": prediction_df["predictedClass"] } self._model_summary[ "confusion_matrix"] = MLUtils.calculate_confusion_matrix( objs["actual"], objs["predicted"]) overall_precision_recall = MLUtils.calculate_overall_precision_recall( objs["actual"], objs["predicted"]) self._model_summary[ "precision_recall_stats"] = overall_precision_recall[ "classwise_stats"] self._model_summary["model_precision"] = overall_precision_recall[ "precision"] self._model_summary["model_recall"] = overall_precision_recall[ "recall"] self._model_summary["target_variable"] = result_column self._model_summary[ "test_sample_prediction"] = overall_precision_recall[ "prediction_split"] self._model_summary["algorithm_name"] = "Random Forest" self._model_summary["validation_method"] = "Train and Test" self._model_summary["independent_variables"] = len( categorical_columns) + len(numerical_columns) self._model_summary["level_counts"] = CommonUtils.get_level_count_dict( trainingData, categorical_columns, self._dataframe_context.get_column_separator(), dataType="spark") # print json.dumps(self._model_summary,indent=2) self._model_summary["total_trees"] = 100 self._model_summary["total_rules"] = 300 CommonUtils.write_to_file( summary_filepath, json.dumps({"modelSummary": self._model_summary}))
lblIndxr = StringIndexer().setInputCol("lab").setOutputCol("labelInd") idxRes = lblIndxr.fit(simpleDF).transform(simpleDF) idxRes.show() # COMMAND ---------- valIndexer = StringIndexer().setInputCol("value1").setOutputCol("valueInd") valIndexer.fit(simpleDF).transform(simpleDF).show() # COMMAND ---------- from pyspark.ml.feature import IndexToString labelReverse = IndexToString().setInputCol("labelInd") labelReverse.transform(idxRes).show() # COMMAND ---------- from pyspark.ml.feature import VectorIndexer from pyspark.ml.linalg import Vectors idxIn = spark.createDataFrame([ (Vectors.dense(1, 2, 3),1), (Vectors.dense(2, 5, 6),2), (Vectors.dense(1, 8, 9),3) ]).toDF("features", "label") indxr = VectorIndexer()\ .setInputCol("features")\ .setOutputCol("idxed")\ .setMaxCategories(2)
model_for_topic_classification = NaiveBayesModel.load( '/Users/saumya/Desktop/Big_data_project/NB_model_without_pipeline') print(model_for_topic_classification) ## Predict topics for unlabelled tweets predictions = model_for_topic_classification.transform(dataset_for_topic) ## convert the labels to text labels labeler = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=[ 'event', 'sports', 'politics', 'news', 'technology', 'business', 'entertainment', 'health' ]) # print(predictions) prediciton_with_label = labeler.transform(predictions) prediciton_with_label.show(5) print(prediciton_with_label.count()) ta = data_modified_tweet.alias('ta') tb = prediciton_with_label.select('trend', 'creation_time', 'twid', 'predictedLabel').alias('tb') ## Use join to create final table with predicted labels for topics final_df = ta.join( tb, (ta.twid == tb.twid) & (ta.creation_time == tb.creation_time) & (ta.trend == tb.trend), how="left").select(ta.trend, ta.creation_time, ta.twid, ta.body, ta.text_words, ta.location, ta.sentiment, ta.topic, ta.user, tb.predictedLabel) final_df.show()
if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("IndexToStringExample")\ .getOrCreate() # $example on$ df = spark.createDataFrame( [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) indexer = StringIndexer(inputCol="category", outputCol="categoryIndex") model = indexer.fit(df) indexed = model.transform(df) print("Transformed string column '%s' to indexed column '%s'" % (indexer.getInputCol(), indexer.getOutputCol())) indexed.show() print("StringIndexer will store labels in output column metadata\n") converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory") converted = converter.transform(indexed) print("Transformed indexed column '%s' back to original string column '%s' using " "labels in metadata" % (converter.getInputCol(), converter.getOutputCol())) converted.select("id", "categoryIndex", "originalCategory").show() # $example off$ spark.stop()
#test_rdd = test_transformed.map(lambda data: Vectors.dense([float(c) for c in data])) data_transformed = test_transformed.select(col("Id").alias("label"), col("features")).map(lambda row: LabeledPoint(row.label, row.features)) #Evaluate the model on the training data - output "ID", "prediction" realTest_labelsAndPreds = data_transformed.map(lambda p: (p.label, (float(nb_model.predict(p.features))))) output = sqlContext.createDataFrame(realTest_labelsAndPreds,['id','Category_Index']) #convert back to Categories #you need SPARK1.6 for this #in cmd prompt,type in: sudo yum install spark-core spark-master spark-worker spark-python from pyspark.ml.feature import IndexToString converter = IndexToString(inputCol="Category_Index", outputCol="originalCategory", labels=classifymodel.labels) converted = converter.transform(output) #converted.write.format('com.databricks.spark.csv').save('submission1.csv') def toCSVLine(data): return ','.join(str(d) for d in data) lines = converted.map(toCSVLine) lines.saveAsTextFile('submission1.csv') #view Error rates #realTest_trainErr = realTest_labelsAndPreds.filter(lambda vp: vp[0] != vp[1]).count() / float(test_transformed.count()) #print("Training Error = " + str(realTest_trainErr)) #model.predict(test_rdd)
from pyspark.ml.feature import IndexToString from pyspark.ml.feature import StringIndexer from pyspark.sql import SparkSession spark = SparkSession \ .builder \ .appName("stringindexer_sample") \ .master("local[*]") \ .getOrCreate() df1 = spark.createDataFrame([ (0, "red"), (1, "blue"), (2, "green"), (3, "yellow")]).toDF("id", "color") strignIndexer = StringIndexer(inputCol="color", outputCol="colorIndex").fit(df1) df2 = strignIndexer.transform(df1) df2.show() indexToString = IndexToString(inputCol="colorIndex", outputCol="originalColor") df3 = indexToString.transform(df2) df3.show() spark.stop
s3 = boto3.client('s3') modelPath = 'full_gbt_test_model/data/_SUCCESS' if check(s3, 'gdc-emr0', stdmodelPath) == False: print("saving Random Forest model...") bestModel.save('s3://gdc-emr0/full_gbt_test_model') else: print(modelPath + " already exists...") # In[ ]: # Convert indexed labels back to original labels. labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=label_dict['disease_type_cpv_idx']) predictions = labelConverter.transform(predictions) # In[ ]: feature_rd_df = pd.DataFrame(feature_importance, columns=['feature_importance']) # cpv_feature_columns+mirna_feature_columns from io import StringIO csv_buffer = StringIO() feature_rd_df.to_csv(csv_buffer) s3_resource = boto3.resource('s3') s3_resource.Object('gdc-emr0', 'gbt_feature_impt.csv').put(Body=csv_buffer.getvalue()) spark.stop()
indexer = StringIndexer(inputCol="category", outputCol="categoryIndex") model = indexer.fit(df) indexed = model.transform(df) indexed.show() # IndexToString: num -> str ''' 与StringIndexer相对应,IndexToString的作用是把标签索引的一列重新映射回原有的字符型标签。 其主要使用场景一般都是和StringIndexer配合,先用StringIndexer将标签转换为标签索引,进行 模型训练,然后在预测标签的时候再把标签索引转化成原有的字符标签。 ''' from pyspark.ml.feature import IndexToString toString = IndexToString(inputCol="categoryIndex", outputCol="originalCategory") indexString = toString.transform(indexed) indexString.select("id", "originalCategory").show() # VectorIndexer: ''' 解决向量数据集中的类别型特征转换。 通过为其提供maxCategories超参数,它可以自动识别哪些特征是类别型的,并且将原始值转换为类 别索引。它基于不同特征值的数量来识别哪些特征需要被类别化,那些取值可能性最多不超过maxCategories 的特征需要会被认为是类别型的。 ''' from pyspark.ml.feature import VectorIndexer from pyspark.ml.linalg import Vector, Vectors df = spark.createDataFrame([(Vectors.dense(-1.0, 1.0, 1.0), ), (Vectors.dense(-1.0, 3.0, 1.0), ), (Vectors.dense(0.0, 5.0, 1.0), )], ["features"])
sports_new_rdd = sports_new_files.map(lambda (k,v): ("sports",v)) travel_new_rdd = travel_new_files.map(lambda (k,v): ("travel",v)) business_new_rdd = business_new_rdd.union(politics_new_rdd) business_new_rdd = business_new_rdd.union(sports_new_rdd) business_new_rdd = business_new_rdd.union(travel_new_rdd) spark_new_df = spark.createDataFrame(business_new_rdd, Data_schema) test_dataset = pipelineFit.transform(spark_new_df) idx_to_string = IndexToString(inputCol="prediction", outputCol="category_output",labels=["business","politics","travel","sports"]) #new_dataset=idx_to_string.transform(test_dataset) test_predictions = lrModel.transform(test_dataset) prediction_with_labels=idx_to_string.transform(test_predictions) # Evaluate model results test_result = evaluator.evaluate(test_predictions) test_predictions_and_labels = test_predictions.select(["prediction","label"]) test_metrics = MulticlassMetrics(test_predictions_and_labels.rdd) test_conf_mat1 = test_metrics.confusionMatrix() test_precision1 = test_metrics.precision() test_recall1 = test_metrics.recall() test_f1Score1 = test_metrics.fMeasure() test_nbPredictions=nbModel.transform(test_dataset) nb_prediction_with_labels=idx_to_string.transform(test_nbPredictions)
# only select the features and label column df = df.select(['features', 'label']) print("Reading for machine learning") df.show(10) train, test = df.randomSplit([0.70, 0.30]) test.show() lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) model = lr.fit(train) predictions = model.transform(test) converter = IndexToString(inputCol="label", outputCol="originallabel") converted = converter.transform(predictions) converter = IndexToString(inputCol="prediction", outputCol="prediction_label", labels=user_labels) converted = converter.transform(converted) converted.show(5) customSchema = StructType([ StructField("sepal_length", DoubleType(), True), StructField("sepal_width", DoubleType(), True), StructField("petal_length", DoubleType(), True), StructField("petal_width", DoubleType(), True) ]) myrdd = spark.sparkContext.parallelize([[5.1, 3.5, 1.4, 0.2]]) df = sqlContext.createDataFrame(myrdd, customSchema)
lblIndxr = StringIndexer().setInputCol("lab").setOutputCol("labelInd") idxRes = lblIndxr.fit(simpleDF).transform(simpleDF) idxRes.show() # COMMAND ---------- valIndexer = StringIndexer().setInputCol("value1").setOutputCol("valueInd") valIndexer.fit(simpleDF).transform(simpleDF).show(5) # COMMAND ---------- from pyspark.ml.feature import IndexToString labelReverse = IndexToString().setInputCol("labelInd") labelReverse.transform(idxRes).show(5) # COMMAND ---------- from pyspark.ml.feature import VectorIndexer from pyspark.ml.linalg import Vectors idxIn = spark.createDataFrame([(Vectors.dense(1, 2, 3), 1), (Vectors.dense(2, 5, 6), 2), (Vectors.dense(1, 8, 9), 3) ]).toDF("features", "label") indxr = VectorIndexer()\ .setInputCol("features")\ .setOutputCol("idxed")\ .setMaxCategories(2) indxr.fit(idxIn).transform(idxIn).show()
#making recommendations using ALS recommender model and selecting only top 'n' movies recommendations = rec_model.transform(remaining_movies).orderBy( 'prediction', ascending=False) # COMMAND ---------- recommendations.show(5, False) # COMMAND ---------- #converting title_new values back to movie titles movie_title = IndexToString(inputCol="title_new", outputCol="title", labels=model.labels) final_recommendations = movie_title.transform(recommendations) # COMMAND ---------- final_recommendations.show(10, False) # COMMAND ---------- #create function to recommend top 'n' movies to any particular user def top_movies(user_id, n): """ This function returns the top 'n' movies that user has not seen yet but might like """ #assigning alias name 'a' to unique movies df
estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator(), trainRatio=0.8) (trainingData, testData) = li.transform(va).randomSplit([0.7, 0.3]) # Run TrainValidationSplit, and choose the best set of parameters. model = tvs.fit(trainingData) predictions = model.transform(testData) evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) i2s.transform(predictions).groupBy('predictedLabel', 'maintenanceType')\ .count().toPandas() fi = model.bestModel.featureImportances.toArray() sensorImportances = {} for sensorIndex in range(len(fi)): sensorImportances[sensorNames[sensorIndex]] = round(fi[sensorIndex]*100) sensorImportancesPD = pd.DataFrame.from_records(list(sensorImportances.items()), columns=['Sensor','Importance (%)'])\ .sort_values('Importance (%)') sb.set_color_codes("pastel") sb.barplot(x="Importance (%)", y="Sensor", data=sensorImportancesPD, label="Total", color="b")
def numericToNominal(data_frame): converter = IndexToString(inputCol='LoAL_num', outputCol='originalCategory') converted = converter.transform(data_frame)
df = spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) indexer = StringIndexer(inputCol="category", outputCol="categoryIndex") model = indexer.fit(df) indexed = model.transform(df) print("Transformed string column '%s' to indexed column '%s'" % (indexer.getInputCol(), indexer.getOutputCol())) indexed.show() print("StringIndexer will store labels in output column metadata\n") converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory") converted = converter.transform(indexed) print( "Transformed indexed column '%s' back to original string column '%s' using " "labels in metadata" % (converter.getInputCol(), converter.getOutputCol())) converted.select("id", "categoryIndex", "originalCategory").show() # COMMAND ---------- ###One hot encode estimator maps the categorical features to binary vector. It is common practice to run string indexer first to convert the raw features into indexed features (Stringindexer) from pyspark.ml.feature import OneHotEncoderEstimator df = spark.createDataFrame([(0.0, 1.0), (1.0, 0.0), (2.0, 1.0), (0.0, 2.0), (0.0, 1.0), (2.0, 0.0)], ["categoryIndex1", "categoryIndex2"])
idf = IDF(inputCol="tf", outputCol="features", minDocFreq=5) labels = StringIndexer(inputCol="original", outputCol="label") lines = Pipeline(stages=[tokenizer, ngrams, hashtf, idf, labels]) linesFit = lines.fit(trainSet) trainModel = linesFit.transform(trainSet) validationModel = linesFit.transform(valSet) lr = LogisticRegression(maxIter=100) model = lr.fit(trainModel) predictions = model.transform(validationModel) evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") predictions.show(30) converter = IndexToString(inputCol="label", outputCol="label meaning") converted = converter.transform(predictions.select("label").distinct()) converted.select("label", "label meaning").distinct().show() truePositive = predictions[(predictions.label == 0) & (predictions.prediction == 0)].count() trueNegative = predictions[(predictions.label == 1) & (predictions.prediction == 1)].count() falsePositive = predictions[(predictions.label == 1) & (predictions.prediction == 0)].count() falseNegative = predictions[(predictions.label == 0) & (predictions.prediction == 1)].count() recall = float(truePositive) / (truePositive + falseNegative) precision = float(truePositive) / (truePositive + falsePositive) print("True Positive", truePositive) print("True Negative", trueNegative)
def main(train_x, train_y, test_x, test_y=None, base='gs'): # generate joint feature set train_features = elizabeth.preprocess.load(train_x, train_y, base=base, kind='joint').drop('url') test_features = elizabeth.preprocess.load(test_x, test_y, base=base, kind='joint').drop('url') train_features.show() token_counter = CountVectorizer(inputCol='features', outputCol='tokenCounts', minDF=10).fit(train_features) train = token_counter.transform(train_features).drop('features') test = token_counter.transform(test_features).drop('features') # convert the string labels to numeric indices # the handleInvalid param allows the label indexer to deal with labels that weren't seen during fitting label_indexer = StringIndexer(inputCol='label', outputCol='indexedLabel', handleInvalid="skip") label_indexer = label_indexer.fit(train) train = label_indexer.transform(train) # the test set won't always have labels if test_y is not None: test = label_indexer.transform(test) index_labeller = IndexToString(inputCol='prediction', outputCol='predictedClass', labels=label_indexer.labels) # create and train a Random Forest classifier rf = RandomForestClassifier(labelCol='indexedLabel', featuresCol='tokenCounts', numTrees=20, maxDepth=10, minInfoGain=0.0, seed=12345) model = rf.fit(train) prediction = model.transform(test) prediction = index_labeller.transform( prediction) # DF[id, url, ... prediction, predictedClass] # If labels are given for the test set, print a score.s if test_y: evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol='prediction', metricName='accuracy') accuracy = evaluator.evaluate(prediction) print("\n\tAccuracy on test set: %0.6f\n" % accuracy) # If no labels are given for the test set, print predictions. else: prediction = prediction.orderBy(prediction.id).select( prediction.predictedClass) prediction = prediction.rdd.map( lambda prediction: int(prediction.predictedClass)) prediction = prediction.toLocalIterator() print(*prediction, sep='\n')
.option("startingOffsets", "latest") \ .load() \ .selectExpr("CAST(value as string)")\ .select(F.from_json("value", schema).alias("value"))\ .select(F.col("value.*"))\ .select("uid", F.col('visits').url.alias("urls"))\ .withColumn('domains', foo_udf(F.col('urls'))) # Infer on test data results = model.transform(st) # get string classes from encoded values converter = IndexToString(inputCol="prediction", outputCol="gender_age", labels=model.stages[1].labels) converted = converter.transform(results) #Saving to another topic query = converted\ .select(F.to_json(F.struct("uid", "gender_age")).alias("value"))\ .writeStream\ .outputMode("append")\ .format("kafka") \ .option("checkpointLocation", "file:///tmp/checkpoint")\ .option("kafka.bootstrap.servers", kafka_bootstrap ) \ .option("topic", topic_out) \ .start() query.awaitTermination()