Exemplo n.º 1
0
def recommend_users(spark,
                    input_user,
                    input_video,
                    model,
                    user_indexer,
                    video_indexer,
                    user_language,
                    video_language,
                    num_recommend=20,
                    is_show=True):
    """Use als model to recommend for users
    Args:
        spark: Spark session
        input_user:
        input_video:
        model: ALS model
        user_indexer:
        video_indexer:
        num_recommend: The maximum number of recommendation videos
        is_show: If true, the data would be shown
        user_language: The language of user
        video_language: The language of video
    """
    # Recommend for all users
    userRecs = model.recommendForAllUsers(num_recommend)

    # Turn index back string
    indexer_user = IndexToString(inputCol=input_user + "_index",
                                 outputCol=input_user,
                                 labels=user_indexer.labels)
    index_user = indexer_user.transform(userRecs)

    video_labels = array(*[lit(x) for x in video_indexer.labels])
    recommendations = array(*[
        struct(
            video_labels[col("recommendations")[i][
                input_video + "_index"]].alias(input_video),
            col("recommendations")[i][input_rating])
        for i in range(num_recommend)
    ])

    recs = index_user.withColumn("recommendations", recommendations).select(
        input_user, "recommendations")
    explode_recs = recs.select(input_user, explode("recommendations").alias("recommendation")).\
                        select(input_user, "recommendation.*").\
                        select(input_user, input_video, col("col2").alias("score"))

    # Keep user and video have same language
    user_label = read_data_hive(spark, [input_user, user_language], is_show)
    video_label = read_data_hive(spark, [input_video, video_language, is_show])
    explode_recs_filter = explode_recs.join(user_label, input_user,
                                            "inner").join(
                                                video_label, input_video,
                                                "inner")
    explode_recs_filter = explode_recs_filter.filter(
        explode_recs_filter[user_language] ==
        explode_recs_filter[video_language])
    if is_show:
        explode_recs_filter.show(20)
    explode_recs_filter.registerTempTable("temp_table")
Exemplo n.º 2
0
    def test_index_to_string(self):
        original_data = self.spark.createDataFrame([(0, "a"), (1, "b"),
                                                    (2, "c"), (3, "a"),
                                                    (4, "a"), (5, "c")],
                                                   ["id", "category"])
        string_indexer = StringIndexer(inputCol="category",
                                       outputCol="categoryIndex")
        string_indexer_model = string_indexer.fit(original_data)
        data = string_indexer_model.transform(original_data)

        model = IndexToString(inputCol="categoryIndex",
                              outputCol="originalCategory",
                              labels=['A', 'B', 'C'])
        # the input name should match that of what IndexToString.inputCol
        model_onnx = convert_sparkml(
            model, 'Sparkml IndexToString',
            [('categoryIndex', Int64TensorType([1, 1]))])
        self.assertTrue(model_onnx is not None)
        # run the model
        predicted = model.transform(data)
        expected = predicted.select("originalCategory").toPandas().values
        data_np = data.select('categoryIndex').toPandas().values.astype(
            numpy.int64)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlIndexToString")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['originalCategory'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
Exemplo n.º 3
0
def my_transform(rdd):
    with open("./index2whiskey1.json", mode="r", encoding="utf-8") as f:
        whiskey_list = list(json.loads(f.read()).values())
    model = ALSModel.load("hdfs://master/ALSModel1/")
    spark = SparkSession.builder.appName('sql coming~').getOrCreate()
    whiskey = rdd.map(lambda x: Row(whiskeyId=int(x[1]), user_name=x[0]))
    whiskey_df = spark.createDataFrame(whiskey)
    predict = model.recommendForItemSubset(whiskey_df, 1)
    df_user = predict.select(
        predict.whiskeyId,
        predict.recommendations[0].userId.alias("userId"),
    )

    df_whiskey = model.recommendForUserSubset(df_user, 5)
    result_df = df_user.join(df_whiskey, on=['userId'], how='left')
    result_df = result_df.join(whiskey_df, on=['whiskeyId'], how='left')
    result_df = result_df.select("user_name",
                    result_df["recommendations"][0].whiskeyId.alias("whiskeyId1"), \
                    result_df["recommendations"][1].whiskeyId.alias("whiskeyId2"), \
                    result_df["recommendations"][2].whiskeyId.alias("whiskeyId3"), \
                    result_df["recommendations"][3].whiskeyId.alias("whiskeyId4"), \
                    result_df["recommendations"][4].whiskeyId.alias("whiskeyId5") \
                    )
    whiskeyId1converter = IndexToString(inputCol="whiskeyId1",
                                        outputCol="whiskey1",
                                        labels=whiskey_list)
    whiskeyId2converter = IndexToString(inputCol="whiskeyId2",
                                        outputCol="whiskey2",
                                        labels=whiskey_list)
    whiskeyId3converter = IndexToString(inputCol="whiskeyId3",
                                        outputCol="whiskey3",
                                        labels=whiskey_list)
    whiskeyId4converter = IndexToString(inputCol="whiskeyId4",
                                        outputCol="whiskey4",
                                        labels=whiskey_list)
    whiskeyId5converter = IndexToString(inputCol="whiskeyId5",
                                        outputCol="whiskey5",
                                        labels=whiskey_list)

    result_df = whiskeyId1converter.transform(result_df)
    result_df = whiskeyId2converter.transform(result_df)
    result_df = whiskeyId3converter.transform(result_df)
    result_df = whiskeyId4converter.transform(result_df)
    result_df = whiskeyId5converter.transform(result_df)

    return result_df.rdd
Exemplo n.º 4
0
def Customer_List(model, user):
    # Create a dataset with distinct Customers as one column and the asin as another column
    Customer = data_train.select("userid").distinct().withColumn("item", lit(user))

#     # convert index back to original CustomerID 
    userconverter = IndexToString(inputCol="userid", outputCol="List of Customers")
    userString = userconverter.transform(Customer)
    userString.drop("userid").drop("item").show()
Exemplo n.º 5
0
def main(train_x,
         train_y,
         test_x,
         test_y=None,
         idf=False,
         ngram=1,
         base='gs',
         asm=False):
    # Load : DF[id, url, features, label?]
    # The DataFrames only have a labels column if labels are given.
    # We drop the text, since Naive Bayes doesn't use it and we already have all the tokens
    kind = 'asm' if asm else 'bytes'
    train = elizabeth.load(train_x, train_y, base=base, kind=kind).drop('text')
    test = elizabeth.load(test_x, test_y, base=base, kind=kind).drop('text')

    # convert the string labels to numeric indices
    # the handleInvalid param allows the label indexer to deal with labels that weren't seen during fitting
    label_indexer = StringIndexer(inputCol='label',
                                  outputCol='indexedLabel',
                                  handleInvalid="skip")
    label_indexer = label_indexer.fit(train)
    train = label_indexer.transform(train)
    # the test set won't always have labels
    if test_y is not None:
        test = label_indexer.transform(test)

    index_labeller = IndexToString(inputCol='prediction',
                                   outputCol='predictedClass',
                                   labels=label_indexer.labels)

    # Train the preprocessor and transform the data.
    prep = elizabeth.Preprocessor()
    prep.add(NGram(n=int(ngram)))
    prep.add(CountVectorizer())
    if idf: prep.add(IDF())
    train = prep.fit(train)
    test = prep.transform(test)

    # Naive Bayes : DF[id, url, text, features, label?, rawPrediction, probability, prediction]
    nb = NaiveBayes(labelCol='indexedLabel').fit(train)
    test = nb.transform(test)
    test = index_labeller.transform(
        test)  # DF[id, url, ... prediction, predictedClass]

    # If labels are given for the test set, print a score.s
    if test_y:
        test = test.orderBy(test.id)
        test = test.withColumn(
            'correct', (test.label == test.predictedClass).cast('double'))
        test = test.select(avg(test.correct))
        print(test.show())

    # If no labels are given for the test set, print predictions.
    else:
        test = test.orderBy(test.id).select(test.predictedClass)
        test = test.rdd.map(lambda row: int(row.predictedClass))
        test = test.toLocalIterator()
        print(*test, sep='\n')
Exemplo n.º 6
0
 def __predict_sentiment(self, schema):
     """Gets predictions for a given tweet formatted RDD
     Returns: an RDD with format {"sentiment":"POSITIVE"}
     """
     test = self.pipelineFit.transform(schema)
     predict = self.model.transform(test)
     converter = IndexToString(inputCol="prediction",
                               outputCol="predicted_label",
                               labels=self.pipelineFit.stages[3].labels)
     converted = converter.transform(predict)
     return converted.select("predicted_label").collect()[0].asDict()
Exemplo n.º 7
0
def index2string(df,columns:list,param):
    for column in columns:
        column_new=column+"_str"
        print(f"index2string {column} to {column_new}")

        labels=param[column+"_labels"]
        model=IndexToString(inputCol=column,outputCol=column_new,labels=labels)

        df=model.transform(df)\
                 .withColumn(column,col(column_new))\
                 .drop(column_new)
    return df
    def indexToString(infoData):
        stringIndexerPath = infoData.get(mc.INDEXERPATH)
        inverterColm = infoData.get(mc.COLMTOINVERT)
        dataset = infoData.get(mc.DATASET)
        stringIndexer = StringIndexerModel.load(stringIndexerPath)
        inverter = IndexToString(inputCol=inverterColm, outputCol=mc.DMXINVERTEDCOLM,
                                 labels=stringIndexer.labels)
        dataset = inverter.transform(dataset)

        #drop the indexed colm and rename the new unindexed colm with the actual one
        dataset = dataset.drop(inverterColm)
        dataset = dataset.withColumnRenamed(mc.DMXINVERTEDCOLM, inverterColm)
        return dataset
def predict(rdd, bestModel):
    if (not rdd.isEmpty()):
        df = sqlContext.createDataFrame(rdd).toDF("descr")
        predictions = bestModel.transform(df)
        converter = IndexToString(inputCol="prediction",
                                  outputCol="label",
                                  labels=bestModel.stages[0].labels)
        labelReverse = converter.transform(predictions)
        print("predictions for tweet:")
        print(
            labelReverse.select("features", "probability", "prediction",
                                "label").show())
        labelReverse.write.mode('append').parquet(
            'hdfs:///predictions/tweets_predictions.parquet')
    else:
        print("No data received")
Exemplo n.º 10
0
def main():
    spark = SparkSession.builder.appName("BigDataProject").getOrCreate()
    df_train = parse_data("../train.csv")
    df_test, df = parse_data("../test.csv")
    #df_train.cache()
    df_test.cache()
    best = find_best(df_train)
    predictions = best[-1].transform(df_test)
    #predictions.show()
    converter = IndexToString(inputCol="prediction",
                              outputCol="originalCategory",
                              labels=best[1])
    converted = converter.transform(predictions)
    df = df.withColumn('row_index', func.monotonically_increasing_id())
    converted = converted.withColumn('row_index',
                                     func.monotonically_increasing_id())
    df = df.join(converted["row_index", "originalCategory"],
                 on=["row_index"]).drop("row_index")
    df.show()
    df.repartition(1).write.csv('../predictions', sep="\t")
    spark.stop
def top_movies(user_id, n):
    """
    This function returns the top 'n' movies that user has not seen yet but might like 
    
    """
    #assigning alias name 'a' to unique movies df
    a = unique_movies.alias('a')

    #creating another dataframe which contains already watched movie by active user
    watched_movies = indexed.filter(
        indexed['userId'] == user_id).select('title_new')

    #assigning alias name 'b' to watched movies df
    b = watched_movies.alias('b')

    #joining both tables on left join
    total_movies = a.join(b, a.title_new == b.title_new, how='left')

    #selecting movies which active user is yet to rate or watch
    remaining_movies = total_movies.where(col("b.title_new").isNull()).select(
        a.title_new).distinct()

    #adding new column of user_Id of active useer to remaining movies df
    remaining_movies = remaining_movies.withColumn("userId", lit(int(user_id)))

    #making recommendations using ALS recommender model and selecting only top 'n' movies
    recommendations = rec_model.transform(remaining_movies).orderBy(
        'prediction', ascending=False).limit(n)

    #adding columns of movie titles in recommendations
    movie_title = IndexToString(inputCol="title_new",
                                outputCol="title",
                                labels=model.labels)
    final_recommendations = movie_title.transform(recommendations)

    #return the recommendations to active user
    return final_recommendations.show(n, False)
Exemplo n.º 12
0
# ## Build an evaluator

# In[ ]:


evaluator =  BinaryClassificationEvaluator(labelCol="label",rawPredictionCol="rawPrediction", metricName="areaUnderROC")


# ## Do the prediction 

# In[ ]:


predictions = dtModel.transform(test)
predictionsConverted = predConverter.transform(predictions)


# ## Evaluate / Test the Model 

# In[ ]:


predictionsConverted.select("prediction", "label", "predictedLabel", "LEAVE", "features").show()
# Select (prediction, true label) and compute test error.
   
accuracy = evaluator.evaluate(predictions)
print("Test Error = " ,(1.0 - accuracy))

spark.stop()
Exemplo n.º 13
0
# Index labels, adding metadata to the label column
labelIndexer = StringIndexer(inputCol='label',
                             outputCol='indexedLabel').fit(df3)
df4 = labelIndexer.transform(df3)
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="features", impurity="gini", maxDepth=10, maxBins=14)
dt_model = dt.fit(df4)
df5 = dt_model.transform(df4)
# Convert indexed labels back to original labels.
from pyspark.ml.feature import IndexToString

labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)
df6 = labelConverter.transform(df5)
df6.crosstab("label", "predictedLabel").show()
# pipeline
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[categoryIndexer, encoder, assembler, labelIndexer, dt, labelConverter])
pipeline.getStages()
pipelineModel = pipeline.fit(ds)
pipelineModel.stages[-2].toDebugString
predicted = pipelineModel.transform(ds)
predicted.crosstab("label", "predictedLabel").show()

# explode array into row
df = spark.createDataFrame([(1, "A", [1, 2, 3]), (2, "B", [3, 5]), (8, "B", [3, 6])], ["col1", "col2", "col3"])
df.withColumn("col3", F.explode(df.col3)).show()
# explode array of struct
    def Train(self):
        st = time.time()
        categorical_columns = self._dataframe_helper.get_string_columns()
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        result_column = self._dataframe_context.get_result_column()
        categorical_columns = [
            x for x in categorical_columns if x != result_column
        ]

        model_path = self._dataframe_context.get_model_path()
        pipeline_filepath = model_path + "/LogisticRegression/TrainedModels/pipeline"
        model_filepath = model_path + "/LogisticRegression/TrainedModels/model"
        summary_filepath = model_path + "/LogisticRegression/ModelSummary/summary.json"

        df = self._data_frame
        pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns,
                                                      categorical_columns,
                                                      result_column)
        pipelineModel = pipeline.fit(df)
        indexed = pipelineModel.transform(df)
        MLUtils.save_pipeline_or_model(pipelineModel, pipeline_filepath)
        trainingData, validationData = MLUtils.get_training_and_validation_data(
            indexed, result_column, 0.8)
        OriginalTargetconverter = IndexToString(
            inputCol="label", outputCol="originalTargetColumn")
        levels = trainingData.select("label").distinct().collect()

        if self._classifier == "lr":
            if len(levels) == 2:
                lr = LogisticRegression(maxIter=10,
                                        regParam=0.3,
                                        elasticNetParam=0.8)
            elif len(levels) > 2:
                lr = LogisticRegression(maxIter=10,
                                        regParam=0.3,
                                        elasticNetParam=0.8,
                                        family="multinomial")
            fit = lr.fit(trainingData)
        elif self._classifier == "OneVsRest":
            lr = LogisticRegression()
            ovr = OneVsRest(classifier=lr)
            fit = ovr.fit(trainingData)
        transformed = fit.transform(validationData)
        MLUtils.save_pipeline_or_model(fit, model_filepath)

        print fit.coefficientMatrix
        print fit.interceptVector

        # feature_importance = MLUtils.calculate_sparkml_feature_importance(indexed,fit,categorical_columns,numerical_columns)
        label_classes = transformed.select("label").distinct().collect()
        results = transformed.select(["prediction", "label"])
        if len(label_classes) > 2:
            evaluator = MulticlassClassificationEvaluator(
                predictionCol="prediction")
            evaluator.evaluate(results)
            self._model_summary["model_accuracy"] = evaluator.evaluate(
                results,
                {evaluator.metricName: "accuracy"})  # accuracy of the model
        else:
            evaluator = BinaryClassificationEvaluator(
                rawPredictionCol="prediction")
            evaluator.evaluate(results)
            # print evaluator.evaluate(results,{evaluator.metricName: "areaUnderROC"})
            # print evaluator.evaluate(results,{evaluator.metricName: "areaUnderPR"})
            self._model_summary["model_accuracy"] = evaluator.evaluate(
                results,
                {evaluator.metricName: "areaUnderPR"})  # accuracy of the model

        # self._model_summary["feature_importance"] = MLUtils.transform_feature_importance(feature_importance)
        self._model_summary["runtime_in_seconds"] = round((time.time() - st),
                                                          2)

        transformed = OriginalTargetconverter.transform(transformed)
        label_indexer_dict = [
            dict(enumerate(field.metadata["ml_attr"]["vals"]))
            for field in transformed.schema.fields if field.name == "label"
        ][0]
        prediction_to_levels = udf(lambda x: label_indexer_dict[x],
                                   StringType())
        transformed = transformed.withColumn(
            "predictedClass", prediction_to_levels(transformed.prediction))
        prediction_df = transformed.select(
            ["originalTargetColumn", "predictedClass"]).toPandas()
        objs = {
            "actual": prediction_df["originalTargetColumn"],
            "predicted": prediction_df["predictedClass"]
        }

        self._model_summary[
            "confusion_matrix"] = MLUtils.calculate_confusion_matrix(
                objs["actual"], objs["predicted"])
        overall_precision_recall = MLUtils.calculate_overall_precision_recall(
            objs["actual"], objs["predicted"])
        self._model_summary[
            "precision_recall_stats"] = overall_precision_recall[
                "classwise_stats"]
        self._model_summary["model_precision"] = overall_precision_recall[
            "precision"]
        self._model_summary["model_recall"] = overall_precision_recall[
            "recall"]
        self._model_summary["target_variable"] = result_column
        self._model_summary[
            "test_sample_prediction"] = overall_precision_recall[
                "prediction_split"]
        self._model_summary["algorithm_name"] = "Random Forest"
        self._model_summary["validation_method"] = "Train and Test"
        self._model_summary["independent_variables"] = len(
            categorical_columns) + len(numerical_columns)
        self._model_summary["level_counts"] = CommonUtils.get_level_count_dict(
            trainingData,
            categorical_columns,
            self._dataframe_context.get_column_separator(),
            dataType="spark")
        # print json.dumps(self._model_summary,indent=2)
        self._model_summary["total_trees"] = 100
        self._model_summary["total_rules"] = 300
        CommonUtils.write_to_file(
            summary_filepath, json.dumps({"modelSummary":
                                          self._model_summary}))
lblIndxr = StringIndexer().setInputCol("lab").setOutputCol("labelInd")
idxRes = lblIndxr.fit(simpleDF).transform(simpleDF)
idxRes.show()


# COMMAND ----------

valIndexer = StringIndexer().setInputCol("value1").setOutputCol("valueInd")
valIndexer.fit(simpleDF).transform(simpleDF).show()


# COMMAND ----------

from pyspark.ml.feature import IndexToString
labelReverse = IndexToString().setInputCol("labelInd")
labelReverse.transform(idxRes).show()


# COMMAND ----------

from pyspark.ml.feature import VectorIndexer
from pyspark.ml.linalg import Vectors
idxIn = spark.createDataFrame([
  (Vectors.dense(1, 2, 3),1),
  (Vectors.dense(2, 5, 6),2),
  (Vectors.dense(1, 8, 9),3)
]).toDF("features", "label")
indxr = VectorIndexer()\
  .setInputCol("features")\
  .setOutputCol("idxed")\
  .setMaxCategories(2)
model_for_topic_classification = NaiveBayesModel.load(
    '/Users/saumya/Desktop/Big_data_project/NB_model_without_pipeline')
print(model_for_topic_classification)

## Predict topics for unlabelled tweets
predictions = model_for_topic_classification.transform(dataset_for_topic)

## convert the labels to text labels
labeler = IndexToString(inputCol="prediction",
                        outputCol="predictedLabel",
                        labels=[
                            'event', 'sports', 'politics', 'news',
                            'technology', 'business', 'entertainment', 'health'
                        ])
# print(predictions)
prediciton_with_label = labeler.transform(predictions)
prediciton_with_label.show(5)
print(prediciton_with_label.count())

ta = data_modified_tweet.alias('ta')
tb = prediciton_with_label.select('trend', 'creation_time', 'twid',
                                  'predictedLabel').alias('tb')

## Use join to create final table with predicted labels for topics
final_df = ta.join(
    tb, (ta.twid == tb.twid) & (ta.creation_time == tb.creation_time) &
    (ta.trend == tb.trend),
    how="left").select(ta.trend, ta.creation_time, ta.twid, ta.body,
                       ta.text_words, ta.location, ta.sentiment, ta.topic,
                       ta.user, tb.predictedLabel)
final_df.show()
if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("IndexToStringExample")\
        .getOrCreate()

    # $example on$
    df = spark.createDataFrame(
        [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
        ["id", "category"])

    indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
    model = indexer.fit(df)
    indexed = model.transform(df)

    print("Transformed string column '%s' to indexed column '%s'"
          % (indexer.getInputCol(), indexer.getOutputCol()))
    indexed.show()

    print("StringIndexer will store labels in output column metadata\n")

    converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory")
    converted = converter.transform(indexed)

    print("Transformed indexed column '%s' back to original string column '%s' using "
          "labels in metadata" % (converter.getInputCol(), converter.getOutputCol()))
    converted.select("id", "categoryIndex", "originalCategory").show()
    # $example off$

    spark.stop()
Exemplo n.º 18
0
#test_rdd = test_transformed.map(lambda data: Vectors.dense([float(c) for c in data]))


data_transformed = test_transformed.select(col("Id").alias("label"), col("features")).map(lambda row: LabeledPoint(row.label, row.features))

#Evaluate the model on the training data - output "ID", "prediction"
realTest_labelsAndPreds = data_transformed.map(lambda p: (p.label, (float(nb_model.predict(p.features)))))

output = sqlContext.createDataFrame(realTest_labelsAndPreds,['id','Category_Index'])

#convert back to Categories
#you need SPARK1.6 for this
#in cmd prompt,type in: sudo yum install spark-core spark-master spark-worker spark-python
from pyspark.ml.feature import IndexToString
converter = IndexToString(inputCol="Category_Index", outputCol="originalCategory", labels=classifymodel.labels)
converted = converter.transform(output)

#converted.write.format('com.databricks.spark.csv').save('submission1.csv')

def toCSVLine(data):
  return ','.join(str(d) for d in data)

lines = converted.map(toCSVLine)
lines.saveAsTextFile('submission1.csv')


#view Error rates
#realTest_trainErr = realTest_labelsAndPreds.filter(lambda vp: vp[0] != vp[1]).count() / float(test_transformed.count())
#print("Training Error = " + str(realTest_trainErr))

#model.predict(test_rdd)
Exemplo n.º 19
0
from pyspark.ml.feature import IndexToString
from pyspark.ml.feature import StringIndexer
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("stringindexer_sample") \
    .master("local[*]") \
    .getOrCreate()

df1 = spark.createDataFrame([
    (0, "red"),
    (1, "blue"),
    (2, "green"),
    (3, "yellow")]).toDF("id", "color")

strignIndexer = StringIndexer(inputCol="color", outputCol="colorIndex").fit(df1)

df2 = strignIndexer.transform(df1)

df2.show()

indexToString = IndexToString(inputCol="colorIndex", outputCol="originalColor")

df3 = indexToString.transform(df2)
df3.show()

spark.stop
Exemplo n.º 20
0
s3 = boto3.client('s3')
modelPath = 'full_gbt_test_model/data/_SUCCESS'
if check(s3, 'gdc-emr0', stdmodelPath) == False:
    print("saving Random Forest model...")
    bestModel.save('s3://gdc-emr0/full_gbt_test_model')
else:
    print(modelPath + " already exists...")

# In[ ]:

# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction",
                               outputCol="predictedLabel",
                               labels=label_dict['disease_type_cpv_idx'])
predictions = labelConverter.transform(predictions)

# In[ ]:

feature_rd_df = pd.DataFrame(feature_importance,
                             columns=['feature_importance'])
# cpv_feature_columns+mirna_feature_columns
from io import StringIO

csv_buffer = StringIO()
feature_rd_df.to_csv(csv_buffer)
s3_resource = boto3.resource('s3')
s3_resource.Object('gdc-emr0',
                   'gbt_feature_impt.csv').put(Body=csv_buffer.getvalue())

spark.stop()
Exemplo n.º 21
0
indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
model = indexer.fit(df)
indexed = model.transform(df)
indexed.show()

# IndexToString: num -> str
'''
与StringIndexer相对应,IndexToString的作用是把标签索引的一列重新映射回原有的字符型标签。
其主要使用场景一般都是和StringIndexer配合,先用StringIndexer将标签转换为标签索引,进行
模型训练,然后在预测标签的时候再把标签索引转化成原有的字符标签。
'''
from pyspark.ml.feature import IndexToString

toString = IndexToString(inputCol="categoryIndex",
                         outputCol="originalCategory")
indexString = toString.transform(indexed)
indexString.select("id", "originalCategory").show()

# VectorIndexer:
'''
解决向量数据集中的类别型特征转换。
通过为其提供maxCategories超参数,它可以自动识别哪些特征是类别型的,并且将原始值转换为类
别索引。它基于不同特征值的数量来识别哪些特征需要被类别化,那些取值可能性最多不超过maxCategories
的特征需要会被认为是类别型的。
'''
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.linalg import Vector, Vectors

df = spark.createDataFrame([(Vectors.dense(-1.0, 1.0, 1.0), ),
                            (Vectors.dense(-1.0, 3.0, 1.0), ),
                            (Vectors.dense(0.0, 5.0, 1.0), )], ["features"])
Exemplo n.º 22
0
	sports_new_rdd = sports_new_files.map(lambda (k,v): ("sports",v))
	travel_new_rdd = travel_new_files.map(lambda (k,v): ("travel",v))

	business_new_rdd = business_new_rdd.union(politics_new_rdd)
	business_new_rdd = business_new_rdd.union(sports_new_rdd)
	business_new_rdd = business_new_rdd.union(travel_new_rdd)

	spark_new_df = spark.createDataFrame(business_new_rdd, Data_schema)

	test_dataset = pipelineFit.transform(spark_new_df)

	idx_to_string = IndexToString(inputCol="prediction", outputCol="category_output",labels=["business","politics","travel","sports"])
	#new_dataset=idx_to_string.transform(test_dataset)
	
	test_predictions = lrModel.transform(test_dataset)
	prediction_with_labels=idx_to_string.transform(test_predictions)

	# Evaluate model results

	test_result = evaluator.evaluate(test_predictions)
	test_predictions_and_labels = test_predictions.select(["prediction","label"])

	test_metrics = MulticlassMetrics(test_predictions_and_labels.rdd)
	test_conf_mat1 = test_metrics.confusionMatrix()
	
	test_precision1 = test_metrics.precision()
	test_recall1 = test_metrics.recall()
	test_f1Score1 = test_metrics.fMeasure()

	test_nbPredictions=nbModel.transform(test_dataset)
	nb_prediction_with_labels=idx_to_string.transform(test_nbPredictions)
Exemplo n.º 23
0
# only select the features and label column
df = df.select(['features', 'label'])
print("Reading for machine learning")

df.show(10)

train, test = df.randomSplit([0.70, 0.30])
test.show()

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
model = lr.fit(train)

predictions = model.transform(test)
converter = IndexToString(inputCol="label", outputCol="originallabel")
converted = converter.transform(predictions)

converter = IndexToString(inputCol="prediction",
                          outputCol="prediction_label",
                          labels=user_labels)
converted = converter.transform(converted)
converted.show(5)

customSchema = StructType([
    StructField("sepal_length", DoubleType(), True),
    StructField("sepal_width", DoubleType(), True),
    StructField("petal_length", DoubleType(), True),
    StructField("petal_width", DoubleType(), True)
])
myrdd = spark.sparkContext.parallelize([[5.1, 3.5, 1.4, 0.2]])
df = sqlContext.createDataFrame(myrdd, customSchema)
Exemplo n.º 24
0
lblIndxr = StringIndexer().setInputCol("lab").setOutputCol("labelInd")
idxRes = lblIndxr.fit(simpleDF).transform(simpleDF)
idxRes.show()

# COMMAND ----------

valIndexer = StringIndexer().setInputCol("value1").setOutputCol("valueInd")
valIndexer.fit(simpleDF).transform(simpleDF).show(5)

# COMMAND ----------

from pyspark.ml.feature import IndexToString

labelReverse = IndexToString().setInputCol("labelInd")
labelReverse.transform(idxRes).show(5)

# COMMAND ----------

from pyspark.ml.feature import VectorIndexer
from pyspark.ml.linalg import Vectors

idxIn = spark.createDataFrame([(Vectors.dense(1, 2, 3), 1),
                               (Vectors.dense(2, 5, 6), 2),
                               (Vectors.dense(1, 8, 9), 3)
                               ]).toDF("features", "label")
indxr = VectorIndexer()\
  .setInputCol("features")\
  .setOutputCol("idxed")\
  .setMaxCategories(2)
indxr.fit(idxIn).transform(idxIn).show()
#making recommendations using ALS recommender model and selecting only top 'n' movies
recommendations = rec_model.transform(remaining_movies).orderBy(
    'prediction', ascending=False)

# COMMAND ----------

recommendations.show(5, False)

# COMMAND ----------

#converting title_new values back to movie titles
movie_title = IndexToString(inputCol="title_new",
                            outputCol="title",
                            labels=model.labels)

final_recommendations = movie_title.transform(recommendations)

# COMMAND ----------

final_recommendations.show(10, False)

# COMMAND ----------


#create function to recommend top 'n' movies to any particular user
def top_movies(user_id, n):
    """
    This function returns the top 'n' movies that user has not seen yet but might like 
    
    """
    #assigning alias name 'a' to unique movies df
Exemplo n.º 26
0
                           estimatorParamMaps=paramGrid,
                           evaluator=MulticlassClassificationEvaluator(),
                           trainRatio=0.8)

(trainingData, testData) = li.transform(va).randomSplit([0.7, 0.3])

# Run TrainValidationSplit, and choose the best set of parameters.
model = tvs.fit(trainingData)

predictions = model.transform(testData)
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

i2s.transform(predictions).groupBy('predictedLabel', 'maintenanceType')\
    .count().toPandas()
  
fi = model.bestModel.featureImportances.toArray()

sensorImportances = {}
for sensorIndex in range(len(fi)):
    sensorImportances[sensorNames[sensorIndex]] = round(fi[sensorIndex]*100)
    
sensorImportancesPD = pd.DataFrame.from_records(list(sensorImportances.items()), columns=['Sensor','Importance (%)'])\
  .sort_values('Importance (%)')
    
sb.set_color_codes("pastel")
sb.barplot(x="Importance (%)", y="Sensor", 
           data=sensorImportancesPD,
           label="Total", color="b")
Exemplo n.º 27
0
def numericToNominal(data_frame):
    converter = IndexToString(inputCol='LoAL_num',
                              outputCol='originalCategory')
    converted = converter.transform(data_frame)
df = spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"),
                            (5, "c")], ["id", "category"])

indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
model = indexer.fit(df)
indexed = model.transform(df)

print("Transformed string column '%s' to indexed column '%s'" %
      (indexer.getInputCol(), indexer.getOutputCol()))
indexed.show()

print("StringIndexer will store labels in output column metadata\n")

converter = IndexToString(inputCol="categoryIndex",
                          outputCol="originalCategory")
converted = converter.transform(indexed)

print(
    "Transformed indexed column '%s' back to original string column '%s' using "
    "labels in metadata" % (converter.getInputCol(), converter.getOutputCol()))
converted.select("id", "categoryIndex", "originalCategory").show()

# COMMAND ----------

###One hot encode estimator maps the categorical features to binary vector. It is common practice to run string indexer first to convert the raw features into indexed features (Stringindexer)
from pyspark.ml.feature import OneHotEncoderEstimator

df = spark.createDataFrame([(0.0, 1.0), (1.0, 0.0), (2.0, 1.0), (0.0, 2.0),
                            (0.0, 1.0), (2.0, 0.0)],
                           ["categoryIndex1", "categoryIndex2"])
Exemplo n.º 29
0
idf = IDF(inputCol="tf", outputCol="features", minDocFreq=5)
labels = StringIndexer(inputCol="original", outputCol="label")
lines = Pipeline(stages=[tokenizer, ngrams, hashtf, idf, labels])

linesFit = lines.fit(trainSet)
trainModel = linesFit.transform(trainSet)
validationModel = linesFit.transform(valSet)

lr = LogisticRegression(maxIter=100)
model = lr.fit(trainModel)
predictions = model.transform(validationModel)
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
predictions.show(30)

converter = IndexToString(inputCol="label", outputCol="label meaning")
converted = converter.transform(predictions.select("label").distinct())
converted.select("label", "label meaning").distinct().show()

truePositive = predictions[(predictions.label == 0)
                           & (predictions.prediction == 0)].count()
trueNegative = predictions[(predictions.label == 1)
                           & (predictions.prediction == 1)].count()
falsePositive = predictions[(predictions.label == 1)
                            & (predictions.prediction == 0)].count()
falseNegative = predictions[(predictions.label == 0)
                            & (predictions.prediction == 1)].count()
recall = float(truePositive) / (truePositive + falseNegative)
precision = float(truePositive) / (truePositive + falsePositive)

print("True Positive", truePositive)
print("True Negative", trueNegative)
Exemplo n.º 30
0
def main(train_x, train_y, test_x, test_y=None, base='gs'):
    # generate joint feature set
    train_features = elizabeth.preprocess.load(train_x,
                                               train_y,
                                               base=base,
                                               kind='joint').drop('url')
    test_features = elizabeth.preprocess.load(test_x,
                                              test_y,
                                              base=base,
                                              kind='joint').drop('url')

    train_features.show()

    token_counter = CountVectorizer(inputCol='features',
                                    outputCol='tokenCounts',
                                    minDF=10).fit(train_features)
    train = token_counter.transform(train_features).drop('features')
    test = token_counter.transform(test_features).drop('features')

    # convert the string labels to numeric indices
    # the handleInvalid param allows the label indexer to deal with labels that weren't seen during fitting
    label_indexer = StringIndexer(inputCol='label',
                                  outputCol='indexedLabel',
                                  handleInvalid="skip")
    label_indexer = label_indexer.fit(train)
    train = label_indexer.transform(train)
    # the test set won't always have labels
    if test_y is not None:
        test = label_indexer.transform(test)

    index_labeller = IndexToString(inputCol='prediction',
                                   outputCol='predictedClass',
                                   labels=label_indexer.labels)

    # create and train a Random Forest classifier
    rf = RandomForestClassifier(labelCol='indexedLabel',
                                featuresCol='tokenCounts',
                                numTrees=20,
                                maxDepth=10,
                                minInfoGain=0.0,
                                seed=12345)
    model = rf.fit(train)
    prediction = model.transform(test)
    prediction = index_labeller.transform(
        prediction)  # DF[id, url, ... prediction, predictedClass]

    # If labels are given for the test set, print a score.s
    if test_y:
        evaluator = MulticlassClassificationEvaluator(
            labelCol="indexedLabel",
            predictionCol='prediction',
            metricName='accuracy')
        accuracy = evaluator.evaluate(prediction)
        print("\n\tAccuracy on test set: %0.6f\n" % accuracy)

    # If no labels are given for the test set, print predictions.
    else:
        prediction = prediction.orderBy(prediction.id).select(
            prediction.predictedClass)
        prediction = prediction.rdd.map(
            lambda prediction: int(prediction.predictedClass))
        prediction = prediction.toLocalIterator()
        print(*prediction, sep='\n')
  .option("startingOffsets", "latest") \
  .load() \
  .selectExpr("CAST(value as string)")\
  .select(F.from_json("value", schema).alias("value"))\
  .select(F.col("value.*"))\
  .select("uid", F.col('visits').url.alias("urls"))\
  .withColumn('domains', foo_udf(F.col('urls')))

# Infer on test data

results = model.transform(st)

# get string classes from encoded values
converter = IndexToString(inputCol="prediction",
                          outputCol="gender_age",
                          labels=model.stages[1].labels)
converted = converter.transform(results)

#Saving to another topic
query = converted\
 .select(F.to_json(F.struct("uid", "gender_age")).alias("value"))\
 .writeStream\
 .outputMode("append")\
 .format("kafka") \
 .option("checkpointLocation", "file:///tmp/checkpoint")\
 .option("kafka.bootstrap.servers", kafka_bootstrap ) \
 .option("topic", topic_out) \
 .start()

query.awaitTermination()