def task_6(data_io, product_processed_data): # -----------------------------Column names-------------------------------- # Inputs: category_column = 'category' # Outputs: categoryIndex_column = 'categoryIndex' categoryOneHot_column = 'categoryOneHot' categoryPCA_column = 'categoryPCA' # ------------------------------------------------------------------------- # ---------------------- Your implementation begins------------------------ step1 = product_processed_data[[category_column]] stringIndexer = M.feature.StringIndexer(inputCol="category", outputCol="indexed_category", handleInvalid="error", stringOrderType="frequencyDesc") inputs = stringIndexer.getOutputCol() OHencoder = M.feature.OneHotEncoderEstimator(inputCols=[inputs], outputCols=['categoryOneHot'], dropLast=False) pca_ = M.feature.PCA(inputCol="categoryOneHot", outputCol='categoryPCA', k=15) pipeline = Pipeline(stages=[stringIndexer, OHencoder, pca_]) pipelineFit = pipeline.fit(step1) output = pipelineFit.transform(step1) sum_mean = Summarizer.metrics("mean") meanVector_categoryOneHot = output.select( Summarizer.mean(output.categoryOneHot)).head()[0] meanVector_categoryPCA = output.select(Summarizer.mean( output.categoryPCA)).head()[0] # ------------------------------------------------------------------------- # ---------------------- Put results in res dict -------------------------- res = { 'count_total': None, 'meanVector_categoryOneHot': [ None, ], 'meanVector_categoryPCA': [ None, ] } # Modify res: res['count_total'] = output.count() res['meanVector_categoryOneHot'] = meanVector_categoryOneHot res['meanVector_categoryPCA'] = meanVector_categoryPCA # ------------------------------------------------------------------------- # ----------------------------- Do not change ----------------------------- data_io.save(res, 'task_6') return res
def evaluate(model, word_column="words", vectorizer="w2v"): doc2vecs_df = featurize(word_column, vectorizer) if type(model) == LinearSVC: paramGrid = ParamGridBuilder() \ .addGrid(model.regParam, [0.1]) \ .build() elif type(model) == GBTClassifier: paramGrid = ParamGridBuilder() \ .addGrid(model.maxIter, [50]) \ .build() elif type(model) == RandomForestClassifier: paramGrid = ParamGridBuilder() \ .addGrid(model.maxBins, [100]) \ .build() elif type(model) == MultilayerPerceptronClassifier: paramGrid = ParamGridBuilder() \ .addGrid(model.layers, [[122, 50, 2]]) \ .build() # .addGrid(model.layers, [[120, 2], [120, 50, 2], [120, 75, 50, 2]]) \ elif type(model) == FMClassifier: paramGrid = ParamGridBuilder() \ .addGrid(model.stepSize, [.01, .001]) \ .build() print('Evaluating...') w2v_train_df, w2v_test_df = doc2vecs_df.randomSplit([0.8, 0.2]) si = StringIndexer(inputCol="LABEL", outputCol="label") model_evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="f1") classifier_pipeline = Pipeline(stages=[si, model]) crossval = CrossValidator(estimator=classifier_pipeline, estimatorParamMaps=paramGrid, evaluator=model_evaluator, numFolds=5) fit_model = crossval.fit(doc2vecs_df) predictions = fit_model.transform(w2v_test_df) # predictions.toPandas().to_csv('predictions.csv') # predictions.groupBy('prediction', 'label', 'PRODUCT_CATEGORY') # predictions.describe() summarizer = Summarizer.metrics("mean", "count") predictions.select( summarizer.summary(predictions.filter( predictions.label == 1).pos)).show(truncate=False) preds_and_labels = predictions.select(['prediction', 'label']) metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple)) print('Confusion Matrix') print(metrics.confusionMatrix().toArray()) # Overall statistics precision = metrics.precision(1.0) recall = metrics.recall(1.0) f1Score = metrics.fMeasure(1.0) print("Summary Stats") print("Precision = %s" % precision) print("Recall = %s" % recall) print("F1 Score = %s" % f1Score) accuracy = model_evaluator.evaluate(predictions) trainingSummary = fit_model.bestModel.stages[-1].extractParamMap() print(trainingSummary) return accuracy
def task_6(data_io, product_processed_data): # -----------------------------Column names-------------------------------- # Inputs: category_column = 'category' # Outputs: categoryIndex_column = 'categoryIndex' categoryOneHot_column = 'categoryOneHot' categoryPCA_column = 'categoryPCA' # ------------------------------------------------------------------------- # ---------------------- Your implementation begins------------------------ indexer = M.feature.StringIndexer(inputCol=category_column, outputCol=categoryIndex_column, handleInvalid="error") indexed_model = indexer.fit(product_processed_data).transform( product_processed_data) encoder = M.feature.OneHotEncoderEstimator( dropLast=False, inputCols=[categoryIndex_column], outputCols=[categoryOneHot_column]) encodded_model = encoder.fit(indexed_model).transform(indexed_model) pca = M.feature.PCA(k=15, inputCol=categoryOneHot_column, outputCol=categoryPCA_column) pca_model = pca.fit(encodded_model).transform(encodded_model) summarizer = Summarizer.metrics("mean") count_total = pca_model.count() meanVector_categoryPCA = pca_model.select( summarizer.summary(pca_model.categoryPCA)).head()[0][0] meanVector_categoryOneHot = pca_model.select( summarizer.summary(pca_model.categoryOneHot)).head()[0][0] # ------------------------------------------------------------------------- # ---------------------- Put results in res dict -------------------------- res = { 'count_total': None, 'meanVector_categoryOneHot': [ None, ], 'meanVector_categoryPCA': [ None, ] } # Modify res: res['count_total'] = count_total res['meanVector_categoryOneHot'] = meanVector_categoryOneHot res['meanVector_categoryPCA'] = meanVector_categoryPCA # ------------------------------------------------------------------------- # ----------------------------- Do not change ----------------------------- data_io.save(res, 'task_6') return res
def basic_statistics(): """Basic statistics.""" df = sql.read.parquet(str(DATA_PARQUET)) numeric = ['cost', 'call_duration_minutes', 'data_volume_mb'] assemble = VectorAssembler(inputCols=numeric, outputCol='features') features = assemble.transform(df.dropna(subset=numeric + ['target'])) breakpoint() # summarize summarize = Summarizer().metrics('mean', 'variance', 'count', 'numNonZeros', 'max', 'min', 'normL2', 'normL1') features.select(summarize.summary( features['features'])).show(truncate=False) # correlations r1 = Correlation.corr(features, 'features', 'pearson').head()[0] small = features.sample(fraction=0.1, seed=100500) r2 = Correlation.corr(small, 'features', 'spearman').head()[0]
def match_word_with_word_vector(self, clean_word_no_dup_df, word_vector_df): words_with_vector_df = self.assign_vector_to_words( clean_word_no_dup_df, word_vector_df) words_with_vector_df.persist() mismatched_words_matched_df = self.embed_vector_to_not_matched_words( words_with_vector_df, word_vector_df) complete_match_df = words_with_vector_df.where( col('word_vector').isNotNull()).union(mismatched_words_matched_df) return complete_match_df.groupBy('sentence_id').agg( Summarizer.sum( col('word_vector')).alias('sentence_vector')).select( 'sentence_id', 'sentence_vector')
def summarize_artist_styles(self): # We need to use a `Summarizer` to be able to take # the average of a Vector-type column songs = self._generate_dataset() \ .withColumn("artist", F.explode("artists.name")) \ .groupBy("artist") \ .agg(Summarizer.mean(F.col("features")).alias("average_song")) \ .select("artist", "average_song") # Only keep track of some of the most popular artists, # there's way too many to realistically compare all of them dataset = self.spark \ .read.json(KPOP_ARTISTS, multiLine=True) \ .withColumnRenamed("name", "artist") \ .select("artist", "popularity") \ .join(songs, "artist") \ .collect() for row in dataset: self._save_radar_plot( row["artist"], # DenseVector -> numpy.ndarray -> List[float] row["average_song"].toArray().tolist(), row["popularity"])
from pyspark.sql import SparkSession from pyspark.ml.stat import Summarizer from pyspark.sql import Row from pyspark.ml.linalg import Vectors spark = SparkSession.builder.getOrCreate() spark.sparkContext.setLogLevel("ERROR") sc = spark.sparkContext df = sc.parallelize([ Row(weight=1.0, features=Vectors.dense(1.0, 1.0, 1.0)), Row(weight=0.0, features=Vectors.dense(1.0, 2.0, 3.0)) ]).toDF() summarizer = Summarizer.metrics("mean", "count") df.show() df.select(summarizer.summary(df.features, df.weight)).show(truncate=False) df.select(summarizer.summary(df.features)).show(truncate=False) df.select(Summarizer.mean(df.features, df.weight)).show(truncate=False) df.select(Summarizer.mean(df.features)).show(truncate=False) spark.stop()
(Vectors.dense([4.0, 5.0, 0.0, 3.0])), (Vectors.dense([6.0, 7.0, 0.0, 8.0])), (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]))]) rdd_data = more_data.map(lambda line: tuple([float(x) for x in line])) summary = Statistics.colStats(rdd_data) print("Mean:" + str(summary.mean())) # a dense vector containing the mean value for each column print("Variance:" + str(summary.variance())) # column-wise variance print("Non zeros:" + str(summary.numNonzeros())) print("Count:" + str(summary.count())) print("Min:" + str(summary.min())) print("Max:" + str(summary.max())) # Examples with Summarizer summarizer = Summarizer.metrics("mean", "count", "min", "max", "variance") # compute statistics for multiple metrics data_frame.select(summarizer.summary(data_frame.features)).show(truncate=False) # compute statistics for single metric "mean" data_frame.select(Summarizer.mean(data_frame.features)).show(truncate=False) spark_session.stop()
from pyspark.ml.linalg import Vectors # $example off$ if __name__ == "__main__": spark = SparkSession \ .builder \ .appName("SummarizerExample") \ .getOrCreate() sc = spark.sparkContext # $example on$ df = sc.parallelize([Row(weight=1.0, features=Vectors.dense(1.0, 1.0, 1.0)), Row(weight=0.0, features=Vectors.dense(1.0, 2.0, 3.0))]).toDF() # create summarizer for multiple metrics "mean" and "count" summarizer = Summarizer.metrics("mean", "count") # compute statistics for multiple metrics with weight df.select(summarizer.summary(df.features, df.weight)).show(truncate=False) # compute statistics for multiple metrics without weight df.select(summarizer.summary(df.features)).show(truncate=False) # compute statistics for single metric "mean" with weight df.select(Summarizer.mean(df.features, df.weight)).show(truncate=False) # compute statistics for single metric "mean" without weight df.select(Summarizer.mean(df.features)).show(truncate=False) # $example off$ spark.stop()
####### Using correlation and Summarizer #Select features features = ["age", "charges", "customer_contacts", "attrition"] va = VectorAssembler(inputCols= features, outputCol = "features") #Create Vector Assember featuresData = va.transform(rawData) #transform original dataset to include new col of vectors featuresData.show(n=2) #Calculate correlation and display r1 = Correlation.corr(featuresData, "features", method = 'pearson').head() print("Pearson correlation matrix:\n" + str(r1[0])) #Calculate mean statistic for the list of features in order summarizer = Summarizer.metrics("mean") featuresData.select(summarizer.summary(featuresData.features)).show(truncate=False) """**Split the Spark Dataframe into Train and Test**""" #Splitting dataframe with randomsplit splits = rawData.randomSplit(weights= [.7, .3], seed= 12345) print("training obs count: ", splits[0].count()) print("test obs count: ", splits[1].count()) train = splits[0] test = splits[1] """**Feature Engineering & Define Model**"""
output.select("features").show(10, truncate=False) from pyspark.ml.feature import VectorAssembler from pyspark.ml.stat import Summarizer assembler = VectorAssembler(inputCols=[ 'year', 'population', 'labor_force', 'population_percent', 'employed_total', 'employed_percent', 'agrictulture_ratio', 'nonagriculture_ratio', 'unemployed', 'unemployed_percent', 'not_in_labor' ], outputCol="features") assembled = assembler.transform(employment_df) summarizer = Summarizer.metrics("max", "mean").summary(assembled["features"]) assembled.select(summarizer).show(truncate=False) assembled.select(Summarizer.variance(assembled.features)).show(truncate=False) from pyspark.ml.feature import VectorAssembler from pyspark.ml.stat import Correlation assembler = VectorAssembler(inputCols=[ "date", "day", "period", "nswprice", "nswdemand", "vicprice", "vicdemand", "transfer" ], outputCol="features") assembled = assembler.transform(electricity_df)