def cross_validation_task_C(X, estimator, sqlContext, class_type, features_col, sc, k_folds=10): kf = KFold(n_splits=k_folds) maem = [] maeni = [] for train_index, test_index in kf.split(X): sparse_data = [] test_data = [] cl_cl = [] X_train, X_test = X.iloc[train_index], X.iloc[test_index] train_topic = sqlContext.createDataFrame(X_train) test_topic = sqlContext.createDataFrame(X_test) # True: DecisionTree # False: NaiveBayes if (class_type): pred = pd.DataFrame(columns=['class', 'prediction']) train_topic = MLUtils.convertVectorColumnsFromML( train_topic, features_col) test_topic = MLUtils.convertVectorColumnsFromML( test_topic, features_col) for index, row in train_topic.toPandas().iterrows(): sparse_data.append( LabeledPoint(float(row['class']), row[features_col])) for index, row in test_topic.toPandas().iterrows(): cl_cl.append(row['class']) test_data.append(row[features_col]) model = DecisionTree.trainClassifier(sc.parallelize(sparse_data), 5, {}) pred['class'] = cl_cl pred['prediction'] = model.predict( sc.parallelize(test_data)).collect() maem_aux, maeni_aux = mae_ms(pred) else: pred = estimator.fit(train_topic).transform(test_topic).select( 'class', 'prediction').toPandas() maem_aux, maeni_aux = mae_ms(pred) maem.append(maem_aux) maeni.append(maeni_aux) return (np.mean(maem), np.mean(maeni))
def eval_logreg(new_df, filename): (train, test) = new_df.randomSplit([0.8, 0.2], 24) train = train.withColumnRenamed('prediction', 'label') test = test.withColumnRenamed('prediction', 'label') df = MLUtils.convertVectorColumnsFromML(train, "features") parsedData = df.select(col("label"), col("features")).rdd.map( lambda row: LabeledPoint(row.label, row.features)) model = LogisticRegressionWithLBFGS.train(parsedData, numClasses=50) model.save(spark.sparkContext, filename) # sameModel = LogisticRegressionModel.load(spark.sparkContext, "LogRegLBFGSModel") labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter( lambda lp: lp[0] != lp[1]).count() / float(parsedData.count()) print("LogReg Small Training Error = " + str(trainErr)) df = MLUtils.convertVectorColumnsFromML(test, "features") parsed_test = df.select(col("label"), col("features")).rdd.map( lambda row: LabeledPoint(row.label, row.features)) testErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float( parsed_test.count()) print("LogReg Small Test Error = " + str(testErr))
def analysis(df): """ML in Spark """ htf = MLHashingTF(inputCol="message", outputCol="tf") tf = htf.transform(df) idf = MLIDF(inputCol="tf", outputCol="idf") tfidf = idf.fit(tf).transform(tf) #tfidf.show(truncate=True) #sum_ = udf(lambda v: float(v.values.sum()), DoubleType()) #res_df = tfidf.withColumn("idf_sum", sum_("idf")) res_df = MLUtils.convertVectorColumnsFromML(tfidf, 'idf') ml_dataset = res_df.rdd.map(lambda x: x.idf).collect() model = KMeans.train(sc.parallelize(ml_dataset), 5, 50) return res_df, model
def __index_row_matrix_rdd(self, scale_df): """ :param scale_df: :return: """ try: vector_mllib = MLUtils.convertVectorColumnsFromML( scale_df, 'scaled_features').drop('features') vector_rdd = vector_mllib.select( 'scaled_features', 'id').rdd.map(lambda x: IndexedRow(x[1], x[0])) self.__logger.info("Build Index Row Matrix RDD") return IndexedRowMatrix(vector_rdd) except TypeError as te: raise OpheliaMLException( f"An error occurred while calling __index_row_matrix_rdd() method: {te}" )
input = "data/mllib/sample_libsvm_data.txt" # Load input data print("Loading LIBSVM file with UDT from " + input + ".") df = spark.read.format("libsvm").load(input).cache() print("Schema from LIBSVM:") df.printSchema() print("Loaded training data as a DataFrame with " + str(df.count()) + " records.") # Show statistical summary of labels. labelSummary = df.describe("label") labelSummary.show() # Convert features column to an RDD of vectors. features = MLUtils.convertVectorColumnsFromML(df, "features") \ .select("features").rdd.map(lambda r: r.features) summary = Statistics.colStats(features) print("Selected features column with average values:\n" + str(summary.mean())) # Save the records in a parquet file. tempdir = tempfile.NamedTemporaryFile(delete=False).name os.unlink(tempdir) print("Saving to " + tempdir + " as Parquet file.") df.write.parquet(tempdir) # Load the records back. print("Loading Parquet file with UDT from " + tempdir) newDF = spark.read.parquet(tempdir) print("Schema from Parquet:") newDF.printSchema()
input = "sample_libsvm_data.txt" # Load input data print("Loading LIBSVM file with UDT from " + input + ".") df = spark.read.format("libsvm").load(input).cache() print("Schema from LIBSVM:") df.printSchema() print("Loaded training data as a DataFrame with " + str(df.count()) + " records.") # Show statistical summary of labels. labelSummary = df.describe("label") labelSummary.show() # Convert features column to an RDD of vectors. features = MLUtils.convertVectorColumnsFromML(df, "features") \ .select("features").rdd.map(lambda r: r.features) summary = Statistics.colStats(features) print("Selected features column with average values:\n" + str(summary.mean())) # Save the records in a parquet file. tempdir = tempfile.NamedTemporaryFile(delete=False).name os.unlink(tempdir) print("Saving to " + tempdir + " as Parquet file.") df.write.parquet(tempdir) # Load the records back. print("Loading Parquet file with UDT from " + tempdir) newDF = spark.read.parquet(tempdir) print("Schema from Parquet:") newDF.printSchema()
def transform(self, candidate_set): """ TODO change comments Add prediction to each paper in the input data set based on the trained model and its features vector. :param dataset: paper profiles :return: dataset with predictions - column "prediction" """ Logger.log("LTR: transform.") predictions = None # format user_id, paper_id candidate_set = candidate_set.select( self.userId_col, F.explode("candidate_set").alias(self.paperId_col)) # scheme for the final prediction result data frame predictions_scheme = StructType([ # name, dataType, nullable StructField("user_id", StringType(), False), StructField("paper_id", IntegerType(), False), StructField("ranking_score", FloatType(), True) ]) self.paper_profiles_model.setPaperIdCol(self.paperId_col) self.paper_profiles_model.setOutputCol(self.features_col) # add paper representation to each paper in the candidate set # candidate set format - user_id, paper_id predictions = self.paper_profiles_model.transform(candidate_set) # make predictions using the model over predictions = MLUtils.convertVectorColumnsFromML( predictions, self.features_col) if (self.model_training == "gm" ): #self.Model_Training.SINGLE_MODEL_ALL_USERS): Logger.log("Prediction gm ...") model = self.models[0] # set threshold to NONE to receive raw predictions from the model model._threshold = None predictions = predictions.rdd.map(lambda p: ( p.user_id, p.paper_id, float(model.predict(p.features)))) predictions = predictions.toDF(predictions_scheme) elif (self.model_training == "imp"): Logger.log("Predicting imp...") model = self.models[0] # set threshold to NONE to receive raw predictions from the model model.threshold = None # broadcast weight vectors for all models model_br = self.spark.sparkContext.broadcast(model) predictions_rdd = predictions.rdd.map(lambda p: ( p.user_id, p.paper_id, float(model_br.value.predict(p.user_id, p.features)))) predictions = predictions_rdd.toDF(predictions_scheme) elif (self.model_training == "ims"): Logger.log("Predicting ims ...") # broadcast weight vectors for all models weights_br = self.spark.sparkContext.broadcast(self.models) def predict(id, features): weights = weights_br.value weight = weights[int(id)] prediction = weight.dot(features) return float(prediction) predict_udf = F.udf(predict, FloatType()) predictions = predictions.withColumn("ranking_score", predict_udf("user_id", "features")) \ .select(self.userId_col, self.paperId_col, "ranking_score") elif (self.model_training == "cms"): Logger.log("Predicting cms ...") # add cluster id to each user - based on it, prediction are done users_in_cluster = self.user_clusters.withColumn( self.userId_col, F.explode("user_ids")).drop("user_ids") predictions = predictions.join(users_in_cluster, self.userId_col) for clusterId, model in self.models.items(): # set threshold to NONE to receive raw predictions from the model model._threshold = None # broadcast weight vectors for all models models_br = self.spark.sparkContext.broadcast(self.models) def predict(id, features): models = models_br.value model = models[id] prediction = model.predict(features) return float(prediction) predict_udf = F.udf(predict, FloatType()) predictions = predictions.withColumn("ranking_score", predict_udf("cluster_id", "features"))\ .select(self.userId_col, self.paperId_col, "ranking_score") elif (self.model_training == "cmp"): # format user_id, paper_id, feature # add cluster id to each user - based on it, prediction are done predictions = predictions.join(self.user_clusters, self.userId_col) model = self.models[0] # set threshold to NONE to receive raw predictions from the model model.threshold = None # broadcast weight vectors for all models model_br = self.spark.sparkContext.broadcast(model) predictions_rdd = predictions.rdd.map(lambda p: ( p.user_id, p.paper_id, float(model_br.value.predict(p.cluster_id, p.features)))) predictions = predictions_rdd.toDF(predictions_scheme) else: # throw an error - unsupported option raise ValueError('The option' + self.model_training + ' is not supported.') # user_id | paper_id | ranking_score| return predictions
# COMMAND ---------- vectorized = (spark.read.format("delta") .load(delta_gold_path) .select(glow.array_to_sparse_vector(glow.genotype_states(fx.col("genotypes"))).alias("features")) .cache()) # COMMAND ---------- # MAGIC %md # MAGIC #### Use `pyspark.ml` to calculate principal components on sparse vector # COMMAND ---------- matrix = RowMatrix(MLUtils.convertVectorColumnsFromML(vectorized, "features").rdd.map(lambda x: x.features)) pcs = matrix.computeSVD(num_pcs) # COMMAND ---------- pd.DataFrame(pcs.V.toArray()).to_csv(principal_components_path) # COMMAND ---------- # MAGIC %md # MAGIC #### Read sample information in and plot out principal components # MAGIC # MAGIC Here we annotate sample info with principal components by joining both DataFrames on index # MAGIC # MAGIC Note: indexing is performed using the Spark SQL function `monotonically_increasing_id()`