def combine(pair): # list of np array if isinstance(pair[1], list): row = Row(*([pair[0][col] for col in pair[0].__fields__] + [[Vectors.dense(elem) for elem in pair[1]]])) return row, ArrayType(VectorUDT()) # scalar elif len(pair[1].shape) == 0: row = Row(*([pair[0][col] for col in pair[0].__fields__] + [float(pair[1].item(0))])) return row, FloatType() # np ndarray else: dim = len(pair[1].shape) if dim == 1: # np 1-D array row = Row(*([pair[0][col] for col in pair[0].__fields__] + [Vectors.dense(pair[1])])) return row, VectorUDT() else: # multi-dimensional array structType = FloatType() for _ in range(dim): structType = ArrayType(structType) row = Row(*([pair[0][col] for col in pair[0].__fields__] + [pair[1].tolist()])) return row, structType
def transform(self, X_rdd, y_rdd=None): ''' given X RDD (and optionally y RDD), output dataframe with term frequency feature vector and labels ''' #check input type if type(X_rdd) != RDD: raise TypeError("Arguments must be pySpark RDDs") if y_rdd and type(y_rdd) != RDD: raise TypeError("Arguments must be pySpark RDDs") #convert X to URL paths X = X_rdd.map(self._term_frequency).cache() #check if labels exist if y_rdd: #combine X and y into single dataframe X = X.zipWithIndex().map(lambda r: (r[1], r[0])) y = y_rdd.zipWithIndex().map(lambda r: (r[1], r[0])) data = X.join(y).map(lambda r: r[1]) schema = StructType([ StructField('features', VectorUDT(), True), StructField('label', StringType(), True) ]) data = data.toDF(schema) data = data.withColumn('label', data.label.cast(DoubleType())) else: X = X.map(lambda row: [row]) schema = StructType([StructField("features", VectorUDT(), True)]) data = X.toDF(schema) return data
def predict(index, s): items = [i for i in s] feature = VectorUDT().deserialize(pickle.loads(items[0])) print(pickle.loads(items[1])[0]) model = pickle.load(open(pickle.loads(items[1])[0] + "/model.pkl", "rb")) y = model.predict([feature.toArray()]) return [VectorUDT().serialize(Vectors.dense(y))]
def predict(index, s): items = [i for i in s] modelPath = pickle.loads(items[1])[0] + "/model.h5" if not hasattr(os, "mlsql_models"): setattr(os, "mlsql_models", {}) if modelPath not in os.mlsql_models: # import tensorflow as tf # from keras import backend as K # gpu_options = tf.GPUOptions(allow_growth=True) # config = tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options) # session = tf.Session(config=config) # K.set_session(session) os.environ["CUDA_VISIBLE_DEVICES"] = "-1" print("Load Keras model %s, CUDA_VISIBLE_DEVICES:%s " % (modelPath, os.environ["CUDA_VISIBLE_DEVICES"])) from keras.models import load_model os.mlsql_models[modelPath] = load_model(modelPath) # here we can get train params trainParams = pickle.loads(items[2])[0] width = int(trainParams["fitParam.0.width"]) height = int(trainParams["fitParam.0.height"]) model = os.mlsql_models[modelPath] rawVector = pickle.loads(items[0]) feature = VectorUDT().deserialize(rawVector).toArray() feature_final = np.reshape(feature, [1, width, height, 3]) # y是一个numpy对象,是一个预测结果的数组。因为predict是支持批量预测的,所以是一个二维数组。 y = model.predict(feature_final) return [VectorUDT().serialize(Vectors.dense(y.tolist()[0]))]
def test_get_metadata(self): expected_metadata = \ { 'float': { 'spark_data_type': FloatType, 'is_sparse_vector_only': False, 'intermediate_format': constants.NOCHANGE, 'max_size': 1, 'shape': 1 }, 'dense': { 'spark_data_type': DenseVector, 'is_sparse_vector_only': False, 'intermediate_format': constants.ARRAY, 'max_size': 2, 'shape': 2 }, 'sparse': { 'spark_data_type': SparseVector, 'is_sparse_vector_only': True, 'intermediate_format': constants.CUSTOM_SPARSE, 'max_size': 1, 'shape': 2 }, 'mixed': { 'spark_data_type': DenseVector, 'is_sparse_vector_only': False, 'intermediate_format': constants.ARRAY, 'max_size': 2, 'shape': 2 }, } with spark_session('test_get_metadata') as spark: data = [[ 1.0, DenseVector([1.0, 1.0]), SparseVector(2, {0: 1.0}), DenseVector([1.0, 1.0]) ], [ 1.0, DenseVector([1.0, 1.0]), SparseVector(2, {1: 1.0}), SparseVector(2, {1: 1.0}) ]] schema = StructType([ StructField('float', FloatType()), StructField('dense', VectorUDT()), StructField('sparse', VectorUDT()), StructField('mixed', VectorUDT()) ]) df = create_test_data_from_schema(spark, data, schema) metadata = util._get_metadata(df) self.assertDictEqual(metadata, expected_metadata)
def create_mnist_data(spark): features = DenseVector([1.0] * 64) label_vec = DenseVector([0.0, 0.0, 1.0] + [0.0] * 7) label = 2.0 data = [[features, label_vec, label]] * 10 schema = StructType([StructField('features', VectorUDT()), StructField('label_vec', VectorUDT()), StructField('label', FloatType())]) df = create_test_data_from_schema(spark, data, schema) return df
def test_one_hot_encoder(): actual_df = fe.one_hot_encoder(source_df, input_cols=['id']) expected_df = op.create.df([ ('id', LongType(), True), ('x', LongType(), True), ('y', LongType(), True), ('features', VectorUDT(), True), ('id***ONE_HOT_ENCODER', VectorUDT(), True) ], [(0, 1, 2, DenseVector([1.0, 0.5, -1.0]), SparseVector(2, {0: 1.0})), (1, 2, 3, DenseVector([2.0, 1.0, 1.0]), SparseVector(2, {1: 1.0})), (2, 3, 4, DenseVector([4.0, 10.0, 2.0]), SparseVector(2, {}))]) assert (expected_df.collect() == actual_df.collect())
def test_vector_assembler(): actual_df = fe.vector_assembler(source_df, input_cols=['id', 'x', 'y']) expected_df = op.create.df( [('id', LongType(), True), ('x', LongType(), True), ('y', LongType(), True), ('features', VectorUDT(), True), ('id_x_y******VECTOR_ASSEMBLER', VectorUDT(), True)], [(0, 1, 2, DenseVector([1.0, 0.5, -1.0 ]), DenseVector([0.0, 1.0, 2.0])), (1, 2, 3, DenseVector([2.0, 1.0, 1.0 ]), DenseVector([1.0, 2.0, 3.0])), (2, 3, 4, DenseVector([4.0, 10.0, 2.0]), DenseVector([2.0, 3.0, 4.0]))]) assert (expected_df.collect() == actual_df.collect())
def combine(pair): # list of np array if isinstance(pair[1], list): row = Row(*([pair[0][col] for col in pair[0].__fields__] + [[Vectors.dense(elem) for elem in pair[1]]])) return row, ArrayType(VectorUDT()) # scalar elif len(pair[1].shape) == 0: row = Row(*([pair[0][col] for col in pair[0].__fields__] + [float(pair[1].item(0))])) return row, FloatType() # np array else: row = Row(*([pair[0][col] for col in pair[0].__fields__] + [Vectors.dense(pair[1])])) return row, VectorUDT()
def test_check_shape_compatibility(self): feature_columns = ['x1', 'x2', 'features'] label_columns = ['y1', 'y_embedding'] schema = StructType([ StructField('x1', DoubleType()), StructField('x2', IntegerType()), StructField('features', VectorUDT()), StructField('y1', FloatType()), StructField('y_embedding', VectorUDT()) ]) data = [[ 1.0, 1, DenseVector([1.0] * 12), 1.0, DenseVector([1.0] * 12) ]] * 10 with spark_session('test_df_cache') as spark: df = create_test_data_from_schema(spark, data, schema) metadata = util._get_metadata(df) input_shapes = [[1], [1], [-1, 3, 4]] output_shapes = [[1], [-1, 3, 4]] util.check_shape_compatibility(metadata, feature_columns, label_columns, input_shapes, output_shapes) input_shapes = [[1], [1], [3, 2, 2]] output_shapes = [[1, 1], [-1, 2, 3, 2]] util.check_shape_compatibility(metadata, feature_columns, label_columns, input_shapes, output_shapes) bad_input_shapes = [[1], [1], [-1, 3, 5]] with pytest.raises(ValueError): util.check_shape_compatibility(metadata, feature_columns, label_columns, bad_input_shapes, output_shapes) bad_input_shapes = [[2], [1], [-1, 3, 4]] with pytest.raises(ValueError): util.check_shape_compatibility(metadata, feature_columns, label_columns, bad_input_shapes, output_shapes) bad_output_shapes = [[7], [-1, 3, 4]] with pytest.raises(ValueError): util.check_shape_compatibility(metadata, feature_columns, label_columns, input_shapes, bad_output_shapes)
def predict(index, s): items = [i for i in s] modelPath = pickle.loads(items[1])[0] + "/model.pkl" if not hasattr(os, "mlsql_models"): setattr(os, "mlsql_models", {}) if modelPath not in os.mlsql_models: print("Load sklearn model %s" % modelPath) os.mlsql_models[modelPath] = pickle.load(open(modelPath, "rb")) model = os.mlsql_models[modelPath] rawVector = pickle.loads(items[0]) feature = VectorUDT().deserialize(rawVector) y = model.predict([feature.toArray()]) return [VectorUDT().serialize(Vectors.dense(y))]
def transform(self, metadata, hashes_and_labels=None, train=True): ''' extract features from .asm files ''' #Check input type if type(metadata) != RDD: raise TypeError("Arguments must be pySpark RDDs") if hashes_and_labels and type(hashes_and_labels) != RDD and type( hashes_and_labels) != PipelinedRDD: raise TypeError("Arguments must be pySpark RDDs") #word tokenization X = metadata.map(self._tokenize).cache() #create dictionary of words if train: self.dictionary = X.map(lambda row: row[1]).flatMap( lambda word: word).map(lambda word: (word, 1)).reduceByKey( lambda acc, w: acc + w).filter( lambda x: x[1] >= self.min_df).collectAsMap() self.dictionary = dict( zip(self.dictionary, xrange(len(self.dictionary)))) #create word vectors X = X.map(self._term_frequency) #check if labels exist if hashes_and_labels: #combine X and y into single dataframe data = hashes_and_labels.join(X).map( lambda (hash, (label, features)): (hash, features, label)) schema = StructType([ StructField('hash', StringType(), True), StructField('features', VectorUDT(), True), StructField('label', StringType(), True) ]) data = data.toDF(schema) data = data.withColumn('label', data.label.cast(DoubleType())) else: #if no labels, just use X schema = StructType([ StructField('hash', StringType(), True), StructField("features", VectorUDT(), True) ]) data = X.toDF(schema) return data
def test_get_col_info(self): with spark_session('test_get_col_info') as spark: data = [[ 0, 0.0, None, [1, 1], DenseVector([1.0, 1.0]), SparseVector(2, {1: 1.0}), DenseVector([1.0, 1.0]) ], [ 1, None, None, [1, 1], DenseVector([1.0, 1.0]), SparseVector(2, {1: 1.0}), SparseVector(2, {1: 1.0}) ]] schema = StructType([ StructField('int', IntegerType()), StructField('float', FloatType()), StructField('null', NullType()), StructField('array', ArrayType(IntegerType())), StructField('dense', VectorUDT()), StructField('sparse', VectorUDT()), StructField('mixed', VectorUDT()) ]) df = create_test_data_from_schema(spark, data, schema) all_col_types, col_shapes, col_max_sizes = util._get_col_info(df) expected = [ ('int', {int}, 1, 1), ('float', {float, NullType}, 1, 1), ('null', {NullType}, 1, 1), ('array', {list}, 2, 2), ('dense', {DenseVector}, 2, 2), ('sparse', {SparseVector}, 2, 1), ('mixed', {DenseVector, SparseVector}, 2, 2) ] for expected_col_info in expected: col_name, col_types, col_shape, col_size = expected_col_info assert all_col_types[col_name] == col_types, col_name assert col_shapes[col_name] == col_shape, col_name assert col_max_sizes[col_name] == col_size, col_name
def transform(self, X_rdd, y_rdd=None, train=True): ''' given X RDD (and optionally y RDD), output dataframe with term frequency feature vector and labels ''' #check input type if type(X_rdd) != RDD: raise TypeError("Arguments must be pySpark RDDs") if y_rdd and type(y_rdd) != RDD: raise TypeError("Arguments must be pySpark RDDs") #word tokenization X = X_rdd.map(self._tokenize).cache() #create dictionary of words if train: self.dictionary = X.map(lambda row: row[1]).flatMap( lambda word: word).map(lambda word: (word, 1)).reduceByKey( lambda acc, w: acc + w).filter( lambda x: x[1] >= self.min_df).collectAsMap() self.dictionary = dict( zip(self.dictionary, xrange(len(self.dictionary)))) #create word vectors X = X.map(self._term_frequency) #check if labels exist if y_rdd: #combine X and y into single dataframe X = X.zipWithIndex().map(lambda r: (r[1], r[0])) y = y_rdd.zipWithIndex().map(lambda r: (r[1], r[0])) data = X.join(y).map(lambda (idx, ((hash, features), label)): (hash, features, label)) schema = StructType([ StructField('hash', StringType(), True), StructField('features', VectorUDT(), True), StructField('label', StringType(), True) ]) data = data.toDF(schema) data = data.withColumn('label', data.label.cast(DoubleType())) else: schema = StructType([ StructField('hash', StringType(), True), StructField("features", VectorUDT(), True) ]) data = X.toDF(schema) return data
def createInputUtc(utc): spark = SparkSession.builder.getOrCreate() int_utc = (utcToInt(utc)) print "timestamp: ", utc, "num_time: ", int_utc schema = T.StructType([T.StructField('features', VectorUDT())]) return spark.createDataFrame([Row(features=DenseVector([int_utc]))], schema = schema)
def testSimpleOnDataFrame(): spark = test_helpers.getOrCreateSparkSession(test_helpers.getCurrentMethodName()) import catboost_spark featureNames = ["f1", "f2", "f3"] srcDataSchema = pool_test_helpers.createSchema( [ ("features", VectorUDT()), ("label", DoubleType()) ], featureNames, addFeatureNamesMetadata=True ) srcData = [ Row(Vectors.dense(0.1, 0.2, 0.11), 0.12), Row(Vectors.dense(0.97, 0.82, 0.33), 1.1), Row(Vectors.dense(0.13, 0.22, 0.23), 2.1), Row(Vectors.dense(0.14, 0.18, 0.1), 0.0), Row(Vectors.dense(0.9, 0.67, 0.17), -1.0), Row(Vectors.dense(0.66, 0.1, 0.31), 0.62) ] df = spark.createDataFrame(spark.sparkContext.parallelize(srcData), StructType(srcDataSchema)) regressor = (catboost_spark.CatBoostRegressor() .setIterations(20) .setTrainDir(tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName()))) model = regressor.fit(df) predictions = model.transform(df) print ("predictions") predictions.show(truncate=False)
def trainALS(self, ranks, iterations): for rank in ranks: als = ALS(rank=rank, maxIter=iterations, regParam=0.1, userCol="UserID", itemCol="MovieID",ratingCol="label") paramGrid = ParamGridBuilder().addGrid(als.rank,[rank]).build() crossval = CrossValidator(estimator=als, estimatorParamMaps=paramGrid, evaluator=Remove_nan(metricName="rmse", labelCol="label", predictionCol="prediction"), numFolds=5) self.trainDf.show() cvModel = crossval.fit(self.trainDf) predictions = cvModel.transform(self.testDf) rmse = Remove_nan(metricName="rmse", labelCol="label", predictionCol="prediction").evaluate(predictions) print "****RMSE VALUE IS :*****", rmse movieFactors = cvModel.bestModel.itemFactors.orderBy('id').cache() movieFactors.show(truncate=False) convertToVectors = udf(lambda features: Vectors.dense(features), VectorUDT()) movieFactors = movieFactors.withColumn("features", convertToVectors(movieFactors.features)) kmeans = KMeans(k=50, seed=1) kModel = kmeans.fit(movieFactors) kmeansDF = kModel.transform(movieFactors) clusters = [1, 2] kmeansDF = kmeansDF.join(self.movieDf, kmeansDF.id == self.movieDf.MovieID).drop('MovieID') for cluster in clusters: movieNamesDf = kmeansDF.where(col("prediction") == cluster).select("MovieName") movieNamesDf.rdd.map(lambda row: row[0]).saveAsTextFile(outputDir + \ "Rank" + str(rank) + "Cluster" + str(cluster)) if __name__ == "__main__": mr = movieRecALS(inputDir + "/MovieLens100K_train.txt", inputDir + "/MovieLens100K_test.txt", inputDir + "/u.item") ranks = [2, 4, 8, 16, 32, 64, 128, 256] iterations = 20 mr.trainALS(ranks, iterations)
def test_cast_vector(): source_df = op.create.df( rows=[ ("happy", [1, 2, 3]), ("excited", [4, 5, 6]) ], cols=[ ("emotion", StringType(), True), ("num", ArrayType(IntegerType()), True) ] ) actual_df = source_df.cols.cast("num", Vectors) expected_df = op.create.df( rows=[ ("happy", DenseVector([1, 2, 3])), ("excited", DenseVector([4, 5, 6]))], cols=[ ("emotion", StringType(), True), ("num", VectorUDT(), True) ] ) assert (actual_df.collect() == expected_df.collect())
def zero_features(df, *feature_names): """Zero out features in the feature vector. Parameters ---------- df : pyspark.sql.DataFrame feature_names : list of str Returns ------- pyspark.sql.DataFrame """ features = df.schema['features'].metadata['features'] idxs = [features.index(name) for name in feature_names] def zero_features(feat): raw = feat.toArray() for idx in idxs: raw[idx] = 0. return Vectors.dense(raw) zero_features_udf = F.udf(zero_features, VectorUDT()) return df.withColumn( 'features', mjolnir.spark.add_meta(df._sc, zero_features_udf('features'), {'features': features}))
def ratingFeatures(ratingSamples): ratingSamples.printSchema() ratingSamples.show() # calculate average movie rating score and rating count # 按movieId做聚合,统计电影点击次数count(1) as ratingCount # avg(rating) as avgRating # variance(rating) as ratingVar -- 这个是方差 movieFeatures = ratingSamples.groupBy('movieId').agg(F.count(F.lit(1)).alias('ratingCount'), F.avg("rating").alias("avgRating"), F.variance('rating').alias('ratingVar')) \ .withColumn('avgRatingVec', udf(lambda x: Vectors.dense(x), VectorUDT())('avgRating')) # 把平均得分转成只有1列的向量存储,后续做标准化要求的 movieFeatures.show(10) ######## 走pipeline特征处理 ######## # bucketing # 连续值分桶:对ratingCount按分布划分成100个大小一样的桶 ratingCountDiscretizer = QuantileDiscretizer(numBuckets=100, inputCol="ratingCount", outputCol="ratingCountBucket") # Normalization # 标准化:将平均得分向量进行标准化 ratingScaler = MinMaxScaler(inputCol="avgRatingVec", outputCol="scaleAvgRating") # 创建pipeline pipelineStage = [ratingCountDiscretizer, ratingScaler] featurePipeline = Pipeline(stages=pipelineStage) movieProcessedFeatures = featurePipeline.fit(movieFeatures).transform( movieFeatures) # 把分桶转成整数类型, 把标准化的向量提取为非向量 movieProcessedFeatures = movieProcessedFeatures.withColumn('ratingCountBucket', F.col('ratingCountBucket').cast(IntegerType()))\ .withColumn('scaleAvgRating', udf(lambda v: float(v[0]), FloatType())(F.col('scaleAvgRating'))).drop(F.col('avgRatingVec')) movieProcessedFeatures.show(10)
def testBinaryClassificationWithClassWeightsMap(): spark = test_helpers.getOrCreateSparkSession( test_helpers.getCurrentMethodName()) import catboost_spark featureNames = ["f1", "f2", "f3"] srcSchemaData = [("features", VectorUDT()), ("label", IntegerType())] srcData = [ Row(Vectors.dense(0.1, 0.2, 0.11), 0), Row(Vectors.dense(0.97, 0.82, 0.33), 1), Row(Vectors.dense(0.13, 0.22, 0.23), 1), Row(Vectors.dense(0.14, 0.18, 0.1), 0), Row(Vectors.dense(0.9, 0.67, 0.17), 0), Row(Vectors.dense(0.66, 0.1, 0.31), 0) ] pool = pool_test_helpers.createRawPool( test_helpers.getCurrentMethodName, pool_test_helpers.createSchema(srcSchemaData, featureNames, addFeatureNamesMetadata=True), srcData, {}) classWeightsMap = collections.OrderedDict([("0", 1.0), ("1", 2.0)]) classifier = (catboost_spark.CatBoostClassifier().setIterations( 20).setClassWeightsMap(classWeightsMap).setLoggingLevel( catboost_spark.ELoggingLevel.Debug).setTrainDir( tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName()))) model = classifier.fit(pool) predictions = model.transform(pool.data) predictions.show(truncate=False)
def _transform(self, dataset): inp = self.getOrDefault(self.inputCol) out = self.getOrDefault(self.predictionCol) mod_str = self.getOrDefault(self.modStr) use_vector_out = self.getOrDefault(self.useVectorOut) model = dill.loads(codecs.decode(mod_str.encode(), "base64")) model_broadcast = dataset._sc.broadcast(model) def predict_vec(data): features = data.toArray().reshape((1, len(data))) x_data = torch.from_numpy(features).float() model = model_broadcast.value model.eval() return Vectors.dense(model(x_data).detach().numpy().flatten()) def predict_float(data): features = data.toArray().reshape((1, len(data))) x_data = torch.from_numpy(features).float() model = model_broadcast.value model.eval() raw_prediction = model(x_data).detach().numpy().flatten() if len(raw_prediction) > 1: return float(np.argmax(raw_prediction)) return float(raw_prediction[0]) if use_vector_out: udfGenerateCode = F.udf(predict_vec, VectorUDT()) else: udfGenerateCode = F.udf(predict_float, DoubleType()) return dataset.withColumn(out, udfGenerateCode(inp))
def _average_feature_vectors(self, data, outputCol): '''Average the feature vectors Attributes ---------- data (DataFrame): input dataframe outputCol (str): name of the output column ''' session = SparkSession.builder.getOrCreate() def _averager(v1, v2, v3): f1 = v1.toArray() f2 = v2.toArray() f3 = v3.toArray() length = min(len(f1), len(f2), len(f3)) average = [] for i in range(length): average.append((f1[i] + f2[i] + f3[i]) / 3.0) return Vectors.dense(average) session.udf.register("averager", _averager, VectorUDT()) data.createOrReplaceTempView("table") sql = f"SELECT *, averager(feature0, feature1, feature2) AS {self.outputCol} from table" data = session.sql(sql) return data
def test_model_logistic_regression_binary_class(self): import inspect import os this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "sample_libsvm_data.txt") original_data = self.spark.read.format("libsvm").load(input_path) # # truncate the features # self.spark.udf.register("truncateFeatures", lambda x: SparseVector(5, range(0,5), x.toArray()[125:130]), VectorUDT()) data = original_data.selectExpr("label", "truncateFeatures(features) as features") lr = LogisticRegression(maxIter=100, tol=0.0001) model = lr.fit(data) # the name of the input for Logistic Regression is 'features' model_onnx = convert_sparkml(model, 'sparkml logistic regression', [('features', FloatTensorType([1, model.numFeatures]))]) self.assertTrue(model_onnx is not None) self.assertTrue(model_onnx.graph.node is not None) # run the model import pandas predicted = model.transform(data) data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32), predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) ] dump_data_and_sparkml_model(data_np, expected, model, model_onnx, basename="SparkmlLogisticRegression")
def compute_word2vec(self, input_df, output_vec_len, window_size=5, sub_test=False): """ Compute the word2vec for a given dataframe @param input_df : the dataframe to perform the action upon @param output_vec_len : the length (int) of the output vector @param input_col : the name (string) of the input column @param output_col : the name (string) of the output column @return output dataframe with output column """ # ensure that the input column is of type StringType() toArray = udf(lambda vs: vs, ArrayType(StringType())) toArray1 = udf(lambda vs: vs.toArray()) df = input_df.withColumn(self.input_col, toArray(input_df[self.input_col])) # initialize word2vec word2Vec = Word2Vec(vectorSize=output_vec_len, windowSize=window_size, minCount=5, inputCol=self.input_col, outputCol=self.output_col) # train word2vec model model = word2Vec.fit(df) # compute transformation result = model.transform(df) # convert result to a vector if not sub_test: conv = udf(lambda vs: Vectors.dense(vs), VectorUDT()) out = result.withColumn(output_col, conv(result[output_col])) return out else: return result
def SentimentFeatureEngineer(selectreviewDF): RemovePunct_udf = udf(RemovePunct, StringType()) countTokens_udf = udf(lambda words: len(words), IntegerType()) RemoveEmptyEntry_udf = udf(RemoveEmpty, ArrayType(StringType())) GetCharacter_List_udf = udf(GetCharacter_List, ArrayType(IntegerType())) GetSentimentScore_udf = udf(GetSentimentScore, ArrayType(IntegerType())) list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT()) selectreviewDF = selectreviewDF.withColumn( 'remove_punc', RemovePunct_udf(selectreviewDF['review_text'])) tokenizer = Tokenizer(inputCol="remove_punc", outputCol="tokens_word") selectreviewDF = tokenizer.transform(selectreviewDF) # Remender: Do not combine the structure here otherwise you will get an # error. (some columns will not be found) selectreviewDF = selectreviewDF.withColumn( 'num', countTokens_udf(selectreviewDF['tokens_word'])) selectreviewDF = selectreviewDF.withColumn( 'filtered_review_text_new', RemoveEmptyEntry_udf(selectreviewDF['tokens_word'])) selectreviewDF = selectreviewDF.withColumn('Character_adj', GetCharacter_List_udf(selectreviewDF['filtered_review_text_new'])[0]) \ .withColumn('Character_noun', GetCharacter_List_udf(selectreviewDF['filtered_review_text_new'])[1]) \ .withColumn('Character_verb', GetCharacter_List_udf(selectreviewDF['filtered_review_text_new'])[2]) \ .withColumn('Character_adv', GetCharacter_List_udf(selectreviewDF['filtered_review_text_new'])[3]) \ .withColumn('sentiment_neg', GetSentimentScore_udf(selectreviewDF['filtered_review_text_new'])[0]) \ .withColumn('sentiment_neu', GetSentimentScore_udf(selectreviewDF['filtered_review_text_new'])[1]) \ .withColumn('sentiment_pos', GetSentimentScore_udf(selectreviewDF['filtered_review_text_new'])[2]) \ .withColumn('sentiment_compound', GetSentimentScore_udf(selectreviewDF['filtered_review_text_new'])[3]) return selectreviewDF
def data_format(data): indexers = [ StringIndexer(inputCol=col, outputCol=col + "_index").fit(data) for col in categoricalColumns ] pipeline = Pipeline(stages=indexers) data_features = pipeline.fit(data).transform(data) features_withlabel = ['label' ] + [c + "_index" for c in categoricalColumns] + numericCols data_split = data_features.select(features_withlabel) features = [f.col(c + "_index") for c in categoricalColumns ] + [f.col(col) for col in numericCols] data_label_features = data_split.withColumn("features", f.array(features)).select( 'label', 'features') list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT()) df_with_vectors = data_label_features.select( data_label_features["label"], list_to_vector_udf( data_label_features["features"]).alias('features')) return df_with_vectors
def testBinaryClassificationWithTargetBorder(): spark = test_helpers.getOrCreateSparkSession( test_helpers.getCurrentMethodName()) import catboost_spark featureNames = ["f1", "f2", "f3"] srcSchemaData = [("features", VectorUDT()), ("label", DoubleType())] srcData = [ Row(Vectors.dense(0.1, 0.2, 0.11), 0.12), Row(Vectors.dense(0.97, 0.82, 0.33), 0.1), Row(Vectors.dense(0.13, 0.22, 0.23), 0.7), Row(Vectors.dense(0.14, 0.18, 0.1), 0.33), Row(Vectors.dense(0.9, 0.67, 0.17), 0.82), Row(Vectors.dense(0.66, 0.1, 0.31), 0.93) ] pool = pool_test_helpers.createRawPool( test_helpers.getCurrentMethodName, pool_test_helpers.createSchema(srcSchemaData, featureNames, addFeatureNamesMetadata=True), srcData, {}) classifier = (catboost_spark.CatBoostClassifier().setIterations( 20).setTargetBorder(0.5).setTrainDir( tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName()))) model = classifier.fit(pool) predictions = model.transform(pool.data) predictions.show(truncate=False)
def cast_factory(cls): # Parse to Vector if is_type(cls, Vectors): func_type = "udf" def cast_to_vectors(val, attr): return Vectors.dense(val) func_return_type = VectorUDT() # Parse standard data types elif get_spark_dtypes_object(cls): func_type = "column_exp" def cast_to_vectors(col_name, attr): return F.col(col_name).cast(get_spark_dtypes_object(cls)) func_return_type = None # Add here any other parse you want else: RaiseIt.value_error(cls) return func_return_type, cast_to_vectors, func_type
class VectorUDTTests(MLlibTestCase): dv0 = DenseVector([]) dv1 = DenseVector([1.0, 2.0]) sv0 = SparseVector(2, [], []) sv1 = SparseVector(2, [1], [2.0]) udt = VectorUDT() def test_json_schema(self): self.assertEqual(VectorUDT.fromJson(self.udt.jsonValue()), self.udt) def test_serialization(self): for v in [self.dv0, self.dv1, self.sv0, self.sv1]: self.assertEqual(v, self.udt.deserialize(self.udt.serialize(v))) def test_infer_schema(self): rdd = self.sc.parallelize([ Row(label=1.0, features=self.dv1), Row(label=0.0, features=self.sv1) ]) df = rdd.toDF() schema = df.schema field = [f for f in schema.fields if f.name == "features"][0] self.assertEqual(field.dataType, self.udt) vectors = df.rdd.map(lambda p: p.features).collect() self.assertEqual(len(vectors), 2) for v in vectors: if isinstance(v, SparseVector): self.assertEqual(v, self.sv1) elif isinstance(v, DenseVector): self.assertEqual(v, self.dv1) else: raise TypeError("expecting a vector but got %r of type %r" % (v, type(v)))
def test_json_schema(self): self.assertEqual(VectorUDT.fromJson(self.udt.jsonValue()), self.udt)
def predict(index, s): items = [i for i in s] feature = VectorUDT().deserialize(pickle.loads(items[0])) model = pickle.loads(pickle.loads(items[1])[0]) y = model.predict([feature.toArray()]) return [VectorUDT().serialize(Vectors.dense(y))]