def test_model_polynomial_expansion(self): data = self.spark.createDataFrame( [(Vectors.dense([1.2, 3.2, 1.3, -5.6]), ), (Vectors.dense([4.3, -3.2, 5.7, 1.0]), ), (Vectors.dense([0, 3.2, 4.7, -8.9]), )], ["dense"]) model = PolynomialExpansion(degree=2, inputCol="dense", outputCol="expanded") # the input name should match that of what StringIndexer.inputCol feature_count = data.first()[0].size model_onnx = convert_sparkml( model, 'Sparkml PolynomialExpansion', [('dense', FloatTensorType([None, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().expanded.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().dense.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlPolynomialExpansion") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['expanded'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def polynomial_expansion(self, df, column): """ 按列 构造多项式特征PolynomialExpansion """ print('PolynomialExpansionExample') # 按列交叉构造多项式特征 # 1 x1 x2 # 2 x1 x2 x1x2 x1^2 x2^2 # 3 x1 x2 x1x2 x1^2 x2^2 x1^2x2 x1x2^2 x1^3 x2^3 polyExpasion = PolynomialExpansion(degree=2, inputCol=column, outputCol=column + '_poly') polyDF = polyExpasion.transform(df) return polyDF
def polynomial_expansion_usecase(): """ 多项式扩展数据特征 """ spark = getSparkSession() df = spark.createDataFrame([(Vectors.dense([2.0, 1.0]), ), (Vectors.dense([0.0, 0.0]), ), (Vectors.dense([3.0, -1.0]), )], ["features"]) polyExpansion = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures") polyDF = polyExpansion.transform(df) polyDF.show(truncate=False)
result.show(truncate=False) # COMMAND ---------- ###Polynomial expansion is a process of expanding features in polynomial dimensions. This example expand the given features into 3 degree polynomial dimension from pyspark.ml.feature import PolynomialExpansion from pyspark.ml.linalg import Vectors df = spark.createDataFrame([(Vectors.dense([2.0, 1.0]), ), (Vectors.dense([0.0, 0.0]), ), (Vectors.dense([3.0, -1.0]), )], ["features"]) polyExpansion = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures") polyDF = polyExpansion.transform(df) polyDF.show(truncate=False) # COMMAND ---------- ###Discrete cosine transform (DCT) transforms a real valued sequence from time domain into frequency domain from pyspark.ml.feature import DCT from pyspark.ml.linalg import Vectors df = spark.createDataFrame([(Vectors.dense([0.0, 1.0, -2.0, 3.0]), ), (Vectors.dense([-1.0, 2.0, 4.0, -7.0]), ), (Vectors.dense([14.0, -2.0, -5.0, 1.0]), )], ["features"]) dct = DCT(inverse=False, inputCol="features", outputCol="featuresDCT")
# from __future__ import print_function # $example on$ from pyspark.ml.feature import PolynomialExpansion from pyspark.ml.linalg import Vectors # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("PolynomialExpansionExample")\ .getOrCreate() # $example on$ df = spark.createDataFrame([ (Vectors.dense([2.0, 1.0]),), (Vectors.dense([0.0, 0.0]),), (Vectors.dense([3.0, -1.0]),) ], ["features"]) polyExpansion = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures") polyDF = polyExpansion.transform(df) polyDF.show(truncate=False) # $example off$ spark.stop()
else: encoder_model = OneHotEncoderModel.load('/user/ronghui_safe/hgy/nid/edw/oneHotEncoder_model_v2') dataset = encoder_model.transform(dataset) feature_cols = ['source_vec', 'aging', 'PC1', 'PC2', 'PC3', 'PC4'] assembler = VectorAssembler(inputCols=feature_cols, outputCol='feature_vec') dataset = assembler.transform(dataset) scaler_model = None if args.mode == 'train': scaler = StandardScaler(inputCol='feature_vec', outputCol='scaled_feature_vec', withStd=True, withMean=True) scaler_model = scaler.fit(dataset) scaler_model.save('/user/ronghui_safe/hgy/nid/edw/standardScaler_model_v2') else: scaler_model = StandardScalerModel.load('/user/ronghui_safe/hgy/nid/edw/standardScaler_model_v2') dataset = scaler_model.transform(dataset) polyExpansion = PolynomialExpansion(degree=2, inputCol='scaled_feature_vec', outputCol='polyFeatures') dataset = polyExpansion.transform(dataset) dataset = dataset.select(F.col('duration'), F.col('polyFeatures'), F.col('key')).cache() glr = None if args.mode == 'train': glr = GeneralizedLinearRegression(labelCol='duration', featuresCol='polyFeatures', family='Binomial', linkPredictionCol='link_pred') paramGrid = ParamGridBuilder() \ .addGrid(glr.link, ['logit']) \ .addGrid(glr.regParam, [1e-5]) \ .build() tvs = TrainValidationSplit(estimator=glr, \ estimatorParamMaps=paramGrid, \ evaluator=RegressionEvaluator(metricName='r2', labelCol='duration'), \ trainRatio=0.7) tvs_model = tvs.fit(dataset) print('----> {}'.format(tvs_model.validationMetrics)) if args.save_model:
# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from __future__ import print_function # $example on$ from pyspark.ml.feature import PolynomialExpansion from pyspark.mllib.linalg import Vectors # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession.builder.appName("PolynomialExpansionExample").getOrCreate() # $example on$ df = spark\ .createDataFrame([(Vectors.dense([-2.0, 2.3]),), (Vectors.dense([0.0, 0.0]),), (Vectors.dense([0.6, -1.1]),)], ["features"]) px = PolynomialExpansion(degree=2, inputCol="features", outputCol="polyFeatures") polyDF = px.transform(df) for expanded in polyDF.select("polyFeatures").take(3): print(expanded) # $example off$ spark.stop()
print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector))) # COMMAND ---------- from pyspark.ml.feature import PCA pca = PCA().setInputCol("features").setK(2) pca.fit(scaleDF).transform(scaleDF).show(20, False) # COMMAND ---------- from pyspark.ml.feature import PolynomialExpansion pe = PolynomialExpansion().setInputCol("features").setDegree(2).setOutputCol( "polyFeatures") pe.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import ChiSqSelector, Tokenizer tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut") tokenized = tkn\ .transform(sales.select("Description", "CustomerId"))\ .where("CustomerId IS NOT NULL") prechi = fittedCV.transform(tokenized)\ .where("CustomerId IS NOT NULL") chisq = ChiSqSelector()\ .setFeaturesCol("countVec")\ .setLabelCol("CustomerId")\ .setNumTopFeatures(2)
(0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) # Fit the pipeline to training documents. qq=vat.transform(dat).head() qq qq = poly.transform(df).head() >>> px = PolynomialExpansion(degree=2, inputCol="dense", outputCol="expanded") >>> px.transform(df).head().expanded DenseVector([0.5, 0.25, 2.0, 1.0, 4.0]) >>> px.setParams(outputCol="test").transform(df).head().test DenseVector([0.5, 0.25, 2.0, 1.0, 4.0]) >>> polyExpansionPath = temp_path + "/poly-expansion" >>> px.save(polyExpansionPath) >>> loadedPx = PolynomialExpansion.load(polyExpansionPath) >>> loadedPx.getDegree() == px.getDegree() True
df_with_vectors = T1_df4.select( 'B1err', 'ratio', 'T1', list_to_vec(T1_df4["B1Knots"]).alias("B1Knots"), list_to_vec(T1_df4["RatioKnots"]).alias("RatioKnots")) vec = VectorAssembler(inputCols=["B1err", "ratio", "B1Knots", "RatioKnots"], outputCol="features") T1_df5 = vec.transform(df_with_vectors) #Polynomial Exapnsion with interactions polyExpansion = PolynomialExpansion(degree=2, inputCol="features", outputCol="Interaction") polyDF = polyExpansion.transform(T1_df5) #Regression Time! lr = LinearRegression(labelCol="T1", featuresCol="Interaction") model = lr.fit(polyDF) #Now we want to interpolate data onto 100*100 grid: x1 = np.linspace(0.1, 2, 100) #B1err x2 = np.linspace(0.0005, 2.5, 100) #Ratio x1_2 = np.zeros([100, 100]) x2_2 = np.zeros([100, 100]) for i in range(0, len(x1)): for j in range(0, len(x2)): x1_2[i, j] = x1[i] x2_2[i, j] = x2[j]
from pyspark.ml.feature import PolynomialExpansion from pyspark.mllib.linalg import Vectors from pyspark import SparkContext from pyspark.sql import SQLContext sc = SparkContext("local", "samp") sqlContext = SQLContext(sc) dataDF = sqlContext.createDataFrame([(Vectors.dense([-2.0, 2.3]),), (Vectors.dense([0.0, 0.0]),), (Vectors.dense([0.6, -1.1]),)], ["features"]) px = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures") polyDF = px.transform(dataDF) for expanded in polyDF.select("polyFeatures").take(3): print expanded """OUTPUT Row(polyFeatures=DenseVector([-2.0, 4.0, 2.3, -4.6, 5.29])) Row(polyFeatures=DenseVector([0.0, 0.0, 0.0, 0.0, 0.0])) Row(polyFeatures=DenseVector([0.6, 0.36, -1.1, -0.66, 1.21]))""" """ Row(polyFeatures=DenseVector([-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12,167])) Row(polyFeatures=DenseVector([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])) Row(polyFeatures=DenseVector([0.6, 0.36, 0.216, -1.1, -0.66, -0.396, 1.21, 0.72 , -1.331]))"""
text, vector = row print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector))) # COMMAND ---------- from pyspark.ml.feature import PCA pca = PCA().setInputCol("features").setK(2) pca.fit(scaleDF).transform(scaleDF).show(20, False) # COMMAND ---------- from pyspark.ml.feature import PolynomialExpansion pe = PolynomialExpansion().setInputCol("features").setDegree(2) pe.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import ChiSqSelector, Tokenizer tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut") tokenized = tkn\ .transform(sales.select("Description", "CustomerId"))\ .where("CustomerId IS NOT NULL") prechi = fittedCV.transform(tokenized)\ .where("CustomerId IS NOT NULL") chisq = ChiSqSelector()\ .setFeaturesCol("countVec")\ .setLabelCol("CustomerId")\ .setNumTopFeatures(2)
""" Created on Sun Jun 25 21:00:59 2017 @author: vishal """ from __future__ import print_function from pyspark.sql import SparkSession session = SparkSession.builder.appName('Polynomial Expension').getOrCreate() from pyspark.ml.linalg import Vectors df = session.createDataFrame([(Vectors.dense([2.0, 1.0]), ), (Vectors.dense([0.0, 0.0]), ), (Vectors.dense([3.0, -1.0]), )], ["features"]) #df.show() from pyspark.ml.feature import PolynomialExpansion polyExpansion = PolynomialExpansion(degree=2, inputCol="features", outputCol="pe_feature") ps_df = polyExpansion.transform(df) print(df.first()) print(ps_df.first()) #ps_df.select('pe_feature').show() session.stop()
min_value = data.agg(F.min("ablation_rate")).collect()[0][0] max_value = data.agg(F.max("ablation_rate")).collect()[0][0] print("Min/max ablation rate: " + str(min_value) + " and " + str(max_value)) # Transform independent variable columns into vector of features vectorAssembler = VectorAssembler(inputCols=["elevation", "time"], outputCol="features") vector_data = vectorAssembler.transform(data) vector_data = vector_data.select(["features", "ablation_rate"]) vector_data.show(vector_data.count(), truncate=False) # Convert to polynomial features polyExpansion = PolynomialExpansion(degree=1, inputCol='features', outputCol='polyFeatures') poly_data = polyExpansion.transform(vector_data) poly_data = poly_data.select(["polyFeatures", "ablation_rate"]) poly_data.show(truncate=False) # Split into training and test data sets splits = poly_data.randomSplit([0.7, 0.3]) train_df = splits[0] test_df = splits[1] print("Train data count") print(train_df.count()) print("Test data count") print(test_df.count()) lr = LinearRegression(featuresCol='polyFeatures', labelCol='ablation_rate', regParam=0.01)