def test_model_polynomial_expansion(self):
        data = self.spark.createDataFrame(
            [(Vectors.dense([1.2, 3.2, 1.3, -5.6]), ),
             (Vectors.dense([4.3, -3.2, 5.7, 1.0]), ),
             (Vectors.dense([0, 3.2, 4.7, -8.9]), )], ["dense"])
        model = PolynomialExpansion(degree=2,
                                    inputCol="dense",
                                    outputCol="expanded")

        # the input name should match that of what StringIndexer.inputCol
        feature_count = data.first()[0].size
        model_onnx = convert_sparkml(
            model, 'Sparkml PolynomialExpansion',
            [('dense', FloatTensorType([None, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().expanded.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().dense.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlPolynomialExpansion")
        onnx_model_path = paths[-1]
        output, output_shapes = run_onnx_model(['expanded'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
예제 #2
0
 def polynomial_expansion(self, df, column):
     """
     按列 构造多项式特征PolynomialExpansion
     """
     print('PolynomialExpansionExample')
     # 按列交叉构造多项式特征
     # 1 x1 x2
     # 2 x1 x2 x1x2 x1^2 x2^2
     # 3 x1 x2 x1x2 x1^2 x2^2 x1^2x2 x1x2^2 x1^3 x2^3
     polyExpasion = PolynomialExpansion(degree=2,
                                        inputCol=column,
                                        outputCol=column + '_poly')
     polyDF = polyExpasion.transform(df)
     return polyDF
def polynomial_expansion_usecase():
    """
        多项式扩展数据特征
    """
    spark = getSparkSession()

    df = spark.createDataFrame([(Vectors.dense([2.0, 1.0]), ),
                                (Vectors.dense([0.0, 0.0]), ),
                                (Vectors.dense([3.0, -1.0]), )], ["features"])

    polyExpansion = PolynomialExpansion(degree=3,
                                        inputCol="features",
                                        outputCol="polyFeatures")
    polyDF = polyExpansion.transform(df)

    polyDF.show(truncate=False)
result.show(truncate=False)

# COMMAND ----------

###Polynomial expansion is a process of expanding features in polynomial dimensions. This example expand the given features into 3 degree polynomial dimension
from pyspark.ml.feature import PolynomialExpansion
from pyspark.ml.linalg import Vectors

df = spark.createDataFrame([(Vectors.dense([2.0, 1.0]), ),
                            (Vectors.dense([0.0, 0.0]), ),
                            (Vectors.dense([3.0, -1.0]), )], ["features"])

polyExpansion = PolynomialExpansion(degree=3,
                                    inputCol="features",
                                    outputCol="polyFeatures")
polyDF = polyExpansion.transform(df)

polyDF.show(truncate=False)

# COMMAND ----------

###Discrete cosine transform (DCT) transforms a real valued sequence from time domain into frequency domain
from pyspark.ml.feature import DCT
from pyspark.ml.linalg import Vectors

df = spark.createDataFrame([(Vectors.dense([0.0, 1.0, -2.0, 3.0]), ),
                            (Vectors.dense([-1.0, 2.0, 4.0, -7.0]), ),
                            (Vectors.dense([14.0, -2.0, -5.0, 1.0]), )],
                           ["features"])

dct = DCT(inverse=False, inputCol="features", outputCol="featuresDCT")
#

from __future__ import print_function

# $example on$
from pyspark.ml.feature import PolynomialExpansion
from pyspark.ml.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("PolynomialExpansionExample")\
        .getOrCreate()

    # $example on$
    df = spark.createDataFrame([
        (Vectors.dense([2.0, 1.0]),),
        (Vectors.dense([0.0, 0.0]),),
        (Vectors.dense([3.0, -1.0]),)
    ], ["features"])

    polyExpansion = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures")
    polyDF = polyExpansion.transform(df)

    polyDF.show(truncate=False)
    # $example off$

    spark.stop()
예제 #6
0
	else:
		encoder_model = OneHotEncoderModel.load('/user/ronghui_safe/hgy/nid/edw/oneHotEncoder_model_v2')
	dataset = encoder_model.transform(dataset)
	feature_cols = ['source_vec', 'aging', 'PC1', 'PC2', 'PC3', 'PC4']
	assembler = VectorAssembler(inputCols=feature_cols, outputCol='feature_vec')
	dataset = assembler.transform(dataset)
	scaler_model = None
	if args.mode == 'train':
		scaler = StandardScaler(inputCol='feature_vec', outputCol='scaled_feature_vec', withStd=True, withMean=True)
		scaler_model = scaler.fit(dataset)
		scaler_model.save('/user/ronghui_safe/hgy/nid/edw/standardScaler_model_v2')
	else:
		scaler_model = StandardScalerModel.load('/user/ronghui_safe/hgy/nid/edw/standardScaler_model_v2')
	dataset = scaler_model.transform(dataset)
	polyExpansion = PolynomialExpansion(degree=2, inputCol='scaled_feature_vec', outputCol='polyFeatures')
	dataset = polyExpansion.transform(dataset)
	dataset = dataset.select(F.col('duration'), F.col('polyFeatures'), F.col('key')).cache()
	glr = None
	if args.mode == 'train':
		glr = GeneralizedLinearRegression(labelCol='duration', featuresCol='polyFeatures', family='Binomial', linkPredictionCol='link_pred')
		paramGrid = ParamGridBuilder() \
					.addGrid(glr.link, ['logit']) \
					.addGrid(glr.regParam, [1e-5]) \
					.build()
		tvs = TrainValidationSplit(estimator=glr, \
									estimatorParamMaps=paramGrid, \
									evaluator=RegressionEvaluator(metricName='r2', labelCol='duration'), \
									trainRatio=0.7)
		tvs_model = tvs.fit(dataset)
		print('----> {}'.format(tvs_model.validationMetrics))
		if args.save_model:
예제 #7
0
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import print_function

# $example on$
from pyspark.ml.feature import PolynomialExpansion
from pyspark.mllib.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession.builder.appName("PolynomialExpansionExample").getOrCreate()

    # $example on$
    df = spark\
        .createDataFrame([(Vectors.dense([-2.0, 2.3]),),
                          (Vectors.dense([0.0, 0.0]),),
                          (Vectors.dense([0.6, -1.1]),)],
                         ["features"])
    px = PolynomialExpansion(degree=2, inputCol="features", outputCol="polyFeatures")
    polyDF = px.transform(df)
    for expanded in polyDF.select("polyFeatures").take(3):
        print(expanded)
    # $example off$

    spark.stop()
예제 #8
0
    print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))

# COMMAND ----------

from pyspark.ml.feature import PCA

pca = PCA().setInputCol("features").setK(2)
pca.fit(scaleDF).transform(scaleDF).show(20, False)

# COMMAND ----------

from pyspark.ml.feature import PolynomialExpansion

pe = PolynomialExpansion().setInputCol("features").setDegree(2).setOutputCol(
    "polyFeatures")
pe.transform(scaleDF).show()

# COMMAND ----------

from pyspark.ml.feature import ChiSqSelector, Tokenizer

tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn\
  .transform(sales.select("Description", "CustomerId"))\
  .where("CustomerId IS NOT NULL")
prechi = fittedCV.transform(tokenized)\
  .where("CustomerId IS NOT NULL")
chisq = ChiSqSelector()\
  .setFeaturesCol("countVec")\
  .setLabelCol("CustomerId")\
  .setNumTopFeatures(2)
예제 #9
0
  (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"])





# Fit the pipeline to training documents.



qq=vat.transform(dat).head()
qq



qq = poly.transform(df).head()

>>> px = PolynomialExpansion(degree=2, inputCol="dense", outputCol="expanded")
>>> px.transform(df).head().expanded
DenseVector([0.5, 0.25, 2.0, 1.0, 4.0])
>>> px.setParams(outputCol="test").transform(df).head().test
DenseVector([0.5, 0.25, 2.0, 1.0, 4.0])
>>> polyExpansionPath = temp_path + "/poly-expansion"
>>> px.save(polyExpansionPath)
>>> loadedPx = PolynomialExpansion.load(polyExpansionPath)
>>> loadedPx.getDegree() == px.getDegree()
True  
  

  
  
df_with_vectors = T1_df4.select(
    'B1err', 'ratio', 'T1',
    list_to_vec(T1_df4["B1Knots"]).alias("B1Knots"),
    list_to_vec(T1_df4["RatioKnots"]).alias("RatioKnots"))

vec = VectorAssembler(inputCols=["B1err", "ratio", "B1Knots", "RatioKnots"],
                      outputCol="features")

T1_df5 = vec.transform(df_with_vectors)

#Polynomial Exapnsion with interactions

polyExpansion = PolynomialExpansion(degree=2,
                                    inputCol="features",
                                    outputCol="Interaction")
polyDF = polyExpansion.transform(T1_df5)

#Regression Time!
lr = LinearRegression(labelCol="T1", featuresCol="Interaction")
model = lr.fit(polyDF)

#Now we want to interpolate data onto 100*100 grid:
x1 = np.linspace(0.1, 2, 100)  #B1err
x2 = np.linspace(0.0005, 2.5, 100)  #Ratio
x1_2 = np.zeros([100, 100])
x2_2 = np.zeros([100, 100])
for i in range(0, len(x1)):
    for j in range(0, len(x2)):
        x1_2[i, j] = x1[i]
        x2_2[i, j] = x2[j]
from pyspark.ml.feature import PolynomialExpansion
from pyspark.mllib.linalg import Vectors
from pyspark import SparkContext
from pyspark.sql import SQLContext

sc = SparkContext("local", "samp")
sqlContext = SQLContext(sc)
dataDF = sqlContext.createDataFrame([(Vectors.dense([-2.0, 2.3]),),
                                     (Vectors.dense([0.0, 0.0]),), (Vectors.dense([0.6, -1.1]),)], ["features"])
px = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures")
polyDF = px.transform(dataDF)
for expanded in polyDF.select("polyFeatures").take(3):
    print expanded

"""OUTPUT 
Row(polyFeatures=DenseVector([-2.0, 4.0, 2.3, -4.6, 5.29]))
Row(polyFeatures=DenseVector([0.0, 0.0, 0.0, 0.0, 0.0]))
Row(polyFeatures=DenseVector([0.6, 0.36, -1.1, -0.66, 1.21]))"""

"""
Row(polyFeatures=DenseVector([-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12,167]))
Row(polyFeatures=DenseVector([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]))
Row(polyFeatures=DenseVector([0.6, 0.36, 0.216, -1.1, -0.66, -0.396, 1.21, 0.72
, -1.331]))"""
    text, vector = row
    print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))


# COMMAND ----------

from pyspark.ml.feature import PCA
pca = PCA().setInputCol("features").setK(2)
pca.fit(scaleDF).transform(scaleDF).show(20, False)


# COMMAND ----------

from pyspark.ml.feature import PolynomialExpansion
pe = PolynomialExpansion().setInputCol("features").setDegree(2)
pe.transform(scaleDF).show()


# COMMAND ----------

from pyspark.ml.feature import ChiSqSelector, Tokenizer
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn\
  .transform(sales.select("Description", "CustomerId"))\
  .where("CustomerId IS NOT NULL")
prechi = fittedCV.transform(tokenized)\
  .where("CustomerId IS NOT NULL")
chisq = ChiSqSelector()\
  .setFeaturesCol("countVec")\
  .setLabelCol("CustomerId")\
  .setNumTopFeatures(2)
예제 #13
0
"""
Created on Sun Jun 25 21:00:59 2017

@author: vishal
"""

from __future__ import print_function
from pyspark.sql import SparkSession

session = SparkSession.builder.appName('Polynomial Expension').getOrCreate()

from pyspark.ml.linalg import Vectors

df = session.createDataFrame([(Vectors.dense([2.0, 1.0]), ),
                              (Vectors.dense([0.0, 0.0]), ),
                              (Vectors.dense([3.0, -1.0]), )], ["features"])

#df.show()
from pyspark.ml.feature import PolynomialExpansion

polyExpansion = PolynomialExpansion(degree=2,
                                    inputCol="features",
                                    outputCol="pe_feature")

ps_df = polyExpansion.transform(df)
print(df.first())
print(ps_df.first())

#ps_df.select('pe_feature').show()

session.stop()
min_value = data.agg(F.min("ablation_rate")).collect()[0][0]
max_value = data.agg(F.max("ablation_rate")).collect()[0][0]
print("Min/max ablation rate: " + str(min_value) + " and " + str(max_value))

# Transform independent variable columns into vector of features
vectorAssembler = VectorAssembler(inputCols=["elevation", "time"],
                                  outputCol="features")
vector_data = vectorAssembler.transform(data)
vector_data = vector_data.select(["features", "ablation_rate"])
vector_data.show(vector_data.count(), truncate=False)

# Convert to polynomial features
polyExpansion = PolynomialExpansion(degree=1,
                                    inputCol='features',
                                    outputCol='polyFeatures')
poly_data = polyExpansion.transform(vector_data)
poly_data = poly_data.select(["polyFeatures", "ablation_rate"])
poly_data.show(truncate=False)

# Split into training and test data sets
splits = poly_data.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]
print("Train data count")
print(train_df.count())
print("Test data count")
print(test_df.count())

lr = LinearRegression(featuresCol='polyFeatures',
                      labelCol='ablation_rate',
                      regParam=0.01)