Пример #1
0
def sl_by_libsvm(spark, file_name, in_folder, out_folder):
    rdd_name = os.path.join(out_folder, file_name)

    if os.path.exists(rdd_name):
        data = spark.read.format("libsvm").load(rdd_name)
    else:
        data = read_csv(spark, os.path.join(in_folder, file_name))
        data.show(truncate=False)
        print("read one file use time:" + str(time.time() - exec_start_time))
        data = data.withColumn("failure", data["failure"].cast("double"))
        # ut = MLUtils.convertVectorColumnsToML(df, "indexedFeatures")
        # ut = ut.withColumnRenamed("failure", "label").withColumnRenamed("indexedFeatures", "features")
        # ut = ut.withColumn("label", ut["label"].cast("double"))

        scaler = MaxAbsScaler(inputCol="indexedFeatures", outputCol="features")
        # Compute summary statistics and generate MaxAbsScalerModel
        scalerModel = scaler.fit(data)
        # rescale each feature to range [-1, 1].
        data = scalerModel.transform(data)
        data.show(truncate=False)

        data = data.select("failure", "features")
        data.write.format("libsvm").save(rdd_name)

    data.show(truncate=False)

    return data
Пример #2
0
    def test_maxabs_scaler(self):
        data = self.spark.createDataFrame([(
            0,
            Vectors.dense([1.0, 0.1, -1.0]),
        ), (
            1,
            Vectors.dense([2.0, 1.1, 1.0]),
        ), (
            2,
            Vectors.dense([3.0, 10.1, 3.0]),
        )], ["id", "features"])
        scaler = MaxAbsScaler(inputCol='features', outputCol='scaled_features')
        model = scaler.fit(data)

        # the input names must match the inputCol(s) above
        model_onnx = convert_sparkml(model, 'Sparkml MaxAbsScaler',
                                     [('features', FloatTensorType([1, 3]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().scaled_features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlMaxAbsScaler")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['scaled_features'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
Пример #3
0
def max_abs_scale(dataFrame, inputColNames):

    assembledDF = getAssembledDataFrame(dataFrame, inputColNames)
    scaler=MaxAbsScaler(inputCol="features",\
                        outputCol="scaled features")
    scalerModel = scaler.fit(assembledDF)
    scaledDF = scalerModel.transform(assembledDF).drop("features")
    return scaledDF
Пример #4
0
def maxAbsScalerModel(df, conf):
    """
        input: spark-dataFrame, conf [configuration params]
        return value: scaler, model
    """
    inp = conf.get("inputCol", None)
    output = conf.get("outputCol", None)
    scaler = MaxAbsScaler(inputCol = inp, outputCol = output)
    model = scaler.fit(df)
    return scaler, model
Пример #5
0
 def test_clear_param(self):
     df = self.spark.createDataFrame([(Vectors.dense([1.0]),), (Vectors.dense([2.0]),)], ["a"])
     maScaler = MaxAbsScaler(inputCol="a", outputCol="scaled")
     model = maScaler.fit(df)
     self.assertTrue(model.isSet(model.outputCol))
     self.assertEqual(model.getOutputCol(), "scaled")
     model.clear(model.outputCol)
     self.assertFalse(model.isSet(model.outputCol))
     self.assertEqual(model.getOutputCol()[:12], 'MaxAbsScaler')
     output = model.transform(df)
     self.assertEqual(model.getOutputCol(), output.schema.names[1])
Пример #6
0
    def max_abs_scaler(self, df, column):
        """
        按列 特征绝对值归一化MaxAbsScaler
        """
        print('MaxAbsScalerExample')

        # 把“每一列”都缩放到[-1,1]之间——最大绝对值缩放
        scaler = MaxAbsScaler(inputCol=column, outputCol=column + '_maxabs')
        scalerModel = scaler.fit(df)
        scaledData = scalerModel.transform(df)
        return scaledData
Пример #7
0
 def scaling(dataFrame, inputColName):
     outputColName = "scaled " + inputColName
     assembler = VectorAssembler(inputCols=[inputColName], \
                                 outputCol="features")
     assembledDF = assembler.transform(dataFrame)
     scaler=MaxAbsScaler(inputCol="features", \
                         outputCol=outputColName)
     scalerModel=scaler.fit(assembledDF)
     scaledDF = scalerModel.transform(assembledDF).drop("features")
     castVectorToFloat = udf(lambda v : float(v[0]), FloatType())
     scaledDF = scaledDF.withColumn(outputColName, castVectorToFloat(outputColName)) 
     print ("Successfully scale the column '{0:s}' to range (-1, 1) and create a new column '{1:s}'.".format(inputColName, outputColName))
     return scaledDF
Пример #8
0
    def maxabs_scale(self, columns='*'):
        '''
        rescale the columns by dividing by the max absolute value
        '''
        if columns == "*":
            columns = self._df.schema.names
        else:
            assert isinstance(columns,
                              list), "Error: columns argument must be a list!"

        for column in columns:
            outputcol = column + '_scaled'
            assembler = VectorAssembler(inputCols=[column],
                                        outputCol='features')
            df = assembler.transform(self._df)
            scaler = MaxAbsScaler(inputCol='features', outputCol=outputcol)
            df = scaler.fit(df).transform(df).drop('features')
            to_float = udf(lambda x: float(x[0]))
            self._df = df.withColumn(outputcol, to_float(outputcol))
        return self._df
Пример #9
0
# COMMAND ----------

from pyspark.ml.feature import MinMaxScaler

minMax = MinMaxScaler().setMin(5).setMax(10).setInputCol(
    "features").setOutputCol("features_minmax_scaled")
fittedminMax = minMax.fit(scaleDF)
fittedminMax.transform(scaleDF).show()

# COMMAND ----------

from pyspark.ml.feature import MaxAbsScaler

maScaler = MaxAbsScaler().setInputCol("features").setOutputCol(
    "features_MaxAbs_scaled")
fittedmaScaler = maScaler.fit(scaleDF)
fittedmaScaler.transform(scaleDF).show()

# COMMAND ----------

from pyspark.ml.feature import ElementwiseProduct
from pyspark.ml.linalg import Vectors

scaleUpVec = Vectors.dense(10.0, 15.0, 20.0)
scalingUp = ElementwiseProduct()\
  .setScalingVec(scaleUpVec)\
  .setInputCol("features")
scalingUp.transform(scaleDF).show()

# COMMAND ----------
Пример #10
0
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import print_function

# $example on$
from pyspark.ml.feature import MaxAbsScaler
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession.builder.appName("MaxAbsScalerExample").getOrCreate()

    # $example on$
    dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

    scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures")

    # Compute summary statistics and generate MaxAbsScalerModel
    scalerModel = scaler.fit(dataFrame)

    # rescale each feature to range [-1, 1].
    scaledData = scalerModel.transform(dataFrame)
    scaledData.show()
    # $example off$

    spark.stop()
Пример #11
0
def scaler(input_features):
    scaler = MaxAbsScaler(inputCol="raw_features", outputCol="features")
    scalerModel = scaler.fit(input_features)
    scaledData = scalerModel.transform(input_features).drop("raw_features")
    #scaledData.show(3)
    return scaledData
assembler = VectorAssembler(inputCols=[
    "event_id_index", "age_index", "longitude_index", "device_model_index",
    "latitude_index", "phone_brand_index"
],
                            outputCol="features")
res = assembler.transform(df_r)
res = res.drop("event_id_index").drop("age_index").drop(
    "longitude_index").drop("device_model_index").drop("device_id_index").drop(
        "phone_brand_index")
res.show()

scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures")

# Compute summary statistics and generate MaxAbsScalerModel
scalerModel = scaler.fit(res)

# rescale each feature to range [-1, 1].
scaledData = scalerModel.transform(res)
scaledData.show()
scaledData.select("scaledFeatures").show()
res = scaledData.limit(100000)

(trainingData, testingData) = res.randomSplit([0.7, 0.3])
rf = RandomForestClassifier(labelCol="group_index",
                            featuresCol="scaledFeatures")
rfModel = rf.fit(trainingData)
predictions = rfModel.transform(testingData)
predictions.show()

evaluator = MulticlassClassificationEvaluator(labelCol="group_index",
sScaler.fit(scaleDF).transform(scaleDF).show()


# COMMAND ----------

from pyspark.ml.feature import MinMaxScaler
minMax = MinMaxScaler().setMin(5).setMax(10).setInputCol("features")
fittedminMax = minMax.fit(scaleDF)
fittedminMax.transform(scaleDF).show()


# COMMAND ----------

from pyspark.ml.feature import MaxAbsScaler
maScaler = MaxAbsScaler().setInputCol("features")
fittedmaScaler = maScaler.fit(scaleDF)
fittedmaScaler.transform(scaleDF).show()


# COMMAND ----------

from pyspark.ml.feature import ElementwiseProduct
from pyspark.ml.linalg import Vectors
scaleUpVec = Vectors.dense(10.0, 15.0, 20.0)
scalingUp = ElementwiseProduct()\
  .setScalingVec(scaleUpVec)\
  .setInputCol("features")
scalingUp.transform(scaleDF).show()


# COMMAND ----------
    def process(self, data_input, data_output):
        """
        An spark process to do feature engineering
        :param data_input: data input filename
        :param data_output: data output filename
        """

        df = self.spark.read.parquet(data_input).select('SHP_DATE_CREATED_ID', 'SHP_DATETIME_CREATED_ID', 'SHP_DATE_HANDLING_ID', 'SHP_DATETIME_HANDLING_ID', 'SHP_SENDER_ID', 'SHP_ORDER_COST', 'CAT_CATEG_ID_L7', 'SHP_ADD_ZIP_CODE', 'SHP_DATE_SHIPPED_ID', 'SHP_DATETIME_SHIPPED_ID', 'HT_REAL')

        # 1. SHP_ORDER_COST_INT: Se tranforma la columna SHP_ORDER_COST de float a integer.
        df = df.withColumn("SHP_ORDER_COST_INT", (df["SHP_ORDER_COST"].cast(IntegerType())))

        # 2. SHP_DAY: Se añade una columna para indicar que dia de la semana se acredito el pago.
        shp_day_udf = udf(self.shp_day, IntegerType())

        df = df.withColumn('SHP_DAY', shp_day_udf(df['SHP_DATE_HANDLING_ID']))

        # 3. WKND_DAY: Se añade una columna para indicar si el pago se acredito durante el fin de semana.
        weekend_day_udf = udf(self.weekend_day, IntegerType())

        df = df.withColumn('WKND_DAY', weekend_day_udf(df['SHP_DATE_HANDLING_ID']))
        df.select('WKND_DAY').show(10)

        # 4. MONTH_NUM: Se añanade una columna para indicar el mes del pago.
        week_number_udf = udf(self.week_number, IntegerType())

        df = df.withColumn('WK_NUM', week_number_udf(df['SHP_DATE_HANDLING_ID']))
        df.select('WK_NUM').show(10)

        # 5. *WK_NUM*: Se añanade una columna para indicar la semana del año en la que se realizó el pago.
        month_number_udf = udf(self.month_number, IntegerType())
        df = df.withColumn('MONTH_NUM', month_number_udf(df['SHP_DATE_HANDLING_ID']))

        # 6. *TIMESTAMP*: Se añanade un TIMESTAMP de las fechas.

        get_timestamp_udf = udf(self.get_timestamp, IntegerType())
        df = df.withColumn('SHP_DATE_HANDLING_TIMESTAMP', get_timestamp_udf(df['SHP_DATE_HANDLING_ID']))
        df = df.withColumn('SHP_DATE_CREATED_TIMESTAMP', get_timestamp_udf(df['SHP_DATE_CREATED_ID']))

        my_handling_time_udf = udf(self.my_handling_time, IntegerType())

        df = df.withColumn('HT', my_handling_time_udf(array('SHP_DATETIME_SHIPPED_ID', 'SHP_DATETIME_HANDLING_ID')))
        shp_sender_indexer = StringIndexer(inputCol="SHP_SENDER_ID", outputCol="SHP_SENDER_ID_NUM").fit(df)
        df = shp_sender_indexer.transform(df)
        shp_sender_indexer = StringIndexer(inputCol="CAT_CATEG_ID_L7", outputCol="CAT_CATEG_ID_L7_NUM").fit(df)
        df = shp_sender_indexer.transform(df)

        #create the vector assembler 
        vec_assembler = VectorAssembler(inputCols=['SHP_DATE_HANDLING_TIMESTAMP', 'SHP_DATE_CREATED_TIMESTAMP','SHP_SENDER_ID_NUM', 'CAT_CATEG_ID_L7_NUM', 
                            'SHP_ORDER_COST_INT', 'SHP_DAY', 'WKND_DAY', 
                            'WK_NUM', 'MONTH_NUM', 'SHP_ADD_ZIP_CODE'], outputCol='features')

        #transform the values
        features_df = vec_assembler.transform(df)

        scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures")

        # Compute summary statistics and generate MaxAbsScalerModel
        scalerModel = scaler.fit(features_df)

        # rescale each feature to range [-1, 1].
        scaledData = scalerModel.transform(features_df)

        #Save dataset as parquet
        scaledData.write.format("parquet").mode('overwrite').option("header", "true").save(data_output)
Пример #15
0
dfJoin = dfJoin.withColumnRenamed("avg(followers_count)","avg-followers")
dfJoin.show()

# COMMAND ----------

from pyspark.ml.feature import VectorAssembler
dfJoin1 = dfJoin.select("avg-sentiment","avg-followers","avg-volume")
inputFeatures = ["avg-sentiment","avg-followers","avg-volume"]
assembler = VectorAssembler(inputCols=inputFeatures, outputCol="features")
dfJoin2 = assembler.transform(dfJoin1)

# COMMAND ----------

# Scaling features
scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures")
scalerModel = scaler.fit(dfJoin2)
scaledData = scalerModel.transform(dfJoin2)
scaledData.select("features", "scaledFeatures").show()

# COMMAND ----------

#Elbow method
import numpy as np
cost = np.zeros(10)
for k in range(2,10):
    kmeans = KMeans().setK(k).setFeaturesCol("scaledFeatures").setPredictionCol("prediction").setMaxIter(1).setSeed(1)
    model = kmeans.fit(scaledData)
    cost[k] = model.computeCost(scaledData)

# COMMAND ----------
Пример #16
0
@author: luogan
"""

from pyspark.ml.feature import MaxAbsScaler
from pyspark.ml.linalg import Vectors
from pyspark.sql import SparkSession

spark= SparkSession\
                .builder \
                .appName("dataFrame") \
                .getOrCreate()
dataFrame = spark.createDataFrame([(
    0,
    Vectors.dense([1.0, 0.1, -8.0]),
), (
    1,
    Vectors.dense([2.0, 1.0, -4.0]),
), (
    2,
    Vectors.dense([4.0, 10.0, 8.0]),
)], ["id", "features"])

scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures")

# Compute summary statistics and generate MaxAbsScalerModel
scalerModel = scaler.fit(dataFrame)

# rescale each feature to range [-1, 1].
scaledData = scalerModel.transform(dataFrame)

scaledData.select("features", "scaledFeatures").show()
dfGoogle = dfJoin.select('*').where(dfJoin.company == 'GOOGLE')
dfNetflix = dfJoin.select('*').where(dfJoin.company == 'NETFLIX')
dfSnapchat = dfJoin.select('*').where(dfJoin.company == 'SNAPCHAT')
dfMicrosoft = dfJoin.select('*').where(dfJoin.company == 'MICROSOFT')
dfFacebook.describe().toPandas().transpose()
display(dfFacebook)

# COMMAND ----------

#Feature scaling using MaxAbsScaler
vectorAssembler = VectorAssembler(
    inputCols=['avg-sentiment', 'avg-followers', 'avg-volume'],
    outputCol='features')
v_dffacebook = vectorAssembler.transform(dfFacebook)
scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures")
scalerModel = scaler.fit(v_dffacebook)
scaledData = scalerModel.transform(v_dffacebook)
scaledData.select("features", "scaledFeatures").show()
v_dffacebook1 = scaledData.select(['features', 'scaledFeatures', 'avg-close'])
v_dffacebook1.show()

# COMMAND ----------

#Train test split
train_df, test_df = v_dffacebook1.randomSplit([0.8, 0.2])

# COMMAND ----------

#Linear Regression model
lr = LinearRegression(featuresCol='features', labelCol='avg-close', maxIter=10)
lr_model = lr.fit(train_df)