示例#1
0
def transform_data_in_pipeline(df):
    """

    :param df:
    :return:
    """

    # Initialise pipeline variables
    stages = []
    assembler_inputs = []

    # Assemble features vector from Spark dataframe fields
    assembler = VectorAssembler(
        inputCols=['x', 'y', 'star_rating_number', 'avg_adr'],
        outputCol='features')
    stages += [assembler]
    assembler_inputs += [assembler.getOutputCol()]

    # Apply standard scaling with unit std and centroid about the mean
    scaler = StandardScaler(inputCol=assembler.getOutputCol(),
                            outputCol='scaledFeatures')
    stages += [scaler]
    assembler_inputs += [scaler.getOutputCol()]

    # Execute the pipeline
    pipeline_model = Pipeline() \
        .setStages(stages) \
        .fit(df)

    # Return the dataframe with the additional transformed features vector
    return pipeline_model.transform(df)
示例#2
0
def prepare_data():
    """Commodity function to read the data from the files and prepare the features for the kmeans model fit.
    """
    # Read data from files.
    _data = load_data()

    # As the distribution of the following feature is not normal they will be log scaled to have a more
    # normally distributed distribution. This is required for kmeans algorithm to work better.
    _data = _data.withColumn('log_age', F.log('age')).withColumn('log_avg_buy', F.log('avg_buy'))\
        .withColumn('log_min_buy', F.log('min_buy')).withColumn('log_max_buy', F.log('max_buy'))

    # Select the features to use in kmeans. The features will be also standard scaled, that is mean centered
    # and scaled to have standard deviation of one.
    features = _data.columns[4:]

    assembler = VectorAssembler(inputCols=features,
                                outputCol='features_unscaled')
    assembled = assembler.transform(_data)

    scaler = StandardScaler(inputCol='features_unscaled',
                            outputCol='features',
                            withStd=True,
                            withMean=True)
    scaler_model = scaler.fit(assembled)
    scaled_data = scaler_model.transform(assembled)

    return scaled_data, features
def normalizationBySpark(RDDSparkDF):
  
  scalerSD = StandardScaler(inputCol="features", outputCol="scaledFeatures",withStd=True, withMean=False)
  #scalerMaxMin = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
  #scalerMaxAbs = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures")
  
  # Compute summary statistics and generate MaxAbsScalerModel
  scalerSDModel = scalerSD.fit(RDDSparkDF)
  #scalerMaxMinModel = scalerMaxMin.fit(RDDSparkDF)
  #scalerMaxAbsModel = scalerMaxAbs.fit(RDDSparkDF)
  # rescale each feature to range [-1, 1].
  scaledDataSD = scalerSDModel.transform(RDDSparkDF)
  #scaledDataMinMax = scalerMaxMinModel.transform(RDDSparkDF)
  #scaledDataMaxAbs = scalerMaxAbsModel.transform(RDDSparkDF)
  
  #Compute summary statistics by fitting the StandardScaler
  #Compute summary statistics and generate MinMaxScalerModel
  #print("Features scaled by SD to range: [%f, %f]" % (scaledDataSD.getMin(), scaledDataSD.getMax()))
  #print("Features scaled by MinMax to range: [%f, %f]" % (scaledDataMinMax.getMin(), scaledDataMinMax.getMax()))
  
  scaledFeatures_outcome = scaledDataSD.rdd.map(extractRow).toDF(newColumns)
  
  leftDF = RDDSparkDF.select(col2)
  df = leftDF.join(scaledFeatures_outcome, ["KEY"])
  
  #return scaledDataSD, scaledDataMinMax,scaledDataMaxAbs,df
  return df
示例#4
0
def generate_train_test(data,
                        multiplier_minority, 
                        fraction_majority, 
                        label_col='label',
                        minority_tag=1,
                        train_perc=0.7):
    '''
    Train test split on the data (after the step of features assembling)

    multiplier_minority: how many small proportions do we want of the minority data
    fraction_majority: sample fraction for majority group
    label_col: column name of the label column
    minority_tag: tag that has very few representatives
    train_perc: how many percentages of the data will go to the training set
    '''
    po = data.filter("{} == {}".format(label_col, minority_tag))
    ne = data.filter("{} != {}".format(label_col, minority_tag))
    training_po, testing_po = po.randomSplit([train_perc, 1-train_perc], seed = 100)
    training_ne, testing_ne = ne.randomSplit([train_perc, 1-train_perc], seed = 100)
    training = training_po.union(training_ne)
    training = resample(training, 
                        multiplier_minority=multiplier_minority, 
                        fraction_majority=fraction_majority)
    testing = testing_po.union(testing_ne)
    scaler = StandardScaler(inputCol="features", 
                            outputCol="scaledFeatures",
                            withStd=True, 
                            withMean=False)
    scale_model = scaler.fit(training)
    training = scale_model.transform(training)
    testing = scale_model.transform(testing)
    return training, testing
示例#5
0
def preprocess(df, should_undersample, scaler=None):
    """ Escala los datos y balancea usando Random Undersample (RUS) """
    # Agrupar las caracteristicas para poder usarlas en la MLlib:
    assembler = VectorAssembler(inputCols=[
        "PSSM_r1_1_K", "PSSM_r2_-1_R", "PSSM_central_2_D", "PSSM_central_0_A",
        "PSSM_r1_1_W", "PSSM_central_-1_V"
    ],
                                outputCol="features")

    out = assembler.transform(df).select("features", "class")

    # Random Undersample (RUS)
    # Antes: POS = 550.140, NEG = 1.100.591
    # Despues: POS = 550.140, NEG = 549.668
    if should_undersample:
        positive = out.filter(out["class"] == 1.0)
        negative = out.filter(out["class"] == 0.0)
        fraction = float(positive.count()) / float(negative.count())
        negative = negative.sample(withReplacement=False,
                                   fraction=fraction,
                                   seed=89)
        out = negative.union(positive)

    # Escalar:
    if scaler == None:
        scaler = StandardScaler(withMean=True,
                                withStd=True,
                                inputCol="features",
                                outputCol="scaled_features")
        scaler = scaler.fit(out)
        out = scaler.transform(out)
    else:
        out = scaler.transform(out)

    return out, scaler
示例#6
0
def spark_data_flow():
    input_df = spark.read.parquet(
        ("{path}/"
         "p2p_feature_merge/"
         "{version}").format(path=IN_PAHT, 
                             version=RELATION_VERSION))

    tid_vector_df = input_df.rdd.map(
        get_vectors
    ).toDF(
    ).withColumnRenamed(
        '_1', 'features'
    ).withColumnRenamed(
        '_2', 'bbd_qyxx_id'
    ).withColumnRenamed(
        '_3', 'company_name'
    ).withColumnRenamed(
        '_4', 'platform_name'
    ).withColumnRenamed(
        '_5', 'platform_state'
    )
    
    scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                            withStd=True, withMean=True)
    
    # Compute summary statistics by fitting the StandardScaler
    scalerModel = scaler.fit(tid_vector_df)
    
    # Normalize each feature to have unit standard deviation.
    scaled_df = scalerModel.transform(tid_vector_df)
    
    return scaled_df
示例#7
0
 def build_standard_scaler(self,
                           df,
                           with_mean=False,
                           with_std=True,
                           persist_estimator_path=None,
                           input_col='features',
                           output_col='scaled_features'):
     """
     Standard Scaler estimator builder and transformer for dense feature vectors.
     Warnings: It will build a dense output, so take care when applying to sparse input.
     :param df: Spark DataFrame object auto-reference from DataFrame class
     :param with_mean: False by default. Centers the data with mean before scaling
     :param with_std: True by default. Scales the data to unit standard deviation
     :param persist_estimator_path: Persist model estimator metadata path
     :param input_col: Name for input column to scale
     :param output_col: Name of output column to create with scaled features
     :return: Standard Scaler model
     """
     std_scaler = StandardScaler(withMean=with_mean,
                                 withStd=with_std,
                                 inputCol=input_col,
                                 outputCol=output_col)
     if persist_estimator_path:
         self.__logger.info("Compute Feature Standard ScalerModel Metadata")
         self.__logger.warning(
             f"Persist Metadata Model Path: {persist_estimator_path}")
         std_scaler.fit(df).write().overwrite().save(persist_estimator_path)
         self.__logger.info("Loading Scaler Estimator For Prediction")
         return StandardScalerModel.load(persist_estimator_path).tansfrom(
             df)
     self.__logger.info("Compute Feature Standard Scaler DataFrame")
     return std_scaler.fit(df).transform(df)
示例#8
0
def rescale_df(data):
    """Rescale the data."""
    standardScaler = StandardScaler(inputCol="features",
                                    outputCol="features_scaled")
    scaler = standardScaler.fit(data)
    scaled_df = scaler.transform(data)
    return scaled_df
示例#9
0
    def test_standard_scaler(self):
        data = self.spark.createDataFrame([(
            0,
            Vectors.dense([1.0, 0.1, -1.0]),
        ), (
            1,
            Vectors.dense([2.0, 1.1, 1.0]),
        ), (
            2,
            Vectors.dense([3.0, 10.1, 3.0]),
        )], ["id", "features"])
        scaler = StandardScaler(inputCol='features',
                                outputCol='scaled_features')
        model = scaler.fit(data)

        # the input names must match the inputCol(s) above
        model_onnx = convert_sparkml(model, 'Sparkml StandardScaler',
                                     [('features', FloatTensorType([1, 3]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().scaled_features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlStandardScaler")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['scaled_features'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
    def create_standard_pipeline(self, cross_validate=False):
        """
        This method creates a standard pipeline, standard meaning: vectorize, standardize and model...
        :return: Pipeline for pyspark, ParameterGrid for Pyspark pipeline
        """

        # Feature columns are created from instance variables
        # feature_columns = [i.name for i in self._feature_cols]

        # Vectorized transformation
        vectorizer = VectorAssembler(inputCols=self._feature_cols,
                                     outputCol='v_features')
        # Cast the vector from mllib to ml
        converter = ConvertAllToVecToMl(inputCol=vectorizer.getOutputCol(),
                                        outputCol='casted')
        # Standardize estimator
        standardizes = StandardScaler(withMean=self._standardize,
                                      withStd=self._standardize,
                                      inputCol=converter.getOutputCol(),
                                      outputCol="scaled")
        # Labels and strings are already set into the model, +
        dict_parameters = dict(
            filter(lambda x: not isinstance(x[1], tuple),
                   self._params.items()))
        dict_parameters['featuresCol'] = standardizes.getOutputCol()
        dict_parameters['labelCol'] = self._label_col[0]  # HACK!!!
        #print(label_dict)

        # Model is set
        model = eval("classification." + self._algorithm)(**dict_parameters)

        pipe = Pipeline(stages=[vectorizer, converter, standardizes, model])
        return pipe
示例#11
0
    def train(cls, spark, sdf, cat_colnames, num_colnames):
        string_indexer_list = list()
        for cat_colname in cat_colnames:
            string_indexer = StringIndexer(inputCol=cat_colname,
                                           outputCol=cat_colname + "_index",
                                           handleInvalid="skip")
            string_indexer_list.append(string_indexer)

        out = []
        pipe = []
        if len(num_colnames) > 0:

            assembler = VectorAssembler(inputCols=num_colnames,
                                        outputCol="features_vec")
            standard_scaler = StandardScaler(inputCol="features_vec",
                                             outputCol="features_zs",
                                             withMean=True,
                                             withStd=True)
            out = [standard_scaler.getOutputCol()]
            pipe = [assembler, standard_scaler]
        assembler_2 = VectorAssembler(
            inputCols=[x.getOutputCol() for x in string_indexer_list] + out,
            outputCol="features")
        estimator = KMeans(featuresCol="features",
                           predictionCol="cluster_id",
                           k=4)

        clustering_pipeline = Pipeline(stages=string_indexer_list + pipe +
                                       [assembler_2] + [estimator])
        clustering_pipeline_model = clustering_pipeline.fit(sdf)

        return KMeansPipeline(pipeline_model=clustering_pipeline_model)
示例#12
0
def vectorize_data(training_data, test_data):
    # Assemble the vectors
    input_columns = training_data.columns
    input_columns.remove(TARGET)
    print("Using these features: {}".format(input_columns))
    vector_assembler = VectorAssembler(inputCols=input_columns, outputCol='features')
    train_df = vector_assembler.transform(training_data)

    # Normalize the data using Scalar
    scalar = StandardScaler(inputCol='features', outputCol='scaledFeatures', withStd=True, withMean=True).fit(train_df)
    train_df = scalar.transform(train_df)

    # Select the rows needed
    train_df = train_df.select(['scaledFeatures', TARGET])

    new_test_data = dict()
    for company in test_data:
        company_data = test_data[company]
        test_df = vector_assembler.transform(company_data)
        test_df = scalar.transform(test_df)

        test_df = test_df.select(['scaledFeatures', TARGET])
        new_test_data[company] = test_df

    return train_df, new_test_data
def normalize_score(df):
    
    from pyspark.ml.linalg import Vectors
    from pyspark.ml.feature import VectorAssembler
    from pyspark.ml.feature import StandardScaler


    assembler = VectorAssembler(
        inputCols=["score"],
        outputCol="score_v")

    output = assembler.transform(df)
    
    # Normalize each Vector using $L^1$ norm.
    
    scaler = StandardScaler(inputCol="score_v", outputCol="popularity_score",
                        withStd=False, withMean=True)

    # Compute summary statistics by fitting the StandardScaler
    scalerModel = scaler.fit(output)

    # Normalize each feature to have unit standard deviation.
    scaledData = scalerModel.transform(output)
    
    return scaledData
def run_standard_scaler(t_data):
    standardscaler = StandardScaler(inputCol="features",
                                    outputCol="scaled_features",
                                    withStd=True,
                                    withMean=False)
    t_data = standardscaler.fit(t_data).transform(t_data)

    return t_data
示例#15
0
def standarize_features(rfm_table):
    assembler = VectorAssembler(inputCols=['r_quartile','f_quartile','m_quartile'],\
                            outputCol='features',handleInvalid = 'skip')
    rfm_table = assembler.transform(rfm_table)

    standardizer = StandardScaler(withMean=True, withStd=True).setInputCol("features").setOutputCol("scaled_features")
    std_model = standardizer.fit(rfm_table)
    features = std_model.transform(rfm_table)
    return features
示例#16
0
def StandardScalerModel(input_col, output_col, withStd, withMean, input_data):

    staticmethod = StandardScaler(inputCol=input_col,
                                  outputCol=output_col,
                                  withStd=withStd,
                                  withMean=withMean)
    model = staticmethod.fit(input_data)
    result = model.transform(input_data)
    return result
def StandardScalerModel(input_col, output_col, withStd, withMean, input_data):
    # mindf 必须在文档中出现的最少次数
    # vocabSize 词典大小
    staticmethod = StandardScaler(inputCol=input_col,
                                  outputCol=output_col,
                                  withStd=withStd,
                                  withMean=withMean)
    model = staticmethod.fit(input_data)
    result = model.transform(input_data)
    return result
    def standardScaler(self):
        from pyspark.ml.feature import StandardScaler

        dataFrame = self.session.read.format("libsvm").load(
            self.dataDir + "/data/mllib/sample_libsvm_data.txt")
        scaler = StandardScaler(inputCol="features",
                                outputCol="scaledFeatures",
                                withStd=True,
                                withMean=False)

        scalerModel = scaler.fit(dataFrame)
        scaledData = scalerModel.transform(dataFrame)
        scaledData.show()

        scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

        # Compute summary statistics and generate MinMaxScalerModel
        scalerModel = scaler.fit(dataFrame)

        # rescale each feature to range [min, max].
        scaledData = scalerModel.transform(dataFrame)
        print("Features scaled to range: [%f, %f]" %
              (scaler.getMin(), scaler.getMax()))
        scaledData.select("features", "scaledFeatures").show()

        scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures")

        # Compute summary statistics and generate MaxAbsScalerModel
        scalerModel = scaler.fit(dataFrame)

        # rescale each feature to range [-1, 1].
        scaledData = scalerModel.transform(dataFrame)

        scaledData.select("features", "scaledFeatures").show()
示例#19
0
def scale_features(df):
    scalar = StandardScaler(inputCol='features',
                            outputCol='scaled_features',
                            withStd=True,
                            withMean=False)
    model = scalar.fit(df)
    sc_df = model.transform(df)
    sc_df = sc_df.drop('features')
    sc_df = sc_df.select(
        *(col(c) for c in list(set(sc_df.columns) - {'scaled_features'})),
        col('scaled_features').alias('features'))
    return sc_df
示例#20
0
文件: scale.py 项目: shtsai/PyPandas
def standard_scale(dataFrame,
                   inputColNames,
                   usr_withStd=True,
                   usr_withMean=False):

    assembledDF = getAssembledDataFrame(dataFrame, inputColNames)
    scaler=StandardScaler(inputCol="features", \
                          outputCol="scaled features", \
                          withStd=usr_withStd, \
                          withMean=usr_withMean).fit(assembledDF)
    scaledDF = scaler.transform(assembledDF).drop("features")
    return scaledDF
示例#21
0
def nlpTransform(data):
    tokenizer = Tokenizer(inputCol="combi_text", outputCol="words")
    wordsData = tokenizer.transform(data)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
    featurizedData = hashingTF.transform(wordsData)
    scaler = StandardScaler(inputCol="rawFeatures",
                            outputCol="features",
                            withStd=True,
                            withMean=False)
    featureData = scaler.fit(featurizedData)
    featureD = featureData.transform(featurizedData)
    return featureD
示例#22
0
def Model(Data, Tgt='Target', Indp='Nada'):

    vector_assembler = VectorAssembler(
        inputCols=Indp, outputCol='assembled_important_features')
    standard_scaler = StandardScaler(inputCol=vector_assembler.getOutputCol(),
                                     outputCol='standardized_features')
    rf = RandomForestClassifier(featuresCol=standard_scaler.getOutputCol(),
                                labelCol=Tgt)
    #	letters_train, letters_test = letters.randomSplit([0.8,0.2], seed=4)
    pipeline = Pipeline(stages=[vector_assembler, standard_scaler, rf])
    pipeline_model_rf = pipeline.fit(Data)
    return pipeline_model_rf
示例#23
0
 def kmeans(self):
     rdd = self.stream.filter(lambda message: float(message.temperature)) \
         .filter(lambda message: float(message.delay > 10000)) \
         .transform(lambda rdd: rdd.sortByKey())
     sqlContext = SQLContext(self.sc)
     schema = sqlContext.createDataFrame(rdd)
     df = schema.createOrReplaceTempView('kmeans')
     assembler = VectorAssembler(inputCols=df.columns, outputCol='features')
     final_df = assembler.transform(df)
     scaler = StandardScaler(inputCol='features',
                             outputCol='scaled_features')
     scaler_model = scaler.fit(final_df)
     return scaler_model.transform(final_df)
示例#24
0
 def standard_scaler(self, df, column):
     """
     按列 特征标准化StandardScaler
     """
     print('StandScalerExample')
     # 按特征列减均值除标准差——标准化
     scaler = StandardScaler(inputCol=column,
                             outputCol=column + '_standscale',
                             withStd=False,
                             withMean=True)
     scalerModel = scaler.fit(df)
     scaledData = scalerModel.transform(df)
     return scaledData
 def standardize_url_vectors(urls_and_vectors: DataFrame) -> DataFrame:
     """
     Standardizes URLs and vectors DataFrame.
     :param urls_and_vectors: A DataFrame of URLs and vectors with columns: id, url, split_url, coefficients, vector.
     :return: A DataFrame of URLS and standardized vectors with columns: id, url, split_url, coefficients, vector.
     """
     standard_scaler = StandardScaler(inputCol="vector",
                                      outputCol="scaled_vector")
     standard_scaler_model = standard_scaler.fit(urls_and_vectors)
     return standard_scaler_model \
         .transform(urls_and_vectors) \
         .select("id", "url", "split_url", "coefficients", "scaled_vector") \
         .withColumnRenamed("scaled_vector", "vector")
def standard_scaler(input_df):
    df = input_df

    scaler = StandardScaler(
        inputCol='features', outputCol='features_Scaled',
        withMean=True, withStd=True)

    stds = scaler.fit(df)

    # Normalize each feature
    df = stds.transform(df).drop('features')
    df = df.withColumnRenamed('features_Scaled', 'features')
    return df
    def get_scaled_label_features(self):
        self.data = self.get_label_features()
        scaler = StandardScaler(inputCol='features', \
              outputCol="scaled", \
              withStd=True, \
              withMean=False)

        self.data = scaler.fit(self.data)\
              .transform(self.data)\
              .withColumnRenamed('features',"to_drop")\
              .withColumnRenamed('scaled','features')\
              .drop('to_drop')
        return self.data
 def scaling(dataFrame, inputColName, usr_withStd, usr_withMean):
     outputColName = "scaled " + inputColName
     assembler = VectorAssembler(inputCols=[inputColName], \
                                 outputCol="features")
     assembledDF = assembler.transform(dataFrame)
     scaler=StandardScaler(inputCol="features", \
                           outputCol=outputColName, \
                           withStd=usr_withStd, \
                           withMean=usr_withMean).fit(assembledDF)
     scaledDF = scaler.transform(assembledDF).drop("features")
     castVectorToFloat = udf(lambda v : float(v[0]), FloatType())
     scaledDF = scaledDF.withColumn(outputColName, castVectorToFloat(outputColName)) 
     print ("Successfully scale the column '{0:s}' and create a new column '{1:s}'.".format(inputColName, outputColName))
     return scaledDF
示例#29
0
def hacker_test(spark, resources_folder):
    data = spark.read.csv(resources_folder + 'hack_data.csv',
                          header=True,
                          inferSchema=True)
    data.printSchema()
    data.show()
    print(data.columns)
    assembler = VectorAssembler(inputCols=[
        'Session_Connection_Time', 'Bytes Transferred', 'Kali_Trace_Used',
        'Servers_Corrupted', 'Pages_Corrupted', 'WPM_Typing_Speed'
    ],
                                outputCol='features')
    data_assembled = assembler.transform(data)
    data_assembled.show()

    scaler = StandardScaler(inputCol='features', outputCol='scaledfeatures')
    scaler_model = scaler.fit(data_assembled)
    data_assembled_scaled = scaler_model.transform(data_assembled)
    data_assembled_scaled.show()

    data_assembled = data_assembled_scaled.select('scaledfeatures').withColumn(
        'features', data_assembled_scaled['scaledfeatures'])
    data_assembled.show()

    print(
        "************************************* con tres cluster *************************************"
    )
    kmeans3 = KMeans(featuresCol='features', k=3, seed=10)
    model3 = kmeans3.fit(data_assembled)
    wssse3 = model3.summary.trainingCost
    print(wssse3)
    print(model3.clusterCenters())
    model3.summary.predictions.show()

    predictions3 = model3.summary.predictions
    predictions3.groupBy('prediction').count().show()
    # predictions3.agg({'prediction': 'count'}).show()

    print(
        "************************************* con dos cluster *************************************"
    )
    kmeans2 = KMeans(featuresCol='features', k=2, seed=10)
    model2 = kmeans2.fit(data_assembled)
    wssse2 = model2.summary.trainingCost
    print(wssse2)
    print(model2.clusterCenters())
    model2.summary.predictions.show()

    predictions2 = model2.summary.predictions
    predictions2.groupBy('prediction').count().show()
示例#30
0
def standardScalerModel(df, conf):
    """
        input: spark-dataFrame, conf [configuration params]
        return value: model
    """
    mean = conf.get("withMean", False)
    std = conf.get("withStd", True)
    input = conf.get("inputCol", None)
    output = conf.get("outputCol", None)
    scaler = StandardScaler(inputCol=input,
                            outputCol=output,
                            withMean=mean,
                            withStd=std)
    model = scaler.fit(dataFrame)
    return scaler, model
示例#31
0
# Get the tf-idf features
data = tf_idf_features_quora(data)
# Get the text features
data = text_features(data)

# combine all the features
feature_assembler = VectorAssembler(
    inputCols=["tf_idf_features", "text_features"],
    outputCol="combined_features"
)
data = feature_assembler.transform(data)


# Normalizing each feature to have unit standard deviation
scaler = StandardScaler(inputCol="combined_features", outputCol="features",
                        withStd=True, withMean=False)
scalerModel = scaler.fit(data)
# Normalize each feature to have unit standard deviation.
data = scalerModel.transform(data)


# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
# Automatically identify categorical features, and index them.
feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=2).fit(data)

training_df, test_df = data.randomSplit([0.8, 0.2])
training_df.cache()
test_df.cache()
示例#32
0
def rescale_df(data):
    """Rescale the data."""
    standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled")
    scaler = standardScaler.fit(data)
    scaled_df = scaler.transform(data)
    return scaled_df
示例#33
0
  def initialize(self, do_scaling=True, do_onehot=True):
    """Reads the dataset, initializes class members.

    features_df: Original DataFrame as read from the features_file.
    train_df: A DataFrame with columns Lat, Lon, Pickup_Count and
        vector columns Features & ScaledFeatures. Contains only data before 2015.
    test_df: As train_df, but only containing data of 2015.
    districts_with_counts: A DataFrame with all districts and their counts.
    """

    # Read feature dataframe
    self.features_df = self.sql_context.read.parquet(self.features_file).cache()

    # Set exclude columns to default
    exclude_columns = self.EXCLUDE_COLUMNS

    # Scale features
    if do_scaling:
      assembler = VectorAssembler(inputCols=self.SCALE_COLUMNS,
                                  outputCol='FeaturesToScale')
      self.features_df = assembler.transform(self.features_df)
      scaler = StandardScaler(inputCol='FeaturesToScale',
                              outputCol=('ScaledFeatures'),
                              withStd=True, withMean=False)
      self.features_df = scaler.fit(self.features_df).transform(self.features_df)

      exclude_columns += self.SCALE_COLUMNS + ['FeaturesToScale']

    # Adopt categorical features that do not have a value range of [0, numCategories)
    for column in ['Day', 'Month', 'Day_Of_Year']:
        if column in self.features_df.columns:
            self.features_df = self.features_df.withColumn(column, self.features_df[column] - 1)

    # Encode categorical features using one-hot encoding
    if do_onehot:
      vec_category_columns = ['%s_Vector' % column for column in self.ONE_HOT_COLUMNS]
      for i in range(len(self.ONE_HOT_COLUMNS)):
        column = self.ONE_HOT_COLUMNS[i]
        if column in self.features_df.columns:
            self.features_df = self.features_df.withColumn(column, self.features_df[column].cast(DoubleType()))
            encoder = OneHotEncoder(inputCol=column,
                                    outputCol=vec_category_columns[i],
                                    dropLast=False)
            self.features_df = encoder.transform(self.features_df)
      exclude_columns += self.ONE_HOT_COLUMNS

    # Vectorize features
    feature_columns = [column for column in self.features_df.columns
                              if column not in exclude_columns]
    assembler = VectorAssembler(inputCols=feature_columns, outputCol='Features')
    self.features_df = assembler.transform(self.features_df)

    # Set number of distinct values for categorical features (identified by index)
    self.categorical_features_info = {}
    if not do_onehot:
        self.categorical_features_info = {i:self.CATEGORY_VALUES_COUNT[feature_columns[i]]
                                          for i in range(len(feature_columns))
                                          if feature_columns[i] in self.CATEGORY_VALUES_COUNT.keys()}

    # Split into train and test data
    split_date = datetime(2015, 1, 1)
    self.train_df = self.features_df.filter(self.features_df.Time < split_date).cache()
    self.test_df = self.features_df.filter(self.features_df.Time > split_date).cache()

    # Compute Districts with counts
    self.districts_with_counts = self.features_df \
                                 .groupBy([self.features_df.Lat, self.features_df.Lon]) \
                                 .count()
bucketer = Bucketizer().setSplits(bucketBorders).setInputCol("id")
bucketer.transform(contDF).show()


# COMMAND ----------

from pyspark.ml.feature import QuantileDiscretizer
bucketer = QuantileDiscretizer().setNumBuckets(5).setInputCol("id")
fittedBucketer = bucketer.fit(contDF)
fittedBucketer.transform(contDF).show()


# COMMAND ----------

from pyspark.ml.feature import StandardScaler
sScaler = StandardScaler().setInputCol("features")
sScaler.fit(scaleDF).transform(scaleDF).show()


# COMMAND ----------

from pyspark.ml.feature import MinMaxScaler
minMax = MinMaxScaler().setMin(5).setMax(10).setInputCol("features")
fittedminMax = minMax.fit(scaleDF)
fittedminMax.transform(scaleDF).show()


# COMMAND ----------

from pyspark.ml.feature import MaxAbsScaler
maScaler = MaxAbsScaler().setInputCol("features")
示例#35
0
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import print_function

from pyspark import SparkContext
from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import StandardScaler
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="StandardScalerExample")
    sqlContext = SQLContext(sc)

    # $example on$
    dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
    scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                            withStd=True, withMean=False)

    # Compute summary statistics by fitting the StandardScaler
    scalerModel = scaler.fit(dataFrame)

    # Normalize each feature to have unit standard deviation.
    scaledData = scalerModel.transform(dataFrame)
    scaledData.show()
    # $example off$

    sc.stop()