def transform_data_in_pipeline(df): """ :param df: :return: """ # Initialise pipeline variables stages = [] assembler_inputs = [] # Assemble features vector from Spark dataframe fields assembler = VectorAssembler( inputCols=['x', 'y', 'star_rating_number', 'avg_adr'], outputCol='features') stages += [assembler] assembler_inputs += [assembler.getOutputCol()] # Apply standard scaling with unit std and centroid about the mean scaler = StandardScaler(inputCol=assembler.getOutputCol(), outputCol='scaledFeatures') stages += [scaler] assembler_inputs += [scaler.getOutputCol()] # Execute the pipeline pipeline_model = Pipeline() \ .setStages(stages) \ .fit(df) # Return the dataframe with the additional transformed features vector return pipeline_model.transform(df)
def prepare_data(): """Commodity function to read the data from the files and prepare the features for the kmeans model fit. """ # Read data from files. _data = load_data() # As the distribution of the following feature is not normal they will be log scaled to have a more # normally distributed distribution. This is required for kmeans algorithm to work better. _data = _data.withColumn('log_age', F.log('age')).withColumn('log_avg_buy', F.log('avg_buy'))\ .withColumn('log_min_buy', F.log('min_buy')).withColumn('log_max_buy', F.log('max_buy')) # Select the features to use in kmeans. The features will be also standard scaled, that is mean centered # and scaled to have standard deviation of one. features = _data.columns[4:] assembler = VectorAssembler(inputCols=features, outputCol='features_unscaled') assembled = assembler.transform(_data) scaler = StandardScaler(inputCol='features_unscaled', outputCol='features', withStd=True, withMean=True) scaler_model = scaler.fit(assembled) scaled_data = scaler_model.transform(assembled) return scaled_data, features
def normalizationBySpark(RDDSparkDF): scalerSD = StandardScaler(inputCol="features", outputCol="scaledFeatures",withStd=True, withMean=False) #scalerMaxMin = MinMaxScaler(inputCol="features", outputCol="scaledFeatures") #scalerMaxAbs = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures") # Compute summary statistics and generate MaxAbsScalerModel scalerSDModel = scalerSD.fit(RDDSparkDF) #scalerMaxMinModel = scalerMaxMin.fit(RDDSparkDF) #scalerMaxAbsModel = scalerMaxAbs.fit(RDDSparkDF) # rescale each feature to range [-1, 1]. scaledDataSD = scalerSDModel.transform(RDDSparkDF) #scaledDataMinMax = scalerMaxMinModel.transform(RDDSparkDF) #scaledDataMaxAbs = scalerMaxAbsModel.transform(RDDSparkDF) #Compute summary statistics by fitting the StandardScaler #Compute summary statistics and generate MinMaxScalerModel #print("Features scaled by SD to range: [%f, %f]" % (scaledDataSD.getMin(), scaledDataSD.getMax())) #print("Features scaled by MinMax to range: [%f, %f]" % (scaledDataMinMax.getMin(), scaledDataMinMax.getMax())) scaledFeatures_outcome = scaledDataSD.rdd.map(extractRow).toDF(newColumns) leftDF = RDDSparkDF.select(col2) df = leftDF.join(scaledFeatures_outcome, ["KEY"]) #return scaledDataSD, scaledDataMinMax,scaledDataMaxAbs,df return df
def generate_train_test(data, multiplier_minority, fraction_majority, label_col='label', minority_tag=1, train_perc=0.7): ''' Train test split on the data (after the step of features assembling) multiplier_minority: how many small proportions do we want of the minority data fraction_majority: sample fraction for majority group label_col: column name of the label column minority_tag: tag that has very few representatives train_perc: how many percentages of the data will go to the training set ''' po = data.filter("{} == {}".format(label_col, minority_tag)) ne = data.filter("{} != {}".format(label_col, minority_tag)) training_po, testing_po = po.randomSplit([train_perc, 1-train_perc], seed = 100) training_ne, testing_ne = ne.randomSplit([train_perc, 1-train_perc], seed = 100) training = training_po.union(training_ne) training = resample(training, multiplier_minority=multiplier_minority, fraction_majority=fraction_majority) testing = testing_po.union(testing_ne) scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False) scale_model = scaler.fit(training) training = scale_model.transform(training) testing = scale_model.transform(testing) return training, testing
def preprocess(df, should_undersample, scaler=None): """ Escala los datos y balancea usando Random Undersample (RUS) """ # Agrupar las caracteristicas para poder usarlas en la MLlib: assembler = VectorAssembler(inputCols=[ "PSSM_r1_1_K", "PSSM_r2_-1_R", "PSSM_central_2_D", "PSSM_central_0_A", "PSSM_r1_1_W", "PSSM_central_-1_V" ], outputCol="features") out = assembler.transform(df).select("features", "class") # Random Undersample (RUS) # Antes: POS = 550.140, NEG = 1.100.591 # Despues: POS = 550.140, NEG = 549.668 if should_undersample: positive = out.filter(out["class"] == 1.0) negative = out.filter(out["class"] == 0.0) fraction = float(positive.count()) / float(negative.count()) negative = negative.sample(withReplacement=False, fraction=fraction, seed=89) out = negative.union(positive) # Escalar: if scaler == None: scaler = StandardScaler(withMean=True, withStd=True, inputCol="features", outputCol="scaled_features") scaler = scaler.fit(out) out = scaler.transform(out) else: out = scaler.transform(out) return out, scaler
def spark_data_flow(): input_df = spark.read.parquet( ("{path}/" "p2p_feature_merge/" "{version}").format(path=IN_PAHT, version=RELATION_VERSION)) tid_vector_df = input_df.rdd.map( get_vectors ).toDF( ).withColumnRenamed( '_1', 'features' ).withColumnRenamed( '_2', 'bbd_qyxx_id' ).withColumnRenamed( '_3', 'company_name' ).withColumnRenamed( '_4', 'platform_name' ).withColumnRenamed( '_5', 'platform_state' ) scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True) # Compute summary statistics by fitting the StandardScaler scalerModel = scaler.fit(tid_vector_df) # Normalize each feature to have unit standard deviation. scaled_df = scalerModel.transform(tid_vector_df) return scaled_df
def build_standard_scaler(self, df, with_mean=False, with_std=True, persist_estimator_path=None, input_col='features', output_col='scaled_features'): """ Standard Scaler estimator builder and transformer for dense feature vectors. Warnings: It will build a dense output, so take care when applying to sparse input. :param df: Spark DataFrame object auto-reference from DataFrame class :param with_mean: False by default. Centers the data with mean before scaling :param with_std: True by default. Scales the data to unit standard deviation :param persist_estimator_path: Persist model estimator metadata path :param input_col: Name for input column to scale :param output_col: Name of output column to create with scaled features :return: Standard Scaler model """ std_scaler = StandardScaler(withMean=with_mean, withStd=with_std, inputCol=input_col, outputCol=output_col) if persist_estimator_path: self.__logger.info("Compute Feature Standard ScalerModel Metadata") self.__logger.warning( f"Persist Metadata Model Path: {persist_estimator_path}") std_scaler.fit(df).write().overwrite().save(persist_estimator_path) self.__logger.info("Loading Scaler Estimator For Prediction") return StandardScalerModel.load(persist_estimator_path).tansfrom( df) self.__logger.info("Compute Feature Standard Scaler DataFrame") return std_scaler.fit(df).transform(df)
def rescale_df(data): """Rescale the data.""" standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled") scaler = standardScaler.fit(data) scaled_df = scaler.transform(data) return scaled_df
def test_standard_scaler(self): data = self.spark.createDataFrame([( 0, Vectors.dense([1.0, 0.1, -1.0]), ), ( 1, Vectors.dense([2.0, 1.1, 1.0]), ), ( 2, Vectors.dense([3.0, 10.1, 3.0]), )], ["id", "features"]) scaler = StandardScaler(inputCol='features', outputCol='scaled_features') model = scaler.fit(data) # the input names must match the inputCol(s) above model_onnx = convert_sparkml(model, 'Sparkml StandardScaler', [('features', FloatTensorType([1, 3]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().scaled_features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlStandardScaler") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['scaled_features'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def create_standard_pipeline(self, cross_validate=False): """ This method creates a standard pipeline, standard meaning: vectorize, standardize and model... :return: Pipeline for pyspark, ParameterGrid for Pyspark pipeline """ # Feature columns are created from instance variables # feature_columns = [i.name for i in self._feature_cols] # Vectorized transformation vectorizer = VectorAssembler(inputCols=self._feature_cols, outputCol='v_features') # Cast the vector from mllib to ml converter = ConvertAllToVecToMl(inputCol=vectorizer.getOutputCol(), outputCol='casted') # Standardize estimator standardizes = StandardScaler(withMean=self._standardize, withStd=self._standardize, inputCol=converter.getOutputCol(), outputCol="scaled") # Labels and strings are already set into the model, + dict_parameters = dict( filter(lambda x: not isinstance(x[1], tuple), self._params.items())) dict_parameters['featuresCol'] = standardizes.getOutputCol() dict_parameters['labelCol'] = self._label_col[0] # HACK!!! #print(label_dict) # Model is set model = eval("classification." + self._algorithm)(**dict_parameters) pipe = Pipeline(stages=[vectorizer, converter, standardizes, model]) return pipe
def train(cls, spark, sdf, cat_colnames, num_colnames): string_indexer_list = list() for cat_colname in cat_colnames: string_indexer = StringIndexer(inputCol=cat_colname, outputCol=cat_colname + "_index", handleInvalid="skip") string_indexer_list.append(string_indexer) out = [] pipe = [] if len(num_colnames) > 0: assembler = VectorAssembler(inputCols=num_colnames, outputCol="features_vec") standard_scaler = StandardScaler(inputCol="features_vec", outputCol="features_zs", withMean=True, withStd=True) out = [standard_scaler.getOutputCol()] pipe = [assembler, standard_scaler] assembler_2 = VectorAssembler( inputCols=[x.getOutputCol() for x in string_indexer_list] + out, outputCol="features") estimator = KMeans(featuresCol="features", predictionCol="cluster_id", k=4) clustering_pipeline = Pipeline(stages=string_indexer_list + pipe + [assembler_2] + [estimator]) clustering_pipeline_model = clustering_pipeline.fit(sdf) return KMeansPipeline(pipeline_model=clustering_pipeline_model)
def vectorize_data(training_data, test_data): # Assemble the vectors input_columns = training_data.columns input_columns.remove(TARGET) print("Using these features: {}".format(input_columns)) vector_assembler = VectorAssembler(inputCols=input_columns, outputCol='features') train_df = vector_assembler.transform(training_data) # Normalize the data using Scalar scalar = StandardScaler(inputCol='features', outputCol='scaledFeatures', withStd=True, withMean=True).fit(train_df) train_df = scalar.transform(train_df) # Select the rows needed train_df = train_df.select(['scaledFeatures', TARGET]) new_test_data = dict() for company in test_data: company_data = test_data[company] test_df = vector_assembler.transform(company_data) test_df = scalar.transform(test_df) test_df = test_df.select(['scaledFeatures', TARGET]) new_test_data[company] = test_df return train_df, new_test_data
def normalize_score(df): from pyspark.ml.linalg import Vectors from pyspark.ml.feature import VectorAssembler from pyspark.ml.feature import StandardScaler assembler = VectorAssembler( inputCols=["score"], outputCol="score_v") output = assembler.transform(df) # Normalize each Vector using $L^1$ norm. scaler = StandardScaler(inputCol="score_v", outputCol="popularity_score", withStd=False, withMean=True) # Compute summary statistics by fitting the StandardScaler scalerModel = scaler.fit(output) # Normalize each feature to have unit standard deviation. scaledData = scalerModel.transform(output) return scaledData
def run_standard_scaler(t_data): standardscaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=False) t_data = standardscaler.fit(t_data).transform(t_data) return t_data
def standarize_features(rfm_table): assembler = VectorAssembler(inputCols=['r_quartile','f_quartile','m_quartile'],\ outputCol='features',handleInvalid = 'skip') rfm_table = assembler.transform(rfm_table) standardizer = StandardScaler(withMean=True, withStd=True).setInputCol("features").setOutputCol("scaled_features") std_model = standardizer.fit(rfm_table) features = std_model.transform(rfm_table) return features
def StandardScalerModel(input_col, output_col, withStd, withMean, input_data): staticmethod = StandardScaler(inputCol=input_col, outputCol=output_col, withStd=withStd, withMean=withMean) model = staticmethod.fit(input_data) result = model.transform(input_data) return result
def StandardScalerModel(input_col, output_col, withStd, withMean, input_data): # mindf 必须在文档中出现的最少次数 # vocabSize 词典大小 staticmethod = StandardScaler(inputCol=input_col, outputCol=output_col, withStd=withStd, withMean=withMean) model = staticmethod.fit(input_data) result = model.transform(input_data) return result
def standardScaler(self): from pyspark.ml.feature import StandardScaler dataFrame = self.session.read.format("libsvm").load( self.dataDir + "/data/mllib/sample_libsvm_data.txt") scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False) scalerModel = scaler.fit(dataFrame) scaledData = scalerModel.transform(dataFrame) scaledData.show() scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures") # Compute summary statistics and generate MinMaxScalerModel scalerModel = scaler.fit(dataFrame) # rescale each feature to range [min, max]. scaledData = scalerModel.transform(dataFrame) print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax())) scaledData.select("features", "scaledFeatures").show() scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures") # Compute summary statistics and generate MaxAbsScalerModel scalerModel = scaler.fit(dataFrame) # rescale each feature to range [-1, 1]. scaledData = scalerModel.transform(dataFrame) scaledData.select("features", "scaledFeatures").show()
def scale_features(df): scalar = StandardScaler(inputCol='features', outputCol='scaled_features', withStd=True, withMean=False) model = scalar.fit(df) sc_df = model.transform(df) sc_df = sc_df.drop('features') sc_df = sc_df.select( *(col(c) for c in list(set(sc_df.columns) - {'scaled_features'})), col('scaled_features').alias('features')) return sc_df
def standard_scale(dataFrame, inputColNames, usr_withStd=True, usr_withMean=False): assembledDF = getAssembledDataFrame(dataFrame, inputColNames) scaler=StandardScaler(inputCol="features", \ outputCol="scaled features", \ withStd=usr_withStd, \ withMean=usr_withMean).fit(assembledDF) scaledDF = scaler.transform(assembledDF).drop("features") return scaledDF
def nlpTransform(data): tokenizer = Tokenizer(inputCol="combi_text", outputCol="words") wordsData = tokenizer.transform(data) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures") featurizedData = hashingTF.transform(wordsData) scaler = StandardScaler(inputCol="rawFeatures", outputCol="features", withStd=True, withMean=False) featureData = scaler.fit(featurizedData) featureD = featureData.transform(featurizedData) return featureD
def Model(Data, Tgt='Target', Indp='Nada'): vector_assembler = VectorAssembler( inputCols=Indp, outputCol='assembled_important_features') standard_scaler = StandardScaler(inputCol=vector_assembler.getOutputCol(), outputCol='standardized_features') rf = RandomForestClassifier(featuresCol=standard_scaler.getOutputCol(), labelCol=Tgt) # letters_train, letters_test = letters.randomSplit([0.8,0.2], seed=4) pipeline = Pipeline(stages=[vector_assembler, standard_scaler, rf]) pipeline_model_rf = pipeline.fit(Data) return pipeline_model_rf
def kmeans(self): rdd = self.stream.filter(lambda message: float(message.temperature)) \ .filter(lambda message: float(message.delay > 10000)) \ .transform(lambda rdd: rdd.sortByKey()) sqlContext = SQLContext(self.sc) schema = sqlContext.createDataFrame(rdd) df = schema.createOrReplaceTempView('kmeans') assembler = VectorAssembler(inputCols=df.columns, outputCol='features') final_df = assembler.transform(df) scaler = StandardScaler(inputCol='features', outputCol='scaled_features') scaler_model = scaler.fit(final_df) return scaler_model.transform(final_df)
def standard_scaler(self, df, column): """ 按列 特征标准化StandardScaler """ print('StandScalerExample') # 按特征列减均值除标准差——标准化 scaler = StandardScaler(inputCol=column, outputCol=column + '_standscale', withStd=False, withMean=True) scalerModel = scaler.fit(df) scaledData = scalerModel.transform(df) return scaledData
def standardize_url_vectors(urls_and_vectors: DataFrame) -> DataFrame: """ Standardizes URLs and vectors DataFrame. :param urls_and_vectors: A DataFrame of URLs and vectors with columns: id, url, split_url, coefficients, vector. :return: A DataFrame of URLS and standardized vectors with columns: id, url, split_url, coefficients, vector. """ standard_scaler = StandardScaler(inputCol="vector", outputCol="scaled_vector") standard_scaler_model = standard_scaler.fit(urls_and_vectors) return standard_scaler_model \ .transform(urls_and_vectors) \ .select("id", "url", "split_url", "coefficients", "scaled_vector") \ .withColumnRenamed("scaled_vector", "vector")
def standard_scaler(input_df): df = input_df scaler = StandardScaler( inputCol='features', outputCol='features_Scaled', withMean=True, withStd=True) stds = scaler.fit(df) # Normalize each feature df = stds.transform(df).drop('features') df = df.withColumnRenamed('features_Scaled', 'features') return df
def get_scaled_label_features(self): self.data = self.get_label_features() scaler = StandardScaler(inputCol='features', \ outputCol="scaled", \ withStd=True, \ withMean=False) self.data = scaler.fit(self.data)\ .transform(self.data)\ .withColumnRenamed('features',"to_drop")\ .withColumnRenamed('scaled','features')\ .drop('to_drop') return self.data
def scaling(dataFrame, inputColName, usr_withStd, usr_withMean): outputColName = "scaled " + inputColName assembler = VectorAssembler(inputCols=[inputColName], \ outputCol="features") assembledDF = assembler.transform(dataFrame) scaler=StandardScaler(inputCol="features", \ outputCol=outputColName, \ withStd=usr_withStd, \ withMean=usr_withMean).fit(assembledDF) scaledDF = scaler.transform(assembledDF).drop("features") castVectorToFloat = udf(lambda v : float(v[0]), FloatType()) scaledDF = scaledDF.withColumn(outputColName, castVectorToFloat(outputColName)) print ("Successfully scale the column '{0:s}' and create a new column '{1:s}'.".format(inputColName, outputColName)) return scaledDF
def hacker_test(spark, resources_folder): data = spark.read.csv(resources_folder + 'hack_data.csv', header=True, inferSchema=True) data.printSchema() data.show() print(data.columns) assembler = VectorAssembler(inputCols=[ 'Session_Connection_Time', 'Bytes Transferred', 'Kali_Trace_Used', 'Servers_Corrupted', 'Pages_Corrupted', 'WPM_Typing_Speed' ], outputCol='features') data_assembled = assembler.transform(data) data_assembled.show() scaler = StandardScaler(inputCol='features', outputCol='scaledfeatures') scaler_model = scaler.fit(data_assembled) data_assembled_scaled = scaler_model.transform(data_assembled) data_assembled_scaled.show() data_assembled = data_assembled_scaled.select('scaledfeatures').withColumn( 'features', data_assembled_scaled['scaledfeatures']) data_assembled.show() print( "************************************* con tres cluster *************************************" ) kmeans3 = KMeans(featuresCol='features', k=3, seed=10) model3 = kmeans3.fit(data_assembled) wssse3 = model3.summary.trainingCost print(wssse3) print(model3.clusterCenters()) model3.summary.predictions.show() predictions3 = model3.summary.predictions predictions3.groupBy('prediction').count().show() # predictions3.agg({'prediction': 'count'}).show() print( "************************************* con dos cluster *************************************" ) kmeans2 = KMeans(featuresCol='features', k=2, seed=10) model2 = kmeans2.fit(data_assembled) wssse2 = model2.summary.trainingCost print(wssse2) print(model2.clusterCenters()) model2.summary.predictions.show() predictions2 = model2.summary.predictions predictions2.groupBy('prediction').count().show()
def standardScalerModel(df, conf): """ input: spark-dataFrame, conf [configuration params] return value: model """ mean = conf.get("withMean", False) std = conf.get("withStd", True) input = conf.get("inputCol", None) output = conf.get("outputCol", None) scaler = StandardScaler(inputCol=input, outputCol=output, withMean=mean, withStd=std) model = scaler.fit(dataFrame) return scaler, model
# Get the tf-idf features data = tf_idf_features_quora(data) # Get the text features data = text_features(data) # combine all the features feature_assembler = VectorAssembler( inputCols=["tf_idf_features", "text_features"], outputCol="combined_features" ) data = feature_assembler.transform(data) # Normalizing each feature to have unit standard deviation scaler = StandardScaler(inputCol="combined_features", outputCol="features", withStd=True, withMean=False) scalerModel = scaler.fit(data) # Normalize each feature to have unit standard deviation. data = scalerModel.transform(data) # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data) # Automatically identify categorical features, and index them. feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=2).fit(data) training_df, test_df = data.randomSplit([0.8, 0.2]) training_df.cache() test_df.cache()
def initialize(self, do_scaling=True, do_onehot=True): """Reads the dataset, initializes class members. features_df: Original DataFrame as read from the features_file. train_df: A DataFrame with columns Lat, Lon, Pickup_Count and vector columns Features & ScaledFeatures. Contains only data before 2015. test_df: As train_df, but only containing data of 2015. districts_with_counts: A DataFrame with all districts and their counts. """ # Read feature dataframe self.features_df = self.sql_context.read.parquet(self.features_file).cache() # Set exclude columns to default exclude_columns = self.EXCLUDE_COLUMNS # Scale features if do_scaling: assembler = VectorAssembler(inputCols=self.SCALE_COLUMNS, outputCol='FeaturesToScale') self.features_df = assembler.transform(self.features_df) scaler = StandardScaler(inputCol='FeaturesToScale', outputCol=('ScaledFeatures'), withStd=True, withMean=False) self.features_df = scaler.fit(self.features_df).transform(self.features_df) exclude_columns += self.SCALE_COLUMNS + ['FeaturesToScale'] # Adopt categorical features that do not have a value range of [0, numCategories) for column in ['Day', 'Month', 'Day_Of_Year']: if column in self.features_df.columns: self.features_df = self.features_df.withColumn(column, self.features_df[column] - 1) # Encode categorical features using one-hot encoding if do_onehot: vec_category_columns = ['%s_Vector' % column for column in self.ONE_HOT_COLUMNS] for i in range(len(self.ONE_HOT_COLUMNS)): column = self.ONE_HOT_COLUMNS[i] if column in self.features_df.columns: self.features_df = self.features_df.withColumn(column, self.features_df[column].cast(DoubleType())) encoder = OneHotEncoder(inputCol=column, outputCol=vec_category_columns[i], dropLast=False) self.features_df = encoder.transform(self.features_df) exclude_columns += self.ONE_HOT_COLUMNS # Vectorize features feature_columns = [column for column in self.features_df.columns if column not in exclude_columns] assembler = VectorAssembler(inputCols=feature_columns, outputCol='Features') self.features_df = assembler.transform(self.features_df) # Set number of distinct values for categorical features (identified by index) self.categorical_features_info = {} if not do_onehot: self.categorical_features_info = {i:self.CATEGORY_VALUES_COUNT[feature_columns[i]] for i in range(len(feature_columns)) if feature_columns[i] in self.CATEGORY_VALUES_COUNT.keys()} # Split into train and test data split_date = datetime(2015, 1, 1) self.train_df = self.features_df.filter(self.features_df.Time < split_date).cache() self.test_df = self.features_df.filter(self.features_df.Time > split_date).cache() # Compute Districts with counts self.districts_with_counts = self.features_df \ .groupBy([self.features_df.Lat, self.features_df.Lon]) \ .count()
bucketer = Bucketizer().setSplits(bucketBorders).setInputCol("id") bucketer.transform(contDF).show() # COMMAND ---------- from pyspark.ml.feature import QuantileDiscretizer bucketer = QuantileDiscretizer().setNumBuckets(5).setInputCol("id") fittedBucketer = bucketer.fit(contDF) fittedBucketer.transform(contDF).show() # COMMAND ---------- from pyspark.ml.feature import StandardScaler sScaler = StandardScaler().setInputCol("features") sScaler.fit(scaleDF).transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import MinMaxScaler minMax = MinMaxScaler().setMin(5).setMax(10).setInputCol("features") fittedminMax = minMax.fit(scaleDF) fittedminMax.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import MaxAbsScaler maScaler = MaxAbsScaler().setInputCol("features")
# See the License for the specific language governing permissions and # limitations under the License. # from __future__ import print_function from pyspark import SparkContext from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import StandardScaler # $example off$ if __name__ == "__main__": sc = SparkContext(appName="StandardScalerExample") sqlContext = SQLContext(sc) # $example on$ dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False) # Compute summary statistics by fitting the StandardScaler scalerModel = scaler.fit(dataFrame) # Normalize each feature to have unit standard deviation. scaledData = scalerModel.transform(dataFrame) scaledData.show() # $example off$ sc.stop()