# COMMAND ---------- # MAGIC %md #4. Logistic regression - all features # COMMAND ---------- from pyspark.ml import Pipeline from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler categoricalColumns = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country"] stages = [] # stages in our Pipeline for categoricalCol in categoricalColumns: # Category Indexing with StringIndexer stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index") # Use OneHotEncoder to convert categorical variables into binary SparseVectors # encoder = OneHotEncoderEstimator(inputCol=categoricalCol + "Index", outputCol=categoricalCol + "classVec") encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"]) # Add stages. These are not run here, but will run all at once later on. stages += [stringIndexer, encoder] # Convert label into label indices using the StringIndexer label_stringIdx = StringIndexer(inputCol="income", outputCol="label") stages += [label_stringIdx] # Transform all features into a vector using VectorAssembler numericCols = ["age", "fnlwgt", "education_num", "capital_gain", "capital_loss", "hours_per_week"] assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") stages += [assembler] partialPipeline = Pipeline().setStages(stages)
from pyspark.ml import Pipeline indexer = StringIndexer(inputCol="class", outputCol="classIndex") encoder = OneHotEncoder(inputCol="classIndex", outputCol="categoryVec") vectorAssembler = VectorAssembler(inputCols=["x","y","z"], outputCol="features") normalizer = Normalizer(inputCol="features", outputCol="features_norm", p=1.0) MinMaxScaler = MinMaxScaler(inputCol="features", outputCol="features_minmax") pipeline = Pipeline(stages=[indexer, encoder, vectorAssembler, normalizer,MinMaxScaler]) model = pipeline.fit(df) prediction = model.transform(df) prediction.show() from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, Normalizer, MinMaxScaler, OneHotEncoderEstimator from pyspark.ml.linalg import Vectors from pyspark.ml import Pipeline indexer = StringIndexer(inputCol="class", outputCol="classIndex") encoder = OneHotEncoderEstimator(inputCol="classIndex", outputCol="categoryVec") vectorAssembler = VectorAssembler(inputCols=["x","y","z"], outputCol="features") normalizer = Normalizer(inputCol="features", outputCol="features_norm", p=1.0) pipeline = Pipeline(stages=[indexer, encoder, vectorAssembler, normalizer]) model = pipeline.fit(df) prediction = model.transform(df) prediction.show()
label_indexer = StringIndexer(inputCol='' + target_variable, outputCol='label').fit(df) df = label_indexer.transform(df) string_indexer = [ StringIndexer(inputCol=column, outputCol=column + "_index", handleInvalid="keep") for column in cats ] pipeline0 = Pipeline(stages=string_indexer) df_transformed = pipeline0.fit(df).transform(df) df_transformed.cache() #One hot encoding for Logistic encoder = OneHotEncoderEstimator(inputCols=[ string_indexer[i].getOutputCol() for i in range(0, len(string_indexer)) ], outputCols=[ column + "_cat" for column in cats ]) stages += [encoder] assemblerInputs = [c + "_cat" for c in cats] + nums assemblerInputs.remove(target_variable) assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") stages += [assembler] pipeline1 = Pipeline(stages=stages) df_transformed_logistic = pipeline1.fit(df_transformed).transform( df_transformed) standardscaler = StandardScaler().setInputCol("features").setOutputCol( "scaled_features") df_transformed_logistic = standardscaler.fit( df_transformed_logistic).transform(df_transformed_logistic) train, test = df_transformed_logistic.randomSplit([0.70, 0.30], seed=42)
# Create the news columns based on the group. encoder = OneHotEncoder(dropLast=False, inputCol="workclass_encoded", outputCol="workclass_vec") encoded = encoder.transform(indexed) encoded.show(2) # In[24]: # Encode the categorical data categorical_variables = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native_country'] indexers = [StringIndexer(inputCol=column, outputCol=column+"-index") for column in categorical_variables] encoder = OneHotEncoderEstimator( inputCols=[indexer.getOutputCol() for indexer in indexers], outputCols=["{0}-encoded".format(indexer.getOutputCol()) for indexer in indexers] ) assembler = VectorAssembler( inputCols=encoder.getOutputCols(), outputCol="categorical-features" ) # In[25]: # # Create a Pipeline. pipeline = Pipeline(stages=indexers + [encoder, assembler]) pipelineModel = pipeline.fit(df_remove) model = pipelineModel.transform(df_remove)
store = Store.create(args.work_dir) # Download MNIST dataset data_url = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/mnist.bz2' libsvm_path = os.path.join(args.data_dir, 'mnist.bz2') if not os.path.exists(libsvm_path): subprocess.check_output(['wget', data_url, '-O', libsvm_path]) # Load dataset into a Spark DataFrame df = spark.read.format('libsvm') \ .option('numFeatures', '784') \ .load(libsvm_path) # One-hot encode labels into SparseVectors encoder = OneHotEncoderEstimator(inputCols=['label'], outputCols=['label_vec'], dropLast=False) model = encoder.fit(df) train_df = model.transform(df) # Train/test split train_df, test_df = train_df.randomSplit([0.9, 0.1]) # Define the PyTorch model without any Horovod-specific parameters class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.conv1 = nn.Conv2d(1, 10, kernel_size=5) self.conv2 = nn.Conv2d(10, 20, kernel_size=5) self.conv2_drop = nn.Dropout2d() self.fc1 = nn.Linear(320, 50)
sum(price) as s2, max(price) as ma2, min(price) as mi2 ,u.gender, u.age, u.user_id, u.segment from sales_known s join users_known u where s.user_id = u.user_id group by u.user_id, u.gender, u.age, u.segment""") #подробное описание модели https://towardsdatascience.com/machine-learning-with-pyspark-and-mllib-solving-a-binary-classification-problem-96396065d2aa #и https://spark.apache.org/docs/latest/ml-features.html #в общем - все анализируемые колонки заносим в колонку-вектор features categoricalColumns = ['gender', 'age'] stages = [] for categoricalCol in categoricalColumns: stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + 'Index').setHandleInvalid("keep") encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec" ]).setHandleInvalid("keep") stages += [stringIndexer, encoder] label_stringIdx = StringIndexer(inputCol='segment', outputCol='label').setHandleInvalid("keep") stages += [label_stringIdx] numericCols = ['c', 's1', 'ma1', 'mi1', 's2', 'ma2', 'mi2'] assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features").setHandleInvalid("keep") stages += [assembler] lr = LogisticRegression(featuresCol='features', labelCol='label', maxIter=10) stages += [lr]
#Create categorical variables for Region and StreetID from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer from pyspark.ml import Pipeline #Region genderIndxr = StringIndexer().setInputCol("Region").setOutputCol("RegionInd") #StreetID classIndxr = StringIndexer().setInputCol("StreetID").setOutputCol("StreetIDInd") #One-hot encoding ohee_catv = OneHotEncoderEstimator(inputCols=["RegionInd","StreetIDInd"],outputCols=["Region_dum","StreetID_dum"]) pipe_catv = Pipeline(stages=[genderIndxr, classIndxr, ohee_catv]) basetable_train = pipe_catv.fit(data1).transform(data1) basetable_train = basetable_train.drop("RegionInd","StreetIDInd","FirstSubDate","LastComplaint","FirstComplaint","ChurnedAt03/02/2018","ChurnedAt03/02/2019") basetable_train= basetable_train.withColumnRenamed("ChurnIn6Month","label") # COMMAND ---------- basetable_test = pipe_catv.fit(BaseValidation).transform(BaseValidation) basetable_test = basetable_test.drop("RegionInd","StreetIDInd","FirstSubDate","LastComplaint","FirstComplaint","ChurnedAt03/02/2018","ChurnedAt03/02/2019") basetable_test= basetable_test.withColumnRenamed("ChurnIn6Month","label")
print("Sample model input") print(flites.toPandas().sample(12)) # Create an indexer for the org categorical feature #flights_indexed = StringIndexer(inputCol="org", outputCol='org_idx').fit(flights).transform(flights) indexer = StringIndexer(inputCol="org", outputCol='org_idx') # Assign index values to strings indexer = indexer.fit(flites) # Create column with index values flites = indexer.transform(flites) # Check first five records #flights_indexed.show(5) onehot = OneHotEncoderEstimator(inputCols=['org_idx', 'dow'], outputCols=['org_dummy', 'dow_dummy']) flites = onehot.fit(flites).transform(flites) # Create 'features' vector: 'weight_kg', 'cyl', 'type_dummy' assembler = VectorAssembler(inputCols=['km', 'org_dummy', 'dow_dummy'], outputCol='features') # Consolidate predictor columns flites = assembler.transform(flites) # Check the resulting column #flites.distinct().show(8, truncate=False) # Split the data into training and testing sets flights_train, flights_test = flites.randomSplit([0.8, 0.2], seed=23) #print(flights_train.toPandas().shape, flights_test.toPandas().shape)
predictionCol='svm_pred_2', rawPredictionCol='svm_raw_2') # build pipeline to generate predictions from base classifiers, will be used in task 1.3 gen_base_pred_pipeline = Pipeline( stages=[nb_0, nb_1, nb_2, svm_0, svm_1, svm_2]) gen_base_pred_pipeline_model = gen_base_pred_pipeline.fit(training_set) # task 1.2 meta_features = gen_meta_features(training_set, nb_0, nb_1, nb_2, svm_0, svm_1, svm_2) # build onehotencoder and vectorassembler pipeline onehot_encoder = OneHotEncoderEstimator( inputCols=[ 'nb_pred_0', 'nb_pred_1', 'nb_pred_2', 'svm_pred_0', 'svm_pred_1', 'svm_pred_2', 'joint_pred_0', 'joint_pred_1', 'joint_pred_2' ], outputCols=['vec{}'.format(i) for i in range(9)]) vector_assembler = VectorAssembler( inputCols=['vec{}'.format(i) for i in range(9)], outputCol='meta_features') gen_meta_feature_pipeline = Pipeline(stages=[onehot_encoder, vector_assembler]) gen_meta_feature_pipeline_model = gen_meta_feature_pipeline.fit(meta_features) meta_features = gen_meta_feature_pipeline_model.transform(meta_features) # train the meta clasifier lr_model = LogisticRegression(featuresCol='meta_features', labelCol='label', predictionCol='final_prediction', maxIter=20, regParam=1., elasticNetParam=0)
'NVVar1','NVVar2','NVVar3','NVVar4',\ 'Cat1_id','Cat2_id','Cat3_id','Cat4_id','Cat5_id','Cat6_id','Cat7_id','Cat8_id','Cat9_id','Cat10_id','Cat11_id','Cat12_id',\ 'Calendar_Year','Model_Year','Claim_Amount') category_id = ['Cat1_id','Cat2_id','Cat3_id','Cat4_id','Cat5_id','Cat6_id','Cat7_id','Cat8_id','Cat9_id','Cat10_id','Cat11_id','Cat12_id'] cat_ohe = [] for col in category_id: cat_ = col.replace('_id','_ohe') cat_ohe.append(cat_) input_features.append(cat_) data = raw_df.select('Var1','Var2','Var3','Var4','Var5','Var6','Var7','Var8','NVVar1','NVVar2','NVVar3','NVVar4', 'Cat1_id','Cat2_id','Cat3_id','Cat4_id','Cat5_id','Cat6_id','Cat7_id','Cat8_id','Cat9_id','Cat10_id','Cat11_id','Cat12_id','Calendar_Year','Model_Year','Claim_Amount') encoder = OneHotEncoderEstimator(inputCols=category_id, outputCols=cat_ohe) encoder_data = encoder.fit(data) data_ohe = encoder_data.transform(data) # #assemble all features from pyspark.ml.feature import VectorAssembler all_features_assembler = VectorAssembler(inputCols=['Cat1_ohe','Cat2_ohe','Cat3_ohe','Cat4_ohe'\ ,'Cat5_ohe','Cat6_ohe','Cat7_ohe','Cat8_ohe'\ ,'Cat9_ohe','Cat10_ohe','Cat11_ohe','Cat12_ohe'\ ,'Var1','Var2','Var3','Var4','Var5',\ 'Var6','Var7','Var8','NVVar1','NVVar2'\ ,'NVVar3','NVVar4','Calendar_Year','Model_Year'],\ outputCol='features') all_data = all_features_assembler.transform(data_ohe)
def train_model(args): # do not run this test for pytorch lightning below min supported verson import pytorch_lightning as pl if LooseVersion(pl.__version__) < LooseVersion(MIN_PL_VERSION): print("Skip test for pytorch_ligthning=={}, min support version is {}". format(pl.__version__, MIN_PL_VERSION)) return # Initialize SparkSession conf = SparkConf().setAppName('pytorch_spark_mnist').set( 'spark.sql.shuffle.partitions', '16') if args.master: conf.setMaster(args.master) elif args.num_proc: conf.setMaster('local[{}]'.format(args.num_proc)) spark = SparkSession.builder.config(conf=conf).getOrCreate() # Setup our store for intermediate data store = Store.create(args.work_dir) # Download MNIST dataset data_url = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/mnist.bz2' libsvm_path = os.path.join(args.data_dir, 'mnist.bz2') if not os.path.exists(libsvm_path): subprocess.check_output(['wget', data_url, '-O', libsvm_path]) # Load dataset into a Spark DataFrame df = spark.read.format('libsvm') \ .option('numFeatures', '784') \ .load(libsvm_path) # One-hot encode labels into SparseVectors encoder = OneHotEncoder(inputCols=['label'], outputCols=['label_vec'], dropLast=False) model = encoder.fit(df) train_df = model.transform(df) # Train/test split train_df, test_df = train_df.randomSplit([0.9, 0.1]) # Define the PyTorch model without any Horovod-specific parameters class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.conv1 = nn.Conv2d(1, 10, kernel_size=5) self.conv2 = nn.Conv2d(10, 20, kernel_size=5) self.conv2_drop = nn.Dropout2d() self.fc1 = nn.Linear(320, 50) self.fc2 = nn.Linear(50, 10) def forward(self, features): x = features.float() x = F.relu(F.max_pool2d(self.conv1(x), 2)) x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) x = x.view(-1, 320) x = F.relu(self.fc1(x)) x = F.dropout(x, training=self.training) x = self.fc2(x) return F.log_softmax(x, -1) model = Net() optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5) loss = nn.NLLLoss() # Train a Horovod Spark Estimator on the DataFrame backend = SparkBackend(num_proc=args.num_proc, stdout=sys.stdout, stderr=sys.stderr, prefix_output_with_timestamp=True) torch_estimator = hvd.TorchEstimator( backend=backend, store=store, model=model, optimizer=optimizer, loss=lambda input, target: loss(input, target.long()), input_shapes=[[-1, 1, 28, 28]], feature_cols=['features'], label_cols=['label'], validation=0.1, batch_size=args.batch_size, epochs=args.epochs, verbose=1) torch_model = torch_estimator.fit(train_df).setOutputCols(['label_prob']) # Evaluate the model on the held-out test DataFrame pred_df = torch_model.transform(test_df) argmax = udf(lambda v: float(np.argmax(v)), returnType=T.DoubleType()) pred_df = pred_df.withColumn('label_pred', argmax(pred_df.label_prob)) evaluator = MulticlassClassificationEvaluator(predictionCol='label_pred', labelCol='label', metricName='accuracy') print('Test accuracy:', evaluator.evaluate(pred_df)) spark.stop()
# MAGIC # MAGIC To do that we will use the `VectorAssembler` where we # MAGIC * Set `inputCols` to the new list of feature columns # MAGIC * Set `outputCol` to `features` # MAGIC # MAGIC # MAGIC For more information see: # MAGIC * Scala: <a href="https://spark.apache.org/docs/latest/api/scala/#org.apache.spark.ml.feature.VectorAssembler" target="_blank">VectorAssembler</a> # MAGIC * Python: <a href="https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.VectorAssembler" target="_blank">VectorAssembler</a> # COMMAND ---------- from pyspark.ml.feature import VectorAssembler from pyspark.ml.feature import StandardScaler encoder = OneHotEncoderEstimator(inputCols=["seasonIndex","yrIndex", "workingdayIndex", "weathersitIndex" ], outputCols=["seasonVector", "yrVector", "workingdayVector", "weathersitVector"]) assemblerInputs = [ "mnth", "hr", "hum", "atemp", "windspeed", # Our numerical features #"seasonIndex", "yrIndex", "workingdayIndex", "weathersitIndex" "seasonVector", "yrVector", "workingdayVector", "weathersitVector" ] # Our new categorical features vectorAssembler = VectorAssembler( inputCols=assemblerInputs, outputCol="features_assembler") scaler = StandardScaler(inputCol="features_assembler", outputCol="features")
.format("csv") \ .load(test_input) #set handle invalid to keep to account for any unseen categorical data in the test set target = ['click'] indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c), handleInvalid="skip") for c in categoryCols ] label = StringIndexer(inputCol=target[0], outputCol="label") #formatting for logistic regression encoder = OneHotEncoderEstimator( inputCols=[indexer.getOutputCol() for indexer in indexers], outputCols=[ "{0}_encoded".format(indexer.getOutputCol()) for indexer in indexers ], dropLast=False) assembler = VectorAssembler(inputCols=encoder.getOutputCols() + integerCols, outputCol="features") lr = LogisticRegression(maxIter=10) pipeline = Pipeline(stages=indexers + [encoder, assembler, label, lr]) # fit train data with LogisticRegression model using the pipeline lr_model = pipeline.fit(train_df) # Make predictions on test data using the transform() method. - LogisticRegression.transform() will only use the 'features' column. predictions = lr_model.transform(test_df)
"workclass", "education", "marital_status", "occupation", "relationship", "race", "gender", "native_country" ] numericCols = [ "age", "fnlwgt", "educational_num", "capital_gain", "capital_loss", "hours_per_week" ] stages = [] # Stages in the pipeline # First, we need to turn the categorical variables into one-hot encodings. # We do this in two steps: StringIndexer and OneHotEncoderEstimator for col in categoricalCols: stringIndexer = StringIndexer(inputCol=col, outputCol=col + "_index") encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[col + "_one_hot"]) stages += [stringIndexer, encoder] # Assemble all the columns into a single vector, called "features" assemblerInputs = [c + "_one_hot" for c in categoricalCols] + numericCols assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") stages += [assembler] print(stages) # Create a Pipeline. pipeline = Pipeline(stages=stages) # Fit pipeline to the dataset pipelineModel = pipeline.fit(adults_df)
print("The data contains %d records after dropping records with na values." % flights.count()) # Create an indexer for carrier categorical feature indexer = StringIndexer(inputCol="carrier", outputCol='carrier_idx') # Indexer identifies categories in the data indexer_model = indexer.fit(flights) # Indexer creates a new column with numeric index values flights_indexed = indexer_model.transform(flights) # Repeat the process for the org categorical feature flites = StringIndexer(inputCol="org", outputCol='org_idx').fit(flights_indexed).transform(flights_indexed) # Create an instance of the one hot encoder onehot = OneHotEncoderEstimator(inputCols=["org_idx"], outputCols=["org_dummy"]) # Apply the one hot encoder to the flights data onehot = onehot.fit(flites) flites = onehot.transform(flites) pd.set_option('display.max_columns', None) # all cols pd.set_option('display.width', 199) pd.set_option('display.max_colwidth', 199) # Create buckets at 3 hour intervals through the day buckets = Bucketizer(splits=[0, 3, 6, 9, 12, 15, 18, 21, 24], inputCol="depart", outputCol="depart_bucket") # Bucket the departure times bucketed = buckets.transform(flites)
def get_parcels_to_spark(parcels_filepath='data/EXTR_Parcel.csv'): # Comment out to only use initial SparkSession # spark = SparkSession\ # .builder\ # .master('Local[4]')\ # .appName("Get_Parcel_Data")\ # .config("spark.master", "local")\ # .getOrCreate() # Initially read in pre-cleaned Pandas DataFrame into Spark DataFrame parcel_pd = get_parcels(parcels_filepath) parcel = spark.createDataFrame(parcel_pd) # Normalize numerical data numerical_cols = [ 'PcntUnusable', 'WfntFootage', ] numerical_assembler = VectorAssembler(inputCols=numerical_cols, outputCol='num_features') parcel = numerical_assembler.transform(parcel) parcel = StandardScaler( inputCol='num_features', outputCol='num_features_std').fit(parcel).transform(parcel) # Create index and dummy_vector column names of categorical colums, eventually dropping categorical and index columns cat_cols = [ 'Range', 'Township', 'Section', 'QuarterSection', 'Area', 'SubArea', 'LevyCode', 'CurrentZoning', 'PresentUse', 'SqFtLot', 'WaterSystem', 'SewerSystem', 'Access', 'Topography', 'StreetSurface', 'InadequateParking', 'MtRainier', 'Olympics', 'Cascades', 'Territorial', 'SeattleSkyline', 'PugetSound', 'LakeWashington', 'SmallLakeRiverCreek', 'OtherView', 'WfntLocation', 'WfntBank', 'WfntPoorQuality', 'WfntRestrictedAccess', 'WfntAccessRights', 'TidelandShoreland', 'LotDepthFactor', 'TrafficNoise', 'NbrBldgSites', 'Contamination', ] cat_index = [] dummies = [] for col in cat_cols: cat_index.append(col + '_index') dummies.append(col + '_dummy_vector') # Create and populate categorical index columns indexers = [ StringIndexer(inputCol=column, outputCol=column + "_index").fit(parcel) for column in cat_cols ] cat_pipeline = Pipeline(stages=indexers) parcel = cat_pipeline.fit(parcel).transform(parcel) # Encode dummy_vector columns from categorical indeces encoder = OneHotEncoderEstimator(inputCols=cat_index, outputCols=dummies) model = encoder.fit(parcel) parcel = model.transform(parcel) # Drop categorical and index columns parcel = parcel.drop(*cat_cols) parcel = parcel.drop(*cat_index) parcel = parcel.drop(*numerical_cols) # Combine all features into single vector ignore = ['PIN'] assembler = VectorAssembler( inputCols=[col for col in parcel.columns if col not in ignore], outputCol='parcel_features') parcel = assembler.transform(parcel) # Drop all columns that are now in the features column ignore.append('parcel_features') parcel = parcel.drop(*[col for col in parcel.columns if col not in ignore]) # # Write to parquet - not sure if I will eventually open from this, but that's the idea # # gis.write.parquet('data/gis_parquet',mode='overwrite') return parcel
from pyspark.ml.feature import StringIndexer sipclass = StringIndexer(handleInvalid='keep', inputCol='Pclass', outputCol='idxPclass').fit(dftrain) dftrain = sipclass.transform(dftrain) dftrain = dftrain.drop('Pclass') # In[16]: dftrain.show() # In[17]: from pyspark.ml.feature import OneHotEncoderEstimator ohe = OneHotEncoderEstimator(handleInvalid='keep', dropLast=True, inputCols=['idxPclass'], outputCols=['ohePclass']).fit(dftrain) dftrain = ohe.transform(dftrain) dftrain = dftrain.drop('idxPclass') dftrain.sample(withReplacement=False, fraction=0.1).limit(20).show() # In[18]: from pyspark.ml.feature import VectorAssembler va = VectorAssembler( inputCols=['SibSp', 'Parch', 'Fare', 'impAge', 'ohePclass'], outputCol='features') dftrain = va.transform(dftrain) dftrain = dftrain.drop('SibSp', 'Parch', 'Fare', 'impAge', 'ohePclass') dftrain.show()
# Exercise_1 # Import the one hot encoder class from pyspark.ml.feature import OneHotEncoderEstimator # Create an instance of the one hot encoder onehot = OneHotEncoderEstimator(inputCols=['org_idx'], outputCols=['org_dummy']) # Apply the one hot encoder to the flights data onehot = onehot.fit(flights) flights_onehot = onehot.transform(flights) # Check the results flights_onehot.select('org', 'org_idx', 'org_dummy').distinct().sort('org_idx').show() -------------------------------------------------- # Exercise_2 from pyspark.ml.regression import LinearRegression from pyspark.ml.evaluation import RegressionEvaluator # Create a regression object and train on training data regression = LinearRegression(labelCol='duration').fit(flights_train) # Create predictions for the testing data and take a look at the predictions predictions = regression.transform(flights_test) predictions.select('duration', 'prediction').show(5, False) # Calculate the RMSE RegressionEvaluator(labelCol='duration').evaluate(predictions) -------------------------------------------------- # Exercise_3
def basicTrainingPipeline(data=False, col_target="revenue_tickets", first_pred_day=False, jarra='quinto', verbose=True, checks=False, logger=False): try: parser = SafeConfigParser() config_file_path = MODEL_CONFIG_FILE parser.read(config_file_path) # Model parameters (RF) min_samples_leaf = int(parser.get('random_forest_params', 'min_samples_leaf')) max_features = parser.get('random_forest_params', 'max_features') max_features = max_features if max_features == 'sqrt' else str(max_features) num_trees = int(parser.get('random_forest_params', 'num_trees')) max_depth = int(parser.get('random_forest_params', 'max_depth')) subsampling_rate = float(parser.get('random_forest_params', 'subsampling_rate')) # Other parameters from parser training_sample = float(parser.get('random_forest_params', 'training_sample')) model_complexity = parser.get('model_params', 'model_complexity') use_clustered_data_sets = parser.getboolean('model_params', 'use_clustered_data_sets') cols_id = [e.strip() for e in parser.get('col_names', 'cols_id').split(',')] col_predict = col_target + '_pred' # Set 1st day of predictions if first_pred_day is not None: split_value = first_pred_day else: split_value = datetime.today() first_pred_day = split_value.strftime('%Y-%m-%d') split_value = split_value.strftime('%Y-%m-%d') # Save parameters s3_save_path = parser.get('save_params', 's3_save_path') s3_save_pipelines_path = s3_save_path + 'pipelines/'+col_target+'/dt-execution=' + split_value + '/' # Connect to spark spark = createSparkSession(jarra=jarra, verbose=verbose, logger=logger) # Define name of the variable for predictions cols_cyclical, cols_ohe_in, cols_features, col_target, cols_id = defineFeatures(model_complex='first', use_clustered_data_sets=False, col_target=col_target, verbose=False, logger=False) # Add cyclical variables to features lists, OHE_out not as they are already in pipelines cols_cyclical_sin = [s + '_sin' for s in cols_cyclical] cols_cyclical_cos = [s + '_cos' for s in cols_cyclical] cols_cyclical_out = cols_cyclical_sin + cols_cyclical_cos for i in range(len(cols_features)): cols_features[i] = cols_features[i] + cols_cyclical_out # Fill with features (depending on how many models we have) cols_ohe_out = [] features_list = [] col_date = '' if verbose: logger.info(cols_ohe_in) logger.info('Number of partition of data df: ' + str(data.rdd.getNumPartitions())) # Define date filters for training set and test/pred sets of each consecutive models filterTrainEndList = [] for i in range(len(cols_features)): filterTrainEndList.append(col(col_date) < split_value) # Create list with data sets for each of the consecutive models, each data set have different features data = data.coalesce(200) train_data_list, train_data_basic_list = createTrainingDataLists(data, cols_features, cols_ohe_in, col_target, cols_id, filterTrainEndList, use_clustered_data_sets, training_sample, spark, verbose, logger) # String indexer, one hot encoder and vector assembler (creates column 'features' with all the features for # given model as vector), if verbose: logger.info('String indexer, one hot encoder and vector assembler, start') # Indexer: transforms string values into numeric values, the value that occurs in data most is indexed as zero, # second as 1 etc. indexers = [StringIndexer(inputCol=x, outputCol=x+'_tmp', handleInvalid='keep') for x in cols_ohe_in] # One hot encoding cols_ohe_in_tmp = [i + '_tmp' for i in cols_ohe_in] encoder = OneHotEncoderEstimator(dropLast=True, inputCols=cols_ohe_in_tmp, outputCols=cols_ohe_out, handleInvalid ='keep') # Add to pipeline pipeline_tmp=indexers + [encoder] # Create placeholders assembler = [] pipeline_tmp2 = [] pipeline_list = [] pipelinePrepList = [] trainDataList = [] pipelinePrepBasicList = [] trainDataBasicList = [] start = datetime.now() for i in range(len(train_data_list)): if verbose: logger.info('Model ' + str(i)) features_list[i] = features_list[i] + cols_ohe_out assembler.append(VectorAssembler(inputCols=features_list[i], outputCol='features')) pipeline_tmp2.append(pipeline_tmp + [assembler[i]]) pipeline_list.append(Pipeline(stages=pipeline_tmp2[i])) pipelinePrepList.append(pipeline_list[i].fit(train_data_list[i])) trainDataList.append(pipelinePrepList[i].transform(train_data_list[i])) end = datetime.now() if verbose: logger.info('First day of model training set: ' + str(trainDataList[0].toPandas().dt_flight_date_local.min())) logger.info('Last day of model training set: ' + str(trainDataList[0].toPandas().dt_flight_date_local.max())) logger.info('String indexer, one hot encoder and vector assembler, done for all models, time: ' + str(end-start)) logger.info('Number of partition of trainDataList 0 df: ' + str(trainDataList[0].rdd.getNumPartitions())) logger.info('Number of partition of trainDataList 5 df: ' + str(trainDataList[5].rdd.getNumPartitions())) logger.info('Features list for first model (ie. for next 7 days): ') logger.info(features_list[0]) logger.info('RF Model start') start_all = datetime.now() # Create placeholders RFModelList = [] fitList = [] fitBasicList = [] for i in range(len(trainDataList)): start = datetime.now() if verbose: logger.info('Model '+str(i)) RFModelList.append(RFRegr(labelCol=col_target, featuresCol='features', numTrees=num_trees, maxDepth=max_depth, featureSubsetStrategy=max_features, subsamplingRate=subsampling_rate, minInstancesPerNode=min_samples_leaf )) # Repartition to get the evenly distributed data set: trainDataList[i] = trainDataList[i].coalesce(36) # Fit to training set fitList.append(RFModelList[i].setPredictionCol(col_predict).fit(trainDataList[i])) end = datetime.now() if verbose: logger.info('Random Forest, ' + str(i) + ' model, time: ' + str(end-start)) if verbose: logger.info('Saving data preparation and model pipelines in ' + s3_save_pipelines_path) for i in range(len(pipelinePrepList)): pipelinePrepList[i].write().overwrite().save(s3_save_pipelines_path+"data_prep_pipeline"+str(i)) fitList[i].write().overwrite().save(s3_save_pipelines_path+"model_pipeline"+str(i)) if checks: mlflow_params_extra = calcTrainingSetError(fitList, fitBasicList, trainDataList, trainDataBasicList, cols_id, col_predict, col_target, verbose, logger) else: mlflow_params_extra = {} end_all = datetime.now() if verbose: logger.info('Random Forest, all models, time: ' + str(end_all-start_all)) max_features_int = max_features if max_features == 'sqrt' else float(max_features) mlflow_params = {'col_target': col_target, 'num_trees': num_trees, 'max_depth': max_depth, 'max_features': max_features_int, 'subsampling_rate': subsampling_rate, 'min_samples_leaf': min_samples_leaf, 'training_sample': training_sample, 'train_date_min': str(trainDataList[0].toPandas().dt_flight_date_local.min()), 'train_date_max': str(trainDataList[5].toPandas().dt_flight_date_local.max()), 'time_seconds': (end_all-start_all).total_seconds() } except Exception: logger.exception("Fatal error in demand_forecast_training()") raise return pipelinePrepList, fitList, mlflow_params, mlflow_params_extra
# } UDF { to_sparse = udf(dense_to_sparse, VectorUDT()) # COMMAND ---------- # DBTITLE 1,ML Pipeline # Create DataFrame df = spark.createDataFrame([(0.0, 1.0), (1.0, 0.0), (2.0, 1.0), (0.0, 2.0), (0.0, 1.0), (2.0, 0.0)], ['categoryIndex1', 'categoryIndex2']) # Data pipeline for feature creation encoder = OneHotEncoderEstimator(inputCols=['categoryIndex1'], outputCols=['categoryVec1']) assembler = VectorAssembler(inputCols=['categoryVec1', 'categoryIndex2'], outputCol='features') # pca = PCA( # k=2, # inputCol='features', # outputCol='pcaFeatures' # ) pipeline = Pipeline(stages=[encoder, assembler]) pipelineFit = pipeline.fit(df) output = pipelineFit.transform(df) # Transform all dense matrix to sparse output_sparse = output.withColumn('features', to_sparse(col('features'))) # Add a row index to join results later output_index = add_column_index(output_sparse)
label = 'salary' numerical_cols = [ 'age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week' ] categorical_cols = [ "workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country" ] stages = [] # One hot encode categorical cols for cname in categorical_cols: string_idxer = StringIndexer(inputCol=cname, outputCol=cname + 'Index') encoder = OneHotEncoderEstimator( inputCols=[string_idxer.getOutputCol()], outputCols=[cname + 'classVec']) stages += [string_idxer, encoder] # Convert labels (Slary) to 0 and 1 label_idxer = StringIndexer(inputCol="salary", outputCol="label") stages += [label_idxer] # Standardize numberical cols numerical_assembler = VectorAssembler(inputCols=numerical_cols, outputCol='numFeatures') scaler = StandardScaler(inputCol='numFeatures', outputCol='norm_cols', withStd=True, withMean=True)
scorecard_data_rdd = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(data_path) scorecard_data = scorecard_data_rdd.toDF(*db_cols) features =["CONTROL","ADM_RATE","ADM_RATE_ALL","SAT_AVG_ALL","SATMTMID","UGDS","HIGHDEG", "TUITFTE", "COSTT4_A", "PCTFLOAN","COMP_ORIG_YR2_RT", "UGDS_WHITE","UGDS_BLACK","UGDS_HISP","UGDS_ASIAN","UGDS_AIAN","UGDS_NHPI","UGDS_2MOR","UGDS_NRA","UGDS_UNKN","PPTUG_EF","COSTT4_P","TUITIONFEE_IN","TUITIONFEE_OUT","TUITIONFEE_PROG","INEXPFTE","PCTPELL","COMP_ORIG_YR3_RT","LOAN_COMP_ORIG_YR3_RT","DEATH_YR4_RT","COMP_ORIG_YR4_RT","AGE_ENTRY","COUNT_NWNE_P10","COUNT_WNE_P10","MN_EARN_WNE_P10","MD_EARN_WNE_P10","COMPL_RPY_1YR_RT"] categ_features = ["CONTROL","HIGHDEG"] target = "COMPL_RPY_1YR_RT" # this preprocessing step is important only in training, so it will not be included in the pipeline. scorecard_data_cleaned = cleanPrivacySuppressed(scorecard_data) # select not null target rows scorecard_data_cleaned =scorecard_data_cleaned.where(F.col(target).isNotNull()).where(F.col(target)!= np.nan) # hot encoder preprocessing hot_encoder = OneHotEncoderEstimator(inputCols=categ_features, outputCols=["{0}_ENCODED".format(colName) for colName in categ_features] ) #vector assembler model_input_features = [f for f in features if f not in categ_features and f is not target ] model_input_features.extend(["{0}_ENCODED".format(colName) for colName in categ_features]) vec_assembler = VectorAssembler(inputCols=model_input_features,outputCol="features",handleInvalid="keep") #preprocessing pipleline preprocessing_pipeline = Pipeline(stages=[hot_encoder,vec_assembler]) preprocessed_data = preprocessing_pipeline.fit(scorecard_data_cleaned).transform(scorecard_data_cleaned) # cache this preprocessing step, this is performance optimization step for model-tunning process preprocessed_data.cache() # split data to train/test 80/20 train_preprocessed_data = preprocessed_data.randomSplit([.8,.2])[0]
def test_profile_sparkml_pipeline(self): import inspect import os import numpy import pandas import time this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv") full_data = self.spark.read.format('csv') \ .options(header='true', inferschema='true').load(input_path) training_data, test_data = full_data.randomSplit([0.9, 0.1], seed=1) label = "income" dtypes = dict(training_data.dtypes) dtypes.pop(label) si_xvars = [] ohe_xvars = [] feature_cols = [] for idx, key in enumerate(dtypes): if dtypes[key] == "string": feature_col = "-".join([key, "encoded"]) feature_cols.append(feature_col) tmp_col = "-".join([key, "tmp"]) si_xvars.append(StringIndexer(inputCol=key, outputCol=tmp_col, handleInvalid="skip")) ohe_xvars.append(OneHotEncoderEstimator(inputCols=[tmp_col], outputCols=[feature_col], dropLast=False)) else: feature_cols.append(key) si_label = StringIndexer(inputCol=label, outputCol='label') assembler = VectorAssembler(inputCols=feature_cols, outputCol="features") lr = LogisticRegression(regParam=0.001) pipeline = Pipeline(stages=si_xvars + ohe_xvars + [ si_label, assembler, lr]) # filter out the records which will cause error model = pipeline.fit(training_data) model_onnx = convert_sparkml(model, 'Sparkml Pipeline', buildInitialTypesSimple(test_data)) if model_onnx is None: raise AssertionError("Failed to create the onnx model") model_path = os.path.join("tests", "profile_pipeline_model.onnx") with open(model_path, "wb") as f: f.write(model_onnx.SerializeToString()) rec_counts = [] spark_times = [] runtime_times = [] for i in range(0, 4): rec_counts.append(test_data.count()) data_np = buildInputDictSimple(test_data) # run the model in Spark start = time.time() predicted = model.transform(test_data) end = time.time() spark_times.append(1000*(end - start)) # test for correctness also expected = [ predicted.toPandas().label.values.astype(numpy.float32), predicted.toPandas().prediction.values.astype(numpy.float32), predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) ] # run the model in onnx runtime start = time.time() output, session = run_with_runtime(data_np, model_path) end = time.time() runtime_times.append(1000*(end - start)) # compare results _compare_expected(expected, output, session, model_path, decimal=5, onnx_shape=None) # each time in this loop double the number of rows test_data = test_data.union(test_data) results = pandas.DataFrame(data={ 'input_rec_count': rec_counts, 'pyspark (ms)': spark_times, 'onnxruntime (ms)': runtime_times }) print(results)
outputCol="cont_features") #Scaler for Continuous Variables scaler = StandardScaler(inputCol="cont_features", outputCol="scaledFeatures", withStd=True, withMean=False) #Indexers for Categorical Variables Neighborhood_indexer = StringIndexer(inputCol="Neighborhood", outputCol="Neighborhood_Indexed", handleInvalid="keep") YearBuilt_indexer = StringIndexer(inputCol="YearBuilt", outputCol="YearBuilt_Indexed", handleInvalid="keep") MoSold_indexer = StringIndexer(inputCol="MoSold", outputCol="MoSold_Indexed", handleInvalid="keep") YrSold_indexer = StringIndexer(inputCol="YrSold", outputCol="YrSold_Indexed", handleInvalid="keep") CentralAir_indexer = StringIndexer(inputCol="CentralAir", outputCol="CentralAir_Indexed", handleInvalid="keep") Condition1_indexer = StringIndexer(inputCol="Condition1", outputCol="Condition1_Indexed", handleInvalid="keep") #One Hot Encoder for Indexed Variables encoder = OneHotEncoderEstimator(inputCols=["Neighborhood_Indexed", "YearBuilt_Indexed", "MoSold_Indexed", "YrSold_Indexed", "OverallQual", "OverallCond", "CentralAir_Indexed", "Condition1_Indexed"], outputCols=["Neighborhood_Indexed_Vec", "YearBuilt_Indexed_Vec", "MoSold_Indexed_Vec", "YrSold_Indexed_Vec", "OverallQual_Vec", "OverallCond_Vec", "CentralAir_Indexed_Vec", "Condition1_Indexed_Vec"]) #Feature Vector Assembler assembler = VectorAssembler( inputCols=["Neighborhood_Indexed_Vec", "YearBuilt_Indexed_Vec", "MoSold_Indexed_Vec", "YrSold_Indexed_Vec", "OverallQual_Vec", "OverallCond_Vec", "CentralAir_Indexed_Vec", "Condition1_Indexed", "scaledFeatures"], outputCol="features") #Define Linear Regression Model lr = LinearRegression(maxIter=200) # COMMAND ---------- #Define Pipeline pipeline = Pipeline(stages=[cont_assembler, scaler, Neighborhood_indexer, YearBuilt_indexer, MoSold_indexer, YrSold_indexer, CentralAir_indexer, Condition1_indexer, encoder, assembler, lr]) # COMMAND ----------
dataset_df = sqlContext.read.csv('salaries.csv', header='true', inferSchema='true') # initializing stages of main transformation pipeline stages = [] # list of categorical features for further hot-encoding cat_features = ["rank", "discipline", "sincephd_bin", "service_bin", "sex"] # removing column with ID field dataset_df = dataset_df.drop('_c0') # bining numeric features by local binner udf function (specified for current dataset if needed) dataset_df = dataset_df.withColumn('sincephd_bin', binner(dataset_df['sincephd'])) dataset_df = dataset_df.withColumn('service_bin', binner(dataset_df['service'])) dataset_df = dataset_df.withColumn('model_type', sf.lit(0)) dataset_df = dataset_df.drop('sincephd', 'service') # hot encoding categorical features for feature in cat_features: string_indexer = StringIndexer(inputCol=feature, outputCol=feature + "_index") encoder = OneHotEncoderEstimator(inputCols=[string_indexer.getOutputCol()], outputCols=[feature + "_vec"]) encoder.setDropLast(False) stages += [string_indexer, encoder] assembler_inputs = [feature + "_vec" for feature in cat_features] assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="assembled_inputs") stages += [assembler] assembler_final = VectorAssembler(inputCols=["assembled_inputs"], outputCol="features") stages += [assembler_final] pipeline = Pipeline(stages=stages) pipeline_model = pipeline.fit(dataset_df) dataset_transformed = pipeline_model.transform(dataset_df) df_transform_fin = dataset_transformed.select('features', label, 'model_type').toPandas() train, test = train_test_split(df_transform_fin, test_size=0.3, random_state=0) train_df = sqlContext.createDataFrame(train) test_df = sqlContext.createDataFrame(test) decode_dict = {}
def runModel(regressionMethodName, stationID, stationDataFrame, featureInputCols, normalize, splitMethod='random'): print("=" * 80) print('Station:{0}'.format(stationID)) print( 'Model:{0}, Normalize:{1}, LinkFunction:{2}, train/test splitMethod:{3}' .format(regressionMethodName, normalize, labelLinkFunction, splitMethod)) print(featureInputCols) oneHot = OneHotEncoderEstimator( inputCols=["hourOfDay", "dayOfWeek"], outputCols=["hourOfDayVector", "dayOfWeekVector"]) stationSummaryAll = stationDataFrame.groupBy('station_id').agg( count('label'), sum('label'), avg("label"), stddev_pop("label")) stationAvg = stationSummaryAll.select('avg(label)').where( col('station_id') == stationID).collect() stationSum = stationSummaryAll.select('sum(label)').where( col('station_id') == stationID).collect() stationStd = stationSummaryAll.select('stddev_pop(label)').where( col('station_id') == stationID).collect() stationNonZeroCount = stationSummaryAll.select('count(label)').where( col('station_id') == stationID).collect() stationCount = stationSummaryAll.select('count(label)').where( col('station_id') == "None").collect() featureInputCols.extend(["hourOfDayVector", "dayOfWeekVector"]) assembler = VectorAssembler(inputCols=featureInputCols, outputCol='features') if normalize == True: normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0) featureName = "normFeatures" regressionMethod, regressionModelParameters = selectRegressionMethod( 'rf', featureName) pipeline = Pipeline( stages=[oneHot, assembler, normalizer, regressionMethod]) else: featureName = "features" regressionMethod, regressionModelParameters = selectRegressionMethod( 'rf', featureName) pipeline = Pipeline(stages=[oneHot, assembler, regressionMethod]) trainingDates = ['2016-10-01 00:00:00', '2017-9-30 23:59:59'] testDates = ['2017-10-01 00:00:00', '2017-10-31 23:59:59'] dates = {'train': trainingDates, 'test': testDates} if splitMethod == 'random': # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = stationDataFrame.randomSplit([0.6, 0.4]) else: (trainingData, testData) = timeSeriesTestTrain(stationDataFrame, dates) #fit model and make predictions model = pipeline.fit(trainingData) predictedData = model.transform(testData) #predictedData.select("prediction", "label", featureName).show(5) predictedData evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") evaluator2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2") evaluator3 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="explainedVariance") rmse = evaluator.evaluate(predictedData) rSquared = evaluator2.evaluate(predictedData) varianceExplained = evaluator2.evaluate(predictedData) print( "RMSE, R2, and variance explained on test data = {0:6.3f}, {1:6.3f}, {2:6.3f}" .format(rmse, rSquared, varianceExplained)) print() basetime = 1541216769 experimentTimeStamp = int((time.time() - basetime) / 6) experiment = { experimentTimeStamp: { "station": stationID, 'stationNonZeroCount': stationNonZeroCount, 'stationCount': stationCount, 'stationSum': stationSum, 'stationAvg': stationAvg, 'stationStd': stationStd, 'regressionMethodName': regressionMethodName, 'normalize': normalize, 'linkFunctionLabel': labelLinkFunction, 'featureInputCols': featureInputCols, 'rmse': rmse, 'rSquared': rSquared, 'varianceExplained': varianceExplained, 'version': "Added OneHotEncode for hOD, dOW", 'trainSplitMethod': splitMethod } } experiments.update(experiment) with open(pathFigure + "experiments.json", "w") as f: json.dump(experiments, f) return ()
df = spark.createDataFrame([ ("dummy 0", 0, "A", "X"), ("dummy 1", 1, "A", "X"), ("dummy 2", 2, "B", "Y"), ("dummy 3", 3, "A", "X"), ("dummy 4", 4, "D", "Y"), ("dummy 5", 5, "B", "X"), ("dummy 6", 6, "C", "Z"), ], ["dummy", "i", "avar", "xvar"]) stages = [] catvars = ["avar", "xvar"] for v in catvars: stages += [StringIndexer(inputCol=v, outputCol=f"i{v}")] ohin = [f"i{v}" for v in catvars] ohout = [f"v{v}" for v in catvars] stages += [OneHotEncoderEstimator(inputCols=ohin, outputCols=ohout)] stages += [VectorAssembler(inputCols=['vavar', 'vxvar', 'i'], outputCol='features')] pip = Pipeline(stages=stages) pipm = pip.fit(df) dft = pipm.transform(df) dft.show()
ratio = 1.0 counts = train.select(f'_c{label_idx}').groupBy(f'_c{label_idx}').count().collect() higher_bound = counts[1][1] treshold = int(ratio * float(counts[0][1]) / counts[1][1] * higher_bound) rand_gen = lambda x: randint(0, higher_bound) if x == 0 else -1 udf_rand_gen = udf(rand_gen, IntegerType()) train = train.withColumn('rand_idx', udf_rand_gen('_c0')) train_subsample = train.filter(train['rand_idx'] < treshold) train_subsample = train_subsample.drop('rand_idx') train_subsample.select(f'_c{label_idx}').groupBy(f'_c{label_idx}').count().show(n=5) # パイプラインの構築 indexers = [StringIndexer(inputCol=feature, outputCol=f'{feature}_idx', handleInvalid='keep') for feature in category_features] encoder = OneHotEncoderEstimator(inputCols=[f'{feature}_idx' for feature in category_features], outputCols=[f'{feature}_vec' for feature in category_features], dropLast=False, handleInvalid='keep') assembler = VectorAssembler(inputCols=real_features+[f'{feature}_vec' for feature in category_features], outputCol='assembles') pca = PCA(k=2, inputCol='assembles', outputCol='features') lr = LogisticRegression(featuresCol='features', labelCol=f'_c{label_idx}') stages = indexers stages.append(encoder) stages.append(assembler) stages.append(pca) stages.append(lr) pipeline = Pipeline(stages=stages) model = pipeline.fit(train_subsample) print(model.stages[-1].coefficients) predictions = model.transform(test) evaluator = MulticlassClassificationEvaluator(labelCol=f'_c{label_idx}', metricName='f1')
def gis_data_to_spark( numFolds, gis_filepath='data/Parcels_for_King_County_with_Address_with_Property_Information__parcel_address_area.csv' ): # Initially read in pre-cleaned Pandas DataFrame into Spark DataFrame gis_pd = get_gis_data(gis_filepath) gis_pd['fold'] = np.random.randint(0, numFolds, gis_pd.shape[0]) gis = spark.createDataFrame(gis_pd) # Normalize numerical data numerical_cols = [ 'LAT', 'LON', 'LOTSQFT', 'APPRLNDVAL', 'APPR_IMPR', 'TAX_LNDVAL', 'TAX_IMPR', 'Shape_Length', 'Shape_Area', 'value_per_area', 'improvement_over_land' ] numerical_assembler = VectorAssembler(inputCols=numerical_cols, outputCol='num_features') gis = numerical_assembler.transform(gis) gis = StandardScaler(inputCol='num_features', outputCol='num_features_std').fit(gis).transform(gis) # Create index and dummy_vector column names of categorical colums, eventually dropping categorical and index columns cat_cols = [ 'KCTP_STATE', 'SITETYPE', 'LEVYCODE', 'NEW_CONSTR', 'TAXVAL_RSN', 'QTS', 'SEC', 'TWP', 'RNG', 'KCA_ZONING', 'PROPTYPE', 'PREUSE_DESC' ] cat_index = [] dummies = [] for col in cat_cols: cat_index.append(col + '_index') dummies.append(col + '_dummy_vector') # Create and populate categorical index columns indexers = [ StringIndexer(inputCol=column, outputCol=column + "_index").fit(gis) for column in cat_cols ] cat_pipeline = Pipeline(stages=indexers) gis = cat_pipeline.fit(gis).transform(gis) # Encode dummy_vector columns from categorical indeces encoder = OneHotEncoderEstimator(inputCols=cat_index, outputCols=dummies) model = encoder.fit(gis) gis = model.transform(gis) # Drop categorical and index columns gis = gis.drop(*cat_cols) gis = gis.drop(*cat_index) gis = gis.drop(*numerical_cols) # Combine all features into single vector ignore = ['PIN', 'MAJOR', 'MINOR', 'ADDR_FULL', 'TARGET', 'fold'] assembler = VectorAssembler( inputCols=[col for col in gis.columns if col not in ignore], outputCol='gis_features') gis = assembler.transform(gis) # Drop all columns that are now in the features column ignore.append('gis_features') gis = gis.drop(*[col for col in gis.columns if col not in ignore]) # Write to parquet - not sure if I will eventually open from this, but that's the idea # gis.write.parquet('data/gis_parquet',mode='overwrite') return gis
# $example on$ from pyspark.ml.feature import OneHotEncoderEstimator # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("OneHotEncoderEstimatorExample")\ .getOrCreate() # Note: categorical features are usually first encoded with StringIndexer # $example on$ df = spark.createDataFrame([ (0.0, 1.0), (1.0, 0.0), (2.0, 1.0), (0.0, 2.0), (0.0, 1.0), (2.0, 0.0) ], ["categoryIndex1", "categoryIndex2"]) encoder = OneHotEncoderEstimator(inputCols=["categoryIndex1", "categoryIndex2"], outputCols=["categoryVec1", "categoryVec2"]) model = encoder.fit(df) encoded = model.transform(df) encoded.show() # $example off$ spark.stop()
from pyspark.ml.feature import OneHotEncoderEstimator from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() spark.sparkContext.setLogLevel("ERROR") df = spark.createDataFrame([(0.0, 1.0), (1.0, 0.0), (2.0, 1.0), (0.0, 2.0), (0.0, 1.0), (2.0, 0.0)], ["categoryIndex1", "categoryIndex2"]) encoder = OneHotEncoderEstimator( inputCols=["categoryIndex1", "categoryIndex2"], outputCols=["categoryVec1", "categoryVec2"]) model = encoder.fit(df) encoded = model.transform(df) encoded.show() spark.stop()