# COMMAND ----------

# MAGIC %md #4. Logistic regression - all features

# COMMAND ----------

from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
categoricalColumns = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country"]
stages = [] # stages in our Pipeline
for categoricalCol in categoricalColumns:
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    # Use OneHotEncoder to convert categorical variables into binary SparseVectors
    # encoder = OneHotEncoderEstimator(inputCol=categoricalCol + "Index", outputCol=categoricalCol + "classVec")
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    # Add stages.  These are not run here, but will run all at once later on.
    stages += [stringIndexer, encoder]
    
    
# Convert label into label indices using the StringIndexer
label_stringIdx = StringIndexer(inputCol="income", outputCol="label")
stages += [label_stringIdx]

# Transform all features into a vector using VectorAssembler
numericCols = ["age", "fnlwgt", "education_num", "capital_gain", "capital_loss", "hours_per_week"]
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

partialPipeline = Pipeline().setStages(stages)
示例#2
0
from pyspark.ml import Pipeline

indexer = StringIndexer(inputCol="class", outputCol="classIndex")
encoder = OneHotEncoder(inputCol="classIndex", outputCol="categoryVec")
vectorAssembler = VectorAssembler(inputCols=["x","y","z"],
                                  outputCol="features")
normalizer = Normalizer(inputCol="features", outputCol="features_norm", p=1.0)

MinMaxScaler = MinMaxScaler(inputCol="features", outputCol="features_minmax")

pipeline = Pipeline(stages=[indexer, encoder, vectorAssembler, normalizer,MinMaxScaler])
model = pipeline.fit(df)
prediction = model.transform(df)
prediction.show()

from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, Normalizer, MinMaxScaler, OneHotEncoderEstimator
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline

indexer = StringIndexer(inputCol="class", outputCol="classIndex")
encoder = OneHotEncoderEstimator(inputCol="classIndex", outputCol="categoryVec")
vectorAssembler = VectorAssembler(inputCols=["x","y","z"],
                                  outputCol="features")
normalizer = Normalizer(inputCol="features", outputCol="features_norm", p=1.0)


pipeline = Pipeline(stages=[indexer, encoder, vectorAssembler, normalizer])
model = pipeline.fit(df)
prediction = model.transform(df)
prediction.show()
示例#3
0
label_indexer = StringIndexer(inputCol='' + target_variable,
                              outputCol='label').fit(df)
df = label_indexer.transform(df)
string_indexer = [
    StringIndexer(inputCol=column,
                  outputCol=column + "_index",
                  handleInvalid="keep") for column in cats
]
pipeline0 = Pipeline(stages=string_indexer)
df_transformed = pipeline0.fit(df).transform(df)
df_transformed.cache()

#One hot encoding for Logistic
encoder = OneHotEncoderEstimator(inputCols=[
    string_indexer[i].getOutputCol() for i in range(0, len(string_indexer))
],
                                 outputCols=[
                                     column + "_cat" for column in cats
                                 ])
stages += [encoder]
assemblerInputs = [c + "_cat" for c in cats] + nums
assemblerInputs.remove(target_variable)
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]
pipeline1 = Pipeline(stages=stages)
df_transformed_logistic = pipeline1.fit(df_transformed).transform(
    df_transformed)
standardscaler = StandardScaler().setInputCol("features").setOutputCol(
    "scaled_features")
df_transformed_logistic = standardscaler.fit(
    df_transformed_logistic).transform(df_transformed_logistic)
train, test = df_transformed_logistic.randomSplit([0.70, 0.30], seed=42)
# Create the news columns based on the group. 
encoder = OneHotEncoder(dropLast=False, inputCol="workclass_encoded", outputCol="workclass_vec")
encoded = encoder.transform(indexed)
encoded.show(2)


# In[24]:


# Encode the categorical data
categorical_variables = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native_country']

indexers = [StringIndexer(inputCol=column, outputCol=column+"-index") for column in categorical_variables]

encoder = OneHotEncoderEstimator(
    inputCols=[indexer.getOutputCol() for indexer in indexers],
    outputCols=["{0}-encoded".format(indexer.getOutputCol()) for indexer in indexers]
)
assembler = VectorAssembler(
    inputCols=encoder.getOutputCols(),
    outputCol="categorical-features"
)


# In[25]:


# # Create a Pipeline.
pipeline = Pipeline(stages=indexers + [encoder, assembler])
pipelineModel = pipeline.fit(df_remove)
model = pipelineModel.transform(df_remove)
示例#5
0
    store = Store.create(args.work_dir)

    # Download MNIST dataset
    data_url = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/mnist.bz2'
    libsvm_path = os.path.join(args.data_dir, 'mnist.bz2')
    if not os.path.exists(libsvm_path):
        subprocess.check_output(['wget', data_url, '-O', libsvm_path])

    # Load dataset into a Spark DataFrame
    df = spark.read.format('libsvm') \
        .option('numFeatures', '784') \
        .load(libsvm_path)

    # One-hot encode labels into SparseVectors
    encoder = OneHotEncoderEstimator(inputCols=['label'],
                                     outputCols=['label_vec'],
                                     dropLast=False)
    model = encoder.fit(df)
    train_df = model.transform(df)

    # Train/test split
    train_df, test_df = train_df.randomSplit([0.9, 0.1])

    # Define the PyTorch model without any Horovod-specific parameters
    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
            self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
            self.conv2_drop = nn.Dropout2d()
            self.fc1 = nn.Linear(320, 50)
sum(price) as s2, max(price) as ma2, min(price) as mi2 ,u.gender, u.age, u.user_id, u.segment 
from sales_known s join users_known u 
where s.user_id = u.user_id 
group by u.user_id, u.gender, u.age, u.segment""")

#подробное описание модели https://towardsdatascience.com/machine-learning-with-pyspark-and-mllib-solving-a-binary-classification-problem-96396065d2aa
#и https://spark.apache.org/docs/latest/ml-features.html
#в общем - все анализируемые колонки заносим в колонку-вектор features
categoricalColumns = ['gender', 'age']
stages = []
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol=categoricalCol,
                                  outputCol=categoricalCol +
                                  'Index').setHandleInvalid("keep")
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()],
                                     outputCols=[categoricalCol + "classVec"
                                                 ]).setHandleInvalid("keep")
    stages += [stringIndexer, encoder]

label_stringIdx = StringIndexer(inputCol='segment',
                                outputCol='label').setHandleInvalid("keep")
stages += [label_stringIdx]

numericCols = ['c', 's1', 'ma1', 'mi1', 's2', 'ma2', 'mi2']
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs,
                            outputCol="features").setHandleInvalid("keep")
stages += [assembler]

lr = LogisticRegression(featuresCol='features', labelCol='label', maxIter=10)
stages += [lr]
示例#7
0
#Create categorical variables for Region and StreetID 
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer
from pyspark.ml import Pipeline




#Region
genderIndxr = StringIndexer().setInputCol("Region").setOutputCol("RegionInd")

#StreetID
classIndxr = StringIndexer().setInputCol("StreetID").setOutputCol("StreetIDInd")

#One-hot encoding
ohee_catv = OneHotEncoderEstimator(inputCols=["RegionInd","StreetIDInd"],outputCols=["Region_dum","StreetID_dum"])
pipe_catv = Pipeline(stages=[genderIndxr, classIndxr, ohee_catv])

basetable_train = pipe_catv.fit(data1).transform(data1)
basetable_train = basetable_train.drop("RegionInd","StreetIDInd","FirstSubDate","LastComplaint","FirstComplaint","ChurnedAt03/02/2018","ChurnedAt03/02/2019")


basetable_train= basetable_train.withColumnRenamed("ChurnIn6Month","label")

# COMMAND ----------

basetable_test = pipe_catv.fit(BaseValidation).transform(BaseValidation)
basetable_test = basetable_test.drop("RegionInd","StreetIDInd","FirstSubDate","LastComplaint","FirstComplaint","ChurnedAt03/02/2018","ChurnedAt03/02/2019")


basetable_test= basetable_test.withColumnRenamed("ChurnIn6Month","label")
print("Sample model input")
print(flites.toPandas().sample(12))

# Create an indexer for the org categorical feature
#flights_indexed = StringIndexer(inputCol="org", outputCol='org_idx').fit(flights).transform(flights)
indexer = StringIndexer(inputCol="org", outputCol='org_idx')
# Assign index values to strings
indexer = indexer.fit(flites)
# Create column with index values
flites = indexer.transform(flites)

# Check first five records
#flights_indexed.show(5)

onehot = OneHotEncoderEstimator(inputCols=['org_idx', 'dow'],
                                outputCols=['org_dummy', 'dow_dummy'])
flites = onehot.fit(flites).transform(flites)

# Create 'features' vector: 'weight_kg', 'cyl', 'type_dummy'
assembler = VectorAssembler(inputCols=['km', 'org_dummy', 'dow_dummy'],
                            outputCol='features')

# Consolidate predictor columns
flites = assembler.transform(flites)

# Check the resulting column
#flites.distinct().show(8, truncate=False)

# Split the data into training and testing sets
flights_train, flights_test = flites.randomSplit([0.8, 0.2], seed=23)
#print(flights_train.toPandas().shape, flights_test.toPandas().shape)
                  predictionCol='svm_pred_2',
                  rawPredictionCol='svm_raw_2')

# build pipeline to generate predictions from base classifiers, will be used in task 1.3
gen_base_pred_pipeline = Pipeline(
    stages=[nb_0, nb_1, nb_2, svm_0, svm_1, svm_2])
gen_base_pred_pipeline_model = gen_base_pred_pipeline.fit(training_set)

# task 1.2
meta_features = gen_meta_features(training_set, nb_0, nb_1, nb_2, svm_0, svm_1,
                                  svm_2)

# build onehotencoder and vectorassembler pipeline
onehot_encoder = OneHotEncoderEstimator(
    inputCols=[
        'nb_pred_0', 'nb_pred_1', 'nb_pred_2', 'svm_pred_0', 'svm_pred_1',
        'svm_pred_2', 'joint_pred_0', 'joint_pred_1', 'joint_pred_2'
    ],
    outputCols=['vec{}'.format(i) for i in range(9)])
vector_assembler = VectorAssembler(
    inputCols=['vec{}'.format(i) for i in range(9)], outputCol='meta_features')
gen_meta_feature_pipeline = Pipeline(stages=[onehot_encoder, vector_assembler])
gen_meta_feature_pipeline_model = gen_meta_feature_pipeline.fit(meta_features)
meta_features = gen_meta_feature_pipeline_model.transform(meta_features)

# train the meta clasifier
lr_model = LogisticRegression(featuresCol='meta_features',
                              labelCol='label',
                              predictionCol='final_prediction',
                              maxIter=20,
                              regParam=1.,
                              elasticNetParam=0)
示例#10
0
                        'NVVar1','NVVar2','NVVar3','NVVar4',\
                        'Cat1_id','Cat2_id','Cat3_id','Cat4_id','Cat5_id','Cat6_id','Cat7_id','Cat8_id','Cat9_id','Cat10_id','Cat11_id','Cat12_id',\
                        'Calendar_Year','Model_Year','Claim_Amount')

category_id = ['Cat1_id','Cat2_id','Cat3_id','Cat4_id','Cat5_id','Cat6_id','Cat7_id','Cat8_id','Cat9_id','Cat10_id','Cat11_id','Cat12_id']
cat_ohe = []
for col in category_id:
    cat_ = col.replace('_id','_ohe')
    cat_ohe.append(cat_)
    input_features.append(cat_)


data = raw_df.select('Var1','Var2','Var3','Var4','Var5','Var6','Var7','Var8','NVVar1','NVVar2','NVVar3','NVVar4',
                  'Cat1_id','Cat2_id','Cat3_id','Cat4_id','Cat5_id','Cat6_id','Cat7_id','Cat8_id','Cat9_id','Cat10_id','Cat11_id','Cat12_id','Calendar_Year','Model_Year','Claim_Amount')

encoder = OneHotEncoderEstimator(inputCols=category_id, outputCols=cat_ohe)

encoder_data = encoder.fit(data)
data_ohe = encoder_data.transform(data)


# #assemble all features
from pyspark.ml.feature import VectorAssembler
all_features_assembler = VectorAssembler(inputCols=['Cat1_ohe','Cat2_ohe','Cat3_ohe','Cat4_ohe'\
                                                    ,'Cat5_ohe','Cat6_ohe','Cat7_ohe','Cat8_ohe'\
                                                    ,'Cat9_ohe','Cat10_ohe','Cat11_ohe','Cat12_ohe'\
                                                    ,'Var1','Var2','Var3','Var4','Var5',\
                                                        'Var6','Var7','Var8','NVVar1','NVVar2'\
                                                    ,'NVVar3','NVVar4','Calendar_Year','Model_Year'],\
                                                    outputCol='features')
all_data = all_features_assembler.transform(data_ohe)
def train_model(args):
    # do not run this test for pytorch lightning below min supported verson
    import pytorch_lightning as pl
    if LooseVersion(pl.__version__) < LooseVersion(MIN_PL_VERSION):
        print("Skip test for pytorch_ligthning=={}, min support version is {}".
              format(pl.__version__, MIN_PL_VERSION))
        return

    # Initialize SparkSession
    conf = SparkConf().setAppName('pytorch_spark_mnist').set(
        'spark.sql.shuffle.partitions', '16')
    if args.master:
        conf.setMaster(args.master)
    elif args.num_proc:
        conf.setMaster('local[{}]'.format(args.num_proc))
    spark = SparkSession.builder.config(conf=conf).getOrCreate()

    # Setup our store for intermediate data
    store = Store.create(args.work_dir)

    # Download MNIST dataset
    data_url = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/mnist.bz2'
    libsvm_path = os.path.join(args.data_dir, 'mnist.bz2')
    if not os.path.exists(libsvm_path):
        subprocess.check_output(['wget', data_url, '-O', libsvm_path])

    # Load dataset into a Spark DataFrame
    df = spark.read.format('libsvm') \
        .option('numFeatures', '784') \
        .load(libsvm_path)

    # One-hot encode labels into SparseVectors
    encoder = OneHotEncoder(inputCols=['label'],
                            outputCols=['label_vec'],
                            dropLast=False)
    model = encoder.fit(df)
    train_df = model.transform(df)

    # Train/test split
    train_df, test_df = train_df.randomSplit([0.9, 0.1])

    # Define the PyTorch model without any Horovod-specific parameters
    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
            self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
            self.conv2_drop = nn.Dropout2d()
            self.fc1 = nn.Linear(320, 50)
            self.fc2 = nn.Linear(50, 10)

        def forward(self, features):
            x = features.float()
            x = F.relu(F.max_pool2d(self.conv1(x), 2))
            x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
            x = x.view(-1, 320)
            x = F.relu(self.fc1(x))
            x = F.dropout(x, training=self.training)
            x = self.fc2(x)
            return F.log_softmax(x, -1)

    model = Net()
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
    loss = nn.NLLLoss()

    # Train a Horovod Spark Estimator on the DataFrame
    backend = SparkBackend(num_proc=args.num_proc,
                           stdout=sys.stdout,
                           stderr=sys.stderr,
                           prefix_output_with_timestamp=True)
    torch_estimator = hvd.TorchEstimator(
        backend=backend,
        store=store,
        model=model,
        optimizer=optimizer,
        loss=lambda input, target: loss(input, target.long()),
        input_shapes=[[-1, 1, 28, 28]],
        feature_cols=['features'],
        label_cols=['label'],
        validation=0.1,
        batch_size=args.batch_size,
        epochs=args.epochs,
        verbose=1)

    torch_model = torch_estimator.fit(train_df).setOutputCols(['label_prob'])

    # Evaluate the model on the held-out test DataFrame
    pred_df = torch_model.transform(test_df)

    argmax = udf(lambda v: float(np.argmax(v)), returnType=T.DoubleType())
    pred_df = pred_df.withColumn('label_pred', argmax(pred_df.label_prob))
    evaluator = MulticlassClassificationEvaluator(predictionCol='label_pred',
                                                  labelCol='label',
                                                  metricName='accuracy')
    print('Test accuracy:', evaluator.evaluate(pred_df))

    spark.stop()
# MAGIC 
# MAGIC To do that we will use the `VectorAssembler` where we
# MAGIC   * Set `inputCols` to the new list of feature columns
# MAGIC   * Set `outputCol` to `features`
# MAGIC   
# MAGIC   
# MAGIC For more information see:
# MAGIC * Scala: <a href="https://spark.apache.org/docs/latest/api/scala/#org.apache.spark.ml.feature.VectorAssembler" target="_blank">VectorAssembler</a>
# MAGIC * Python: <a href="https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.VectorAssembler" target="_blank">VectorAssembler</a>

# COMMAND ----------

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler

encoder = OneHotEncoderEstimator(inputCols=["seasonIndex","yrIndex", "workingdayIndex", "weathersitIndex" ],
                                 outputCols=["seasonVector", "yrVector", "workingdayVector", "weathersitVector"])


assemblerInputs  = [
  "mnth",  "hr", "hum", "atemp", "windspeed", # Our numerical features
  #"seasonIndex", "yrIndex", "workingdayIndex", "weathersitIndex"
   "seasonVector", "yrVector", "workingdayVector", "weathersitVector"
]        # Our new categorical features

vectorAssembler = VectorAssembler(
  inputCols=assemblerInputs, 
  outputCol="features_assembler")

scaler = StandardScaler(inputCol="features_assembler", outputCol="features")
    .format("csv") \
    .load(test_input)

#set handle invalid to keep to account for any unseen categorical data in the test set
target = ['click']
indexers = [
    StringIndexer(inputCol=c,
                  outputCol="{0}_indexed".format(c),
                  handleInvalid="skip") for c in categoryCols
]
label = StringIndexer(inputCol=target[0], outputCol="label")

#formatting for logistic regression
encoder = OneHotEncoderEstimator(
    inputCols=[indexer.getOutputCol() for indexer in indexers],
    outputCols=[
        "{0}_encoded".format(indexer.getOutputCol()) for indexer in indexers
    ],
    dropLast=False)

assembler = VectorAssembler(inputCols=encoder.getOutputCols() + integerCols,
                            outputCol="features")

lr = LogisticRegression(maxIter=10)

pipeline = Pipeline(stages=indexers + [encoder, assembler, label, lr])

# fit train data with LogisticRegression model using the pipeline
lr_model = pipeline.fit(train_df)

# Make predictions on test data using the transform() method. - LogisticRegression.transform() will only use the 'features' column.
predictions = lr_model.transform(test_df)
示例#14
0
    "workclass", "education", "marital_status", "occupation", "relationship",
    "race", "gender", "native_country"
]

numericCols = [
    "age", "fnlwgt", "educational_num", "capital_gain", "capital_loss",
    "hours_per_week"
]

stages = []  # Stages in the pipeline

# First, we need to turn the categorical variables into one-hot encodings.
# We do this in two steps: StringIndexer and OneHotEncoderEstimator
for col in categoricalCols:
    stringIndexer = StringIndexer(inputCol=col, outputCol=col + "_index")
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()],
                                     outputCols=[col + "_one_hot"])
    stages += [stringIndexer, encoder]

# Assemble all the columns into a single vector, called "features"
assemblerInputs = [c + "_one_hot" for c in categoricalCols] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

print(stages)

# Create a Pipeline.
pipeline = Pipeline(stages=stages)

# Fit pipeline to the dataset
pipelineModel = pipeline.fit(adults_df)
示例#15
0
print("The data contains %d records after dropping records with na values." % flights.count())

# Create an indexer for carrier categorical feature
indexer = StringIndexer(inputCol="carrier", outputCol='carrier_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(flights)

# Indexer creates a new column with numeric index values
flights_indexed = indexer_model.transform(flights)

# Repeat the process for the org categorical feature
flites = StringIndexer(inputCol="org", outputCol='org_idx').fit(flights_indexed).transform(flights_indexed)

# Create an instance of the one hot encoder
onehot = OneHotEncoderEstimator(inputCols=["org_idx"], outputCols=["org_dummy"])

# Apply the one hot encoder to the flights data
onehot = onehot.fit(flites)
flites = onehot.transform(flites)

pd.set_option('display.max_columns', None) # all cols
pd.set_option('display.width', 199)
pd.set_option('display.max_colwidth', 199)

# Create buckets at 3 hour intervals through the day
buckets = Bucketizer(splits=[0, 3, 6, 9, 12, 15, 18, 21, 24],
                     inputCol="depart", outputCol="depart_bucket")

# Bucket the departure times
bucketed = buckets.transform(flites)
示例#16
0
def get_parcels_to_spark(parcels_filepath='data/EXTR_Parcel.csv'):
    # Comment out to only use initial SparkSession
    # spark = SparkSession\
    # .builder\
    # .master('Local[4]')\
    # .appName("Get_Parcel_Data")\
    # .config("spark.master", "local")\
    # .getOrCreate()

    # Initially read in pre-cleaned Pandas DataFrame into Spark DataFrame
    parcel_pd = get_parcels(parcels_filepath)
    parcel = spark.createDataFrame(parcel_pd)

    # Normalize numerical data
    numerical_cols = [
        'PcntUnusable',
        'WfntFootage',
    ]
    numerical_assembler = VectorAssembler(inputCols=numerical_cols,
                                          outputCol='num_features')
    parcel = numerical_assembler.transform(parcel)

    parcel = StandardScaler(
        inputCol='num_features',
        outputCol='num_features_std').fit(parcel).transform(parcel)

    # Create index and dummy_vector column names of categorical colums, eventually dropping categorical and index columns
    cat_cols = [
        'Range',
        'Township',
        'Section',
        'QuarterSection',
        'Area',
        'SubArea',
        'LevyCode',
        'CurrentZoning',
        'PresentUse',
        'SqFtLot',
        'WaterSystem',
        'SewerSystem',
        'Access',
        'Topography',
        'StreetSurface',
        'InadequateParking',
        'MtRainier',
        'Olympics',
        'Cascades',
        'Territorial',
        'SeattleSkyline',
        'PugetSound',
        'LakeWashington',
        'SmallLakeRiverCreek',
        'OtherView',
        'WfntLocation',
        'WfntBank',
        'WfntPoorQuality',
        'WfntRestrictedAccess',
        'WfntAccessRights',
        'TidelandShoreland',
        'LotDepthFactor',
        'TrafficNoise',
        'NbrBldgSites',
        'Contamination',
    ]

    cat_index = []
    dummies = []
    for col in cat_cols:
        cat_index.append(col + '_index')
        dummies.append(col + '_dummy_vector')

    # Create and populate categorical index columns
    indexers = [
        StringIndexer(inputCol=column, outputCol=column + "_index").fit(parcel)
        for column in cat_cols
    ]
    cat_pipeline = Pipeline(stages=indexers)
    parcel = cat_pipeline.fit(parcel).transform(parcel)

    # Encode dummy_vector columns from categorical indeces
    encoder = OneHotEncoderEstimator(inputCols=cat_index, outputCols=dummies)
    model = encoder.fit(parcel)
    parcel = model.transform(parcel)

    # Drop categorical and index columns
    parcel = parcel.drop(*cat_cols)
    parcel = parcel.drop(*cat_index)
    parcel = parcel.drop(*numerical_cols)

    # Combine all features into single vector
    ignore = ['PIN']
    assembler = VectorAssembler(
        inputCols=[col for col in parcel.columns if col not in ignore],
        outputCol='parcel_features')
    parcel = assembler.transform(parcel)

    # Drop all columns that are now in the features column
    ignore.append('parcel_features')
    parcel = parcel.drop(*[col for col in parcel.columns if col not in ignore])

    # # Write to parquet - not sure if I will eventually open from this, but that's the idea
    # # gis.write.parquet('data/gis_parquet',mode='overwrite')

    return parcel
示例#17
0
from pyspark.ml.feature import StringIndexer
sipclass = StringIndexer(handleInvalid='keep',
                         inputCol='Pclass',
                         outputCol='idxPclass').fit(dftrain)
dftrain = sipclass.transform(dftrain)
dftrain = dftrain.drop('Pclass')

# In[16]:

dftrain.show()

# In[17]:

from pyspark.ml.feature import OneHotEncoderEstimator
ohe = OneHotEncoderEstimator(handleInvalid='keep',
                             dropLast=True,
                             inputCols=['idxPclass'],
                             outputCols=['ohePclass']).fit(dftrain)
dftrain = ohe.transform(dftrain)
dftrain = dftrain.drop('idxPclass')
dftrain.sample(withReplacement=False, fraction=0.1).limit(20).show()

# In[18]:

from pyspark.ml.feature import VectorAssembler
va = VectorAssembler(
    inputCols=['SibSp', 'Parch', 'Fare', 'impAge', 'ohePclass'],
    outputCol='features')
dftrain = va.transform(dftrain)
dftrain = dftrain.drop('SibSp', 'Parch', 'Fare', 'impAge', 'ohePclass')
dftrain.show()
示例#18
0
# Exercise_1 
# Import the one hot encoder class
from pyspark.ml.feature import OneHotEncoderEstimator

# Create an instance of the one hot encoder
onehot = OneHotEncoderEstimator(inputCols=['org_idx'], outputCols=['org_dummy'])

# Apply the one hot encoder to the flights data
onehot = onehot.fit(flights)
flights_onehot = onehot.transform(flights)

# Check the results
flights_onehot.select('org', 'org_idx', 'org_dummy').distinct().sort('org_idx').show()

--------------------------------------------------
# Exercise_2 
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Create a regression object and train on training data
regression = LinearRegression(labelCol='duration').fit(flights_train)

# Create predictions for the testing data and take a look at the predictions
predictions = regression.transform(flights_test)
predictions.select('duration', 'prediction').show(5, False)

# Calculate the RMSE
RegressionEvaluator(labelCol='duration').evaluate(predictions)

--------------------------------------------------
# Exercise_3 
示例#19
0
def basicTrainingPipeline(data=False,
                          col_target="revenue_tickets",
                          first_pred_day=False,
                          jarra='quinto',
                          verbose=True,
                          checks=False,
                          logger=False):
    try:
        parser = SafeConfigParser()
        config_file_path = MODEL_CONFIG_FILE
        parser.read(config_file_path)

        # Model parameters (RF)
        min_samples_leaf = int(parser.get('random_forest_params', 'min_samples_leaf'))
        max_features = parser.get('random_forest_params', 'max_features')
        max_features = max_features if max_features == 'sqrt' else str(max_features)
        num_trees = int(parser.get('random_forest_params', 'num_trees'))
        max_depth = int(parser.get('random_forest_params', 'max_depth'))
        subsampling_rate = float(parser.get('random_forest_params', 'subsampling_rate'))

        # Other parameters from parser
        training_sample = float(parser.get('random_forest_params', 'training_sample'))
        model_complexity = parser.get('model_params', 'model_complexity')
        use_clustered_data_sets = parser.getboolean('model_params', 'use_clustered_data_sets')
        cols_id = [e.strip() for e in parser.get('col_names', 'cols_id').split(',')]
        col_predict = col_target + '_pred'

        # Set 1st day of predictions
        if first_pred_day is not None:
            split_value = first_pred_day
        else:
            split_value = datetime.today()
            first_pred_day = split_value.strftime('%Y-%m-%d')
            split_value = split_value.strftime('%Y-%m-%d')

        # Save parameters
        s3_save_path = parser.get('save_params', 's3_save_path')
        s3_save_pipelines_path = s3_save_path + 'pipelines/'+col_target+'/dt-execution=' + split_value + '/'

        # Connect to spark
        spark = createSparkSession(jarra=jarra,
                                     verbose=verbose,
                                     logger=logger)

        # Define name of the variable for predictions
        cols_cyclical, cols_ohe_in, cols_features, col_target, cols_id = defineFeatures(model_complex='first',
                                                                                        use_clustered_data_sets=False,
                                                                                        col_target=col_target,
                                                                                        verbose=False,
                                                                                        logger=False)

        # Add cyclical variables to features lists, OHE_out not as they are already in pipelines
        cols_cyclical_sin = [s + '_sin' for s in cols_cyclical]
        cols_cyclical_cos = [s + '_cos' for s in cols_cyclical]
        cols_cyclical_out = cols_cyclical_sin + cols_cyclical_cos
        for i in range(len(cols_features)):
            cols_features[i] = cols_features[i] + cols_cyclical_out

        # Fill with features (depending on how many models we have)
        cols_ohe_out = []
        features_list = []

        col_date = ''
        if verbose:
            logger.info(cols_ohe_in)
            logger.info('Number of partition of data df: ' + str(data.rdd.getNumPartitions()))

        # Define date filters for training set and test/pred sets of each consecutive models
        filterTrainEndList = []
        for i in range(len(cols_features)):
            filterTrainEndList.append(col(col_date) < split_value)

        # Create list with data sets for each of the consecutive models, each data set have different features
        data = data.coalesce(200)
        train_data_list, train_data_basic_list = createTrainingDataLists(data,
                                                                         cols_features,
                                                                         cols_ohe_in,
                                                                         col_target,
                                                                         cols_id,
                                                                         filterTrainEndList,
                                                                         use_clustered_data_sets,
                                                                         training_sample,
                                                                         spark,
                                                                         verbose,
                                                                         logger)

        # String indexer, one hot encoder and vector assembler (creates column 'features' with all the features for
        # given model as vector),
        if verbose:
            logger.info('String indexer, one hot encoder and vector assembler, start')

        # Indexer: transforms string values into numeric values, the value that occurs in data most is indexed as zero,
        # second as 1 etc.
        indexers = [StringIndexer(inputCol=x,
                                  outputCol=x+'_tmp',
                                  handleInvalid='keep')
                    for x in cols_ohe_in]

        # One hot encoding
        cols_ohe_in_tmp = [i + '_tmp' for i in cols_ohe_in]
        encoder = OneHotEncoderEstimator(dropLast=True,
                                         inputCols=cols_ohe_in_tmp,
                                         outputCols=cols_ohe_out,
                                         handleInvalid ='keep')


        # Add to pipeline
        pipeline_tmp=indexers + [encoder]

        # Create placeholders
        assembler = []
        pipeline_tmp2 = []
        pipeline_list = []
        pipelinePrepList = []
        trainDataList = []
        pipelinePrepBasicList = []
        trainDataBasicList = []
        start = datetime.now()
        for i in range(len(train_data_list)):
            if verbose:
                logger.info('Model ' + str(i))
            features_list[i] = features_list[i] + cols_ohe_out
            assembler.append(VectorAssembler(inputCols=features_list[i],
                                             outputCol='features'))
            pipeline_tmp2.append(pipeline_tmp + [assembler[i]])
            pipeline_list.append(Pipeline(stages=pipeline_tmp2[i]))
            pipelinePrepList.append(pipeline_list[i].fit(train_data_list[i]))
            trainDataList.append(pipelinePrepList[i].transform(train_data_list[i]))

        end = datetime.now()

        if verbose:
            logger.info('First day of model training set: ' + str(trainDataList[0].toPandas().dt_flight_date_local.min()))
            logger.info('Last day of model training set: ' + str(trainDataList[0].toPandas().dt_flight_date_local.max()))
            logger.info('String indexer, one hot encoder and vector assembler, done for all models, time: ' + str(end-start))
            logger.info('Number of partition of trainDataList 0 df: ' + str(trainDataList[0].rdd.getNumPartitions()))
            logger.info('Number of partition of trainDataList 5 df: ' + str(trainDataList[5].rdd.getNumPartitions()))
            logger.info('Features list for first model (ie. for next 7 days): ')
            logger.info(features_list[0])
            logger.info('RF Model start')

        start_all = datetime.now()

        # Create placeholders
        RFModelList = []
        fitList = []
        fitBasicList = []
        for i in range(len(trainDataList)):
            start = datetime.now()
            if verbose:
                logger.info('Model '+str(i))
            RFModelList.append(RFRegr(labelCol=col_target,
                                      featuresCol='features',
                                      numTrees=num_trees,
                                      maxDepth=max_depth,
                                      featureSubsetStrategy=max_features,
                                      subsamplingRate=subsampling_rate,
                                      minInstancesPerNode=min_samples_leaf
                                      ))

            # Repartition to get the evenly distributed data set:
            trainDataList[i] = trainDataList[i].coalesce(36)

            # Fit to training set
            fitList.append(RFModelList[i].setPredictionCol(col_predict).fit(trainDataList[i]))
            end = datetime.now()
            if verbose:
                logger.info('Random Forest, ' + str(i) + ' model, time: ' + str(end-start))
        if verbose:
            logger.info('Saving data preparation and model pipelines in ' + s3_save_pipelines_path)
        for i in range(len(pipelinePrepList)):
            pipelinePrepList[i].write().overwrite().save(s3_save_pipelines_path+"data_prep_pipeline"+str(i))
            fitList[i].write().overwrite().save(s3_save_pipelines_path+"model_pipeline"+str(i))

        if checks:
            mlflow_params_extra = calcTrainingSetError(fitList,
                                                       fitBasicList,
                                                       trainDataList,
                                                       trainDataBasicList,
                                                       cols_id,
                                                       col_predict,
                                                       col_target,
                                                       verbose,
                                                       logger)
        else:
            mlflow_params_extra = {}

        end_all = datetime.now()
        if verbose:
            logger.info('Random Forest, all models, time: ' + str(end_all-start_all))
        max_features_int = max_features if max_features == 'sqrt' else float(max_features)
        mlflow_params = {'col_target': col_target,
                         'num_trees': num_trees,
                         'max_depth': max_depth,
                         'max_features': max_features_int,
                         'subsampling_rate': subsampling_rate,
                         'min_samples_leaf': min_samples_leaf,
                         'training_sample': training_sample,
                         'train_date_min': str(trainDataList[0].toPandas().dt_flight_date_local.min()),
                         'train_date_max': str(trainDataList[5].toPandas().dt_flight_date_local.max()),
                         'time_seconds': (end_all-start_all).total_seconds()
                         }

    except Exception:
        logger.exception("Fatal error in demand_forecast_training()")
        raise
    return pipelinePrepList, fitList, mlflow_params, mlflow_params_extra
示例#20
0

# } UDF {

to_sparse = udf(dense_to_sparse, VectorUDT())

# COMMAND ----------

# DBTITLE 1,ML Pipeline
# Create DataFrame
df = spark.createDataFrame([(0.0, 1.0), (1.0, 0.0), (2.0, 1.0), (0.0, 2.0),
                            (0.0, 1.0), (2.0, 0.0)],
                           ['categoryIndex1', 'categoryIndex2'])

# Data pipeline for feature creation
encoder = OneHotEncoderEstimator(inputCols=['categoryIndex1'],
                                 outputCols=['categoryVec1'])
assembler = VectorAssembler(inputCols=['categoryVec1', 'categoryIndex2'],
                            outputCol='features')
# pca = PCA(
#     k=2,
#     inputCol='features',
#     outputCol='pcaFeatures'
# )
pipeline = Pipeline(stages=[encoder, assembler])
pipelineFit = pipeline.fit(df)
output = pipelineFit.transform(df)

# Transform all dense matrix to sparse
output_sparse = output.withColumn('features', to_sparse(col('features')))
# Add a row index to join results later
output_index = add_column_index(output_sparse)
    label = 'salary'
    numerical_cols = [
        'age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss',
        'hours_per_week'
    ]
    categorical_cols = [
        "workclass", "education", "marital_status", "occupation",
        "relationship", "race", "sex", "native_country"
    ]
    stages = []

    # One hot encode categorical cols
    for cname in categorical_cols:
        string_idxer = StringIndexer(inputCol=cname, outputCol=cname + 'Index')
        encoder = OneHotEncoderEstimator(
            inputCols=[string_idxer.getOutputCol()],
            outputCols=[cname + 'classVec'])
        stages += [string_idxer, encoder]

    # Convert labels (Slary) to 0 and 1
    label_idxer = StringIndexer(inputCol="salary", outputCol="label")
    stages += [label_idxer]

    # Standardize numberical cols
    numerical_assembler = VectorAssembler(inputCols=numerical_cols,
                                          outputCol='numFeatures')
    scaler = StandardScaler(inputCol='numFeatures',
                            outputCol='norm_cols',
                            withStd=True,
                            withMean=True)
示例#22
0
    scorecard_data_rdd =  spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(data_path)
    scorecard_data = scorecard_data_rdd.toDF(*db_cols)


    features =["CONTROL","ADM_RATE","ADM_RATE_ALL","SAT_AVG_ALL","SATMTMID","UGDS","HIGHDEG",  "TUITFTE", 
       "COSTT4_A", "PCTFLOAN","COMP_ORIG_YR2_RT", "UGDS_WHITE","UGDS_BLACK","UGDS_HISP","UGDS_ASIAN","UGDS_AIAN","UGDS_NHPI","UGDS_2MOR","UGDS_NRA","UGDS_UNKN","PPTUG_EF","COSTT4_P","TUITIONFEE_IN","TUITIONFEE_OUT","TUITIONFEE_PROG","INEXPFTE","PCTPELL","COMP_ORIG_YR3_RT","LOAN_COMP_ORIG_YR3_RT","DEATH_YR4_RT","COMP_ORIG_YR4_RT","AGE_ENTRY","COUNT_NWNE_P10","COUNT_WNE_P10","MN_EARN_WNE_P10","MD_EARN_WNE_P10","COMPL_RPY_1YR_RT"]
    categ_features = ["CONTROL","HIGHDEG"]
    target = "COMPL_RPY_1YR_RT"

    # this preprocessing step is important only in training, so it will not be included in the pipeline.
    scorecard_data_cleaned = cleanPrivacySuppressed(scorecard_data)
    # select not null target rows
    scorecard_data_cleaned =scorecard_data_cleaned.where(F.col(target).isNotNull()).where(F.col(target)!= np.nan)

    # hot encoder preprocessing
    hot_encoder = OneHotEncoderEstimator(inputCols=categ_features, outputCols=["{0}_ENCODED".format(colName) for colName in categ_features] )

    #vector assembler
    model_input_features = [f for f in features if f not in categ_features and f is not target ]
    model_input_features.extend(["{0}_ENCODED".format(colName) for colName in categ_features])
    vec_assembler = VectorAssembler(inputCols=model_input_features,outputCol="features",handleInvalid="keep")

    #preprocessing pipleline
    preprocessing_pipeline = Pipeline(stages=[hot_encoder,vec_assembler])
    preprocessed_data = preprocessing_pipeline.fit(scorecard_data_cleaned).transform(scorecard_data_cleaned)
    # cache this preprocessing step, this is performance optimization step for model-tunning process
    preprocessed_data.cache()


    # split data to train/test 80/20
    train_preprocessed_data = preprocessed_data.randomSplit([.8,.2])[0]
示例#23
0
    def test_profile_sparkml_pipeline(self):
        import inspect
        import os
        import numpy
        import pandas
        import time
        this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv")
        full_data = self.spark.read.format('csv') \
            .options(header='true', inferschema='true').load(input_path)
        training_data, test_data = full_data.randomSplit([0.9, 0.1], seed=1)

        label = "income"
        dtypes = dict(training_data.dtypes)
        dtypes.pop(label)

        si_xvars = []
        ohe_xvars = []
        feature_cols = []
        for idx, key in enumerate(dtypes):
            if dtypes[key] == "string":
                feature_col = "-".join([key, "encoded"])
                feature_cols.append(feature_col)

                tmp_col = "-".join([key, "tmp"])
                si_xvars.append(StringIndexer(inputCol=key, outputCol=tmp_col, handleInvalid="skip"))
                ohe_xvars.append(OneHotEncoderEstimator(inputCols=[tmp_col], outputCols=[feature_col], dropLast=False))
            else:
                feature_cols.append(key)
        si_label = StringIndexer(inputCol=label, outputCol='label')
        assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
        lr = LogisticRegression(regParam=0.001)
        pipeline = Pipeline(stages=si_xvars + ohe_xvars + [ si_label, assembler, lr])

        # filter out the records which will cause error
        model = pipeline.fit(training_data)
        model_onnx = convert_sparkml(model, 'Sparkml Pipeline', buildInitialTypesSimple(test_data))
        if model_onnx is None: raise AssertionError("Failed to create the onnx model")
        model_path = os.path.join("tests", "profile_pipeline_model.onnx")
        with open(model_path, "wb") as f:
            f.write(model_onnx.SerializeToString())

        rec_counts = []
        spark_times = []
        runtime_times = []
        for i in range(0, 4):
            rec_counts.append(test_data.count())
            data_np = buildInputDictSimple(test_data)
            # run the model in Spark
            start = time.time()
            predicted = model.transform(test_data)
            end = time.time()
            spark_times.append(1000*(end - start))

            # test for correctness also
            expected = [
                predicted.toPandas().label.values.astype(numpy.float32),
                predicted.toPandas().prediction.values.astype(numpy.float32),
                predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
            ]
            # run the model in onnx runtime
            start = time.time()
            output, session = run_with_runtime(data_np, model_path)
            end = time.time()
            runtime_times.append(1000*(end - start))

            # compare results
            _compare_expected(expected, output, session, model_path, decimal=5, onnx_shape=None)

            # each time in this loop double the number of rows
            test_data = test_data.union(test_data)

        results = pandas.DataFrame(data={
            'input_rec_count': rec_counts,
            'pyspark (ms)': spark_times,
            'onnxruntime (ms)': runtime_times
        })
        print(results)
    outputCol="cont_features")

#Scaler for Continuous Variables 
scaler = StandardScaler(inputCol="cont_features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)

#Indexers for Categorical Variables
Neighborhood_indexer = StringIndexer(inputCol="Neighborhood", outputCol="Neighborhood_Indexed", handleInvalid="keep")
YearBuilt_indexer = StringIndexer(inputCol="YearBuilt", outputCol="YearBuilt_Indexed", handleInvalid="keep")
MoSold_indexer = StringIndexer(inputCol="MoSold", outputCol="MoSold_Indexed", handleInvalid="keep")
YrSold_indexer = StringIndexer(inputCol="YrSold", outputCol="YrSold_Indexed", handleInvalid="keep")
CentralAir_indexer = StringIndexer(inputCol="CentralAir", outputCol="CentralAir_Indexed", handleInvalid="keep")
Condition1_indexer = StringIndexer(inputCol="Condition1", outputCol="Condition1_Indexed", handleInvalid="keep")

#One Hot Encoder for Indexed Variables
encoder = OneHotEncoderEstimator(inputCols=["Neighborhood_Indexed", "YearBuilt_Indexed", "MoSold_Indexed", "YrSold_Indexed", "OverallQual", "OverallCond", "CentralAir_Indexed", "Condition1_Indexed"],
                                 outputCols=["Neighborhood_Indexed_Vec", "YearBuilt_Indexed_Vec", "MoSold_Indexed_Vec", "YrSold_Indexed_Vec", "OverallQual_Vec", "OverallCond_Vec", "CentralAir_Indexed_Vec", "Condition1_Indexed_Vec"])

#Feature Vector Assembler
assembler = VectorAssembler(
    inputCols=["Neighborhood_Indexed_Vec", "YearBuilt_Indexed_Vec", "MoSold_Indexed_Vec", "YrSold_Indexed_Vec", "OverallQual_Vec", "OverallCond_Vec", "CentralAir_Indexed_Vec", "Condition1_Indexed", "scaledFeatures"],
    outputCol="features")

#Define Linear Regression Model
lr = LinearRegression(maxIter=200)

# COMMAND ----------

#Define Pipeline
pipeline = Pipeline(stages=[cont_assembler, scaler, Neighborhood_indexer, YearBuilt_indexer, MoSold_indexer, YrSold_indexer, CentralAir_indexer, Condition1_indexer, encoder, assembler, lr])

# COMMAND ----------
示例#25
0
 dataset_df = sqlContext.read.csv('salaries.csv', header='true', inferSchema='true')
 # initializing stages of main transformation pipeline
 stages = []
 # list of categorical features for further hot-encoding
 cat_features = ["rank", "discipline", "sincephd_bin", "service_bin", "sex"]
 # removing column with ID field
 dataset_df = dataset_df.drop('_c0')
 # bining numeric features by local binner udf function (specified for current dataset if needed)
 dataset_df = dataset_df.withColumn('sincephd_bin', binner(dataset_df['sincephd']))
 dataset_df = dataset_df.withColumn('service_bin', binner(dataset_df['service']))
 dataset_df = dataset_df.withColumn('model_type', sf.lit(0))
 dataset_df = dataset_df.drop('sincephd', 'service')
 # hot encoding categorical features
 for feature in cat_features:
     string_indexer = StringIndexer(inputCol=feature, outputCol=feature + "_index")
     encoder = OneHotEncoderEstimator(inputCols=[string_indexer.getOutputCol()], outputCols=[feature + "_vec"])
     encoder.setDropLast(False)
     stages += [string_indexer, encoder]
 assembler_inputs = [feature + "_vec" for feature in cat_features]
 assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="assembled_inputs")
 stages += [assembler]
 assembler_final = VectorAssembler(inputCols=["assembled_inputs"], outputCol="features")
 stages += [assembler_final]
 pipeline = Pipeline(stages=stages)
 pipeline_model = pipeline.fit(dataset_df)
 dataset_transformed = pipeline_model.transform(dataset_df)
 df_transform_fin = dataset_transformed.select('features', label, 'model_type').toPandas()
 train, test = train_test_split(df_transform_fin, test_size=0.3, random_state=0)
 train_df = sqlContext.createDataFrame(train)
 test_df = sqlContext.createDataFrame(test)
 decode_dict = {}
示例#26
0
def runModel(regressionMethodName,
             stationID,
             stationDataFrame,
             featureInputCols,
             normalize,
             splitMethod='random'):
    print("=" * 80)
    print('Station:{0}'.format(stationID))
    print(
        'Model:{0}, Normalize:{1}, LinkFunction:{2}, train/test splitMethod:{3}'
        .format(regressionMethodName, normalize, labelLinkFunction,
                splitMethod))
    print(featureInputCols)

    oneHot = OneHotEncoderEstimator(
        inputCols=["hourOfDay", "dayOfWeek"],
        outputCols=["hourOfDayVector", "dayOfWeekVector"])

    stationSummaryAll = stationDataFrame.groupBy('station_id').agg(
        count('label'), sum('label'), avg("label"), stddev_pop("label"))
    stationAvg = stationSummaryAll.select('avg(label)').where(
        col('station_id') == stationID).collect()
    stationSum = stationSummaryAll.select('sum(label)').where(
        col('station_id') == stationID).collect()
    stationStd = stationSummaryAll.select('stddev_pop(label)').where(
        col('station_id') == stationID).collect()
    stationNonZeroCount = stationSummaryAll.select('count(label)').where(
        col('station_id') == stationID).collect()
    stationCount = stationSummaryAll.select('count(label)').where(
        col('station_id') == "None").collect()

    featureInputCols.extend(["hourOfDayVector", "dayOfWeekVector"])
    assembler = VectorAssembler(inputCols=featureInputCols,
                                outputCol='features')

    if normalize == True:
        normalizer = Normalizer(inputCol="features",
                                outputCol="normFeatures",
                                p=1.0)
        featureName = "normFeatures"
        regressionMethod, regressionModelParameters = selectRegressionMethod(
            'rf', featureName)
        pipeline = Pipeline(
            stages=[oneHot, assembler, normalizer, regressionMethod])
    else:
        featureName = "features"
        regressionMethod, regressionModelParameters = selectRegressionMethod(
            'rf', featureName)
        pipeline = Pipeline(stages=[oneHot, assembler, regressionMethod])

    trainingDates = ['2016-10-01 00:00:00', '2017-9-30 23:59:59']

    testDates = ['2017-10-01 00:00:00', '2017-10-31 23:59:59']

    dates = {'train': trainingDates, 'test': testDates}

    if splitMethod == 'random':
        # Split the data into training and test sets (30% held out for testing)
        (trainingData, testData) = stationDataFrame.randomSplit([0.6, 0.4])

    else:
        (trainingData, testData) = timeSeriesTestTrain(stationDataFrame, dates)

    #fit model and make predictions
    model = pipeline.fit(trainingData)
    predictedData = model.transform(testData)
    #predictedData.select("prediction", "label", featureName).show(5)
    predictedData
    evaluator = RegressionEvaluator(labelCol="label",
                                    predictionCol="prediction",
                                    metricName="rmse")
    evaluator2 = RegressionEvaluator(labelCol="label",
                                     predictionCol="prediction",
                                     metricName="r2")
    evaluator3 = RegressionEvaluator(labelCol="label",
                                     predictionCol="prediction",
                                     metricName="explainedVariance")

    rmse = evaluator.evaluate(predictedData)
    rSquared = evaluator2.evaluate(predictedData)
    varianceExplained = evaluator2.evaluate(predictedData)

    print(
        "RMSE, R2, and variance explained on test data = {0:6.3f}, {1:6.3f}, {2:6.3f}"
        .format(rmse, rSquared, varianceExplained))
    print()
    basetime = 1541216769
    experimentTimeStamp = int((time.time() - basetime) / 6)
    experiment = {
        experimentTimeStamp: {
            "station": stationID,
            'stationNonZeroCount': stationNonZeroCount,
            'stationCount': stationCount,
            'stationSum': stationSum,
            'stationAvg': stationAvg,
            'stationStd': stationStd,
            'regressionMethodName': regressionMethodName,
            'normalize': normalize,
            'linkFunctionLabel': labelLinkFunction,
            'featureInputCols': featureInputCols,
            'rmse': rmse,
            'rSquared': rSquared,
            'varianceExplained': varianceExplained,
            'version': "Added OneHotEncode for hOD, dOW",
            'trainSplitMethod': splitMethod
        }
    }
    experiments.update(experiment)
    with open(pathFigure + "experiments.json", "w") as f:
        json.dump(experiments, f)
    return ()
示例#27
0
df = spark.createDataFrame([
    ("dummy 0", 0, "A", "X"),
    ("dummy 1", 1, "A", "X"),
    ("dummy 2", 2, "B", "Y"),
    ("dummy 3", 3, "A", "X"),
    ("dummy 4", 4, "D", "Y"),
    ("dummy 5", 5, "B", "X"),
    ("dummy 6", 6, "C", "Z"),
], ["dummy", "i", "avar", "xvar"])

stages = []

catvars = ["avar", "xvar"]
for v in catvars:
    stages += [StringIndexer(inputCol=v,
                             outputCol=f"i{v}")]
ohin = [f"i{v}" for v in catvars]
ohout = [f"v{v}" for v in catvars]
stages += [OneHotEncoderEstimator(inputCols=ohin,
                                  outputCols=ohout)]

stages += [VectorAssembler(inputCols=['vavar', 'vxvar', 'i'],
                           outputCol='features')]

pip = Pipeline(stages=stages)
pipm = pip.fit(df)

dft = pipm.transform(df)
dft.show()
示例#28
0
ratio = 1.0
counts = train.select(f'_c{label_idx}').groupBy(f'_c{label_idx}').count().collect()
higher_bound = counts[1][1]
treshold = int(ratio * float(counts[0][1]) / counts[1][1] * higher_bound)

rand_gen = lambda x: randint(0, higher_bound) if x == 0 else -1
udf_rand_gen = udf(rand_gen, IntegerType())
train = train.withColumn('rand_idx', udf_rand_gen('_c0'))
train_subsample = train.filter(train['rand_idx'] < treshold)
train_subsample = train_subsample.drop('rand_idx')

train_subsample.select(f'_c{label_idx}').groupBy(f'_c{label_idx}').count().show(n=5)

# パイプラインの構築
indexers = [StringIndexer(inputCol=feature, outputCol=f'{feature}_idx',  handleInvalid='keep') for feature in category_features]
encoder = OneHotEncoderEstimator(inputCols=[f'{feature}_idx' for feature in category_features], outputCols=[f'{feature}_vec' for feature in category_features], dropLast=False, handleInvalid='keep')
assembler = VectorAssembler(inputCols=real_features+[f'{feature}_vec' for feature in category_features], outputCol='assembles')
pca = PCA(k=2, inputCol='assembles', outputCol='features')
lr = LogisticRegression(featuresCol='features', labelCol=f'_c{label_idx}')
stages = indexers
stages.append(encoder)
stages.append(assembler)
stages.append(pca)
stages.append(lr)
pipeline = Pipeline(stages=stages)

model = pipeline.fit(train_subsample)
print(model.stages[-1].coefficients)

predictions = model.transform(test)
evaluator = MulticlassClassificationEvaluator(labelCol=f'_c{label_idx}', metricName='f1')
示例#29
0
def gis_data_to_spark(
    numFolds,
    gis_filepath='data/Parcels_for_King_County_with_Address_with_Property_Information__parcel_address_area.csv'
):

    # Initially read in pre-cleaned Pandas DataFrame into Spark DataFrame
    gis_pd = get_gis_data(gis_filepath)

    gis_pd['fold'] = np.random.randint(0, numFolds, gis_pd.shape[0])
    gis = spark.createDataFrame(gis_pd)

    # Normalize numerical data
    numerical_cols = [
        'LAT', 'LON', 'LOTSQFT', 'APPRLNDVAL', 'APPR_IMPR', 'TAX_LNDVAL',
        'TAX_IMPR', 'Shape_Length', 'Shape_Area', 'value_per_area',
        'improvement_over_land'
    ]
    numerical_assembler = VectorAssembler(inputCols=numerical_cols,
                                          outputCol='num_features')
    gis = numerical_assembler.transform(gis)

    gis = StandardScaler(inputCol='num_features',
                         outputCol='num_features_std').fit(gis).transform(gis)

    # Create index and dummy_vector column names of categorical colums, eventually dropping categorical and index columns
    cat_cols = [
        'KCTP_STATE', 'SITETYPE', 'LEVYCODE', 'NEW_CONSTR', 'TAXVAL_RSN',
        'QTS', 'SEC', 'TWP', 'RNG', 'KCA_ZONING', 'PROPTYPE', 'PREUSE_DESC'
    ]
    cat_index = []
    dummies = []
    for col in cat_cols:
        cat_index.append(col + '_index')
        dummies.append(col + '_dummy_vector')

    # Create and populate categorical index columns
    indexers = [
        StringIndexer(inputCol=column, outputCol=column + "_index").fit(gis)
        for column in cat_cols
    ]
    cat_pipeline = Pipeline(stages=indexers)
    gis = cat_pipeline.fit(gis).transform(gis)

    # Encode dummy_vector columns from categorical indeces
    encoder = OneHotEncoderEstimator(inputCols=cat_index, outputCols=dummies)
    model = encoder.fit(gis)
    gis = model.transform(gis)

    # Drop categorical and index columns
    gis = gis.drop(*cat_cols)
    gis = gis.drop(*cat_index)
    gis = gis.drop(*numerical_cols)

    # Combine all features into single vector
    ignore = ['PIN', 'MAJOR', 'MINOR', 'ADDR_FULL', 'TARGET', 'fold']
    assembler = VectorAssembler(
        inputCols=[col for col in gis.columns if col not in ignore],
        outputCol='gis_features')
    gis = assembler.transform(gis)

    # Drop all columns that are now in the features column
    ignore.append('gis_features')
    gis = gis.drop(*[col for col in gis.columns if col not in ignore])

    # Write to parquet - not sure if I will eventually open from this, but that's the idea
    # gis.write.parquet('data/gis_parquet',mode='overwrite')

    return gis
# $example on$
from pyspark.ml.feature import OneHotEncoderEstimator
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("OneHotEncoderEstimatorExample")\
        .getOrCreate()

    # Note: categorical features are usually first encoded with StringIndexer
    # $example on$
    df = spark.createDataFrame([
        (0.0, 1.0),
        (1.0, 0.0),
        (2.0, 1.0),
        (0.0, 2.0),
        (0.0, 1.0),
        (2.0, 0.0)
    ], ["categoryIndex1", "categoryIndex2"])

    encoder = OneHotEncoderEstimator(inputCols=["categoryIndex1", "categoryIndex2"],
                                     outputCols=["categoryVec1", "categoryVec2"])
    model = encoder.fit(df)
    encoded = model.transform(df)
    encoded.show()
    # $example off$

    spark.stop()
from pyspark.ml.feature import OneHotEncoderEstimator
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

df = spark.createDataFrame([(0.0, 1.0), (1.0, 0.0), (2.0, 1.0), (0.0, 2.0),
                            (0.0, 1.0), (2.0, 0.0)],
                           ["categoryIndex1", "categoryIndex2"])

encoder = OneHotEncoderEstimator(
    inputCols=["categoryIndex1", "categoryIndex2"],
    outputCols=["categoryVec1", "categoryVec2"])

model = encoder.fit(df)
encoded = model.transform(df)
encoded.show()

spark.stop()