'location of the training dataset in the local filesystem (will be downloaded if needed)' ) args = parser.parse_args() # Initialize SparkSession conf = SparkConf().setAppName('keras_spark_mnist').set( 'spark.sql.shuffle.partitions', '16') if args.master: conf.setMaster(args.master) elif args.num_proc: conf.setMaster('local[{}]'.format(args.num_proc)) spark = SparkSession.builder.config(conf=conf).getOrCreate() # Setup our store for intermediate data store = Store.create(args.work_dir) # Download MNIST dataset data_url = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/mnist.bz2' libsvm_path = os.path.join(args.data_dir, 'mnist.bz2') if not os.path.exists(libsvm_path): subprocess.check_output(['wget', data_url, '-O', libsvm_path]) # Load dataset into a Spark DataFrame df = spark.read.format('libsvm') \ .option('numFeatures', '784') \ .load(libsvm_path) # One-hot encode labels into SparseVectors encoder = OneHotEncoderEstimator(inputCols=['label'], outputCols=['label_vec'],
def train( max_sales: float, vocab: Dict[str, List[Any]], hp: Hyperparameters, work_dir: FlyteDirectory, train_df: pyspark.sql.DataFrame, working_dir: FlyteDirectory, ): print("==============") print("Model training") print("==============") # a method to determine root mean square percentage error of exponential of predictions def exp_rmspe(y_true, y_pred): """Competition evaluation metric, expects logarmithic inputs.""" pct = tf.square((tf.exp(y_true) - tf.exp(y_pred)) / tf.exp(y_true)) # compute mean excluding stores with zero denominator x = tf.reduce_sum(tf.where(y_true > 0.001, pct, tf.zeros_like(pct))) y = tf.reduce_sum( tf.where(y_true > 0.001, tf.ones_like(pct), tf.zeros_like(pct))) return tf.sqrt(x / y) def act_sigmoid_scaled(x): """Sigmoid scaled to logarithm of maximum sales scaled by 20%.""" return tf.nn.sigmoid(x) * tf.math.log(max_sales) * 1.2 # NOTE: exp_rmse and act_sigmoid_scaled functions are not placed at the module level # this is because we cannot explicitly send max_sales as an argument to act_sigmoid_scaled since it is an activation function # two of them are custom objects, and placing one at the module level and the other within the function doesn't really add up all_cols = CATEGORICAL_COLS + CONTINUOUS_COLS CUSTOM_OBJECTS = { "exp_rmspe": exp_rmspe, "act_sigmoid_scaled": act_sigmoid_scaled } # disable GPUs when building the model to prevent memory leaks if LooseVersion(tf.__version__) >= LooseVersion("2.0.0"): # See https://github.com/tensorflow/tensorflow/issues/33168 os.environ["CUDA_VISIBLE_DEVICES"] = "-1" else: K.set_session( tf.Session(config=tf.ConfigProto(device_count={"GPU": 0}))) # build the Keras model inputs = {col: Input(shape=(1, ), name=col) for col in all_cols} embeddings = [ Embedding(len(vocab[col]), 10, input_length=1, name="emb_" + col)(inputs[col]) for col in CATEGORICAL_COLS ] continuous_bn = Concatenate()([ Reshape((1, 1), name="reshape_" + col)(inputs[col]) for col in CONTINUOUS_COLS ]) continuous_bn = BatchNormalization()(continuous_bn) x = Concatenate()(embeddings + [continuous_bn]) x = Flatten()(x) x = Dense(1000, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.00005))(x) x = Dense(1000, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.00005))(x) x = Dense(1000, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.00005))(x) x = Dense(500, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.00005))(x) x = Dropout(0.5)(x) # specify element-wise activation output = Dense(1, activation=act_sigmoid_scaled)(x) model = tf.keras.Model([inputs[f] for f in all_cols], output) # display the details of the Keras model model.summary() opt = tf.keras.optimizers.Adam(lr=hp.learning_rate, epsilon=1e-3) # checkpoint callback to specify the options for the returned Keras model ckpt_callback = BestModelCheckpoint(monitor="val_loss", mode="auto", save_freq="epoch") # create an object of Store class store = Store.create(work_dir.remote_source) # 'SparkBackend' uses `horovod.spark.run` to execute the distributed training function, and # returns a list of results by running 'train' on every worker in the cluster backend = SparkBackend( num_proc=hp.num_proc, stdout=sys.stdout, stderr=sys.stderr, prefix_output_with_timestamp=True, ) # define a Spark Estimator that fits Keras models to a DataFrame keras_estimator = hvd.KerasEstimator( backend=backend, store=store, model=model, optimizer=opt, loss="mae", metrics=[exp_rmspe], custom_objects=CUSTOM_OBJECTS, feature_cols=all_cols, label_cols=["Sales"], validation="Validation", batch_size=hp.batch_size, epochs=hp.epochs, verbose=2, checkpoint_callback=ckpt_callback, ) # The Estimator hides the following details: # 1. Binding Spark DataFrames to a deep learning training script # 2. Reading data into a format that can be interpreted by the training framework # 3. Distributed training using Horovod # the user would provide a Keras model to the `KerasEstimator`` # this `KerasEstimator`` will fit the data and store it in a Spark DataFrame keras_model = keras_estimator.fit(train_df).setOutputCols(["Sales_output"]) # retrieve the model training history history = keras_model.getHistory() best_val_rmspe = min(history["val_exp_rmspe"]) print("Best RMSPE: %f" % best_val_rmspe) # save the trained model keras_model.save(os.path.join(working_dir, hp.local_checkpoint_file)) print("Written checkpoint to %s" % os.path.join(working_dir, hp.local_checkpoint_file)) # the Estimator returns a Transformer representation of the trained model once training is complete return keras_model
def train_model(args): # do not run this test for pytorch lightning below min supported verson import pytorch_lightning as pl if LooseVersion(pl.__version__) < LooseVersion(MIN_PL_VERSION): print("Skip test for pytorch_ligthning=={}, min support version is {}".format(pl.__version__, MIN_PL_VERSION)) return # Initialize SparkSession conf = SparkConf().setAppName('pytorch_spark_mnist').set('spark.sql.shuffle.partitions', '16') if args.master: conf.setMaster(args.master) elif args.num_proc: conf.setMaster('local[{}]'.format(args.num_proc)) spark = SparkSession.builder.config(conf=conf).getOrCreate() # Setup our store for intermediate data store = Store.create(args.work_dir) # Download MNIST dataset data_url = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/mnist.bz2' libsvm_path = os.path.join(args.data_dir, 'mnist.bz2') if not os.path.exists(libsvm_path): subprocess.check_output(['wget', data_url, '-O', libsvm_path]) # Load dataset into a Spark DataFrame df = spark.read.format('libsvm') \ .option('numFeatures', '784') \ .load(libsvm_path) # One-hot encode labels into SparseVectors encoder = OneHotEncoder(inputCols=['label'], outputCols=['label_vec'], dropLast=False) model = encoder.fit(df) train_df = model.transform(df) # Train/test split train_df, test_df = train_df.randomSplit([0.9, 0.1]) # Define the PyTorch model without any Horovod-specific parameters class Net(LightningModule): def __init__(self): super(Net, self).__init__() self.conv1 = nn.Conv2d(1, 10, kernel_size=5) self.conv2 = nn.Conv2d(10, 20, kernel_size=5) self.conv2_drop = nn.Dropout2d() self.fc1 = nn.Linear(320, 50) self.fc2 = nn.Linear(50, 10) def forward(self, x): x = x.float().reshape((-1, 1, 28, 28)) x = F.relu(F.max_pool2d(self.conv1(x), 2)) x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) x = x.view(-1, 320) x = F.relu(self.fc1(x)) x = F.dropout(x, training=self.training) x = self.fc2(x) return F.log_softmax(x, -1) def configure_optimizers(self): return optim.SGD(self.parameters(), lr=0.01, momentum=0.5) def training_step(self, batch, batch_idx): if batch_idx == 0: print(f"training data batch size: {batch['label'].shape}") x, y = batch['features'], batch['label'] y_hat = self(x) loss = F.nll_loss(y_hat, y.long()) self.log('train_loss', loss) return loss def validation_step(self, batch, batch_idx): if batch_idx == 0: print(f"validation data batch size: {batch['label'].shape}") x, y = batch['features'], batch['label'] y_hat = self(x) loss = F.nll_loss(y_hat, y.long()) self.log('val_loss', loss) def validation_epoch_end(self, outputs): avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean() if len(outputs) > 0 else float('inf') self.log('avg_val_loss', avg_loss) model = Net() # Train a Horovod Spark Estimator on the DataFrame backend = SparkBackend(num_proc=args.num_proc, stdout=sys.stdout, stderr=sys.stderr, prefix_output_with_timestamp=True) from pytorch_lightning.callbacks import Callback epochs = args.epochs class MyDummyCallback(Callback): def __init__(self): self.epcoh_end_counter = 0 self.train_epcoh_end_counter = 0 self.validation_epoch_end_counter = 0 def on_init_start(self, trainer): print('Starting to init trainer!') def on_init_end(self, trainer): print('Trainer is initialized.') def on_epoch_end(self, trainer, model): print('A train or eval epoch ended.') self.epcoh_end_counter += 1 def on_train_epoch_end(self, trainer, model, unused=None): print('A train epoch ended.') self.train_epcoh_end_counter += 1 def on_validation_epoch_end(self, trainer, model, unused=None): print('A val epoch ended.') self.validation_epoch_end_counter += 1 def on_train_end(self, trainer, model): print("Training ends:" f"epcoh_end_counter={self.epcoh_end_counter}, " f"train_epcoh_end_counter={self.train_epcoh_end_counter}, " f"validation_epoch_end_counter={self.validation_epoch_end_counter} \n") assert self.train_epcoh_end_counter <= epochs assert self.epcoh_end_counter == self.train_epcoh_end_counter + self.validation_epoch_end_counter callbacks = [MyDummyCallback()] # added EarlyStopping and ModelCheckpoint from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint callbacks.append(ModelCheckpoint(monitor='val_loss', mode="min", save_top_k=1, verbose=True)) from pytorch_lightning.callbacks.early_stopping import EarlyStopping callbacks.append(EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3, verbose=True, mode='min')) torch_estimator = hvd.TorchEstimator(backend=backend, store=store, model=model, input_shapes=[[-1, 1, 28, 28]], feature_cols=['features'], label_cols=['label'], batch_size=args.batch_size, epochs=args.epochs, validation=0.1, verbose=1, callbacks=callbacks, profiler="simple" if args.enable_profiler else None) torch_model = torch_estimator.fit(train_df).setOutputCols(['label_prob']) # Evaluate the model on the held-out test DataFrame pred_df = torch_model.transform(test_df) argmax = udf(lambda v: float(np.argmax(v)), returnType=T.DoubleType()) pred_df = pred_df.withColumn('label_pred', argmax(pred_df.label_prob)) evaluator = MulticlassClassificationEvaluator(predictionCol='label_pred', labelCol='label', metricName='accuracy') print('Test accuracy:', evaluator.evaluate(pred_df)) spark.stop()
def train_model(args): # do not run this test for pytorch lightning below min supported verson import pytorch_lightning as pl if LooseVersion(pl.__version__) < LooseVersion(MIN_PL_VERSION): print("Skip test for pytorch_ligthning=={}, min support version is {}". format(pl.__version__, MIN_PL_VERSION)) return # Initialize SparkSession conf = SparkConf().setAppName('keras_spark_mnist').set( 'spark.sql.shuffle.partitions', '16') if args.master: conf.setMaster(args.master) elif args.num_proc: conf.setMaster('local[{}]'.format(args.num_proc)) spark = SparkSession.builder.config(conf=conf).getOrCreate() # Setup our store for intermediate data store = Store.create(args.work_dir) # Download MNIST dataset data_url = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/mnist.bz2' libsvm_path = os.path.join(args.data_dir, 'mnist.bz2') if not os.path.exists(libsvm_path): subprocess.check_output(['wget', data_url, '-O', libsvm_path]) # Load dataset into a Spark DataFrame df = spark.read.format('libsvm') \ .option('numFeatures', '784') \ .load(libsvm_path) # One-hot encode labels into SparseVectors encoder = OneHotEncoderEstimator(inputCols=['label'], outputCols=['label_vec'], dropLast=False) model = encoder.fit(df) train_df = model.transform(df) # Train/test split train_df, test_df = train_df.randomSplit([0.9, 0.1]) # Define the PyTorch model without any Horovod-specific parameters class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.conv1 = nn.Conv2d(1, 10, kernel_size=5) self.conv2 = nn.Conv2d(10, 20, kernel_size=5) self.conv2_drop = nn.Dropout2d() self.fc1 = nn.Linear(320, 50) self.fc2 = nn.Linear(50, 10) def forward(self, x): x = x.float() x = F.relu(F.max_pool2d(self.conv1(x), 2)) x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) x = x.view(-1, 320) x = F.relu(self.fc1(x)) x = F.dropout(x, training=self.training) x = self.fc2(x) return F.log_softmax(x) model = Net() optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5) loss = nn.NLLLoss() # Train a Horovod Spark Estimator on the DataFrame torch_estimator = hvd.TorchEstimator( num_proc=args.num_proc, store=store, model=model, optimizer=optimizer, loss=lambda input, target: loss(input, target.long()), input_shapes=[[-1, 1, 28, 28]], feature_cols=['features'], label_cols=['label'], batch_size=args.batch_size, epochs=args.epochs, verbose=1) torch_model = torch_estimator.fit(train_df).setOutputCols(['label_prob']) # Evaluate the model on the held-out test DataFrame pred_df = torch_model.transform(test_df) argmax = udf(lambda v: float(np.argmax(v)), returnType=T.DoubleType()) pred_df = pred_df.withColumn('label_pred', argmax(pred_df.label_prob)) evaluator = MulticlassClassificationEvaluator(predictionCol='label_pred', labelCol='label', metricName='accuracy') print('Test accuracy:', evaluator.evaluate(pred_df)) spark.stop()