def main(args): # MultiWorkerMirroredStrategy creates copies of all variables in the model's # layers on each device across all workers # if your GPUs don't support NCCL, replace "communication" with another strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy( communication=tf.distribute.experimental.CollectiveCommunication.NCCL) BATCH_SIZE_PER_REPLICA = 64 BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync with strategy.scope(): ds_train = make_datasets_unbatched().batch(BATCH_SIZE).repeat() options = tf.data.Options() options.experimental_distribute.auto_shard_policy = \ tf.data.experimental.AutoShardPolicy.DATA ds_train = ds_train.with_options(options) # Model building/compiling need to be within `strategy.scope()`. multi_worker_model = get_model(args) # Function for decaying the learning rate. # You can define any decay function you need. # Callback for printing the LR at the end of each epoch. class PrintLR(tf.keras.callbacks.Callback): def on_epoch_end(self, epoch, logs=None): print('\nLearning rate for epoch {} is {}'.format( epoch + 1, multi_worker_model.optimizer.lr.numpy())) callbacks = [ PrintLR(), tf.keras.callbacks.LearningRateScheduler(decay), ] # Polyaxon if TASK_INDEX == 0: plx_callback = PolyaxonKerasCallback() plx_model_callback = PolyaxonKerasModelCheckpoint(save_weights_only=True) log_dir = tracking.get_tensorboard_path() callbacks = [ tf.keras.callbacks.TensorBoard(log_dir=log_dir), plx_model_callback, plx_callback, ] # Keras' `model.fit()` trains the model with specified number of epochs and # number of steps per epoch. Note that the numbers here are for demonstration # purposes only and may not sufficiently produce a model with good quality. multi_worker_model.fit(ds_train, epochs=args.epochs, steps_per_epoch=70, callbacks=callbacks) multi_worker_model.save("/tmp/model") if TASK_INDEX == 0: tracking.log_model(path="/tmp/model", framework="tensorflow")
X, y = datasets.load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1012) # Polyaxon tracking.log_data_ref(content=X_train, name='x_train') tracking.log_data_ref(content=y_train, name='y_train') tracking.log_data_ref(content=X_test, name='x_test') tracking.log_data_ref(content=y_test, name='y_test') rfr = RandomForestRegressor( n_estimators=args.n_estimators, max_depth=args.max_depth, min_samples_split=args.min_samples_split, ) rfr.fit(X_train, y_train) # Polyaxon log_regressor(rfr, X_test, y_test) # Logging the model as joblib with tempfile.TemporaryDirectory() as d: model_path = os.path.join(d, "model.joblib") joblib.dump(rfr, model_path) tracking.log_model(model_path, name="model", framework="scikit-learn", versioned=False)
'--random_state', type=int, default=33, ) args = parser.parse_args() # Polyaxon tracking.init() # Train and eval the model with given parameters. # Polyaxon model_path = "model.joblib" metrics = train_and_eval( model_path=model_path, n_neighbors=args.n_neighbors, leaf_size=args.leaf_size, metric=args.metric, p=args.p, weights=args.weights, test_size=args.test_size, random_state=args.random_state, ) # Logging metrics to Polyaxon print("Testing metrics: {}", metrics) # Polyaxon tracking.log_metrics(**metrics) # Logging the model tracking.log_model(model_path, name="iris-model", framework="scikit-learn")