def test_train_model(self): hvd.init() with self.test_session() as sess: K.set_session(sess) opt = keras.optimizers.RMSprop(lr=0.0001) opt = hvd.DistributedOptimizer(opt) model = keras.models.Sequential() model.add(keras.layers.Dense(2, input_shape=(3, ))) model.add(keras.layers.RepeatVector(3)) model.add(keras.layers.ThresholdedReLU(0.5)) model.compile(loss=keras.losses.mean_squared_error, optimizer=opt, metrics=[keras.metrics.categorical_accuracy], sample_weight_mode='temporal') x = np.random.random((1, 3)) y = np.random.random((1, 3, 3)) def generator(): while 1: yield (x, y) # No assertions, we just need to verify that it doesn't hang callbacks = [hvd.callbacks.BroadcastGlobalVariablesCallback(0)] model.fit_generator(generator(), steps_per_epoch=10, callbacks=callbacks, epochs=0, verbose=0, workers=4, initial_epoch=1)
def init_gpu(args, logger): hvd.init() init_logger( full=hvd.rank() == 0, args=args, logger=logger ) if args.affinity != 'disabled': gpu_id = hvd.local_rank() affinity = set_affinity( gpu_id=gpu_id, nproc_per_node=hvd.size(), mode=args.affinity ) logger.warning(f'{gpu_id}: thread affinity: {affinity}') gpus = tf.config.experimental.list_physical_devices('GPU') tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') if args.amp: policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16") tf.keras.mixed_precision.experimental.set_policy(policy) if args.xla: tf.config.optimizer.set_jit(True)
def init_workers(distributed=False): """Initialize distributed worker""" rank, local_rank, n_ranks = 0, 0, 1 if distributed: hvd.init() rank, local_rank, n_ranks = hvd.rank(), hvd.local_rank(), hvd.size() return rank, local_rank, n_ranks
def init_workers(distributed=False): if distributed: hvd.init() return SimpleNamespace(rank=hvd.rank(), size=hvd.size(), local_rank=hvd.local_rank(), local_size=hvd.local_size()) else: return SimpleNamespace(rank=0, size=1, local_rank=0, local_size=1)
def __init__(self, *args, **kwargs): super(TfKerasTests, self).__init__(*args, **kwargs) warnings.simplefilter('module') hvd.init() self.config = tf.compat.v1.ConfigProto() self.config.gpu_options.allow_growth = True self.config.gpu_options.visible_device_list = str(hvd.local_rank())
def main(_): hvd.init() print("After hvd init") config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) # K.set_session(tf.Session(config=config)) print("After gpu_options visible_device_list") tf.enable_eager_execution(config=config) epochs = 20 steps_per_epoch = 2 batch_size = 32 num_classes = 10 full_model = 'image' image_model = 'efficientnet' image_training_type = 'finetuning' text_model = 'cnn' combined_embeddings = 'stack' learning_rate = 0.005 width = 150 height = 150 input_shape = (height, width, 3) input_size = (224, 224, 3) train_tfrecord = tf.data.TFRecordDataset(filenames=['tfrecords/train.tfrecords']) print(train_tfrecord) val_tfrecord = tf.data.TFRecordDataset(filenames=['tfrecords/val.tfrecords']) test_tfrecord = tf.data.TFRecordDataset(filenames=['tfrecords/test.tfrecords']) def read_tfrecord(serialized_example): feature_description = { 'image_raw': tf.io.FixedLenFeature([], tf.string), 'label': tf.io.FixedLenFeature([], tf.int64), } example = tf.io.parse_single_example(serialized_example, feature_description) input_2 = tf.image.decode_png(example['image_raw'], channels=3, dtype=tf.dtypes.uint8) input_2 = tf.image.resize(input_2, [600, 600]) return (input_2, example['label']) train_parsed_dataset = train_tfrecord.map(read_tfrecord) val_parsed_dataset = val_tfrecord.map(read_tfrecord) test_parsed_dataset = test_tfrecord.map(read_tfrecord) tf.keras.backend.clear_session() baseModel = EfficientNetB7(weights='imagenet', include_top=True) probs = baseModel.layers.pop() top_droput = probs.input headModel = layers.Dense(10, activation='softmax')(top_droput) model = models.Model(inputs=baseModel.input, outputs=headModel) SGD = optimizers.SGD(lr=0.01, decay=4e-05, momentum=0.9) model.compile(loss='sparse_categorical_crossentropy', optimizer=adapt_optimizer(SGD), metrics=['accuracy']) train_dataset = train_parsed_dataset.batch(2).repeat() val_dataset = val_parsed_dataset.batch(2).repeat() test_dataset = test_parsed_dataset.batch(2).repeat() model.fit(train_dataset, epochs=adapt_epochs(epochs), steps_per_epoch=400, validation_data=val_dataset, validation_steps=100, verbose=(1 if (hvd.rank() == 0) else 0), callbacks=adapt_callbacks([], True)) if (hvd.rank() == 0): model.save('saved_model.h5') if (hvd.rank() == 0): (test_loss, test_acc) = model.evaluate(test_dataset, verbose=0, steps=1241) print('Test loss =', test_loss) print('Test acc =', test_acc)
def setup_horovod(self): import horovod.tensorflow.keras as hvd hvd.init() self.model = self.model_creator(self.config) compile_args = self.compile_args_creator(self.config) compile_args["optimizer"] = hvd.DistributedOptimizer(compile_args["optimizer"]) self.model.compile(**compile_args) self.backend = "horovod"
def __init__(self, *args, **kwargs): super(Tf2KerasTests, self).__init__(*args, **kwargs) warnings.simplefilter('module') hvd.init() gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
def initialize_horovod(): hvd.init() gpus = tf.config.experimental.list_physical_devices("GPU") for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU") return hvd.size()
def init(): gpu_thread_count = 2 os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private' os.environ['TF_GPU_THREAD_COUNT'] = str(gpu_thread_count) os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1' os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' hvd.init() if hvd.rank() == 0: print('PY', sys.version) print('TF', tf.version.VERSION)
def connect_GPU_to_horovod(): import horovod.tensorflow.keras as hvd import tensorflow as tf tf.keras.backend.clear_session() hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
def train_hvd(learning_rate=1.0): # Tensorflow has given up on pickling. We need to explicitly import its modules inside workers from tensorflow.keras import backend as K from tensorflow.keras.models import Sequential import tensorflow as tf from tensorflow import keras import horovod.tensorflow.keras as hvd # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.Session(config=config)) (x_train, y_train), (x_test, y_test) = get_dataset(num_classes, hvd.rank(), hvd.size()) model = get_model(num_classes) # Horovod: adjust learning rate based on number of GPUs. optimizer = keras.optimizers.Adadelta(lr=learning_rate * hvd.size()) # Horovod: add Horovod Distributed Optimizer. optimizer = hvd.DistributedOptimizer(optimizer) model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy']) callbacks = [ # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. hvd.callbacks.BroadcastGlobalVariablesCallback(0), ] # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. if hvd.rank() == 0: callbacks.append( keras.callbacks.ModelCheckpoint(checkpoint_dir + '/checkpoint-{epoch}.ckpt', save_weights_only=True)) model.fit(x_train, y_train, batch_size=batch_size, callbacks=callbacks, epochs=epochs, verbose=2, validation_data=(x_test, y_test))
def setup_horovod(self): import horovod.tensorflow.keras as hvd hvd.init() self.model = self.model_creator(self.config) compile_args = self.compile_args_creator(self.config) compile_args["optimizer"] = hvd.DistributedOptimizer(compile_args["optimizer"]) self.model.compile(**compile_args) self.backend = "horovod" self.size = hvd.size() self.rank = hvd.rank() from tensorflow.python.distribute import distribution_strategy_context as ds_context self.strategy = ds_context.get_strategy()
def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--tf1", action="store_true") parser.add_argument( "--aggregation-frequency", dest="aggregation_frequency", default=0, type=int ) parser.add_argument("--average-aggregated-gradients", action="store_true") args = parser.parse_args() hvd.init() if args.tf1: check_tf_1(args.aggregation_frequency, args.average_aggregated_gradients) else: check_tf_2(args.aggregation_frequency, args.average_aggregated_gradients)
def simple_fn(): hvd.init() rank = hvd.rank() ## getting the hostname by socket.gethostname() method hostname = socket.gethostname() ## getting the IP address using socket.gethostbyname() method ip_address = socket.gethostbyname(hostname) print(f"hvd rank[{ip_address}]", rank) return rank
def setup(args, report): """Set up environment variables given the type of partition.""" # Initialize Horovod hvd.init() # Set environment variable necessary to use h5py for file read/write os.putenv("HDF5_USE_FILE_LOCKING", "FALSE") os.system("export $HDF5_USE_FILE_LOCKING") # Pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.log_device_placement = False config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.compat.v1.Session(config=config)) np.random.seed(args.random_seed) print('Rank ' + str(hvd.rank()) + ' session configured')
def train(cls, training_rows, training_steps_per_epoch, val_rows, val_steps_per_epoch, epochs, gen_workers): """ Trains model over training / validation data generators. We measure the average time taken to train & validate the model for each epoch """ from tensorflow.keras import backend as K from tensorflow import keras import horovod.tensorflow.keras as hvd hvd.init() model = Lenet5.get_model() train_imgs = ArrGenerator(img_size=np.array([training_rows, 32, 32, 3]), gen_cls=RandomArrCreator) train_labels = ArrGenerator(img_size=np.array([training_rows, 10]), gen_cls=RandomArrCreator) train_gen = DataGenerator.generate(img_gen=train_imgs, label_gen=train_labels) val_imgs = ArrGenerator(img_size=np.array([val_rows, 32, 32, 3]), gen_cls=RandomArrCreator) val_labels = ArrGenerator(img_size=np.array([val_rows, 10]), gen_cls=RandomArrCreator) val_gen = DataGenerator.generate(img_gen=val_imgs, label_gen=val_labels) opt = keras.optimizers.Adadelta() opt = hvd.DistributedOptimizer(opt) model.compile(optimizer=opt, loss="mean_squared_error", metrics=['accuracy']) # For training model.fit_generator(generator=train_gen, steps_per_epoch=training_steps_per_epoch, epochs=epochs, validation_data=val_gen, validation_steps=val_steps_per_epoch, max_queue_size=20, workers=gen_workers, use_multiprocessing=True, callbacks=[cls.time_callback]) hvd.shutdown() return
def train_evaluate(): # Initialize Horovod hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) tf.keras.backend.set_session(tf.Session(config=config)) train_dataset, eval_dataset = prepare_datasets() model = toy_resnet_model() # Wrap an optimizer in Horovod optimizer = hvd.DistributedOptimizer(optimizers.Adadelta()) model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["accuracy"]) callbacks = [ # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with loaded weights. hvd.callbacks.BroadcastGlobalVariablesCallback(0), # Horovod: average metrics among workers at the end of every epoch. # # Note: This callback must be in the list before the ReduceLROnPlateau, # TensorBoard, or other metrics-based callbacks. hvd.callbacks.MetricAverageCallback() ] # Horovod: save checkpoints only on worker 0 (master) to prevent other workers from corrupting them. # Configure Tensorboard and Azure ML Tracking if hvd.rank() == 0: #callbacks.append(tf.keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5')) callbacks.append( tf.keras.callbacks.TensorBoard(log_dir=FLAGS['job-dir'].value, update_freq='epoch')) model.fit(train_dataset, epochs=FLAGS.epochs, steps_per_epoch=1000, callbacks=callbacks, validation_data=eval_dataset, validation_steps=200)
def init(en_mem_growth=False, set_visible_dev=False): """ This initializes the horovod package. :param en_mem_growth: :param set_visible_dev: """ if hvd is not None: hvd.init() gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: if en_mem_growth: tf.config.experimental.set_memory_growth(gpu, True) if gpus and set_visible_dev: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') else: print("Horovod not supported on this system!")
def init_gpu(args, logger): hvd.init() init_logger(full=hvd.rank() == 0, args=args, logger=logger) if args.affinity != "disabled": gpu_id = hvd.local_rank() affinity = set_affinity(gpu_id=gpu_id, nproc_per_node=hvd.size(), mode=args.affinity) logger.warning(f"{gpu_id}: thread affinity: {affinity}") if args.amp: tf.keras.mixed_precision.set_global_policy("mixed_float16") if args.xla: tf.config.optimizer.set_jit(True)
def init_hvd(args): if hvd: hvd.init() FORMAT = "[%%(levelname)s - P%i/%i - %%(filename)s:%%(lineno)s - %%(funcName)s] %%(message)s" % ( hvd.rank(), hvd.size()) # Remove all handlers associated with the root logger object. for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) logging.basicConfig(level=logging.INFO, format=FORMAT) if args.verbose: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) logger.debug("Updated logger to print process") args.hvd_rank = hvd.rank() if hvd else 0 args.hvd_size = hvd.size() if hvd else 1
def test_sparse_as_dense(self): hvd.init() with self.test_session() as sess: K.set_session(sess) opt = keras.optimizers.RMSprop(lr=0.0001) opt = hvd.DistributedOptimizer(opt, sparse_as_dense=True) model = keras.models.Sequential() model.add(keras.layers.Embedding(1000, 64, input_length=10)) model.compile(loss=keras.losses.mean_squared_error, optimizer=opt) x = np.random.randint(1000, size=(32, 10)) y = np.random.random((32, 10, 64)) # No assertions, we just need to verify that it doesn't hang model.train_on_batch(x, y)
def handle_distribution_strategy(distribution_strategy): """ Create distribution strategy. """ strategy = None if distribution_strategy: strategy = distribution_strategy if isinstance(distribution_strategy, dict): strategy = distribution_strategy.get("distribution_strategy", None) if isinstance(distribution_strategy, str): strategy = distribution_strategy.lower() if is_third_party_allreduce(strategy): if strategy == "horovod": import horovod.tensorflow.keras as hvd else: import byteps.tensorflow.keras as hvd logging.info("import {} as hvd backend.".format(strategy)) hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices( gpus[hvd.local_rank()], 'GPU') compat.register_distributed_worker_setting(hvd.rank(), hvd.size(), strategy) if hvd.rank() != 0: logging.set_verbosity(logging.ERROR) else: if isinstance(distribution_strategy, str): strategy = distribution_utils.get_distribution_strategy( distribution_strategy=distribution_strategy) elif isinstance(distribution_strategy, dict): strategy = distribution_utils.get_distribution_strategy( **distribution_strategy) if strategy is None: logging.info("No distribution strategy was used.") else: try: logging.info( "Using distribution strategy: {} with num_replicas_in_sync={}". format(strategy, strategy.num_replicas_in_sync)) except Exception: pass return strategy
def main() -> None: """ Start training Seq2Seq model. :return: None """ # Horovod: initialize Horovod. hvd.init() # Pin GPU to be used to process local rank (one GPU per process) gpu_list = tf.config.experimental.list_physical_devices("GPU") for gpu in gpu_list: tf.config.experimental.set_memory_growth(gpu, True) if gpu_list: print("Visible GPUs detected.") tf.config.experimental.set_visible_devices(gpu_list[hvd.local_rank()], "GPU") print("Loading input data.") subject_list, body_list = load_data() config: Seq2SeqConfig = fit_text(body_list, subject_list) summarizer: Seq2SeqSummarizer = Seq2SeqSummarizer(config) if not CONFIG.is_dev: if tf.io.gfile.exists(LOCAL_MODEL_WEIGHTS): summarizer.load_weights(weight_file_path=LOCAL_MODEL_WEIGHTS) else: Path(CONFIG.bucket_summarization_model).mkdir(parents=True, exist_ok=True) if Path(LOCAL_MODEL_WEIGHTS).exists(): summarizer.load_weights(weight_file_path=LOCAL_MODEL_WEIGHTS) body_train, body_test, subject_train, subject_test = train_test_split(body_list, subject_list, test_size=0.2) print("Starting training.") summarizer.fit( body_train=body_train, subject_train=subject_train, body_test=body_test, subject_test=subject_test, epochs=int(math.ceil(100 / hvd.size())), batch_size=128, )
def main(argv=None): tf.reset_default_graph() # init horovod hvd.init() config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) keras.backend.set_session(tf.Session(config=config)) image, label = _get_dataset() model_input = keras.layers.Input(tensor=image) model_output = keras.layers.Flatten(input_shape=(-1, 299, 299, 3))(model_input) model_output = keras.layers.Dense(5, activation='relu')(model_output) model = keras.models.Model(inputs=model_input, outputs=model_output) # Horovod: opt = keras.optimizers.Adadelta(1.0 * hvd.size()) opt = hvd.DistributedOptimizer(opt) model.compile(optimizer=opt,loss='categorical_crossentropy', metrics=['accuracy'],target_tensors=[label]) # callback t_callback = keras.callbacks.TensorBoard(log_dir='./logs') # fit model callbacks = [ hvd.callbacks.BroadcastGlobalVariablesCallback(0), t_callback, ] epochs = int(math.ceil(FLAGS.num_epochs / hvd.size())) model.fit(epochs=epochs,steps_per_epoch=FLAGS.steps_one_epoch,callbacks=callbacks) # save to h5 h5file = os.path.join(FLAGS.model_path,'model.h5') if hvd.rank() == 0: keras.models.save_model(model, h5file)
def __init__(self, config): self.config = config self.checkpoint_path = config.get_attribute('checkpoint_path') self.epochs = config.get_attribute('epochs') self.checkpoint_save_period = config.get_attribute( 'checkpoint_save_period') self.checkpoint_format = 'checkpoint-{epoch}.h5' self.learning_rate = config.get_attribute('learning_rate') self.models_train = [] self.models_eval = [] self.train_steps_per_epoch = 1 self.eval_steps_per_epoch = 1 self.resume_from_epoch = 0 self.verbose = 1 self.cur_epoch = 0 hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') self.verbose = 1 if hvd.rank() == 0 else 0 origin_train_model = get_model(config, is_training=True) origin_eval_model = get_model(config, is_training=False) self.models_train.append(origin_train_model) self.models_eval.append(origin_eval_model) train_model = tf.keras.models.clone_model(origin_train_model) eval_model = tf.keras.models.clone_model(origin_eval_model) self.models_train.append(train_model) self.models_eval.append(eval_model) self.train_dataset, self.eval_dataset, self.train_dataset_distill, self.eval_dataset_distill = \ self.build_dataset() self.build_train() self.build_eval() self.load_model() self.save_model_path = config.get_attribute('checkpoint_eval_path') self.callbacks = []
def train(num_epochs): # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices( gpus[hvd.local_rank()], 'GPU') (mnist_images, mnist_labels), _ = \ tf.keras.datasets.mnist.load_data(path='mnist-%d.npz' % hvd.rank()) dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), tf.cast(mnist_labels, tf.int64)) ) dataset = dataset.repeat().shuffle(10000).batch(128) mnist_model = tf.keras.Sequential([ tf.keras.layers.Conv2D(32, [3, 3], activation='relu'), tf.keras.layers.Conv2D(64, [3, 3], activation='relu'), tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), tf.keras.layers.Dropout(0.25), tf.keras.layers.Flatten(), tf.keras.layers.Dense(128, activation='relu'), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(10, activation='softmax') ]) # Horovod: adjust learning rate based on number of GPUs. scaled_lr = 0.001 * hvd.size() opt = tf.optimizers.Adam(scaled_lr) # Horovod: add Horovod DistributedOptimizer. opt = hvd.DistributedOptimizer(opt) # Horovod: Specify `experimental_run_tf_function=False` to ensure TensorFlow # uses hvd.DistributedOptimizer() to compute gradients. mnist_model.compile(loss=tf.losses.SparseCategoricalCrossentropy(), optimizer=opt, metrics=['accuracy'], experimental_run_tf_function=False) callbacks = [ # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. hvd.callbacks.BroadcastGlobalVariablesCallback(0), # Horovod: average metrics among workers at the end of every epoch. # # Note: This callback must be in the list before the ReduceLROnPlateau, # TensorBoard or other metrics-based callbacks. hvd.callbacks.MetricAverageCallback(), # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during # the first three epochs. See https://arxiv.org/abs/1706.02677 for details. hvd.callbacks.LearningRateWarmupCallback( warmup_epochs=3, initial_lr=scaled_lr, verbose=1), ] # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. if hvd.rank() == 0: callbacks.append(tf.keras.callbacks.ModelCheckpoint( './checkpoint-{epoch}.h5')) # Horovod: write logs on worker 0. verbose = 1 if hvd.rank() == 0 else 0 # Train the model. # Horovod: adjust number of steps based on number of GPUs. mnist_model.fit(dataset, steps_per_epoch=500 // hvd.size(), callbacks=callbacks, epochs=num_epochs, verbose=verbose)
def train_fn(model_bytes): # Make sure pyarrow is referenced before anything else to avoid segfault due to conflict # with TensorFlow libraries. Use `pa` package reference to ensure it's loaded before # functions like `deserialize_model` which are implemented at the top level. # See https://jira.apache.org/jira/browse/ARROW-3346 pa import atexit import horovod.tensorflow.keras as hvd from horovod.spark.task import get_available_devices import os from petastorm import make_batch_reader from petastorm.tf_utils import make_petastorm_dataset import tempfile import tensorflow as tf import tensorflow.keras.backend as K import shutil # Horovod: initialize Horovod inside the trainer. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process), if GPUs are available. config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = get_available_devices()[0] K.set_session(tf.Session(config=config)) # Horovod: restore from checkpoint, use hvd.load_model under the hood. model = deserialize_model(model_bytes, hvd.load_model) # Horovod: adjust learning rate based on number of processes. scaled_lr = K.get_value(model.optimizer.lr) * hvd.size() K.set_value(model.optimizer.lr, scaled_lr) # Horovod: print summary logs on the first worker. verbose = 2 if hvd.rank() == 0 else 0 callbacks = [ # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. hvd.callbacks.BroadcastGlobalVariablesCallback(root_rank=0), # Horovod: average metrics among workers at the end of every epoch. # # Note: This callback must be in the list before the ReduceLROnPlateau, # TensorBoard, or other metrics-based callbacks. hvd.callbacks.MetricAverageCallback(), # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during # the first five epochs. See https://arxiv.org/abs/1706.02677 for details. hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, initial_lr=scaled_lr, verbose=verbose), # Reduce LR if the metric is not improved for 10 epochs, and stop training # if it has not improved for 20 epochs. tf.keras.callbacks.ReduceLROnPlateau(monitor='val_exp_rmspe', patience=10, verbose=verbose), tf.keras.callbacks.EarlyStopping(monitor='val_exp_rmspe', mode='min', patience=20, verbose=verbose), tf.keras.callbacks.TerminateOnNaN() ] # Model checkpoint location. ckpt_dir = tempfile.mkdtemp() ckpt_file = os.path.join(ckpt_dir, 'checkpoint.h5') atexit.register(lambda: shutil.rmtree(ckpt_dir)) # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them. if hvd.rank() == 0: callbacks.append(tf.keras.callbacks.ModelCheckpoint(ckpt_file, monitor='val_exp_rmspe', mode='min', save_best_only=True)) # Make Petastorm readers. with make_batch_reader('%s/train_df.parquet' % args.data_dir, num_epochs=None, cur_shard=hvd.rank(), shard_count=hvd.size(), hdfs_driver=PETASTORM_HDFS_DRIVER) as train_reader: with make_batch_reader('%s/val_df.parquet' % args.data_dir, num_epochs=None, cur_shard=hvd.rank(), shard_count=hvd.size(), hdfs_driver=PETASTORM_HDFS_DRIVER) as val_reader: # Convert readers to tf.data.Dataset. train_ds = make_petastorm_dataset(train_reader) \ .apply(tf.data.experimental.unbatch()) \ .shuffle(int(train_rows / hvd.size())) \ .batch(args.batch_size) \ .map(lambda x: (tuple(getattr(x, col) for col in all_cols), tf.log(x.Sales))) val_ds = make_petastorm_dataset(val_reader) \ .apply(tf.data.experimental.unbatch()) \ .batch(args.batch_size) \ .map(lambda x: (tuple(getattr(x, col) for col in all_cols), tf.log(x.Sales))) history = model.fit(train_ds, validation_data=val_ds, steps_per_epoch=int(train_rows / args.batch_size / hvd.size()), validation_steps=int(val_rows / args.batch_size / hvd.size()), callbacks=callbacks, verbose=verbose, epochs=args.epochs) # Dataset API usage currently displays a wall of errors upon termination. # This global model registration ensures clean termination. # Tracked in https://github.com/tensorflow/tensorflow/issues/24570 globals()['_DATASET_FINALIZATION_HACK'] = model if hvd.rank() == 0: with open(ckpt_file, 'rb') as f: return history.history, f.read()
def __init__(self, timesteps, includeAux, folderI, trainLoss, includeModis, includeVGG, disLoss, cloud_cov=0.4, istransfer=False, img_h=256, img_width=256, startT='01-01-2018', endT='01-05-2019'): self.img_h = img_h self.img_w = img_width self.timesteps = timesteps self.includeModis = includeModis hvd.init() gpus = tf.config.experimental.list_physical_devices('GPU') if gpus: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) self.gen_schedule = ExponentialDecay(1e-4 * hvd.size(), decay_steps=10000, decay_rate=0.1, staircase=True) self.disc_schedule = ExponentialDecay(1e-4 * hvd.size() * 5, decay_steps=10000, decay_rate=0.1, staircase=True) self.istransfer = istransfer # self.disOp = hvd.DistributedOptimizer(tf.keras.optimizers.Adam(1e-4 * hvd.size(), 0.5)) # self.lstmOp = hvd.DistributedOptimizer(Adam(lr=1e-4 * hvd.size(), beta_1=0.9, beta_2=0.999, epsilon=1e-08)) self.disOp = hvd.DistributedOptimizer( Adam(learning_rate=self.disc_schedule)) self.lstmOp = hvd.DistributedOptimizer( Adam(learning_rate=self.gen_schedule)) self.model_helpers = models.LSTM_GAN_MODEL(disOp=self.disOp, lstmOp=self.lstmOp, h=self.img_h, w=self.img_w, timeStep=timesteps, includeAux=includeAux, trainLoss=trainLoss, disLoss=disLoss) # print("GOT MODIS======", includeModis) if includeVGG and includeModis == 0: if istransfer: self.dataloader = dataloaders.DatasetHandling( self.img_w, self.img_h, no_of_timesteps=timesteps, startT=startT, endT=endT, cloud_cov=cloud_cov, album='foco-co-20km') self.lstm_gan, self.vgg, self.disciminator, self.lstm_generator = self.model_helpers.lstm_gan_with_vgg_transfer( self.transferLear()) else: self.dataloader = dataloaders.DatasetHandling( self.img_w, self.img_h, no_of_timesteps=timesteps, startT=startT, endT=endT, cloud_cov=cloud_cov) self.lstm_gan, self.vgg, self.disciminator, self.lstm_generator = self.model_helpers.lstm_gan_with_vgg( ) elif not includeVGG and includeModis == 0: self.lstm_gan, self.vgg, self.disciminator, self.lstm_generator = self.model_helpers.lstm_gan_no_vgg( ) elif includeModis == 1: self.lstm_gan, self.vgg, self.disciminator, self.lstm_generator = self.model_helpers.lstm_gan_with_vgg_multi_modis( ) self.dirName = "/s/" + socket.gethostname( ) + "/a/nobackup/galileo/paahuni/" + str(folderI) + "/" if not includeModis == 2: self.img_itr = self.dataloader.get_non_random_image_iterator_new( batch_size=1, no_of_timesteps=self.timesteps, sendMetaInfo=True, includeModis=includeModis) else: self.dataloader = dataloaders.DatasetHandling( self.img_w, self.img_h, no_of_timesteps=timesteps, startT=startT, endT=endT, cloud_cov=cloud_cov) self.includeVGG = includeVGG
IMAGE_HEIGHT = mlctx.get_param("image_height", 128) IMAGE_CHANNELS = mlctx.get_param("image_channels", 3) # RGB color IMAGE_SIZE = (IMAGE_WIDTH, IMAGE_HEIGHT) IMAGE_SHAPE = (IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_CHANNELS) EPOCHS = mlctx.get_param("epochs", 1) BATCH_SIZE = mlctx.get_param("batch_size", 16) # RANDOM_STATE must be a parameter for reproducibility: RANDOM_STATE = mlctx.get_param("random_state", 1) TEST_SIZE = mlctx.get_param("test_size", 0.2) # kubeflow outputs/inputs categories_map = str(mlctx.get_input("categories_map").get()) df = pd.read_csv(str(mlctx.get_input("file_categories"))) # Horovod: initialize Horovod. hvd.init() # if gpus found, pin GPU to be used to process local rank (one GPU per process) gpus = tf.config.experimental.list_physical_devices("GPU") if gpus: for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU") else: os.environ["CUDA_VISIBLE_DEVICES"] = "-1" if hvd.rank() == 0: mlctx.logger.info( f"Validating paths:\nData_path:\t{DATA_PATH}\nModel_dir:\t{MODEL_DIR}\n" ) mlctx.logger.info(f"Categories map:{categories_map}")
if __name__ == '__main__': num_gpus = int(os.environ['SM_NUM_GPUS']) parser = argparse.ArgumentParser() # Data, model, and output directories. These are required. parser.add_argument('--output-dir', type=str, default=os.environ['SM_OUTPUT_DIR']) parser.add_argument('--model_dir', type=str) parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN']) parser.add_argument('--test', type=str, default=os.environ['SM_CHANNEL_TEST']) args, _ = parser.parse_known_args() # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.Session(config=config)) batch_size = 128 num_classes = 10 # Horovod: adjust number of epochs based on number of GPUs. epochs = int(math.ceil(12.0 / hvd.size())) # Input image dimensions img_rows, img_cols = 28, 28