def get_connections_for_rank(self, rank): # Create connections dict: { conn_id: ((rank_i, rank), size_in_bytes)} assert rank < self.num_ranks partition_id = self.get_partition_for_rank(rank) replica_id = self.get_replica_for_rank(rank) connection_table = dict() for edge in self.pipeline_graph.get_edges(): conn_id = edge.info_dict['connection_id'] # `source` and `target` nodes of an edge are partition IDs if partition_id in [edge.source_node, edge.target_node]: size_in_bytes = edge.info_dict['number_elements'] * edge.info_dict['dtype'].size source_ranks = self._get_ranks_for_partition(edge.source_node) target_ranks = self._get_ranks_for_partition(edge.target_node) source_rank_for_replica = [r for r in source_ranks if self.get_replica_for_rank(r) == replica_id] target_rank_for_replica = [r for r in target_ranks if self.get_replica_for_rank(r) == replica_id] assert len(source_rank_for_replica) == 1, \ f"[RankMapper] Incorrect set of ranks for replica = {replica_id}: {source_rank_for_replica}" assert len(target_rank_for_replica) == 1, \ f"[RankMapper] Incorrect set of ranks for replica = {replica_id}: {target_rank_for_replica}" connection_table[conn_id] = cinfo.ConnectionInfo((source_rank_for_replica[0], target_rank_for_replica[0]), size_in_bytes) logger.debug(f"[RankMapper] Connection table for rank {rank}: {connection_table}") return connection_table
def _get_model(self, model): logger.debug(f"Creating model for partition {self.partition_id}") core_model_config = self._to_model_config(self.partition_id, self.partition_graph) core_model = tf.keras.Model().from_config(core_model_config) set_weights(core_model, model) return core_model
def shuffle_with_seed(self, dataset, ds_kwargs): if not 'seed' in ds_kwargs or ds_kwargs['seed'] is None: logger.warn("Shuffling with fixed shuffle seed {}.".format(self.shuffle_seed)) ds_kwargs['seed'] = self.shuffle_seed else: logger.debug("Shuffling with shuffle seed {}.".format(ds_kwargs['seed'])) return dataset.shuffle(**ds_kwargs)
def load_model(filepath, compile=True, **kwargs): logger.debug("Load model from file: {}".format(filepath)) keras_model = tf.keras.models.load_model(filepath, compile=compile, **kwargs) # FIXME load models with any type of parallelization strategy logger.warning("Loading model with the default `data parallel` strategy.") tnt_model = tnt.Model(keras_model, parallel_strategy=tnt.ParallelStrategy.DATA) if compile: try: tnt_optimizer = tnt.distributed_optimizers.SynchDistributedOptimizer( keras_model.optimizer, group=tnt_model.group) tnt_model.dist_optimizer = tnt_optimizer tnt_model._set_internal_optimizer(tnt_model.dist_optimizer) tnt_model.compiled = True tnt_model.done_broadcast = True if version_utils.tf_version_below_equal('2.1'): tnt_model.model._experimental_run_tf_function = False logger.info("Set `experimental_run_tf_function` to False.") except: logger.info("The loaded model was not pre-compiled.") tnt_model.barrier.execute() return tnt_model
def __init__(self, keras_callback: tf.keras.callbacks.Callback) -> None: self.keras_callback = keras_callback logger.debug( f"Creating generic TNT callback of type={type(keras_callback)}" ) _construct_from_keras_object(self, keras_callback) self.tnt_parallel_strategy = parallel_strategy
def __init__(self, model, params): # params = {'optimizer', 'loss', 'metrics', 'loss_weights', # 'sample_loss_weights', 'weighted_metrics'} self._optimizer = params.get('optimizer', None) self._loss = self._assign_named_attributes_to_outputs(model, 'loss', params) self._loss_weights = self._assign_named_attributes_to_outputs(model, 'loss_weights', params) self._metrics = self._assign_named_attributes_to_outputs(model, 'metrics', params) logger.debug(f"Compile properties: losses = {self._loss}, loss_weights = {self._loss_weights}, metrics = {self._metrics}")
def _to_parallel_callbacks(callbacks, group, parallel_strategy): for index, callback in enumerate(callbacks): logger.debug( f"[{parallel_strategy}] Preprocessing callback {callback} of type {type(callback)}" ) callbacks[index] = tnt.keras.callbacks.Callback(callback, parallel_strategy, group=group) return callbacks
def get_pipelining_group_for_rank(self, rank): if not rank in self.partition_mapping: raise ValueError(f"Rank {rank} not found in the mapping of partition IDs to ranks: {self.partition_mapping}") replica_id = self.get_replica_for_rank(rank) pipeline_group_ranks = [r for r in self.partition_mapping.keys() \ if self.get_replica_for_rank(r) == replica_id] logger.debug(f"[RankMapper] Pipeline group = {pipeline_group_ranks}.") return tnt.Group(pipeline_group_ranks)
def get_replica_group_for_rank(self, rank): if not rank in self.replica_mapping: raise ValueError(f"Rank {rank} not found in the mapping of replica IDs to ranks: {self.replica_mapping}") partition_id = self.get_partition_for_rank(rank) replica_group_ranks = [r for r in self.replica_mapping.keys() \ if self.get_partition_for_rank(r) == partition_id] logger.debug(f"[RankMapper] Replica group = {replica_group_ranks}.") return tnt.Group(replica_group_ranks)
def _(self, keras_callback: tf.keras.callbacks.EarlyStopping): logger.debug("[DataParallel] EarlyStopping callback") # only master rank should print messages self.verbose = keras_callback.verbose if tnt.is_group_master_rank(self.group) \ else utilities.TF_verbose.SILENT.value def _get_monitor_value(self, logs): averaged_logs = self.average_logs(logs) return super().get_monitor_value(averaged_logs) self.get_monitor_value = _get_monitor_value
def _get_partition_compile_params(self): if not self.compile_properties: raise RuntimeError("[PipelinedModel] `model.fit` called before `model.compile`") logger.debug(f"[PartitionedModel] Compiled partitioned model with losses={self.compile_properties.loss}, " f"metrics = {self.compile_properties.metrics} {self.model.metrics}") return {'optimizer' : self.compile_properties.optimizer, 'loss' : self.microbatched_model_builder.get_losses(self.compile_properties.loss), 'loss_weights' : self.microbatched_model_builder.get_loss_weights(), 'metrics' : self.microbatched_model_builder.get_metrics(self.compile_properties.metrics)}
def get_microbatch_size(self, batch_size): if batch_size is None or batch_size == 0: raise ValueError("[DistributedDataset]Incorrectly defined batch size") if batch_size % self.num_ranks != 0: raise ValueError("[DistributedDataset] Batch size ({}) is not a multiple".format(batch_size) + "of the number of ranks {}".format(self.num_ranks)) logger.debug("Batch size ({}) is a multiple of the number of ranks {}.".format( batch_size, self.num_ranks)) return int(batch_size // self.num_ranks)
def model_from_yaml(yaml_string, **kwargs): logger.debug("Load model from yaml") try: keras_model = tf.keras.models.model_from_yaml(yaml_string, **kwargs) # FIXME load models with any type of parallelization strategy logger.warning( "Loading model with the default `data parallel` strategy.") return tnt.Model(keras_model, parallel_strategy=tnt.ParallelStrategy.DATA) except: raise RuntimeError("[tnt.models.model_from_yaml] Cannot load model")
def __init__(self, keras_callback: tf.keras.callbacks.Callback, aggregate_logs: bool = True, run_on_all_ranks: bool = True, group: tnt.Group = tnt.Group()) -> None: super().__init__(group = group) logger.debug(f"[DataParallelCallback] init with {keras_callback}") base_type.__init__(self, keras_callback) self.aggregate_logs = aggregate_logs self.run_on_all_ranks = run_on_all_ranks self.is_built = False self._distribute_callback = self._distribute_callback_default self.customize_callback(keras_callback)
def _(self, keras_callback: tf.keras.callbacks.ModelCheckpoint): logger.debug("[DataParallel] ModelCheckpoint callback") # only master rank should save and thus print messages self.verbose = keras_callback.verbose if tnt.is_group_master_rank(self.group) \ else utilities.TF_verbose.SILENT.value self.run_on_all_ranks = False # only one checkpoint is needed (models are identical in a data parallel setting) # disable checkpointing for all ranks except the master rank if not tnt.is_group_master_rank(self.group): self._supports_tf_logs = False self.save_freq = 1e20 # very large value to avoid triggering checkpointing self.epochs_since_last_save = 0
def setup_gpus(rank, ngpus=None): """Checks whether there are GPUs available on the machine and assigns one to the current rank. To make sure a specific GPU will be used by the current rank, TensorFlow is configured so that this particular GPU is the only one visible. A GPU is selected if its index within the list of available GPUs is equal to (rank % ngpus). This allocation assumes that all nodes are homogeneous and are configured with the same number of processes (< ngpus). Args: rank: int, rank of the current process ngpus: int value specifying the maximum number of GPUs per node that will be used (`None` stands for using all GPUs available) """ if ngpus is not None and ngpus <= 0: # Disable all GPUs tf.config.experimental.set_visible_devices([], 'GPU') visible_gpus = tf.config.experimental.get_visible_devices('GPU') if visible_gpus and len(visible_gpus) > 0: sys.exit( "ERROR: [rank {}] Could not disable GPUs: {} GPUs still visible" .format(rank, len(visible_gpus))) else: # try to use `ngpus` per node phys_gpus = tf_config.get_available_gpus() if phys_gpus and len(phys_gpus) > 0: if ngpus is None: # use as many GPUs as possible ngpus = len(phys_gpus) target_gpu = rank % ngpus if len(phys_gpus) < ngpus: sys.exit( "ERROR: rank {} cannot use GPU_id={} (only {} GPUs available)" .format(rank, target_gpu, len(phys_gpus))) try: # memory growth has to be set only once on all availble GPUs if target_gpu == 0: for gpu in phys_gpus: tf.config.experimental.set_memory_growth(gpu, True) # make sure only one GPU is visible per process tf.config.experimental.set_visible_devices( phys_gpus[target_gpu], 'GPU') except RuntimeError: raise RuntimeError("[Tarantella][init] Cannot configure GPUs") logger.debug("Using device: {}".format( tf.config.experimental.get_visible_devices()))
def _(self, keras_callback: tf.keras.callbacks.TensorBoard): logger.debug("[PipeliningParallel] TensorBoard callback") if tnt.global_tnt_config.tensorboard_on_all_devices: self.log_dir += f"/rank_{tnt.get_rank()}" else: # disable any data logging for all ranks except the last partition if not tnt.is_group_master_rank(self.group): self.histogram_freq = 0 self.write_graph = False self.write_images = False self.write_steps_per_second = False self.update_freq = 0 self.embeddings_freq = 0 self.embeddings_metadata = None self.profile_batch = None
def _map_ranks_to_partition_and_replica_ids(self): nranks_dp = self.num_ranks // self.num_partitions partition_mapping = dict() replica_mapping = dict() for index, node in enumerate(self.pipeline_graph.get_nodes()): for replica_index in range(nranks_dp): first_rank_in_partition = replica_index * self.num_partitions rank = first_rank_in_partition + index partition_id = node.name partition_mapping[rank] = partition_id replica_mapping[rank] = replica_index logger.debug(f"[RankMapper] Mapping of ranks to partition IDs: {partition_mapping}") logger.debug(f"[RankMapper] Mapping of ranks to replica IDs: {replica_mapping}") return partition_mapping, replica_mapping
def _get_microbatch_size(rank, num_ranks, batch_size): if batch_size is None or batch_size == 0: raise ValueError("[DistributedDataset]Incorrectly defined batch size") microbatch_size = int(batch_size // num_ranks) remaining_samples = batch_size % num_ranks if remaining_samples != 0: logger.debug( f"[Rank {tnt.get_rank()}] Batch size ({batch_size}) is a not multiple of the number of ranks {num_ranks}." ) if rank < remaining_samples: microbatch_size = microbatch_size + 1 logger.debug( f"[Rank {tnt.get_rank()}] Micro batch size {microbatch_size}.") return microbatch_size
def _pad_dataset_if_necessary(dataset, num_samples, batch_size, min_last_batch_size): last_batch_size = _get_last_incomplete_batch_size(num_samples, batch_size) if last_batch_size == 0: logger.debug(f"No padding required: number of samples {num_samples} is a multiple " \ f"of the batch size {batch_size}.") return dataset logger.info(f"Incomplete last batch in the dataset: number of samples is " \ f"{last_batch_size} ( != batch size {batch_size}).") if version_utils.tf_version_below_equal('2.1'): num_samples_multiple = num_samples - last_batch_size logger.warn(f"Number of samples ({num_samples}) is not a multiple of batch size. " \ f"This use case is not supported in TF v{version_utils.current_version()}. " \ f"Dropping the last incomplete batch from the dataset, "\ f"and proceeding with {num_samples_multiple} samples.") return dataset.take(num_samples_multiple) if last_batch_size < min_last_batch_size: logger.debug(f"Padding required for the last batch: number of samples is " \ f"{last_batch_size} ( < min_batch_size {min_last_batch_size}).") # Create helper dataset that contains one full batch and one incomplete batch helper_dataset = dataset.take(min_last_batch_size + last_batch_size) helper_dataset = helper_dataset.batch(min_last_batch_size, drop_remainder=False) # If `padded_shape` is unspecified, all dimensions of all components # are padded to the maximum size in the batch. # The second batch in `helper_dataset` will now contain `min_last_batch_size - last_batch_size` # default-initialized samples. helper_dataset = helper_dataset.padded_batch(2) # Switch back to a list of samples instead of batches helper_dataset = helper_dataset.unbatch().unbatch() # Remaining samples in the dataset are those generated through padding padding_samples = helper_dataset.skip(min_last_batch_size + last_batch_size) dataset = dataset.concatenate(padding_samples) logger.info(f"[Rank {tnt.get_rank()}] Dataset padded with " \ f"{min_last_batch_size - last_batch_size} samples.") return dataset
def _get_num_samples(dataset): cardinality = tf.data.experimental.cardinality(dataset) if cardinality == tf.data.experimental.INFINITE_CARDINALITY: logger.debug("Infinite dataset detected.") return tf.data.experimental.INFINITE_CARDINALITY if cardinality != tf.data.experimental.UNKNOWN_CARDINALITY: logger.debug("Dataset size is %d" % (cardinality.numpy())) return cardinality.numpy() logger.debug("Unknown dataset size. Counting samples...") dataset_size = 0 for d in dataset: dataset_size += 1 logger.debug("Dataset size is %d" % (dataset_size)) return dataset_size
def _(self, keras_callback: tf.keras.callbacks.LearningRateScheduler): logger.debug("[DataParallel] LearningRateScheduler callback") if not tnt.global_tnt_config.output_on_all_devices: if not tnt.is_group_master_rank(self.group): self.verbose = 0
def on_train_batch_begin(self, batch, logs=None): scaling_factor = get_scaling_factor_by_iteration(batch, self._scaling_factor_table) if scaling_factor != self.model.optimizer.scaling_factor: logger.debug(f"[Rank {tnt.get_rank()}] Setting scaling factor to {scaling_factor}") K.set_value(self.model.optimizer.scaling_factor, scaling_factor)
def _(self, keras_callback: tf.keras.callbacks.ProgbarLogger): logger.debug("[DataParallel] ProgbarLogger callback") _customize_progbar_logger(self)
def _(self, keras_callback: tf.keras.callbacks.ReduceLROnPlateau): logger.debug("[DataParallel] ReduceLROnPlateau callback") # only master rank should print messages self.verbose = keras_callback.verbose if tnt.is_group_master_rank(self.group) \ else utilities.TF_verbose.SILENT.value
def _(self, keras_callback: tf.keras.callbacks.RemoteMonitor): logger.debug("[DataParallel] RemoteMonitor callback")
def _(self, keras_callback: tf.keras.callbacks.TerminateOnNaN): logger.debug("[DataParallel] TerminateOnNaN callback")
def customize_callback(self, keras_callback: tf.keras.callbacks.Callback) -> None: logger.debug("[DataParallel] Generic callback")
def _(self, keras_callback: tf.keras.callbacks.CSVLogger): logger.debug("[DataParallel] CSVLogger callback")
def _(self, keras_callback: tf.keras.callbacks.History): logger.debug("[DataParallel] History callback")