def __init__(self, chunk_size=128, local_proxy_variable=False, sync=True, staleness=0): PSLoadBalancing.__init__(self, local_proxy_variable, sync, staleness) AllReduce.__init__(self, chunk_size)
def __init__(self, resource_spec_file, strategy_builder=None): set_default_autodist(self) self._resource_spec = ResourceSpec(resource_file=resource_spec_file) self._strategy_builder = strategy_builder or PSLoadBalancing() self._original_graph_item = None self._transformed_graph_item = None self._remapper = None self._built = None # Ref to the built GraphDef self._cluster: Cluster = SSHCluster( self._resource_spec) # which can be also defined with strategy self._coordinator: Coordinator
def main(_): assert tf.version.VERSION.startswith('2.') if not FLAGS.model_dir: FLAGS.model_dir = '/tmp/bert/' ######################################################################### # Construct AutoDist with ResourceSpec for Different Strategies if FLAGS.autodist_patch_tf: os.environ['AUTODIST_PATCH_TF'] = 'True' else: os.environ['AUTODIST_PATCH_TF'] = 'False' resource_spec_file = os.path.join(os.path.dirname(__file__), '../resource_spec.yml') if FLAGS.autodist_strategy == 'PS': strategy = AutoDist(resource_spec_file, PS(local_proxy_variable=FLAGS.proxy)) elif FLAGS.autodist_strategy == 'PSLoadBalancing': strategy = AutoDist(resource_spec_file, PSLoadBalancing(local_proxy_variable=FLAGS.proxy)) elif FLAGS.autodist_strategy == 'PartitionedPS': strategy = AutoDist(resource_spec_file, PartitionedPS(local_proxy_variable=FLAGS.proxy)) elif FLAGS.autodist_strategy == 'AllReduce': strategy = AutoDist(resource_spec_file, AllReduce(chunk_size=FLAGS.chunk_size)) elif FLAGS.autodist_strategy == 'Parallax': strategy = AutoDist( resource_spec_file, Parallax(chunk_size=FLAGS.chunk_size, local_proxy_variable=FLAGS.proxy)) else: raise ValueError( 'the strategy can be only from PS, PSLoadBalancing, PartitionedPS, AllReduce, Parallax' ) strategy.num_replicas_in_sync = strategy._resource_spec.num_gpus if strategy: print('***** Number of cores used : ', strategy.num_replicas_in_sync) resource_info = yaml.safe_load(open(resource_spec_file, 'r')) try: node_num = len(resource_info['nodes']) except ValueError: print("nodes need to be set in specficiation file") try: gpu_num = len(resource_info['nodes'][0]['gpus']) except ValueError: print("gpus need to be set in specficiation file") ######################################################################### logdir = '/tmp/logs' if not os.path.exists(logdir): os.makedirs(logdir) logname = 'bert_strategy_{}_node_{}_gpu_{}_patch_{}_proxy_{}'.format( FLAGS.autodist_strategy, node_num, gpu_num, FLAGS.autodist_patch_tf, FLAGS.proxy) logging.get_absl_handler().use_absl_log_file(logname, logdir) # start running run_bert_pretrain(strategy, gpu_num, node_num)
def run_ncf(FLAGS): """Run NCF training and eval with Keras.""" ######################################################################### # Construct AutoDist with ResourceSpec for Different Strategies resource_spec_file = os.path.join( os.path.dirname(__file__), '../resource_spec.yml') resource_info = yaml.safe_load(open(resource_spec_file, 'r')) try: node_num = len(resource_info['nodes']) except ValueError: print("nodes need to be set in specficiation file") try: gpu_num = len(resource_info['nodes'][0]['gpus']) except ValueError: print("gpus need to be set in specficiation file") if FLAGS.autodist_patch_tf: os.environ['AUTODIST_PATCH_TF'] = '1' else: os.environ['AUTODIST_PATCH_TF'] = '0' if FLAGS.proxy: local_proxy_variable = True else: local_proxy_variable = False if FLAGS.autodist_strategy == 'PS': autodist = AutoDist( resource_spec_file, PS( local_proxy_variable=local_proxy_variable)) elif FLAGS.autodist_strategy == 'PSLoadBalancing': autodist = AutoDist(resource_spec_file, PSLoadBalancing( local_proxy_variable=local_proxy_variable)) elif FLAGS.autodist_strategy == 'PartitionedPS': autodist = AutoDist(resource_spec_file, PartitionedPS( local_proxy_variable=local_proxy_variable)) elif FLAGS.autodist_strategy == 'AllReduce': autodist = AutoDist(resource_spec_file, AllReduce(chunk_size=256)) elif FLAGS.autodist_strategy == 'Parallax': autodist = AutoDist( resource_spec_file, Parallax( chunk_size=256, local_proxy_variable=local_proxy_variable)) else: raise ValueError( 'the strategy can be only from PS, PSLoadBalancing, PartitionedPS, AllReduce, Parallax') ######################################################################### if FLAGS.seed is not None: print("Setting tf seed") tf.random.set_seed(FLAGS.seed) model_helpers.apply_clean(FLAGS) if FLAGS.dtype == "fp16" and FLAGS.fp16_implementation == "keras": policy = tf.keras.mixed_precision.experimental.Policy( "mixed_float16", loss_scale=flags_core.get_loss_scale( FLAGS, default_for_fp16="dynamic")) tf.keras.mixed_precision.experimental.set_policy(policy) params = ncf_common.parse_flags(FLAGS) params["distribute_strategy"] = None batch_size = params["batch_size"] time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps) callbacks = [time_callback] producer, input_meta_data = None, None generate_input_online = params["train_dataset_path"] is None if generate_input_online: num_users, num_items, _, _, producer = ncf_common.get_inputs(params) producer.start() per_epoch_callback = IncrementEpochCallback(producer) callbacks.append(per_epoch_callback) else: assert params["eval_dataset_path"] and params["input_meta_data_path"] with tf.io.gfile.GFile(params["input_meta_data_path"], "rb") as reader: input_meta_data = json.loads(reader.read().decode("utf-8")) num_users = input_meta_data["num_users"] num_items = input_meta_data["num_items"] params["num_users"], params["num_items"] = num_users, num_items if FLAGS.early_stopping: early_stopping_callback = CustomEarlyStopping( "val_HR_METRIC", desired_value=FLAGS.hr_threshold) callbacks.append(early_stopping_callback) with tf.Graph().as_default(), autodist.scope(): (train_input_dataset, eval_input_dataset, num_train_steps, num_eval_steps) = ( ncf_input_pipeline.create_ncf_input_data(params, producer, input_meta_data, None)) steps_per_epoch = None if generate_input_online else num_train_steps keras_model = _get_keras_model(params) if FLAGS.optimizer == 'adam': optimizer = tf.keras.optimizers.Adam( learning_rate=params["learning_rate"], beta_1=params["beta1"], beta_2=params["beta2"], epsilon=params["epsilon"]) elif FLAGS.optimizer == 'sgd': optimizer = tf.keras.optimizers.SGD( learning_rate=params["learning_rate"]) elif FLAGS.optimizer == 'lazyadam': optimizer = LazyAdam( learning_rate=params["learning_rate"], beta_1=params["beta1"], beta_2=params["beta2"], epsilon=params["epsilon"]) else: raise ValueError('Do not support other optimizers...') if FLAGS.fp16_implementation == "graph_rewrite": optimizer = tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite( optimizer, loss_scale=flags_core.get_loss_scale(FLAGS, default_for_fp16="dynamic")) elif FLAGS.dtype == "fp16" and params["keras_use_ctl"]: optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer( optimizer, tf.keras.mixed_precision.experimental.global_policy().loss_scale) return run_ncf_custom_training( params, autodist, keras_model, optimizer, callbacks, train_input_dataset, eval_input_dataset, num_train_steps, num_eval_steps, generate_input_online=generate_input_online, return_simulation=FLAGS.simulation_strategy_id is not None)
os.path.join(os.path.dirname(__file__), 'resource_specs/r2.yml') # single node with 1 GPU ] strategies = [ PS(), PartitionedPS(local_proxy_variable=True), AllReduce(chunk_size=1, all_reduce_spec='NCCL', compressor='NoneCompressor'), AllReduce(chunk_size=1, all_reduce_spec='NCCL', compressor='HorovodCompressor'), AllReduce(chunk_size=1, all_reduce_spec='RING', compressor='HorovodCompressorEF'), PSLoadBalancing(local_proxy_variable=True), Parallax(local_proxy_variable=True), PartitionedAR(), UnevenPartitionedPS(local_proxy_variable=True), RandomAxisPartitionAR(chunk_size=4) ] @pytest.mark.integration def test_all(): combinations = itertools.product(resource_specs, strategies) for r, s in combinations: for c in cases: def run(): """This wrapper will handle the AutoDist destructor and garbage collections."""
def run(flags_obj): """ Run ResNet ImageNet training and eval loop using native Keras APIs. Raises: ValueError: If fp16 is passed as it is not currently supported. Returns: Dictionary of training and eval stats. """ ######################################################################### # Construct AutoDist with ResourceSpec for Different Strategies if flags_obj.autodist_patch_tf: os.environ['AUTODIST_PATCH_TF'] = '1' else: os.environ['AUTODIST_PATCH_TF'] = '0' if flags_obj.cnn_model == 'vgg16': chunk = 25 elif flags_obj.cnn_model == 'resnet101': chunk = 200 elif flags_obj.cnn_model == 'inceptionv3': chunk = 30 else: chunk = 512 if flags_obj.autodist_strategy == 'PS': autodist = AutoDist(resource_spec_file, PS(local_proxy_variable=flags_obj.proxy)) elif flags_obj.autodist_strategy == 'PSLoadBalancing': autodist = AutoDist( resource_spec_file, PSLoadBalancing(local_proxy_variable=flags_obj.proxy)) elif flags_obj.autodist_strategy == 'PartitionedPS': autodist = AutoDist( resource_spec_file, PartitionedPS(local_proxy_variable=flags_obj.proxy)) elif flags_obj.autodist_strategy == 'AllReduce': autodist = AutoDist(resource_spec_file, AllReduce(chunk_size=chunk)) elif flags_obj.autodist_strategy == 'Parallax': autodist = AutoDist( resource_spec_file, Parallax(chunk_size=chunk, local_proxy_variable=flags_obj.proxy)) else: raise ValueError( 'the strategy can be only from PS, PSLoadBalancing, PartitionedPS, AllReduce, Parallax' ) ######################################################################### dtype = flags_core.get_tf_dtype(flags_obj) if dtype == tf.float16: loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128) policy = tf.compat.v1.keras.mixed_precision.experimental.Policy( 'mixed_float16', loss_scale=loss_scale) tf.compat.v1.keras.mixed_precision.experimental.set_policy(policy) if not keras_utils.is_v2_0(): raise ValueError('--dtype=fp16 is not supported in TensorFlow 1.') elif dtype == tf.bfloat16: policy = tf.compat.v1.keras.mixed_precision.experimental.Policy( 'mixed_bfloat16') tf.compat.v1.keras.mixed_precision.experimental.set_policy(policy) input_fn = imagenet_preprocessing.input_fn drop_remainder = flags_obj.enable_xla if 'vgg' in flags_obj.cnn_model: lr_schedule = 0.01 else: lr_schedule = 0.1 if flags_obj.use_tensor_lr: lr_schedule = common.PiecewiseConstantDecayWithWarmup( batch_size=flags_obj.batch_size, epoch_size=imagenet_preprocessing.NUM_IMAGES['train'], warmup_epochs=common.LR_SCHEDULE[0][1], boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]), multipliers=list(p[0] for p in common.LR_SCHEDULE), compute_lr_on_cpu=True) ######################################################################### # Build with Graph mode, and put all under AutoDist scope. with tf.Graph().as_default(), autodist.scope(): ########################################################################## train_input_dataset = input_fn( is_training=True, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=imagenet_preprocessing.parse_record, datasets_num_private_threads=flags_obj. datasets_num_private_threads, dtype=dtype, drop_remainder=drop_remainder, tf_data_experimental_slack=flags_obj.tf_data_experimental_slack, training_dataset_cache=flags_obj.training_dataset_cache, ) if flags_obj.cnn_model == 'resnet101': model = tf.keras.applications.ResNet101( weights=None, classes=imagenet_preprocessing.NUM_CLASSES) elif flags_obj.cnn_model == 'vgg16': model = tf.keras.applications.VGG16( weights=None, classes=imagenet_preprocessing.NUM_CLASSES) elif flags_obj.cnn_model == 'inceptionv3': model = tf.keras.applications.InceptionV3( weights=None, classes=imagenet_preprocessing.NUM_CLASSES) elif flags_obj.cnn_model == 'densenet121': model = tf.keras.applications.DenseNet121( weights=None, classes=imagenet_preprocessing.NUM_CLASSES) else: raise ValueError('Other Model Undeveloped') optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule, beta_1=0.9, beta_2=0.999, epsilon=1e-08) train_input_iterator = tf.compat.v1.data.make_one_shot_iterator( train_input_dataset) train_input, train_target = train_input_iterator.get_next() steps_per_epoch = (imagenet_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size) train_epochs = flags_obj.train_epochs if flags_obj.enable_checkpoint_and_export: ckpt_full_path = os.path.join(flags_obj.model_dir, 'model.ckpt-{epoch:04d}') if train_epochs <= 1 and flags_obj.train_steps: steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch) train_epochs = 1 num_eval_steps = (imagenet_preprocessing.NUM_IMAGES['validation'] // flags_obj.batch_size) train_output = model(train_input, training=True) scc_loss = tf.keras.losses.SparseCategoricalCrossentropy() loss = scc_loss(train_target, train_output) var_list = variables.trainable_variables() + \ ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES) grad = optimizer.get_gradients(loss, var_list) train_op = optimizer.apply_gradients(zip(grad, var_list)) ##################################################################### # Create distributed session. # Instead of using the original TensorFlow session for graph execution, # let's use AutoDist's distributed session, in which a computational # graph for distributed training is constructed. # # [original line] # >>> sess = tf.compat.v1.Session() # sess = autodist.create_distributed_session() ##################################################################### summary = TimeHistory(flags_obj.batch_size, steps_per_epoch) for epoch_id in range(train_epochs): summary.on_epoch_begin(epoch_id) for batch_id in range(steps_per_epoch): summary.on_batch_begin(batch_id) loss_v, _ = sess.run([loss, train_op]) summary.on_batch_end(batch_id, loss_v) summary.on_epoch_end(epoch_id) summary.on_train_end() return
from autodist.strategy.partitioned_ps_strategy import PartitionedPS from autodist.strategy.ps_lb_strategy import PSLoadBalancing from autodist.strategy.ps_strategy import PS from autodist.strategy.partitioned_all_reduce_strategy import PartitionedAR from autodist.strategy.uneven_partition_ps_strategy import UnevenPartitionedPS from autodist.strategy.random_axis_partition_all_reduce_strategy import RandomAxisPartitionAR STRATEGIES_FOR_DISTRIBUTED_TESTS = { 'PS': PS(sync=True), 'PS_stale_3': PS(sync=True, staleness=3), 'PartitionedPS': PartitionedPS(), 'PartitionedPS_stale_3': PartitionedPS(staleness=3), 'AllReduce': AllReduce(chunk_size=1), 'AllReduce_2': AllReduce(chunk_size=2), 'Parallax': Parallax(), 'PSLoadBalancingProxy_stale_3': PSLoadBalancing(local_proxy_variable=True, staleness=3), 'ParallaxProxy': Parallax(local_proxy_variable=True), 'PartitionedAR': PartitionedAR(), 'RandomAxisPartitionAR': RandomAxisPartitionAR(chunk_size=4), 'UnevenPartitionedPS': UnevenPartitionedPS(local_proxy_variable=True) } def run_test(resource, strategy, case): print("\n>>>>>>>> Running Test: Case:{}, Strategy:{}, ResourceSpec:{} >>>>>>>>\n".format(case, strategy, resource)) a = AutoDist(resource_spec_file=resource, strategy_builder=STRATEGIES_FOR_DISTRIBUTED_TESTS[strategy]) c = importlib.import_module("cases." + case) c.main(a) print('<<<<<<<<<< Test Case Finished. <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')