示例#1
0
 def __init__(self,
              chunk_size=128,
              local_proxy_variable=False,
              sync=True,
              staleness=0):
     PSLoadBalancing.__init__(self, local_proxy_variable, sync, staleness)
     AllReduce.__init__(self, chunk_size)
示例#2
0
    def __init__(self, resource_spec_file, strategy_builder=None):
        set_default_autodist(self)
        self._resource_spec = ResourceSpec(resource_file=resource_spec_file)
        self._strategy_builder = strategy_builder or PSLoadBalancing()

        self._original_graph_item = None
        self._transformed_graph_item = None
        self._remapper = None
        self._built = None  # Ref to the built GraphDef

        self._cluster: Cluster = SSHCluster(
            self._resource_spec)  # which can be also defined with strategy
        self._coordinator: Coordinator
示例#3
0
文件: bert.py 项目: zeta1999/autodist
def main(_):
    assert tf.version.VERSION.startswith('2.')

    if not FLAGS.model_dir:
        FLAGS.model_dir = '/tmp/bert/'

    #########################################################################
    # Construct AutoDist with ResourceSpec for Different Strategies
    if FLAGS.autodist_patch_tf:
        os.environ['AUTODIST_PATCH_TF'] = 'True'
    else:
        os.environ['AUTODIST_PATCH_TF'] = 'False'
    resource_spec_file = os.path.join(os.path.dirname(__file__),
                                      '../resource_spec.yml')

    if FLAGS.autodist_strategy == 'PS':
        strategy = AutoDist(resource_spec_file,
                            PS(local_proxy_variable=FLAGS.proxy))
    elif FLAGS.autodist_strategy == 'PSLoadBalancing':
        strategy = AutoDist(resource_spec_file,
                            PSLoadBalancing(local_proxy_variable=FLAGS.proxy))
    elif FLAGS.autodist_strategy == 'PartitionedPS':
        strategy = AutoDist(resource_spec_file,
                            PartitionedPS(local_proxy_variable=FLAGS.proxy))
    elif FLAGS.autodist_strategy == 'AllReduce':
        strategy = AutoDist(resource_spec_file,
                            AllReduce(chunk_size=FLAGS.chunk_size))
    elif FLAGS.autodist_strategy == 'Parallax':
        strategy = AutoDist(
            resource_spec_file,
            Parallax(chunk_size=FLAGS.chunk_size,
                     local_proxy_variable=FLAGS.proxy))
    else:
        raise ValueError(
            'the strategy can be only from PS, PSLoadBalancing, PartitionedPS, AllReduce, Parallax'
        )

    strategy.num_replicas_in_sync = strategy._resource_spec.num_gpus

    if strategy:
        print('***** Number of cores used : ', strategy.num_replicas_in_sync)

    resource_info = yaml.safe_load(open(resource_spec_file, 'r'))
    try:
        node_num = len(resource_info['nodes'])
    except ValueError:
        print("nodes need to be set in specficiation file")

    try:
        gpu_num = len(resource_info['nodes'][0]['gpus'])
    except ValueError:
        print("gpus need to be set in specficiation file")
    #########################################################################

    logdir = '/tmp/logs'
    if not os.path.exists(logdir):
        os.makedirs(logdir)

    logname = 'bert_strategy_{}_node_{}_gpu_{}_patch_{}_proxy_{}'.format(
        FLAGS.autodist_strategy, node_num, gpu_num, FLAGS.autodist_patch_tf,
        FLAGS.proxy)

    logging.get_absl_handler().use_absl_log_file(logname, logdir)
    # start running
    run_bert_pretrain(strategy, gpu_num, node_num)
示例#4
0
文件: ncf.py 项目: zeta1999/autodist
def run_ncf(FLAGS):
    """Run NCF training and eval with Keras."""

    #########################################################################
    # Construct AutoDist with ResourceSpec for Different Strategies
    resource_spec_file = os.path.join(
        os.path.dirname(__file__),
        '../resource_spec.yml')
    resource_info = yaml.safe_load(open(resource_spec_file, 'r'))
    try:
        node_num = len(resource_info['nodes'])
    except ValueError:
        print("nodes need to be set in specficiation file")

    try:
        gpu_num = len(resource_info['nodes'][0]['gpus'])
    except ValueError:
        print("gpus need to be set in specficiation file")

    if FLAGS.autodist_patch_tf:
        os.environ['AUTODIST_PATCH_TF'] = '1'
    else:
        os.environ['AUTODIST_PATCH_TF'] = '0'

    if FLAGS.proxy:
        local_proxy_variable = True
    else:
        local_proxy_variable = False

    if FLAGS.autodist_strategy == 'PS':
        autodist = AutoDist(
            resource_spec_file, PS(
                local_proxy_variable=local_proxy_variable))
    elif FLAGS.autodist_strategy == 'PSLoadBalancing':
        autodist = AutoDist(resource_spec_file, PSLoadBalancing(
            local_proxy_variable=local_proxy_variable))
    elif FLAGS.autodist_strategy == 'PartitionedPS':
        autodist = AutoDist(resource_spec_file, PartitionedPS(
            local_proxy_variable=local_proxy_variable))
    elif FLAGS.autodist_strategy == 'AllReduce':
        autodist = AutoDist(resource_spec_file, AllReduce(chunk_size=256))
    elif FLAGS.autodist_strategy == 'Parallax':
        autodist = AutoDist(
            resource_spec_file,
            Parallax(
                chunk_size=256,
                local_proxy_variable=local_proxy_variable))
    else:
        raise ValueError(
            'the strategy can be only from PS, PSLoadBalancing, PartitionedPS, AllReduce, Parallax')
    #########################################################################
    if FLAGS.seed is not None:
        print("Setting tf seed")
        tf.random.set_seed(FLAGS.seed)

    model_helpers.apply_clean(FLAGS)

    if FLAGS.dtype == "fp16" and FLAGS.fp16_implementation == "keras":
        policy = tf.keras.mixed_precision.experimental.Policy(
            "mixed_float16", loss_scale=flags_core.get_loss_scale(
                FLAGS, default_for_fp16="dynamic"))
        tf.keras.mixed_precision.experimental.set_policy(policy)

    params = ncf_common.parse_flags(FLAGS)
    params["distribute_strategy"] = None

    batch_size = params["batch_size"]
    time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps)
    callbacks = [time_callback]

    producer, input_meta_data = None, None
    generate_input_online = params["train_dataset_path"] is None

    if generate_input_online:
        num_users, num_items, _, _, producer = ncf_common.get_inputs(params)
        producer.start()
        per_epoch_callback = IncrementEpochCallback(producer)
        callbacks.append(per_epoch_callback)
    else:
        assert params["eval_dataset_path"] and params["input_meta_data_path"]
        with tf.io.gfile.GFile(params["input_meta_data_path"], "rb") as reader:
            input_meta_data = json.loads(reader.read().decode("utf-8"))
            num_users = input_meta_data["num_users"]
            num_items = input_meta_data["num_items"]

    params["num_users"], params["num_items"] = num_users, num_items

    if FLAGS.early_stopping:
        early_stopping_callback = CustomEarlyStopping(
            "val_HR_METRIC", desired_value=FLAGS.hr_threshold)
        callbacks.append(early_stopping_callback)

    with tf.Graph().as_default(), autodist.scope():
        (train_input_dataset, eval_input_dataset, num_train_steps, num_eval_steps) = (
            ncf_input_pipeline.create_ncf_input_data(params, producer, input_meta_data, None))
        steps_per_epoch = None if generate_input_online else num_train_steps
        keras_model = _get_keras_model(params)
        if FLAGS.optimizer == 'adam':
            optimizer = tf.keras.optimizers.Adam(
                learning_rate=params["learning_rate"],
                beta_1=params["beta1"],
                beta_2=params["beta2"],
                epsilon=params["epsilon"])
        elif FLAGS.optimizer == 'sgd':
            optimizer = tf.keras.optimizers.SGD(
                learning_rate=params["learning_rate"])
        elif FLAGS.optimizer == 'lazyadam':
            optimizer = LazyAdam(
                learning_rate=params["learning_rate"],
                beta_1=params["beta1"],
                beta_2=params["beta2"],
                epsilon=params["epsilon"])
        else:
            raise ValueError('Do not support other optimizers...')
        if FLAGS.fp16_implementation == "graph_rewrite":
            optimizer = tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(
                optimizer, loss_scale=flags_core.get_loss_scale(FLAGS, default_for_fp16="dynamic"))
        elif FLAGS.dtype == "fp16" and params["keras_use_ctl"]:
            optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
                optimizer, tf.keras.mixed_precision.experimental.global_policy().loss_scale)

        return run_ncf_custom_training(
            params,
            autodist,
            keras_model,
            optimizer,
            callbacks,
            train_input_dataset,
            eval_input_dataset,
            num_train_steps,
            num_eval_steps,
            generate_input_online=generate_input_online,
            return_simulation=FLAGS.simulation_strategy_id is not None)
示例#5
0
    os.path.join(os.path.dirname(__file__),
                 'resource_specs/r2.yml')  # single node with 1 GPU
]
strategies = [
    PS(),
    PartitionedPS(local_proxy_variable=True),
    AllReduce(chunk_size=1,
              all_reduce_spec='NCCL',
              compressor='NoneCompressor'),
    AllReduce(chunk_size=1,
              all_reduce_spec='NCCL',
              compressor='HorovodCompressor'),
    AllReduce(chunk_size=1,
              all_reduce_spec='RING',
              compressor='HorovodCompressorEF'),
    PSLoadBalancing(local_proxy_variable=True),
    Parallax(local_proxy_variable=True),
    PartitionedAR(),
    UnevenPartitionedPS(local_proxy_variable=True),
    RandomAxisPartitionAR(chunk_size=4)
]


@pytest.mark.integration
def test_all():
    combinations = itertools.product(resource_specs, strategies)
    for r, s in combinations:
        for c in cases:

            def run():
                """This wrapper will handle the AutoDist destructor and garbage collections."""
示例#6
0
def run(flags_obj):
    """
    Run ResNet ImageNet training and eval loop using native Keras APIs.
    Raises:
        ValueError: If fp16 is passed as it is not currently supported.
    Returns:
        Dictionary of training and eval stats.
    """

    #########################################################################
    # Construct AutoDist with ResourceSpec for Different Strategies
    if flags_obj.autodist_patch_tf:
        os.environ['AUTODIST_PATCH_TF'] = '1'
    else:
        os.environ['AUTODIST_PATCH_TF'] = '0'

    if flags_obj.cnn_model == 'vgg16':
        chunk = 25
    elif flags_obj.cnn_model == 'resnet101':
        chunk = 200
    elif flags_obj.cnn_model == 'inceptionv3':
        chunk = 30
    else:
        chunk = 512

    if flags_obj.autodist_strategy == 'PS':
        autodist = AutoDist(resource_spec_file,
                            PS(local_proxy_variable=flags_obj.proxy))
    elif flags_obj.autodist_strategy == 'PSLoadBalancing':
        autodist = AutoDist(
            resource_spec_file,
            PSLoadBalancing(local_proxy_variable=flags_obj.proxy))
    elif flags_obj.autodist_strategy == 'PartitionedPS':
        autodist = AutoDist(
            resource_spec_file,
            PartitionedPS(local_proxy_variable=flags_obj.proxy))
    elif flags_obj.autodist_strategy == 'AllReduce':
        autodist = AutoDist(resource_spec_file, AllReduce(chunk_size=chunk))
    elif flags_obj.autodist_strategy == 'Parallax':
        autodist = AutoDist(
            resource_spec_file,
            Parallax(chunk_size=chunk, local_proxy_variable=flags_obj.proxy))
    else:
        raise ValueError(
            'the strategy can be only from PS, PSLoadBalancing, PartitionedPS, AllReduce, Parallax'
        )
    #########################################################################

    dtype = flags_core.get_tf_dtype(flags_obj)
    if dtype == tf.float16:
        loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128)
        policy = tf.compat.v1.keras.mixed_precision.experimental.Policy(
            'mixed_float16', loss_scale=loss_scale)
        tf.compat.v1.keras.mixed_precision.experimental.set_policy(policy)
        if not keras_utils.is_v2_0():
            raise ValueError('--dtype=fp16 is not supported in TensorFlow 1.')
    elif dtype == tf.bfloat16:
        policy = tf.compat.v1.keras.mixed_precision.experimental.Policy(
            'mixed_bfloat16')
        tf.compat.v1.keras.mixed_precision.experimental.set_policy(policy)

    input_fn = imagenet_preprocessing.input_fn

    drop_remainder = flags_obj.enable_xla

    if 'vgg' in flags_obj.cnn_model:
        lr_schedule = 0.01
    else:
        lr_schedule = 0.1
    if flags_obj.use_tensor_lr:
        lr_schedule = common.PiecewiseConstantDecayWithWarmup(
            batch_size=flags_obj.batch_size,
            epoch_size=imagenet_preprocessing.NUM_IMAGES['train'],
            warmup_epochs=common.LR_SCHEDULE[0][1],
            boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]),
            multipliers=list(p[0] for p in common.LR_SCHEDULE),
            compute_lr_on_cpu=True)

    #########################################################################
    # Build with Graph mode, and put all under AutoDist scope.
    with tf.Graph().as_default(), autodist.scope():
        ##########################################################################
        train_input_dataset = input_fn(
            is_training=True,
            data_dir=flags_obj.data_dir,
            batch_size=flags_obj.batch_size,
            num_epochs=flags_obj.train_epochs,
            parse_record_fn=imagenet_preprocessing.parse_record,
            datasets_num_private_threads=flags_obj.
            datasets_num_private_threads,
            dtype=dtype,
            drop_remainder=drop_remainder,
            tf_data_experimental_slack=flags_obj.tf_data_experimental_slack,
            training_dataset_cache=flags_obj.training_dataset_cache,
        )

        if flags_obj.cnn_model == 'resnet101':
            model = tf.keras.applications.ResNet101(
                weights=None, classes=imagenet_preprocessing.NUM_CLASSES)
        elif flags_obj.cnn_model == 'vgg16':
            model = tf.keras.applications.VGG16(
                weights=None, classes=imagenet_preprocessing.NUM_CLASSES)
        elif flags_obj.cnn_model == 'inceptionv3':
            model = tf.keras.applications.InceptionV3(
                weights=None, classes=imagenet_preprocessing.NUM_CLASSES)
        elif flags_obj.cnn_model == 'densenet121':
            model = tf.keras.applications.DenseNet121(
                weights=None, classes=imagenet_preprocessing.NUM_CLASSES)
        else:
            raise ValueError('Other Model Undeveloped')

        optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule,
                                             beta_1=0.9,
                                             beta_2=0.999,
                                             epsilon=1e-08)

        train_input_iterator = tf.compat.v1.data.make_one_shot_iterator(
            train_input_dataset)
        train_input, train_target = train_input_iterator.get_next()

        steps_per_epoch = (imagenet_preprocessing.NUM_IMAGES['train'] //
                           flags_obj.batch_size)
        train_epochs = flags_obj.train_epochs

        if flags_obj.enable_checkpoint_and_export:
            ckpt_full_path = os.path.join(flags_obj.model_dir,
                                          'model.ckpt-{epoch:04d}')

        if train_epochs <= 1 and flags_obj.train_steps:
            steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch)
            train_epochs = 1

        num_eval_steps = (imagenet_preprocessing.NUM_IMAGES['validation'] //
                          flags_obj.batch_size)

        train_output = model(train_input, training=True)
        scc_loss = tf.keras.losses.SparseCategoricalCrossentropy()

        loss = scc_loss(train_target, train_output)
        var_list = variables.trainable_variables() + \
            ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)
        grad = optimizer.get_gradients(loss, var_list)
        train_op = optimizer.apply_gradients(zip(grad, var_list))

        #####################################################################
        # Create distributed session.
        #   Instead of using the original TensorFlow session for graph execution,
        #   let's use AutoDist's distributed session, in which a computational
        #   graph for distributed training is constructed.
        #
        # [original line]
        # >>> sess = tf.compat.v1.Session()
        #
        sess = autodist.create_distributed_session()
        #####################################################################

        summary = TimeHistory(flags_obj.batch_size, steps_per_epoch)
        for epoch_id in range(train_epochs):
            summary.on_epoch_begin(epoch_id)
            for batch_id in range(steps_per_epoch):
                summary.on_batch_begin(batch_id)
                loss_v, _ = sess.run([loss, train_op])
                summary.on_batch_end(batch_id, loss_v)
            summary.on_epoch_end(epoch_id)
        summary.on_train_end()

    return
示例#7
0
from autodist.strategy.partitioned_ps_strategy import PartitionedPS
from autodist.strategy.ps_lb_strategy import PSLoadBalancing
from autodist.strategy.ps_strategy import PS
from autodist.strategy.partitioned_all_reduce_strategy import PartitionedAR
from autodist.strategy.uneven_partition_ps_strategy import UnevenPartitionedPS
from autodist.strategy.random_axis_partition_all_reduce_strategy import RandomAxisPartitionAR

STRATEGIES_FOR_DISTRIBUTED_TESTS = {
    'PS': PS(sync=True),
    'PS_stale_3': PS(sync=True, staleness=3),
    'PartitionedPS': PartitionedPS(),
    'PartitionedPS_stale_3': PartitionedPS(staleness=3),
    'AllReduce': AllReduce(chunk_size=1),
    'AllReduce_2': AllReduce(chunk_size=2),
    'Parallax': Parallax(),
    'PSLoadBalancingProxy_stale_3': PSLoadBalancing(local_proxy_variable=True, staleness=3),
    'ParallaxProxy': Parallax(local_proxy_variable=True),
    'PartitionedAR': PartitionedAR(),
    'RandomAxisPartitionAR': RandomAxisPartitionAR(chunk_size=4),
    'UnevenPartitionedPS': UnevenPartitionedPS(local_proxy_variable=True)
}


def run_test(resource, strategy, case):
    print("\n>>>>>>>> Running Test: Case:{}, Strategy:{}, ResourceSpec:{} >>>>>>>>\n".format(case, strategy, resource))
    a = AutoDist(resource_spec_file=resource, strategy_builder=STRATEGIES_FOR_DISTRIBUTED_TESTS[strategy])
    c = importlib.import_module("cases." + case)
    c.main(a)
    print('<<<<<<<<<< Test Case Finished. <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')