Exemplo n.º 1
0
def main(_):
    autodist = AutoDist(resource_spec_file, AllReduce(128))

    TRUE_W = 3.0
    TRUE_b = 2.0
    NUM_EXAMPLES = 1000
    EPOCHS = 10

    inputs = np.random.randn(NUM_EXAMPLES)
    noises = np.random.randn(NUM_EXAMPLES)
    outputs = inputs * TRUE_W + TRUE_b + noises

    class MyIterator:
        def initialize(self):
            return tf.zeros(1)

        def get_next(self):
            # a fake one
            return inputs

    inputs_iterator = MyIterator()
    print('I am going to a scope.')
    with tf.Graph().as_default() as g, autodist.scope():
        # x = placeholder(shape=[NUM_EXAMPLES], dtype=tf.float32)

        W = tf.Variable(5.0, name='W', dtype=tf.float64)
        b = tf.Variable(0.0, name='b', dtype=tf.float64)

        def train_step(input):
            def y(x):
                return W * x + b

            def l(predicted_y, desired_y):
                return tf.reduce_mean(tf.square(predicted_y - desired_y))

            major_version, _, _ = tf.version.VERSION.split('.')
            if major_version == '1':
                optimizer = tf.train.GradientDescentOptimizer(0.01)
            else:
                optimizer = tf.optimizers.SGD(0.01)

            with tf.GradientTape() as tape:
                loss = l(y(input), outputs)
                vs = [W, b]

                # gradients = tape.gradient(target=loss, sources=vs)
                gradients = tf.gradients(loss, vs)

                train_op = optimizer.apply_gradients(zip(gradients, vs))
            return loss, train_op, b

        fetches = train_step(inputs_iterator.get_next())
        session = autodist.create_distributed_session()
        for epoch in range(EPOCHS):
            l, t, b = session.run(fetches)
            print('node: {}, loss: {}\nb:{}'.format(
                autodist._cluster.get_local_address(), l, b))

    print('I am out of scope')
Exemplo n.º 2
0
def train_main(args):
    autodist = AutoDist(resource_spec_file, Parallaxx())
    with open(os.path.join(args.path, "meta.yml"), 'rb') as f:
        meta = yaml.load(f.read(), Loader=yaml.FullLoader)
    hidden_layer_size = args.hidden_size
    num_epoch = args.num_epoch
    rank = ad.get_worker_communicate().rank()
    device_id = rank % args.num_local_worker
    nrank = ad.get_worker_communicate().nrank()
    distributed.ps_init(rank, nrank)
    ngraph = meta["partition"]["nodes"][rank] // args.batch_size
    graphs = prepare_data(ngraph)
    idx, epoch, nnodes = 0, 0, 0
    graph_len = graphs[0][0].y.shape[0]
    with tf.Graph().as_default() as g, autodist.scope():
        norm_adj = tf.compat.v1.sparse.placeholder(tf.float32, name="norm_adj")
        sparse_feature = tf.placeholder(tf.int32,
                                        [graph_len, meta["feature"] - 1])
        y_ = tf.placeholder(tf.int32, [graph_len], name="y_")
        train_mask = tf.placeholder(tf.float32, [graph_len], name="train_mask")
        loss, y, train_op = model(norm_adj, sparse_feature, y_, train_mask)
        sess = autodist.create_distributed_session()

        acc_stat = []
        start = time.time()
        while True:
            g_sample, mp_val, mask, mask_eval = graphs[idx]
            idx = (idx + 1) % ngraph
            feed_dict = {
                norm_adj: mp_val,
                sparse_feature: g_sample.x[:, 0:-1],
                y_: g_sample.y,
                train_mask: mask
            }
            print("Before training")
            loss_val = sess.run([loss, y, y_, train_op], feed_dict=feed_dict)
            print(loss_val)
            pred_val = loss_val[1]
            true_val = loss_val[2]
            acc_val = np.equal(np.argmax(pred_val, 1),
                               true_val).astype(np.float)
            acc_stat.append(acc_val)
            nnodes += mask.sum() + mask_eval.sum()
            if nnodes > meta["partition"]["nodes"][rank]:
                nnodes = 0
                epoch += 1
                print("Acc : ", np.mean(acc_stat), "Time : ",
                      time.time() - start)
                start = time.time()
                acc_stat = []
                if epoch >= num_epoch:
                    break
Exemplo n.º 3
0
def train_and_save():
    resource_spec_file = os.path.join(os.path.dirname(__file__),
                                      'resource_spec.yml')
    autodist = AutoDist(resource_spec_file, PartitionedPS())
    (train_data,
     train_labels), (test_data,
                     test_labels) = tf.keras.datasets.imdb.load_data(
                         num_words=vocab_size)
    train_data = tf.keras.preprocessing.sequence.pad_sequences(train_data,
                                                               value=0,
                                                               padding='post',
                                                               maxlen=256)
    test_data = tf.keras.preprocessing.sequence.pad_sequences(test_data,
                                                              value=0,
                                                              padding='post',
                                                              maxlen=256)
    train_labels = train_labels.astype(np.float32)
    with tf.Graph().as_default(), autodist.scope():  # AutoDist code
        my_iterator = tf.compat.v1.data.Dataset.from_tensor_slices((train_data, train_labels)) \
            .shuffle(25000).batch(batch_size).repeat().make_one_shot_iterator().get_next()
        # my_iterator = MyIterator().get_next()
        model = SimpleModel()
        prev_time = time.time()
        # fetch train_op and loss
        loss_fn, train_op, gradients = model.train_fn(my_iterator)

        saver = autodist_saver()

        sess = autodist.create_distributed_session()
        for local_step in range(max_steps):
            loss, _ = sess.run(fetches=[loss_fn, train_op],
                               options=config_pb2.RunOptions(
                                   trace_level=config_pb2.RunOptions.NO_TRACE))
            if local_step % log_frequency == 0:
                cur_time = time.time()
                elapsed_time = cur_time - prev_time
                num_sentences = batch_size * log_frequency
                wps = float(num_sentences) / elapsed_time
                print(
                    "Iteration %d, time = %.2fs, wps = %.0f, train loss = %.4f"
                    % (local_step, cur_time - prev_time, wps, loss))
                prev_time = cur_time

        saver.save(sess, checkpoint_dir + "sentiment", global_step=local_step)
        print(sess.run(model.emb))
        print(sess.run(gradients[0]))
    print('ending...')
Exemplo n.º 4
0
 def run():
     """This wrapper will handle the AutoDist destructor and garbage collections."""
     try:
         atexit._clear(
         )  # TensorFlow also uses atexit, but running its exitfuncs cause some issues
         a = AutoDist(resource_spec_file=r,
                      strategy_builder=s)  # Fixtures in the future
         c.main(a)
     except Exception:
         raise
     finally:
         atexit._run_exitfuncs()
Exemplo n.º 5
0
def train_criteo(model, args):
    resource_spec_file = os.path.join(os.path.dirname(__file__), 'settings',
                                      'plx_dist_spec.yml')
    # resource_spec_file = os.path.join(os.path.dirname(__file__), 'settings', 'plx_local_spec.yml')
    autodist = AutoDist(resource_spec_file, Parallaxx())
    respec = ResourceSpec(resource_spec_file)
    if args.all:
        from models.load_data import process_all_criteo_data
        dense, sparse, all_labels = process_all_criteo_data()
        dense_feature, val_dense = dense
        sparse_feature, val_sparse = sparse
        labels, val_labels = all_labels
    else:
        from models.load_data import process_sampled_criteo_data
        dense_feature, sparse_feature, labels = process_sampled_criteo_data()

    # autodist will split the feeding data
    batch_size = 128
    with tf.Graph().as_default() as g, autodist.scope():
        dense_input = tf.compat.v1.placeholder(tf.float32, [batch_size, 13])
        sparse_input = tf.compat.v1.placeholder(tf.int32, [batch_size, 26])
        y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 1])
        embed_partitioner = tf.fixed_size_partitioner(len(
            respec.nodes), 0) if len(respec.nodes) > 1 else None
        loss, y, opt = model(dense_input, sparse_input, y_, embed_partitioner)
        train_op = opt.minimize(loss)

        sess = autodist.create_distributed_session()

        my_feed_dict = {
            dense_input: np.empty(shape=(batch_size, 13)),
            sparse_input: np.empty(shape=(batch_size, 26)),
            y_: np.empty(shape=(batch_size, 1)),
        }

        if args.all:
            raw_log_file = os.path.join(
                os.path.split(os.path.abspath(__file__))[0], 'logs',
                'tf_plx_%s.log' % (args.model))
            print('Processing all data, log to', raw_log_file)
            log_file = open(raw_log_file, 'w')
            iterations = dense_feature.shape[0] // batch_size
            total_epoch = 32
            start_index = 0
            for ep in range(total_epoch):
                # print("iters: %d" % (lp * 1000))
                print("epoch %d" % ep)
                st_time = time.time()
                train_loss, train_acc, train_auc = [], [], []
                for it in tqdm(
                        range(iterations // 10 + (ep % 10 == 9) *
                              (iterations % 10))):
                    my_feed_dict[dense_input][:] = dense_feature[
                        start_index:start_index + batch_size]
                    my_feed_dict[sparse_input][:] = sparse_feature[
                        start_index:start_index + batch_size]
                    my_feed_dict[y_][:] = labels[start_index:start_index +
                                                 batch_size]
                    start_index += batch_size
                    if start_index + batch_size > dense_feature.shape[0]:
                        start_index = 0
                    loss_val = sess.run([loss, y, y_, train_op],
                                        feed_dict=my_feed_dict)
                    pred_val = loss_val[1]
                    true_val = loss_val[2]
                    acc_val = np.equal(true_val, pred_val > 0.5)
                    train_loss.append(loss_val[0])
                    train_acc.append(acc_val)
                    train_auc.append(metrics.roc_auc_score(true_val, pred_val))
                tra_accuracy = np.mean(train_acc)
                tra_loss = np.mean(train_loss)
                tra_auc = np.mean(train_auc)
                en_time = time.time()
                train_time = en_time - st_time
                printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\
                            % (tra_loss, tra_accuracy, tra_auc, train_time)
                print(printstr)
                log_file.write(printstr + '\n')
                log_file.flush()

        else:
            iteration = dense_feature.shape[0] // batch_size

            epoch = 50
            for ep in range(epoch):
                print('epoch', ep)
                if ep == 5:
                    start = time.time()
                ep_st = time.time()
                train_loss = []
                train_acc = []
                for idx in range(iteration):
                    start_index = idx * batch_size
                    my_feed_dict[dense_input][:] = dense_feature[
                        start_index:start_index + batch_size]
                    my_feed_dict[sparse_input][:] = sparse_feature[
                        start_index:start_index + batch_size]
                    my_feed_dict[y_][:] = labels[start_index:start_index +
                                                 batch_size]

                    loss_val = sess.run([loss, y, y_, train_op],
                                        feed_dict=my_feed_dict)
                    pred_val = loss_val[1]
                    true_val = loss_val[2]
                    if pred_val.shape[1] == 1:  # for criteo case
                        acc_val = np.equal(true_val, pred_val > 0.5)
                    else:
                        acc_val = np.equal(np.argmax(pred_val, 1),
                                           np.argmax(true_val,
                                                     1)).astype(np.float)
                    train_loss.append(loss_val[0])
                    train_acc.append(acc_val)
                tra_accuracy = np.mean(train_acc)
                tra_loss = np.mean(train_loss)
                ep_en = time.time()
                print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f" %
                      (tra_loss, tra_accuracy, ep_en - ep_st))
            print('all time:', (time.time() - start))
import os
import sys
import numpy as np
import tensorflow as tf
from autodist.strategy import PS

############################################################
# Step 1: Construct AutoDist with ResourceSpec
from autodist import AutoDist
autodist = AutoDist(
    resource_spec_file='resource_spec.yml', 
    strategy_builder=PS(local_proxy_variable=False, sync=True, staleness=3)
)
############################################################

fashion_mnist = tf.keras.datasets.fashion_mnist
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
train_images = train_images[:512, :, :, None]
test_images = test_images[:512, :, :, None]
train_labels = train_labels[:512]
test_labels = test_labels[:512]
train_images = train_images / np.float32(255)
test_images = test_images / np.float32(255)

BATCH_SIZE = 64
EPOCHS = 1

#############################################################
# Step 2: Build with Graph mode, and put it under AutoDist scope
with tf.Graph().as_default(), autodist.scope():
#############################################################
Exemplo n.º 7
0
import os
import time
import tensorflow as tf
import numpy as np
from absl import app
from tensorflow.core.protobuf import config_pb2

from autodist import AutoDist
from autodist.strategy import PS, PSLoadBalancing, PartitionedPS, AllReduce, Parallax

resource_spec_file = os.path.join(os.path.dirname(__file__),
                                  'resource_spec.yml')
autodist = AutoDist(resource_spec_file, PartitionedPS())

vocab_size = 10000
embedding_size = 16
hidden_dim = 16
max_steps = 101
batch_size = 128
log_frequency = 100


class SimpleModel():
    def __init__(self):
        self.emb = tf.Variable(tf.random.uniform([vocab_size, embedding_size]),
                               name='emb',
                               trainable=True,
                               dtype=tf.float32)
        self.w1 = tf.Variable(tf.random.uniform([embedding_size, hidden_dim]),
                              name='w1',
                              trainable=True,
Exemplo n.º 8
0
def train_and_save():
    """ Train the model and save the serialized model and its weights. """
    autodist = AutoDist(resource_spec_file, AllReduce(128))
    print('I am going to a scope.')
    with tf.Graph().as_default() as g, autodist.scope():
        x = tf.compat.v1.placeholder(shape=[NUM_EXAMPLES], dtype=tf.float64)
        W = tf.Variable(5.0, name='W', dtype=tf.float64)
        b = tf.Variable(0.0, name='b', dtype=tf.float64)

        def y():
            return W * x + b

        def l(predicted_y, desired_y):
            return tf.reduce_mean(tf.square(predicted_y - desired_y))

        major_version, _, _ = tf.version.VERSION.split('.')
        if major_version == '1':
            optimizer = tf.train.GradientDescentOptimizer(0.01)
        else:
            optimizer = tf.optimizers.SGD(0.01)

        with tf.GradientTape() as tape:
            prediction = y()
            loss = l(prediction, outputs)
            vs = [W, b]
            gradients = tf.gradients(loss, vs)
            train_op = optimizer.apply_gradients(zip(gradients, vs))

        ops.add_to_collection(TRAIN_OP_KEY, train_op)
        fetches = [loss, train_op, b, prediction]
        feeds = [x]

        # NOTE: The AutoDist saver should be declared before the wrapped session.
        saver = autodist_saver()
        session = autodist.create_distributed_session()
        for _ in range(EPOCHS):
            l, _, b, _ = session.run(fetches, feed_dict={feeds[0]: inputs})
            print('node: {}, loss: {}\nb:{}'.format(autodist._cluster.get_local_address(), l, b))
        print('I am out of scope')

        inputs_info = {
            "input_data":
                saved_model.utils.build_tensor_info(feeds[0])
        }
        outputs_info = {
            "loss": saved_model.utils.build_tensor_info(fetches[0]),
            "prediction": saved_model.utils.build_tensor_info(fetches[3])
        }
        serving_signature = saved_model.signature_def_utils.build_signature_def(
            inputs=inputs_info,
            outputs=outputs_info,
            method_name=saved_model.signature_constants.PREDICT_METHOD_NAME
        )
        signature_map = {
            saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
                serving_signature,
        }
        if os.path.exists(EXPORT_DIR):
            shutil.rmtree(EXPORT_DIR)
        builder = SavedModelBuilder(EXPORT_DIR)
        builder.add_meta_graph_and_variables(
            sess=session,
            tags=[TAG_NAME],
            saver=saver,
            signature_def_map=signature_map)
        builder.save()
Exemplo n.º 9
0
def run(flags_obj):
    """
    Run ResNet ImageNet training and eval loop using native Keras APIs.
    Raises:
        ValueError: If fp16 is passed as it is not currently supported.
    Returns:
        Dictionary of training and eval stats.
    """

    #########################################################################
    # Construct AutoDist with ResourceSpec for Different Strategies
    if flags_obj.autodist_patch_tf:
        os.environ['AUTODIST_PATCH_TF'] = '1'
    else:
        os.environ['AUTODIST_PATCH_TF'] = '0'

    if flags_obj.cnn_model == 'vgg16':
        chunk = 25
    elif flags_obj.cnn_model == 'resnet101':
        chunk = 200
    elif flags_obj.cnn_model == 'inceptionv3':
        chunk = 30
    else:
        chunk = 512

    if flags_obj.autodist_strategy == 'PS':
        autodist = AutoDist(resource_spec_file,
                            PS(local_proxy_variable=flags_obj.proxy))
    elif flags_obj.autodist_strategy == 'PSLoadBalancing':
        autodist = AutoDist(
            resource_spec_file,
            PSLoadBalancing(local_proxy_variable=flags_obj.proxy))
    elif flags_obj.autodist_strategy == 'PartitionedPS':
        autodist = AutoDist(
            resource_spec_file,
            PartitionedPS(local_proxy_variable=flags_obj.proxy))
    elif flags_obj.autodist_strategy == 'AllReduce':
        autodist = AutoDist(resource_spec_file, AllReduce(chunk_size=chunk))
    elif flags_obj.autodist_strategy == 'Parallax':
        autodist = AutoDist(
            resource_spec_file,
            Parallax(chunk_size=chunk, local_proxy_variable=flags_obj.proxy))
    else:
        raise ValueError(
            'the strategy can be only from PS, PSLoadBalancing, PartitionedPS, AllReduce, Parallax'
        )
    #########################################################################

    dtype = flags_core.get_tf_dtype(flags_obj)
    if dtype == tf.float16:
        loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128)
        policy = tf.compat.v1.keras.mixed_precision.experimental.Policy(
            'mixed_float16', loss_scale=loss_scale)
        tf.compat.v1.keras.mixed_precision.experimental.set_policy(policy)
        if not keras_utils.is_v2_0():
            raise ValueError('--dtype=fp16 is not supported in TensorFlow 1.')
    elif dtype == tf.bfloat16:
        policy = tf.compat.v1.keras.mixed_precision.experimental.Policy(
            'mixed_bfloat16')
        tf.compat.v1.keras.mixed_precision.experimental.set_policy(policy)

    input_fn = imagenet_preprocessing.input_fn

    drop_remainder = flags_obj.enable_xla

    if 'vgg' in flags_obj.cnn_model:
        lr_schedule = 0.01
    else:
        lr_schedule = 0.1
    if flags_obj.use_tensor_lr:
        lr_schedule = common.PiecewiseConstantDecayWithWarmup(
            batch_size=flags_obj.batch_size,
            epoch_size=imagenet_preprocessing.NUM_IMAGES['train'],
            warmup_epochs=common.LR_SCHEDULE[0][1],
            boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]),
            multipliers=list(p[0] for p in common.LR_SCHEDULE),
            compute_lr_on_cpu=True)

    #########################################################################
    # Build with Graph mode, and put all under AutoDist scope.
    with tf.Graph().as_default(), autodist.scope():
        ##########################################################################
        train_input_dataset = input_fn(
            is_training=True,
            data_dir=flags_obj.data_dir,
            batch_size=flags_obj.batch_size,
            num_epochs=flags_obj.train_epochs,
            parse_record_fn=imagenet_preprocessing.parse_record,
            datasets_num_private_threads=flags_obj.
            datasets_num_private_threads,
            dtype=dtype,
            drop_remainder=drop_remainder,
            tf_data_experimental_slack=flags_obj.tf_data_experimental_slack,
            training_dataset_cache=flags_obj.training_dataset_cache,
        )

        if flags_obj.cnn_model == 'resnet101':
            model = tf.keras.applications.ResNet101(
                weights=None, classes=imagenet_preprocessing.NUM_CLASSES)
        elif flags_obj.cnn_model == 'vgg16':
            model = tf.keras.applications.VGG16(
                weights=None, classes=imagenet_preprocessing.NUM_CLASSES)
        elif flags_obj.cnn_model == 'inceptionv3':
            model = tf.keras.applications.InceptionV3(
                weights=None, classes=imagenet_preprocessing.NUM_CLASSES)
        elif flags_obj.cnn_model == 'densenet121':
            model = tf.keras.applications.DenseNet121(
                weights=None, classes=imagenet_preprocessing.NUM_CLASSES)
        else:
            raise ValueError('Other Model Undeveloped')

        optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule,
                                             beta_1=0.9,
                                             beta_2=0.999,
                                             epsilon=1e-08)

        train_input_iterator = tf.compat.v1.data.make_one_shot_iterator(
            train_input_dataset)
        train_input, train_target = train_input_iterator.get_next()

        steps_per_epoch = (imagenet_preprocessing.NUM_IMAGES['train'] //
                           flags_obj.batch_size)
        train_epochs = flags_obj.train_epochs

        if flags_obj.enable_checkpoint_and_export:
            ckpt_full_path = os.path.join(flags_obj.model_dir,
                                          'model.ckpt-{epoch:04d}')

        if train_epochs <= 1 and flags_obj.train_steps:
            steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch)
            train_epochs = 1

        num_eval_steps = (imagenet_preprocessing.NUM_IMAGES['validation'] //
                          flags_obj.batch_size)

        train_output = model(train_input, training=True)
        scc_loss = tf.keras.losses.SparseCategoricalCrossentropy()

        loss = scc_loss(train_target, train_output)
        var_list = variables.trainable_variables() + \
            ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)
        grad = optimizer.get_gradients(loss, var_list)
        train_op = optimizer.apply_gradients(zip(grad, var_list))

        #####################################################################
        # Create distributed session.
        #   Instead of using the original TensorFlow session for graph execution,
        #   let's use AutoDist's distributed session, in which a computational
        #   graph for distributed training is constructed.
        #
        # [original line]
        # >>> sess = tf.compat.v1.Session()
        #
        sess = autodist.create_distributed_session()
        #####################################################################

        summary = TimeHistory(flags_obj.batch_size, steps_per_epoch)
        for epoch_id in range(train_epochs):
            summary.on_epoch_begin(epoch_id)
            for batch_id in range(steps_per_epoch):
                summary.on_batch_begin(batch_id)
                loss_v, _ = sess.run([loss, train_op])
                summary.on_batch_end(batch_id, loss_v)
            summary.on_epoch_end(epoch_id)
        summary.on_train_end()

    return
Exemplo n.º 10
0
def main(_):
    assert tf.version.VERSION.startswith('2.')

    if not FLAGS.model_dir:
        FLAGS.model_dir = '/tmp/bert/'

    #########################################################################
    # Construct AutoDist with ResourceSpec for Different Strategies
    if FLAGS.autodist_patch_tf:
        os.environ['AUTODIST_PATCH_TF'] = 'True'
    else:
        os.environ['AUTODIST_PATCH_TF'] = 'False'
    resource_spec_file = os.path.join(os.path.dirname(__file__),
                                      '../resource_spec.yml')

    if FLAGS.autodist_strategy == 'PS':
        strategy = AutoDist(resource_spec_file,
                            PS(local_proxy_variable=FLAGS.proxy))
    elif FLAGS.autodist_strategy == 'PSLoadBalancing':
        strategy = AutoDist(resource_spec_file,
                            PSLoadBalancing(local_proxy_variable=FLAGS.proxy))
    elif FLAGS.autodist_strategy == 'PartitionedPS':
        strategy = AutoDist(resource_spec_file,
                            PartitionedPS(local_proxy_variable=FLAGS.proxy))
    elif FLAGS.autodist_strategy == 'AllReduce':
        strategy = AutoDist(resource_spec_file,
                            AllReduce(chunk_size=FLAGS.chunk_size))
    elif FLAGS.autodist_strategy == 'Parallax':
        strategy = AutoDist(
            resource_spec_file,
            Parallax(chunk_size=FLAGS.chunk_size,
                     local_proxy_variable=FLAGS.proxy))
    else:
        raise ValueError(
            'the strategy can be only from PS, PSLoadBalancing, PartitionedPS, AllReduce, Parallax'
        )

    strategy.num_replicas_in_sync = strategy._resource_spec.num_gpus

    if strategy:
        print('***** Number of cores used : ', strategy.num_replicas_in_sync)

    resource_info = yaml.safe_load(open(resource_spec_file, 'r'))
    try:
        node_num = len(resource_info['nodes'])
    except ValueError:
        print("nodes need to be set in specficiation file")

    try:
        gpu_num = len(resource_info['nodes'][0]['gpus'])
    except ValueError:
        print("gpus need to be set in specficiation file")
    #########################################################################

    logdir = '/tmp/logs'
    if not os.path.exists(logdir):
        os.makedirs(logdir)

    logname = 'bert_strategy_{}_node_{}_gpu_{}_patch_{}_proxy_{}'.format(
        FLAGS.autodist_strategy, node_num, gpu_num, FLAGS.autodist_patch_tf,
        FLAGS.proxy)

    logging.get_absl_handler().use_absl_log_file(logname, logdir)
    # start running
    run_bert_pretrain(strategy, gpu_num, node_num)
Exemplo n.º 11
0
def train_main(args):
    resource_spec_file = os.path.join(os.path.dirname(__file__),
                                      '../../examples/ctr/settings',
                                      'plx_dist_spec.yml')
    autodist = AutoDist(resource_spec_file, Parallaxx())
    with open(os.path.join(args.path, "meta.yml"), 'rb') as f:
        meta = yaml.load(f.read(), Loader=yaml.FullLoader)
    hidden_layer_size = args.hidden_size
    num_epoch = args.num_epoch
    rank = ad.get_worker_communicate().rank()
    device_id = rank % args.num_local_worker
    nrank = ad.get_worker_communicate().nrank()
    distributed.ps_init(rank, nrank)
    ngraph = meta["partition"]["nodes"][rank] // args.batch_size
    graphs = prepare_data(ngraph)
    idx, epoch, nnodes = 0, 0, 0
    worker_device = "gpu:0"
    graph_len = graphs[0][0].y.shape[0]
    with tf.Graph().as_default() as g, autodist.scope():
        with tf.device(worker_device):
            norm_adj = tf.compat.v1.sparse.placeholder(tf.float32,
                                                       name="norm_adj")
            sparse_feature = tf.placeholder(tf.int32,
                                            [graph_len, meta["feature"] - 1])
            y_ = tf.placeholder(tf.int32, [graph_len], name="y_")
            train_mask = tf.placeholder(tf.float32, [graph_len],
                                        name="train_mask")
        loss, y, train_op = model(norm_adj, sparse_feature, y_, train_mask)
        # init=tf.global_variables_initializer()
        # gpu_options = tf.GPUOptions(allow_growth=True)
        # sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
        sess = autodist.create_distributed_session()
        # sess.run(init)
        acc_cnt, total_cnt = 0, 0
        train_acc, train_cnt = 0, 0
        start = time.time()
        while True:
            g_sample, mp_val, mask, mask_eval = graphs[idx]
            idx = (idx + 1) % ngraph
            feed_dict = {
                norm_adj: mp_val,
                sparse_feature: g_sample.x[:, 0:-1],
                y_: g_sample.y,
                train_mask: mask
            }
            loss_val = sess.run([loss, y, train_op], feed_dict=feed_dict)
            pred_val = loss_val[1]
            acc_val = np.equal(np.argmax(pred_val, 1),
                               g_sample.y).astype(np.float)
            acc_cnt += (acc_val * mask_eval).sum()
            total_cnt += mask_eval.sum()
            nnodes += mask.sum() + mask_eval.sum()
            train_acc += (acc_val * mask).sum()
            train_cnt += mask.sum()
            if nnodes > meta["partition"]["nodes"][rank] // 10:
                nnodes = 0
                epoch += 1
                print("Acc : ", acc_cnt / total_cnt, train_acc / train_cnt,
                      "Time : ",
                      time.time() - start)
                # print(pred_val)
                start = time.time()
                acc_cnt, total_cnt = 0, 0
                train_acc, train_cnt = 0, 0
                if epoch >= num_epoch:
                    break
Exemplo n.º 12
0
import language_model
from autodist import AutoDist
from autodist.strategy import PS, PSLoadBalancing, PartitionedPS, AllReduce, Parallax

FLAGS = flags.FLAGS
flags.DEFINE_string("logdir", "/tmp/lm1b", "Logging directory.")
flags.DEFINE_string("datadir", "/tmp/dataset/lm1b", "Data directory.")
flags.DEFINE_integer("eval_steps", 70, "Number of eval steps.")
flags.DEFINE_integer('max_steps', 1000000,
                     """Number of iterations to run for each workers.""")
flags.DEFINE_integer('log_frequency', 100,
                     """How many steps between two runop logs.""")

resource_spec_file = os.path.join(os.path.dirname(__file__),
                                  '../resource_spec.yml')
autodist = AutoDist(resource_spec_file, PS())


def gen_lm1b_train_dataset(file_pattern, num_step):
    """
    Returns: The training dataset (tf.data.Dataset) that has been repeated
    and shuffled
    """
    file_names = []
    for file_name in glob.glob(file_pattern):
        file_names.append(file_name)
    if not file_names:
        raise ValueError
    # create dataset ops
    BUFFER_SIZE = 100000
Exemplo n.º 13
0
def main():
    resource_spec_file = os.path.join(os.path.dirname(__file__),
                                      '../ctr/settings', 'plx_dist_spec.yml')
    autodist = AutoDist(resource_spec_file, Parallaxx())
    respec = ResourceSpec(resource_spec_file)

    def validate():
        # validate phase
        hits, ndcgs = [], []
        for idx in range(num_users):
            start_index = idx * 100
            my_feed_dict = {
                user_input: testUserInput[start_index:start_index + 100],
                item_input: testItemInput[start_index:start_index + 100],
            }
            predictions = sess.run([y], feed_dict=my_feed_dict)
            map_item_score = {
                testItemInput[start_index + i]: predictions[0][i]
                for i in range(100)
            }

            # Evaluate top rank list
            ranklist = heapq.nlargest(topK,
                                      map_item_score,
                                      key=map_item_score.get)
            hr = getHitRatio(ranklist, testItemInput[start_index])
            ndcg = getNDCG(ranklist, testItemInput[start_index])
            hits.append(hr)
            ndcgs.append(ndcg)
        hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
        return hr, ndcg

    from movielens import getdata
    trainData, testData = getdata('ml-25m', 'datasets')
    testUserInput = np.repeat(np.arange(testData.shape[0], dtype=np.int32),
                              100)
    testItemInput = testData.reshape((-1, ))
    num_users, num_items = {
        'ml-1m': (6040, 3706),
        'ml-20m': (138493, 26744),
        'ml-25m': (162541, 59047),
    }['ml-25m']
    batch_size = 1024
    num_negatives = 4
    topK = 10
    with tf.Graph().as_default() as g, autodist.scope():
        user_input = tf.compat.v1.placeholder(tf.int32, [
            None,
        ])
        item_input = tf.compat.v1.placeholder(tf.int32, [
            None,
        ])
        y_ = tf.compat.v1.placeholder(tf.float32, [
            None,
        ])

        loss, y, opt = neural_mf(user_input, item_input, y_, num_users,
                                 num_items)
        train_op = opt.minimize(loss)

        # init = tf.compat.v1.global_variables_initializer()
        # gpu_options = tf.compat.v1.GPUOptions(allow_growth=True)
        sess = autodist.create_distributed_session()
        # sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
        # sess.run(init)

        log = Logging(
            path=os.path.join(os.path.dirname(__file__), 'logs', 'tfplx.txt'))
        epoch = 7
        iterations = trainData['user_input'].shape[0] // batch_size
        start = time.time()
        for ep in range(epoch):
            ep_st = time.time()
            log.write('epoch %d' % ep)
            train_loss = []
            for idx in range(iterations):
                start_index = idx * batch_size
                my_feed_dict = {
                    user_input:
                    trainData['user_input'][start_index:start_index +
                                            batch_size],
                    item_input:
                    trainData['item_input'][start_index:start_index +
                                            batch_size],
                    y_:
                    trainData['labels'][start_index:start_index + batch_size],
                }

                loss_val = sess.run([loss, train_op], feed_dict=my_feed_dict)
                train_loss.append(loss_val[0])

                # if idx % 10000 == 0:
                #     hr, ndcg = validate()
                #     printstr = "HR: %.4f, NDCF: %.4f" % (hr, ndcg)
                #     log.write(printstr)

            tra_loss = np.mean(train_loss)
            ep_en = time.time()

            # validate phase
            # hr, ndcg = validate()
            printstr = "train_loss: %.4f, train_time: %.4f" % (tra_loss,
                                                               ep_en - ep_st)
            log.write(printstr)
        log.write('all time:', (time.time() - start))
import tensorflow as tf
import tensorflow.contrib.layers as layers
import numpy as np
import os

from tensorflow.python import debug as tf_debug
from tensorflow.python.ops import math_ops
from autodist import AutoDist
from autodist.strategy import PS

resource_spec_file = os.path.join(os.path.dirname(__file__),
                                  'resource_spec.yml')
autodist = AutoDist(resource_spec_file, PS())

tf.reset_default_graph()

#network parameters
n_input = 2000  #input size for a single sample (2000 words), train3000, test1000

#hyperparamters
batch_size = 256
eta = 0.001  # learning rate
max_epoch = 27

# 1. get data (using same dataset as keras example)
from keras.datasets import imdb
(train_data, train_labels), (test_data,
                             test_labels) = tf.keras.datasets.imdb.load_data(
                                 num_words=n_input)

Exemplo n.º 15
0
import os
import numpy as np
import tensorflow as tf

############################################################
# Step 1: Construct AutoDist with ResourceSpec
from autodist import AutoDist
filepath = os.path.join(os.path.dirname(__file__), 'resource_spec.yml')
autodist = AutoDist(resource_spec_file=filepath)
############################################################

fashion_mnist = tf.keras.datasets.fashion_mnist
(train_images, train_labels), (test_images,
                               test_labels) = fashion_mnist.load_data()
train_images = train_images[:512, :, :, None]
test_images = test_images[:512, :, :, None]
train_labels = train_labels[:512]
test_labels = test_labels[:512]
train_images = train_images / np.float32(255)
test_images = test_images / np.float32(255)

BATCH_SIZE = 64
EPOCHS = 1

#############################################################
# Step 2: Build with Graph mode, and put it under AutoDist scope
with tf.Graph().as_default(), autodist.scope():
    #############################################################

    train_dataset = tf.data.Dataset.from_tensor_slices(
        (train_images, train_labels)).repeat(EPOCHS).shuffle(
Exemplo n.º 16
0
def test_single_process(tmp_resource_spec):
    from autodist import AutoDist
    AutoDist(resource_spec_file=tmp_resource_spec)
    with pytest.raises(NotImplementedError):
        AutoDist(resource_spec_file=tmp_resource_spec)
Exemplo n.º 17
0
def run_ncf(FLAGS):
    """Run NCF training and eval with Keras."""

    #########################################################################
    # Construct AutoDist with ResourceSpec for Different Strategies
    resource_spec_file = os.path.join(
        os.path.dirname(__file__),
        '../resource_spec.yml')
    resource_info = yaml.safe_load(open(resource_spec_file, 'r'))
    try:
        node_num = len(resource_info['nodes'])
    except ValueError:
        print("nodes need to be set in specficiation file")

    try:
        gpu_num = len(resource_info['nodes'][0]['gpus'])
    except ValueError:
        print("gpus need to be set in specficiation file")

    if FLAGS.autodist_patch_tf:
        os.environ['AUTODIST_PATCH_TF'] = '1'
    else:
        os.environ['AUTODIST_PATCH_TF'] = '0'

    if FLAGS.proxy:
        local_proxy_variable = True
    else:
        local_proxy_variable = False

    if FLAGS.autodist_strategy == 'PS':
        autodist = AutoDist(
            resource_spec_file, PS(
                local_proxy_variable=local_proxy_variable))
    elif FLAGS.autodist_strategy == 'PSLoadBalancing':
        autodist = AutoDist(resource_spec_file, PSLoadBalancing(
            local_proxy_variable=local_proxy_variable))
    elif FLAGS.autodist_strategy == 'PartitionedPS':
        autodist = AutoDist(resource_spec_file, PartitionedPS(
            local_proxy_variable=local_proxy_variable))
    elif FLAGS.autodist_strategy == 'AllReduce':
        autodist = AutoDist(resource_spec_file, AllReduce(chunk_size=256))
    elif FLAGS.autodist_strategy == 'Parallax':
        autodist = AutoDist(
            resource_spec_file,
            Parallax(
                chunk_size=256,
                local_proxy_variable=local_proxy_variable))
    else:
        raise ValueError(
            'the strategy can be only from PS, PSLoadBalancing, PartitionedPS, AllReduce, Parallax')
    #########################################################################
    if FLAGS.seed is not None:
        print("Setting tf seed")
        tf.random.set_seed(FLAGS.seed)

    model_helpers.apply_clean(FLAGS)

    if FLAGS.dtype == "fp16" and FLAGS.fp16_implementation == "keras":
        policy = tf.keras.mixed_precision.experimental.Policy(
            "mixed_float16", loss_scale=flags_core.get_loss_scale(
                FLAGS, default_for_fp16="dynamic"))
        tf.keras.mixed_precision.experimental.set_policy(policy)

    params = ncf_common.parse_flags(FLAGS)
    params["distribute_strategy"] = None

    batch_size = params["batch_size"]
    time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps)
    callbacks = [time_callback]

    producer, input_meta_data = None, None
    generate_input_online = params["train_dataset_path"] is None

    if generate_input_online:
        num_users, num_items, _, _, producer = ncf_common.get_inputs(params)
        producer.start()
        per_epoch_callback = IncrementEpochCallback(producer)
        callbacks.append(per_epoch_callback)
    else:
        assert params["eval_dataset_path"] and params["input_meta_data_path"]
        with tf.io.gfile.GFile(params["input_meta_data_path"], "rb") as reader:
            input_meta_data = json.loads(reader.read().decode("utf-8"))
            num_users = input_meta_data["num_users"]
            num_items = input_meta_data["num_items"]

    params["num_users"], params["num_items"] = num_users, num_items

    if FLAGS.early_stopping:
        early_stopping_callback = CustomEarlyStopping(
            "val_HR_METRIC", desired_value=FLAGS.hr_threshold)
        callbacks.append(early_stopping_callback)

    with tf.Graph().as_default(), autodist.scope():
        (train_input_dataset, eval_input_dataset, num_train_steps, num_eval_steps) = (
            ncf_input_pipeline.create_ncf_input_data(params, producer, input_meta_data, None))
        steps_per_epoch = None if generate_input_online else num_train_steps
        keras_model = _get_keras_model(params)
        if FLAGS.optimizer == 'adam':
            optimizer = tf.keras.optimizers.Adam(
                learning_rate=params["learning_rate"],
                beta_1=params["beta1"],
                beta_2=params["beta2"],
                epsilon=params["epsilon"])
        elif FLAGS.optimizer == 'sgd':
            optimizer = tf.keras.optimizers.SGD(
                learning_rate=params["learning_rate"])
        elif FLAGS.optimizer == 'lazyadam':
            optimizer = LazyAdam(
                learning_rate=params["learning_rate"],
                beta_1=params["beta1"],
                beta_2=params["beta2"],
                epsilon=params["epsilon"])
        else:
            raise ValueError('Do not support other optimizers...')
        if FLAGS.fp16_implementation == "graph_rewrite":
            optimizer = tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(
                optimizer, loss_scale=flags_core.get_loss_scale(FLAGS, default_for_fp16="dynamic"))
        elif FLAGS.dtype == "fp16" and params["keras_use_ctl"]:
            optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
                optimizer, tf.keras.mixed_precision.experimental.global_policy().loss_scale)

        return run_ncf_custom_training(
            params,
            autodist,
            keras_model,
            optimizer,
            callbacks,
            train_input_dataset,
            eval_input_dataset,
            num_train_steps,
            num_eval_steps,
            generate_input_online=generate_input_online,
            return_simulation=FLAGS.simulation_strategy_id is not None)
import tensorflow.compat.v1 as tf

from tensorflow.keras.layers import Dense
from tensorflow.keras import Model
from tensorflow.keras.datasets import imdb

import numpy as np
from autodist import AutoDist
from autodist.strategy import PS

autodist = AutoDist(resource_spec_file='resource_spec.yml',
                    strategy_builder=PS(local_proxy_variable=False,
                                        sync=True,
                                        staleness=1)
                    #strategy_builder=PS()
                    )

d = autodist

#network parameters
n_input = 800  #input size for a single sample (800 words)

#hyperparamters
batch_size = 128
eta = 0.001  # learning rate
max_epoch = 20

# 1. get data
(train_data, train_labels), (test_data,
                             test_labels) = imdb.load_data(num_words=800)
Exemplo n.º 19
0
def run_test(resource, strategy, case):
    print("\n>>>>>>>> Running Test: Case:{}, Strategy:{}, ResourceSpec:{} >>>>>>>>\n".format(case, strategy, resource))
    a = AutoDist(resource_spec_file=resource, strategy_builder=STRATEGIES_FOR_DISTRIBUTED_TESTS[strategy])
    c = importlib.import_module("cases." + case)
    c.main(a)
    print('<<<<<<<<<< Test Case Finished. <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')