예제 #1
0
def get_tpu_addr(tpu_name=None):
    # Get the TPU's location
    if tpu_name is not None:
      return TPUClusterResolver(tpu_name).get_master()
    if 'COLAB_TPU_ADDR' in os.environ:
      return TPUClusterResolver().get_master()
    elif 'TPU_NAME' in os.environ:
      return TPUClusterResolver(os.environ['TPU_NAME']).get_master()
예제 #2
0
파일: tflex.py 프로젝트: mylesdc/gpt2-hdf5
def get_tpu_addr(tpu_name=None):
    """Get the TPU's location."""
    if tpu_name is not None:
        return TPUClusterResolver(tpu_name).get_master()
    if "COLAB_TPU_ADDR" in os.environ:
        return TPUClusterResolver().get_master()
    elif "TPU_NAME" in os.environ:
        return TPUClusterResolver(os.environ["TPU_NAME"]).get_master()
예제 #3
0
def q1():
    global l_returnflag_group_size
    global l_linestatus_group_size
    returnflag_groups = np.unique(l_returnflag)
    linestatus_groups = np.unique(l_linestatus)
    l_returnflag_group_size = len(returnflag_groups)
    l_linestatus_group_size = len(linestatus_groups)
    inputs = [
        tf.convert_to_tensor(l_shipdate, np.float32),
        tf.convert_to_tensor(l_returnflag, np.float32),
        tf.convert_to_tensor(l_linestatus, np.float32),
        tf.convert_to_tensor(l_quantity, np.float32),
        tf.convert_to_tensor(l_extendedprice, np.float32),
        tf.convert_to_tensor(l_discount, np.float32),
        tf.convert_to_tensor(l_tax, np.float32),
        tf.convert_to_tensor(returnflag_groups, np.float32),
        tf.convert_to_tensor(linestatus_groups, np.float32)
    ]
    tpu_computation = tpu.rewrite(q1_computation, inputs)
    tpu_grpc_url = TPUClusterResolver(
        tpu=[os.environ['TPU_NAME']]).get_master()
    with tf.Session(tpu_grpc_url) as sess:
        sess.run(tpu.initialize_system())
        sess.run(tf.global_variables_initializer())
        for i in range(0, 5):
            res = sess.run(tpu_computation)
        sess.run(tpu.shutdown_system())
        print(res)
        return res
예제 #4
0
def run(size):
    a_ = []
    b_ = []
    c_ = []
    for i in range(size):
        a_.append((i * 1.0 + 4.0) * 2.5)
        b_.append((i * 1.0 + 5.0) * 2.5)
        c_.append((i * 1.0 + 6.0) * 0.1)

    inputs = [tf.constant(a_), tf.constant(b_), tf.constant(c_)]

    tpu_computation = tpu.rewrite(expression, inputs)
    tpu_grpc_url = TPUClusterResolver(
        tpu=[os.environ['TPU_NAME']]).get_master()

    with tf.Session(tpu_grpc_url) as sess:
        sess.run(tpu.initialize_system())
        t1 = time()
        sess.run(tf.global_variables_initializer())
        sess.run(tpu_computation)
        t2 = time()
        print(str(size) + " : " + str(t2 - t1))
        sess.run(tpu.shutdown_system())

    print('Done !')
예제 #5
0
def main(args):
    # unpack the tensor batch to be used as the list of inputs of the TPU function
    dataset = train_input_fn()
    iterator = dataset.make_one_shot_iterator()
    features, labels = iterator.get_next()

    # mark part of the graph to be run on the TPUs
    global_step_tensor, loss_tensor = tf.contrib.tpu.rewrite(tpu_computation, [features, labels])

    # utility ops
    tpu_init = tf.contrib.tpu.initialize_system()
    tpu_shutdown = tf.contrib.tpu.shutdown_system()
    variables_init = tf.global_variables_initializer()

    saver = tf.train.Saver()

    # get the TPU resource's grpc url
    # Note: when running on CMLE, args.tpu should be left as None
    tpu_grpc_url = TPUClusterResolver(tpu=args.tpu).get_master()
    sess = tf.Session(tpu_grpc_url)

    sess.run(tpu_init)
    sess.run(variables_init)

    for i in range(args.max_steps):
        # the tensor values in the TPU function are returned in a list, and the operations in the TPU function are called with no return value
        global_step, loss = sess.run([global_step_tensor, loss_tensor])

        if i % args.save_checkpoints_steps == 0:
            saver.save(sess, os.path.join(args.model_dir, 'model.ckpt'), global_step=global_step)

            tf.logging.info('global_step: {}, loss: {}'.format(global_step, loss))

    sess.run(tpu_shutdown)
예제 #6
0
def create_labels(input_tfrecord_path,
                  output_tfrecord_path,
                  dataset_preprocess_fn,
                  embedding_fn,
                  label_fn,
                  write_fn=None,
                  batch_size=64,
                  parallel_calls=1):
    """Creates a new set of labels for a single chunk.

  Args:
    input_tfrecord_path: String with input TF Record file.
    output_tfrecord_path: String with input TF Record file.
    dataset_preprocess_fn: Preprocessing function applied to dataset.
    embedding_fn: Embedding function applied to the dataset tensor.
    label_fn: Label function applied to the (after sess.run).
    write_fn: Function to write TF Record to TF Record writer.
    batch_size: Optional integer with batch_size.
  """
    tf.logging.info("Input: {}\nOutput: {}".format(input_tfrecord_path,
                                                   output_tfrecord_path))
    if write_fn is None:
        write_fn = write_imagenet

    if FLAGS.tpu_name:
        cluster = TPUClusterResolver(tpu=[FLAGS.tpu_name])
    else:
        cluster = None
    config = tf.contrib.tpu.RunConfig(cluster=cluster)

    # Load the data in the chunk.
    input_dataset = tf.data.TFRecordDataset(input_tfrecord_path)
    input_dataset = input_dataset.map(dataset_preprocess_fn, parallel_calls)
    input_dataset = input_dataset.batch(batch_size)
    next_node = input_dataset.make_one_shot_iterator().get_next()
    embedding = embedding_fn(next_node)
    with tf.Session(cluster.get_master(),
                    config=config.session_config) as sess:
        with tf.python_io.TFRecordWriter(output_tfrecord_path) as writer:
            sess.run(tf.global_variables_initializer())
            while True:
                try:
                    embedded = sess.run(embedding)
                    results = label_fn(embedded)
                    write_fn(writer, results)
                except tf.errors.OutOfRangeError:
                    break
예제 #7
0
def apply_comp(inputs):
    tpu_computation = tpu.rewrite(apply, inputs)
    tpu_grpc_url = TPUClusterResolver(
        tpu=[os.environ['TPU_NAME']]).get_master()

    with tf.Session(tpu_grpc_url) as sess:
        sess.run(tpu.initialize_system())
        sess.run(tf.global_variables_initializer())
        t1 = time()
        sess.run(tpu_computation)
        t2 = time()
        sess.run(tpu.shutdown_system())
    print(t2 - t1)
예제 #8
0
def filter_sum():
    inputs = [tf.convert_to_tensor(l_quantity, np.float32)]
    tpu_computation = tpu.rewrite(filter_sum_computation, inputs)
    tpu_grpc_url = TPUClusterResolver(
        tpu=[os.environ['TPU_NAME']]).get_master()
    with tf.Session(tpu_grpc_url) as sess:
        sess.run(tpu.initialize_system())
        sess.run(tf.global_variables_initializer())
        for i in range(0, 5):
            res = sess.run(tpu_computation)
        sess.run(tpu.shutdown_system())
        print(res)
        return res
예제 #9
0
def main(_):
    tpu_grpc_url = None

    if config.use_tpu:
        tpu_grpc_url = TPUClusterResolver(tpu=config.tpu_name).get_master()

    run_config = tpu.RunConfig(
        master=tpu_grpc_url,
        evaluation_master=tpu_grpc_url,
        model_dir=config.log_dir,
        session_config=tf.ConfigProto(allow_soft_placement=True),
        tpu_config=tpu.TPUConfig(config.tpu_iterations, config.tpu_shards)
    )

    batch_size = config.batch_size * config.tpu_shards if config.use_tpu else config.batch_size
    est = Estimator(
        model_fn=estimator.model_fn,
        use_tpu=config.use_tpu,
        train_batch_size=batch_size,
        eval_batch_size=batch_size,
        params={
            "use_tpu": config.use_tpu,
            "data_dir": config.data_dir,
            "buffer_size": config.buffer_size,
            "data_format": "NCHW" if config.use_tpu else "NHWC",
            "z_dim": config.z_dim,
            "D_lr": config.d_lr,
            "G_lr": config.g_lr,
            "data_seed": config.data_shuffle_seed,
            "data_map_parallelism": config.data_map_parallelism
        },
        config=run_config
    )

    if config.train:
        est.train(
            input_fn=estimator.train_input_fn,
            max_steps=config.train_steps
        )
    if config.eval:
        est.evaluate(
            input_fn=estimator.eval_input_fn,
            steps=config.eval_steps
        )
    elif config.predict:
        est.predict(
            input_fn=lambda params: estimator.predict_input_fn(params, config.predict_class),
            predict_keys=['G2']
        )
예제 #10
0
def group_by():
    unique_groups = np.unique(l_returnflag)
    inputs = [
        tf.convert_to_tensor(l_quantity, np.float32),
        tf.convert_to_tensor(l_returnflag, np.float32),
        tf.convert_to_tensor(unique_groups, np.float32)
    ]
    tpu_computation = tpu.rewrite(group_by_computation, inputs)
    tpu_grpc_url = TPUClusterResolver(
        tpu=[os.environ['TPU_NAME']]).get_master()
    with tf.Session(tpu_grpc_url) as sess:
        sess.run(tpu.initialize_system())
        sess.run(tf.global_variables_initializer())
        for i in range(0, 5):
            res = sess.run(tpu_computation)
        sess.run(tpu.shutdown_system())
        print(res)
예제 #11
0
    def setup_model(self):
        print("*** Thawing model from JSON ***")
        with open(self.experiment_env.model_json, "r") as fptr:
            json_string = fptr.read()
        model = model_from_json(json_string)  # type:Model
        model.load_weights(self.experiment_env.final_weights)
        adam = optimizers.Adam(
            lr=self.config["RNN-train"].getfloat("initial_lr"))
        model.compile(loss='binary_crossentropy', optimizer=adam)
        model.summary()

        if self.config["RNN-train"].getboolean("use_tpu"):
            model = tf.contrib.tpu.keras_to_tpu_model(
                model,
                strategy=tf.contrib.tpu.TPUDistributionStrategy(
                    tf.contrib.cluster_resolver.TPUClusterResolver(
                        tpu=TPUClusterResolver(
                            tpu=[os.environ['TPU_NAME']]).get_master())))

        return model
예제 #12
0
def run():
    S0 = 100.
    K0 = 100.
    r0 = 0.05
    T0 = 1.0
    v0 = 0.2

    S = tf.constant(S0)
    K = tf.constant(K0)
    r = tf.constant(r0)
    T = tf.constant(T0)
    v = tf.constant(v0)

    inputs = [S, K, r, T, v]

    tpu_computation = tpu.rewrite(blackscholes, inputs)
    tpu_grpc_url = TPUClusterResolver(
        tpu=[os.environ['TPU_NAME']]).get_master()

    timer(tpu_computation, tpu_grpc_url)
    print('Done !')
예제 #13
0
def timer(inputs):
    reps = 2
    times = []

    for i in range(reps):
        t1 = time()
        tpu_computation = tpu.rewrite(blackscholes, inputs)
        tpu_grpc_url = TPUClusterResolver(
            tpu=[os.environ['TPU_NAME']]).get_master()

        with tf.Session(tpu_grpc_url) as sess:
            sess.run(tpu.initialize_system())
            sess.run(tf.global_variables_initializer())
            sess.run(tpu_computation)
            sess.run(tpu.shutdown_system())

        t2 = time()
        print(str(i) + "_ : " + str(t2 - t1))
        times.append(t2 - t1)

    print(sum(times) / reps)
예제 #14
0
    def __init__(self,
                 N,
                 batch_size=32,
                 archive_fit_samples=64,
                 use_tpu=None,
                 log_path='logs/tensorboard'):
        self.N = N
        self.batch_size = batch_size

        self.model = None
        self.archive_fit_samples = archive_fit_samples
        self.position_archive = []

        self.tpu_grpc_url = use_tpu
        tpu_name_environ_key = 'TPU_NAME'

        # Check has server got TPU
        if use_tpu is not False and tpu_name_environ_key in os.environ:
            tpu_name = os.environ[tpu_name_environ_key].strip()
            if tpu_name != "":
                self.is_tpu = True
                self.tpu_grpc_url = TPUClusterResolver(
                    tpu=[os.environ[tpu_name_environ_key]]).get_master()
        # TODO write an if condition to validate and resolve the TPU url provided

        self.__loss_functions = [
            'categorical_crossentropy', 'binary_crossentropy'
        ]

        self.model_name = time.strftime('GM{0}-%y%m%dT%H%M%S').format('%02d' %
                                                                      N)
        # print(self.model_name)

        log_path = os.path.join(log_path, self.model_name)
        if not os.path.exists(log_path):
            os.makedirs(log_path)
        self.callback = TensorBoard(log_path)
    def model_train(self):
        # 1. Build the graph
        train_graph = tf.Graph()
        # Set the graph to default to ensure that it is ready for training
        with train_graph.as_default():
            # Load the model inputs
            input_data_logdesignid_enc, input_data_logdesignid_dec, target_logdesignid, lr, source_sequence_length, max_source_sequence_length = self.get_model_inputs(
            )

            embedding_encoder = tf.get_variable("embedding_encoder", [
                len(self.logdesignid_int_to_vocab),
                self.embedding_size_logdesignid
            ])
            enc_embed_input_logdesignid = tf.nn.embedding_lookup(
                embedding_encoder, input_data_logdesignid_enc)
            print("[model_train] enc_embed_input_logdesignid:")
            self.print_activations(enc_embed_input_logdesignid)
            embedding_decoder = tf.get_variable("embedding_decoder", [
                len(self.logdesignid_int_to_vocab),
                self.embedding_size_logdesignid
            ])
            dec_embed_input_logdesignid = tf.nn.embedding_lookup(
                embedding_decoder, input_data_logdesignid_dec)
            print("[model_train] dec_embed_input_logdesignid:")
            self.print_activations(dec_embed_input_logdesignid)

            input_data_enc = enc_embed_input_logdesignid
            input_data_dec = dec_embed_input_logdesignid
            #            targets = input_data_enc #autoencoder: target equals to input
            #            targets = self.player_logid_test

            with tf.name_scope('seq2seq'):
                # Create the training and inference logits
                training_decoder_outputs, logits, enc_state = self.seq2seq_model(
                    input_data_enc, input_data_dec, target_logdesignid, lr,
                    source_sequence_length, max_source_sequence_length,
                    source_sequence_length, len(self.logdesignid_int_to_vocab),
                    len(self.logdesignid_int_to_vocab), self.rnn_size,
                    self.rnn_num_layers)

            masks = tf.sequence_mask(source_sequence_length,
                                     max_source_sequence_length,
                                     dtype=tf.float32,
                                     name='masks')

            with tf.name_scope("optimization"):
                # Loss function
                print('[model_train] training_logits:', logits.get_shape())
                print('[model_train] targets', target_logdesignid.get_shape())

                crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=target_logdesignid, logits=logits)
                cost = (tf.reduce_sum(crossent * masks) /
                        (self.batch_size * self.maxlen))
                tf.summary.scalar('loss', cost)

                # Calculate and clip gradients
                params = tf.trainable_variables()
                gradients = tf.gradients(cost, params)
                clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5)

                # Optimizer
                optimizer = tf.train.AdamOptimizer(lr)
                train_op = optimizer.apply_gradients(
                    zip(clipped_gradients, params))

        # 2.Start train
        checkpoint = self.model_file + "best_model.ckpt"
        tpu_grpc_url = TPUClusterResolver(
            tpu_names=[os.environ['TPU_NAME']]).get_master()

        with tf.Session(tpu_grpc_url, graph=train_graph) as sess:
            merged = tf.summary.merge_all()
            train_writer = tf.summary.FileWriter(self.log_file + 'train',
                                                 sess.graph)
            test_writer = tf.summary.FileWriter(self.log_file + 'test')
            sess.run(tf.global_variables_initializer())

            max_batchsize = self.train_size // self.batch_size
            # for epoch_i in range(1, self.epochs + 1):
            epoch_i = 1
            test_generator = self.generator_batches(datatype='test')
            for batch_i, (pad_enc_logdesignid_batch, pad_dec_logdesignid_batch,
                          sources_lengths, train_targets_batch) in enumerate(
                              self.generator_batches(datatype='train')):

                #                    print('train_targets_batch',train_targets_batch)
                if (batch_i % max_batchsize) + 1 == max_batchsize:
                    epoch_i += 1
                    if epoch_i >= self.epochs:
                        break
                # Training step
                with tf.name_scope('loss'):
                    #                    try:
                    #                    print('train',pad_enc_logdesignid_batch,pad_dec_logdesignid_batch,train_targets_batch)
                    summary, _, loss = sess.run(
                        [merged, train_op, cost], {
                            input_data_logdesignid_enc:
                            pad_enc_logdesignid_batch,
                            input_data_logdesignid_dec:
                            pad_dec_logdesignid_batch,
                            target_logdesignid: train_targets_batch,
                            lr: self.learning_rate,
                            source_sequence_length: sources_lengths,
                        })
                    #                    except:

                    train_writer.add_summary(summary, batch_i)

                # Debug message updating us on the status of the training
                if batch_i % self.display_step == 0:

                    (pad_enc_valid_logdesignid_batch,
                     pad_dec_valid_logdesignid_batch, valid_sources_lengths,
                     valid_targets_batch) = next(test_generator)
                    # Calculate validation cost
                    #                    print('test',pad_enc_valid_logdesignid_batch,pad_dec_valid_logdesignid_batch,valid_targets_batch)
                    summary, validation_loss = sess.run(
                        [merged, cost], {
                            input_data_logdesignid_enc:
                            pad_enc_valid_logdesignid_batch,
                            input_data_logdesignid_dec:
                            pad_dec_valid_logdesignid_batch,
                            target_logdesignid: valid_targets_batch,
                            lr: self.learning_rate,
                            source_sequence_length: valid_sources_lengths,
                        })
                    test_writer.add_summary(summary, batch_i)

                    print(
                        'Epoch {:>3}/{} Batch {:>4}/{} - Loss: {:>6.3f}  - Validation loss: {:>6.3f}'
                        .format(epoch_i, self.epochs,
                                (batch_i % max_batchsize) + 1, max_batchsize,
                                loss, validation_loss))


#                if epoch_i % self.saver_step == 0 and ((batch_i % max_batchsize)+1) % max_batchsize == 0:
                if ((batch_i % max_batchsize) + 1) % self.saver_step == 0:
                    saver = tf.train.Saver()
                    saver.save(
                        sess,
                        os.path.join(
                            os.getcwd(), self.model_file + "epoch" +
                            str(epoch_i) + "batch" +
                            str((batch_i % max_batchsize) + 1) + ".ckpt"))

            # Save Model
            # saver = tf.train.Saver()
            # saver.save(sess, checkpoint)
            print('Model Trained and Saved')
예제 #16
0
    slice3 = tf.slice(fijk, [i + d3, j, 0], [1, 1, size])
    slice4 = tf.slice(fijk, [i - d4, j, 0], [1, 1, size])

    fdo = c1 * slice1 + c2 * slice2 + c3 * slice3 + c4 * slice4
    return fdo


if __name__ == "__main__":

    dim1 = [0., 1., 2., 3., 4.]
    dim2 = [5., 6., 7., 8., 9.]

    dim3 = [10., 11., 12., 13., 14.]
    dim4 = [15., 16., 17., 18., 19.]

    fijk = tf.constant([[dim1, dim2, dim3, dim4], [dim2, dim3, dim4, dim1],
                        [dim3, dim4, dim1, dim2], [dim4, dim1, dim2, dim3]])

    i = tf.constant(1)
    j = tf.constant(1)
    dk = tf.constant(0)

    inputs = [fijk, i, j, dk]

    tpu_computation = tpu.rewrite(apply_, inputs)
    tpu_grpc_url = TPUClusterResolver(
        tpu=[os.environ['TPU_NAME']]).get_master()

    run(tpu_computation, tpu_grpc_url)
    print('Done !')
예제 #17
0
    def required_tpu(self):
        return self._required_tpu


# pylint: disable=g-long-lambda
default_strategy = NamedDistribution(
    "Default",
    distribution_strategy_context._get_default_distribution_strategy,  # pylint: disable=protected-access
    required_gpus=None)
one_device_strategy = NamedDistribution(
    "OneDeviceCPU",
    lambda: one_device_lib.OneDeviceStrategy("/cpu:0"),
    required_gpus=None)
tpu_strategy = NamedDistribution(
    "TPU",
    lambda: tpu_lib.TPUStrategy(TPUClusterResolver(""), steps_per_run=5),
    required_tpu=True)
# Note that we disable prefetching for testing since prefetching makes
# the input non-deterministic.
mirrored_strategy_with_gpu_and_cpu = NamedDistribution(
    "MirroredCPUAndGPU",
    lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/cpu:0"],
                                          prefetch_on_device=False),
    required_gpus=1)
mirrored_strategy_with_two_gpus = NamedDistribution(
    "Mirrored2GPUs",
    lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/gpu:1"],
                                          prefetch_on_device=False),
    required_gpus=2)

adam_optimizer_v1_fn = NamedObject("AdamV1",
def train_and_eval():
    """Trains a network on (self) supervised data."""
    checkpoint_dir = os.path.join(FLAGS.workdir)

    if FLAGS.use_tpu:
        master = TPUClusterResolver(tpu=[os.environ['TPU_NAME']]).get_master()
    else:
        master = ''

    config = tf.contrib.tpu.RunConfig(
        model_dir=checkpoint_dir,
        tf_random_seed=FLAGS.get_flag_value('random_seed', None),
        master=master,
        evaluation_master=master,
        keep_checkpoint_every_n_hours=FLAGS.get_flag_value(
            'keep_checkpoint_every_n_hours', 4),
        save_checkpoints_secs=FLAGS.get_flag_value('save_checkpoints_secs',
                                                   600),
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=TPU_ITERATIONS_PER_LOOP,
            tpu_job_name=FLAGS.tpu_worker_name))

    # The global batch-sizes are passed to the TPU estimator, and it will pass
    # along the local batch size in the model_fn's `params` argument dict.
    estimator = tf.contrib.tpu.TPUEstimator(
        model_fn=get_self_supervision_model(FLAGS.task),
        model_dir=checkpoint_dir,
        config=config,
        use_tpu=FLAGS.use_tpu,
        train_batch_size=FLAGS.batch_size,
        eval_batch_size=FLAGS.get_flag_value('eval_batch_size',
                                             FLAGS.batch_size))

    if FLAGS.run_eval:
        data_fn = functools.partial(datasets.get_data,
                                    split_name=FLAGS.get_flag_value(
                                        'val_split', 'val'),
                                    is_training=False,
                                    shuffle=False,
                                    num_epochs=1,
                                    drop_remainder=FLAGS.use_tpu)

        # Contrary to what the documentation claims, the `train` and the
        # `evaluate` functions NEED to have `max_steps` and/or `steps` set and
        # cannot make use of the iterator's end-of-input exception, so we need
        # to do some math for that here.
        num_samples = datasets.get_count(
            FLAGS.get_flag_value('val_split', 'val'))
        num_steps = num_samples // FLAGS.get_flag_value(
            'eval_batch_size', FLAGS.batch_size)
        tf.logging.info('val_steps: %d', num_steps)

        for checkpoint in tf.contrib.training.checkpoints_iterator(
                estimator.model_dir, timeout=10 * 60):

            estimator.evaluate(checkpoint_path=checkpoint,
                               input_fn=data_fn,
                               steps=num_steps)

            hub_exporter = hub.LatestModuleExporter('hub', serving_input_fn)
            hub_exporter.export(estimator,
                                os.path.join(checkpoint_dir, 'export/hub'),
                                checkpoint)

            if tf.gfile.Exists(os.path.join(FLAGS.workdir,
                                            'TRAINING_IS_DONE')):
                break

        # Evaluates the latest checkpoint on validation set.
        result = estimator.evaluate(input_fn=data_fn, steps=num_steps)
        return result

    else:
        train_data_fn = functools.partial(
            datasets.get_data,
            split_name=FLAGS.get_flag_value('train_split', 'train'),
            is_training=True,
            num_epochs=int(math.ceil(FLAGS.epochs)),
            drop_remainder=True)

        # We compute the number of steps and make use of Estimator's max_steps
        # arguments instead of relying on the Dataset's iterator to run out after
        # a number of epochs so that we can use 'fractional' epochs, which are
        # used by regression tests. (And because TPUEstimator needs it anyways.)
        num_samples = datasets.get_count(
            FLAGS.get_flag_value('train_split', 'train'))
        # Depending on whether we drop the last batch each epoch or only at the
        # ver end, this should be ordered differently for rounding.
        updates_per_epoch = num_samples // FLAGS.batch_size
        num_steps = int(math.ceil(FLAGS.epochs * updates_per_epoch))
        tf.logging.info('train_steps: %d', num_steps)

        estimator.train(train_data_fn, max_steps=num_steps)
예제 #19
0
    def required_tpu(self):
        return self._required_tpu


# pylint: disable=g-long-lambda
default_strategy = NamedDistribution(
    "Default",
    lambda: distribute_lib._default_distribution_strategy,  # pylint: disable=protected-access
    required_gpus=None)
one_device_strategy = NamedDistribution(
    "OneDeviceCPU",
    lambda: one_device_lib.OneDeviceStrategy("/cpu:0"),
    required_gpus=None)
tpu_strategy = NamedDistribution(
    "TPU",
    lambda: tpu_lib.TPUStrategy(TPUClusterResolver("")),
    required_tpu=True)
# Note that we disable prefetching for testing since prefetching makes
# the input non-deterministic.
mirrored_strategy_with_gpu_and_cpu = NamedDistribution(
    "MirroredCPUAndGPU",
    lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/cpu:0"],
                                          prefetch_on_device=False),
    required_gpus=1)
mirrored_strategy_with_two_gpus = NamedDistribution(
    "Mirrored2GPUs",
    lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/gpu:1"],
                                          prefetch_on_device=False),
    required_gpus=2)

multi_worker_strategy_with_cpu = NamedDistribution(
예제 #20
0
def main(args):
    # Unpack the tensor batch to be used to set up the infeed/outfeed queues.
    dataset = train_input_fn()
    iterator = dataset.make_one_shot_iterator()
    features, labels = iterator.get_next()

    infeed_ops, outfeed_ops = setup_feed(features, labels, num_shards=8)

    # Wrap the tpu computation function to be run in a loop.
    def computation_loop():
        return tf.contrib.tpu.repeat(
            args.max_steps,
            partial(tpu_computation_with_infeed, batch_size=16, num_shards=8))

    # Since we are using infeed/outfeed queues, tensors are not explicitly passed in or returned.
    tpu_computation_loop = tf.contrib.tpu.batch_parallel(computation_loop,
                                                         num_shards=8)

    # utility ops
    tpu_init = tf.contrib.tpu.initialize_system()
    tpu_shutdown = tf.contrib.tpu.shutdown_system()
    variables_init = tf.global_variables_initializer()

    saver = tf.train.Saver()

    # get the TPU resource's grpc url
    # Note: when running on AI Platform, args.tpu should be left as None
    tpu_grpc_url = TPUClusterResolver(tpu=args.tpu).get_master()
    sess = tf.Session(tpu_grpc_url)

    # Use separate threads to run infeed and outfeed.
    def _run_infeed():
        for i in range(args.max_steps):
            sess.run(infeed_ops)

            if i % args.save_checkpoints_steps == 0:
                print('infeed {}'.format(i))

    def _run_outfeed():
        for i in range(args.max_steps):
            outfeed_data = sess.run(outfeed_ops)

            if i % args.save_checkpoints_steps == 0:
                print('outfeed {}'.format(i))
                print('data returned from outfeed: {}'.format(outfeed_data))

                saver.save(sess,
                           os.path.join(args.model_dir, 'model.ckpt'),
                           global_step=i)

    infeed_thread = threading.Thread(target=_run_infeed)
    outfeed_thread = threading.Thread(target=_run_outfeed)

    sess.run(tpu_init)
    sess.run(variables_init)

    infeed_thread.start()
    outfeed_thread.start()

    sess.run(tpu_computation_loop)

    infeed_thread.join()
    outfeed_thread.join()

    sess.run(tpu_shutdown)

    saver.save(sess,
               os.path.join(args.model_dir, 'model.ckpt'),
               global_step=args.max_steps)
예제 #21
0
def train():
    def getTrainBatch():
        labels = []
        arr = np.zeros([BATCH_SIZE, MAX_WORDS_IN_REVIEW, EMBEDDING_SIZE])
        for i in range(BATCH_SIZE):
            if (i % 2 == 0):
                num = randint(0, 12499)
                labels.append([1, 0])
            else:
                num = randint(12500, 24999)
                labels.append([0, 1])
            arr[i] = training_data_embedded[num, :, :]
        return arr, labels

    # Call implementation
    glove_array, glove_dict = load_glove_embeddings()

    training_data_text = load_data()
    training_data_embedded = embedd_data(training_data_text, glove_array,
                                         glove_dict)
    input_data, labels, dropout_keep_prob, optimizer, accuracy, loss = \
        imp.define_graph()

    # tensorboard
    tf.summary.scalar("training_accuracy", accuracy)
    tf.summary.scalar("loss", loss)
    summary_op = tf.summary.merge_all()

    # saver
    all_saver = tf.train.Saver()
    tpu_grpc_url = TPUClusterResolver(
        tpu=[os.environ['TPU_NAME']]).get_master()
    sess = tf.InteractiveSession(tpu_grpc_url)
    sess.run(tpu.initialize_system())

    sess.run(tf.global_variables_initializer())

    logdir = "tensorboard/" + datetime.datetime.now().strftime(
        "%Y%m%d-%H%M%S") + "/"
    writer = tf.summary.FileWriter(logdir, sess.graph)

    for i in range(iterations):
        batch_data, batch_labels = getTrainBatch()
        sess.run(optimizer, {
            input_data: batch_data,
            labels: batch_labels,
            dropout_keep_prob: 0.6
        })
        if (i % 50 == 0):
            loss_value, accuracy_value, summary = sess.run(
                [loss, accuracy, summary_op], {
                    input_data: batch_data,
                    labels: batch_labels
                })
            writer.add_summary(summary, i)
            print("Iteration: ", i)
            print("loss", loss_value)
            print("acc", accuracy_value)
        if (i % SAVE_FREQ == 0 and i != 0):
            if not os.path.exists(checkpoints_dir):
                os.makedirs(checkpoints_dir)
            save_path = all_saver.save(sess,
                                       checkpoints_dir + "/trained_model.ckpt",
                                       global_step=i)
            print("Saved model to %s" % save_path)
    #sess.close()
    sess.run(tpu.shutdown_system())
예제 #22
0
파일: acer.py 프로젝트: npfoss/Halite3RL
    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef,
                 q_coef, gamma, max_grad_norm, lr, rprop_alpha, rprop_epsilon,
                 total_timesteps, lrschedule, c, trust_region, alpha, delta):

        if USING_TPUS:
            tpu_grpc_url = TPUClusterResolver(
                tpu=[os.environ['TPU_NAME']]).get_master()
            sess = tf.Session(tpu_grpc_url)
            sess.run(tpu.initialize_system())
        else:
            sess = get_session()
        nact = ac_space.n
        nbatch = nenvs * nsteps

        A = tf.placeholder(tf.int32, [nbatch])  # actions
        D = tf.placeholder(tf.float32, [nbatch])  # dones
        R = tf.placeholder(tf.float32, [nbatch])  # rewards, not returns
        MU = tf.placeholder(tf.float32, [nbatch, nact])  # mu's
        LR = tf.placeholder(tf.float32, [])
        eps = 1e-6

        step_ob_placeholder = tf.placeholder(dtype=ob_space.dtype,
                                             shape=(nenvs, ) + ob_space.shape)
        train_ob_placeholder = tf.placeholder(dtype=ob_space.dtype,
                                              shape=(nenvs * (nsteps + 1), ) +
                                              ob_space.shape)
        with tf.variable_scope('acer_model', reuse=tf.AUTO_REUSE):

            step_model = policy(observ_placeholder=step_ob_placeholder,
                                sess=sess)
            train_model = policy(observ_placeholder=train_ob_placeholder,
                                 sess=sess)

        params = find_trainable_variables("acer_model")
        print("Params {}".format(len(params)))
        for var in params:
            print(var)

        # create polyak averaged model
        ema = tf.train.ExponentialMovingAverage(alpha)
        ema_apply_op = ema.apply(params)

        def custom_getter(getter, *args, **kwargs):
            v = ema.average(getter(*args, **kwargs))
            print(v.name)
            return v

        with tf.variable_scope("acer_model",
                               custom_getter=custom_getter,
                               reuse=True):
            polyak_model = policy(observ_placeholder=train_ob_placeholder,
                                  sess=sess)

        # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i

        # action probability distributions according to train_model, polyak_model and step_model
        # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax
        train_model_p = tf.nn.softmax(train_model.pi)
        polyak_model_p = tf.nn.softmax(polyak_model.pi)
        step_model_p = tf.nn.softmax(step_model.pi)
        v = tf.reduce_sum(train_model_p * train_model.q,
                          axis=-1)  # shape is [nenvs * (nsteps + 1)]

        # strip off last step
        f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps),
                          [train_model_p, polyak_model_p, train_model.q])
        # Get pi and q values for actions taken
        f_i = get_by_index(f, A)
        q_i = get_by_index(q, A)

        # Compute ratios for importance truncation
        rho = f / (MU + eps)
        rho_i = get_by_index(rho, A)

        # Calculate Q_retrace targets
        qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma)

        # Calculate losses
        # Entropy
        # entropy = tf.reduce_mean(strip(train_model.pd.entropy(), nenvs, nsteps))
        entropy = tf.reduce_mean(cat_entropy_softmax(f))

        # Policy Graident loss, with truncated importance sampling & bias correction
        v = strip(v, nenvs, nsteps, True)
        check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4)
        check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2)

        # Truncated importance sampling
        adv = qret - v
        logf = tf.log(f_i + eps)
        gain_f = logf * tf.stop_gradient(
            adv * tf.minimum(c, rho_i))  # [nenvs * nsteps]
        loss_f = -tf.reduce_mean(gain_f)

        # Bias correction for the truncation
        adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1])
                  )  # [nenvs * nsteps, nact]
        logf_bc = tf.log(f + eps)  # / (f_old + eps)
        check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]] * 2)
        gain_bc = tf.reduce_sum(
            logf_bc *
            tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f),
            axis=1)  #IMP: This is sum, as expectation wrt f
        loss_bc = -tf.reduce_mean(gain_bc)

        loss_policy = loss_f + loss_bc

        # Value/Q function loss, and explained variance
        check_shape([qret, q_i], [[nenvs * nsteps]] * 2)
        ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]),
                                  tf.reshape(qret, [nenvs, nsteps]))
        loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i) * 0.5)

        # Net loss
        check_shape([loss_policy, loss_q, entropy], [[]] * 3)
        loss = loss_policy + q_coef * loss_q - ent_coef * entropy

        if trust_region:
            g = tf.gradients(-(loss_policy - ent_coef * entropy) * nsteps *
                             nenvs, f)  #[nenvs * nsteps, nact]
            # k = tf.gradients(KL(f_pol || f), f)
            k = -f_pol / (
                f + eps
            )  #[nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f
            k_dot_g = tf.reduce_sum(k * g, axis=-1)
            adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) /
                             (tf.reduce_sum(tf.square(k), axis=-1) +
                              eps))  #[nenvs * nsteps]

            # Calculate stats (before doing adjustment) for logging.
            avg_norm_k = avg_norm(k)
            avg_norm_g = avg_norm(g)
            avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g))
            avg_norm_adj = tf.reduce_mean(tf.abs(adj))

            g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k
            grads_f = -g / (
                nenvs * nsteps
            )  # These are turst region adjusted gradients wrt f ie statistics of policy pi
            grads_policy = tf.gradients(f, params, grads_f)
            grads_q = tf.gradients(loss_q * q_coef, params)
            grads = [
                gradient_add(g1, g2, param)
                for (g1, g2, param) in zip(grads_policy, grads_q, params)
            ]

            avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs)
            norm_grads_q = tf.global_norm(grads_q)
            norm_grads_policy = tf.global_norm(grads_policy)
        else:
            grads = tf.gradients(loss, params)

        if max_grad_norm is not None:
            grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=rprop_alpha,
                                            epsilon=rprop_epsilon)
        _opt_op = trainer.apply_gradients(grads)

        # so when you call _train, you first do the gradient step, then you apply ema
        with tf.control_dependencies([_opt_op]):
            _train = tf.group(ema_apply_op)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        # Ops/Summaries to run, and their names for logging
        run_ops = [
            _train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev,
            norm_grads
        ]
        names_ops = [
            'loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc',
            'explained_variance', 'norm_grads'
        ]
        if trust_region:
            run_ops = run_ops + [
                norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k,
                avg_norm_g, avg_norm_k_dot_g, avg_norm_adj
            ]
            names_ops = names_ops + [
                'norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f',
                'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj'
            ]

        def train(obs, actions, rewards, dones, mus, states, masks, steps):
            cur_lr = lr.value_steps(steps)
            td_map = {
                train_model.X: obs,
                polyak_model.X: obs,
                A: actions,
                R: rewards,
                D: dones,
                MU: mus,
                LR: cur_lr
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
                td_map[polyak_model.S] = states
                td_map[polyak_model.M] = masks

            if USING_TPUS:
                return names_ops, sess.run(tpu.rewrite(
                    run_ops, td_map))[1:]  # strip off _train
            else:
                return names_ops, sess.run(run_ops,
                                           td_map)[1:]  # strip off _train

        def _step(observation, **kwargs):
            return step_model._evaluate(
                [step_model.action, step_model_p, step_model.state],
                observation, **kwargs)

        self.train = train
        self.save = functools.partial(save_variables,
                                      sess=sess,
                                      variables=params)
        self.train_model = train_model
        self.step_model = step_model
        self._step = _step
        self.step = self.step_model.step

        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
"""Simple scritpt from google TPU collab to measure teraflops
   [https://colab.research.google.com/notebooks/tpu.ipynb]
"""

from tensorflow.contrib import tpu
from tensorflow.contrib.cluster_resolver import TPUClusterResolver  #pylint: disable=E0611
import time
import tensorflow as tf

#tpu_address = ['node-1', 'node-2']
# Apparently multiple TPUs for a single session are not
# yet suported
tpu_address = ['node-1']

tpu_cluster = TPUClusterResolver(tpu=tpu_address).get_master()

N = 4096
COUNT = 100


def flops():
    x = tf.random_uniform([N, N])
    y = tf.random_uniform([N, N])

    def _matmul(x, y):
        return tf.tensordot(x, y, axes=[[1], [0]]), y

    return tf.reduce_sum(tpu.repeat(COUNT, _matmul, [x, y]))


tpu_ops = tpu.batch_parallel(flops, [], num_shards=8)
예제 #24
0
from bert import modeling
from bert.run_pretraining import input_fn_builder, model_fn_builder

# configure logging
log = logging.getLogger('tensorflow')
log.setLevel(logging.INFO)

# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s :  %(message)s')
sh = logging.StreamHandler()
sh.setLevel(logging.INFO)
sh.setFormatter(formatter)
log.handlers = [sh]
log.info("Using TPU runtime")
USE_TPU = True
tpu_cluster_resolver = TPUClusterResolver(tpu='greek-bert',
                                          zone='us-central1-a')

# SETUP FOLDERS
with tf.Session(tpu_cluster_resolver.get_master()) as session:
    print(tpu_cluster_resolver.get_master())
    HOME_PATH = "gs://greek_bert"  # @param {type:"string"}
    MODEL_DIR = "greek_bert"  # @param {type:"string"}
    PRETRAINING_DIR = "greek_tfrecords"  # @param {type:"string"}
    VOC_FNAME = "vocab.txt"  # @param {type:"string"}

# Input data pipeline config
TRAIN_BATCH_SIZE = 256  # @param {type:"integer"}
MAX_PREDICTIONS = 75  # @param {type:"integer"}
MAX_SEQ_LENGTH = 512  # @param {type:"integer"}
MASKED_LM_PROB = 0.15  # @param
예제 #25
0
파일: sum.py 프로젝트: valohai/tpu-test
def axy_computation(a, x, y):
    return a * x + y


output_shape = [80, 80]

inputs = [
    3.0,
    tf.random_uniform(output_shape, dtype=tf.float32),
    tf.random_uniform(output_shape, dtype=tf.float32),
]

if use_tpu:
    print('Setting up TPU')
    tpu_grpc_url = TPUClusterResolver(tpu=[tpu_name]).get_master()
    computation = tpu.rewrite(axy_computation, inputs)
else:
    print(
        'TPU IS NOT ENABLED (pass a TPU name or grpc://ip:port as the TPU_NAME envvar)'
    )
    computation = tf.py_func(axy_computation, inputs, tf.float32)
    tpu_grpc_url = None

with tf.Session(tpu_grpc_url) as sess:
    if use_tpu:
        print('Running TPU initializer')
        sess.run(tpu.initialize_system())
    sess.run(tf.global_variables_initializer())
    print('Running computation {}'.format(computation))
    output = sess.run(computation)
예제 #26
0
# export TPU_NAME=aeon
# wget --show-progress --continue -O shakespeare.txt http://www.gutenberg.org/files/100/100-0.txt

import numpy as np
import six
import tensorflow as tf
import time
import os
from tensorflow.contrib import tpu
from tensorflow.contrib.cluster_resolver import TPUClusterResolver

# This address identifies the TPU we'll use when configuring TensorFlow.
TPU_WORKER = TPUClusterResolver(tpu=[os.environ['TPU_NAME']]).get_master()

SHAKESPEARE_TXT = 'shakespeare1.txt'

tf.logging.set_verbosity(tf.logging.INFO)


def transform(txt, pad_to=None):
    # drop any non-ascii characters
    output = np.asarray([ord(c) for c in txt if ord(c) < 255], dtype=np.int32)
    if pad_to is not None:
        output = output[:pad_to]
        output = np.concatenate([
            np.zeros([pad_to - len(txt)], dtype=np.int32),
            output,
        ])
    return output

예제 #27
0
  @property
  def required_tpu(self):
    return self._required_tpu


# pylint: disable=g-long-lambda
default_strategy = NamedDistribution(
    "Default",
    distribution_strategy_context._get_default_distribution_strategy,  # pylint: disable=protected-access
    required_gpus=None)
one_device_strategy = NamedDistribution(
    "OneDeviceCPU", lambda: one_device_lib.OneDeviceStrategy("/cpu:0"),
    required_gpus=None)
tpu_strategy = NamedDistribution(
    "TPU", lambda: tpu_lib.TPUStrategy(
        TPUClusterResolver(""), steps_per_run=2),
    required_tpu=True)
tpu_strategy_one_step = NamedDistribution(
    "TPUOneStep", lambda: tpu_lib.TPUStrategy(
        TPUClusterResolver(""), steps_per_run=1),
    required_tpu=True)
mirrored_strategy_with_one_cpu = NamedDistribution(
    "Mirrored1CPU",
    lambda: mirrored_lib.MirroredStrategy(["/cpu:0"]))
mirrored_strategy_with_one_gpu = NamedDistribution(
    "Mirrored1GPU",
    lambda: mirrored_lib.MirroredStrategy(["/gpu:0"]),
    required_gpus=1)
mirrored_strategy_with_gpu_and_cpu = NamedDistribution(
    "MirroredCPUAndGPU",
    lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/cpu:0"]),
예제 #28
0
def train_and_eval():
  """Trains a network on (self) supervised data."""
  checkpoint_dir = FLAGS.get_flag_value("checkpoint", FLAGS.workdir)
  tf.gfile.MakeDirs(checkpoint_dir)

  if FLAGS.tpu_name:
    cluster = TPUClusterResolver(tpu=[FLAGS.tpu_name])
  else:
    cluster = None

  # tf.logging.info("master: %s", master)
  config = RunConfig(
      model_dir=checkpoint_dir,
      tf_random_seed=FLAGS.random_seed,
      cluster=cluster,
      keep_checkpoint_max=None,
      save_checkpoints_steps=FLAGS.save_checkpoints_steps,
      tpu_config=TPUConfig(iterations_per_loop=TPU_ITERATIONS_PER_LOOP))

  # Optionally resume from a stored checkpoint.
  if FLAGS.path_to_initial_ckpt:
    warm_start_from = tf.estimator.WarmStartSettings(
        ckpt_to_initialize_from=FLAGS.path_to_initial_ckpt,
        # The square bracket is important for loading all the
        # variables from GLOBAL_VARIABLES collection.
        # See https://www.tensorflow.org/api_docs/python/tf/estimator/WarmStartSettings  # pylint: disable=line-too-long
        # section vars_to_warm_start for more details.
        vars_to_warm_start=[FLAGS.vars_to_restore]
    )
  else:
    warm_start_from = None

  # The global batch-sizes are passed to the TPU estimator, and it will pass
  # along the local batch size in the model_fn's `params` argument dict.
  estimator = TPUEstimator(
      model_fn=semi_supervised.get_model(FLAGS.task),
      model_dir=checkpoint_dir,
      config=config,
      use_tpu=FLAGS.tpu_name is not None,
      train_batch_size=FLAGS.batch_size,
      eval_batch_size=FLAGS.get_flag_value("eval_batch_size", FLAGS.batch_size),
      warm_start_from=warm_start_from
  )

  if FLAGS.run_eval:
    data_fn = functools.partial(
        datasets.get_data,
        split_name=FLAGS.val_split,
        preprocessing=FLAGS.get_flag_value("preprocessing_eval",
                                           FLAGS.preprocessing),
        is_training=False,
        shuffle=False,
        num_epochs=1,
        drop_remainder=True)

    # Contrary to what the documentation claims, the `train` and the
    # `evaluate` functions NEED to have `max_steps` and/or `steps` set and
    # cannot make use of the iterator's end-of-input exception, so we need
    # to do some math for that here.
    num_samples = datasets.get_count(FLAGS.val_split)
    num_steps = num_samples // FLAGS.get_flag_value("eval_batch_size",
                                                    FLAGS.batch_size)
    tf.logging.info("val_steps: %d", num_steps)

    for checkpoint in checkpoints_iterator(
        estimator.model_dir, timeout=FLAGS.eval_timeout_mins * 60):

      result_dict_val = estimator.evaluate(
          checkpoint_path=checkpoint, input_fn=data_fn, steps=num_steps)

      hub_exporter = hub.LatestModuleExporter("hub", serving_input_fn)
      hub_exporter.export(
          estimator,
          os.path.join(checkpoint_dir, "export/hub"),
          checkpoint)
      # This is here instead of using the above `checkpoints_iterator`'s
      # `timeout_fn` param, because that would wait forever on failed
      # trainers which will never create this file.
      if tf.gfile.Exists(os.path.join(FLAGS.workdir, "TRAINING_IS_DONE")):
        break

    # Evaluates the latest checkpoint on validation set.
    result_dict_val = estimator.evaluate(input_fn=data_fn, steps=num_steps)
    tf.logging.info(result_dict_val)

    # Optionally evaluates the latest checkpoint on test set.
    if FLAGS.test_split:
      data_fn = functools.partial(
          datasets.get_data,
          split_name=FLAGS.test_split,
          preprocessing=FLAGS.get_flag_value("preprocessing_eval",
                                             FLAGS.preprocessing),
          is_training=False,
          shuffle=False,
          num_epochs=1,
          drop_remainder=True)
      num_samples = datasets.get_count(FLAGS.test_split)
      num_steps = num_samples // FLAGS.get_flag_value("eval_batch_size",
                                                      FLAGS.batch_size)
      result_dict_test = estimator.evaluate(input_fn=data_fn, steps=num_steps)
      tf.logging.info(result_dict_test)
    return result_dict_val

  else:
    train_data_fn = functools.partial(
        datasets.get_data,
        split_name=FLAGS.train_split,
        preprocessing=FLAGS.preprocessing,
        is_training=True,
        num_epochs=None,  # read data indefenitely for training
        drop_remainder=True)

    # We compute the number of steps and make use of Estimator's max_steps
    # arguments instead of relying on the Dataset's iterator to run out after
    # a number of epochs so that we can use "fractional" epochs, which are
    # used by regression tests. (And because TPUEstimator needs it anyways.)
    num_samples = datasets.get_count(FLAGS.train_split)
    if FLAGS.num_supervised_examples:
      num_samples = FLAGS.num_supervised_examples
    # Depending on whether we drop the last batch each epoch or only at the
    # ver end, this should be ordered differently for rounding.
    updates_per_epoch = num_samples // FLAGS.batch_size
    epochs = utils.str2intlist(FLAGS.schedule, strict_int=False)[-1]
    num_steps = int(math.ceil(epochs * updates_per_epoch))
    tf.logging.info("train_steps: %d", num_steps)

    return estimator.train(
        train_data_fn,
        max_steps=num_steps)
예제 #29
0
def experiment(model_config):
    tf.logging.set_verbosity(tf.logging.INFO)
    tf.logging.info("SCRIPT START")

    tf.logging.info("TPU resolver started")

    tpu_cluster_resolver = TPUClusterResolver(
        tpu=os.environ['TPU_NAME'],
        project=os.environ['PROJECT_NAME'],
        zone=os.environ['PROJECT_ZONE'])
    config = tpu_config.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=model_config['model_base_dir'] + os.path.sep + str(model_config["experiment_id"]),
        save_checkpoints_steps=500,
        save_summary_steps=250,
        tpu_config=tpu_config.TPUConfig(
            iterations_per_loop=500,
            num_shards=8,
            per_host_input_for_training=tpu_config.InputPipelineConfig.PER_HOST_V1))  # pylint: disable=line-too-long

    tf.logging.info("Creating datasets")
    urmp_train, urmp_eval, urmp_test = [
        urmp_input.URMPInput(mode=mode,
                             data_dir=model_config['data_path'],
                             transpose_input=False,
                             use_bfloat16=model_config['use_bfloat16'])
        for mode in ['train', 'eval', 'test']
    ]

    tf.logging.info("Assigning TPUEstimator")
    # Optimize in a +supervised fashion until validation loss worsens
    separator = tpu_estimator.TPUEstimator(
        use_tpu=model_config["use_tpu"],
        model_fn=unet_separator,
        config=config,
        train_batch_size=model_config['batch_size'],
        eval_batch_size=model_config['batch_size'],
        predict_batch_size=model_config['batch_size'],
        params={
            i: model_config[i]
            for i in model_config if (i != 'batch_size' and i != 'context')
        })

    if model_config['load_model']:
        tf.logging.info("Load the model")
        current_step = estimator._load_global_step_from_checkpoint_dir(
            model_config['model_base_dir'] + os.path.sep +
            str(model_config["experiment_id"]))

    if model_config['mode'] == 'train_and_eval':
        tf.logging.info("Train the model")
        # Should be an early stopping here, but it will come with tf 1.10
        separator.train(input_fn=urmp_train.input_fn,
                        steps=model_config['training_steps'])

        tf.logging.info("Supervised training finished!")
        tf.logging.info("Evaluate model")
        # Evaluate the model.
        eval_result = separator.evaluate(
            input_fn=urmp_eval.input_fn,
            steps=model_config['evaluation_steps'])
        tf.logging.info('Evaluation results: %s' % eval_result)

    elif model_config['mode'] == 'predict':
        tf.logging.info("Test results and save predicted sources:")
        predictions = separator.predict(input_fn=urmp_test.input_fn)

        for prediction in predictions:
            Test.save_prediction(prediction,
                                 estimates_path=model_config["estimates_path"],
                                 sample_rate=model_config["expected_sr"])
        Utils.concat_and_upload(
            model_config["estimates_path"], model_config['model_base_dir'] +
            os.path.sep + str(model_config["experiment_id"]))
import tensorflow as tf
from tensorflow.contrib import tpu
from tensorflow.contrib.cluster_resolver import TPUClusterResolver


def axy_computation(a, x, y):
    return a * x + y


inputs = [
    3.0,
    tf.ones([3, 3], tf.float32),
    tf.ones([3, 3], tf.float32),
]

tpu_computation = tpu.rewrite(axy_computation, inputs)

tpu_grpc_url = TPUClusterResolver().get_master()

with tf.Session(tpu_grpc_url) as sess:
    sess.run(tpu.initialize_system())
    sess.run(tf.global_variables_initializer())
    output = sess.run(tpu_computation)

    with open("/output.txt", "w") as output_file:
        output_file.write(str(output))

    print(output)
    sess.run(tpu.shutdown_system())

print('Done!')