Пример #1
0
def main(args):
    tf.logging.set_verbosity(tf.logging.INFO)
    model_cls = models.get_model(args.model)
    params = default_parameters()
    # Import and override parameters
    # Priorities (low -> high):
    # default -> saved -> command
    params = merge_parameters(params, model_cls.get_parameters())
    params = import_params(args.output, args.model, params)
    override_parameters(params, args)

    # Export all parameters and model specific parameters
    export_params(params.output, "params.json", params)
    export_params(params.output, "%s.json" % args.model,
                  collect_params(params, model_cls.get_parameters()))

    # Build Graph
    with tf.Graph().as_default():
        if not params.record:
            # Build input queue
            if params.use_bert and params.bert_emb_path:
                features = dataset.get_training_input_with_bert(
                    params.input + [params.bert_emb_path], params)
            else:
                features = dataset.get_training_input(params.input, params)
        else:
            features = record.get_input_features(  # ??? 
                os.path.join(params.record, "*train*"), "train", params)

        # Build model
        initializer = get_initializer(params)
        model = model_cls(params)

        # Multi-GPU setting
        sharded_losses = parallel.parallel_model(
            model.get_training_func(initializer), features, params.device_list)
        loss = tf.add_n(sharded_losses) / len(sharded_losses)

        # Create global step
        global_step = tf.train.get_or_create_global_step()

        # Print parameters
        all_weights = {v.name: v for v in tf.trainable_variables()}
        total_size = 0

        for v_name in sorted(list(all_weights)):
            v = all_weights[v_name]
            tf.logging.info("%s\tshape    %s", v.name[:-2].ljust(80),
                            str(v.shape).ljust(20))
            v_size = np.prod(np.array(
                v.shape.as_list())).tolist()  # mutiple all dimension size
            total_size += v_size
        tf.logging.info("Total trainable variables size: %d", total_size)

        learning_rate = get_learning_rate_decay(params.learning_rate,
                                                global_step, params)
        learning_rate = tf.convert_to_tensor(learning_rate, dtype=tf.float32)
        tf.summary.scalar("learning_rate", learning_rate)

        # Create optimizer
        opt = tf.train.AdamOptimizer(learning_rate,
                                     beta1=params.adam_beta1,
                                     beta2=params.adam_beta2,
                                     epsilon=params.adam_epsilon)

        if params.update_cycle == 1:
            train_op = tf.contrib.layers.optimize_loss(
                name="training",
                loss=loss,
                global_step=global_step,
                learning_rate=learning_rate,
                clip_gradients=params.clip_grad_norm or None,
                optimizer=opt,
                colocate_gradients_with_ops=True)
            zero_op = tf.no_op("zero_op")
            collect_op = tf.no_op("collect_op")
        else:
            grads_and_vars = opt.compute_gradients(
                loss, colocate_gradients_with_ops=True)
            gradients = [item[0] for item in grads_and_vars]
            variables = [item[1] for item in grads_and_vars]
            variables = utils.replicate_variables(variables)
            zero_op = utils.zero_variables(variables)
            collect_op = utils.collect_gradients(gradients, variables)

            scale = 1.0 / params.update_cycle
            gradients, variables = utils.scale_gradients(grads_and_vars, scale)

            # Gradient clipping avoid greadient explosion!!
            if isinstance(params.clip_grad_norm or None, float):
                gradients, _ = tf.clip_by_global_norm(gradients,
                                                      params.clip_grad_norm)

            # Update variables
            grads_and_vars = list(zip(gradients, variables))
            with tf.control_dependencies([collect_op]):
                train_op = opt.apply_gradients(grads_and_vars, global_step)

        # Validation
        '''
        if params.validation and params.references[0]:
            files = [params.validation] + list(params.references)
            eval_inputs = files
            eval_input_fn = dataset.get_evaluation_input
        else:
            print("Don't evaluate")
            eval_input_fn = None
        '''
        # Add hooks
        train_hooks = [
            tf.train.StopAtStepHook(last_step=params.train_steps),
            tf.train.NanTensorHook(
                loss
            ),  # Monitors the loss tensor and stops training if loss is NaN
            tf.train.LoggingTensorHook(
                {
                    "step": global_step,
                    "loss": loss,
                    "chars": tf.shape(features["chars"]),
                    "source": tf.shape(features["source"]),
                    #"bert": tf.shape(features["bert"]),
                    "lr": learning_rate
                },
                every_n_iter=1),
            tf.train.CheckpointSaverHook(
                checkpoint_dir=params.output,
                save_secs=params.save_checkpoint_secs or None,
                save_steps=params.save_checkpoint_steps or None,
                saver=tf.train.Saver(max_to_keep=params.keep_checkpoint_max,
                                     sharded=False))
        ]

        config = session_config(params)
        '''
        if not eval_input_fn is  None:
            train_hooks.append(
                hooks.EvaluationHook(
                    lambda f: search.create_inference_graph(
                        model.get_evaluation_func(), f, params
                    ),
                    lambda: eval_input_fn(eval_inputs, params),
                    lambda x: decode_target_ids(x, params),
                    params.output,
                    config,
                    params.keep_top_checkpoint_max,
                    eval_secs=params.eval_secs,
                    eval_steps=params.eval_steps
                )
            )
        '''

        with tf.train.MonitoredTrainingSession(checkpoint_dir=params.output,
                                               hooks=train_hooks,
                                               save_checkpoint_secs=None,
                                               config=config) as sess:
            while not sess.should_stop():
                utils.session_run(sess, zero_op)
                for i in range(1, params.update_cycle):
                    utils.session_run(sess, collect_op)
                sess.run(train_op)
Пример #2
0
def main(args):
    if args.distribute:
        distribute.enable_distributed_training()

    tf.logging.set_verbosity(tf.logging.INFO)
    model_cls = models.get_model(args.model)
    params = default_parameters()

    # Import and override parameters
    # Priorities (low -> high):
    # default -> saved -> command
    params = merge_parameters(params, model_cls.get_parameters())
    params = import_params(args.output, args.model, params)
    override_parameters(params, args)

    # Export all parameters and model specific parameters
    if distribute.rank() == 0:
        export_params(params.output, "params.json", params)
        export_params(params.output, "%s.json" % args.model,
                      collect_params(params, model_cls.get_parameters()))

    # Build Graph
    with tf.Graph().as_default():
        if not params.record:
            # Build input queue
            features = dataset.get_training_input(params.input, params)
        else:
            features = record.get_input_features(
                os.path.join(params.record, "*train*"), "train", params)

        # Build model
        initializer = get_initializer(params)
        regularizer = tf.contrib.layers.l1_l2_regularizer(
            scale_l1=params.scale_l1, scale_l2=params.scale_l2)
        model = model_cls(params)
        # Create global step
        global_step = tf.train.get_or_create_global_step()
        dtype = tf.float16 if args.half else None

        # Multi-GPU setting
        sharded_losses = parallel.parallel_model(
            model.get_training_func(initializer, regularizer, dtype), features,
            params.device_list)
        loss = tf.add_n(sharded_losses) / len(sharded_losses)
        loss = loss + tf.losses.get_regularization_loss()

        if distribute.rank() == 0:
            print_variables()

        learning_rate = get_learning_rate_decay(params.learning_rate,
                                                global_step, params)
        learning_rate = tf.convert_to_tensor(learning_rate, dtype=tf.float32)

        tf.summary.scalar("loss", loss)
        tf.summary.scalar("learning_rate", learning_rate)

        # Create optimizer
        if params.optimizer == "Adam":
            opt = tf.train.AdamOptimizer(learning_rate,
                                         beta1=params.adam_beta1,
                                         beta2=params.adam_beta2,
                                         epsilon=params.adam_epsilon)
        elif params.optimizer == "LazyAdam":
            opt = tf.contrib.opt.LazyAdamOptimizer(learning_rate,
                                                   beta1=params.adam_beta1,
                                                   beta2=params.adam_beta2,
                                                   epsilon=params.adam_epsilon)
        else:
            raise RuntimeError("Optimizer %s not supported" % params.optimizer)

        opt = optimizers.MultiStepOptimizer(opt, params.update_cycle)

        if args.half:
            opt = optimizers.LossScalingOptimizer(opt, params.loss_scale)

        # Optimization
        grads_and_vars = opt.compute_gradients(
            loss, colocate_gradients_with_ops=True)

        if params.clip_grad_norm:
            grads, var_list = list(zip(*grads_and_vars))
            grads, _ = tf.clip_by_global_norm(grads, params.clip_grad_norm)
            grads_and_vars = zip(grads, var_list)

        train_op = opt.apply_gradients(grads_and_vars, global_step=global_step)

        # Validation
        if params.validation and params.references[0]:
            files = [params.validation] + list(params.references)
            eval_inputs = dataset.sort_and_zip_files(files)
            eval_input_fn = dataset.get_evaluation_input
        else:
            eval_input_fn = None

        # Hooks
        train_hooks = [
            tf.train.StopAtStepHook(last_step=params.train_steps),
            tf.train.NanTensorHook(loss),
            tf.train.LoggingTensorHook(
                {
                    "step": global_step,
                    "loss": loss,
                    "source": tf.shape(features["source"]),
                    "target": tf.shape(features["target"])
                },
                every_n_iter=1)
        ]

        broadcast_hook = distribute.get_broadcast_hook()

        if broadcast_hook:
            train_hooks.append(broadcast_hook)

        if distribute.rank() == 0:
            # Add hooks
            save_vars = tf.trainable_variables() + [global_step]
            saver = tf.train.Saver(
                var_list=save_vars if params.only_save_trainable else None,
                max_to_keep=params.keep_checkpoint_max,
                sharded=False)
            tf.add_to_collection(tf.GraphKeys.SAVERS, saver)
            train_hooks.append(
                hooks.MultiStepHook(tf.train.CheckpointSaverHook(
                    checkpoint_dir=params.output,
                    save_secs=params.save_checkpoint_secs or None,
                    save_steps=params.save_checkpoint_steps or None,
                    saver=saver),
                                    step=params.update_cycle))

            if eval_input_fn is not None:
                train_hooks.append(
                    hooks.MultiStepHook(hooks.EvaluationHook(
                        lambda f: inference.create_inference_graph([model], f,
                                                                   params),
                        lambda: eval_input_fn(eval_inputs, params),
                        lambda x: decode_target_ids(x, params),
                        params.output,
                        session_config(params),
                        device_list=params.device_list,
                        max_to_keep=params.keep_top_checkpoint_max,
                        eval_secs=params.eval_secs,
                        eval_steps=params.eval_steps),
                                        step=params.update_cycle))
            checkpoint_dir = params.output
        else:
            checkpoint_dir = None

        restore_op = restore_variables(args.checkpoint)

        def restore_fn(step_context):
            step_context.session.run(restore_op)

        # Create session, do not use default CheckpointSaverHook
        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=checkpoint_dir,
                hooks=train_hooks,
                save_checkpoint_secs=None,
                config=session_config(params)) as sess:
            # Restore pre-trained variables
            sess.run_step_fn(restore_fn)

            while not sess.should_stop():
                sess.run(train_op)
Пример #3
0
def main(args):
    tf.logging.set_verbosity(tf.logging.INFO)
    model_cls = models.get_model(args.model)
    params = default_parameters()

    # Import and override parameters
    # Priorities (low -> high):
    # default -> saved -> command
    params = merge_parameters(params, model_cls.get_parameters())
    params = import_params(args.output, args.model, params)
    override_parameters(params, args)

    # Export all parameters and model specific parameters
    export_params(params.output, "params.json", params)
    export_params(params.output, "%s.json" % args.model,
                  collect_params(params, model_cls.get_parameters()))

    #import ipdb; ipdb.set_trace()
    # Build Graph
    with tf.Graph().as_default():
        if not params.record:
            # Build input queue
            features = dataset_c2f_4layers.get_training_input_and_c2f_label(
                params.input, params.c2f_input, params)
        else:
            features = record.get_input_features(
                os.path.join(params.record, "*train*"), "train", params)

        update_cycle = params.update_cycle
        features, init_op = cache.cache_features(features, update_cycle)

        # Build model
        initializer = get_initializer(params)
        regularizer = tf.contrib.layers.l1_l2_regularizer(
            scale_l1=params.scale_l1, scale_l2=params.scale_l2)
        model = model_cls(params)
        # Create global step
        global_step = tf.train.get_or_create_global_step()

        # Multi-GPU setting
        sharded_losses = parallel.parallel_model(
            model.get_training_func(initializer, regularizer), features,
            params.device_list)
        if len(sharded_losses) > 1:
            losses_mle, losses_l1, losses_l2, losses_l3, losses_l4 = sharded_losses
            loss_mle = tf.add_n(losses_mle) / len(losses_mle)
            loss_l1 = tf.add_n(losses_l1) / len(losses_l1)
            loss_l2 = tf.add_n(losses_l2) / len(losses_l2)
            loss_l3 = tf.add_n(losses_l3) / len(losses_l3)
            loss_l4 = tf.add_n(losses_l4) / len(losses_l4)
        else:
            loss_mle, loss_l1, loss_l2, loss_l3, loss_l4 = sharded_losses[0]
        loss = loss_mle + (loss_l1 + loss_l2 + loss_l3 + loss_l4
                           ) / 4.0 + tf.losses.get_regularization_loss()

        # Print parameters
        all_weights = {v.name: v for v in tf.trainable_variables()}
        total_size = 0

        for v_name in sorted(list(all_weights)):
            v = all_weights[v_name]
            tf.logging.info("%s\tshape    %s", v.name[:-2].ljust(80),
                            str(v.shape).ljust(20))
            v_size = np.prod(np.array(v.shape.as_list())).tolist()
            total_size += v_size
        tf.logging.info("Total trainable variables size: %d", total_size)

        learning_rate = get_learning_rate_decay(params.learning_rate,
                                                global_step, params)
        learning_rate = tf.convert_to_tensor(learning_rate, dtype=tf.float32)
        tf.summary.scalar("learning_rate", learning_rate)

        # Create optimizer
        if params.optimizer == "Adam":
            opt = tf.train.AdamOptimizer(learning_rate,
                                         beta1=params.adam_beta1,
                                         beta2=params.adam_beta2,
                                         epsilon=params.adam_epsilon)
        elif params.optimizer == "LazyAdam":
            opt = tf.contrib.opt.LazyAdamOptimizer(learning_rate,
                                                   beta1=params.adam_beta1,
                                                   beta2=params.adam_beta2,
                                                   epsilon=params.adam_epsilon)
        else:
            raise RuntimeError("Optimizer %s not supported" % params.optimizer)

        loss, ops = optimize.create_train_op(loss, opt, global_step, params)
        restore_op = restore_variables(args.checkpoint)

        #import ipdb; ipdb.set_trace()

        # Validation
        if params.validation and params.references[0]:
            files = [params.validation] + list(params.references)
            eval_inputs = dataset_c2f_4layers.sort_and_zip_files(files)
            eval_input_fn = dataset_c2f_4layers.get_evaluation_input
        else:
            eval_input_fn = None

        # Add hooks
        save_vars = tf.trainable_variables() + [global_step]
        saver = tf.train.Saver(
            var_list=save_vars if params.only_save_trainable else None,
            max_to_keep=params.keep_checkpoint_max,
            sharded=False)
        tf.add_to_collection(tf.GraphKeys.SAVERS, saver)

        multiplier = tf.convert_to_tensor([update_cycle, 1])

        train_hooks = [
            tf.train.StopAtStepHook(last_step=params.train_steps),
            tf.train.NanTensorHook(loss),
            tf.train.LoggingTensorHook(
                {
                    "step": global_step,
                    "loss_mle": loss_mle,
                    "loss_l1": loss_l1,
                    "loss_l2": loss_l2,
                    "loss_l3": loss_l3,
                    "loss_l4": loss_l4,
                    "source": tf.shape(features["source"]) * multiplier,
                    "target": tf.shape(features["target"]) * multiplier
                },
                every_n_iter=1),
            tf.train.CheckpointSaverHook(
                checkpoint_dir=params.output,
                save_secs=params.save_checkpoint_secs or None,
                save_steps=params.save_checkpoint_steps or None,
                saver=saver)
        ]

        config = session_config(params)

        if eval_input_fn is not None:
            train_hooks.append(
                hooks.EvaluationHook(
                    lambda f: inference.create_inference_graph([model], f,
                                                               params),
                    lambda: eval_input_fn(eval_inputs, params),
                    lambda x: decode_target_ids(x, params),
                    params.output,
                    config,
                    params.keep_top_checkpoint_max,
                    eval_secs=params.eval_secs,
                    eval_steps=params.eval_steps))

        #def restore_fn(step_context):
        #    step_context.session.run(restore_op)

        #def step_fn(step_context):
        #    # Bypass hook calls
        #    step_context.session.run([init_op, ops["zero_op"]])
        #    for i in range(update_cycle - 1):
        #        step_context.session.run(ops["collect_op"])

        #    return step_context.run_with_hooks(ops["train_op"])

        # Create session, do not use default CheckpointSaverHook
        with tf.train.MonitoredTrainingSession(checkpoint_dir=params.output,
                                               hooks=train_hooks,
                                               save_checkpoint_secs=None,
                                               config=config) as sess:
            # Restore pre-trained variables
            sess._tf_sess().run(restore_op)

            while not sess.should_stop():
                sess._tf_sess().run([init_op, ops["zero_op"]])
                for i in range(update_cycle - 1):
                    sess._tf_sess().run(ops["collect_op"])
                sess.run(ops["train_op"])
Пример #4
0
def main(args):
    tf.logging.set_verbosity(tf.logging.INFO)
    model_cls = models.get_model(args.model)
    params = default_parameters()

    # Import and override parameters
    # Priorities (low -> high):
    # default -> saved -> command
    params = merge_parameters(params, model_cls.get_parameters())
    params = import_params(args.output, args.model, params)
    override_parameters(params, args)

    # Export all parameters and model specific parameters
    export_params(params.output, "params.json", params)
    export_params(params.output, "%s.json" % args.model,
                  collect_params(params, model_cls.get_parameters()))

    # Build Graph
    with tf.Graph().as_default():
        if not params.record:
            # Build input queue
            features = dataset.get_training_input(params.input, params)
        else:
            features = record.get_input_features(
                os.path.join(params.record, "*train*"), "train", params)

        # Build model
        initializer = get_initializer(params)
        model = model_cls(params)
        if params.MRT:
            assert params.batch_size == 1
            features = mrt_utils.get_MRT(features, params, model)

        # Multi-GPU setting
        sharded_losses = parallel.parallel_model(
            model.get_training_func(initializer), features, params.device_list)
        loss = tf.add_n(sharded_losses) / len(sharded_losses)

        # Create global step
        global_step = tf.train.get_or_create_global_step()

        # Print parameters
        all_weights = {v.name: v for v in tf.trainable_variables()}
        total_size = 0

        for v_name in sorted(list(all_weights)):
            v = all_weights[v_name]
            tf.logging.info("%s\tshape    %s", v.name[:-2].ljust(80),
                            str(v.shape).ljust(20))
            v_size = np.prod(np.array(v.shape.as_list())).tolist()
            total_size += v_size
        tf.logging.info("Total trainable variables size: %d", total_size)

        learning_rate = get_learning_rate_decay(params.learning_rate,
                                                global_step, params)
        learning_rate = tf.convert_to_tensor(learning_rate, dtype=tf.float32)
        tf.summary.scalar("learning_rate", learning_rate)

        # Create optimizer
        opt = tf.train.AdamOptimizer(learning_rate,
                                     beta1=params.adam_beta1,
                                     beta2=params.adam_beta2,
                                     epsilon=params.adam_epsilon)

        train_op = tf.contrib.layers.optimize_loss(
            name="training",
            loss=loss,
            global_step=global_step,
            learning_rate=learning_rate,
            clip_gradients=params.clip_grad_norm or None,
            optimizer=opt,
            colocate_gradients_with_ops=True)

        # Validation
        if params.validation and params.references[0]:
            files = [params.validation] + list(params.references)
            eval_inputs = dataset.sort_and_zip_files(files)
            eval_input_fn = dataset.get_evaluation_input
        else:
            eval_input_fn = None

        # Add hooks
        train_hooks = [
            tf.train.StopAtStepHook(last_step=params.train_steps),
            tf.train.NanTensorHook(loss),
            tf.train.LoggingTensorHook(
                {
                    "step": global_step,
                    "loss": loss,
                    "source": tf.shape(features["source"]),
                    "target": tf.shape(features["target"])
                },
                every_n_iter=1),
            tf.train.CheckpointSaverHook(
                checkpoint_dir=params.output,
                save_secs=params.save_checkpoint_secs or None,
                save_steps=params.save_checkpoint_steps or None,
                saver=tf.train.Saver(max_to_keep=params.keep_checkpoint_max,
                                     sharded=False))
        ]

        config = session_config(params)

        if eval_input_fn is not None:
            train_hooks.append(
                hooks.EvaluationHook(
                    lambda f: search.create_inference_graph(
                        model.get_evaluation_func(), f, params),
                    lambda: eval_input_fn(eval_inputs, params),
                    lambda x: decode_target_ids(x, params),
                    params.output,
                    config,
                    params.keep_top_checkpoint_max,
                    eval_secs=params.eval_secs,
                    eval_steps=params.eval_steps))

        # Create session, do not use default CheckpointSaverHook
        with tf.train.MonitoredTrainingSession(checkpoint_dir=params.output,
                                               hooks=train_hooks,
                                               save_checkpoint_secs=None,
                                               config=config) as sess:
            while not sess.should_stop():
                sess.run(train_op)
Пример #5
0
def main(args):
    if args.distribute:
        distribute.enable_distributed_training()

    tf.logging.set_verbosity(tf.logging.INFO)
    model_cls = models.get_model(args.model)
    params = default_parameters()

    # Import and override parameters
    # Priorities (low -> high):
    # default -> saved -> command
    params = merge_parameters(params, model_cls.get_parameters())
    params = import_params(args.output, args.model, params)
    override_parameters(params, args)

    # Export all parameters and model specific parameters
    if not args.distribute or distribute.rank() == 0:
        export_params(params.output, "params.json", params)
        export_params(params.output, "%s.json" % args.model,
                      collect_params(params, model_cls.get_parameters()))

    assert 'r2l' in params.input[2]
    # Build Graph
    use_all_devices(params)
    with tf.Graph().as_default():
        if not params.record:
            # Build input queue
            features = dataset.abd_get_training_input(params.input, params)
        else:
            features = record.get_input_features(
                os.path.join(params.record, "*train*"), "train", params)

        update_cycle = params.update_cycle
        features, init_op = cache.cache_features(features, update_cycle)

        # Build model
        initializer = get_initializer(params)
        regularizer = tf.contrib.layers.l1_l2_regularizer(
            scale_l1=params.scale_l1, scale_l2=params.scale_l2)
        model = model_cls(params)
        # Create global step
        global_step = tf.train.get_or_create_global_step()
        dtype = tf.float16 if args.fp16 else None

        if args.distribute:
            training_func = model.get_training_func(initializer, regularizer,
                                                    dtype)
            loss = training_func(features)
        else:
            # Multi-GPU setting
            sharded_losses = parallel.parallel_model(
                model.get_training_func(initializer, regularizer, dtype),
                features, params.device_list)
            loss = tf.add_n(sharded_losses) / len(sharded_losses)
            loss = loss + tf.losses.get_regularization_loss()

        # Print parameters
        if not args.distribute or distribute.rank() == 0:
            print_variables()

        learning_rate = get_learning_rate_decay(params.learning_rate,
                                                global_step, params)
        learning_rate = tf.convert_to_tensor(learning_rate, dtype=tf.float32)
        tf.summary.scalar("learning_rate", learning_rate)

        # Create optimizer
        if params.optimizer == "Adam":
            opt = tf.train.AdamOptimizer(learning_rate,
                                         beta1=params.adam_beta1,
                                         beta2=params.adam_beta2,
                                         epsilon=params.adam_epsilon)
        elif params.optimizer == "LazyAdam":
            opt = tf.contrib.opt.LazyAdamOptimizer(learning_rate,
                                                   beta1=params.adam_beta1,
                                                   beta2=params.adam_beta2,
                                                   epsilon=params.adam_epsilon)
        else:
            raise RuntimeError("Optimizer %s not supported" % params.optimizer)

        loss, ops = optimize.create_train_op(
            loss, opt, global_step,
            distribute.all_reduce if args.distribute else None, args.fp16,
            params)
        restore_op = restore_variables(args.checkpoint)

        # Validation
        if params.validation and params.references[0]:
            files = params.validation + list(params.references)
            eval_inputs = dataset.sort_and_zip_files(files)
            eval_input_fn = dataset.abd_get_evaluation_input
        else:
            eval_input_fn = None

        # Add hooks
        multiplier = tf.convert_to_tensor([update_cycle, 1])

        train_hooks = [
            tf.train.StopAtStepHook(last_step=params.train_steps),
            tf.train.NanTensorHook(loss),
            tf.train.LoggingTensorHook(
                {
                    "step": global_step,
                    "loss": loss,
                    "source": tf.shape(features["source"]) * multiplier,
                    "target": tf.shape(features["target"]) * multiplier
                },
                every_n_iter=1)
        ]

        if args.distribute:
            train_hooks.append(distribute.get_broadcast_hook())

        config = session_config(params)

        if not args.distribute or distribute.rank() == 0:
            # Add hooks
            save_vars = tf.trainable_variables() + [global_step]
            saver = tf.train.Saver(
                var_list=save_vars if params.only_save_trainable else None,
                max_to_keep=params.keep_checkpoint_max,
                sharded=False)
            tf.add_to_collection(tf.GraphKeys.SAVERS, saver)
            train_hooks.append(
                tf.train.CheckpointSaverHook(
                    checkpoint_dir=params.output,
                    save_secs=params.save_checkpoint_secs or None,
                    save_steps=params.save_checkpoint_steps or None,
                    saver=saver))

        if eval_input_fn is not None:
            if not args.distribute or distribute.rank() == 0:
                train_hooks.append(
                    hooks.EvaluationHook(
                        lambda f: inference.create_inference_graph([model], f,
                                                                   params),
                        lambda: eval_input_fn(eval_inputs, params),
                        lambda x: decode_target_ids(x, params),
                        params.output,
                        config,
                        params.keep_top_checkpoint_max,
                        eval_secs=params.eval_secs,
                        eval_steps=params.eval_steps))

        def restore_fn(step_context):
            step_context.session.run(restore_op)

        def step_fn(step_context):
            # Bypass hook calls
            step_context.session.run([init_op, ops["zero_op"]])
            for i in range(update_cycle - 1):
                step_context.session.run(ops["collect_op"])

            return step_context.run_with_hooks(ops["train_op"])

        # Create session, do not use default CheckpointSaverHook
        if not args.distribute or distribute.rank() == 0:
            checkpoint_dir = params.output
        else:
            checkpoint_dir = None

        with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                               hooks=train_hooks,
                                               save_checkpoint_secs=None,
                                               config=config) as sess:
            # Restore pre-trained variables
            sess.run_step_fn(restore_fn)

            while not sess.should_stop():
                sess.run_step_fn(step_fn)
def main(args):
    tf.logging.set_verbosity(tf.logging.INFO)
    # model_cls = models.get_model(args.model)
    model_cls = transformer_cache_fixencoder.Transformer
    params = default_parameters()

    # Import and override parameters
    # Priorities (low -> high):
    # default -> saved -> command
    params = merge_parameters(params, model_cls.get_parameters())
    params = import_params(args.output, args.model, params)
    override_parameters(params, args)

    # Export all parameters and model specific parameters
    export_params(params.output, "params.json", params)
    export_params(params.output, "%s.json" % args.model,
                  collect_params(params, model_cls.get_parameters()))

    # Build Graph
    with tf.Graph().as_default():
        if not params.record:
            # Build input queue
            features = dataset.get_training_input_src_context(
                params.input, params)
        else:
            features = record.get_input_features(
                os.path.join(params.record, "*train*"), "train", params)

        features, init_op = cache.cache_features(features, params.update_cycle)

        # Build model
        initializer = get_initializer(params)
        model = model_cls(params)

        # Multi-GPU setting
        sharded_losses = parallel.parallel_model(
            model.get_training_func(initializer), features, params.device_list)
        loss = tf.add_n(sharded_losses) / len(sharded_losses)

        # Create global step
        global_step = tf.train.get_or_create_global_step()

        # Print parameters
        all_weights = {v.name: v for v in tf.trainable_variables()}
        total_size = 0

        for v_name in sorted(list(all_weights)):
            v = all_weights[v_name]
            tf.logging.info("%s\tshape    %s", v.name[:-2].ljust(80),
                            str(v.shape).ljust(20))
            v_size = np.prod(np.array(v.shape.as_list())).tolist()
            total_size += v_size
        tf.logging.info("Total trainable variables size: %d", total_size)

        learning_rate = get_learning_rate_decay(params.learning_rate,
                                                global_step, params)
        learning_rate = tf.convert_to_tensor(learning_rate, dtype=tf.float32)
        tf.summary.scalar("learning_rate", learning_rate)

        # Create optimizer
        if params.optimizer == "Adam":
            opt = tf.train.AdamOptimizer(learning_rate,
                                         beta1=params.adam_beta1,
                                         beta2=params.adam_beta2,
                                         epsilon=params.adam_epsilon)
        elif params.optimizer == "LazyAdam":
            opt = tf.contrib.opt.LazyAdamOptimizer(learning_rate,
                                                   beta1=params.adam_beta1,
                                                   beta2=params.adam_beta2,
                                                   epsilon=params.adam_epsilon)
        else:
            raise RuntimeError("Optimizer %s not supported" % params.optimizer)

        loss, ops = optimize.create_train_op(loss, opt, global_step, params)
        restore_op = restore_variables(args.checkpoint)
        restore_trained_encoder_op = restore_encoder_variables(
            args.thumt_checkpoint)
        # Validation
        if params.validation and params.references[0]:
            files = [params.validation] + list(params.references)
            eval_inputs = dataset.sort_and_zip_files_catch(files)
            eval_input_fn = dataset.get_evaluation_input_catch
        else:
            eval_input_fn = None

        # Add hooks
        save_vars = tf.trainable_variables() + [global_step]
        saver = tf.train.Saver(
            var_list=save_vars if params.only_save_trainable else None,
            max_to_keep=params.keep_checkpoint_max,
            sharded=False)
        tf.add_to_collection(tf.GraphKeys.SAVERS, saver)

        train_hooks = [
            tf.train.StopAtStepHook(last_step=params.train_steps),
            tf.train.NanTensorHook(loss),
            tf.train.LoggingTensorHook({
                "step": global_step,
                "loss": loss,
            },
                                       every_n_iter=1),
            tf.train.CheckpointSaverHook(
                checkpoint_dir=params.output,
                save_secs=params.save_checkpoint_secs or None,
                save_steps=params.save_checkpoint_steps or None,
                saver=saver)
        ]

        config = session_config(params)

        if eval_input_fn is not None:
            train_hooks.append(
                hooks.EvaluationHook(
                    lambda f: inference.create_inference_graph(
                        [model.get_inference_func()], f, params),
                    lambda: eval_input_fn(eval_inputs, params),
                    lambda x: decode_target_ids(x, params),
                    params.output,
                    config,
                    params.keep_top_checkpoint_max,
                    eval_secs=params.eval_secs,
                    eval_steps=params.eval_steps))

        def restore_fn(step_context):
            step_context.session.run(restore_op)
            step_context.session.run(restore_trained_encoder_op)

        def step_fn(step_context):
            # Bypass hook calls
            step_context.session.run([init_op, ops["zero_op"]])
            for i in range(params.update_cycle):
                step_context.session.run(ops["collect_op"])
            step_context.session.run(ops["scale_op"])

            # ####################################
            # # print some unchanged variable
            # scale = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
            #                           "transformer/encoder/layer_0/self_attention/layer_norm/scale")
            # # scale = tf.get_variable("transformer/encoder/layer_0/self_attention/layer_norm/scale")
            # scale = step_context.session.run(scale[0])
            #
            # print(scale)
            #
            # ####################################
            # # print some changed variable
            #
            # scale = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
            #                           "transformer/context/head_to_scalar")
            # # scale = tf.get_variable("transformer/encoder/layer_0/self_attention/layer_norm/scale")
            # scale = step_context.session.run(scale[0])
            #
            # print(scale)

            return step_context.run_with_hooks(ops["train_op"])

        # Create session, do not use default CheckpointSaverHook
        with tf.train.MonitoredTrainingSession(checkpoint_dir=params.output,
                                               hooks=train_hooks,
                                               save_checkpoint_secs=None,
                                               config=config) as sess:
            # Restore pre-trained variables
            sess.run_step_fn(restore_fn)

            while not sess.should_stop():
                sess.run_step_fn(step_fn)
Пример #7
0
def main(args):
    tf.logging.set_verbosity(tf.logging.INFO)
    model_cls = models.get_model(args.model)
    params = default_parameters()

    # Import and override parameters
    # Priorities (low -> high):
    # default -> saved -> command
    params = merge_parameters(params, model_cls.get_parameters())
    params = import_params(args.checkpoint, args.model, params)
    override_parameters(params, args)

    # Build Graph
    with tf.Graph().as_default():
        model = model_cls(params)
        inputs = read_files(args.input)
        features = get_features(inputs, params)
        score_fn = model.get_evaluation_func()
        # scores = score_fn(features, params)
        # Multi-GPU setting
        scores = parallel.parallel_model(score_fn, features,
                                         params.device_list)

        sess_creator = tf.train.ChiefSessionCreator(
            config=session_config(params))

        # Load checkpoint
        tf.logging.info("Loading %s" % args.checkpoint)
        var_list = tf.train.list_variables(args.checkpoint)
        values = {}
        reader = tf.train.load_checkpoint(args.checkpoint)

        for (name, shape) in var_list:
            if not name.startswith(model_cls.get_name()):
                continue

            tensor = reader.get_tensor(name)
            values[name] = tensor

        ops = set_variables(tf.trainable_variables(), values,
                            model_cls.get_name())
        assign_op = tf.group(*ops)

        # Create session
        fb = 0
        with tf.train.MonitoredSession(session_creator=sess_creator) as sess:
            # Restore variables
            sess.run(assign_op)
            fd = tf.gfile.Open(args.rv_file, "w")
            if params.model_uncertainty:
                fm = tf.gfile.Open(args.mean_file, 'w')
                fv = tf.gfile.Open(args.var_file, 'w')
                fsm = tf.gfile.Open(args.sen_mean, 'w')
                fsv = tf.gfile.Open(args.sen_var, 'w')
                fsr = tf.gfile.Open(args.sen_rv, 'w')

            while not sess.should_stop():
                results = sess.run(scores)
                fb += 1
                message = "Finished batch %d" % fb
                tf.logging.log(tf.logging.INFO, message)
                if params.model_uncertainty:
                    rv_score = []
                    mean_score = []
                    var_score = []
                    len_score = []
                    sen_mean = []
                    sen_var = []
                    sen_rv = []
                    for result in results:
                        rv_score.append(result["rv"].tolist())
                        mean_score.append(result["mean"].tolist())
                        var_score.append(result["var"].tolist())
                        len_score.append(result["len"].tolist())
                        sen_mean.append(result["sen_mean"].tolist())
                        sen_var.append(result["sen_var"].tolist())
                        sen_rv.append(result["sen_rv"].tolist())
                    rv_score = list(itertools.chain(*rv_score))
                    mean_score = list(itertools.chain(*mean_score))
                    var_score = list(itertools.chain(*var_score))
                    len_score = list(itertools.chain(*len_score))
                    sen_mean = list(itertools.chain(*sen_mean))
                    sen_var = list(itertools.chain(*sen_var))
                    sen_rv = list(itertools.chain(*sen_rv))

                    # mean_score = results["mean"]
                    # var_score = results["var"]
                    # rv_score = results["rv"]
                    # len_score = results["len"].tolist()
                    len_score = map(int, len_score)

                    for i, l in enumerate(len_score):
                        m = mean_score[i][:l]
                        v = var_score[i][:l]
                        r = rv_score[i][:l]
                        for (i_m, i_v, i_r) in zip(m, v, r):
                            # fd.write('%f/%f/%f ' % (i_m, i_v, i_r))
                            fd.write('%f ' % i_r)
                            fm.write('%f ' % i_m)
                            fv.write('%f ' % i_v)
                        fd.write("\n")
                        fm.write("\n")
                        fv.write("\n")

                        fsm.write("{}\n".format(sen_mean[i]))
                        fsv.write("{}\n".format(sen_var[i]))
                        fsr.write("{}\n".format(sen_rv[i]))
                else:
                    results = itertools.chain(*results)
                    for value in results:
                        fd.write("%f\n" % value)

            fd.close()
Пример #8
0
def main(args):
    tf.logging.set_verbosity(tf.logging.INFO)
    model_cls = models.get_model(args.model)

    params = default_parameters()

    params = merge_parameters(params, model_cls.get_parameters())

    params = import_params(args.output, args.model, params)
    override_parameters(params, args)

    export_params(params.output, "params.json", params)
    export_params(params.output, "%s.json" % args.model,
                  collect_params(params, model_cls.get_parameters()))

    with tf.Graph().as_default():
        features = dataset.get_training_input(params.input, params)

        update_cycle = params.update_cycle
        features, init_op = cache.cache_features(features, update_cycle)

        initializer = get_initializer(params)
        regularizer = tf.contrib.layers.l1_l2_regularizer(
            scale_l1=params.scale_l1, scale_l2=params.scale_l2)
        model = model_cls(params)
        global_step = tf.train.get_or_create_global_step()

        sharded_losses = parallel.parallel_model(
            model.get_training_func(initializer, regularizer), features,
            params.device_list)

        loss = tf.add_n(sharded_losses) / len(sharded_losses)
        loss = loss + tf.losses.get_regularization_loss()

        all_weights = {v.name: v for v in tf.trainable_variables()}
        total_size = 0

        for v_name in sorted(list(all_weights)):
            v = all_weights[v_name]
            tf.logging.info("%s\tshape    %s", v.name[:-2].ljust(80),
                            str(v.shape).ljust(20))
            v_size = np.prod(np.array(v.shape.as_list())).tolist()
            total_size += v_size
        tf.logging.info("Total trainable variables size: %d", total_size)

        learning_rate = get_learning_rate_decay(params.learning_rate,
                                                global_step, params)
        learning_rate = tf.convert_to_tensor(learning_rate, dtype=tf.float32)
        tf.summary.scalar("learning_rate", learning_rate)

        if params.optimizer == "Adam":
            opt = tf.train.AdamOptimizer(learning_rate,
                                         beta1=params.adam_beta1,
                                         beta2=params.adam_beta2,
                                         epsilon=params.adam_epsilon)
        elif params.optimizer == "LazyAdam":
            opt = tf.contrib.opt.LazyAdamOptimizer(learning_rate,
                                                   beta1=params.adam_beta1,
                                                   beta2=params.adam_beta2,
                                                   epsilon=params.adam_epsilon)
        elif params.optimizer == "SGD":
            opt = tf.train.GradientDescentOptimizer(learning_rate)
        else:
            raise RuntimeError("Optimizer %s not supported" % params.optimizer)

        loss, ops = optimize.create_train_op(loss, opt, global_step, params)
        restore_op = restore_variables(args.checkpoint)

        if params.validation:
            eval_sorted_keys, eval_inputs = dataset.read_eval_input_file(
                params.validation)
            eval_input_fn = dataset.get_predict_input
        else:
            eval_input_fn = None

        save_vars = tf.trainable_variables() + [global_step]
        saver = tf.train.Saver(
            var_list=save_vars if params.only_save_trainable else None,
            max_to_keep=params.keep_checkpoint_max,
            sharded=False)
        tf.add_to_collection(tf.GraphKeys.SAVERS, saver)

        multiplier = tf.convert_to_tensor([update_cycle, 1])

        train_hooks = [
            tf.train.StopAtStepHook(last_step=params.train_steps),
            tf.train.NanTensorHook(loss),
            tf.train.LoggingTensorHook(
                {
                    "step": global_step,
                    "loss": loss,
                    "text": tf.shape(features["text"]) * multiplier,
                    "aspect": tf.shape(features["aspect"]) * multiplier,
                    "polarity": tf.shape(features["polarity"]) * multiplier
                },
                every_n_iter=1),
            tf.train.CheckpointSaverHook(
                checkpoint_dir=params.output,
                save_secs=params.save_checkpoint_secs or None,
                save_steps=params.save_checkpoint_steps or None,
                saver=saver)
        ]

        config = session_config(params)

        if eval_input_fn is not None:
            train_hooks.append(
                hooks.EvaluationHook(
                    lambda f: inference.create_predict_graph([model], f, params
                                                             ),
                    lambda: eval_input_fn(eval_inputs, params),
                    params.output,
                    config,
                    params.keep_top_checkpoint_max,
                    eval_secs=params.eval_secs,
                    eval_steps=params.eval_steps))

        def restore_fn(step_context):
            step_context.session.run(restore_op)

        def step_fn(step_context):
            step_context.session.run([init_op, ops["zero_op"]])
            for i in range(update_cycle - 1):
                step_context.session.run(ops["collect_op"])

            return step_context.run_with_hooks(ops["train_op"])

        with tf.train.MonitoredTrainingSession(checkpoint_dir=params.output,
                                               hooks=train_hooks,
                                               save_checkpoint_secs=None,
                                               config=config) as sess:
            sess.run_step_fn(restore_fn)

            while not sess.should_stop():
                sess.run_step_fn(step_fn)