예제 #1
0
    def __init__(self, encoders, decoders, learning_rate, global_step, max_gradient_norm, use_dropout=False,
                 freeze_variables=None, feed_previous=0.0, optimizer='sgd', decode_only=False,
                 len_normalization=1.0, name=None, chained_encoders=False, baseline_step=None,
                 use_baseline=True, reverse_input=False, reconstruction_decoders=False, multi_task=False,
                 **kwargs):
        self.encoders = encoders
        self.decoders = decoders
        self.temperature = self.decoders[0].temperature
        self.pred_edits = decoders[0].pred_edits

        self.name = name

        self.learning_rate = learning_rate
        self.global_step = global_step
        self.baseline_step = baseline_step
        self.use_baseline = use_baseline

        self.max_output_len = [decoder.max_len for decoder in decoders]
        self.max_input_len = [encoder.max_len for encoder in encoders]
        self.len_normalization = len_normalization
        self.reverse_input = reverse_input

        dropout_on = []
        dropout_off = []

        if use_dropout:
            for encoder_or_decoder in encoders + decoders:
                names = ['rnn_input', 'rnn_output', 'rnn_state', 'initial_state', 'word', 'input_layer', 'output',
                         'attn', 'deep_layer', 'inter_layer', 'embedding']

                for name in names:
                    value = encoder_or_decoder.get(name + '_dropout')
                    var_name = name + '_keep_prob'
                    if not value:
                        encoder_or_decoder[var_name] = 1.0
                        continue
                    var = tf.Variable(1 - value, trainable=False, name=var_name)
                    encoder_or_decoder[var_name] = var
                    dropout_on.append(var.assign(1.0 - value))
                    dropout_off.append(var.assign(1.0))

        self.dropout_on = tf.group(*dropout_on)
        self.dropout_off = tf.group(*dropout_off)

        self.feed_previous = tf.constant(feed_previous, dtype=tf.float32)
        self.feed_argmax = tf.constant(True, dtype=tf.bool)  # feed with argmax or sample from softmax
        self.training = tf.placeholder(dtype=tf.bool, shape=())

        self.encoder_inputs = []
        self.encoder_input_length = []
        for encoder in encoders:
            shape = [None, None, encoder.embedding_size] if encoder.binary else [None, None]
            dtype = tf.float32 if encoder.binary else tf.int32
            encoder_input = tf.placeholder(dtype=dtype, shape=shape, name='encoder_{}'.format(encoder.name))
            encoder_input_length = tf.placeholder(dtype=tf.int32, shape=[None],
                                                  name='encoder_input_length_{}'.format(encoder.name))
            self.encoder_inputs.append(encoder_input)
            self.encoder_input_length.append(encoder_input_length)

        # starts with BOS, and ends with EOS
        self.targets = tuple([
            tf.placeholder(tf.int32, shape=[None, None], name='target_{}'.format(decoder.name))
            for decoder in decoders
        ])
        self.rewards = tf.placeholder(tf.float32, shape=[None, None], name='rewards')

        if reconstruction_decoders:
            architecture = models.reconstruction_encoder_decoder
        elif chained_encoders and self.pred_edits:
            architecture = models.chained_encoder_decoder    # no REINFORCE for now
        elif multi_task:
            architecture = models.multi_task_encoder_decoder
        else:
            architecture = models.encoder_decoder

        tensors = architecture(encoders, decoders, self.encoder_inputs, self.targets, self.feed_previous,
                               encoder_input_length=self.encoder_input_length, feed_argmax=self.feed_argmax,
                               rewards=self.rewards, use_baseline=use_baseline, training=self.training,
                               global_step=self.global_step, **kwargs)

        self.losses, self.outputs, self.attention_weights, self.samples, self.beam_fun, self.initial_data = tensors

        self.xent_loss, self.reinforce_loss, self.baseline_loss = self.losses
        self.loss = self.xent_loss   # main loss

        optimizers = self.get_optimizers(optimizer, learning_rate)

        if not decode_only:
            get_update_ops = functools.partial(self.get_update_op, opts=optimizers,
                                               max_gradient_norm=max_gradient_norm, freeze_variables=freeze_variables)

            self.update_ops = utils.AttrDict({
                'xent': get_update_ops(self.xent_loss, global_step=self.global_step),
                'reinforce': get_update_ops(self.reinforce_loss, global_step=self.global_step),
            })

            if use_baseline:
                self.update_ops['baseline'] = get_update_ops(self.baseline_loss, global_step=self.baseline_step)

        self.models = [self]
        self.beam_outputs = tf.expand_dims(tf.argmax(self.outputs[0], axis=2), axis=1)
        self.beam_scores = tf.zeros(shape=[tf.shape(self.beam_outputs)[0], 1])
        self.beam_size = tf.placeholder(shape=(), dtype=tf.int32)
예제 #2
0
def main(args=None):
    args = parser.parse_args(args)

    # read config file and default config
    with open('config/default.yaml') as f:
        default_config = utils.AttrDict(yaml.safe_load(f))

    with open(args.config) as f:
        config = utils.AttrDict(yaml.safe_load(f))
        
        if args.learning_rate is not None:
            args.reset_learning_rate = True
        
        # command-line parameters have higher precedence than config file
        for k, v in vars(args).items():
            if v is not None:
                config[k] = v

        # set default values for parameters that are not defined
        for k, v in default_config.items():
            config.setdefault(k, v)

    if config.score_function:
        config.score_functions = evaluation.name_mapping[config.score_function]

    if args.crash_test:
        config.max_train_size = 0

    if not config.debug:
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # disable TensorFlow's debugging logs
    decoding_mode = any(arg is not None for arg in (args.decode, args.eval, args.align))

    # enforce parameter constraints
    assert config.steps_per_eval % config.steps_per_checkpoint == 0, (
        'steps-per-eval should be a multiple of steps-per-checkpoint')
    assert decoding_mode or args.train or args.save or args.save_embedding, (
        'you need to specify at least one action (decode, eval, align, or train)')
    assert not (args.average and args.ensemble)

    if args.train and args.purge:
        utils.log('deleting previous model')
        shutil.rmtree(config.model_dir, ignore_errors=True)

    os.makedirs(config.model_dir, exist_ok=True)

    # copy config file to model directory
    config_path = os.path.join(config.model_dir, 'config.yaml')
    if args.train and not os.path.exists(config_path):
        with open(args.config) as config_file, open(config_path, 'w') as dest_file:
            content = config_file.read()
            content = re.sub(r'model_dir:.*?\n', 'model_dir: {}\n'.format(config.model_dir), content,
                             flags=re.MULTILINE)
            dest_file.write(content)

    # also copy default config
    config_path = os.path.join(config.model_dir, 'default.yaml')
    if args.train and not os.path.exists(config_path):
        shutil.copy('config/default.yaml', config_path)

    # copy source code to model directory
    tar_path =  os.path.join(config.model_dir, 'code.tar.gz')
    if args.train and not os.path.exists(tar_path):
        with tarfile.open(tar_path, "w:gz") as tar:
            for filename in os.listdir('translate'):
                if filename.endswith('.py'):
                    tar.add(os.path.join('translate', filename), arcname=filename)

    logging_level = logging.DEBUG if args.verbose else logging.INFO
    # always log to stdout in decoding and eval modes (to avoid overwriting precious train logs)
    log_path = os.path.join(config.model_dir, config.log_file)
    logger = utils.create_logger(log_path if args.train else None)
    logger.setLevel(logging_level)

    utils.log('label: {}'.format(config.label))
    utils.log('description:\n  {}'.format('\n  '.join(config.description.strip().split('\n'))))

    utils.log(' '.join(sys.argv))  # print command line
    try:  # print git hash
        commit_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
        utils.log('commit hash {}'.format(commit_hash))
    except:
        pass

    utils.log('tensorflow version: {}'.format(tf.__version__))

    # log parameters
    utils.debug('program arguments')
    for k, v in sorted(config.items(), key=itemgetter(0)):
        utils.debug('  {:<20} {}'.format(k, pformat(v)))

    if isinstance(config.dev_prefix, str):
        config.dev_prefix = [config.dev_prefix]

    if config.tasks is not None:
        config.tasks = [utils.AttrDict(task) for task in config.tasks]
        tasks = config.tasks
    else:
        tasks = [config]

    for task in tasks:
        for parameter, value in config.items():
            task.setdefault(parameter, value)

        task.encoders = [utils.AttrDict(encoder) for encoder in task.encoders]
        task.decoders = [utils.AttrDict(decoder) for decoder in task.decoders]

        for encoder_or_decoder in task.encoders + task.decoders:
            for parameter, value in task.items():
                encoder_or_decoder.setdefault(parameter, value)

        if args.max_len:
            args.max_input_len = args.max_len
        if args.max_output_len:   # override decoder's max len
            task.decoders[0].max_len = args.max_output_len
        if args.max_input_len:    # override encoder's max len
            task.encoders[0].max_len = args.max_input_len

    config.checkpoint_dir = os.path.join(config.model_dir, 'checkpoints')

    # setting random seeds
    if config.seed is None:
        config.seed = random.randrange(sys.maxsize)
    if config.tf_seed is None:
        config.tf_seed = random.randrange(sys.maxsize)
    utils.log('python random seed: {}'.format(config.seed))
    utils.log('tf random seed:     {}'.format(config.tf_seed))
    random.seed(config.seed)
    tf.set_random_seed(config.tf_seed)

    device = None
    if config.no_gpu:
        device = '/cpu:0'
        device_id = None
    elif config.gpu_id is not None:
        device = '/gpu:{}'.format(config.gpu_id)
        device_id = config.gpu_id
    else:
        device_id = 0

    # hide other GPUs so that TensorFlow won't use memory on them
    os.environ['CUDA_VISIBLE_DEVICES'] = '' if device_id is None else str(device_id)

    utils.log('creating model')
    utils.log('using device: {}'.format(device))

    with tf.device(device):
        if config.weight_scale:
            if config.initializer == 'uniform':
                initializer = tf.random_uniform_initializer(minval=-config.weight_scale, maxval=config.weight_scale)
            else:
                initializer = tf.random_normal_initializer(stddev=config.weight_scale)
        else:
            initializer = None

        tf.get_variable_scope().set_initializer(initializer)

        # exempt from creating gradient ops
        config.decode_only = decoding_mode

        if config.tasks is not None:
            model = MultiTaskModel(**config)
        else:
            model = TranslationModel(**config)

    # count parameters
    # not counting parameters created by training algorithm (e.g. Adam)
    variables = [var for var in tf.global_variables() if not var.name.startswith('gradients')]
    utils.log('model parameters ({})'.format(len(variables)))
    parameter_count = 0
    for var in sorted(variables, key=lambda var: var.name):
        utils.log('  {} {}'.format(var.name, var.get_shape()))
        v = 1
        for d in var.get_shape():
            v *= d.value
        parameter_count += v
    utils.log('number of parameters: {:.2f}M'.format(parameter_count / 1e6))

    tf_config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)
    tf_config.gpu_options.allow_growth = config.allow_growth
    tf_config.gpu_options.per_process_gpu_memory_fraction = config.mem_fraction

    def average_checkpoints(main_sess, sessions):
        for var in tf.global_variables():
            avg_value = sum(sess.run(var) for sess in sessions) / len(sessions)
            main_sess.run(var.assign(avg_value))

    with tf.Session(config=tf_config) as sess:
        best_checkpoint = os.path.join(config.checkpoint_dir, 'best')

        params = {'variable_mapping': config.variable_mapping, 'reverse_mapping': config.reverse_mapping,
                  'rnn_lm_model_dir': None, 'rnn_mt_model_dir': None,
                  'rnn_lm_cell_name': None, 'origin_model_ckpt': None}
        if config.ensemble and len(config.checkpoints) > 1:
            model.initialize(config.checkpoints, **params)
        elif config.average and len(config.checkpoints) > 1:
            model.initialize(reset=True)
            sessions = [tf.Session(config=tf_config) for _ in config.checkpoints]
            for sess_, checkpoint in zip(sessions, config.checkpoints):
                model.initialize(sess=sess_, checkpoints=[checkpoint], **params)
            average_checkpoints(sess, sessions)
        elif (not config.checkpoints and decoding_mode and
             (os.path.isfile(best_checkpoint + '.index') or os.path.isfile(best_checkpoint + '.index'))):
            # in decoding and evaluation mode, unless specified otherwise (by `checkpoints`),
            # try to load the best checkpoint
            model.initialize([best_checkpoint], **params)
        else:
            # loads last checkpoint, unless `reset` is true
            model.initialize(**config)

        if config.output is not None:
            dirname = os.path.dirname(config.output)
            if dirname:
                os.makedirs(dirname, exist_ok=True)

        try:
            if args.save:
                model.save()
            elif args.save_embedding:
                if config.embedding_output_dir is None:
                    output_dir = "."
                else:
                    output_dir = config.embedding_output_dir
                model.save_embedding(output_dir)
            elif args.decode is not None:
                if config.align is not None:
                    config.align = True
                model.decode(**config)
            elif args.eval is not None:
                model.evaluate(on_dev=False, **config)
            elif args.align is not None:
                model.align(**config)
            elif args.train:
                model.train(**config)
        except KeyboardInterrupt:
            sys.exit()
예제 #3
0
    def __init__(self,
                 encoders,
                 decoders,
                 checkpoint_dir,
                 learning_rate,
                 learning_rate_decay_factor,
                 batch_size,
                 keep_best=1,
                 dev_prefix=None,
                 name=None,
                 ref_ext=None,
                 pred_edits=False,
                 dual_output=False,
                 binary=None,
                 truncate_lines=True,
                 ensemble=False,
                 checkpoints=None,
                 beam_size=1,
                 len_normalization=1,
                 lexicon=None,
                 debug=False,
                 **kwargs):

        self.batch_size = batch_size
        self.character_level = {}
        self.binary = []
        self.debug = debug

        for encoder_or_decoder in encoders + decoders:
            encoder_or_decoder.ext = encoder_or_decoder.ext or encoder_or_decoder.name
            self.character_level[
                encoder_or_decoder.ext] = encoder_or_decoder.character_level
            self.binary.append(encoder_or_decoder.get('binary', False))

        self.encoders, self.decoders = encoders, decoders

        self.char_output = decoders[0].character_level

        self.src_ext = [encoder.ext for encoder in encoders]
        self.trg_ext = [decoder.ext for decoder in decoders]

        self.extensions = self.src_ext + self.trg_ext

        self.ref_ext = ref_ext
        if self.ref_ext is not None:
            self.binary.append(False)

        self.pred_edits = pred_edits
        self.dual_output = dual_output

        self.dev_prefix = dev_prefix
        self.name = name

        self.max_input_len = [encoder.max_len for encoder in encoders]
        self.max_output_len = [decoder.max_len for decoder in decoders]
        self.beam_size = beam_size

        if truncate_lines:
            self.max_len = None  # we let seq2seq.get_batch handle long lines (by truncating them)
        else:  # the line reader will drop lines that are too long
            self.max_len = dict(
                zip(self.extensions, self.max_input_len + self.max_output_len))

        self.learning_rate = tf.Variable(learning_rate,
                                         trainable=False,
                                         name='learning_rate',
                                         dtype=tf.float32)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)

        with tf.device('/cpu:0'):
            self.global_step = tf.Variable(0,
                                           trainable=False,
                                           name='global_step')
            self.baseline_step = tf.Variable(0,
                                             trainable=False,
                                             name='baseline_step')

        self.filenames = utils.get_filenames(extensions=self.extensions,
                                             dev_prefix=dev_prefix,
                                             name=name,
                                             ref_ext=ref_ext,
                                             binary=self.binary,
                                             **kwargs)
        utils.debug('reading vocabularies')
        self.vocabs = None
        self.src_vocab, self.trg_vocab = None, None
        self.read_vocab()

        for encoder_or_decoder, vocab in zip(encoders + decoders, self.vocabs):
            if vocab:
                if encoder_or_decoder.vocab_size:  # reduce vocab size
                    vocab.reverse[:] = vocab.reverse[:encoder_or_decoder.
                                                     vocab_size]
                    for token, token_id in list(vocab.vocab.items()):
                        if token_id >= encoder_or_decoder.vocab_size:
                            del vocab.vocab[token]
                else:
                    encoder_or_decoder.vocab_size = len(vocab.reverse)

        utils.debug('creating model')

        self.models = []
        if ensemble and checkpoints is not None:
            for i, _ in enumerate(checkpoints, 1):
                with tf.variable_scope('model_{}'.format(i)):
                    model = Seq2SeqModel(encoders,
                                         decoders,
                                         self.learning_rate,
                                         self.global_step,
                                         name=name,
                                         pred_edits=pred_edits,
                                         dual_output=dual_output,
                                         baseline_step=self.baseline_step,
                                         **kwargs)
                    self.models.append(model)
            self.seq2seq_model = self.models[0]
        else:
            self.seq2seq_model = Seq2SeqModel(encoders,
                                              decoders,
                                              self.learning_rate,
                                              self.global_step,
                                              name=name,
                                              pred_edits=pred_edits,
                                              dual_output=dual_output,
                                              baseline_step=self.baseline_step,
                                              **kwargs)
            self.models.append(self.seq2seq_model)

        self.seq2seq_model.create_beam_op(self.models, len_normalization)

        self.batch_iterator = None
        self.dev_batches = None
        self.train_size = None
        self.saver = None
        self.keep_best = keep_best
        self.checkpoint_dir = checkpoint_dir
        self.epoch = None

        self.training = utils.AttrDict()  # used to keep track of training

        if lexicon:
            with open(lexicon) as lexicon_file:
                self.lexicon = dict(line.split() for line in lexicon_file)
        else:
            self.lexicon = None
예제 #4
0
def main(args=None):
    args = parser.parse_args(args)

    # read config file and default config
    with open('config/default.yaml') as f:
        default_config = utils.AttrDict(yaml.safe_load(f))

    with open(args.config) as f:
        config = utils.AttrDict(yaml.safe_load(f))

        if args.learning_rate is not None:
            args.reset_learning_rate = True

        # command-line parameters have higher precedence than config file
        for k, v in vars(args).items():
            if v is not None:
                config[k] = v

        # set default values for parameters that are not defined
        for k, v in default_config.items():
            config.setdefault(k, v)

    # enforce parameter constraints
    assert config.steps_per_eval % config.steps_per_checkpoint == 0, (
        'steps-per-eval should be a multiple of steps-per-checkpoint')
    assert args.decode is not None or args.eval or args.train or args.align, (
        'you need to specify at least one action (decode, eval, align, or train)'
    )

    if args.purge:
        utils.log('deleting previous model')
        shutil.rmtree(config.model_dir, ignore_errors=True)

    logging_level = logging.DEBUG if args.verbose else logging.INFO
    # always log to stdout in decoding and eval modes (to avoid overwriting precious train logs)
    logger = utils.create_logger(config.log_file if args.train else None)
    logger.setLevel(logging_level)

    utils.log(' '.join(sys.argv))  # print command line
    try:  # print git hash
        commit_hash = subprocess.check_output(['git', 'rev-parse',
                                               'HEAD']).decode().strip()
        utils.log('commit hash {}'.format(commit_hash))
    except:
        pass

    # list of encoder and decoder parameter names (each encoder and decoder can have a different value
    # for those parameters)
    model_parameters = [
        'cell_size', 'layers', 'vocab_size', 'embedding_size',
        'attention_filters', 'attention_filter_length', 'use_lstm',
        'time_pooling', 'attention_window_size', 'dynamic', 'binary',
        'character_level', 'bidir', 'load_embeddings', 'pooling_avg',
        'swap_memory', 'parallel_iterations', 'input_layers',
        'residual_connections', 'attn_size'
    ]
    # TODO: independent model dir for each task
    task_parameters = [
        'data_dir', 'train_prefix', 'dev_prefix', 'vocab_prefix', 'ratio',
        'lm_file', 'learning_rate', 'learning_rate_decay_factor',
        'max_input_len', 'max_output_len', 'encoders', 'decoder'
    ]

    # in case no task is defined (standard mono-task settings), define a "main" task
    config.setdefault('tasks', [{
        'encoders': config.encoders,
        'decoder': config.decoder,
        'name': 'main',
        'ratio': 1.0
    }])
    config.tasks = [utils.AttrDict(task) for task in config.tasks]

    for task in config.tasks:
        for parameter in task_parameters:
            task.setdefault(parameter, config.get(parameter))

        if isinstance(task.dev_prefix,
                      str):  # for back-compatibility with old config files
            task.dev_prefix = [task.dev_prefix]

        # convert dicts to AttrDicts for convenience
        task.encoders = [utils.AttrDict(encoder) for encoder in task.encoders]
        task.decoder = utils.AttrDict(task.decoder)

        for encoder_or_decoder in task.encoders + [task.decoder]:
            # move parameters all the way up from base level to encoder/decoder level:
            # default values for encoder/decoder parameters can be defined at the task level and base level
            # default values for tasks can be defined at the base level
            for parameter in model_parameters:
                if parameter in encoder_or_decoder:
                    continue
                elif parameter in task:
                    encoder_or_decoder[parameter] = task[parameter]
                else:
                    encoder_or_decoder[parameter] = config.get(parameter)

    # log parameters
    utils.log('program arguments')
    for k, v in sorted(config.items(), key=itemgetter(0)):
        if k == 'tasks':
            utils.log('  {:<20}\n{}'.format(k, pformat(v)))
        elif k not in model_parameters and k not in task_parameters:
            utils.log('  {:<20} {}'.format(k, pformat(v)))

    device = None
    if config.no_gpu:
        device = '/cpu:0'
    elif config.gpu_id is not None:
        device = '/gpu:{}'.format(config.gpu_id)

    utils.log('creating model')
    utils.log('using device: {}'.format(device))

    with tf.device(device):
        checkpoint_dir = os.path.join(config.model_dir, 'checkpoints')
        # All parameters except recurrent connexions and attention parameters are initialized with this.
        # Recurrent connexions are initialized with orthogonal matrices, and the parameters of the attention model
        # with a standard deviation of 0.001
        if config.weight_scale:
            initializer = tf.random_normal_initializer(
                stddev=config.weight_scale)
        else:
            initializer = None

        tf.get_variable_scope().set_initializer(initializer)
        decode_only = args.decode is not None or args.eval or args.align  # exempt from creating gradient ops
        model = MultiTaskModel(name='main',
                               checkpoint_dir=checkpoint_dir,
                               decode_only=decode_only,
                               **config)

    utils.log('model parameters ({})'.format(len(tf.global_variables())))
    parameter_count = 0
    for var in tf.global_variables():
        utils.log('  {} {}'.format(var.name, var.get_shape()))

        v = 1
        for d in var.get_shape():
            v *= d.value
        parameter_count += v
    utils.log('number of parameters: {}'.format(parameter_count))

    tf_config = tf.ConfigProto(log_device_placement=False,
                               allow_soft_placement=True)
    tf_config.gpu_options.allow_growth = config.allow_growth
    tf_config.gpu_options.per_process_gpu_memory_fraction = config.mem_fraction

    with tf.Session(config=tf_config) as sess:
        best_checkpoint = os.path.join(checkpoint_dir, 'best')

        if config.ensemble and (args.eval or args.decode is not None):
            # create one session for each model in the ensemble
            sess = [tf.Session() for _ in config.checkpoints]
            for sess_, checkpoint in zip(sess, config.checkpoints):
                model.initialize(sess_, [checkpoint], reset=True)
        elif (not config.checkpoints
              and (args.eval or args.decode is not None or args.align)
              and (os.path.isfile(best_checkpoint + '.index')
                   or os.path.isfile(best_checkpoint + '.index'))):
            # in decoding and evaluation mode, unless specified otherwise (by `checkpoints`),
            # try to load the best checkpoint)
            model.initialize(sess, [best_checkpoint], reset=True)
        else:
            # loads last checkpoint, unless `reset` is true
            model.initialize(sess, **config)

        # Inspect variables:
        # tf.get_variable_scope().reuse_variables()
        # import pdb; pdb.set_trace()
        if args.decode is not None:
            model.decode(sess, **config)
        elif args.eval:
            model.evaluate(sess, on_dev=False, **config)
        elif args.align:
            model.align(sess, **config)
        elif args.train:
            eval_output = os.path.join(config.model_dir, 'eval')
            try:
                model.train(sess, eval_output=eval_output, **config)
            except KeyboardInterrupt:
                utils.log('exiting...')
                model.save(sess)
                sys.exit()
예제 #5
0
    def __init__(self,
                 encoders,
                 decoders,
                 checkpoint_dir,
                 learning_rate,
                 learning_rate_decay_factor,
                 batch_size,
                 keep_best=1,
                 dev_prefix=None,
                 score_function='corpus_scores',
                 name=None,
                 ref_ext=None,
                 pred_edits=False,
                 dual_output=False,
                 binary=None,
                 **kwargs):

        self.batch_size = batch_size
        self.character_level = {}
        self.binary = []

        for encoder_or_decoder in encoders + decoders:
            encoder_or_decoder.ext = encoder_or_decoder.ext or encoder_or_decoder.name
            self.character_level[
                encoder_or_decoder.ext] = encoder_or_decoder.character_level
            self.binary.append(encoder_or_decoder.get('binary', False))

        self.char_output = decoders[0].character_level

        self.src_ext = [encoder.ext for encoder in encoders]
        self.trg_ext = [decoder.ext for decoder in decoders]

        self.extensions = self.src_ext + self.trg_ext

        self.ref_ext = ref_ext
        if self.ref_ext is not None:
            self.binary.append(False)

        self.pred_edits = pred_edits
        self.dual_output = dual_output

        self.dev_prefix = dev_prefix
        self.name = name

        self.max_input_len = [encoder.max_len for encoder in encoders]
        self.max_output_len = [decoder.max_len for decoder in decoders]
        self.max_len = dict(
            zip(self.extensions, self.max_input_len + self.max_output_len))

        self.learning_rate = tf.Variable(learning_rate,
                                         trainable=False,
                                         name='learning_rate',
                                         dtype=tf.float32)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)

        with tf.device('/cpu:0'):
            self.global_step = tf.Variable(0,
                                           trainable=False,
                                           name='global_step')
            self.baseline_step = tf.Variable(0,
                                             trainable=False,
                                             name='baseline_step')

        self.filenames = utils.get_filenames(extensions=self.extensions,
                                             dev_prefix=dev_prefix,
                                             name=name,
                                             ref_ext=ref_ext,
                                             binary=self.binary,
                                             **kwargs)
        utils.debug('reading vocabularies')
        self.vocabs = None
        self.src_vocab, self.trg_vocab = None, None
        self.read_vocab()

        for encoder_or_decoder, vocab in zip(encoders + decoders, self.vocabs):
            if vocab:
                encoder_or_decoder.vocab_size = len(vocab.reverse)

        utils.debug('creating model')
        self.seq2seq_model = Seq2SeqModel(encoders,
                                          decoders,
                                          self.learning_rate,
                                          self.global_step,
                                          name=name,
                                          pred_edits=pred_edits,
                                          dual_output=dual_output,
                                          baseline_step=self.baseline_step,
                                          **kwargs)

        self.batch_iterator = None
        self.dev_batches = None
        self.train_size = None
        self.saver = None
        self.keep_best = keep_best
        self.checkpoint_dir = checkpoint_dir

        self.training = utils.AttrDict()  # used to keep track of training

        try:
            self.reversed_scores = getattr(
                evaluation, score_function).reversed  # the lower the better
        except AttributeError:
            self.reversed_scores = False  # the higher the better
예제 #6
0
def main(args=None):
    args = parser.parse_args(args)

    # read config file and default config
    with open('config/default.yaml') as f:
        default_config = utils.AttrDict(yaml.safe_load(f))

    with open(args.config) as f:
        config = utils.AttrDict(yaml.safe_load(f))

        if args.learning_rate is not None:
            args.reset_learning_rate = True

        # command-line parameters have higher precedence than config file
        for k, v in vars(args).items():
            if v is not None:
                config[k] = v

        # set default values for parameters that are not defined
        for k, v in default_config.items():
            config.setdefault(k, v)

    # enforce parameter constraints
    assert config.steps_per_eval % config.steps_per_checkpoint == 0, (
        'steps-per-eval should be a multiple of steps-per-checkpoint')
    assert args.decode is not None or args.eval or args.train or args.align, (
        'you need to specify at least one action (decode, eval, align, or train)'
    )
    assert not (args.avg_checkpoints and args.ensemble)

    if args.purge:
        utils.log('deleting previous model')
        shutil.rmtree(config.model_dir, ignore_errors=True)

    os.makedirs(config.model_dir, exist_ok=True)

    # copy config file to model directory
    config_path = os.path.join(config.model_dir, 'config.yaml')
    if not os.path.exists(config_path):
        shutil.copy(args.config, config_path)

    # also copy default config
    config_path = os.path.join(config.model_dir, 'default.yaml')
    if not os.path.exists(config_path):
        shutil.copy('config/default.yaml', config_path)

    # copy source code to model directory
    tar_path = os.path.join(config.model_dir, 'code.tar.gz')
    if not os.path.exists(tar_path):
        with tarfile.open(tar_path, "w:gz") as tar:
            for filename in os.listdir('translate'):
                if filename.endswith('.py'):
                    tar.add(os.path.join('translate', filename),
                            arcname=filename)

    logging_level = logging.DEBUG if args.verbose else logging.INFO
    # always log to stdout in decoding and eval modes (to avoid overwriting precious train logs)
    log_path = os.path.join(config.model_dir, config.log_file)
    logger = utils.create_logger(log_path if args.train else None)
    logger.setLevel(logging_level)

    utils.log('label: {}'.format(config.label))
    utils.log('description:\n  {}'.format('\n  '.join(
        config.description.strip().split('\n'))))

    utils.log(' '.join(sys.argv))  # print command line
    try:  # print git hash
        commit_hash = subprocess.check_output(['git', 'rev-parse',
                                               'HEAD']).decode().strip()
        utils.log('commit hash {}'.format(commit_hash))
    except:
        pass

    utils.log('tensorflow version: {}'.format(tf.__version__))

    # log parameters
    utils.debug('program arguments')
    for k, v in sorted(config.items(), key=itemgetter(0)):
        utils.debug('  {:<20} {}'.format(k, pformat(v)))

    if isinstance(config.dev_prefix, str):
        config.dev_prefix = [config.dev_prefix]

    if config.tasks is not None:
        config.tasks = [utils.AttrDict(task) for task in config.tasks]
        tasks = config.tasks
    else:
        tasks = [config]

    for task in tasks:
        for parameter, value in config.items():
            task.setdefault(parameter, value)

        task.encoders = [utils.AttrDict(encoder) for encoder in task.encoders]
        task.decoders = [utils.AttrDict(decoder) for decoder in task.decoders]

        for encoder_or_decoder in task.encoders + task.decoders:
            for parameter, value in task.items():
                encoder_or_decoder.setdefault(parameter, value)

    device = None
    if config.no_gpu:
        device = '/cpu:0'
    elif config.gpu_id is not None:
        device = '/gpu:{}'.format(config.gpu_id)

    utils.log('creating model')
    utils.log('using device: {}'.format(device))

    with tf.device(device):
        config.checkpoint_dir = os.path.join(config.model_dir, 'checkpoints')

        if config.weight_scale:
            if config.initializer == 'uniform':
                initializer = tf.random_uniform_initializer(
                    minval=-config.weight_scale, maxval=config.weight_scale)
            else:
                initializer = tf.random_normal_initializer(
                    stddev=config.weight_scale)
        else:
            initializer = None

        tf.get_variable_scope().set_initializer(initializer)

        config.decode_only = args.decode is not None or args.eval or args.align  # exempt from creating gradient ops

        if config.tasks is not None:
            model = MultiTaskModel(**config)
        else:
            model = TranslationModel(**config)

    # count parameters
    utils.log('model parameters ({})'.format(len(tf.global_variables())))
    parameter_count = 0
    for var in tf.global_variables():
        utils.log('  {} {}'.format(var.name, var.get_shape()))

        if not var.name.startswith(
                'gradients'
        ):  # not counting parameters created by training algorithm (e.g. Adam)
            v = 1
            for d in var.get_shape():
                v *= d.value
            parameter_count += v
    utils.log('number of parameters: {:.2f}M'.format(parameter_count / 1e6))

    tf_config = tf.ConfigProto(log_device_placement=False,
                               allow_soft_placement=True)
    tf_config.gpu_options.allow_growth = config.allow_growth
    tf_config.gpu_options.per_process_gpu_memory_fraction = config.mem_fraction

    def average_checkpoints(main_sess, sessions):
        for var in tf.global_variables():
            avg_value = sum(sess.run(var) for sess in sessions) / len(sessions)
            main_sess.run(var.assign(avg_value))

    with tf.Session(config=tf_config) as sess:
        best_checkpoint = os.path.join(config.checkpoint_dir, 'best')

        if ((config.ensemble or config.avg_checkpoints)
                and (args.eval or args.decode is not None)
                and len(config.checkpoints) > 1):
            # create one session for each model in the ensemble
            sessions = [tf.Session() for _ in config.checkpoints]
            for sess_, checkpoint in zip(sessions, config.checkpoints):
                model.initialize(sess_, [checkpoint])

            if config.ensemble:
                sess = sessions
            else:
                sess = sessions[0]
                average_checkpoints(sess, sessions)
        elif (not config.checkpoints
              and (args.eval or args.decode is not None or args.align)
              and (os.path.isfile(best_checkpoint + '.index')
                   or os.path.isfile(best_checkpoint + '.index'))):
            # in decoding and evaluation mode, unless specified otherwise (by `checkpoints`),
            # try to load the best checkpoint)
            model.initialize(sess, [best_checkpoint])
        else:
            # loads last checkpoint, unless `reset` is true
            model.initialize(sess, **config)

        if args.decode is not None:
            model.decode(sess, **config)
        elif args.eval:
            model.evaluate(sess, on_dev=False, **config)
        elif args.align:
            model.align(sess, **config)
        elif args.train:
            try:
                model.train(sess=sess, **config)
            except (KeyboardInterrupt, utils.FinishedTrainingException):
                utils.log('exiting...')
                model.save(sess)
                sys.exit()
예제 #7
0
파일: models.py 프로젝트: zabin10/seq2seq-2
def attention_decoder(decoder_inputs,
                      initial_state,
                      attention_states,
                      encoders,
                      decoder,
                      encoder_input_length,
                      feed_previous=0.0,
                      align_encoder_id=0,
                      feed_argmax=True,
                      **kwargs):
    """
    :param decoder_inputs: int32 tensor of shape (batch_size, output_length)
    :param initial_state: initial state of the decoder (usually the final state of the encoder),
      as a float32 tensor of shape (batch_size, initial_state_size). This state is mapped to the
      correct state size for the decoder.
    :param attention_states: list of tensors of shape (batch_size, input_length, encoder_cell_size),
      the hidden states of the encoder(s) (one tensor for each encoder).
    :param encoders: configuration of the encoders
    :param decoder: configuration of the decoder
    :param encoder_input_length: list of int32 tensors of shape (batch_size,), tells for each encoder,
     the true length of each sequence in the batch (sequences in the same batch are padded to all have the same
     length).
    :param feed_previous: scalar tensor corresponding to the probability to use previous decoder output
      instead of the ground truth as input for the decoder (1 when decoding, between 0 and 1 when training)
    :param feed_argmax: boolean tensor, when True the greedy decoder outputs the word with the highest
    probability (argmax). When False, it samples a word from the probability distribution (softmax).
    :param align_encoder_id: outputs attention weights for this encoder. Also used when predicting edit operations
    (pred_edits), to specifify which encoder reads the sequence to post-edit (MT).

    :return:
      outputs of the decoder as a tensor of shape (batch_size, output_length, decoder_cell_size)
      attention weights as a tensor of shape (output_length, encoders, batch_size, input_length)
    """
    assert not decoder.pred_maxout_layer or decoder.cell_size % 2 == 0, 'cell size must be a multiple of 2'

    embedding_shape = [decoder.vocab_size, decoder.embedding_size]

    device = '/cpu:0' if decoder.embeddings_on_cpu else None
    with tf.device(device):
        embedding = get_variable('embedding_{}'.format(decoder.name),
                                 shape=embedding_shape)

    def embed(input_):
        return tf.nn.embedding_lookup(embedding, input_)

    def get_cell(input_size=None, reuse=False, dropout=True):
        cells = []

        for _ in range(decoder.layers):
            if decoder.use_lstm:
                cell = CellWrapper(
                    BasicLSTMCell(decoder.cell_size, reuse=reuse))
            else:
                cell = GRUCell(decoder.cell_size, reuse=reuse)

            if dropout and decoder.use_dropout:
                cell = DropoutWrapper(
                    cell,
                    input_keep_prob=decoder.rnn_input_keep_prob,
                    output_keep_prob=decoder.rnn_output_keep_prob,
                    state_keep_prob=decoder.rnn_state_keep_prob,
                    variational_recurrent=decoder.pervasive_dropout,
                    dtype=tf.float32,
                    input_size=input_size)
            cells.append(cell)

        if len(cells) == 1:
            return cells[0]
        else:
            return CellWrapper(MultiRNNCell(cells))

    def look(state, input_, prev_weights=None, pos=None):
        if not decoder.attn_use_lstm_state:
            state = state[:, -cell_output_size:]
        prev_weights_ = [
            prev_weights if i == align_encoder_id else None
            for i in range(len(encoders))
        ]
        pos_ = None
        if decoder.pred_edits:
            pos_ = [
                pos if i == align_encoder_id else None
                for i in range(len(encoders))
            ]
        if decoder.attn_prev_word:
            state = tf.concat([state, input_], axis=1)

        parameters = dict(hidden_states=attention_states,
                          encoder_input_length=encoder_input_length,
                          encoders=encoders,
                          aggregation_method=decoder.aggregation_method)
        context, new_weights = multi_attention(state,
                                               pos=pos_,
                                               prev_weights=prev_weights_,
                                               **parameters)
        return context, new_weights[align_encoder_id]

    def update(state, input_, context=None, symbol=None):
        if context is not None and decoder.rnn_feed_attn:
            input_ = tf.concat([input_, context], axis=1)
        input_size = input_.get_shape()[1]

        try:
            _, new_state = get_cell(input_size)(input_, state)
        except ValueError:  # auto_reuse doesn't work with LSTM cells
            _, new_state = get_cell(input_size, reuse=True)(input_, state)

        if decoder.skip_update and decoder.pred_edits and symbol is not None:
            is_del = tf.equal(symbol, utils.DEL_ID)
            new_state = tf.where(is_del, state, new_state)
        return new_state

    def update_pos(pos, symbol, max_pos=None):
        if not decoder.pred_edits:
            return pos

        is_keep = tf.equal(symbol, utils.KEEP_ID)
        is_del = tf.equal(symbol, utils.DEL_ID)
        is_not_ins = tf.logical_or(is_keep, is_del)
        pos += tf.to_float(is_not_ins)
        if max_pos is not None:
            pos = tf.minimum(pos, tf.to_float(max_pos))
        return pos

    def generate(state, input_, context):
        if not decoder.pred_use_lstm_state:
            state = state[:, -cell_output_size:]

        projection_input = [state, context]
        if decoder.use_previous_word:
            projection_input.insert(1, input_)  # for back-compatibility

        output_ = tf.concat(projection_input, axis=1)

        if decoder.pred_deep_layer:
            output_ = dense(output_,
                            decoder.embedding_size,
                            activation=tf.tanh,
                            use_bias=True,
                            name='deep_output')
        else:
            if decoder.pred_maxout_layer:
                output_ = dense(output_,
                                decoder.cell_size,
                                use_bias=False,
                                name='maxout')
                output_ = tf.nn.pool(tf.expand_dims(output_, axis=2),
                                     window_shape=[2],
                                     pooling_type='MAX',
                                     padding='SAME',
                                     strides=[2])
                output_ = tf.squeeze(output_, axis=2)

            if decoder.pred_embed_proj:
                # intermediate projection to embedding size (before projecting to vocabulary size)
                # this is useful to reduce the number of parameters, and
                # to use the output embeddings for output projection (tie_embeddings parameter)
                output_ = dense(output_,
                                decoder.embedding_size,
                                use_bias=False,
                                name='softmax0')

        if decoder.tie_embeddings and (decoder.pred_embed_proj
                                       or decoder.pred_deep_layer):
            bias = get_variable('softmax1/bias', shape=[decoder.vocab_size])
            output_ = tf.matmul(output_, tf.transpose(embedding)) + bias
        else:
            output_ = dense(output_,
                            output_size,
                            use_bias=True,
                            name='softmax1')
        return output_

    input_shape = tf.shape(decoder_inputs)
    batch_size = input_shape[0]
    time_steps = input_shape[1]

    output_size = decoder.vocab_size

    state_size = get_cell(dropout=False).state_size
    cell_output_size = get_cell(dropout=False).output_size

    time = tf.constant(0, dtype=tf.int32, name='time')

    outputs = tf.TensorArray(dtype=tf.float32, size=time_steps)
    samples = tf.TensorArray(dtype=tf.int64, size=time_steps)
    inputs = tf.TensorArray(dtype=tf.int64,
                            size=time_steps,
                            clear_after_read=False).unstack(
                                tf.to_int64(
                                    tf.transpose(decoder_inputs, perm=(1, 0))))
    states = tf.TensorArray(dtype=tf.float32, size=time_steps)
    weights = tf.TensorArray(dtype=tf.float32, size=time_steps)
    attns = tf.TensorArray(dtype=tf.float32, size=time_steps)

    initial_symbol = inputs.read(0)  # first symbol is BOS
    initial_input = embed(initial_symbol)
    initial_pos = tf.zeros([batch_size], tf.float32)
    initial_weights = tf.zeros(
        tf.shape(attention_states[align_encoder_id])[:2])

    if decoder.use_dropout:
        initial_state = tf.nn.dropout(
            initial_state, keep_prob=decoder.initial_state_keep_prob)

    with tf.variable_scope('decoder_{}'.format(decoder.name)):
        initial_state = dense(initial_state,
                              state_size,
                              use_bias=True,
                              name='initial_state_projection',
                              activation=tf.nn.tanh)

    if decoder.update_first and not decoder.rnn_feed_attn and not decoder.conditional_rnn:
        initial_state = update(initial_state,
                               initial_input,
                               context=None,
                               symbol=None)

    initial_data = tf.concat(
        [initial_state,
         tf.expand_dims(initial_pos, axis=1), initial_weights],
        axis=1)
    initial_state, initial_pos, initial_weights = tf.split(initial_data,
                                                           [state_size, 1, -1],
                                                           axis=1)
    initial_state.set_shape([None, state_size])
    initial_pos = initial_pos[:, 0]

    def _time_step(time, input_, input_symbol, pos, state, outputs, states,
                   weights, attns, prev_weights, samples):
        if decoder.conditional_rnn:
            with tf.variable_scope('conditional_1'):
                state = update(state, input_)

        context, new_weights = look(state,
                                    input_,
                                    pos=pos,
                                    prev_weights=prev_weights)

        if decoder.conditional_rnn:
            with tf.variable_scope('conditional_2'):
                state = update(state, context)
        elif not decoder.generate_first:
            state = update(state, input_, context, input_symbol)

        output_ = generate(state, input_, context)

        argmax = lambda: tf.argmax(output_, 1)
        target = lambda: inputs.read(time + 1)
        softmax = lambda: tf.squeeze(tf.multinomial(
            tf.log(tf.nn.softmax(output_)), num_samples=1),
                                     axis=1)

        predicted_symbol = tf.case(
            [(tf.logical_and(time < time_steps - 1,
                             tf.random_uniform([]) >= feed_previous), target),
             (tf.logical_not(feed_argmax), softmax)],
            default=argmax)  # default case is useful for beam-search
        predicted_symbol.set_shape([None])
        predicted_symbol = tf.stop_gradient(predicted_symbol)
        samples = samples.write(time, predicted_symbol)

        input_ = embed(predicted_symbol)
        pos = update_pos(pos, predicted_symbol,
                         encoder_input_length[align_encoder_id])

        attns = attns.write(time, context)
        weights = weights.write(time, new_weights)
        states = states.write(time, state)
        outputs = outputs.write(time, output_)

        if not decoder.conditional_rnn and decoder.generate_first:
            state = update(state, input_, context, predicted_symbol)

        return (time + 1, input_, predicted_symbol, pos, state, outputs,
                states, weights, attns, new_weights, samples)

    with tf.variable_scope('decoder_{}'.format(decoder.name)):
        _, _, _, new_pos, new_state, outputs, states, weights, attns, new_weights, samples = tf.while_loop(
            cond=lambda time, *_: time < time_steps,
            body=_time_step,
            loop_vars=(time, initial_input, initial_symbol, initial_pos,
                       initial_state, outputs, weights, states, attns,
                       initial_weights, samples),
            parallel_iterations=decoder.parallel_iterations,
            swap_memory=decoder.swap_memory)

    outputs = outputs.stack()
    weights = weights.stack()  # batch_size, encoders, output time, input time
    states = states.stack()
    attns = attns.stack()
    samples = samples.stack()

    new_data = tf.concat(
        [new_state, tf.expand_dims(new_pos, axis=1), new_weights], axis=1)
    beam_tensors = utils.AttrDict(data=initial_data, new_data=new_data)

    # put batch_size as first dimension
    outputs = tf.transpose(outputs, perm=(1, 0, 2))
    weights = tf.transpose(weights[1:], perm=(1, 0, 2))
    states = tf.transpose(states, perm=(1, 0, 2))
    attns = tf.transpose(attns, perm=(1, 0, 2))
    samples = tf.transpose(samples)

    return outputs, weights, states, attns, beam_tensors, samples