def __init__(self, encoders, decoders, learning_rate, global_step, max_gradient_norm, use_dropout=False, freeze_variables=None, feed_previous=0.0, optimizer='sgd', decode_only=False, len_normalization=1.0, name=None, chained_encoders=False, baseline_step=None, use_baseline=True, reverse_input=False, reconstruction_decoders=False, multi_task=False, **kwargs): self.encoders = encoders self.decoders = decoders self.temperature = self.decoders[0].temperature self.pred_edits = decoders[0].pred_edits self.name = name self.learning_rate = learning_rate self.global_step = global_step self.baseline_step = baseline_step self.use_baseline = use_baseline self.max_output_len = [decoder.max_len for decoder in decoders] self.max_input_len = [encoder.max_len for encoder in encoders] self.len_normalization = len_normalization self.reverse_input = reverse_input dropout_on = [] dropout_off = [] if use_dropout: for encoder_or_decoder in encoders + decoders: names = ['rnn_input', 'rnn_output', 'rnn_state', 'initial_state', 'word', 'input_layer', 'output', 'attn', 'deep_layer', 'inter_layer', 'embedding'] for name in names: value = encoder_or_decoder.get(name + '_dropout') var_name = name + '_keep_prob' if not value: encoder_or_decoder[var_name] = 1.0 continue var = tf.Variable(1 - value, trainable=False, name=var_name) encoder_or_decoder[var_name] = var dropout_on.append(var.assign(1.0 - value)) dropout_off.append(var.assign(1.0)) self.dropout_on = tf.group(*dropout_on) self.dropout_off = tf.group(*dropout_off) self.feed_previous = tf.constant(feed_previous, dtype=tf.float32) self.feed_argmax = tf.constant(True, dtype=tf.bool) # feed with argmax or sample from softmax self.training = tf.placeholder(dtype=tf.bool, shape=()) self.encoder_inputs = [] self.encoder_input_length = [] for encoder in encoders: shape = [None, None, encoder.embedding_size] if encoder.binary else [None, None] dtype = tf.float32 if encoder.binary else tf.int32 encoder_input = tf.placeholder(dtype=dtype, shape=shape, name='encoder_{}'.format(encoder.name)) encoder_input_length = tf.placeholder(dtype=tf.int32, shape=[None], name='encoder_input_length_{}'.format(encoder.name)) self.encoder_inputs.append(encoder_input) self.encoder_input_length.append(encoder_input_length) # starts with BOS, and ends with EOS self.targets = tuple([ tf.placeholder(tf.int32, shape=[None, None], name='target_{}'.format(decoder.name)) for decoder in decoders ]) self.rewards = tf.placeholder(tf.float32, shape=[None, None], name='rewards') if reconstruction_decoders: architecture = models.reconstruction_encoder_decoder elif chained_encoders and self.pred_edits: architecture = models.chained_encoder_decoder # no REINFORCE for now elif multi_task: architecture = models.multi_task_encoder_decoder else: architecture = models.encoder_decoder tensors = architecture(encoders, decoders, self.encoder_inputs, self.targets, self.feed_previous, encoder_input_length=self.encoder_input_length, feed_argmax=self.feed_argmax, rewards=self.rewards, use_baseline=use_baseline, training=self.training, global_step=self.global_step, **kwargs) self.losses, self.outputs, self.attention_weights, self.samples, self.beam_fun, self.initial_data = tensors self.xent_loss, self.reinforce_loss, self.baseline_loss = self.losses self.loss = self.xent_loss # main loss optimizers = self.get_optimizers(optimizer, learning_rate) if not decode_only: get_update_ops = functools.partial(self.get_update_op, opts=optimizers, max_gradient_norm=max_gradient_norm, freeze_variables=freeze_variables) self.update_ops = utils.AttrDict({ 'xent': get_update_ops(self.xent_loss, global_step=self.global_step), 'reinforce': get_update_ops(self.reinforce_loss, global_step=self.global_step), }) if use_baseline: self.update_ops['baseline'] = get_update_ops(self.baseline_loss, global_step=self.baseline_step) self.models = [self] self.beam_outputs = tf.expand_dims(tf.argmax(self.outputs[0], axis=2), axis=1) self.beam_scores = tf.zeros(shape=[tf.shape(self.beam_outputs)[0], 1]) self.beam_size = tf.placeholder(shape=(), dtype=tf.int32)
def main(args=None): args = parser.parse_args(args) # read config file and default config with open('config/default.yaml') as f: default_config = utils.AttrDict(yaml.safe_load(f)) with open(args.config) as f: config = utils.AttrDict(yaml.safe_load(f)) if args.learning_rate is not None: args.reset_learning_rate = True # command-line parameters have higher precedence than config file for k, v in vars(args).items(): if v is not None: config[k] = v # set default values for parameters that are not defined for k, v in default_config.items(): config.setdefault(k, v) if config.score_function: config.score_functions = evaluation.name_mapping[config.score_function] if args.crash_test: config.max_train_size = 0 if not config.debug: os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # disable TensorFlow's debugging logs decoding_mode = any(arg is not None for arg in (args.decode, args.eval, args.align)) # enforce parameter constraints assert config.steps_per_eval % config.steps_per_checkpoint == 0, ( 'steps-per-eval should be a multiple of steps-per-checkpoint') assert decoding_mode or args.train or args.save or args.save_embedding, ( 'you need to specify at least one action (decode, eval, align, or train)') assert not (args.average and args.ensemble) if args.train and args.purge: utils.log('deleting previous model') shutil.rmtree(config.model_dir, ignore_errors=True) os.makedirs(config.model_dir, exist_ok=True) # copy config file to model directory config_path = os.path.join(config.model_dir, 'config.yaml') if args.train and not os.path.exists(config_path): with open(args.config) as config_file, open(config_path, 'w') as dest_file: content = config_file.read() content = re.sub(r'model_dir:.*?\n', 'model_dir: {}\n'.format(config.model_dir), content, flags=re.MULTILINE) dest_file.write(content) # also copy default config config_path = os.path.join(config.model_dir, 'default.yaml') if args.train and not os.path.exists(config_path): shutil.copy('config/default.yaml', config_path) # copy source code to model directory tar_path = os.path.join(config.model_dir, 'code.tar.gz') if args.train and not os.path.exists(tar_path): with tarfile.open(tar_path, "w:gz") as tar: for filename in os.listdir('translate'): if filename.endswith('.py'): tar.add(os.path.join('translate', filename), arcname=filename) logging_level = logging.DEBUG if args.verbose else logging.INFO # always log to stdout in decoding and eval modes (to avoid overwriting precious train logs) log_path = os.path.join(config.model_dir, config.log_file) logger = utils.create_logger(log_path if args.train else None) logger.setLevel(logging_level) utils.log('label: {}'.format(config.label)) utils.log('description:\n {}'.format('\n '.join(config.description.strip().split('\n')))) utils.log(' '.join(sys.argv)) # print command line try: # print git hash commit_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip() utils.log('commit hash {}'.format(commit_hash)) except: pass utils.log('tensorflow version: {}'.format(tf.__version__)) # log parameters utils.debug('program arguments') for k, v in sorted(config.items(), key=itemgetter(0)): utils.debug(' {:<20} {}'.format(k, pformat(v))) if isinstance(config.dev_prefix, str): config.dev_prefix = [config.dev_prefix] if config.tasks is not None: config.tasks = [utils.AttrDict(task) for task in config.tasks] tasks = config.tasks else: tasks = [config] for task in tasks: for parameter, value in config.items(): task.setdefault(parameter, value) task.encoders = [utils.AttrDict(encoder) for encoder in task.encoders] task.decoders = [utils.AttrDict(decoder) for decoder in task.decoders] for encoder_or_decoder in task.encoders + task.decoders: for parameter, value in task.items(): encoder_or_decoder.setdefault(parameter, value) if args.max_len: args.max_input_len = args.max_len if args.max_output_len: # override decoder's max len task.decoders[0].max_len = args.max_output_len if args.max_input_len: # override encoder's max len task.encoders[0].max_len = args.max_input_len config.checkpoint_dir = os.path.join(config.model_dir, 'checkpoints') # setting random seeds if config.seed is None: config.seed = random.randrange(sys.maxsize) if config.tf_seed is None: config.tf_seed = random.randrange(sys.maxsize) utils.log('python random seed: {}'.format(config.seed)) utils.log('tf random seed: {}'.format(config.tf_seed)) random.seed(config.seed) tf.set_random_seed(config.tf_seed) device = None if config.no_gpu: device = '/cpu:0' device_id = None elif config.gpu_id is not None: device = '/gpu:{}'.format(config.gpu_id) device_id = config.gpu_id else: device_id = 0 # hide other GPUs so that TensorFlow won't use memory on them os.environ['CUDA_VISIBLE_DEVICES'] = '' if device_id is None else str(device_id) utils.log('creating model') utils.log('using device: {}'.format(device)) with tf.device(device): if config.weight_scale: if config.initializer == 'uniform': initializer = tf.random_uniform_initializer(minval=-config.weight_scale, maxval=config.weight_scale) else: initializer = tf.random_normal_initializer(stddev=config.weight_scale) else: initializer = None tf.get_variable_scope().set_initializer(initializer) # exempt from creating gradient ops config.decode_only = decoding_mode if config.tasks is not None: model = MultiTaskModel(**config) else: model = TranslationModel(**config) # count parameters # not counting parameters created by training algorithm (e.g. Adam) variables = [var for var in tf.global_variables() if not var.name.startswith('gradients')] utils.log('model parameters ({})'.format(len(variables))) parameter_count = 0 for var in sorted(variables, key=lambda var: var.name): utils.log(' {} {}'.format(var.name, var.get_shape())) v = 1 for d in var.get_shape(): v *= d.value parameter_count += v utils.log('number of parameters: {:.2f}M'.format(parameter_count / 1e6)) tf_config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) tf_config.gpu_options.allow_growth = config.allow_growth tf_config.gpu_options.per_process_gpu_memory_fraction = config.mem_fraction def average_checkpoints(main_sess, sessions): for var in tf.global_variables(): avg_value = sum(sess.run(var) for sess in sessions) / len(sessions) main_sess.run(var.assign(avg_value)) with tf.Session(config=tf_config) as sess: best_checkpoint = os.path.join(config.checkpoint_dir, 'best') params = {'variable_mapping': config.variable_mapping, 'reverse_mapping': config.reverse_mapping, 'rnn_lm_model_dir': None, 'rnn_mt_model_dir': None, 'rnn_lm_cell_name': None, 'origin_model_ckpt': None} if config.ensemble and len(config.checkpoints) > 1: model.initialize(config.checkpoints, **params) elif config.average and len(config.checkpoints) > 1: model.initialize(reset=True) sessions = [tf.Session(config=tf_config) for _ in config.checkpoints] for sess_, checkpoint in zip(sessions, config.checkpoints): model.initialize(sess=sess_, checkpoints=[checkpoint], **params) average_checkpoints(sess, sessions) elif (not config.checkpoints and decoding_mode and (os.path.isfile(best_checkpoint + '.index') or os.path.isfile(best_checkpoint + '.index'))): # in decoding and evaluation mode, unless specified otherwise (by `checkpoints`), # try to load the best checkpoint model.initialize([best_checkpoint], **params) else: # loads last checkpoint, unless `reset` is true model.initialize(**config) if config.output is not None: dirname = os.path.dirname(config.output) if dirname: os.makedirs(dirname, exist_ok=True) try: if args.save: model.save() elif args.save_embedding: if config.embedding_output_dir is None: output_dir = "." else: output_dir = config.embedding_output_dir model.save_embedding(output_dir) elif args.decode is not None: if config.align is not None: config.align = True model.decode(**config) elif args.eval is not None: model.evaluate(on_dev=False, **config) elif args.align is not None: model.align(**config) elif args.train: model.train(**config) except KeyboardInterrupt: sys.exit()
def __init__(self, encoders, decoders, checkpoint_dir, learning_rate, learning_rate_decay_factor, batch_size, keep_best=1, dev_prefix=None, name=None, ref_ext=None, pred_edits=False, dual_output=False, binary=None, truncate_lines=True, ensemble=False, checkpoints=None, beam_size=1, len_normalization=1, lexicon=None, debug=False, **kwargs): self.batch_size = batch_size self.character_level = {} self.binary = [] self.debug = debug for encoder_or_decoder in encoders + decoders: encoder_or_decoder.ext = encoder_or_decoder.ext or encoder_or_decoder.name self.character_level[ encoder_or_decoder.ext] = encoder_or_decoder.character_level self.binary.append(encoder_or_decoder.get('binary', False)) self.encoders, self.decoders = encoders, decoders self.char_output = decoders[0].character_level self.src_ext = [encoder.ext for encoder in encoders] self.trg_ext = [decoder.ext for decoder in decoders] self.extensions = self.src_ext + self.trg_ext self.ref_ext = ref_ext if self.ref_ext is not None: self.binary.append(False) self.pred_edits = pred_edits self.dual_output = dual_output self.dev_prefix = dev_prefix self.name = name self.max_input_len = [encoder.max_len for encoder in encoders] self.max_output_len = [decoder.max_len for decoder in decoders] self.beam_size = beam_size if truncate_lines: self.max_len = None # we let seq2seq.get_batch handle long lines (by truncating them) else: # the line reader will drop lines that are too long self.max_len = dict( zip(self.extensions, self.max_input_len + self.max_output_len)) self.learning_rate = tf.Variable(learning_rate, trainable=False, name='learning_rate', dtype=tf.float32) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) with tf.device('/cpu:0'): self.global_step = tf.Variable(0, trainable=False, name='global_step') self.baseline_step = tf.Variable(0, trainable=False, name='baseline_step') self.filenames = utils.get_filenames(extensions=self.extensions, dev_prefix=dev_prefix, name=name, ref_ext=ref_ext, binary=self.binary, **kwargs) utils.debug('reading vocabularies') self.vocabs = None self.src_vocab, self.trg_vocab = None, None self.read_vocab() for encoder_or_decoder, vocab in zip(encoders + decoders, self.vocabs): if vocab: if encoder_or_decoder.vocab_size: # reduce vocab size vocab.reverse[:] = vocab.reverse[:encoder_or_decoder. vocab_size] for token, token_id in list(vocab.vocab.items()): if token_id >= encoder_or_decoder.vocab_size: del vocab.vocab[token] else: encoder_or_decoder.vocab_size = len(vocab.reverse) utils.debug('creating model') self.models = [] if ensemble and checkpoints is not None: for i, _ in enumerate(checkpoints, 1): with tf.variable_scope('model_{}'.format(i)): model = Seq2SeqModel(encoders, decoders, self.learning_rate, self.global_step, name=name, pred_edits=pred_edits, dual_output=dual_output, baseline_step=self.baseline_step, **kwargs) self.models.append(model) self.seq2seq_model = self.models[0] else: self.seq2seq_model = Seq2SeqModel(encoders, decoders, self.learning_rate, self.global_step, name=name, pred_edits=pred_edits, dual_output=dual_output, baseline_step=self.baseline_step, **kwargs) self.models.append(self.seq2seq_model) self.seq2seq_model.create_beam_op(self.models, len_normalization) self.batch_iterator = None self.dev_batches = None self.train_size = None self.saver = None self.keep_best = keep_best self.checkpoint_dir = checkpoint_dir self.epoch = None self.training = utils.AttrDict() # used to keep track of training if lexicon: with open(lexicon) as lexicon_file: self.lexicon = dict(line.split() for line in lexicon_file) else: self.lexicon = None
def main(args=None): args = parser.parse_args(args) # read config file and default config with open('config/default.yaml') as f: default_config = utils.AttrDict(yaml.safe_load(f)) with open(args.config) as f: config = utils.AttrDict(yaml.safe_load(f)) if args.learning_rate is not None: args.reset_learning_rate = True # command-line parameters have higher precedence than config file for k, v in vars(args).items(): if v is not None: config[k] = v # set default values for parameters that are not defined for k, v in default_config.items(): config.setdefault(k, v) # enforce parameter constraints assert config.steps_per_eval % config.steps_per_checkpoint == 0, ( 'steps-per-eval should be a multiple of steps-per-checkpoint') assert args.decode is not None or args.eval or args.train or args.align, ( 'you need to specify at least one action (decode, eval, align, or train)' ) if args.purge: utils.log('deleting previous model') shutil.rmtree(config.model_dir, ignore_errors=True) logging_level = logging.DEBUG if args.verbose else logging.INFO # always log to stdout in decoding and eval modes (to avoid overwriting precious train logs) logger = utils.create_logger(config.log_file if args.train else None) logger.setLevel(logging_level) utils.log(' '.join(sys.argv)) # print command line try: # print git hash commit_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip() utils.log('commit hash {}'.format(commit_hash)) except: pass # list of encoder and decoder parameter names (each encoder and decoder can have a different value # for those parameters) model_parameters = [ 'cell_size', 'layers', 'vocab_size', 'embedding_size', 'attention_filters', 'attention_filter_length', 'use_lstm', 'time_pooling', 'attention_window_size', 'dynamic', 'binary', 'character_level', 'bidir', 'load_embeddings', 'pooling_avg', 'swap_memory', 'parallel_iterations', 'input_layers', 'residual_connections', 'attn_size' ] # TODO: independent model dir for each task task_parameters = [ 'data_dir', 'train_prefix', 'dev_prefix', 'vocab_prefix', 'ratio', 'lm_file', 'learning_rate', 'learning_rate_decay_factor', 'max_input_len', 'max_output_len', 'encoders', 'decoder' ] # in case no task is defined (standard mono-task settings), define a "main" task config.setdefault('tasks', [{ 'encoders': config.encoders, 'decoder': config.decoder, 'name': 'main', 'ratio': 1.0 }]) config.tasks = [utils.AttrDict(task) for task in config.tasks] for task in config.tasks: for parameter in task_parameters: task.setdefault(parameter, config.get(parameter)) if isinstance(task.dev_prefix, str): # for back-compatibility with old config files task.dev_prefix = [task.dev_prefix] # convert dicts to AttrDicts for convenience task.encoders = [utils.AttrDict(encoder) for encoder in task.encoders] task.decoder = utils.AttrDict(task.decoder) for encoder_or_decoder in task.encoders + [task.decoder]: # move parameters all the way up from base level to encoder/decoder level: # default values for encoder/decoder parameters can be defined at the task level and base level # default values for tasks can be defined at the base level for parameter in model_parameters: if parameter in encoder_or_decoder: continue elif parameter in task: encoder_or_decoder[parameter] = task[parameter] else: encoder_or_decoder[parameter] = config.get(parameter) # log parameters utils.log('program arguments') for k, v in sorted(config.items(), key=itemgetter(0)): if k == 'tasks': utils.log(' {:<20}\n{}'.format(k, pformat(v))) elif k not in model_parameters and k not in task_parameters: utils.log(' {:<20} {}'.format(k, pformat(v))) device = None if config.no_gpu: device = '/cpu:0' elif config.gpu_id is not None: device = '/gpu:{}'.format(config.gpu_id) utils.log('creating model') utils.log('using device: {}'.format(device)) with tf.device(device): checkpoint_dir = os.path.join(config.model_dir, 'checkpoints') # All parameters except recurrent connexions and attention parameters are initialized with this. # Recurrent connexions are initialized with orthogonal matrices, and the parameters of the attention model # with a standard deviation of 0.001 if config.weight_scale: initializer = tf.random_normal_initializer( stddev=config.weight_scale) else: initializer = None tf.get_variable_scope().set_initializer(initializer) decode_only = args.decode is not None or args.eval or args.align # exempt from creating gradient ops model = MultiTaskModel(name='main', checkpoint_dir=checkpoint_dir, decode_only=decode_only, **config) utils.log('model parameters ({})'.format(len(tf.global_variables()))) parameter_count = 0 for var in tf.global_variables(): utils.log(' {} {}'.format(var.name, var.get_shape())) v = 1 for d in var.get_shape(): v *= d.value parameter_count += v utils.log('number of parameters: {}'.format(parameter_count)) tf_config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) tf_config.gpu_options.allow_growth = config.allow_growth tf_config.gpu_options.per_process_gpu_memory_fraction = config.mem_fraction with tf.Session(config=tf_config) as sess: best_checkpoint = os.path.join(checkpoint_dir, 'best') if config.ensemble and (args.eval or args.decode is not None): # create one session for each model in the ensemble sess = [tf.Session() for _ in config.checkpoints] for sess_, checkpoint in zip(sess, config.checkpoints): model.initialize(sess_, [checkpoint], reset=True) elif (not config.checkpoints and (args.eval or args.decode is not None or args.align) and (os.path.isfile(best_checkpoint + '.index') or os.path.isfile(best_checkpoint + '.index'))): # in decoding and evaluation mode, unless specified otherwise (by `checkpoints`), # try to load the best checkpoint) model.initialize(sess, [best_checkpoint], reset=True) else: # loads last checkpoint, unless `reset` is true model.initialize(sess, **config) # Inspect variables: # tf.get_variable_scope().reuse_variables() # import pdb; pdb.set_trace() if args.decode is not None: model.decode(sess, **config) elif args.eval: model.evaluate(sess, on_dev=False, **config) elif args.align: model.align(sess, **config) elif args.train: eval_output = os.path.join(config.model_dir, 'eval') try: model.train(sess, eval_output=eval_output, **config) except KeyboardInterrupt: utils.log('exiting...') model.save(sess) sys.exit()
def __init__(self, encoders, decoders, checkpoint_dir, learning_rate, learning_rate_decay_factor, batch_size, keep_best=1, dev_prefix=None, score_function='corpus_scores', name=None, ref_ext=None, pred_edits=False, dual_output=False, binary=None, **kwargs): self.batch_size = batch_size self.character_level = {} self.binary = [] for encoder_or_decoder in encoders + decoders: encoder_or_decoder.ext = encoder_or_decoder.ext or encoder_or_decoder.name self.character_level[ encoder_or_decoder.ext] = encoder_or_decoder.character_level self.binary.append(encoder_or_decoder.get('binary', False)) self.char_output = decoders[0].character_level self.src_ext = [encoder.ext for encoder in encoders] self.trg_ext = [decoder.ext for decoder in decoders] self.extensions = self.src_ext + self.trg_ext self.ref_ext = ref_ext if self.ref_ext is not None: self.binary.append(False) self.pred_edits = pred_edits self.dual_output = dual_output self.dev_prefix = dev_prefix self.name = name self.max_input_len = [encoder.max_len for encoder in encoders] self.max_output_len = [decoder.max_len for decoder in decoders] self.max_len = dict( zip(self.extensions, self.max_input_len + self.max_output_len)) self.learning_rate = tf.Variable(learning_rate, trainable=False, name='learning_rate', dtype=tf.float32) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) with tf.device('/cpu:0'): self.global_step = tf.Variable(0, trainable=False, name='global_step') self.baseline_step = tf.Variable(0, trainable=False, name='baseline_step') self.filenames = utils.get_filenames(extensions=self.extensions, dev_prefix=dev_prefix, name=name, ref_ext=ref_ext, binary=self.binary, **kwargs) utils.debug('reading vocabularies') self.vocabs = None self.src_vocab, self.trg_vocab = None, None self.read_vocab() for encoder_or_decoder, vocab in zip(encoders + decoders, self.vocabs): if vocab: encoder_or_decoder.vocab_size = len(vocab.reverse) utils.debug('creating model') self.seq2seq_model = Seq2SeqModel(encoders, decoders, self.learning_rate, self.global_step, name=name, pred_edits=pred_edits, dual_output=dual_output, baseline_step=self.baseline_step, **kwargs) self.batch_iterator = None self.dev_batches = None self.train_size = None self.saver = None self.keep_best = keep_best self.checkpoint_dir = checkpoint_dir self.training = utils.AttrDict() # used to keep track of training try: self.reversed_scores = getattr( evaluation, score_function).reversed # the lower the better except AttributeError: self.reversed_scores = False # the higher the better
def main(args=None): args = parser.parse_args(args) # read config file and default config with open('config/default.yaml') as f: default_config = utils.AttrDict(yaml.safe_load(f)) with open(args.config) as f: config = utils.AttrDict(yaml.safe_load(f)) if args.learning_rate is not None: args.reset_learning_rate = True # command-line parameters have higher precedence than config file for k, v in vars(args).items(): if v is not None: config[k] = v # set default values for parameters that are not defined for k, v in default_config.items(): config.setdefault(k, v) # enforce parameter constraints assert config.steps_per_eval % config.steps_per_checkpoint == 0, ( 'steps-per-eval should be a multiple of steps-per-checkpoint') assert args.decode is not None or args.eval or args.train or args.align, ( 'you need to specify at least one action (decode, eval, align, or train)' ) assert not (args.avg_checkpoints and args.ensemble) if args.purge: utils.log('deleting previous model') shutil.rmtree(config.model_dir, ignore_errors=True) os.makedirs(config.model_dir, exist_ok=True) # copy config file to model directory config_path = os.path.join(config.model_dir, 'config.yaml') if not os.path.exists(config_path): shutil.copy(args.config, config_path) # also copy default config config_path = os.path.join(config.model_dir, 'default.yaml') if not os.path.exists(config_path): shutil.copy('config/default.yaml', config_path) # copy source code to model directory tar_path = os.path.join(config.model_dir, 'code.tar.gz') if not os.path.exists(tar_path): with tarfile.open(tar_path, "w:gz") as tar: for filename in os.listdir('translate'): if filename.endswith('.py'): tar.add(os.path.join('translate', filename), arcname=filename) logging_level = logging.DEBUG if args.verbose else logging.INFO # always log to stdout in decoding and eval modes (to avoid overwriting precious train logs) log_path = os.path.join(config.model_dir, config.log_file) logger = utils.create_logger(log_path if args.train else None) logger.setLevel(logging_level) utils.log('label: {}'.format(config.label)) utils.log('description:\n {}'.format('\n '.join( config.description.strip().split('\n')))) utils.log(' '.join(sys.argv)) # print command line try: # print git hash commit_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip() utils.log('commit hash {}'.format(commit_hash)) except: pass utils.log('tensorflow version: {}'.format(tf.__version__)) # log parameters utils.debug('program arguments') for k, v in sorted(config.items(), key=itemgetter(0)): utils.debug(' {:<20} {}'.format(k, pformat(v))) if isinstance(config.dev_prefix, str): config.dev_prefix = [config.dev_prefix] if config.tasks is not None: config.tasks = [utils.AttrDict(task) for task in config.tasks] tasks = config.tasks else: tasks = [config] for task in tasks: for parameter, value in config.items(): task.setdefault(parameter, value) task.encoders = [utils.AttrDict(encoder) for encoder in task.encoders] task.decoders = [utils.AttrDict(decoder) for decoder in task.decoders] for encoder_or_decoder in task.encoders + task.decoders: for parameter, value in task.items(): encoder_or_decoder.setdefault(parameter, value) device = None if config.no_gpu: device = '/cpu:0' elif config.gpu_id is not None: device = '/gpu:{}'.format(config.gpu_id) utils.log('creating model') utils.log('using device: {}'.format(device)) with tf.device(device): config.checkpoint_dir = os.path.join(config.model_dir, 'checkpoints') if config.weight_scale: if config.initializer == 'uniform': initializer = tf.random_uniform_initializer( minval=-config.weight_scale, maxval=config.weight_scale) else: initializer = tf.random_normal_initializer( stddev=config.weight_scale) else: initializer = None tf.get_variable_scope().set_initializer(initializer) config.decode_only = args.decode is not None or args.eval or args.align # exempt from creating gradient ops if config.tasks is not None: model = MultiTaskModel(**config) else: model = TranslationModel(**config) # count parameters utils.log('model parameters ({})'.format(len(tf.global_variables()))) parameter_count = 0 for var in tf.global_variables(): utils.log(' {} {}'.format(var.name, var.get_shape())) if not var.name.startswith( 'gradients' ): # not counting parameters created by training algorithm (e.g. Adam) v = 1 for d in var.get_shape(): v *= d.value parameter_count += v utils.log('number of parameters: {:.2f}M'.format(parameter_count / 1e6)) tf_config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) tf_config.gpu_options.allow_growth = config.allow_growth tf_config.gpu_options.per_process_gpu_memory_fraction = config.mem_fraction def average_checkpoints(main_sess, sessions): for var in tf.global_variables(): avg_value = sum(sess.run(var) for sess in sessions) / len(sessions) main_sess.run(var.assign(avg_value)) with tf.Session(config=tf_config) as sess: best_checkpoint = os.path.join(config.checkpoint_dir, 'best') if ((config.ensemble or config.avg_checkpoints) and (args.eval or args.decode is not None) and len(config.checkpoints) > 1): # create one session for each model in the ensemble sessions = [tf.Session() for _ in config.checkpoints] for sess_, checkpoint in zip(sessions, config.checkpoints): model.initialize(sess_, [checkpoint]) if config.ensemble: sess = sessions else: sess = sessions[0] average_checkpoints(sess, sessions) elif (not config.checkpoints and (args.eval or args.decode is not None or args.align) and (os.path.isfile(best_checkpoint + '.index') or os.path.isfile(best_checkpoint + '.index'))): # in decoding and evaluation mode, unless specified otherwise (by `checkpoints`), # try to load the best checkpoint) model.initialize(sess, [best_checkpoint]) else: # loads last checkpoint, unless `reset` is true model.initialize(sess, **config) if args.decode is not None: model.decode(sess, **config) elif args.eval: model.evaluate(sess, on_dev=False, **config) elif args.align: model.align(sess, **config) elif args.train: try: model.train(sess=sess, **config) except (KeyboardInterrupt, utils.FinishedTrainingException): utils.log('exiting...') model.save(sess) sys.exit()
def attention_decoder(decoder_inputs, initial_state, attention_states, encoders, decoder, encoder_input_length, feed_previous=0.0, align_encoder_id=0, feed_argmax=True, **kwargs): """ :param decoder_inputs: int32 tensor of shape (batch_size, output_length) :param initial_state: initial state of the decoder (usually the final state of the encoder), as a float32 tensor of shape (batch_size, initial_state_size). This state is mapped to the correct state size for the decoder. :param attention_states: list of tensors of shape (batch_size, input_length, encoder_cell_size), the hidden states of the encoder(s) (one tensor for each encoder). :param encoders: configuration of the encoders :param decoder: configuration of the decoder :param encoder_input_length: list of int32 tensors of shape (batch_size,), tells for each encoder, the true length of each sequence in the batch (sequences in the same batch are padded to all have the same length). :param feed_previous: scalar tensor corresponding to the probability to use previous decoder output instead of the ground truth as input for the decoder (1 when decoding, between 0 and 1 when training) :param feed_argmax: boolean tensor, when True the greedy decoder outputs the word with the highest probability (argmax). When False, it samples a word from the probability distribution (softmax). :param align_encoder_id: outputs attention weights for this encoder. Also used when predicting edit operations (pred_edits), to specifify which encoder reads the sequence to post-edit (MT). :return: outputs of the decoder as a tensor of shape (batch_size, output_length, decoder_cell_size) attention weights as a tensor of shape (output_length, encoders, batch_size, input_length) """ assert not decoder.pred_maxout_layer or decoder.cell_size % 2 == 0, 'cell size must be a multiple of 2' embedding_shape = [decoder.vocab_size, decoder.embedding_size] device = '/cpu:0' if decoder.embeddings_on_cpu else None with tf.device(device): embedding = get_variable('embedding_{}'.format(decoder.name), shape=embedding_shape) def embed(input_): return tf.nn.embedding_lookup(embedding, input_) def get_cell(input_size=None, reuse=False, dropout=True): cells = [] for _ in range(decoder.layers): if decoder.use_lstm: cell = CellWrapper( BasicLSTMCell(decoder.cell_size, reuse=reuse)) else: cell = GRUCell(decoder.cell_size, reuse=reuse) if dropout and decoder.use_dropout: cell = DropoutWrapper( cell, input_keep_prob=decoder.rnn_input_keep_prob, output_keep_prob=decoder.rnn_output_keep_prob, state_keep_prob=decoder.rnn_state_keep_prob, variational_recurrent=decoder.pervasive_dropout, dtype=tf.float32, input_size=input_size) cells.append(cell) if len(cells) == 1: return cells[0] else: return CellWrapper(MultiRNNCell(cells)) def look(state, input_, prev_weights=None, pos=None): if not decoder.attn_use_lstm_state: state = state[:, -cell_output_size:] prev_weights_ = [ prev_weights if i == align_encoder_id else None for i in range(len(encoders)) ] pos_ = None if decoder.pred_edits: pos_ = [ pos if i == align_encoder_id else None for i in range(len(encoders)) ] if decoder.attn_prev_word: state = tf.concat([state, input_], axis=1) parameters = dict(hidden_states=attention_states, encoder_input_length=encoder_input_length, encoders=encoders, aggregation_method=decoder.aggregation_method) context, new_weights = multi_attention(state, pos=pos_, prev_weights=prev_weights_, **parameters) return context, new_weights[align_encoder_id] def update(state, input_, context=None, symbol=None): if context is not None and decoder.rnn_feed_attn: input_ = tf.concat([input_, context], axis=1) input_size = input_.get_shape()[1] try: _, new_state = get_cell(input_size)(input_, state) except ValueError: # auto_reuse doesn't work with LSTM cells _, new_state = get_cell(input_size, reuse=True)(input_, state) if decoder.skip_update and decoder.pred_edits and symbol is not None: is_del = tf.equal(symbol, utils.DEL_ID) new_state = tf.where(is_del, state, new_state) return new_state def update_pos(pos, symbol, max_pos=None): if not decoder.pred_edits: return pos is_keep = tf.equal(symbol, utils.KEEP_ID) is_del = tf.equal(symbol, utils.DEL_ID) is_not_ins = tf.logical_or(is_keep, is_del) pos += tf.to_float(is_not_ins) if max_pos is not None: pos = tf.minimum(pos, tf.to_float(max_pos)) return pos def generate(state, input_, context): if not decoder.pred_use_lstm_state: state = state[:, -cell_output_size:] projection_input = [state, context] if decoder.use_previous_word: projection_input.insert(1, input_) # for back-compatibility output_ = tf.concat(projection_input, axis=1) if decoder.pred_deep_layer: output_ = dense(output_, decoder.embedding_size, activation=tf.tanh, use_bias=True, name='deep_output') else: if decoder.pred_maxout_layer: output_ = dense(output_, decoder.cell_size, use_bias=False, name='maxout') output_ = tf.nn.pool(tf.expand_dims(output_, axis=2), window_shape=[2], pooling_type='MAX', padding='SAME', strides=[2]) output_ = tf.squeeze(output_, axis=2) if decoder.pred_embed_proj: # intermediate projection to embedding size (before projecting to vocabulary size) # this is useful to reduce the number of parameters, and # to use the output embeddings for output projection (tie_embeddings parameter) output_ = dense(output_, decoder.embedding_size, use_bias=False, name='softmax0') if decoder.tie_embeddings and (decoder.pred_embed_proj or decoder.pred_deep_layer): bias = get_variable('softmax1/bias', shape=[decoder.vocab_size]) output_ = tf.matmul(output_, tf.transpose(embedding)) + bias else: output_ = dense(output_, output_size, use_bias=True, name='softmax1') return output_ input_shape = tf.shape(decoder_inputs) batch_size = input_shape[0] time_steps = input_shape[1] output_size = decoder.vocab_size state_size = get_cell(dropout=False).state_size cell_output_size = get_cell(dropout=False).output_size time = tf.constant(0, dtype=tf.int32, name='time') outputs = tf.TensorArray(dtype=tf.float32, size=time_steps) samples = tf.TensorArray(dtype=tf.int64, size=time_steps) inputs = tf.TensorArray(dtype=tf.int64, size=time_steps, clear_after_read=False).unstack( tf.to_int64( tf.transpose(decoder_inputs, perm=(1, 0)))) states = tf.TensorArray(dtype=tf.float32, size=time_steps) weights = tf.TensorArray(dtype=tf.float32, size=time_steps) attns = tf.TensorArray(dtype=tf.float32, size=time_steps) initial_symbol = inputs.read(0) # first symbol is BOS initial_input = embed(initial_symbol) initial_pos = tf.zeros([batch_size], tf.float32) initial_weights = tf.zeros( tf.shape(attention_states[align_encoder_id])[:2]) if decoder.use_dropout: initial_state = tf.nn.dropout( initial_state, keep_prob=decoder.initial_state_keep_prob) with tf.variable_scope('decoder_{}'.format(decoder.name)): initial_state = dense(initial_state, state_size, use_bias=True, name='initial_state_projection', activation=tf.nn.tanh) if decoder.update_first and not decoder.rnn_feed_attn and not decoder.conditional_rnn: initial_state = update(initial_state, initial_input, context=None, symbol=None) initial_data = tf.concat( [initial_state, tf.expand_dims(initial_pos, axis=1), initial_weights], axis=1) initial_state, initial_pos, initial_weights = tf.split(initial_data, [state_size, 1, -1], axis=1) initial_state.set_shape([None, state_size]) initial_pos = initial_pos[:, 0] def _time_step(time, input_, input_symbol, pos, state, outputs, states, weights, attns, prev_weights, samples): if decoder.conditional_rnn: with tf.variable_scope('conditional_1'): state = update(state, input_) context, new_weights = look(state, input_, pos=pos, prev_weights=prev_weights) if decoder.conditional_rnn: with tf.variable_scope('conditional_2'): state = update(state, context) elif not decoder.generate_first: state = update(state, input_, context, input_symbol) output_ = generate(state, input_, context) argmax = lambda: tf.argmax(output_, 1) target = lambda: inputs.read(time + 1) softmax = lambda: tf.squeeze(tf.multinomial( tf.log(tf.nn.softmax(output_)), num_samples=1), axis=1) predicted_symbol = tf.case( [(tf.logical_and(time < time_steps - 1, tf.random_uniform([]) >= feed_previous), target), (tf.logical_not(feed_argmax), softmax)], default=argmax) # default case is useful for beam-search predicted_symbol.set_shape([None]) predicted_symbol = tf.stop_gradient(predicted_symbol) samples = samples.write(time, predicted_symbol) input_ = embed(predicted_symbol) pos = update_pos(pos, predicted_symbol, encoder_input_length[align_encoder_id]) attns = attns.write(time, context) weights = weights.write(time, new_weights) states = states.write(time, state) outputs = outputs.write(time, output_) if not decoder.conditional_rnn and decoder.generate_first: state = update(state, input_, context, predicted_symbol) return (time + 1, input_, predicted_symbol, pos, state, outputs, states, weights, attns, new_weights, samples) with tf.variable_scope('decoder_{}'.format(decoder.name)): _, _, _, new_pos, new_state, outputs, states, weights, attns, new_weights, samples = tf.while_loop( cond=lambda time, *_: time < time_steps, body=_time_step, loop_vars=(time, initial_input, initial_symbol, initial_pos, initial_state, outputs, weights, states, attns, initial_weights, samples), parallel_iterations=decoder.parallel_iterations, swap_memory=decoder.swap_memory) outputs = outputs.stack() weights = weights.stack() # batch_size, encoders, output time, input time states = states.stack() attns = attns.stack() samples = samples.stack() new_data = tf.concat( [new_state, tf.expand_dims(new_pos, axis=1), new_weights], axis=1) beam_tensors = utils.AttrDict(data=initial_data, new_data=new_data) # put batch_size as first dimension outputs = tf.transpose(outputs, perm=(1, 0, 2)) weights = tf.transpose(weights[1:], perm=(1, 0, 2)) states = tf.transpose(states, perm=(1, 0, 2)) attns = tf.transpose(attns, perm=(1, 0, 2)) samples = tf.transpose(samples) return outputs, weights, states, attns, beam_tensors, samples