def score_model(source_file, target_file, scorer_settings, options): scores = [] for option in options: g = tf.Graph() with g.as_default(): tf_config = tf.ConfigProto() tf_config.allow_soft_placement = True with tf.Session(config=tf_config) as sess: logging.info('Building model...') model = rnn_model.RNNModel(option) saver = model_loader.init_or_restore_variables(option, sess) text_iterator = TextIterator( source=source_file.name, target=target_file.name, source_dicts=option.source_dicts, target_dict=option.target_dict, batch_size=scorer_settings.minibatch_size, maxlen=float('inf'), source_vocab_sizes=option.source_vocab_sizes, target_vocab_size=option.target_vocab_size, use_factor=(option.factors > 1), sort_by_length=False) losses = nmt.calc_loss_per_sentence( option, sess, text_iterator, model, normalization_alpha=scorer_settings.normalization_alpha) scores.append(losses) return scores
def main(settings): """ Translates a source language file (or STDIN) into a target language file (or STDOUT). """ # Start logging. level = logging.DEBUG if settings.verbose else logging.INFO logging.basicConfig(level=level, format='%(levelname)s: %(message)s') # Create the TensorFlow session. tf_config = tf.ConfigProto() tf_config.allow_soft_placement = True session = tf.Session(config=tf_config) # Load config file for each model. configs = [] for model in settings.models: config = load_config_from_json_file(model) setattr(config, 'reload', model) configs.append(config) # Create the model graphs and restore their variables. logging.debug("Loading models\n") models = [] # ============= 19/8/16 KP ============ warning('='*20 + 'Model Config to Load') warning(settings.models) # ===================================== for i, config in enumerate(configs): with tf.variable_scope("model%d" % i) as scope: if config.model_type == "transformer": model = TransformerModel(config) else: model = rnn_model.RNNModel(config) saver = model_loader.init_or_restore_variables(config, session, ensemble_scope=scope) model.sampling_utils = SamplingUtils(settings) models.append(model) # ============= 19/8/16 KP ============ model_summary() # ===================================== # TODO Ensembling is currently only supported for RNNs, so if # TODO len(models) > 1 then check models are all rnn # Translate the source file. inference.translate_file(input_file=settings.input, output_file=settings.output, session=session, models=models, configs=configs, beam_size=settings.beam_size, nbest=settings.n_best, minibatch_size=settings.minibatch_size, maxibatch_size=settings.maxibatch_size, normalization_alpha=settings.normalization_alpha)
def main(settings): """ Translates a source language file (or STDIN) into a target language file (or STDOUT). """ # Create the TensorFlow session. tf_config = tf.ConfigProto() tf_config.allow_soft_placement = True session = tf.Session(config=tf_config) # Load config file for each model. configs = [] for model in settings.models: config = load_config_from_json_file(model) setattr(config, 'reload', model) configs.append(config) # Create the model graphs. logging.debug("Loading models\n") models = [] for i, config in enumerate(configs): with tf.variable_scope("model%d" % i) as scope: if config.model_type == "transformer": model = TransformerModel(config) else: model = rnn_model.RNNModel(config) model.sampling_utils = SamplingUtils(settings) models.append(model) # Add smoothing variables (if the models were trained with smoothing). #FIXME Assumes either all models were trained with smoothing or none were. if configs[0].exponential_smoothing > 0.0: smoothing = ExponentialSmoothing(configs[0].exponential_smoothing) # Restore the model variables. for i, config in enumerate(configs): with tf.variable_scope("model%d" % i) as scope: _ = model_loader.init_or_restore_variables(config, session, ensemble_scope=scope) # Swap-in the smoothed versions of the variables. if configs[0].exponential_smoothing > 0.0: session.run(fetches=smoothing.swap_ops) # TODO Ensembling is currently only supported for RNNs, so if # TODO len(models) > 1 then check models are all rnn # Translate the source file. inference.translate_file(input_file=settings.input, output_file=settings.output, session=session, models=models, configs=configs, beam_size=settings.beam_size, nbest=settings.n_best, minibatch_size=settings.minibatch_size, maxibatch_size=settings.maxibatch_size, normalization_alpha=settings.normalization_alpha)
def main(settings): """ Translates a source language file (or STDIN) into a target language file (or STDOUT). """ # Start logging. level = logging.DEBUG if settings.verbose else logging.INFO logging.basicConfig(level=level, format='%(levelname)s: %(message)s') # Create the TensorFlow session. if settings.cpu: logging.info("using cpu now...") os.environ["CUDA_VISIBLE_DEVICES"] = "" tf_config = tf.ConfigProto(device_count={'GPU': 0}) else: os.environ["CUDA_VISIBLE_DEVICES"] = "2" tf_config = tf.ConfigProto() tf_config.allow_soft_placement = True session = tf.Session(config=tf_config) # Load config file for each model. configs = [] for model in settings.models: config = util.load_config(model) compat.fill_options(config) config['reload'] = model configs.append(argparse.Namespace(**config)) # Create the model graphs and restore their variables. logging.debug("Loading models") models = [] for i, config in enumerate(configs): with tf.variable_scope("model%d" % i) as scope: model = rnn_model.RNNModel(config) saver = model_loader.init_or_restore_variables( config, session, ensemble_scope=scope) models.append(model) logging.debug("Models load done.") # Translate the source file. inference.translate_file(input_file=settings.input, output_file=settings.output, session=session, models=models, configs=configs, beam_size=settings.beam_size, nbest=settings.n_best, minibatch_size=settings.minibatch_size, maxibatch_size=settings.maxibatch_size, normalization_alpha=settings.normalization_alpha)
def theano_to_tensorflow_model(in_path, out_path): saved_model = np.load(in_path) config = theano_to_tensorflow_config(in_path) th2tf = construct_parameter_map(config) with tf.compat.v1.Session() as sess: logging.info('Building model...') model = rnn_model.RNNModel(config) init = tf.zeros_initializer(dtype=tf.int32) global_step = tf.compat.v1.get_variable('time', [], initializer=init, trainable=False) saver = model_loader.init_or_restore_variables(config, sess) seen = set() assign_ops = [] for th_name in list(saved_model.keys()): # ignore adam parameters if th_name.startswith('adam'): continue tf_name = th2tf[th_name] if tf_name is None: logging.info("Not saving {} because no TF " \ "equivalent".format(th_name)) continue assert tf_name not in seen seen.add(tf_name) tf_var = tf.compat.v1.get_default_graph().get_tensor_by_name( tf_name) tf_shape = sess.run(tf.shape(input=tf_var)) th_var = saved_model[th_name] th_shape = th_var.shape if list(tf_shape) != list(th_shape): logging.error("Shapes do not match for {} and " \ "{}.".format(tf_name, th_name)) logging.error("Shape of {} is {}".format(tf_name, tf_shape)) logging.error("Shape of {} is {}".format(th_name, th_shape)) sys.exit(1) assign_ops.append(tf.compat.v1.assign(tf_var, th_var)) sess.run(assign_ops) saver.save(sess, save_path=out_path) unassigned = [] for tf_var in tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.GLOBAL_VARIABLES): if tf_var.name not in seen: unassigned.append(tf_var.name) logging.info("The following TF variables were not " \ "assigned: {}".format(" ".join(unassigned))) logging.info("You should see only the 'time' variable listed")
def _load_models(self, process_id, sess): """ Loads models and returns them """ logging.debug("Process '%s' - Loading models\n" % (process_id)) import tensorflow as tf models = [] for i, options in enumerate(self._options): with tf.variable_scope("model%d" % i) as scope: model = rnn_model.RNNModel(options) saver = model_loader.init_or_restore_variables( options, sess, ensemble_scope=scope) models.append(model) logging.info("NOTE: Length of translations is capped to {}".format( self._options[0].translation_maxlen)) return models
def train(X, Y, model_type, learning_rate, file_prefix): class_weight = get_class_weight(Y) train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.1) train_X, valid_X, train_Y, valid_Y = train_test_split(train_X, train_Y, test_size=0.1) start_time = time.time() if model_type == 'CNNLSTM': # only take the original order book columns train_X = train_X[:, :, -20:] test_X = test_X[:, :, -20:] valid_X = valid_X[:, :, -20:] model = cnn_lstm.FullModel(learning_rate=learning_rate, num_hidden=num_hidden, leaky_relu_alpha=0.1, output_size=3) model.train(train_data=(train_X, train_Y), class_weights=class_weight, valid_data=(valid_X, valid_Y), num_epoch=n_epoch, batch_size=batch_size) val_acc = model.evaluate(test_X, test_Y) print("Evaluating the model, acc:", val_acc) else: pca_model = PCA(n_components=0.95) ss_model = StandardScaler() train_X = transform_pc(train_X, pca_model, ss_model, training=True) valid_X = transform_pc(valid_X, pca_model, ss_model) test_X = transform_pc(test_X, pca_model, ss_model) joblib.dump(pca_model, '{}/pca.joblib'.format(file_prefix)) joblib.dump(ss_model, '{}/ss.joblib'.format(file_prefix)) train_data = tf.data.Dataset.from_tensor_slices((train_X, train_Y)).batch(batch_size=batch_size) valid_data = tf.data.Dataset.from_tensor_slices((valid_X, valid_Y)).batch(batch_size=batch_size) test_data = tf.data.Dataset.from_tensor_slices((test_X, test_Y)).batch(batch_size=batch_size) # Build model and train model = rnn_model.RNNModel(input_shape=train_X[0].shape, learning_rate=learning_rate, num_hidden=num_hidden, method=model_type, output_size=3, log_files_path=file_prefix) model.train(train_data, valid_data, n_epoch=n_epoch, class_weight=class_weight) val_acc = model.evaluate(test_data)[1] print("Total training time: {0:.3f} seconds".format(time.time() - start_time)) return val_acc
def main(argv): train_dataset, valid_dataset, test_dataset, vocab = \ dataset_preparation(FLAGS.train_dir, FLAGS.valid_dir, FLAGS.test_dir) # Length of the vocabulary in words vocab_size = len(vocab) # Set random seed tf.random.set_random_seed(FLAGS.seed) # Build the model model_args = dict( model=FLAGS.model, layer_num=FLAGS.layer_num, chunk_size=FLAGS.chunk_size, vocab_size=vocab_size, embedding_dim=FLAGS.embed_size, rnn_units=FLAGS.hidden_units, embed_dropout=FLAGS.embed_dropout, input_dropout=FLAGS.input_dropout, dropout=FLAGS.dropout, rnn_dropout=FLAGS.rnn_dropout, w_dropout=FLAGS.w_dropout, w_decay=FLAGS.w_decay, tied=FLAGS.tied ) model = rnn_model.RNNModel(training=True, batch_size=FLAGS.batch_size, **model_args) model.build(input_shape=(FLAGS.batch_size, None)) # Build the model for evaluation eval_model = rnn_model.RNNModel(training=False, batch_size=EVAL_BATCH_SIZE, **model_args) eval_model.build(input_shape=(EVAL_BATCH_SIZE, None)) # Set the learning rate scheduler lr_schedule = tf.train.exponential_decay( learning_rate=FLAGS.lr, global_step=0, decay_steps=FLAGS.when, decay_rate=FLAGS.lr_decay, staircase=True) optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr_schedule) store_ppl = float('inf') best_epoch = None for epoch in range(1, FLAGS.epochs + 1): logging.info('=============== Epoch {} ==============='.format(epoch)) start = time.time() # Name of the checkpoint files checkpoint_prefix = os.path.join(FLAGS.logdir, 'ckpt_{epoch}') # initializing the hidden state at the start of every epoch # initially hidden is None model.reset_states() total_train_loss = 0.0 total_train_raw_loss = 0.0 for batch_n, i in enumerate(range(0, tf.shape(train_dataset)[1]-1, FLAGS.seq_length)): inp, target = get_batch(train_dataset, i, batch_size=FLAGS.batch_size) train_loss, train_raw_loss = train_step(inp, target, model, optimizer) total_train_loss += train_loss total_train_raw_loss += train_raw_loss if batch_n > 10: break train_loss = total_train_loss / (batch_n + 1) train_raw_loss = total_train_raw_loss / (batch_n + 1) train_ppl = tf.exp(train_raw_loss) if epoch % FLAGS.measurement_store_interval == 0: logging.info('Begin Validation...') eval_model.set_weights(model.get_weights()) eval_model.reset_states() total_valid_loss = 0.0 for batch_n, i in enumerate(range(0, tf.shape(valid_dataset)[1]-1, FLAGS.seq_length)): inp, target = get_batch(valid_dataset, i, EVAL_BATCH_SIZE) predictions, hidden, dropped_hidden = eval_model(inp) total_valid_loss += tf.reduce_mean( tf.keras.losses.sparse_categorical_crossentropy( target, predictions)) valid_loss = total_valid_loss / (batch_n + 1) valid_ppl = tf.exp(valid_loss) if valid_ppl < store_ppl: store_ppl = valid_ppl best_epoch = epoch if epoch > 0.5*FLAGS.epochs: model.save_weights(checkpoint_prefix.format(epoch=epoch)) logging.info('Begin Test...') eval_model.reset_states() total_test_loss = 0.0 for batch_n, i in enumerate(range(0, tf.shape(test_dataset)[1]-1, FLAGS.seq_length)): inp, target = get_batch(test_dataset, i, EVAL_BATCH_SIZE) predictions, hidden, dropped_hidden = eval_model(inp) total_test_loss += tf.reduce_mean( tf.keras.losses.sparse_categorical_crossentropy( target, predictions)) test_loss = total_test_loss / (batch_n + 1) test_ppl = tf.exp(test_loss) logging.info('\nEpoch {} | ' 'Train Loss {:.4f} | ' 'Train PPL {:.4f} | ' 'Valid PPL {:.4f} | ' 'Min Valid PPL {:.4f}| ' 'Best Epoch {}'.format(epoch, train_loss, train_ppl, valid_ppl, store_ppl, best_epoch)) logging.info('Time taken for a epoch {} sec\n'.format(time.time() - start))
def main(settings): """ Translates a source language file (or STDIN) into a target language file (or STDOUT). """ # Create the TensorFlow session. g = tf.Graph() with g.as_default(): tf_config = tf.compat.v1.ConfigProto() tf_config.allow_soft_placement = True session = tf.compat.v1.Session(config=tf_config) # Load config file for each model. configs = [] for model in settings.models: config = load_config_from_json_file(model) setattr(config, 'reload', model) setattr(config, 'translation_maxlen', settings.translation_maxlen) configs.append(config) # Create the model graphs. logging.debug("Loading models\n") models = [] for i, config in enumerate(configs): with tf.compat.v1.variable_scope("model%d" % i) as scope: if config.model_type == "transformer": model = TransformerModel( config, consts_config_str=settings.config_str) else: model = rnn_model.RNNModel(config) model.sampling_utils = SamplingUtils(settings) models.append(model) # Add smoothing variables (if the models were trained with smoothing). # FIXME Assumes either all models were trained with smoothing or none were. if configs[0].exponential_smoothing > 0.0: smoothing = ExponentialSmoothing(configs[0].exponential_smoothing) # Restore the model variables. for i, config in enumerate(configs): with tf.compat.v1.variable_scope("model%d" % i) as scope: _ = model_loader.init_or_restore_variables( config, session, ensemble_scope=scope) # Swap-in the smoothed versions of the variables. if configs[0].exponential_smoothing > 0.0: session.run(fetches=smoothing.swap_ops) max_translation_len = settings.translation_maxlen # Create a BeamSearchSampler / RandomSampler. if settings.translation_strategy == 'beam_search': sampler = BeamSearchSampler(models, configs, settings.beam_size) else: assert settings.translation_strategy == 'sampling' sampler = RandomSampler(models, configs, settings.beam_size) # Warn about the change from neg log probs to log probs for the RNN. if settings.n_best: model_types = [config.model_type for config in configs] if 'rnn' in model_types: logging.warn( 'n-best scores for RNN models have changed from ' 'positive to negative (as of commit 95793196...). ' 'If you are using the scores for reranking etc, then ' 'you may need to update your scripts.') # Translate the source file. translate_utils.translate_file( input_file=settings.input, output_file=settings.output, session=session, sampler=sampler, config=configs[0], max_translation_len=max_translation_len, normalization_alpha=settings.normalization_alpha, consts_config_str=settings.config_str, nbest=settings.n_best, minibatch_size=settings.minibatch_size, maxibatch_size=settings.maxibatch_size)
def train(config, sess): assert (config.prior_model != None and (tf.train.checkpoint_exists(os.path.abspath(config.prior_model))) or (config.map_decay_c==0.0)), \ "MAP training requires a prior model file: Use command-line option --prior_model" # Construct the graph, with one model replica per GPU num_gpus = len(tf_utils.get_available_gpus()) num_replicas = max(1, num_gpus) if config.loss_function == 'MRT': assert config.gradient_aggregation_steps == 1 assert config.max_sentences_per_device == 0, "MRT mode does not support sentence-based split" if config.max_tokens_per_device != 0: assert (config.samplesN * config.maxlen <= config.max_tokens_per_device), "need to make sure candidates of a sentence could be " \ "feed into the model" else: assert num_replicas == 1, "MRT mode does not support sentence-based split" assert (config.samplesN * config.maxlen <= config.token_batch_size), "need to make sure candidates of a sentence could be " \ "feed into the model" logging.info('Building model...') replicas = [] for i in range(num_replicas): device_type = "GPU" if num_gpus > 0 else "CPU" device_spec = tf.DeviceSpec(device_type=device_type, device_index=i) with tf.device(device_spec): with tf.variable_scope(tf.get_variable_scope(), reuse=(i>0)): if config.model_type == "transformer": model = TransformerModel(config) else: model = rnn_model.RNNModel(config) replicas.append(model) init = tf.zeros_initializer(dtype=tf.int32) global_step = tf.get_variable('time', [], initializer=init, trainable=False) if config.learning_schedule == "constant": schedule = learning_schedule.ConstantSchedule(config.learning_rate) elif config.learning_schedule == "transformer": schedule = learning_schedule.TransformerSchedule( global_step=global_step, dim=config.state_size, warmup_steps=config.warmup_steps) elif config.learning_schedule == "warmup-plateau-decay": schedule = learning_schedule.WarmupPlateauDecaySchedule( global_step=global_step, peak_learning_rate=config.learning_rate, warmup_steps=config.warmup_steps, plateau_steps=config.plateau_steps) else: logging.error('Learning schedule type is not valid: {}'.format( config.learning_schedule)) sys.exit(1) if config.optimizer == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate=schedule.learning_rate, beta1=config.adam_beta1, beta2=config.adam_beta2, epsilon=config.adam_epsilon) else: logging.error('No valid optimizer defined: {}'.format(config.optimizer)) sys.exit(1) if config.summary_freq: summary_dir = (config.summary_dir if config.summary_dir is not None else os.path.abspath(os.path.dirname(config.saveto))) writer = tf.summary.FileWriter(summary_dir, sess.graph) else: writer = None updater = ModelUpdater(config, num_gpus, replicas, optimizer, global_step, writer) if config.exponential_smoothing > 0.0: smoothing = ExponentialSmoothing(config.exponential_smoothing) saver, progress = model_loader.init_or_restore_variables( config, sess, train=True) global_step.load(progress.uidx, sess) if config.sample_freq: random_sampler = RandomSampler( models=[replicas[0]], configs=[config], beam_size=1) if config.beam_freq or config.valid_script is not None: beam_search_sampler = BeamSearchSampler( models=[replicas[0]], configs=[config], beam_size=config.beam_size) #save model options write_config_to_json_file(config, config.saveto) text_iterator, valid_text_iterator = load_data(config) _, _, num_to_source, num_to_target = util.load_dictionaries(config) total_loss = 0. n_sents, n_words = 0, 0 last_time = time.time() logging.info("Initial uidx={}".format(progress.uidx)) # set epoch = 1 if print per-token-probability if config.print_per_token_pro: config.max_epochs = progress.eidx+1 for progress.eidx in range(progress.eidx, config.max_epochs): logging.info('Starting epoch {0}'.format(progress.eidx)) for source_sents, target_sents in text_iterator: if len(source_sents[0][0]) != config.factors: logging.error('Mismatch between number of factors in settings ({0}), and number in training corpus ({1})\n'.format(config.factors, len(source_sents[0][0]))) sys.exit(1) x_in, x_mask_in, y_in, y_mask_in = util.prepare_data( source_sents, target_sents, config.factors, maxlen=None) if x_in is None: logging.info('Minibatch with zero sample under length {0}'.format(config.maxlen)) continue write_summary_for_this_batch = config.summary_freq and ((progress.uidx % config.summary_freq == 0) or (config.finish_after and progress.uidx % config.finish_after == 0)) (factors, seqLen, batch_size) = x_in.shape output = updater.update( sess, x_in, x_mask_in, y_in, y_mask_in, num_to_target, write_summary_for_this_batch) if config.print_per_token_pro == False: total_loss += output else: # write per-token probability into the file f = open(config.print_per_token_pro, 'a') for pro in output: pro = str(pro) + '\n' f.write(pro) f.close() n_sents += batch_size n_words += int(numpy.sum(y_mask_in)) progress.uidx += 1 # Update the smoothed version of the model variables. # To reduce the performance overhead, we only do this once every # N steps (the smoothing factor is adjusted accordingly). if config.exponential_smoothing > 0.0 and progress.uidx % smoothing.update_frequency == 0: sess.run(fetches=smoothing.update_ops) if config.disp_freq and progress.uidx % config.disp_freq == 0: duration = time.time() - last_time disp_time = datetime.now().strftime('[%Y-%m-%d %H:%M:%S]') logging.info('{0} Epoch: {1} Update: {2} Loss/word: {3} Words/sec: {4} Sents/sec: {5}'.format(disp_time, progress.eidx, progress.uidx, total_loss/n_words, n_words/duration, n_sents/duration)) last_time = time.time() total_loss = 0. n_sents = 0 n_words = 0 if config.sample_freq and progress.uidx % config.sample_freq == 0: x_small = x_in[:, :, :10] x_mask_small = x_mask_in[:, :10] y_small = y_in[:, :10] samples = translate_utils.translate_batch( sess, random_sampler, x_small, x_mask_small, config.translation_maxlen, 0.0) assert len(samples) == len(x_small.T) == len(y_small.T), \ (len(samples), x_small.shape, y_small.shape) for xx, yy, ss in zip(x_small.T, y_small.T, samples): source = util.factoredseq2words(xx, num_to_source) target = util.seq2words(yy, num_to_target) sample = util.seq2words(ss[0][0], num_to_target) logging.info('SOURCE: {}'.format(source)) logging.info('TARGET: {}'.format(target)) logging.info('SAMPLE: {}'.format(sample)) if config.beam_freq and progress.uidx % config.beam_freq == 0: x_small = x_in[:, :, :10] x_mask_small = x_mask_in[:, :10] y_small = y_in[:,:10] samples = translate_utils.translate_batch( sess, beam_search_sampler, x_small, x_mask_small, config.translation_maxlen, config.normalization_alpha) assert len(samples) == len(x_small.T) == len(y_small.T), \ (len(samples), x_small.shape, y_small.shape) for xx, yy, ss in zip(x_small.T, y_small.T, samples): source = util.factoredseq2words(xx, num_to_source) target = util.seq2words(yy, num_to_target) logging.info('SOURCE: {}'.format(source)) logging.info('TARGET: {}'.format(target)) for i, (sample_seq, cost) in enumerate(ss): sample = util.seq2words(sample_seq, num_to_target) msg = 'SAMPLE {}: {} Cost/Len/Avg {}/{}/{}'.format( i, sample, cost, len(sample), cost/len(sample)) logging.info(msg) if config.valid_freq and progress.uidx % config.valid_freq == 0: if config.exponential_smoothing > 0.0: sess.run(fetches=smoothing.swap_ops) valid_ce = validate(sess, replicas[0], config, valid_text_iterator) sess.run(fetches=smoothing.swap_ops) else: valid_ce = validate(sess, replicas[0], config, valid_text_iterator) if (len(progress.history_errs) == 0 or valid_ce < min(progress.history_errs)): progress.history_errs.append(valid_ce) progress.bad_counter = 0 save_non_checkpoint(sess, saver, config.saveto) progress_path = '{0}.progress.json'.format(config.saveto) progress.save_to_json(progress_path) else: progress.history_errs.append(valid_ce) progress.bad_counter += 1 if progress.bad_counter > config.patience: logging.info('Early Stop!') progress.estop = True break if config.valid_script is not None: if config.exponential_smoothing > 0.0: sess.run(fetches=smoothing.swap_ops) score = validate_with_script(sess, beam_search_sampler) sess.run(fetches=smoothing.swap_ops) else: score = validate_with_script(sess, beam_search_sampler) need_to_save = (score is not None and (len(progress.valid_script_scores) == 0 or score > max(progress.valid_script_scores))) if score is None: score = 0.0 # ensure a valid value is written progress.valid_script_scores.append(score) if need_to_save: progress.bad_counter = 0 save_path = config.saveto + ".best-valid-script" save_non_checkpoint(sess, saver, save_path) write_config_to_json_file(config, save_path) progress_path = '{}.progress.json'.format(save_path) progress.save_to_json(progress_path) if config.save_freq and progress.uidx % config.save_freq == 0: saver.save(sess, save_path=config.saveto, global_step=progress.uidx) write_config_to_json_file(config, "%s-%s" % (config.saveto, progress.uidx)) progress_path = '{0}-{1}.progress.json'.format(config.saveto, progress.uidx) progress.save_to_json(progress_path) if config.finish_after and progress.uidx % config.finish_after == 0: logging.info("Maximum number of updates reached") saver.save(sess, save_path=config.saveto, global_step=progress.uidx) write_config_to_json_file(config, "%s-%s" % (config.saveto, progress.uidx)) progress.estop=True progress_path = '{0}-{1}.progress.json'.format(config.saveto, progress.uidx) progress.save_to_json(progress_path) break if progress.estop: break
def calc_scores(source_file, target_file, scorer_settings, configs): """Calculates sentence pair scores using each of the specified models. By default (when scorer_settings.normalization_alpha is 0.0), the score is the sentence-level cross entropy, otherwise it's a normalized version. Args: source_file: file object for file containing source sentences. target_file: file object for file containing target sentences. scorer_settings: a ScorerSettings object. configs: a list of Namespace objects specifying the model configs. Returns: A list of lists of floats. The outer list contains one list for each model (in the same order given by configs). The inner list contains one score for each sentence pair. """ scores = [] for config in configs: g = tf.Graph() with g.as_default(): tf_config = tf.ConfigProto() tf_config.allow_soft_placement = True with tf.Session(config=tf_config) as sess: logging.info('Building model...') # Create the model graph. if config.model_type == 'transformer': model = transformer.Transformer(config) else: model = rnn_model.RNNModel(config) # Add smoothing variables (if the model was trained with # smoothing). if config.exponential_smoothing > 0.0: smoothing = ExponentialSmoothing( config.exponential_smoothing) # Restore the model variables. saver = model_loader.init_or_restore_variables(config, sess) # Swap-in the smoothed versions of the variables (if present). if config.exponential_smoothing > 0.0: sess.run(fetches=smoothing.swap_ops) text_iterator = TextIterator( source=source_file.name, target=target_file.name, source_dicts=config.source_dicts, target_dict=config.target_dict, model_type=config.model_type, batch_size=scorer_settings.minibatch_size, maxlen=float('inf'), source_vocab_sizes=config.source_vocab_sizes, target_vocab_size=config.target_vocab_size, use_factor=(config.factors > 1), sort_by_length=False) ce_vals, _ = train.calc_cross_entropy_per_sentence( sess, model, config, text_iterator, normalization_alpha=scorer_settings.normalization_alpha) scores.append(ce_vals) return scores
def train(config, sess): assert (config.prior_model != None and (tf.train.checkpoint_exists(os.path.abspath(config.prior_model))) or (config.map_decay_c==0.0)), \ "MAP training requires a prior model file: Use command-line option --prior_model" # Construct the graph, with one model replica per GPU num_gpus = len(util.get_available_gpus()) num_replicas = max(1, num_gpus) logging.info('Building model...') replicas = [] for i in range(num_replicas): device_type = "GPU" if num_gpus > 0 else "CPU" device_spec = tf.DeviceSpec(device_type=device_type, device_index=i) with tf.device(device_spec): with tf.variable_scope(tf.get_variable_scope(), reuse=(i > 0)): if config.model_type == "transformer": model = TransformerModel(config) else: model = rnn_model.RNNModel(config) replicas.append(model) init = tf.zeros_initializer(dtype=tf.int32) global_step = tf.get_variable('time', [], initializer=init, trainable=False) if config.learning_schedule == "constant": schedule = ConstantSchedule(config.learning_rate) elif config.learning_schedule == "transformer": schedule = TransformerSchedule(global_step=global_step, dim=config.state_size, warmup_steps=config.warmup_steps) else: logging.error('Learning schedule type is not valid: {}'.format( config.learning_schedule)) sys.exit(1) if config.optimizer == 'adam': optimizer = tf.train.AdamOptimizer( learning_rate=schedule.learning_rate, beta1=config.adam_beta1, beta2=config.adam_beta2, epsilon=config.adam_epsilon) else: logging.error('No valid optimizer defined: {}'.format( config.optimizer)) sys.exit(1) if config.summary_freq: summary_dir = (config.summary_dir if config.summary_dir is not None else os.path.abspath(os.path.dirname(config.saveto))) writer = tf.summary.FileWriter(summary_dir, sess.graph) else: writer = None updater = ModelUpdater(config, num_gpus, replicas, optimizer, global_step, writer) saver, progress = model_loader.init_or_restore_variables(config, sess, train=True) global_step.load(progress.uidx, sess) # Use an InferenceModelSet to abstract over model types for sampling and # beam search. Multi-GPU sampling and beam search are not currently # supported, so we just use the first replica. model_set = inference.InferenceModelSet([replicas[0]], [config]) #save model options write_config_to_json_file(config, config.saveto) text_iterator, valid_text_iterator = load_data(config) _, _, num_to_source, num_to_target = util.load_dictionaries(config) total_loss = 0. n_sents, n_words = 0, 0 last_time = time.time() logging.info("Initial uidx={}".format(progress.uidx)) for progress.eidx in range(progress.eidx, config.max_epochs): logging.info('Starting epoch {0}'.format(progress.eidx)) for source_sents, target_sents in text_iterator: if len(source_sents[0][0]) != config.factors: logging.error( 'Mismatch between number of factors in settings ({0}), and number in training corpus ({1})\n' .format(config.factors, len(source_sents[0][0]))) sys.exit(1) x_in, x_mask_in, y_in, y_mask_in = util.prepare_data( source_sents, target_sents, config.factors, maxlen=None) if x_in is None: logging.info( 'Minibatch with zero sample under length {0}'.format( config.maxlen)) continue write_summary_for_this_batch = config.summary_freq and ( (progress.uidx % config.summary_freq == 0) or (config.finish_after and progress.uidx % config.finish_after == 0)) (factors, seqLen, batch_size) = x_in.shape loss = updater.update(sess, x_in, x_mask_in, y_in, y_mask_in, write_summary_for_this_batch) total_loss += loss n_sents += batch_size n_words += int(numpy.sum(y_mask_in)) progress.uidx += 1 if config.disp_freq and progress.uidx % config.disp_freq == 0: duration = time.time() - last_time disp_time = datetime.now().strftime('[%Y-%m-%d %H:%M:%S]') logging.info( '{0} Epoch: {1} Update: {2} Loss/word: {3} Words/sec: {4} Sents/sec: {5}' .format(disp_time, progress.eidx, progress.uidx, total_loss / n_words, n_words / duration, n_sents / duration)) last_time = time.time() total_loss = 0. n_sents = 0 n_words = 0 if config.sample_freq and progress.uidx % config.sample_freq == 0: x_small, x_mask_small, y_small = x_in[:, :, : 10], x_mask_in[:, : 10], y_in[:, : 10] samples = model_set.sample(sess, x_small, x_mask_small) assert len(samples) == len(x_small.T) == len( y_small.T), (len(samples), x_small.shape, y_small.shape) for xx, yy, ss in zip(x_small.T, y_small.T, samples): source = util.factoredseq2words(xx, num_to_source) target = util.seq2words(yy, num_to_target) sample = util.seq2words(ss, num_to_target) logging.info('SOURCE: {}'.format(source)) logging.info('TARGET: {}'.format(target)) logging.info('SAMPLE: {}'.format(sample)) if config.beam_freq and progress.uidx % config.beam_freq == 0: x_small, x_mask_small, y_small = x_in[:, :, : 10], x_mask_in[:, : 10], y_in[:, : 10] samples = model_set.beam_search( sess, x_small, x_mask_small, config.beam_size, normalization_alpha=config.normalization_alpha) # samples is a list with shape batch x beam x len assert len(samples) == len(x_small.T) == len( y_small.T), (len(samples), x_small.shape, y_small.shape) for xx, yy, ss in zip(x_small.T, y_small.T, samples): source = util.factoredseq2words(xx, num_to_source) target = util.seq2words(yy, num_to_target) logging.info('SOURCE: {}'.format(source)) logging.info('TARGET: {}'.format(target)) for i, (sample_seq, cost) in enumerate(ss): sample = util.seq2words(sample_seq, num_to_target) msg = 'SAMPLE {}: {} Cost/Len/Avg {}/{}/{}'.format( i, sample, cost, len(sample), cost / len(sample)) logging.info(msg) if config.valid_freq and progress.uidx % config.valid_freq == 0: valid_ce = validate(sess, replicas[0], config, valid_text_iterator) if (len(progress.history_errs) == 0 or valid_ce < min(progress.history_errs)): progress.history_errs.append(valid_ce) progress.bad_counter = 0 save_non_checkpoint(sess, saver, config.saveto) progress_path = '{0}.progress.json'.format(config.saveto) progress.save_to_json(progress_path) else: progress.history_errs.append(valid_ce) progress.bad_counter += 1 if progress.bad_counter > config.patience: logging.info('Early Stop!') progress.estop = True break if config.valid_script is not None: score = validate_with_script(sess, replicas[0], config) need_to_save = ( score is not None and (len(progress.valid_script_scores) == 0 or score > max(progress.valid_script_scores))) if score is None: score = 0.0 # ensure a valid value is written progress.valid_script_scores.append(score) if need_to_save: save_path = config.saveto + ".best-valid-script" save_non_checkpoint(sess, saver, save_path) write_config_to_json_file(config, save_path) progress_path = '{}.progress.json'.format(save_path) progress.save_to_json(progress_path) if config.save_freq and progress.uidx % config.save_freq == 0: saver.save(sess, save_path=config.saveto, global_step=progress.uidx) write_config_to_json_file( config, "%s-%s" % (config.saveto, progress.uidx)) progress_path = '{0}-{1}.progress.json'.format( config.saveto, progress.uidx) progress.save_to_json(progress_path) if config.finish_after and progress.uidx % config.finish_after == 0: logging.info("Maximum number of updates reached") saver.save(sess, save_path=config.saveto, global_step=progress.uidx) write_config_to_json_file( config, "%s-%s" % (config.saveto, progress.uidx)) progress.estop = True progress_path = '{0}-{1}.progress.json'.format( config.saveto, progress.uidx) progress.save_to_json(progress_path) break if progress.estop: break
def train(config, sess): #################################################### assert (config.prior_model != None and (tf.train.checkpoint_exists(os.path.abspath(config.prior_model))) or (config.map_decay_c==0.0)), \ "MAP training requires a prior model file: Use command-line option --prior_model" # Construct the graph, with one model replica per GPU num_gpus = len(util.get_available_gpus()) num_replicas = max(1, num_gpus) logging.info('Building model...') replicas = [] for i in range(num_replicas): device_type = "GPU" if num_gpus > 0 else "CPU" device_spec = tf.DeviceSpec(device_type=device_type, device_index=i) with tf.device(device_spec): with tf.variable_scope(tf.get_variable_scope(), reuse=(i > 0)): if config.model_type == "transformer": model = TransformerModel(config) else: model = rnn_model.RNNModel(config) replicas.append(model) init = tf.zeros_initializer(dtype=tf.int32) global_step = tf.get_variable('time', [], initializer=init, trainable=False) if config.learning_schedule == "constant": schedule = ConstantSchedule(config.learning_rate) elif config.learning_schedule == "transformer": schedule = TransformerSchedule(global_step=global_step, dim=config.state_size, warmup_steps=config.warmup_steps) else: logging.error('Learning schedule type is not valid: {}'.format( config.learning_schedule)) sys.exit(1) if config.optimizer == 'adam': optimizer = tf.train.AdamOptimizer( learning_rate=schedule.learning_rate, beta1=config.adam_beta1, beta2=config.adam_beta2, epsilon=config.adam_epsilon) else: logging.error('No valid optimizer defined: {}'.format( config.optimizer)) sys.exit(1) if config.summary_freq: summary_dir = (config.summary_dir if config.summary_dir is not None else os.path.abspath(os.path.dirname(config.saveto))) writer = tf.summary.FileWriter(summary_dir, sess.graph) else: writer = None updater = ModelUpdater(config, num_gpus, replicas, optimizer, global_step, writer) saver, progress = model_loader.init_or_restore_variables(config, sess, train=True) ############################################################ #add: pretrain if config.pretrain: logging.info("Start pre-training") #预训练网络参数 pre_batch_size = 1000 epochs = 20 pre_learning_rate = 0.001 pre_optimizer = tf.train.GradientDescentOptimizer( pre_learning_rate).minimize(replicas[0].loss_pre_train) #加载预训练数据及相关字典 gvocab, gvectors = util.pre_load_data(config.pretrain_vocab, config.pretrain_vectors) pre_vocab_list = list(gvocab.keys()) #过采样 pre_train_list = [] with open('/media/ntfs-3/EXP/MULTI/mix/zh-en/data3/glove/vocab.txt', 'r', encoding='utf-8') as f: for line in f: k, v = line.strip().split() pre_train_list.extend([k] * int(v)) utf8_dict = json.load( open(config.source_dicts[0], 'r', encoding='utf-8')) embedding_list = [] #开始训练 for i in range(epochs): logging.info("epoch:{}".format(i)) if i == epochs - 1: source_x, source_y, _vocab = util.get_data(pre_vocab_list, pre_batch_size, gvocab, gvectors, utf8_dict, shuffle=False) else: source_x, source_y, _vocab = util.get_data(pre_train_list, pre_batch_size, gvocab, gvectors, utf8_dict, shuffle=True) for idx, [s_x, s_y] in enumerate(zip(source_x, source_y)): assert len(s_x) == len(s_y), "{}, {}".format( len(s_x), len(s_y)) sx, sy = util.pre_prepare_data(s_x, s_y) feed_dict = {} feed_dict[replicas[0].pre_inputs.x] = sx feed_dict[replicas[0].pre_inputs.y] = sy _, loss, embedding = sess.run([ pre_optimizer, replicas[0].loss_pre_train, replicas[0].pre_embedding ], feed_dict=feed_dict) if idx % 100 == 0: logging.info("loss:{}".format(loss)) if i == epochs - 1: embedding_list.append(embedding) assert _vocab == pre_vocab_list emb = embedding_list[0] for e in embedding_list[1:]: emb = numpy.concatenate((emb, e)) numpy.save("pre_emb/pre_emb.npy", emb) with open("pre_emb/vocab", "w", encoding="utf-8") as f: f.write("\n".join(pre_vocab_list)) #tsne可视化 tsne = util.get_tsne(emb, "pre_emb/tsne.npy") gtsne = numpy.load(config.pretrain_tsne) #util.plot_tsne(_vocab, tsne, gvocab, gtsne, top=20) #exit(0) ################################################################################## global_step.load(progress.uidx, sess) # Use an InferenceModelSet to abstract over model types for sampling and # beam search. Multi-GPU sampling and beam search are not currently # supported, so we just use the first replica. model_set = inference.InferenceModelSet([replicas[0]], [config]) #save model options write_config_to_json_file(config, config.saveto) text_iterator, valid_text_iterator = load_data(config) _, _, num_to_source, num_to_target = util.load_dictionaries(config) total_loss = 0. n_sents, n_words = 0, 0 last_time = time.time() logging.info("Initial uidx={}".format(progress.uidx)) for progress.eidx in range(progress.eidx, config.max_epochs): logging.info('Starting epoch {0}'.format(progress.eidx)) for pre_source_sents, source_sents, target_sents in text_iterator: #if len(source_sents[0][0]) != config.factors: #logging.error('Mismatch between number of factors in settings ({0}), and number in training corpus ({1})\n'.format(config.factors, len(source_sents[0][0]))) #sys.exit(1) px_in, x_in, x_mask_in, y_in, y_mask_in = util.prepare_data( source_sents, target_sents, config.factors, pre_source_sents, maxlen=None) if x_in is None: logging.info( 'Minibatch with zero sample under length {0}'.format( config.maxlen)) continue write_summary_for_this_batch = config.summary_freq and ( (progress.uidx % config.summary_freq == 0) or (config.finish_after and progress.uidx % config.finish_after == 0)) (factors, seqLen, uLen, batch_size) = x_in.shape loss = updater.update(sess, px_in, x_in, x_mask_in, y_in, y_mask_in, write_summary_for_this_batch) total_loss += loss n_sents += batch_size n_words += int(numpy.sum(y_mask_in)) progress.uidx += 1 if config.disp_freq and progress.uidx % config.disp_freq == 0: duration = time.time() - last_time disp_time = datetime.now().strftime('[%Y-%m-%d %H:%M:%S]') logging.info( '{0} Epoch: {1} Update: {2} Loss/word: {3} Words/sec: {4} Sents/sec: {5}' .format(disp_time, progress.eidx, progress.uidx, total_loss / n_words, n_words / duration, n_sents / duration)) last_time = time.time() total_loss = 0. n_sents = 0 n_words = 0 if config.sample_freq and progress.uidx % config.sample_freq == 0: x_small, x_mask_small, y_small = x_in[:, :, :, : 10], x_mask_in[:, :, : 10], y_in[:, : 10] samples = model_set.sample(sess, x_small, x_mask_small) assert len(samples) == len(x_small.T) == len( y_small.T), (len(samples), x_small.shape, y_small.shape) for xx, yy, ss in zip(x_small.T, y_small.T, samples): #source = util.factoredseq2words(xx, num_to_source) target = util.seq2words(yy, num_to_target) sample = util.seq2words(ss, num_to_target) #logging.info('SOURCE: {}'.format(source)) #logging.info('SOURCE: {}'.format(xx)) logging.info('TARGET: {}'.format(target)) logging.info('SAMPLE: {}'.format(sample)) if config.beam_freq and progress.uidx % config.beam_freq == 0: x_small, x_mask_small, y_small = x_in[:, :, :, : 10], x_mask_in[:, :, : 10], y_in[:, : 10] samples = model_set.beam_search( sess, x_small, x_mask_small, config.beam_size, normalization_alpha=config.normalization_alpha) # samples is a list with shape batch x beam x len assert len(samples) == len(x_small.T) == len( y_small.T), (len(samples), x_small.shape, y_small.shape) for xx, yy, ss in zip(x_small.T, y_small.T, samples): #source = util.factoredseq2words(xx, num_to_source) target = util.seq2words(yy, num_to_target) #logging.info('SOURCE: {}'.format(source)) logging.info('TARGET: {}'.format(target)) for i, (sample_seq, cost) in enumerate(ss): sample = util.seq2words(sample_seq, num_to_target) msg = 'SAMPLE {}: {} Cost/Len/Avg {}/{}/{}'.format( i, sample, cost, len(sample), cost / len(sample)) logging.info(msg) if config.valid_freq and progress.uidx % config.valid_freq == 0: valid_ce = validate(sess, replicas[0], config, valid_text_iterator) if (len(progress.history_errs) == 0 or valid_ce < min(progress.history_errs)): progress.history_errs.append(valid_ce) progress.bad_counter = 0 save_non_checkpoint(sess, saver, config.saveto) progress_path = '{0}.progress.json'.format(config.saveto) progress.save_to_json(progress_path) else: progress.history_errs.append(valid_ce) progress.bad_counter += 1 if progress.bad_counter > config.patience: logging.info('Early Stop!') progress.estop = True break if config.valid_script is not None: score = validate_with_script(sess, replicas[0], config) need_to_save = ( score is not None and (len(progress.valid_script_scores) == 0 or score > max(progress.valid_script_scores))) if score is None: score = 0.0 # ensure a valid value is written progress.valid_script_scores.append(score) if need_to_save: progress.bad_counter = 0 save_path = config.saveto + ".best-valid-script" save_non_checkpoint(sess, saver, save_path) write_config_to_json_file(config, save_path) progress_path = '{}.progress.json'.format(save_path) progress.save_to_json(progress_path) if config.save_freq and progress.uidx % config.save_freq == 0: saver.save(sess, save_path=config.saveto, global_step=progress.uidx) write_config_to_json_file( config, "%s-%s" % (config.saveto, progress.uidx)) progress_path = '{0}-{1}.progress.json'.format( config.saveto, progress.uidx) progress.save_to_json(progress_path) if config.finish_after and progress.uidx % config.finish_after == 0: logging.info("Maximum number of updates reached") saver.save(sess, save_path=config.saveto, global_step=progress.uidx) write_config_to_json_file( config, "%s-%s" % (config.saveto, progress.uidx)) progress.estop = True progress_path = '{0}-{1}.progress.json'.format( config.saveto, progress.uidx) progress.save_to_json(progress_path) break if progress.estop: break