示例#1
0
 def train(self,
           config,
           src_file,
           tgt_file,
           src_vocab_info,
           tgt_vocab_info,
           align_file=None,
           model_path=None,
           gpuid=0):
     if src_vocab_info['changed'] or tgt_vocab_info['changed']:
         model_path = checkpoint.update_vocab(
             model_path,
             os.path.join(self._output_dir, 'new_vocab_checkpoint'),
             src_vocab_info['model'],
             tgt_vocab_info['model'],
             new_src_vocab=src_vocab_info['current']
             if src_vocab_info['changed'] else None,
             new_tgt_vocab=tgt_vocab_info['current']
             if tgt_vocab_info['changed'] else None,
             mode='replace',
             session_config=tf.ConfigProto(device_count={'GPU': 0}))
     model_dir, model = self._load_model(
         model_type=config['options'].get('model_type'),
         model_file=config['options'].get('model'),
         model_path=model_path)
     run_config = copy.deepcopy(config['options'].get('config', {}))
     run_config['model_dir'] = model_dir
     if 'data' not in run_config:
         run_config['data'] = {}
     if 'train' not in run_config:
         run_config['train'] = {}
     run_config['data']['source_words_vocabulary'] = src_vocab_info[
         'current']
     run_config['data']['target_words_vocabulary'] = tgt_vocab_info[
         'current']
     run_config['data']['train_features_file'] = src_file
     run_config['data']['train_labels_file'] = tgt_file
     if align_file is not None and os.path.exists(align_file):
         run_config['data']['train_alignments'] = align_file
         if "params" not in run_config:
             run_config["params"] = {}
         if "guided_alignment_type" not in run_config["params"]:
             run_config["params"]["guided_alignment_type"] = "ce"
     if 'train_steps' not in run_config['train']:
         run_config['train']['single_pass'] = True
         run_config['train']['train_steps'] = None
     if 'sample_buffer_size' not in run_config['train']:
         run_config['train']['sample_buffer_size'] = -1
     if 'average_last_checkpoints' not in run_config['train']:
         run_config['train']['average_last_checkpoints'] = 0
     runner = onmt.Runner(model,
                          run_config,
                          num_devices=utils.count_devices(gpuid),
                          auto_config=config['options'].get(
                              'auto_config', False))
     output_dir = runner.train()
     if output_dir != model_dir:
         shutil.copy(os.path.join(model_dir, "model_description.py"),
                     output_dir)
     return self._list_model_files(output_dir)
示例#2
0
 def _make_predict_runner(self, config, model_path):
     model_dir, model = self._load_model(model_path=model_path)
     run_config = copy.deepcopy(config['options']['config'])
     run_config['model_dir'] = model_dir
     for k, v in six.iteritems(run_config['data']):
         run_config['data'][k] = self._convert_vocab(v)
     return onmt.Runner(model, run_config)
 def trans(self, config, model_path, input, output, gpuid=0):
     model_dir, model = self._load_model(model_path=model_path)
     run_config = copy.deepcopy(config['options']['config'])
     run_config['model_dir'] = model_dir
     for k, v in six.iteritems(run_config['data']):
         run_config['data'][k] = self._convert_vocab(v)
     onmt.Runner(model, run_config).infer(input, predictions_file=output)
示例#4
0
def init_model_and_runner(config, d, **kwargs):
    model = opennmt.models.Transformer(
        opennmt.inputters.WordEmbedder(embedding_size=d),
        opennmt.inputters.WordEmbedder(embedding_size=d), **kwargs)
    runner = opennmt.Runner(model, config, auto_config=True)

    return runner
示例#5
0
def minimal_transformer_training_example():
    run_type = 'train'  # Run type: 'train' or 'translate'.
    train_features_filepath = './toy-ende/src-train.txt'  # Path to the source file.
    train_labels_filepath = './toy-ende/tgt-train.txt'  # Path to the target file.
    eval_features_filepath = './toy-ende/src-val.txt'  # Path to the source file.
    eval_labels_filepath = './toy-ende/tgt-val.txt'  # Path to the target file.
    source_vocabulary_filepath = './toy-ende/src-vocab.txt'  # Path to the source vocabulary.
    target_vocabulary_filepath = './toy-ende/tgt-vocab.txt'  # Path to the target vocabulary.
    model_dir_path = './checkpoint'  # Directory where checkpoint are written.

    # See http://opennmt.net/OpenNMT-tf/configuration.html for a complete specification of the configuration.
    config = {
        'model_dir': model_dir_path,
        'data': {
            'source_vocabulary': source_vocabulary_filepath,
            'target_vocabulary': target_vocabulary_filepath,
            'train_features_file': train_features_filepath,
            'train_labels_file': train_labels_filepath,
            'eval_features_file': eval_features_filepath,
            'eval_labels_file': eval_labels_filepath,
        }
    }

    model = onmt.models.TransformerBase()
    runner = onmt.Runner(model, config, auto_config=True)

    if run_type == 'train':
        runner.train()
    elif run_type == 'translate':
        runner.infer(eval_features_filepath)
示例#6
0
 def _make_predict_runner(self, config, model_path):
     model_dir, model = self._load_model(
         model_type=config['options'].get('model_type'),
         model_file=config['options'].get('model'),
         model_path=model_path)
     run_config = copy.deepcopy(config['options']['config'])
     run_config['model_dir'] = model_dir
     if 'data' not in run_config:
         run_config['data'] = {}
     run_config['data'] = self._register_vocab(config, run_config['data'])
     return onmt.Runner(model, run_config)
示例#7
0
    def _build_runner(
        self,
        config,
        src_vocab=None,
        tgt_vocab=None,
        src_file=None,
        tgt_file=None,
        align_file=None,
        example_weights_file=None,
        model_path=None,
    ):
        model_dir = os.path.join(self._output_dir, "model")
        if os.path.exists(model_dir):
            shutil.rmtree(model_dir)
        os.makedirs(model_dir)

        # Copy checkpoint files into the temporary model dir.
        if model_path is not None:
            checkpoint_files = _list_checkpoint_files(model_path)
            for filename, path in checkpoint_files.items():
                shutil.copy(path, os.path.join(model_dir, filename))

        # Prepare vocabulary if not already done.
        if src_vocab is None:
            src_vocab = self._convert_vocab(
                config["vocabulary"]["source"]["path"])
        if tgt_vocab is None:
            tgt_vocab = self._convert_vocab(
                config["vocabulary"]["target"]["path"])

        options = config["options"]
        run_config = _build_run_config(
            options.get("config"),
            model_dir,
            src_vocab,
            tgt_vocab,
            src_file=src_file,
            tgt_file=tgt_file,
            align_file=align_file,
            example_weights_file=example_weights_file,
        )
        model = opennmt.load_model(
            model_dir,
            model_file=options.get("model"),
            model_name=options.get("model_type"),
            as_builder=True,
        )
        return opennmt.Runner(
            model,
            run_config,
            auto_config=options.get("auto_config", False),
            mixed_precision=options.get("mixed_precision", False),
        )
示例#8
0
    def _build_runner(self,
                      config,
                      src_vocab=None,
                      tgt_vocab=None,
                      src_file=None,
                      tgt_file=None,
                      align_file=None,
                      example_weights_file=None,
                      model_path=None):
        model_dir = os.path.join(self._output_dir, 'model')
        os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
        if os.path.exists(model_dir):
            shutil.rmtree(model_dir)
        os.makedirs(model_dir)

        # Copy checkpoint files into the temporary model dir.
        if model_path is not None:
            checkpoint_files = _list_checkpoint_files(model_path)
            for filename, path in checkpoint_files.items():
                shutil.copy(path, os.path.join(model_dir, filename))

        # Prepare vocabulary if not already done.
        if src_vocab is None:
            src_vocab = self._convert_vocab(config['vocabulary']['source']['path'])
        if tgt_vocab is None:
            tgt_vocab = self._convert_vocab(config['vocabulary']['target']['path'])

        options = config['options']
        run_config = _build_run_config(
            options.get('config'),
            model_dir,
            src_vocab,
            tgt_vocab,
            src_file=src_file,
            tgt_file=tgt_file,
            align_file=align_file,
            example_weights_file=example_weights_file)
        model = opennmt.load_model(
            model_dir,
            model_file=options.get('model'),
            model_name=options.get('model_type'),
            as_builder=True,
        )
        return opennmt.Runner(
            model,
            run_config,
            auto_config=options.get('auto_config', False),
            mixed_precision=options.get('mixed_precision', False),
        )
示例#9
0
 def train(self,
           config,
           src_file,
           tgt_file,
           src_vocab_info,
           tgt_vocab_info,
           model_path=None,
           gpuid=0):
     if src_vocab_info['changed'] or tgt_vocab_info['changed']:
         model_path = checkpoint.update_vocab(
             model_path,
             os.path.join(self._output_dir, 'new_vocab_checkpoint'),
             src_vocab_info['model'],
             tgt_vocab_info['model'],
             new_src_vocab=src_vocab_info['current']
             if src_vocab_info['changed'] else None,
             new_tgt_vocab=tgt_vocab_info['current']
             if tgt_vocab_info['changed'] else None,
             mode='replace',
             session_config=tf.ConfigProto(device_count={'GPU': 0}))
     model_dir, model = self._load_model(
         model_type=config['options'].get('model_type'),
         model_file=config['options'].get('model'),
         model_path=model_path)
     run_config = copy.deepcopy(config['options'].get('config', {}))
     run_config['model_dir'] = model_dir
     if 'data' not in run_config:
         run_config['data'] = {}
     if 'train' not in run_config:
         run_config['train'] = {}
     run_config['data']['source_words_vocabulary'] = src_vocab_info[
         'current']
     run_config['data']['target_words_vocabulary'] = tgt_vocab_info[
         'current']
     run_config['data']['train_features_file'] = src_file
     run_config['data']['train_labels_file'] = tgt_file
     if 'train_steps' not in run_config['train']:
         run_config['train']['single_pass'] = True
         run_config['train']['train_steps'] = None
     if 'sample_buffer_size' not in run_config['train']:
         run_config['train']['sample_buffer_size'] = -1
     runner = onmt.Runner(model,
                          run_config,
                          num_devices=utils.count_devices(gpuid),
                          auto_config=config['options'].get(
                              'auto_config', False))
     runner.train()
     return self._list_model_files(model_dir)
示例#10
0
 def train(self, config, src_file, tgt_file, model_path=None, gpuid=0):
     model_dir, model = self._load_model(
         model_type=config['options'].get('model_type'),
         model_file=config['options'].get('model'),
         model_path=model_path)
     run_config = copy.deepcopy(config['options']['config'])
     run_config['model_dir'] = model_dir
     for k, v in six.iteritems(run_config['data']):
         run_config['data'][k] = self._convert_vocab(v)
     run_config['data']['train_features_file'] = src_file
     run_config['data']['train_labels_file'] = tgt_file
     if 'train_steps' not in run_config['train']:
         run_config['train']['single_pass'] = True
         run_config['train']['train_steps'] = None
     if 'sample_buffer_size' not in run_config['train']:
         run_config['train']['sample_buffer_size'] = -1
     onmt.Runner(model, run_config,
                 num_devices=utils.count_devices(gpuid)).train()
     return self._list_model_files(model_dir)
示例#11
0
 def train(self,
           config,
           src_file,
           tgt_file,
           model_path=None,
           gpuid=0):
     model_dir, model = self._load_model(
         model_file=config['options']['model'], model_path=model_path)
     run_config = copy.deepcopy(config['options']['config'])
     run_config['model_dir'] = model_dir
     for k, v in six.iteritems(run_config['data']):
         run_config['data'][k] = self._convert_vocab(v)
     run_config['data']['train_features_file'] = src_file
     run_config['data']['train_labels_file'] = tgt_file
     if 'train_steps' not in run_config['train']:
         run_config['train']['single_pass'] = True
         run_config['train']['train_steps'] = None
     onmt.Runner(model, run_config).train()
     return self._list_model_files(model_dir)
    def _build_runner(self,
                      config,
                      src_vocab=None,
                      tgt_vocab=None,
                      src_file=None,
                      tgt_file=None,
                      align_file=None,
                      model_path=None):
        model_dir = os.path.join(self._output_dir, 'model')
        if os.path.exists(model_dir):
            shutil.rmtree(model_dir)
        os.makedirs(model_dir)

        # Copy checkpoint files into the temporary model dir.
        if model_path is not None:
            checkpoint_files = _list_checkpoint_files(model_path)
            for filename, path in checkpoint_files.items():
                shutil.copy(path, os.path.join(model_dir, filename))

        # Prepare vocabulary if not already done.
        if src_vocab is None:
            src_vocab = self._convert_vocab(
                config['tokenization']['source']['vocabulary'])
        if tgt_vocab is None:
            tgt_vocab = self._convert_vocab(
                config['tokenization']['target']['vocabulary'])

        options = config['options']
        run_config = _build_run_config(options.get('config'),
                                       model_dir,
                                       src_vocab,
                                       tgt_vocab,
                                       src_file=src_file,
                                       tgt_file=tgt_file,
                                       align_file=align_file)
        model = opennmt.load_model(model_dir,
                                   model_file=options.get('model'),
                                   model_name=options.get('model_type'),
                                   serialize_model=False)
        return opennmt.Runner(model,
                              run_config,
                              auto_config=options.get('auto_config', False))
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("run",
                        choices=["train", "translate"],
                        help="Run type.")
    parser.add_argument("--src",
                        required=True,
                        help="Path to the source file.")
    parser.add_argument("--tgt", help="Path to the target file.")
    parser.add_argument("--src_vocab",
                        required=True,
                        help="Path to the source vocabulary.")
    parser.add_argument("--tgt_vocab",
                        required=True,
                        help="Path to the target vocabulary.")
    parser.add_argument("--model_dir",
                        default="checkpoint",
                        help="Directory where checkpoint are written.")
    args = parser.parse_args()

    # See http://opennmt.net/OpenNMT-tf/configuration.html for a complete specification
    # of the configuration.
    config = {
        "model_dir": args.model_dir,
        "data": {
            "source_vocabulary": args.src_vocab,
            "target_vocabulary": args.tgt_vocab,
            "train_features_file": args.src,
            "train_labels_file": args.tgt,
        }
    }

    model = onmt.models.TransformerBase()
    runner = onmt.Runner(model, config, auto_config=True)

    if args.run == "train":
        runner.train()
    elif args.run == "translate":
        runner.infer(args.src)
示例#14
0
import opennmt
import seq_tagger_updated as SequenceTagger

config = {
    "model_dir": "run/",
    "data": {
        "train_features_file": "train_words_bitext.txt",
        "train_labels_file": "train_tags_bitext.txt",
        "eval_features_file": "valid_words_bitext.txt",
        "eval_labels_file": "valid_tags_bitext.txt",
        "source_1_vocabulary": "src-train-vocab.txt",
        "source_2_vocabulary": "src-train-tkt-vocab.txt",
        "target_vocabulary": "tgt-train-vocab.txt",
    },
}

# model = SequenceTagger.model()

#model = opennmt.models.SequenceTagger()
model = opennmt.models.catalog.LstmCnnCrfTagger()
runner = opennmt.Runner(model, config, auto_config=True)
runner.train(num_devices=2, with_eval=True)