def run_inference(model_dir: str, epoch: Optional[int], device: str, metric: str): chainer.config.train = False if device >= 0: cuda.get_device(device).use() set_seed() configs = json.load(open(os.path.join(model_dir, "args"))) snapshot_file, prediction_path = select_snapshot(epoch, metric, model_dir) logger.debug(f"creat prediction into {prediction_path}") vocab = Vocabulary.prepare(configs) num_word_vocab = configs["num_word_vocab"] num_char_vocab = configs["num_char_vocab"] num_tag_vocab = configs["num_tag_vocab"] model = BiLSTM_CRF(configs, num_word_vocab, num_char_vocab, num_tag_vocab) model_path = os.path.join(model_dir, snapshot_file) chainer.serializers.load_npz(model_path, model) logger.debug(f"load {snapshot_file}") if device >= 0: model.to_gpu(device) transformer = DatasetTransformer(vocab) transform = transformer.transform test_iterator = create_iterator( vocab, configs, "test", transform, return_original_sentence=True ) with open(prediction_path, "w", encoding="utf-8") as file: for batch in test_iterator: batch, original_sentences = list(zip(*batch)) in_arrays, t_arrays = converter(batch, device) p_arrays = model.predict(in_arrays) word_sentences, t_tag_sentences = list( zip(*transformer.itransform(in_arrays[0], t_arrays)) ) _, p_tag_sentences = list( zip(*transformer.itransform(in_arrays[0], p_arrays)) ) sentence_gen = zip( word_sentences, t_tag_sentences, p_tag_sentences, original_sentences, ) # NOQA for ws, ts, ps, _os in sentence_gen: for w, t, p, o in zip(ws, ts, ps, _os): w = w.replace(" ", "<WHITESPACE>") o = o.replace(" ", "<WHITESPACE>") if w != o: w = f"{w}({o})" print(f"{w} {t} {p}", file=file) print(file=file)
def run_inference( model_dir: str, epoch: Optional[int], device: str, metric: str, tokenizer: str): chainer.config.train = False if device >= 0: chainer.get_device(device).use() set_seed() config = json.load(open(os.path.join(model_dir, "args"))) snapshot_file, prediction_path = select_snapshot(epoch, metric, model_dir) logger.debug(f"creat prediction into {prediction_path}") vocab = Vocabulary.prepare(config) num_word_vocab = config["num_word_vocab"] num_char_vocab = config["num_char_vocab"] num_tag_vocab = config["num_tag_vocab"] model = BiLSTM_CRF(config, num_word_vocab, num_char_vocab, num_tag_vocab) model_path = os.path.join(model_dir, snapshot_file) logger.debug(f"load {snapshot_file}") chainer.serializers.load_npz(model_path, model) if device >= 0: model.to_gpu(device) transformer = DatasetTransformer(vocab) word_tokenizer = WordTokenizer(tokenizer=tokenizer) for line in sys.stdin: input_sentence = [str(t) for t in word_tokenizer.tokenize(line)] batch = transformer.transform(input_sentence, None) in_arr, _ = converter([batch]) pd_arr = model.predict(in_arr) (_, tag_sequence), = transformer.itransform(in_arr[0], pd_arr) print(' '.join(f"{word}/{tag}" for word, tag in zip(input_sentence, tag_sequence))) # NOQA
def run_training(config: str, device: int, seed: int): configs = ConfigParser.parse(config) params = yaml.load(open(config, encoding="utf-8")) if device >= 0: cuda.get_device(device).use() set_seed(seed, device) vocab = Vocabulary.prepare(configs) num_word_vocab = max(vocab.dictionaries["word2idx"].values()) + 1 num_char_vocab = max(vocab.dictionaries["char2idx"].values()) + 1 num_tag_vocab = max(vocab.dictionaries["tag2idx"].values()) + 1 model = BiLSTM_CRF(configs, num_word_vocab, num_char_vocab, num_tag_vocab) transformer = DatasetTransformer(vocab) transform = transformer.transform external_configs = configs["external"] if "word_vector" in external_configs: syn0 = model.embed_word.W.data _, word_dim = syn0.shape pre_word_dim = vocab.gensim_model.vector_size if word_dim != pre_word_dim: msg = "Mismatch vector size between model and pre-trained word vectors" # NOQA msg += f"(model: \x1b[31m{word_dim}\x1b[0m" msg += f", pre-trained word vector: \x1b[31m{pre_word_dim}\x1b[0m" raise Exception(msg) word2idx = vocab.dictionaries["word2idx"] syn0 = prepare_pretrained_word_vector(word2idx, vocab.gensim_model, syn0, num_word_vocab) model.set_pretrained_word_vectors(syn0) train_iterator = create_iterator(vocab, configs, "train", transform) valid_iterator = create_iterator(vocab, configs, "valid", transform) test_iterator = create_iterator(vocab, configs, "test", transform) if device >= 0: model.to_gpu(device) optimizer = create_optimizer(configs) optimizer.setup(model) optimizer = add_hooks(optimizer, configs) updater = T.StandardUpdater(train_iterator, optimizer, converter=converter, device=device) params = configs.export() params["num_word_vocab"] = num_word_vocab params["num_char_vocab"] = num_char_vocab params["num_tag_vocab"] = num_tag_vocab epoch = configs["iteration"]["epoch"] trigger = (epoch, "epoch") model_path = configs["output"] timestamp = datetime.datetime.now() timestamp_str = timestamp.isoformat() output_path = Path(f"{model_path}.{timestamp_str}") trainer = T.Trainer(updater, trigger, out=output_path) save_args(params, output_path) msg = f"Create \x1b[31m{output_path}\x1b[0m for saving model snapshots" logging.debug(msg) entries = ["epoch", "iteration", "elapsed_time", "lr", "main/loss"] entries += ["validation/main/loss", "validation/main/fscore"] entries += ["validation_1/main/loss", "validation_1/main/fscore"] valid_evaluator = NamedEntityEvaluator(valid_iterator, model, transformer.itransform, converter, device=device) test_evaluator = NamedEntityEvaluator(test_iterator, model, transformer.itransform, converter, device=device) epoch_trigger = (1, "epoch") snapshot_filename = "snapshot_epoch_{.updater.epoch:04d}" trainer.extend(valid_evaluator, trigger=epoch_trigger) trainer.extend(test_evaluator, trigger=epoch_trigger) trainer.extend(E.observe_lr(), trigger=epoch_trigger) trainer.extend(E.LogReport(trigger=epoch_trigger)) trainer.extend(E.PrintReport(entries=entries), trigger=epoch_trigger) trainer.extend(E.ProgressBar(update_interval=20)) trainer.extend(E.snapshot_object(model, filename=snapshot_filename), trigger=(1, "epoch")) if "learning_rate_decay" in params: logger.debug("Enable Learning Rate decay") trainer.extend( LearningRateDecay("lr", params["learning_rate"], params["learning_rate_decay"]), trigger=epoch_trigger, ) trainer.run()
import chainer import pathlib import logging import json if __name__ == '__main__': logger = logging.getLogger(__name__) fmt = '%(asctime)s : %(threadName)s : %(levelname)s : %(message)s' logging.basicConfig(level=logging.DEBUG, format=fmt) args = parse_inference_args() chainer.config.train = False if args.device >= 0: chainer.cuda.get_device(args.device).use() set_seed() model_dir = pathlib.Path(args.model) configs = json.load(open(model_dir / 'args')) metric = args.metric.replace('/', '.') snapshot_file, prediction_path = select_snapshot(args, model_dir) logger.debug(f'creat prediction into {prediction_path}') vocab = Vocabulary.prepare(configs) num_word_vocab = configs['num_word_vocab'] num_char_vocab = configs['num_char_vocab'] num_tag_vocab = configs['num_tag_vocab'] model = BiLSTM_CRF(configs, num_word_vocab, num_char_vocab, num_tag_vocab)
f'- match1: \x1b[31m{match1}\x1b[0m, match2: \x1b[31m{match2}\x1b[0m' ) # NOQA return syn0 if __name__ == '__main__': logger = logging.getLogger(__name__) fmt = '[%(name)s] %(asctime)s : %(threadName)s : %(levelname)s : %(message)s' # NOQA logging.basicConfig(level=logging.DEBUG, format=fmt) args = parse_train_args() params = yaml.load(open(args.config, encoding='utf-8')) if args.device >= 0: chainer.cuda.get_device(args.device).use() set_seed(args.seed, args.device) configs = ConfigParser.parse(args.config) config_path = Path(args.config) vocab = Vocabulary.prepare(configs) num_word_vocab = max(vocab.dictionaries['word2idx'].values()) + 1 num_char_vocab = max(vocab.dictionaries['char2idx'].values()) + 1 num_tag_vocab = max(vocab.dictionaries['tag2idx'].values()) + 1 model = BiLSTM_CRF(configs, num_word_vocab, num_char_vocab, num_tag_vocab) transformer = DatasetTransformer(vocab) transform = transformer.transform external_configs = configs['external']