Exemplo n.º 1
0
def run_config(config):
    params = Params(json.loads(config))
    params_copy = params.duplicate()

    if "dataset_reader" in params:
        reader = DatasetReader.from_params(params.pop("dataset_reader"))
    else:
        raise RuntimeError("`dataset_reader` section is required")

    loader_params = params.pop("iterator")
    train_data_loader = DataIterator.from_params(
        reader=reader,
        data_path=params.pop("train_data_path"),
        params=loader_params.duplicate(),
    )
    dev_data_loader = DataIterator.from_params(
        reader=reader,
        data_path=params.pop("validation_data_path"),
        params=loader_params,
    )

    print("Building the vocabulary...")
    vocab = Vocabulary.from_instances(train_data_loader.iter_instances())

    if "model" not in params:
        # 'dataset' mode — just preview the (first 10) instances
        print("Showing the first 10 instances:")
        for inst in train_data_loader.iter_instances():
            print(inst)
            return None

    model = Model.from_params(vocab=vocab, params=params.pop("model"))

    train_data_loader.index_with(vocab)
    dev_data_loader.index_with(vocab)

    # set up a temporary, empty directory for serialization
    with tempfile.TemporaryDirectory() as serialization_dir:
        trainer = Trainer.from_params(
            model=model,
            serialization_dir=serialization_dir,
            data_loader=train_data_loader,
            validation_data_loader=dev_data_loader,
            params=params.pop("trainer"),
        )
        trainer.train()

    return {
        "params": params_copy,
        "dataset_reader": reader,
        "vocab": vocab,
        "model": model,
    }
Exemplo n.º 2
0
    def test_ultra_fine_reader(self):
        reader = get_reader("entity")
        instances = ensure_list(
            reader.read('tests/fixtures/evaluation/ultra_fine/train.json'))

        # Check number of instances is correct
        self.assertEqual(len(instances), 2)

        # Check that first instance's tokens are correct
        tokens_0 = [x.text for x in instances[0]['tokens']]
        segments_0 = list(instances[0]['segment_ids'].array)
        actual = list(zip(tokens_0, segments_0))
        expected = [('[CLS]', 0), ('the', 0), ('british', 0),
                    ('information', 0), ('commissioner', 0), ("'s", 0),
                    ('office', 0), ('invites', 0), ('[unused0]', 0), ('to', 0),
                    ('locate', 0), ('its', 0), ('add', 0), ('##ress', 0),
                    ('using', 0), ('google', 0), ('[UNK]', 0), ('.', 0),
                    ('[SEP]', 0), ('web', 1), ('users', 1), ('[SEP]', 1)]
        self.assertListEqual(actual, expected)

        iterator = DataIterator.from_params(Params({"type": "basic"}))
        iterator.index_with(Vocabulary())

        for batch in iterator(instances, num_epochs=1, shuffle=False):
            break

        expected_labels = [[0, 0, 0, 0, 0, 0, 1, 0, 0],
                           [1, 0, 0, 0, 0, 0, 0, 0, 0]]
        self.assertEqual(batch['label_ids'].numpy().tolist(), expected_labels)
Exemplo n.º 3
0
def write_for_official_eval(model_archive_file, test_file, output_file,
                            label_ids_to_label):
    archive = load_archive(model_archive_file)
    model = archive.model

    reader = DatasetReader.from_params(archive.config['dataset_reader'])

    iterator = DataIterator.from_params(Params({"type": "basic", "batch_size": 4}))
    vocab = Vocabulary.from_params(archive.config['vocabulary'])
    iterator.index_with(vocab)

    model.cuda()
    model.eval()

    instances = reader.read(test_file)
    predictions = []
    for batch in iterator(instances, num_epochs=1, shuffle=False):
        batch = move_to_device(batch, cuda_device=0)
        output = model(**batch)

        batch_labels = [
            label_ids_to_label[i]
            for i in output['predictions'].cpu().numpy().tolist()
        ]

        predictions.extend(batch_labels)


    with open(output_file, 'w') as fout:
        for p in predictions:
            fout.write("{}\n".format(p))
Exemplo n.º 4
0
    def setUp(self):
        super().setUp()
        params = Params({
            "model": {
                "type": "simple_tagger",
                "text_field_embedder": {
                    "token_embedders": {
                        "tokens": {
                            "type": "embedding",
                            "embedding_dim": 5
                        }
                    }
                },
                "encoder": {
                    "type": "lstm",
                    "input_size": 5,
                    "hidden_size": 7,
                    "num_layers": 2
                },
            },
            "dataset_reader": {
                "type": "sequence_tagging"
            },
            "train_data_path":
            str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"),
            "validation_data_path":
            str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"),
            "iterator": {
                "type": "basic",
                "batch_size": 2
            },
            "trainer": {
                "cuda_device": -1,
                "num_epochs": 2,
                "optimizer": "adam"
            },
        })
        all_datasets = datasets_from_params(params)
        vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            instances=(instance for dataset in all_datasets.values()
                       for instance in dataset),
        )
        model = Model.from_params(vocab=vocab, params=params.pop("model"))
        iterator = DataIterator.from_params(params.pop("iterator"))
        iterator.index_with(vocab)
        train_data = all_datasets["train"]
        trainer_params = params.pop("trainer")
        serialization_dir = os.path.join(self.TEST_DIR,
                                         "test_search_learning_rate")

        self.trainer = TrainerBase.from_params(
            model=model,
            serialization_dir=serialization_dir,
            iterator=iterator,
            train_data=train_data,
            params=trainer_params,
            validation_data=None,
            validation_iterator=None,
        )
Exemplo n.º 5
0
    def ensure_model_can_train_save_and_load(self, param_file: str):
        save_dir = os.path.join(self.TEST_DIR, "save_and_load_test")
        archive_file = os.path.join(save_dir, "model.tar.gz")
        model = train_model_from_file(param_file, save_dir)
        loaded_model = load_archive(archive_file).model
        state_keys = model.state_dict().keys()
        loaded_state_keys = loaded_model.state_dict().keys()
        assert state_keys == loaded_state_keys
        # First we make sure that the state dict (the parameters) are the same for both models.
        for key in state_keys:
            assert_allclose(model.state_dict()[key].numpy(),
                            loaded_model.state_dict()[key].numpy(),
                            err_msg=key)
        params = Params.from_file(self.param_file)
        reader = DatasetReader.from_params(params['dataset_reader'])
        iterator = DataIterator.from_params(params['iterator'])

        # We'll check that even if we index the dataset with each model separately, we still get
        # the same result out.
        model_dataset = reader.read(params['validation_data_path'])
        model_dataset.index_instances(model.vocab)
        model_batch_arrays = next(iterator(model_dataset, shuffle=False))
        model_batch = arrays_to_variables(model_batch_arrays, for_training=False)
        loaded_dataset = reader.read(params['validation_data_path'])
        loaded_dataset.index_instances(loaded_model.vocab)
        loaded_batch_arrays = next(iterator(loaded_dataset, shuffle=False))
        loaded_batch = arrays_to_variables(loaded_batch_arrays, for_training=False)

        # The datasets themselves should be identical.
        for key in model_batch.keys():
            field = model_batch[key]
            if isinstance(field, dict):
                for subfield in field:
                    self.assert_fields_equal(model_batch[key][subfield],
                                             loaded_batch[key][subfield],
                                             tolerance=1e-6,
                                             name=key + '.' + subfield)
            else:
                self.assert_fields_equal(model_batch[key], loaded_batch[key], 1e-6, key)

        # Set eval mode, to turn off things like dropout, then get predictions.
        model.eval()
        loaded_model.eval()
        model_predictions = model.forward(**model_batch)
        loaded_model_predictions = loaded_model.forward(**loaded_batch)

        # Check loaded model's loss exists and we can compute gradients, for continuing training.
        loaded_model_loss = loaded_model_predictions["loss"]
        assert loaded_model_loss is not None
        loaded_model_loss.backward()

        # Both outputs should have the same keys and the values for these keys should be close.
        for key in model_predictions.keys():
            self.assert_fields_equal(model_predictions[key],
                                     loaded_model_predictions[key],
                                     tolerance=1e-4,
                                     name=key)

        return model, loaded_model
Exemplo n.º 6
0
def main(args):
    params = Params.from_file(args.config_path)
    stdout_handler = prepare_global_logging(args.output_dir, False)
    prepare_environment(params)

    reader = DatasetReader.from_params(params["dataset_reader"])
    train_dataset = reader.read(params.pop("train_data_path", None))
    valid_dataset = reader.read(params.pop("validation_data_path", None))
    test_data_path = params.pop("test_data_path", None)
    if test_data_path:
        test_dataset = reader.read(test_data_path)
        vocab = Vocabulary.from_instances(train_dataset + valid_dataset +
                                          test_dataset)
    else:
        test_dataset = None
        vocab = Vocabulary.from_instances(train_dataset + valid_dataset)

    model_params = params.pop("model", None)
    model = Model.from_params(model_params.duplicate(), vocab=vocab)
    vocab.save_to_files(os.path.join(args.output_dir, "vocabulary"))
    # copy config file
    with open(args.config_path, "r", encoding="utf-8") as f_in:
        with open(os.path.join(args.output_dir, "config.json"),
                  "w",
                  encoding="utf-8") as f_out:
            f_out.write(f_in.read())

    iterator = DataIterator.from_params(params.pop("iterator", None))
    iterator.index_with(vocab)

    trainer_params = params.pop("trainer", None)
    trainer = Trainer.from_params(model=model,
                                  serialization_dir=args.output_dir,
                                  iterator=iterator,
                                  train_data=train_dataset,
                                  validation_data=valid_dataset,
                                  params=trainer_params.duplicate())
    trainer.train()

    # evaluate on the test set
    if test_dataset:
        logging.info("Evaluating on the test set")
        import torch  # import here to ensure the republication of the experiment
        model.load_state_dict(
            torch.load(os.path.join(args.output_dir, "best.th")))
        test_metrics = evaluate(model,
                                test_dataset,
                                iterator,
                                cuda_device=trainer_params.pop(
                                    "cuda_device", 0),
                                batch_weight_key=None)
        logging.info(f"Metrics on the test set: {test_metrics}")
        with open(os.path.join(args.output_dir, "test_metrics.txt"),
                  "w",
                  encoding="utf-8") as f_out:
            f_out.write(f"Metrics on the test set: {test_metrics}")

    cleanup_global_logging(stdout_handler)
Exemplo n.º 7
0
def main(config_file):
    config = Params.from_file(config_file)
    dataset_reader = DatasetReader.from_params(config['dataset_reader'])
    iterator_params = config['iterator']
    iterator_keys = list(iterator_params.keys())
    for key in iterator_keys:
        if key != 'batch_size':
            del iterator_params[key]
    iterator_params['type'] = 'basic'
    iterator = DataIterator.from_params(iterator_params)
    evaluation_data_path = config['validation_data_path']

    expected_version = '1.1'
    with open(evaluation_data_path) as dataset_file:
        dataset_json = json.load(dataset_file)
        if (dataset_json['version'] != expected_version):
            print('Evaluation expects v-' + expected_version +
                  ', but got dataset with v-' + dataset_json['version'],
                  file=sys.stderr)
        official_script_dataset = dataset_json['data']

    cuda_device = 0
    squad_eval.verbosity = 1
    model = Model.load(config, cuda_device=cuda_device)

    # Load the evaluation data
    print("Reading evaluation data from %s" % evaluation_data_path)
    dataset = dataset_reader.read(evaluation_data_path)
    dataset.index_instances(model._vocab)

    model.eval()
    generator = iterator(dataset, num_epochs=1, shuffle=False)
    print("Predicting best spans for the evaluation data")
    best_spans = []
    result_dict = {}
    for batch in tqdm.tqdm(generator):
        tensor_batch = arrays_to_variables(batch,
                                           cuda_device,
                                           for_training=False)
        result = model.forward(**tensor_batch)
        best_span_tensor = result['best_span']
        for i in range(best_span_tensor.size(0)):
            best_spans.append(best_span_tensor[i].data.cpu().tolist())
    for best_span, instance in zip(best_spans, dataset.instances):
        span_tokens = instance.fields['passage'].tokens[
            best_span[0]:best_span[1]]
        # We have to do some hacks to get from our tokens back to the original passage text, so
        # that our answers get scored correctly.  This could be made much easier if we kept around
        # the character offset in the original text when we tokenize things.
        span_text = fix_span_text(span_tokens,
                                  instance.metadata['original_passage'])
        question_id = instance.metadata['id']
        result_dict[question_id] = span_text
    metrics = model.get_metrics()
    official_result = squad_eval.evaluate(official_script_dataset, result_dict)
    print("Our model's metrics:", metrics)
    print("Official result:", official_result)
Exemplo n.º 8
0
    def test_wic_reader_entity_markers(self):
        reader_params = Params({
            "type": "wic",
            "entity_markers": True,
            "tokenizer_and_candidate_generator": {
                "type": "bert_tokenizer_and_candidate_generator",
                "entity_candidate_generators": {
                    "wordnet": {
                        "type":
                        "wordnet_mention_generator",
                        "entity_file":
                        "tests/fixtures/wordnet/entities_fixture.jsonl"
                    }
                },
                "entity_indexers": {
                    "wordnet": {
                        "type": "characters_tokenizer",
                        "tokenizer": {
                            "type": "word",
                            "word_splitter": {
                                "type": "just_spaces"
                            },
                        },
                        "namespace": "entity"
                    }
                },
                "bert_model_type":
                "tests/fixtures/evaluation/wic/vocab_entity_markers.txt",
                "do_lower_case": True,
            },
        })

        reader = DatasetReader.from_params(reader_params)
        instances = reader.read(FIXTURES + '/train')
        iterator = DataIterator.from_params(Params({"type": "basic"}))
        iterator.index_with(Vocabulary())

        for batch in iterator(instances, num_epochs=1, shuffle=False):
            break

        self.assertTrue(len(batch['label_ids']) == 5)

        self.assertEqual(batch['index_a'][0].item(), 3)
        self.assertEqual(batch['index_b'][0].item(), 12)

        instance_0_text = [
            token.text for token in instances[0].fields['tokens'].tokens
        ]
        expected_instance_0_text = [
            '[CLS]', '[UNK]', '[UNK]', '[e1start]', '[UNK]', '[e1end]',
            '[UNK]', '[UNK]', '[UNK]', '.', '[SEP]', '[UNK]', '[e2start]',
            '[UNK]', '[e2end]', '[UNK]', 'over', '[UNK]', '.', '[SEP]'
        ]
        self.assertEqual(instance_0_text, expected_instance_0_text)
        self.assertEqual(instance_0_text[3], '[e1start]')
        self.assertEqual(instance_0_text[12], '[e2start]')
Exemplo n.º 9
0
    def test_reader(self):
        reader = get_reader(masked_lm_prob=0.15)

        np.random.seed(5)
        instances = reader.read("tests/fixtures/bert_pretraining/shard1.txt")

        vocab = Vocabulary.from_params(Params({
            "directory_path": "tests/fixtures/bert/vocab_dir_with_entities_for_tokenizer_and_generator"
        }))
        iterator = DataIterator.from_params(Params({"type": "basic"}))
        iterator.index_with(vocab)

        for batch in iterator(instances, num_epochs=1, shuffle=False):
            break

        actual_tokens_ids = batch['tokens']['tokens']
        expected_tokens_ids = torch.tensor(
                [[16, 18, 19, 20,  1, 19, 21, 13, 17, 21,  3,  4, 12, 13, 17],
                [16,  1, 13, 17, 21,  1,  1, 13, 17,  0,  0,  0,  0,  0,  0]])

        self.assertEqual(actual_tokens_ids.tolist(), expected_tokens_ids.tolist())

        actual_entities = batch['candidates']['wordnet']['candidate_entities']['ids']
        expected_entities = torch.tensor(
                    [[[29, 30],
                     [31,  0],
                     [31,  0]],
                   
                    [[ 0,  0],
                     [ 0,  0],
                     [ 0,  0]]])
        self.assertEqual(actual_entities.tolist(), expected_entities.tolist())

        expected_spans = torch.tensor(
                       [[[ 1,  3],
                         [ 2,  3],
                         [ 5,  6]],
                
                        [[-1, -1],
                         [-1, -1],
                         [-1, -1]]])
        actual_spans = batch['candidates']['wordnet']['candidate_spans']
        self.assertEqual(actual_spans.tolist(), expected_spans.tolist())

        expected_lm_labels = torch.tensor(
                [[ 0,  0,  0,  0,  0,  0, 20,  0,  0,  2,  0,  0,  0,  0,  0],
                 [ 0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])
        actual_lm_labels = batch['lm_label_ids']['lm_labels']
        self.assertEqual(actual_lm_labels.tolist(), expected_lm_labels.tolist())

        expected_segment_ids = torch.tensor(
            [[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
             [0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]])
        self.assertEqual(batch['segment_ids'].tolist(), expected_segment_ids.tolist())
        self.assertTrue(batch['segment_ids'].dtype == torch.long)
Exemplo n.º 10
0
def evaluate_from_args(args: argparse.Namespace):
    # Disable some of the more verbose logging statements
    logging.getLogger('allennlp.common.params').disabled = True
    logging.getLogger('allennlp.nn.initializers').disabled = True
    logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(logging.INFO)

    # Load from archive
    archive = load_archive(args.archive_file, args.cuda_device, args.overrides, args.weights_file)
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Load the evaluation data

    # Try to use the validation dataset reader if there is one - otherwise fall back
    # to the default dataset_reader used for both training and validation.
    validation_dataset_reader_params = config.pop('validation_dataset_reader', None)
    if validation_dataset_reader_params is not None:
        dataset_reader = DatasetReader.from_params(validation_dataset_reader_params)
    else:
        dataset_reader = DatasetReader.from_params(config.pop('dataset_reader'))
    evaluation_data_path = args.input_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    instances = dataset_reader.read(evaluation_data_path)

    embedding_sources: Dict[str, str] = (json.loads(args.embedding_sources_mapping)
                                         if args.embedding_sources_mapping else {})
    if args.extend_vocab:
        logger.info("Vocabulary is being extended with test instances.")
        model.vocab.extend_from_instances(Params({}), instances=instances)
        model.extend_embedder_vocab(embedding_sources)

    iterator_params = config.pop("validation_iterator", None)
    if iterator_params is None:
        iterator_params = config.pop("iterator")
    iterator = DataIterator.from_params(iterator_params)
    iterator.index_with(model.vocab)

    csv_writer = csv.writer(args.output_file)

    keys = None
    for instance in instances:
        metrics = evaluate(model, [instance], iterator, args.cuda_device, args.batch_weight_key)

        if keys is None:
            keys = sorted(metrics.keys())
            csv_writer.writerow(['instance_id', *keys])

        instance_id = instance.fields['metadata']['id']

        values = [metrics[key] for key in keys]
        csv_writer.writerow([instance_id, *values])
Exemplo n.º 11
0
    def setUp(self):
        super().setUp()
        params = Params({
                "model": {
                    "type": "simple_tagger",
                    "text_field_embedder": {
                        "token_embedders": {
                            "tokens": {
                                "type": "embedding",
                                "embedding_dim": 5
                            }
                        }
                    },
                    "encoder": {
                        "type": "lstm",
                        "input_size": 5,
                        "hidden_size": 7,
                        "num_layers": 2
                    }
                },
                "dataset_reader": {"type": "sequence_tagging"},
                "train_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'),
                "validation_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'),
                "iterator": {"type": "basic", "batch_size": 2},
                "trainer": {
                    "cuda_device": -1,
                    "num_epochs": 2,
                    "optimizer": "adam"
                }
            })
        all_datasets = datasets_from_params(params)
        vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            (instance for dataset in all_datasets.values()
             for instance in dataset)
        )
        model = Model.from_params(vocab=vocab, params=params.pop('model'))
        iterator = DataIterator.from_params(params.pop("iterator"))
        iterator.index_with(vocab)
        train_data = all_datasets['train']
        trainer_params = params.pop("trainer")
        serialization_dir = os.path.join(self.TEST_DIR, 'test_search_learning_rate')

        self.trainer = Trainer.from_params(model,
                                           serialization_dir,
                                           iterator,
                                           train_data,
                                           params=trainer_params,
                                           validation_data=None,
                                           validation_iterator=None)
Exemplo n.º 12
0
    def test_sample(self):
        generator_params = Params.from_file(
            "kglm/tests/fixtures/training_config/kglm.json")
        params = Params.from_file(self.param_file)
        dataset_file = "kglm/tests/fixtures/enhanced-wikitext-test/train.jsonl"

        # Need instances from 'generative' reader!
        reader_params = generator_params['dataset_reader']
        reader = DatasetReader.from_params(reader_params)
        instances = list(reader.read(dataset_file))
        iterator = DataIterator.from_params(generator_params['iterator'])
        iterator.index_with(self.model.vocab)
        batch, _ = next(iterator(instances, shuffle=False))
        self.model.sample(**batch)
Exemplo n.º 13
0
def run_evaluation(evaluation_file, model_archive, random_candidates=False):

    archive = load_archive(model_archive)
    model = archive.model
    vocab = model.vocab
    params = archive.config

    model.multitask = False
    model.multitask_kg = False
    model.cuda()
    model.eval()
    for p in model.parameters():
        p.requires_grad_(False)

    reader_params = params.pop('dataset_reader')
    if reader_params['type'] == 'multitask_reader':
        reader_params = reader_params['dataset_readers']['language_modeling']

    if random_candidates:
        for k, v in reader_params['base_reader'][
                'tokenizer_and_candidate_generator'][
                    'entity_candidate_generators'].items():
            v['random_candidates'] = True

    reader = DatasetReader.from_params(Params(reader_params))

    iterator = DataIterator.from_params(
        Params({
            "type": "self_attn_bucket",
            "batch_size_schedule": "base-11gb-fp32",
            "iterator": {
                "type": "bucket",
                "batch_size": 32,
                "sorting_keys": [["tokens", "num_tokens"]],
                "max_instances_in_memory": 2500,
            }
        }))
    iterator.index_with(vocab)
    instances = reader.read(evaluation_file)

    for batch_no, batch in enumerate(
            tqdm.tqdm(iterator(instances, num_epochs=1))):
        b = move_to_device(batch, 0)
        loss = model(**b)
        if batch_no % 100 == 0:
            print(model.get_metrics())

    print(model.get_metrics())
Exemplo n.º 14
0
    def __init__(self,
                 model_archive,
                 batch_size=32,
                 masking_strategy=None,
                 wordnet_entity_file=None,
                 vocab_dir=None):

        # get bert_tokenizer_and_candidate_generator
        if os.path.isdir(model_archive):
            config = Params.from_file(
                os.path.join(model_archive, 'config.json'))
        else:
            config = _extract_config_from_archive(cached_path(model_archive))

        # look for the bert_tokenizers and candidate_generator
        candidate_generator_params = _find_key(
            config['dataset_reader'].as_dict(),
            'tokenizer_and_candidate_generator')

        if wordnet_entity_file is not None:
            candidate_generator_params['entity_candidate_generators'][
                'wordnet']['entity_file'] = wordnet_entity_file

        self.tokenizer_and_candidate_generator = TokenizerAndCandidateGenerator.\
                from_params(Params(candidate_generator_params))
        self.tokenizer_and_candidate_generator.whitespace_tokenize = False

        assert masking_strategy is None or masking_strategy == 'full_mask'
        self.masking_strategy = masking_strategy

        # need bert_tokenizer_and_candidate_generator
        if vocab_dir is not None:
            vocab_params = Params({"directory_path": vocab_dir})
        else:
            vocab_params = config['vocabulary']
        self.vocab = Vocabulary.from_params(vocab_params)

        self.iterator = DataIterator.from_params(
            Params({
                "type": "basic",
                "batch_size": batch_size
            }))
        self.iterator.index_with(self.vocab)
Exemplo n.º 15
0
Arquivo: testing.py Projeto: zxlzr/kb
def get_wsd_reader(is_training,
                   use_bert_indexer=False,
                   wordnet_entity_file=None):
    if wordnet_entity_file is None:
        wordnet_entity_file = "tests/fixtures/wordnet/entities_cat_hat.jsonl"

    if use_bert_indexer:
        bert_fixtures = get_bert_test_fixture()
        indexer_params = bert_fixtures["indexer_params"]
    else:
        indexer_params = {"type": "single_id", "lowercase_tokens": True}

    reader_params = {
        "type": "wordnet_fine_grained",
        "wordnet_entity_file": wordnet_entity_file,
        "token_indexers": {
            "tokens": indexer_params,
        },
        "entity_indexer": {
            "type": "characters_tokenizer",
            "tokenizer": {
                "type": "word",
                "word_splitter": {
                    "type": "just_spaces"
                },
            },
            "namespace": "entity"
        },
        "is_training": is_training,
        "use_surface_form": False
    }
    reader = DatasetReader.from_params(Params(reader_params))

    vocab_params = {
        "directory_path": "tests/fixtures/wordnet/cat_hat_vocabdir"
    }
    vocab = Vocabulary.from_params(Params(vocab_params))

    iterator = DataIterator.from_params(Params({"type": "basic"}))
    iterator.index_with(vocab)

    return reader, vocab, iterator
Exemplo n.º 16
0
    def test_wic_reader(self):
        reader_params = Params({
            "type": "wic",
            "tokenizer_and_candidate_generator": {
                "type": "bert_tokenizer_and_candidate_generator",
                "entity_candidate_generators": {
                    "wordnet": {
                        "type":
                        "wordnet_mention_generator",
                        "entity_file":
                        "tests/fixtures/wordnet/entities_fixture.jsonl"
                    }
                },
                "entity_indexers": {
                    "wordnet": {
                        "type": "characters_tokenizer",
                        "tokenizer": {
                            "type": "word",
                            "word_splitter": {
                                "type": "just_spaces"
                            },
                        },
                        "namespace": "entity"
                    }
                },
                "bert_model_type": "tests/fixtures/bert/vocab.txt",
                "do_lower_case": True,
            },
        })

        reader = DatasetReader.from_params(reader_params)
        instances = reader.read(FIXTURES + '/train')
        iterator = DataIterator.from_params(Params({"type": "basic"}))
        iterator.index_with(Vocabulary())

        for batch in iterator(instances, num_epochs=1, shuffle=False):
            break

        self.assertTrue(len(batch['label_ids']) == 5)

        self.assertEqual(batch['index_a'][0].item(), 3)
        self.assertEqual(batch['index_b'][0].item(), 10)
Exemplo n.º 17
0
    def test_sample(self):
        generator_params = Params.from_file(
            "kglm/tests/fixtures/training_config/kglm.no-shortlist.json")
        params = Params.from_file(self.param_file)
        dataset_file = "kglm/tests/fixtures/enhanced-wikitext-test/train.jsonl"

        # Need instances from 'generative' reader!
        reader_params = generator_params['dataset_reader']
        reader_params['mode'] = 'generative'
        reader = DatasetReader.from_params(reader_params)
        instances = list(reader.read(dataset_file))

        iterator = DataIterator.from_params(generator_params['iterator'])
        iterator.index_with(self.model.vocab)
        batch, _ = next(iterator(instances, shuffle=False))

        # Samples should match (we'll test by comparing logp)
        torch.manual_seed(123)
        logp1 = self.model.sample(**batch).get('logp', None)
        torch.manual_seed(123)
        logp2 = self.model.sample(**batch).get('logp', None)
Exemplo n.º 18
0
def get_wic_batch():
    fixtures = 'tests/fixtures/evaluation/wic'

    reader_params = Params({
        "type": "wic",
        "tokenizer_and_candidate_generator": {
            "type": "bert_tokenizer_and_candidate_generator",
            "entity_candidate_generators": {
                "wordnet": {
                    "type": "wordnet_mention_generator",
                    "entity_file":
                    "tests/fixtures/wordnet/entities_fixture.jsonl"
                }
            },
            "entity_indexers": {
                "wordnet": {
                    "type": "characters_tokenizer",
                    "tokenizer": {
                        "type": "word",
                        "word_splitter": {
                            "type": "just_spaces"
                        },
                    },
                    "namespace": "entity"
                }
            },
            "bert_model_type": "tests/fixtures/bert/vocab.txt",
            "do_lower_case": True,
        },
    })

    reader = DatasetReader.from_params(reader_params)
    instances = reader.read(fixtures + '/train')
    iterator = DataIterator.from_params(Params({"type": "basic"}))
    iterator.index_with(Vocabulary())

    for batch in iterator(instances, num_epochs=1, shuffle=False):
        break

    return batch
Exemplo n.º 19
0
def knowbert_fill2(sentences, model, batcher, vocab, mask_start=0, mask_end=0, config_file=None, top=10):
    iterator = DataIterator.from_params(Params({"type": "basic", "batch_size": 32}))
    config = Params.from_file(config_file)
    vocab_params = config['vocabulary']
    iterator.index_with(Vocabulary.from_params(vocab_params))
    instances = []
    for sent in sentences:
        token_candidates = batcher.tokenizer_and_candidate_generator.tokenize_and_generate_candidates(sent.replace('[MASK]', ' [MASK] '))
        masked_tokens = token_candidates['tokens'].copy()
        for i in range(mask_start, mask_end):
            masked_tokens[i] = '[MASK]'
        token_candidates['tokens'] = masked_tokens

        # mask out the entity candidates
        candidates = token_candidates['candidates']
        for candidate_key in candidates.keys():
            indices_to_mask = []
            for k, candidate_span in enumerate(candidates[candidate_key]['candidate_spans']):
                if (candidate_span[0] >= mask_start and candidate_span[0] <= mask_end-1) or (candidate_span[1] >= mask_start and candidate_span[1] <= mask_end-1):
                    indices_to_mask.append(k)
            for ind in indices_to_mask:
                candidates[candidate_key]['candidate_entities'][ind] = ['@@MASK@@']
                candidates[candidate_key]['candidate_entity_priors'][ind] = [1.0]
            if len(indices_to_mask) == 0:
                candidates[candidate_key]['candidate_spans'].append([mask_start, mask_end-1])
                candidates[candidate_key]['candidate_entities'].append(['@@MASK@@'])
                candidates[candidate_key]['candidate_entity_priors'].append([1.0])
                candidates[candidate_key]['candidate_segment_ids'].append(0)
        fields = batcher.tokenizer_and_candidate_generator.convert_tokens_candidates_to_fields(token_candidates)
        instances.append(Instance(fields))
    for batch in iterator(instances, num_epochs=1, shuffle=False):
        print(batch['tokens']['tokens'])
        model_output = model(**batch)
        print([vocab[w] for w in batch['tokens']['tokens'][0].numpy()])
        logits, _ = model.pretraining_heads(model_output['contextual_embeddings'], model_output['pooled_output'])
        log_probs = F.log_softmax(logits, dim=-1).cpu()
        for mask_ind in range(mask_start, mask_end):
            topk = torch.topk(log_probs[0, mask_ind], top, -1)[1]
            print([vocab[t.item()] for t in topk])
Exemplo n.º 20
0
    def ensure_model_can_train_save_and_load(self,
                                             param_file: str,
                                             tolerance: float = 1e-4,
                                             cuda_device: int = -1):
        save_dir = self.TEST_DIR / "save_and_load_test"
        archive_file = save_dir / "model.tar.gz"
        model = train_model_from_file(param_file, save_dir)
        loaded_model = load_archive(archive_file, cuda_device=cuda_device).model
        state_keys = model.state_dict().keys()
        loaded_state_keys = loaded_model.state_dict().keys()
        assert state_keys == loaded_state_keys
        # First we make sure that the state dict (the parameters) are the same for both models.
        for key in state_keys:
            assert_allclose(model.state_dict()[key].cpu().numpy(),
                            loaded_model.state_dict()[key].cpu().numpy(),
                            err_msg=key)
        params = Params.from_file(param_file)
        reader = DatasetReader.from_params(params['dataset_reader'])

        # Need to duplicate params because Iterator.from_params will consume.
        iterator_params = params['iterator']
        iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict()))

        iterator = DataIterator.from_params(iterator_params)
        iterator2 = DataIterator.from_params(iterator_params2)

        # We'll check that even if we index the dataset with each model separately, we still get
        # the same result out.
        model_dataset = reader.read(params['validation_data_path'])
        iterator.index_with(model.vocab)
        model_batch = next(iterator(model_dataset, shuffle=False, cuda_device=cuda_device))

        loaded_dataset = reader.read(params['validation_data_path'])
        iterator2.index_with(loaded_model.vocab)
        loaded_batch = next(iterator2(loaded_dataset, shuffle=False, cuda_device=cuda_device))

        # Check gradients are None for non-trainable parameters and check that
        # trainable parameters receive some gradient if they are trainable.
        self.check_model_computes_gradients_correctly(model, model_batch)

        # The datasets themselves should be identical.
        assert model_batch.keys() == loaded_batch.keys()
        for key in model_batch.keys():
            self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6)

        # Set eval mode, to turn off things like dropout, then get predictions.
        model.eval()
        loaded_model.eval()
        # Models with stateful RNNs need their states reset to have consistent
        # behavior after loading.
        for model_ in [model, loaded_model]:
            for module in model_.modules():
                if hasattr(module, 'stateful') and module.stateful:
                    module.reset_states()
        model_predictions = model(**model_batch)
        loaded_model_predictions = loaded_model(**loaded_batch)

        # Check loaded model's loss exists and we can compute gradients, for continuing training.
        loaded_model_loss = loaded_model_predictions["loss"]
        assert loaded_model_loss is not None
        loaded_model_loss.backward()

        # Both outputs should have the same keys and the values for these keys should be close.
        for key in model_predictions.keys():
            self.assert_fields_equal(model_predictions[key],
                                     loaded_model_predictions[key],
                                     name=key,
                                     tolerance=tolerance)

        return model, loaded_model
Exemplo n.º 21
0
    def ensure_model_can_train_save_and_load(self,
                                             param_file: str,
                                             tolerance: float = 1e-4,
                                             cuda_device: int = -1,
                                             gradients_to_ignore: Set[str] = None,
                                             overrides: str = ""):
        """
        Parameters
        ----------
        param_file : ``str``
            Path to a training configuration file that we will use to train the model for this
            test.
        tolerance : ``float``, optional (default=1e-4)
            When comparing model predictions between the originally-trained model and the model
            after saving and loading, we will use this tolerance value (passed as ``rtol`` to
            ``numpy.testing.assert_allclose``).
        cuda_device : ``int``, optional (default=-1)
            The device to run the test on.
        gradients_to_ignore : ``Set[str]``, optional (default=None)
            This test runs a gradient check to make sure that we're actually computing gradients
            for all of the parameters in the model.  If you really want to ignore certain
            parameters when doing that check, you can pass their names here.  This is not
            recommended unless you're `really` sure you don't need to have non-zero gradients for
            those parameters (e.g., some of the beam search / state machine models have
            infrequently-used parameters that are hard to force the model to use in a small test).
        overrides : ``str``, optional (default = "")
            A JSON string that we will use to override values in the input parameter file.
        """
        save_dir = self.TEST_DIR / "save_and_load_test"
        archive_file = save_dir / "model.tar.gz"
        model = train_model_from_file(param_file, save_dir, overrides=overrides)
        loaded_model = load_archive(archive_file, cuda_device=cuda_device).model
        state_keys = model.state_dict().keys()
        loaded_state_keys = loaded_model.state_dict().keys()
        assert state_keys == loaded_state_keys
        # First we make sure that the state dict (the parameters) are the same for both models.
        for key in state_keys:
            assert_allclose(model.state_dict()[key].cpu().numpy(),
                            loaded_model.state_dict()[key].cpu().numpy(),
                            err_msg=key)
        params = Params.from_file(param_file)
        reader = DatasetReader.from_params(params['dataset_reader'])

        # Need to duplicate params because Iterator.from_params will consume.
        iterator_params = params['iterator']
        iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict()))

        iterator = DataIterator.from_params(iterator_params)
        iterator2 = DataIterator.from_params(iterator_params2)

        # We'll check that even if we index the dataset with each model separately, we still get
        # the same result out.
        model_dataset = reader.read(params['validation_data_path'])
        iterator.index_with(model.vocab)
        model_batch = next(iterator(model_dataset, shuffle=False))

        loaded_dataset = reader.read(params['validation_data_path'])
        iterator2.index_with(loaded_model.vocab)
        loaded_batch = next(iterator2(loaded_dataset, shuffle=False))

        # Check gradients are None for non-trainable parameters and check that
        # trainable parameters receive some gradient if they are trainable.
        self.check_model_computes_gradients_correctly(model, model_batch, gradients_to_ignore)

        # The datasets themselves should be identical.
        assert model_batch.keys() == loaded_batch.keys()
        for key in model_batch.keys():
            self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6)

        # Set eval mode, to turn off things like dropout, then get predictions.
        model.eval()
        loaded_model.eval()
        # Models with stateful RNNs need their states reset to have consistent
        # behavior after loading.
        for model_ in [model, loaded_model]:
            for module in model_.modules():
                if hasattr(module, 'stateful') and module.stateful:
                    module.reset_states()
        model_predictions = model(**model_batch)
        loaded_model_predictions = loaded_model(**loaded_batch)

        # Check loaded model's loss exists and we can compute gradients, for continuing training.
        loaded_model_loss = loaded_model_predictions["loss"]
        assert loaded_model_loss is not None
        loaded_model_loss.backward()

        # Both outputs should have the same keys and the values for these keys should be close.
        for key in model_predictions.keys():
            self.assert_fields_equal(model_predictions[key],
                                     loaded_model_predictions[key],
                                     name=key,
                                     tolerance=tolerance)

        return model, loaded_model
Exemplo n.º 22
0
def train_model(db: FeverDocDB, params: Union[Params, Dict[str, Any]],
                cuda_device: int, serialization_dir: str,
                filtering: str) -> Model:
    """
    This function can be used as an entry point to running models in AllenNLP
    directly from a JSON specification using a :class:`Driver`. Note that if
    you care about reproducibility, you should avoid running code using Pytorch
    or numpy which affect the reproducibility of your experiment before you
    import and use this function, these libraries rely on random seeds which
    can be set in this function via a JSON specification file. Note that this
    function performs training and will also evaluate the trained model on
    development and test sets if provided in the parameter json.

    Parameters
    ----------
    params: Params, required.
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir: str, required
        The directory in which to save results and logs.
    """

    SimpleRandom.set_seeds()

    os.makedirs(serialization_dir, exist_ok=True)
    sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"),
                           sys.stdout)  # type: ignore
    sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"),
                           sys.stderr)  # type: ignore
    handler = logging.FileHandler(
        os.path.join(serialization_dir, "python_logging.log"))
    handler.setLevel(logging.INFO)
    handler.setFormatter(
        logging.Formatter(
            '%(asctime)s - %(levelname)s - %(name)s - %(message)s'))
    logging.getLogger().addHandler(handler)
    serialization_params = deepcopy(params).as_dict(quiet=True)

    with open(os.path.join(serialization_dir, "model_params.json"),
              "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    # Now we begin assembling the required parts for the Trainer.
    ds_params = params.pop('dataset_reader', {})
    dataset_reader = FEVERReader(db,
                                 sentence_level=ds_params.pop(
                                     "sentence_level", False),
                                 wiki_tokenizer=Tokenizer.from_params(
                                     ds_params.pop('wiki_tokenizer', {})),
                                 claim_tokenizer=Tokenizer.from_params(
                                     ds_params.pop('claim_tokenizer', {})),
                                 token_indexers=TokenIndexer.dict_from_params(
                                     ds_params.pop('token_indexers', {})),
                                 filtering=filtering)

    train_data_path = params.pop('train_data_path')
    logger.info("Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(train_data_path)

    all_datasets = [train_data]
    datasets_in_vocab = ["train"]

    validation_data_path = params.pop('validation_data_path', None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = dataset_reader.read(validation_data_path)
        all_datasets.append(validation_data)
        datasets_in_vocab.append("validation")
    else:
        validation_data = None

    logger.info("Creating a vocabulary using %s data.",
                ", ".join(datasets_in_vocab))
    vocab = Vocabulary.from_params(
        params.pop("vocabulary", {}),
        Dataset([
            instance for dataset in all_datasets
            for instance in dataset.instances
        ]))
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    model = Model.from_params(vocab, params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))

    train_data.index_instances(vocab)
    if validation_data:
        validation_data.index_instances(vocab)

    trainer_params = params.pop("trainer")
    if cuda_device is not None:
        trainer_params["cuda_device"] = cuda_device
    trainer = Trainer.from_params(model, serialization_dir, iterator,
                                  train_data, validation_data, trainer_params)

    trainer.train()

    # Now tar up results
    archive_model(serialization_dir)

    return model
Exemplo n.º 23
0
    def ensure_model_can_train_save_and_load(
            self,
            param_file: str,
            tolerance: float = 1e-4,
            cuda_device: int = -1,
            gradients_to_ignore: Set[str] = None,
            overrides: str = ""):
        """
        Parameters
        ----------
        param_file : ``str``
            Path to a training configuration file that we will use to train the model for this
            test.
        tolerance : ``float``, optional (default=1e-4)
            When comparing model predictions between the originally-trained model and the model
            after saving and loading, we will use this tolerance value (passed as ``rtol`` to
            ``numpy.testing.assert_allclose``).
        cuda_device : ``int``, optional (default=-1)
            The device to run the test on.
        gradients_to_ignore : ``Set[str]``, optional (default=None)
            This test runs a gradient check to make sure that we're actually computing gradients
            for all of the parameters in the model.  If you really want to ignore certain
            parameters when doing that check, you can pass their names here.  This is not
            recommended unless you're `really` sure you don't need to have non-zero gradients for
            those parameters (e.g., some of the beam search / state machine models have
            infrequently-used parameters that are hard to force the model to use in a small test).
        overrides : ``str``, optional (default = "")
            A JSON string that we will use to override values in the input parameter file.
        """
        save_dir = self.TEST_DIR / "save_and_load_test"
        archive_file = save_dir / "model.tar.gz"
        model = train_model_from_file(param_file,
                                      save_dir,
                                      overrides=overrides)
        loaded_model = load_archive(archive_file,
                                    cuda_device=cuda_device).model
        state_keys = model.state_dict().keys()
        loaded_state_keys = loaded_model.state_dict().keys()
        assert state_keys == loaded_state_keys
        # First we make sure that the state dict (the parameters) are the same for both models.
        for key in state_keys:
            assert_allclose(model.state_dict()[key].cpu().numpy(),
                            loaded_model.state_dict()[key].cpu().numpy(),
                            err_msg=key)
        params = Params.from_file(param_file)

        # Need to duplicate params because DatasetReader.from_params will consume.
        reader_params = params['dataset_reader']
        reader_params2 = Params(copy.deepcopy(reader_params.as_dict()))

        reader = DatasetReader.from_params(reader_params)
        reader2 = DatasetReader.from_params(reader_params2)

        # Need to duplicate params because Iterator.from_params will consume.
        iterator_params = params['iterator']
        iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict()))

        iterator = DataIterator.from_params(iterator_params)
        iterator2 = DataIterator.from_params(iterator_params2)

        # We'll check that even if we index the dataset with each model separately, we still get
        # the same result out.
        seed_params = Params({
            "random_seed": 5,
            "numpy_seed": 5,
            "pytorch_seed": 5
        })
        prepare_environment(seed_params)
        model_dataset = reader.read(params['validation_data_path'])
        iterator.index_with(model.vocab)
        model_batch = next(iterator(model_dataset, shuffle=False))

        seed_params = Params({
            "random_seed": 5,
            "numpy_seed": 5,
            "pytorch_seed": 5
        })
        prepare_environment(seed_params)
        loaded_dataset = reader2.read(params['validation_data_path'])
        iterator2.index_with(loaded_model.vocab)
        loaded_batch = next(iterator2(loaded_dataset, shuffle=False))

        # Check gradients are None for non-trainable parameters and check that
        # trainable parameters receive some gradient if they are trainable.
        self.check_model_computes_gradients_correctly(model, model_batch,
                                                      gradients_to_ignore)

        # The datasets themselves should be identical.
        assert model_batch.keys() == loaded_batch.keys()
        # import pdb; pdb.set_trace()
        for key in model_batch.keys():
            self.assert_fields_equal(model_batch[key], loaded_batch[key], key,
                                     1e-6)

        # Set eval mode, to turn off things like dropout, then get predictions.
        model.eval()
        loaded_model.eval()
        # Models with stateful RNNs need their states reset to have consistent
        # behavior after loading.
        for model_ in [model, loaded_model]:
            for module in model_.modules():
                if hasattr(module, 'stateful') and module.stateful:
                    module.reset_states()
        model_predictions = model(**model_batch)
        loaded_model_predictions = loaded_model(**loaded_batch)

        # Check loaded model's loss exists and we can compute gradients, for continuing training.
        loaded_model_loss = loaded_model_predictions["loss"]
        assert loaded_model_loss is not None
        loaded_model_loss.backward()

        # Both outputs should have the same keys and the values for these keys should be close.
        for key in model_predictions.keys():
            self.assert_fields_equal(model_predictions[key],
                                     loaded_model_predictions[key],
                                     name=key,
                                     tolerance=tolerance)

        return model, loaded_model
Exemplo n.º 24
0
    def ensure_model_can_train_save_and_load(self,
                                             param_file: str,
                                             tolerance: float = 1e-4,
                                             cuda_device: int = -1):
        save_dir = os.path.join(self.TEST_DIR, "save_and_load_test")
        archive_file = os.path.join(save_dir, "model.tar.gz")
        model = train_model_from_file(param_file, save_dir)
        loaded_model = load_archive(archive_file, cuda_device=cuda_device).model
        state_keys = model.state_dict().keys()
        loaded_state_keys = loaded_model.state_dict().keys()
        assert state_keys == loaded_state_keys
        # First we make sure that the state dict (the parameters) are the same for both models.
        for key in state_keys:
            assert_allclose(model.state_dict()[key].cpu().numpy(),
                            loaded_model.state_dict()[key].cpu().numpy(),
                            err_msg=key)
        params = Params.from_file(self.param_file)
        reader = DatasetReader.from_params(params['dataset_reader'])

        # Need to duplicate params because Iterator.from_params will consume.
        iterator_params = params['iterator']
        iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict()))

        iterator = DataIterator.from_params(iterator_params)
        iterator2 = DataIterator.from_params(iterator_params2)

        # We'll check that even if we index the dataset with each model separately, we still get
        # the same result out.
        model_dataset = reader.read(params['validation_data_path'])
        iterator.index_with(model.vocab)
        model_batch = next(iterator(model_dataset, shuffle=False, cuda_device=cuda_device))

        loaded_dataset = reader.read(params['validation_data_path'])
        iterator2.index_with(loaded_model.vocab)
        loaded_batch = next(iterator2(loaded_dataset, shuffle=False, cuda_device=cuda_device))

        # Check gradients are None for non-trainable parameters and check that
        # trainable parameters receive some gradient if they are trainable.
        self.check_model_computes_gradients_correctly(model, model_batch)

        # The datasets themselves should be identical.
        assert model_batch.keys() == loaded_batch.keys()
        for key in model_batch.keys():
            self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6)

        # Set eval mode, to turn off things like dropout, then get predictions.
        model.eval()
        loaded_model.eval()
        # Models with stateful RNNs need their states reset to have consistent
        # behavior after loading.
        for model_ in [model, loaded_model]:
            for module in model_.modules():
                if hasattr(module, 'stateful') and module.stateful:
                    module.reset_states()
        model_predictions = model(**model_batch)
        loaded_model_predictions = loaded_model(**loaded_batch)

        # Check loaded model's loss exists and we can compute gradients, for continuing training.
        loaded_model_loss = loaded_model_predictions["loss"]
        assert loaded_model_loss is not None
        loaded_model_loss.backward()

        # Both outputs should have the same keys and the values for these keys should be close.
        for key in model_predictions.keys():
            self.assert_fields_equal(model_predictions[key],
                                     loaded_model_predictions[key],
                                     name=key,
                                     tolerance=tolerance)

        return model, loaded_model
def predict(archive_folder, span_prediction_file, output_file, cuda_device):
    '''
    span_prediction_file (jsonl) needs atleast three fields 
        - doc_id, words: List[str], field: List[Tuple[start_index, end_index, type]]

    Return output_file (jsonl) - 
        {
            'doc_id' : str,
            'pairwise_coreference_scores' : List[(s_1, e_1), (s_2, e_2), float (3 sig. digits) in [0, 1]]
        }
    '''
    import_submodules("scirex")
    archive_file = os.path.join(archive_folder, "model.tar.gz")
    archive = load_archive(archive_file, cuda_device)
    model = archive.model
    model.eval()
    config = archive.config.duplicate()
    dataset_reader_params = config["dataset_reader"]
    dataset_reader_params.pop('type')
    dataset_reader = ScirexCoreferenceEvalReader.from_params(
        params=dataset_reader_params, field="ner")
    instances = dataset_reader.read(span_prediction_file)

    batch = Batch(instances)
    batch.index_instances(model.vocab)

    config['iterator'].pop('batch_size')
    data_iterator = DataIterator.from_params(config["iterator"],
                                             batch_size=1000)
    iterator = data_iterator(instances, num_epochs=1, shuffle=False)

    with open(output_file, "w") as f:
        documents = {}
        for batch in tqdm(iterator):
            with torch.no_grad():
                batch = nn_util.move_to_device(batch,
                                               cuda_device)  # Put on GPU.
                pred = model(**batch)
                decoded = model.decode(pred)

            metadata = decoded["metadata"]
            label_prob: List[float] = [
                float(x) for x in decoded["label_probs"]
            ]
            doc_ids: List[str] = [m["doc_id"] for m in metadata]
            span_premise = [m["span_premise"] for m in metadata]
            span_hypothesis = [m["span_hypothesis"] for m in metadata]
            fields = [m["field"] for m in metadata]
            assert len(set(fields)) == 1, breakpoint()

            for doc_id, span_p, span_h, p in zip(doc_ids, span_premise,
                                                 span_hypothesis, label_prob):
                if doc_id not in documents:
                    documents[doc_id] = {
                        "doc_id": doc_id,
                        "pairwise_coreference_scores": []
                    }

                documents[doc_id]["pairwise_coreference_scores"].append(
                    ((span_p[0], span_p[1]), (span_h[0], span_h[1]),
                     round(p, 4)))

        f.write("\n".join([json.dumps(x) for x in documents.values()]))
Exemplo n.º 26
0
def train_model(params: Union[Params, Dict[str, Any]], cuda_device: int,
                serialization_dir: str, filtering: str) -> Model:
    """
    This function can be used as an entry point to running models in AllenNLP
    directly from a JSON specification using a :class:`Driver`. Note that if
    you care about reproducibility, you should avoid running code using Pytorch
    or numpy which affect the reproducibility of your experiment before you
    import and use this function, these libraries rely on random seeds which
    can be set in this function via a JSON specification file. Note that this
    function performs training and will also evaluate the trained model on
    development and test sets if provided in the parameter json.

    Parameters
    ----------
    params: Params, required.
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir: str, required
        The directory in which to save results and logs.
    """

    SimpleRandom.set_seeds()

    os.makedirs(serialization_dir, exist_ok=True)
    try:
        sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"),
                               sys.stdout, True)  # type: ignore
        sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"),
                               sys.stderr, True)  # type: ignore
    except TypeError:
        sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"),
                               sys.stdout)  # type: ignore
        sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"),
                               sys.stderr)  # type: ignore
    handler = logging.FileHandler(
        os.path.join(serialization_dir, "python_logging.log"))
    handler.setLevel(logging.INFO)
    handler.setFormatter(
        logging.Formatter(
            '%(asctime)s - %(levelname)s - %(name)s - %(message)s'))
    logging.getLogger().addHandler(handler)
    serialization_params = deepcopy(params).as_dict(quiet=True)

    with open(os.path.join(serialization_dir, "model_params.json"),
              "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    # Now we begin assembling the required parts for the Trainer.
    ds_params = params.pop('dataset_reader', {})
    read_settings = ds_params.pop('read_settings', {})
    dataset_reader = FEVERReader.from_params(ds_params)

    train_data_path = params.pop('train_data_path')
    logger.info("Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(
        train_data_path,
        include_metadata=True,
        replace_with_gold=read_settings.pop('replace_gold', False),
        pad_with_nearest=read_settings.pop('pad_with_nearest', 0))

    validation_data_path = params.pop('validation_data_path', None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = dataset_reader.read(validation_data_path,
                                              include_metadata=True)
    else:
        validation_data = None

    vocab_params = params.pop("vocabulary", {})
    dataset = None
    print(dict(vocab_params), 'directory_path' not in vocab_params)
    assert ('directory_path' in vocab_params)
    vocab = Vocabulary.from_params(vocab_params, dataset)
    print(vocab)
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    model = Model.from_params(vocab=vocab, params=params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    trainer_params = params.pop("trainer")
    if cuda_device is not None:
        trainer_params["cuda_device"] = cuda_device
    trainer = Trainer.from_params(model, serialization_dir, iterator,
                                  train_data, validation_data, trainer_params)

    trainer.train()

    # Now tar up results
    archive_model(serialization_dir)

    return model
Exemplo n.º 27
0
def predict(archive_folder, span_file, cluster_file, output_file, cuda_device):
    combine_span_and_cluster_file(span_file, cluster_file)

    test_file = 'tmp_relation_42424242.jsonl'
    relation_threshold = json.load(
        open(archive_folder +
             '/metrics.json'))['best_validation__n_ary_rel_global_threshold']
    print(relation_threshold)

    import_submodules("scirex")
    logging.info("Loading Model from %s", archive_folder)
    archive_file = os.path.join(archive_folder, "model.tar.gz")
    archive = load_archive(archive_file, cuda_device)
    model = archive.model
    model.eval()

    model.prediction_mode = True
    config = archive.config.duplicate()
    dataset_reader_params = config["dataset_reader"]
    dataset_reader = DatasetReader.from_params(dataset_reader_params)
    dataset_reader.prediction_mode = True
    instances = dataset_reader.read(test_file)

    for instance in instances:
        batch = Batch([instance])
        batch.index_instances(model.vocab)

    data_iterator = DataIterator.from_params(config["validation_iterator"])
    iterator = data_iterator(instances, num_epochs=1, shuffle=False)

    with open(output_file, "w") as f:
        documents = {}
        for batch in tqdm(iterator):
            with torch.no_grad():
                batch = nn_util.move_to_device(batch, cuda_device)
                output_res = model.decode_relations(batch)

            n_ary_relations = output_res['n_ary_relation']
            predicted_relations, scores = n_ary_relations[
                'candidates'], n_ary_relations['scores']
            try:
                metadata = output_res['n_ary_relation']['metadata'][0]
            except (KeyError, IndexError):
                continue
            doc_id = metadata['doc_id']
            coref_key_map = {
                k: i
                for i, k in metadata['document_metadata']
                ['cluster_name_to_id'].items()
            }

            for i, rel in enumerate(predicted_relations):
                predicted_relations[i] = tuple([
                    coref_key_map[k] if k in coref_key_map else None
                    for k in rel
                ])

            if doc_id not in documents:
                documents[doc_id] = {
                    'predicted_relations': [],
                    'doc_id': doc_id
                }
            scores_ = list(scores.ravel())
            if not scores_:
                warnings.warn(f"no relation scores defined for {doc_id}")
                continue
            label = [1 if x > relation_threshold else 0 for x in scores_]
            if all(l == 0 for l in label):
                decoding_mode = os.environ.get("SCIREX_RELATION_DECODING")
                if decoding_mode == "report_single_most_likely":
                    label[scores.argmax()] = 1
                elif decoding_mode == "report_probabilistically":
                    idxs_sorted_by_score = sorted(
                        range(len(label)),
                        key=lambda i: scores[i],
                        reverse=True  # highest score first
                    )
                    possible_decoding_idxs = \
                        [idxs_sorted_by_score[:i] for i in range(1, 11)]  # assuming that >10 relationships would never happen

                    def score_decoding(candidate_idxs):
                        """likelihood function for a geometric distribution fit
                        to the training distribution of number-of-relationships-per-document

                        :param candidate_idxs (List[int]): a list of idxs that represents a relationship distribution
                        :return: likelihood that distribution
                        """
                        score_from_n_relationships = st.geom.pmf(
                            len(candidate_idxs),
                            0.4046692607003891  # MLE from training distribution, i.e.: 1 / (1 + E[X])
                        )
                        score_from_indiv_relationships = scores[candidate_idxs]
                        return score_from_n_relationships * np.prod(
                            score_from_indiv_relationships)

                    best_decoding_idxs = max(possible_decoding_idxs,
                                             key=score_decoding)
                    for idx in best_decoding_idxs:
                        label[idx] = 1

            scores = [round(float(x), 4) for x in list(scores.ravel())]
            documents[doc_id]['predicted_relations'] += list(
                zip(predicted_relations, scores, label))

        for d in documents.values():
            predicted_relations = {}
            for r, s, l in d['predicted_relations']:
                r = tuple(r)
                if r not in predicted_relations or predicted_relations[r][
                        0] < s:
                    predicted_relations[r] = (s, l)

            d['predicted_relations'] = [
                (r, s, l) for r, (s, l) in predicted_relations.items()
            ]

        f.write("\n".join([json.dumps(x) for x in documents.values()]))
Exemplo n.º 28
0
Arquivo: run_me.py Projeto: dav009/kb
    },
    "entity_indexer": {
        "type": "characters_tokenizer",
        "tokenizer": {
            "type": "word",
            "word_splitter": {
                "type": "just_spaces"
            },
        },
        "namespace": "entity",
    },
    "should_remap_span_indices": True,
})
reader = DatasetReader.from_params(Params(reader_params))
iterator = DataIterator.from_params(Params({
    "type": "basic",
    "batch_size": 16
}))
iterator.index_with(vocab)

from flask import Flask
from flask import request, jsonify
app = Flask(__name__)


def annotate(text):
    instances = reader.read(text)
    for batch_no, b in enumerate(
            iterator(instances, shuffle=False, num_epochs=1)):
        b['candidates'] = {
            'wiki': {
                'candidate_entities': b.pop('candidate_entities'),
Exemplo n.º 29
0
def find_learning_rate_model(params: Params, serialization_dir: str,
                             start_lr: float = 1e-5,
                             end_lr: float = 10,
                             num_batches: int = 100,
                             linear_steps: bool = False,
                             stopping_factor: float = None,
                             force: bool = False) -> None:
    """
    Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir``

    Parameters
    ----------
    trainer: :class:`~allennlp.common.registrable.Registrable`
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results.
    start_lr: ``float``
        Learning rate to start the search.
    end_lr: ``float``
        Learning rate upto which search is done.
    num_batches: ``int``
        Number of mini-batches to run Learning rate finder.
    linear_steps: ``bool``
        Increase learning rate linearly if False exponentially.
    stopping_factor: ``float``
        Stop the search when the current loss exceeds the best loss recorded by
        multiple of stopping factor. If ``None`` search proceeds till the ``end_lr``
    force: ``bool``
        If True and the serialization directory already exists, everything in it will
        be removed prior to finding the learning rate.
    """
    if os.path.exists(serialization_dir) and force:
        shutil.rmtree(serialization_dir)

    if os.path.exists(serialization_dir) and os.listdir(serialization_dir):
        raise ConfigurationError(f'Serialization directory {serialization_dir} already exists and is '
                                 f'not empty.')
    else:
        os.makedirs(serialization_dir, exist_ok=True)

    prepare_environment(params)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    if isinstance(cuda_device, list):
        for device in cuda_device:
            check_for_gpu(device)
    else:
        check_for_gpu(cuda_device)

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            (instance for key, dataset in all_datasets.items()
             for instance in dataset
             if key in datasets_for_vocab_creation)
    )

    model = Model.from_params(vocab=vocab, params=params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    train_data = all_datasets['train']

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    trainer = Trainer.from_params(model,
                                  serialization_dir,
                                  iterator,
                                  train_data,
                                  params=trainer_params,
                                  validation_data=None,
                                  validation_iterator=None)

    logger.info(f'Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations.')
    learning_rates, losses = search_learning_rate(trainer,
                                                  start_lr=start_lr,
                                                  end_lr=end_lr,
                                                  num_batches=num_batches,
                                                  linear_steps=linear_steps,
                                                  stopping_factor=stopping_factor)
    logger.info(f'Finished learning rate search.')
    losses = _smooth(losses, 0.98)

    _save_plot(learning_rates, losses, os.path.join(serialization_dir, 'lr-losses.png'))
Exemplo n.º 30
0
def main(model_dir,
         model_type,
         compression_rate,
         max_sentences,
         model_index=None):
    print(compression_rate, max_sentences)

    i = 0
    if model_index:
        i = model_index

    params = Params.from_file(os.path.join(model_dir, 'model_params.json'))
    ds_params = params.pop('dataset_reader', {})
    data_params = ds_params.pop('data', {})
    dataset_reader = CMVReader.from_params(ds_params)

    vocab = Vocabulary.from_params(
        Params({"directory_path": os.path.join(model_dir, 'vocabulary')}))

    val_iterator = DataIterator.from_params(params.pop('generator_iterator'))

    cmv_predictor = Model.from_params(params=params.pop('cmv_predictor'),
                                      vocab=vocab)
    document_embedder = Model.from_params(
        params=params.pop('document_embedder'), vocab=vocab)
    cmv_extractor = Model.from_params(params=params.pop('cmv_extractor'))

    cmv_actor_critic_params = params.pop('cmv_actor_critic', None)
    cmv_actor_critic = None
    if cmv_actor_critic_params is not None:
        cmv_actor_critic = Model.from_params(params=cmv_actor_critic_params)

    cmv_discriminator_params = params.pop('cmv_discriminator', None)
    cmv_discriminator = None
    if cmv_discriminator_params is not None:
        cmv_discriminator = Model.from_params(params=cmv_discriminator_params)

    params = dict(document_embedder=document_embedder,
                  cmv_predictor=cmv_predictor,
                  cmv_extractor=cmv_extractor,
                  cmv_actor_critic=cmv_actor_critic)
    if model_type == 'generator':
        params.update(dict(cmv_discriminator=cmv_discriminator))

    model = model_types[model_type](**params)

    data = dataset_reader.read('val', **data_params)
    data.index_instances(vocab)

    while True:
        model_filename = 'model_state_epoch_{}.th'.format(i)
        model_filename = os.path.join(os.path.join(model_dir, model_type),
                                      model_filename)

        print(model_filename)
        if not os.path.exists(model_filename):
            break

        #load file then do forward_on_instance
        model_state = torch.load(model_filename,
                                 map_location=util.device_mapping(-1))
        model.load_state_dict(model_state)
        model.eval()

        val_generator = val_iterator(data, num_epochs=1, shuffle=False)

        model._cmv_extractor._compression_rate = compression_rate
        for batch in val_generator:
            #batch is a tensor dict
            document, mask = model._document_embedder(batch['original_post'])
            idxs, probs, gold_loss = model._cmv_extractor(
                document,
                mask,
                batch['label'],
                gold_evidence=batch['weakpoints'],
                n_abs=max_sentences)

            #extracted_sentences = extract(batch['original_post'], idxs)
            #fake_output = model._cmv_predictor(batch['response'], batch['label'], extracted_sentences)
            for bidx, e in enumerate(batch['weakpoints']):
                if int(e.ne(-1).sum()) == 0:
                    continue
                print(e.numpy().tolist())
                print(idxs[bidx].numpy().tolist())
                for idx, sentence in enumerate(
                        batch['original_post']['tokens'][bidx]):
                    o = [
                        model._cmv_predictor.vocab.get_token_from_index(
                            int(index),
                            'tokens').replace('@@end@@', '').replace(
                                '@@UNKNOWN@@', 'UNK') for index in sentence
                        if int(index)
                    ]
                    if len(o):
                        print(idx, ' '.join(o))
                print()

        #print(model._cmv_predictor.get_metrics(reset=True))
        print(model._cmv_extractor.get_metrics(reset=True))

        i += 1
        if model_index is not None:
            break
Exemplo n.º 31
0
def find_learning_rate_model(params: Params, serialization_dir: str,
                             start_lr: float = 1e-5,
                             end_lr: float = 10,
                             num_batches: int = 100,
                             linear_steps: bool = False,
                             stopping_factor: float = None,
                             force: bool = False) -> None:
    """
    Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir``

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results.
    start_lr: ``float``
        Learning rate to start the search.
    end_lr: ``float``
        Learning rate upto which search is done.
    num_batches: ``int``
        Number of mini-batches to run Learning rate finder.
    linear_steps: ``bool``
        Increase learning rate linearly if False exponentially.
    stopping_factor: ``float``
        Stop the search when the current loss exceeds the best loss recorded by
        multiple of stopping factor. If ``None`` search proceeds till the ``end_lr``
    force: ``bool``
        If True and the serialization directory already exists, everything in it will
        be removed prior to finding the learning rate.
    """
    if os.path.exists(serialization_dir) and force:
        shutil.rmtree(serialization_dir)

    if os.path.exists(serialization_dir) and os.listdir(serialization_dir):
        raise ConfigurationError(f'Serialization directory {serialization_dir} already exists and is '
                                 f'not empty.')
    else:
        os.makedirs(serialization_dir, exist_ok=True)

    prepare_environment(params)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    check_for_gpu(cuda_device)

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            (instance for key, dataset in all_datasets.items()
             for instance in dataset
             if key in datasets_for_vocab_creation)
    )

    model = Model.from_params(vocab=vocab, params=params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    train_data = all_datasets['train']

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)


    trainer_choice = trainer_params.pop("type", "default")
    if trainer_choice != "default":
        raise ConfigurationError("currently find-learning-rate only works with the default Trainer")
    trainer = Trainer.from_params(model=model,
                                  serialization_dir=serialization_dir,
                                  iterator=iterator,
                                  train_data=train_data,
                                  validation_data=None,
                                  params=trainer_params,
                                  validation_iterator=None)

    logger.info(f'Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations.')
    learning_rates, losses = search_learning_rate(trainer,
                                                  start_lr=start_lr,
                                                  end_lr=end_lr,
                                                  num_batches=num_batches,
                                                  linear_steps=linear_steps,
                                                  stopping_factor=stopping_factor)
    logger.info(f'Finished learning rate search.')
    losses = _smooth(losses, 0.98)

    _save_plot(learning_rates, losses, os.path.join(serialization_dir, 'lr-losses.png'))
Exemplo n.º 32
0
def predict(archive_folder, span_file, cluster_file, output_file, cuda_device):
    combine_span_and_cluster_file(span_file, cluster_file)

    test_file = 'tmp_relation_42424242.jsonl'
    relation_threshold = json.load(
        open(archive_folder +
             '/metrics.json'))['test__n_ary_rel_global_threshold']
    print(relation_threshold)

    import_submodules("scirex")
    logging.info("Loading Model from %s", archive_folder)
    archive_file = os.path.join(archive_folder, "model.tar.gz")
    archive = load_archive(archive_file, cuda_device)
    model = archive.model
    model.eval()

    model.prediction_mode = True
    config = archive.config.duplicate()
    dataset_reader_params = config["dataset_reader"]
    dataset_reader = DatasetReader.from_params(dataset_reader_params)
    dataset_reader.prediction_mode = True
    instances = dataset_reader.read(test_file)

    for instance in instances:
        batch = Batch([instance])
        batch.index_instances(model.vocab)

    data_iterator = DataIterator.from_params(config["validation_iterator"])
    iterator = data_iterator(instances, num_epochs=1, shuffle=False)

    with open(output_file, "w") as f:
        documents = {}
        for batch in tqdm(iterator):
            with torch.no_grad():
                batch = nn_util.move_to_device(batch, cuda_device)
                output_res = model.decode_relations(batch)

            n_ary_relations = output_res['n_ary_relation']
            predicted_relations, scores = n_ary_relations[
                'candidates'], n_ary_relations['scores']

            if 'metadata' not in output_res['n_ary_relation']:
                continue

            metadata = output_res['n_ary_relation']['metadata'][0]
            doc_id = metadata['doc_id']
            coref_key_map = {
                k: i
                for i, k in metadata['document_metadata']
                ['cluster_name_to_id'].items()
            }

            for i, rel in enumerate(predicted_relations):
                predicted_relations[i] = tuple([
                    coref_key_map[k] if k in coref_key_map else None
                    for k in rel
                ])

            if doc_id not in documents:
                documents[doc_id] = {
                    'predicted_relations': [],
                    'doc_id': doc_id
                }

            label = [
                1 if x > relation_threshold else 0
                for x in list(scores.ravel())
            ]
            scores = [round(float(x), 4) for x in list(scores.ravel())]
            documents[doc_id]['predicted_relations'] += list(
                zip(predicted_relations, scores, label))

        for d in documents.values():
            predicted_relations = {}
            for r, s, l in d['predicted_relations']:
                r = tuple(r)
                if r not in predicted_relations or predicted_relations[r][
                        0] < s:
                    predicted_relations[r] = (s, l)

            d['predicted_relations'] = [
                (r, s, l) for r, (s, l) in predicted_relations.items()
            ]

        f.write("\n".join([json.dumps(x) for x in documents.values()]))
Exemplo n.º 33
0
def find_learning_rate_model(
    params: Params,
    serialization_dir: str,
    start_lr: float = 1e-5,
    end_lr: float = 10,
    num_batches: int = 100,
    linear_steps: bool = False,
    stopping_factor: float = None,
    force: bool = False,
) -> None:
    """
    Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir``

    # Parameters

    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results.
    start_lr : ``float``
        Learning rate to start the search.
    end_lr : ``float``
        Learning rate upto which search is done.
    num_batches : ``int``
        Number of mini-batches to run Learning rate finder.
    linear_steps : ``bool``
        Increase learning rate linearly if False exponentially.
    stopping_factor : ``float``
        Stop the search when the current loss exceeds the best loss recorded by
        multiple of stopping factor. If ``None`` search proceeds till the ``end_lr``
    force : ``bool``
        If True and the serialization directory already exists, everything in it will
        be removed prior to finding the learning rate.
    """
    create_serialization_dir(params,
                             serialization_dir,
                             recover=False,
                             force=force)

    prepare_environment(params)

    cuda_device = params.params.get("trainer").get("cuda_device", -1)
    check_for_gpu(cuda_device)
    distributed_params = params.params.get("distributed", None)
    # See https://github.com/allenai/allennlp/issues/3658
    assert not distributed_params, "find-lr is not compatible with DistributedDataParallel."

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info(
        "From dataset instances, %s will be considered for vocabulary creation.",
        ", ".join(datasets_for_vocab_creation),
    )
    vocab = Vocabulary.from_params(
        params.pop("vocabulary", {}),
        instances=(instance for key, dataset in all_datasets.items()
                   for instance in dataset
                   if key in datasets_for_vocab_creation),
    )

    model = Model.from_params(vocab=vocab, params=params.pop("model"))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    train_data = all_datasets["train"]

    trainer_params = params.pop("trainer")

    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    trainer_choice = trainer_params.pop("type", "default")
    if trainer_choice != "default":
        raise ConfigurationError(
            "currently find-learning-rate only works with the default Trainer")
    trainer: Trainer = TrainerBase.from_params(  # type: ignore
        model=model,
        serialization_dir=serialization_dir,
        iterator=iterator,
        train_data=train_data,
        validation_data=None,
        params=trainer_params,
        validation_iterator=None,
    )

    logger.info(
        f"Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations."
    )
    learning_rates, losses = search_learning_rate(
        trainer,
        start_lr=start_lr,
        end_lr=end_lr,
        num_batches=num_batches,
        linear_steps=linear_steps,
        stopping_factor=stopping_factor,
    )
    logger.info(f"Finished learning rate search.")
    losses = _smooth(losses, 0.98)

    _save_plot(learning_rates, losses,
               os.path.join(serialization_dir, "lr-losses.png"))
Exemplo n.º 34
0
def run_evaluation(evaluation_file,
                   model_archive_file,
                   is_wordnet_and_wiki=False):
    archive = load_archive(model_archive_file)

    params = archive.config
    vocab = Vocabulary.from_params(params.pop('vocabulary'))

    model = archive.model
    #model.cuda()
    model.eval()

    if is_wordnet_and_wiki:
        reader_params = Params({
            "type": "aida_wiki_linking",
            "entity_disambiguation_only": False,
            "entity_indexer": {
                "type": "characters_tokenizer",
                "namespace": "entity_wiki",
                "tokenizer": {
                    "type": "word",
                    "word_splitter": {
                        "type": "just_spaces"
                    }
                }
            },
            "extra_candidate_generators": {
                "wordnet": {
                    "type": "wordnet_mention_generator",
                    "entity_file":
                    "s3://allennlp/knowbert/wordnet/entities.jsonl"
                }
            },
            "should_remap_span_indices": True,
            "token_indexers": {
                "tokens": {
                    "type": "bert-pretrained",
                    "do_lowercase": True,
                    "max_pieces": 512,
                    "pretrained_model": "bert-base-uncased",
                    "use_starting_offsets": True,
                }
            }
        })
    else:
        reader_params = Params({
            "type": "aida_wiki_linking",
            "entity_disambiguation_only": False,
            "token_indexers": {
                "tokens": {
                    "type": "bert-pretrained",
                    "pretrained_model": "bert-base-uncased",
                    "do_lowercase": True,
                    "use_starting_offsets": True,
                    "max_pieces": 512,
                },
            },
            "entity_indexer": {
                "type": "characters_tokenizer",
                "tokenizer": {
                    "type": "word",
                    "word_splitter": {
                        "type": "just_spaces"
                    },
                },
                "namespace": "entity",
            },
            "should_remap_span_indices": True,
        })

    if is_wordnet_and_wiki:
        cg_params = Params({
            "type": "bert_tokenizer_and_candidate_generator",
            "bert_model_type": "bert-base-uncased",
            "do_lower_case": True,
            "entity_candidate_generators": {
                "wordnet": {
                    "type": "wordnet_mention_generator",
                    "entity_file":
                    "s3://allennlp/knowbert/wordnet/entities.jsonl"
                }
            },
            "entity_indexers": {
                "wordnet": {
                    "type": "characters_tokenizer",
                    "namespace": "entity_wordnet",
                    "tokenizer": {
                        "type": "word",
                        "word_splitter": {
                            "type": "just_spaces"
                        }
                    }
                }
            }
        })
        candidate_generator = TokenizerAndCandidateGenerator.from_params(
            cg_params)

    reader = DatasetReader.from_params(Params(reader_params))

    iterator = DataIterator.from_params(
        Params({
            "type": "basic",
            "batch_size": 16
        }))
    iterator.index_with(vocab)

    instances = reader.read(evaluation_file)

    for batch_no, batch in enumerate(
            iterator(instances, shuffle=False, num_epochs=1)):
        b = move_to_device(batch, -1)

        b['candidates'] = {
            'wiki': {
                'candidate_entities': b.pop('candidate_entities'),
                'candidate_entity_priors': b.pop('candidate_entity_prior'),
                'candidate_segment_ids': b.pop('candidate_segment_ids'),
                'candidate_spans': b.pop('candidate_spans')
            }
        }
        gold_entities = b.pop('gold_entities')
        b['gold_entities'] = {'wiki': gold_entities}

        if is_wordnet_and_wiki:
            extra_candidates = b.pop('extra_candidates')
            seq_len = b['tokens']['tokens'].shape[1]
            bbb = []
            for e in extra_candidates:
                for k in e.keys():
                    e[k]['candidate_segment_ids'] = [0] * len(
                        e[k]['candidate_spans'])
                ee = {
                    'tokens': ['[CLS]'] * seq_len,
                    'segment_ids': [0] * seq_len,
                    'candidates': e
                }
                ee_fields = candidate_generator.convert_tokens_candidates_to_fields(
                    ee)
                bbb.append(Instance(ee_fields))
            eb = Batch(bbb)
            eb.index_instances(vocab)
            padding_lengths = eb.get_padding_lengths()
            tensor_dict = eb.as_tensor_dict(padding_lengths)
            b['candidates'].update(tensor_dict['candidates'])
            bb = move_to_device(b, -1)
        else:
            bb = b

        loss = model(**bb)
        if batch_no % 100 == 0:
            print(model.get_metrics())

    print(model.get_metrics())