def test_uses_named_inputs(self):
        """
        Tests whether the model outputs conform to the expected format.
        """
        inputs = {
                "sentence": "Angela Merkel met and spoke to her EU counterparts during the climate summit."
        }

        archive = load_archive(self.FIXTURES_ROOT / \
                               'srl' / 'serialization' / 'model.tar.gz')
        predictor = Predictor.from_archive(archive, 'open-information-extraction')

        result = predictor.predict_json(inputs)

        words = result.get("words")
        assert words == ["Angela", "Merkel", "met", "and", "spoke", "to", "her", "EU", "counterparts",
                         "during", "the", "climate", "summit", "."]
        num_words = len(words)

        verbs = result.get("verbs")
        assert verbs is not None
        assert isinstance(verbs, list)

        for verb in verbs:
            tags = verb.get("tags")
            assert tags is not None
            assert isinstance(tags, list)
            assert all(isinstance(tag, str) for tag in tags)
            assert len(tags) == num_words
    def test_batch_prediction(self):
        inputs = [
                {
                        "sentence": "What kind of test succeeded on its first attempt?",
                },
                {
                        "sentence": "What kind of test succeeded on its first attempt at batch processing?",
                }
        ]

        archive = load_archive(self.FIXTURES_ROOT / 'biaffine_dependency_parser'
                               / 'serialization' / 'model.tar.gz')
        predictor = Predictor.from_archive(archive, 'biaffine-dependency-parser')

        results = predictor.predict_batch_json(inputs)
        assert len(results) == 2

        for result in results:
            sequence_length = len(result.get("words"))
            predicted_heads = result.get("predicted_heads")
            assert len(predicted_heads) == sequence_length

            predicted_dependencies = result.get("predicted_dependencies")
            assert len(predicted_dependencies) == sequence_length
            assert isinstance(predicted_dependencies, list)
            assert all(isinstance(x, str) for x in predicted_dependencies)
Пример #3
0
    def test_batch_prediction(self):
        inputs = [
                {"sentence": "What a great test sentence."},
                {"sentence": "Here's another good, interesting one."}
        ]

        archive = load_archive(self.FIXTURES_ROOT / 'constituency_parser' / 'serialization' / 'model.tar.gz')
        predictor = Predictor.from_archive(archive, 'constituency-parser')
        results = predictor.predict_batch_json(inputs)

        result = results[0]
        assert len(result["spans"]) == 21 # number of possible substrings of the sentence.
        assert len(result["class_probabilities"]) == 21
        assert result["tokens"] == ["What", "a", "great", "test", "sentence", "."]
        assert isinstance(result["trees"], str)

        for class_distribution in result["class_probabilities"]:
            self.assertAlmostEqual(sum(class_distribution), 1.0, places=4)

        result = results[1]

        assert len(result["spans"]) == 36 # number of possible substrings of the sentence.
        assert len(result["class_probabilities"]) == 36
        assert result["tokens"] == ["Here", "'s", "another", "good", ",", "interesting", "one", "."]
        assert isinstance(result["trees"], str)

        for class_distribution in result["class_probabilities"]:
            self.assertAlmostEqual(sum(class_distribution), 1.0, places=4)
    def test_uses_named_inputs(self):
        inputs = {
                "premise": "I always write unit tests for my code.",
                "hypothesis": "One time I didn't write any unit tests for my code."
        }

        archive = load_archive(self.FIXTURES_ROOT / 'decomposable_attention' / 'serialization' / 'model.tar.gz')
        predictor = Predictor.from_archive(archive, 'textual-entailment')
        result = predictor.predict_json(inputs)

        # Label probs should be 3 floats that sum to one
        label_probs = result.get("label_probs")
        assert label_probs is not None
        assert isinstance(label_probs, list)
        assert len(label_probs) == 3
        assert all(isinstance(x, float) for x in label_probs)
        assert all(x >= 0 for x in label_probs)
        assert sum(label_probs) == approx(1.0)

        # Logits should be 3 floats that softmax to label_probs
        label_logits = result.get("label_logits")
        assert label_logits is not None
        assert isinstance(label_logits, list)
        assert len(label_logits) == 3
        assert all(isinstance(x, float) for x in label_logits)

        exps = [math.exp(x) for x in label_logits]
        sumexps = sum(exps)
        for e, p in zip(exps, label_probs):
            assert e / sumexps == approx(p)
Пример #5
0
    def test_uses_named_inputs(self):
        inputs = {
                "question": "What kind of test succeeded on its first attempt?",
                "passage": "One time I was writing a unit test, and it succeeded on the first attempt."
        }

        archive = load_archive('tests/fixtures/bidaf/serialization/model.tar.gz')
        predictor = Predictor.from_archive(archive, 'machine-comprehension')

        result = predictor.predict_json(inputs)

        best_span = result.get("best_span")
        assert best_span is not None
        assert isinstance(best_span, list)
        assert len(best_span) == 2
        assert all(isinstance(x, int) for x in best_span)
        assert best_span[0] <= best_span[1]

        best_span_str = result.get("best_span_str")
        assert isinstance(best_span_str, str)
        assert best_span_str != ""

        for probs_key in ("span_start_probs", "span_end_probs"):
            probs = result.get(probs_key)
            assert probs is not None
            assert all(isinstance(x, float) for x in probs)
            assert sum(probs) == approx(1.0)
Пример #6
0
    def test_archiving(self):
        # copy params, since they'll get consumed during training
        params_copy = copy.deepcopy(self.params.as_dict())

        # `train_model` should create an archive
        serialization_dir = self.TEST_DIR / 'archive_test'
        model = train_model(self.params, serialization_dir=serialization_dir)

        archive_path = serialization_dir / "model.tar.gz"

        # load from the archive
        archive = load_archive(archive_path)
        model2 = archive.model

        # check that model weights are the same
        keys = set(model.state_dict().keys())
        keys2 = set(model2.state_dict().keys())

        assert keys == keys2

        for key in keys:
            assert torch.equal(model.state_dict()[key], model2.state_dict()[key])

        # check that vocabularies are the same
        vocab = model.vocab
        vocab2 = model2.vocab

        assert vocab._token_to_index == vocab2._token_to_index  # pylint: disable=protected-access
        assert vocab._index_to_token == vocab2._index_to_token  # pylint: disable=protected-access

        # check that params are the same
        params2 = archive.config
        assert params2.as_dict() == params_copy
Пример #7
0
    def test_uses_named_inputs(self):
        inputs = {
                "sentence": "The squirrel wrote a unit test to make sure its nuts worked as designed."
        }

        archive = load_archive(self.FIXTURES_ROOT / 'srl' / 'serialization' / 'model.tar.gz')
        predictor = Predictor.from_archive(archive, 'semantic-role-labeling')

        result = predictor.predict_json(inputs)

        words = result.get("words")
        assert words == ["The", "squirrel", "wrote", "a", "unit", "test",
                         "to", "make", "sure", "its", "nuts", "worked", "as", "designed", "."]
        num_words = len(words)

        verbs = result.get("verbs")
        assert verbs is not None
        assert isinstance(verbs, list)

        assert any(v["verb"] == "wrote" for v in verbs)
        assert any(v["verb"] == "make" for v in verbs)
        assert any(v["verb"] == "worked" for v in verbs)

        for verb in verbs:
            tags = verb.get("tags")
            assert tags is not None
            assert isinstance(tags, list)
            assert all(isinstance(tag, str) for tag in tags)
            assert len(tags) == num_words
    def __init__(self,
                 vocab: Vocabulary,
                 sentence_embedder: TextFieldEmbedder,
                 action_embedding_dim: int,
                 encoder: Seq2SeqEncoder,
                 attention: Attention,
                 beam_size: int,
                 max_decoding_steps: int,
                 max_num_finished_states: int = None,
                 dropout: float = 0.0,
                 normalize_beam_score_by_length: bool = False,
                 checklist_cost_weight: float = 0.6,
                 dynamic_cost_weight: Dict[str, Union[int, float]] = None,
                 penalize_non_agenda_actions: bool = False,
                 initial_mml_model_file: str = None) -> None:
        super(NlvrCoverageSemanticParser, self).__init__(vocab=vocab,
                                                         sentence_embedder=sentence_embedder,
                                                         action_embedding_dim=action_embedding_dim,
                                                         encoder=encoder,
                                                         dropout=dropout)
        self._agenda_coverage = Average()
        self._decoder_trainer: DecoderTrainer[Callable[[CoverageState], torch.Tensor]] = \
                ExpectedRiskMinimization(beam_size=beam_size,
                                         normalize_by_length=normalize_beam_score_by_length,
                                         max_decoding_steps=max_decoding_steps,
                                         max_num_finished_states=max_num_finished_states)

        # Instantiating an empty NlvrWorld just to get the number of terminals.
        self._terminal_productions = set(NlvrWorld([]).terminal_productions.values())
        self._decoder_step = CoverageTransitionFunction(encoder_output_dim=self._encoder.get_output_dim(),
                                                        action_embedding_dim=action_embedding_dim,
                                                        input_attention=attention,
                                                        num_start_types=1,
                                                        activation=Activation.by_name('tanh')(),
                                                        predict_start_type_separately=False,
                                                        add_action_bias=False,
                                                        dropout=dropout)
        self._checklist_cost_weight = checklist_cost_weight
        self._dynamic_cost_wait_epochs = None
        self._dynamic_cost_rate = None
        if dynamic_cost_weight:
            self._dynamic_cost_wait_epochs = dynamic_cost_weight["wait_num_epochs"]
            self._dynamic_cost_rate = dynamic_cost_weight["rate"]
        self._penalize_non_agenda_actions = penalize_non_agenda_actions
        self._last_epoch_in_forward: int = None
        # TODO (pradeep): Checking whether file exists here to avoid raising an error when we've
        # copied a trained ERM model from a different machine and the original MML model that was
        # used to initialize it does not exist on the current machine. This may not be the best
        # solution for the problem.
        if initial_mml_model_file is not None:
            if os.path.isfile(initial_mml_model_file):
                archive = load_archive(initial_mml_model_file)
                self._initialize_weights_from_archive(archive)
            else:
                # A model file is passed, but it does not exist. This is expected to happen when
                # you're using a trained ERM model to decode. But it may also happen if the path to
                # the file is really just incorrect. So throwing a warning.
                logger.warning("MML model file for initializing weights is passed, but does not exist."
                               " This is fine if you're just decoding.")
Пример #9
0
def _get_predictor(args: argparse.Namespace) -> Predictor:
    check_for_gpu(args.cuda_device)
    archive = load_archive(args.archive_file,
                           weights_file=args.weights_file,
                           cuda_device=args.cuda_device,
                           overrides=args.overrides)

    return Predictor.from_archive(archive, args.predictor)
Пример #10
0
 def test_batch_prediction(self):
     inputs = {
             "sentence": "The squirrel wrote a unit test to make sure its nuts worked as designed."
     }
     archive = load_archive(self.FIXTURES_ROOT / 'srl' / 'serialization' / 'model.tar.gz')
     predictor = Predictor.from_archive(archive, 'semantic-role-labeling')
     result = predictor.predict_batch_json([inputs, inputs])
     assert result[0] == result[1]
Пример #11
0
    def from_params(cls, vocab: Vocabulary, params: Params) -> 'BidafEnsemble':  # type: ignore
        # pylint: disable=arguments-differ
        if vocab:
            raise ConfigurationError("vocab should be None")

        submodels = []
        paths = params.pop("submodels")
        for path in paths:
            submodels.append(load_archive(path).model)

        return cls(submodels=submodels)
Пример #12
0
    def test_predictor_with_direct_parser(self):
        archive_dir = self.FIXTURES_ROOT / 'semantic_parsing' / 'nlvr_direct_semantic_parser' / 'serialization'
        archive = load_archive(os.path.join(archive_dir, 'model.tar.gz'))
        predictor = Predictor.from_archive(archive, 'nlvr-parser')

        result = predictor.predict_json(self.inputs)
        assert 'logical_form' in result
        assert 'denotations' in result
        # result['denotations'] is a list corresponding to k-best logical forms, where k is 1 by
        # default.
        assert len(result['denotations'][0]) == 2  # Because there are two worlds in the input.
    def test_prediction_with_no_verbs(self):
        """
        Tests whether the model copes with sentences without verbs.
        """
        input1 = {"sentence": "Blah no verb sentence."}
        archive = load_archive(self.FIXTURES_ROOT / \
                               'srl' / 'serialization' / 'model.tar.gz')
        predictor = Predictor.from_archive(archive, 'open-information-extraction')

        result = predictor.predict_json(input1)
        assert result == {'words': ['Blah', 'no', 'verb', 'sentence', '.'], 'verbs': []}
Пример #14
0
    def test_atis_parser_batch_predicted_sql_present(self):
        inputs = [{
                "utterance": "show me flights to seattle",
        }]

        archive_path = self.FIXTURES_ROOT / 'semantic_parsing' / 'atis' / 'serialization' / 'model.tar.gz'
        archive = load_archive(archive_path)
        predictor = Predictor.from_archive(archive, 'atis-parser')

        result = predictor.predict_batch_json(inputs)
        predicted_sql_query = result[0].get("predicted_sql_query")
        assert predicted_sql_query is not None
Пример #15
0
 def test_copynet_predictions(self):
     archive = load_archive(self.FIXTURES_ROOT / 'encoder_decoder' / 'copynet_seq2seq' /
                            'serialization' / 'model.tar.gz')
     predictor = Predictor.from_archive(archive, 'seq2seq')
     model = predictor._model
     end_token = model.vocab.get_token_from_index(model._end_index, model._target_namespace)
     output_dict = predictor.predict("these tokens should be copied over : hello world")
     assert len(output_dict["predictions"]) == model._beam_search.beam_size
     assert len(output_dict["predicted_tokens"]) == model._beam_search.beam_size
     for predicted_tokens in output_dict["predicted_tokens"]:
         assert all(isinstance(x, str) for x in predicted_tokens)
         assert end_token not in predicted_tokens
Пример #16
0
    def test_answer_present_with_batch_predict(self):
        inputs = [{
                "question": "Who is 18 years old?",
                "table": "Name\tAge\nShallan\t16\nKaladin\t18"
        }]

        archive_path = self.FIXTURES_ROOT / 'semantic_parsing' / 'wikitables' / 'serialization' / 'model.tar.gz'
        archive = load_archive(archive_path)
        predictor = Predictor.from_archive(archive, 'wikitables-parser')

        result = predictor.predict_batch_json(inputs)
        answer = result[0].get("answer")
        assert answer is not None
Пример #17
0
    def test_uses_named_inputs(self):
        inputs = {
                "source": "What kind of test succeeded on its first attempt?",
        }

        archive = load_archive('tests/fixtures/encoder_decoder/simple_seq2seq/serialization/model.tar.gz')
        predictor = Predictor.from_archive(archive, 'simple_seq2seq')

        result = predictor.predict_json(inputs)

        predicted_tokens = result.get("predicted_tokens")
        assert predicted_tokens is not None
        assert isinstance(predicted_tokens, list)
        assert all(isinstance(x, str) for x in predicted_tokens)
Пример #18
0
    def test_prediction_with_no_verbs(self):

        input1 = {"sentence": "Blah no verb sentence."}
        archive = load_archive(self.FIXTURES_ROOT / 'srl' / 'serialization' / 'model.tar.gz')
        predictor = Predictor.from_archive(archive, 'semantic-role-labeling')
        result = predictor.predict_json(input1)
        assert result == {'words': ['Blah', 'no', 'verb', 'sentence', '.'], 'verbs': []}

        input2 = {"sentence": "This sentence has a verb."}
        results = predictor.predict_batch_json([input1, input2])
        assert results[0] == {'words': ['Blah', 'no', 'verb', 'sentence', '.'], 'verbs': []}
        assert results[1] == {'words': ['This', 'sentence', 'has', 'a', 'verb', '.'],
                              'verbs': [{'verb': 'has', 'description': 'This sentence has a verb .',
                                         'tags': ['O', 'O', 'O', 'O', 'O', 'O']}]}
Пример #19
0
    def test_uses_named_inputs(self):
        inputs = {"document": "This is a single string document about a test. Sometimes it "
                              "contains coreferent parts."}
        archive = load_archive(self.FIXTURES_ROOT / 'coref' / 'serialization' / 'model.tar.gz')
        predictor = Predictor.from_archive(archive, 'coreference-resolution')

        result = predictor.predict_json(inputs)
        self.assert_predict_result(result)

        document = ['This', 'is', 'a', 'single', 'string',
                    'document', 'about', 'a', 'test', '.', 'Sometimes',
                    'it', 'contains', 'coreferent', 'parts', '.']

        result_doc_words = predictor.predict_tokenized(document)
        self.assert_predict_result(result_doc_words)
    def test_uses_named_inputs(self):
        inputs = {
                "sentence": "What a great test sentence.",
        }

        archive = load_archive('tests/fixtures/constituency_parser/serialization/model.tar.gz')
        predictor = Predictor.from_archive(archive, 'constituency-parser')
        result = predictor.predict_json(inputs)

        assert len(result["spans"]) == 21 # number of possible substrings of the sentence.
        assert len(result["class_probabilities"]) == 21
        assert result["tokens"] == ["What", "a", "great", "test", "sentence", "."]
        assert isinstance(result["trees"], str)

        for class_distribution in result["class_probabilities"]:
            self.assertAlmostEqual(sum(class_distribution), 1.0, places=4)
Пример #21
0
def _get_predictor(args: argparse.Namespace) -> Predictor:
    archive = load_archive(args.archive_file,
                           weights_file=args.weights_file,
                           cuda_device=args.cuda_device,
                           overrides=args.overrides)

    if args.predictor:
        # Predictor explicitly specified, so use it
        return Predictor.from_archive(archive, args.predictor)

    # Otherwise, use the mapping
    model_type = archive.config.get("model").get("type")
    if model_type not in DEFAULT_PREDICTORS:
        raise ConfigurationError(f"No known predictor for model type {model_type}.\n"
                                 f"Specify one with the --predictor flag.")
    return Predictor.from_archive(archive, DEFAULT_PREDICTORS[model_type])
Пример #22
0
    def test_uses_named_inputs(self):
        inputs = {
                "sentence": "The squirrel wrote a unit test to make sure its nuts worked as designed."
        }

        archive = load_archive(self.FIXTURES_ROOT / 'srl' / 'serialization' / 'model.tar.gz')
        predictor = Predictor.from_archive(archive, 'semantic-role-labeling')

        result_json = predictor.predict_json(inputs)
        self.assert_predict_result(result_json)

        words = ["The", "squirrel", "wrote", "a", "unit", "test",
                 "to", "make", "sure", "its", "nuts", "worked", "as", "designed", "."]

        result_words = predictor.predict_tokenized(words)
        self.assert_predict_result(result_words)
Пример #23
0
    def from_path(cls, archive_path: str, predictor_name: str = None) -> 'Predictor':
        """
        Instantiate a :class:`Predictor` from an archive path.

        If you need more detailed configuration options, such as running the predictor on the GPU,
        please use `from_archive`.

        Parameters
        ----------
        archive_path The path to the archive.

        Returns
        -------
        A Predictor instance.
        """
        return Predictor.from_archive(load_archive(archive_path), predictor_name)
    def test_predictor_uses_dataset_reader_to_determine_pos_set(self):
        # pylint: disable=protected-access
        archive = load_archive(self.FIXTURES_ROOT / 'biaffine_dependency_parser'
                               / 'serialization' / 'model.tar.gz')
        predictor = Predictor.from_archive(archive, 'biaffine-dependency-parser')

        inputs = {
                "sentence": "Dogs eat cats.",
        }
        instance_with_ud_pos = predictor._json_to_instance(inputs)
        tags = instance_with_ud_pos.fields["pos_tags"].labels
        assert tags == ['NOUN', 'VERB', 'NOUN', 'PUNCT']

        predictor._dataset_reader.use_language_specific_pos = True

        instance_with_ptb_pos = predictor._json_to_instance(inputs)
        tags = instance_with_ptb_pos.fields["pos_tags"].labels
        assert tags == ['NNS', 'VBP', 'NNS', '.']
Пример #25
0
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
    # Disable some of the more verbose logging statements
    logging.getLogger('allennlp.common.params').disabled = True
    logging.getLogger('allennlp.nn.initializers').disabled = True
    logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(logging.INFO)

    # Load from archive
    archive = load_archive(args.archive_file, args.cuda_device, args.overrides, args.weights_file)
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Load the evaluation data

    # Try to use the validation dataset reader if there is one - otherwise fall back
    # to the default dataset_reader used for both training and validation.
    validation_dataset_reader_params = config.pop('validation_dataset_reader', None)
    if validation_dataset_reader_params is not None:
        dataset_reader = DatasetReader.from_params(validation_dataset_reader_params)
    else:
        dataset_reader = DatasetReader.from_params(config.pop('dataset_reader'))
    evaluation_data_path = args.input_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    instances = dataset_reader.read(evaluation_data_path)

    iterator_params = config.pop("validation_iterator", None)
    if iterator_params is None:
        iterator_params = config.pop("iterator")
    iterator = DataIterator.from_params(iterator_params)
    iterator.index_with(model.vocab)

    metrics = evaluate(model, instances, iterator, args.cuda_device, args.batch_weight_key)

    logger.info("Finished evaluating.")
    logger.info("Metrics:")
    for key, metric in metrics.items():
        logger.info("%s: %s", key, metric)

    output_file = args.output_file
    if output_file:
        with open(output_file, "w") as file:
            json.dump(metrics, file, indent=4)
    return metrics
Пример #26
0
    def test_atis_parser_uses_named_inputs(self):
        inputs = {
                "utterance": "show me the flights to seattle",
        }

        archive_path = self.FIXTURES_ROOT / 'semantic_parsing' / 'atis' / 'serialization' / 'model.tar.gz'
        archive = load_archive(archive_path)
        predictor = Predictor.from_archive(archive, 'atis-parser')

        result = predictor.predict_json(inputs)
        action_sequence = result.get("best_action_sequence")
        if action_sequence:
            # An untrained model will likely get into a loop, and not produce at finished states.
            # When the model gets into a loop it will not produce any valid SQL, so we don't get
            # any actions. This basically just tests if the model runs.
            assert len(action_sequence) > 1
            assert all([isinstance(action, str) for action in action_sequence])
            predicted_sql_query = result.get("predicted_sql_query")
            assert predicted_sql_query is not None
Пример #27
0
    def test_uses_named_inputs(self):
        inputs = {"paragraphs": [{"qas": [{"followup": "y", "yesno": "x", "question": "When was the first one?",
                                           "answers": [{"answer_start": 0, "text": "One time"}], "id": "C_q#0"},
                                          {"followup": "n", "yesno": "x", "question": "What were you doing?",
                                           "answers": [{"answer_start": 15, "text": "writing a"}], "id": "C_q#1"},
                                          {"followup": "m", "yesno": "y", "question": "How often?",
                                           "answers": [{"answer_start": 4, "text": "time I"}], "id": "C_q#2"}],
                                  "context": "One time I was writing a unit test,\
                                   and it succeeded on the first attempt."}]}

        archive = load_archive(self.FIXTURES_ROOT / 'dialog_qa' / 'serialization' / 'model.tar.gz')
        predictor = Predictor.from_archive(archive, 'dialog_qa')

        result = predictor.predict_json(inputs)

        best_span_str_list = result.get("best_span_str")
        for best_span_str in best_span_str_list:
            assert isinstance(best_span_str, str)
            assert best_span_str != ""
    def test_get_vocab_index_mapping(self):
        # pylint: disable=line-too-long
        mml_model_archive_file = (self.FIXTURES_ROOT / "semantic_parsing" / "nlvr_direct_semantic_parser" /
                                  "serialization" / "model.tar.gz")
        archive = load_archive(mml_model_archive_file)
        mapping = self.model._get_vocab_index_mapping(archive.model.vocab)
        expected_mapping = [(i, i) for i in range(16)]
        assert mapping == expected_mapping

        new_vocab = Vocabulary()
        def copy_token_at_index(i):
            token = self.vocab.get_token_from_index(i, "tokens")
            new_vocab.add_token_to_namespace(token, "tokens")
        copy_token_at_index(5)
        copy_token_at_index(7)
        copy_token_at_index(10)
        mapping = self.model._get_vocab_index_mapping(new_vocab)
        # Mapping of indices from model vocabulary to new vocabulary. 0 and 1 are padding and unk
        # tokens.
        assert mapping == [(0, 0), (1, 1), (5, 2), (7, 3), (10, 4)]
Пример #29
0
    def test_extra_files(self):

        serialization_dir = self.TEST_DIR / 'serialization'

        # Train a model
        train_model(self.params, serialization_dir=serialization_dir)

        # Archive model, and also archive the training data
        files_to_archive = {"train_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')}
        archive_model(serialization_dir=serialization_dir, files_to_archive=files_to_archive)

        archive = load_archive(serialization_dir / 'model.tar.gz')
        params = archive.config

        # The param in the data should have been replaced with a temporary path
        # (which we don't know, but we know what it ends with).
        assert params.get('train_data_path').endswith('/fta/train_data_path')

        # The validation data path should be the same though.
        assert params.get('validation_data_path') == str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')
def make_data(input_examples_file: str,
              tables_directory: str,
              archived_model_file: str,
              output_dir: str,
              num_logical_forms: int) -> None:
    reader = WikiTablesDatasetReader(tables_directory=tables_directory,
                                     keep_if_no_dpd=True,
                                     output_agendas=True)
    dataset = reader.read(input_examples_file)
    input_lines = []
    with open(input_examples_file) as input_file:
        input_lines = input_file.readlines()
    # Note: Double { for escaping {.
    new_tables_config = f"{{model: {{tables_directory: {tables_directory}}}}}"
    archive = load_archive(archived_model_file,
                           overrides=new_tables_config)
    model = archive.model
    model.training = False
    model._decoder_trainer._max_num_decoded_sequences = 100
    for instance, example_line in zip(dataset, input_lines):
        outputs = model.forward_on_instance(instance)
        parsed_info = reader._parse_example_line(example_line)
        example_id = parsed_info["id"]
        logical_forms = outputs["logical_form"]
        correct_logical_forms = []
        for logical_form in logical_forms:
            if model._denotation_accuracy.evaluate_logical_form(logical_form, example_line):
                correct_logical_forms.append(logical_form)
                if len(correct_logical_forms) >= num_logical_forms:
                    break
        num_found = len(correct_logical_forms)
        print(f"{num_found} found for {example_id}")
        if num_found == 0:
            continue
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        output_file = gzip.open(os.path.join(output_dir, f"{example_id}.gz"), "wb")
        for logical_form in correct_logical_forms:
            logical_form_line = (logical_form + "\n").encode('utf-8')
            output_file.write(logical_form_line)
        output_file.close()
Пример #31
0
    def test_predictions_to_labeled_instances(self):
        inputs = {
            "question":
            "What kind of test succeeded on its first attempt?",
            "passage":
            "One time I was writing a unit test, and it succeeded on the first attempt.",
        }

        archive = load_archive(FIXTURES_ROOT / "rc" / "bidaf" /
                               "serialization" / "model.tar.gz")
        predictor = Predictor.from_archive(archive, "reading_comprehension")

        instance = predictor._json_to_instance(inputs)
        outputs = predictor._model.forward_on_instance(instance)
        new_instances = predictor.predictions_to_labeled_instances(
            instance, outputs)
        assert "span_start" in new_instances[0].fields
        assert "span_end" in new_instances[0].fields
        assert new_instances[0].fields["span_start"] is not None
        assert new_instances[0].fields["span_end"] is not None
        assert len(new_instances) == 1
Пример #32
0
    def test_transferring_of_modules_ensures_type_consistency(self):

        model_archive = str(self.FIXTURES_ROOT / 'decomposable_attention' /
                            'serialization' / 'model.tar.gz')
        trained_model = load_archive(model_archive).model

        config_file = str(self.FIXTURES_ROOT / 'decomposable_attention' /
                          'experiment.json')
        model_params = Params.from_file(config_file).pop("model").as_dict(
            quiet=True)

        # Override only text_field_embedder and make it load AttendFeedForward
        model_params["text_field_embedder"] = {
            "_pretrained": {
                "archive_file": model_archive,
                "module_path": "_attend_feedforward._module"
            }
        }
        with pytest.raises(ConfigurationError):
            Model.from_params(vocab=trained_model.vocab,
                              params=Params(model_params))
Пример #33
0
    def from_path(
        cls,
        archive_path: str,
        predictor_name: str = None,
        cuda_device: int = -1,
        dataset_reader_to_load: str = "validation",
        frozen: bool = True,
    ) -> "Predictor":
        """
        Instantiate a `Predictor` from an archive path.

        If you need more detailed configuration options, such as overrides,
        please use `from_archive`.

        # Parameters

        archive_path : `str`
            The path to the archive.
        predictor_name : `str`, optional (default=None)
            Name that the predictor is registered as, or None to use the
            predictor associated with the model.
        cuda_device : `int`, optional (default=-1)
            If `cuda_device` is >= 0, the model will be loaded onto the
            corresponding GPU. Otherwise it will be loaded onto the CPU.
        dataset_reader_to_load : `str`, optional (default="validation")
            Which dataset reader to load from the archive, either "train" or
            "validation".
        frozen : `bool`, optional (default=True)
            If we should call `model.eval()` when building the predictor.

        # Returns

        A Predictor instance.
        """
        return Predictor.from_archive(
            load_archive(archive_path, cuda_device=cuda_device),
            predictor_name,
            dataset_reader_to_load=dataset_reader_to_load,
            frozen=frozen,
        )
Пример #34
0
    def test_batch_prediction(self):
        inputs = [
            {
                "sentence": "What a great test sentence."
            },
            {
                "sentence": "Here's another good, interesting one."
            },
        ]

        archive = load_archive(FIXTURES_ROOT / "structured_prediction" /
                               "constituency_parser" / "serialization" /
                               "model.tar.gz")
        predictor = Predictor.from_archive(archive, "constituency_parser")
        results = predictor.predict_batch_json(inputs)

        result = results[0]
        assert len(result["spans"]
                   ) == 21  # number of possible substrings of the sentence.
        assert len(result["class_probabilities"]) == 21
        assert result["tokens"] == [
            "What", "a", "great", "test", "sentence", "."
        ]
        assert isinstance(result["trees"], str)

        for class_distribution in result["class_probabilities"]:
            assert sum(class_distribution) == pytest.approx(1.0, rel=1e-3)

        result = results[1]

        assert len(result["spans"]
                   ) == 36  # number of possible substrings of the sentence.
        assert len(result["class_probabilities"]) == 36
        assert result["tokens"] == [
            "Here", "'s", "another", "good", ",", "interesting", "one", "."
        ]
        assert isinstance(result["trees"], str)

        for class_distribution in result["class_probabilities"]:
            assert sum(class_distribution) == pytest.approx(1.0, rel=1e-3)
Пример #35
0
def main(args):
    # Executing this file with no extra options runs the simple service with the bidaf test fixture
    # and the machine-comprehension predictor. There's no good reason you'd want
    # to do this, except possibly to test changes to the stock HTML).

    parser = argparse.ArgumentParser(description='Serve up a simple model')

    parser.add_argument('--archive-path', type=str, required=True, help='path to trained archive file')
    parser.add_argument('--predictor', type=str, required=True, help='name of predictor')
    parser.add_argument('--static-dir', type=str, help='serve index.html from this directory')
    parser.add_argument('--title', type=str, help='change the default page title', default="AllenNLP Demo")
    parser.add_argument('--field-name', type=str, required=True, action='append',
                        help='field names to include in the demo')
    parser.add_argument('--port', type=int, default=8000, help='port to serve the demo on')

    parser.add_argument('--include-package',
                        type=str,
                        action='append',
                        default=[],
                        help='additional packages to include')

    args = parser.parse_args(args)

    # Load modules
    for package_name in args.include_package:
        import_submodules(package_name)

    archive = load_archive(args.archive_path)
    predictor = Predictor.from_archive(archive, args.predictor)
    field_names = args.field_name

    app = make_app(predictor=predictor,
                   field_names=field_names,
                   static_dir=args.static_dir,
                   title=args.title)
    CORS(app)

    http_server = WSGIServer(('0.0.0.0', args.port), app)
    print(f"Model loaded, serving demo on port {args.port}")
    http_server.serve_forever()
Пример #36
0
    def test_predictions_to_labeled_instances_with_naqanet(self):
        inputs = {
            "question": "What kind of test succeeded on its first attempt?",
            "passage": "One time I was writing 2 unit tests, and 1 succeeded on the first attempt.",
        }

        archive = load_archive(FIXTURES_ROOT / "naqanet" / "serialization" / "model.tar.gz")
        predictor = Predictor.from_archive(archive, "reading_comprehension")
        predictor._dataset_reader.skip_when_all_empty = False

        instance = predictor._json_to_instance(inputs)
        outputs = predictor._model.forward_on_instance(instance)
        new_instances = predictor.predictions_to_labeled_instances(instance, outputs)
        assert "number_indices" in new_instances[0].fields
        assert "answer_as_passage_spans" in new_instances[0].fields
        assert "answer_as_question_spans" in new_instances[0].fields
        assert "answer_as_add_sub_expressions" in new_instances[0].fields
        assert "answer_as_counts" in new_instances[0].fields
        assert "metadata" in new_instances[0].fields
        assert len(new_instances) == 1

        outputs["answer"]["answer_type"] = "count"
        outputs["answer"]["count"] = 2
        new_instances = predictor.predictions_to_labeled_instances(instance, outputs)
        assert new_instances[0]["answer_as_counts"][0].label == 2

        outputs["answer"]["answer_type"] = "passage_span"
        outputs["answer"]["spans"] = [[0, 8]]  # character offsets
        new_instances = predictor.predictions_to_labeled_instances(instance, outputs)
        assert new_instances[0]["answer_as_passage_spans"][0] == (0, 1)  # token indices

        outputs["answer"]["answer_type"] = "arithmetic"
        outputs["answer"]["numbers"] = [{"sign": 2}, {"sign": 0}]
        new_instances = predictor.predictions_to_labeled_instances(instance, outputs)
        assert new_instances[0]["answer_as_add_sub_expressions"][0].labels == [2, 0, 0]

        outputs["answer"]["answer_type"] = "question_span"
        outputs["answer"]["spans"] = [[0, 9]]  # character offsets
        new_instances = predictor.predictions_to_labeled_instances(instance, outputs)
        assert new_instances[0]["answer_as_question_spans"][0] == (0, 1)  # token indices
Пример #37
0
    def __init__(
            self,
            archive_file=DEFAULT_ARCHIVE_FILE,
            cuda_device=DEFAULT_CUDA_DEVICE,
            model_file="https://convlab.blob.core.windows.net/convlab-2/new_milu(20200922)_multiwoz_all_context.tar.gz",
            context_size=3):
        """ Constructor for NLU class. """

        self.context_size = context_size
        cuda_device = 0 if torch.cuda.is_available() else DEFAULT_CUDA_DEVICE
        check_for_gpu(cuda_device)

        if not os.path.isfile(archive_file):
            if not model_file:
                raise Exception("No model for MILU is specified!")

            archive_file = cached_path(model_file)

        archive = load_archive(archive_file, cuda_device=cuda_device)
        self.tokenizer = SpacyWordSplitter(language="en_core_web_sm")
        _special_case = [{ORTH: u"id", LEMMA: u"id"}]
        self.tokenizer.spacy.tokenizer.add_special_case(u"id", _special_case)
        with open(
                os.path.join(get_root_path(),
                             'data/multiwoz/db/postcode.json'), 'r') as f:
            token_list = json.load(f)

        for token in token_list:
            token = token.strip()
            self.tokenizer.spacy.tokenizer.add_special_case(
                token, [{
                    ORTH: token,
                    LEMMA: token,
                    POS: u'NOUN'
                }])

        dataset_reader_params = archive.config["dataset_reader"]
        self.dataset_reader = DatasetReader.from_params(dataset_reader_params)
        self.model = archive.model
        self.model.eval()
Пример #38
0
def predict(archive_file, test_file, output_file, cuda_device, score_dir):
    import_submodules("dygie")
    gold_test_data = load_json(test_file)
    archive = load_archive(archive_file, cuda_device)
    model = archive.model
    model.eval()
    config = archive.config.duplicate()
    dataset_reader_params = config["dataset_reader"]
    dataset_reader = DatasetReader.from_params(dataset_reader_params)
    instances = dataset_reader.read(test_file)
    batch = Batch(instances)
    batch.index_instances(model.vocab)
    iterator = DocumentIterator()
    with open(output_file, "w") as f:
        for doc, gold_data in zip(
                iterator(batch.instances, num_epochs=1, shuffle=False),
                gold_test_data):
            doc = nn_util.move_to_device(doc, cuda_device)  # Put on GPU.
            sentence_lengths = [
                len(entry["sentence"]) for entry in doc["metadata"]
            ]
            sentence_starts = np.cumsum(sentence_lengths)
            sentence_starts = np.roll(sentence_starts, 1)
            sentence_starts[0] = 0
            pred = model(**doc)
            if score_dir is not None:
                dump_scores(doc, pred, score_dir)
            decoded = model.decode(pred)
            predictions = {}
            for k, v in decoded.items():
                predictions[decode_names[k]] = cleanup(k, v[decode_fields[k]],
                                                       sentence_starts)
            res = {}
            res.update(gold_data)
            res.update(predictions)
            if "dataset" in res:
                del res["dataset"]
            check_lengths(res)
            encoded = json.dumps(res, default=int)
            f.write(encoded + "\n")
Пример #39
0
    def from_params(cls, vocab: Vocabulary, params: Params) -> 'DecAccSRL':
        embedder_params = params.pop("text_field_embedder")
        text_field_embedder = TextFieldEmbedder.from_params(vocab, embedder_params)

        premise_encoder_params = params.pop("premise_encoder", None)
        if premise_encoder_params is not None:
            premise_encoder = Seq2SeqEncoder.from_params(premise_encoder_params)
        else:
            premise_encoder = None

        hypothesis_encoder_params = params.pop("hypothesis_encoder", None)
        if hypothesis_encoder_params is not None:
            hypothesis_encoder = Seq2SeqEncoder.from_params(hypothesis_encoder_params)
        else:
            hypothesis_encoder = None

        srl_model_archive = params.pop('srl_model_archive', None)
        if srl_model_archive is not None:
            logger.info("Loaded pretrained SRL model from {}".format(srl_model_archive))
            archive = load_archive(srl_model_archive)
            srl_model = archive.model
        else:
            srl_model = None

        attend_feedforward = FeedForward.from_params(params.pop('attend_feedforward'))
        similarity_function = SimilarityFunction.from_params(params.pop("similarity_function"))
        compare_feedforward = FeedForward.from_params(params.pop('compare_feedforward'))
        aggregate_feedforward = FeedForward.from_params(params.pop('aggregate_feedforward'))
        initializer = InitializerApplicator.from_params(params.pop("initializer", []))

        return cls(vocab=vocab,
                   text_field_embedder=text_field_embedder,
                   attend_feedforward=attend_feedforward,
                   similarity_function=similarity_function,
                   compare_feedforward=compare_feedforward,
                   aggregate_feedforward=aggregate_feedforward,
                   initializer=initializer,
                   srl_model=srl_model,
                   premise_encoder=premise_encoder,
                   hypothesis_encoder=hypothesis_encoder)
def pred(cuda_device=0,
         archive_file="/backup3/jcxu/exComp/tmp_expsc74o5pf7/model.tar.gz",
         weights_file="/backup3/jcxu/exComp/tmp_expsc74o5pf7/best.th",
         predictor='lstm-tagger',
         input_file="/backup3/jcxu/exComp/example.txt"):
    with open(input_file, 'w') as fd:
        json.dump({"sentence": "This is a useful sentence."}, fd)
        fd.write("\n")
        json.dump({"sentence": "This is a gree, blue and useful sentence."},
                  fd)
        fd.write("\n")
        json.dump({"sentence": "This is a useless sentence."}, fd)
    check_for_gpu(cuda_device)
    archive = load_archive(archive_file,
                           weights_file=weights_file,
                           cuda_device=cuda_device,
                           overrides="")
    # predictor = SentenceTaggerPredictor(archive, dataset_reader=PosDatasetReader())
    predictor = Predictor.from_archive(archive, 'sentence-tagger')

    manager = _PredictManager(predictor, input_file, None, 1, not False, False)
    manager.run()
Пример #41
0
    def test_loads_correct_dataset_reader(self):
        # This model has a different dataset reader configuration for train and validation. The
        # parameter that differs is the token indexer's namespace.
        archive = load_archive(self.FIXTURES_ROOT /
                               "simple_tagger_with_span_f1" / "serialization" /
                               "model.tar.gz")

        predictor = Predictor.from_archive(archive, "sentence-tagger")
        assert predictor._dataset_reader._token_indexers[
            "tokens"].namespace == "test_tokens"

        predictor = Predictor.from_archive(archive,
                                           "sentence-tagger",
                                           dataset_reader_to_load="train")
        assert predictor._dataset_reader._token_indexers[
            "tokens"].namespace == "tokens"

        predictor = Predictor.from_archive(archive,
                                           "sentence-tagger",
                                           dataset_reader_to_load="validation")
        assert predictor._dataset_reader._token_indexers[
            "tokens"].namespace == "test_tokens"
Пример #42
0
def main():
    archive = load_archive(opts.model)
    predictor = Predictor.from_archive(
        archive, 'dialogue_context_hierarchical_coherence_attention_predictor')

    test_set = []
    if opts.test_pos != "NONE":
        for l in open(opts.test_pos):
            test_set.append([l.strip(), "pos"])
    if opts.test_neg != "NONE":
        for l in open(opts.test_neg):
            test_set.append([l.strip(), "neg"])

    for pair in test_set:
        inputs = {"context": pair[0].split("\t")}
        result = predictor.predict_json(inputs)
        print(result)
        label = result.get("label")
        prob = max(result.get("class_probabilities"))
        #print("Predicted label: '{}' with probability: {}".format(label, prob))
        print("RES", pair[1], label, prob,
              result.get("class_probabilities")[1])
    def test_get_vocab_index_mapping(self):
        mml_model_archive_file = (self.FIXTURES_ROOT /
                                  "nlvr_direct_semantic_parser" /
                                  "serialization" / "model.tar.gz")
        archive = load_archive(mml_model_archive_file)
        mapping = self.model._get_vocab_index_mapping(archive.model.vocab)
        expected_mapping = [(i, i) for i in range(16)]
        assert mapping == expected_mapping

        new_vocab = Vocabulary()

        def copy_token_at_index(i):
            token = self.vocab.get_token_from_index(i, "tokens")
            new_vocab.add_token_to_namespace(token, "tokens")

        copy_token_at_index(5)
        copy_token_at_index(7)
        copy_token_at_index(10)
        mapping = self.model._get_vocab_index_mapping(new_vocab)
        # Mapping of indices from model vocabulary to new vocabulary. 0 and 1 are padding and unk
        # tokens.
        assert mapping == [(0, 0), (1, 1), (5, 2), (7, 3), (10, 4)]
    def test_uses_named_inputs(self):
        inputs = {
                "source": "personx gave persony a present",
        }

        archive = load_archive(self.FIXTURES_ROOT / 'event2mind' /
                               'serialization' / 'model.tar.gz')
        predictor = Predictor.from_archive(archive, 'event2mind')

        result = predictor.predict_json(inputs)

        token_names = [
                'xintent_top_k_predicted_tokens',
                'xreact_top_k_predicted_tokens',
                'oreact_top_k_predicted_tokens'
        ]

        for token_name in token_names:
            all_predicted_tokens = result.get(token_name)
            for predicted_tokens in all_predicted_tokens:
                assert isinstance(predicted_tokens, list)
                assert all(isinstance(x, str) for x in predicted_tokens)
Пример #45
0
    def __init__(self,
                 archive_file=DEFAULT_ARCHIVE_FILE,
                 cuda_device=DEFAULT_CUDA_DEVICE,
                 model_file=None):
        """ Constructor for NLU class. """
        SysPolicy.__init__(self)

        check_for_gpu(cuda_device)

        if not os.path.isfile(archive_file):
            if not model_file:
                raise Exception("No model for MILU is specified!")
            archive_file = cached_path(model_file)

        archive = load_archive(archive_file, cuda_device=cuda_device)
        dataset_reader_params = archive.config["dataset_reader"]
        self.dataset_reader = DatasetReader.from_params(dataset_reader_params)
        self.action_decoder = MultiWozVocabActionDecoder()
        self.action_decoder.action_vocab = self.dataset_reader.action_vocab
        self.state_encoder = self.dataset_reader.state_encoder
        self.model = archive.model
        self.model.eval()
Пример #46
0
    def __init__(self, vocab: Vocabulary, parser_model_path: str,
                 parser_hidden_size: int, parser_cuda_device: int,
                 freeze_parser: bool, pretrained_bert_model_file: str,
                 num_labels: int) -> None:
        super().__init__(vocab)
        self.bert_sc_model = SEBertForSC.from_pretrained(
            pretrained_bert_model_file,
            num_labels=num_labels,
            parser_hidden_size=parser_hidden_size)

        self._accuracy = CategoricalAccuracy()
        self._loss = torch.nn.CrossEntropyLoss()

        self._parser = load_archive(parser_model_path,
                                    cuda_device=parser_cuda_device).model
        self._parser._head_sentinel.requires_grad = False
        for child in self._parser.children():
            for param in child.parameters():
                param.requires_grad = False
        if not freeze_parser:
            for param in self._parser.encoder.parameters():
                param.requires_grad = True
Пример #47
0
    def __init__(self, serialization_dir, cuda_device=0) -> None:
        super(DependencyParsingEmbedding, self).__init__()

        from allennlp.models.archival import load_archive

        self.serialization_dir = serialization_dir
        self.parameter_filename = os.path.join(serialization_dir,
                                               "config.json")
        self.weights_filename = os.path.join(serialization_dir, "weights.th")
        self.cuda_device = cuda_device

        self.config = Params.from_file(self.parameter_filename)

        self.archive = load_archive(self.serialization_dir)
        self.model = self.archive.model
        self.model.eval()
        self.dataset_reader_params = self.config["dataset_reader"]
        self.dataset_reader = DatasetReader.from_params(
            self.dataset_reader_params)
        self.tokenizer = SpacyWordSplitter(language='en_core_web_sm',
                                           pos_tags=True,
                                           wst=True)
Пример #48
0
    def __init__(self, 
                archive_file=DEFAULT_ARCHIVE_FILE,
                cuda_device=DEFAULT_CUDA_DEVICE,
                model_file=None):
        """ Constructor for NLU class. """
        check_for_gpu(cuda_device)

        if not os.path.isfile(archive_file):
            if not model_file:
                raise Exception("No model for MlstNLU is specified!")
            file_path = cached_path(model_file)
            zip_ref = ZipFile(file_path, 'r')
            zip_ref.extractall(DEFAULT_DIRECTORY)
            zip_ref.close()

        archive = load_archive(archive_file,
                            cuda_device=cuda_device)
        self.tokenizer = SpacyWordSplitter(language="en_core_web_sm") 
        dataset_reader_params = archive.config["dataset_reader"]
        self.dataset_reader = DatasetReader.from_params(dataset_reader_params)
        self.model = archive.model
        self.model.eval()
Пример #49
0
def get_file_iface_predictor_with_archive(predictor: str,
                                          params: Params,
                                          archive: str,
                                          batch_size: int = 1):
    cuda_device = params["trainer"]["cuda_device"]

    check_for_gpu(cuda_device)
    archive = load_archive(archive, cuda_device=cuda_device)

    predictor = Predictor.from_archive(archive, predictor)

    def file_iface_predictor(input_file, output_file):

        manager = _PredictManager(predictor,
                                  input_file,
                                  output_file,
                                  batch_size,
                                  print_to_console=False,
                                  has_dataset_reader=True)
        manager.run()

    return file_iface_predictor
Пример #50
0
    def __init__(self, archive_path, cuda_device, overrides, weights_file,
                 lookup_path, wordnet_path, am_tools_path):
        jnius_config.set_classpath(".", am_tools_path)

        # Load model
        archive = load_archive(archive_path, cuda_device, overrides,
                               weights_file)
        config = archive.config
        config.formalism = "DUMMY"
        prepare_environment(config)
        model = archive.model
        model.eval()
        dataset_reader = DatasetReader.from_params(
            config.pop('dataset_reader'))

        self.predictor = AMconllPredictor(dataset_reader,
                                          k=6,
                                          give_up=0,
                                          threads=1,
                                          model=model)

        self.formalism = AMRInterface(lookup_path, wordnet_path)
Пример #51
0
    def test_sentences(self):
        sentence = "We investigate various contextual effects on text"
        archive = load_archive(
            'tests/pos_tagger/fixtures/model.tar.gz'
        )  # come from model test (when debugging, copy the output model here)
        predictor = Predictor.from_archive(archive, 'sentence-pos-tagger')
        result = predictor.predict(sentence)

        tags = result.get("tags")
        for tag in tags:
            assert tag in {
                'I-NP', 'B-NP', 'I-VP', 'B-PP', 'O', 'B-VP', 'B-SBAR', 'B-ADJP'
            }

        class_probabilities = result.get("logits")
        assert class_probabilities is not None
        assert len(class_probabilities) == len(tags)
        assert len(tags) == len(sentence.split())

        words = result.get("words")
        for i, word in enumerate(words):
            assert word == sentence.split()[i]
    def test_aux_verb(self):
        inputs = {
            "sentence":
            "Yellowstone National Park is in the United States of America."
        }

        archive = load_archive(FIXTURES_ROOT / "structured_prediction" /
                               "srl" / "serialization" / "model.tar.gz")
        predictor = Predictor.from_archive(archive,
                                           "open_information_extraction")

        result = predictor.predict_json(inputs)

        verbs = result.get("verbs")
        assert verbs is not None
        assert isinstance(verbs, list)

        for verb in verbs:
            tags = verb.get("tags")
            assert tags is not None
            assert isinstance(tags, list)
            assert all(isinstance(tag, str) for tag in tags)
Пример #53
0
def make_data(input_examples_file: str, tables_directory: str,
              archived_model_file: str, output_dir: str,
              num_logical_forms: int) -> None:
    reader = WikiTablesDatasetReader(tables_directory=tables_directory,
                                     keep_if_no_logical_forms=True,
                                     output_agendas=True)
    dataset = reader.read(input_examples_file)
    input_lines = []
    with open(input_examples_file) as input_file:
        input_lines = input_file.readlines()
    archive = load_archive(archived_model_file)
    model = archive.model
    model.training = False
    model._decoder_trainer._max_num_decoded_sequences = 100
    for instance, example_line in zip(dataset, input_lines):
        outputs = model.forward_on_instance(instance)
        world = instance.fields['world'].metadata
        parsed_info = util.parse_example_line(example_line)
        example_id = parsed_info["id"]
        target_list = parsed_info["target_values"]
        logical_forms = outputs["logical_form"]
        correct_logical_forms = []
        for logical_form in logical_forms:
            if world.evaluate_logical_form(logical_form, target_list):
                correct_logical_forms.append(logical_form)
                if len(correct_logical_forms) >= num_logical_forms:
                    break
        num_found = len(correct_logical_forms)
        print(f"{num_found} found for {example_id}")
        if num_found == 0:
            continue
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        output_file = gzip.open(os.path.join(output_dir, f"{example_id}.gz"),
                                "wb")
        for logical_form in correct_logical_forms:
            logical_form_line = (logical_form + "\n").encode('utf-8')
            output_file.write(logical_form_line)
        output_file.close()
Пример #54
0
def run(port: int,
        workers: int,
        trained_models: Dict[str, str],
        static_dir: str = None) -> None:
    """Run the server programatically"""
    print("Starting a sanic server on port {}.".format(port))

    if port != 8000:
        logger.warning("The demo requires the API to be run on port 8000.")

    # This will be ``None`` if all the relevant environment variables are not defined.
    demo_db = PostgresDemoDatabase.from_environment()

    app = make_app(static_dir, demo_db)
    CORS(app)

    for predictor_name, archive_file in trained_models.items():
        archive = load_archive(archive_file)
        predictor = Predictor.from_archive(archive, predictor_name)
        app.predictors[predictor_name] = predictor

    app.run(port=port, host="0.0.0.0", workers=workers)
Пример #55
0
    def load(self, cuda_device: int = -1) -> Model:
        '''
        Loads the model. This does not require you to train the model if the 
        `save_dir` attribute is pointing to a folder containing a trained model.
        This is just a wrapper around the `load_archive` function.

        :param cuda_device: Whether the loaded model should be loaded on to the 
                            CPU (-1) or the GPU (0). Default CPU.
        :returns: The model that was saved at `self.save_dir` 
        :raises AssertionError: If the `save_dir` argument is None
        :raises FileNotFoundError: If the save directory does not exist.
        '''

        save_dir_err = 'Save directory was not set in the constructor of the class'
        assert self.save_dir, save_dir_err
        if self.save_dir.exists():
            archive = load_archive(self.save_dir / "model.tar.gz",
                                   cuda_device=cuda_device)
            self.model = archive.model
            return self.model
        raise FileNotFoundError('There is nothing at the save dir:\n'
                                f'{self.save_dir.resolve()}')
 def test_initialize_weights_from_archive(self):
     original_model_parameters = self.model.named_parameters()
     original_model_weights = {name: parameter.data.clone().numpy()
                               for name, parameter in original_model_parameters}
     # pylint: disable=line-too-long
     mml_model_archive_file = (self.FIXTURES_ROOT / "semantic_parsing" / "nlvr_direct_semantic_parser" /
                               "serialization" / "model.tar.gz")
     archive = load_archive(mml_model_archive_file)
     archived_model_parameters = archive.model.named_parameters()
     self.model._initialize_weights_from_archive(archive)
     changed_model_parameters = dict(self.model.named_parameters())
     for name, archived_parameter in archived_model_parameters:
         archived_weight = archived_parameter.data.numpy()
         original_weight = original_model_weights[name]
         changed_weight = changed_model_parameters[name].data.numpy()
         # We want to make sure that the weights in the original model have indeed been changed
         # after a call to ``_initialize_weights_from_archive``.
         with self.assertRaises(AssertionError, msg=f"{name} has not changed"):
             assert_almost_equal(original_weight, changed_weight)
         # This also includes the sentence token embedder. Those weights will be the same
         # because the two models have the same vocabulary.
         assert_almost_equal(archived_weight, changed_weight)
Пример #57
0
    def test_get_gradients(self):
        inputs = {
            "premise": "I always write unit tests",
            "hypothesis": "One time I did not write any unit tests"
        }

        archive = load_archive(self.FIXTURES_ROOT / 'decomposable_attention' /
                               'serialization' / 'model.tar.gz')
        predictor = Predictor.from_archive(archive, 'textual-entailment')

        instance = predictor._json_to_instance(inputs)
        outputs = predictor._model.forward_on_instance(instance)
        labeled_instances = predictor.predictions_to_labeled_instances(
            instance, outputs)
        for instance in labeled_instances:
            grads = predictor.get_gradients([instance])[0]
            assert 'grad_input_1' in grads
            assert 'grad_input_2' in grads
            assert grads['grad_input_1'] is not None
            assert grads['grad_input_2'] is not None
            assert len(grads['grad_input_1']) == 9  # 9 words in hypothesis
            assert len(grads['grad_input_2']) == 5  # 5 words in premise
def main():
    archive = load_archive(opts.model)
    predictor = Predictor.from_archive(
        archive, 'dialogue_context_hierarchical_coherence_attention_predictor')

    context = response = ""

    test_set = []
    for l in open(
            "coherence/dataset_readers/generated_dial_examples_test.pos"):
        test_set.append([l.strip(), "pos"])
    for l in open(
            "coherence/dataset_readers/generated_dial_examples_test.neg"):
        test_set.append([l.strip(), "neg"])

    for pair in test_set:
        inputs = {"context": pair[0].split("\t")}
        result = predictor.predict_json(inputs)
        label = result.get("label")
        prob = max(result.get("class_probabilities"))
        #print("Predicted label: '{}' with probability: {}".format(label, prob))
        print("RES", pair[1], label, prob)
Пример #59
0
    def test_uses_named_inputs(self):
        inputs = {
            "text":
            "\u0628\u0639\u062f\u064a\u0646 \u0627\u0648\u0643\u064a \u0627\u0644\u062a\u0628\u0630\u064a\u0631 \u062d\u0631\u0627\u0645 \u0628\u0633 \u0628\u0639\u062f \u0627\u0644\u0628\u062e\u0644 \u0648\u0627\u0646\u064a \u0627\u062d\u0631\u0645 \u0646\u0641\u0633\u064a \u0645\u0646 \u0627\u0646\u0629 \u0641\u0644\u0648\u0633\u064a \u062a\u0631\u064a\u062d\u0646\u064a \u0648\u062a\u0639\u0632\u0646\u064a \u0645\u0627\u062a\u0639\u062a\u0642\u062f \u064a\u0627\u0643\u0627\u062a\u0628\u0646\u0627 \u0627\u0646\u0629 \u0627\u062c\u062d\u0627\u0641 \u0628\u062d\u0642 \u0627\u0644\u0646\u0641\u0633 !"
        }
        model_dir = Path(__file__, '..', '..', 'test_data', 'saved_models')
        model_file = Path(model_dir, 'model.tar.gz').resolve()
        archive = load_archive(model_file)
        predictor = Predictor.from_archive(archive, 'dialect-predictor')

        prediction_results = predictor.predict_json(inputs)
        class_probabilities = prediction_results.get('class_probabilities')
        label = prediction_results.get('label')

        assert class_probabilities is not None
        assert label is not None

        assert isinstance(class_probabilities, list)
        assert isinstance(label, str)
        assert label == 'DIAL_GLF'

        inputs = [{
            "text":
            "\u0628\u0639\u062f\u064a\u0646 \u0627\u0648\u0643\u064a \u0627\u0644\u062a\u0628\u0630\u064a\u0631 \u062d\u0631\u0627\u0645 \u0628\u0633 \u0628\u0639\u062f \u0627\u0644\u0628\u062e\u0644 \u0648\u0627\u0646\u064a \u0627\u062d\u0631\u0645 \u0646\u0641\u0633\u064a \u0645\u0646 \u0627\u0646\u0629 \u0641\u0644\u0648\u0633\u064a \u062a\u0631\u064a\u062d\u0646\u064a \u0648\u062a\u0639\u0632\u0646\u064a \u0645\u0627\u062a\u0639\u062a\u0642\u062f \u064a\u0627\u0643\u0627\u062a\u0628\u0646\u0627 \u0627\u0646\u0629 \u0627\u062c\u062d\u0627\u0641 \u0628\u062d\u0642 \u0627\u0644\u0646\u0641\u0633 !"
        }, {
            "text":
            "\u0644\u0627\u0632\u0645 \u062a\u0630\u0643\u0631\u0648\u0627 \u0627\u0633\u0645 \u0627\u0644\u0645\u0643\u062a\u0628 \u0648\u0635\u0627\u062d\u0628 \u0627\u0644\u0645\u0643\u062a\u0628 \u062d\u062a\u0649 \u064a\u0643\u0648\u0646 \u0639\u0628\u0631\u0647"
        }, {
            "text":
            "\u0627\u0644\u0644\u0647 \u064a\u0631\u062d\u0645\u0647 \u064a\u0648\u0645\u0647"
        }, {
            "text":
            "\u0628\u0633 \u0639\u0627\u064a\u0632\u0629 \u0645\u0646\u0643 \u0627\u0644\u0641\u062a\u0631\u0629 \u0627\u0644\u062c\u0627\u064a\u0629 \u0634\u0648\u064a\u0629 \u062a\u0641\u0643\u064a\u0631 \u0645\u0639\u0627\u0643 \u0644\u0627\u0639\u0628\u064a\u0646 \u0627\u062d\u0633\u0646 \u0645\u0646 \u0627\u064a \u0641\u0631\u064a\u0642 \u0641\u064a \u0627\u0644\u062f\u0648\u0631\u064a \u0648\u0627\u0644\u0633\u0646\u0629 \u062f\u064a \u062a\u0639\u0628\u0643 \u0634\u0648\u064a\u0629 \u0627\u0644\u062f\u0648\u0631\u0629 \u0627\u0644\u0627\u0645\u0645 \u0627\u0644\u0627\u0641\u0631\u064a\u0642\u064a\u0629 \u0644\u0643\u0646 \u0627\u0646\u062a \u0634\u063a\u0627\u0644 \u0643\u0648\u064a\u0633 \u0631\u0628\u0646"
        }]
        labels = ['DIAL_GLF', 'DIAL_LEV', 'MSA', 'DIAL_EGY']
        predictions = predictor.predict_batch_json(inputs)
        for index, prediction in enumerate(predictions):
            assert labels[index] == prediction.get('label')
Пример #60
0
    def test_batch_prediction(self):
        inputs = [{
            "question":
            "What kind of test succeeded on its first attempt?",
            "passage":
            "One time I was writing a unit test, and it succeeded on the first attempt."
        }, {
            "question":
            "What kind of test succeeded on its first attempt at batch processing?",
            "passage":
            "One time I was writing a unit test, and it always failed!"
        }]

        archive = load_archive(self.FIXTURES_ROOT / 'bidaf' / 'serialization' /
                               'model.tar.gz')
        predictor = Predictor.from_archive(archive, 'machine-comprehension')

        results = predictor.predict_batch_json(inputs)
        assert len(results) == 2

        for result in results:
            best_span = result.get("best_span")
            best_span_str = result.get("best_span_str")
            start_probs = result.get("span_start_probs")
            end_probs = result.get("span_end_probs")
            assert best_span is not None
            assert isinstance(best_span, list)
            assert len(best_span) == 2
            assert all(isinstance(x, int) for x in best_span)
            assert best_span[0] <= best_span[1]

            assert isinstance(best_span_str, str)
            assert best_span_str != ""

            for probs in (start_probs, end_probs):
                assert probs is not None
                assert all(isinstance(x, float) for x in probs)
                assert sum(probs) == approx(1.0)