示例#1
0
 def _adaptive_grouping(self, dataset: Dataset):
     batches = []
     current_batch = []
     current_lengths = defaultdict(dict)  # type: Dict[str, Dict[str, int]]
     logger.debug("Creating adaptive groups")
     for instance in dataset.instances:
         current_batch.append(instance)
         instance_lengths = instance.get_padding_lengths()
         for field_name in instance_lengths:
             for key in instance_lengths[field_name]:
                 current_lengths[field_name][key] = max(instance_lengths[field_name][key],
                                                        current_lengths[field_name].get(key, -1))
         big_o_memory_constant = self._padding_memory_scaling(current_lengths)
         if (len(current_batch) * big_o_memory_constant > self._adaptive_memory_usage_constant
                     or len(current_batch) > self._maximum_batch_size):
             current_batch.pop()
             if logger.getEffectiveLevel() <= logging.DEBUG:
                 padding_lengths = Dataset(current_batch).get_padding_lengths()
                 logger.debug("Batch size: %d; padding: %s", len(current_batch), padding_lengths)
             batches.append(current_batch)
             current_batch = [instance]
             current_lengths = instance_lengths
     if logger.getEffectiveLevel() <= logging.DEBUG:
         padding_lengths = Dataset(current_batch).get_padding_lengths()
         logger.debug("Batch size: %d; padding: %s", len(current_batch), padding_lengths)
     batches.append(current_batch)
     return batches
示例#2
0
文件: demo.py 项目: liuz37/NLP-HW4
    def get_answer():
        # Take user input and convert to Instance
        user_context = request.args.get("context", "", type=str)
        user_question = request.args.get("question", "", type=str)
        input_instance = squad_reader.text_to_instance(
            question_text=user_question, passage_text=user_context)
        # Make a dataset from the instance
        dataset = Dataset([input_instance])
        dataset.index_instances(train_vocab)
        batch = dataset.as_tensor_dict(cuda_device=0 if cuda else -1,
                                       for_training=False)
        # Extract relevant data from batch.
        passage = batch["passage"]["tokens"]
        question = batch["question"]["tokens"]
        metadata = batch.get("metadata", {})

        # Run data through model to get start and end logits.
        output_dict = model(passage, question)
        start_logits = output_dict["start_logits"]
        end_logits = output_dict["end_logits"]

        # Compute the best span
        best_span = get_best_span(start_logits, end_logits)

        # Get the string corresponding to the best span
        passage_str = metadata[0]['original_passage']
        offsets = metadata[0]['token_offsets']
        predicted_span = tuple(best_span[0].data.cpu().numpy())
        start_offset = offsets[predicted_span[0]][0]
        end_offset = offsets[predicted_span[1]][1]
        best_span_string = passage_str[start_offset:end_offset]

        # Return the best string back to the GUI
        return jsonify(answer=best_span_string)
示例#3
0
    def forward_on_instances(
            self, instances: List[Instance],
            cuda_device: int) -> List[Dict[str, numpy.ndarray]]:
        """
        Takes a list of  :class:`~allennlp.data.instance.Instance`s, converts that text into
        arrays using this model's :class:`Vocabulary`, passes those arrays through
        :func:`self.forward()` and :func:`self.decode()` (which by default does nothing)
        and returns the result.  Before returning the result, we convert any
        ``torch.autograd.Variables`` or ``torch.Tensors`` into numpy arrays and separate the
        batched output into a list of individual dicts per instance. Note that typically
        this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to
        :func:`forward_on_instance`.
        """

        dataset = Dataset(instances)
        dataset.index_instances(self.vocab)
        model_input = arrays_to_variables(dataset.as_array_dict(),
                                          cuda_device=cuda_device,
                                          for_training=False)
        outputs = self.decode(self.forward(**model_input))

        instance_separated_output: List[Dict[str, numpy.ndarray]] = [
            {} for _ in dataset.instances
        ]
        for name, output in list(outputs.items()):
            if isinstance(output, torch.autograd.Variable):
                output = output.data.cpu().numpy()
            outputs[name] = output
            for instance_output, batch_element in zip(
                    instance_separated_output, output):
                instance_output[name] = batch_element
        return instance_separated_output
示例#4
0
    def test_elmo(self):
        # load the test model
        options_file = os.path.join(FIXTURES, 'options.json')
        weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5')
        elmo = Elmo(options_file, weight_file, 2)

        # Correctness checks are in ElmoBiLm and ScalarMix, here we just add a shallow test
        # to ensure things execute.
        indexer = ELMoTokenCharactersIndexer()
        sentences = [['The', 'sentence', '.'],
                     ['ELMo', 'helps', 'disambiguate', 'ELMo', 'from', 'Elmo', '.']]

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for sentence in sentences:
            tokens = [Token(token) for token in sentence]
            field = TextField(tokens, {'character_ids': indexer})
            instance = Instance({'elmo': field})
            instances.append(instance)

        dataset = Dataset(instances)
        vocab = Vocabulary()
        dataset.index_instances(vocab)
        character_ids = dataset.as_array_dict()['elmo']['character_ids']

        output = elmo(Variable(torch.from_numpy(character_ids)))
        elmo_representations = output['elmo_representations']
        mask = output['mask']

        assert len(elmo_representations) == 2
        assert list(elmo_representations[0].size()) == [2, 7, 32]
        assert list(elmo_representations[1].size()) == [2, 7, 32]
        assert list(mask.size()) == [2, 7]
示例#5
0
    def test_elmo_bilm(self):
        # get the raw data
        sentences, expected_lm_embeddings = self._load_sentences_embeddings()

        # load the test model
        options_file = os.path.join(FIXTURES, 'options.json')
        weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5')
        elmo_bilm = _ElmoBiLm(options_file, weight_file)

        # Deal with the data.
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for batch in zip(*sentences):
            for sentence in batch:
                tokens = [Token(token) for token in sentence.split()]
                field = TextField(tokens, {'character_ids': indexer})
                instance = Instance({"elmo": field})
                instances.append(instance)

        dataset = Dataset(instances)
        vocab = Vocabulary()
        dataset.index_instances(vocab)

        # Now finally we can iterate through batches.
        iterator = BasicIterator(3)
        for i, batch in enumerate(iterator(dataset, num_epochs=1, shuffle=False)):
            batch_tensor = Variable(torch.from_numpy(batch['elmo']['character_ids']))
            lm_embeddings = elmo_bilm(batch_tensor)
            top_layer_embeddings, mask = remove_sentence_boundaries(
                    lm_embeddings['activations'][2],
                    lm_embeddings['mask']
            )

            # check the mask lengths
            lengths = mask.data.numpy().sum(axis=1)
            batch_sentences = [sentences[k][i] for k in range(3)]
            expected_lengths = [
                    len(sentence.split()) for sentence in batch_sentences
            ]
            self.assertEqual(lengths.tolist(), expected_lengths)

            # get the expected embeddings and compare!
            expected_top_layer = [expected_lm_embeddings[k][i] for k in range(3)]
            for k in range(3):
                self.assertTrue(
                        numpy.allclose(
                                top_layer_embeddings[k, :lengths[k], :].data.numpy(),
                                expected_top_layer[k],
                                atol=1.0e-6
                        )
                )
示例#6
0
    def _sentences_to_ids(self, sentences):
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for sentence in sentences:
            tokens = [Token(token) for token in sentence]
            field = TextField(tokens, {'character_ids': indexer})
            instance = Instance({'elmo': field})
            instances.append(instance)

        dataset = Dataset(instances)
        vocab = Vocabulary()
        dataset.index_instances(vocab)
        return dataset.as_tensor_dict()['elmo']['character_ids']
示例#7
0
    def read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        instances = []
        with open(file_path, 'r') as data_file:
            logger.info("Reading instances in ARC jsonl format from dataset at: %s", file_path)
            for line in data_file:
                item_json = json.loads(line.strip())

                item_id = item_json["id"]
                question_text = item_json["question"]["stem"]

                choice_label_to_id = {}
                choice_text_list = []

                for choice_id, choice_item in enumerate(item_json["question"]["choices"]):
                    choice_label = choice_item["label"]
                    choice_label_to_id[choice_label] = choice_id

                    choice_text = choice_item["text"]

                    choice_text_list.append(choice_text)

                answer_id = choice_label_to_id[item_json["answerKey"]]

                instances.append(self.text_to_instance(item_id, question_text, choice_text_list, answer_id))

        return Dataset(instances)
示例#8
0
 def setUp(self):
     token_indexer = SingleIdTokenIndexer("tokens")
     text_field = TextField([Token(t) for t in ["a", "a", "a", "a", "b", "b", "c", "c", "c"]],
                            {"tokens": token_indexer})
     self.instance = Instance({"text": text_field})
     self.dataset = Dataset([self.instance])
     super(TestVocabulary, self).setUp()
示例#9
0
 def _sort_dataset_by_padding(
     dataset: Dataset,
     sorting_keys: List[Tuple[str, str]],  # pylint: disable=invalid-sequence-index
     padding_noise: float = 0.0
 ) -> Dataset:
     """
     Sorts the ``Instances`` in this ``Dataset`` by their padding lengths, using the keys in
     ``sorting_keys`` (in the order in which they are provided).  ``sorting_keys`` is a list of
     ``(field_name, padding_key)`` tuples.
     """
     instances_with_lengths = []
     for instance in dataset.instances:
         padding_lengths = cast(Dict[str, Dict[str, float]],
                                instance.get_padding_lengths())
         if padding_noise > 0.0:
             noisy_lengths = {}
             for field_name, field_lengths in padding_lengths.items():
                 noisy_lengths[field_name] = add_noise_to_dict_values(
                     field_lengths, padding_noise)
             padding_lengths = noisy_lengths
         instance_with_lengths = ([
             padding_lengths[field_name][padding_key]
             for (field_name, padding_key) in sorting_keys
         ], instance)
         instances_with_lengths.append(instance_with_lengths)
     instances_with_lengths.sort(key=lambda x: x[0])
     return Dataset([
         instance_with_lengths[-1]
         for instance_with_lengths in instances_with_lengths
     ])
示例#10
0
 def ensure_batch_predictions_are_consistent(self):
     self.model.eval()
     single_predictions = []
     for i, instance in enumerate(self.dataset.instances):
         dataset = Dataset([instance])
         arrays = dataset.as_array_dict(dataset.get_padding_lengths(),
                                        verbose=False)
         variables = arrays_to_variables(arrays, for_training=False)
         result = self.model.forward(**variables)
         single_predictions.append(result)
     batch_arrays = self.dataset.as_array_dict(
         self.dataset.get_padding_lengths(), verbose=False)
     batch_variables = arrays_to_variables(batch_arrays, for_training=False)
     batch_predictions = self.model.forward(**batch_variables)
     for i, instance_predictions in enumerate(single_predictions):
         for key, single_predicted in instance_predictions.items():
             tolerance = 1e-6
             if key == 'loss':
                 # Loss is particularly unstable; we'll just be satisfied if everything else is
                 # close.
                 continue
             single_predicted = single_predicted[0]
             batch_predicted = batch_predictions[key][i]
             if isinstance(single_predicted, torch.autograd.Variable):
                 if single_predicted.size() != batch_predicted.size():
                     # This is probably a sequence model, and our output shape has some padded
                     # elements in the batched case.  Fixing this in general is complicated;
                     # we'll just fix some easy cases that we actually have, for now.
                     num_tokens = single_predicted.size(0)
                     if batch_predicted.dim() == 1:
                         batch_predicted = batch_predicted[:num_tokens]
                     elif batch_predicted.dim() == 2:
                         batch_predicted = batch_predicted[:num_tokens, :]
                     else:
                         raise NotImplementedError
                 assert_allclose(single_predicted.data.numpy(),
                                 batch_predicted.data.numpy(),
                                 atol=tolerance,
                                 err_msg=key)
             else:
                 assert single_predicted == batch_predicted, key
示例#11
0
 def read(self, file_path):
     logger.info('Reading instances from file %s', file_path)
     reader = TaggedCorpusReader(*os.path.split(file_path),
                                 sep='\t',
                                 word_tokenizer=RegexpTokenizer(r'\n',
                                                                gaps=True),
                                 sent_tokenizer=BlanklineTokenizer(),
                                 para_block_reader=lambda s: [s.read()])
     return Dataset([
         self.text_to_instance(*tuple(zip(*tagged_sent)))
         for tagged_sent in reader.tagged_sents()
     ])
示例#12
0
    def read(self, file_path: str):
        with open(file_path, "r") as text_file:
            instance_strings = text_file.readlines()
        if self._tokens_per_instance is not None:
            all_text = " ".join(
                [x.replace("\n", " ").strip() for x in instance_strings])
            tokenized_text = self._tokenizer.tokenize(all_text)
            num_tokens = self._tokens_per_instance
            tokenized_strings = []
            for index in range(0,
                               len(tokenized_text) - num_tokens, num_tokens):
                tokenized_strings.append(tokenized_text[index:index +
                                                        num_tokens])
        else:
            tokenized_strings = [
                self._tokenizer.tokenize(s) for s in instance_strings
            ]

        # TODO(matt): this isn't quite right, because you really want to split on sentences,
        # tokenize the sentences, add the start and end tokens per sentence, then change the tokens
        # per instance if desired.  But, we can fix that later, if someone actually wants to use
        # this for language modeling.  This is just another example of how to use the data reader
        # code, for now.
        tokenized_strings = [[self._start_token] + x + [self._end_token]
                             for x in tokenized_strings]

        # No matter how you want to represent the input, we'll always represent the output as a
        # single token id.  This code lets you learn a language model that concatenates word
        # embeddings with character-level encoders, in order to predict the word token that comes
        # next.
        output_indexer = None  # type: Dict[str, TokenIndexer]
        for name, indexer in self._token_indexers.items():
            if isinstance(indexer, SingleIdTokenIndexer):
                output_indexer = {name: indexer}
                break
        else:
            output_indexer = {"tokens": SingleIdTokenIndexer()}

        instances = []
        for tokenized_string in tokenized_strings:
            input_field = TextField(tokenized_string[:-1],
                                    self._token_indexers)
            output_field = TextField(tokenized_string[1:], output_indexer)
            instances.append(
                Instance({
                    'input_tokens': input_field,
                    'output_tokens': output_field
                }))
        return Dataset(instances)
示例#13
0
def batch_to_ids(batch):
    """
    Given a batch (as list of tokenized sentences), return a batch
    of padded character ids.
    """
    instances = []
    for sentence in batch:
        tokens = [Token(token) for token in sentence]
        field = TextField(tokens, {'character_ids': indexer})
        instance = Instance({"elmo": field})
        instances.append(instance)

    dataset = Dataset(instances)
    vocab = Vocabulary()
    # dataset.index_instances(vocab)
    for instance in dataset.instances:
        instance.index_fields(vocab)
    return dataset.as_tensor_dict()['elmo']['character_ids']
示例#14
0
    def read(self, file_path):
        with open(file_path, "r") as data_file:

            instances = []
            for line in data_file:
                tokens_and_tags = [
                    pair.split("###") for pair in line.strip("\n").split("\t")
                ]
                tokens = [x[0] for x in tokens_and_tags]
                tags = [x[1] for x in tokens_and_tags]

                sequence = TextField(tokens, self._token_indexers)
                sequence_tags = TagField(tags, sequence)
                instances.append(
                    Instance({
                        'tokens': sequence,
                        'tags': sequence_tags
                    }))
        return Dataset(instances)
示例#15
0
    def test_saving_and_loading_works_with_byte_encoding(self):
        # We're going to set a vocabulary from a TextField using byte encoding, index it, save the
        # vocab, load the vocab, then index the text field again, and make sure we get the same
        # result.
        tokenizer = CharacterTokenizer(byte_encoding='utf-8')
        token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer)
        tokens = [Token(t) for t in ["Øyvind", "für", "汉字"]]
        text_field = TextField(tokens, {"characters": token_indexer})
        dataset = Dataset([Instance({"sentence": text_field})])
        vocab = Vocabulary.from_dataset(dataset)
        text_field.index(vocab)
        indexed_tokens = deepcopy(text_field._indexed_tokens)  # pylint: disable=protected-access

        vocab_dir = os.path.join(self.TEST_DIR, 'vocab_save')
        vocab.save_to_files(vocab_dir)
        vocab2 = Vocabulary.from_files(vocab_dir)
        text_field2 = TextField(tokens, {"characters": token_indexer})
        text_field2.index(vocab2)
        indexed_tokens2 = deepcopy(text_field2._indexed_tokens)  # pylint: disable=protected-access
        assert indexed_tokens == indexed_tokens2
示例#16
0
 def setUp(self):
     super(IteratorTest, self).setUp()
     self.token_indexers = {"tokens": SingleIdTokenIndexer()}
     self.vocab = Vocabulary()
     self.this_index = self.vocab.add_token_to_namespace('this')
     self.is_index = self.vocab.add_token_to_namespace('is')
     self.a_index = self.vocab.add_token_to_namespace('a')
     self.sentence_index = self.vocab.add_token_to_namespace('sentence')
     self.another_index = self.vocab.add_token_to_namespace('another')
     self.yet_index = self.vocab.add_token_to_namespace('yet')
     self.very_index = self.vocab.add_token_to_namespace('very')
     self.long_index = self.vocab.add_token_to_namespace('long')
     self.instances = [
             self.create_instance(["this", "is", "a", "sentence"]),
             self.create_instance(["this", "is", "another", "sentence"]),
             self.create_instance(["yet", "another", "sentence"]),
             self.create_instance(["this", "is", "a", "very", "very", "very", "very", "long", "sentence"]),
             self.create_instance(["sentence"]),
             ]
     self.dataset = Dataset(self.instances)
示例#17
0
文件: snli.py 项目: panyang/allennlp
    def read(self, file_path: str):
        instances = []
        with open(file_path, 'r') as snli_file:
            for line in snli_file:
                example = json.loads(line)

                label = example["gold_label"]
                label_field = LabelField(label)

                premise = example["sentence1"]
                premise_field = TextField(self._tokenizer.tokenize(premise),
                                          self._token_indexers)
                hypothesis = example["sentence2"]
                hypothesis_field = TextField(
                    self._tokenizer.tokenize(hypothesis), self._token_indexers)
                instances.append(
                    Instance({
                        'label': label_field,
                        'premise': premise_field,
                        'hypothesis': hypothesis_field
                    }))
        return Dataset(instances)
示例#18
0
 def get_dataset(self):
     field1 = TextField(
         [Token(t) for t in ["this", "is", "a", "sentence", "."]],
         self.token_indexer)
     field2 = TextField([
         Token(t)
         for t in ["this", "is", "a", "different", "sentence", "."]
     ], self.token_indexer)
     field3 = TextField(
         [Token(t) for t in ["here", "is", "a", "sentence", "."]],
         self.token_indexer)
     field4 = TextField([Token(t) for t in ["this", "is", "short"]],
                        self.token_indexer)
     instances = [
         Instance({
             "text1": field1,
             "text2": field2
         }),
         Instance({
             "text1": field3,
             "text2": field4
         })
     ]
     return Dataset(instances)
示例#19
0
文件: squad.py 项目: panyang/allennlp
    def read(self, file_path: str):
        # Import is here, since it isn't necessary by default.
        import nltk

        # Holds tuples of (question_text, answer_sentence_id)
        questions = []
        logger.info("Reading file at %s", file_path)
        with open(file_path) as dataset_file:
            dataset_json = json.load(dataset_file)
            dataset = dataset_json['data']
        logger.info("Reading the dataset")
        for article in tqdm(dataset):
            for paragraph in article['paragraphs']:
                paragraph_id = len(self._paragraph_sentences)
                self._paragraph_sentences[paragraph_id] = []

                context_article = paragraph["context"]
                # replace newlines in the context article
                cleaned_context_article = context_article.replace("\n", "")

                # Split the cleaned_context_article into a list of sentences.
                sentences = nltk.sent_tokenize(cleaned_context_article)

                # Make a dict from span indices to sentence. The end span is
                # exclusive, and the start span is inclusive.
                span_to_sentence_index = {}
                current_index = 0
                for sentence in sentences:
                    sentence_id = len(self._sentence_to_id)
                    self._sentence_to_id[sentence] = sentence_id
                    self._id_to_sentence[sentence_id] = sentence
                    self._sentence_paragraph_map[sentence_id] = paragraph_id
                    self._paragraph_sentences[paragraph_id].append(sentence_id)

                    sentence_len = len(sentence)
                    # Need to add one to the end index to account for the
                    # trailing space after punctuation that is stripped by NLTK.
                    span_to_sentence_index[(current_index, current_index +
                                            sentence_len + 1)] = sentence
                    current_index += sentence_len + 1
                for question_answer in paragraph['qas']:
                    question_text = question_answer["question"].strip()
                    question_id = len(self._question_to_id)
                    self._question_to_id[question_text] = question_id
                    self._id_to_question[question_id] = question_text

                    # There may be multiple answer annotations, so pick the one
                    # that occurs the most.
                    candidate_answer_start_indices = Counter()  # type: Counter
                    for answer in question_answer["answers"]:
                        candidate_answer_start_indices[
                            answer["answer_start"]] += 1
                    answer_start_index, _ = candidate_answer_start_indices.most_common(
                        1)[0]

                    # Get the full sentence corresponding to the answer.
                    answer_sentence = None
                    for span_tuple in span_to_sentence_index:
                        start_span, end_span = span_tuple
                        if start_span <= answer_start_index and answer_start_index < end_span:
                            answer_sentence = span_to_sentence_index[
                                span_tuple]
                            break
                    else:  # no break
                        raise ValueError(
                            "Index of answer start was out of bounds. "
                            "This should never happen, please raise "
                            "an issue on GitHub.")

                    # Now that we have the string of the full sentence, we need to
                    # search for it in our shuffled list to get the index.
                    answer_id = self._sentence_to_id[answer_sentence]

                    # Now we can make the string representation and add this
                    # to the list of processed_rows.
                    questions.append((question_id, answer_id))
        instances = []
        logger.info("Processing questions into training instances")
        for question_id, answer_id in tqdm(questions):
            sentence_choices, correct_choice = self._get_sentence_choices(
                question_id, answer_id)
            question_text = self._id_to_question[question_id]
            sentence_fields = []  # type: List[Field]
            for sentence in sentence_choices:
                tokenized_sentence = self._tokenizer.tokenize(sentence)
                sentence_field = TextField(tokenized_sentence,
                                           self._token_indexers)
                sentence_fields.append(sentence_field)
            sentences_field = ListField(sentence_fields)
            tokenized_question = self._tokenizer.tokenize(question_text)
            question_field = TextField(tokenized_question,
                                       self._token_indexers)
            correct_sentence_field = IndexField(correct_choice,
                                                sentences_field)
            instances.append(
                Instance({
                    'question': question_field,
                    'sentences': sentences_field,
                    'correct_sentence': correct_sentence_field
                }))
        return Dataset(instances)
示例#20
0
 def test_instances_must_have_homogeneous_fields(self):
     instance1 = Instance({"tag": (LabelField(1, skip_indexing=True))})
     instance2 = Instance({"words": TextField([Token("hello")], {})})
     with pytest.raises(ConfigurationError):
         _ = Dataset([instance1, instance2])
示例#21
0
def train_model(db: FeverDocDB, params: Union[Params, Dict[str, Any]],
                cuda_device: int, serialization_dir: str,
                filtering: str) -> Model:
    """
    This function can be used as an entry point to running models in AllenNLP
    directly from a JSON specification using a :class:`Driver`. Note that if
    you care about reproducibility, you should avoid running code using Pytorch
    or numpy which affect the reproducibility of your experiment before you
    import and use this function, these libraries rely on random seeds which
    can be set in this function via a JSON specification file. Note that this
    function performs training and will also evaluate the trained model on
    development and test sets if provided in the parameter json.

    Parameters
    ----------
    params: Params, required.
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir: str, required
        The directory in which to save results and logs.
    """

    SimpleRandom.set_seeds()

    os.makedirs(serialization_dir, exist_ok=True)
    sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"),
                           sys.stdout)  # type: ignore
    sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"),
                           sys.stderr)  # type: ignore
    handler = logging.FileHandler(
        os.path.join(serialization_dir, "python_logging.log"))
    handler.setLevel(logging.INFO)
    handler.setFormatter(
        logging.Formatter(
            '%(asctime)s - %(levelname)s - %(name)s - %(message)s'))
    logging.getLogger().addHandler(handler)
    serialization_params = deepcopy(params).as_dict(quiet=True)

    with open(os.path.join(serialization_dir, "model_params.json"),
              "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    # Now we begin assembling the required parts for the Trainer.
    ds_params = params.pop('dataset_reader', {})
    dataset_reader = FEVERReader(db,
                                 sentence_level=ds_params.pop(
                                     "sentence_level", False),
                                 wiki_tokenizer=Tokenizer.from_params(
                                     ds_params.pop('wiki_tokenizer', {})),
                                 claim_tokenizer=Tokenizer.from_params(
                                     ds_params.pop('claim_tokenizer', {})),
                                 token_indexers=TokenIndexer.dict_from_params(
                                     ds_params.pop('token_indexers', {})),
                                 filtering=filtering)

    train_data_path = params.pop('train_data_path')
    logger.info("Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(train_data_path)

    all_datasets = [train_data]
    datasets_in_vocab = ["train"]

    validation_data_path = params.pop('validation_data_path', None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = dataset_reader.read(validation_data_path)
        all_datasets.append(validation_data)
        datasets_in_vocab.append("validation")
    else:
        validation_data = None

    logger.info("Creating a vocabulary using %s data.",
                ", ".join(datasets_in_vocab))
    vocab = Vocabulary.from_params(
        params.pop("vocabulary", {}),
        Dataset([
            instance for dataset in all_datasets
            for instance in dataset.instances
        ]))
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    model = Model.from_params(vocab, params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))

    train_data.index_instances(vocab)
    if validation_data:
        validation_data.index_instances(vocab)

    trainer_params = params.pop("trainer")
    if cuda_device is not None:
        trainer_params["cuda_device"] = cuda_device
    trainer = Trainer.from_params(model, serialization_dir, iterator,
                                  train_data, validation_data, trainer_params)

    trainer.train()

    # Now tar up results
    archive_model(serialization_dir)

    return model
示例#22
0
    def read(self, file_path: str):
        instances = []

        sentence = []  # type: List[str]
        verbal_predicates = []  # type: List[int]
        predicate_argument_labels = []  # type: List[List[str]]
        current_span_label = []  # type: List[Optional[str]]

        for root, _, files in os.walk(file_path):
            for data_file in files:
                # These are a relic of the dataset pre-processing. Every file will be duplicated
                # - one file called filename.gold_skel and one generated from the preprocessing
                # called filename.gold_conll.
                if 'gold_conll' not in data_file:
                    continue
                with codecs.open(os.path.join(root, data_file), 'r', encoding='utf8') as open_file:
                    for line in open_file:
                        line = line.strip()
                        if line == '' or line.startswith("#"):

                            # Conll format data begins and ends with lines containing a hash,
                            # which may or may not occur after an empty line. To deal with this
                            # we check if the sentence is empty or not and if it is, we just skip
                            # adding instances, because there aren't any to add.
                            if not sentence:
                                continue
                            instances.extend(self._process_sentence(sentence,
                                                                    verbal_predicates,
                                                                    predicate_argument_labels))
                            # Reset everything for the next sentence.
                            sentence = []
                            verbal_predicates = []
                            predicate_argument_labels = []
                            current_span_label = []
                            continue

                        conll_components = line.split()
                        word = conll_components[3]

                        sentence.append(word)
                        word_index = len(sentence) - 1
                        if word_index == 0:
                            # We're starting a new sentence. Here we set up a list of lists
                            # for the BIO labels for the annotation for each verb and create
                            # a temporary 'current_span_label' list for each annotation which
                            # we will use to keep track of whether we are beginning, inside of,
                            # or outside a particular span.
                            predicate_argument_labels = [[] for _ in conll_components[11:-1]]
                            current_span_label = [None for _ in conll_components[11:-1]]

                        num_annotations = len(predicate_argument_labels)
                        is_verbal_predicate = False
                        # Iterate over all verb annotations for the current sentence.
                        for annotation_index in range(num_annotations):
                            annotation = conll_components[11 + annotation_index]
                            label = annotation.strip("()*")

                            if "(" in annotation:
                                # Entering into a span for a particular semantic role label.
                                # We append the label and set the current span for this annotation.
                                bio_label = "B-" + label
                                predicate_argument_labels[annotation_index].append(bio_label)
                                current_span_label[annotation_index] = label

                            elif current_span_label[annotation_index] is not None:
                                # If there's no '(' token, but the current_span_label is not None,
                                # then we are inside a span.
                                bio_label = "I-" + current_span_label[annotation_index]
                                predicate_argument_labels[annotation_index].append(bio_label)
                            else:
                                # We're outside a span.
                                predicate_argument_labels[annotation_index].append("O")

                            # Exiting a span, so we reset the current span label for this annotation.
                            if ")" in annotation:
                                current_span_label[annotation_index] = None
                            # If any annotation contains this word as a verb predicate,
                            # we need to record its index. This also has the side effect
                            # of ordering the verbal predicates by their location in the
                            # sentence, automatically aligning them with the annotations.
                            if "(V" in annotation:
                                is_verbal_predicate = True

                        if is_verbal_predicate:
                            verbal_predicates.append(word_index)
        return Dataset(instances)
示例#23
0
def train_model(params: Params, serialization_dir: str) -> Model:
    """
    This function can be used as an entry point to running models in AllenNLP
    directly from a JSON specification using a :class:`Driver`. Note that if
    you care about reproducibility, you should avoid running code using Pytorch
    or numpy which affect the reproducibility of your experiment before you
    import and use this function, these libraries rely on random seeds which
    can be set in this function via a JSON specification file. Note that this
    function performs training and will also evaluate the trained model on
    development and test sets if provided in the parameter json.

    Parameters
    ----------
    params: Params, required.
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir: str, required
        The directory in which to save results and logs.
    """
    prepare_environment(params)

    os.makedirs(serialization_dir, exist_ok=True)
    sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), sys.stdout)  # type: ignore
    sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), sys.stderr)  # type: ignore
    handler = logging.FileHandler(os.path.join(serialization_dir, "python_logging.log"))
    handler.setLevel(logging.INFO)
    handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(name)s - %(message)s'))
    logging.getLogger().addHandler(handler)
    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    # Now we begin assembling the required parts for the Trainer.
    dataset_reader = DatasetReader.from_params(params.pop('dataset_reader'))

    train_data_path = params.pop('train_data_path')
    logger.info("Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(train_data_path)

    all_datasets: List[Dataset] = [train_data]
    datasets_in_vocab = ["train"]

    validation_data_path = params.pop('validation_data_path', None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = dataset_reader.read(validation_data_path)
        all_datasets.append(validation_data)
        datasets_in_vocab.append("validation")
    else:
        validation_data = None

    test_data_path = params.pop("test_data_path", None)
    if test_data_path is not None:
        logger.info("Reading test data from %s", test_data_path)
        test_data = dataset_reader.read(test_data_path)
        all_datasets.append(test_data)
        datasets_in_vocab.append("test")
    else:
        test_data = None

    logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_in_vocab))
    vocab = Vocabulary.from_params(params.pop("vocabulary", {}),
                                   Dataset([instance for dataset in all_datasets
                                            for instance in dataset.instances]))
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    model = Model.from_params(vocab, params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))

    train_data.index_instances(vocab)
    if validation_data:
        validation_data.index_instances(vocab)

    trainer_params = params.pop("trainer")
    trainer = Trainer.from_params(model,
                                  serialization_dir,
                                  iterator,
                                  train_data,
                                  validation_data,
                                  trainer_params)

    evaluate_on_test = params.pop("evaluate_on_test", False)
    params.assert_empty('base train command')
    trainer.train()

    # Now tar up results
    archive_model(serialization_dir)

    if test_data and evaluate_on_test:
        test_data.index_instances(vocab)
        evaluate(model, test_data, iterator, cuda_device=trainer._cuda_device)  # pylint: disable=protected-access

    elif test_data:
        logger.info("To evaluate on the test set after training, pass the "
                    "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    return model
def generate_features(zipped_annotated_data, feature, feature_details, reader,
                      mithun_logger, objUofaTrainTest, dataset, length_data):
    mithun_logger.info(f"got inside generate_features")
    mithun_logger.info(f"value of feature  is:{feature}")
    mithun_logger.info(f"value of dataset  is:{dataset}")
    instances = []
    for index, (he, be, hl, bl, hw, bw, ht, hd,
                hfc) in enumerate(zipped_annotated_data):

        new_label = ""
        label = hfc

        if (dataset == "fnc"):

            if (label == "unrelated"):
                continue
            else:
                if (label == 'discuss'):
                    new_label = "NOT ENOUGH INFO"
                if (label == 'agree'):
                    new_label = "SUPPORTS"
                if (label == 'disagree'):
                    new_label = "REFUTES"

        else:
            new_label = label

        he_split = he.split(" ")
        be_split = be.split(" ")
        hl_split = hl.split(" ")
        bl_split = bl.split(" ")
        hw_split = hw.split(" ")
        bw_split = bw.split(" ")

        premise_ann = ""
        hypothesis_ann = ""

        if (feature == "plain_NER"):
            premise_ann, hypothesis_ann = objUofaTrainTest.convert_NER_form_per_sent_plain_NER(
                he_split, be_split, hl_split, bl_split, hw_split, bw_split,
                mithun_logger)
        else:
            if (feature == "smart_NER"):
                premise_ann, hypothesis_ann, found_intersection = objUofaTrainTest.convert_SMARTNER_form_per_sent(
                    he_split, be_split, hl_split, bl_split, hw_split, bw_split,
                    mithun_logger)

        #     mithun_logger.info(f"value of old label is:{label}")
        #     mithun_logger.info(f"value of new label is:{new_label}")
        #     mithun_logger.info(f"value of claim before annotation is:{hw}")
        #     mithun_logger.info(f"value of evidence before anntoation is is:{bw}")
        #     mithun_logger.info(f"value of premise_ann is:{premise_ann}")
        #     mithun_logger.info(f"value of hypothesis_ann is:{hypothesis_ann}")

        # mithun_logger.debug(f"value of old label is:{label}")
        # mithun_logger.debug(f"value of new label is:{new_label}")
        # mithun_logger.debug(f"value of claim before annotation is:{hw}")
        # mithun_logger.debug(f"value of evidence before anntoation is is:{bw}")
        # mithun_logger.debug(f"value of premise_ann is:{premise_ann}")
        # mithun_logger.debug(f"value of hypothesis_ann is:{hypothesis_ann}")

        #todo: fixe me. not able to cleanly retrieve boolean values from the config file
        # person_c1 = feature_details.pop('person_c1', {})
        # lower_case_tokens= feature_details.pop('lower_case_tokens', {})
        # update_embeddings= feature_details.pop('update_embeddings', {})
        # assert type(person_c1) is str
        # assert type(lower_case_tokens) is bool
        # assert type(update_embeddings) is bool
        #
        # if(lower_case_tokens):
        #     premise_ann=premise_ann.lower(),
        #     hypothesis_ann=hypothesis_ann.lower()
        #     mithun_logger.debug(f"value of premise_ann after lower case token is:{premise_ann}")
        #     mithun_logger.debug(f"value of label after lower case token  is:{hypothesis_ann}")

        instances.append(
            reader.text_to_instance(premise_ann, hypothesis_ann, new_label))

    if len(instances) == 0:
        mithun_logger.error(
            "No instances were read from the given filepath {}. "
            "Is the path correct?")
        sys.exit(1)
    mithun_logger.info(f"type of instances is :{type(instances)}")
    return Dataset(instances)
示例#25
0
def train_model(params: Params, serialization_dir: str) -> Model:
    """
    This function can be used as an entry point to running models in AllenNLP
    directly from a JSON specification using a :class:`Driver`. Note that if
    you care about reproducibility, you should avoid running code using Pytorch
    or numpy which affect the reproducibility of your experiment before you
    import and use this function, these libraries rely on random seeds which
    can be set in this function via a JSON specification file. Note that this
    function performs training and will also evaluate the trained model on
    development and test sets if provided in the parameter json.

    Parameters
    ----------
    params: Params, required.
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir: str, required
        The directory in which to save results and logs.
    """
    prepare_environment(params)

    os.makedirs(serialization_dir, exist_ok=True)
    sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"),
                           sys.stdout)  # type: ignore
    sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"),
                           sys.stderr)  # type: ignore
    handler = logging.FileHandler(
        os.path.join(serialization_dir, "python_logging.log"))
    handler.setLevel(logging.INFO)
    handler.setFormatter(
        logging.Formatter(
            '%(asctime)s - %(levelname)s - %(name)s - %(message)s'))
    logging.getLogger().addHandler(handler)
    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, "model_params.json"),
              "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    # Now we begin assembling the required parts for the Trainer.

    # 1. Primary training data.
    dataset_reader = DatasetReader.from_params(params.pop('dataset_reader'))
    train_data_path = params.pop('train_data_path')
    logger.info("Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(train_data_path)

    # 2. Auxillary training data.
    dataset_reader_aux = DatasetReader.from_params(
        params.pop('dataset_reader_aux'))
    train_data_path_aux = params.pop('train_data_path_aux')
    logger.info("Reading auxilliary training data from %s",
                train_data_path_aux)
    train_data_aux = dataset_reader_aux.read(train_data_path_aux)

    # If only using a fraction of the auxiliary data.
    aux_sample_fraction = params.pop("aux_sample_fraction", 1.0)
    if aux_sample_fraction < 1.0:
        sample_size = int(aux_sample_fraction * len(train_data_aux.instances))
        train_data_aux = Dataset(
            random.sample(train_data_aux.instances, sample_size))

    # Balance the two datasets by inflating the size of the smaller dataset to the size of the larger dataset.
    train_size = len(train_data.instances)
    aux_train_size = len(train_data_aux.instances)
    mixing_ratio = params.pop("mixing_ratio")
    # mixing_ratio = float(train_size)/aux_train_size

    if train_size > aux_train_size:  # case for PB scaffold.
        difference = train_size - aux_train_size
        aux_sample = [
            random.choice(train_data_aux.instances) for _ in range(difference)
        ]
        train_data_aux = Dataset(train_data_aux.instances + aux_sample)
        logger.info(
            "Inflating auxiliary train data from {} to {} samples".format(
                aux_train_size, len(train_data_aux.instances)))
    # else: # case for FN scaffold.
    #     difference = aux_train_size - train_size
    #     train_sample = [random.choice(train_data.instances) for _ in range(difference)]
    #     train_data = Dataset(train_data.instances + train_sample)
    #     logger.info("Inflating train data from {} to {} samples".format(
    #         train_size, len(train_data.instances)))

    all_datasets: Dict[str, Dataset] = {"train": train_data}
    all_datasets_aux: Dict[str, Dataset] = {"train_aux": train_data_aux}

    # 3. Primary validation data.
    validation_data_path = params.pop('validation_data_path', None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = dataset_reader.read(validation_data_path)
        all_datasets["validation"] = validation_data
    else:
        validation_data = None

    # 4. Auxillary validation data.
    validation_data_path_aux = params.pop('validation_data_path_aux', None)
    if validation_data_path_aux is not None:
        logger.info("Reading auxilliary validation data from %s",
                    validation_data_path_aux)
        validation_data_aux = dataset_reader_aux.read(validation_data_path_aux)
        all_datasets_aux["validation_aux"] = validation_data_aux
    else:
        validation_data_aux = None

    # 5. Primary test data
    test_data_path = params.pop("test_data_path", None)
    if test_data_path is not None:
        logger.info("Reading test data from %s", test_data_path)
        test_data = dataset_reader.read(test_data_path)
        all_datasets["test"] = test_data
    else:
        test_data = None

    # 6. Auxillary test data
    test_data_path_aux = params.pop("test_data_path_aux", None)
    if test_data_path_aux is not None:
        logger.info("Reading auxillary test data from %s", test_data_path_aux)
        test_data_aux = dataset_reader_aux.read(test_data_path_aux)
        all_datasets_aux["test_aux"] = test_data_aux
    else:
        test_data_aux = None

    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))
    datasets_for_vocab_creation_aux = set(
        params.pop("auxillary_datasets_for_vocab_creation", all_datasets_aux))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info(
        "Creating a vocabulary using %s data. Auxillary also included.",
        ", ".join(datasets_for_vocab_creation))
    dataset_primary = Dataset([
        instance for key, dataset in all_datasets.items()
        for instance in dataset.instances if key in datasets_for_vocab_creation
    ])
    dataset_aux = Dataset([
        instance for key, dataset in all_datasets_aux.items()
        for instance in dataset.instances
        if key in datasets_for_vocab_creation_aux
    ])
    vocab = Vocabulary.from_params(params.pop("vocabulary", {}),
                                   dataset_primary,
                                   dataset_aux=dataset_aux)
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    model = Model.from_params(vocab, params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator_aux = DataIterator.from_params(params.pop("iterator_aux"))

    train_data.index_instances(vocab)
    train_data_aux.index_instances(vocab)
    if validation_data:
        validation_data.index_instances(vocab)
    if validation_data_aux:
        validation_data_aux.index_instances(vocab)

    cutoff_epoch = params.pop("cutoff_epoch", -1)

    trainer_params = params.pop("trainer")
    trainer = MultiTaskTrainer.from_params(
        model=model,
        serialization_dir=serialization_dir,
        iterator=iterator,
        iterator_aux=iterator_aux,
        train_dataset=train_data,
        train_dataset_aux=train_data_aux,
        mixing_ratio=mixing_ratio,
        cutoff_epoch=cutoff_epoch,
        validation_dataset=validation_data,
        validation_dataset_aux=validation_data_aux,
        params=trainer_params,
        files_to_archive=params.files_to_archive)

    evaluate_on_test = params.pop("evaluate_on_test", False)
    params.assert_empty('base train command')
    trainer.train()

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    if test_data and evaluate_on_test:
        test_data.index_instances(vocab)
        evaluate(model, test_data, iterator, cuda_device=trainer._cuda_device)  # pylint: disable=protected-access

    elif test_data:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    if test_data_aux and evaluate_on_test:
        test_data_aux.index_instances(vocab)
        evaluate(model,
                 test_data_aux,
                 iterator_aux,
                 cuda_device=trainer._cuda_device)  # pylint: disable=protected-access

    elif test_data_aux:
        logger.info(
            "To evaluate on the auxillary test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    return model