示例#1
0
    def test_dry_run_without_extension(self):
        existing_serialization_dir = self.TEST_DIR / 'existing'
        extended_serialization_dir = self.TEST_DIR / 'extended'
        existing_vocab_path = existing_serialization_dir / 'vocabulary'
        extended_vocab_path = extended_serialization_dir / 'vocabulary'

        vocab = Vocabulary()
        # if extend is False, its users responsibility to make sure that dataset instances
        # will be indexible by provided vocabulary. At least @@UNKNOWN@@ should be present in
        # namespace for which there could be OOV entries seen in dataset during indexing.
        # For `tokens` ns, new words will be seen but `tokens` has @@UNKNOWN@@ token.
        # but for 'labels' ns, there is no @@UNKNOWN@@ so required to add 'N', 'V' upfront.
        vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens')
        vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens')
        vocab.add_token_to_namespace('N', namespace='labels')
        vocab.add_token_to_namespace('V', namespace='labels')
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params['vocabulary'] = {}
        self.params['vocabulary']['directory_path'] = existing_vocab_path
        self.params['vocabulary']['extend'] = False
        dry_run_from_params(self.params, extended_serialization_dir)

        with open(extended_vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == '@@UNKNOWN@@'
        assert tokens[1] == 'some_weird_token_1'
        assert tokens[2] == 'some_weird_token_2'
        assert len(tokens) == 3
示例#2
0
    def test_dry_run_without_extension(self):
        existing_serialization_dir = self.TEST_DIR / "existing"
        extended_serialization_dir = self.TEST_DIR / "extended"
        existing_vocab_path = existing_serialization_dir / "vocabulary"
        extended_vocab_path = extended_serialization_dir / "vocabulary"

        vocab = Vocabulary()
        # if extend is False, its users responsibility to make sure that dataset instances
        # will be indexible by provided vocabulary. At least @@UNKNOWN@@ should be present in
        # namespace for which there could be OOV entries seen in dataset during indexing.
        # For `tokens` ns, new words will be seen but `tokens` has @@UNKNOWN@@ token.
        # but for 'labels' ns, there is no @@UNKNOWN@@ so required to add 'N', 'V' upfront.
        vocab.add_token_to_namespace("some_weird_token_1", namespace="tokens")
        vocab.add_token_to_namespace("some_weird_token_2", namespace="tokens")
        vocab.add_token_to_namespace("N", namespace="labels")
        vocab.add_token_to_namespace("V", namespace="labels")
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params["vocabulary"] = {}
        self.params["vocabulary"]["type"] = "from_files"
        self.params["vocabulary"]["directory"] = existing_vocab_path
        dry_run_from_params(self.params, extended_serialization_dir)

        with open(extended_vocab_path / "tokens.txt") as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == "@@UNKNOWN@@"
        assert tokens[1] == "some_weird_token_1"
        assert tokens[2] == "some_weird_token_2"
        assert len(tokens) == 3
示例#3
0
    def test_dry_run_without_extension(self):
        existing_serialization_dir = self.TEST_DIR / 'existing'
        extended_serialization_dir = self.TEST_DIR / 'extended'
        existing_vocab_path = existing_serialization_dir / 'vocabulary'
        extended_vocab_path = extended_serialization_dir / 'vocabulary'

        vocab = Vocabulary()
        # if extend is False, its users responsibility to make sure that dataset instances
        # will be indexible by provided vocabulary. At least @@UNKNOWN@@ should be present in
        # namespace for which there could be OOV entries seen in dataset during indexing.
        # For `tokens` ns, new words will be seen but `tokens` has @@UNKNOWN@@ token.
        # but for 'labels' ns, there is no @@UNKNOWN@@ so required to add 'N', 'V' upfront.
        vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens')
        vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens')
        vocab.add_token_to_namespace('N', namespace='labels')
        vocab.add_token_to_namespace('V', namespace='labels')
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params['vocabulary'] = {}
        self.params['vocabulary']['directory_path'] = existing_vocab_path
        self.params['vocabulary']['extend'] = False
        dry_run_from_params(self.params, extended_serialization_dir)

        with open(extended_vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == '@@UNKNOWN@@'
        assert tokens[1] == 'some_weird_token_1'
        assert tokens[2] == 'some_weird_token_2'
        assert len(tokens) == 3
示例#4
0
    def _save_vocab_to_disk(self, vocab: Vocabulary) -> str:
        """Saves the vocab to disk to reuse it between the trials

        Parameters
        ----------
        vocab
            Vocabulary to be saved to disk

        Returns
        -------
        vocab_path
            Path to the vocabulary, that is a directory
        """
        tmp_dir = tempfile.TemporaryDirectory()
        self._created_tmp_dirs.append(tmp_dir)
        vocab_path = tmp_dir.name
        vocab.save_to_files(vocab_path)

        # Make sure that we can load the vocab successfully
        try:
            Vocabulary.from_files(vocab_path)
        except Exception as exception:
            raise ValidationError(
                f"Could not load vocab saved in '{vocab_path}'") from exception

        return vocab_path
示例#5
0
    def test_dry_run_with_extension(self):
        existing_serialization_dir = self.TEST_DIR / "existing"
        extended_serialization_dir = self.TEST_DIR / "extended"
        existing_vocab_path = existing_serialization_dir / "vocabulary"
        extended_vocab_path = extended_serialization_dir / "vocabulary"

        vocab = Vocabulary()
        vocab.add_token_to_namespace("some_weird_token_1", namespace="tokens")
        vocab.add_token_to_namespace("some_weird_token_2", namespace="tokens")
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params["vocabulary"] = {}
        self.params["vocabulary"]["type"] = "extend"
        self.params["vocabulary"]["directory"] = str(existing_vocab_path)
        self.params["vocabulary"]["min_count"] = {"tokens": 3}
        train_model(self.params, extended_serialization_dir, dry_run=True)

        vocab_files = os.listdir(extended_vocab_path)
        assert set(vocab_files) == {
            ".lock",
            "labels.txt",
            "non_padded_namespaces.txt",
            "tokens.txt",
        }

        with open(extended_vocab_path / "tokens.txt") as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == "@@UNKNOWN@@"
        assert tokens[1] == "some_weird_token_1"
        assert tokens[2] == "some_weird_token_2"

        tokens.sort()
        assert tokens == [
            ".",
            "@@UNKNOWN@@",
            "animals",
            "are",
            "some_weird_token_1",
            "some_weird_token_2",
        ]

        with open(extended_vocab_path / "labels.txt") as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ["N", "V"]
示例#6
0
def save_vocab_in_allennlp_format():
    first_data_filepath = filepaths_of_data_to_train_on[0]
    numless_vocab_file = first_data_filepath[:first_data_filepath.rfind(
        '.')] + vocabword_ind_not_numbered_file_ending
    numless_label_file = first_data_filepath[:first_data_filepath.rfind(
        '.')] + label_ind_not_numbered_file_ending

    vocab = Vocabulary()
    vocab.set_from_file(numless_vocab_file,
                        is_padded=True,
                        oov_token=unk_token,
                        namespace='tokens')
    vocab.set_from_file(numless_label_file,
                        is_padded=False,
                        namespace='labels')
    vocab.save_to_files(dir_to_save_vocab_in)
示例#7
0
    def test_dry_run_with_extension(self):
        existing_serialization_dir = self.TEST_DIR / u'existing'
        extended_serialization_dir = self.TEST_DIR / u'extended'
        existing_vocab_path = existing_serialization_dir / u'vocabulary'
        extended_vocab_path = extended_serialization_dir / u'vocabulary'

        vocab = Vocabulary()
        vocab.add_token_to_namespace(u'some_weird_token_1',
                                     namespace=u'tokens')
        vocab.add_token_to_namespace(u'some_weird_token_2',
                                     namespace=u'tokens')
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params[u'vocabulary'] = {}
        self.params[u'vocabulary'][u'directory_path'] = existing_vocab_path
        self.params[u'vocabulary'][u'extend'] = True
        self.params[u'vocabulary'][u'min_count'] = {u"tokens": 3}
        dry_run_from_params(self.params, extended_serialization_dir)

        vocab_files = os.listdir(extended_vocab_path)
        assert set(vocab_files) == set(
            [u'labels.txt', u'non_padded_namespaces.txt', u'tokens.txt'])

        with open(extended_vocab_path / u'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == u'@@UNKNOWN@@'
        assert tokens[1] == u'some_weird_token_1'
        assert tokens[2] == u'some_weird_token_2'

        tokens.sort()
        assert tokens == [
            u'.', u'@@UNKNOWN@@', u'animals', u'are', u'some_weird_token_1',
            u'some_weird_token_2'
        ]

        with open(extended_vocab_path / u'labels.txt') as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == [u'N', u'V']
示例#8
0
    def test_dry_run_with_extension(self):
        existing_serialization_dir = self.TEST_DIR / 'existing'
        extended_serialization_dir = self.TEST_DIR / 'extended'
        existing_vocab_path = existing_serialization_dir / 'vocabulary'
        extended_vocab_path = extended_serialization_dir / 'vocabulary'

        vocab = Vocabulary()
        vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens')
        vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens')
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params['vocabulary'] = {}
        self.params['vocabulary']['directory_path'] = existing_vocab_path
        self.params['vocabulary']['extend'] = True
        self.params['vocabulary']['min_count'] = {"tokens" : 3}
        dry_run_from_params(self.params, extended_serialization_dir)

        vocab_files = os.listdir(extended_vocab_path)
        assert set(vocab_files) == {'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt'}

        with open(extended_vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == '@@UNKNOWN@@'
        assert tokens[1] == 'some_weird_token_1'
        assert tokens[2] == 'some_weird_token_2'

        tokens.sort()
        assert tokens == ['.', '@@UNKNOWN@@', 'animals', 'are',
                          'some_weird_token_1', 'some_weird_token_2']

        with open(extended_vocab_path / 'labels.txt') as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ['N', 'V']
示例#9
0
def serialize_model(model: nn.Module, vocab: Vocabulary, output_dir: str):
    output_model_file = os.path.join(output_dir, 'model.pickle')
    vocab.save_to_files(os.path.join(output_dir, 'vocab'))
    torch.save(model.state_dict(), output_model_file)
示例#10
0
import json
import argparse

from allennlp.data import Vocabulary

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--ontology-path', type=str, required=True)
    parser.add_argument('--output-path', type=str, required=True)
    args = parser.parse_args()

    with open(args.ontology_path) as f:
        ontology = json.load(f)

    vocab = Vocabulary()
    vocab.add_token_to_namespace(token='None', namespace='span_labels')
    vocab.add_token_to_namespace(token='@@PADDING@@', namespace='span_labels')
    vocab.add_tokens_to_namespace(tokens=list(ontology['args'].keys()),
                                  namespace='span_labels')
    vocab.add_tokens_to_namespace(tokens=list(ontology['events'].keys()),
                                  namespace='event_labels')
    vocab.save_to_files(args.output_path)
示例#11
0
    def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        seq2vec_encoder: Seq2VecEncoder,
        seq2seq_encoder: Seq2SeqEncoder = None,
        feedforward: Optional[FeedForward] = None,
        dropout: float = None,
        num_labels: int = None,
        label_namespace: str = "labels",
        namespace: str = "tokens",
        initializer: InitializerApplicator = InitializerApplicator(),
        vocab_save_dir: str = None,
        **kwargs,
    ) -> None:

        super().__init__(vocab, **kwargs)
        self._text_field_embedder = text_field_embedder
        
#         for 
        
        self._vocab_save_dir = vocab_save_dir
        
        if self._vocab_save_dir:
            print("Saving Vocab to %s ..."%(self._vocab_save_dir))
            vocab.save_to_files(directory=self._vocab_save_dir)
            
        print("VOCAB STATISTICS:")
        vocab.print_statistics()
        
        
        

        if seq2seq_encoder:
            self._seq2seq_encoder = seq2seq_encoder
        else:
            self._seq2seq_encoder = None

        self._seq2vec_encoder = seq2vec_encoder
        self._feedforward = feedforward
        if feedforward is not None:
            self._classifier_input_dim = self._feedforward.get_output_dim()
        else:
            self._classifier_input_dim = self._seq2vec_encoder.get_output_dim()

        if dropout:
            self._dropout = torch.nn.Dropout(dropout)
        else:
            self._dropout = None
        self._label_namespace = label_namespace
        self._namespace = namespace

        if num_labels:
            self._num_labels = num_labels
        else:
            self._num_labels = vocab.get_vocab_size(namespace=self._label_namespace)
        self._classification_layer = torch.nn.Linear(self._classifier_input_dim, self._num_labels)
        self._accuracy = CategoricalAccuracy()
        self._f1 = F1Measure(positive_label=1)
        self._loss = torch.nn.CrossEntropyLoss()
        initializer(self)