def test_dry_run_without_extension(self): existing_serialization_dir = self.TEST_DIR / 'existing' extended_serialization_dir = self.TEST_DIR / 'extended' existing_vocab_path = existing_serialization_dir / 'vocabulary' extended_vocab_path = extended_serialization_dir / 'vocabulary' vocab = Vocabulary() # if extend is False, its users responsibility to make sure that dataset instances # will be indexible by provided vocabulary. At least @@UNKNOWN@@ should be present in # namespace for which there could be OOV entries seen in dataset during indexing. # For `tokens` ns, new words will be seen but `tokens` has @@UNKNOWN@@ token. # but for 'labels' ns, there is no @@UNKNOWN@@ so required to add 'N', 'V' upfront. vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens') vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens') vocab.add_token_to_namespace('N', namespace='labels') vocab.add_token_to_namespace('V', namespace='labels') os.makedirs(existing_serialization_dir, exist_ok=True) vocab.save_to_files(existing_vocab_path) self.params['vocabulary'] = {} self.params['vocabulary']['directory_path'] = existing_vocab_path self.params['vocabulary']['extend'] = False dry_run_from_params(self.params, extended_serialization_dir) with open(extended_vocab_path / 'tokens.txt') as f: tokens = [line.strip() for line in f] assert tokens[0] == '@@UNKNOWN@@' assert tokens[1] == 'some_weird_token_1' assert tokens[2] == 'some_weird_token_2' assert len(tokens) == 3
def test_dry_run_without_extension(self): existing_serialization_dir = self.TEST_DIR / "existing" extended_serialization_dir = self.TEST_DIR / "extended" existing_vocab_path = existing_serialization_dir / "vocabulary" extended_vocab_path = extended_serialization_dir / "vocabulary" vocab = Vocabulary() # if extend is False, its users responsibility to make sure that dataset instances # will be indexible by provided vocabulary. At least @@UNKNOWN@@ should be present in # namespace for which there could be OOV entries seen in dataset during indexing. # For `tokens` ns, new words will be seen but `tokens` has @@UNKNOWN@@ token. # but for 'labels' ns, there is no @@UNKNOWN@@ so required to add 'N', 'V' upfront. vocab.add_token_to_namespace("some_weird_token_1", namespace="tokens") vocab.add_token_to_namespace("some_weird_token_2", namespace="tokens") vocab.add_token_to_namespace("N", namespace="labels") vocab.add_token_to_namespace("V", namespace="labels") os.makedirs(existing_serialization_dir, exist_ok=True) vocab.save_to_files(existing_vocab_path) self.params["vocabulary"] = {} self.params["vocabulary"]["type"] = "from_files" self.params["vocabulary"]["directory"] = existing_vocab_path dry_run_from_params(self.params, extended_serialization_dir) with open(extended_vocab_path / "tokens.txt") as f: tokens = [line.strip() for line in f] assert tokens[0] == "@@UNKNOWN@@" assert tokens[1] == "some_weird_token_1" assert tokens[2] == "some_weird_token_2" assert len(tokens) == 3
def _save_vocab_to_disk(self, vocab: Vocabulary) -> str: """Saves the vocab to disk to reuse it between the trials Parameters ---------- vocab Vocabulary to be saved to disk Returns ------- vocab_path Path to the vocabulary, that is a directory """ tmp_dir = tempfile.TemporaryDirectory() self._created_tmp_dirs.append(tmp_dir) vocab_path = tmp_dir.name vocab.save_to_files(vocab_path) # Make sure that we can load the vocab successfully try: Vocabulary.from_files(vocab_path) except Exception as exception: raise ValidationError( f"Could not load vocab saved in '{vocab_path}'") from exception return vocab_path
def test_dry_run_with_extension(self): existing_serialization_dir = self.TEST_DIR / "existing" extended_serialization_dir = self.TEST_DIR / "extended" existing_vocab_path = existing_serialization_dir / "vocabulary" extended_vocab_path = extended_serialization_dir / "vocabulary" vocab = Vocabulary() vocab.add_token_to_namespace("some_weird_token_1", namespace="tokens") vocab.add_token_to_namespace("some_weird_token_2", namespace="tokens") os.makedirs(existing_serialization_dir, exist_ok=True) vocab.save_to_files(existing_vocab_path) self.params["vocabulary"] = {} self.params["vocabulary"]["type"] = "extend" self.params["vocabulary"]["directory"] = str(existing_vocab_path) self.params["vocabulary"]["min_count"] = {"tokens": 3} train_model(self.params, extended_serialization_dir, dry_run=True) vocab_files = os.listdir(extended_vocab_path) assert set(vocab_files) == { ".lock", "labels.txt", "non_padded_namespaces.txt", "tokens.txt", } with open(extended_vocab_path / "tokens.txt") as f: tokens = [line.strip() for line in f] assert tokens[0] == "@@UNKNOWN@@" assert tokens[1] == "some_weird_token_1" assert tokens[2] == "some_weird_token_2" tokens.sort() assert tokens == [ ".", "@@UNKNOWN@@", "animals", "are", "some_weird_token_1", "some_weird_token_2", ] with open(extended_vocab_path / "labels.txt") as f: labels = [line.strip() for line in f] labels.sort() assert labels == ["N", "V"]
def save_vocab_in_allennlp_format(): first_data_filepath = filepaths_of_data_to_train_on[0] numless_vocab_file = first_data_filepath[:first_data_filepath.rfind( '.')] + vocabword_ind_not_numbered_file_ending numless_label_file = first_data_filepath[:first_data_filepath.rfind( '.')] + label_ind_not_numbered_file_ending vocab = Vocabulary() vocab.set_from_file(numless_vocab_file, is_padded=True, oov_token=unk_token, namespace='tokens') vocab.set_from_file(numless_label_file, is_padded=False, namespace='labels') vocab.save_to_files(dir_to_save_vocab_in)
def test_dry_run_with_extension(self): existing_serialization_dir = self.TEST_DIR / u'existing' extended_serialization_dir = self.TEST_DIR / u'extended' existing_vocab_path = existing_serialization_dir / u'vocabulary' extended_vocab_path = extended_serialization_dir / u'vocabulary' vocab = Vocabulary() vocab.add_token_to_namespace(u'some_weird_token_1', namespace=u'tokens') vocab.add_token_to_namespace(u'some_weird_token_2', namespace=u'tokens') os.makedirs(existing_serialization_dir, exist_ok=True) vocab.save_to_files(existing_vocab_path) self.params[u'vocabulary'] = {} self.params[u'vocabulary'][u'directory_path'] = existing_vocab_path self.params[u'vocabulary'][u'extend'] = True self.params[u'vocabulary'][u'min_count'] = {u"tokens": 3} dry_run_from_params(self.params, extended_serialization_dir) vocab_files = os.listdir(extended_vocab_path) assert set(vocab_files) == set( [u'labels.txt', u'non_padded_namespaces.txt', u'tokens.txt']) with open(extended_vocab_path / u'tokens.txt') as f: tokens = [line.strip() for line in f] assert tokens[0] == u'@@UNKNOWN@@' assert tokens[1] == u'some_weird_token_1' assert tokens[2] == u'some_weird_token_2' tokens.sort() assert tokens == [ u'.', u'@@UNKNOWN@@', u'animals', u'are', u'some_weird_token_1', u'some_weird_token_2' ] with open(extended_vocab_path / u'labels.txt') as f: labels = [line.strip() for line in f] labels.sort() assert labels == [u'N', u'V']
def test_dry_run_with_extension(self): existing_serialization_dir = self.TEST_DIR / 'existing' extended_serialization_dir = self.TEST_DIR / 'extended' existing_vocab_path = existing_serialization_dir / 'vocabulary' extended_vocab_path = extended_serialization_dir / 'vocabulary' vocab = Vocabulary() vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens') vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens') os.makedirs(existing_serialization_dir, exist_ok=True) vocab.save_to_files(existing_vocab_path) self.params['vocabulary'] = {} self.params['vocabulary']['directory_path'] = existing_vocab_path self.params['vocabulary']['extend'] = True self.params['vocabulary']['min_count'] = {"tokens" : 3} dry_run_from_params(self.params, extended_serialization_dir) vocab_files = os.listdir(extended_vocab_path) assert set(vocab_files) == {'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt'} with open(extended_vocab_path / 'tokens.txt') as f: tokens = [line.strip() for line in f] assert tokens[0] == '@@UNKNOWN@@' assert tokens[1] == 'some_weird_token_1' assert tokens[2] == 'some_weird_token_2' tokens.sort() assert tokens == ['.', '@@UNKNOWN@@', 'animals', 'are', 'some_weird_token_1', 'some_weird_token_2'] with open(extended_vocab_path / 'labels.txt') as f: labels = [line.strip() for line in f] labels.sort() assert labels == ['N', 'V']
def serialize_model(model: nn.Module, vocab: Vocabulary, output_dir: str): output_model_file = os.path.join(output_dir, 'model.pickle') vocab.save_to_files(os.path.join(output_dir, 'vocab')) torch.save(model.state_dict(), output_model_file)
import json import argparse from allennlp.data import Vocabulary if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--ontology-path', type=str, required=True) parser.add_argument('--output-path', type=str, required=True) args = parser.parse_args() with open(args.ontology_path) as f: ontology = json.load(f) vocab = Vocabulary() vocab.add_token_to_namespace(token='None', namespace='span_labels') vocab.add_token_to_namespace(token='@@PADDING@@', namespace='span_labels') vocab.add_tokens_to_namespace(tokens=list(ontology['args'].keys()), namespace='span_labels') vocab.add_tokens_to_namespace(tokens=list(ontology['events'].keys()), namespace='event_labels') vocab.save_to_files(args.output_path)
def __init__( self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, seq2vec_encoder: Seq2VecEncoder, seq2seq_encoder: Seq2SeqEncoder = None, feedforward: Optional[FeedForward] = None, dropout: float = None, num_labels: int = None, label_namespace: str = "labels", namespace: str = "tokens", initializer: InitializerApplicator = InitializerApplicator(), vocab_save_dir: str = None, **kwargs, ) -> None: super().__init__(vocab, **kwargs) self._text_field_embedder = text_field_embedder # for self._vocab_save_dir = vocab_save_dir if self._vocab_save_dir: print("Saving Vocab to %s ..."%(self._vocab_save_dir)) vocab.save_to_files(directory=self._vocab_save_dir) print("VOCAB STATISTICS:") vocab.print_statistics() if seq2seq_encoder: self._seq2seq_encoder = seq2seq_encoder else: self._seq2seq_encoder = None self._seq2vec_encoder = seq2vec_encoder self._feedforward = feedforward if feedforward is not None: self._classifier_input_dim = self._feedforward.get_output_dim() else: self._classifier_input_dim = self._seq2vec_encoder.get_output_dim() if dropout: self._dropout = torch.nn.Dropout(dropout) else: self._dropout = None self._label_namespace = label_namespace self._namespace = namespace if num_labels: self._num_labels = num_labels else: self._num_labels = vocab.get_vocab_size(namespace=self._label_namespace) self._classification_layer = torch.nn.Linear(self._classifier_input_dim, self._num_labels) self._accuracy = CategoricalAccuracy() self._f1 = F1Measure(positive_label=1) self._loss = torch.nn.CrossEntropyLoss() initializer(self)