Пример #1
0
 def __init__(
     self,
     num_heads: int,
     tensor_1_dim: int,
     tensor_1_projected_dim: int = None,
     tensor_2_dim: int = None,
     tensor_2_projected_dim: int = None,
     internal_similarity: SimilarityFunction = DotProductSimilarity()
 ) -> None:
     super(MultiHeadedSimilarity, self).__init__()
     self.num_heads = num_heads
     self._internal_similarity = internal_similarity
     tensor_1_projected_dim = tensor_1_projected_dim or tensor_1_dim
     tensor_2_dim = tensor_2_dim or tensor_1_dim
     tensor_2_projected_dim = tensor_2_projected_dim or tensor_2_dim
     if tensor_1_projected_dim % num_heads != 0:
         raise ConfigurationError(
             "Projected dimension not divisible by number of heads: %d, %d"
             % (tensor_1_projected_dim, num_heads))
     if tensor_2_projected_dim % num_heads != 0:
         raise ConfigurationError(
             "Projected dimension not divisible by number of heads: %d, %d"
             % (tensor_2_projected_dim, num_heads))
     self._tensor_1_projection = Parameter(
         torch.Tensor(tensor_1_dim, tensor_1_projected_dim))
     self._tensor_2_projection = Parameter(
         torch.Tensor(tensor_2_dim, tensor_2_projected_dim))
     self.reset_parameters()
Пример #2
0
    def __init__(self,
                 input_dim: int,
                 projection_dim: int = None,
                 similarity_function: SimilarityFunction = DotProductSimilarity(),
                 num_attention_heads: int = 1,
                 combination: str = '1,2',
                 output_dim: int = None) -> None:
        super(IntraSentenceAttentionEncoder, self).__init__()
        self._input_dim = input_dim
        if projection_dim:
            self._projection = torch.nn.Linear(input_dim, projection_dim)
        else:
            self._projection = lambda x: x
            projection_dim = input_dim
        self._matrix_attention = LegacyMatrixAttention(similarity_function)
        self._num_attention_heads = num_attention_heads
        if isinstance(similarity_function, MultiHeadedSimilarity):
            if num_attention_heads == 1:
                raise ConfigurationError("Similarity function has multiple heads but encoder doesn't")
            if num_attention_heads != similarity_function.num_heads:
                raise ConfigurationError("Number of heads don't match between similarity function "
                                         "and encoder: %d, %d" % (num_attention_heads,
                                                                  similarity_function.num_heads))
        elif num_attention_heads > 1:
            raise ConfigurationError("Encoder has multiple heads but similarity function doesn't")
        self._combination = combination

        combined_dim = util.get_combined_dim(combination, [input_dim, projection_dim])
        if output_dim:
            self._output_projection = Linear(combined_dim, output_dim)
            self._output_dim = output_dim
        else:
            self._output_projection = lambda x: x
            self._output_dim = combined_dim
Пример #3
0
    def __init__(self,
                 labels: Sequence[Union[str, int]],
                 label_namespace: str = 'labels',
                 skip_indexing: bool = False,
                 num_labels: Optional[int] = None) -> None:
        self.labels = labels
        self._label_namespace = label_namespace
        self._label_ids = None
        self._maybe_warn_for_namespace(label_namespace)
        self._num_labels = num_labels

        if skip_indexing:
            if not all(isinstance(label, int) for label in labels):
                raise ConfigurationError("In order to skip indexing, your labels must be integers. "
                                         "Found labels = {}".format(labels))
            if not num_labels:
                raise ConfigurationError("In order to skip indexing, num_labels can't be None.")

            if not all(cast(int, label) < num_labels for label in labels):
                raise ConfigurationError("All labels should be < num_labels. "
                                         "Found num_labels = {} and labels = {} ".format(num_labels, labels))

            self._label_ids = labels
        else:
            if not all(isinstance(label, str) for label in labels):
                raise ConfigurationError("MultiLabelFields expects string labels if skip_indexing=False. "
                                         "Found labels: {}".format(labels))
Пример #4
0
    def __init__(self,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 tag_label: str = "ner",
                 feature_labels: Sequence[str] = (),
                 lazy: bool = False,
                 coding_scheme: str = "IOB1",
                 label_namespace: str = "labels",
                 ignore_ner_tags: bool = False) -> None:
        super().__init__(lazy)
        self._token_indexers = token_indexers or {
            'tokens': SingleIdTokenIndexer()
        }
        if tag_label is not None and tag_label not in _VALID_LABELS:
            raise ConfigurationError(
                "unknown tag label type: {}".format(tag_label))
        for label in feature_labels:
            if label not in _VALID_LABELS:
                raise ConfigurationError(
                    "unknown feature label type: {}".format(label))
        if coding_scheme not in ("IOB1", "BIOUL"):
            raise ConfigurationError(
                "unknown coding_scheme: {}".format(coding_scheme))

        self.tag_label = tag_label
        self.feature_labels = set(feature_labels)
        self.coding_scheme = coding_scheme
        self.label_namespace = label_namespace
        self.ignore_ner_tags = ignore_ner_tags
Пример #5
0
    def text_to_instance(
            self,  # type: ignore
            tokens: List[Token],
            pos_tags: List[str] = None,
            chunk_tags: List[str] = None,
            ner_tags: List[str] = None) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {'tokens': sequence}
        instance_fields["metadata"] = MetadataField(
            {"words": [x.text for x in tokens]})

        # Recode the labels if necessary.
        if self.coding_scheme == "BIOUL":
            coded_chunks = to_bioul(
                chunk_tags) if chunk_tags is not None else None
            coded_ner = to_bioul(ner_tags) if ner_tags is not None else None
        else:
            # the default IOB1
            coded_chunks = chunk_tags
            coded_ner = ner_tags

        # Add "feature labels" to instance
        if 'pos' in self.feature_labels:
            if pos_tags is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use pos_tags as "
                    "features. Pass them to text_to_instance.")
            instance_fields['pos_tags'] = SequenceLabelField(
                pos_tags, sequence, "pos_tags")
        if 'chunk' in self.feature_labels:
            if coded_chunks is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use chunk tags as "
                    "features. Pass them to text_to_instance.")
            instance_fields['chunk_tags'] = SequenceLabelField(
                coded_chunks, sequence, "chunk_tags")
        if 'ner' in self.feature_labels:
            if coded_ner is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use NER tags as "
                    " features. Pass them to text_to_instance.")
            instance_fields['ner_tags'] = SequenceLabelField(
                coded_ner, sequence, "ner_tags")

        # Add "tag label" to instance
        if self.tag_label == 'ner' and coded_ner is not None:
            instance_fields['tags'] = SequenceLabelField(
                coded_ner, sequence, self.label_namespace)
        elif self.tag_label == 'pos' and pos_tags is not None:
            instance_fields['tags'] = SequenceLabelField(
                pos_tags, sequence, self.label_namespace)
        elif self.tag_label == 'chunk' and coded_chunks is not None:
            instance_fields['tags'] = SequenceLabelField(
                coded_chunks, sequence, self.label_namespace)

        return Instance(instance_fields)
Пример #6
0
def make_vocab_from_params(params: Params, serialization_dir: str):
    prepare_environment(params)

    vocab_params = params.pop("vocabulary", {})
    os.makedirs(serialization_dir, exist_ok=True)
    vocab_dir = os.path.join(serialization_dir, "vocabulary")

    if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None:
        raise ConfigurationError("The 'vocabulary' directory in the provided "
                                 "serialization directory is non-empty")

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info(
        "From dataset instances, %s will be considered for vocabulary creation.",
        ", ".join(datasets_for_vocab_creation))

    instances = [
        instance for key, dataset in all_datasets.items()
        for instance in dataset if key in datasets_for_vocab_creation
    ]

    vocab = Vocabulary.from_params(vocab_params, instances)

    logger.info(f"writing the vocabulary to {vocab_dir}.")
    vocab.save_to_files(vocab_dir)
    logger.info("done creating vocab")
Пример #7
0
def unflatten(flat_dict: Dict[str, Any]) -> Dict[str, Any]:
    """
    Given a "flattened" dict with compound keys, e.g.
        {"a.b": 0}
    unflatten it:
        {"a": {"b": 0}}
    """
    unflat: Dict[str, Any] = {}

    for compound_key, value in flat_dict.items():
        curr_dict = unflat
        parts = compound_key.split(".")
        for key in parts[:-1]:
            curr_value = curr_dict.get(key)
            if key not in curr_dict:
                curr_dict[key] = {}
                curr_dict = curr_dict[key]
            elif isinstance(curr_value, dict):
                curr_dict = curr_value
            else:
                raise ConfigurationError("flattened dictionary is invalid")
        if not isinstance(curr_dict, dict) or parts[-1] in curr_dict:
            raise ConfigurationError("flattened dictionary is invalid")
        else:
            curr_dict[parts[-1]] = value

    return unflat
Пример #8
0
    def __init__(self,
                 hidden_dim: int = 100,
                 num_perspectives: int = 20,
                 share_weights_between_directions: bool = True,
                 is_forward: bool = None,
                 with_full_match: bool = True,
                 with_maxpool_match: bool = True,
                 with_attentive_match: bool = True,
                 with_max_attentive_match: bool = True) -> None:
        super(BiMpmMatching, self).__init__()

        self.hidden_dim = hidden_dim
        self.num_perspectives = num_perspectives
        self.is_forward = is_forward

        self.with_full_match = with_full_match
        self.with_maxpool_match = with_maxpool_match
        self.with_attentive_match = with_attentive_match
        self.with_max_attentive_match = with_max_attentive_match

        if not (with_full_match or with_maxpool_match or with_attentive_match or with_max_attentive_match):
            raise ConfigurationError("At least one of the matching method should be enabled")

        def create_parameter():  # utility function to create and initialize a parameter
            param = nn.Parameter(torch.zeros(num_perspectives, hidden_dim))
            torch.nn.init.kaiming_normal_(param)
            return param

        def share_or_create(weights_to_share):  # utility function to create or share the weights
            return weights_to_share if share_weights_between_directions else create_parameter()

        output_dim = 2  # used to calculate total output dimension, 2 is for cosine max and cosine min
        if with_full_match:
            if is_forward is None:
                raise ConfigurationError("Must specify is_forward to enable full matching")
            self.full_match_weights = create_parameter()
            self.full_match_weights_reversed = share_or_create(self.full_match_weights)
            output_dim += num_perspectives + 1

        if with_maxpool_match:
            self.maxpool_match_weights = create_parameter()
            output_dim += num_perspectives * 2

        if with_attentive_match:
            self.attentive_match_weights = create_parameter()
            self.attentive_match_weights_reversed = share_or_create(self.attentive_match_weights)
            output_dim += num_perspectives + 1

        if with_max_attentive_match:
            self.max_attentive_match_weights = create_parameter()
            self.max_attentive_match_weights_reversed = share_or_create(self.max_attentive_match_weights)
            output_dim += num_perspectives + 1

        self.output_dim = output_dim
Пример #9
0
 def __init__(self, top_k: int = 1, tie_break: bool = False) -> None:
     if top_k > 1 and tie_break:
         raise ConfigurationError(
             "Tie break in Categorical Accuracy "
             "can be done only for maximum (top_k = 1)")
     if top_k <= 0:
         raise ConfigurationError(
             "top_k passed to Categorical Accuracy must be > 0")
     self._top_k = top_k
     self._tie_break = tie_break
     self.correct_count = 0.
     self.total_count = 0.
Пример #10
0
def dry_run_from_params(params: Params, serialization_dir: str) -> None:
    prepare_environment(params)

    vocab_params = params.pop("vocabulary", {})
    os.makedirs(serialization_dir, exist_ok=True)
    vocab_dir = os.path.join(serialization_dir, "vocabulary")

    if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None:
        raise ConfigurationError("The 'vocabulary' directory in the provided "
                                 "serialization directory is non-empty")

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))

    instances = [instance for key, dataset in all_datasets.items()
                 for instance in dataset
                 if key in datasets_for_vocab_creation]

    vocab = Vocabulary.from_params(vocab_params, instances)
    dataset = Batch(instances)
    dataset.index_instances(vocab)
    dataset.print_statistics()
    vocab.print_statistics()

    logger.info(f"writing the vocabulary to {vocab_dir}.")
    vocab.save_to_files(vocab_dir)

    model = Model.from_params(vocab=vocab, params=params.pop('model'))
    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)
Пример #11
0
    def read(self, file_path: str) -> Iterable[Instance]:
        """
        Returns an ``Iterable`` containing all the instances
        in the specified dataset.

        If ``self.lazy`` is False, this calls ``self._read()``,
        ensures that the result is a list, then returns the resulting list.

        If ``self.lazy`` is True, this returns an object whose
        ``__iter__`` method calls ``self._read()`` each iteration.
        In this case your implementation of ``_read()`` must also be lazy
        (that is, not load all instances into memory at once), otherwise
        you will get a ``ConfigurationError``.

        In either case, the returned ``Iterable`` can be iterated
        over multiple times. It's unlikely you want to override this function,
        but if you do your result should likewise be repeatedly iterable.
        """
        lazy = getattr(self, 'lazy', None)
        if lazy is None:
            logger.warning(
                "DatasetReader.lazy is not set, "
                "did you forget to call the superclass constructor?")

        if lazy:
            return _LazyInstances(lambda: iter(self._read(file_path)))
        else:
            instances = self._read(file_path)
            if not isinstance(instances, list):
                instances = [instance for instance in Tqdm.tqdm(instances)]
            if not instances:
                raise ConfigurationError(
                    "No instances were read from the given filepath {}. "
                    "Is the path correct?".format(file_path))
            return instances
 def forward(self,
             text_field_input: Dict[str, torch.Tensor],
             num_wrapping_dims: int = 0) -> torch.Tensor:
     if self._token_embedders.keys() != text_field_input.keys():
         if not self._allow_unmatched_keys:
             message = "Mismatched token keys: %s and %s" % (str(
                 self._token_embedders.keys()), str(
                     text_field_input.keys()))
             raise ConfigurationError(message)
     embedded_representations = []
     keys = sorted(self._token_embedders.keys())
     for key in keys:
         # If we pre-specified a mapping explictly, use that.
         if self._embedder_to_indexer_map is not None:
             tensors = [
                 text_field_input[indexer_key]
                 for indexer_key in self._embedder_to_indexer_map[key]
             ]
         else:
             # otherwise, we assume the mapping between indexers and embedders
             # is bijective and just use the key directly.
             tensors = [text_field_input[key]]
         # Note: need to use getattr here so that the pytorch voodoo
         # with submodules works with multiple GPUs.
         embedder = getattr(self, 'token_embedder_{}'.format(key))
         for _ in range(num_wrapping_dims):
             embedder = TimeDistributed(embedder)
         token_vectors = embedder(*tensors)
         embedded_representations.append(token_vectors)
     return torch.cat(embedded_representations, dim=-1)
Пример #13
0
 def from_params(self, params: Params) -> PytorchSeq2VecWrapper:
     if not params.pop('batch_first', True):
         raise ConfigurationError("Our encoder semantics assumes batch is always first!")
     if self._module_class in self.PYTORCH_MODELS:
         params['batch_first'] = True
     module = self._module_class(**params.as_dict())
     return PytorchSeq2VecWrapper(module)
Пример #14
0
    def pop_choice(self,
                   key: str,
                   choices: List[Any],
                   default_to_first_choice: bool = False) -> Any:
        """
        Gets the value of ``key`` in the ``params`` dictionary, ensuring that the value is one of
        the given choices. Note that this `pops` the key from params, modifying the dictionary,
        consistent with how parameters are processed in this codebase.

        Parameters
        ----------
        key: str
            Key to get the value from in the param dictionary
        choices: List[Any]
            A list of valid options for values corresponding to ``key``.  For example, if you're
            specifying the type of encoder to use for some part of your model, the choices might be
            the list of encoder classes we know about and can instantiate.  If the value we find in
            the param dictionary is not in ``choices``, we raise a ``ConfigurationError``, because
            the user specified an invalid value in their parameter file.
        default_to_first_choice: bool, optional (default=False)
            If this is ``True``, we allow the ``key`` to not be present in the parameter
            dictionary.  If the key is not present, we will use the return as the value the first
            choice in the ``choices`` list.  If this is ``False``, we raise a
            ``ConfigurationError``, because specifying the ``key`` is required (e.g., you `have` to
            specify your model class when running an experiment, but you can feel free to use
            default settings for encoders if you want).
        """
        default = choices[0] if default_to_first_choice else self.DEFAULT
        value = self.pop(key, default)
        if value not in choices:
            key_str = self.history + key
            message = '%s not in acceptable choices for %s: %s' % (
                value, key_str, str(choices))
            raise ConfigurationError(message)
        return value
Пример #15
0
    def __init__(self,
                 options_file: str,
                 weight_file: str,
                 num_output_representations: int,
                 requires_grad: bool = False,
                 do_layer_norm: bool = False,
                 dropout: float = 0.5,
                 vocab_to_cache: List[str] = None,
                 module: torch.nn.Module = None) -> None:
        super(Elmo, self).__init__()

        logging.info("Initializing ELMo")
        if module is not None:
            if options_file is not None or weight_file is not None:
                raise ConfigurationError(
                    "Don't provide options_file or weight_file with module")
            self._elmo_lstm = module
        else:
            self._elmo_lstm = _ElmoBiLm(options_file,
                                        weight_file,
                                        requires_grad=requires_grad,
                                        vocab_to_cache=vocab_to_cache)
        self._has_cached_vocab = vocab_to_cache is not None
        self._dropout = Dropout(p=dropout)
        self._scalar_mixes: Any = []
        for k in range(num_output_representations):
            scalar_mix = ScalarMix(self._elmo_lstm.num_layers,
                                   do_layer_norm=do_layer_norm)
            self.add_module('scalar_mix_{}'.format(k), scalar_mix)
            self._scalar_mixes.append(scalar_mix)
    def __init__(self,
                 input_dim: int,
                 combination: str = "x,y",
                 num_width_embeddings: int = None,
                 span_width_embedding_dim: int = None,
                 bucket_widths: bool = False,
                 use_exclusive_start_indices: bool = False) -> None:
        super().__init__()
        self._input_dim = input_dim
        self._combination = combination
        self._num_width_embeddings = num_width_embeddings
        self._bucket_widths = bucket_widths

        self._use_exclusive_start_indices = use_exclusive_start_indices
        if use_exclusive_start_indices:
            self._start_sentinel = Parameter(
                torch.randn([1, 1, int(input_dim)]))

        if num_width_embeddings is not None and span_width_embedding_dim is not None:
            self._span_width_embedding = Embedding(num_width_embeddings,
                                                   span_width_embedding_dim)
        elif not all(
            [num_width_embeddings is None, span_width_embedding_dim is None]):
            raise ConfigurationError(
                "To use a span width embedding representation, you must"
                "specify both num_width_buckets and span_width_embedding_dim.")
        else:
            self._span_width_embedding = None
    def forward(
            self,  # pylint: disable=arguments-differ
            inputs: PackedSequence,
            initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None):
        """
        Parameters
        ----------
        inputs : ``PackedSequence``, required.
            A batch first ``PackedSequence`` to run the stacked LSTM over.
        initial_state : Tuple[torch.Tensor, torch.Tensor], optional, (default = None)
            A tuple (state, memory) representing the initial hidden state and memory
            of the LSTM. Each tensor has shape (1, batch_size, output_dimension * 2).

        Returns
        -------
        output_sequence : PackedSequence
            The encoded sequence of shape (batch_size, sequence_length, hidden_size * 2)
        final_states: torch.Tensor
            The per-layer final (state, memory) states of the LSTM, each with shape
            (num_layers, batch_size, hidden_size * 2).
        """
        if not initial_state:
            hidden_states = [None] * len(self.lstm_layers)
        elif initial_state[0].size()[0] != len(self.lstm_layers):
            raise ConfigurationError(
                "Initial states were passed to forward() but the number of "
                "initial states does not match the number of layers.")
        else:
            hidden_states = list(
                zip(initial_state[0].split(1, 0), initial_state[1].split(1,
                                                                         0)))

        output_sequence = inputs
        final_states = []
        for i, state in enumerate(hidden_states):
            forward_layer = getattr(self, 'forward_layer_{}'.format(i))
            backward_layer = getattr(self, 'backward_layer_{}'.format(i))
            # The state is duplicated to mirror the Pytorch API for LSTMs.
            forward_output, final_forward_state = forward_layer(
                output_sequence, state)
            backward_output, final_backward_state = backward_layer(
                output_sequence, state)

            forward_output, lengths = pad_packed_sequence(forward_output,
                                                          batch_first=True)
            backward_output, _ = pad_packed_sequence(backward_output,
                                                     batch_first=True)

            output_sequence = torch.cat([forward_output, backward_output], -1)
            output_sequence = pack_padded_sequence(output_sequence,
                                                   lengths,
                                                   batch_first=True)
            final_states.append(
                (torch.cat(both_direction_states,
                           -1) for both_direction_states in zip(
                               final_forward_state, final_backward_state)))

        final_state_tuple = (torch.cat(state_list, 0)
                             for state_list in zip(*final_states))
        return output_sequence, final_state_tuple
Пример #18
0
 def _read(self, file_path: str):
     if file_path.endswith('.examples'):
         yield from self._read_examples_file(file_path)
     elif file_path.endswith('.jsonl'):
         yield from self._read_preprocessed_file(file_path)
     else:
         raise ConfigurationError(
             f"Don't know how to read filetype of {file_path}")
Пример #19
0
    def __init__(self, index: int, sequence_field: SequenceField) -> None:
        self.sequence_index = index
        self.sequence_field = sequence_field

        if not isinstance(index, int):
            raise ConfigurationError(
                "IndexFields must be passed integer indices. "
                "Found index: {} with type: {}.".format(index, type(index)))
Пример #20
0
 def add_subclass_to_registry(subclass: Type[T]):
     # Add to registry, raise an error if key has already been used.
     if name in registry:
         message = "Cannot register %s as %s; name already in use for %s" % (
             name, cls.__name__, registry[name].__name__)
         raise ConfigurationError(message)
     registry[name] = subclass
     return subclass
Пример #21
0
 def __init__(self, module: torch.nn.modules.RNNBase) -> None:
     # Seq2VecEncoders cannot be stateful.
     super(PytorchSeq2VecWrapper, self).__init__(stateful=False)
     self._module = module
     try:
         if not self._module.batch_first:
             raise ConfigurationError("Our encoder semantics assumes batch is always first!")
     except AttributeError:
         pass
    def __init__(self,
                 encoder: Dict[str, int] = None,
                 byte_pairs: List[Tuple[str, str]] = None,
                 n_ctx: int = 512,
                 model_path: str = None) -> None:

        too_much_information = model_path and (encoder or byte_pairs)
        too_little_information = not model_path and not (encoder and byte_pairs)

        if too_much_information or too_little_information:
            raise ConfigurationError("must specify either model path or (encoder + byte_pairs) but not both")

        if model_path:
            model_path = cached_path(model_path)

            # Load encoder and byte_pairs from tar.gz
            with tarfile.open(model_path) as tmp:
                encoder_name = next(m.name for m in tmp.getmembers() if 'encoder_bpe' in m.name)
                encoder_info = tmp.extractfile(encoder_name)

                if encoder_info:
                    encoder = json.loads(encoder_info.read())
                else:
                    raise ConfigurationError(f"expected encoder_bpe file in archive {model_path}")

                bpe_name = next(m.name for m in tmp.getmembers() if m.name.endswith('.bpe'))
                bpe_info = tmp.extractfile(bpe_name)

                if bpe_info:
                    # First line is "version", last line is blank
                    lines = bpe_info.read().decode('utf-8').split('\n')[1:-1]
                    # Convert "b1 b2" -> (b1, b2)
                    byte_pairs = [tuple(line.split()) for line in lines]  # type: ignore
                else:
                    raise ConfigurationError(f"expected .bpe file in archive {model_path}")

        self.encoder = encoder
        self.decoder = {word_id: word for word, word_id in self.encoder.items()}

        # Compute ranks
        self.bpe_ranks = {pair: idx for idx, pair in enumerate(byte_pairs)}

        self.cache: Dict[str, List[str]] = {}
        self.n_ctx = n_ctx
Пример #23
0
    def __init__(self,
                 label: Union[str, int],
                 label_namespace: str = 'labels',
                 skip_indexing: bool = False) -> None:
        self.label = label
        self._label_namespace = label_namespace
        self._label_id = None
        self._maybe_warn_for_namespace(label_namespace)

        if skip_indexing:
            if not isinstance(label, int):
                raise ConfigurationError("In order to skip indexing, your labels must be integers. "
                                         "Found label = {}".format(label))
            else:
                self._label_id = label
        else:
            if not isinstance(label, str):
                raise ConfigurationError("LabelFields must be passed a string label if skip_indexing=False. "
                                         "Found label: {} with type: {}.".format(label, type(label)))
 def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str,
                                                                   int]]):
     if token.text is None:
         raise ConfigurationError(
             'TokenCharactersIndexer needs a tokenizer that retains text')
     for character in self._character_tokenizer.tokenize(token.text):
         # If `text_id` is set on the character token (e.g., if we're using byte encoding), we
         # will not be using the vocab for this character.
         if getattr(character, 'text_id', None) is None:
             counter[self._namespace][character.text] += 1
Пример #25
0
 def assert_empty(self, class_name: str):
     """
     Raises a ``ConfigurationError`` if ``self.params`` is not empty.  We take ``class_name`` as
     an argument so that the error message gives some idea of where an error happened, if there
     was one.  ``class_name`` should be the name of the `calling` class, the one that got extra
     parameters (if there are any).
     """
     if self.params:
         raise ConfigurationError(
             "Extra parameters passed to {}: {}".format(
                 class_name, self.params))
Пример #26
0
    def __init__(self, submodels: List[Model]) -> None:
        vocab = submodels[0].vocab
        for submodel in submodels:
            if submodel.vocab != vocab:
                raise ConfigurationError("Vocabularies in ensemble differ")

        super(Ensemble, self).__init__(vocab, None)

        # Using ModuleList propagates calls to .eval() so dropout is disabled on the submodels in evaluation
        # and prediction.
        self.submodels = torch.nn.ModuleList(submodels)
Пример #27
0
    def list_available(cls) -> List[str]:
        """List default first if it exists"""
        keys = list(Registrable._registry[cls].keys())
        default = cls.default_implementation

        if default is None:
            return keys
        elif default not in keys:
            message = "Default implementation %s is not registered" % default
            raise ConfigurationError(message)
        else:
            return [default] + [k for k in keys if k != default]
Пример #28
0
    def __init__(self, tokens: List[Token],
                 token_indexers: Dict[str, TokenIndexer]) -> None:
        self.tokens = tokens
        self._token_indexers = token_indexers
        self._indexed_tokens: Optional[Dict[str, TokenList]] = None
        self._indexer_name_to_indexed_token: Optional[Dict[str,
                                                           List[str]]] = None

        if not all([isinstance(x, (Token, SpacyToken)) for x in tokens]):
            raise ConfigurationError("TextFields must be passed Tokens. "
                                     "Found: {} with types {}.".format(
                                         tokens, [type(x) for x in tokens]))
Пример #29
0
    def from_params(cls, vocab: Vocabulary,
                    params: Params) -> 'BidafEnsemble':  # type: ignore
        # pylint: disable=arguments-differ
        if vocab:
            raise ConfigurationError("vocab should be None")

        submodels = []
        paths = params.pop("submodels")
        for path in paths:
            submodels.append(load_archive(path).model)

        return cls(submodels=submodels)
Пример #30
0
    def __init__(self,
                 labels: Union[List[str], List[int]],
                 sequence_field: SequenceField,
                 label_namespace: str = 'labels') -> None:
        self.labels = labels
        self.sequence_field = sequence_field
        self._label_namespace = label_namespace
        self._indexed_labels = None
        self._maybe_warn_for_namespace(label_namespace)
        if len(labels) != sequence_field.sequence_length():
            raise ConfigurationError(
                "Label length and sequence length "
                "don't match: %d and %d" %
                (len(labels), sequence_field.sequence_length()))

        if all([isinstance(x, int) for x in labels]):
            self._indexed_labels = labels

        elif not all([isinstance(x, str) for x in labels]):
            raise ConfigurationError(
                "SequenceLabelFields must be passed either all "
                "strings or all ints. Found labels {} with "
                "types: {}.".format(labels, [type(x) for x in labels]))