示例#1
0
    def from_vocab(cls,
                   vocab: data.Vocabulary,
                   char_vocab_namespace: str,
                   lemma_vocab_namespace: str,
                   embedding_dim: int,
                   input_projection_layer: base.Linear,
                   filters: List[int],
                   kernel_size: List[int],
                   stride: List[int],
                   padding: List[int],
                   dilation: List[int],
                   activations: List[allen_nn.Activation],
                   ):
        assert char_vocab_namespace in vocab.get_namespaces()
        assert lemma_vocab_namespace in vocab.get_namespaces()

        if len(filters) + 1 != len(kernel_size):
            raise checks.ConfigurationError(
                f"len(filters) ({len(filters):d}) + 1 != kernel_size ({len(kernel_size):d})"
            )
        filters = filters + [vocab.get_vocab_size(lemma_vocab_namespace)]

        dilated_cnn_encoder = dilated_cnn.DilatedCnnEncoder(
            input_dim=embedding_dim + input_projection_layer.get_output_dim(),
            filters=filters,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            activations=activations,
        )
        return cls(num_embeddings=vocab.get_vocab_size(char_vocab_namespace),
                   embedding_dim=embedding_dim,
                   dilated_cnn_encoder=dilated_cnn_encoder,
                   input_projection_layer=input_projection_layer)
示例#2
0
文件: base.py 项目: ipipan/combo
    def from_vocab(
        cls,
        vocab: data.Vocabulary,
        vocab_namespace: str,
        input_dim: int,
        num_layers: int,
        hidden_dims: List[int],
        activations: Union[Activation, List[Activation]],
        dropout: Union[float, List[float]] = 0.0,
    ):
        if len(hidden_dims) + 1 != num_layers:
            raise checks.ConfigurationError(
                f"len(hidden_dims) ({len(hidden_dims):d}) + 1 != num_layers ({num_layers:d})"
            )

        assert vocab_namespace in vocab.get_namespaces(), \
            f"There is not {vocab_namespace} in created vocabs, check if this field has any values to predict!"
        hidden_dims = hidden_dims + [vocab.get_vocab_size(vocab_namespace)]

        return cls(
            feedforward.FeedForward(input_dim=input_dim,
                                    num_layers=num_layers,
                                    hidden_dims=hidden_dims,
                                    activations=activations,
                                    dropout=dropout))
示例#3
0
文件: morpho.py 项目: ipipan/combo
    def from_vocab(cls,
                   vocab: data.Vocabulary,
                   vocab_namespace: str,
                   input_dim: int,
                   num_layers: int,
                   hidden_dims: List[int],
                   activations: Union[Activation, List[Activation]],
                   dropout: Union[float, List[float]] = 0.0,
                   ):
        if len(hidden_dims) + 1 != num_layers:
            raise checks.ConfigurationError(
                f"len(hidden_dims) ({len(hidden_dims):d}) + 1 != num_layers ({num_layers:d})"
            )

        assert vocab_namespace in vocab.get_namespaces()
        hidden_dims = hidden_dims + [vocab.get_vocab_size(vocab_namespace)]

        slices = dataset.get_slices_if_not_provided(vocab)

        return cls(
            feedforward_network=feedforward.FeedForward(
                input_dim=input_dim,
                num_layers=num_layers,
                hidden_dims=hidden_dims,
                activations=activations,
                dropout=dropout),
            slices=slices
        )
示例#4
0
 def from_config(cls,
                 embedding_dim: int,
                 vocab: data.Vocabulary,
                 dilated_cnn_encoder: dilated_cnn.DilatedCnnEncoder,
                 vocab_namespace: str = "token_characters"):
     assert vocab_namespace in vocab.get_namespaces()
     return cls(embedding_dim=embedding_dim,
                num_embeddings=vocab.get_vocab_size(vocab_namespace),
                dilated_cnn_encoder=dilated_cnn_encoder)
示例#5
0
 def from_vocab(cls, vocab: data.Vocabulary, vocab_namespace: str,
                head_predictor: HeadPredictionModel,
                head_projection_layer: base.Linear,
                dependency_projection_layer: base.Linear):
     """Creates parser combining model configuration and vocabulary data."""
     assert vocab_namespace in vocab.get_namespaces()
     relation_prediction_layer = base.Linear(
         in_features=head_projection_layer.get_output_dim() +
         dependency_projection_layer.get_output_dim(),
         out_features=vocab.get_vocab_size(vocab_namespace))
     return cls(head_predictor=head_predictor,
                head_projection_layer=head_projection_layer,
                dependency_projection_layer=dependency_projection_layer,
                relation_prediction_layer=relation_prediction_layer)
示例#6
0
    def __init__(self,
                 vocab: Vocabulary,
                 make_feedforward: Callable,
                 span_emb_dim: int,
                 feature_size: int,
                 spans_per_word: float,
                 positive_label_weight: float = 1.0,
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab, regularizer)

        self._namespaces = [
            entry for entry in vocab.get_namespaces()
            if "relation_labels" in entry
        ]
        self._n_labels = {
            name: vocab.get_vocab_size(name)
            for name in self._namespaces
        }

        self._mention_pruners = torch.nn.ModuleDict()
        self._relation_feedforwards = torch.nn.ModuleDict()
        self._relation_scorers = torch.nn.ModuleDict()
        self._relation_metrics = {}

        for namespace in self._namespaces:
            mention_feedforward = make_feedforward(input_dim=span_emb_dim)
            feedforward_scorer = torch.nn.Sequential(
                TimeDistributed(mention_feedforward),
                TimeDistributed(
                    torch.nn.Linear(mention_feedforward.get_output_dim(), 1)))
            self._mention_pruners[namespace] = Pruner(feedforward_scorer)

            relation_scorer_dim = 3 * span_emb_dim
            relation_feedforward = make_feedforward(
                input_dim=relation_scorer_dim)
            self._relation_feedforwards[namespace] = relation_feedforward
            relation_scorer = torch.nn.Linear(
                relation_feedforward.get_output_dim(),
                self._n_labels[namespace])
            self._relation_scorers[namespace] = relation_scorer

            self._relation_metrics[namespace] = RelationMetrics()

        self._spans_per_word = spans_per_word
        self._active_namespace = None

        self._loss = torch.nn.CrossEntropyLoss(reduction="sum",
                                               ignore_index=-1)
示例#7
0
    def __init__(self,
                 vocab: Vocabulary,
                 make_feedforward: Callable,
                 span_emb_dim: int,
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(NERTagger, self).__init__(vocab, regularizer)

        self._namespaces = [
            entry for entry in vocab.get_namespaces() if "ner_labels" in entry
        ]

        # Number of classes determine the output dimension of the final layer
        self._n_labels = {
            name: vocab.get_vocab_size(name)
            for name in self._namespaces
        }

        # Null label is needed to keep track of when calculating the metrics
        for namespace in self._namespaces:
            null_label = vocab.get_token_index("", namespace)
            assert null_label == 0  # If not, the dummy class won't correspond to the null label.

        # The output dim is 1 less than the number of labels because we don't score the null label;
        # we just give it a score of 0 by default.

        # Create a separate scorer and metric for each dataset we're dealing with.
        self._ner_scorers = torch.nn.ModuleDict()
        self._ner_metrics = {}

        for namespace in self._namespaces:
            mention_feedforward = make_feedforward(input_dim=span_emb_dim)
            self._ner_scorers[namespace] = torch.nn.Sequential(
                TimeDistributed(mention_feedforward),
                TimeDistributed(
                    torch.nn.Linear(mention_feedforward.get_output_dim(),
                                    self._n_labels[namespace] - 1)))

            self._ner_metrics[namespace] = NERMetrics(
                self._n_labels[namespace], null_label)

        self._active_namespace = None

        self._loss = torch.nn.CrossEntropyLoss(reduction="sum")
示例#8
0
def get_slices_if_not_provided(vocab: allen_data.Vocabulary):
    if hasattr(vocab, "slices"):
        return vocab.slices

    if "feats_labels" in vocab.get_namespaces():
        idx2token = vocab.get_index_to_token_vocabulary("feats_labels")
        for _, v in dict(idx2token).items():
            if v not in ["_", "__PAD__"]:
                empty_value = v.split("=")[0] + "=None"
                vocab.add_token_to_namespace(empty_value, "feats_labels")

        slices = {}
        for idx, name in vocab.get_index_to_token_vocabulary(
                "feats_labels").items():
            # There are 2 types features: with (Case=Acc) or without assigment (None).
            # Here we group their indices by name (before assigment sign).
            name = name.split("=")[0]
            if name in slices:
                slices[name].append(idx)
            else:
                slices[name] = [idx]
        vocab.slices = slices
        return vocab.slices
示例#9
0
文件: events.py 项目: MSLars/mare
    def __init__(
            self,
            vocab: Vocabulary,
            make_feedforward: Callable,
            text_emb_dim: int,
            trigger_emb_dim:
        int,  # Triggers are represented via span embeddings (but can have different width than arg spans).
            span_emb_dim: int,  # Arguments are represented via span embeddings.
            feature_size: int,
            trigger_spans_per_word: float,
            argument_spans_per_word: float,
            loss_weights: Dict[str, float],
            context_window: int = 0,
            regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(EventExtractor, self).__init__(vocab, regularizer)

        self._trigger_namespaces = [
            entry for entry in vocab.get_namespaces()
            if "trigger_labels" in entry
        ]
        self._argument_namespaces = [
            entry for entry in vocab.get_namespaces()
            if "argument_labels" in entry
        ]

        self._n_trigger_labels = {
            name: vocab.get_vocab_size(name)
            for name in self._trigger_namespaces
        }
        self._n_argument_labels = {
            name: vocab.get_vocab_size(name)
            for name in self._argument_namespaces
        }

        # Context window
        self._context_window = context_window  # If greater than 0, concatenate context as features.
        context_window_dim = 4 * self._context_window * text_emb_dim
        # 2 (arg context + trig context) * 2 (left context + right context) * context_window + text_emb_size

        # Make sure the null trigger label is always 0.
        for namespace in self._trigger_namespaces:
            null_label = vocab.get_token_index("", namespace)
            assert null_label == 0  # If not, the dummy class won't correspond to the null label.

        # Create trigger scorers and pruners.
        self._trigger_scorers = torch.nn.ModuleDict()
        self._trigger_pruners = torch.nn.ModuleDict()
        for trigger_namespace in self._trigger_namespaces:
            # The trigger pruner.
            trigger_candidate_feedforward = make_feedforward(
                input_dim=trigger_emb_dim)
            self._trigger_pruners[trigger_namespace] = make_pruner(
                trigger_candidate_feedforward)
            # The trigger scorer.
            trigger_scorer_feedforward = make_feedforward(
                input_dim=trigger_emb_dim)
            self._trigger_scorers[namespace] = torch.nn.Sequential(
                TimeDistributed(trigger_scorer_feedforward),
                TimeDistributed(
                    torch.nn.Linear(
                        trigger_scorer_feedforward.get_output_dim(),
                        self._n_trigger_labels[trigger_namespace] - 1)))

        # Create argument scorers and pruners.
        self._mention_pruners = torch.nn.ModuleDict()
        self._argument_feedforwards = torch.nn.ModuleDict()
        self._argument_scorers = torch.nn.ModuleDict()
        for argument_namespace in self._argument_namespaces:
            # The argument pruner.
            mention_feedforward = make_feedforward(input_dim=span_emb_dim)
            self._mention_pruners[argument_namespace] = make_pruner(
                mention_feedforward)
            # The argument scorer. The `+ 2` is there because I include indicator features for
            # whether the trigger is before or inside the arg span.

            # set argument feedforward
            argument_feedforward_dim = trigger_emb_dim + span_emb_dim + feature_size + 2 + context_window_dim
            # feature size + 2 = bucket distance embedding + 2 position features
            argument_feedforward = make_feedforward(
                input_dim=argument_feedforward_dim)
            self._argument_feedforwards[
                argument_namespace] = argument_feedforward
            self._argument_scorers[argument_namespace] = torch.nn.Linear(
                argument_feedforward.get_output_dim(),
                self._n_argument_labels[argument_namespace])

        # Weight on trigger labeling and argument labeling.
        self._loss_weights = loss_weights

        # Distance embeddings.
        self._num_distance_buckets = 10  # Just use 10 which is the default.
        self._distance_embedding = Embedding(
            embedding_dim=feature_size,
            num_embeddings=self._num_distance_buckets)

        self._trigger_spans_per_word = trigger_spans_per_word
        self._argument_spans_per_word = argument_spans_per_word

        # Metrics
        # Make a metric for each dataset (not each namespace).
        namespaces = self._trigger_namespaces + self._argument_namespaces
        datasets = set([x.split("__")[0] for x in namespaces])
        self._metrics = {dataset: EventMetrics() for dataset in datasets}

        self._active_namespaces = {"trigger": None, "argument": None}
        self._active_dataset = None

        # Trigger and argument loss.
        self._trigger_loss = torch.nn.CrossEntropyLoss(reduction="sum")
        self._argument_loss = torch.nn.CrossEntropyLoss(reduction="sum",
                                                        ignore_index=-1)
示例#10
0
    def __init__(self,
                 vocab: Vocabulary,
                 make_feedforward: Callable,
                 token_emb_dim: int,   # Triggers are represented via token embeddings.
                 span_emb_dim: int,    # Arguments are represented via span embeddings.
                 feature_size: int,
                 trigger_spans_per_word: float,
                 argument_spans_per_word: float,
                 loss_weights: Dict[str, float],
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(EventExtractor, self).__init__(vocab, regularizer)

        self._trigger_namespaces = [entry for entry in vocab.get_namespaces()
                                    if "trigger_labels" in entry]
        self._argument_namespaces = [entry for entry in vocab.get_namespaces()
                                     if "argument_labels" in entry]

        self._n_trigger_labels = {name: vocab.get_vocab_size(name)
                                  for name in self._trigger_namespaces}
        self._n_argument_labels = {name: vocab.get_vocab_size(name)
                                   for name in self._argument_namespaces}

        # Make sure the null trigger label is always 0.
        for namespace in self._trigger_namespaces:
            null_label = vocab.get_token_index("", namespace)
            assert null_label == 0  # If not, the dummy class won't correspond to the null label.

        # Create trigger scorers and pruners.
        self._trigger_scorers = torch.nn.ModuleDict()
        self._trigger_pruners = torch.nn.ModuleDict()
        for trigger_namespace in self._trigger_namespaces:
            # The trigger pruner.
            trigger_candidate_feedforward = make_feedforward(input_dim=token_emb_dim)
            self._trigger_pruners[trigger_namespace] = make_pruner(trigger_candidate_feedforward)
            # The trigger scorer.
            trigger_feedforward = make_feedforward(input_dim=token_emb_dim)
            self._trigger_scorers[namespace] = torch.nn.Sequential(
                TimeDistributed(trigger_feedforward),
                TimeDistributed(torch.nn.Linear(trigger_feedforward.get_output_dim(),
                                                self._n_trigger_labels[trigger_namespace] - 1)))

        # Creater argument scorers and pruners.
        self._mention_pruners = torch.nn.ModuleDict()
        self._argument_feedforwards = torch.nn.ModuleDict()
        self._argument_scorers = torch.nn.ModuleDict()
        for argument_namespace in self._argument_namespaces:
            # The argument pruner.
            mention_feedforward = make_feedforward(input_dim=span_emb_dim)
            self._mention_pruners[argument_namespace] = make_pruner(mention_feedforward)
            # The argument scorer. The `+ 2` is there because I include indicator features for
            # whether the trigger is before or inside the arg span.

            # TODO(dwadden) Here
            argument_feedforward_dim = token_emb_dim + span_emb_dim + feature_size + 2
            argument_feedforward = make_feedforward(input_dim=argument_feedforward_dim)
            self._argument_feedforwards[argument_namespace] = argument_feedforward
            self._argument_scorers[argument_namespace] = torch.nn.Linear(
                argument_feedforward.get_output_dim(), self._n_argument_labels[argument_namespace])

        # Weight on trigger labeling and argument labeling.
        self._loss_weights = loss_weights

        # Distance embeddings.
        self._num_distance_buckets = 10  # Just use 10 which is the default.
        self._distance_embedding = Embedding(embedding_dim=feature_size,
                                             num_embeddings=self._num_distance_buckets)

        self._trigger_spans_per_word = trigger_spans_per_word
        self._argument_spans_per_word = argument_spans_per_word

        # Metrics
        # TODO(dwadden) Need different metrics for different namespaces.
        self._metrics = EventMetrics()

        self._active_namespaces = {"trigger": None, "argument": None}

        # Trigger and argument loss.
        self._trigger_loss = torch.nn.CrossEntropyLoss(reduction="sum")
        self._argument_loss = torch.nn.CrossEntropyLoss(reduction="sum", ignore_index=-1)