示例#1
0
    def from_params(  # type: ignore
            cls, vocab: Vocabulary,
            params: Params) -> "ElmoTokenEmbedderMultiLang":

        options_files = params.pop("options_files")
        weight_files = params.pop("weight_files")
        for lang in options_files.keys():
            options_files.add_file_to_archive(lang)
        for lang in weight_files.keys():
            weight_files.add_file_to_archive(lang)
        requires_grad = params.pop("requires_grad", False)
        do_layer_norm = params.pop_bool("do_layer_norm", False)
        dropout = params.pop_float("dropout", 0.5)
        namespace_to_cache = params.pop("namespace_to_cache", None)
        if namespace_to_cache is not None:
            vocab_to_cache = list(
                vocab.get_token_to_index_vocabulary(namespace_to_cache).keys())
        else:
            vocab_to_cache = None
        projection_dim = params.pop_int("projection_dim", None)
        scalar_mix_parameters = params.pop("scalar_mix_parameters", None)
        aligning_files = params.pop("aligning_files", {})
        params.assert_empty(cls.__name__)
        return cls(
            options_files=options_files,
            weight_files=weight_files,
            do_layer_norm=do_layer_norm,
            dropout=dropout,
            requires_grad=requires_grad,
            projection_dim=projection_dim,
            vocab_to_cache=vocab_to_cache,
            scalar_mix_parameters=scalar_mix_parameters,
            aligning_files=aligning_files,
        )
示例#2
0
    def from_params(  # type: ignore
            cls, vocab: Vocabulary, params: Params,
            **extras) -> "ElmoTokenEmbedder":

        options_file = params.pop("options_file")
        weight_file = params.pop("weight_file")
        requires_grad = params.pop("requires_grad", False)
        do_layer_norm = params.pop_bool("do_layer_norm", False)
        dropout = params.pop_float("dropout", 0.5)
        namespace_to_cache = params.pop("namespace_to_cache", None)
        if namespace_to_cache is not None:
            vocab_to_cache = list(
                vocab.get_token_to_index_vocabulary(namespace_to_cache).keys())
        else:
            vocab_to_cache = None
        projection_dim = params.pop_int("projection_dim", None)
        scalar_mix_parameters = params.pop("scalar_mix_parameters", None)
        params.assert_empty(cls.__name__)
        return cls(
            options_file=options_file,
            weight_file=weight_file,
            do_layer_norm=do_layer_norm,
            dropout=dropout,
            requires_grad=requires_grad,
            projection_dim=projection_dim,
            vocab_to_cache=vocab_to_cache,
            scalar_mix_parameters=scalar_mix_parameters,
        )
示例#3
0
 def from_params(cls, vocab: Vocabulary, params: Params) -> 'ElmoTokenEmbedder':  # type: ignore
     # pylint: disable=arguments-differ
     params.add_file_to_archive('options_file')
     params.add_file_to_archive('weight_file')
     options_file = params.pop('options_file')
     weight_file = params.pop('weight_file')
     requires_grad = params.pop('requires_grad', False)
     do_layer_norm = params.pop_bool('do_layer_norm', False)
     dropout = params.pop_float("dropout", 0.5)
     namespace_to_cache = params.pop("namespace_to_cache", None)
     if namespace_to_cache is not None:
         vocab_to_cache = list(vocab.get_token_to_index_vocabulary(namespace_to_cache).keys())
     else:
         vocab_to_cache = None
     projection_dim = params.pop_int("projection_dim", None)
     scalar_mix_parameters = params.pop('scalar_mix_parameters', None)
     params.assert_empty(cls.__name__)
     return cls(options_file=options_file,
                weight_file=weight_file,
                do_layer_norm=do_layer_norm,
                dropout=dropout,
                requires_grad=requires_grad,
                projection_dim=projection_dim,
                vocab_to_cache=vocab_to_cache,
                scalar_mix_parameters=scalar_mix_parameters)
示例#4
0
 def from_params(
         cls,
         vocab: Vocabulary,
         params: Params  # type: ignore
 ) -> 'ElmoTokenEmbedderMultiLang':
     # pylint: disable=arguments-differ
     options_files = params.pop('options_files')
     weight_files = params.pop('weight_files')
     for lang in options_files.keys():
         options_files.add_file_to_archive(lang)
     for lang in weight_files.keys():
         weight_files.add_file_to_archive(lang)
     requires_grad = params.pop('requires_grad', False)
     do_layer_norm = params.pop_bool('do_layer_norm', False)
     dropout = params.pop_float("dropout", 0.5)
     namespace_to_cache = params.pop("namespace_to_cache", None)
     if namespace_to_cache is not None:
         vocab_to_cache = list(
             vocab.get_token_to_index_vocabulary(namespace_to_cache).keys())
     else:
         vocab_to_cache = None
     projection_dim = params.pop_int("projection_dim", None)
     scalar_mix_parameters = params.pop('scalar_mix_parameters', None)
     aligning_files = params.pop('aligning_files', {})
     params.assert_empty(cls.__name__)
     return cls(options_files=options_files,
                weight_files=weight_files,
                do_layer_norm=do_layer_norm,
                dropout=dropout,
                requires_grad=requires_grad,
                projection_dim=projection_dim,
                vocab_to_cache=vocab_to_cache,
                scalar_mix_parameters=scalar_mix_parameters,
                aligning_files=aligning_files)
示例#5
0
    def __init__(self,
                 vocab: Vocabulary,
                 beam_size: int,
                 namespace: str = 'tokens',
                 end_symbol: str = None,
                 min_steps: int = None,
                 max_steps: int = 50,
                 per_node_beam_size: int = None,
                 disallow_repeated_ngrams: int = None,
                 repeated_ngrams_exceptions: List[str] = None,
                 length_penalizer: LengthPenalizer = None,
                 coverage_penalizer: CoveragePenalizer = None) -> None:
        self.beam_size = beam_size
        end_symbol = end_symbol or END_SYMBOL
        self._end_index = vocab.get_token_index(end_symbol, namespace)
        self.max_steps = max_steps
        self.min_steps = min_steps
        self.per_node_beam_size = per_node_beam_size or beam_size
        self.length_penalizer = length_penalizer
        self.coverage_penalizer = coverage_penalizer

        # Convert the token exceptions to their indexes
        self.disallow_repeated_ngrams = disallow_repeated_ngrams
        self.repeated_ngrams_exceptions = set()
        repeated_ngrams_exceptions = repeated_ngrams_exceptions or []
        token_to_index = vocab.get_token_to_index_vocabulary(namespace)
        for token in repeated_ngrams_exceptions:
            if token not in token_to_index:
                raise Exception(f'Could not add token exception {token} because {token} is not in the vocabulary')
            self.repeated_ngrams_exceptions.add(token_to_index[token])
 def from_params(cls, vocab: Vocabulary,
                 params: Params) -> 'ElmoTokenEmbedder':  # type: ignore
     # pylint: disable=arguments-differ
     params.add_file_to_archive('options_file')
     params.add_file_to_archive('weight_file')
     options_file = params.pop('options_file')
     weight_file = params.pop('weight_file')
     requires_grad = params.pop('requires_grad', False)
     do_layer_norm = params.pop_bool('do_layer_norm', False)
     dropout = params.pop_float("dropout", 0.5)
     namespace_to_cache = params.pop("namespace_to_cache", None)
     if namespace_to_cache is not None:
         vocab_to_cache = list(
             vocab.get_token_to_index_vocabulary(namespace_to_cache).keys())
     else:
         vocab_to_cache = None
     projection_dim = params.pop_int("projection_dim", None)
     params.assert_empty(cls.__name__)
     return cls(options_file=options_file,
                weight_file=weight_file,
                do_layer_norm=do_layer_norm,
                dropout=dropout,
                requires_grad=requires_grad,
                projection_dim=projection_dim,
                vocab_to_cache=vocab_to_cache)
 def __init__(self,
              vocab: Vocabulary,
              model: Model,
              label_namespace: str = "labels",
              positive_label: str = "HasDef"):
     super().__init__(vocab)
     self._model = model
     label_vocab = vocab.get_token_to_index_vocabulary(label_namespace)
     self._f1_measure = F1Measure(label_vocab[positive_label])
示例#8
0
    def __init__(self,
                 vocab: Vocabulary,
                 input_embedder: TextFieldEmbedder,
                 nli_projection_layer: FeedForward,
                 training_tasks: Any,
                 validation_tasks: Any,
                 langs_print_train: List[str] = ["en", "fr", "de", "ur", "sw"],
                 dropout: float = 0.0,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None,
                 feed_lang_ids: bool = True,
                 avg: bool = False) -> None:
        super(SimpleProjectionXlm, self).__init__(vocab, regularizer)

        self._avg = avg

        if type(training_tasks) == dict:
            self._training_tasks = list(training_tasks.keys())
        else:
            self._training_tasks = training_tasks

        if type(validation_tasks) == dict:
            self._validation_tasks = list(validation_tasks.keys())
        else:
            self._validation_tasks = validation_tasks

        self._input_embedder = input_embedder

        self._label_namespace = "labels"
        self._num_labels = vocab.get_vocab_size(
            namespace=self._label_namespace)

        self._nli_projection_layer = nli_projection_layer
        print(
            vocab.get_token_to_index_vocabulary(
                namespace=self._label_namespace))
        assert nli_projection_layer.get_output_dim() == self._num_labels

        self._dropout = torch.nn.Dropout(p=dropout)

        self._loss = torch.nn.CrossEntropyLoss()

        initializer(self._nli_projection_layer)

        self._nli_per_lang_acc: Dict[str, CategoricalAccuracy] = dict()

        for taskname in self._validation_tasks:
            # this will hide some metrics from tqdm, but they will still be computed
            self._nli_per_lang_acc[taskname] = CategoricalAccuracy()
        self._nli_avg_acc = Average()

        self._langs_pring_train = langs_print_train or "en"
        if '*' in self._langs_pring_train:
            self._langs_pring_train = [t.split("")[-1] for t in training_tasks]

        self._feed_lang_ids = feed_lang_ids
    def from_params(cls, vocab: Vocabulary, params: Params) -> 'ConstrainedConditionalModule':
        hard_constraints = params.pop("hard_constraints", [])
        soft_constraints = params.pop("soft_constraints", {})
        label_namespace = params.pop("label_namespace", "labels")
        sentence_penalty_map_dict = params.pop("sentence_penalty_map", None)
        constrain_crf_decoding = params.pop("constrain_crf_decoding", False)
        label_encoding = params.pop("label_encoding", None)

        sentence_penalty_map = None
        if sentence_penalty_map_dict:
            assert len(sentence_penalty_map_dict) == 1, "multiple sentence constraints not supported"
            tag, penalty = list(sentence_penalty_map_dict.items())[0]
            tag_index = vocab.get_token_index(tag, label_namespace)
            sentence_penalty_map = (tag_index, penalty)

        hard_constraints_to_indices: Dict[str, List[int]] = {}
        for tag in hard_constraints:
            hard_constraints_to_indices[tag] = []
            for label, index in vocab.get_token_to_index_vocabulary(label_namespace).items():
                if re.match(rf"^.*-{tag}", label):
                    hard_constraints_to_indices[tag].append(index)
        soft_constraints = soft_constraints or {}
        soft_constraints_to_indices: Dict[str, Tuple[List[int], float]] = {}
        for tag, penalty in soft_constraints.items():
            indices = []
            for label, index in vocab.get_token_to_index_vocabulary(label_namespace).items():
                if re.match(rf"^.*-{tag}", label):
                    indices.append(index)
            soft_constraints_to_indices[tag] = (indices, penalty)
        num_tags = vocab.get_vocab_size(label_namespace)
        if constrain_crf_decoding:
            if not label_encoding:
                raise ConfigurationError("constrain_crf_decoding is True, but "
                                         "no label_encoding was specified.")
            labels = vocab.get_index_to_token_vocabulary(label_namespace)
            constraints = allowed_transitions(label_encoding, labels)
        else:
            constraints = None
        params.assert_empty(cls.__name__)
        return ConstrainedConditionalModule(num_tags, constraints,
                                            hard_constraints_to_indices,
                                            soft_constraints_to_indices,
                                            sentence_penalty_map)
示例#10
0
    def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        encoder: Seq2SeqEncoder,
        *,
        projection: bool = True,
        embeddings_dropout: float = 0,
        dropout: float = 0,
        verbose: Union[bool, Iterable[str]] = False,
        report_labelwise: bool = False,
        balance: bool = None,
        normalize: str = None,
        trigger_label_namespace: str = 'event_labels',
        initializer: InitializerApplicator = InitializerApplicator()
    ) -> None:
        super().__init__(vocab)

        self._text_field_embedder = text_field_embedder
        self._encoder = encoder
        self._embeddings_dropout = Dropout(embeddings_dropout)
        self._dropout = Dropout(dropout)
        self._verbose = verbose
        self._report_labelwise = report_labelwise
        self._balance = balance
        self._trigger_label_namespace = trigger_label_namespace

        self._normalize = normalize

        num_trigger_classes = vocab.get_vocab_size(trigger_label_namespace)
        self._num_trigger_classes = num_trigger_classes
        if projection:
            self._projection = Linear(in_features=encoder.get_output_dim(),
                                      out_features=num_trigger_classes)
        else:
            self._projection = None

        self._accuracy = CategoricalAccuracy()
        labels = vocab.get_token_to_index_vocabulary(
            self._trigger_label_namespace)
        self._labels = list(labels)

        # We have two (slight different) metric sets: char-based and token-based
        # Char-based metrics also capture error propagated by NER.
        # Token-based metrics are computed as well to:
        #  1. Measure difference as error we have because of NER,
        #  2. Compare with most of the previous work evaluated token-level,
        #  3. As fallback if our tokenization option does not provide token-char mappings.
        self._prf_char_seqs = PrecisionRecallFScore(labels=self._labels)
        self._prf_token_seqs = PrecisionRecallFScore(labels=self._labels,
                                                     prefix='token_level/')
        self._prf_jmee = SeqEvalPrecisionRecallFScore()

        initializer(self)
示例#11
0
        def _build_role_type_mask(vocab: Vocabulary) -> torch.Tensor:
            role_type_mask_list: List[List[int]] = []
            all_role_types: List[str] = [
                r
                for r, _ in sorted(vocab.get_token_to_index_vocabulary(namespace='span_labels').items(),
                                   key=lambda t: t[1])
            ]
            for event_type, _ in sorted(vocab.get_token_to_index_vocabulary(namespace='event_labels').items(),
                                        key=lambda t: t[1]):
                role_type_mask_list.append([
                    (
                        1
                        if k in ontology['events'][event_type]['roles'].keys() or (
                                i == 0 and task in ['argidcls', 'argidcls_noisy'])
                        else 0
                    )
                    for i, k in enumerate(all_role_types)
                ])

            return torch.tensor(role_type_mask_list, dtype=torch.bool)  # [num_events, num_roles]
示例#12
0
    def __init__(self,
                 vocab: Vocabulary,
                 projection_dim: int = 10,
                 xnyms: str = 'antonyms',
                 normalize=True,
                 sparse=True,
                 parallelize=False,
                 numerize_dict=True):
        super(XnymEmbedder, self).__init__()
        self.xnyms = xnyms
        self.S = None

        with timeit_context('creating %s-dict' % self.xnyms):
            self.vocab = vocab
            self.parallelize = parallelize

            xnyms_looker_fun = wordnet_lookers[xnyms]
            self.xnym_dict = wordnet_lookup_xnyms(
                vocab._index_to_token['tokens'], fun=xnyms_looker_fun)

            self.xnym_dict[(
                'in',
                'common',
            )] = [('differ', ), ('differs', )]
            self.xnym_dict[('equivocally', )] = [('univocally', )]
            self.xnym_dict[('micronutrients', )] = [('macronutrients', )]

            self.xnym_dict = balance_complex_tuple_dict(self.xnym_dict)

            if numerize_dict:
                self.xnym_dict = numerize(
                    self.xnym_dict, vocab.get_token_to_index_vocabulary())

            #pprint.pprint (dict(zip(list(self.xnym_dict.keys())[:take],list(self.xnym_dict.values())[:take])))

            self.normalize = normalize
            self.sparse = sparse
            self.output_dim = projection_dim

            xnym_keys = list(self.xnym_dict.keys())
            length = max(map(len, xnym_keys))

            self.xnyms_keys = np.array(
                [list(xi) + [np.nan] * (length - len(xi)) for xi in xnym_keys])
            self.xnyms_counterparts = self.generate_xnym_counterparts(
                self.xnym_dict.values())

            self.xnyms_keys_len_groups = [
                (l, list(g))
                for l, g in itertools.groupby(sorted(self.xnym_dict.items(),
                                                     key=lambda x: len(x[0])),
                                              key=lambda x: len(x[0]))
            ]
    def __init__(self,
                 vocab: Vocabulary,
                 span_typer: SpanTyper,
                 embed_size: int,
                 label_namespace: str = 'span_labels',
                 event_namespace: str = 'event_labels'):
        super(ArgumentSpanClassifier, self).__init__()

        self.vocab: Vocabulary = vocab
        self.label_namespace: str = label_namespace
        self.event_namespace: str = event_namespace

        self.embed_size = embed_size
        self.event_embedding_size = 50

        self.event_embeddings: nn.Embedding = nn.Embedding(
            num_embeddings=len(
                vocab.get_token_to_index_vocabulary(
                    namespace=event_namespace)),
            embedding_dim=self.event_embedding_size)

        self.lexical_dropout = nn.Dropout(p=0.2)
        self.span_extractor: SpanExtractor = EndpointSpanExtractor(
            input_dim=self.embed_size, combination='x,y')
        self.attentive_span_extractor: SpanExtractor = SelfAttentiveSpanExtractor(
            embed_size)

        self.arg_affine = TimeDistributed(
            FeedForward(input_dim=self.span_extractor.get_output_dim() +
                        self.attentive_span_extractor.get_output_dim(),
                        hidden_dims=self.embed_size,
                        num_layers=2,
                        activations=nn.GELU(),
                        dropout=0.2))
        self.trigger_affine = FeedForward(
            input_dim=self.span_extractor.get_output_dim() +
            self.attentive_span_extractor.get_output_dim(),
            hidden_dims=self.embed_size - self.event_embedding_size,
            num_layers=2,
            activations=nn.GELU(),
            dropout=0.2)

        self.trigger_event_infusion = TimeDistributed(
            FeedForward(input_dim=2 * self.embed_size,
                        hidden_dims=self.embed_size,
                        num_layers=2,
                        activations=nn.GELU(),
                        dropout=0.2))

        self.span_typer: SpanTyper = span_typer

        self.apply(self._init_weights)
示例#14
0
 def test_transformers_vocabs_added_correctly(self):
     namespace, model_name = "tags", "roberta-base"
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     allennlp_tokenizer = PretrainedTransformerTokenizer(model_name)
     indexer = PretrainedTransformerIndexer(model_name=model_name,
                                            namespace=namespace)
     allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!")
     vocab = Vocabulary()
     # here we copy entire transformers vocab
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
     del indexed
     assert vocab.get_token_to_index_vocabulary(
         namespace=namespace) == tokenizer.encoder
示例#15
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 span_extractor: SpanExtractor,
                 feedforward: FeedForward,
                 ner_threshold: float = 0.65,
                 max_inner_range: float = 18,
                 metadata: List[Dict[str, Any]] = None,
                 label_namespace: str = "ner_labels",
                 regularizer: Optional[RegularizerApplicator] = None,
                 initializer: InitializerApplicator = InitializerApplicator()) -> None:
        super(NERTagger, self).__init__(vocab, regularizer)

        self._include_trigger = False
        for label in vocab.get_token_to_index_vocabulary(label_namespace):
            if "trigger" in label:
                self._include_trigger = True

        self.label_namespace = label_namespace
        self._n_labels = self.vocab.get_vocab_size(label_namespace)

        # null_label = vocab.get_token_index("", label_namespace)
        # assert null_label == 0

        self._ner_threshold = ner_threshold
        self._max_inner_range = max_inner_range
        self._ner_scorer = torch.nn.ModuleDict()

        self._text_field_embedder = text_field_embedder

        self._span_extractor = span_extractor

        self._ner_scorer = torch.nn.Sequential(
            TimeDistributed(feedforward),
            TimeDistributed(torch.nn.Linear(
                feedforward.get_output_dim(),
                self._n_labels)))

        self._relation_f1_metric = RelationMetric(
            vocab, tag_namespace=label_namespace,
        )

        self._ner_metric = NERMetrics(self._n_labels)
        self._relation_metric = SpanRelationMetric()

        self._loss = torch.nn.BCEWithLogitsLoss(reduction="sum")

        initializer(self)
示例#16
0
def get_labels(vocab: Vocabulary) -> List[str]:
    """Gets list of labels in the vocabulary

    Parameters
    ----------
    vocab: `allennlp.data.Vocabulary`

    Returns
    -------
    labels: `List[str]`
        A list of label strings
    """
    return [
        k for k in vocab.get_token_to_index_vocabulary(
            namespace=LABELS_NAMESPACE)
    ]
示例#17
0
    def __init__(self,
                 vocab: Vocabulary,
                 input_embedder: TextFieldEmbedder,
                 pooler: Seq2VecEncoder,
                 nli_projection_layer: FeedForward,
                 training_tasks: Any,
                 validation_tasks: Any,
                 dropout: float = 0.0,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(SimpleProjectionOld, self).__init__(vocab, regularizer)
        if type(training_tasks) == dict:
            self._training_tasks = list(training_tasks.keys())
        else:
            self._training_tasks = training_tasks

        if type(validation_tasks) == dict:
            self._validation_tasks = list(validation_tasks.keys())
        else:
            self._validation_tasks = validation_tasks

        self._input_embedder = input_embedder
        self._pooler = pooler

        self._label_namespace = "labels"
        self._num_labels = vocab.get_vocab_size(
            namespace=self._label_namespace)

        self._nli_projection_layer = nli_projection_layer
        print(
            vocab.get_token_to_index_vocabulary(
                namespace=self._label_namespace))
        assert nli_projection_layer.get_output_dim() == self._num_labels

        self._dropout = torch.nn.Dropout(p=dropout)

        self._loss = torch.nn.CrossEntropyLoss()

        initializer(self._nli_projection_layer)

        self._nli_per_lang_acc: Dict[str, CategoricalAccuracy] = dict()

        for taskname in self._validation_tasks:
            # this will hide some metrics from tqdm, but they will still be computed
            self._nli_per_lang_acc[taskname] = CategoricalAccuracy()
        self._nli_avg_acc = Average()
示例#18
0
def set_labels(vocab: Vocabulary, new_labels: List[str]):
    """Resets the labels in the vocabulary with a given labels string list

    Parameters
    ----------
    vocab: `allennlp.data.Vocabulary`
    new_labels: `List[str]`
        The label strings to add to the vocabulary
    """
    for namespace_vocab in [
            vocab.get_token_to_index_vocabulary(LABELS_NAMESPACE),
            vocab.get_index_to_token_vocabulary(LABELS_NAMESPACE),
    ]:
        tokens = list(namespace_vocab.keys())
        for token in tokens:
            del namespace_vocab[token]

    extend_labels(vocab, new_labels)
    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 attend_feedforward: FeedForward,
                 similarity_function: SimilarityFunction,
                 compare_feedforward: FeedForward,
                 aggregate_feedforward: FeedForward,
                 premise_encoder: Optional[Seq2SeqEncoder] = None,
                 hypothesis_encoder: Optional[Seq2SeqEncoder] = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None,
                 similarity_weight: int = 30) -> None:
        super(DecomposableAttentionModified, self).__init__(vocab, regularizer)
        
        self.label_map = vocab.get_token_to_index_vocabulary('labels')

        label_map = [None]*len(self.label_map)
        for lb,lb_idx in self.label_map.items():
            label_map[lb_idx] = lb
        self.label_map = label_map

        self._text_field_embedder = text_field_embedder
        self._attend_feedforward = TimeDistributed(attend_feedforward)
        self._matrix_attention = LegacyMatrixAttention(similarity_function)
        self._compare_feedforward = TimeDistributed(compare_feedforward)
        self._aggregate_feedforward = aggregate_feedforward
        self._premise_encoder = premise_encoder
        self._hypothesis_encoder = hypothesis_encoder or premise_encoder

        self._num_labels = vocab.get_vocab_size(namespace="labels")

        check_dimensions_match(text_field_embedder.get_output_dim(), attend_feedforward.get_input_dim(),
                               "text field embedding dim", "attend feedforward input dim")
        check_dimensions_match(aggregate_feedforward.get_output_dim(), self._num_labels,
                               "final output dimension", "number of labels")

        self._accuracy = CategoricalAccuracy()
        self._loss = torch.nn.CrossEntropyLoss()

        initializer(self)

        self.lambda_layer = nn.Sequential(nn.Linear(16, 1,bias=False), MyActivationFunction())

        self.lambda_layer[0].weight.data = torch.tensor([[0.1,0.5,0.5,0.5, 0.5,0.1,0.5,0.5, 0.5,0.5,0.1,0.5, 0.5,0.5,0.5,0.9]])
        self.similarity_weight = similarity_weight        
 def __init__(self,
              vocab: Vocabulary,
              vocab_namespace: str,
              projection_dim: int = None,
              ignore_oov: bool = False) -> None:
     super().__init__()
     self.vocab = vocab
     self.vocab_size = vocab.get_vocab_size(vocab_namespace)
     if projection_dim:
         self._projection = torch.nn.Linear(self.vocab_size, projection_dim)
     else:
         self._projection = None
     self._ignore_oov = ignore_oov
     oov_token = vocab._oov_token  # pylint: disable=protected-access
     self._oov_idx = vocab.get_token_to_index_vocabulary(
         vocab_namespace).get(oov_token)
     if self._oov_idx is None:
         raise ConfigurationError(
             "OOV token does not exist in vocabulary namespace {}".format(
                 vocab_namespace))
     self.output_dim = projection_dim or self.vocab_size
示例#21
0
    def __init__(
        self,
        vocabulary: Vocabulary,
        image_feature_size: Tuple[int, int, int] = (1024, 14, 14),
        module_channels: int = 128,
        class_projection_channels: int = 1024,
        classifier_linear_size: int = 1024,
    ):
        super().__init__()
        self.vocabulary = vocabulary

        # Short-hand notations for convenience.
        __channels, __height, __width = image_feature_size

        # Exclude "@@UNKNOWN@@" answer token, our network will never generate this output through
        # regular forward pass. We set answer output as "@@UNKNOWN@@" when sampled programs are
        # invalid. __num_answers will be 28 for all practical purposes.
        __num_answers = len(
            vocabulary.get_index_to_token_vocabulary(namespace="answers")) - 1

        # The stem takes features from ResNet (or another feature extractor) and projects down to
        # a lower-dimensional space for sending through the Neural Module Network.
        self.stem = nn.Sequential(
            nn.Conv2d(image_feature_size[0],
                      module_channels,
                      kernel_size=3,
                      padding=1),
            nn.ReLU(),
            nn.Conv2d(module_channels,
                      module_channels,
                      kernel_size=3,
                      padding=1),
            nn.ReLU(),
        )
        # The classifier takes output of the last module (which will be a Query or Equal module)
        # and produces a distribution over answers.
        self.classifier = nn.Sequential(
            nn.Conv2d(module_channels,
                      class_projection_channels,
                      kernel_size=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            Flatten(),
            nn.Linear(class_projection_channels * __height * __width // 4,
                      classifier_linear_size),
            nn.ReLU(),
            nn.Linear(classifier_linear_size,
                      __num_answers),  # note no softmax here
        )

        # Instantiate a module for each program token in our vocabulary.
        self._function_modules: Dict[str, Type[nn.Module]] = {}
        for program_token in vocabulary.get_token_to_index_vocabulary(
                "programs"):

            # We don"t need modules for the placeholders.
            if program_token in [
                    "@@PADDING@@", "@@UNKNOWN@@", "@start@", "@end@", "unique"
            ]:
                continue

            # Figure out which module we want we use.
            if program_token == "scene":
                # "scene" is just a flag that indicates the start of a new line of reasoning
                # we set `module` to `None` because we still need the flag "scene" in forward()
                module = None
            elif program_token == "intersect":
                module = AndModule()
            elif program_token == "union":
                module = OrModule()
            elif "equal" in program_token or program_token in {
                    "less_than", "greater_than"
            }:
                module = ComparisonModule(module_channels)
            elif "query" in program_token or program_token in {
                    "exist", "count"
            }:
                module = QueryModule(module_channels)
            elif "relate" in program_token:
                module = RelateModule(module_channels)
            elif "same" in program_token:
                module = SameModule(module_channels)
            else:
                module = AttentionModule(module_channels)

            # Add the module to our dictionary and register its parameters so it can learn
            self._function_modules[program_token] = module  # type: ignore
            self.add_module(program_token, module)

        # Cross Entropy Loss for answer classification.
        self._loss = nn.CrossEntropyLoss(reduction="none")

        # Record accuracy while training and validation.
        self._answer_accuracy = BooleanAccuracy()

        # Record average number of invalid programs per batch.
        self._average_invalid_programs = Average()
示例#22
0
文件: sciie.py 项目: almoslmi/yarx
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 context_layer: Seq2SeqEncoder,
                 relex_feedforward: FeedForward,
                 antecedent_feedforward: FeedForward,
                 feature_size: int,
                 max_span_width: int,
                 spans_per_word: float,
                 relex_spans_per_word: float,
                 max_antecedents: int,
                 mention_feedforward: FeedForward,
                 coref_mention_feedforward: FeedForward = None,
                 relex_mention_feedforward: FeedForward = None,
                 symmetric_relations: bool = False,
                 lexical_dropout: float = 0.2,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None,
                 loss_coref_weight: float = 1,
                 loss_relex_weight: float = 1,
                 loss_ner_weight: float = 1,
                 preserve_metadata: List = None,
                 relex_namespace: str = 'relation_labels') -> None:
        # If separate coref mention and relex mention feedforward scorers
        # are not provided, share the one of NER module
        if coref_mention_feedforward is None:
            coref_mention_feedforward = mention_feedforward
        if relex_mention_feedforward is None:
            relex_mention_feedforward = mention_feedforward

        super().__init__(vocab, text_field_embedder, context_layer,
                         coref_mention_feedforward, antecedent_feedforward,
                         feature_size, max_span_width, spans_per_word,
                         max_antecedents, lexical_dropout, initializer,
                         regularizer)

        self._symmetric_relations = symmetric_relations
        self._relex_spans_per_word = relex_spans_per_word
        self._loss_coref_weight = loss_coref_weight
        self._loss_relex_weight = loss_relex_weight
        self._loss_ner_weight = loss_ner_weight
        self._preserve_metadata = preserve_metadata or ['id']
        self._relex_namespace = relex_namespace

        relex_labels = list(
            vocab.get_token_to_index_vocabulary(self._relex_namespace))
        self._relex_mention_recall = RelexMentionRecall()
        self._relex_precision_recall_fscore = PrecisionRecallFScore(
            labels=relex_labels)

        relex_mention_scorer = Sequential(
            TimeDistributed(relex_mention_feedforward),
            TimeDistributed(
                Projection(relex_mention_feedforward.get_output_dim())))
        self._relex_mention_pruner = MultiTimeDistributed(
            Pruner(relex_mention_scorer))

        self._ner_scorer = Sequential(
            TimeDistributed(mention_feedforward),
            TimeDistributed(
                Projection(mention_feedforward.get_output_dim(),
                           vocab.get_vocab_size('ner_labels'),
                           with_dummy=True)))

        self._relex_scorer = Sequential(
            TimeDistributed(relex_feedforward),
            TimeDistributed(
                Projection(relex_feedforward.get_output_dim(),
                           vocab.get_vocab_size(self._relex_namespace),
                           with_dummy=True)))
    def __init__(self,
                 vocab: Vocabulary,
                 span_graph_encoder: SpanGraphEncoder,
                 span_typer: SpanTyper,
                 embed_size: int,
                 label_namespace: str = 'span_labels',
                 event_namespace: str = 'event_labels',
                 use_event_embedding: bool = True):
        super(SelectorArgLinking, self).__init__()

        self.vocab: Vocabulary = vocab
        self.label_namespace: str = label_namespace
        self.event_namespace: str = event_namespace

        self.use_event_embedding = use_event_embedding
        self.embed_size = embed_size
        self.event_embedding_size = 50

        # self.span_finder: SpanFinder = span_finder
        # self.span_selector: SpanSelector = span_selector
        if use_event_embedding:
            self.event_embeddings: nn.Embedding = nn.Embedding(
                num_embeddings=len(vocab.get_token_to_index_vocabulary(namespace=event_namespace)),
                embedding_dim=self.event_embedding_size
            )

        self.lexical_dropout = nn.Dropout(p=0.2)
        # self.contextualized_encoder: Seq2SeqEncoder = LstmSeq2SeqEncoder(
        #     bidirectional=True,
        #     input_size=embed_size,
        #     hidden_size=embed_size,
        #     num_layers=2,
        #     dropout=0.4
        # )
        self.span_graph_encoder: SpanGraphEncoder = span_graph_encoder
        self.span_extractor: SpanExtractor = EndpointSpanExtractor(
            # input_dim=self.contextualized_encoder.get_output_dim(),
            input_dim=self.embed_size,
            combination='x,y'
        )
        self.attentive_span_extractor: SpanExtractor = SelfAttentiveSpanExtractor(embed_size)

        self.arg_affine = TimeDistributed(FeedForward(
            input_dim=self.span_extractor.get_output_dim() + self.attentive_span_extractor.get_output_dim(),
            hidden_dims=self.span_graph_encoder.get_input_dim(),
            num_layers=2,
            activations=nn.GELU(),
            dropout=0.2
        ))
        self.trigger_affine = FeedForward(
            input_dim=self.span_extractor.get_output_dim() + self.attentive_span_extractor.get_output_dim(),
            hidden_dims=self.span_graph_encoder.get_input_dim() - (
                self.event_embedding_size if use_event_embedding else 0),
            num_layers=2,
            activations=nn.GELU(),
            dropout=0.2
        )
        # self.arg_affine: nn.Linear = nn.Linear(
        #     self.span_extractor.get_output_dim() + self.attentive_span_extractor.get_output_dim(),
        #     self.span_graph_encoder.get_input_dim()
        # )
        # self.trigger_affine: nn.Linear = nn.Linear(
        #     self.span_extractor.get_output_dim() + self.attentive_span_extractor.get_output_dim(),
        #     self.span_graph_encoder.get_input_dim()
        # )

        # self.trigger_event_infuse: nn.Sequential = nn.Sequential(
        #     nn.Dropout(p=0.1),
        #     nn.Linear(4 * self.span_graph_encoder.get_input_dim(), 2 * self.span_graph_encoder.get_input_dim()),
        #     nn.Dropout(p=0.1),
        #     nn.GELU(),
        #     nn.Linear(2 * self.span_graph_encoder.get_input_dim(), self.span_graph_encoder.get_input_dim()),
        #     nn.Dropout(p=0.1),
        #     nn.GELU()
        # )

        self.span_typer: SpanTyper = span_typer

        self.apply(self._init_weights)
示例#24
0
    def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        doc_encoder: Seq2VecEncoder,
        const_path: str,
        tokens_namespace: str,
        use_sim: bool = True,
        use_classifier: bool = True,
    ) -> None:
        super().__init__(vocab)
        self.vocab = vocab
        self.num_tags = vocab.get_vocab_size("labels")

        self._token_embedder = text_field_embedder
        self._doc_encoder = doc_encoder

        if not use_sim:
            raise Exception(
                "use_sim option is false, but it must be true for this to work"
            )

        if use_classifier:
            print("Warning: use_classifier option does nothing now...")

        self.use_sim = use_sim
        self.use_classifier = use_classifier

        # I actually want to use the one from the config, but not sure how to do that.
        _spacy_word_splitter = SpacyWordSplitter()
        token_indexer = PretrainedBertIndexer("bert-base-cased",
                                              do_lowercase=False,
                                              use_starting_offsets=True)

        jc = JsonConverter()
        const, links = jc._read_const(const_path)

        # the extra 1 is for the "unmatched" label.
        print(vocab.get_token_to_index_vocabulary("labels"))
        print(const.keys())
        assert self.num_tags == len(
            const
        ) + 1, "Num tags ({}) doesn't match the size of the constitution+1 ({})".format(
            self.num_tags,
            len(const) + 1)

        if self.use_sim:
            # create the constitution matrix. Every element is one of the groups.
            tagmap = self.vocab.get_index_to_token_vocabulary("labels")
            self.const_dict = {}
            indices = []
            for i in range(self.num_tags):
                tagname = tagmap[i]
                if tagname != "unmatched":
                    const_text = const[tagname]
                else:
                    const_text = "@@pad@@"

                const_toks = _spacy_word_splitter.split_words(const_text)
                # truncate so BERT is happy.
                const_toks = const_toks[:250]
                const_indices = token_indexer.tokens_to_indices(
                    const_toks, vocab, tokens_namespace)
                indices.append(const_indices)

            max_len = max(map(lambda j: len(j[tokens_namespace]), indices))
            max_offset_len = max(
                map(lambda j: len(j["tokens-offsets"]), indices))

            const_tensor = torch.zeros(self.num_tags, max_len).long()
            const_tensor_offsets = torch.zeros(self.num_tags,
                                               max_offset_len).long()
            const_tensor_mask = torch.zeros(self.num_tags,
                                            max_offset_len).long()
            for i, ind in enumerate(indices):
                toks = ind[tokens_namespace]
                mask = ind["mask"]
                const_tensor[i, :len(toks)] = torch.LongTensor(toks)
                const_tensor_offsets[
                    i, :len(ind["tokens-offsets"])] = torch.LongTensor(
                        ind["tokens-offsets"])
                const_tensor_mask[i, :len(mask)] = torch.LongTensor(mask)

            const_tokens = {
                tokens_namespace: const_tensor,
                "tokens-offsets": const_tensor_offsets,
                "mask": const_tensor_mask
            }

            print("Embedding the constitution... this could take a minute...")
            self.const_mask = util.get_text_field_mask(const_tokens)
            self.const_emb = self._token_embedder(const_tokens).detach()
            print("Done embedding the constitution.")

            if torch.cuda.is_available():
                self.const_emb = self.const_emb.cuda()
                self.const_mask = self.const_mask.cuda()

        self.vectorf1 = VectorF1(unmatched_index=self.vocab.get_token_index(
            "unmatched", namespace="labels"))
        # self.metric = F1Measure(positive_label=1)

        # self.ff = FeedForward(doc_encoder.get_output_dim(), num_layers=4,
        #                       hidden_dims=100,
        #                       activations=Activation.by_name("relu")())

        #self.tag_projection_layer = Linear(self.ff.get_output_dim(), self.num_tags)
        #self.choice_projection_layer = Linear(self.ff.get_output_dim(), 2)

        self.sim_ff = TimeDistributed(
            FeedForward(doc_encoder.get_output_dim(),
                        num_layers=1,
                        hidden_dims=2,
                        activations=Activation.by_name("relu")()))
示例#25
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 projection_feedforward: FeedForward,
                 key_projection_feedforward: FeedForward,
                 inference_encoder: Seq2SeqEncoder,
                 link_key_encoder: Seq2SeqEncoder,
                 key_compare_feedforward: FeedForward,
                 output_feedforward: FeedForward,
                 output_logit: FeedForward,
                 dropout: float = 0.5,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:

        super().__init__(vocab, regularizer)
        self.label_map = vocab.get_token_to_index_vocabulary('labels')
        l_map = [None] * len(self.label_map)
        for lb, lb_idx in self.label_map.items():
            l_map[lb_idx] = lb
        self.label_map = l_map

        self._text_field_embedder = text_field_embedder
        self._word_embedding_dimension = text_field_embedder.get_output_dim()
        self._sentence_encoder = encoder
        self._encoded_word_dimension = self._sentence_encoder.get_output_dim()

        self._matrix_attention = DotProductMatrixAttention()
        self._projection_feedforward = projection_feedforward
        self._key_projection_feedforward = key_projection_feedforward

        self._inference_encoder = inference_encoder
        self._link_key_encoder = link_key_encoder
        self._embedded_key_dimension = self._link_key_encoder.get_output_dim()
        self._key_compare_feedforward = key_compare_feedforward

        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
            self.rnn_input_dropout = InputVariationalDropout(dropout)
        else:
            self.dropout = None
            self.rnn_input_dropout = None

        self._output_feedforward = output_feedforward
        self._output_logit = output_logit

        self._num_labels = vocab.get_vocab_size(namespace="labels")

        check_dimensions_match(text_field_embedder.get_output_dim(),
                               encoder.get_input_dim(),
                               "text field embedding dim", "encoder input dim")
        check_dimensions_match(encoder.get_output_dim() * 4,
                               projection_feedforward.get_input_dim(),
                               "encoder output dim",
                               "projection feedforward input")
        check_dimensions_match(encoder.get_output_dim() * 4,
                               key_projection_feedforward.get_input_dim(),
                               "encoder output dim",
                               "projection feedforward input")
        check_dimensions_match(projection_feedforward.get_output_dim(),
                               inference_encoder.get_input_dim(),
                               "proj feedforward output dim",
                               "inference lstm input dim")
        check_dimensions_match(key_projection_feedforward.get_output_dim(),
                               link_key_encoder.get_input_dim(),
                               "key proj feedforward output dim",
                               "link key lstm input dim")
        check_dimensions_match(key_projection_feedforward.get_output_dim(),
                               link_key_encoder.get_input_dim(),
                               "key proj feedforward output dim",
                               "inference lstm input dim")

        self._accuracy = CategoricalAccuracy()
        self._loss = torch.nn.CrossEntropyLoss()

        initializer(self)
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 similarity_function: SimilarityFunction,
                 projection_feedforward: FeedForward,
                 inference_encoder: Seq2SeqEncoder,
                 output_feedforward: FeedForward,
                 output_logit: FeedForward,
                 dropout: float = 0.5,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None,
                 similarity_weight: int = 30) -> None:
        super().__init__(vocab, regularizer)

        self.label_map = vocab.get_token_to_index_vocabulary('labels')

        label_map = [None] * len(self.label_map)
        for lb, lb_idx in self.label_map.items():
            label_map[lb_idx] = lb
        self.label_map = label_map

        self._text_field_embedder = text_field_embedder
        self._encoder = encoder
        print(similarity_function)
        self._matrix_attention = LegacyMatrixAttention(similarity_function)
        self._projection_feedforward = projection_feedforward

        self._inference_encoder = inference_encoder

        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
            self.rnn_input_dropout = InputVariationalDropout(dropout)
        else:
            self.dropout = None
            self.rnn_input_dropout = None

        self._output_feedforward = output_feedforward
        self._output_logit = output_logit

        self._num_labels = vocab.get_vocab_size(namespace="labels")

        check_dimensions_match(text_field_embedder.get_output_dim(),
                               encoder.get_input_dim(),
                               "text field embedding dim", "encoder input dim")
        check_dimensions_match(encoder.get_output_dim() * 4,
                               projection_feedforward.get_input_dim(),
                               "encoder output dim",
                               "projection feedforward input")
        check_dimensions_match(projection_feedforward.get_output_dim(),
                               inference_encoder.get_input_dim(),
                               "proj feedforward output dim",
                               "inference lstm input dim")

        self._accuracy = CategoricalAccuracy()
        self._loss = torch.nn.CrossEntropyLoss()

        initializer(self)

        self.lambda_layer = nn.Sequential(nn.Linear(16, 1, bias=False),
                                          MyActivationFunction())

        self.lambda_layer[0].weight.data = torch.tensor([[
            0.1, 0.5, 0.5, 0.5, 0.5, 0.1, 0.5, 0.5, 0.5, 0.5, 0.1, 0.5, 0.5,
            0.5, 0.5, 0.9
        ]])
        self.similarity_weight = similarity_weight
        print("SIMILARITY WEIGHT BEING USED IS : {0}".format(
            self.similarity_weight))
示例#27
0
    def from_concrete(cls,
                      data_path: str,
                      cache_file: str,
                      vocab: Vocabulary,
                      ontology: Dict,
                      max_num_spans: int = 512,
                      task: str = 'argidcls',
                      sentence_mode: bool = False) -> 'ConcreteDataset':
        def _build_role_type_mask(vocab: Vocabulary) -> torch.Tensor:
            role_type_mask_list: List[List[int]] = []
            all_role_types: List[str] = [
                r
                for r, _ in sorted(vocab.get_token_to_index_vocabulary(namespace='span_labels').items(),
                                   key=lambda t: t[1])
            ]
            for event_type, _ in sorted(vocab.get_token_to_index_vocabulary(namespace='event_labels').items(),
                                        key=lambda t: t[1]):
                role_type_mask_list.append([
                    (
                        1
                        if k in ontology['events'][event_type]['roles'].keys() or (
                                i == 0 and task in ['argidcls', 'argidcls_noisy'])
                        else 0
                    )
                    for i, k in enumerate(all_role_types)
                ])

            return torch.tensor(role_type_mask_list, dtype=torch.bool)  # [num_events, num_roles]

        def _to_predictive_span_finder_gold(spans: List[Span]) -> Tuple[torch.Tensor, torch.Tensor]:
            sorted_spans: List[Span] = sorted(spans, key=lambda s: s.start)
            gold: List[int] = [0 for _ in range((sorted_spans[-1].start + 1) if len(sorted_spans) > 0 else 0)]
            gold_mask: List[int] = [0 for _ in range((sorted_spans[-1].start + 1) if len(sorted_spans) > 0 else 0)]
            for span in sorted_spans:
                gold[span.start] = span.end + 1  # shift one for null span
                gold_mask[span.start] = 1

            return (
                torch.tensor(gold, dtype=torch.long),
                torch.tensor(gold_mask, dtype=torch.bool)
            )

        def _tensorize_spans(evnt: Event) -> Tuple[torch.Tensor, torch.Tensor]:
            span_indices: List[Tuple[int, int]] = [(evnt.trigger.start, evnt.trigger.end)]
            span_types: List[int] = [vocab.get_token_index(token='None', namespace='span_labels')]
            if task in ['argidcls', 'argidcls-noisy']:
                mention_list = evnt.document.argument_mentions
            else:
                mention_list = evnt.arguments
            for mention in mention_list:
                if (mention.start, mention.end) == (evnt.trigger.start, evnt.trigger.end):
                    continue
                span_indices.append((mention.start, mention.end))
                arg: Optional[Argument] = evnt.find_arg_by_indices(indices=(mention.start, mention.end))
                span_types.append(
                    vocab.get_token_index(token='None', namespace='span_labels')
                    if arg is None else vocab.get_token_index(token=arg.role, namespace='span_labels')
                )
            return (
                torch.tensor(span_indices, dtype=torch.long).view([1, -1, 2]),
                torch.tensor(span_types, dtype=torch.long).view(1, -1)
            )

        def _tensorize_spans_sentence_level(
                evnt: Event,
                grouped_mentions: Optional[Dict[int, List[Tuple[Tuple[int, int], Span]]]]
        ) -> Tuple[int, torch.Tensor, torch.Tensor]:
            span_types: List[int] = [vocab.get_token_index(token='None', namespace='span_labels')]
            trigger_sent_ids, trigger_indices = evnt.document.global_to_local_spans(
                spans=[(evnt.trigger.start, evnt.trigger.end)]
            )
            trigger_sent_id = trigger_sent_ids[0]
            trigger_indices = trigger_indices[0]
            spans: List[Tuple[int, int]] = [trigger_indices]
            if task in ['argidcls', 'argidcls-noisy']:
                span_list = grouped_mentions[trigger_sent_id]
            else:  # argcls
                arg_sent_ids, arg_indices = evnt.document.global_to_local_spans(
                    spans=[(arg.start, arg.end) for arg in evnt.arguments]
                )
                span_list = [(t, evnt.arguments[i]) for i, t in enumerate(arg_indices)]
            for t, s in span_list:
                arg: Optional[Argument] = evnt.find_arg_by_indices(indices=(s.start, s.end))
                spans.append(t)
                span_types.append(
                    vocab.get_token_index(token='None', namespace='span_labels')
                    if arg is None else vocab.get_token_index(token=arg.role, namespace='span_labels')
                )
            return (
                trigger_sent_id,
                torch.tensor(spans, dtype=torch.long).view([1, -1, 2]),
                torch.tensor(span_types, dtype=torch.long).view(1, -1)
            )

        def _group_mentions_by_sentence(d: Document) -> Dict[int, List[Tuple[Tuple[int, int], Span]]]:
            if task == 'argidcls-noisy':
                mention_list = d.argument_mentions
            elif task == 'argidcls':
                mention_list = []
                for e in doc.events:
                    mention_list.extend(e.arguments)
            else:
                raise NotImplementedError
            sent_ids, span_indices = d.global_to_local_spans(spans=[(s.start, s.end) for s in mention_list])
            grouped_spans = defaultdict(list)
            for sent_id, t_list in groupby(zip(sent_ids, span_indices, mention_list), key=lambda k: k[0]):
                ss = [(t[1], t[2]) for t in t_list]
                grouped_spans[sent_id].extend(ss)
            return grouped_spans

        docs: List[Document] = cls.load_documents_from_concrete_dir(dir=data_path,
                                                                    task=task)
        cache: h5py.File = h5py.File(cache_file, mode='r')
        role_type_mask: torch.Tensor = _build_role_type_mask(vocab=vocab)  # [num_events, num_roles]
        num_events: int = len(vocab.get_token_to_index_vocabulary(namespace='event_labels'))
        num_roles: int = len(vocab.get_token_to_index_vocabulary(namespace='span_labels'))
        ins_to_event: Dict[int, Event] = {}
        instances: List[InputInstance] = []
        padding_tensor: torch.Tensor = torch.tensor(
            [[vocab.get_token_index(token='@@PADDING@@', namespace='span_labels')]],
            dtype=torch.long
        )  # [1, 1]

        metadata: Dict[int, Dict[str, Any]] = {}
        id: int = 0
        for doc in tqdm(docs):
            # sequence_tensor: torch.Tensor = _load_cache(doc.doc_key)
            if sentence_mode and task in ['argidcls', 'argidcls-noisy']:
                grouped_mentions: Optional[Dict[int, List[Tuple[Tuple[int, int], Span]]]] = _group_mentions_by_sentence(
                    doc)
            else:
                grouped_mentions = None
            for e in doc.events:  # but RAMS only has one event per document
                ins_metadata = {}
                if sentence_mode:
                    sent_id, span_indices, span_types = _tensorize_spans_sentence_level(
                        evnt=e,
                        grouped_mentions=grouped_mentions
                    )
                    ins_metadata['sentence_id'] = sent_id
                else:
                    span_indices, span_types = _tensorize_spans(evnt=e)
                if task == 'emd':
                    gold_span_indices, gold_span_indices_mask = _to_predictive_span_finder_gold([
                        arg for arg in e.arguments
                    ])
                if span_indices.shape[1] == 1:
                    logger.info('Example has no arguments.')
                    continue

                new_ins: InputInstance = InputInstance(
                    id=torch.tensor([id], dtype=torch.long),
                    # sequence_tensor=sequence_tensor.view(1, sequence_tensor.shape[0], sequence_tensor.shape[1]),
                    event_type=torch.tensor([vocab.get_token_index(token=e.kind, namespace='event_labels')],
                                            dtype=torch.long),
                    span_indices=torch.cat([
                        span_indices,
                        torch.zeros([1, max_num_spans - span_indices.shape[1], 2], dtype=torch.long)
                    ], dim=1) if task != 'emd' else None,
                    span_indices_mask=torch.cat([
                        torch.ones([1, span_indices.shape[1]], dtype=torch.bool),
                        torch.zeros([1, max_num_spans - span_indices.shape[1]], dtype=torch.bool)
                    ], dim=1) if task != 'emd' else None,
                    type_mask=role_type_mask[
                              vocab.get_token_index(e.kind, namespace='event_labels'), :
                              ].view(1, -1) if task != 'emd' else None,
                    span_types=torch.cat([
                        span_types,
                        padding_tensor.expand([1, max_num_spans - span_indices.shape[1]])
                    ], dim=1) if task != 'emd' else None,
                    gold_span_indices=gold_span_indices.view(1, -1) if task == 'emd' else None,
                    gold_span_indices_mask=gold_span_indices_mask.view(1, -1) if task == 'emd' else None
                )
                instances.append(new_ins)
                ins_to_event[id] = e
                metadata[id] = ins_metadata
                id += 1

        return cls(docs=docs,
                   ins_to_event=ins_to_event,
                   vocab=vocab,
                   cache_file=cache,
                   role_type_mask=role_type_mask,
                   num_events=num_events,
                   num_roles=num_roles,
                   instances=instances,
                   max_num_spans=max_num_spans,
                   sentence_mode=sentence_mode,
                   metadata=metadata)