def from_params( # type: ignore cls, vocab: Vocabulary, params: Params) -> "ElmoTokenEmbedderMultiLang": options_files = params.pop("options_files") weight_files = params.pop("weight_files") for lang in options_files.keys(): options_files.add_file_to_archive(lang) for lang in weight_files.keys(): weight_files.add_file_to_archive(lang) requires_grad = params.pop("requires_grad", False) do_layer_norm = params.pop_bool("do_layer_norm", False) dropout = params.pop_float("dropout", 0.5) namespace_to_cache = params.pop("namespace_to_cache", None) if namespace_to_cache is not None: vocab_to_cache = list( vocab.get_token_to_index_vocabulary(namespace_to_cache).keys()) else: vocab_to_cache = None projection_dim = params.pop_int("projection_dim", None) scalar_mix_parameters = params.pop("scalar_mix_parameters", None) aligning_files = params.pop("aligning_files", {}) params.assert_empty(cls.__name__) return cls( options_files=options_files, weight_files=weight_files, do_layer_norm=do_layer_norm, dropout=dropout, requires_grad=requires_grad, projection_dim=projection_dim, vocab_to_cache=vocab_to_cache, scalar_mix_parameters=scalar_mix_parameters, aligning_files=aligning_files, )
def from_params( # type: ignore cls, vocab: Vocabulary, params: Params, **extras) -> "ElmoTokenEmbedder": options_file = params.pop("options_file") weight_file = params.pop("weight_file") requires_grad = params.pop("requires_grad", False) do_layer_norm = params.pop_bool("do_layer_norm", False) dropout = params.pop_float("dropout", 0.5) namespace_to_cache = params.pop("namespace_to_cache", None) if namespace_to_cache is not None: vocab_to_cache = list( vocab.get_token_to_index_vocabulary(namespace_to_cache).keys()) else: vocab_to_cache = None projection_dim = params.pop_int("projection_dim", None) scalar_mix_parameters = params.pop("scalar_mix_parameters", None) params.assert_empty(cls.__name__) return cls( options_file=options_file, weight_file=weight_file, do_layer_norm=do_layer_norm, dropout=dropout, requires_grad=requires_grad, projection_dim=projection_dim, vocab_to_cache=vocab_to_cache, scalar_mix_parameters=scalar_mix_parameters, )
def from_params(cls, vocab: Vocabulary, params: Params) -> 'ElmoTokenEmbedder': # type: ignore # pylint: disable=arguments-differ params.add_file_to_archive('options_file') params.add_file_to_archive('weight_file') options_file = params.pop('options_file') weight_file = params.pop('weight_file') requires_grad = params.pop('requires_grad', False) do_layer_norm = params.pop_bool('do_layer_norm', False) dropout = params.pop_float("dropout", 0.5) namespace_to_cache = params.pop("namespace_to_cache", None) if namespace_to_cache is not None: vocab_to_cache = list(vocab.get_token_to_index_vocabulary(namespace_to_cache).keys()) else: vocab_to_cache = None projection_dim = params.pop_int("projection_dim", None) scalar_mix_parameters = params.pop('scalar_mix_parameters', None) params.assert_empty(cls.__name__) return cls(options_file=options_file, weight_file=weight_file, do_layer_norm=do_layer_norm, dropout=dropout, requires_grad=requires_grad, projection_dim=projection_dim, vocab_to_cache=vocab_to_cache, scalar_mix_parameters=scalar_mix_parameters)
def from_params( cls, vocab: Vocabulary, params: Params # type: ignore ) -> 'ElmoTokenEmbedderMultiLang': # pylint: disable=arguments-differ options_files = params.pop('options_files') weight_files = params.pop('weight_files') for lang in options_files.keys(): options_files.add_file_to_archive(lang) for lang in weight_files.keys(): weight_files.add_file_to_archive(lang) requires_grad = params.pop('requires_grad', False) do_layer_norm = params.pop_bool('do_layer_norm', False) dropout = params.pop_float("dropout", 0.5) namespace_to_cache = params.pop("namespace_to_cache", None) if namespace_to_cache is not None: vocab_to_cache = list( vocab.get_token_to_index_vocabulary(namespace_to_cache).keys()) else: vocab_to_cache = None projection_dim = params.pop_int("projection_dim", None) scalar_mix_parameters = params.pop('scalar_mix_parameters', None) aligning_files = params.pop('aligning_files', {}) params.assert_empty(cls.__name__) return cls(options_files=options_files, weight_files=weight_files, do_layer_norm=do_layer_norm, dropout=dropout, requires_grad=requires_grad, projection_dim=projection_dim, vocab_to_cache=vocab_to_cache, scalar_mix_parameters=scalar_mix_parameters, aligning_files=aligning_files)
def __init__(self, vocab: Vocabulary, beam_size: int, namespace: str = 'tokens', end_symbol: str = None, min_steps: int = None, max_steps: int = 50, per_node_beam_size: int = None, disallow_repeated_ngrams: int = None, repeated_ngrams_exceptions: List[str] = None, length_penalizer: LengthPenalizer = None, coverage_penalizer: CoveragePenalizer = None) -> None: self.beam_size = beam_size end_symbol = end_symbol or END_SYMBOL self._end_index = vocab.get_token_index(end_symbol, namespace) self.max_steps = max_steps self.min_steps = min_steps self.per_node_beam_size = per_node_beam_size or beam_size self.length_penalizer = length_penalizer self.coverage_penalizer = coverage_penalizer # Convert the token exceptions to their indexes self.disallow_repeated_ngrams = disallow_repeated_ngrams self.repeated_ngrams_exceptions = set() repeated_ngrams_exceptions = repeated_ngrams_exceptions or [] token_to_index = vocab.get_token_to_index_vocabulary(namespace) for token in repeated_ngrams_exceptions: if token not in token_to_index: raise Exception(f'Could not add token exception {token} because {token} is not in the vocabulary') self.repeated_ngrams_exceptions.add(token_to_index[token])
def from_params(cls, vocab: Vocabulary, params: Params) -> 'ElmoTokenEmbedder': # type: ignore # pylint: disable=arguments-differ params.add_file_to_archive('options_file') params.add_file_to_archive('weight_file') options_file = params.pop('options_file') weight_file = params.pop('weight_file') requires_grad = params.pop('requires_grad', False) do_layer_norm = params.pop_bool('do_layer_norm', False) dropout = params.pop_float("dropout", 0.5) namespace_to_cache = params.pop("namespace_to_cache", None) if namespace_to_cache is not None: vocab_to_cache = list( vocab.get_token_to_index_vocabulary(namespace_to_cache).keys()) else: vocab_to_cache = None projection_dim = params.pop_int("projection_dim", None) params.assert_empty(cls.__name__) return cls(options_file=options_file, weight_file=weight_file, do_layer_norm=do_layer_norm, dropout=dropout, requires_grad=requires_grad, projection_dim=projection_dim, vocab_to_cache=vocab_to_cache)
def __init__(self, vocab: Vocabulary, model: Model, label_namespace: str = "labels", positive_label: str = "HasDef"): super().__init__(vocab) self._model = model label_vocab = vocab.get_token_to_index_vocabulary(label_namespace) self._f1_measure = F1Measure(label_vocab[positive_label])
def __init__(self, vocab: Vocabulary, input_embedder: TextFieldEmbedder, nli_projection_layer: FeedForward, training_tasks: Any, validation_tasks: Any, langs_print_train: List[str] = ["en", "fr", "de", "ur", "sw"], dropout: float = 0.0, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, feed_lang_ids: bool = True, avg: bool = False) -> None: super(SimpleProjectionXlm, self).__init__(vocab, regularizer) self._avg = avg if type(training_tasks) == dict: self._training_tasks = list(training_tasks.keys()) else: self._training_tasks = training_tasks if type(validation_tasks) == dict: self._validation_tasks = list(validation_tasks.keys()) else: self._validation_tasks = validation_tasks self._input_embedder = input_embedder self._label_namespace = "labels" self._num_labels = vocab.get_vocab_size( namespace=self._label_namespace) self._nli_projection_layer = nli_projection_layer print( vocab.get_token_to_index_vocabulary( namespace=self._label_namespace)) assert nli_projection_layer.get_output_dim() == self._num_labels self._dropout = torch.nn.Dropout(p=dropout) self._loss = torch.nn.CrossEntropyLoss() initializer(self._nli_projection_layer) self._nli_per_lang_acc: Dict[str, CategoricalAccuracy] = dict() for taskname in self._validation_tasks: # this will hide some metrics from tqdm, but they will still be computed self._nli_per_lang_acc[taskname] = CategoricalAccuracy() self._nli_avg_acc = Average() self._langs_pring_train = langs_print_train or "en" if '*' in self._langs_pring_train: self._langs_pring_train = [t.split("")[-1] for t in training_tasks] self._feed_lang_ids = feed_lang_ids
def from_params(cls, vocab: Vocabulary, params: Params) -> 'ConstrainedConditionalModule': hard_constraints = params.pop("hard_constraints", []) soft_constraints = params.pop("soft_constraints", {}) label_namespace = params.pop("label_namespace", "labels") sentence_penalty_map_dict = params.pop("sentence_penalty_map", None) constrain_crf_decoding = params.pop("constrain_crf_decoding", False) label_encoding = params.pop("label_encoding", None) sentence_penalty_map = None if sentence_penalty_map_dict: assert len(sentence_penalty_map_dict) == 1, "multiple sentence constraints not supported" tag, penalty = list(sentence_penalty_map_dict.items())[0] tag_index = vocab.get_token_index(tag, label_namespace) sentence_penalty_map = (tag_index, penalty) hard_constraints_to_indices: Dict[str, List[int]] = {} for tag in hard_constraints: hard_constraints_to_indices[tag] = [] for label, index in vocab.get_token_to_index_vocabulary(label_namespace).items(): if re.match(rf"^.*-{tag}", label): hard_constraints_to_indices[tag].append(index) soft_constraints = soft_constraints or {} soft_constraints_to_indices: Dict[str, Tuple[List[int], float]] = {} for tag, penalty in soft_constraints.items(): indices = [] for label, index in vocab.get_token_to_index_vocabulary(label_namespace).items(): if re.match(rf"^.*-{tag}", label): indices.append(index) soft_constraints_to_indices[tag] = (indices, penalty) num_tags = vocab.get_vocab_size(label_namespace) if constrain_crf_decoding: if not label_encoding: raise ConfigurationError("constrain_crf_decoding is True, but " "no label_encoding was specified.") labels = vocab.get_index_to_token_vocabulary(label_namespace) constraints = allowed_transitions(label_encoding, labels) else: constraints = None params.assert_empty(cls.__name__) return ConstrainedConditionalModule(num_tags, constraints, hard_constraints_to_indices, soft_constraints_to_indices, sentence_penalty_map)
def __init__( self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, *, projection: bool = True, embeddings_dropout: float = 0, dropout: float = 0, verbose: Union[bool, Iterable[str]] = False, report_labelwise: bool = False, balance: bool = None, normalize: str = None, trigger_label_namespace: str = 'event_labels', initializer: InitializerApplicator = InitializerApplicator() ) -> None: super().__init__(vocab) self._text_field_embedder = text_field_embedder self._encoder = encoder self._embeddings_dropout = Dropout(embeddings_dropout) self._dropout = Dropout(dropout) self._verbose = verbose self._report_labelwise = report_labelwise self._balance = balance self._trigger_label_namespace = trigger_label_namespace self._normalize = normalize num_trigger_classes = vocab.get_vocab_size(trigger_label_namespace) self._num_trigger_classes = num_trigger_classes if projection: self._projection = Linear(in_features=encoder.get_output_dim(), out_features=num_trigger_classes) else: self._projection = None self._accuracy = CategoricalAccuracy() labels = vocab.get_token_to_index_vocabulary( self._trigger_label_namespace) self._labels = list(labels) # We have two (slight different) metric sets: char-based and token-based # Char-based metrics also capture error propagated by NER. # Token-based metrics are computed as well to: # 1. Measure difference as error we have because of NER, # 2. Compare with most of the previous work evaluated token-level, # 3. As fallback if our tokenization option does not provide token-char mappings. self._prf_char_seqs = PrecisionRecallFScore(labels=self._labels) self._prf_token_seqs = PrecisionRecallFScore(labels=self._labels, prefix='token_level/') self._prf_jmee = SeqEvalPrecisionRecallFScore() initializer(self)
def _build_role_type_mask(vocab: Vocabulary) -> torch.Tensor: role_type_mask_list: List[List[int]] = [] all_role_types: List[str] = [ r for r, _ in sorted(vocab.get_token_to_index_vocabulary(namespace='span_labels').items(), key=lambda t: t[1]) ] for event_type, _ in sorted(vocab.get_token_to_index_vocabulary(namespace='event_labels').items(), key=lambda t: t[1]): role_type_mask_list.append([ ( 1 if k in ontology['events'][event_type]['roles'].keys() or ( i == 0 and task in ['argidcls', 'argidcls_noisy']) else 0 ) for i, k in enumerate(all_role_types) ]) return torch.tensor(role_type_mask_list, dtype=torch.bool) # [num_events, num_roles]
def __init__(self, vocab: Vocabulary, projection_dim: int = 10, xnyms: str = 'antonyms', normalize=True, sparse=True, parallelize=False, numerize_dict=True): super(XnymEmbedder, self).__init__() self.xnyms = xnyms self.S = None with timeit_context('creating %s-dict' % self.xnyms): self.vocab = vocab self.parallelize = parallelize xnyms_looker_fun = wordnet_lookers[xnyms] self.xnym_dict = wordnet_lookup_xnyms( vocab._index_to_token['tokens'], fun=xnyms_looker_fun) self.xnym_dict[( 'in', 'common', )] = [('differ', ), ('differs', )] self.xnym_dict[('equivocally', )] = [('univocally', )] self.xnym_dict[('micronutrients', )] = [('macronutrients', )] self.xnym_dict = balance_complex_tuple_dict(self.xnym_dict) if numerize_dict: self.xnym_dict = numerize( self.xnym_dict, vocab.get_token_to_index_vocabulary()) #pprint.pprint (dict(zip(list(self.xnym_dict.keys())[:take],list(self.xnym_dict.values())[:take]))) self.normalize = normalize self.sparse = sparse self.output_dim = projection_dim xnym_keys = list(self.xnym_dict.keys()) length = max(map(len, xnym_keys)) self.xnyms_keys = np.array( [list(xi) + [np.nan] * (length - len(xi)) for xi in xnym_keys]) self.xnyms_counterparts = self.generate_xnym_counterparts( self.xnym_dict.values()) self.xnyms_keys_len_groups = [ (l, list(g)) for l, g in itertools.groupby(sorted(self.xnym_dict.items(), key=lambda x: len(x[0])), key=lambda x: len(x[0])) ]
def __init__(self, vocab: Vocabulary, span_typer: SpanTyper, embed_size: int, label_namespace: str = 'span_labels', event_namespace: str = 'event_labels'): super(ArgumentSpanClassifier, self).__init__() self.vocab: Vocabulary = vocab self.label_namespace: str = label_namespace self.event_namespace: str = event_namespace self.embed_size = embed_size self.event_embedding_size = 50 self.event_embeddings: nn.Embedding = nn.Embedding( num_embeddings=len( vocab.get_token_to_index_vocabulary( namespace=event_namespace)), embedding_dim=self.event_embedding_size) self.lexical_dropout = nn.Dropout(p=0.2) self.span_extractor: SpanExtractor = EndpointSpanExtractor( input_dim=self.embed_size, combination='x,y') self.attentive_span_extractor: SpanExtractor = SelfAttentiveSpanExtractor( embed_size) self.arg_affine = TimeDistributed( FeedForward(input_dim=self.span_extractor.get_output_dim() + self.attentive_span_extractor.get_output_dim(), hidden_dims=self.embed_size, num_layers=2, activations=nn.GELU(), dropout=0.2)) self.trigger_affine = FeedForward( input_dim=self.span_extractor.get_output_dim() + self.attentive_span_extractor.get_output_dim(), hidden_dims=self.embed_size - self.event_embedding_size, num_layers=2, activations=nn.GELU(), dropout=0.2) self.trigger_event_infusion = TimeDistributed( FeedForward(input_dim=2 * self.embed_size, hidden_dims=self.embed_size, num_layers=2, activations=nn.GELU(), dropout=0.2)) self.span_typer: SpanTyper = span_typer self.apply(self._init_weights)
def test_transformers_vocabs_added_correctly(self): namespace, model_name = "tags", "roberta-base" tokenizer = AutoTokenizer.from_pretrained(model_name) allennlp_tokenizer = PretrainedTransformerTokenizer(model_name) indexer = PretrainedTransformerIndexer(model_name=model_name, namespace=namespace) allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!") vocab = Vocabulary() # here we copy entire transformers vocab indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) del indexed assert vocab.get_token_to_index_vocabulary( namespace=namespace) == tokenizer.encoder
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, span_extractor: SpanExtractor, feedforward: FeedForward, ner_threshold: float = 0.65, max_inner_range: float = 18, metadata: List[Dict[str, Any]] = None, label_namespace: str = "ner_labels", regularizer: Optional[RegularizerApplicator] = None, initializer: InitializerApplicator = InitializerApplicator()) -> None: super(NERTagger, self).__init__(vocab, regularizer) self._include_trigger = False for label in vocab.get_token_to_index_vocabulary(label_namespace): if "trigger" in label: self._include_trigger = True self.label_namespace = label_namespace self._n_labels = self.vocab.get_vocab_size(label_namespace) # null_label = vocab.get_token_index("", label_namespace) # assert null_label == 0 self._ner_threshold = ner_threshold self._max_inner_range = max_inner_range self._ner_scorer = torch.nn.ModuleDict() self._text_field_embedder = text_field_embedder self._span_extractor = span_extractor self._ner_scorer = torch.nn.Sequential( TimeDistributed(feedforward), TimeDistributed(torch.nn.Linear( feedforward.get_output_dim(), self._n_labels))) self._relation_f1_metric = RelationMetric( vocab, tag_namespace=label_namespace, ) self._ner_metric = NERMetrics(self._n_labels) self._relation_metric = SpanRelationMetric() self._loss = torch.nn.BCEWithLogitsLoss(reduction="sum") initializer(self)
def get_labels(vocab: Vocabulary) -> List[str]: """Gets list of labels in the vocabulary Parameters ---------- vocab: `allennlp.data.Vocabulary` Returns ------- labels: `List[str]` A list of label strings """ return [ k for k in vocab.get_token_to_index_vocabulary( namespace=LABELS_NAMESPACE) ]
def __init__(self, vocab: Vocabulary, input_embedder: TextFieldEmbedder, pooler: Seq2VecEncoder, nli_projection_layer: FeedForward, training_tasks: Any, validation_tasks: Any, dropout: float = 0.0, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(SimpleProjectionOld, self).__init__(vocab, regularizer) if type(training_tasks) == dict: self._training_tasks = list(training_tasks.keys()) else: self._training_tasks = training_tasks if type(validation_tasks) == dict: self._validation_tasks = list(validation_tasks.keys()) else: self._validation_tasks = validation_tasks self._input_embedder = input_embedder self._pooler = pooler self._label_namespace = "labels" self._num_labels = vocab.get_vocab_size( namespace=self._label_namespace) self._nli_projection_layer = nli_projection_layer print( vocab.get_token_to_index_vocabulary( namespace=self._label_namespace)) assert nli_projection_layer.get_output_dim() == self._num_labels self._dropout = torch.nn.Dropout(p=dropout) self._loss = torch.nn.CrossEntropyLoss() initializer(self._nli_projection_layer) self._nli_per_lang_acc: Dict[str, CategoricalAccuracy] = dict() for taskname in self._validation_tasks: # this will hide some metrics from tqdm, but they will still be computed self._nli_per_lang_acc[taskname] = CategoricalAccuracy() self._nli_avg_acc = Average()
def set_labels(vocab: Vocabulary, new_labels: List[str]): """Resets the labels in the vocabulary with a given labels string list Parameters ---------- vocab: `allennlp.data.Vocabulary` new_labels: `List[str]` The label strings to add to the vocabulary """ for namespace_vocab in [ vocab.get_token_to_index_vocabulary(LABELS_NAMESPACE), vocab.get_index_to_token_vocabulary(LABELS_NAMESPACE), ]: tokens = list(namespace_vocab.keys()) for token in tokens: del namespace_vocab[token] extend_labels(vocab, new_labels)
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, attend_feedforward: FeedForward, similarity_function: SimilarityFunction, compare_feedforward: FeedForward, aggregate_feedforward: FeedForward, premise_encoder: Optional[Seq2SeqEncoder] = None, hypothesis_encoder: Optional[Seq2SeqEncoder] = None, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, similarity_weight: int = 30) -> None: super(DecomposableAttentionModified, self).__init__(vocab, regularizer) self.label_map = vocab.get_token_to_index_vocabulary('labels') label_map = [None]*len(self.label_map) for lb,lb_idx in self.label_map.items(): label_map[lb_idx] = lb self.label_map = label_map self._text_field_embedder = text_field_embedder self._attend_feedforward = TimeDistributed(attend_feedforward) self._matrix_attention = LegacyMatrixAttention(similarity_function) self._compare_feedforward = TimeDistributed(compare_feedforward) self._aggregate_feedforward = aggregate_feedforward self._premise_encoder = premise_encoder self._hypothesis_encoder = hypothesis_encoder or premise_encoder self._num_labels = vocab.get_vocab_size(namespace="labels") check_dimensions_match(text_field_embedder.get_output_dim(), attend_feedforward.get_input_dim(), "text field embedding dim", "attend feedforward input dim") check_dimensions_match(aggregate_feedforward.get_output_dim(), self._num_labels, "final output dimension", "number of labels") self._accuracy = CategoricalAccuracy() self._loss = torch.nn.CrossEntropyLoss() initializer(self) self.lambda_layer = nn.Sequential(nn.Linear(16, 1,bias=False), MyActivationFunction()) self.lambda_layer[0].weight.data = torch.tensor([[0.1,0.5,0.5,0.5, 0.5,0.1,0.5,0.5, 0.5,0.5,0.1,0.5, 0.5,0.5,0.5,0.9]]) self.similarity_weight = similarity_weight
def __init__(self, vocab: Vocabulary, vocab_namespace: str, projection_dim: int = None, ignore_oov: bool = False) -> None: super().__init__() self.vocab = vocab self.vocab_size = vocab.get_vocab_size(vocab_namespace) if projection_dim: self._projection = torch.nn.Linear(self.vocab_size, projection_dim) else: self._projection = None self._ignore_oov = ignore_oov oov_token = vocab._oov_token # pylint: disable=protected-access self._oov_idx = vocab.get_token_to_index_vocabulary( vocab_namespace).get(oov_token) if self._oov_idx is None: raise ConfigurationError( "OOV token does not exist in vocabulary namespace {}".format( vocab_namespace)) self.output_dim = projection_dim or self.vocab_size
def __init__( self, vocabulary: Vocabulary, image_feature_size: Tuple[int, int, int] = (1024, 14, 14), module_channels: int = 128, class_projection_channels: int = 1024, classifier_linear_size: int = 1024, ): super().__init__() self.vocabulary = vocabulary # Short-hand notations for convenience. __channels, __height, __width = image_feature_size # Exclude "@@UNKNOWN@@" answer token, our network will never generate this output through # regular forward pass. We set answer output as "@@UNKNOWN@@" when sampled programs are # invalid. __num_answers will be 28 for all practical purposes. __num_answers = len( vocabulary.get_index_to_token_vocabulary(namespace="answers")) - 1 # The stem takes features from ResNet (or another feature extractor) and projects down to # a lower-dimensional space for sending through the Neural Module Network. self.stem = nn.Sequential( nn.Conv2d(image_feature_size[0], module_channels, kernel_size=3, padding=1), nn.ReLU(), nn.Conv2d(module_channels, module_channels, kernel_size=3, padding=1), nn.ReLU(), ) # The classifier takes output of the last module (which will be a Query or Equal module) # and produces a distribution over answers. self.classifier = nn.Sequential( nn.Conv2d(module_channels, class_projection_channels, kernel_size=1), nn.ReLU(), nn.MaxPool2d(kernel_size=2, stride=2), Flatten(), nn.Linear(class_projection_channels * __height * __width // 4, classifier_linear_size), nn.ReLU(), nn.Linear(classifier_linear_size, __num_answers), # note no softmax here ) # Instantiate a module for each program token in our vocabulary. self._function_modules: Dict[str, Type[nn.Module]] = {} for program_token in vocabulary.get_token_to_index_vocabulary( "programs"): # We don"t need modules for the placeholders. if program_token in [ "@@PADDING@@", "@@UNKNOWN@@", "@start@", "@end@", "unique" ]: continue # Figure out which module we want we use. if program_token == "scene": # "scene" is just a flag that indicates the start of a new line of reasoning # we set `module` to `None` because we still need the flag "scene" in forward() module = None elif program_token == "intersect": module = AndModule() elif program_token == "union": module = OrModule() elif "equal" in program_token or program_token in { "less_than", "greater_than" }: module = ComparisonModule(module_channels) elif "query" in program_token or program_token in { "exist", "count" }: module = QueryModule(module_channels) elif "relate" in program_token: module = RelateModule(module_channels) elif "same" in program_token: module = SameModule(module_channels) else: module = AttentionModule(module_channels) # Add the module to our dictionary and register its parameters so it can learn self._function_modules[program_token] = module # type: ignore self.add_module(program_token, module) # Cross Entropy Loss for answer classification. self._loss = nn.CrossEntropyLoss(reduction="none") # Record accuracy while training and validation. self._answer_accuracy = BooleanAccuracy() # Record average number of invalid programs per batch. self._average_invalid_programs = Average()
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, context_layer: Seq2SeqEncoder, relex_feedforward: FeedForward, antecedent_feedforward: FeedForward, feature_size: int, max_span_width: int, spans_per_word: float, relex_spans_per_word: float, max_antecedents: int, mention_feedforward: FeedForward, coref_mention_feedforward: FeedForward = None, relex_mention_feedforward: FeedForward = None, symmetric_relations: bool = False, lexical_dropout: float = 0.2, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, loss_coref_weight: float = 1, loss_relex_weight: float = 1, loss_ner_weight: float = 1, preserve_metadata: List = None, relex_namespace: str = 'relation_labels') -> None: # If separate coref mention and relex mention feedforward scorers # are not provided, share the one of NER module if coref_mention_feedforward is None: coref_mention_feedforward = mention_feedforward if relex_mention_feedforward is None: relex_mention_feedforward = mention_feedforward super().__init__(vocab, text_field_embedder, context_layer, coref_mention_feedforward, antecedent_feedforward, feature_size, max_span_width, spans_per_word, max_antecedents, lexical_dropout, initializer, regularizer) self._symmetric_relations = symmetric_relations self._relex_spans_per_word = relex_spans_per_word self._loss_coref_weight = loss_coref_weight self._loss_relex_weight = loss_relex_weight self._loss_ner_weight = loss_ner_weight self._preserve_metadata = preserve_metadata or ['id'] self._relex_namespace = relex_namespace relex_labels = list( vocab.get_token_to_index_vocabulary(self._relex_namespace)) self._relex_mention_recall = RelexMentionRecall() self._relex_precision_recall_fscore = PrecisionRecallFScore( labels=relex_labels) relex_mention_scorer = Sequential( TimeDistributed(relex_mention_feedforward), TimeDistributed( Projection(relex_mention_feedforward.get_output_dim()))) self._relex_mention_pruner = MultiTimeDistributed( Pruner(relex_mention_scorer)) self._ner_scorer = Sequential( TimeDistributed(mention_feedforward), TimeDistributed( Projection(mention_feedforward.get_output_dim(), vocab.get_vocab_size('ner_labels'), with_dummy=True))) self._relex_scorer = Sequential( TimeDistributed(relex_feedforward), TimeDistributed( Projection(relex_feedforward.get_output_dim(), vocab.get_vocab_size(self._relex_namespace), with_dummy=True)))
def __init__(self, vocab: Vocabulary, span_graph_encoder: SpanGraphEncoder, span_typer: SpanTyper, embed_size: int, label_namespace: str = 'span_labels', event_namespace: str = 'event_labels', use_event_embedding: bool = True): super(SelectorArgLinking, self).__init__() self.vocab: Vocabulary = vocab self.label_namespace: str = label_namespace self.event_namespace: str = event_namespace self.use_event_embedding = use_event_embedding self.embed_size = embed_size self.event_embedding_size = 50 # self.span_finder: SpanFinder = span_finder # self.span_selector: SpanSelector = span_selector if use_event_embedding: self.event_embeddings: nn.Embedding = nn.Embedding( num_embeddings=len(vocab.get_token_to_index_vocabulary(namespace=event_namespace)), embedding_dim=self.event_embedding_size ) self.lexical_dropout = nn.Dropout(p=0.2) # self.contextualized_encoder: Seq2SeqEncoder = LstmSeq2SeqEncoder( # bidirectional=True, # input_size=embed_size, # hidden_size=embed_size, # num_layers=2, # dropout=0.4 # ) self.span_graph_encoder: SpanGraphEncoder = span_graph_encoder self.span_extractor: SpanExtractor = EndpointSpanExtractor( # input_dim=self.contextualized_encoder.get_output_dim(), input_dim=self.embed_size, combination='x,y' ) self.attentive_span_extractor: SpanExtractor = SelfAttentiveSpanExtractor(embed_size) self.arg_affine = TimeDistributed(FeedForward( input_dim=self.span_extractor.get_output_dim() + self.attentive_span_extractor.get_output_dim(), hidden_dims=self.span_graph_encoder.get_input_dim(), num_layers=2, activations=nn.GELU(), dropout=0.2 )) self.trigger_affine = FeedForward( input_dim=self.span_extractor.get_output_dim() + self.attentive_span_extractor.get_output_dim(), hidden_dims=self.span_graph_encoder.get_input_dim() - ( self.event_embedding_size if use_event_embedding else 0), num_layers=2, activations=nn.GELU(), dropout=0.2 ) # self.arg_affine: nn.Linear = nn.Linear( # self.span_extractor.get_output_dim() + self.attentive_span_extractor.get_output_dim(), # self.span_graph_encoder.get_input_dim() # ) # self.trigger_affine: nn.Linear = nn.Linear( # self.span_extractor.get_output_dim() + self.attentive_span_extractor.get_output_dim(), # self.span_graph_encoder.get_input_dim() # ) # self.trigger_event_infuse: nn.Sequential = nn.Sequential( # nn.Dropout(p=0.1), # nn.Linear(4 * self.span_graph_encoder.get_input_dim(), 2 * self.span_graph_encoder.get_input_dim()), # nn.Dropout(p=0.1), # nn.GELU(), # nn.Linear(2 * self.span_graph_encoder.get_input_dim(), self.span_graph_encoder.get_input_dim()), # nn.Dropout(p=0.1), # nn.GELU() # ) self.span_typer: SpanTyper = span_typer self.apply(self._init_weights)
def __init__( self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, doc_encoder: Seq2VecEncoder, const_path: str, tokens_namespace: str, use_sim: bool = True, use_classifier: bool = True, ) -> None: super().__init__(vocab) self.vocab = vocab self.num_tags = vocab.get_vocab_size("labels") self._token_embedder = text_field_embedder self._doc_encoder = doc_encoder if not use_sim: raise Exception( "use_sim option is false, but it must be true for this to work" ) if use_classifier: print("Warning: use_classifier option does nothing now...") self.use_sim = use_sim self.use_classifier = use_classifier # I actually want to use the one from the config, but not sure how to do that. _spacy_word_splitter = SpacyWordSplitter() token_indexer = PretrainedBertIndexer("bert-base-cased", do_lowercase=False, use_starting_offsets=True) jc = JsonConverter() const, links = jc._read_const(const_path) # the extra 1 is for the "unmatched" label. print(vocab.get_token_to_index_vocabulary("labels")) print(const.keys()) assert self.num_tags == len( const ) + 1, "Num tags ({}) doesn't match the size of the constitution+1 ({})".format( self.num_tags, len(const) + 1) if self.use_sim: # create the constitution matrix. Every element is one of the groups. tagmap = self.vocab.get_index_to_token_vocabulary("labels") self.const_dict = {} indices = [] for i in range(self.num_tags): tagname = tagmap[i] if tagname != "unmatched": const_text = const[tagname] else: const_text = "@@pad@@" const_toks = _spacy_word_splitter.split_words(const_text) # truncate so BERT is happy. const_toks = const_toks[:250] const_indices = token_indexer.tokens_to_indices( const_toks, vocab, tokens_namespace) indices.append(const_indices) max_len = max(map(lambda j: len(j[tokens_namespace]), indices)) max_offset_len = max( map(lambda j: len(j["tokens-offsets"]), indices)) const_tensor = torch.zeros(self.num_tags, max_len).long() const_tensor_offsets = torch.zeros(self.num_tags, max_offset_len).long() const_tensor_mask = torch.zeros(self.num_tags, max_offset_len).long() for i, ind in enumerate(indices): toks = ind[tokens_namespace] mask = ind["mask"] const_tensor[i, :len(toks)] = torch.LongTensor(toks) const_tensor_offsets[ i, :len(ind["tokens-offsets"])] = torch.LongTensor( ind["tokens-offsets"]) const_tensor_mask[i, :len(mask)] = torch.LongTensor(mask) const_tokens = { tokens_namespace: const_tensor, "tokens-offsets": const_tensor_offsets, "mask": const_tensor_mask } print("Embedding the constitution... this could take a minute...") self.const_mask = util.get_text_field_mask(const_tokens) self.const_emb = self._token_embedder(const_tokens).detach() print("Done embedding the constitution.") if torch.cuda.is_available(): self.const_emb = self.const_emb.cuda() self.const_mask = self.const_mask.cuda() self.vectorf1 = VectorF1(unmatched_index=self.vocab.get_token_index( "unmatched", namespace="labels")) # self.metric = F1Measure(positive_label=1) # self.ff = FeedForward(doc_encoder.get_output_dim(), num_layers=4, # hidden_dims=100, # activations=Activation.by_name("relu")()) #self.tag_projection_layer = Linear(self.ff.get_output_dim(), self.num_tags) #self.choice_projection_layer = Linear(self.ff.get_output_dim(), 2) self.sim_ff = TimeDistributed( FeedForward(doc_encoder.get_output_dim(), num_layers=1, hidden_dims=2, activations=Activation.by_name("relu")()))
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, projection_feedforward: FeedForward, key_projection_feedforward: FeedForward, inference_encoder: Seq2SeqEncoder, link_key_encoder: Seq2SeqEncoder, key_compare_feedforward: FeedForward, output_feedforward: FeedForward, output_logit: FeedForward, dropout: float = 0.5, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super().__init__(vocab, regularizer) self.label_map = vocab.get_token_to_index_vocabulary('labels') l_map = [None] * len(self.label_map) for lb, lb_idx in self.label_map.items(): l_map[lb_idx] = lb self.label_map = l_map self._text_field_embedder = text_field_embedder self._word_embedding_dimension = text_field_embedder.get_output_dim() self._sentence_encoder = encoder self._encoded_word_dimension = self._sentence_encoder.get_output_dim() self._matrix_attention = DotProductMatrixAttention() self._projection_feedforward = projection_feedforward self._key_projection_feedforward = key_projection_feedforward self._inference_encoder = inference_encoder self._link_key_encoder = link_key_encoder self._embedded_key_dimension = self._link_key_encoder.get_output_dim() self._key_compare_feedforward = key_compare_feedforward if dropout: self.dropout = torch.nn.Dropout(dropout) self.rnn_input_dropout = InputVariationalDropout(dropout) else: self.dropout = None self.rnn_input_dropout = None self._output_feedforward = output_feedforward self._output_logit = output_logit self._num_labels = vocab.get_vocab_size(namespace="labels") check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(), "text field embedding dim", "encoder input dim") check_dimensions_match(encoder.get_output_dim() * 4, projection_feedforward.get_input_dim(), "encoder output dim", "projection feedforward input") check_dimensions_match(encoder.get_output_dim() * 4, key_projection_feedforward.get_input_dim(), "encoder output dim", "projection feedforward input") check_dimensions_match(projection_feedforward.get_output_dim(), inference_encoder.get_input_dim(), "proj feedforward output dim", "inference lstm input dim") check_dimensions_match(key_projection_feedforward.get_output_dim(), link_key_encoder.get_input_dim(), "key proj feedforward output dim", "link key lstm input dim") check_dimensions_match(key_projection_feedforward.get_output_dim(), link_key_encoder.get_input_dim(), "key proj feedforward output dim", "inference lstm input dim") self._accuracy = CategoricalAccuracy() self._loss = torch.nn.CrossEntropyLoss() initializer(self)
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, similarity_function: SimilarityFunction, projection_feedforward: FeedForward, inference_encoder: Seq2SeqEncoder, output_feedforward: FeedForward, output_logit: FeedForward, dropout: float = 0.5, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, similarity_weight: int = 30) -> None: super().__init__(vocab, regularizer) self.label_map = vocab.get_token_to_index_vocabulary('labels') label_map = [None] * len(self.label_map) for lb, lb_idx in self.label_map.items(): label_map[lb_idx] = lb self.label_map = label_map self._text_field_embedder = text_field_embedder self._encoder = encoder print(similarity_function) self._matrix_attention = LegacyMatrixAttention(similarity_function) self._projection_feedforward = projection_feedforward self._inference_encoder = inference_encoder if dropout: self.dropout = torch.nn.Dropout(dropout) self.rnn_input_dropout = InputVariationalDropout(dropout) else: self.dropout = None self.rnn_input_dropout = None self._output_feedforward = output_feedforward self._output_logit = output_logit self._num_labels = vocab.get_vocab_size(namespace="labels") check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(), "text field embedding dim", "encoder input dim") check_dimensions_match(encoder.get_output_dim() * 4, projection_feedforward.get_input_dim(), "encoder output dim", "projection feedforward input") check_dimensions_match(projection_feedforward.get_output_dim(), inference_encoder.get_input_dim(), "proj feedforward output dim", "inference lstm input dim") self._accuracy = CategoricalAccuracy() self._loss = torch.nn.CrossEntropyLoss() initializer(self) self.lambda_layer = nn.Sequential(nn.Linear(16, 1, bias=False), MyActivationFunction()) self.lambda_layer[0].weight.data = torch.tensor([[ 0.1, 0.5, 0.5, 0.5, 0.5, 0.1, 0.5, 0.5, 0.5, 0.5, 0.1, 0.5, 0.5, 0.5, 0.5, 0.9 ]]) self.similarity_weight = similarity_weight print("SIMILARITY WEIGHT BEING USED IS : {0}".format( self.similarity_weight))
def from_concrete(cls, data_path: str, cache_file: str, vocab: Vocabulary, ontology: Dict, max_num_spans: int = 512, task: str = 'argidcls', sentence_mode: bool = False) -> 'ConcreteDataset': def _build_role_type_mask(vocab: Vocabulary) -> torch.Tensor: role_type_mask_list: List[List[int]] = [] all_role_types: List[str] = [ r for r, _ in sorted(vocab.get_token_to_index_vocabulary(namespace='span_labels').items(), key=lambda t: t[1]) ] for event_type, _ in sorted(vocab.get_token_to_index_vocabulary(namespace='event_labels').items(), key=lambda t: t[1]): role_type_mask_list.append([ ( 1 if k in ontology['events'][event_type]['roles'].keys() or ( i == 0 and task in ['argidcls', 'argidcls_noisy']) else 0 ) for i, k in enumerate(all_role_types) ]) return torch.tensor(role_type_mask_list, dtype=torch.bool) # [num_events, num_roles] def _to_predictive_span_finder_gold(spans: List[Span]) -> Tuple[torch.Tensor, torch.Tensor]: sorted_spans: List[Span] = sorted(spans, key=lambda s: s.start) gold: List[int] = [0 for _ in range((sorted_spans[-1].start + 1) if len(sorted_spans) > 0 else 0)] gold_mask: List[int] = [0 for _ in range((sorted_spans[-1].start + 1) if len(sorted_spans) > 0 else 0)] for span in sorted_spans: gold[span.start] = span.end + 1 # shift one for null span gold_mask[span.start] = 1 return ( torch.tensor(gold, dtype=torch.long), torch.tensor(gold_mask, dtype=torch.bool) ) def _tensorize_spans(evnt: Event) -> Tuple[torch.Tensor, torch.Tensor]: span_indices: List[Tuple[int, int]] = [(evnt.trigger.start, evnt.trigger.end)] span_types: List[int] = [vocab.get_token_index(token='None', namespace='span_labels')] if task in ['argidcls', 'argidcls-noisy']: mention_list = evnt.document.argument_mentions else: mention_list = evnt.arguments for mention in mention_list: if (mention.start, mention.end) == (evnt.trigger.start, evnt.trigger.end): continue span_indices.append((mention.start, mention.end)) arg: Optional[Argument] = evnt.find_arg_by_indices(indices=(mention.start, mention.end)) span_types.append( vocab.get_token_index(token='None', namespace='span_labels') if arg is None else vocab.get_token_index(token=arg.role, namespace='span_labels') ) return ( torch.tensor(span_indices, dtype=torch.long).view([1, -1, 2]), torch.tensor(span_types, dtype=torch.long).view(1, -1) ) def _tensorize_spans_sentence_level( evnt: Event, grouped_mentions: Optional[Dict[int, List[Tuple[Tuple[int, int], Span]]]] ) -> Tuple[int, torch.Tensor, torch.Tensor]: span_types: List[int] = [vocab.get_token_index(token='None', namespace='span_labels')] trigger_sent_ids, trigger_indices = evnt.document.global_to_local_spans( spans=[(evnt.trigger.start, evnt.trigger.end)] ) trigger_sent_id = trigger_sent_ids[0] trigger_indices = trigger_indices[0] spans: List[Tuple[int, int]] = [trigger_indices] if task in ['argidcls', 'argidcls-noisy']: span_list = grouped_mentions[trigger_sent_id] else: # argcls arg_sent_ids, arg_indices = evnt.document.global_to_local_spans( spans=[(arg.start, arg.end) for arg in evnt.arguments] ) span_list = [(t, evnt.arguments[i]) for i, t in enumerate(arg_indices)] for t, s in span_list: arg: Optional[Argument] = evnt.find_arg_by_indices(indices=(s.start, s.end)) spans.append(t) span_types.append( vocab.get_token_index(token='None', namespace='span_labels') if arg is None else vocab.get_token_index(token=arg.role, namespace='span_labels') ) return ( trigger_sent_id, torch.tensor(spans, dtype=torch.long).view([1, -1, 2]), torch.tensor(span_types, dtype=torch.long).view(1, -1) ) def _group_mentions_by_sentence(d: Document) -> Dict[int, List[Tuple[Tuple[int, int], Span]]]: if task == 'argidcls-noisy': mention_list = d.argument_mentions elif task == 'argidcls': mention_list = [] for e in doc.events: mention_list.extend(e.arguments) else: raise NotImplementedError sent_ids, span_indices = d.global_to_local_spans(spans=[(s.start, s.end) for s in mention_list]) grouped_spans = defaultdict(list) for sent_id, t_list in groupby(zip(sent_ids, span_indices, mention_list), key=lambda k: k[0]): ss = [(t[1], t[2]) for t in t_list] grouped_spans[sent_id].extend(ss) return grouped_spans docs: List[Document] = cls.load_documents_from_concrete_dir(dir=data_path, task=task) cache: h5py.File = h5py.File(cache_file, mode='r') role_type_mask: torch.Tensor = _build_role_type_mask(vocab=vocab) # [num_events, num_roles] num_events: int = len(vocab.get_token_to_index_vocabulary(namespace='event_labels')) num_roles: int = len(vocab.get_token_to_index_vocabulary(namespace='span_labels')) ins_to_event: Dict[int, Event] = {} instances: List[InputInstance] = [] padding_tensor: torch.Tensor = torch.tensor( [[vocab.get_token_index(token='@@PADDING@@', namespace='span_labels')]], dtype=torch.long ) # [1, 1] metadata: Dict[int, Dict[str, Any]] = {} id: int = 0 for doc in tqdm(docs): # sequence_tensor: torch.Tensor = _load_cache(doc.doc_key) if sentence_mode and task in ['argidcls', 'argidcls-noisy']: grouped_mentions: Optional[Dict[int, List[Tuple[Tuple[int, int], Span]]]] = _group_mentions_by_sentence( doc) else: grouped_mentions = None for e in doc.events: # but RAMS only has one event per document ins_metadata = {} if sentence_mode: sent_id, span_indices, span_types = _tensorize_spans_sentence_level( evnt=e, grouped_mentions=grouped_mentions ) ins_metadata['sentence_id'] = sent_id else: span_indices, span_types = _tensorize_spans(evnt=e) if task == 'emd': gold_span_indices, gold_span_indices_mask = _to_predictive_span_finder_gold([ arg for arg in e.arguments ]) if span_indices.shape[1] == 1: logger.info('Example has no arguments.') continue new_ins: InputInstance = InputInstance( id=torch.tensor([id], dtype=torch.long), # sequence_tensor=sequence_tensor.view(1, sequence_tensor.shape[0], sequence_tensor.shape[1]), event_type=torch.tensor([vocab.get_token_index(token=e.kind, namespace='event_labels')], dtype=torch.long), span_indices=torch.cat([ span_indices, torch.zeros([1, max_num_spans - span_indices.shape[1], 2], dtype=torch.long) ], dim=1) if task != 'emd' else None, span_indices_mask=torch.cat([ torch.ones([1, span_indices.shape[1]], dtype=torch.bool), torch.zeros([1, max_num_spans - span_indices.shape[1]], dtype=torch.bool) ], dim=1) if task != 'emd' else None, type_mask=role_type_mask[ vocab.get_token_index(e.kind, namespace='event_labels'), : ].view(1, -1) if task != 'emd' else None, span_types=torch.cat([ span_types, padding_tensor.expand([1, max_num_spans - span_indices.shape[1]]) ], dim=1) if task != 'emd' else None, gold_span_indices=gold_span_indices.view(1, -1) if task == 'emd' else None, gold_span_indices_mask=gold_span_indices_mask.view(1, -1) if task == 'emd' else None ) instances.append(new_ins) ins_to_event[id] = e metadata[id] = ins_metadata id += 1 return cls(docs=docs, ins_to_event=ins_to_event, vocab=vocab, cache_file=cache, role_type_mask=role_type_mask, num_events=num_events, num_roles=num_roles, instances=instances, max_num_spans=max_num_spans, sentence_mode=sentence_mode, metadata=metadata)