def _process_sentence(self, sent: Sentence, dataset: str): # Get the sentence text and define the `text_field`. sentence_text = [self._normalize_word(word) for word in sent.text] text_field = TextField([Token(word) for word in sentence_text], self._token_indexers) # Enumerate spans. spans = [] for start, end in enumerate_spans(sentence_text, max_span_width=self._max_span_width): spans.append(SpanField(start, end, text_field)) span_field = ListField(spans) span_tuples = [(span.span_start, span.span_end) for span in spans] # Convert data to fields. # NOTE: The `ner_labels` and `coref_labels` would ideally have type # `ListField[SequenceLabelField]`, where the sequence labels are over the `SpanField` of # `spans`. But calling `as_tensor_dict()` fails on this specific data type. Matt G # recognized that this is an AllenNLP API issue and suggested that represent these as # `ListField[ListField[LabelField]]` instead. fields = {} fields["text"] = text_field fields["spans"] = span_field if sent.ner is not None: ner_labels = self._process_ner(span_tuples, sent) fields["ner_labels"] = ListField([ LabelField(entry, label_namespace=f"{dataset}__ner_labels") for entry in ner_labels ]) if sent.cluster_dict is not None: # Skip indexing for coref labels, which are ints. coref_labels = self._process_coref(span_tuples, sent) fields["coref_labels"] = ListField([ LabelField(entry, label_namespace="coref_labels", skip_indexing=True) for entry in coref_labels ]) if sent.relations is not None: relation_labels, relation_indices = self._process_relations( span_tuples, sent) fields["relation_labels"] = AdjacencyField( indices=relation_indices, sequence_field=span_field, labels=relation_labels, label_namespace=f"{dataset}__relation_labels") if sent.events is not None: trigger_labels, argument_labels, argument_indices = self._process_events( span_tuples, sent) fields["trigger_labels"] = SequenceLabelField( trigger_labels, text_field, label_namespace=f"{dataset}__trigger_labels") fields["argument_labels"] = AdjacencyFieldAssym( indices=argument_indices, row_field=text_field, col_field=span_field, labels=argument_labels, label_namespace=f"{dataset}__argument_labels") return fields
def text_to_instance(self, sentence: List[str], ner_dict: Dict[Tuple[int, int], str], relation_dict, cluster_dict, trigger_dict, argument_dict, doc_key: str, dataset: str, sentence_num: int, groups: List[str], start_ix: int, end_ix: int, tree: Dict[str, Any], syntax_dict: Dict[Tuple[int, int], str], children_dict: Dict[Tuple[int, int], List[Tuple[int, int]]], dep_children_dict: Dict[Tuple[int, int], List[Tuple[int, int]]], tf_dict: Dict[Tuple[int, int], Any]): """ TODO(dwadden) document me. """ sentence = [self._normalize_word(word) for word in sentence] text_field = TextField([Token(word) for word in sentence], self._token_indexers) text_field_with_context = TextField([Token(word) for word in groups], self._token_indexers) # feili, NER labels. One label per token ner_sequence_labels = self._generate_ner_label(sentence, ner_dict) ner_sequence_label_field = SequenceLabelField( ner_sequence_labels, text_field, label_namespace="ner_sequence_labels") # Put together the metadata. metadata = dict(sentence=sentence, ner_dict=ner_dict, relation_dict=relation_dict, cluster_dict=cluster_dict, trigger_dict=trigger_dict, argument_dict=argument_dict, doc_key=doc_key, dataset=dataset, groups=groups, start_ix=start_ix, end_ix=end_ix, sentence_num=sentence_num, seq_dict=ner_sequence_labels, tree=tree, syntax_dict=syntax_dict, children_dict=children_dict, dep_children_dict=dep_children_dict) metadata_field = MetadataField(metadata) # Trigger labels. One label per token in the input. token_trigger_labels = [] for i in range(len(text_field)): token_trigger_labels.append(trigger_dict[i]) trigger_label_field = SequenceLabelField( token_trigger_labels, text_field, label_namespace="trigger_labels") # Generate fields for text spans, ner labels, coref labels. spans = [] span_ner_labels = [] # feili span_labels = [] span_coref_labels = [] span_syntax_labels = [] span_children_labels = [] dep_span_children_labels = [] # span_children_syntax_labels = [] span_tree_labels = [] raw_spans = [] assert len(syntax_dict) == len(children_dict) for start, end in enumerate_spans(sentence, max_span_width=self._max_span_width): span_ix = (start, end) # here we need to consider how to use tree info # for example, use_tree, span is in tree, match is true or false # if self._tree_span_filter and not self._is_span_in_tree(span_ix, syntax_dict, children_dict): # if len(raw_spans) == 0: # in case that there is no span for this instance # pass # else: # continue span_tree_labels.append('1' if self._is_span_in_tree( span_ix, syntax_dict, children_dict) else '') span_ner_labels.append(ner_dict[span_ix]) span_labels.append('' if ner_dict[span_ix] == '' else '1') span_coref_labels.append(cluster_dict[span_ix]) spans.append(SpanField(start, end, text_field)) span_syntax_labels.append(syntax_dict[span_ix]) raw_spans.append(span_ix) # if len(children_dict[span_ix]) == 0: # children_field = ListField([SpanField(-1, -1, text_field)]) # children_syntax_field = SequenceLabelField([''], children_field, # label_namespace="span_syntax_labels") # else: # children_field = ListField([SpanField(children_span[0], children_span[1], text_field) # for children_span in children_dict[span_ix]]) # children_syntax_field = SequenceLabelField([syntax_dict[children_span] for children_span in children_dict[span_ix]], # children_field, label_namespace="span_syntax_labels") # span_children_labels.append(children_field) # span_children_syntax_labels.append(children_syntax_field) span_field = ListField(spans) for span in raw_spans: if len(children_dict[span]) == 0: children_field = ListField([IndexField(-1, span_field)]) else: children_field = [] for children_span in children_dict[span]: if children_span in raw_spans: children_field.append( IndexField(raw_spans.index(children_span), span_field)) else: children_field.append(IndexField(-1, span_field)) children_field = ListField(children_field) span_children_labels.append(children_field) # for span in raw_spans: # if len(dep_children_dict[span]) == 0: # children_field = ListField([IndexField(-1, span_field)]) # else: # children_field = [] # for children_span in dep_children_dict[span]: # if children_span in raw_spans: # children_field.append(IndexField(raw_spans.index(children_span), span_field)) # else: # children_field.append(IndexField(-1, span_field)) # children_field = ListField(children_field) # dep_span_children_labels.append(children_field) n_tokens = len(sentence) candidate_indices = [(i, j) for i in range(n_tokens) for j in range(n_tokens)] dep_adjs = [] dep_adjs_indices = [] # tf_indices = {} # tf_features = {} # for k, v in tf_dict.items(): # tf_indices[k] = [] # tf_features[k] = [] tf_indices = [] tf_features = [] for token_pair in candidate_indices: dep_adj_label = dep_children_dict[token_pair] if dep_adj_label: dep_adjs_indices.append(token_pair) dep_adjs.append(dep_adj_label) # for k,v in tf_dict.items(): # feature = tf_dict[k][token_pair] # if feature: # tf_indices[k].append(token_pair) # tf_features[k].append(feature) feature = tf_dict[token_pair] if feature: tf_indices.append(token_pair) tf_features.append(feature) ner_label_field = SequenceLabelField(span_ner_labels, span_field, label_namespace="ner_labels") coref_label_field = SequenceLabelField(span_coref_labels, span_field, label_namespace="coref_labels") # feili span_label_field = SequenceLabelField(span_labels, span_field, label_namespace="span_labels") # Generate labels for relations and arguments. Only store non-null values. # For the arguments, by convention the first span specifies the trigger, and the second # specifies the argument. Ideally we'd have an adjacency field between (token, span) pairs # for the event arguments field, but AllenNLP doesn't make it possible to express # adjacencies between two different sequences. n_spans = len(spans) span_tuples = [(span.span_start, span.span_end) for span in spans] candidate_indices = [(i, j) for i in range(n_spans) for j in range(n_spans)] relations = [] relation_indices = [] for i, j in candidate_indices: span_pair = (span_tuples[i], span_tuples[j]) relation_label = relation_dict[span_pair] if relation_label: relation_indices.append((i, j)) relations.append(relation_label) relation_label_field = AdjacencyField( indices=relation_indices, sequence_field=span_field, labels=relations, label_namespace="relation_labels") arguments = [] argument_indices = [] n_tokens = len(sentence) candidate_indices = [(i, j) for i in range(n_tokens) for j in range(n_spans)] for i, j in candidate_indices: token_span_pair = (i, span_tuples[j]) argument_label = argument_dict[token_span_pair] if argument_label: argument_indices.append((i, j)) arguments.append(argument_label) argument_label_field = AdjacencyFieldAssym( indices=argument_indices, row_field=text_field, col_field=span_field, labels=arguments, label_namespace="argument_labels") # Syntax span_syntax_field = SequenceLabelField( span_syntax_labels, span_field, label_namespace="span_syntax_labels") span_children_field = ListField(span_children_labels) span_tree_field = SequenceLabelField( span_tree_labels, span_field, label_namespace="span_tree_labels") # span_children_syntax_field = ListField(span_children_syntax_labels) # dep_span_children_field = ListField(dep_span_children_labels) dep_span_children_field = AdjacencyField( indices=dep_adjs_indices, sequence_field=text_field, labels=dep_adjs, label_namespace="dep_adj_labels") # tf_f1_field = AdjacencyField(indices=tf_indices['F1'], sequence_field=text_field, labels=tf_features['F1'], # label_namespace="tf_f1_labels") # tf_f2_field = AdjacencyField(indices=tf_indices['F2'], sequence_field=text_field, labels=tf_features['F2'], # label_namespace="tf_f2_labels") # tf_f3_field = AdjacencyField(indices=tf_indices['F3'], sequence_field=text_field, labels=tf_features['F3'], # label_namespace="tf_f3_labels") # tf_f4_field = AdjacencyField(indices=tf_indices['F4'], sequence_field=text_field, labels=tf_features['F4'], # label_namespace="tf_f4_labels") # tf_f5_field = AdjacencyField(indices=tf_indices['F5'], sequence_field=text_field, labels=tf_features['F5'], # label_namespace="tf_f5_labels") tf_field = AdjacencyField(indices=tf_indices, sequence_field=text_field, labels=tf_features, label_namespace="tf_labels") # Pull it all together. fields = dict( text=text_field_with_context, spans=span_field, ner_labels=ner_label_field, coref_labels=coref_label_field, trigger_labels=trigger_label_field, argument_labels=argument_label_field, relation_labels=relation_label_field, metadata=metadata_field, span_labels=span_label_field, ner_sequence_labels=ner_sequence_label_field, syntax_labels=span_syntax_field, span_children=span_children_field, span_tree_labels=span_tree_field, dep_span_children=dep_span_children_field, # tf_f1 = tf_f1_field, # tf_f2 = tf_f2_field, # tf_f3 = tf_f3_field, # tf_f4 = tf_f4_field, # tf_f5 = tf_f5_field) tf=tf_field) # span_children_syntax=span_children_syntax_field) return Instance(fields)
def text_to_instance(self, sentence: List[str], ner_dict: Dict[Tuple[int, int], str], relation_dict, cluster_dict, trigger_dict, argument_dict, doc_key: str, dataset: str, sentence_num: int, groups: List[str], start_ix: int, end_ix: int): """ TODO(dwadden) document me. """ sentence = [self._normalize_word(word) for word in sentence] text_field = TextField([Token(word) for word in sentence], self._token_indexers) text_field_with_context = TextField([Token(word) for word in groups], self._token_indexers) # Put together the metadata. metadata = dict(sentence=sentence, ner_dict=ner_dict, relation_dict=relation_dict, cluster_dict=cluster_dict, trigger_dict=trigger_dict, argument_dict=argument_dict, doc_key=doc_key, dataset=dataset, groups=groups, start_ix=start_ix, end_ix=end_ix, sentence_num=sentence_num) metadata_field = MetadataField(metadata) # Trigger labels. One label per token in the input. token_trigger_labels = [] for i in range(len(text_field)): token_trigger_labels.append(trigger_dict[i]) trigger_label_field = SequenceLabelField( token_trigger_labels, text_field, label_namespace="trigger_labels") # Generate fields for text spans, ner labels, coref labels. spans = [] span_ner_labels = [] span_coref_labels = [] for start, end in enumerate_spans(sentence, max_span_width=self._max_span_width): span_ix = (start, end) span_ner_labels.append(ner_dict[span_ix]) span_coref_labels.append(cluster_dict[span_ix]) spans.append(SpanField(start, end, text_field)) span_field = ListField(spans) ner_label_field = SequenceLabelField(span_ner_labels, span_field, label_namespace="ner_labels") coref_label_field = SequenceLabelField(span_coref_labels, span_field, label_namespace="coref_labels") # Generate labels for relations and arguments. Only store non-null values. # For the arguments, by convention the first span specifies the trigger, and the second # specifies the argument. Ideally we'd have an adjacency field between (token, span) pairs # for the event arguments field, but AllenNLP doesn't make it possible to express # adjacencies between two different sequences. n_spans = len(spans) span_tuples = [(span.span_start, span.span_end) for span in spans] candidate_indices = [(i, j) for i in range(n_spans) for j in range(n_spans)] relations = [] relation_indices = [] for i, j in candidate_indices: span_pair = (span_tuples[i], span_tuples[j]) relation_label = relation_dict[span_pair] if relation_label: relation_indices.append((i, j)) relations.append(relation_label) relation_label_field = AdjacencyField( indices=relation_indices, sequence_field=span_field, labels=relations, label_namespace="relation_labels") arguments = [] argument_indices = [] n_tokens = len(sentence) candidate_indices = [(i, j) for i in range(n_tokens) for j in range(n_spans)] for i, j in candidate_indices: token_span_pair = (i, span_tuples[j]) argument_label = argument_dict[token_span_pair] if argument_label: argument_indices.append((i, j)) arguments.append(argument_label) argument_label_field = AdjacencyFieldAssym( indices=argument_indices, row_field=text_field, col_field=span_field, labels=arguments, label_namespace="argument_labels") # Pull it all together. fields = dict(text=text_field_with_context, spans=span_field, ner_labels=ner_label_field, coref_labels=coref_label_field, trigger_labels=trigger_label_field, argument_labels=argument_label_field, relation_labels=relation_label_field, metadata=metadata_field) return Instance(fields)