def text_to_instance( self, # type: ignore tokens: List[Token], pos_tags: List[str] = None, chunk_tags: List[str] = None, ner_tags: List[str] = None) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ sequence = TextField(tokens, self._token_indexers) instance_fields: Dict[str, Field] = {'tokens': sequence} instance_fields["metadata"] = MetadataField( {"words": [x.text for x in tokens]}) # Recode the labels if necessary. if self.coding_scheme == "BIOUL": coded_chunks = iob1_to_bioul( chunk_tags) if chunk_tags is not None else None coded_ner = iob1_to_bioul( ner_tags) if ner_tags is not None else None else: # the default IOB1 coded_chunks = chunk_tags coded_ner = ner_tags # Add "feature labels" to instance if 'pos' in self.feature_labels: if pos_tags is None: raise ConfigurationError( "Dataset reader was specified to use pos_tags as " "features. Pass them to text_to_instance.") instance_fields['pos_tags'] = SequenceLabelField( pos_tags, sequence, "pos_tags") if 'chunk' in self.feature_labels: if coded_chunks is None: raise ConfigurationError( "Dataset reader was specified to use chunk tags as " "features. Pass them to text_to_instance.") instance_fields['chunk_tags'] = SequenceLabelField( coded_chunks, sequence, "chunk_tags") if 'ner' in self.feature_labels: if coded_ner is None: raise ConfigurationError( "Dataset reader was specified to use NER tags as " " features. Pass them to text_to_instance.") instance_fields['ner_tags'] = SequenceLabelField( coded_ner, sequence, "ner_tags") # Add "tag label" to instance if self.tag_label == 'ner' and coded_ner is not None: instance_fields['tags'] = SequenceLabelField(coded_ner, sequence) elif self.tag_label == 'pos' and pos_tags is not None: instance_fields['tags'] = SequenceLabelField(pos_tags, sequence) elif self.tag_label == 'chunk' and coded_chunks is not None: instance_fields['tags'] = SequenceLabelField( coded_chunks, sequence) return Instance(instance_fields)
def _conll_rows_to_sentence(self, conll_rows: List[str]) -> ACESentence: sentence: List[str] = [] mention_tags: List[str] = [] span_labels: List[List[str]] = [] current_span_labels: List[str] = [] # Cluster id -> List of (start_index, end_index) spans. clusters: DefaultDict[int, List[Tuple[int, int]]] = defaultdict(list) # Cluster id -> List of start_indices which are open for this id. coref_stacks: DefaultDict[int, List[int]] = defaultdict(list) for index, row in enumerate(conll_rows): conll_components = row.split() word = conll_components[1] if not span_labels: span_labels = [[] for _ in conll_components[2:-1]] current_span_labels = [None for _ in conll_components[2:-1]] self._process_span_annotations_for_word( annotations=conll_components[2:-1], span_labels=span_labels, current_span_labels=current_span_labels) #Process coref self._process_coref_span_annotations_for_word( conll_components[-1], index, clusters, coref_stacks) sentence.append(word) mention_tags = iob1_to_bioul(span_labels[0]) #Process coref clusters coref_span_tuples: Set[TypedSpan] = { (cluster_id, span) for cluster_id, span_list in clusters.items() for span in span_list } #Reformat the labels to only keep the the last token of the head #Cf paper, we model relation between last tokens of heads. last_head_token_relations = [] bioul_relations = [] for relation_frame in span_labels[1:]: bioul_relation_frame = iob1_to_bioul(relation_frame) reformatted_frame = [] for annotation in bioul_relation_frame: if annotation[:2] in ["L-", "U-"]: reformatted_frame.append(annotation[2:]) else: reformatted_frame.append("*") last_head_token_relations.append(reformatted_frame) bioul_relations.append(bioul_relation_frame) return ACESentence(sentence, mention_tags, bioul_relations, last_head_token_relations, coref_span_tuples)
def _read(self, file_path: str) -> Iterable[Instance]: # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) # Group into alternative divider / sentence chunks. for is_divider, lines in itertools.groupby(data_file, _is_divider): # Ignore the divider chunks, so that `lines` corresponds to the words # of a single sentence. if not is_divider: fields = [line.strip().split() for line in lines] # unzipping trick returns tuples, but our Fields need lists tokens, pos_tags, chunk_tags, ner_tags = [ list(field) for field in zip(*fields) ] # TextField requires ``Token`` objects tokens = [Token(token) for token in tokens] sequence = TextField(tokens, self._token_indexers) instance_fields: Dict[str, Field] = {'tokens': sequence} # Recode the labels if necessary. if self.coding_scheme == "BIOUL": coded_chunks = iob1_to_bioul(chunk_tags) coded_ner = iob1_to_bioul(ner_tags) else: # the default IOB1 coded_chunks = chunk_tags coded_ner = ner_tags # Add "feature labels" to instance if 'pos' in self.feature_labels: instance_fields['pos_tags'] = SequenceLabelField( pos_tags, sequence, "pos_tags") if 'chunk' in self.feature_labels: instance_fields['chunk_tags'] = SequenceLabelField( coded_chunks, sequence, "chunk_tags") if 'ner' in self.feature_labels: instance_fields['ner_tags'] = SequenceLabelField( coded_ner, sequence, "ner_tags") # Add "tag label" to instance if self.tag_label == 'ner': instance_fields['tags'] = SequenceLabelField( coded_ner, sequence) elif self.tag_label == 'pos': instance_fields['tags'] = SequenceLabelField( pos_tags, sequence) elif self.tag_label == 'chunk': instance_fields['tags'] = SequenceLabelField( coded_chunks, sequence) yield Instance(instance_fields)
def _read(self, file_path: str): file_path = cached_path( file_path) # if `file_path` is a URL, redirect to the cache ontonotes_reader = Ontonotes() logger.info("Reading NER instances from dataset files at: %s", file_path) if self._domain_identifier is not None: logger.info( "Filtering to only include file paths containing the %s domain", self._domain_identifier) for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier): tokens = [Token(t) for t in sentence.words] if not sentence.named_entities: tags = ["O" for _ in tokens] else: tags = sentence.named_entities if self._coding_scheme == "BIOUL": tags = iob1_to_bioul(tags) yield self.text_to_instance(tokens, tags)