def text_to_instance( self, # type: ignore tokens: List[Token], pos_tags: List[str] = None, chunk_tags: List[str] = None, ner_tags: List[str] = None) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ sequence = TextField(tokens, self._token_indexers) instance_fields: Dict[str, Field] = {'tokens': sequence} instance_fields["metadata"] = MetadataField( {"words": [x.text for x in tokens]}) # Recode the labels if necessary. if self.coding_scheme == "BIOUL": coded_chunks = to_bioul( chunk_tags) if chunk_tags is not None else None coded_ner = to_bioul(ner_tags) if ner_tags is not None else None else: # the default IOB1 coded_chunks = chunk_tags coded_ner = ner_tags # Add "feature labels" to instance if 'pos' in self.feature_labels: if pos_tags is None: raise ConfigurationError( "Dataset reader was specified to use pos_tags as " "features. Pass them to text_to_instance.") instance_fields['pos_tags'] = SequenceLabelField( pos_tags, sequence, "pos_tags") if 'chunk' in self.feature_labels: if coded_chunks is None: raise ConfigurationError( "Dataset reader was specified to use chunk tags as " "features. Pass them to text_to_instance.") instance_fields['chunk_tags'] = SequenceLabelField( coded_chunks, sequence, "chunk_tags") if 'ner' in self.feature_labels: if coded_ner is None: raise ConfigurationError( "Dataset reader was specified to use NER tags as " " features. Pass them to text_to_instance.") instance_fields['ner_tags'] = SequenceLabelField( coded_ner, sequence, "ner_tags") # Add "tag label" to instance if self.tag_label == 'ner' and coded_ner is not None: instance_fields['tags'] = SequenceLabelField( coded_ner, sequence, self.label_namespace) elif self.tag_label == 'pos' and pos_tags is not None: instance_fields['tags'] = SequenceLabelField( pos_tags, sequence, self.label_namespace) elif self.tag_label == 'chunk' and coded_chunks is not None: instance_fields['tags'] = SequenceLabelField( coded_chunks, sequence, self.label_namespace) return Instance(instance_fields)
def from_bio_to_bioul(bio_fp: Path, bioul_fp: Path) -> None: ''' :NOTE: This also removes lines that start with `#` and changes the Sentiment labels with the following dictionary: `{'positive': 'POS', 'neutral': 'NEU', 'negative': 'NEG'}` :param bio_fp: File path to the data that is in CONLL like format: TOKEN LABEL\n where sentences are split by empty new lines. The label format is in BIO = Beginning of, inside of, outside. :param bioul_fp: File path to save the data that is in `bio_fp` to this file but in BIOUL ''' sentiment_tag_convert = { 'positive': 'POS', 'neutral': 'NEU', 'negative': 'NEG' } with bioul_fp.open('w+') as bioul_file: with bio_fp.open('r') as bio_file: tokens = [] labels = [] for line in bio_file: if not line.strip(): labels = to_bioul(labels, encoding='BIO') temp_labels = [] for label in labels: if len(label.split('-')) == 1: temp_labels.append(label) else: bio_tag, sentiment_tag = label.split('-') sentiment_tag = sentiment_tag_convert[ sentiment_tag] temp_labels.append(f'{bio_tag}-{sentiment_tag}') labels = temp_labels for token, label in zip(tokens, labels): bioul_file.write(f'{token} {label}\n') bioul_file.write('\n') tokens = [] labels = [] else: if re.search(r'^#', line): continue token, label = line.split() tokens.append(token) labels.append(label) if tokens: labels = to_bioul(labels, encoding='BIO') temp_labels = [] for label in labels: if len(label.split('-')) == 1: temp_labels.append(label) else: bio_tag, sentiment_tag = label.split('-') sentiment_tag = sentiment_tag_convert[sentiment_tag] temp_labels.append(f'{bio_tag}-{sentiment_tag}') labels = temp_labels for token, label in zip(tokens, labels): bioul_file.write(f'{token} {label}\n')
def text_to_instance(self, # type: ignore tokens: List[Token], pos_tags: List[str] = None, chunk_tags: List[str] = None, ner_tags: List[str] = None) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ sequence = TextField(tokens, self._token_indexers) instance_fields: Dict[str, Field] = {'tokens': sequence} instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens]}) # Recode the labels if necessary. if self.coding_scheme == "BIOUL": coded_chunks = to_bioul(chunk_tags, encoding=self._original_coding_scheme) if chunk_tags is not None else None coded_ner = to_bioul(ner_tags, encoding=self._original_coding_scheme) if ner_tags is not None else None else: # the default IOB1 coded_chunks = chunk_tags coded_ner = ner_tags # Add "feature labels" to instance if 'pos' in self.feature_labels: if pos_tags is None: raise ConfigurationError("Dataset reader was specified to use pos_tags as " "features. Pass them to text_to_instance.") instance_fields['pos_tags'] = SequenceLabelField(pos_tags, sequence, "pos_tags") if 'chunk' in self.feature_labels: if coded_chunks is None: raise ConfigurationError("Dataset reader was specified to use chunk tags as " "features. Pass them to text_to_instance.") instance_fields['chunk_tags'] = SequenceLabelField(coded_chunks, sequence, "chunk_tags") if 'ner' in self.feature_labels: if coded_ner is None: raise ConfigurationError("Dataset reader was specified to use NER tags as " " features. Pass them to text_to_instance.") instance_fields['ner_tags'] = SequenceLabelField(coded_ner, sequence, "ner_tags") # Add "tag label" to instance if self.tag_label == 'ner' and coded_ner is not None: instance_fields['tags'] = SequenceLabelField(coded_ner, sequence, self.label_namespace) elif self.tag_label == 'pos' and pos_tags is not None: instance_fields['tags'] = SequenceLabelField(pos_tags, sequence, self.label_namespace) elif self.tag_label == 'chunk' and coded_chunks is not None: instance_fields['tags'] = SequenceLabelField(coded_chunks, sequence, self.label_namespace) return Instance(instance_fields)
def text_to_instance( # type: ignore self, tokens: List[Token], ner_tags: List[str] = None, ) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ sequence = TextField(tokens, self._token_indexers) instance_fields: Dict[str, Field] = {"tokens": sequence} # Recode the labels if necessary. if self.coding_scheme == "BIOUL": coded_ner = (to_bioul(ner_tags, encoding=self._original_coding_scheme) if ner_tags is not None else None) else: # the default IOB1 coded_ner = ner_tags instance_fields["metadata"] = MetadataField({ "words": [x.text for x in tokens], "tags": coded_ner }) instance_fields["tags"] = SequenceLabelField(coded_ner, sequence, self.label_namespace) return Instance(instance_fields)
def text_to_instance( self, # type: ignore tokens: List[Token], pos_tags: List[str] = None, chunk_tags: List[str] = None) -> Instance: # pylint: disable=arguments-differ sequence = TextField(tokens, self._token_indexers) instance_fields: Dict[str, Field] = {'tokens': sequence} instance_fields["metadata"] = MetadataField( {"words": [x.text for x in tokens]}) # Recode the labels if necessary. if self.label_encoding == "BIOUL": coded_chunks = to_bioul(chunk_tags, encoding=self._original_label_encoding ) if chunk_tags is not None else None else: # the default BIO coded_chunks = chunk_tags # Add "feature labels" to instance if 'pos' in self.feature_labels: if pos_tags is None: raise ConfigurationError( "Dataset reader was specified to use pos_tags as " "features. Pass them to text_to_instance.") instance_fields['pos_tags'] = SequenceLabelField( pos_tags, sequence, "pos_tags") if 'chunk' in self.feature_labels: if coded_chunks is None: raise ConfigurationError( "Dataset reader was specified to use chunk tags as " "features. Pass them to text_to_instance.") instance_fields['chunk_tags'] = SequenceLabelField( coded_chunks, sequence, "chunk_tags") # Add "tag label" to instance if self.tag_label == 'pos' and pos_tags is not None: instance_fields['tags'] = SequenceLabelField( pos_tags, sequence, self.label_namespace) elif self.tag_label == 'chunk' and coded_chunks is not None: instance_fields['tags'] = SequenceLabelField( coded_chunks, sequence, label_namespace=self._chunk_label_namespace) return Instance(instance_fields)
def text_to_instance( # type: ignore self, tokens: List[Token], pos_tags: List[str] = None, chunk_tags: List[str] = None) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ sequence = TextField(tokens, self._token_indexers) instance_fields: Dict[str, Field] = {"tokens": sequence} instance_fields["metadata"] = MetadataField( {"words": [x.text for x in tokens]}) # Recode the labels if necessary. if self.coding_scheme == "BIOUL": coded_chunks = (to_bioul(chunk_tags, encoding=self._original_coding_scheme) if chunk_tags is not None else None) else: # the default BIO coded_chunks = chunk_tags # Add "feature labels" to instance if "pos" in self.feature_labels: if pos_tags is None: raise ConfigurationError( "Dataset reader was specified to use pos_tags as " "features. Pass them to text_to_instance.") instance_fields["pos_tags"] = SequenceLabelField( pos_tags, sequence, "pos_tags") if "chunk" in self.feature_labels: if coded_chunks is None: raise ConfigurationError( "Dataset reader was specified to use chunk tags as " "features. Pass them to text_to_instance.") instance_fields["chunk_tags"] = SequenceLabelField( coded_chunks, sequence, "chunk_tags") # Add "tag label" to instance if self.tag_label == "pos" and pos_tags is not None: instance_fields["tags"] = SequenceLabelField( pos_tags, sequence, self.label_namespace) elif self.tag_label == "chunk" and coded_chunks is not None: instance_fields["tags"] = SequenceLabelField( coded_chunks, sequence, self.label_namespace) return Instance(instance_fields)
def text_to_instance(self, # type: ignore tokens: List[Token], ner_tags: List[str] = None) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ sequence = TextField(tokens, self._token_indexers) instance_fields: Dict[str, Field] = {'tokens': sequence} instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens]}) # Add "tag label" to instance if ner_tags is not None: if self._coding_scheme == "BIOUL": ner_tags = to_bioul(ner_tags, encoding="BIO") instance_fields['tags'] = SequenceLabelField(ner_tags, sequence) return Instance(instance_fields)
def offsets_from_tags( doc: Doc, tags: List[str], label_encoding: Optional[str] = "BIOUL", only_token_spans: bool = False, ) -> List[Dict]: """Converts BIOUL or BIO tags to offsets Parameters ---------- doc A spaCy Doc created with `text` and the backbone tokenizer tags A list of BIOUL or BIO tags label_encoding The label encoding of the tags: BIOUL or BIO only_token_spans If True, offsets contains only token index references. Default is False Returns ------- offsets A list of dicts with start and end character/token index with respect to the doc and the span label: `{"start": int, "end": int, "start_token": int, "end_token": int, "label": str}` """ # spacy's biluo_tags_to_offsets surprisingly does not check this ... if len(doc) != len(tags): raise ValueError( f"Number of tokens and tags must be the same, " f"but 'len({list(doc)}) != len({tags})" ) if label_encoding == "BIO": tags = to_bioul(tags, encoding="BIO") offsets = [] for start, end, label in biluo_tags_to_offsets(doc, tags): span = doc.char_span(start, end) data = { "start_token": span.start, "end_token": span.end, "label": label, } if not only_token_spans: data.update({"start": start, "end": end}) offsets.append(data) return offsets
def text_to_instance(self, # type: ignore tokens: List[Token], chunk_tags: List[str] = None) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ sentence = TextField(tokens, self._token_indexers) instance_fields: Dict[str, Field] = {'tokens': sentence} if chunk_tags is None: return Instance(instance_fields) chunk_tags = self.clip_chunks_by_max_length(chunk_tags) # Recode the labels if necessary. if self.coding_scheme == "BIOUL" and self._original_coding_scheme == "BIO": chunk_tags = to_bioul(chunk_tags, encoding=self._original_coding_scheme) # We want to treat O also as a valid span label, which is usually ignored. # However, each O span needs to be of length 1, since there is no reason to # combine tokens with O tags as a span, hence replacing O with U-O. chunk_tags = ['U-O' if tag == 'O' else tag for tag in chunk_tags] tags, namespace = self.convert_bioul_to_segmental(chunk_tags) instance_fields["tags"] = SequenceLabelField(tags, sentence, namespace) seg_starts = [] seg_ends = [] seg_map = [] seg_count = 0 for i, tag in enumerate(chunk_tags): if tag.startswith('B-') or tag.startswith('U-'): start = i seg_starts.append(IndexField(start, sentence)) if tag.startswith('L-') or tag.startswith('U-'): end = i assert end - start < self._max_span_width seg_ends.append(IndexField(end, sentence)) seg_map += [ IndexField(seg_count, instance_fields["tags"]) for _ in range(start, end+1)] seg_count += 1 instance_fields['seg_ends'] = ListField(seg_ends) instance_fields['seg_starts'] = ListField(seg_starts) instance_fields['seg_map'] = ListField(seg_map) return Instance(instance_fields)
def text_to_instance( self, # type: ignore filename: str, tokens: List[Token], ner_tags: List[str] = None, weights: List[float] = None) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ sequence = TextField(tokens, self._token_indexers) instance_fields: Dict[str, Field] = { 'tokens': sequence, "metadata": MetadataField({"words": [x.text for x in tokens]}) } if weights is None: weights = [1.0] * len(tokens) weight = weights[0] instance_fields["dataset"] = MetadataField(filename) # Recode the labels if necessary. if self.coding_scheme == "BIOUL": coded_ner = to_bioul(ner_tags, encoding=self._original_coding_scheme ) if ner_tags is not None else None else: # the default IOB1 coded_ner = ner_tags # Add "feature labels" to instance if 'ner' in self.feature_labels: if coded_ner is None: raise ConfigurationError( "Dataset reader was specified to use NER tags as " " features. Pass them to text_to_instance.") instance_fields['ner_tags'] = SequenceLabelField( coded_ner, sequence, "ner_tags") # Add "tag label" to instance if self.tag_label == 'ner' and coded_ner is not None: instance_fields['tags'] = SequenceLabelField( coded_ner, sequence, self.label_namespace) return Instance(instance_fields)
def text_to_instance( self, # type: ignore tokens, ner_tags=None): u""" We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ sequence = TextField(tokens, self._token_indexers) instance_fields = {u'tokens': sequence} instance_fields[u"metadata"] = MetadataField( {u"words": [x.text for x in tokens]}) # Add "tag label" to instance if ner_tags is not None: if self._coding_scheme == u"BIOUL": ner_tags = to_bioul(ner_tags, encoding=u"BIO") instance_fields[u'tags'] = SequenceLabelField(ner_tags, sequence) return Instance(instance_fields)
def text_to_instance( self, # type: ignore tokens: List[Token], ner_tags: List[str] = None, ) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ sequence = TextField(tokens, self._token_indexers) instance_fields: Dict[str, Field] = {"tokens": sequence} instance_fields["metadata"] = MetadataField( {"words": [x.text for x in tokens]}) # Add "tag label" to instance if ner_tags is not None: if self._coding_scheme == "BIOUL": ner_tags = to_bioul(ner_tags, encoding="BIO") instance_fields["tags"] = SequenceLabelField(ner_tags, sequence) return Instance(instance_fields)
def text_to_instance(self, tokens: List[Token], tags: Optional[List[str]] = None) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ sequence = TextField(tokens, self._token_indexers) instance_fields: Dict[str, Field] = {'tokens': sequence} # Metadata field metadata_dict = {"words": [x.text for x in tokens]} instance_fields["metadata"] = MetadataField(metadata_dict) if tags is not None: if self.coding_scheme == "BIOUL": tags = to_bioul(tag_sequence=tags, encoding=self._original_coding_scheme) instance_fields['tags'] = SequenceLabelField( tags, sequence, self.label_namespace) return Instance(instance_fields)
def text_to_instance(self, # type: ignore tokens: List[Token], ner_tags: List[str] = None) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ sequence = TextField(tokens, self._token_indexers) instance_fields: Dict[str, Field] = {'tokens': sequence} def _remove_BI(_one_tag): if _one_tag == 'O': return _one_tag else: return _one_tag[2:] if self.coding_scheme == "BIOUL": coded_ner = to_bioul(ner_tags, encoding=self._original_coding_scheme) if ner_tags is not None else None else: # the default IOB1 coded_ner = ner_tags # TODO: # ner_tags -> spans of NE # return something like spans, span_labels ("O" if span not in golden_spans, "PER", "LOC"... otherwise) spans: List[Field] = [] span_labels: List[str] = [] gold_spans: List[Field] = [] gold_span_labels: List[str] = [] assert len(ner_tags) == len(tokens), "sentence:%s but ner_tags:%s"%(str(tokens), str(ner_tags)) ner_gold_spans = _extract_spans(ner_tags) # ner_gold_spans: Dict[tuple(startid, endid), str(entity_type)] for start, end in enumerate_spans(ner_tags, offset=0, max_span_width=self._max_span_width): span_labels.append(ner_gold_spans.get((start, end), 'O')) spans.append(SpanField(start, end, sequence)) pass _dict_gold_spans = {} for ky, val in ner_gold_spans.items(): gold_span_labels.append(val) gold_spans.append(SpanField(ky[0], ky[1], sequence)) if val != 'O': _dict_gold_spans[ky] = val pass instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens] , "gold_spans": _dict_gold_spans}) assert len(spans) == len(span_labels), "span length not equal to span label length..." span_field = ListField(spans) # a list of (start, end) tuples... # contains all possible spans and their tags instance_fields['spans'] = span_field instance_fields['span_labels'] = SequenceLabelField(span_labels, span_field, "span_tags") # only contain gold_spans and their tags # e.g. (0,0,O), (1,1,O), (2,3,PER), (4,4,O) for 'I am Donald Trump .' gold_span_field = ListField(gold_spans) instance_fields['gold_spans'] = gold_span_field instance_fields['gold_span_labels'] = SequenceLabelField(gold_span_labels, gold_span_field, "span_tags") # Add "tag label" to instance if self.tag_label == 'ner' and coded_ner is not None: instance_fields['tags'] = SequenceLabelField(coded_ner, sequence, 'token_tags') return Instance(instance_fields)
def text_to_instance( self, # type: ignore tokens: List[Token], pos_tags: List[str] = None, ner_tags: List[str] = None) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # Recode the labels if necessary. if self.coding_scheme == "BIOUL": coded_ner = to_bioul(ner_tags, encoding=self._original_coding_scheme) elif self.coding_scheme == "B": # convert to binary mentions. coded_ner = ["O" if t == "O" else "U-MNT" for t in ner_tags] else: # the default IOB1 coded_ner = ner_tags fix_coded_ner = [] for t in coded_ner: if t[-1] == "-": fix_coded_ner.append("O") else: fix_coded_ner.append(t) coded_ner = fix_coded_ner # pylint: disable=arguments-differ sequence = TextField(tokens, self._token_indexers) words = [x.text for x in tokens] instance_fields: Dict[str, Field] = { 'tokens': sequence, "metadata": MetadataField({ "words": words, "orig_tags": coded_ner }), "donotuse": SequenceLabelField(coded_ner, sequence, label_namespace="labels") } tag_marginals = [] for tag in coded_ner: if tag == "O": if self.strategy == "trust_labels": # this strategy believes the tags completely tag_marginal = np.zeros(len(self.alltags)) - 10000 tag_marginal[self.alltags[tag]] = 0 tag_marginals.append(ArrayField(tag_marginal)) elif self.strategy == "uniform": tag_marginal = np.zeros(len(self.alltags)) tag_marginals.append(ArrayField(tag_marginal)) # this strategy will express ignorance over all possibilities. else: raise ConfigurationError("Unknown strategy: " + self.strategy) else: # we always fully trust the given labels. # this strategy believes the tags completely tag_marginal = np.zeros(len(self.alltags)) - 10000 tag_marginal[self.alltags[tag]] = 0 tag_marginals.append(ArrayField(tag_marginal)) instance_fields['tags'] = ListField(tag_marginals) return Instance(instance_fields)
def text_to_instance( self, # type: ignore tokens: List[Token], pos_tags: List[str] = None, chunk_tags: List[str] = None, ner_tags: List[str] = None, target_verb_lemma: str = None, target_verb_position: int = None, verb_sense: str = None, legal_args: List[str] = None, verb_annotation: List[str] = None, parse: str = None) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ sequence = TextField(tokens, self._token_indexers) instance_fields: Dict[str, Field] = {'tokens': sequence} words = [x.text for x in tokens] instance_fields["metadata"] = MetadataField({ "words": words, # used in ai2's srl model "pos_tags": pos_tags, "chunk_tags": chunk_tags, "ner_tags": chunk_tags, "target_verb_lemma": target_verb_lemma, "target_verb_position": target_verb_position, "verb_annotation": verb_annotation, "verb_sense": verb_sense, "legal_args": legal_args, "verb": target_verb_lemma, # used in ai2's srl model "parse": parse # for constraints for the dev set srl }) # This is the position of the gold verb predicate # We may or may not use it (the model might predict the predicate), but the reader always sends it. # instance_fields["verb_pos"] = IndexField(index=target_verb_position, sequence_field=sequence) # TODO Allennlp uses SequenceFeatureField for a indicator vector of the verb position (Find this) # instance_fields["verb_indicator"] = SequenceFeatureField(index=target_verb_position, sequence_field=sequence) verb_indicator = np.zeros(len(tokens)) verb_indicator[target_verb_position] = 1.0 instance_fields["verb_indicator"] = ArrayField(array=verb_indicator) # everyone follows the default IOB2 == BIO format here coded_srl = get_bio_from_spans(verb_annotation, year=self.year, core_args_only=self.core_args_only) coded_chunks = chunk_tags coded_ner = ner_tags if self.coding_scheme == "BIOUL": # coded_srl = get_bio_from_spans(verb_annotation) coded_chunks = to_bioul(chunk_tags, encoding=self._original_coding_scheme ) if chunk_tags is not None else None coded_ner = to_bioul(ner_tags, encoding=self._original_coding_scheme ) if ner_tags is not None else None if 'pos' in self.feature_labels: if pos_tags is None: raise ConfigurationError( "Dataset reader was specified to use pos_tags as " "features. Pass them to text_to_instance.") instance_fields['pos_tags'] = SequenceLabelField( pos_tags, sequence, "pos_tags") if 'chunk' in self.feature_labels: if coded_chunks is None: raise ConfigurationError( "Dataset reader was specified to use chunk tags as " "features. Pass them to text_to_instance.") instance_fields['chunk_tags'] = SequenceLabelField( coded_chunks, sequence, "chunk_tags") if 'ner' in self.feature_labels: if coded_ner is None: raise ConfigurationError( "Dataset reader was specified to use NER tags as " " features. Pass them to text_to_instance.") instance_fields['ner_tags'] = SequenceLabelField( coded_ner, sequence, "ner_tags") # Add "tag label" to instance if self.tag_label == 'srl' and coded_srl is not None: instance_fields['tags'] = SequenceLabelField( coded_srl, sequence, self.label_namespace) elif self.tag_label == 'pos' and pos_tags is not None: instance_fields['tags'] = SequenceLabelField( pos_tags, sequence, self.label_namespace) elif self.tag_label == 'chunk' and coded_chunks is not None: instance_fields['tags'] = SequenceLabelField( coded_chunks, sequence, self.label_namespace) return Instance(instance_fields)
def _read_dataset(self, file_path: str, count_only: bool = False, keep_idx: Optional[Set[int]] = None): """ Yield instances from the file_path. Parameters ---------- file_path: str, required The path to the data file. count_only: bool, optional (default=``False``) If True, no instances are returned and instead a dummy object is returned. This is useful for quickly counting the number of instances in the data file, since creating instances is relatively expensive. keep_idx: Set[int], optional (default=``None``) If not None, only yield instances whose index is in this set. """ # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, "r") as data_file: if count_only: logger.info("Counting instances in file at: %s", file_path) else: logger.info("Reading instances from lines in file at: %s", file_path) index = 0 # Group into alternative divider / sentence chunks. for is_divider, lines in itertools.groupby(data_file, _is_divider): # Ignore the divider chunks, so that `lines` corresponds to the words # of a single sentence. if not is_divider: if count_only: yield 1 continue if keep_idx is not None and index not in keep_idx: index += 1 continue fields = [line.strip().split() for line in lines] # unzipping trick returns tuples, but our Fields need lists tokens, _, _, ner_tags = [list(field) for field in zip(*fields)] # Contextualize the tokens if a Contextualizer was provided. # TODO (nfliu): How can we make this batched? # Would make contextualizers that use the GPU much faster. if self._contextualizer: token_representations = self._contextualizer([tokens])[0] else: token_representations = None # Recode the labels if necessary. if self._label_encoding == "BIOUL": coded_ner = to_bioul(ner_tags) if ner_tags is not None else None else: coded_ner = ner_tags yield self.text_to_instance( tokens, token_representations, coded_ner) index += 1