def test_dataset_path_iterator(self):
     reader = Ontonotes()
     files = list(reader.dataset_path_iterator(CONLL_PATH))
     expected_paths = [
         str(CONLL_PATH / "subdomain" / "example.gold_conll"),
         str(CONLL_PATH / "subdomain2" / "example.gold_conll"),
     ]
     assert len(files) == len(expected_paths)
     assert set(files) == set(expected_paths)
示例#2
0
 def _ontonotes_subset(
         ontonotes_reader: Ontonotes, file_path: str,
         domain_identifier: str) -> Iterable[OntonotesSentence]:
     """
     Iterates over the Ontonotes 5.0 dataset using an optional domain identifier.
     If the domain identifier is present, only examples which contain the domain
     identifier in the file path are yielded.
     """
     for conll_file in ontonotes_reader.dataset_path_iterator(file_path):
         if domain_identifier is None or f"/{domain_identifier}/" in conll_file:
             yield from ontonotes_reader.sentence_iterator(conll_file)