def to_input_samples(self, fold: Optional[str] = None) -> List[InputSample]: files_found = False input_samples = [] for i, file_path in enumerate(self.files_path.glob(self.glob_pattern)): if fold and fold not in file_path.name: continue files_found = True with open(file_path, "r", encoding="utf-8") as file: text = file.readlines() text = "".join(text) output_docs = conll_ner_to_docs(input_data=text, n_sents=None, no_print=True) for doc in tqdm(output_docs, f"Processing doc for file {file_path.name}"): input_samples.append(InputSample.from_spacy_doc(doc=doc)) if not files_found: raise FileNotFoundError( f"No files found for pattern {self.glob_pattern} and fold {fold}" ) return input_samples
def test_cli_converters_conll_ner_to_docs(): lines = [ "-DOCSTART- -X- O O", "", "I\tO", "like\tO", "London\tB-GPE", "and\tO", "New\tB-GPE", "York\tI-GPE", "City\tI-GPE", ".\tO", "", "I O", "like O", "London B-GPE", "and O", "New B-GPE", "York I-GPE", "City I-GPE", ". O", "", "I PRP O", "like VBP O", "London NNP B-GPE", "and CC O", "New NNP B-GPE", "York NNP I-GPE", "City NNP I-GPE", ". . O", "", "I PRP _ O", "like VBP _ O", "London NNP _ B-GPE", "and CC _ O", "New NNP _ B-GPE", "York NNP _ I-GPE", "City NNP _ I-GPE", ". . _ O", "", "I\tPRP\t_\tO", "like\tVBP\t_\tO", "London\tNNP\t_\tB-GPE", "and\tCC\t_\tO", "New\tNNP\t_\tB-GPE", "York\tNNP\t_\tI-GPE", "City\tNNP\t_\tI-GPE", ".\t.\t_\tO", ] input_data = "\n".join(lines) converted_docs = list(conll_ner_to_docs(input_data, n_sents=10)) assert len(converted_docs) == 1 converted = docs_to_json(converted_docs) assert converted["id"] == 0 assert len(converted["paragraphs"]) == 1 assert len(converted["paragraphs"][0]["sentences"]) == 5 for i in range(0, 5): sent = converted["paragraphs"][0]["sentences"][i] assert len(sent["tokens"]) == 8 tokens = sent["tokens"] # fmt: off assert [t["orth"] for t in tokens] == [ "I", "like", "London", "and", "New", "York", "City", "." ] # fmt: on assert len(converted_docs[0].ents) == 10 for ent in converted_docs[0].ents: assert ent.text in ["New York City", "London"]