示例#1
0
    def to_input_samples(self,
                         fold: Optional[str] = None) -> List[InputSample]:
        files_found = False
        input_samples = []
        for i, file_path in enumerate(self.files_path.glob(self.glob_pattern)):
            if fold and fold not in file_path.name:
                continue

            files_found = True
            with open(file_path, "r", encoding="utf-8") as file:
                text = file.readlines()

            text = "".join(text)

            output_docs = conll_ner_to_docs(input_data=text,
                                            n_sents=None,
                                            no_print=True)
            for doc in tqdm(output_docs,
                            f"Processing doc for file {file_path.name}"):
                input_samples.append(InputSample.from_spacy_doc(doc=doc))

        if not files_found:
            raise FileNotFoundError(
                f"No files found for pattern {self.glob_pattern} and fold {fold}"
            )

        return input_samples
def test_from_spacy_doc():
    nlp = spacy.load("en_core_web_sm")
    doc = nlp("Nice to meet you Mr. Perkins.")

    sample = InputSample.from_spacy_doc(doc)
    assert sample.spans[0].entity_type == "PERSON"
    assert sample.tags == ["O", "O", "O", "O", "O", "U-PERSON", "O"]