Exemplo n.º 1
0
    def to_input_samples(self,
                         fold: Optional[str] = None) -> List[InputSample]:
        files_found = False
        input_samples = []
        for i, file_path in enumerate(self.files_path.glob(self.glob_pattern)):
            if fold and fold not in file_path.name:
                continue

            files_found = True
            with open(file_path, "r", encoding="utf-8") as file:
                text = file.readlines()

            text = "".join(text)

            output_docs = conll_ner_to_docs(input_data=text,
                                            n_sents=None,
                                            no_print=True)
            for doc in tqdm(output_docs,
                            f"Processing doc for file {file_path.name}"):
                input_samples.append(InputSample.from_spacy_doc(doc=doc))

        if not files_found:
            raise FileNotFoundError(
                f"No files found for pattern {self.glob_pattern} and fold {fold}"
            )

        return input_samples
Exemplo n.º 2
0
def test_cli_converters_conll_ner_to_docs():
    lines = [
        "-DOCSTART- -X- O O",
        "",
        "I\tO",
        "like\tO",
        "London\tB-GPE",
        "and\tO",
        "New\tB-GPE",
        "York\tI-GPE",
        "City\tI-GPE",
        ".\tO",
        "",
        "I O",
        "like O",
        "London B-GPE",
        "and O",
        "New B-GPE",
        "York I-GPE",
        "City I-GPE",
        ". O",
        "",
        "I PRP O",
        "like VBP O",
        "London NNP B-GPE",
        "and CC O",
        "New NNP B-GPE",
        "York NNP I-GPE",
        "City NNP I-GPE",
        ". . O",
        "",
        "I PRP _ O",
        "like VBP _ O",
        "London NNP _ B-GPE",
        "and CC _ O",
        "New NNP _ B-GPE",
        "York NNP _ I-GPE",
        "City NNP _ I-GPE",
        ". . _ O",
        "",
        "I\tPRP\t_\tO",
        "like\tVBP\t_\tO",
        "London\tNNP\t_\tB-GPE",
        "and\tCC\t_\tO",
        "New\tNNP\t_\tB-GPE",
        "York\tNNP\t_\tI-GPE",
        "City\tNNP\t_\tI-GPE",
        ".\t.\t_\tO",
    ]
    input_data = "\n".join(lines)
    converted_docs = list(conll_ner_to_docs(input_data, n_sents=10))
    assert len(converted_docs) == 1
    converted = docs_to_json(converted_docs)
    assert converted["id"] == 0
    assert len(converted["paragraphs"]) == 1
    assert len(converted["paragraphs"][0]["sentences"]) == 5
    for i in range(0, 5):
        sent = converted["paragraphs"][0]["sentences"][i]
        assert len(sent["tokens"]) == 8
        tokens = sent["tokens"]
        # fmt: off
        assert [t["orth"] for t in tokens] == [
            "I", "like", "London", "and", "New", "York", "City", "."
        ]
        # fmt: on
    assert len(converted_docs[0].ents) == 10
    for ent in converted_docs[0].ents:
        assert ent.text in ["New York City", "London"]