示例#1
0
def test_json_to_docs_no_ner(en_vocab):
    data = [{
        "id":
        1,
        "paragraphs": [{
            "sentences": [{
                "tokens": [
                    {
                        "dep": "nn",
                        "head": 1,
                        "tag": "NNP",
                        "orth": "Ms."
                    },
                    {
                        "dep": "nsubj",
                        "head": 1,
                        "tag": "NNP",
                        "orth": "Haag",
                    },
                    {
                        "dep": "ROOT",
                        "head": 0,
                        "tag": "VBZ",
                        "orth": "plays",
                    },
                    {
                        "dep": "dobj",
                        "head": -1,
                        "tag": "NNP",
                        "orth": "Elianti",
                    },
                    {
                        "dep": "punct",
                        "head": -2,
                        "tag": ".",
                        "orth": "."
                    },
                ]
            }]
        }],
    }]
    docs = list(json_to_docs(data))
    assert len(docs) == 1
    for doc in docs:
        assert not doc.has_annotation("ENT_IOB")
    for token in doc:
        assert token.ent_iob == 0
    eg = Example(
        Doc(
            doc.vocab,
            words=[w.text for w in doc],
            spaces=[bool(w.whitespace_) for w in doc],
        ),
        doc,
    )
    ner_tags = eg.get_aligned_ner()
    assert ner_tags == [None, None, None, None, None]
示例#2
0
def json_path_to_examples(data_path, NLP):
    data = srsly.read_json(data_path)
    # no good way to convert with a specified vocab, so convert, then reload
    # through DocBin with the right vocab
    docs = json_to_docs(data)
    docbin = DocBin()
    for doc in docs:
        docbin.add(doc)
    docs = docbin.get_docs(NLP.vocab)
    examples = [Example(NLP.make_doc(doc.text), doc) for doc in docs]
    return examples
示例#3
0
def test_issue4402():
    json_data = {
        "id": 0,
        "paragraphs": [
            {
                "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
                "sentences": [
                    {
                        "tokens": [
                            {"id": 0, "orth": "How", "ner": "O"},
                            {"id": 1, "orth": "should", "ner": "O"},
                            {"id": 2, "orth": "I", "ner": "O"},
                            {"id": 3, "orth": "cook", "ner": "O"},
                            {"id": 4, "orth": "bacon", "ner": "O"},
                            {"id": 5, "orth": "in", "ner": "O"},
                            {"id": 6, "orth": "an", "ner": "O"},
                            {"id": 7, "orth": "oven", "ner": "O"},
                            {"id": 8, "orth": "?", "ner": "O"},
                        ],
                        "brackets": [],
                    },
                    {
                        "tokens": [
                            {"id": 9, "orth": "\n", "ner": "O"},
                            {"id": 10, "orth": "I", "ner": "O"},
                            {"id": 11, "orth": "'ve", "ner": "O"},
                            {"id": 12, "orth": "heard", "ner": "O"},
                            {"id": 13, "orth": "of", "ner": "O"},
                            {"id": 14, "orth": "people", "ner": "O"},
                            {"id": 15, "orth": "cooking", "ner": "O"},
                            {"id": 16, "orth": "bacon", "ner": "O"},
                            {"id": 17, "orth": "in", "ner": "O"},
                            {"id": 18, "orth": "an", "ner": "O"},
                            {"id": 19, "orth": "oven", "ner": "O"},
                            {"id": 20, "orth": ".", "ner": "O"},
                        ],
                        "brackets": [],
                    },
                ],
                "cats": [
                    {"label": "baking", "value": 1.0},
                    {"label": "not_baking", "value": 0.0},
                ],
            },
            {
                "raw": "What is the difference between white and brown eggs?\n",
                "sentences": [
                    {
                        "tokens": [
                            {"id": 0, "orth": "What", "ner": "O"},
                            {"id": 1, "orth": "is", "ner": "O"},
                            {"id": 2, "orth": "the", "ner": "O"},
                            {"id": 3, "orth": "difference", "ner": "O"},
                            {"id": 4, "orth": "between", "ner": "O"},
                            {"id": 5, "orth": "white", "ner": "O"},
                            {"id": 6, "orth": "and", "ner": "O"},
                            {"id": 7, "orth": "brown", "ner": "O"},
                            {"id": 8, "orth": "eggs", "ner": "O"},
                            {"id": 9, "orth": "?", "ner": "O"},
                        ],
                        "brackets": [],
                    },
                    {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
                ],
                "cats": [
                    {"label": "baking", "value": 0.0},
                    {"label": "not_baking", "value": 1.0},
                ],
            },
        ],
    }
    nlp = English()
    attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
    with make_tempdir() as tmpdir:
        output_file = tmpdir / "test4402.spacy"
        docs = json_to_docs([json_data])
        data = DocBin(docs=docs, attrs=attrs).to_bytes()
        with output_file.open("wb") as file_:
            file_.write(data)
        reader = Corpus(output_file)
        train_data = list(reader(nlp))
        assert len(train_data) == 2

        split_train_data = []
        for eg in train_data:
            split_train_data.extend(eg.split_sents())
        assert len(split_train_data) == 4