示例#1
0
def load_test_with_spacy(ddt):
    from spacy.cli.converters import conllu2json

    conll_path = os.path.join(ddt.dataset_dir, '{}.{}{}'.format(ddt.dataset_name, 'test', ddt.file_extension))

    file_as_json = {}
    with open(conll_path, 'r') as file:
        file_as_string = file.read()
        file_as_string = file_as_string.replace("name=", "").replace("|SpaceAfter=No", "")
        file_as_json = conllu2json(file_as_string)
    return read_json_object(file_as_json)
示例#2
0
文件: ddt.py 项目: smaakage85/danlp
    def load_with_spacy(self):
        """
        Loads the DDT with spaCy. 
        
        This function converts the conllu files to json in the spaCy format.

        :return: GoldCorpus

        .. note:: Not using jsonl because of:
            https://github.com/explosion/spaCy/issues/3523
        """
        import srsly
        from spacy.cli.converters import conllu2json
        from spacy.gold import GoldCorpus
        from spacy.gold import Path

        for part in ['train', 'dev', 'test']:
            conll_path = os.path.join(
                self.dataset_dir, '{}.{}{}'.format(self.dataset_name, part,
                                                   self.file_extension))
            json_path = os.path.join(self.dataset_dir,
                                     "ddt.{}.json".format(part))

            if not os.path.isfile(
                    json_path):  # Convert the conllu files to json
                with open(conll_path, 'r') as file:
                    file_as_string = file.read()
                    file_as_string = file_as_string.replace(
                        "name=", "").replace("|SpaceAfter=No", "")
                    file_as_json = conllu2json(file_as_string)

                    srsly.write_json(json_path, file_as_json)

        train_json_path = os.path.join(self.dataset_dir, "ddt.train.json")
        dev_json_path = os.path.join(self.dataset_dir, "ddt.dev.json")

        assert os.path.isfile(train_json_path)
        assert os.path.isfile(dev_json_path)

        return GoldCorpus(Path(train_json_path), Path(dev_json_path))
示例#3
0
def test_cli_converters_conllu2json():
    # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
    lines = [
        "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO",
        "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tB-PER",
        "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tI-PER",
        "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO",
    ]
    input_data = "\n".join(lines)
    converted = conllu2json(input_data, n_sents=1)
    assert len(converted) == 1
    assert converted[0]["id"] == 0
    assert len(converted[0]["paragraphs"]) == 1
    assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
    sent = converted[0]["paragraphs"][0]["sentences"][0]
    assert len(sent["tokens"]) == 4
    tokens = sent["tokens"]
    assert [t["orth"] for t in tokens] == ["Dommer", "Finn", "Eilertsen", "avstår"]
    assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB"]
    assert [t["head"] for t in tokens] == [1, 2, -1, 0]
    assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT"]
    assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"]