Пример #1
0
def parse(options, filename, annotated_sentences, tmp_dir):
    tagged_conll_filename = output_filename(tmp_dir, filename, "tag.conll")
    parsed_filename = output_filename(tmp_dir, filename, "conll")
    log_filename = output_filename(tmp_dir, filename, "log")

    # The parser command line is dependent on the input and
    # output files, so we build that one for each data file
    parser_cmdline = ["java", "-Xmx2000m",
                      "-jar", os.path.expanduser(options.malt),
                      "-m", "parse",
                      "-i", tagged_conll_filename,
                      "-o", parsed_filename,
                      "-w", tmp_dir,
                      "-c", os.path.basename(options.parsing_model)]

    # Conversion from .tag file to tagged.conll (input format for the parser)
    tagged_conll_file = open(tagged_conll_filename, "w", encoding="utf-8")
    tagged_to_tagged_conll(annotated_sentences, tagged_conll_file)
    tagged_conll_file.close()

    # Run the parser
    with open(log_filename, "w", encoding="utf-8") as log_file:
        returncode = Popen(parser_cmdline, stdout=log_file, stderr=log_file).wait()

    if returncode:
        sys.exit("Parsing failed! Log file may contain more information: %s" % log_filename)

    return parsed_filename
Пример #2
0
    def test_verb(self):
        annotated_sentences = [[
            ('Jag jag PRON|Case=Nom|Definite=Def|Gender=Com|Number=Sing PN|UTR|SIN|DEF|SUB'
             ).split(" "),
            ('har ha VERB|Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act VB|PRS|AKT'
             ).split(" "),
            ('en en DET|Definite=Ind|Gender=Com|Number=Sing DT|UTR|SIN|IND'
             ).split(" "),
            ('dröm dröm NOUN|Case=Nom|Definite=Ind|Gender=Com|Number=Sing NN|UTR|SIN|IND|NOM'
             ).split(" "),
            ('. . PUNCT|_ MAD').split(" "),
        ]]

        file_data = ""
        with tempfile.TemporaryFile(mode="w+") as outfile:
            conll.tagged_to_tagged_conll(annotated_sentences, outfile)
            outfile.seek(0)
            file_data = outfile.read()

        self.assertEqual(
            file_data.splitlines(),
            textwrap.dedent("""
            0	Jag	jag	PRON	PN|UTR|SIN|DEF|SUB	Case=Nom|Definite=Def|Gender=Com|Number=Sing
            1	har	ha	VERB	VB|PRS|AKT	Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act
            2	en	en	DET	DT|UTR|SIN|IND	Definite=Ind|Gender=Com|Number=Sing
            3	dröm	dröm	NOUN	NN|UTR|SIN|IND|NOM	Case=Nom|Definite=Ind|Gender=Com|Number=Sing
            4	.	.	PUNCT	MAD	_

        """).lstrip().splitlines())
Пример #3
0
def parse(options, filename, annotated_sentences, tmp_dir):
    tagged_conll_filename = output_filename(tmp_dir, filename, "tag.conll")
    parsed_filename = output_filename(tmp_dir, filename, "conll")
    log_filename = output_filename(tmp_dir, filename, "log")

    # The parser command line is dependent on the input and
    # output files, so we build that one for each data file
    parser_cmdline = [
        "java", "-Xmx2000m", "-jar",
        os.path.expanduser(options.malt), "-m", "parse", "-i",
        tagged_conll_filename, "-o", parsed_filename, "-w", tmp_dir, "-c",
        os.path.basename(options.parsing_model)
    ]

    # Conversion from .tag file to tagged.conll (input format for the parser)
    tagged_conll_file = open(tagged_conll_filename, "w", encoding="utf-8")
    tagged_to_tagged_conll(annotated_sentences, tagged_conll_file)
    tagged_conll_file.close()

    # Run the parser
    with open(log_filename, "w", encoding="utf-8") as log_file:
        returncode = Popen(parser_cmdline, stdout=log_file,
                           stderr=log_file).wait()

    if returncode:
        sys.exit("Parsing failed! See log file: %s" % log_filename)

    return parsed_filename
Пример #4
0
    def test_verb(self):
        annotated_sentences = [[
            ('Jag jag PRON|Case=Nom|Definite=Def|Gender=Com|Number=Sing PN|UTR|SIN|DEF|SUB').split(" "),
            ('har ha VERB|Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act VB|PRS|AKT').split(" "),
            ('en en DET|Definite=Ind|Gender=Com|Number=Sing DT|UTR|SIN|IND').split(" "),
            ('dröm dröm NOUN|Case=Nom|Definite=Ind|Gender=Com|Number=Sing NN|UTR|SIN|IND|NOM').split(" "),
            ('. . PUNCT|_ MAD').split(" "),
        ]]

        file_data = ""
        with tempfile.TemporaryFile(mode="w+") as outfile:
            conll.tagged_to_tagged_conll(annotated_sentences, outfile)
            outfile.seek(0)
            file_data = outfile.read()

        self.assertEqual(file_data.splitlines(), textwrap.dedent("""
            0	Jag	jag	PRON	PN|UTR|SIN|DEF|SUB	Case=Nom|Definite=Def|Gender=Com|Number=Sing
            1	har	ha	VERB	VB|PRS|AKT	Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act
            2	en	en	DET	DT|UTR|SIN|IND	Definite=Ind|Gender=Com|Number=Sing
            3	dröm	dröm	NOUN	NN|UTR|SIN|IND|NOM	Case=Nom|Definite=Ind|Gender=Com|Number=Sing
            4	.	.	PUNCT	MAD	_

        """).lstrip().splitlines())