Exemplo n.º 1
0
 def test_parse_sent_count(self):
     pr = LGInprocParser()
     bar = TextProgress(total=12, desc="Overal progress")
     pr.parse("tests/test-data/dict/poc-turtle",
              "tests/test-data/corpora/poc-turtle/poc-turtle.txt",
              f"{self.tmp_dir}/poc-turtle.txt.ull", None, 0, bar)
     self.assertEqual(12, 12)
Exemplo n.º 2
0
    def test_parse_file_with_api(self):
        # Testing over poc-turtle corpus... 100% success is expected.

        options = BIT_NO_LWALL | BIT_NO_PERIOD | BIT_STRIP | BIT_RM_DIR | BIT_LOC_LANG | BIT_PARSE_QUALITY \
                  | BIT_EXISTING_DICT

        lgp = LGInprocParser()
        api = LGApiParser()

        dict = "en"
        corp = "tests/test-data/corpora/poc-english/poc_english.txt"
        # reff = "tests/test-data/corpora/poc-english/poc_english_parses_lg.txt"
        outp = "/var/tmp/temp"
        reff = None

        # dict = "test-data/dict/poc-turtle"
        # corp = "test-data/corpora/poc-turtle/poc-turtle.txt"
        # outp = "test-data/temp"
        # reff = None

        m1, q1 = lgp.parse(dict, corp, outp, reff, options)
        m2, q2 = api.parse(dict, corp, outp, reff, options)

        print(f"q1=\n{q1.parse_quality_str(q1)}\n")
        print(f"q2=\n{q2.parse_quality_str(q2)}\n")

        self.assertTrue(m1 == m2)
Exemplo n.º 3
0
 def test_parse_file_not_found(self):
     with self.assertRaises(FileNotFoundError) as ctx:
         pr = LGInprocParser()
         pr.parse("tests/test-data/dict/poc-turtle",
                  "tests/test-data/corpora/poc-turtle/poc-turtle.txt",
                  f"{self.tmp_dir}/poc-turtle.txt.ull",
                  "tests/test-data/corpora/poc-turtle/poc-horse.txt",
                  BIT_PARSE_QUALITY)
Exemplo n.º 4
0
 def test_parse_batch_ps_output(self):
     """ Test postscript parsing for total number of parsed sentences """
     pr = LGInprocParser()
     num_sent = len(pr._parse_batch_ps_output(lg_post_output, 0))
     self.assertEqual(
         num_sent, 12,
         "'parse_batch_ps_output()' returns '{}' instead of '{}'".format(
             num_sent, 12))
Exemplo n.º 5
0
 def test_parse_batch_ps_output_explosion(self):
     """ Test for 'combinatorial explosion' """
     pr = LGInprocParser(verbosity=0)
     num_sent = len(pr._parse_batch_ps_output(lg_post_explosion, 0))
     self.assertEqual(
         num_sent, 4,
         "'parse_batch_ps_output()' returns '{}' instead of '{}'".format(
             num_sent, 4))
Exemplo n.º 6
0
    def test_parse_invalid_file_format(self):

        with self.assertRaises(LGParseError) as ctx:
            pr = LGInprocParser()
            pr.parse("tests/test-data/dict/poc-turtle",
                     "tests/test-data/corpora/poc-turtle/poc-turtle.txt",
                     f"{self.tmp_dir}/poc-turtle-01.txt.ull",
                     "tests/test-data/corpora/poc-turtle/poc-turtle.txt",
                     BIT_PARSE_QUALITY)
Exemplo n.º 7
0
 def test_parse_file_not_found(self):
     with self.assertRaises(FileNotFoundError) as ctx:
         # TestClass().test_func()
         pr = LGInprocParser()
         pr.parse("tests/test-data/dict/poc-turtle",
                  "tests/test-data/corpora/poc-turtle/poc-turtle.txt",
                  "/var/tmp/parse",
                  "tests/test-data/corpora/poc-turtle/poc-horse.txt",
                  BIT_PARSE_QUALITY)
Exemplo n.º 8
0
    def test_parse_invalid_ref_file(self):

        with self.assertRaises(LGParseError) as ctx:
            pr = LGInprocParser()
            pr.parse(
                "tests/test-data/dict/poc-turtle",
                "tests/test-data/corpora/poc-english/poc_english.txt",
                f"{self.tmp_dir}/poc_english.txt.ull",
                "tests/test-data/parses/poc-turtle-mst/poc-turtle-parses-expected.txt",
                BIT_PARSE_QUALITY)
Exemplo n.º 9
0
    def test_second_linkage_issue(self):
        with open(
                "tests/test-data/second-linkage-test/GCB-NQ.txt.raw") as file:
            raw = file.read()

        options = BIT_EXISTING_DICT | BIT_NO_LWALL | BIT_NO_PERIOD | BIT_STRIP

        lg_parser = LGInprocParser()
        sentenses = lg_parser._parse_batch_ps_output(raw, options)

        self.assertEqual(229, len(sentenses))
Exemplo n.º 10
0
    def test_max_sentence_len(self):
        pr = LGInprocParser()
        pm, pq = pr.parse("en",
                          "tests/test-data/sentence-skip-test/issue-184.txt",
                          f"{self.tmp_dir}/issue-184.ull",
                          None,
                          BIT_EXISTING_DICT | BIT_NO_LWALL | BIT_NO_PERIOD
                          | BIT_STRIP,
                          max_sentence_len=3)

        self.assertEqual(2, pm.sentences)
        self.assertEqual(19, pm.skipped_sentences)
Exemplo n.º 11
0
    def test_parse_batch_ps_output_sharp(self):
        """ Test for 'sharp sign token suffix' """
        pr = LGInprocParser(verbosity=1)
        sentences = pr._parse_batch_ps_output(sharp_sign_linkage, 0)
        num_sent = len(sentences)
        self.assertEqual(
            num_sent, 1,
            "'parse_batch_ps_output()' returns '{}' instead of '{}'".format(
                num_sent, 1))

        print(sentences[0].text)
        print(sentences[0].linkages)
Exemplo n.º 12
0
    def test_parse_invalid_ref_file(self):

        # with self.assertRaises(LGParseError) as ctx:
        try:
            pr = LGInprocParser()
            pr.parse(
                "tests/test-data/dict/poc-turtle",
                "tests/test-data/corpora/poc-english/poc_english.txt",
                "/var/tmp/parse",
                "tests/test-data/parses/poc-turtle-mst/poc-turtle-parses-expected.txt",
                BIT_PARSE_QUALITY)
        except Exception as err:
            print(str(type(err)) + ": " + str(err), file=sys.stderr)
Exemplo n.º 13
0
    def test_parse_batch_ps_output_explosion(self):
        # """ Test postscript parsing for total number of parsed sentences """
        pr = LGInprocParser()

        print(explosion_no_linkages)

        sentences = pr._parse_batch_ps_output(explosion_no_linkages, 0)

        self.assertEqual(1, len(sentences))
        self.assertEqual(
            "But there still remained all the damage that had been done that day , and the king "
            "had nothing with which to pay for this.", sentences[0].text)
        self.assertEqual(1, len(sentences[0].linkages))
Exemplo n.º 14
0
    def test_parse_invalid_file_format(self):

        # from subprocess import Popen, PIPE
        #
        # with Popen(["conda", "env", "list"], stdout=PIPE, stderr=PIPE) as wh:
        #
        #     raw, err = wh.communicate()
        #     print(raw.decode("utf-8-sig"))

        with self.assertRaises(LGParseError) as ctx:
            pr = LGInprocParser()
            pr.parse("tests/test-data/dict/poc-turtle",
                     "tests/test-data/corpora/poc-turtle/poc-turtle.txt",
                     "/var/tmp/parse",
                     "tests/test-data/corpora/poc-turtle/poc-turtle.txt",
                     BIT_PARSE_QUALITY)
Exemplo n.º 15
0
    def test_parseability_coinsedence(self):
        """ Test for coinsidence of results of parsing poc-english corpus in a single file and the one splited into multiple files """
        dict = "en"
        # dict = handle_path_string("tests/test-data/dict/poc-turtle")
        corp1 = handle_path_string("tests/test-data/corpora/poc-english/poc_english.txt")
        corp2 = handle_path_string("tests/test-data/corpora/poc-english-multi")
        dest = handle_path_string("/var/tmp/test_parseability_coinsedence")
        self.create_path(dest)

        # dest = handle_path_string("tests/test-data/temp")
        ref1 = handle_path_string("tests/test-data/parses/poc-english-ref/poc_english.txt.ull")
        ref2 = handle_path_string("tests/test-data/parses/poc-english-multi-ref")

        pr = LGInprocParser()
        # pr = LGApiParser()

        # opts |= BIT_EXISTING_DICT

        gt = GrammarTester(grmr, tmpl, limit, pr)
        pm1, pq1 = gt.test(dict, corp1, dest, ref1, (opts | BIT_EXISTING_DICT))
        pm2, pq2 = gt.test(dict, corp2, dest, ref2, (opts | BIT_EXISTING_DICT))

        # print(pm.text(pm))
        # print(pq.text(pq))

        self.assertEqual(pm1, pm2)
        self.assertEqual(pq1, pq2)

        # self.assertEqual(88, pm.sentences)
        self.assertEqual("100.00%", pm1.parseability_str(pm1).strip())
        self.assertEqual("0.00%", pm1.completely_unparsed_str(pm1).strip())
        self.assertEqual("100.00%", pm1.completely_parsed_str(pm1).strip())
Exemplo n.º 16
0
    def test_parseability_multi_file(self):
        """ Test poc-english corpus with poc-turtle dictionary """
        # dict = "poc-turtle"
        dict = handle_path_string("tests/test-data/dict/poc-turtle")
        corp = handle_path_string("tests/test-data/corpora/poc-english-multi")
        dest = handle_path_string("/var/tmp/test_parseability_multi_file")
        self.create_path(dest)

        # dest = handle_path_string("tests/test-data/temp")
        ref = None  # handle_path_string("test-data/parses/poc-english-multi-ref")

        pr = LGInprocParser()
        # pr = LGApiParser()

        # print(dict, corp, dest, ref, sep="\n")

        gt = GrammarTester(grmr, tmpl, limit, pr)
        pm, pq = gt.test(dict, corp, dest, ref, (opts | BIT_EXISTING_DICT))

        # print(pm.text(pm))
        # print(pq.text(pq))

        # self.assertEqual(9, gt._total_files)
        self.assertEqual(88, pm.sentences)
        self.assertEqual("2.46%", pm.parseability_str(pm).strip())
        self.assertEqual("90.91%", pm.completely_unparsed_str(pm).strip())
Exemplo n.º 17
0
    def test_parse_batch_ps_output_explosion_merged_sentences(self):
        # """ Test postscript parsing for total number of parsed sentences """
        pr = LGInprocParser()

        print(merged_ps_parses)

        sentences = pr._parse_batch_ps_output(merged_ps_parses, 0)

        self.assertEqual(2, len(sentences))
        self.assertEqual(
            "There the train was coming mother was holding Jem's hand Dog Monday was licking it everybody "
            "was saying good-bye the train was in !", sentences[0].text)
        self.assertEqual(
            "[([There])([the])([train])([was])([coming])([mother])([was])([holding])([Jem's])([hand])"
            "([Dog])([Monday])([was])([licking])([it])([everybody])([was])([saying])([good-bye])([the])"
            "([train])([was])([in])([!])][][0]", sentences[0].linkages[0])
        self.assertEqual("They had gone.", sentences[1].text)
        self.assertEqual(1, len(sentences[0].linkages))
        self.assertEqual(1, len(sentences[1].linkages))
Exemplo n.º 18
0
    def test_min_word_count(self):
        token_counts, total_count = {}, 0
        corpus_file_path = "tests/test-data/corpora/poc-turtle/poc-turtle-dot-separated.txt"

        options = BIT_EXISTING_DICT | BIT_NO_LWALL | BIT_NO_PERIOD | BIT_STRIP

        total_count = update_token_counts(corpus_file_path, token_counts,
                                          options)

        self.assertEqual(48, total_count)
        self.assertEqual(6, token_counts.get("isa", 0))
        self.assertEqual(6, token_counts.get("has", 0))
        self.assertEqual(2, token_counts.get("tuna", 0))

        pr = LGInprocParser()
        pm, pq = pr.parse(
            "tests/test-data/dict/poc-turtle",
            corpus_file_path,
            f"{self.tmp_dir}/{os.path.split(corpus_file_path)[1]}",
            None,
            options,
            min_word_count=1,
            token_counts=token_counts)

        self.assertEqual(12, pm.sentences)
        self.assertEqual(0, pm.skipped_sentences)

        pm, pq = pr.parse(
            "tests/test-data/dict/poc-turtle",
            corpus_file_path,
            f"{self.tmp_dir}/{os.path.split(corpus_file_path)[1]}",
            None,
            options,
            min_word_count=2,
            token_counts=token_counts)

        self.assertEqual(10, pm.sentences)
        self.assertEqual(2, pm.skipped_sentences)
Exemplo n.º 19
0
    def test_test(self):
        pr = LGInprocParser()
        # pr = LGApiParser()

        print(dict, corp, dest, ref, sep="\n")

        gt = GrammarTester(grmr, tmpl, limit, pr)
        pm, pq = gt.test(dict, corp, dest, ref, opts)

        print(pm.text(pm))
        # print(pq.text(pq))

        # self.assertEqual(25, gt._total_dicts)
        self.assertEqual(88, pm.sentences)
Exemplo n.º 20
0
    def test_stop_tokens(self):
        pr = LGInprocParser()
        pm, pq = pr.parse(
            "tests/test-data/dict/poc-turtle",
            "tests/test-data/corpora/poc-turtle/poc-turtle.txt",
            f"{self.tmp_dir}/poc-turtle-02.txt.ull",
            "tests/test-data/parses/poc-turtle-mst/poc-turtle-parses-expected.txt",
            BIT_PARSE_QUALITY | BIT_EXISTING_DICT | BIT_NO_LWALL
            | BIT_NO_PERIOD | BIT_STRIP)

        self.assertEqual(12, pm.sentences)
        self.assertEqual(0, pm.skipped_sentences)

        pm, pq = pr.parse(
            "tests/test-data/dict/poc-turtle",
            "tests/test-data/corpora/poc-turtle/poc-turtle.txt",
            f"{self.tmp_dir}/poc-turtle-03.txt.ull",
            "tests/test-data/parses/poc-turtle-mst/poc-turtle-parses-expected.txt",
            BIT_PARSE_QUALITY | BIT_EXISTING_DICT | BIT_NO_LWALL
            | BIT_NO_PERIOD | BIT_STRIP,
            stop_tokens="isa")

        self.assertEqual(6, pm.sentences)
        self.assertEqual(6, pm.skipped_sentences)

        pm, pq = pr.parse(
            "tests/test-data/dict/poc-turtle",
            "tests/test-data/corpora/poc-turtle/poc-turtle.txt",
            f"{self.tmp_dir}/poc-turtle-04.txt.ull",
            "tests/test-data/parses/poc-turtle-mst/poc-turtle-parses-expected.txt",
            BIT_PARSE_QUALITY | BIT_EXISTING_DICT | BIT_NO_LWALL
            | BIT_NO_PERIOD | BIT_STRIP,
            stop_tokens="tuna herring")

        self.assertEqual(8, pm.sentences)
        self.assertEqual(4, pm.skipped_sentences)
 def setUp(self):
     self.parser = LGInprocParser()