Пример #1
0
 def test_get_passage_hypercontext_complex_xpath(self):
     simple = self.text_complex.getTextualNode(Reference("pr.1-1.2"))
     str_simple = simple.tostring(encoding=str)
     text = CapitainsCtsText(resource=str_simple,
                             citation=self.text_complex.citation)
     self.assertIn(
         "Pervincis tandem",
         text.getTextualNode(Reference("pr.1"),
                             simple=True).export(output=Mimetypes.PLAINTEXT,
                                                 exclude=["tei:note"
                                                          ]).strip(),
         "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
     )
     self.assertEqual(
         text.getTextualNode(
             Reference("1.2"),
             simple=True).export(output=Mimetypes.PLAINTEXT).strip(),
         "lusimus quos in Suebae gratiam virgunculae,",
         "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
     )
     self.assertEqual(
         list(map(lambda x: str(x),
                  text.getValidReff(level=2))), ["pr.1", "1.1", "1.2"],
         "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
     )
Пример #2
0
def onekgreek_tei_xml_to_text_capitains():
    """Use MyCapitains program to convert TEI to plaintext."""
    file = os.path.expanduser(
        '~/cltk_data/greek/text/greek_text_first1kgreek/data/tlg0627/tlg021/tlg0627.tlg021.1st1K-grc1.xml')
    xml_dir = os.path.expanduser('~/cltk_data/greek/text/greek_text_first1kgreek/data/*/*/*.xml')
    xml_paths = glob.glob(xml_dir)
    if not len(xml_paths):
        logger.error('1K Greek corpus not installed. Use CorpusInstaller to get `First1KGreek`.')
        raise FileNotFoundError
    xml_paths = [path for path in xml_paths if '__cts__' not in path]

    # new dir
    new_dir = os.path.expanduser('~/cltk_data/greek/text/greek_text_first1kgreek_plaintext/')
    if not os.path.isdir(new_dir):
        os.makedirs(new_dir)

    for xml_path in xml_paths:
        _, xml_name = os.path.split(xml_path)
        xml_name = xml_name.rstrip('.xml')
        xml_name += '.txt'

        plain_text = ''
        with open(xml_path) as file_open:
            text = CapitainsCtsText(resource=file_open)
            for ref in text.getReffs(level=len(text.citation)):
                psg = text.getTextualNode(subreference=ref, simple=True)
                text_line = psg.export(Mimetypes.PLAINTEXT, exclude=["tei:note"])
                plain_text += text_line

        new_plaintext_path = os.path.join(new_dir, xml_name)
        with open(new_plaintext_path, 'w') as file_open:
            file_open.write(plain_text)
Пример #3
0
    def test_get_passage_hyper_context_double_slash_xpath(self):
        simple = self.seneca.getTextualNode(Reference("1-10"))
        str_simple = simple.export(output=Mimetypes.XML.Std)
        text = CapitainsCtsText(resource=str_simple,
                                citation=self.seneca.citation)
        self.assertEqual(
            text.getTextualNode(Reference("1"),
                                simple=True).export(output=Mimetypes.PLAINTEXT,
                                                    exclude=["tei:note"
                                                             ]).strip(),
            "Di coniugales tuque genialis tori,",
            "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
        )
        self.assertEqual(
            text.getTextualNode(
                Reference("10"),
                simple=True).export(output=Mimetypes.PLAINTEXT).strip(),
            "aversa superis regna manesque impios",
            "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
        )
        self.assertEqual(
            list(map(lambda x: str(x), text.getValidReff(level=1))),
            ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
            "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
        )

        simple = self.seneca.getTextualNode(Reference("1"))
        str_simple = simple.tostring(encoding=str)
        text = CapitainsCtsText(resource=str_simple,
                                citation=self.seneca.citation)
        self.assertEqual(
            text.getTextualNode(Reference("1"),
                                simple=True).export(output=Mimetypes.PLAINTEXT,
                                                    exclude=["tei:note"
                                                             ]).strip(),
            "Di coniugales tuque genialis tori,",
            "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
        )
        self.assertEqual(
            list(map(lambda x: str(x), text.getValidReff(level=1))), ["1"],
            "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
        )
Пример #4
0
    def test_passage_extraction_fail_when_reffs_are_found(self):
        """ This issues is drawn from https://github.com/PerseusDL/canonical-latinLit/issues/226
        """
        with open("tests/testing_data/texts/extraction_issue.xml") as text:
            interactive_text = CapitainsCtsText(resource=etree.parse(text).getroot())
            reffs = interactive_text.getReffs(level=len(interactive_text.citation))
            passages = []
            # The failing passage was 5.1
            for reff in reffs:
                try:
                    passages.append(interactive_text.getTextualNode(reff))
                except IndexError:
                    raise Exception("Unable to extract %s " % reff)

            plaintext = [r.export(Mimetypes.PLAINTEXT, exclude=["tei:note"]).strip() for r in passages]
            self.assertIn(
                "NUNC et praedictos et regni sorte sequentes", plaintext,
                "The text of 5.1 should be in plaintext"
            )
Пример #5
0
 def build_texts(self, text):
     interactive_text = CapitainsCtsText(
         resource=etree.parse(text).getroot())
     reffs = interactive_text.getReffs(level=len(interactive_text.citation))
     passages = [
         interactive_text.getTextualNode(passage) for passage in reffs
     ]
     plaintext = [
         r.export(Mimetypes.PLAINTEXT, exclude=["tei:note"]).strip()
         for r in passages
     ]
     if self.cites is True:
         for i, t in enumerate(plaintext):
             plaintext[i] = '#' + reffs[i] + '#\n' + t
     with open('{}text/{}.txt'.format(
             self.dest,
             text.split('/')[-1].replace('.xml', '')),
               mode='w') as f:
         f.write('\n\n'.join(plaintext))
Пример #6
0
def onekgreek_tei_xml_to_text_capitains():
    """Use MyCapitains program to convert TEI to plaintext."""
    file = os.path.expanduser(
        get_cltk_data_dir() +
        '/greek/text/greek_text_first1kgreek/data/tlg0627/tlg021/tlg0627.tlg021.1st1K-grc1.xml'
    )
    xml_dir = os.path.normpath(
        get_cltk_data_dir() +
        '/greek/text/greek_text_first1kgreek/data/*/*/*.xml')
    xml_paths = glob.glob(xml_dir)
    if not len(xml_paths):
        logger.error(
            '1K Greek corpus not installed. Use CorpusInstaller to get `First1KGreek`.'
        )
        raise FileNotFoundError
    xml_paths = [path for path in xml_paths if '__cts__' not in path]

    # new dir
    new_dir = os.path.normpath(
        get_cltk_data_dir() + '/greek/text/greek_text_first1kgreek_plaintext/')
    if not os.path.isdir(new_dir):
        os.makedirs(new_dir)

    for xml_path in xml_paths:
        _, xml_name = os.path.split(xml_path)
        xml_name = xml_name.rstrip('.xml')
        xml_name += '.txt'

        plain_text = ''
        with open(xml_path) as file_open:
            text = CapitainsCtsText(resource=file_open)
            for ref in text.getReffs(level=len(text.citation)):
                psg = text.getTextualNode(subreference=ref, simple=True)
                text_line = psg.export(Mimetypes.PLAINTEXT,
                                       exclude=["tei:note"])
                plain_text += text_line

        new_plaintext_path = os.path.join(new_dir, xml_name)
        with open(new_plaintext_path, 'w') as file_open:
            file_open.write(plain_text)
Пример #7
0
    def test_get_Passage_context_no_double_slash(self):
        """ Check that get CapitainsCtsPassage contexts return right information """
        simple = self.TEI.getTextualNode(Reference("1.pr.2"))
        str_simple = simple.tostring(encoding=str)
        text = CapitainsCtsText(resource=str_simple,
                                citation=self.TEI.citation)
        self.assertEqual(
            text.getTextualNode(
                Reference("1.pr.2"),
                simple=True).export(output=Mimetypes.PLAINTEXT).strip(),
            "tum, ut de illis queri non possit quisquis de se bene",
            "Ensure passage finding with context is fully TEI / Capitains compliant (One reference CapitainsCtsPassage)"
        )

        simple = self.TEI.getTextualNode(Reference("1.pr.2-1.pr.7"))
        str_simple = simple.tostring(encoding=str)
        text = CapitainsCtsText(resource=str_simple,
                                citation=self.TEI.citation)
        self.assertEqual(
            text.getTextualNode(
                Reference("1.pr.2"),
                simple=True).export(output=Mimetypes.PLAINTEXT).strip(),
            "tum, ut de illis queri non possit quisquis de se bene",
            "Ensure passage finding with context is fully TEI / Capitains compliant (Same level same "
            "parent range CapitainsCtsPassage)")
        self.assertEqual(
            text.getTextualNode(
                Reference("1.pr.3"),
                simple=True).export(output=Mimetypes.PLAINTEXT).strip(),
            "senserit, cum salva infimarum quoque personarum re-",
            "Ensure passage finding with context is fully TEI / Capitains compliant (Same level same "
            "parent range CapitainsCtsPassage)")
        self.assertEqual(
            list(map(lambda x: str(x), text.getValidReff(level=3))),
            ["1.pr.2", "1.pr.3", "1.pr.4", "1.pr.5", "1.pr.6", "1.pr.7"],
            "Ensure passage finding with context is fully TEI / Capitains compliant (Same level same "
            "parent range CapitainsCtsPassage)")

        simple = self.TEI.getTextualNode(Reference("1.pr.2-1.1.6"))
        str_simple = simple.tostring(encoding=str)
        text = CapitainsCtsText(resource=str_simple,
                                citation=self.TEI.citation)
        self.assertEqual(
            text.getTextualNode(
                Reference("1.pr.2"),
                simple=True).export(output=Mimetypes.PLAINTEXT).strip(),
            "tum, ut de illis queri non possit quisquis de se bene",
            "Ensure passage finding with context is fully TEI / Capitains compliant (Same level range CapitainsCtsPassage)"
        )
        self.assertEqual(
            text.getTextualNode(
                Reference("1.1.6"),
                simple=True).export(output=Mimetypes.PLAINTEXT).strip(),
            "Rari post cineres habent poetae.",
            "Ensure passage finding with context is fully TEI / Capitains compliant (Same level range CapitainsCtsPassage)"
        )
        self.assertEqual(
            list(map(lambda x: str(x), text.getValidReff(level=3))), [
                "1.pr.2",
                "1.pr.3",
                "1.pr.4",
                "1.pr.5",
                "1.pr.6",
                "1.pr.7",
                "1.pr.8",
                "1.pr.9",
                "1.pr.10",
                "1.pr.11",
                "1.pr.12",
                "1.pr.13",
                "1.pr.14",
                "1.pr.15",
                "1.pr.16",
                "1.pr.17",
                "1.pr.18",
                "1.pr.19",
                "1.pr.20",
                "1.pr.21",
                "1.pr.22",
                "1.1.1",
                "1.1.2",
                "1.1.3",
                "1.1.4",
                "1.1.5",
                "1.1.6",
            ],
            "Ensure passage finding with context is fully TEI / Capitains compliant (Same level range CapitainsCtsPassage)"
        )

        simple = self.TEI.getTextualNode(Reference("1.pr.2-1.2"))
        str_simple = simple.tostring(encoding=str)
        text = CapitainsCtsText(resource=str_simple,
                                citation=self.TEI.citation)
        self.assertEqual(
            text.getTextualNode(
                Reference("1.pr.2"),
                simple=True).export(output=Mimetypes.PLAINTEXT).strip(),
            "tum, ut de illis queri non possit quisquis de se bene",
            "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
        )
        self.assertEqual(
            text.getTextualNode(
                Reference("1.1.6"),
                simple=True).export(output=Mimetypes.PLAINTEXT).strip(),
            "Rari post cineres habent poetae.",
            "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
        )
        self.assertEqual(
            list(map(lambda x: str(x), text.getValidReff(level=3))), [
                "1.pr.2", "1.pr.3", "1.pr.4", "1.pr.5", "1.pr.6", "1.pr.7",
                "1.pr.8", "1.pr.9", "1.pr.10", "1.pr.11", "1.pr.12", "1.pr.13",
                "1.pr.14", "1.pr.15", "1.pr.16", "1.pr.17", "1.pr.18",
                "1.pr.19", "1.pr.20", "1.pr.21", "1.pr.22", "1.1.1", "1.1.2",
                "1.1.3", "1.1.4", "1.1.5", "1.1.6", '1.2.1', '1.2.2', '1.2.3',
                '1.2.4', '1.2.5', '1.2.6', '1.2.7', '1.2.8'
            ],
            "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
        )
Пример #8
0
from lxml.etree import tostring

#  We open a file
with open("./tests/testing_data/examples/text.martial.xml") as f:
    # We initiate a Text object giving the IO instance to resource argument
    text = CapitainsCtsText(resource=f)

# Text objects have a citation property
#  len(Citation(...)) gives the depth of the citation scheme
# in the case of this sample, this would be 3 (Book, Poem, Line)
for ref in text.getReffs(level=len(text.citation)):
    # We retrieve a Passage object for each reference that we find
    # We can pass the reference many way, including in the form of a list of strings
    # We use the _simple parameter to get a fairly simple object
    # Simple makes a straight object that has only the targeted node inside of it
    psg = text.getTextualNode(subreference=ref, simple=True)
    # We print the passage from which we retrieve <note> nodes
    print("\t".join([ref, psg.export(Mimetypes.PLAINTEXT, exclude=["tei:note"])]))

"""
You'll print something like the following :

    1.pr.1	Spero me secutum in libellis meis tale temperamen-
    1.pr.2	tum, ut de illis queri non possit quisquis de se bene
    1.pr.3	senserit, cum salva infimarum quoque personarum re-
    1.pr.4	verentia ludant; quae adeo antiquis auctoribus defuit, ut
    1.pr.5	nominibus non tantum veris abusi sint, sed et magnis.
    1.pr.6	Mihi fama vilius constet et probetur in me novissimum

"""