Python CapitainsCtsText.getTextualNode примеры, MyCapytain.resources.texts.local.capitains.cts.CapitainsCtsText.getTextualNode Python примеры использования

Пример #1

0

Показать файл

Файл: commonTests.py Проект: rillian/MyCapytain

 def test_get_passage_hypercontext_complex_xpath(self):
     simple = self.text_complex.getTextualNode(Reference("pr.1-1.2"))
     str_simple = simple.tostring(encoding=str)
     text = CapitainsCtsText(resource=str_simple,
                             citation=self.text_complex.citation)
     self.assertIn(
         "Pervincis tandem",
         text.getTextualNode(Reference("pr.1"),
                             simple=True).export(output=Mimetypes.PLAINTEXT,
                                                 exclude=["tei:note"
                                                          ]).strip(),
         "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
     )
     self.assertEqual(
         text.getTextualNode(
             Reference("1.2"),
             simple=True).export(output=Mimetypes.PLAINTEXT).strip(),
         "lusimus quos in Suebae gratiam virgunculae,",
         "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
     )
     self.assertEqual(
         list(map(lambda x: str(x),
                  text.getValidReff(level=2))), ["pr.1", "1.1", "1.2"],
         "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
     )

Пример #2

0

Показать файл

Файл: tei.py Проект: TylerKirby/cltk

def onekgreek_tei_xml_to_text_capitains():
    """Use MyCapitains program to convert TEI to plaintext."""
    file = os.path.expanduser(
        '~/cltk_data/greek/text/greek_text_first1kgreek/data/tlg0627/tlg021/tlg0627.tlg021.1st1K-grc1.xml')
    xml_dir = os.path.expanduser('~/cltk_data/greek/text/greek_text_first1kgreek/data/*/*/*.xml')
    xml_paths = glob.glob(xml_dir)
    if not len(xml_paths):
        logger.error('1K Greek corpus not installed. Use CorpusInstaller to get `First1KGreek`.')
        raise FileNotFoundError
    xml_paths = [path for path in xml_paths if '__cts__' not in path]

    # new dir
    new_dir = os.path.expanduser('~/cltk_data/greek/text/greek_text_first1kgreek_plaintext/')
    if not os.path.isdir(new_dir):
        os.makedirs(new_dir)

    for xml_path in xml_paths:
        _, xml_name = os.path.split(xml_path)
        xml_name = xml_name.rstrip('.xml')
        xml_name += '.txt'

        plain_text = ''
        with open(xml_path) as file_open:
            text = CapitainsCtsText(resource=file_open)
            for ref in text.getReffs(level=len(text.citation)):
                psg = text.getTextualNode(subreference=ref, simple=True)
                text_line = psg.export(Mimetypes.PLAINTEXT, exclude=["tei:note"])
                plain_text += text_line

        new_plaintext_path = os.path.join(new_dir, xml_name)
        with open(new_plaintext_path, 'w') as file_open:
            file_open.write(plain_text)

Пример #3

0

Показать файл

Файл: commonTests.py Проект: rillian/MyCapytain

    def test_get_passage_hyper_context_double_slash_xpath(self):
        simple = self.seneca.getTextualNode(Reference("1-10"))
        str_simple = simple.export(output=Mimetypes.XML.Std)
        text = CapitainsCtsText(resource=str_simple,
                                citation=self.seneca.citation)
        self.assertEqual(
            text.getTextualNode(Reference("1"),
                                simple=True).export(output=Mimetypes.PLAINTEXT,
                                                    exclude=["tei:note"
                                                             ]).strip(),
            "Di coniugales tuque genialis tori,",
            "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
        )
        self.assertEqual(
            text.getTextualNode(
                Reference("10"),
                simple=True).export(output=Mimetypes.PLAINTEXT).strip(),
            "aversa superis regna manesque impios",
            "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
        )
        self.assertEqual(
            list(map(lambda x: str(x), text.getValidReff(level=1))),
            ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
            "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
        )

        simple = self.seneca.getTextualNode(Reference("1"))
        str_simple = simple.tostring(encoding=str)
        text = CapitainsCtsText(resource=str_simple,
                                citation=self.seneca.citation)
        self.assertEqual(
            text.getTextualNode(Reference("1"),
                                simple=True).export(output=Mimetypes.PLAINTEXT,
                                                    exclude=["tei:note"
                                                             ]).strip(),
            "Di coniugales tuque genialis tori,",
            "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
        )
        self.assertEqual(
            list(map(lambda x: str(x), text.getValidReff(level=1))), ["1"],
            "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
        )

Пример #4

0

Показать файл

    def test_passage_extraction_fail_when_reffs_are_found(self):
        """ This issues is drawn from https://github.com/PerseusDL/canonical-latinLit/issues/226
        """
        with open("tests/testing_data/texts/extraction_issue.xml") as text:
            interactive_text = CapitainsCtsText(resource=etree.parse(text).getroot())
            reffs = interactive_text.getReffs(level=len(interactive_text.citation))
            passages = []
            # The failing passage was 5.1
            for reff in reffs:
                try:
                    passages.append(interactive_text.getTextualNode(reff))
                except IndexError:
                    raise Exception("Unable to extract %s " % reff)

            plaintext = [r.export(Mimetypes.PLAINTEXT, exclude=["tei:note"]).strip() for r in passages]
            self.assertIn(
                "NUNC et praedictos et regni sorte sequentes", plaintext,
                "The text of 5.1 should be in plaintext"
            )

Пример #5

0

Показать файл

 def build_texts(self, text):
     interactive_text = CapitainsCtsText(
         resource=etree.parse(text).getroot())
     reffs = interactive_text.getReffs(level=len(interactive_text.citation))
     passages = [
         interactive_text.getTextualNode(passage) for passage in reffs
     ]
     plaintext = [
         r.export(Mimetypes.PLAINTEXT, exclude=["tei:note"]).strip()
         for r in passages
     ]
     if self.cites is True:
         for i, t in enumerate(plaintext):
             plaintext[i] = '#' + reffs[i] + '#\n' + t
     with open('{}text/{}.txt'.format(
             self.dest,
             text.split('/')[-1].replace('.xml', '')),
               mode='w') as f:
         f.write('\n\n'.join(plaintext))

Пример #6

0

Показать файл

Файл: tei.py Проект: gymnosophist/pharr_format

def onekgreek_tei_xml_to_text_capitains():
    """Use MyCapitains program to convert TEI to plaintext."""
    file = os.path.expanduser(
        get_cltk_data_dir() +
        '/greek/text/greek_text_first1kgreek/data/tlg0627/tlg021/tlg0627.tlg021.1st1K-grc1.xml'
    )
    xml_dir = os.path.normpath(
        get_cltk_data_dir() +
        '/greek/text/greek_text_first1kgreek/data/*/*/*.xml')
    xml_paths = glob.glob(xml_dir)
    if not len(xml_paths):
        logger.error(
            '1K Greek corpus not installed. Use CorpusInstaller to get `First1KGreek`.'
        )
        raise FileNotFoundError
    xml_paths = [path for path in xml_paths if '__cts__' not in path]

    # new dir
    new_dir = os.path.normpath(
        get_cltk_data_dir() + '/greek/text/greek_text_first1kgreek_plaintext/')
    if not os.path.isdir(new_dir):
        os.makedirs(new_dir)

    for xml_path in xml_paths:
        _, xml_name = os.path.split(xml_path)
        xml_name = xml_name.rstrip('.xml')
        xml_name += '.txt'

        plain_text = ''
        with open(xml_path) as file_open:
            text = CapitainsCtsText(resource=file_open)
            for ref in text.getReffs(level=len(text.citation)):
                psg = text.getTextualNode(subreference=ref, simple=True)
                text_line = psg.export(Mimetypes.PLAINTEXT,
                                       exclude=["tei:note"])
                plain_text += text_line

        new_plaintext_path = os.path.join(new_dir, xml_name)
        with open(new_plaintext_path, 'w') as file_open:
            file_open.write(plain_text)

Пример #7

0

Показать файл

Файл: commonTests.py Проект: rillian/MyCapytain

    def test_get_Passage_context_no_double_slash(self):
        """ Check that get CapitainsCtsPassage contexts return right information """
        simple = self.TEI.getTextualNode(Reference("1.pr.2"))
        str_simple = simple.tostring(encoding=str)
        text = CapitainsCtsText(resource=str_simple,
                                citation=self.TEI.citation)
        self.assertEqual(
            text.getTextualNode(
                Reference("1.pr.2"),
                simple=True).export(output=Mimetypes.PLAINTEXT).strip(),
            "tum, ut de illis queri non possit quisquis de se bene",
            "Ensure passage finding with context is fully TEI / Capitains compliant (One reference CapitainsCtsPassage)"
        )

        simple = self.TEI.getTextualNode(Reference("1.pr.2-1.pr.7"))
        str_simple = simple.tostring(encoding=str)
        text = CapitainsCtsText(resource=str_simple,
                                citation=self.TEI.citation)
        self.assertEqual(
            text.getTextualNode(
                Reference("1.pr.2"),
                simple=True).export(output=Mimetypes.PLAINTEXT).strip(),
            "tum, ut de illis queri non possit quisquis de se bene",
            "Ensure passage finding with context is fully TEI / Capitains compliant (Same level same "
            "parent range CapitainsCtsPassage)")
        self.assertEqual(
            text.getTextualNode(
                Reference("1.pr.3"),
                simple=True).export(output=Mimetypes.PLAINTEXT).strip(),
            "senserit, cum salva infimarum quoque personarum re-",
            "Ensure passage finding with context is fully TEI / Capitains compliant (Same level same "
            "parent range CapitainsCtsPassage)")
        self.assertEqual(
            list(map(lambda x: str(x), text.getValidReff(level=3))),
            ["1.pr.2", "1.pr.3", "1.pr.4", "1.pr.5", "1.pr.6", "1.pr.7"],
            "Ensure passage finding with context is fully TEI / Capitains compliant (Same level same "
            "parent range CapitainsCtsPassage)")

        simple = self.TEI.getTextualNode(Reference("1.pr.2-1.1.6"))
        str_simple = simple.tostring(encoding=str)
        text = CapitainsCtsText(resource=str_simple,
                                citation=self.TEI.citation)
        self.assertEqual(
            text.getTextualNode(
                Reference("1.pr.2"),
                simple=True).export(output=Mimetypes.PLAINTEXT).strip(),
            "tum, ut de illis queri non possit quisquis de se bene",
            "Ensure passage finding with context is fully TEI / Capitains compliant (Same level range CapitainsCtsPassage)"
        )
        self.assertEqual(
            text.getTextualNode(
                Reference("1.1.6"),
                simple=True).export(output=Mimetypes.PLAINTEXT).strip(),
            "Rari post cineres habent poetae.",
            "Ensure passage finding with context is fully TEI / Capitains compliant (Same level range CapitainsCtsPassage)"
        )
        self.assertEqual(
            list(map(lambda x: str(x), text.getValidReff(level=3))), [
                "1.pr.2",
                "1.pr.3",
                "1.pr.4",
                "1.pr.5",
                "1.pr.6",
                "1.pr.7",
                "1.pr.8",
                "1.pr.9",
                "1.pr.10",
                "1.pr.11",
                "1.pr.12",
                "1.pr.13",
                "1.pr.14",
                "1.pr.15",
                "1.pr.16",
                "1.pr.17",
                "1.pr.18",
                "1.pr.19",
                "1.pr.20",
                "1.pr.21",
                "1.pr.22",
                "1.1.1",
                "1.1.2",
                "1.1.3",
                "1.1.4",
                "1.1.5",
                "1.1.6",
            ],
            "Ensure passage finding with context is fully TEI / Capitains compliant (Same level range CapitainsCtsPassage)"
        )

        simple = self.TEI.getTextualNode(Reference("1.pr.2-1.2"))
        str_simple = simple.tostring(encoding=str)
        text = CapitainsCtsText(resource=str_simple,
                                citation=self.TEI.citation)
        self.assertEqual(
            text.getTextualNode(
                Reference("1.pr.2"),
                simple=True).export(output=Mimetypes.PLAINTEXT).strip(),
            "tum, ut de illis queri non possit quisquis de se bene",
            "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
        )
        self.assertEqual(
            text.getTextualNode(
                Reference("1.1.6"),
                simple=True).export(output=Mimetypes.PLAINTEXT).strip(),
            "Rari post cineres habent poetae.",
            "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
        )
        self.assertEqual(
            list(map(lambda x: str(x), text.getValidReff(level=3))), [
                "1.pr.2", "1.pr.3", "1.pr.4", "1.pr.5", "1.pr.6", "1.pr.7",
                "1.pr.8", "1.pr.9", "1.pr.10", "1.pr.11", "1.pr.12", "1.pr.13",
                "1.pr.14", "1.pr.15", "1.pr.16", "1.pr.17", "1.pr.18",
                "1.pr.19", "1.pr.20", "1.pr.21", "1.pr.22", "1.1.1", "1.1.2",
                "1.1.3", "1.1.4", "1.1.5", "1.1.6", '1.2.1', '1.2.2', '1.2.3',
                '1.2.4', '1.2.5', '1.2.6', '1.2.7', '1.2.8'
            ],
            "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
        )

Пример #8

0

Показать файл

from lxml.etree import tostring

#  We open a file
with open("./tests/testing_data/examples/text.martial.xml") as f:
    # We initiate a Text object giving the IO instance to resource argument
    text = CapitainsCtsText(resource=f)

# Text objects have a citation property
#  len(Citation(...)) gives the depth of the citation scheme
# in the case of this sample, this would be 3 (Book, Poem, Line)
for ref in text.getReffs(level=len(text.citation)):
    # We retrieve a Passage object for each reference that we find
    # We can pass the reference many way, including in the form of a list of strings
    # We use the _simple parameter to get a fairly simple object
    # Simple makes a straight object that has only the targeted node inside of it
    psg = text.getTextualNode(subreference=ref, simple=True)
    # We print the passage from which we retrieve <note> nodes
    print("\t".join([ref, psg.export(Mimetypes.PLAINTEXT, exclude=["tei:note"])]))

"""
You'll print something like the following :

    1.pr.1	Spero me secutum in libellis meis tale temperamen-
    1.pr.2	tum, ut de illis queri non possit quisquis de se bene
    1.pr.3	senserit, cum salva infimarum quoque personarum re-
    1.pr.4	verentia ludant; quae adeo antiquis auctoribus defuit, ut
    1.pr.5	nominibus non tantum veris abusi sint, sed et magnis.
    1.pr.6	Mihi fama vilius constet et probetur in me novissimum

"""

Python CapitainsCtsText.getTextualNode примеры использования