Пример #1
0
 def testCitationSetters(self):
     d = Citation()
     c = Citation(
         name="ahah",
         refsDecl="/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n='$1']",
         child=None)
     b = Citation(
         name="ahah",
         refsDecl="/tei:TEI/tei:text/tei:body/tei:div/tei:z[@n='$1']",
         child=None)
     with open("tests/testing_data/texts/sample.xml", "rb") as sample:
         a = CapitainsCtsText(resource=sample, citation=b)
     """ Test original setting """
     self.assertIs(a.citation, b)
     """ Test simple replacement """
     a.citation = d
     self.assertIs(a.citation, d)
     """ Test conversion """
     a.citation = c
     self.assertEqual(a.citation.name, "ahah")
     self.assertEqual(a.citation.child, None)
     self.assertEqual(
         a.citation.refsDecl,
         "/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n='$1']")
     self.assertEqual(a.citation.scope,
                      "/tei:TEI/tei:text/tei:body/tei:div")
     self.assertEqual(a.citation.xpath, "/tei:div[@n='?']")
Пример #2
0
    def test_node_collision(self):
        """ Test unique_passage
        """

        unit = HookTest.capitains_units.cts.CTSText_TestUnit("/a/b")
        unit.xml = etree.ElementTree(
            etree.fromstring(
                self.frame.format(
                    "/tei:TEI/tei:text/tei:body//tei:div[@n='$1']",
                    "/tei:TEI/tei:text/tei:body/tei:div[@n='$1']//tei:div[@n='$2']",
                    1, 1, 2, 3, 1, 1, 2))).getroot()
        unit.Text = CapitainsCtsText(resource=unit.xml)
        unit.flush()

        results = [result for result in unit.unique_passage()]
        self.assertEqual(results, [False],
                         "Wrong citation with node collision should fail")

        unit.xml = etree.ElementTree(
            etree.fromstring(
                self.frame.format(
                    "/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n='$1']",
                    "/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n='$1']/tei:div[@n='$2']",
                    1, 1, 2, 3, 1, 1, 2))).getroot()
        unit.Text = CapitainsCtsText(resource=unit.xml)
        unit.flush()
        results = [result for result in unit.unique_passage()]
        self.assertEqual(results, [True],
                         "Right citation with node collision should success")
Пример #3
0
 def test_get_passage_hypercontext_complex_xpath(self):
     simple = self.text_complex.getTextualNode(Reference("pr.1-1.2"))
     str_simple = simple.tostring(encoding=str)
     text = CapitainsCtsText(resource=str_simple,
                             citation=self.text_complex.citation)
     self.assertIn(
         "Pervincis tandem",
         text.getTextualNode(Reference("pr.1"),
                             simple=True).export(output=Mimetypes.PLAINTEXT,
                                                 exclude=["tei:note"
                                                          ]).strip(),
         "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
     )
     self.assertEqual(
         text.getTextualNode(
             Reference("1.2"),
             simple=True).export(output=Mimetypes.PLAINTEXT).strip(),
         "lusimus quos in Suebae gratiam virgunculae,",
         "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
     )
     self.assertEqual(
         list(map(lambda x: str(x),
                  text.getValidReff(level=2))), ["pr.1", "1.1", "1.2"],
         "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
     )
Пример #4
0
def onekgreek_tei_xml_to_text_capitains():
    """Use MyCapitains program to convert TEI to plaintext."""
    file = os.path.expanduser(
        '~/cltk_data/greek/text/greek_text_first1kgreek/data/tlg0627/tlg021/tlg0627.tlg021.1st1K-grc1.xml')
    xml_dir = os.path.expanduser('~/cltk_data/greek/text/greek_text_first1kgreek/data/*/*/*.xml')
    xml_paths = glob.glob(xml_dir)
    if not len(xml_paths):
        logger.error('1K Greek corpus not installed. Use CorpusInstaller to get `First1KGreek`.')
        raise FileNotFoundError
    xml_paths = [path for path in xml_paths if '__cts__' not in path]

    # new dir
    new_dir = os.path.expanduser('~/cltk_data/greek/text/greek_text_first1kgreek_plaintext/')
    if not os.path.isdir(new_dir):
        os.makedirs(new_dir)

    for xml_path in xml_paths:
        _, xml_name = os.path.split(xml_path)
        xml_name = xml_name.rstrip('.xml')
        xml_name += '.txt'

        plain_text = ''
        with open(xml_path) as file_open:
            text = CapitainsCtsText(resource=file_open)
            for ref in text.getReffs(level=len(text.citation)):
                psg = text.getTextualNode(subreference=ref, simple=True)
                text_line = psg.export(Mimetypes.PLAINTEXT, exclude=["tei:note"])
                plain_text += text_line

        new_plaintext_path = os.path.join(new_dir, xml_name)
        with open(new_plaintext_path, 'w') as file_open:
            file_open.write(plain_text)
Пример #5
0
 def test_Text_text_function(self, simple):
     simple = self.seneca.getTextualNode(Reference("1"), simple=simple)
     str_simple = simple.tostring(encoding=str)
     text = CapitainsCtsText(resource=str_simple,
                             citation=self.seneca.citation)
     self.assertEqual(
         text.export(output=Mimetypes.PLAINTEXT,
                     exclude=["tei:note"]).strip(),
         "Di coniugales tuque genialis tori,",
         "Ensure text methods works on CtsTextMetadata object")
Пример #6
0
    def test_warning(self):
        with open("tests/testing_data/texts/duplicate_references.xml") as xml:
            text = CapitainsCtsText(resource=xml)
        with warnings.catch_warnings(record=True) as w:
            # Cause all warnings to always be triggered.
            warnings.simplefilter("always")
            for i in [1, 2, 3]:
                text.getValidReff(level=i, _debug=True)

        self.assertEqual(len(w), 3, "There should be warning on each level")
        self.assertEqual(
            issubclass(w[-1].category, MyCapytain.errors.DuplicateReference),
            True, "Warning should be DuplicateReference")
        self.assertEqual(str(w[0].message), "1",
                         "Warning message should be list of duplicate")
Пример #7
0
 def testURN(self):
     """ Check that urn is set"""
     tei = CapitainsCtsText(
         resource=self.TEI.xml,
         urn="urn:cts:latinLit:phi1294.phi002.perseus-lat2")
     self.assertEqual(str(tei.urn),
                      "urn:cts:latinLit:phi1294.phi002.perseus-lat2")
Пример #8
0
    def parsable(self):
        """ Chacke that the text is parsable (as XML) and ingest it through MyCapytain then.

        .. note:: Override super(parsable) and add CapiTainS Ingesting to it
        """
        status = next(super(CTSText_TestUnit, self).parsable())
        if status is True:
            try:
                self.Text = CapitainsCtsText(resource=self.xml.getroot())
            except MissingRefsDecl as E:
                self.Text = None
                self.log(str(E))
                self.capitains_errors.append(str(E))
                yield False
        else:
            self.Text = None
        yield status
Пример #9
0
 def build_texts(self, text):
     interactive_text = CapitainsCtsText(
         resource=etree.parse(text).getroot())
     reffs = interactive_text.getReffs(level=len(interactive_text.citation))
     passages = [
         interactive_text.getTextualNode(passage) for passage in reffs
     ]
     plaintext = [
         r.export(Mimetypes.PLAINTEXT, exclude=["tei:note"]).strip()
         for r in passages
     ]
     if self.cites is True:
         for i, t in enumerate(plaintext):
             plaintext[i] = '#' + reffs[i] + '#\n' + t
     with open('{}text/{}.txt'.format(
             self.dest,
             text.split('/')[-1].replace('.xml', '')),
               mode='w') as f:
         f.write('\n\n'.join(plaintext))
Пример #10
0
    def test_passage_extraction_fail_when_reffs_are_found(self):
        """ This issues is drawn from https://github.com/PerseusDL/canonical-latinLit/issues/226
        """
        with open("tests/testing_data/texts/extraction_issue.xml") as text:
            interactive_text = CapitainsCtsText(resource=etree.parse(text).getroot())
            reffs = interactive_text.getReffs(level=len(interactive_text.citation))
            passages = []
            # The failing passage was 5.1
            for reff in reffs:
                try:
                    passages.append(interactive_text.getTextualNode(reff))
                except IndexError:
                    raise Exception("Unable to extract %s " % reff)

            plaintext = [r.export(Mimetypes.PLAINTEXT, exclude=["tei:note"]).strip() for r in passages]
            self.assertIn(
                "NUNC et praedictos et regni sorte sequentes", plaintext,
                "The text of 5.1 should be in plaintext"
            )
Пример #11
0
    def test_empty_ref_warning(self):
        with open("tests/testing_data/texts/empty_references.xml") as xml:
            text = CapitainsCtsText(resource=xml)
        with warnings.catch_warnings(record=True) as w:
            # Cause all warnings to always be triggered.
            warnings.simplefilter("always")
            for i in [1, 2, 3]:
                text.getValidReff(level=i, _debug=True)

        self.assertEqual(len(w), 3, "There should be warning on each level")
        self.assertEqual(
            issubclass(w[-1].category, MyCapytain.errors.EmptyReference), True,
            "Warning should be EmptyReference")
        self.assertEqual([str(s.message) for s in w], [
            "1 empty reference(s) at citation level 1",
            "1 empty reference(s) at citation level 2",
            "1 empty reference(s) at citation level 3"
        ], "Warning message should indicate number of references and the level at which they occur"
                         )
Пример #12
0
def onekgreek_tei_xml_to_text_capitains():
    """Use MyCapitains program to convert TEI to plaintext."""
    file = os.path.expanduser(
        get_cltk_data_dir() +
        '/greek/text/greek_text_first1kgreek/data/tlg0627/tlg021/tlg0627.tlg021.1st1K-grc1.xml'
    )
    xml_dir = os.path.normpath(
        get_cltk_data_dir() +
        '/greek/text/greek_text_first1kgreek/data/*/*/*.xml')
    xml_paths = glob.glob(xml_dir)
    if not len(xml_paths):
        logger.error(
            '1K Greek corpus not installed. Use CorpusInstaller to get `First1KGreek`.'
        )
        raise FileNotFoundError
    xml_paths = [path for path in xml_paths if '__cts__' not in path]

    # new dir
    new_dir = os.path.normpath(
        get_cltk_data_dir() + '/greek/text/greek_text_first1kgreek_plaintext/')
    if not os.path.isdir(new_dir):
        os.makedirs(new_dir)

    for xml_path in xml_paths:
        _, xml_name = os.path.split(xml_path)
        xml_name = xml_name.rstrip('.xml')
        xml_name += '.txt'

        plain_text = ''
        with open(xml_path) as file_open:
            text = CapitainsCtsText(resource=file_open)
            for ref in text.getReffs(level=len(text.citation)):
                psg = text.getTextualNode(subreference=ref, simple=True)
                text_line = psg.export(Mimetypes.PLAINTEXT,
                                       exclude=["tei:note"])
                plain_text += text_line

        new_plaintext_path = os.path.join(new_dir, xml_name)
        with open(new_plaintext_path, 'w') as file_open:
            file_open.write(plain_text)
Пример #13
0
    def test_get_passage_hyper_context_double_slash_xpath(self):
        simple = self.seneca.getTextualNode(Reference("1-10"))
        str_simple = simple.export(output=Mimetypes.XML.Std)
        text = CapitainsCtsText(resource=str_simple,
                                citation=self.seneca.citation)
        self.assertEqual(
            text.getTextualNode(Reference("1"),
                                simple=True).export(output=Mimetypes.PLAINTEXT,
                                                    exclude=["tei:note"
                                                             ]).strip(),
            "Di coniugales tuque genialis tori,",
            "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
        )
        self.assertEqual(
            text.getTextualNode(
                Reference("10"),
                simple=True).export(output=Mimetypes.PLAINTEXT).strip(),
            "aversa superis regna manesque impios",
            "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
        )
        self.assertEqual(
            list(map(lambda x: str(x), text.getValidReff(level=1))),
            ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
            "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
        )

        simple = self.seneca.getTextualNode(Reference("1"))
        str_simple = simple.tostring(encoding=str)
        text = CapitainsCtsText(resource=str_simple,
                                citation=self.seneca.citation)
        self.assertEqual(
            text.getTextualNode(Reference("1"),
                                simple=True).export(output=Mimetypes.PLAINTEXT,
                                                    exclude=["tei:note"
                                                             ]).strip(),
            "Di coniugales tuque genialis tori,",
            "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
        )
        self.assertEqual(
            list(map(lambda x: str(x), text.getValidReff(level=1))), ["1"],
            "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
        )
Пример #14
0
    def _get_citable_text(self, fileid):
        """
        Parameters
        ----------
        fileid: str
            The file identifier of the file to read

        Returns
        -------
        CapitainsCtsText object

        """

        with open(self._root.join(fileid)) as f:
            text = CapitainsCtsText(resource=f)
        return text
Пример #15
0
 def test_illegal_characters_pass(self):
     """ Test that forbidden passes when there are no illegal characters"""
     unit = HookTest.capitains_units.cts.CTSText_TestUnit("/a/b")
     unit.xml = etree.ElementTree(
         etree.fromstring(
             self.frame.format(
                 "/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n='$1']",
                 "/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n='$1']/tei:div[@n='$2']",
                 0, 1, "q", "b", "105v", "1", "2"))).getroot()
     unit.Text = CapitainsCtsText(resource=unit.xml)
     unit.flush()
     results = [result for result in unit.passages()]
     self.assertEqual(results, [True, True], "Passages are found")
     unit.test_status['passages'] = True
     results = list(unit.forbidden())
     self.assertEqual(
         results, [True],
         "Illegal characters should pass if no forbidden characters")
     self.assertEqual(
         unit.forbiddens, [],
         "All passage IDs containing forbidden characters should be stored."
     )
Пример #16
0
    def test_illegal_characters_fail(self):
        """ Test that illegal characters are detected"""

        unit = HookTest.capitains_units.cts.CTSText_TestUnit("/a/b")
        unit.xml = etree.ElementTree(
            etree.fromstring(
                self.frame.format(
                    "/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n='$1']",
                    "/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n='$1']/tei:div[@n='$2']",
                    "0 1", "a.b", "d-d", "@", "7", "1", "2"))).getroot()
        unit.Text = CapitainsCtsText(resource=unit.xml)
        unit.flush()
        results = [result for result in unit.passages()]
        self.assertEqual(results, [True, True], "Passages are found")
        results = list(unit.forbidden())
        self.assertEqual(results, [False], "Illegal character should fail")
        self.assertIn(
            ">>>>>> Reference with forbidden characters found: '0 1', '0 1.a.b', '0 1.d-d', '0 1.@'",
            unit.logs)
        self.assertCountEqual(
            unit.forbiddens, ["'0 1'", "'0 1.a.b'", "'0 1.d-d'", "'0 1.@'"],
            "All passage IDs containing forbidden characters should be stored."
        )
Пример #17
0
class CTSText_TestUnit(TESTUnit):
    """ CTS testing object

    :param path: Path to the file
    :type path: basestring
    :param countwords: Count the number of words and log it if necessary
    :type countwords: bool

    :cvar tests: Contains the list of methods to be run again the text
    :type tests: [str]
    :cvar readable: Human friendly string associated to object methods
    :type readable: dict

    :ivar inv: List of URN retrieved in metadata. Used to check the availability of metadata for the text
    :type inv: [str]
    :ivar scheme: Scheme to be used to check the
    :type scheme: str
    :ivar Text: Text object according to MyCapytains parsing. Used to find passages
    :type Text: MyCapytain.resources.text.local.Text

    Shared variables with parent class:

    :ivar path: Path for the resource
    :type path: str
    :ivar xml: XML resource, parsed in python. Used to do general checking
    :type xml: lxml._etree.Element

    .. note:: All method in CTSText_TestUnit.tests ( "parsable", "has_urn", "naming_convention", "refsDecl", "passages", \
    "unique_passage", "inventory" ) yield at least one boolean (might be more) which represents the success of it.
    """

    tests = [
        # Parsing the XML
        "parsable",
        # Retrieving the URN (requires parsale
        "has_urn", 'language',
        # Requires has_urn
        "inventory", "naming_convention",
        # Requires parsable
        "refsDecl", "passages", "unique_passage", "duplicate", "forbidden", "empty"
    ]
    breaks = [
        "parsable",
        "refsDecl",
        "passages"
    ]
    readable = {
        "parsable": "File parsing",
        "refsDecl": "RefsDecl parsing",
        "passages": "Passage level parsing",
        "duplicate": "Duplicate passages",
        "forbidden": "Forbidden characters",
        "epidoc": "Epidoc DTD validation",
        "tei": "TEI DTD Validation",
        "auto_rng": "Automatic RNG validation",
        "local_file": "Custom local RNG validation",
        "has_urn": "URN informations",
        "naming_convention": "Naming conventions",
        "inventory": "Available in inventory",
        "unique_passage": "Unique nodes found by XPath",
        "count_words": "Word Counting",
        "language": "Correct xml:lang attribute",
        "empty": "Empty References"
    }
    splitter = re.compile(r'\S+', re.MULTILINE)

    def __init__(self, path, countwords=False, timeout=30, *args, **kwargs):
        self.inv = list()
        self.timeout = timeout
        self.scheme = None
        self.guidelines = None
        self.rng = None
        self.Text = None
        self.xml = None
        self.count = 0
        self.countwords = countwords
        self.citation = list()
        self.duplicates = list()
        self.forbiddens = list()
        self.empties = list()
        self.capitains_errors = list()
        self.test_status = defaultdict(bool)
        self.lang = ''
        self.dtd_errors = list()
        super(CTSText_TestUnit, self).__init__(path, *args, **kwargs)

    def parsable(self):
        """ Chacke that the text is parsable (as XML) and ingest it through MyCapytain then.

        .. note:: Override super(parsable) and add CapiTainS Ingesting to it
        """
        status = next(
            super(CTSText_TestUnit, self).parsable()
        )
        if status is True:
            try:
                self.Text = CapitainsCtsText(resource=self.xml.getroot())
            except MissingRefsDecl as E:
                self.Text = None
                self.log(str(E))
                self.capitains_errors.append(str(E))
                yield False
        else:
            self.Text = None
        yield status

    def refsDecl(self):
        """ Check that the text contains refsDecl informations
        """
        if self.Text:
            # In 1.0.1, MyCapytain actually create an empty citation by default
            if not self.Text.citation.isEmpty():
                self.log(str(len(self.Text.citation)) + " citation's level found")
                yield True
            else:
                yield False
        else:
            yield False

    def run_rng(self, rng_path):
        """ Run the RNG through JingTrang

        :param rng_path: Path to the RelaxNG file to run against the XML to test
        """
        test = subprocess.Popen(
            ["java", "-Duser.country=US",  "-Duser.language=en", "-jar", TESTUnit.JING, rng_path, self.path],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            shell=False
        )
        out = []
        error = []
        timer = Timer(self.timeout, test.kill)
        try:
            timer.start()
            out, error = test.communicate()
        except Exception as E:
            self.error(E)
            yield False
            pass
        finally:
            if not timer.isAlive():
                self.log("Timeout on RelaxNG")
                yield False
                timer.cancel()
                pass
            timer.cancel()

        # This is to deal with Travis printing a message about the _JAVA_OPTIONS when a java command is run
        # Travis printing this command resulted in this test not passing
        out = '\n'.join([x for x in out.decode().split('\n') if '_JAVA_OPTIONS' not in x]).encode()
        error = '\n'.join([x for x in error.decode().split('\n') if '_JAVA_OPTIONS' not in x]).encode()

        if len(out) > 0:
            for issue in TESTUnit.rng_logs(out):
                self.log(issue)
                self.dtd_errors.append(issue)
        yield len(out) == 0 and len(error) == 0

    def auto_rng(self):
        xml = parse(self.path)
        xml_dir = os.path.dirname(os.path.abspath(self.path))
        # A file can have multiple schema
        for rng in xml.xpath("/processing-instruction('xml-model')"):
            uri = rng.attrib["href"]
            rng_path = os.path.abspath(os.path.join(xml_dir, uri))
            if validators.url(uri):
                rng_path = self.get_remote_rng(uri)
            elif not os.path.isfile(rng_path):
                self.dtd_errors.append("No RNG was found at " + rng_path)
                yield False
                continue
            for status in self.run_rng(rng_path):
                yield status

    def get_remote_rng(self, url):
        """ Given a valid URL, downloads the RNG from the given URL and returns the filepath and name

        :param url: the URL of the RNG
        :return: filenpath and name where the RNG was saved
        """
        # If the file is remote, have a file-system approved name
        # The md5 hash seems like a good option
        sha = md5(url.encode()).hexdigest()

        # We have a name for the rng file but also for the in-download marker
        # Note : we might want to add a os.makedirs somewhere with exists=True
        makedirs(".rngs", exist_ok=True)
        stable_local = os.path.join(".rngs", sha+".rng")
        stable_local_downloading = os.path.join(".rngs", sha+".rng-indownload")

        # check if the stable_local rng already exists
        # if it does, immediately run the rng test and move to the next rng in the file
        if os.path.exists(stable_local):
            return stable_local
        # We check if the in-download proof file is shown here
        # Until the in-download marker is there, we need to wait
        elif os.path.exists(stable_local_downloading):
            # Wait up to 30 secs ?
            # Have it as a constant that could be changed in environment variables ?
            waited = self.timeout
            while not os.path.exists(stable_local):
                time.sleep(1)
                waited -= 1
                if waited < 0:
                    # Maybe we can wait more ?
                    raise EnvironmentError("The download of the RNG took too long")
        else:
            with open(stable_local_downloading, "w") as f:
                f.write("Downloading...")
            data = requests.get(url)
            data.raise_for_status()
            with open(stable_local_downloading, "w") as f:
                f.write(data.text)
            shutil.move(stable_local_downloading, stable_local)

        return stable_local

    def epidoc(self):
        """ Check the original file against Epidoc rng through a java pipe
        """
        for status in self.run_rng(TESTUnit.EPIDOC):
            yield status

    def tei(self):
        """ Check the original file against TEI rng through a java pipe
        """

        for status in self.run_rng(TESTUnit.TEI_ALL):
            yield status

    def local_file(self):
        """ Check the original file against TEI rng through a java pipe
        """

        for status in self.run_rng(self.rng):
            yield status

    def passages(self):
        """  Check that passages are available at each level. On top of that, it checks for forbidden characters \
        and duplicate in references

        """
        if self.Text and self.Text.citation.refsDecl:
            citations = [c.name for c in self.Text.citation]
            for i in range(0, len(self.Text.citation)):
                try:
                    with warnings.catch_warnings(record=True) as warning_record:
                        # Cause all warnings to always be triggered.
                        warnings.simplefilter("always")
                        passages = self.Text.getValidReff(level=i+1, _debug=True)
                        ids = [ref.split(".", i)[-1] for ref in passages]
                        space_in_passage = TESTUnit.FORBIDDEN_CHAR.search("".join(ids))
                        len_passage = len(passages)
                        status = len_passage > 0
                        self.log(str(len_passage) + " found")
                        self.citation.append((i, len_passage, citations[i]))
                        for record in warning_record:
                            if record.category == DuplicateReference:
                                self.duplicates += sorted(str(record.message).split(", "))
                            if record.category == EmptyReference:
                                self.empties += [str(record.message)]
                        if space_in_passage and space_in_passage is not None:
                            self.forbiddens += ["'{}'".format(n)
                                                for ref, n in zip(ids, passages)
                                                if TESTUnit.FORBIDDEN_CHAR.search(ref)]
                        if status is False:
                            yield status
                            break
                        yield status
                except Exception as E:
                    self.error(E)
                    self.log("Error when searching passages at level {0}".format(i+1))
                    yield False
                    break
        else:
            yield False

    def duplicate(self):
        """ Detects duplicate references

        """
        if len(self.duplicates) > 0:
            self.log("Duplicate references found : {0}".format(", ".join(self.duplicates)))
            yield False
        elif self.test_status['passages'] is False:
            yield False
        else:
            yield True

    def forbidden(self):
        """ Checks for forbidden characters in references

        """
        if len(self.forbiddens) > 0:
            self.log("Reference with forbidden characters found: {0}".format(", ".join(self.forbiddens)))
            yield False
        elif self.test_status['passages'] is False:
            yield False
        else:
            yield True

    def empty(self):
        """ Detects empty references

        """
        if len(self.empties) > 0:
            self.log("Empty references found : {0}".format(", ".join(self.empties)))
            yield False
        elif self.test_status['passages'] is False:
            yield False
        else:
            yield True

    def unique_passage(self):
        """ Check that citation scheme do not collide (eg. Where text:1 would be the same node as text:1.1)
        """
        try:
            # Checking for duplicate
            xpaths = [
                self.Text.xml.xpath(
                    MyCapytain.common.reference.REFERENCE_REPLACER.sub(
                        r"\1",
                        citation.refsDecl
                    ),
                    namespaces=TESTUnit.NS
                )
                for citation in self.Text.citation
            ]
            nodes = [element for xpath in xpaths for element in xpath]
            bad_citation = len(nodes) == len(set(nodes))
            if not bad_citation:
                self.log("Some node are found twice")
                yield False
            else:
                yield True
        except Exception:
            yield False

    def has_urn(self):
        """ Test that a file has its urn according to CapiTainS Guidelines in its scheme
        """
        if self.xml is not None:
            if self.guidelines == "2.tei":
                urns = self.xml.xpath("//tei:text/tei:body[starts-with(@n, 'urn:cts:')]", namespaces=TESTUnit.NS) + \
                        self.xml.xpath("//tei:text[starts-with(@xml:base, 'urn:cts:')]", namespaces=TESTUnit.NS)
            else:
                urns = self.xml.xpath(
                    "//tei:body/tei:div[@type='edition' and starts-with(@n, 'urn:cts:')]",
                    namespaces=TESTUnit.NS
                )
                urns += self.xml.xpath(
                    "//tei:body/tei:div[@type='translation' and starts-with(@n, 'urn:cts:')]",
                    namespaces=TESTUnit.NS
                )
                urns += self.xml.xpath(
                    "//tei:body/tei:div[@type='commentary' and starts-with(@n, 'urn:cts:')]",
                    namespaces=TESTUnit.NS
                )
            status = len(urns) > 0
            if status:
                logs = urns[0].get("n")
                if not logs:
                    logs = urns[0].base
                urn = MyCapytain.common.reference.URN(logs)
                missing_members = [
                    key for key in ['namespace', 'work', 'version', 'textgroup']
                    if getattr(urn, key) is None or len(getattr(urn, key)) == 0
                ]
                if len(urn) < 5:
                    status = False
                    self.log("Incomplete URN")
                elif urn.reference:
                    status = False
                    self.log("Reference not accepted in URN")
                elif len(missing_members) > 0:
                    status = False
                    self.log("Elements of URN are empty: {}".format(", ".join(sorted(missing_members))))
                self.urn = logs
        else:
            status = False
        yield status

    def naming_convention(self):
        """ Check the naming convention of the file
        """
        if self.urn:
            yield self.urn.split(":")[-1] in self.path
        else:
            yield False

    def inventory(self):
        """ Check the naming convention of the file
        """
        if self.urn and self.inv:
            yield self.urn in self.inv
        else:
            yield False

    def count_words(self):
        """ Count words in a file
        """
        status = False
        if self.test_status["passages"]:
            text = self.Text.export(Mimetypes.PLAINTEXT, exclude=["tei:note", "tei:teiHeader"])
            self.count = len(type(self).splitter.findall(text))

            self.log("{} has {} words".format(self.urn, self.count))
            status = self.count > 0
        yield status

    def language(self):
        """ Tests to make sure an xml:lang element is on the correct node
        """
        if self.guidelines == "2.epidoc":
            urns_holding_node = self.xml.xpath(
                "//tei:text/tei:body/tei:div"
                "[@type='edition' or @type='translation' or @type='commentary']"
                "[starts-with(@n, 'urn:cts:')]",
                namespaces=TESTUnit.NS
            )
        elif self.guidelines == "2.tei":
            urns_holding_node = self.xml.xpath("//tei:text/tei:body[starts-with(@n, 'urn:cts:')]", namespaces=TESTUnit.NS) + \
                    self.xml.xpath("//tei:text[starts-with(@xml:base, 'urn:cts:')]", namespaces=TESTUnit.NS)

        try:
            self.lang = urns_holding_node[0].get('{http://www.w3.org/XML/1998/namespace}lang')
        except:
            self.lang = ''
        if self.lang == '' or self.lang is None:
            self.lang = 'UNK'
            yield False
        else:
            yield True

    def test(self, scheme, guidelines, rng=None, inventory=None):
        """ Test a file with various checks

        :param scheme: Test with TEI DTD
        :type scheme: str
        :param inventory: URNs to be matched against
        :type inventory: list
        :returns: Iterator containing human readable test name, boolean status and logs
        :rtype: iterator(str, bool, list(str))
        """
        if inventory is not None:
            self.inv = inventory
        tests = [] + CTSText_TestUnit.tests
        if self.countwords:
            tests.append("count_words")

        if scheme in["tei", "epidoc", "auto_rng", "local_file"]:
            tests = [scheme] + tests

        self.scheme = scheme
        self.guidelines = guidelines
        self.rng = rng
        if environ.get("HOOKTEST_DEBUG", False):
            print("Starting %s " % self.path)
        i = 0
        for test in tests:

            # Show the logs and return the status

            if environ.get("HOOKTEST_DEBUG", False):
                print("\t Testing %s " % test)
            status = False not in [status for status in getattr(self, test)()]
            self.test_status[test] = status
            yield (CTSText_TestUnit.readable[test], status, self.logs)
            if test in self.breaks and status == False:
                for t in tests[i+1:]:
                    self.test_status[t] = False
                    yield (CTSText_TestUnit.readable[t], False, [])
                break
            self.flush()
            i += 1
Пример #18
0
    def test_get_Passage_context_no_double_slash(self):
        """ Check that get CapitainsCtsPassage contexts return right information """
        simple = self.TEI.getTextualNode(Reference("1.pr.2"))
        str_simple = simple.tostring(encoding=str)
        text = CapitainsCtsText(resource=str_simple,
                                citation=self.TEI.citation)
        self.assertEqual(
            text.getTextualNode(
                Reference("1.pr.2"),
                simple=True).export(output=Mimetypes.PLAINTEXT).strip(),
            "tum, ut de illis queri non possit quisquis de se bene",
            "Ensure passage finding with context is fully TEI / Capitains compliant (One reference CapitainsCtsPassage)"
        )

        simple = self.TEI.getTextualNode(Reference("1.pr.2-1.pr.7"))
        str_simple = simple.tostring(encoding=str)
        text = CapitainsCtsText(resource=str_simple,
                                citation=self.TEI.citation)
        self.assertEqual(
            text.getTextualNode(
                Reference("1.pr.2"),
                simple=True).export(output=Mimetypes.PLAINTEXT).strip(),
            "tum, ut de illis queri non possit quisquis de se bene",
            "Ensure passage finding with context is fully TEI / Capitains compliant (Same level same "
            "parent range CapitainsCtsPassage)")
        self.assertEqual(
            text.getTextualNode(
                Reference("1.pr.3"),
                simple=True).export(output=Mimetypes.PLAINTEXT).strip(),
            "senserit, cum salva infimarum quoque personarum re-",
            "Ensure passage finding with context is fully TEI / Capitains compliant (Same level same "
            "parent range CapitainsCtsPassage)")
        self.assertEqual(
            list(map(lambda x: str(x), text.getValidReff(level=3))),
            ["1.pr.2", "1.pr.3", "1.pr.4", "1.pr.5", "1.pr.6", "1.pr.7"],
            "Ensure passage finding with context is fully TEI / Capitains compliant (Same level same "
            "parent range CapitainsCtsPassage)")

        simple = self.TEI.getTextualNode(Reference("1.pr.2-1.1.6"))
        str_simple = simple.tostring(encoding=str)
        text = CapitainsCtsText(resource=str_simple,
                                citation=self.TEI.citation)
        self.assertEqual(
            text.getTextualNode(
                Reference("1.pr.2"),
                simple=True).export(output=Mimetypes.PLAINTEXT).strip(),
            "tum, ut de illis queri non possit quisquis de se bene",
            "Ensure passage finding with context is fully TEI / Capitains compliant (Same level range CapitainsCtsPassage)"
        )
        self.assertEqual(
            text.getTextualNode(
                Reference("1.1.6"),
                simple=True).export(output=Mimetypes.PLAINTEXT).strip(),
            "Rari post cineres habent poetae.",
            "Ensure passage finding with context is fully TEI / Capitains compliant (Same level range CapitainsCtsPassage)"
        )
        self.assertEqual(
            list(map(lambda x: str(x), text.getValidReff(level=3))), [
                "1.pr.2",
                "1.pr.3",
                "1.pr.4",
                "1.pr.5",
                "1.pr.6",
                "1.pr.7",
                "1.pr.8",
                "1.pr.9",
                "1.pr.10",
                "1.pr.11",
                "1.pr.12",
                "1.pr.13",
                "1.pr.14",
                "1.pr.15",
                "1.pr.16",
                "1.pr.17",
                "1.pr.18",
                "1.pr.19",
                "1.pr.20",
                "1.pr.21",
                "1.pr.22",
                "1.1.1",
                "1.1.2",
                "1.1.3",
                "1.1.4",
                "1.1.5",
                "1.1.6",
            ],
            "Ensure passage finding with context is fully TEI / Capitains compliant (Same level range CapitainsCtsPassage)"
        )

        simple = self.TEI.getTextualNode(Reference("1.pr.2-1.2"))
        str_simple = simple.tostring(encoding=str)
        text = CapitainsCtsText(resource=str_simple,
                                citation=self.TEI.citation)
        self.assertEqual(
            text.getTextualNode(
                Reference("1.pr.2"),
                simple=True).export(output=Mimetypes.PLAINTEXT).strip(),
            "tum, ut de illis queri non possit quisquis de se bene",
            "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
        )
        self.assertEqual(
            text.getTextualNode(
                Reference("1.1.6"),
                simple=True).export(output=Mimetypes.PLAINTEXT).strip(),
            "Rari post cineres habent poetae.",
            "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
        )
        self.assertEqual(
            list(map(lambda x: str(x), text.getValidReff(level=3))), [
                "1.pr.2", "1.pr.3", "1.pr.4", "1.pr.5", "1.pr.6", "1.pr.7",
                "1.pr.8", "1.pr.9", "1.pr.10", "1.pr.11", "1.pr.12", "1.pr.13",
                "1.pr.14", "1.pr.15", "1.pr.16", "1.pr.17", "1.pr.18",
                "1.pr.19", "1.pr.20", "1.pr.21", "1.pr.22", "1.1.1", "1.1.2",
                "1.1.3", "1.1.4", "1.1.5", "1.1.6", '1.2.1', '1.2.2', '1.2.3',
                '1.2.4', '1.2.5', '1.2.6', '1.2.7', '1.2.8'
            ],
            "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)"
        )
Пример #19
0
 def test_wrong_main_scope(self):
     with open("tests/testing_data/texts/sample2.xml", "rb") as f:
         with self.assertRaises(MyCapytain.errors.RefsDeclError):
             (CapitainsCtsText(resource=f)).test()
Пример #20
0
    def parse(self, resource):
        """ Parse a list of directories and reades it into a collection

        :param resource: List of folders
        :return: An inventory resource and a list of CtsTextMetadata metadata-objects
        """
        for folder in resource:
            textgroups = glob("{base_folder}/data/*/__cts__.xml".format(base_folder=folder))
            for __cts__ in textgroups:
                try:
                    with io.open(__cts__) as __xml__:
                        textgroup = XmlCtsTextgroupMetadata.parse(
                            resource=__xml__
                        )
                        tg_urn = str(textgroup.urn)
                    if tg_urn in self.inventory:
                        self.inventory[tg_urn].update(textgroup)
                    else:
                        self.dispatcher.dispatch(textgroup, path=__cts__)

                    for __subcts__ in glob("{parent}/*/__cts__.xml".format(parent=os.path.dirname(__cts__))):
                        with io.open(__subcts__) as __xml__:
                            work = XmlCtsWorkMetadata.parse(
                                resource=__xml__,
                                parent=self.inventory[tg_urn]
                            )
                            work_urn = str(work.urn)
                            if work_urn in self.inventory[tg_urn].works:
                                self.inventory[work_urn].update(work)

                        for __textkey__ in work.texts:
                            __text__ = self.inventory[__textkey__]
                            __text__.path = "{directory}/{textgroup}.{work}.{version}.xml".format(
                                directory=os.path.dirname(__subcts__),
                                textgroup=__text__.urn.textgroup,
                                work=__text__.urn.work,
                                version=__text__.urn.version
                            )
                            if os.path.isfile(__text__.path):
                                try:
                                    with io.open(__text__.path) as f:
                                        t = CapitainsCtsText(resource=self.xmlparse(f))
                                        cites = list()
                                        for cite in [c for c in t.citation][::-1]:
                                            if len(cites) >= 1:
                                                cites.append(XmlCtsCitation(
                                                    xpath=cite.xpath.replace("'", '"'),
                                                    scope=cite.scope.replace("'", '"'),
                                                    name=cite.name,
                                                    child=cites[-1]
                                                ))
                                            else:
                                                cites.append(XmlCtsCitation(
                                                    xpath=cite.xpath.replace("'", '"'),
                                                    scope=cite.scope.replace("'", '"'),
                                                    name=cite.name
                                                ))
                                        del t
                                    __text__.citation = cites[-1]
                                    self.logger.info("%s has been parsed ", __text__.path)
                                    if __text__.citation.isEmpty() is False:
                                        self.texts.append(__text__)
                                    else:
                                        self.logger.error("%s has no passages", __text__.path)
                                except Exception:
                                    self.logger.error(
                                        "%s does not accept parsing at some level (most probably citation) ",
                                        __text__.path
                                    )
                            else:
                                self.logger.error("%s is not present", __text__.path)
                except UndispatchedTextError as E:
                    self.logger.error("Error dispatching %s ", __cts__)
                    if self.RAISE_ON_UNDISPATCHED is True:
                        raise E
                except Exception as E:
                    self.logger.error("Error parsing %s ", __cts__)

        return self.inventory, self.texts
Пример #21
0
#  We import the correct classes from the local module
from MyCapytain.resources.texts.local.capitains.cts import CapitainsCtsText
from MyCapytain.common.constants import Mimetypes, XPATH_NAMESPACES
from lxml.etree import tostring

#  We open a file
with open("./tests/testing_data/examples/text.martial.xml") as f:
    # We initiate a Text object giving the IO instance to resource argument
    text = CapitainsCtsText(resource=f)

# Text objects have a citation property
#  len(Citation(...)) gives the depth of the citation scheme
# in the case of this sample, this would be 3 (Book, Poem, Line)
for ref in text.getReffs(level=len(text.citation)):
    # We retrieve a Passage object for each reference that we find
    # We can pass the reference many way, including in the form of a list of strings
    # We use the _simple parameter to get a fairly simple object
    # Simple makes a straight object that has only the targeted node inside of it
    psg = text.getTextualNode(subreference=ref, simple=True)
    # We print the passage from which we retrieve <note> nodes
    print("\t".join([ref, psg.export(Mimetypes.PLAINTEXT, exclude=["tei:note"])]))

"""
You'll print something like the following :

    1.pr.1	Spero me secutum in libellis meis tale temperamen-
    1.pr.2	tum, ut de illis queri non possit quisquis de se bene
    1.pr.3	senserit, cum salva infimarum quoque personarum re-
    1.pr.4	verentia ludant; quae adeo antiquis auctoribus defuit, ut
    1.pr.5	nominibus non tantum veris abusi sint, sed et magnis.
    1.pr.6	Mihi fama vilius constet et probetur in me novissimum