Python get_xml_from_unicode示例，digipal.utils.get_xml_from_unicode Python示例

示例#1

0

显示文件

文件： dpxml.py 项目： suzypiat/digipal

    def val(self):
        if len(self.cargs) < 2:
            raise CommandError('Convert requires 1 arguments')

        xml_path = self.cargs[1]
        val_path = None
        if len(self.cargs) > 2:
            val_path = self.cargs[2]

        xml_string = utils.readFile(xml_path)

        import lxml.etree as ET
        try:
            dom = dputils.get_xml_from_unicode(xml_string)

            if val_path:
                from io import StringIO
                dtd = ET.DTD(open(val_path, 'rb'))
                valid = dtd.validate(dom)

                if not valid:
                    for error in dtd.error_log.filter_from_errors():
                        print error

        except ET.XMLSyntaxError as e:
            print u'XML Syntax Error %s' % e

示例#2

0

显示文件

文件： dpxml.py 项目： MCadeStewart/digipal

    def val(self):
        if len(self.cargs) < 2:
            raise CommandError('Convert requires 1 arguments')
        
        xml_path = self.cargs[1]
        val_path = None
        if len(self.cargs) > 2:
            val_path = self.cargs[2]
        
        xml_string = utils.readFile(xml_path)
        
        import lxml.etree as ET
        try:
            dom = dputils.get_xml_from_unicode(xml_string)
            
            if val_path:
                from io import StringIO
                dtd = ET.DTD(open(val_path, 'rb'))
                valid = dtd.validate(dom)

                if not valid:
                    for error in dtd.error_log.filter_from_errors():
                        print error
            
        except ET.XMLSyntaxError as e:
            print u'XML Syntax Error %s' % e

示例#3

0

显示文件

文件： dptext.py 项目： suzypiat/digipal

    def command_upload(self):
        '''upload    XML_PATH IP_ID CONTENT_TYPE [XPATH]
            pm dptext upload exon\source\rekeyed\converted\EXON-1-493.xhtml 1 transcription
        '''
        if len(self.args) < 4:
            print 'upload requires 3 arguments'
            return

        xml_path, ip_id, content_type_name = self.args[1:4]

        xpath = None
        if len(self.args) > 4:
            xpath = self.args[4]

        # I find the TextContentXML record (or create it)
        tcx = self.get_textcontentxml(ip_id, content_type_name)
        if not tcx:
            print 'ERROR: could not find record (%s, %s)' % (ip_id,
                                                             content_type_name)
            return

        # II load the file and convert it
        from digipal.utils import read_file, get_xml_from_unicode
        xml_string = read_file(xml_path)

        # III get the XML into a string
        if xpath:
            xml = get_xml_from_unicode(xml_string, add_root=True)
            els = xml.xpath(xpath)
            if len(els) > 0:
                root = els[0]
            else:
                raise Exception(u'No match for XPATH "%s"' % xpath)
            from lxml import etree
            #content = etree.tostring(root, encoding="UTF-8")
            content = dputils.get_unicode_from_xml(etree, remove_root=True)
        else:
            content = xml_string
#         print type(root)
#         print dir(root)
#         content = str(root)

        if '&#361;' in content:
            print 'Numeric entity'
            exit()

        # IV convert the xml tags and attribute to HTML-TEI
        # content = self.get_xhtml_from_xml(content)

        # save the content into the TextContentXML record
        tcx.content = content
        tcx.save()

        from django.template.defaultfilters import filesizeformat
        print 'Uploaded %s into record #%s' % (filesizeformat(
            tcx.get_length()), tcx.id)

示例#4

0

显示文件

    def command_upload(self):
        '''upload    XML_PATH IP_ID CONTENT_TYPE [XPATH]
            pm dptext upload exon\source\rekeyed\converted\EXON-1-493.xhtml 1 transcription
        '''
        if len(self.args) < 4:
            print 'upload requires 3 arguments'
            return

        xml_path, ip_id, content_type_name = self.args[1:4]

        xpath = None
        if len(self.args) > 4:
            xpath = self.args[4]

        # I find the TextContentXML record (or create it)
        tcx = self.get_textcontentxml(ip_id, content_type_name)
        if not tcx:
            print 'ERROR: could not find record (%s, %s)' % (ip_id, content_type_name)
            return

        # II load the file and convert it
        from digipal.utils import read_file, get_xml_from_unicode
        xml_string = read_file(xml_path)

        # III get the XML into a string
        if xpath:
            xml = get_xml_from_unicode(xml_string, add_root=True)
            els = xml.xpath(xpath)
            if len(els) > 0:
                root = els[0]
            else:
                raise Exception(u'No match for XPATH "%s"' % xpath)
            from lxml import etree
            #content = etree.tostring(root, encoding="UTF-8")
            content = dputils.get_unicode_from_xml(etree, remove_root=True)
        else:
            content = xml_string
#         print type(root)
#         print dir(root)
#         content = str(root)

        if '&#361;' in content:
            print 'Numeric entity'
            exit()

        # IV convert the xml tags and attribute to HTML-TEI
        # content = self.get_xhtml_from_xml(content)

        # save the content into the TextContentXML record
        tcx.content = content
        tcx.save()

        from django.template.defaultfilters import filesizeformat
        print 'Uploaded %s into record #%s' % (filesizeformat(tcx.get_length()), tcx.id)

示例#5

0

显示文件

文件： viewer.py 项目： MCadeStewart/digipal

def get_text_elements_from_content(content):
    ret = []
    if content:
        xml = utils.get_xml_from_unicode(content, ishtml=True, add_root=True)

        for element in xml.findall("//*[@data-dpt]"):

            elementid = get_elementid_from_xml_element(element)
            if elementid:
                ret.append(elementid)

    return ret

示例#6

0

显示文件

def get_text_elements_from_content(content):
    ''' Returns all the marked-up elements in a unit of text
    as a list of pairs (elementid, label). e.g. of a pair:
    [ [["", "clause"], ["type", "address"]], 'address (clause)' ]
    Elements are returns in the order they occur in the unit of text.
    Each elementid is unique in the list.
    Each label is unique in the list.
    '''
    ret = []
    if content:
        idcount = {}

        xml = utils.get_xml_from_unicode(content, ishtml=True, add_root=True)

        for element in xml.findall("//*[@data-dpt]"):

            elementid = get_elementid_from_xml_element(element, idcount=idcount)
            if elementid:
                ret.append([elementid, get_label_from_elementid(elementid)])

    # print '\n'.join([repr(r) for r in ret])

    return ret

示例#7

0

显示文件

文件： viewer.py 项目： jdemaris/digipal

def get_text_elements_from_content(content):
    ''' Returns all the marked-up elements in a unit of text
    as a list of pairs (elementid, label). e.g. of a pair:
    [ [["", "clause"], ["type", "address"]], 'address (clause)' ]
    Elements are returns in the order they occur in the unit of text.
    Each elementid is unique in the list.
    Each label is unique in the list.
    '''
    ret = []
    if content:
        idcount = {}

        xml = utils.get_xml_from_unicode(content, ishtml=True, add_root=True)

        for element in xml.findall("//*[@data-dpt]"):

            elementid = get_elementid_from_xml_element(element,
                                                       idcount=idcount)
            if elementid:
                ret.append([elementid, get_label_from_elementid(elementid)])

    # print '\n'.join([repr(r) for r in ret])

    return ret