def val(self): if len(self.cargs) < 2: raise CommandError('Convert requires 1 arguments') xml_path = self.cargs[1] val_path = None if len(self.cargs) > 2: val_path = self.cargs[2] xml_string = utils.readFile(xml_path) import lxml.etree as ET try: dom = dputils.get_xml_from_unicode(xml_string) if val_path: from io import StringIO dtd = ET.DTD(open(val_path, 'rb')) valid = dtd.validate(dom) if not valid: for error in dtd.error_log.filter_from_errors(): print error except ET.XMLSyntaxError as e: print u'XML Syntax Error %s' % e
def command_upload(self): '''upload XML_PATH IP_ID CONTENT_TYPE [XPATH] pm dptext upload exon\source\rekeyed\converted\EXON-1-493.xhtml 1 transcription ''' if len(self.args) < 4: print 'upload requires 3 arguments' return xml_path, ip_id, content_type_name = self.args[1:4] xpath = None if len(self.args) > 4: xpath = self.args[4] # I find the TextContentXML record (or create it) tcx = self.get_textcontentxml(ip_id, content_type_name) if not tcx: print 'ERROR: could not find record (%s, %s)' % (ip_id, content_type_name) return # II load the file and convert it from digipal.utils import read_file, get_xml_from_unicode xml_string = read_file(xml_path) # III get the XML into a string if xpath: xml = get_xml_from_unicode(xml_string, add_root=True) els = xml.xpath(xpath) if len(els) > 0: root = els[0] else: raise Exception(u'No match for XPATH "%s"' % xpath) from lxml import etree #content = etree.tostring(root, encoding="UTF-8") content = dputils.get_unicode_from_xml(etree, remove_root=True) else: content = xml_string # print type(root) # print dir(root) # content = str(root) if 'ũ' in content: print 'Numeric entity' exit() # IV convert the xml tags and attribute to HTML-TEI # content = self.get_xhtml_from_xml(content) # save the content into the TextContentXML record tcx.content = content tcx.save() from django.template.defaultfilters import filesizeformat print 'Uploaded %s into record #%s' % (filesizeformat( tcx.get_length()), tcx.id)
def command_upload(self): '''upload XML_PATH IP_ID CONTENT_TYPE [XPATH] pm dptext upload exon\source\rekeyed\converted\EXON-1-493.xhtml 1 transcription ''' if len(self.args) < 4: print 'upload requires 3 arguments' return xml_path, ip_id, content_type_name = self.args[1:4] xpath = None if len(self.args) > 4: xpath = self.args[4] # I find the TextContentXML record (or create it) tcx = self.get_textcontentxml(ip_id, content_type_name) if not tcx: print 'ERROR: could not find record (%s, %s)' % (ip_id, content_type_name) return # II load the file and convert it from digipal.utils import read_file, get_xml_from_unicode xml_string = read_file(xml_path) # III get the XML into a string if xpath: xml = get_xml_from_unicode(xml_string, add_root=True) els = xml.xpath(xpath) if len(els) > 0: root = els[0] else: raise Exception(u'No match for XPATH "%s"' % xpath) from lxml import etree #content = etree.tostring(root, encoding="UTF-8") content = dputils.get_unicode_from_xml(etree, remove_root=True) else: content = xml_string # print type(root) # print dir(root) # content = str(root) if 'ũ' in content: print 'Numeric entity' exit() # IV convert the xml tags and attribute to HTML-TEI # content = self.get_xhtml_from_xml(content) # save the content into the TextContentXML record tcx.content = content tcx.save() from django.template.defaultfilters import filesizeformat print 'Uploaded %s into record #%s' % (filesizeformat(tcx.get_length()), tcx.id)
def get_text_elements_from_content(content): ret = [] if content: xml = utils.get_xml_from_unicode(content, ishtml=True, add_root=True) for element in xml.findall("//*[@data-dpt]"): elementid = get_elementid_from_xml_element(element) if elementid: ret.append(elementid) return ret
def get_text_elements_from_content(content): ''' Returns all the marked-up elements in a unit of text as a list of pairs (elementid, label). e.g. of a pair: [ [["", "clause"], ["type", "address"]], 'address (clause)' ] Elements are returns in the order they occur in the unit of text. Each elementid is unique in the list. Each label is unique in the list. ''' ret = [] if content: idcount = {} xml = utils.get_xml_from_unicode(content, ishtml=True, add_root=True) for element in xml.findall("//*[@data-dpt]"): elementid = get_elementid_from_xml_element(element, idcount=idcount) if elementid: ret.append([elementid, get_label_from_elementid(elementid)]) # print '\n'.join([repr(r) for r in ret]) return ret