def command_upload(self): '''upload XML_PATH IP_ID CONTENT_TYPE [XPATH] pm dptext upload exon\source\rekeyed\converted\EXON-1-493.xhtml 1 transcription ''' if len(self.args) < 4: print 'upload requires 3 arguments' return xml_path, ip_id, content_type_name = self.args[1:4] xpath = None if len(self.args) > 4: xpath = self.args[4] # I find the TextContentXML record (or create it) tcx = self.get_textcontentxml(ip_id, content_type_name) if not tcx: print 'ERROR: could not find record (%s, %s)' % (ip_id, content_type_name) return # II load the file and convert it from digipal.utils import read_file, get_xml_from_unicode xml_string = read_file(xml_path) # III get the XML into a string if xpath: xml = get_xml_from_unicode(xml_string, add_root=True) els = xml.xpath(xpath) if len(els) > 0: root = els[0] else: raise Exception(u'No match for XPATH "%s"' % xpath) from lxml import etree #content = etree.tostring(root, encoding="UTF-8") content = dputils.get_unicode_from_xml(etree, remove_root=True) else: content = xml_string # print type(root) # print dir(root) # content = str(root) if 'ũ' in content: print 'Numeric entity' exit() # IV convert the xml tags and attribute to HTML-TEI # content = self.get_xhtml_from_xml(content) # save the content into the TextContentXML record tcx.content = content tcx.save() from django.template.defaultfilters import filesizeformat print 'Uploaded %s into record #%s' % (filesizeformat( tcx.get_length()), tcx.id)
def command_upload(self): '''upload XML_PATH IP_ID CONTENT_TYPE [XPATH] pm dptext upload exon\source\rekeyed\converted\EXON-1-493.xhtml 1 transcription ''' if len(self.args) < 4: print 'upload requires 3 arguments' return xml_path, ip_id, content_type_name = self.args[1:4] xpath = None if len(self.args) > 4: xpath = self.args[4] # I find the TextContentXML record (or create it) tcx = self.get_textcontentxml(ip_id, content_type_name) if not tcx: print 'ERROR: could not find record (%s, %s)' % (ip_id, content_type_name) return # II load the file and convert it from digipal.utils import read_file, get_xml_from_unicode xml_string = read_file(xml_path) # III get the XML into a string if xpath: xml = get_xml_from_unicode(xml_string, add_root=True) els = xml.xpath(xpath) if len(els) > 0: root = els[0] else: raise Exception(u'No match for XPATH "%s"' % xpath) from lxml import etree #content = etree.tostring(root, encoding="UTF-8") content = dputils.get_unicode_from_xml(etree, remove_root=True) else: content = xml_string # print type(root) # print dir(root) # content = str(root) if 'ũ' in content: print 'Numeric entity' exit() # IV convert the xml tags and attribute to HTML-TEI # content = self.get_xhtml_from_xml(content) # save the content into the TextContentXML record tcx.content = content tcx.save() from django.template.defaultfilters import filesizeformat print 'Uploaded %s into record #%s' % (filesizeformat(tcx.get_length()), tcx.id)
def validate_input(self): '''Raises an exception if the LESS file contains @import and not the KEYWORD_ALLOW_IMPORT keyword.''' from digipal import utils content = None try: content = utils.read_file(self.infile.name) except: pass if content: if '@import' in content and KEYWORD_ALLOW_IMPORT not in content: raise Exception('@import not supported in LESS file as changes in nested LESS files are not detected by django-compressor (%s).' % self.infile.name)
def validate_input(self): '''Raises an exception if the LESS file contains @import and not the KEYWORD_ALLOW_IMPORT keyword.''' from digipal import utils content = None try: content = utils.read_file(self.infile.name) except: pass if content: if '@import' in content and KEYWORD_ALLOW_IMPORT not in content: raise Exception( '@import not supported in LESS file as changes in nested LESS files are not detected by django-compressor (%s).' % self.infile.name)
def md2cms(self): from digipal.views import doc doc_slug = 'doc' self.update_cms_page(doc_slug, draft=True) for path in utils.get_all_files_under(doc.get_doc_root_path('digipal'), file_types='f', filters=self.options['filter'], extensions='md', can_return_root=True): print path info = doc.get_doc_from_md(utils.read_file(path)) page = None if info: content = u'<div class="mddoc">%s</div>' % info['content'] page = self.update_cms_page(info['title'], content, doc_slug) if page: print ' => # %s (%s)' % (page.id, page.slug)
def read_static_file(web_path): from mezzanine.conf import settings from django.conf.urls.static import static file_path = '%s/%s' % (settings.STATIC_ROOT, re.sub('^' + re.escape(settings.STATIC_URL), '', web_path)) file_path = re.sub(ur'\?.*$', '', file_path) from digipal.utils import read_file ret = read_file(file_path) # src: url('../junicode/Junicode.eot?#iefix') format('embedded-opentype'), url('../junicode/Junicode.woff') format('woff'), url('../junicode/Junicode.ttf') format('truetype'), url('../junicode/Junicode.svg#Junicode') format('svg'); #ret = re.sub(ur"(?musi)src:[^;]*junicode*[^;]*;", ur"src: url('/static/digipal_text/junicode/Junicode.ttf');", ret) #ret = re.sub(ur"(?musi)src:[^;]*junicode*[^;]*;", ur"", ret) ret = re.sub(ur"(?musi)@font-face\s*{[^}]*}", ur"", ret) return ret
def read_static_file(web_path): from mezzanine.conf import settings from django.conf.urls.static import static file_path = '%s/%s' % (settings.STATIC_ROOT, re.sub('^'+re.escape(settings.STATIC_URL), '', web_path)) file_path = re.sub(ur'\?.*$', '', file_path) from digipal.utils import read_file ret = read_file(file_path) # src: url('../junicode/Junicode.eot?#iefix') format('embedded-opentype'), url('../junicode/Junicode.woff') format('woff'), url('../junicode/Junicode.ttf') format('truetype'), url('../junicode/Junicode.svg#Junicode') format('svg'); #ret = re.sub(ur"(?musi)src:[^;]*junicode*[^;]*;", ur"src: url('/static/digipal_text/junicode/Junicode.ttf');", ret) #ret = re.sub(ur"(?musi)src:[^;]*junicode*[^;]*;", ur"", ret) ret = re.sub(ur"(?musi)@font-face\s*{[^}]*}", ur"", ret) return ret
def readFile(filepath): from digipal.utils import read_file return read_file(filepath)
def get_md_from_html(html_file_path): info = {'files': [], 'md': '', 'title': ''} from digipal.utils import read_file import os path = html_file_path html = read_file(path) # convert to HTML DOM from bs4 import BeautifulSoup soup = BeautifulSoup(html) # extract the main title title = 'untitled' if soup.head and soup.head.title: title = soup.head.title.string # special case for Confluence webpage title = title.replace( ' - DigiPal - Confluence - Digital Humanities', '').strip() # extract the body soup = soup.body # special case for Confluence webpage for e in soup.find_all('div', attrs={'class': 'wiki-content'}): soup = e break # remove any line breaks within the <ul>s for tag in soup.find_all('ul'): tag_markup = unicode(tag) tag_markup = re.sub(ur'(?musi)<p>|</p>', ur' ', tag_markup) tag_markup = re.sub(ur'(?musi)\s+', ur' ', tag_markup) tag.replace_with(BeautifulSoup(tag_markup).ul) # images # <img src="./collections_files/col-management.png"> # ![](/digipal/static/doc/col-management.png?raw=true) # copy the image file # convert the tag import digipal import shutil static_path = os.path.join(digipal.__path__[0], 'static', 'doc') for tag in soup.find_all('img'): file_name = re.sub('.*?([^/?]*)($|\?|#)', ur'\1', tag['src']) img_src = os.path.join(os.path.dirname(path), tag['src']) img_dst = os.path.join(static_path, file_name) imgmd = '![](/static/doc/%s?raw=true)' % file_name tag.replace_with(imgmd) shutil.copyfile(img_src, img_dst) info['files'].append(img_dst) # convert <li>s for tag in soup.find_all('li'): prefix = '' for parent in tag.parents: if parent.name in ('ul', 'ol'): if not prefix: if parent.name == 'ul': prefix = '* ' if parent.name == 'ol': prefix = '%s. ' % ( len([s for s in tag.previous_siblings if s.name == 'li']) + 1) else: prefix = '#SPACE#' + prefix if prefix: tag.insert(0, prefix) # serialise into a string ret = unicode(soup) # print ret.encode('utf-8', 'ignore') # Preserve the spaces and line breaks in <pre> tags pattern = re.compile(ur'(?musi)<pre>(.*?)</pre>') pos = 1 while True: m = pattern.search(ret, pos - 1) if not m: break replacement = '#CR#```#CR#%s#CR#```#CR#' % m.group( 1).replace('\n', '#CR#').replace(' ', '#SPACE#') ret = ret[:m.start(0)] + replacement + ret[m.end(0):] pos = m.start(0) + len(replacement) # strip all unnecessary spaces #ret = re.sub(ur'(?musi)>\s+', ur'>', ret) #ret = re.sub(ur'(?musi)\s+<', ur'<', ret) ret = re.sub(ur'\s+', ur' ', ret) # convert <hx> to # for i in range(1, 5): ret = re.sub(ur'(?musi)<h%s[^>]*>(.*?)</h%s>' % (i, i), ur'\n%s \1\n' % ('#' * i,), ret) if 1: # convert <p> to paragraphs ret = re.sub(ur'(?musi)<p>(.*?)</p>\s*', ur'\1\n\n', ret) # convert strike-through ret = re.sub(ur'(?musi)<s>(.*?)</s>', ur'~~\1~~', ret) # convert italics ret = re.sub(ur'(?musi)<em>(.*?)</em>', ur'_\1_', ret) # convert <strong> ret = re.sub(ur'(?musi)<strong>(.*?)</strong>', ur'**\1**', ret) # convert <a href=""> #ret = re.sub(ur'(?musi)<a>(.*?)</a>', ur'[]()', ret) pattern = re.compile(ur'(?musi)<a.*?href="([^"]*)".*?>(.*?)</a>') pos = 1 while True: m = pattern.search(ret, pos - 1) if not m: break replacement = '' if m.group(2): # if this is a link to a confluence page, convert it to a local # link href = get_local_doc_url(m.group(1)) replacement = '[%s](%s)' % (m.group(2), href) ret = ret[:m.start(0)] + replacement + ret[m.end(0):] pos = m.start(0) + len(replacement) # convert <blockquote> #ret = re.sub(ur'(?musi)<blockquote>\s*(.*?)\s*</blockquote>', ur'\n> \1\n', ret) pattern = re.compile(ur'(?musi)<blockquote>\s*(.*?)\s*</blockquote>') pos = 1 while True: m = pattern.search(ret, pos - 1) if not m: break replacement = '%s\n\n' % re.sub( ur'(?musi)^\s*', ur'> ', m.group(1)) ret = ret[:m.start(0)] + replacement + ret[m.end(0):] pos = m.start(0) + len(replacement) # convert <pre> #ret = re.sub(ur'(?musi)<pre>\s*(.*?)\s*</pre>', ur'\n```\n\1\n```\n', ret) # add line break before bullet points ret = re.sub(ur'\s*<li>', ur'\n', ret) # add line break after block of bullet points # (only if not nested into another block) ret = re.sub(ur'\s*</ul>(?!\s*</li>)', ur'\n\n', ret) ret = re.sub(ur'#SPACE#', ur' ', ret) ret = re.sub(ur'#CR#', ur'\n', ret) # remove remaining tags ret = re.sub(ur'<[^>]*>', ur'', ret) ret = u'# %s\n%s' % (title, ret) info['md'] = ret info['title'] = title return info
def get_md_from_html(html_file_path): info = {'files': [], 'md': '', 'title': ''} from digipal.utils import read_file import os path = html_file_path html = read_file(path) # convert to HTML DOM from bs4 import BeautifulSoup soup = BeautifulSoup(html) # extract the main title title = 'untitled' if soup.head and soup.head.title: title = soup.head.title.string # special case for Confluence webpage title = title.replace(' - DigiPal - Confluence - Digital Humanities', '').strip() # extract the body soup = soup.body # special case for Confluence webpage for e in soup.find_all('div', attrs={'class': 'wiki-content'}): soup = e break # remove any line breaks within the <ul>s for tag in soup.find_all('ul'): tag_markup = unicode(tag) tag_markup = re.sub(ur'(?musi)<p>|</p>', ur' ', tag_markup) tag_markup = re.sub(ur'(?musi)\s+', ur' ', tag_markup) tag.replace_with(BeautifulSoup(tag_markup).ul) # images # <img src="./collections_files/col-management.png"> # ![](/digipal/static/doc/col-management.png?raw=true) # copy the image file # convert the tag import digipal import shutil static_path = os.path.join(digipal.__path__[0], 'static', 'doc') for tag in soup.find_all('img'): file_name = re.sub('.*?([^/?]*)($|\?|#)', ur'\1', tag['src']) img_src = os.path.join(os.path.dirname(path), tag['src']) img_dst = os.path.join(static_path, file_name) imgmd = '![](/static/doc/%s?raw=true)' % file_name tag.replace_with(imgmd) shutil.copyfile(img_src, img_dst) info['files'].append(img_dst) # convert <li>s for tag in soup.find_all('li'): prefix = '' for parent in tag.parents: if parent.name in ('ul', 'ol'): if not prefix: if parent.name == 'ul': prefix = '* ' if parent.name == 'ol': prefix = '%s. ' % (len([ s for s in tag.previous_siblings if s.name == 'li' ]) + 1) else: prefix = '#SPACE#' + prefix if prefix: tag.insert(0, prefix) # serialise into a string ret = unicode(soup) # print ret.encode('utf-8', 'ignore') # Preserve the spaces and line breaks in <pre> tags pattern = re.compile(ur'(?musi)<pre>(.*?)</pre>') pos = 1 while True: m = pattern.search(ret, pos - 1) if not m: break replacement = '#CR#```#CR#%s#CR#```#CR#' % m.group(1).replace( '\n', '#CR#').replace(' ', '#SPACE#') ret = ret[:m.start(0)] + replacement + ret[m.end(0):] pos = m.start(0) + len(replacement) # strip all unnecessary spaces #ret = re.sub(ur'(?musi)>\s+', ur'>', ret) #ret = re.sub(ur'(?musi)\s+<', ur'<', ret) ret = re.sub(ur'\s+', ur' ', ret) # convert <hx> to # for i in range(1, 5): ret = re.sub(ur'(?musi)<h%s[^>]*>(.*?)</h%s>' % (i, i), ur'\n%s \1\n' % ('#' * i, ), ret) if 1: # convert <p> to paragraphs ret = re.sub(ur'(?musi)<p>(.*?)</p>\s*', ur'\1\n\n', ret) # convert strike-through ret = re.sub(ur'(?musi)<s>(.*?)</s>', ur'~~\1~~', ret) # convert italics ret = re.sub(ur'(?musi)<em>(.*?)</em>', ur'_\1_', ret) # convert <strong> ret = re.sub(ur'(?musi)<strong>(.*?)</strong>', ur'**\1**', ret) # convert <a href=""> #ret = re.sub(ur'(?musi)<a>(.*?)</a>', ur'[]()', ret) pattern = re.compile(ur'(?musi)<a.*?href="([^"]*)".*?>(.*?)</a>') pos = 1 while True: m = pattern.search(ret, pos - 1) if not m: break replacement = '' if m.group(2): # if this is a link to a confluence page, convert it to a local # link href = get_local_doc_url(m.group(1)) replacement = '[%s](%s)' % (m.group(2), href) ret = ret[:m.start(0)] + replacement + ret[m.end(0):] pos = m.start(0) + len(replacement) # convert <blockquote> #ret = re.sub(ur'(?musi)<blockquote>\s*(.*?)\s*</blockquote>', ur'\n> \1\n', ret) pattern = re.compile(ur'(?musi)<blockquote>\s*(.*?)\s*</blockquote>') pos = 1 while True: m = pattern.search(ret, pos - 1) if not m: break replacement = '%s\n\n' % re.sub(ur'(?musi)^\s*', ur'> ', m.group(1)) ret = ret[:m.start(0)] + replacement + ret[m.end(0):] pos = m.start(0) + len(replacement) # convert <pre> #ret = re.sub(ur'(?musi)<pre>\s*(.*?)\s*</pre>', ur'\n```\n\1\n```\n', ret) # add line break before bullet points ret = re.sub(ur'\s*<li>', ur'\n', ret) # add line break after block of bullet points # (only if not nested into another block) ret = re.sub(ur'\s*</ul>(?!\s*</li>)', ur'\n\n', ret) ret = re.sub(ur'#SPACE#', ur' ', ret) ret = re.sub(ur'#CR#', ur'\n', ret) # remove remaining tags ret = re.sub(ur'<[^>]*>', ur'', ret) ret = u'# %s\n%s' % (title, ret) info['md'] = ret info['title'] = title return info