Пример #1
0
    def command_upload(self):
        '''upload    XML_PATH IP_ID CONTENT_TYPE [XPATH]
            pm dptext upload exon\source\rekeyed\converted\EXON-1-493.xhtml 1 transcription
        '''
        if len(self.args) < 4:
            print 'upload requires 3 arguments'
            return

        xml_path, ip_id, content_type_name = self.args[1:4]

        xpath = None
        if len(self.args) > 4:
            xpath = self.args[4]

        # I find the TextContentXML record (or create it)
        tcx = self.get_textcontentxml(ip_id, content_type_name)
        if not tcx:
            print 'ERROR: could not find record (%s, %s)' % (ip_id,
                                                             content_type_name)
            return

        # II load the file and convert it
        from digipal.utils import read_file, get_xml_from_unicode
        xml_string = read_file(xml_path)

        # III get the XML into a string
        if xpath:
            xml = get_xml_from_unicode(xml_string, add_root=True)
            els = xml.xpath(xpath)
            if len(els) > 0:
                root = els[0]
            else:
                raise Exception(u'No match for XPATH "%s"' % xpath)
            from lxml import etree
            #content = etree.tostring(root, encoding="UTF-8")
            content = dputils.get_unicode_from_xml(etree, remove_root=True)
        else:
            content = xml_string
#         print type(root)
#         print dir(root)
#         content = str(root)

        if '&#361;' in content:
            print 'Numeric entity'
            exit()

        # IV convert the xml tags and attribute to HTML-TEI
        # content = self.get_xhtml_from_xml(content)

        # save the content into the TextContentXML record
        tcx.content = content
        tcx.save()

        from django.template.defaultfilters import filesizeformat
        print 'Uploaded %s into record #%s' % (filesizeformat(
            tcx.get_length()), tcx.id)
Пример #2
0
    def command_upload(self):
        '''upload    XML_PATH IP_ID CONTENT_TYPE [XPATH]
            pm dptext upload exon\source\rekeyed\converted\EXON-1-493.xhtml 1 transcription
        '''
        if len(self.args) < 4:
            print 'upload requires 3 arguments'
            return

        xml_path, ip_id, content_type_name = self.args[1:4]

        xpath = None
        if len(self.args) > 4:
            xpath = self.args[4]

        # I find the TextContentXML record (or create it)
        tcx = self.get_textcontentxml(ip_id, content_type_name)
        if not tcx:
            print 'ERROR: could not find record (%s, %s)' % (ip_id, content_type_name)
            return

        # II load the file and convert it
        from digipal.utils import read_file, get_xml_from_unicode
        xml_string = read_file(xml_path)

        # III get the XML into a string
        if xpath:
            xml = get_xml_from_unicode(xml_string, add_root=True)
            els = xml.xpath(xpath)
            if len(els) > 0:
                root = els[0]
            else:
                raise Exception(u'No match for XPATH "%s"' % xpath)
            from lxml import etree
            #content = etree.tostring(root, encoding="UTF-8")
            content = dputils.get_unicode_from_xml(etree, remove_root=True)
        else:
            content = xml_string
#         print type(root)
#         print dir(root)
#         content = str(root)

        if '&#361;' in content:
            print 'Numeric entity'
            exit()

        # IV convert the xml tags and attribute to HTML-TEI
        # content = self.get_xhtml_from_xml(content)

        # save the content into the TextContentXML record
        tcx.content = content
        tcx.save()

        from django.template.defaultfilters import filesizeformat
        print 'Uploaded %s into record #%s' % (filesizeformat(tcx.get_length()), tcx.id)
Пример #3
0
 def validate_input(self):
     '''Raises an exception if the LESS file contains @import and not the
         KEYWORD_ALLOW_IMPORT keyword.'''
     from digipal import utils
     content = None
     try:
         content = utils.read_file(self.infile.name)
     except:
         pass
     if content:
         if '@import' in content and KEYWORD_ALLOW_IMPORT not in content:
             raise Exception('@import not supported in LESS file as changes in nested LESS files are not detected by django-compressor (%s).' % self.infile.name)
Пример #4
0
 def validate_input(self):
     '''Raises an exception if the LESS file contains @import and not the
         KEYWORD_ALLOW_IMPORT keyword.'''
     from digipal import utils
     content = None
     try:
         content = utils.read_file(self.infile.name)
     except:
         pass
     if content:
         if '@import' in content and KEYWORD_ALLOW_IMPORT not in content:
             raise Exception(
                 '@import not supported in LESS file as changes in nested LESS files are not detected by django-compressor (%s).'
                 % self.infile.name)
Пример #5
0
 def md2cms(self):
     from digipal.views import doc
     
     doc_slug = 'doc'
     self.update_cms_page(doc_slug, draft=True)
     
     for path in utils.get_all_files_under(doc.get_doc_root_path('digipal'), file_types='f', filters=self.options['filter'], extensions='md', can_return_root=True):
         print path
         info = doc.get_doc_from_md(utils.read_file(path))
         page = None
         if info:
             content = u'<div class="mddoc">%s</div>' % info['content']
             page = self.update_cms_page(info['title'], content, doc_slug)
         if page:
             print '  => # %s (%s)' % (page.id, page.slug)
Пример #6
0
 def md2cms(self):
     from digipal.views import doc
     
     doc_slug = 'doc'
     self.update_cms_page(doc_slug, draft=True)
     
     for path in utils.get_all_files_under(doc.get_doc_root_path('digipal'), file_types='f', filters=self.options['filter'], extensions='md', can_return_root=True):
         print path
         info = doc.get_doc_from_md(utils.read_file(path))
         page = None
         if info:
             content = u'<div class="mddoc">%s</div>' % info['content']
             page = self.update_cms_page(info['title'], content, doc_slug)
         if page:
             print '  => # %s (%s)' % (page.id, page.slug)
Пример #7
0
def read_static_file(web_path):
    from mezzanine.conf import settings
    from django.conf.urls.static import static

    file_path = '%s/%s' % (settings.STATIC_ROOT,
                           re.sub('^' + re.escape(settings.STATIC_URL), '',
                                  web_path))
    file_path = re.sub(ur'\?.*$', '', file_path)

    from digipal.utils import read_file
    ret = read_file(file_path)

    # src: url('../junicode/Junicode.eot?#iefix') format('embedded-opentype'),  url('../junicode/Junicode.woff') format('woff'), url('../junicode/Junicode.ttf')  format('truetype'), url('../junicode/Junicode.svg#Junicode') format('svg');
    #ret = re.sub(ur"(?musi)src:[^;]*junicode*[^;]*;", ur"src: url('/static/digipal_text/junicode/Junicode.ttf');", ret)
    #ret = re.sub(ur"(?musi)src:[^;]*junicode*[^;]*;", ur"", ret)
    ret = re.sub(ur"(?musi)@font-face\s*{[^}]*}", ur"", ret)

    return ret
Пример #8
0
def read_static_file(web_path):
    from mezzanine.conf import settings
    from django.conf.urls.static import static
    
    file_path = '%s/%s' % (settings.STATIC_ROOT, re.sub('^'+re.escape(settings.STATIC_URL), '', web_path))
    file_path = re.sub(ur'\?.*$', '', file_path)
    
    from digipal.utils import read_file
    ret = read_file(file_path)
    
    # src: url('../junicode/Junicode.eot?#iefix') format('embedded-opentype'),  url('../junicode/Junicode.woff') format('woff'), url('../junicode/Junicode.ttf')  format('truetype'), url('../junicode/Junicode.svg#Junicode') format('svg');
    #ret = re.sub(ur"(?musi)src:[^;]*junicode*[^;]*;", ur"src: url('/static/digipal_text/junicode/Junicode.ttf');", ret)
    #ret = re.sub(ur"(?musi)src:[^;]*junicode*[^;]*;", ur"", ret)
    ret = re.sub(ur"(?musi)@font-face\s*{[^}]*}", ur"", ret)

    return ret

    
    
Пример #9
0
def readFile(filepath):
    from digipal.utils import read_file
    return read_file(filepath)
Пример #10
0
def get_md_from_html(html_file_path):
    info = {'files': [], 'md': '', 'title': ''}
    from digipal.utils import read_file
    import os

    path = html_file_path

    html = read_file(path)

    # convert to HTML DOM
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html)

    # extract the main title
    title = 'untitled'
    if soup.head and soup.head.title:
        title = soup.head.title.string
        # special case for Confluence webpage
        title = title.replace(
            ' - DigiPal - Confluence - Digital Humanities', '').strip()

    # extract the body
    soup = soup.body
    # special case for Confluence webpage
    for e in soup.find_all('div', attrs={'class': 'wiki-content'}):
        soup = e
        break

    # remove any line breaks within the <ul>s
    for tag in soup.find_all('ul'):
        tag_markup = unicode(tag)
        tag_markup = re.sub(ur'(?musi)<p>|</p>', ur' ', tag_markup)
        tag_markup = re.sub(ur'(?musi)\s+', ur' ', tag_markup)
        tag.replace_with(BeautifulSoup(tag_markup).ul)

    # images
    # <img src="./collections_files/col-management.png">
    # ![](/digipal/static/doc/col-management.png?raw=true)
    # copy the image file
    # convert the tag
    import digipal
    import shutil
    static_path = os.path.join(digipal.__path__[0], 'static', 'doc')
    for tag in soup.find_all('img'):
        file_name = re.sub('.*?([^/?]*)($|\?|#)', ur'\1', tag['src'])
        img_src = os.path.join(os.path.dirname(path), tag['src'])
        img_dst = os.path.join(static_path, file_name)
        imgmd = '![](/static/doc/%s?raw=true)' % file_name
        tag.replace_with(imgmd)
        shutil.copyfile(img_src, img_dst)
        info['files'].append(img_dst)

    # convert <li>s
    for tag in soup.find_all('li'):
        prefix = ''
        for parent in tag.parents:
            if parent.name in ('ul', 'ol'):
                if not prefix:
                    if parent.name == 'ul':
                        prefix = '* '
                    if parent.name == 'ol':
                        prefix = '%s. ' % (
                            len([s for s in tag.previous_siblings if s.name == 'li']) + 1)
                else:
                    prefix = '#SPACE#' + prefix
        if prefix:
            tag.insert(0, prefix)

    # serialise into a string
    ret = unicode(soup)

    # print ret.encode('utf-8', 'ignore')

    # Preserve the spaces and line breaks in <pre> tags
    pattern = re.compile(ur'(?musi)<pre>(.*?)</pre>')
    pos = 1
    while True:
        m = pattern.search(ret, pos - 1)
        if not m:
            break

        replacement = '#CR#```#CR#%s#CR#```#CR#' % m.group(
            1).replace('\n', '#CR#').replace(' ', '#SPACE#')
        ret = ret[:m.start(0)] + replacement + ret[m.end(0):]
        pos = m.start(0) + len(replacement)

    # strip all unnecessary spaces
    #ret = re.sub(ur'(?musi)>\s+', ur'>', ret)
    #ret = re.sub(ur'(?musi)\s+<', ur'<', ret)
    ret = re.sub(ur'\s+', ur' ', ret)

    # convert <hx> to #
    for i in range(1, 5):
        ret = re.sub(ur'(?musi)<h%s[^>]*>(.*?)</h%s>' %
                     (i, i), ur'\n%s \1\n' % ('#' * i,), ret)

    if 1:
        # convert <p> to paragraphs
        ret = re.sub(ur'(?musi)<p>(.*?)</p>\s*', ur'\1\n\n', ret)

        # convert strike-through
        ret = re.sub(ur'(?musi)<s>(.*?)</s>', ur'~~\1~~', ret)

        # convert italics
        ret = re.sub(ur'(?musi)<em>(.*?)</em>', ur'_\1_', ret)

        # convert <strong>
        ret = re.sub(ur'(?musi)<strong>(.*?)</strong>', ur'**\1**', ret)

        # convert <a href="">
        #ret = re.sub(ur'(?musi)<a>(.*?)</a>', ur'[]()', ret)
        pattern = re.compile(ur'(?musi)<a.*?href="([^"]*)".*?>(.*?)</a>')
        pos = 1
        while True:
            m = pattern.search(ret, pos - 1)
            if not m:
                break

            replacement = ''
            if m.group(2):
                # if this is a link to a confluence page, convert it to a local
                # link
                href = get_local_doc_url(m.group(1))
                replacement = '[%s](%s)' % (m.group(2), href)

            ret = ret[:m.start(0)] + replacement + ret[m.end(0):]
            pos = m.start(0) + len(replacement)

        # convert <blockquote>
        #ret = re.sub(ur'(?musi)<blockquote>\s*(.*?)\s*</blockquote>', ur'\n> \1\n', ret)
        pattern = re.compile(ur'(?musi)<blockquote>\s*(.*?)\s*</blockquote>')
        pos = 1
        while True:
            m = pattern.search(ret, pos - 1)
            if not m:
                break

            replacement = '%s\n\n' % re.sub(
                ur'(?musi)^\s*', ur'> ', m.group(1))
            ret = ret[:m.start(0)] + replacement + ret[m.end(0):]
            pos = m.start(0) + len(replacement)

        # convert <pre>
        #ret = re.sub(ur'(?musi)<pre>\s*(.*?)\s*</pre>', ur'\n```\n\1\n```\n', ret)

        # add line break before bullet points
        ret = re.sub(ur'\s*<li>', ur'\n', ret)
        # add line break after block of bullet points
        # (only if not nested into another block)
        ret = re.sub(ur'\s*</ul>(?!\s*</li>)', ur'\n\n', ret)

        ret = re.sub(ur'#SPACE#', ur' ', ret)
        ret = re.sub(ur'#CR#', ur'\n', ret)

        # remove remaining tags
        ret = re.sub(ur'<[^>]*>', ur'', ret)

        ret = u'# %s\n%s' % (title, ret)

    info['md'] = ret
    info['title'] = title

    return info
Пример #11
0
def readFile(filepath):
    from digipal.utils import read_file

    return read_file(filepath)
Пример #12
0
def get_md_from_html(html_file_path):
    info = {'files': [], 'md': '', 'title': ''}
    from digipal.utils import read_file
    import os

    path = html_file_path

    html = read_file(path)

    # convert to HTML DOM
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html)

    # extract the main title
    title = 'untitled'
    if soup.head and soup.head.title:
        title = soup.head.title.string
        # special case for Confluence webpage
        title = title.replace(' - DigiPal - Confluence - Digital Humanities',
                              '').strip()

    # extract the body
    soup = soup.body
    # special case for Confluence webpage
    for e in soup.find_all('div', attrs={'class': 'wiki-content'}):
        soup = e
        break

    # remove any line breaks within the <ul>s
    for tag in soup.find_all('ul'):
        tag_markup = unicode(tag)
        tag_markup = re.sub(ur'(?musi)<p>|</p>', ur' ', tag_markup)
        tag_markup = re.sub(ur'(?musi)\s+', ur' ', tag_markup)
        tag.replace_with(BeautifulSoup(tag_markup).ul)

    # images
    # <img src="./collections_files/col-management.png">
    # ![](/digipal/static/doc/col-management.png?raw=true)
    # copy the image file
    # convert the tag
    import digipal
    import shutil
    static_path = os.path.join(digipal.__path__[0], 'static', 'doc')
    for tag in soup.find_all('img'):
        file_name = re.sub('.*?([^/?]*)($|\?|#)', ur'\1', tag['src'])
        img_src = os.path.join(os.path.dirname(path), tag['src'])
        img_dst = os.path.join(static_path, file_name)
        imgmd = '![](/static/doc/%s?raw=true)' % file_name
        tag.replace_with(imgmd)
        shutil.copyfile(img_src, img_dst)
        info['files'].append(img_dst)

    # convert <li>s
    for tag in soup.find_all('li'):
        prefix = ''
        for parent in tag.parents:
            if parent.name in ('ul', 'ol'):
                if not prefix:
                    if parent.name == 'ul':
                        prefix = '* '
                    if parent.name == 'ol':
                        prefix = '%s. ' % (len([
                            s for s in tag.previous_siblings if s.name == 'li'
                        ]) + 1)
                else:
                    prefix = '#SPACE#' + prefix
        if prefix:
            tag.insert(0, prefix)

    # serialise into a string
    ret = unicode(soup)

    # print ret.encode('utf-8', 'ignore')

    # Preserve the spaces and line breaks in <pre> tags
    pattern = re.compile(ur'(?musi)<pre>(.*?)</pre>')
    pos = 1
    while True:
        m = pattern.search(ret, pos - 1)
        if not m:
            break

        replacement = '#CR#```#CR#%s#CR#```#CR#' % m.group(1).replace(
            '\n', '#CR#').replace(' ', '#SPACE#')
        ret = ret[:m.start(0)] + replacement + ret[m.end(0):]
        pos = m.start(0) + len(replacement)

    # strip all unnecessary spaces
    #ret = re.sub(ur'(?musi)>\s+', ur'>', ret)
    #ret = re.sub(ur'(?musi)\s+<', ur'<', ret)
    ret = re.sub(ur'\s+', ur' ', ret)

    # convert <hx> to #
    for i in range(1, 5):
        ret = re.sub(ur'(?musi)<h%s[^>]*>(.*?)</h%s>' % (i, i),
                     ur'\n%s \1\n' % ('#' * i, ), ret)

    if 1:
        # convert <p> to paragraphs
        ret = re.sub(ur'(?musi)<p>(.*?)</p>\s*', ur'\1\n\n', ret)

        # convert strike-through
        ret = re.sub(ur'(?musi)<s>(.*?)</s>', ur'~~\1~~', ret)

        # convert italics
        ret = re.sub(ur'(?musi)<em>(.*?)</em>', ur'_\1_', ret)

        # convert <strong>
        ret = re.sub(ur'(?musi)<strong>(.*?)</strong>', ur'**\1**', ret)

        # convert <a href="">
        #ret = re.sub(ur'(?musi)<a>(.*?)</a>', ur'[]()', ret)
        pattern = re.compile(ur'(?musi)<a.*?href="([^"]*)".*?>(.*?)</a>')
        pos = 1
        while True:
            m = pattern.search(ret, pos - 1)
            if not m:
                break

            replacement = ''
            if m.group(2):
                # if this is a link to a confluence page, convert it to a local
                # link
                href = get_local_doc_url(m.group(1))
                replacement = '[%s](%s)' % (m.group(2), href)

            ret = ret[:m.start(0)] + replacement + ret[m.end(0):]
            pos = m.start(0) + len(replacement)

        # convert <blockquote>
        #ret = re.sub(ur'(?musi)<blockquote>\s*(.*?)\s*</blockquote>', ur'\n> \1\n', ret)
        pattern = re.compile(ur'(?musi)<blockquote>\s*(.*?)\s*</blockquote>')
        pos = 1
        while True:
            m = pattern.search(ret, pos - 1)
            if not m:
                break

            replacement = '%s\n\n' % re.sub(ur'(?musi)^\s*', ur'> ',
                                            m.group(1))
            ret = ret[:m.start(0)] + replacement + ret[m.end(0):]
            pos = m.start(0) + len(replacement)

        # convert <pre>
        #ret = re.sub(ur'(?musi)<pre>\s*(.*?)\s*</pre>', ur'\n```\n\1\n```\n', ret)

        # add line break before bullet points
        ret = re.sub(ur'\s*<li>', ur'\n', ret)
        # add line break after block of bullet points
        # (only if not nested into another block)
        ret = re.sub(ur'\s*</ul>(?!\s*</li>)', ur'\n\n', ret)

        ret = re.sub(ur'#SPACE#', ur' ', ret)
        ret = re.sub(ur'#CR#', ur'\n', ret)

        # remove remaining tags
        ret = re.sub(ur'<[^>]*>', ur'', ret)

        ret = u'# %s\n%s' % (title, ret)

    info['md'] = ret
    info['title'] = title

    return info