Exemplo n.º 1
0
    def convert(self):
        if len(self.cargs) < 3:
            raise CommandError("Convert requires 2 arguments")

        xml_path = self.cargs[1]
        xslt_path = self.cargs[2]
        out_file = self.cargs[3] if len(self.cargs) > 3 else None

        xml_string = utils.readFile(xml_path)
        xml_string = re.sub(ur"\bxmlns=", ur"xmlns2=", xml_string)

        # TODO: remove this hack, only for odt conversion
        # position 33% is like 'super' style
        xml_string = re.sub(ur'"-33%', ur'"sub', xml_string)
        xml_string = re.sub(ur'"33%', ur'"super', xml_string)

        xslt_string = utils.readFile(xslt_path)

        # replacements in the XSLT
        comments, xslt_string = self.parse_xslt_directives(xslt_string, xml_string)

        ret = str(dputils.get_xslt_transform(xml_string, xslt_string))

        if out_file:
            dputils.write_file(out_file, str(comments) + ret, encoding=None)
        else:
            print str(comments) + ret

        return ret
Exemplo n.º 2
0
    def command_download(self):
        ret = ur''

        recordid = self.args[1]
        unitid = ''
        if len(self.args) > 2: unitid = self.args[2]
        from digipal_text.models import TextContentXML
        from digipal_text.views.viewer import get_fragment_extent, get_all_units
        text_content_xml = TextContentXML.objects.get(id=recordid)
        content = text_content_xml.content

        suffix = ''
        if unitid:
            suffix = '-unit'
            units = get_all_units(content, 'entry')
            for unit in units:
                if unit['unitid'] == unitid:
                    ret = ur'<root>%s</root>' % unit['content']
        else:
            ret = content

        import regex

        if ret is None:
            ret = u''

        # print repr(ret)
        file_name = 'tcx%s%s.xml' % (text_content_xml.id, suffix)
        from digipal.utils import write_file
        write_file(file_name, ret)
        print 'Written file %s ' % file_name
Exemplo n.º 3
0
    def convert(self):
        if len(self.cargs) < 3:
            raise CommandError('Convert requires 2 arguments')

        xml_path = self.cargs[1]
        xslt_path = self.cargs[2]
        out_file = self.cargs[3] if len(self.cargs) > 3 else None 

        xml_string = utils.readFile(xml_path)
        xml_string = re.sub(ur'\bxmlns=', ur'xmlns2=', xml_string)

        # TODO: remove this hack, only for odt conversion
        # position 33% is like 'super' style
        xml_string = re.sub(ur'"-33%', ur'"sub', xml_string)
        xml_string = re.sub(ur'"33%', ur'"super', xml_string)

        xslt_string = utils.readFile(xslt_path)

        # replacements in the XSLT
        comments, xslt_string = self.parse_xslt_directives(xslt_string, xml_string)

        ret = str(dputils.get_xslt_transform(xml_string, xslt_string))

        if out_file:
            dputils.write_file(out_file, str(comments) + ret, encoding=None)
        else:
            print str(comments) + ret

        return ret
Exemplo n.º 4
0
    def command_autoconvert(self):
        dry = self.is_dry_run()

        from digipal_text.models import TextContentXML, TextAnnotation
        from digipal_text.views import viewer
        before = ur''
        after = ur''
        total = 0
        converted = 0
        for tcx in TextContentXML.objects.filter(
                text_content__type__slug='transcription').order_by('id'):
            total += 1
            content = tcx.content
            if not content: continue
            tcx.convert()
            if content != tcx.content:
                converted += 1
                text_name = u'#%s: %s [length diff = %s]' % (
                    tcx.id, tcx, abs(len(content) - len(tcx.content)))
                print text_name

                before += u'\n\n'
                before += text_name
                before += u'\n\n'
                before += content.replace('\r', '\n')

                after += u'\n\n'
                after += text_name
                after += u'\n\n'
                after += tcx.content.replace('\r', '\n')

                if 0:
                    html = ''
                    from difflib import HtmlDiff
                    diff = HtmlDiff(tabsize=2)
                    d = diff.make_table([content], [tcx.content])

                    html += u'<h2>%s</h2>' % text_name
                    html += d

                if not dry:
                    tcx.save()

                #break

            #tcx.save()

        dputils.write_file('before.txt', before)
        dputils.write_file('after.txt', after)

        print '%s converted out of %s texts' % (converted, total)

        if dry:
            print 'DRY RUN: no data was changed in the database.'
Exemplo n.º 5
0
    def command_download(self):
        ret = ur''

        recordid = self.args[1]
        unitid = ''
        if len(self.args) > 2: unitid = self.args[2]
        from digipal_text.models import TextContentXML
        from digipal_text.views.viewer import get_fragment_extent, get_all_units
        text_content_xml = TextContentXML.objects.get(id=recordid)
        content = text_content_xml.content
        

        suffix = ''
        if unitid:
            suffix = '-unit'
            units = get_all_units(content, 'entry')
            for unit in units:
                if unit['unitid'] == unitid:
                    ret = ur'<root>%s</root>' % unit['content']
        else:
            ret = content

        import regex

        if ret is None:
            ret = u''

        # ret = regex.sub(ur'(?musi)<span data-dpt="abbr">.*?</span>(<span data-dpt="exp">)', ur'\1', ret)

        # ret = regex.sub(ur'(?musi)<span data-dpt="hi" data-dpt-rend="su[pb]">(.*?)</span>', ur'\1', ret)
        # ret = regex.sub(ur'(?musi)<i>(.*?)</i>', ur'\1', ret)

        # print repr(ret)

#         for it in regex.findall('<span data-dpt="hi" data-dpt-rend="su[pb]">.*?</span>', ret):
#             print repr(it)

        # for it in regex.findall(ur'(?musi)qu[i1][i1]', ret):
        #    print repr(it)
        if 0:
            ret = regex.sub(ur'(?musi)<span data-dpt="hi" data-dpt-rend="sup">([^<]+)</span>', ur'<sup>\1</sup>', ret)
            ret = regex.sub(ur'(?musi)<span data-dpt="hi" data-dpt-rend="sub">([^<]+)</span>', ur'<sub>\1</sub>', ret)
            ret = regex.sub(ur'(?musi)<span data-dpt="lb" data-dpt-src="ms"></span>', ur'<br/>', ret)
            ret = regex.sub(ur'(?musi)<span data-dpt="lb" data-dpt-src="prj"></span>', ur'<lb/>', ret)
            ret = regex.sub(ur'(?musi)<span data-dpt="abbr">(.*?)</span>', ur'<abbr>\1</abbr>', ret)
            ret = regex.sub(ur'(?musi)<span data-dpt="exp">(.*?)</span>', ur'<exp>\1</exp>', ret)

        # print repr(ret)
        file_name = 'tcx%s%s.xml' % (text_content_xml.id, suffix)
        from digipal.utils import write_file
        write_file(file_name, ret)
        print 'Written file %s ' % file_name
Exemplo n.º 6
0
    def html2md(self):
        if len(self.args) < 2:
            print 'ERROR: missing path. Check help.'
            exit()
            
        path = self.args[1]
        
        from digipal.views import doc
        from django.utils.text import slugify

        for path in utils.get_all_files_under(path, file_types='f', filters=self.options['filter'], extensions=['html', 'htm'], can_return_root=True):
            info = doc.get_md_from_html(path)
            target = os.path.join(doc.get_doc_root_path('digipal'), slugify(info['title']))+'.md'
            if 'confluence-workbox' in target:
                continue
            utils.write_file(target, info['md'])
            print '%s\n  => %s' % (path, target)
            for f in info['files']:
                print '   + %s' % f 
Exemplo n.º 7
0
    def html2md(self):
        if len(self.args) < 2:
            print 'ERROR: missing path. Check help.'
            exit()
            
        path = self.args[1]
        
        from digipal.views import doc
        from django.utils.text import slugify

        for path in utils.get_all_files_under(path, file_types='f', filters=self.options['filter'], extensions=['html', 'htm'], can_return_root=True):
            info = doc.get_md_from_html(path)
            target = os.path.join(doc.get_doc_root_path('digipal'), slugify(info['title']))+'.md'
            if 'confluence-workbox' in target:
                continue
            utils.write_file(target, info['md'])
            print '%s\n  => %s' % (path, target)
            for f in info['files']:
                print '   + %s' % f