示例#1
0
def dotest(_, testnum, expnum=None):
    global Test_xmlcmp_indata, Test_xmlcmp_expdata
    e = edparse.entr(_.indata[testnum])
    xml = fmtxml.entr(e, compat="jmdict")
    expected = _.expdata[expnum or testnum]
    diff = fmtxml.entr_diff(expected, xml)
    if diff:
        #msg = "\nExpected: '%s'\nDiff: '%s'" % (expected, diff)
        msg = "\nDiff: '%s'" % (diff)
        _.failIf(1, msg)
示例#2
0
def rt(_, seq):
    # Test round trip from entry object through
    # serialize.serialize, serialize.unserialize, back to
    # object.  Compare input and output objects
    # by converting both to xml and comparing
    # text.  (Watch out for order problems).

    # FIXME: reading database to slow, too volatile.
    #   read from a test xml file instead.
    if not Cursor: globalSetup()
    # FIXME: don't hardwire corpus (aka src).
    sql = "SELECT id FROM entr WHERE seq=%s AND src=1"
    elist, r = jdb.entrList(Cursor, sql, [seq], ret_tuple=1)
    e1 = elist[0]
    jdb.augment_xrefs(Cursor, r['xref'])
    s = serialize.serialize(e1)
    e2 = serialize.unserialize(s)
    f1 = fmtxml.entr(e1)
    _.assert_(len(f1) > 40)  # Sanity check to detect empty entry.
    f2 = fmtxml.entr(e2)
    _.assertEqual(f1, f2)
示例#3
0
def write_log(entrs, outf):
    # FIXME: this code is similar to code in extrs2xml.py,
    #  should factor out into a common library.
    corpora = set()
    # Generate xml for each entry and write it to the output file.
    for e in entrs:
        if e.src not in corpora:
            txt = '\n'.join(fmtxml.corpus([e.src]))
            outf.write(txt.encode('utf-8') + "\n")
            corpora.add(e.src)
        grp = getattr(e, '_grp', [])
        for g in grp:
            gob = jdb.KW.GRP[g.kw]
            if not hasattr(gob, 'written'):
                gob.written = True
                txt = '\n'.join(fmtxml.grpdef(gob))
                outf.write(txt.encode('utf-8') + "\n")
        txt = fmtxml.entr(e, compat=None, genhists=True)
        outf.write(txt.encode('utf-8') + "\n")
示例#4
0
def entr(entr, xslfile=None, xslt=[], want_utf8=False):
    # A slow but simple way to get an Edict2 formatted text for an entry.
    # entr -- A jmdictdb Entr object, or a string containing the xml
    #   of an Entr object, or None.
    # xslfile -- Name of an xslt file.  If the name contains any path
    #   separator characters, it will be used as is.  Otherwise is it
    #   will be taken as a plain filename and searched for on the Python
    #   search path (sys.path).  Either way, the resulting file is
    #   will be converted to a lxml .etree.XSLT transform object and
    #   applied the the xml from 'entr' (if 'entr' was not None.)
    # xslt -- May be None, an empty list, or a list of one item which
    #   is a lxml.etree.XSLT transform object that will be applied to
    #   in 'entr' xml.  If an empty list, the xslt file given 'xslfile'
    #   will be converted to a transform and saved in it (for use in
    #   subsequent calls).  If None, 'xslfile' will be converted to a
    #   transform and not saved.
    # want_utf8 -- If false, a unicode text string is returned.  If
    #   true, a utf-8 encoded text string is returned.

    if not xslt:
        if not xslfile: xslfile = 'edict2.xsl'
        # Read the xsl file.
        if '/' not in xslfile and '\\' not in xslfile:
            dir = jdb.find_in_syspath(xslfile)
            xslfile = dir + '/' + xslfile
        xsldoc = lxml.etree.parse(xslfile)
        # Generate a transform, and use the default value
        # of the 'xslt' parameter to cache it.
        xslt[:] = [lxml.etree.XSLT(xsldoc)]
    edicttxt = None
    if entr:
        if not isinstance(entr, str):
            xml = fmtxml.entr(entr, compat='jmdict')
        else:
            xml = entr
        # Replace entities.
        xml = re.sub(r'&([a-zA-Z0-9-]+);', r'\1', xml)
        xml = "<JMdict>%s</JMdict>" % xml
        # Apply the xsl to the xml, result is utf-8 encoded.
        edicttxt = str(xslt[0](etree.parse(StringIO(xml)))).rstrip('\n\r')
        if want_utf8:  # Convert to utf-8 to unicode.
            edicttxt = edicttxt.encode('utf-8')
    return edicttxt
示例#5
0
def write_entrs(cur, entrs, raw, corpora, opts, outf):
    # To format xrefs in xml, they must be augmented so that the
    # the target reading and kanji text will be available.
    jdb.augment_xrefs(cur, raw['xref'])

    # Generate xml for each entry and write it to the output file.
    start = time.time()
    for e in entrs:
        if not opts.compat:
            if e.src not in corpora:
                txt = '\n'.join(fmtxml.corpus([e.src]))
                outf.write(txt + "\n")
                corpora.add(e.src)
            grp = getattr(e, '_grp', [])
            for g in grp:
                gob = jdb.KW.GRP[g.kw]
                if not hasattr(gob, 'written'):
                    gob.written = True
                    txt = '\n'.join(fmtxml.grpdef(gob))
                    outf.write(txt + "\n")
        txt = fmtxml.entr(e, compat=opts.compat, genhists=True)
        outf.write(txt + "\n")
    if Debug: print("Time: %s (fmt)" % (time.time() - start), file=sys.stderr)
示例#6
0
def main (args, opts):
        jdb.reset_encoding (sys.stdout, 'utf-8')
        errs = []
        try: form, svc, dbg, cur, sid, sess, parms, cfg = jmcgi.parseform()
        except Exception as e: jmcgi.err_page ([str (e)])
        entries = jmcgi.get_entrs (cur, form.getlist ('e'),
                                        form.getlist ('q'), errs)
        if errs: jmcgi.err_page (errs)

          # Add a .SEQKR attribute to each entry in 'entries' that
          # gives the kanji and reading of the newest (most recently 
          # edited) entry that has the same sequence number. 
        seqkr_decorate (entries)

          # Sort the entries.  The sorting order will group entries 
          # with the same sequence number (.src,.seq) together and 
          # each of those groups will be ordered by the kanji/reading
          # of the newest (most recently edited) entry in the group.
          # (The kanji and/or readings of an entry are sometimes changed
          # and this order will keep the changed entries together with
          # their pre-changed versions, while maintaining an overall
          # ordering by kanji/reading.)  Within each group having the 
          # same sequence number, entries are sorted in descending order
          # by the timestamp of the most recent history; that is, from
          # the most recently edited entry to the least recently edited
          # one. 
        entries.sort (key=lambda e: (
                e.SEQKR[0], e.SEQKR[1], 
                e.src, e.seq,  # In case different seqs have same SEQKR.
                  # e._hist[*].dt is a datatime.datetime instance.
                -(e._hist[-1].dt.timestamp() if e._hist else 0), 
                -e.id))
        for e in entries:
            for s in e._sens:
                if hasattr (s, '_xref'): jdb.augment_xrefs (cur, s._xref)
                if hasattr (s, '_xrer'): jdb.augment_xrefs (cur, s._xrer, 1)
            if hasattr (e, '_snd'): jdb.augment_snds (cur, e._snd)
        cur.close()
        disp = form.getfirst ('disp')
        if disp == 'xml':
            etxts = [fmtxml.entr (e) for e in entries]
        elif disp == 'jm':
            etxts = [fmtxml.entr (e, compat='jmdict') for e in entries]
        elif disp == 'jmne':
            etxts = [fmtxml.entr (e, compat='jmnedict') for e in entries]
        elif disp == 'jel':
            etxts = [fmtjel.entr (e) for e in entries]
        elif disp == 'ed':
            etxts = [xslfmt.entr (e) for e in entries]
        else:
            etxts = ['' for e in entries]
        jmcgi.htmlprep (entries)
        jmcgi.add_encodings (entries)    # For kanjidic entries.
        if disp == 'ed': etxts = [jmcgi.txt2html (x) for x in etxts]
        jmcgi.add_filtered_xrefs (entries, rem_unap=True)

        if errs: jmcgi.err_page (errs)

        jmcgi.jinja_page ('entr.jinja',
                        entries=list(zip(entries, etxts)), disp=disp,
                        svc=svc, dbg=dbg, sid=sid, session=sess, cfg=cfg,
                        parms=parms, this_page='entr.py')