def dotest(_, testnum, expnum=None): global Test_xmlcmp_indata, Test_xmlcmp_expdata e = edparse.entr(_.indata[testnum]) xml = fmtxml.entr(e, compat="jmdict") expected = _.expdata[expnum or testnum] diff = fmtxml.entr_diff(expected, xml) if diff: #msg = "\nExpected: '%s'\nDiff: '%s'" % (expected, diff) msg = "\nDiff: '%s'" % (diff) _.failIf(1, msg)
def rt(_, seq): # Test round trip from entry object through # serialize.serialize, serialize.unserialize, back to # object. Compare input and output objects # by converting both to xml and comparing # text. (Watch out for order problems). # FIXME: reading database to slow, too volatile. # read from a test xml file instead. if not Cursor: globalSetup() # FIXME: don't hardwire corpus (aka src). sql = "SELECT id FROM entr WHERE seq=%s AND src=1" elist, r = jdb.entrList(Cursor, sql, [seq], ret_tuple=1) e1 = elist[0] jdb.augment_xrefs(Cursor, r['xref']) s = serialize.serialize(e1) e2 = serialize.unserialize(s) f1 = fmtxml.entr(e1) _.assert_(len(f1) > 40) # Sanity check to detect empty entry. f2 = fmtxml.entr(e2) _.assertEqual(f1, f2)
def write_log(entrs, outf): # FIXME: this code is similar to code in extrs2xml.py, # should factor out into a common library. corpora = set() # Generate xml for each entry and write it to the output file. for e in entrs: if e.src not in corpora: txt = '\n'.join(fmtxml.corpus([e.src])) outf.write(txt.encode('utf-8') + "\n") corpora.add(e.src) grp = getattr(e, '_grp', []) for g in grp: gob = jdb.KW.GRP[g.kw] if not hasattr(gob, 'written'): gob.written = True txt = '\n'.join(fmtxml.grpdef(gob)) outf.write(txt.encode('utf-8') + "\n") txt = fmtxml.entr(e, compat=None, genhists=True) outf.write(txt.encode('utf-8') + "\n")
def entr(entr, xslfile=None, xslt=[], want_utf8=False): # A slow but simple way to get an Edict2 formatted text for an entry. # entr -- A jmdictdb Entr object, or a string containing the xml # of an Entr object, or None. # xslfile -- Name of an xslt file. If the name contains any path # separator characters, it will be used as is. Otherwise is it # will be taken as a plain filename and searched for on the Python # search path (sys.path). Either way, the resulting file is # will be converted to a lxml .etree.XSLT transform object and # applied the the xml from 'entr' (if 'entr' was not None.) # xslt -- May be None, an empty list, or a list of one item which # is a lxml.etree.XSLT transform object that will be applied to # in 'entr' xml. If an empty list, the xslt file given 'xslfile' # will be converted to a transform and saved in it (for use in # subsequent calls). If None, 'xslfile' will be converted to a # transform and not saved. # want_utf8 -- If false, a unicode text string is returned. If # true, a utf-8 encoded text string is returned. if not xslt: if not xslfile: xslfile = 'edict2.xsl' # Read the xsl file. if '/' not in xslfile and '\\' not in xslfile: dir = jdb.find_in_syspath(xslfile) xslfile = dir + '/' + xslfile xsldoc = lxml.etree.parse(xslfile) # Generate a transform, and use the default value # of the 'xslt' parameter to cache it. xslt[:] = [lxml.etree.XSLT(xsldoc)] edicttxt = None if entr: if not isinstance(entr, str): xml = fmtxml.entr(entr, compat='jmdict') else: xml = entr # Replace entities. xml = re.sub(r'&([a-zA-Z0-9-]+);', r'\1', xml) xml = "<JMdict>%s</JMdict>" % xml # Apply the xsl to the xml, result is utf-8 encoded. edicttxt = str(xslt[0](etree.parse(StringIO(xml)))).rstrip('\n\r') if want_utf8: # Convert to utf-8 to unicode. edicttxt = edicttxt.encode('utf-8') return edicttxt
def write_entrs(cur, entrs, raw, corpora, opts, outf): # To format xrefs in xml, they must be augmented so that the # the target reading and kanji text will be available. jdb.augment_xrefs(cur, raw['xref']) # Generate xml for each entry and write it to the output file. start = time.time() for e in entrs: if not opts.compat: if e.src not in corpora: txt = '\n'.join(fmtxml.corpus([e.src])) outf.write(txt + "\n") corpora.add(e.src) grp = getattr(e, '_grp', []) for g in grp: gob = jdb.KW.GRP[g.kw] if not hasattr(gob, 'written'): gob.written = True txt = '\n'.join(fmtxml.grpdef(gob)) outf.write(txt + "\n") txt = fmtxml.entr(e, compat=opts.compat, genhists=True) outf.write(txt + "\n") if Debug: print("Time: %s (fmt)" % (time.time() - start), file=sys.stderr)
def main (args, opts): jdb.reset_encoding (sys.stdout, 'utf-8') errs = [] try: form, svc, dbg, cur, sid, sess, parms, cfg = jmcgi.parseform() except Exception as e: jmcgi.err_page ([str (e)]) entries = jmcgi.get_entrs (cur, form.getlist ('e'), form.getlist ('q'), errs) if errs: jmcgi.err_page (errs) # Add a .SEQKR attribute to each entry in 'entries' that # gives the kanji and reading of the newest (most recently # edited) entry that has the same sequence number. seqkr_decorate (entries) # Sort the entries. The sorting order will group entries # with the same sequence number (.src,.seq) together and # each of those groups will be ordered by the kanji/reading # of the newest (most recently edited) entry in the group. # (The kanji and/or readings of an entry are sometimes changed # and this order will keep the changed entries together with # their pre-changed versions, while maintaining an overall # ordering by kanji/reading.) Within each group having the # same sequence number, entries are sorted in descending order # by the timestamp of the most recent history; that is, from # the most recently edited entry to the least recently edited # one. entries.sort (key=lambda e: ( e.SEQKR[0], e.SEQKR[1], e.src, e.seq, # In case different seqs have same SEQKR. # e._hist[*].dt is a datatime.datetime instance. -(e._hist[-1].dt.timestamp() if e._hist else 0), -e.id)) for e in entries: for s in e._sens: if hasattr (s, '_xref'): jdb.augment_xrefs (cur, s._xref) if hasattr (s, '_xrer'): jdb.augment_xrefs (cur, s._xrer, 1) if hasattr (e, '_snd'): jdb.augment_snds (cur, e._snd) cur.close() disp = form.getfirst ('disp') if disp == 'xml': etxts = [fmtxml.entr (e) for e in entries] elif disp == 'jm': etxts = [fmtxml.entr (e, compat='jmdict') for e in entries] elif disp == 'jmne': etxts = [fmtxml.entr (e, compat='jmnedict') for e in entries] elif disp == 'jel': etxts = [fmtjel.entr (e) for e in entries] elif disp == 'ed': etxts = [xslfmt.entr (e) for e in entries] else: etxts = ['' for e in entries] jmcgi.htmlprep (entries) jmcgi.add_encodings (entries) # For kanjidic entries. if disp == 'ed': etxts = [jmcgi.txt2html (x) for x in etxts] jmcgi.add_filtered_xrefs (entries, rem_unap=True) if errs: jmcgi.err_page (errs) jmcgi.jinja_page ('entr.jinja', entries=list(zip(entries, etxts)), disp=disp, svc=svc, dbg=dbg, sid=sid, session=sess, cfg=cfg, parms=parms, this_page='entr.py')