def main(args, opts): jdb.reset_encoding(sys.stdout, opts.encoding) dir = jdb.find_in_syspath("dtd-audio.xml") dtd = jdb.get_dtd(dir + "/" + "dtd-audio.xml", "JMaudio", opts.encoding) print(dtd) print("<JMaudio>") cur = jdb.dbOpen(opts.database, **jdb.dbopts(opts)) vols = jdb.dbread(cur, "SELECT * FROM sndvol") for v in vols: print("\n".join(fmtxml.sndvols([v]))) sels = jdb.dbread(cur, "SELECT * FROM sndfile s WHERE s.vol=%s", [v.id]) for s in sels: print("\n".join(fmtxml.sndsels([s]))) clips = jdb.dbread(cur, "SELECT * FROM snd c WHERE c.file=%s", [s.id]) for c in clips: print("\n".join(fmtxml.sndclips([c]))) print('</JMaudio>')
def entr(entr, xslfile=None, xslt=[], want_utf8=False): # A slow but simple way to get an Edict2 formatted text for an entry. # entr -- A jmdictdb Entr object, or a string containing the xml # of an Entr object, or None. # xslfile -- Name of an xslt file. If the name contains any path # separator characters, it will be used as is. Otherwise is it # will be taken as a plain filename and searched for on the Python # search path (sys.path). Either way, the resulting file is # will be converted to a lxml .etree.XSLT transform object and # applied the the xml from 'entr' (if 'entr' was not None.) # xslt -- May be None, an empty list, or a list of one item which # is a lxml.etree.XSLT transform object that will be applied to # in 'entr' xml. If an empty list, the xslt file given 'xslfile' # will be converted to a transform and saved in it (for use in # subsequent calls). If None, 'xslfile' will be converted to a # transform and not saved. # want_utf8 -- If false, a unicode text string is returned. If # true, a utf-8 encoded text string is returned. if not xslt: if not xslfile: xslfile = 'edict2.xsl' # Read the xsl file. if '/' not in xslfile and '\\' not in xslfile: dir = jdb.find_in_syspath(xslfile) xslfile = dir + '/' + xslfile xsldoc = lxml.etree.parse(xslfile) # Generate a transform, and use the default value # of the 'xslt' parameter to cache it. xslt[:] = [lxml.etree.XSLT(xsldoc)] edicttxt = None if entr: if not isinstance(entr, str): xml = fmtxml.entr(entr, compat='jmdict') else: xml = entr # Replace entities. xml = re.sub(r'&([a-zA-Z0-9-]+);', r'\1', xml) xml = "<JMdict>%s</JMdict>" % xml # Apply the xsl to the xml, result is utf-8 encoded. edicttxt = str(xslt[0](etree.parse(StringIO(xml)))).rstrip('\n\r') if want_utf8: # Convert to utf-8 to unicode. edicttxt = edicttxt.encode('utf-8') return edicttxt
def main(args, opts): global Debug Debug = opts.debug # Open the database. jdb.dbopts() extracts the db-related # options from the command line options in 'opts'. cur = jdb.dbOpen(opts.database, **jdb.dbopts(opts)) # If no "--root" option was supplied, choose a default based # on the value of the "--compat" option. if not opts.root: if opts.compat in ('jmnedict', 'jmneold'): opts.root = 'JMnedict' else: opts.root = 'JMdict' outf = None if not opts.nodtd: # Choose a dtd to use based on the "--compat" option. # The dtd file is expected to be located somewhere in the # pythonpath (sys.path) directories. if opts.compat == 'jmdict': dtd = "dtd-jmdict.xml" elif opts.compat == 'jmdicthist': dtd = "dtd-jmdict.xml" elif opts.compat == 'jmnedict': dtd = "dtd-jmnedict.xml" elif opts.compat == 'jmneold': dtd = "dtd-jmneold.xml" else: dtd = "dtd-jmdict-ex.xml" dir = jdb.find_in_syspath(dtd) dtdfn = dir + "/" + dtd # Fully qualified dtd file name. # jdb.get_dtd() reads the dtd text, and replaces the root # element name name and encoding with the values supplied # in the arguments. dtdtxt = jdb.get_dtd(dtdfn, opts.root, opts.encoding) if len(args) == 0: outf = sys.stdout else: outf = open(args[0], "w") jdb.reset_encoding(outf, opts.encoding) outf.write(dtdtxt) if opts.seqfile: if opts.seqfile == '-': f = sys.stdin else: f = open(opts.seqfile) #FIXME: we should read these incrementally. entrlist = [int(x) for x in f.read().split()] # seq# separated by sp or nl. if f != sys.stdin: f.close() # Turn the "--corpus" option value into a string that can be # and'ed into a SQL WHERE clause to restrict the results to # the specified corpora. corp_terms = parse_corpus_opt(opts.corpus, 'e.src') # If the output file was not opened in the dtd section # above, open it now. We postpose opening it until the # last possible moment to avoid creating it and then # bombing because there was a typo in the input or dtd # filename, etc. # FIXME: Should do a "write" function that opens the # file just before writing. if not outf: if len(args) == 0: outf = sys.stdout else: outf = open(args[0], "w") whr_act = " AND NOT unap AND stat=" + str( jdb.KW.STAT['A'].id) if opts.compat else "" if opts.begin: # If a "--begin" sequence number was given, we need to read # the entr record so we can get the src id number. Complain # and exit if not found. Complain if more than one entry # with the requested seq number exists. More than one may be # found since the same sequence number may exist in different # corpora, or in the same corpus if an entry was edited. # #FIXME: no way to select from multiple entries with same seq # number. Might want just the stat="A" entries for example. sql = "SELECT id,seq,src FROM entr e WHERE seq=%s%s%s ORDER BY src" \ % (int(opts.begin), corp_terms, whr_act) if Debug: print(sql, file=sys.stderr) start = time.time() rs = jdb.dbread(cur, sql) if Debug: print("Time: %s (init read)" % (time.time() - start), file=sys.stderr) if not rs: print ("No entry with seq '%s' found" \ % opts.begin, file=sys.stderr) sys.exit(1) if len(rs) > 1: print ("Multiple entries having seq '%s' found, results " \ "may not be as expected. Consider using -s to " \ "restrict to a single corpus." % (opts.begin), file=sys.stderr) lastsrc, lastseq, lastid = rs[0].src, rs[0].seq, rs[0].id if not opts.begin and not opts.seqfile: # If no "--begin" option, remove the " AND" from the front of # the 'corp_terms' string. Read the first entry (by seq number) # in the requested corpora. cc = corp_terms[4:] if corp_terms else 'True' # If compat (jmdict or jmnedict), restrict the xml to Active # entries only. sql = "SELECT id,seq,src FROM entr e WHERE %s%s ORDER BY src,seq LIMIT 1" % ( cc, whr_act) start = time.time() if Debug: print(sql, file=sys.stderr) rs = jdb.dbread(cur, sql) if Debug: print("Time: %s (init read)" % (time.time() - start), file=sys.stderr) lastsrc, lastseq, lastid = rs[0].src, rs[0].seq, rs[0].id # Add an enclosing root element only if we are also including # a DTD (ie, producing a full XML file). Otherwise, the file # generated will just be a list of <entr> elements. if not opts.nodtd: if opts.compat: # Add a date comment... today = time.strftime("%Y-%m-%d", time.localtime()) outf.write("<!-- %s created: %s -->\n" % (opts.root, today)) outf.write('<%s>\n' % opts.root) entrlist_loc = 0 count = opts.count done = 0 blksize = opts.blocksize corpora = set() while count is None or count > 0: if opts.seqfile: seqnums = tuple(entrlist[entrlist_loc:entrlist_loc + blksize]) if not seqnums: break entrlist_loc += blksize #FIXME: need detection of non-existent seq#s. sql = "SELECT id FROM entr e WHERE seq IN %s" + corp_terms + whr_act sql_args = [seqnums] if Debug: print(sql, sql_args, file=sys.stderr) start = time.time() tmptbl = jdb.entrFind(cur, sql, sql_args) else: # In this loop we read blocks of 'blksize' entries. Each # block read is ordered by entr src (i.e. corpus), seq, and # id. The block to read is specified in WHERE clause which # is effectively: # WHERE ((e.src=lastsrc AND e.seq=lastseq AND e.id>=lastid+1) # OR (e.src=lastsrc AND e.seq>=lastseq) # OR e.src>lastsrc) # and (lastsrc, lastseq, lastid) are from the last entry in # the last block read. whr = "WHERE ((e.src=%%s AND e.seq=%%s AND e.id>=%%s) " \ "OR (e.src=%%s AND e.seq>%%s) " \ "OR e.src>%%s) %s%s" % (corp_terms, whr_act) sql = "SELECT e.id FROM entr e" \ " %s ORDER BY src,seq,id LIMIT %d" \ % (whr, blksize if count is None else min (blksize, count)) # The following args will be substituted for the "%%s" in # the sql above, in jbd.findEntr(). sql_args = [lastsrc, lastseq, lastid, lastsrc, lastseq, lastsrc] # Create a temporary table of id numbers and give that to # jdb.entrList(). This is an order of magnitude faster than # giving the above sql directly to entrList(). if Debug: print(sql, sql_args, file=sys.stderr) start = time.time() tmptbl = jdb.entrFind(cur, sql, sql_args) mid = time.time() entrs, raw = jdb.entrList(cur, tmptbl, None, ord="src,seq,id", ret_tuple=True) end = time.time() if Debug: print("read %d entries" % len(entrs), file=sys.stderr) if Debug: print("Time: %s (entrFind), %s (entrList)" % (mid - start, end - mid), file=sys.stderr) if not entrs: break write_entrs(cur, entrs, raw, corpora, opts, outf) # Update the 'last*' variables for the next time through # the loop. Also, decrement 'count', if we are counting. lastsrc = entrs[-1].src lastseq = entrs[-1].seq lastid = entrs[-1].id + 1 if count is not None: count -= blksize done += len(entrs) if not Debug: sys.stderr.write('.') else: print("%d entries written" % done, file=sys.stderr) if not opts.nodtd: outf.writelines('</%s>\n' % opts.root) if not Debug: sys.stderr.write('\n') print("Wrote %d entries" % done, file=sys.stderr)