Exemplo n.º 1
0
def p_entr_1(p):
    '''entr : preentr'''
    p.lexer.begin('INITIAL')
    e = p[1]
      # The Freq objects on the readings are inependent of
      # those on the kanjis.  The following function merges
      # common values.
    merge_freqs (e)
      # Set the foreign key ids since they will be used
      # needed by mk_restrs() below.
    jdb.setkeys (e, None)
      # The reading and sense restrictions here are simple
      # lists of text strings that give the allowed readings
      # or kanji.  mk_restrs() converts those to the canonical
      # format which uses the index number of the disallowed
      # readings or kanji.
    if hasattr (e, '_rdng') and hasattr (e, '_kanj'):
        err = mk_restrs ("_RESTR", e._rdng, e._kanj)
        if err: perror (p, err, loc=False)
    if hasattr (e, '_sens') and hasattr (e, '_kanj'):
        err = mk_restrs ("_STAGK", e._sens, e._kanj)
        if err: perror (p, err, loc=False)
    if hasattr (e, '_sens') and hasattr (e, '_rdng'):
        err = mk_restrs ("_STAGR", e._sens, e._rdng)
        if err: perror (p, err, loc=False)
      # Note that the entry object returned may have an _XREF list
      # on its senses but the supplied xref records are not
      # complete.  We do not assume database access is available
      # when parsing so we cannot look up the xrefs to find the
      # the target entry id numbers, validate that the kanji
      # reading (if given) are unique, or the target senses exist,
      # etc.  It is expected that the caller will do this resolution
      # on the xrefs using something like jdb.resolv_xref() prior
      # to using the object.
    p[0] = e
Exemplo n.º 2
0
def main (args, opts):
        global msg
        global Opts; Opts = opts
        global KW; jdb.KW = KW = jdb.Kwds (jdb.std_csv_dir())

          # Create a globally accessible function, msg() that has
          # has 'logfile' and 'opts.verbose' already bound and
          # which will be called elsewhere when there is a need to
          # write a message to the logfile.
        logfile = sys.stderr
        if opts.logfile:
            logfile = open (opts.logfile, "w", encoding=opts.encoding)
        def msg (message): _msg (logfile, opts.verbose, message)

        fin = ABPairReader (args[0], encoding='utf-8')
          # FIXME: following gives localtime, change to utc or lt+tz.
        mtime = datetime.date.fromtimestamp(os.stat(args[0])[8])
        corpid, corprec \
            = pgi.parse_corpus_opt (opts.corpus, "examples", mtime, KW.SRCT['examples'].id)
        tmpfiles = pgi.initialize (opts.tempdir)
        if not opts.noaction:
            tmpfiles = pgi.initialize (opts.tempdir)
            if corprec: pgi.wrcorp (corprec, tmpfiles)
        for eid, entr in enumerate (parse_ex (fin, opts.begin)):
            if not opts.noaction:
                entr.src = corpid
                jdb.setkeys (entr, eid+1)
                pgi.wrentr (entr, tmpfiles)
            if not (eid % 2000):
                sys.stdout.write ('.'); sys.stdout.flush()
            if opts.count and eid+1 >= opts.count: break
        sys.stdout.write ('\n')
        if not opts.noaction: pgi.finalize (tmpfiles, opts.output, not opts.keep)
Exemplo n.º 3
0
 def test_000030(_):
     # Delete rdng 1.
     pentr, entr = _.getpair(3000010)
     del entr._rdng[0]
     jdb.setkeys(entr)
     res = realign_xrers(entr, pentr)
     _.assertEqual(res, [Xref(3000020, 1, 1, 3, 3000010, 1, 1, None, None)])
     _.assertEqual(entr._sens[0]._xrer, [])
Exemplo n.º 4
0
 def test_000120(_):
     # Delete kanj 1.
     pentr, entr = _.getpair(3000050)
     del entr._kanj[0]
     jdb.setkeys(entr)
     res = realign_xrers(entr, pentr)
     _.assertEqual(res, [Xref(3000060, 1, 1, 3, 3000050, 1, 1, 1, None)])
     _.assertEqual(entr._sens[0]._xrer, [])
Exemplo n.º 5
0
 def test_000020(_):
     # Swap rdng 1 and rdng 2
     pentr, entr = _.getpair(3000010)
     entr._rdng = entr._rdng[::-1]
     jdb.setkeys(entr)
     res = realign_xrers(entr, pentr)
     _.assertEqual(res, [])
     _.assertEqual(entr._sens[0]._xrer,
                   [Xref(3000020, 1, 1, 3, 3000010, 1, 2, None, None)])
Exemplo n.º 6
0
 def test_000090(_):
     # Swap kanj's.
     pentr, entr = _.getpair(3000050)
     entr._kanj = entr._kanj[::-1]
     jdb.setkeys(entr)
     res = realign_xrers(entr, pentr)
     _.assertEqual(res, [])
     _.assertEqual(entr._sens[0]._xrer,
                   [Xref(3000060, 1, 1, 3, 3000050, 1, 1, 2, None)])
Exemplo n.º 7
0
def main(args, opts):
    global KW

    if opts.database:
        jdb.dbOpen(opts.database, **jdb.dbopts(opts))
        KW = jdb.KW
    else:
        jdb.KW = KW = jdb.Kwds(jdb.std_csv_dir())

    xlang = None
    if opts.lang:
        xlang = [KW.LANG[x].id for x in opts.lang.split(',')]

    #FIXME: we open the xml file with utf-8 encoding even though
    # its encoding may be given within the file and may be different.
    inpf = jmxml.JmdictFile(open(args[0], encoding='utf-8'))
    tmpfiles = pgi.initialize(opts.tempdir)
    if not opts.logfile: logfile = sys.stderr
    else: logfile = open(opts.logfile, "w", encoding=opts.encoding)
    eid = 0
    jmparser = jmxml.Jmparser(KW, logfile=logfile)
    for typ, entr in jmparser.parse_xmlfile(inpf,
                                            opts.begin,
                                            opts.count,
                                            opts.extract,
                                            xlang,
                                            toptag=True,
                                            seqnum_init=opts.sequence[0],
                                            seqnum_incr=opts.sequence[1]):
        if typ == 'entry':
            eid += 1
            if not ((eid - 1) % 1800):
                sys.stdout.write('.')
                sys.stdout.flush()
                logfile.flush()
            if not getattr(entr, 'src', None): entr.src = corpid
            jdb.setkeys(entr, eid)
            pgi.wrentr(entr, tmpfiles)
        elif typ == 'corpus':
            pgi.wrcorp(entr, tmpfiles)
        elif typ == 'grpdef':
            pgi.wrgrpdef(entr, tmpfiles)
        elif typ == 'root':
            # Note that 'entr' here is actually the tag name of the
            # top-level element in the xml file, typically either
            # "JMdict" or "JMnedict".
            try:                corpid, corprec \
                   = pgi.parse_corpus_opt (opts.corpus, entr, inpf.created, kw=KW)
            except KeyError:
                pass
            else:
                if corprec: pgi.wrcorp(corprec, tmpfiles)

    sys.stdout.write('\n')
    pgi.finalize(tmpfiles, opts.output, not opts.keep)
Exemplo n.º 8
0
 def test_000140(_):
     # Swap kanj, delete rdng 1.
     pentr, entr = _.getpair(3000050)
     entr._kanj = entr._kanj[::-1]
     del entr._rdng[0]
     jdb.setkeys(entr)
     res = realign_xrers(entr, pentr)
     # Note that the kanj number was not changed from
     # 1 to 2 because realign() processes rdng before
     # kanj. (c.f. test_000130.)
     _.assertEqual(res, [Xref(3000060, 1, 1, 3, 3000050, 1, 1, 1, None)])
     _.assertEqual(entr._sens[0]._xrer, [])
Exemplo n.º 9
0
 def test_000130(_):
     # Swap rdng, delete kanj 1.
     pentr, entr = _.getpair(3000050)
     entr._rdng = entr._rdng[::-1]
     del entr._kanj[0]
     jdb.setkeys(entr)
     res = realign_xrers(entr, pentr)
     # Note that the rdng number was changed from 1 to 2
     # because realign() processes rdng before kanj. (c.f.
     # test_000140.)
     _.assertEqual(res, [Xref(3000060, 1, 1, 3, 3000050, 1, 2, 1, None)])
     _.assertEqual(entr._sens[0]._xrer, [])
Exemplo n.º 10
0
def parse_xmlfile(infn, srcid, workfiles, start, count, langs):

    global Lineno

    # Use the ElementTree module to parse the jmdict
    # xml file.  This function keeps track of where
    # we are and for each parsed <entry> element, calls
    # do_entry() to actually build a runtime representation
    # of the entry, and then write_entry() to do the actual
    # writing to the database.

    inpf = LnFile(open(infn, encoding='utf-8'))
    context = iter(ElementTree.iterparse(inpf, ("start", "end")))
    event, root = next(context)
    if start and start > 1:
        print("Skipping initial entries...", file=sys.stderr)
    cntr = 0
    for event, elem in context:

        # We get here every time a tag is opened (event
        # will be "start") or closed (event will be "end")
        # "elem" is an object containg the element which
        # will be empty when event is "start" and will contain
        # all the element's attributes and child elements
        # when event is "end".  elem.tag is the name of the
        # tag.

        if elem.tag == "character" and event == "start":

            # When we encounter a <character> tag, save the line
            # number, and increment the entry counter "cntr".

            Lineno = inpf.lineno  # For warning messages created by warn().

            # If we are skipping entries, cntr will be 0.
            # Otherwise, break if we have processed the
            # the number of entries requested in the -c
            # option.

            if cntr >= count: break

        if elem.tag == 'header' and event == 'end':
            xmldate = (elem.find('date_of_creation')).text
            if (elem.find ('file_version')).text != '4' or \
               (elem.find ('database_version')).text != KANJIDIC_VERSION:
                warn('Kanjidic XML version is %s but we expected %s.'
                     '\nThis program may or may not work on this file.' %
                     (elem.find('database_version').text, KANJIDIC_VERSION))

        # Otherwise we are precessing characters so we want
        # to handle the <character> "end" events but we are
        # not interested in anything else.

        if elem.tag != "character" or event != "end": continue

        # If we haven't reached that starting line number
        # (given by the -b option) yet, then don't process
        # this entry, but we still need to clear the parsed
        # entry bofore continuing in order to avoid excessive
        # memory consumption.

        if Lineno >= start:

            # If this is the first entry processed (cnt0==0)
            # save the current entry counter value.

            cntr += 1
            if cntr == 1: print("Parsing...", file=sys.stderr)

            # Process and write this entry.

            entr = do_chr(elem, srcid, langs)
            jdb.setkeys(entr, cntr)
            pgi.wrentr(entr, workfiles)

            # A progress bar.  The modulo number is picked
            # to provide slightly less that 80 dots for a full
            # kanjidic2 file.

            if (cntr - 1) % 166 == 0: sys.stderr.write(".")

        # We no longer need the parsed xml info for this
        # item so dump it to reduce memory consumption.

        root.clear()

    return xmldate