예제 #1
0
def parse_xrefs(txt, sens):
    # Following regex is used to allow any xref type designator
    # separated from the xref text by either or both colon or spaces.
    p = re.split(r'^(?:([a-zA-Z]+)(?:[: ]+))', txt)
    if len(p) != 3:
        raise ParseError('Xref "%s", bad format' % txt)
        return
    typ, xtxt = p[1:3]
    xtyp = jdb.KW.XREF[typ.lower()].id
    xrefs = re.split(r'[, ]', xtxt)
    xrsvs = []
    for n, x in enumerate(xrefs):
        if not x: continue
        krs = x.split('\u30FB')
        if len(krs) > 3 or len(krs) == 0:
            raise ParseError('Xref "%s", bad format' % x)
            continue

        # 'krs' has 1, 2, or 3 items.  Using "x" to indicate a non-
        # existent item, the valid arrangements if kanji, reading,
        # and sense number are:
        #   Kxx, KRx, KRS, KSx, Rxx RSx
        # or rephrased in terms of what part of the xref can be in
        # what item:
        #    [0]:KR, [1]:RS, [2]:S

        ktxt = None
        rtxt = None
        tsens = None
        for n, v in enumerate(krs):
            if n == 0:  # v is K or R
                if jdb.jstr_reb(v): rtxt = v
                else: ktxt = v
            elif n == 1:  # v is R or S (if n==0 was K) or S (if n==0 was R)
                if v.isdigit(): tsens = int(v)
                elif jdb.jstr_reb(v):
                    if rtxt:
                        raise ParseError(
                            'Xref "%s", two reading parts present' % x)
                        break
                    rtxt = v
                else:
                    raise ParseError('Xref "%s", two kanji parts present' % x)
                    break
            else:  # v is S (n==1 must have been R)
                if not v.isdigit():
                    raise ParseError('Xref "%s", "%s" is not a sense number' %
                                     (x, v))
                    break
                if tsens:
                    raise ParseError('Xref "%s", has two sense numbers' % x)
                    break
                tsens = int(v)
        else:
            xrsvs.append(
                Xrslv(typ=xtyp, ord=n + 1, ktxt=ktxt, rtxt=rtxt, tsens=tsens))
    if xrsvs:
        if not getattr(sens, '_xrslv', None): sens._xrslv = []
        sens._xrslv.extend(xrsvs)
예제 #2
0
파일: jmxml.py 프로젝트: cobysy/jmdictdb
 def do_rdngs(self, elems, entr, fmap):
     if elems is None: return
     rdngs = getattr(entr, '_rdng', [])
     kanjs = getattr(entr, '_kanj', [])
     rdngs = []
     dupchk = {}
     for ord, elem in enumerate(elems):
         txt = elem.find('reb').text
         if not jdb.unique(txt, dupchk):
             self.warn("Duplicate reb text: '%s'" % txt)
             continue
         if not jdb.jstr_reb(txt):
             self.warn("reb text '%s' not kana." % txt)
         rdng = jdb.Rdng(rdng=ord + 1, txt=txt)
         self.do_kws(elem.findall('re_inf'), rdng, '_inf', 'RINF')
         for x in elem.findall('re_pri'):
             freqtuple = self.parse_freq(x.text, "re_pri")
             if not freqtuple: continue
             rlist = fmap[freqtuple][0]
             if not jdb.isin(rdng, rlist): rlist.append(rdng)
             else: self.freq_warn("Duplicate", rdng, None, x.text)
         nokanji = elem.find('re_nokanji')
         self.do_restr(elem.findall('re_restr'), rdng, kanjs, 'restr',
                       nokanji)
         self.do_audio(elem.findall("audio"), rdng, jdb.Rdngsnd)
         rdngs.append(rdng)
     if rdngs: entr._rdng = rdngs
예제 #3
0
def create_entr(cursor, parsed):
    # From the dictionary of wwwjdict submission values in
    # 'parsed' we create the same kind of data that cgi/edform.py
    # creates internally to send to the edform.tal template: an
    # Entr object with some attached extra data.  This object is
    # returned to caller (who will serialize it and write it to
    # a file).

    if parsed['subtype'] == 'new':
        entr = jdb.Entr()
        entr.src = jdb.KW.SRC['jmdict'].id
    else:  # == 'amend'
        seqnum = parsed['seqnum']
        errs = []
        # FIXME: following assumes seqnum is an entry in jmdict.
        entrs = jmcgi.get_entrs(cursor,
                                None, [seqnum],
                                errs,
                                active=True,
                                corpus='jmdict')
        if errs: print('\n'.join(errs))
        if entrs: entr = entrs[0]
        else:
            raise ParseError("Unable to get entry seq# %s from database" %
                             seqnum)

    kanj = []
    rdng = []
    gloss = []
    for x in parsed.get('headw', []):
        if jdb.jstr_reb(x): rdng.append(x)
        else: kanj.append(x)
    rdng.extend(parsed.get('kana', []))
    ktxt = ';'.join(kanj)
    rtxt = ';'.join(rdng)
    stxt = ' / '.join(parsed.get('english', []))
    pos = ','.join(parsed.get('pos', []))
    misc = ','.join(parsed.get('misc', []))
    xref = ','.join(parsed.get('crossref', []))
    #FIXME: Note that including pos, xref. et.al. can break
    # a sense parse that would otherwise be ok.  Maybe if the
    # parse fails, we should try again without this stuff,
    # and if that works, append this stuff as "unparsable"-
    # tagged extra text.
    # However, senses other than the first may have this
    # information embedded in the text and it seems a bit
    # much to try pulling it out...
    stxt = (('('+pos+')') if pos else '') \
            + (('(See '+xref+')') if xref else '') \
            + (('('+misc+')') if misc else '') \
            + (' ' if pos or misc or xref else '') + stxt

    #FIXME:  What do about 'date', 'entlangnam' fields?
    # I don't think we care about 'sendNotJS'.

    ktxt, rtxt, stxt = reformat(ktxt, rtxt, stxt, entr)
    entr.ktxt, entr.rtxt, entr.stxt = ktxt, rtxt, stxt
    return entr
예제 #4
0
 def t_TEXT(self, t):
     '[^;\uFF1B\[\u3000 \\t\\r\\n\\f]+'
     # Classify it as kanji, reading (kana), or ordinary
     # text and return token accordingly.
     m = jdb.jstr_classify(t.value)
     if jdb.jstr_reb(m): t.type = 'RTEXT'
     elif jdb.jstr_gloss(m): pass
     else: t.type = 'KTEXT'
     return t
예제 #5
0
 def t_TAGLIST_TEXT(self, t):
     '[^;\uFF1B:=,\u3001\\/\\.\\#\uFF0F\u30FB\\[\\] \\t\\r\\n\\f]+'
     # Classify it as kanji, reading (kana), or ordinary
     # text and return token accordingly.
     t.value = qcleanup(t.value)
     m = jdb.jstr_classify(t.value)
     if jdb.jstr_reb(m): t.type = 'RTEXT'
     elif jdb.jstr_gloss(m): pass
     else: t.type = 'KTEXT'
     return t
예제 #6
0
def mkentr(jtxt, etxt):
    global Lnnum
    # Create an entry object to represent the "A" line text of the
    # example sentence.
    e = jdb.Entr(stat=KW.STAT_A, unap=False)
    e.srcnote = str(Lnnum)
    if jdb.jstr_reb(jtxt): e._rdng = [jdb.Rdng(txt=jtxt)]
    else: e._kanj = [jdb.Kanj(txt=jtxt)]
    e._sens = [
        jdb.Sens(
            _gloss=[jdb.Gloss(txt=etxt, ginf=KW.GINF_equ, lang=KW.LANG_eng)])
    ]
    return e
예제 #7
0
파일: exparse.py 프로젝트: cobysy/jmdictdb
def mkentr (jtxt, etxt, kwds):
        global Lnnum
          # Create an entry object to represent the "A" line text of the
          # example sentence.
        e = jdb.Entr (stat=KW.STAT_A, unap=False)
        e.srcnote = str (Lnnum)
          # Each @$kwds item is a 2-array consisting of the kw
          # id number and optionally a note string.
        kws = [x[0] for x in kwds]
        sens_note = "; ".join ([x[1] for x in kwds if len(x)>1]) or None
        if jdb.jstr_reb (jtxt): e._rdng = [jdb.Rdng (txt=jtxt)]
        else:                   e._kanj = [jdb.Kanj (txt=jtxt)]
        e._sens = [jdb.Sens (notes=sens_note,
                    _gloss=[jdb.Gloss (lang=KW.LANG_eng,
                                     ginf=KW.GINF_equ, txt=etxt)],
                    _misc=[jdb.Misc (kw=x) for x in kws])]
        return e
예제 #8
0
def check_for_warnings(cur, entr, parent_seq, chklist):
    # Look for other entries that have the same kanji or reading.
    # These will be shown as cautions at the top of the confirmation
    # form in hopes of reducing submissions of words already in
    # the database.
    # 'parent_seq' is used by find_similar() to exclude other entries
    # with the same seq# from being flagged as having duplicate kanji
    # or readings.
    dups = find_similar(cur, getattr(entr, '_kanj', []),
                        getattr(entr, '_rdng', []), entr.src, parent_seq)
    if dups: chklist['dups'] = dups

    # FIXME: IS-190.
    if not getattr (entr, '_rdng', None) \
            and entr.src==jdb.KW.SRC['jmdict'].id:
        chklist['norebs'] = True

    # FIXME: Should pass list of the kanj/rdng text rather than
    #   a pre-joined string so that page can present the list as
    #   it wishes.
    chklist['invkebs'] = ", ".join(k.txt for k in getattr(entr, '_kanj', [])
                                   if not jdb.jstr_keb(k.txt))
    chklist['invrebs'] = ", ".join(r.txt for r in getattr(entr, '_rdng', [])
                                   if not jdb.jstr_reb(r.txt))
    # FIXME: IS-190.
    if entr.src == jdb.KW.SRC['jmdict'].id:
        chklist['nopos'] = ", ".join(
            str(n + 1) for n, x in enumerate(getattr(entr, '_sens', []))
            if not x._pos)
    chklist['jpgloss'] = ", ".join(
        "%d.%d: %s" % (n + 1, m + 1, '"' +
                       '", "'.join(re.findall('[\uFF01-\uFF5D]', g.txt)) + '"')
        for n, s in enumerate(getattr(entr, '_sens', []))
        for m, g in enumerate(getattr(s, '_gloss', []))
        # Change text in edconf.tal if charset changed.
        if re.findall('[\uFF01-\uFF5D]', g.txt))

    # Remove any empty warnings so that if there are no warnings,
    # 'chklist' itself will be empty and no warning span element
    # will be produced by the template (which otherwise will
    # contain a <hr/> even if there are no other warnings.)
    for k in list(chklist.keys()):
        if not chklist[k]: del chklist[k]
예제 #9
0
파일: exparse.py 프로젝트: cobysy/jmdictdb
def parsebitem (s, n, jtxt):
        mo = re.search (r'^([^([{]+)(\((\S+)\))?(\[\d+\])*(\{(\S+)\})?(~)?\s*$', s)
        if not mo:
            raise ParseError ("\"B\" line parse error in item %d: '%s'" % (n, s))

        ktxt,rtxt,sens,atxt,prio = mo.group (1,3,4,6,7)

        if rtxt and not jdb.jstr_reb (rtxt):
            raise ParseError ("Expected kana in item %d: '%s'" % (n, rtxt))
        if kana_only (ktxt):
            if rtxt: raise ParseError ("Double kana in item %d: '%s', '%s'" % (n, ktxt, rtxt))
            rtxt = ktxt;  ktxt = None
        if sens:
            sens = sens.replace(']', '')
            sens = [x for x in sens.split ('[') if len(x)>0]

        if atxt and jtxt.find (atxt) < 0:
            raise ParseError ("{%s} not in A line in item %d" % (atxt, n))
        return ktxt, rtxt, sens, atxt, not not prio
예제 #10
0
def parse_stags(tag, sens, kanjs, rdngs):
    stagrtxts = []
    stagktxts = []
    words = tag.split(',')
    for word in words:
        word = word.strip()
        if jdb.jstr_reb(word): stagrtxts.append(word)
        elif jdb.jstr_keb(word): stagktxts.append(word)
        else:
            raise ParseError(
                'stagx restriction word neither reading or kanji: "%s"' % word)
    errs = []
    jdb.txt2restr(stagrtxts, sens, rdngs, '_stagr', bad=errs)
    if errs:
        raise ParseError('Stagr text not in readings: "%s"' % '","'.join(errs))
    errs = []
    jdb.txt2restr(stagktxts, sens, kanjs, '_stagk', bad=errs)
    if errs:
        raise ParseError('Stagk text not in kanji: "%s"' % '","'.join(errs))
    return
예제 #11
0
 def check (_, expected, testtext):
     result = jdb.jstr_reb (testtext)
     _.assertEqual (result, expected)
예제 #12
0
def kana_only(txt):
    v = jdb.jstr_reb(txt)
    return (v & jdb.KANA) and not (v & jdb.KANJI)