def parse_xrefs(txt, sens): # Following regex is used to allow any xref type designator # separated from the xref text by either or both colon or spaces. p = re.split(r'^(?:([a-zA-Z]+)(?:[: ]+))', txt) if len(p) != 3: raise ParseError('Xref "%s", bad format' % txt) return typ, xtxt = p[1:3] xtyp = jdb.KW.XREF[typ.lower()].id xrefs = re.split(r'[, ]', xtxt) xrsvs = [] for n, x in enumerate(xrefs): if not x: continue krs = x.split('\u30FB') if len(krs) > 3 or len(krs) == 0: raise ParseError('Xref "%s", bad format' % x) continue # 'krs' has 1, 2, or 3 items. Using "x" to indicate a non- # existent item, the valid arrangements if kanji, reading, # and sense number are: # Kxx, KRx, KRS, KSx, Rxx RSx # or rephrased in terms of what part of the xref can be in # what item: # [0]:KR, [1]:RS, [2]:S ktxt = None rtxt = None tsens = None for n, v in enumerate(krs): if n == 0: # v is K or R if jdb.jstr_reb(v): rtxt = v else: ktxt = v elif n == 1: # v is R or S (if n==0 was K) or S (if n==0 was R) if v.isdigit(): tsens = int(v) elif jdb.jstr_reb(v): if rtxt: raise ParseError( 'Xref "%s", two reading parts present' % x) break rtxt = v else: raise ParseError('Xref "%s", two kanji parts present' % x) break else: # v is S (n==1 must have been R) if not v.isdigit(): raise ParseError('Xref "%s", "%s" is not a sense number' % (x, v)) break if tsens: raise ParseError('Xref "%s", has two sense numbers' % x) break tsens = int(v) else: xrsvs.append( Xrslv(typ=xtyp, ord=n + 1, ktxt=ktxt, rtxt=rtxt, tsens=tsens)) if xrsvs: if not getattr(sens, '_xrslv', None): sens._xrslv = [] sens._xrslv.extend(xrsvs)
def do_rdngs(self, elems, entr, fmap): if elems is None: return rdngs = getattr(entr, '_rdng', []) kanjs = getattr(entr, '_kanj', []) rdngs = [] dupchk = {} for ord, elem in enumerate(elems): txt = elem.find('reb').text if not jdb.unique(txt, dupchk): self.warn("Duplicate reb text: '%s'" % txt) continue if not jdb.jstr_reb(txt): self.warn("reb text '%s' not kana." % txt) rdng = jdb.Rdng(rdng=ord + 1, txt=txt) self.do_kws(elem.findall('re_inf'), rdng, '_inf', 'RINF') for x in elem.findall('re_pri'): freqtuple = self.parse_freq(x.text, "re_pri") if not freqtuple: continue rlist = fmap[freqtuple][0] if not jdb.isin(rdng, rlist): rlist.append(rdng) else: self.freq_warn("Duplicate", rdng, None, x.text) nokanji = elem.find('re_nokanji') self.do_restr(elem.findall('re_restr'), rdng, kanjs, 'restr', nokanji) self.do_audio(elem.findall("audio"), rdng, jdb.Rdngsnd) rdngs.append(rdng) if rdngs: entr._rdng = rdngs
def create_entr(cursor, parsed): # From the dictionary of wwwjdict submission values in # 'parsed' we create the same kind of data that cgi/edform.py # creates internally to send to the edform.tal template: an # Entr object with some attached extra data. This object is # returned to caller (who will serialize it and write it to # a file). if parsed['subtype'] == 'new': entr = jdb.Entr() entr.src = jdb.KW.SRC['jmdict'].id else: # == 'amend' seqnum = parsed['seqnum'] errs = [] # FIXME: following assumes seqnum is an entry in jmdict. entrs = jmcgi.get_entrs(cursor, None, [seqnum], errs, active=True, corpus='jmdict') if errs: print('\n'.join(errs)) if entrs: entr = entrs[0] else: raise ParseError("Unable to get entry seq# %s from database" % seqnum) kanj = [] rdng = [] gloss = [] for x in parsed.get('headw', []): if jdb.jstr_reb(x): rdng.append(x) else: kanj.append(x) rdng.extend(parsed.get('kana', [])) ktxt = ';'.join(kanj) rtxt = ';'.join(rdng) stxt = ' / '.join(parsed.get('english', [])) pos = ','.join(parsed.get('pos', [])) misc = ','.join(parsed.get('misc', [])) xref = ','.join(parsed.get('crossref', [])) #FIXME: Note that including pos, xref. et.al. can break # a sense parse that would otherwise be ok. Maybe if the # parse fails, we should try again without this stuff, # and if that works, append this stuff as "unparsable"- # tagged extra text. # However, senses other than the first may have this # information embedded in the text and it seems a bit # much to try pulling it out... stxt = (('('+pos+')') if pos else '') \ + (('(See '+xref+')') if xref else '') \ + (('('+misc+')') if misc else '') \ + (' ' if pos or misc or xref else '') + stxt #FIXME: What do about 'date', 'entlangnam' fields? # I don't think we care about 'sendNotJS'. ktxt, rtxt, stxt = reformat(ktxt, rtxt, stxt, entr) entr.ktxt, entr.rtxt, entr.stxt = ktxt, rtxt, stxt return entr
def t_TEXT(self, t): '[^;\uFF1B\[\u3000 \\t\\r\\n\\f]+' # Classify it as kanji, reading (kana), or ordinary # text and return token accordingly. m = jdb.jstr_classify(t.value) if jdb.jstr_reb(m): t.type = 'RTEXT' elif jdb.jstr_gloss(m): pass else: t.type = 'KTEXT' return t
def t_TAGLIST_TEXT(self, t): '[^;\uFF1B:=,\u3001\\/\\.\\#\uFF0F\u30FB\\[\\] \\t\\r\\n\\f]+' # Classify it as kanji, reading (kana), or ordinary # text and return token accordingly. t.value = qcleanup(t.value) m = jdb.jstr_classify(t.value) if jdb.jstr_reb(m): t.type = 'RTEXT' elif jdb.jstr_gloss(m): pass else: t.type = 'KTEXT' return t
def mkentr(jtxt, etxt): global Lnnum # Create an entry object to represent the "A" line text of the # example sentence. e = jdb.Entr(stat=KW.STAT_A, unap=False) e.srcnote = str(Lnnum) if jdb.jstr_reb(jtxt): e._rdng = [jdb.Rdng(txt=jtxt)] else: e._kanj = [jdb.Kanj(txt=jtxt)] e._sens = [ jdb.Sens( _gloss=[jdb.Gloss(txt=etxt, ginf=KW.GINF_equ, lang=KW.LANG_eng)]) ] return e
def mkentr (jtxt, etxt, kwds): global Lnnum # Create an entry object to represent the "A" line text of the # example sentence. e = jdb.Entr (stat=KW.STAT_A, unap=False) e.srcnote = str (Lnnum) # Each @$kwds item is a 2-array consisting of the kw # id number and optionally a note string. kws = [x[0] for x in kwds] sens_note = "; ".join ([x[1] for x in kwds if len(x)>1]) or None if jdb.jstr_reb (jtxt): e._rdng = [jdb.Rdng (txt=jtxt)] else: e._kanj = [jdb.Kanj (txt=jtxt)] e._sens = [jdb.Sens (notes=sens_note, _gloss=[jdb.Gloss (lang=KW.LANG_eng, ginf=KW.GINF_equ, txt=etxt)], _misc=[jdb.Misc (kw=x) for x in kws])] return e
def check_for_warnings(cur, entr, parent_seq, chklist): # Look for other entries that have the same kanji or reading. # These will be shown as cautions at the top of the confirmation # form in hopes of reducing submissions of words already in # the database. # 'parent_seq' is used by find_similar() to exclude other entries # with the same seq# from being flagged as having duplicate kanji # or readings. dups = find_similar(cur, getattr(entr, '_kanj', []), getattr(entr, '_rdng', []), entr.src, parent_seq) if dups: chklist['dups'] = dups # FIXME: IS-190. if not getattr (entr, '_rdng', None) \ and entr.src==jdb.KW.SRC['jmdict'].id: chklist['norebs'] = True # FIXME: Should pass list of the kanj/rdng text rather than # a pre-joined string so that page can present the list as # it wishes. chklist['invkebs'] = ", ".join(k.txt for k in getattr(entr, '_kanj', []) if not jdb.jstr_keb(k.txt)) chklist['invrebs'] = ", ".join(r.txt for r in getattr(entr, '_rdng', []) if not jdb.jstr_reb(r.txt)) # FIXME: IS-190. if entr.src == jdb.KW.SRC['jmdict'].id: chklist['nopos'] = ", ".join( str(n + 1) for n, x in enumerate(getattr(entr, '_sens', [])) if not x._pos) chklist['jpgloss'] = ", ".join( "%d.%d: %s" % (n + 1, m + 1, '"' + '", "'.join(re.findall('[\uFF01-\uFF5D]', g.txt)) + '"') for n, s in enumerate(getattr(entr, '_sens', [])) for m, g in enumerate(getattr(s, '_gloss', [])) # Change text in edconf.tal if charset changed. if re.findall('[\uFF01-\uFF5D]', g.txt)) # Remove any empty warnings so that if there are no warnings, # 'chklist' itself will be empty and no warning span element # will be produced by the template (which otherwise will # contain a <hr/> even if there are no other warnings.) for k in list(chklist.keys()): if not chklist[k]: del chklist[k]
def parsebitem (s, n, jtxt): mo = re.search (r'^([^([{]+)(\((\S+)\))?(\[\d+\])*(\{(\S+)\})?(~)?\s*$', s) if not mo: raise ParseError ("\"B\" line parse error in item %d: '%s'" % (n, s)) ktxt,rtxt,sens,atxt,prio = mo.group (1,3,4,6,7) if rtxt and not jdb.jstr_reb (rtxt): raise ParseError ("Expected kana in item %d: '%s'" % (n, rtxt)) if kana_only (ktxt): if rtxt: raise ParseError ("Double kana in item %d: '%s', '%s'" % (n, ktxt, rtxt)) rtxt = ktxt; ktxt = None if sens: sens = sens.replace(']', '') sens = [x for x in sens.split ('[') if len(x)>0] if atxt and jtxt.find (atxt) < 0: raise ParseError ("{%s} not in A line in item %d" % (atxt, n)) return ktxt, rtxt, sens, atxt, not not prio
def parse_stags(tag, sens, kanjs, rdngs): stagrtxts = [] stagktxts = [] words = tag.split(',') for word in words: word = word.strip() if jdb.jstr_reb(word): stagrtxts.append(word) elif jdb.jstr_keb(word): stagktxts.append(word) else: raise ParseError( 'stagx restriction word neither reading or kanji: "%s"' % word) errs = [] jdb.txt2restr(stagrtxts, sens, rdngs, '_stagr', bad=errs) if errs: raise ParseError('Stagr text not in readings: "%s"' % '","'.join(errs)) errs = [] jdb.txt2restr(stagktxts, sens, kanjs, '_stagk', bad=errs) if errs: raise ParseError('Stagk text not in kanji: "%s"' % '","'.join(errs)) return
def check (_, expected, testtext): result = jdb.jstr_reb (testtext) _.assertEqual (result, expected)
def kana_only(txt): v = jdb.jstr_reb(txt) return (v & jdb.KANA) and not (v & jdb.KANJI)