def p_tagitem_10(p): '''tagitem : TEXT EQL jrefs''' tag = p[1] taglist = [] tagtype = 'XREF' KW = jdb.KW for jref in p[3]: dotlist, slist, seq, corpus = jref if tag in [x.kw for x in KW.recs('XREF')]: # FIXME: instead of using XREF kw''s directly, do we want to # change to an lsrc syntax like, "xref=cf:..." # (possibly keeping "see" and "ant" as direct keywords)? if len(dotlist) == 1: if jdb.jstr_keb(dotlist[0]): taglist.append( ['XREF', tag, None, dotlist[0], slist, seq, corpus]) else: taglist.append( ['XREF', tag, dotlist[0], None, slist, seq, corpus]) elif len(dotlist) == 2: taglist.append( ['XREF', tag, dotlist[1], dotlist[0], slist, seq, corpus]) elif len(dotlist) == 0: taglist.append(['XREF', tag, None, None, slist, seq, corpus]) else: perror( "No more than on kanji and one reading string can be given in an xref." ) continue # The full 'jref' syntax is only used by xrefs (above) # so if we get here, complain if the 'jref' item has # any xref-specific elements. if seq or corpus or slist: perror( "Seq number, corpus, or a sense list can only be given with xref tags" ) # Xrefs are also the only contruct that uses the middot character # syntactically. Since we don''t have an xref, then the midots are # just characters in the text, so put the original text string back # together. txt = u'\u30FB'.join(dotlist) if tag == 'restr': if jdb.jstr_keb(txt): taglist.append(['RESTR', None, txt]) else: taglist.append(['RESTR', txt, None]) else: # This must be a tag=QTEXT contruct. taglist.append(tag_eql_text(p, tag, txt)) p[0] = taglist
def p_tagitem_4(p): '''tagitem : QTEXT''' # FIXME: why isn''t a QTEXT already cleaned up by jellex? txt = jellex.qcleanup (p[1][1:-1]) # FIXME: we should check for ascii text here and treat # that as TEXT above. if jdb.jstr_keb (txt): p[0] = [['RESTR', None, txt]] else: p[0] = [['RESTR', txt, None]]
def parse_kitem(ktxt, tags, fmap): if not jdb.jstr_keb(ktxt): raise ParseError('Kanji field not kanji: "%s".' % ktxt) kanj = Kanj(txt=ktxt) for tag in tags: if not tag: continue t = lookup_tag(tag, ['KINF', 'FREQ']) if t: tagtyp, tagval = t[0] if tagtyp == 'KINF': kanj._inf.append(Kinf(kw=tagval)) elif tagtyp == 'FREQ': fmap[t[1:]][1].append(kanj) else: raise ParseError('Unknown tag "%s" on kanji "%s"' % (tag, ktxt)) return kanj
def check_for_warnings(cur, entr, parent_seq, chklist): # Look for other entries that have the same kanji or reading. # These will be shown as cautions at the top of the confirmation # form in hopes of reducing submissions of words already in # the database. # 'parent_seq' is used by find_similar() to exclude other entries # with the same seq# from being flagged as having duplicate kanji # or readings. dups = find_similar(cur, getattr(entr, '_kanj', []), getattr(entr, '_rdng', []), entr.src, parent_seq) if dups: chklist['dups'] = dups # FIXME: IS-190. if not getattr (entr, '_rdng', None) \ and entr.src==jdb.KW.SRC['jmdict'].id: chklist['norebs'] = True # FIXME: Should pass list of the kanj/rdng text rather than # a pre-joined string so that page can present the list as # it wishes. chklist['invkebs'] = ", ".join(k.txt for k in getattr(entr, '_kanj', []) if not jdb.jstr_keb(k.txt)) chklist['invrebs'] = ", ".join(r.txt for r in getattr(entr, '_rdng', []) if not jdb.jstr_reb(r.txt)) # FIXME: IS-190. if entr.src == jdb.KW.SRC['jmdict'].id: chklist['nopos'] = ", ".join( str(n + 1) for n, x in enumerate(getattr(entr, '_sens', [])) if not x._pos) chklist['jpgloss'] = ", ".join( "%d.%d: %s" % (n + 1, m + 1, '"' + '", "'.join(re.findall('[\uFF01-\uFF5D]', g.txt)) + '"') for n, s in enumerate(getattr(entr, '_sens', [])) for m, g in enumerate(getattr(s, '_gloss', [])) # Change text in edconf.tal if charset changed. if re.findall('[\uFF01-\uFF5D]', g.txt)) # Remove any empty warnings so that if there are no warnings, # 'chklist' itself will be empty and no warning span element # will be produced by the template (which otherwise will # contain a <hr/> even if there are no other warnings.) for k in list(chklist.keys()): if not chklist[k]: del chklist[k]
def do_kanjs (self, elems, entr, fmap): if elems is None: return kanjs = []; dupchk = {} for ord, elem in enumerate (elems): txt = elem.find('keb').text if not jdb.unique (txt, dupchk): self.warn ("Duplicate keb text: '%s'" % txt); continue if not (jdb.jstr_keb (txt)): self.warn ("keb text '%s' not kanji." % txt) kanj = jdb.Kanj (kanj=ord+1, txt=txt) self.do_kws (elem.findall('ke_inf'), kanj, '_inf', 'KINF') for x in elem.findall ('ke_pri'): freqtuple = self.parse_freq (x.text, "ke_pri") if not freqtuple: continue klist = fmap[freqtuple][1] if not jdb.isin (kanj, klist): klist.append (kanj) else: self.freq_warn ("Duplicate", None, kanj, x.text) kanjs.append (kanj) if kanjs: entr._kanj = kanjs
def parse_stags(tag, sens, kanjs, rdngs): stagrtxts = [] stagktxts = [] words = tag.split(',') for word in words: word = word.strip() if jdb.jstr_reb(word): stagrtxts.append(word) elif jdb.jstr_keb(word): stagktxts.append(word) else: raise ParseError( 'stagx restriction word neither reading or kanji: "%s"' % word) errs = [] jdb.txt2restr(stagrtxts, sens, rdngs, '_stagr', bad=errs) if errs: raise ParseError('Stagr text not in readings: "%s"' % '","'.join(errs)) errs = [] jdb.txt2restr(stagktxts, sens, kanjs, '_stagk', bad=errs) if errs: raise ParseError('Stagk text not in kanji: "%s"' % '","'.join(errs)) return
def check (_, expected, testtext): result = jdb.jstr_keb (testtext) _.assertEqual (result, expected)