def merge_freqs (entr): # This function is used by code that contructs Entr objects # by parsing a textual entry description. Generally such code # will parse freq (a.k.a. prio) tags for readings and kanji # individually. Before the entry is used, these independent # tags must be combined so that a rdng/kanj pairs with the # same freq tag point to a single Freq object. This function # does that merging. # It expects the entry's Rdng and Kanj objects to have a temp # attribute named "_FREQ" that contains a list of 2-tuples. # Each 2-tuple contains the freq table kw id number, and the # freq value. After merge_freqs() runs, all those .FREQ # attributes will have been deleted, and .freq attributes # created with equivalent, properly linked Freq objects. fmap = defaultdict (lambda:([list(),list()])) # Collect the info in .FREQ attributes from all the readings. for r in getattr (entr, '_rdng', []): for kw_val in getattr (r, '_FREQ', []): # 'kw_val' is a 2-tuple denoting the freq as a freq table # keyword id and freq value pair. rlist = fmap[(kw_val)][0] # Add 'r' to rlist if it is not there already. # Use first() as a "in" operator that uses "is" rather # than "==" as compare function. if not jdb.isin (r, rlist): rlist.append (r) if hasattr (r, '_FREQ'): del r._FREQ # Collect the info in .FREQ attributes from all the kanji. # This works on kanj's the same as above section works on # rdng's and comments above apply here too. for k in getattr (entr, '_kanj', []): for kw_val in getattr (k, '_FREQ', []): klist = fmap[(kw_val)][1] if not jdb.isin (k, klist): klist.append (k) if hasattr (k, '_FREQ'): del k._FREQ # 'fmap' now has one entry for every unique freq (kw,value) tuple # which is a pair of sets. The first set consists of all Rdng # objects that (kw,value) freq spec applies to. The second is # the set of all kanji it applies to. We take all combinations # of readings with kanji, and create a Freq object for each. errs = jdb.make_freq_objs (fmap, entr) return errs
def entr(text, simple=False): fmap = collections.defaultdict(lambda: ([list(), list()])) #krtxt, x, stxt = text.partition ('/') try: krtxt, stxt = re.split('[ \t\u3000]*/[ \t\u3000]*', text, 1) except ValueError as e: raise ParseError('Missing KR-S separator, "/"') kanjs, rdngs = parse_jppart(krtxt, fmap) entr = Entr(_kanj=kanjs, _rdng=rdngs) sens = parse_spart(stxt.lstrip(), entr, fmap) errs = jdb.make_freq_objs(fmap, entr) for err in errs: errtyp, r, k, kw, val = err raise ParseError("%s freq tag(s) %s%s in %s%s%s" % (errtyp, KW.FREQ[kw].kw, val, k or '', '\u30FB' if k and r else '', r or '')) return entr
def do_entr(self, elem, seq, xlit=False, xlang=None, corp_dict=None, grpdefs=None): """ Create an entr object from a parsed ElementTree entry element, 'elem'. 'lineno' is the source file line number of the "<entry>" line or None and is only used in error messages. Note that the entry object returned is different from one read from the database in the following respects: * The 'entr' record will have no .src (aka corpus) attribute if there is no <ent_corp> element in the entry. In this case the .src attribute is expected to be added by the caller. If there is a <ent_corp> element, it will be used to find a corpus in 'corp_dict', which in turn will will provide an id number used in .src. * Items in sense's _xref list are unresolved xrefs, not resolved xrefs as in a database entr object. jdb.resolv_xref() or similar can be used to resolve the xrefs. * Attributes will be missing if the corresponding xml information is not present. For example, if a particular entry has no <ke_ele> elements, the entr object will not have a '._kanj' attribute. In an entr object read from the database, it will have a '._kanj' attribute with a value of []. * The entr object does not have many of the foreign key attributes: gloss.gloss, xref.xref, <anything>.entr, etc. However, it does have rdng.rdng, kanj.kanj, and sens.sens attributes since these are required when adding restr, stagr, stagk, and freq objects. """ XKW, KW = self.XKW, self.KW entr = jdb.Entr() if not seq: elemseq = elem.find('ent_seq') if elemseq is None: raise ParseError("No <ent_seq> element found") try: seq = int(elemseq.text) except ValueError: raise ParseError("Invalid 'ent_seq' value, '%s'" % elem.text) if seq <= 0: raise ParseError("Invalid 'ent_seq' value, '%s'" % elem.text) entr.seq = seq id = elem.get('id') if id is not None: entr.id = int(id) dfrm = elem.get('dfrm') if dfrm is not None: entr.dfrm = int(dfrm) stat = elem.get('status') or jdb.KW.STAT['A'].id try: stat = XKW.STAT[stat].id except KeyError: raise ParseError("Invalid <status> element value, '%s'" % stat) entr.stat = stat entr.unap = elem.get('appr') == 'n' corpname = elem.findtext('ent_corp') if corpname is not None: entr.src = corp_dict[corpname].id fmap = defaultdict(lambda: ([], [])) self.do_kanjs(elem.findall('k_ele'), entr, fmap) self.do_rdngs(elem.findall('r_ele'), entr, fmap) if fmap: freq_errs = jdb.make_freq_objs(fmap, entr) for x in freq_errs: typ, r, k, kw, val = x kwstr = XKW.FREQ[kw].kw + str(val) self.freq_warn(typ, r, k, kwstr) self.do_senss(elem.findall('sense'), entr, xlit, xlang) self.do_senss(elem.findall('trans'), entr, xlit, xlang) self.do_info(elem.findall("info"), entr) self.do_audio(elem.findall("audio"), entr, jdb.Entrsnd) self.do_groups(elem.findall("group"), entr, grpdefs) return entr