def markall(e, trigs, verbose=True, rank=None): # the set of fields triggers relate to: clss = set(t.field for t in trigs) # all bibitems lacking any of the potential triggered fields: ei = { k: (typ, fields) for k, (typ, fields) in e.items() if any(c not in fields for c in clss) } eikeys = set(list(ei.keys())) # map words in titles to lists of bibitem keys having the word in the title: wk = defaultdict(set) for k, (typ, fields) in ei.items(): for w in wrds(fields.get('title', '')): wk[w].add(k) u = defaultdict(lambda: defaultdict(list)) for clauses, triggers in Trigger.group(trigs): for k in triggers[0](eikeys, wk): for t in triggers: u[k][t.cls].append(t) for k, t_by_c in u.items(): t, f = e[k] f2 = {a: b for a, b in f.items()} for (field, type_), triggers in sorted(t_by_c.items(), key=lambda i: len(i[1])): # Make sure we handle the trigger class with the biggest number of matching # triggers last. if rank and field in f2: # only update the assigned hhtype if something better comes along: if rank(f2[field].split(' (comp')[0]) >= rank(type_): continue f2[field] = Trigger.format(type_, triggers) e[k] = (t, f2) if verbose: print("trigs", len(trigs)) print("label classes", len(clss)) print("unlabeled refs", len(ei)) print("updates", len(u)) return e
def markall(e, trigs, verbose=True, rank=None): # the set of fields triggers relate to: clss = set(t.field for t in trigs) # all bibitems lacking any of the potential triggered fields: ei = {k: (typ, fields) for k, (typ, fields) in e.items() if any(c not in fields for c in clss)} eikeys = set(list(ei.keys())) # map words in titles to lists of bibitem keys having the word in the title: wk = defaultdict(set) for k, (typ, fields) in ei.items(): for w in wrds(fields.get('title', '')): wk[w].add(k) u = defaultdict(lambda: defaultdict(list)) for clauses, triggers in Trigger.group(trigs): for k in triggers[0](eikeys, wk): for t in triggers: u[k][t.cls].append(t) for k, t_by_c in u.items(): t, f = e[k] f2 = {a: b for a, b in f.items()} for (field, type_), triggers in sorted(t_by_c.items(), key=lambda i: len(i[1])): # Make sure we handle the trigger class with the biggest number of matching # triggers last. if rank and field in f2: # only update the assigned hhtype if something better comes along: if rank(f2[field].split(' (comp')[0]) >= rank(type_): continue f2[field] = Trigger.format(type_, triggers) e[k] = (t, f2) if verbose: print("trigs", len(trigs)) print("label classes", len(clss)) print("unlabeled refs", len(ei)) print("updates", len(u)) return e