def find_rules_in_flr_bit(mode, bit): found = list(find_rules(mode, bit)) w = None ret = [] if regex.match(b'\n*Rule [0-9]', bit): for data in found: ret.append(data) if len(found) > 1: if b'Rule 2389/0' not in bit: w = 'got multiple rules in FLR bit' rulenum = None elif len(found) == 0: if b'Rule 2386/0' not in bit: w = 'got no rules in FLR bit' else: if len(found) > 0: w = 'got rule in weird FLR bit' rulenum = False if w is not None: with warnx(): print('full: {{{') print(util.highlight_spaces(decode(bit))) print('}}}', ) print('^-', w) return ret
def find_stdformat_rules(text, expect_history=False): m = regex.search(b'Rule ([0-9]+)', text) early_rulenum = None if m: early_rulenum = m.group(1) action = FIXUPS_FOR_RULENUM.get(early_rulenum) if action: text = action(text) # fix CFJ annotations missing brackets altogether text = regex.sub(b'\n(CFJ [0-9]+[^\n]*:.*?)(?=\n\n)', br'\n[\1]', text, flags=regex.S) # -- for m in fsfr_regex.finditer(text): #print('yaeh', text[m.end():m.end()+100]) g = m.groupdict() full = g['full'] history = None if g['history'] is not None: thehist = decode(g['thehist']) thehist = regex.sub( 'The following section is not a portion of the report:.*', '', thehist, flags=regex.S) # lol, old scam history = list(split_history(thehist)) if expect_history and not history: if early_rulenum not in {b'2385', b'2119', b'2001'}: with warnx(): print(repr(g)) print('full: {{{') print(util.highlight_spaces(decode(text))) print('}}}') print('^- no history in this FLR entry') rtext = g['text'] or b'' inumber = int(g['number']) extraheader = g['extraheader'] or b'' if extraheader: _extratitle, rtext, extraheader = fix_oldformat_header( inumber, rtext, extraheader.rstrip()) data = { 'number': inumber, 'revnum': decode(g['revnum']) if g['revnum'] else None, 'title': decode(g['title']) if g['title'] else None, 'header': decode(g['header']), 'extra': decode(extraheader) if extraheader else None, 'text': decode(rtext), 'annotations': decode(g['annotations']) or None, 'history': history, } yield data
def detect_renumberings(): latents_by_reduced_sig = defaultdict(list) for latent in all_latents: if latent.sig is None or isinstance( latent.sig, str) or latent.sig == ('0', 'create'): continue rsig = latent.sig[1:] if rsig[-1] == 'cleaning': continue latents_by_reduced_sig[rsig].append(latent) for rsig, latents in latents_by_reduced_sig.items(): if len(latents) <= 1: continue # potential renumbering # are they all in different entries? entries = [an._entry for latent in latents for an in latent.ans] if len(entries) == len(set(entries)): # yes, so... find the most recent one # [(date, latent)] last_seen = sorted((max(an._entry.date_lower_bound() for an in latent.ans), latent) for latent in latents) # is it strictly more recent? if last_seen[-1][0] > last_seen[-2][0]: winnerdate, winner = last_seen[-1] for loserdate, loser in last_seen[:-1]: for an in loser.ans: an._entry.no_link = True loser.merge_into(winner) #print('xxx', some_number) continue else: why_not = "can't find newest" else: # this may not actually be a problem #why_not = "duplicates were seen in one entry" continue with warnx(): print( "in rule %s, couldn't autofix revision renumbering because %s:" % ( rule.numbers, why_not, )) for latent in latents: print('--') latent.print_sig_and_texts() print('***')
def add_guessed_numbers(entry): data = entry.data meta = entry.meta history = data['history'] # propagate cur_num and revnum forward cur_num = None revnum = None for an in entry.ans: if an.is_indeterminate: cur_num = None revnum = None an._guessed_num = None an._guessed_revnum = None continue if an.cur_num is not None: cur_num = an.cur_num an._guessed_num = cur_num if an.revnum is not None: revnum = an.revnum an._guessed_revnum = revnum # propagate prev_num backward prev_num = data['number'] ans = entry.ans for i in range(len(ans) - 1, -1, -1): an = ans[i] if an.is_indeterminate: prev_num = None continue if an._guessed_num is None: an._guessed_num = prev_num elif (prev_num is not None and an._guessed_num != prev_num and not (an._guessed_num == 155 and prev_num == 115)): with warnx(): print('in %s:' % (entry.meta['path'], )) print( 'disagreement about rule number (on [%d]: going backwards: %r; going forwards: %r) for annotation set:' % (i, prev_num, an._guessed_num)) for j, an2 in enumerate(entry.ans): print('[%d] %r' % (j, an2)) if an.num_changed: prev_num = an.prev_num
def break_cycles(): # break cycles - turns out doesn't actually happen, but I had to write this code to figure that out, so may as well keep it stack = [] for latent in all_latents: latent.nextlist = list(latent.nexts) for j, latent0 in enumerate(all_latents): stack = [[latent0, 0]] latent0.stackidx = 0 while stack: latent, nexti = stack[-1] if nexti == len(latent.nextlist): latent.stackidx = None stack.pop() continue assert latent in all_latents # XXX next = latent.nextlist[nexti] stack[-1][1] = nexti + 1 if next.seen: if next.stackidx is not None: # got a cycle with warnx(): print('Got cycle in annotation ordering:') for xlatent, _ in stack[next.stackidx:]: print('--') xlatent.print_sig_and_texts() print('***') # arbitrarily choose the last link to break since it's easier - TODO if there are real cycles, do it better latent.nextlist.remove(next) latent.nexts.remove(next) next.prevs.remove(latent) stack[-1][1] -= 1 stack[next.stackidx][1] -= 1 else: next.seen = True next.stackidx = len(stack) stack.append([next, 0])
def walk_doc(metadata, text): new_metadata = metadata.copy() if 'rcslog' in new_metadata: del new_metadata['rcslog'] del new_metadata['rcsauthor'] assert isinstance(text, bytes) #print(metadata['path']) m = regex.match(b'(.{,2048}\n)?THE (FULL |SHORT |)LOGICAL RULESET\n\n', text, regex.S) if m: # this is a ruleset! lr_start = m.end() n = regex.search(b'\nEND OF THE [^ ]* LOGICAL RULESET', text, pos=lr_start) if n: lr_end = n.end() else: lr_end = len(text) ruleset = m.group(0) ruleset_bits = regex.split( b'\n------------------------------+|====================+\n', text[lr_start:lr_end]) have_rulenums = [] mode = find_rules_mode_for_path(metadata['path']) for datas in map(partial(find_rules_in_flr_bit, mode), ruleset_bits): for data in datas: if data['number'] is not None: have_rulenums.append(data['number']) yield {'meta': new_metadata, 'data': data} # explicit repeal annotations in RCS? if 'rcslog' in metadata and metadata['rcsauthor'] == 'comex': # split by semicolon, but not semicolons in parens logs = regex.findall(br';\s*((?:\([^\)]*\)|[^;\(]+)*)', metadata['rcslog']) for log in logs: log = log.strip() if log in { b'formatting', b'update xrefs', b'lots of formatting fixes' }: continue # old stuff I put in n = regex.match(b'Rule ([0-9]+) (?:\([^\)]*\) )?repealed', log) if not n: raise Exception('unknown RCS annotation %r' % log) number = int(n.group(1)) yield { 'meta': new_metadata, 'data': { 'number': number, 'revnum': None, 'title': None, 'header': None, 'extra': None, 'text': None, 'annotations': None, 'history': [decode(log)], } } # repeals? yield { 'meta': new_metadata, 'data': { 'no_rules_except': have_rulenums } } # handle any remaining data rest = text[lr_end:].lstrip() if rest: yield from walk_doc(metadata, rest) return elif b'THE RULES OF INTERNOMIC' in text: # this is ... a fake ruleset! return else: # not a ruleset if 'rcslog' in metadata and 'current_flr.txt,v' in metadata['path']: with warnx(): print(repr(text)) print("this should be a flr but doesn't match") for data in find_rules(find_rules_mode_for_path(metadata['path']), text): yield {'meta': new_metadata, 'data': data}
def handle_revnum_clashes(full): latents_by_revnum = defaultdict(list) for latent in latent_by_sig.values(): if latent.dead: continue try: revnum = next( an.revnum for an in latent.ans if not an._entry.no_link and an.revnum is not None) except StopIteration: revnum = next(an.revnum for an in latent.ans) if revnum is not None: latents_by_revnum[revnum].append(latent) for latent in list(all_latents): if latent.sig is None: an = next(iter(latent.ans)) if an.revnum is not None: proper = latents_by_revnum[an.revnum] if len(proper) == 1: latent.merge_into(proper[0]) continue if len(proper) == 0: # eh... I guess there's nothing for it proper.append(latent) continue # ...can we find a text match? text = an._entry.normalized_text text_matches = [ olatent for olatent in proper if any(oan._entry.normalized_text == text for oan in olatent.ans) ] if len(text_matches) == 1: latent.merge_into(text_matches[0]) continue # ...can we find a text match with some other revnum? oans = (oan for olatent in latent_by_sig.values() if not olatent.dead for oan in olatent.ans if oan._entry.normalized_text == text) try: oan = next(oans) except StopIteration: pass else: #print('other text match, so ignore') #print('matched:', oan) latent.delete() continue # ... with warnx(): print( "in rule %d: couldn't find something to merge this into:" % (some_number, )) print(an) print(an._entry) print('possibilities: (%d)' % (len(proper), )) for olatent in proper: print(olatent) print('--') if not full: return for revnum, latents in latents_by_revnum.items(): kill = set() if len(latents) <= 1: continue dispositions = [ REVNUM_FIXES.get((some_number, latent.sig)) for latent in latents ] nones = [ latent for (disposition, latent) in zip(dispositions, latents) if disposition is None ] if len(nones) > 1: # are they all seen in some single entry? then keep all if functools.reduce(set.intersection, (latent.ans for latent in nones)): continue # is all but one from nolink? with warnx(): print('Duplicate revnums for rule %s:' % (rule.numbers, )) for latent in latents: print('--') latent.print_sig_and_texts() print('***') continue for disposition, latent in zip(dispositions, latents): if disposition == 'merge': latent.merge_into(nones[0]) elif disposition == 'kill': latent.delete() elif disposition == 'allow': pass elif disposition is None: pass else: raise Exception('? %r' % (disposition, latent))
def do_stragglers(rules, unowned_entries): rules_by_text = defaultdict(set) rules_by_number = defaultdict(set) for rule in rules: for entry in rule.entries: rules_by_text[entry.normalized_text].add(rule) for number in rule.numbers: rules_by_number[number].add(rule) unowned_by_number_and_text = defaultdict(lambda: defaultdict(set)) for entry in unowned_entries: unowned_by_number_and_text[entry.data['number']][ entry.normalized_text].add(entry) for number, by_text in unowned_by_number_and_text.items(): nrules = rules_by_number[number] if len(nrules) == 0 or number in {1741}: # no history for this rule at all; just assume they're all one rule # rule 1741: the unanchored entry is a different rule from the anchored one new_rule = Rule() new_rule.numbers.add(number) for entries in by_text.values(): new_rule.entries.extend(entries) rules.add(new_rule) continue for normalized_text, entries in by_text.items(): trules = nrules if len(trules) > 1: trules = trules.intersection(rules_by_text[normalized_text]) for entry in entries: drules = trules edate = entry.meta.get('date') if edate is not None and number not in {430}: # rule 430: zefram_rules_text says "ca. Sep. 13 1993", # but it was published on Sep. 8 edate = util.datetime_from_timestamp(edate).date() drules = [ rule for rule in trules if not rule.definitely_created_after(edate) ] drules = list(drules) if len(drules) == 1: rule = next(iter(drules)) rule.entries.append(entry) else: with warnx(): print('could not match entry (and copies) to rule:') print(next(iter(entries))) print('date:', entry.date()) for i, rule in enumerate(drules): print('***** candidate %d/%d:' % (i + 1, len(drules))) print(rule) for oentry in rule.entries: print('--') print(oentry) #for an in oentry.ans: print(an) if not drules: print( '***** no candidates! (%d by number alone, but enacted too late)' % (len(nrules), )) print('====') break
def split_into_rules_with_number_timeline(rule_entries): by_num_and_numbered_date = defaultdict(lambda: defaultdict(list)) for entry in rule_entries: entry.anchors = [] for an in entry.ans: if (an.is_create or an.num_changed ) and an.date is not None and an._guessed_num is not None: anchor = (an._guessed_num, an.proposal_num or an.date) anchor = ANCHOR_OVERRIDES.get(anchor, anchor) by_num_and_numbered_date[an._guessed_num][an.date].append( (entry, anchor)) entry.anchors.append(anchor) timeline_by_num = defaultdict(list) for number, by_numbered_date in by_num_and_numbered_date.items(): timeline_by_num[number] = sorted(by_numbered_date.keys()) for entry in rule_entries: already_anchored = False prev_gn = None for an in entry.ans: if an.num_changed or an.is_indeterminate or an._guessed_num != prev_gn: already_anchored = False prev_gn = an._guessed_num if (an.is_create or an.num_changed) and an.date is not None: already_anchored = True if already_anchored: continue if an._guessed_num is not None and an.date is not None: number = an._guessed_num timeline = timeline_by_num[number] if len(timeline) == 0: # eh, assume there weren't multiple copies of this rule entry.anchors.append((number, None)) already_anchored = True continue i = bisect.bisect_right(timeline, an.date) if i == 0 and number not in {1741}: with warnx(): print( 'Annotation comes before all anchors for rule %d: %s' % (number, an)) print('from %s' % entry.meta['path']) print('earliest is at %s: %s' % (timeline[0], by_num_and_numbered_date[number][timeline[0]])) for an in entry.ans: print(an) continue if i == len(timeline): i -= 1 # assume the rule stayed the same? date = timeline[i] oentry, anchor = by_num_and_numbered_date[ an._guessed_num][date][-1] entry.anchors.append(anchor) already_anchored = True def print_timeline(num): print('timeline for %s:' % (num, )) for date in timeline_by_num[num]: print('- %s' % (date, )) for entry in by_num_and_numbered_date[num][date][0]: print(entry) for an in entry.ans: print(an) #print_timeline(1051); die anchor_to_rule = {} all_rules = set() rule_id = 0 # unify multiple anchors in the same entry for entry in rule_entries: first_info = None for i, anchor in enumerate(entry.anchors): rule = anchor_to_rule.get(anchor) if rule is None: rule = Rule() rule.anchors.append(anchor) all_rules.add(rule) anchor_to_rule[anchor] = rule if i == 0: first_rule = rule else: if rule is not first_rule: for anchor in rule.anchors: anchor_to_rule[anchor] = first_rule all_rules.remove(rule) first_rule.anchors.extend(rule.anchors) unowned_entries = [] for entry in rule_entries: if entry.anchors: rule = anchor_to_rule[entry.anchors[0]] rule.entries.append(entry) for an in entry.ans: if an._guessed_num is not None: rule.numbers.add(an._guessed_num) assert_(entry.data['number'] in rule.numbers) else: unowned_entries.append(entry) return all_rules, unowned_entries
def identify_same(entries): def add_with_revnum(entry): number = entry.data['number'] revnum = entry.data['revnum'] existings = by_number_and_revnum[number][revnum] for existing in existings: # lol, performance if text_match(existing, entry): existing.variants.append(entry) break else: entry.variants = [entry] existings.append(entry) by_number_and_revnum = defaultdict(lambda: defaultdict(list)) for entry in entries: add_with_revnum(entry) # deal with revnum=None entries for number, by_revnum in by_number_and_revnum.items(): nones = by_revnum[None] del by_revnum[None] unmatched_nones = [] for entry in nones: for existing in (existing for xentries in by_revnum.values() for existing in xentries): if text_match(existing, entry): existing.variants.append(entry) break else: unmatched_nones.append(entry) still_unmatched_nones = [] for entry in unmatched_nones: fudge_revnum = None number = entry.data['number'] path = entry.meta['path'] still_unmatched_nones.append(entry) if not still_unmatched_nones: continue fudge_revnums = None if len(still_unmatched_nones) == 1 and len(by_revnum) == 0: fudge_revnums = ['0'] elif number in FUDGE_REVNUMS: fudge_revnums = FUDGE_REVNUMS[number] if fudge_revnums is not None: if len(fudge_revnums) != len(still_unmatched_nones): print('...bad fudge_revnums length!') else: for fudge_revnum, entry in zip(fudge_revnums, still_unmatched_nones): entry.data['revnum'] = fudge_revnum add_with_revnum(entry) continue with warnx(): print('Orphan texts for rule %d:' % (entry.data['number'], ), ) for entry in still_unmatched_nones: print(entry.data['text']) print('variants:%d meta:%s' % ( len(entry.variants), entry.meta, )) print('last annotation:%s' % (entry.ans[-1], )) print('==') print('Here are all the numbered texts I have for that rule:') have_any = False for revnum, xentries in sorted(by_revnum.items(), key=lambda tup: revnum_key(tup[0])): if revnum is None: continue print('--') for existing in xentries: print('revnum: %s header: %s' % (existing.data['revnum'], existing.data['header'])) print(existing.data['text']), print('variants:%d meta:%s' % ( len(existing.variants), existing.meta, )) have_any = True if not have_any: print('(none)') for by_revnum in by_number_and_revnum.values(): for revnum, xentries in by_revnum.items(): if revnum is None: continue for i, entry in enumerate(xentries): best = max(entry.variants, key=quality) if best is not entry: best.variants = entry.variants del entry.variants xentries[i] = best return [ existing for xentries in by_revnum.values() for existing in xentries ]