def get_all_words(): ret = {} # ["ANSWER"] = number of uses for xd in xdfile.corpus(): for pos, clue, answer in xd.clues: ret[answer] = ret.get(answer, 0) + 1 return ret
def main(): args = get_args("save all clues in simple .tsv") outf = open_output("clues.tsv") outf.write_row("PublicationId Date Answer Clue".split()) for xd in corpus(*args.inputs): pubid = xd.publication_id() dt = xd.date() for pos, clue, answer in xd.clues: outf.write_row((pubid or "", dt or "", answer, clue))
def main(): args = get_args(desc="find similar grids") g_corpus = [x for x in corpus()] outf = open_output() outf.write(xd_similar_header) for fn, contents in find_files(*args.inputs, strip_toplevel=False): needle = xdfile(contents.decode("utf-8"), fn) for pct, a, b in find_similar_to(needle, g_corpus): outf.write(xd_similar_row(a, b, pct))
def get_duplicate_puzzles(): dupgrids = {} grids = {} for xd in xdfile.corpus(): g = EOL.join(xd.grid) if g not in grids: grids[g] = [xd] else: grids[g].append(xd) dupgrids[g] = grids[g] return dupgrids.values()
def main(): args = utils.get_args(desc='find grid templates') templates = set() for xd in xdfile.corpus(): tmpl = tuple(''.join(x if x == BLOCK_CHAR else UNKNOWN_CHAR for x in L) for L in xd.grid) templates.add(tmpl) print(len(templates), 'templates') for input_source in args.inputs: for fn, contents in utils.find_files(input_source, ext='.xd'): xd = xdfile.xdfile(contents.decode('utf-8'), fn) for i, T in enumerate(templates): griddedxd = fit_template(T, xd) if griddedxd: with open(args.output + ('-t%s.xd' % i), 'w') as fp: fp.write(griddedxd.to_unicode())
def most_used_grids(n=1): all_grids = {} for xd in xdfile.corpus(): empty = get_blank_grid(xd) if empty not in all_grids: all_grids[empty] = [] all_grids[empty].append(xd.filename) print("%s distinct grids" % len(all_grids)) most_used = sorted(all_grids.items(), key=lambda x: -len(x[1])) for k, v in most_used[0:n]: print("used %s times" % len(v)) gridlines = k.splitlines() for g, u in itertools.izip_longest(gridlines, sorted(v)): print("%15s %s" % (u or "", g or "")) print()
def main(): args = utils.get_args('generate pub-years data') outf = utils.open_output() weekdays = [ 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun' ] pubyears = {} # set() for xd in xdfile.corpus(): puby = (xd.publication_id(), xd.year()) if puby not in pubyears: pubyears[puby] = [] pubyears[puby].append(xd) if pubyears: metadb.delete_stats() for puby, xdlist in sorted(pubyears.items()): pubid, year = puby npublic = 0 # TODO: SELECT FROM publications nexisting = 0 # organize by day-of-week byweekday = {} byweekday_similar = {} for w in weekdays: byweekday[w] = [] byweekday_similar[w] = [] for xd in xdlist: dow = dow_from_date(xd.get_header('Date')) if dow: # Might be empty date or only a year byweekday[dow].append(xd) for r in metadb.xd_similar(pubid + str(year)): if r.match_pct < 25: continue xd = xdfile.get_xd(r.xdid) if xd: dt = xd.get_header('Date') if dt: dow = dow_from_date(dt) if dow: # Might be empty date or only a year byweekday_similar[dow].append(r) else: debug("Date not set for: %s" % xd) # tally stats for weekday in weekdays: copyrights = Counter() # [copyright_text] -> number of xd editors = Counter() # [editor_name] -> number of xd formats = Counter() # ["15x15 RS"] -> number of xd # todo nexisting = 0 nxd = len(byweekday[weekday]) public_xdids = [] # Empty for now for xd in byweekday[weekday]: xdid = xd.xdid() if (year.isdigit() and int(year) <= 1965) or xdid in public_xdids: npublic += 1 editor = xd.get_header('Editor').strip() if editor: editors[editor] += 1 sizestr = xd.sizestr() if sizestr: formats[sizestr] += 1 copyright = xd.get_header('Copyright').strip() if copyright: copyrights[copyright] += 1 # debug("ME: %s MCPR: %s MF: %s" % (list(editors), list(copyrights), list(formats))) def process_counter(count, comp_value): # Process counter comparing with comp_value if count: item, num = count.most_common(1)[0] if num != comp_value: item += " (%s)" % num else: item = '' return item # maineditor = process_counter(editors, nxd) maincopyright = process_counter(copyrights, nxd) mainformat = process_counter(formats, nxd) reprints = 0 touchups = 0 redones = 0 copies = 0 themecopies = 0 for r in byweekday_similar[weekday]: xd1 = xdfile.get_xd(r.xdid) xd2 = xdfile.get_xd(r.match_xdid) if xd1 is None: info("%s: similar puzzle %s not in corpus" % (r.match_xdid, r.xdid)) continue if xd2 is None: info("%s: similar puzzle %s not in corpus" % (r.xdid, r.match_xdid)) continue dt1 = xd1.get_header('Date') dt2 = xd2.get_header('Date') aut1 = xd1.get_header('Author').lower() aut2 = xd2.get_header('Author').lower() pct = int(r.match_pct) if dt2 < dt1: # only capture the later one ##deduce_similarity_type if diff_authors(aut1, aut2): # suspicious if pct >= 50: copies += 1 elif pct >= 30: themecopies += 1 else: if pct == 100: reprints += 1 elif pct >= 50: touchups += 1 elif pct >= 30: themecopies += 1 metadb.append_row("pub/stats", (pubid, year, weekday, mainformat, maineditor, maincopyright, nexisting, nxd, npublic, reprints, touchups, redones, copies, themecopies))
def main(): p = utils.args_parser(desc="annotate puzzle clues with earliest date used in the corpus") p.add_argument("-a", "--all", default=False, help="analyze all puzzles, even those already in similar.tsv") p.add_argument("-l", "--limit", default=100, help="limit amount of puzzles to be analyzed [default=100]") args = get_args(parser=p) outf = open_output() num_processed = 0 prev_similar = metadb.read_rows("gxd/similar") for fn, contents in find_files(*args.inputs, ext=".xd"): progress(fn) mainxd = xdfile(contents.decode("utf-8"), fn) if mainxd.xdid() in prev_similar: continue # skip reprocessing .xd that are already in similar.tsv """ find similar grids (pct, xd) for the mainxd in the corpus. Takes about 1 second per xd. sorted by pct. """ similar_grids = sorted(find_similar_to(mainxd, corpus(), min_pct=0.20), key=lambda x: x[0], reverse=True) num_processed += 1 if num_processed > int(args.limit): break if similar_grids: info("similar: " + " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids)) mainpubid = mainxd.publication_id() maindate = mainxd.date() # go over each clue/answer, find all other uses, other answers, other possibilities. # these are added directly to similar.tsv nstaleclues = 0 nstaleanswers = 0 ntotalclues = 0 for pos, mainclue, mainanswer in mainxd.iterclues(): progress(mainanswer) poss_answers = [] pub_uses = {} # [pubid] -> set(ClueAnswer) mainca = ClueAnswer(mainpubid, maindate, mainanswer, mainclue) # find other uses of this clue, and other answers, in a single pass for clueans in find_clue_variants(mainclue): if clueans.answer != mainanswer: poss_answers.append(clueans) if clueans.answer == mainanswer: if clueans.pubid in pub_uses: otherpubs = pub_uses[clueans.pubid] else: otherpubs = set() # set of ClueAnswer pub_uses[clueans.pubid] = otherpubs otherpubs.add(clueans) # bclues is all boiled clues for this particular answer: { [bc] -> #uses } bclues = load_answers().get(mainanswer, []) stale_answer = False if bclues: uses = [] for bc, nuses in bclues.items(): # then find all clues besides this one clue_usages = [ ca for ca in load_clues().get(bc, []) if ca.answer == mainanswer and ca.date < maindate ] if clue_usages: stale_answer = True if nuses > 1: # only use one (the most recent) ClueAnswer per boiled clue # but use the clue only (no xdid) ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1].clue else: ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1] uses.append((ca, nuses)) # summary row to similar.tsv metadb.append_row( "gxd/similar", [ mainxd.xdid(), # xdid int(100 * sum(pct / 100.0 for pct, xd1, xd2 in similar_grids)), # similar_grid_pct nstaleclues, # reused_clues nstaleanswers, # reused_answers ntotalclues, # total_clues " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids), # matches ], )
def main(): args = utils.get_args('generate pub-years data') outf = utils.open_output() weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] pubyears = {} # set() for xd in xdfile.corpus(): puby = (xd.publication_id(), xd.year()) if puby not in pubyears: pubyears[puby] = [] pubyears[puby].append(xd) if pubyears: metadb.delete_stats() for puby, xdlist in sorted(pubyears.items()): pubid, year = puby npublic = 0 # TODO: SELECT FROM publications nexisting = 0 # organize by day-of-week byweekday = {} byweekday_similar = {} for w in weekdays: byweekday[w] = [] byweekday_similar[w] = [] for xd in xdlist: dow = dow_from_date(xd.get_header('Date')) if dow: # Might be empty date or only a year byweekday[dow].append(xd) for r in metadb.xd_similar(pubid + str(year)): if r.match_pct < 25: continue xd = xdfile.get_xd(r.xdid) if xd: dt = xd.get_header('Date') if dt: dow = dow_from_date(dt) if dow: # Might be empty date or only a year byweekday_similar[dow].append(r) else: debug("Date not set for: %s" % xd) # tally stats for weekday in weekdays: copyrights = Counter() # [copyright_text] -> number of xd editors = Counter() # [editor_name] -> number of xd formats = Counter() # ["15x15 RS"] -> number of xd # todo nexisting = 0 nxd = len(byweekday[weekday]) public_xdids = [] # Empty for now for xd in byweekday[weekday]: xdid = xd.xdid() if (year.isdigit() and int(year) <= 1965) or xdid in public_xdids: npublic += 1 editor = xd.get_header('Editor').strip() if editor: editors[editor] += 1 sizestr = xd.sizestr() if sizestr: formats[sizestr] += 1 copyright = xd.get_header('Copyright').strip() if copyright: copyrights[copyright] += 1 # debug("ME: %s MCPR: %s MF: %s" % (list(editors), list(copyrights), list(formats))) def process_counter(count, comp_value): # Process counter comparing with comp_value if count: item, num = count.most_common(1)[0] if num != comp_value: item += " (%s)" % num else: item = '' return item # maineditor = process_counter(editors, nxd) maincopyright = process_counter(copyrights, nxd) mainformat = process_counter(formats, nxd) reprints = 0 touchups = 0 redones = 0 copies = 0 themecopies = 0 for r in byweekday_similar[weekday]: xd1 = xdfile.get_xd(r.xdid) xd2 = xdfile.get_xd(r.match_xdid) if xd1 is None: info("%s: similar puzzle %s not in corpus" % (r.match_xdid, r.xdid)) continue if xd2 is None: info("%s: similar puzzle %s not in corpus" % (r.xdid, r.match_xdid)) continue dt1 = xd1.get_header('Date') dt2 = xd2.get_header('Date') aut1 = xd1.get_header('Author').lower() aut2 = xd2.get_header('Author').lower() pct = int(r.match_pct) if dt2 < dt1: # only capture the later one ##deduce_similarity_type if diff_authors(aut1, aut2): # suspicious if pct >= 50: copies += 1 elif pct >= 30: themecopies += 1 else: if pct == 100: reprints += 1 elif pct >= 50: touchups += 1 elif pct >= 30: themecopies += 1 metadb.append_row( "pub/stats", (pubid, year, weekday, mainformat, maineditor, maincopyright, nexisting, nxd, npublic, reprints, touchups, redones, copies, themecopies))
def find_grid(fn): needle = get_blank_grid(xdfile(file(fn).read())) return [xd for xd in xdfile.corpus() if needle == get_blank_grid(xd)]
dupgrids[g] = grids[g] return dupgrids.values() if __name__ == "__main__": # import sys all_words = get_all_words() print("%d unique words. most used words:" % len(all_words)) for word, num_uses in sorted(all_words.items(), key=lambda x: -x[1])[0:10]: print num_uses, word print groups = {"small": 0, "medium":0, "large":0, "huge":0} for xd in xdfile.corpus(): if xd.width() < 14 and xd.height() < 14: groups["small"] += 1 elif xd.height() < 17 and xd.height() < 17: groups["medium"] += 1 elif xd.height() < 25 and xd.height() < 25: groups["large"] += 1 else: print xd groups["huge"] += 1 for k, v in groups.items(): print("%s %s" % (k,v)) print
def main(): p = utils.args_parser( desc="annotate puzzle clues with earliest date used in the corpus") p.add_argument( '-a', '--all', default=False, help='analyze all puzzles, even those already in similar.tsv') p.add_argument('-l', '--limit', default=100, help='limit amount of puzzles to be analyzed [default=100]') args = get_args(parser=p) outf = open_output() num_processed = 0 prev_similar = metadb.read_rows('gxd/similar') for fn, contents in find_files(*args.inputs, ext=".xd"): progress(fn) mainxd = xdfile(contents.decode('utf-8'), fn) if mainxd.xdid() in prev_similar: continue # skip reprocessing .xd that are already in similar.tsv """ find similar grids (pct, xd) for the mainxd in the corpus. Takes about 1 second per xd. sorted by pct. """ similar_grids = sorted(find_similar_to(mainxd, corpus(), min_pct=0.20), key=lambda x: x[0], reverse=True) num_processed += 1 if num_processed > int(args.limit): break if similar_grids: info("similar: " + " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids)) mainpubid = mainxd.publication_id() maindate = mainxd.date() # go over each clue/answer, find all other uses, other answers, other possibilities. # these are added directly to similar.tsv nstaleclues = 0 nstaleanswers = 0 ntotalclues = 0 for pos, mainclue, mainanswer in mainxd.iterclues(): progress(mainanswer) poss_answers = [] pub_uses = {} # [pubid] -> set(ClueAnswer) mainca = ClueAnswer(mainpubid, maindate, mainanswer, mainclue) # find other uses of this clue, and other answers, in a single pass for clueans in find_clue_variants(mainclue): if clueans.answer != mainanswer: poss_answers.append(clueans) if clueans.answer == mainanswer: if clueans.pubid in pub_uses: otherpubs = pub_uses[clueans.pubid] else: otherpubs = set() # set of ClueAnswer pub_uses[clueans.pubid] = otherpubs otherpubs.add(clueans) # bclues is all boiled clues for this particular answer: { [bc] -> #uses } bclues = load_answers().get(mainanswer, []) stale_answer = False if bclues: uses = [] for bc, nuses in bclues.items(): # then find all clues besides this one clue_usages = [ ca for ca in load_clues().get(bc, []) if ca.answer == mainanswer and ca.date < maindate ] if clue_usages: stale_answer = True if nuses > 1: # only use one (the most recent) ClueAnswer per boiled clue # but use the clue only (no xdid) ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1].clue else: ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1] uses.append((ca, nuses)) # summary row to similar.tsv metadb.append_row( 'gxd/similar', [ mainxd.xdid(), # xdid int(100 * sum( pct / 100.0 for pct, xd1, xd2 in similar_grids)), # similar_grid_pct nstaleclues, # reused_clues nstaleanswers, # reused_answers ntotalclues, # total_clues " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids) # matches ])