def pubyear_html(pub, year): calendars_html = '<table class="puzzles">' colnames = [year] + pubyear_header calendars_html += html.table_row(colnames, colnames, tag='th') # write out /pub/nyt199x c_grids = {} # utils.info('Generating meta for {pub}{year}'.format(**locals())) for row in sorted(metadb.xd_similar(pub + year)): dt = utils.parse_iso8601(row.xdid) dt2 = utils.parse_iso8601(row.match_xdid) if not dt or not dt2: continue if dt < dt2: continue # dt = row["date"] # without - as GridCalendar needs; or fix GC if dt not in c_grids: c_grids[dt] = {'title': '', 'class': ''} if row.match_pct == 0: continue c_grids[dt]['link'] = '/pub/' + row.xdid matchxdid = row.match_xdid aut1 = metadb.get_author(row.xdid) or '' aut2 = metadb.get_author(matchxdid) or '' # if aut1 is None or aut2 is None: # continue pct = row.match_pct similargrids = '(%s%%) %s [%s]\n' % (pct, aut2, matchxdid) c_grids[dt]["title"] += similargrids ##deduce_similarity_type c_grids[dt]["class"] += ret_classes(aut1, aut2, pct) c_grids_b = {} # For those are not in c_grids # Generate grids for available puzzles for row in metadb.xd_puzzles(pub + year): if row.Date and row.Date not in c_grids_b and row.Date not in c_grids: # add styles only for those are not similar etc. c_grids_b[row.Date] = { 'title': '', 'class': 'privxd' if int(row.Date[:4]) > 1965 else 'pubxd', } # Generate calendars z = c_grids.copy() z.update(c_grids_b) if year[-1] == 's': # decade from_year = int(year[:4]) to_year = int(year[:4]) + 10 else: from_year = int(year) to_year = int(year) + 1 for year in range(from_year, to_year): for month in range(1, 13): dups_table = [] for dt, d in c_grids.items(): if not dt.startswith("%s-%02d" % (year, month)): continue row_dict = {} # Map row and style xdid = pub + dt puzmd = metadb.xd_puzzle(xdid) if not puzmd: continue row_dict['class'] = d['class'] row_dict['tag_params'] = { 'onclick': 'location.href=\'/pub/%s\'' % xdid, 'class': d['class'] + ' hrefrow puzrow', } row_dict['row'] = [ xdid, puzmd.Date, puzmd.Size, puzmd.Title, puzmd.Author, puzmd.Editor, puzmd.Copyright, puzmd.A1_D1, d["title"].replace("\n", "<br/>") ] dups_table.append(row_dict) calendars_html += '<tr class="calendar"><td class="calendar" rowspan="%s">' % ( len(dups_table) + 1) calendars_html += html.GridCalendar(z).formatmonth( int(year), month) + '</td></tr>' for r in sorted(dups_table, key=lambda x: x['row'][1]): calendars_html += html.table_row(r["row"], pubyear_header, tag_params=r['tag_params']) calendars_html += '</table>' ret = '''%s <div class="calendars">%s</div> <hr/>''' % (legend, calendars_html) return ret
def main(): args = utils.get_args('generate pub-years data') outf = utils.open_output() weekdays = [ 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun' ] pubyears = {} # set() for xd in xdfile.corpus(): puby = (xd.publication_id(), xd.year()) if puby not in pubyears: pubyears[puby] = [] pubyears[puby].append(xd) if pubyears: metadb.delete_stats() for puby, xdlist in sorted(pubyears.items()): pubid, year = puby npublic = 0 # TODO: SELECT FROM publications nexisting = 0 # organize by day-of-week byweekday = {} byweekday_similar = {} for w in weekdays: byweekday[w] = [] byweekday_similar[w] = [] for xd in xdlist: dow = dow_from_date(xd.get_header('Date')) if dow: # Might be empty date or only a year byweekday[dow].append(xd) for r in metadb.xd_similar(pubid + str(year)): if r.match_pct < 25: continue xd = xdfile.get_xd(r.xdid) if xd: dt = xd.get_header('Date') if dt: dow = dow_from_date(dt) if dow: # Might be empty date or only a year byweekday_similar[dow].append(r) else: debug("Date not set for: %s" % xd) # tally stats for weekday in weekdays: copyrights = Counter() # [copyright_text] -> number of xd editors = Counter() # [editor_name] -> number of xd formats = Counter() # ["15x15 RS"] -> number of xd # todo nexisting = 0 nxd = len(byweekday[weekday]) public_xdids = [] # Empty for now for xd in byweekday[weekday]: xdid = xd.xdid() if (year.isdigit() and int(year) <= 1965) or xdid in public_xdids: npublic += 1 editor = xd.get_header('Editor').strip() if editor: editors[editor] += 1 sizestr = xd.sizestr() if sizestr: formats[sizestr] += 1 copyright = xd.get_header('Copyright').strip() if copyright: copyrights[copyright] += 1 # debug("ME: %s MCPR: %s MF: %s" % (list(editors), list(copyrights), list(formats))) def process_counter(count, comp_value): # Process counter comparing with comp_value if count: item, num = count.most_common(1)[0] if num != comp_value: item += " (%s)" % num else: item = '' return item # maineditor = process_counter(editors, nxd) maincopyright = process_counter(copyrights, nxd) mainformat = process_counter(formats, nxd) reprints = 0 touchups = 0 redones = 0 copies = 0 themecopies = 0 for r in byweekday_similar[weekday]: xd1 = xdfile.get_xd(r.xdid) xd2 = xdfile.get_xd(r.match_xdid) if xd1 is None: info("%s: similar puzzle %s not in corpus" % (r.match_xdid, r.xdid)) continue if xd2 is None: info("%s: similar puzzle %s not in corpus" % (r.xdid, r.match_xdid)) continue dt1 = xd1.get_header('Date') dt2 = xd2.get_header('Date') aut1 = xd1.get_header('Author').lower() aut2 = xd2.get_header('Author').lower() pct = int(r.match_pct) if dt2 < dt1: # only capture the later one ##deduce_similarity_type if diff_authors(aut1, aut2): # suspicious if pct >= 50: copies += 1 elif pct >= 30: themecopies += 1 else: if pct == 100: reprints += 1 elif pct >= 50: touchups += 1 elif pct >= 30: themecopies += 1 metadb.append_row("pub/stats", (pubid, year, weekday, mainformat, maineditor, maincopyright, nexisting, nxd, npublic, reprints, touchups, redones, copies, themecopies))
def main(): args = utils.get_args( 'generates .html diffs with deep clues for all puzzles in similar.tsv') outf = utils.open_output() similars = utils.parse_tsv('gxd/similar.tsv', 'Similar') xds_todo = [] for fn, contents in find_files(*args.inputs, ext='.xd'): xd = xdfile.xdfile(contents.decode('utf-8'), fn) xds_todo.append(xd) for mainxd in xds_todo: mainxdid = mainxd.xdid() progress(mainxdid) matches = metadb.xd_similar(mainxdid) xddates = {} xddates[mainxdid] = mainxd.date( ) # Dict to store XD dates for further sort html_grids = {} # these are added directly to similar.tsv nstaleclues = 0 nstaleanswers = 0 ntotalclues = 0 dcl_html = '<tr>' dcl_html += '<th></th>' dcl_html += '<th>Clue</th>' dcl_html += '<th>ANSWERs</th>' dcl_html += '<th>Alt. clue possibilities</th>' dcl_html += '</tr>' deepcl_html = [] # keep deep clues to parse later - per row for pos, mainclue, mainanswer in mainxd.iterclues(): if not pos: continue poss_answers = [] # TODO: pub_uses = {} # [pubid] -> set(ClueAnswer) deepcl_html = [] # Temporary to be replaced late mainca = ClueAnswer(mainxdid, mainxd.date(), mainanswer, mainclue) # 'grid position' column deepcl_html.append('<td class="pos">%s.</td>' % pos) # find other uses of this clue, and other answers, in a single pass for clueans in find_clue_variants(mainclue): if clueans.answer != mainanswer: poss_answers.append(clueans) if clueans.answer == mainanswer: if clueans.pubid in pub_uses: otherpubs = pub_uses[clueans.pubid] else: otherpubs = set() # set of ClueAnswer pub_uses[clueans.pubid] = otherpubs otherpubs.add(clueans) # add 'other uses' to clues_html deepcl_html.append('<td class="other-uses">') prev = prev_uses(pub_uses, mainxd, mainclue) if prev: deepcl_html.append('<a href="/pub/clue/%s">%s [x%s]</a>' % (boil(mainclue), mainclue, len(prev))) nstaleclues += 1 else: deepcl_html.append(mainclue) deepcl_html.append('</td>') # add 'other answers' to clues_html deepcl_html.append('<td class="other-answers">') deepcl_html.append( html_select_options(poss_answers, strmaker=lambda ca: ca.answer, force_top=mainca, add_total=False)) deepcl_html.append('</td>') # add 'other clues' to clues_html deepcl_html.append('<td class="other-clues">') other_clues = html_other_clues(mainanswer, mainclue, mainxd) if other_clues: deepcl_html.append(other_clues) nstaleanswers += 1 deepcl_html.append('</td>') # end 'other-clues' ntotalclues += 1 # Quick and dirty - to be replaced dcl_html += '<tr>' + ' '.join(deepcl_html) + '</tr>' # Process deepclues diff_h = '<div class="main-container">' diff_h += grid_to_html(mainxd) diff_h += mktag('table', 'deepclues') + dcl_html + mktag('/table') diff_h += '</div>' info('writing deepclues for %s' % mainxdid) outf.write_html('pub/deep/%s/index.html' % mainxdid, diff_h, title='Deep clue analysis for ' + mainxdid)
def pubyear_html(pub, year): calendars_html = '<table class="puzzles">' colnames = [ year ] + pubyear_header calendars_html += html.table_row(colnames, colnames, tag='th') # write out /pub/nyt199x c_grids = {} # utils.info('Generating meta for {pub}{year}'.format(**locals())) for row in sorted(metadb.xd_similar(pub+year)): dt = utils.parse_iso8601(row.xdid) dt2 = utils.parse_iso8601(row.match_xdid) if not dt or not dt2: continue if dt < dt2: continue # dt = row["date"] # without - as GridCalendar needs; or fix GC if dt not in c_grids: c_grids[dt] = { 'title': '', 'class': '' } if row.match_pct == 0: continue c_grids[dt]['link'] = '/pub/' + row.xdid matchxdid = row.match_xdid aut1 = metadb.get_author(row.xdid) or '' aut2 = metadb.get_author(matchxdid) or '' # if aut1 is None or aut2 is None: # continue pct = row.match_pct similargrids = '(%s%%) %s [%s]\n' % (pct, aut2, matchxdid) c_grids[dt]["title"] += similargrids ##deduce_similarity_type c_grids[dt]["class"] += ret_classes(aut1, aut2, pct) c_grids_b = {} # For those are not in c_grids # Generate grids for available puzzles for row in metadb.xd_puzzles(pub+year): if row.Date and row.Date not in c_grids_b and row.Date not in c_grids: # add styles only for those are not similar etc. c_grids_b[row.Date] = { 'title': '', 'class': 'privxd' if int(row.Date[:4]) > 1965 else 'pubxd', } # Generate calendars z = c_grids.copy() z.update(c_grids_b) if year[-1] == 's': # decade from_year = int(year[:4]) to_year = int(year[:4]) + 10 else: from_year = int(year) to_year = int(year) + 1 for year in range(from_year, to_year): for month in range(1, 13): dups_table = [] for dt, d in c_grids.items(): if not dt.startswith("%s-%02d" % (year, month)): continue row_dict = {} # Map row and style xdid = pub + dt puzmd = metadb.xd_puzzle(xdid) if not puzmd: continue row_dict['class'] = d['class'] row_dict['tag_params'] = { 'onclick': 'location.href=\'/pub/%s\'' % xdid, 'class': d['class'] + ' hrefrow puzrow', } row_dict['row'] = [ xdid, puzmd.Date, puzmd.Size, puzmd.Title, puzmd.Author, puzmd.Editor, puzmd.Copyright, puzmd.A1_D1, d["title"].replace("\n", "<br/>") ] dups_table.append(row_dict) calendars_html += '<tr class="calendar"><td class="calendar" rowspan="%s">' % (len(dups_table) + 1) calendars_html += html.GridCalendar(z).formatmonth(int(year), month) + '</td></tr>' for r in sorted(dups_table, key=lambda x: x['row'][1]): calendars_html += html.table_row(r["row"], pubyear_header, tag_params=r['tag_params']) calendars_html += '</table>' ret = '''%s <div class="calendars">%s</div> <hr/>''' % (legend, calendars_html) return ret
def main(): args = utils.get_args('generate pub-years data') outf = utils.open_output() weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] pubyears = {} # set() for xd in xdfile.corpus(): puby = (xd.publication_id(), xd.year()) if puby not in pubyears: pubyears[puby] = [] pubyears[puby].append(xd) if pubyears: metadb.delete_stats() for puby, xdlist in sorted(pubyears.items()): pubid, year = puby npublic = 0 # TODO: SELECT FROM publications nexisting = 0 # organize by day-of-week byweekday = {} byweekday_similar = {} for w in weekdays: byweekday[w] = [] byweekday_similar[w] = [] for xd in xdlist: dow = dow_from_date(xd.get_header('Date')) if dow: # Might be empty date or only a year byweekday[dow].append(xd) for r in metadb.xd_similar(pubid + str(year)): if r.match_pct < 25: continue xd = xdfile.get_xd(r.xdid) if xd: dt = xd.get_header('Date') if dt: dow = dow_from_date(dt) if dow: # Might be empty date or only a year byweekday_similar[dow].append(r) else: debug("Date not set for: %s" % xd) # tally stats for weekday in weekdays: copyrights = Counter() # [copyright_text] -> number of xd editors = Counter() # [editor_name] -> number of xd formats = Counter() # ["15x15 RS"] -> number of xd # todo nexisting = 0 nxd = len(byweekday[weekday]) public_xdids = [] # Empty for now for xd in byweekday[weekday]: xdid = xd.xdid() if (year.isdigit() and int(year) <= 1965) or xdid in public_xdids: npublic += 1 editor = xd.get_header('Editor').strip() if editor: editors[editor] += 1 sizestr = xd.sizestr() if sizestr: formats[sizestr] += 1 copyright = xd.get_header('Copyright').strip() if copyright: copyrights[copyright] += 1 # debug("ME: %s MCPR: %s MF: %s" % (list(editors), list(copyrights), list(formats))) def process_counter(count, comp_value): # Process counter comparing with comp_value if count: item, num = count.most_common(1)[0] if num != comp_value: item += " (%s)" % num else: item = '' return item # maineditor = process_counter(editors, nxd) maincopyright = process_counter(copyrights, nxd) mainformat = process_counter(formats, nxd) reprints = 0 touchups = 0 redones = 0 copies = 0 themecopies = 0 for r in byweekday_similar[weekday]: xd1 = xdfile.get_xd(r.xdid) xd2 = xdfile.get_xd(r.match_xdid) if xd1 is None: info("%s: similar puzzle %s not in corpus" % (r.match_xdid, r.xdid)) continue if xd2 is None: info("%s: similar puzzle %s not in corpus" % (r.xdid, r.match_xdid)) continue dt1 = xd1.get_header('Date') dt2 = xd2.get_header('Date') aut1 = xd1.get_header('Author').lower() aut2 = xd2.get_header('Author').lower() pct = int(r.match_pct) if dt2 < dt1: # only capture the later one ##deduce_similarity_type if diff_authors(aut1, aut2): # suspicious if pct >= 50: copies += 1 elif pct >= 30: themecopies += 1 else: if pct == 100: reprints += 1 elif pct >= 50: touchups += 1 elif pct >= 30: themecopies += 1 metadb.append_row( "pub/stats", (pubid, year, weekday, mainformat, maineditor, maincopyright, nexisting, nxd, npublic, reprints, touchups, redones, copies, themecopies))