def main(filename): xd = xdfile(open(filename).read()) title = 'unknown' author = 'unknown' for h in xd.headers: if h[0] == 'Title': title = h[1] elif h[0] == 'Author': author = h[1] puzzle = {'title': title, 'by': author} number_index = number_grid(xd.grid) cluelist = [] for xdc_tuple in xd.clues: dnum, c, a = xdc_tuple d, n = dnum n = int(n) xy = number_index[n - 1] clue = {'d': d, 'n': n, 'x': xy[0], 'y': xy[1], 'a': a, 'c': c} cluelist.append(clue) puzzle['clues'] = cluelist print json.dumps(puzzle)
def parse_ujson(content): json_data = json.loads(content) # init crossword rows = int(json_data['Height']) xd = xdfile.xdfile() # add meta data for item in POSSIBLE_META_DATA: text = json_data.get(item, None) if text: xd.headers.append((item, unquote(text).decode("utf-8"))) # add puzzle for row in range(1, rows+1): line = json_data['Solution']['Line'+str(row)] xd.grid.append("".join(line.replace(' ', xdfile.BLOCK_CHAR))) # add clues layout = json_data['Layout'] for clue_type in ('Across', 'Down'): for clue in json_data[clue_type + 'Clue'].split(os.linesep): number, text = clue.split('|') solution = _get_solution(number, clue_type[0], layout, xd.grid) xd.clues.append(((clue_type[0], int(number)), unquote(text).decode("utf-8").strip(), solution)) assert solution return xd
def main(): args = get_args("reclue puzzle with clues from other publications") outf = open_output() all_clues = load_clues() missing_tsv = COLUMN_SEPARATOR.join( ["grid_xdid", "clues_pubid", "num_missing"]) + EOL for fn, contents in find_files(*args.inputs, ext=".xd"): xd = xdfile(contents, fn) if not xd.grid: continue xd.set_header("Title", None) xd.set_header("Editor", "Timothy Parker Bot") xd.set_header( "Author", "%s %s" % (random.choice(fake_first), random.choice(fake_last))) xd.set_header("Copyright", None) xd.set_header("Date", iso8601()) remixed = set() for pubid, pub_clues in list(all_clues.items()): try: if pubid == xd.publication_id(): continue # don't use same publisher's clues nmissing = reclue(xd, pub_clues) outfn = "%s-%s.xd" % (xd.xdid(), pubid) if nmissing == 0: nmutated = 0 while nmutated < 100: nmutated += mutate(xd, pub_clues) nmissing = reclue(xd, pub_clues) info("%s missing %d clues after %d mutations" % (outfn, nmissing, nmutated)) remixed.add(pubid) outf.write_file(outfn, xd.to_unicode()) else: debug("%s missing %d clues" % (outfn, nmissing)) missing_tsv += COLUMN_SEPARATOR.join( [xd.xdid(), pubid, str(nmissing)]) + EOL except Exception as e: error("remix error %s" % str(e)) if remixed: info("%d remixed: %s" % (len(remixed), " ".join(remixed))) try: outf.write_file( parse_pathname(fn).base + ".xd", contents.encode("utf-8")) except Exception as e: error("couldn't write: " + str(e)) outf.write_file("remix.log", get_log().encode("utf-8")) outf.write_file("remix.tsv", missing_tsv)
def main(fn): with open(fn, 'r') as fp: xd = xdfile.xdfile(fp.read(), fn) ipuz = dict(version="http://ipuz.org/v1", kind=["http://ipuz.org/crossword#1"], dimensions=dict(width=xd.width(), height=xd.height()), title='') ipuz.update(dict((k.lower(), v) for k, v in xd.headers.items())) puzzle = [] for x in range(xd.height()): puzzle.append([None] * xd.width()) for direction, cluenum, answer, r, c in xd.iteranswers_full(): puzzle[r][c] = cluenum ipuz["puzzle"] = puzzle ipuz["clues"] = { "Across": [(pos[1], markup_to_html(clue)) for pos, clue, answer in xd.iterclues() if pos.startswith('A')], "Down": [(pos[1], markup_to_html(clue)) for pos, clue, answer in xd.iterclues() if pos.startswith('D')], } ipuz["solution"] = [list(row) for row in xd.grid] print(json.dumps(ipuz))
def xd_clues(filename): xd = xdfile(open(filename).read()) grid = xd.grid maxx = len(grid[0]) maxy = len(grid) next_n = 1 across = [] down = [] for y in range(0, maxy): for x in range(0, maxx): light = grid[y][x] != '#' start_of_xlight = (light and (x == 0 or grid[y][x - 1] == '#') and (x + 1 < maxx and grid[y][x + 1] != '#')) start_of_ylight = (light and (y == 0 or grid[y - 1][x] == '#') and (y + 1 < maxy and grid[y + 1][x] != '#')) num = "" if start_of_xlight or start_of_ylight: num = next_n if start_of_xlight: across.append((num, answer_at(grid, (x, y), 'A'))) if start_of_ylight: down.append((num, answer_at(grid, (x, y), 'D'))) next_n += 1 for n, ans in across: print 'A%d. xxx ~ %s' % (n, ans) for n, ans in down: print 'D%d. xxx ~ %s' % (n, ans)
def main(): args = utils.get_args(desc='outputs cleaned puzzle metadata rows') for input_source in args.inputs: for fn, contents in utils.find_files(input_source, ext='.xd'): xd = xdfile.xdfile(contents.decode('utf-8'), fn) clean_headers(xd) metadb.update_puzzles_row(xd)
def parse_uxml(content, filename): POSSIBLE_META_DATA = ['Title', 'Author', 'Editor', 'Copyright', 'Category'] try: content = content.decode("utf-8") except: try: content = content.decode("cp1252") except: pass # last ditch effort, just try the original string content = content.replace("&", "&") content = content.replace('"<"', '"<"') content = content.replace("''", '"') content = content.replace("\x12", "'") # ^R seems to be ' content = content.replace("\x05", "'") # ^E seems to be junk content = re.sub(r'=""(\S)', r'=""\1', content) # one case has c=""foo"". sheesh content = re.sub(r'(\.)""', r'\1""', content) try: root = etree.fromstring(content) except: xml = re.search(r"<(\w+).*?</\1>", content, flags=re.DOTALL).group() root = etree.fromstring(xml) # init crossword rows = int(root.xpath('//crossword/Height')[0].attrib['v']) cols = int(root.xpath('//crossword/Width')[0].attrib['v']) xd = xdfile.xdfile() # add meta data for item in POSSIBLE_META_DATA: try: text = root.xpath('//crossword/' + item)[0].attrib['v'] if text: xd.headers.append((item, unquote(text))) except: pass # add puzzle all_answers = root.xpath('//crossword/AllAnswer')[0].attrib['v'] all_answers = all_answers.replace('-', xdfile.BLOCK_CHAR) index = 0 while index < len(all_answers): row = all_answers[index:index+cols] xd.grid.append(u"".join(row)) index += cols # add clues for clue_type in ('across', 'down'): for clue in root.xpath('//crossword/'+clue_type)[0].getchildren(): number = int(clue.attrib['cn']) text = udecode(clue.attrib['c'].strip()) solution = clue.attrib['a'].strip() xd.clues.append(((clue_type[0].upper(), number), text, solution)) return xd
def main(): global boiled_clues args = get_args('create clue index') outf = open_output() boiled_clues = load_clues() biggest_clues = "<li>%d total clues, which boil down to %d distinct clues" % (len(clues()), len(boiled_clues)) bcs = [ (len(v), bc, answers_from(v)) for bc, v in boiled_clues.items() ] nreused = len([bc for n, bc, _ in bcs if n > 1]) biggest_clues += "<li>%d (%d%%) of these clues are used in more than one puzzle" % (nreused, nreused*100/len(boiled_clues)) cluepages_to_make = set() # add all boiled clues from all input .xd files for fn, contents in find_files(*args.inputs, ext='.xd'): progress(fn) xd = xdfile.xdfile(contents.decode('utf-8'), fn) for pos, mainclue, mainanswer in xd.iterclues(): cluepages_to_make.add(boil(mainclue)) # add top 100 most used boiled clues from corpus biggest_clues += '<h2>Most used clues</h2>' biggest_clues += '<table class="clues most-used-clues">' biggest_clues += th("clue", "# uses", "answers used with this clue") for n, bc, ans in sorted(bcs, reverse=True)[:100]: cluepages_to_make.add(bc) biggest_clues += td(mkhref(unboil(bc), bc), n, html_select_options(ans)) biggest_clues += '</table>' most_ambig = "<h2>Most ambiguous clues</h2>" most_ambig += '(clues with the largest number of different answers)' most_ambig += '<table class="clues most-different-answers">' most_ambig += th("Clue", "answers") for n, bc, ans in sorted(bcs, reverse=True, key=lambda x: len(set(x[2])))[:100]: cluepages_to_make.add(bc) clue = mkhref(unboil(bc), bc) if 'quip' in bc or 'quote' in bc or 'theme' in bc or 'riddle' in bc: most_ambig += td(clue, html_select_options(ans), rowclass="theme") else: most_ambig += td(clue, html_select_options(ans)) most_ambig += '</table>' for bc in cluepages_to_make: contents = mkwww_cluepage(bc) if contents: outf.write_html('pub/clue/%s/index.html' % bc, contents, title=bc) outf.write_html('pub/clue/index.html', biggest_clues + most_ambig, title="Clues")
def parse_uxml(content, filename): POSSIBLE_META_DATA = ['Title', 'Author', 'Editor', 'Copyright', 'Category'] try: content = content.decode("utf-8") except: try: content = content.decode("cp1252") except: pass # last ditch effort, just try the original string content = escape(content, xml_escape_table) content = re.sub(r'(=["]{2}([^"]+?)["]{2})+', r'=""\2""', content) # Replace double quotes try: root = etree.fromstring(content.encode("utf-8")) except: # TODO: catch the specific exception xml = re.search(r"<(\w+).*?</\1>", content, flags=re.DOTALL).group() root = etree.fromstring(xml) # init crossword # rows = int(root.xpath('//crossword/Height')[0].attrib['v']) cols = int(root.xpath('//crossword/Width')[0].attrib['v']) xd = xdfile.xdfile('', filename) # add meta data for item in POSSIBLE_META_DATA: elem = root.xpath('//crossword/' + item) if elem: text = elem[0].attrib['v'] if text: text = escape(text, rev_xml_escape_table) xd.set_header(item, unquote(text)) # add puzzle all_answers = root.xpath('//crossword/AllAnswer')[0].attrib['v'] all_answers = all_answers.replace('-', xdfile.BLOCK_CHAR) index = 0 while index < len(all_answers): row = all_answers[index:index + cols] xd.grid.append("".join(row)) index += cols # add clues for clue_type in ('across', 'down'): for clue in root.xpath('//crossword/' + clue_type)[0].getchildren(): number = int(clue.attrib['cn']) text = udecode(clue.attrib['c'].strip()) text = escape(text, rev_xml_escape_table) solution = clue.attrib['a'].strip() xd.clues.append(((clue_type[0].upper(), number), text, solution)) return xd
def parse_uxml(content, filename): POSSIBLE_META_DATA = ["Title", "Author", "Editor", "Copyright", "Category"] try: content = content.decode("utf-8") except: try: content = content.decode("cp1252") except: pass # last ditch effort, just try the original string content = escape(content, xml_escape_table) content = re.sub(r'(=["]{2}([^"]+?)["]{2})+', r'=""\2""', content) # Replace double quotes try: root = etree.fromstring(content.encode("utf-8")) except: # TODO: catch the specific exception xml = re.search(r"<(\w+).*?</\1>", content, flags=re.DOTALL).group() root = etree.fromstring(xml) # init crossword # rows = int(root.xpath('//crossword/Height')[0].attrib['v']) cols = int(root.xpath("//crossword/Width")[0].attrib["v"]) xd = xdfile.xdfile("", filename) # add meta data for item in POSSIBLE_META_DATA: elem = root.xpath("//crossword/" + item) if elem: text = elem[0].attrib["v"] if text: text = escape(text, rev_xml_escape_table) xd.set_header(item, unquote(text)) # add puzzle all_answers = root.xpath("//crossword/AllAnswer")[0].attrib["v"] all_answers = all_answers.replace("-", xdfile.BLOCK_CHAR) index = 0 while index < len(all_answers): row = all_answers[index : index + cols] xd.grid.append("".join(row)) index += cols # add clues for clue_type in ("across", "down"): for clue in root.xpath("//crossword/" + clue_type)[0].getchildren(): number = int(clue.attrib["cn"]) text = udecode(clue.attrib["c"].strip()) text = escape(text, rev_xml_escape_table) solution = clue.attrib["a"].strip() xd.clues.append(((clue_type[0].upper(), number), text, solution)) return xd
def main(): args = get_args("reclue puzzle with clues from other publications") outf = open_output() all_clues = load_clues() missing_tsv = COLUMN_SEPARATOR.join([ "grid_xdid", "clues_pubid", "num_missing" ]) + EOL for fn, contents in find_files(*args.inputs, ext=".xd"): xd = xdfile(contents, fn) if not xd.grid: continue xd.set_header("Title", None) xd.set_header("Editor", "Timothy Parker Bot") xd.set_header("Author", "%s %s" % (random.choice(fake_first), random.choice(fake_last))) xd.set_header("Copyright", None) xd.set_header("Date", iso8601()) remixed = set() for pubid, pub_clues in list(all_clues.items()): try: if pubid == xd.publication_id(): continue # don't use same publisher's clues nmissing = reclue(xd, pub_clues) outfn = "%s-%s.xd" % (xd.xdid(), pubid) if nmissing == 0: nmutated = 0 while nmutated < 100: nmutated += mutate(xd, pub_clues) nmissing = reclue(xd, pub_clues) info("%s missing %d clues after %d mutations" % (outfn, nmissing, nmutated)) remixed.add(pubid) outf.write_file(outfn, xd.to_unicode()) else: debug("%s missing %d clues" % (outfn, nmissing)) missing_tsv += COLUMN_SEPARATOR.join([ xd.xdid(), pubid, str(nmissing) ]) + EOL except Exception as e: error("remix error %s" % str(e)) if remixed: info("%d remixed: %s" % (len(remixed), " ".join(remixed))) try: outf.write_file(parse_pathname(fn).base + ".xd", contents.encode("utf-8")) except Exception as e: error("couldn't write: " + str(e)) outf.write_file("remix.log", get_log().encode("utf-8")) outf.write_file("remix.tsv", missing_tsv)
def main(): args = get_args(desc="find similar grids") g_corpus = [x for x in corpus()] outf = open_output() outf.write(xd_similar_header) for fn, contents in find_files(*args.inputs, strip_toplevel=False): needle = xdfile(contents.decode("utf-8"), fn) for pct, a, b in find_similar_to(needle, g_corpus): outf.write(xd_similar_row(a, b, pct))
def xd_to_puz(filename, filename_out): xd = xdfile(open(filename).read()) grid = xd.grid maxx = len(grid[0]) maxy = len(grid) puzzle = crossword.Crossword(maxx, maxy) title = 'unknown' author = 'unknown' for h in xd.headers: if h[0] == 'Title': title = h[1] if h[0] == 'Author': author = h[1] puzzle.meta.creator = author puzzle.meta.title = title for xdc_tuple in xd.clues: dnum, c, a = xdc_tuple d, n = dnum if d == 'A': puzzle.clues.across[int(n)] = c else: puzzle.clues.down[int(n)] = c for direction, number, clue in puzzle.clues.all(): print(direction, number, clue) for y in range(0, maxy): for x in range(0, maxx): ch = grid[y][x] if ch != '#': puzzle[y][x].cell = " " puzzle[y][x].solution = grid[y][x] else: puzzle[y][x].cell = "." puzzle[y][x].block = None puzzle[y][x].solution = None puz = crossword.to_puz(puzzle) puz.fill = ''.join([x if x == '.' else '-' for x in puz.solution]) puz.save(filename_out)
def main(): args = utils.get_args(desc='show grid potentials') wordlist = grid_potentials.get_wordlist() for input_source in args.inputs: for fn, contents in xdfile.utils.find_files(input_source, ext='.xd'): xd = xdfile.xdfile(contents.decode('utf-8'), fn) pots, answers = grid_potentials.get_potentials(xd, wordlist) print_potential_grid(xd, pots) for key, v in sorted(answers.items(), key=lambda x:len(x[1][1])): pattern, matches = v if len(matches) < 10: print(key, pattern, len(matches), ' '.join(matches)) else: print(key, pattern, len(matches))
def main(): args = utils.get_args(desc='find grid templates') templates = set() for xd in xdfile.corpus(): tmpl = tuple(''.join(x if x == BLOCK_CHAR else UNKNOWN_CHAR for x in L) for L in xd.grid) templates.add(tmpl) print(len(templates), 'templates') for input_source in args.inputs: for fn, contents in utils.find_files(input_source, ext='.xd'): xd = xdfile.xdfile(contents.decode('utf-8'), fn) for i, T in enumerate(templates): griddedxd = fit_template(T, xd) if griddedxd: with open(args.output + ('-t%s.xd' % i), 'w') as fp: fp.write(griddedxd.to_unicode())
def main(): global args args = xdfile.utils.get_args(desc='show sorted list of grid potentials') wordlist = get_wordlist() print ("filename mean stdev var min_answer") for input_source in args.inputs: for fn, contents in xdfile.utils.find_files(input_source, ext='.xd'): xd = xdfile.xdfile(contents.decode('utf-8'), fn) pots, answers = get_potentials(xd, wordlist) all_pots = [] unfixed = [len(ch) for row2 in pots for ch in row2 if ch] if unfixed: all_pots.append((xd, pots, answers, unfixed)) matches_list = [x[1] for x in answers.values()] print ("%s %.02f %.02f %.02f %d" % (xd, mean(unfixed), pstdev(unfixed), pvariance(unfixed), min(len(x) for x in matches_list))) print ('\n--') for xd, pots, answers, unfixed in sorted(all_pots, key=lambda r: mean(r[3]), reverse=True): print ("%20s %.02f %.02f %.02f" % (xd, mean(unfixed), pstdev(unfixed), pvariance(unfixed)))
def main(): args = utils.get_args( 'generates .html diffs with deep clues for all puzzles in similar.tsv') outf = utils.open_output() similars = utils.parse_tsv('gxd/similar.tsv', 'Similar') xds_todo = [] for fn, contents in find_files(*args.inputs, ext='.xd'): xd = xdfile.xdfile(contents.decode('utf-8'), fn) xds_todo.append(xd) for mainxd in xds_todo: mainxdid = mainxd.xdid() progress(mainxdid) matches = metadb.xd_similar(mainxdid) xddates = {} xddates[mainxdid] = mainxd.date( ) # Dict to store XD dates for further sort html_grids = {} # these are added directly to similar.tsv nstaleclues = 0 nstaleanswers = 0 ntotalclues = 0 dcl_html = '<tr>' dcl_html += '<th></th>' dcl_html += '<th>Clue</th>' dcl_html += '<th>ANSWERs</th>' dcl_html += '<th>Alt. clue possibilities</th>' dcl_html += '</tr>' deepcl_html = [] # keep deep clues to parse later - per row for pos, mainclue, mainanswer in mainxd.iterclues(): if not pos: continue poss_answers = [] # TODO: pub_uses = {} # [pubid] -> set(ClueAnswer) deepcl_html = [] # Temporary to be replaced late mainca = ClueAnswer(mainxdid, mainxd.date(), mainanswer, mainclue) # 'grid position' column deepcl_html.append('<td class="pos">%s.</td>' % pos) # find other uses of this clue, and other answers, in a single pass for clueans in find_clue_variants(mainclue): if clueans.answer != mainanswer: poss_answers.append(clueans) if clueans.answer == mainanswer: if clueans.pubid in pub_uses: otherpubs = pub_uses[clueans.pubid] else: otherpubs = set() # set of ClueAnswer pub_uses[clueans.pubid] = otherpubs otherpubs.add(clueans) # add 'other uses' to clues_html deepcl_html.append('<td class="other-uses">') prev = prev_uses(pub_uses, mainxd, mainclue) if prev: deepcl_html.append('<a href="/pub/clue/%s">%s [x%s]</a>' % (boil(mainclue), mainclue, len(prev))) nstaleclues += 1 else: deepcl_html.append(mainclue) deepcl_html.append('</td>') # add 'other answers' to clues_html deepcl_html.append('<td class="other-answers">') deepcl_html.append( html_select_options(poss_answers, strmaker=lambda ca: ca.answer, force_top=mainca, add_total=False)) deepcl_html.append('</td>') # add 'other clues' to clues_html deepcl_html.append('<td class="other-clues">') other_clues = html_other_clues(mainanswer, mainclue, mainxd) if other_clues: deepcl_html.append(other_clues) nstaleanswers += 1 deepcl_html.append('</td>') # end 'other-clues' ntotalclues += 1 # Quick and dirty - to be replaced dcl_html += '<tr>' + ' '.join(deepcl_html) + '</tr>' # Process deepclues diff_h = '<div class="main-container">' diff_h += grid_to_html(mainxd) diff_h += mktag('table', 'deepclues') + dcl_html + mktag('/table') diff_h += '</div>' info('writing deepclues for %s' % mainxdid) outf.write_html('pub/deep/%s/index.html' % mainxdid, diff_h, title='Deep clue analysis for ' + mainxdid)
def xd_lint(filename): """ Check some rules about xd files: - Filling in the grid using answers alone results in the same grid - All numbered locations have corresponding clues - All clues have answers """ error = False xd = xdfile(open(filename).read()) title = 'unknown' author = 'unknown' for h in xd.headers: if h[0] == 'Title': title = h[1] if h[0] == 'Author': author = h[1] grid = xd.grid maxx = len(grid[0]) maxy = len(grid) filled = [] for i in range(0, maxy): filled.append(['#'] * maxx) direction = {'A': 'across', 'D': 'down'} across = {} down = {} for xdc_tuple in xd.clues: dnum, c, a = xdc_tuple d, n = dnum if d == 'A': across[int(n)] = (c, a) else: down[int(n)] = (c, a) if not c: print '%s: error: no clue provided for %s %s' % (filename, n, direction[d]) error = True if not a: print '%s: error: no answer provided for %s %s' % (filename, n, direction[d]) error = True number_index = {} next_n = 1 for y in range(0, maxy): for x in range(0, maxx): light = grid[y][x] != '#' start_of_xlight = (light and (x == 0 or grid[y][x - 1] == '#') and (x + 1 < maxx and grid[y][x + 1] != '#')) start_of_ylight = (light and (y == 0 or grid[y - 1][x] == '#') and (y + 1 < maxy and grid[y + 1][x] != '#')) if start_of_xlight and not across.get(next_n): print '%s: error: missing clue for %d %s' % (filename, next_n, direction['A']) error = True if start_of_ylight and not down.get(next_n): print '%s: error: missing clue for %d %s' % (filename, next_n, direction['D']) error = True if start_of_xlight or start_of_ylight: number_index[next_n] = (x, y) next_n += 1 for xdc_tuple in xd.clues: dnum, c, a = xdc_tuple d, n = dnum n = int(n) if n not in number_index: print '%s: error: clue %s %s does not correspond to a grid location' % ( filename, n, direction[d]) error = True continue x, y = number_index[n] for i, letter in enumerate(a): xp, yp = x, y if d == 'A': xp = x + i else: yp = y + i if xp >= maxx or yp >= maxy: print '%s: error: clue %s %s extends beyond the grid' % ( filename, n, direction[d]) else: filled[yp][xp] = letter filled = [''.join(x) for x in filled] for i, line in enumerate(filled): if line != grid[i]: print '%s: error: grids do not match on line %d' % (filename, i + 1) print 'line: %s' % line print 'grid: %s' % grid[i] error = True break if error: sys.exit(1) print 'All checks passed.' sys.exit(0)
def xd_to_html(filename, answers=False): xd = xdfile(open(filename).read()) title = 'unknown' author = 'unknown' filename_noext = filename.split(".")[0] for h in xd.headers: if h[0] == 'Title': title = h[1] html = ''' <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd"> <html> <head> <title>''' + title + '''</title> <link rel="stylesheet" type="text/css" href="style.css"/> <script type="text/javascript" src="lib/jquery.min.js"></script> <script type="text/javascript" src="lib/jquery.hotkeys.js"></script> <script type="text/javascript" src="lib/jquery.cookie.js"></script> <script type="text/javascript" src="xd.js"></script> <script type="text/javascript" src="crossword.js"></script> <script type="text/javascript"> $(function() { var crossword; crossword = new Crossw1rd('container'); crossword.init("''' + filename_noext + '''"); }); </script> </head> <body> <div id="container"> ''' grid = xd.grid maxx = len(grid[0]) maxy = len(grid) html += '''<div id="cross1wrd" style="height:%spx;width:%spx">''' % ( maxy * 28 + 6, 200 + maxx * 28 + 20) html += '''<div class="clues" style="height:%spx;width:%spx;">''' % ( maxy * 28, 200) html += '''<h4 class="cluelabel">Across</h4>''' html += '''<div class="across scroll-pane" style="height:%spx;">''' % ( maxy * 28 / 2 - 20) for xdc_tuple in xd.clues: dnum, c, a = xdc_tuple d, n = dnum if d == 'A': html += '''<p class="c%s%s">%s. %s</p>''' % (d, n, n, c) html += '''</div>''' html += '''<h4 class="cluelabel">Down</h4>''' html += '''<div class="down scroll-pane" style="height:%spx;">''' % ( maxy * 28 / 2 - 20) for xdc_tuple in xd.clues: dnum, c, a = xdc_tuple d, n = dnum if d == 'D': html += '''<p class="c%s%s">%s. %s</p>''' % (d, n, n, c) html += '''</div>''' html += '''</div>''' html += '''<div class="grid" style="height:%spx;width:%spx;">\n''' % ( maxy * 28, maxx * 28) next_n = 1 for y in range(0, maxy): html += '''<div class="row">''' for x in range(0, maxx): light = grid[y][x] != '#' start_of_xlight = (light and (x == 0 or grid[y][x - 1] == '#') and (x + 1 < maxx and grid[y][x + 1] != '#')) start_of_ylight = (light and (y == 0 or grid[y - 1][x] == '#') and (y + 1 < maxy and grid[y + 1][x] != '#')) num = "" if start_of_xlight or start_of_ylight: num = next_n next_n += 1 letter_span = "" if answers and light: letter_span = '''<span class="letter">%s</span>''' % ( grid[y][x]) html += '''<div%s><span class="num">%s</span>%s</div>''' % ( ' class="blank"' if not light else "", num, letter_span) html += '''</div>\n''' html += '''</div>''' html += '''</div></div></body></html>''' print html
def parse_xwordinfo(content, filename): content = content.decode('utf-8') REBUS_LONG_HANDS = { 'NINE': '9', 'EIGHT': '8', 'SEVEN': '7', 'SIX': '6', 'FIVE': '5', 'FOUR': '4', 'THREE': '3', 'TWO': '2', 'ONE': '1', 'ZERO': '0', 'AUGHT': '0', 'AMPERSAND': '&', 'AND': '&', 'ASTERISK': '*', 'PERCENT': '%', 'STAR': '*', 'AT': '@', 'DOLLAR': '$', 'PLUS': '+', 'CENT': 'c', # 'DASH': '-', # 'DOT': '●' } rsh = 'zyxwvutsrqponmlkjihgfedcba♚♛♜♝♞♟⚅⚄⚃⚂⚁⚀♣♦♥♠Фθиλπφя+&%$@?*0987654321' REBUS_SHORT_HANDS = list(rsh) content = content.replace("<b>", "{*") content = content.replace("</b>", "*}") content = content.replace("<i>", "{/") content = content.replace("</i>", "/}") content = content.replace("<em>", "{/") content = content.replace("</em>", "/}") content = content.replace("<u>", "{_") content = content.replace("</u>", "_}") content = content.replace("<strike>", "{-") content = content.replace("</strike>", "-}") content = content.replace("’", "'") content = content.replace('“', '"') # content = content.replace('–', '-') if "CPHContent_" in content: xwiprefix = '#CPHContent_' else: xwiprefix = '#' root = html.fromstring(content) ## debug("ROOT: %s" % root) special_type = '' rebus = {} rebus_order = [] xd = xdfile.xdfile('', filename) # get crossword info title = root.cssselect('#PuzTitle')[0].text.strip() try: subtitle = root.cssselect(xwiprefix + 'SubTitle')[0].text.strip() subtitle = ' [%s]' % subtitle except: subtitle = "" author = root.cssselect('.aegrid div')[1].text.strip() editor = root.cssselect('.aegrid div')[3].text.strip() copyright = root.cssselect(xwiprefix + 'Copyright')[0].text.strip() xd.set_header("Title", '%s%s' % (title, subtitle)) xd.set_header("Author", author) xd.set_header("Editor", editor) xd.set_header("Copyright", copyright) # nyt title normally has date as e.g. January 1, 2020 date_re = "(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}" try: m = re.search(date_re, subtitle if subtitle else title) date_string = m.group(0) date = datetime.strptime(date_string, "%B %d, %Y") xd.set_header("Date", date.strftime("%Y-%m-%d")) except: pass _process_notes(xd, xwiprefix, root) # add header for notes, if any puzzle_table = root.cssselect(xwiprefix + 'PuzTable tr') or root.cssselect( '#PuzTable tr') for row in puzzle_table: row_data = "" for cell in row.cssselect('td'): # check if the cell is special - with a shade or a circle cell_class = cell.get('class') cell_type = '' if cell_class == 'shade': cell_type = 'shaded' elif cell_class == 'bigcircle': cell_type = 'circle' letter = cell.cssselect('div.letter') letter = (len(letter) and letter[0].text) or xdfile.BLOCK_CHAR # handle rebuses if letter == xdfile.BLOCK_CHAR: subst = cell.cssselect('div.subst2') subst = (len(subst) and subst[0].text) or '' if not subst: subst = cell.cssselect('div.subst') if subst: if title in SPLIT_REBUS_TITLES: subst = "/".join(list(subst[0].text)) else: subst = subst[0].text else: # check if color rebus cell_string = etree.tostring(cell).decode('utf-8') m = re.search("background-color:([A-Z]+);", cell_string) if m: subst = m.group(1) else: subst = '' if subst: if subst not in rebus: if subst in REBUS_LONG_HANDS: rebus_val = REBUS_LONG_HANDS[subst] if rebus_val in REBUS_SHORT_HANDS: REBUS_SHORT_HANDS.remove(rebus_val) else: rebus_val = REBUS_SHORT_HANDS.pop() rebus[subst] = rebus_val rebus_order.append(subst) letter = rebus[subst] if cell_type: # the special cell's letter should be represented in lower case letter = letter.lower() if not special_type: # hopefully there shouldn't be both shades and circles in # the same puzzle - if that is the case, only the last value # will be put up in the header special_type = cell_type row_data += letter xd.grid.append(row_data) if len(rebus): rebus = ["%s=%s" % (rebus[x], x.upper()) for x in rebus_order] xd.set_header("Rebus", ','.join(rebus)) if special_type: xd.set_header("Special", special_type) across_div = root.cssselect('#ACluesPan') or root.cssselect(xwiprefix + 'ACluesPan') down_div = root.cssselect('#DCluesPan') or root.cssselect(xwiprefix + 'DCluesPan') if across_div and down_div: # normal puzzle _process_clues(xd, 'A', across_div) # add across clues _process_clues(xd, 'D', down_div) # add down clues elif across_div: # uniclue puzzle? _process_uniclues(xd, across_div) else: raise XWordInfoParseError("No clue divs found.") return xd
def main(): p = utils.args_parser(desc="annotate puzzle clues with earliest date used in the corpus") p.add_argument("-a", "--all", default=False, help="analyze all puzzles, even those already in similar.tsv") p.add_argument("-l", "--limit", default=100, help="limit amount of puzzles to be analyzed [default=100]") args = get_args(parser=p) outf = open_output() num_processed = 0 prev_similar = metadb.read_rows("gxd/similar") for fn, contents in find_files(*args.inputs, ext=".xd"): progress(fn) mainxd = xdfile(contents.decode("utf-8"), fn) if mainxd.xdid() in prev_similar: continue # skip reprocessing .xd that are already in similar.tsv """ find similar grids (pct, xd) for the mainxd in the corpus. Takes about 1 second per xd. sorted by pct. """ similar_grids = sorted(find_similar_to(mainxd, corpus(), min_pct=0.20), key=lambda x: x[0], reverse=True) num_processed += 1 if num_processed > int(args.limit): break if similar_grids: info("similar: " + " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids)) mainpubid = mainxd.publication_id() maindate = mainxd.date() # go over each clue/answer, find all other uses, other answers, other possibilities. # these are added directly to similar.tsv nstaleclues = 0 nstaleanswers = 0 ntotalclues = 0 for pos, mainclue, mainanswer in mainxd.iterclues(): progress(mainanswer) poss_answers = [] pub_uses = {} # [pubid] -> set(ClueAnswer) mainca = ClueAnswer(mainpubid, maindate, mainanswer, mainclue) # find other uses of this clue, and other answers, in a single pass for clueans in find_clue_variants(mainclue): if clueans.answer != mainanswer: poss_answers.append(clueans) if clueans.answer == mainanswer: if clueans.pubid in pub_uses: otherpubs = pub_uses[clueans.pubid] else: otherpubs = set() # set of ClueAnswer pub_uses[clueans.pubid] = otherpubs otherpubs.add(clueans) # bclues is all boiled clues for this particular answer: { [bc] -> #uses } bclues = load_answers().get(mainanswer, []) stale_answer = False if bclues: uses = [] for bc, nuses in bclues.items(): # then find all clues besides this one clue_usages = [ ca for ca in load_clues().get(bc, []) if ca.answer == mainanswer and ca.date < maindate ] if clue_usages: stale_answer = True if nuses > 1: # only use one (the most recent) ClueAnswer per boiled clue # but use the clue only (no xdid) ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1].clue else: ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1] uses.append((ca, nuses)) # summary row to similar.tsv metadb.append_row( "gxd/similar", [ mainxd.xdid(), # xdid int(100 * sum(pct / 100.0 for pct, xd1, xd2 in similar_grids)), # similar_grid_pct nstaleclues, # reused_clues nstaleanswers, # reused_answers ntotalclues, # total_clues " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids), # matches ], )
def parse_xwordinfo(content, filename): content = content.decode('utf-8') REBUS_LONG_HANDS = {'NINE': '9', 'EIGHT': '8', 'SEVEN': '7', 'SIX': '6', 'FIVE': '5', 'FOUR': '4', 'THREE': '3', 'TWO': '2', 'ONE': '1', 'ZERO': '0', 'AUGHT': '0', 'AMPERSAND': '&', 'AND': '&', 'ASTERISK': '*', 'PERCENT': '%', 'STAR': '*', 'AT': '@', 'DOLLAR': '$', 'PLUS': '+', 'CENT': 'c', # 'DASH': '-', # 'DOT': '●' } rsh = 'zyxwvutsrqponmlkjihgfedcba♚♛♜♝♞♟⚅⚄⚃⚂⚁⚀♣♦♥♠Фθиλπφя+&%$@?*0987654321' REBUS_SHORT_HANDS = list(rsh) content = content.replace("<b>", "{*") content = content.replace("</b>", "*}") content = content.replace("<i>", "{/") content = content.replace("</i>", "/}") content = content.replace("<em>", "{/") content = content.replace("</em>", "/}") content = content.replace("<u>", "{_") content = content.replace("</u>", "_}") content = content.replace("<strike>", "{-") content = content.replace("</strike>", "-}") content = content.replace("’", "'") content = content.replace('“', '"') # content = content.replace('–', '-') if "CPHContent_" in content: xwiprefix = '#CPHContent_' else: xwiprefix = '#' root = html.fromstring(content) ## debug("ROOT: %s" % root) special_type = '' rebus = {} rebus_order = [] xd = xdfile.xdfile('', filename) # get crossword info title = root.cssselect(xwiprefix + 'TitleLabel')[0].text.strip() try: subtitle = root.cssselect(xwiprefix + 'SubTitleLabel')[0].text.strip() subtitle = ' [%s]' % subtitle except: subtitle = "" # author = root.cssselect(xwiprefix + 'AuthorLabel')[0].text.strip() # editor = root.cssselect(xwiprefix + 'EditorLabel')[0].text.strip() try: xd.notes = stringify_children(root.cssselect(xwiprefix + 'NotepadDiv')[0]) except Exception as e: xd.notes = "" debug('Exception %s' % e) xd.set_header("Title", '%s%s' % (title, subtitle)) xd.set_header("Author", root.cssselect(xwiprefix + 'AuthorLabel')[0].text.strip()) xd.set_header("Editor", root.cssselect(xwiprefix + 'EditorLabel')[0].text.strip()) xd.notes = xd.notes.replace("<br/>", "\n") xd.notes = xd.notes.replace("<b>Notepad:</b>", "\n") xd.notes = xd.notes.replace(" ", "\n") xd.notes = xd.notes.strip() puzzle_table = root.cssselect(xwiprefix + 'PuzTable tr') or root.cssselect('#PuzTable tr') for row in puzzle_table: row_data = "" for cell in row.cssselect('td'): # check if the cell is special - with a shade or a circle cell_class = cell.get('class') cell_type = '' if cell_class == 'bigshade': cell_type = 'shaded' elif cell_class == 'bigcircle': cell_type = 'circle' letter = cell.cssselect('div.letter') letter = (len(letter) and letter[0].text) or xdfile.BLOCK_CHAR # handle rebuses if letter == xdfile.BLOCK_CHAR: subst = cell.cssselect('div.subst2') subst = (len(subst) and subst[0].text) or '' if not subst: subst = cell.cssselect('div.subst') if subst: if title in SPLIT_REBUS_TITLES: subst = "/".join(list(subst[0].text)) else: subst = subst[0].text else: subst = '' if subst: if subst not in rebus: if subst in REBUS_LONG_HANDS: rebus_val = REBUS_LONG_HANDS[subst] if rebus_val in REBUS_SHORT_HANDS: REBUS_SHORT_HANDS.remove(rebus_val) else: rebus_val = REBUS_SHORT_HANDS.pop() rebus[subst] = rebus_val rebus_order.append(subst) letter = rebus[subst] if cell_type: # the special cell's letter should be represented in lower case letter = letter.lower() if not special_type: # hopefully there shouldn't be both shades and circles in # the same puzzle - if that is the case, only the last value # will be put up in the header special_type = cell_type row_data += letter xd.grid.append(row_data) if len(rebus): rebus = ["%s=%s" % (rebus[x], x.upper()) for x in rebus_order] xd.set_header("Rebus", ','.join(rebus)) if special_type: xd.set_header("Special", special_type) # add clues across_clues = _fetch_clues(xd, 'A', root, xwiprefix + 'AcrossClues', rebus) down_clues = _fetch_clues(xd, 'D', root, xwiprefix + 'DownClues', rebus) return xd
def parse_puz(contents): puz_object = puz.load(contents) puzzle = crossword.from_puz(puz_object) grid_dict = dict(zip(string.uppercase, string.uppercase)) xd = xdfile.xdfile() md = dict([ (hdr_renames.get(k.lower(), k), v) for k, v in puzzle.meta() if v ]) if " / " in md.get("author", ""): author, editor = md.get("author").split(" / ") editor = editor.strip() author = author.strip() author = author.lstrip("By ") editor = editor.lstrip("Edited by ") md["author"] = author md["editor"] = editor if "Washington Post" in md.get("copyright", ""): a = md["author"] if " - " in a: datestr, rest = a.split(" - ") md["date"] = reparse_date(datestr) if "By " in rest: md["title"], rest = rest.split(" By ") else: md["title"], rest = rest.split(" by ", 1) if "Edited by " in rest: md["author"], md["editor"] = rest.split(", Edited by ") elif "edited by " in rest: md["author"], md["editor"] = rest.split(", edited by ") else: md["author"] = rest md["copyright"] = md["copyright"].lstrip("Copyright") for k, v in sorted(md.items(), key=lambda x: hdr_order.index(x[0])): if v: k = k[0].upper() + k[1:].lower() v = decode(v.strip()) v = v.replace(u"© ", "") xd.headers.append((k, v)) answers = { } clue_num = 1 for r, row in enumerate(puzzle): rowstr = "" for c, cell in enumerate(row): if puzzle.block is None and cell.solution == '.': rowstr += xdfile.BLOCK_CHAR elif puzzle.block == cell.solution: rowstr += xdfile.BLOCK_CHAR elif cell == puzzle.empty: rowstr += "." else: if cell.solution not in grid_dict: grid_dict[cell.solution] = rebus_shorthands.pop() rowstr += grid_dict[cell.solution] # compute number shown in box new_clue = False if is_block(puzzle, c-1, r): # across clue start j = 0 answer = "" while not is_block(puzzle, c+j, r): answer += puzzle[c+j, r].solution j += 1 if len(answer) > 1: new_clue = True answers["A"+str(clue_num)] = answer if is_block(puzzle, c, r-1): # down clue start j = 0 answer = "" while not is_block(puzzle, c, r+j): answer += puzzle[c, r+j].solution j += 1 if len(answer) > 1: new_clue = True answers["D"+str(clue_num)] = answer if new_clue: clue_num += 1 xd.grid.append(rowstr) for number, clue in puzzle.clues.across(): xd.clues.append((("A", number), decode(clue), answers["A"+str(number)])) for number, clue in puzzle.clues.down(): xd.clues.append((("D", number), decode(clue), answers["D"+str(number)])) return xd
def parse_ccxml(data, filename): content = data.decode('utf-8', errors='replace') content = escape(content, xml_escape_table) content = consecutive(content) content = re.sub(r'(=["]{2}([^"]+?)["]{2})+',r'=""\2""', content) # Replace double quotes content_xml = content.encode('utf-8') ns = { 'puzzle': 'http://crossword.info/xml/rectangular-puzzle' } try: root = etree.fromstring(content_xml) except Exception as e: error('Exception %s' % e) error(content) exit # init crossword grid = root.xpath('//puzzle:crossword/puzzle:grid', namespaces=ns) if not grid: return None grid = grid[0] rows = int(grid.attrib['height']) cols = int(grid.attrib['width']) xd = xdfile.xdfile('', filename) # add metadata for metadata in root.xpath('//puzzle:metadata', namespaces=ns)[0]: text = metadata.text and metadata.text.strip() title = re.sub('\{[^\}]*\}', '', metadata.tag.title()) title = escape(title, rev_xml_escape_table) if text: text = escape(text, rev_xml_escape_table) xd.set_header(HEADER_RENAMES.get(title, title), text) # add puzzle puzzle = [] for i in range(rows): puzzle.append([" "] * cols) for cell in grid.xpath('./puzzle:cell', namespaces=ns): x = int(cell.attrib['x']) - 1 y = int(cell.attrib['y']) - 1 if 'solution' in cell.attrib: value = cell.attrib['solution'] if 'type' in cell.attrib and cell.attrib['type'] == 'block': value = xdfile.BLOCK_CHAR puzzle[y][x] = value xd.grid = ["".join(row) for row in puzzle] # add clues word_map = {} for word in root.xpath('//puzzle:crossword/puzzle:word', namespaces=ns): word_map[word.attrib['id']] = (word.attrib['x'], word.attrib['y']) for clues in root.xpath('//puzzle:crossword/puzzle:clues', namespaces=ns): type = clues.xpath('./puzzle:title', namespaces=ns)[0] type = "".join(chr(x) for x in etree.tostring(type, method='text').upper() if chr(x) in string.ascii_uppercase) type = type[0] for clue in clues.xpath('./puzzle:clue', namespaces=ns): word_id = clue.attrib['word'] number = int(clue.attrib['number']) text = "|".join(clue.itertext()).strip() text = escape(text, rev_xml_escape_table) solution = get_solution(word_id, word_map, puzzle) xd.clues.append(((type, number), text, solution)) return xd
#!/usr/bin/env python import time import sys import os.path import mkwww import xdfile outlines = [ ] total_xd = 0 for metafn in sys.argv[1:]: pubxd = xdfile.xdfile(file(metafn).read(), metafn) num_xd = int(pubxd.get_header("num_xd")) total_xd += num_xd years = pubxd.get_header("years") pubid = metafn.split("/")[-2] outlines.append((num_xd, '<li><a href="{pubid}"><b>{pubid}</b></a>: {num_xd} crosswords from {years}</li>'.format(**{ 'pubid': pubid, "num_xd": num_xd, "years": years }))) out = mkwww.html_header.format(title=time.strftime("xd corpus grid similarity results [%Y-%m-%d]")) out += "The xd corpus has %d crosswords total:" % total_xd out += "<ul>" out += "\n".join(L for n, L in sorted(outlines, reverse=True)) out += "</ul>" out += '<a href="xd-xdiffs.zip">xd-xdiffs.zip</a> (7MB) has raw data for all puzzles that are at least 25% similar. Source code for using <a href="https://github.com/century-arcade/xd">the .xd format is available on Github.</a><br/>' out += mkwww.html_footer
def parse_puz(contents, filename): rebus_shorthands = list( "⚷⚳♇♆⛢♄♃♂♁♀☿♹♸♷♶♵♴♳⅘⅗⅖⅕♚♛♜♝♞♟⚅⚄⚃⚂⚁⚀♣♦♥♠+&%$@?*zyxwvutsrqponmlkjihgfedcba0987654321" ) try: puzobj = puz.load(contents) puzzle = crossword.from_puz(puzobj) except puz.PuzzleFormatError as e: emsg = e.message if "<html>" in contents.decode('utf-8').lower(): emsg += " (looks like html)" raise xdfile.PuzzleParseError(emsg) grid_dict = dict(list(zip(string.ascii_uppercase, string.ascii_uppercase))) xd = xdfile.xdfile('', filename) xd.set_header("Author", puzobj.author) xd.set_header("Copyright", puzobj.copyright) xd.set_header("Notes", puzobj.notes) xd.set_header("Postscript", "".join(x for x in puzobj.postscript if ord(x) >= ord(' '))) xd.set_header("Preamble", puzobj.preamble) xd.set_header("Title", puzobj.title) used_rebuses = {} # [puz_rebus_gridvalue_as_string] -> our_rebus_gridvalue rebus = {} # [our_rebus_gridvalue] -> full_cell r = puzobj.rebus() if r.has_rebus(): grbs = puzobj.extensions[b"GRBS"] if sum(x for x in grbs if x != 0) > 0: # check for an actual rebus for pair in puzobj.extensions[b"RTBL"].decode("cp1252").split(";"): pair = pair.strip() if not pair: continue key, value = pair.split(":") rebuskey = rebus_shorthands.pop() used_rebuses[key] = rebuskey rebus[rebuskey] = decode(value) rebustr = xdfile.REBUS_SEP.join([ ("%s=%s" % (k, v)) for k, v in sorted(rebus.items()) ]) xd.set_header("Rebus", rebustr) for r, row in enumerate(puzzle): rowstr = "" for c, cell in enumerate(row): if puzzle.block is None and cell.solution == '.': rowstr += xdfile.BLOCK_CHAR elif cell.solution == puzzle.block: rowstr += xdfile.BLOCK_CHAR elif cell.solution == ':': rowstr += xdfile.OPEN_CHAR elif cell == puzzle.empty: rowstr += xdfile.UNKNOWN_CHAR else: n = r * puzobj.width + c reb = puzobj.rebus() if reb.has_rebus() and n in reb.get_rebus_squares(): ch = str(reb.table[n] - 1) rowstr += used_rebuses[ch] cell.solution = rebus[used_rebuses[ch]] else: ch = cell.solution if ch not in grid_dict: if ch in rebus_shorthands: cellch = ch rebus_shorthands.remove(ch) warn( "%s: unknown grid character '%s', assuming rebus of itself" % (filename, ch)) else: cellch = rebus_shorthands.pop() warn( "%s: unknown grid character '%s', assuming rebus (as '%s')" % (filename, ch, cellch)) xd.set_header( "Rebus", xd.get_header("Rebus") + " %s=%s" % (cellch, ch)) grid_dict[ch] = cellch rowstr += grid_dict[ch] xd.grid.append(rowstr) assert xd.size() == (puzzle.width, puzzle.height), "non-matching grid sizes" # clues answers = {} for posdir, posnum, answer in xd.iteranswers(): answers[posdir[0] + str(posnum)] = answer try: for number, clue in puzzle.clues.across(): cluenum = "A" + str(number) if cluenum not in answers: raise xdfile.IncompletePuzzleParse( xd, "Clue number doesn't match grid: " + cluenum) xd.clues.append( (("A", number), decode(clue), answers.get(cluenum, ""))) # xd.append_clue_break() for number, clue in puzzle.clues.down(): cluenum = "D" + str(number) if cluenum not in answers: raise xdfile.IncompletePuzzleParse( xd, "Clue doesn't match grid: " + cluenum) xd.clues.append( (("D", number), decode(clue), answers.get(cluenum, ""))) except KeyError as e: raise xdfile.IncompletePuzzleParse( xd, "Clue doesn't match grid: " + str(e)) return xd
def parse_xwordinfo(content, filename): content = content.decode('utf-8') REBUS_LONG_HANDS = { 'NINE': '9', 'EIGHT': '8', 'SEVEN': '7', 'SIX': '6', 'FIVE': '5', 'FOUR': '4', 'THREE': '3', 'TWO': '2', 'ONE': '1', 'ZERO': '0', 'AUGHT': '0', 'AMPERSAND': '&', 'AND': '&', 'ASTERISK': '*', 'PERCENT': '%', 'STAR': '*', 'AT': '@', 'DOLLAR': '$', 'PLUS': '+', 'CENT': 'c', # 'DASH': '-', # 'DOT': '●' } rsh = 'zyxwvutsrqponmlkjihgfedcba♚♛♜♝♞♟⚅⚄⚃⚂⚁⚀♣♦♥♠Фθиλπφя+&%$@?*0987654321' REBUS_SHORT_HANDS = list(rsh) content = content.replace("<b>", "{*") content = content.replace("</b>", "*}") content = content.replace("<i>", "{/") content = content.replace("</i>", "/}") content = content.replace("<em>", "{/") content = content.replace("</em>", "/}") content = content.replace("<u>", "{_") content = content.replace("</u>", "_}") content = content.replace("<strike>", "{-") content = content.replace("</strike>", "-}") content = content.replace("’", "'") content = content.replace('“', '"') # content = content.replace('–', '-') if "CPHContent_" in content: xwiprefix = '#CPHContent_' else: xwiprefix = '#' root = html.fromstring(content) ## debug("ROOT: %s" % root) special_type = '' rebus = {} rebus_order = [] xd = xdfile.xdfile('', filename) # get crossword info title = root.cssselect(xwiprefix + 'TitleLabel')[0].text.strip() try: subtitle = root.cssselect(xwiprefix + 'SubTitleLabel')[0].text.strip() subtitle = ' [%s]' % subtitle except: subtitle = "" # author = root.cssselect(xwiprefix + 'AuthorLabel')[0].text.strip() # editor = root.cssselect(xwiprefix + 'EditorLabel')[0].text.strip() try: xd.notes = stringify_children( root.cssselect(xwiprefix + 'NotepadDiv')[0]) except Exception as e: xd.notes = "" debug('Exception %s' % e) xd.set_header("Title", '%s%s' % (title, subtitle)) xd.set_header("Author", root.cssselect(xwiprefix + 'AuthorLabel')[0].text.strip()) xd.set_header("Editor", root.cssselect(xwiprefix + 'EditorLabel')[0].text.strip()) xd.notes = xd.notes.replace("<br/>", "\n") xd.notes = xd.notes.replace("<b>Notepad:</b>", "\n") xd.notes = xd.notes.replace(" ", "\n") xd.notes = xd.notes.strip() puzzle_table = root.cssselect(xwiprefix + 'PuzTable tr') or root.cssselect( '#PuzTable tr') for row in puzzle_table: row_data = "" for cell in row.cssselect('td'): # check if the cell is special - with a shade or a circle cell_class = cell.get('class') cell_type = '' if cell_class == 'bigshade': cell_type = 'shaded' elif cell_class == 'bigcircle': cell_type = 'circle' letter = cell.cssselect('div.letter') letter = (len(letter) and letter[0].text) or xdfile.BLOCK_CHAR # handle rebuses if letter == xdfile.BLOCK_CHAR: subst = cell.cssselect('div.subst2') subst = (len(subst) and subst[0].text) or '' if not subst: subst = cell.cssselect('div.subst') if subst: if title in SPLIT_REBUS_TITLES: subst = "/".join(list(subst[0].text)) else: subst = subst[0].text else: subst = '' if subst: if subst not in rebus: if subst in REBUS_LONG_HANDS: rebus_val = REBUS_LONG_HANDS[subst] if rebus_val in REBUS_SHORT_HANDS: REBUS_SHORT_HANDS.remove(rebus_val) else: rebus_val = REBUS_SHORT_HANDS.pop() rebus[subst] = rebus_val rebus_order.append(subst) letter = rebus[subst] if cell_type: # the special cell's letter should be represented in lower case letter = letter.lower() if not special_type: # hopefully there shouldn't be both shades and circles in # the same puzzle - if that is the case, only the last value # will be put up in the header special_type = cell_type row_data += letter xd.grid.append(row_data) if len(rebus): rebus = ["%s=%s" % (rebus[x], x.upper()) for x in rebus_order] xd.set_header("Rebus", ','.join(rebus)) if special_type: xd.set_header("Special", special_type) # add clues across_clues = _fetch_clues(xd, 'A', root, xwiprefix + 'AcrossClues', rebus) down_clues = _fetch_clues(xd, 'D', root, xwiprefix + 'DownClues', rebus) return xd
def reload(self): import xdfile self.xd = xdfile.xdfile(xd_contents=self.source.read_text(), filename=self.source) self.rows = self.xd.clues
def parse_puz(contents, filename): rebus_shorthands = list("⚷⚳♇♆⛢♄♃♂♁♀☿♹♸♷♶♵♴♳⅘⅗⅖⅕♚♛♜♝♞♟⚅⚄⚃⚂⚁⚀♣♦♥♠+&%$@?*zyxwvutsrqponmlkjihgfedcba0987654321") try: puzobj = puz.load(contents) puzzle = crossword.from_puz(puzobj) except puz.PuzzleFormatError as e: emsg = e.message if "<html>" in contents.decode('utf-8').lower(): emsg += " (looks like html)" raise xdfile.PuzzleParseError(emsg) grid_dict = dict(list(zip(string.ascii_uppercase, string.ascii_uppercase))) xd = xdfile.xdfile('', filename) xd.set_header("Author", puzobj.author) xd.set_header("Copyright", puzobj.copyright) xd.set_header("Notes", puzobj.notes) xd.set_header("Postscript", "".join(x for x in puzobj.postscript if ord(x) >= ord(' '))) xd.set_header("Preamble", puzobj.preamble) xd.set_header("Title", puzobj.title) used_rebuses = {} # [puz_rebus_gridvalue_as_string] -> our_rebus_gridvalue rebus = {} # [our_rebus_gridvalue] -> full_cell r = puzobj.rebus() if r.has_rebus(): grbs = puzobj.extensions[b"GRBS"] if sum(x for x in grbs if x != 0) > 0: # check for an actual rebus for pair in puzobj.extensions[b"RTBL"].decode("cp1252").split(";"): pair = pair.strip() if not pair: continue key, value = pair.split(":") rebuskey = rebus_shorthands.pop() used_rebuses[key] = rebuskey rebus[rebuskey] = decode(value) rebustr = xdfile.REBUS_SEP.join([("%s=%s" % (k, v)) for k, v in sorted(rebus.items())]) xd.set_header("Rebus", rebustr) for r, row in enumerate(puzzle): rowstr = "" for c, cell in enumerate(row): if puzzle.block is None and cell.solution == '.': rowstr += xdfile.BLOCK_CHAR elif cell.solution == puzzle.block: rowstr += xdfile.BLOCK_CHAR elif cell.solution == ':': rowstr += xdfile.OPEN_CHAR elif cell == puzzle.empty: rowstr += xdfile.UNKNOWN_CHAR else: n = r * puzobj.width + c reb = puzobj.rebus() if reb.has_rebus() and n in reb.get_rebus_squares(): ch = str(reb.table[n] - 1) rowstr += used_rebuses[ch] cell.solution = rebus[used_rebuses[ch]] else: ch = cell.solution if ch not in grid_dict: if ch in rebus_shorthands: cellch = ch rebus_shorthands.remove(ch) warn("%s: unknown grid character '%s', assuming rebus of itself" % (filename, ch)) else: cellch = rebus_shorthands.pop() warn("%s: unknown grid character '%s', assuming rebus (as '%s')" % (filename, ch, cellch)) xd.set_header("Rebus", xd.get_header("Rebus") + " %s=%s" % (cellch, ch)) grid_dict[ch] = cellch rowstr += grid_dict[ch] xd.grid.append(rowstr) assert xd.size() == (puzzle.width, puzzle.height), "non-matching grid sizes" # clues answers = {} for posdir, posnum, answer in xd.iteranswers(): answers[posdir[0] + str(posnum)] = answer try: for number, clue in puzzle.clues.across(): cluenum = "A" + str(number) if cluenum not in answers: raise xdfile.IncompletePuzzleParse(xd, "Clue number doesn't match grid: " + cluenum) xd.clues.append((("A", number), decode(clue), answers.get(cluenum, ""))) # xd.append_clue_break() for number, clue in puzzle.clues.down(): cluenum = "D" + str(number) if cluenum not in answers: raise xdfile.IncompletePuzzleParse(xd, "Clue doesn't match grid: " + cluenum) xd.clues.append((("D", number), decode(clue), answers.get(cluenum, ""))) except KeyError as e: raise xdfile.IncompletePuzzleParse(xd, "Clue doesn't match grid: " + str(e)) return xd
def xd_from_grid(grid): return xdfile.xdfile("Creator: %s\n\n\n%s" % (cherrypy.request.remote.ip, grid))
def main(): p = utils.args_parser( desc="annotate puzzle clues with earliest date used in the corpus") p.add_argument( '-a', '--all', default=False, help='analyze all puzzles, even those already in similar.tsv') p.add_argument('-l', '--limit', default=100, help='limit amount of puzzles to be analyzed [default=100]') args = get_args(parser=p) outf = open_output() num_processed = 0 prev_similar = metadb.read_rows('gxd/similar') for fn, contents in find_files(*args.inputs, ext=".xd"): progress(fn) mainxd = xdfile(contents.decode('utf-8'), fn) if mainxd.xdid() in prev_similar: continue # skip reprocessing .xd that are already in similar.tsv """ find similar grids (pct, xd) for the mainxd in the corpus. Takes about 1 second per xd. sorted by pct. """ similar_grids = sorted(find_similar_to(mainxd, corpus(), min_pct=0.20), key=lambda x: x[0], reverse=True) num_processed += 1 if num_processed > int(args.limit): break if similar_grids: info("similar: " + " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids)) mainpubid = mainxd.publication_id() maindate = mainxd.date() # go over each clue/answer, find all other uses, other answers, other possibilities. # these are added directly to similar.tsv nstaleclues = 0 nstaleanswers = 0 ntotalclues = 0 for pos, mainclue, mainanswer in mainxd.iterclues(): progress(mainanswer) poss_answers = [] pub_uses = {} # [pubid] -> set(ClueAnswer) mainca = ClueAnswer(mainpubid, maindate, mainanswer, mainclue) # find other uses of this clue, and other answers, in a single pass for clueans in find_clue_variants(mainclue): if clueans.answer != mainanswer: poss_answers.append(clueans) if clueans.answer == mainanswer: if clueans.pubid in pub_uses: otherpubs = pub_uses[clueans.pubid] else: otherpubs = set() # set of ClueAnswer pub_uses[clueans.pubid] = otherpubs otherpubs.add(clueans) # bclues is all boiled clues for this particular answer: { [bc] -> #uses } bclues = load_answers().get(mainanswer, []) stale_answer = False if bclues: uses = [] for bc, nuses in bclues.items(): # then find all clues besides this one clue_usages = [ ca for ca in load_clues().get(bc, []) if ca.answer == mainanswer and ca.date < maindate ] if clue_usages: stale_answer = True if nuses > 1: # only use one (the most recent) ClueAnswer per boiled clue # but use the clue only (no xdid) ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1].clue else: ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1] uses.append((ca, nuses)) # summary row to similar.tsv metadb.append_row( 'gxd/similar', [ mainxd.xdid(), # xdid int(100 * sum( pct / 100.0 for pct, xd1, xd2 in similar_grids)), # similar_grid_pct nstaleclues, # reused_clues nstaleanswers, # reused_answers ntotalclues, # total_clues " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids) # matches ])
return out if __name__ == "__main__": OUTPUT_DIR = sys.argv[1] pubid = OUTPUT_DIR.split("/")[-1] if len(sys.argv) > 2: similar_txts = sys.argv[2:] else: similar_txts = [ "crosswords/%s/similar.txt" % pubid ] os.makedirs(OUTPUT_DIR) pubxd = xdfile.xdfile(file("crosswords/%s/meta.txt" % pubid).read()) # just to parse some cached metadata left_index_list = { } # [(olderfn, newerfn)] -> (pct, index_line) right_index_list = { } # [(olderfn, newerfn)] -> (pct, index_line) for inputfn in similar_txts: for line in file(inputfn).read().splitlines(): if not line: continue parts = line.strip().split(' ', 2) if len(parts) == 2: fn1, fn2 = parts elif len(parts) == 3: fn1, fn2, rest = parts else: print "ERROR in %s: %s" % (inputfn, line) continue
def parse_ipuz(contents, filename): rebus_shorthands = list("⚷⚳♇♆⛢♄♃♂♁♀☿♹♸♷♶♵♴♳⅘⅗⅖⅕♚♛♜♝♞♟⚅⚄⚃⚂⚁⚀♣♦♥♠+&%$@?*zyxwvutsrqponmlkjihgfedcba0987654321") # i need a .load to create the ipuz_dict, and then maybe i am home free ipuz_dict = ipuz.read(contents.decode("utf-8")) puzzle = crossword.from_ipuz(ipuz_dict) grid_dict = dict(list(zip(string.ascii_uppercase, string.ascii_uppercase))) xd = xdfile.xdfile('', filename) xd.set_header("Author", puzzle.meta.creator) xd.set_header("Editor", puzzle.meta.contributor) xd.set_header("Copyright", puzzle.meta.rights) dt = parse_date_from_filename(parse_pathname(filename).base) if dt: xd.set_header("Date", dt) xd.set_header("Notes", puzzle.meta.description) #xd.set_header("Postscript", "".join(x for x in puzobj.postscript if ord(x) >= ord(' '))) #xd.set_header("Preamble", puzobj.preamble) xd.set_header("Title", puzzle.meta.title) for r, row in enumerate(puzzle): rowstr = "" for c, cell in enumerate(row): if puzzle.block is None and cell.solution == '#': rowstr += xdfile.BLOCK_CHAR elif cell.solution == puzzle.block: rowstr += xdfile.BLOCK_CHAR elif cell.solution == ':': rowstr += xdfile.OPEN_CHAR elif cell == puzzle.empty: rowstr += xdfile.UNKNOWN_CHAR else: n = r * puzzle.width + c ch = cell.solution if ch not in grid_dict: if ch in rebus_shorthands: cellch = ch rebus_shorthands.remove(ch) warn("%s: unknown grid character '%s', assuming rebus of itself" % (filename, ch)) else: cellch = rebus_shorthands.pop() warn("%s: unknown grid character '%s', assuming rebus (as '%s')" % (filename, ch, cellch)) xd.set_header("Rebus", xd.get_header("Rebus") + " %s=%s" % (cellch, ch)) grid_dict[ch] = cellch rowstr += grid_dict[ch] xd.grid.append(rowstr) assert xd.size() == (puzzle.width, puzzle.height), "non-matching grid sizes" # clues answers = {} for posdir, posnum, answer in xd.iteranswers(): answers[posdir[0] + str(posnum)] = answer try: for number, clue in puzzle.clues.across(): cluenum = "A" + str(number) if cluenum not in answers: raise xdfile.IncompletePuzzleParse(xd, "Clue number doesn't match grid: " + cluenum) xd.clues.append((("A", number), decode(clue), answers.get(cluenum, ""))) # xd.append_clue_break() for number, clue in puzzle.clues.down(): cluenum = "D" + str(number) if cluenum not in answers: raise xdfile.IncompletePuzzleParse(xd, "Clue doesn't match grid: " + cluenum) xd.clues.append((("D", number), decode(clue), answers.get(cluenum, ""))) except KeyError as e: raise xdfile.IncompletePuzzleParse(xd, "Clue doesn't match grid: " + str(e)) return xd
def parse_ccxml(data, filename): content = data.decode('utf-8', errors='replace') content = escape(content, xml_escape_table) content = consecutive(content) content = re.sub(r'(=["]{2}([^"]+?)["]{2})+', r'=""\2""', content) # Replace double quotes content_xml = content.encode('utf-8') ns = {'puzzle': 'http://crossword.info/xml/rectangular-puzzle'} try: root = etree.fromstring(content_xml) except Exception as e: error('Exception %s' % e) error(content) exit # init crossword grid = root.xpath('//puzzle:crossword/puzzle:grid', namespaces=ns) if not grid: return None grid = grid[0] rows = int(grid.attrib['height']) cols = int(grid.attrib['width']) xd = xdfile.xdfile('', filename) # add metadata for metadata in root.xpath('//puzzle:metadata', namespaces=ns)[0]: text = metadata.text and metadata.text.strip() title = re.sub('\{[^\}]*\}', '', metadata.tag.title()) title = escape(title, rev_xml_escape_table) if text: text = escape(text, rev_xml_escape_table) xd.set_header(HEADER_RENAMES.get(title, title), text) # add puzzle puzzle = [] for i in range(rows): puzzle.append([" "] * cols) for cell in grid.xpath('./puzzle:cell', namespaces=ns): x = int(cell.attrib['x']) - 1 y = int(cell.attrib['y']) - 1 if 'solution' in cell.attrib: value = cell.attrib['solution'] if 'type' in cell.attrib and cell.attrib['type'] == 'block': value = xdfile.BLOCK_CHAR puzzle[y][x] = value xd.grid = ["".join(row) for row in puzzle] # add clues word_map = {} for word in root.xpath('//puzzle:crossword/puzzle:word', namespaces=ns): word_map[word.attrib['id']] = (word.attrib['x'], word.attrib['y']) for clues in root.xpath('//puzzle:crossword/puzzle:clues', namespaces=ns): type = clues.xpath('./puzzle:title', namespaces=ns)[0] type = "".join( chr(x) for x in etree.tostring(type, method='text').upper() if chr(x) in string.ascii_uppercase) type = type[0] for clue in clues.xpath('./puzzle:clue', namespaces=ns): word_id = clue.attrib['word'] number = int(clue.attrib['number']) text = "|".join(clue.itertext()).strip() text = escape(text, rev_xml_escape_table) solution = get_solution(word_id, word_map, puzzle) xd.clues.append(((type, number), text, solution)) return xd
def main(fn): for fn in sys.argv[1:]: print(to_html(xdfile.xdfile(open(fn).read(), fn)))
def parse_ccxml(content): content = content.replace("<b>", "{*") content = content.replace("</b>", "*}") content = content.replace("<i>", "{/") content = content.replace("</i>", "/}") content = content.replace("<em>", "{/") content = content.replace("</em>", "/}") content = content.replace("<u>", "{_") content = content.replace("</u>", "_}") content = content.replace("<strike>", "{-") content = content.replace("</strike>", "-}") ns = { 'puzzle': 'http://crossword.info/xml/rectangular-puzzle' } root = etree.fromstring(content) # init crossword grid = root.xpath('//puzzle:crossword/puzzle:grid', namespaces=ns)[0] rows = int(grid.attrib['height']) cols = int(grid.attrib['width']) xd = xdfile.xdfile() # add metadata for metadata in root.xpath('//puzzle:metadata', namespaces=ns)[0]: text = metadata.text and metadata.text.strip() title = re.sub('\{[^\}]*\}', '', metadata.tag.title()) if text: xd.headers.append((title, text)) # add puzzle puzzle = [ ] for i in range(rows): puzzle.append([ " " ] * cols) for cell in grid.xpath('./puzzle:cell', namespaces=ns): x = int(cell.attrib['x']) - 1 y = int(cell.attrib['y']) - 1 if 'solution' in cell.attrib: value = cell.attrib['solution'] if 'type' in cell.attrib and cell.attrib['type'] == 'block': value = xdfile.BLOCK_CHAR puzzle[y][x] = value xd.grid = [ "".join(row) for row in puzzle ] # add clues word_map = {} for word in root.xpath('//puzzle:crossword/puzzle:word', namespaces=ns): word_map[word.attrib['id']] = (word.attrib['x'], word.attrib['y']) for clues in root.xpath('//puzzle:crossword/puzzle:clues', namespaces=ns): type = clues.xpath('./puzzle:title', namespaces=ns)[0] type = "".join(x for x in etree.tostring(type, method='text').upper() if x in string.uppercase) type = type[0] for clue in clues.xpath('./puzzle:clue', namespaces=ns): word_id = clue.attrib['word'] number = int(clue.attrib['number']) text = "|".join(clue.itertext()).strip() solution = get_solution(word_id, word_map, puzzle) xd.clues.append(((type, number), text, solution)) return xd
def parse_puz(contents, filename): rebus_shorthands = list(u"♚♛♜♝♞♟⚅⚄⚃⚂⚁⚀♣♦♥♠Фθиλπφя+&%$@?*zyxwvutsrqponmlkjihgfedcba0987654321") if not filename.lower().endswith('.puz'): return puz_object = puz.load(contents) puzzle = crossword.from_puz(puz_object) grid_dict = dict(zip(string.uppercase, string.uppercase)) xd = xdfile.xdfile() md = dict([ (k.lower(), v) for k, v in puzzle.meta() if v ]) author = md.get("creator", "") if " / " in author: author, editor = author.split(" / ") else: editor = "" author = author.strip() editor = editor.strip() for editsep in [ "edited by ", "ed. " ]: try: i = author.lower().index(editsep) if i == 0: editor = author[len(editsep):] author = editor.split(",")[1] elif i > 0: assert not editor editor = author[i+len(editsep):] author = author[:i] except: pass author = author.strip() editor = editor.strip() while author.lower().startswith("by "): author = author[3:] if author and author[-1] in ",.": author = author[:-1] md["creator"] = author md["editor"] = editor for k, v in sorted(md.items(), key=lambda x: hdr_order.index(x[0])): if v: k = k[0].upper() + k[1:].lower() v = decode(v.strip()) v = v.replace(u"©", "(c)") xd.headers.append((k, v)) answers = { } clue_num = 1 for r, row in enumerate(puzzle): rowstr = "" for c, cell in enumerate(row): if puzzle.block is None and cell.solution == '.': rowstr += xdfile.BLOCK_CHAR elif puzzle.block == cell.solution: rowstr += xdfile.BLOCK_CHAR elif cell == puzzle.empty: rowstr += "." else: if cell.solution not in grid_dict: grid_dict[cell.solution] = rebus_shorthands.pop() rowstr += grid_dict[cell.solution] # compute number shown in box new_clue = False if is_block(puzzle, c-1, r): # across clue start j = 0 answer = "" while not is_block(puzzle, c+j, r): answer += puzzle[c+j, r].solution j += 1 if len(answer) > 1: new_clue = True answers["A"+str(clue_num)] = answer if is_block(puzzle, c, r-1): # down clue start j = 0 answer = "" while not is_block(puzzle, c, r+j): answer += puzzle[c, r+j].solution j += 1 if len(answer) > 1: new_clue = True answers["D"+str(clue_num)] = answer if new_clue: clue_num += 1 xd.grid.append(rowstr) for number, clue in puzzle.clues.across(): xd.clues.append((("A", number), decode(clue), answers["A"+str(number)])) for number, clue in puzzle.clues.down(): xd.clues.append((("D", number), decode(clue), answers["D"+str(number)])) return xd