Пример #1
0
def main():
    args = get_args("reclue puzzle with clues from other publications")
    outf = open_output()

    all_clues = load_clues()

    missing_tsv = COLUMN_SEPARATOR.join(
        ["grid_xdid", "clues_pubid", "num_missing"]) + EOL

    for fn, contents in find_files(*args.inputs, ext=".xd"):
        xd = xdfile(contents, fn)
        if not xd.grid:
            continue
        xd.set_header("Title", None)
        xd.set_header("Editor", "Timothy Parker Bot")
        xd.set_header(
            "Author",
            "%s %s" % (random.choice(fake_first), random.choice(fake_last)))
        xd.set_header("Copyright", None)
        xd.set_header("Date", iso8601())

        remixed = set()
        for pubid, pub_clues in list(all_clues.items()):
            try:
                if pubid == xd.publication_id():
                    continue  # don't use same publisher's clues

                nmissing = reclue(xd, pub_clues)

                outfn = "%s-%s.xd" % (xd.xdid(), pubid)

                if nmissing == 0:
                    nmutated = 0
                    while nmutated < 100:
                        nmutated += mutate(xd, pub_clues)
                    nmissing = reclue(xd, pub_clues)
                    info("%s missing %d clues after %d mutations" %
                         (outfn, nmissing, nmutated))

                    remixed.add(pubid)
                    outf.write_file(outfn, xd.to_unicode())
                else:
                    debug("%s missing %d clues" % (outfn, nmissing))

                    missing_tsv += COLUMN_SEPARATOR.join(
                        [xd.xdid(), pubid, str(nmissing)]) + EOL

            except Exception as e:
                error("remix error %s" % str(e))

        if remixed:
            info("%d remixed: %s" % (len(remixed), " ".join(remixed)))
            try:
                outf.write_file(
                    parse_pathname(fn).base + ".xd", contents.encode("utf-8"))
            except Exception as e:
                error("couldn't write: " + str(e))

    outf.write_file("remix.log", get_log().encode("utf-8"))
    outf.write_file("remix.tsv", missing_tsv)
Пример #2
0
def main():
    p = args_parser('catalog source files and create source.tsv')
    p.add_argument('-s', '--source', default=None, help='ExternalSource')
    args = get_args(parser=p)

    info("importing from %s" % args.source)

    outf = open_output()

    sources = []

    for input_source in args.inputs:
        for fn, contents, dt in find_files_with_time(input_source):
            if len(contents) == 0:
                info("ignoring empty file")
                continue

            outf.write_file(strip_toplevel(fn), contents, dt)

            sources.append(xd_sources_row(fn, args.source or input_source, iso8601(dt)))

    info("%s files cataloged" % len(sources))

    outbase = parse_pathname(args.output).base

    outf.write_file("%s.tsv" % outbase, xd_sources_header + "".join(sources))
    outf.write_file("%s.log" % outbase, get_log())
Пример #3
0
def main():
    p = args_parser("process huge puzzles archive into separate .zip and create sources.tsv")
    p.add_argument("-s", "--source", default=None, help="ExternalSource")
    args = get_args(parser=p)

    outf = open_output()

    if args.source:
        source = args.source
    else:
        source = parse_pathname(args.inputs[0]).base

    subzips = {}

    for inputfn in args.inputs:
        for fn, contents, dt in xdfile.utils.find_files_with_time(inputfn):
            if not contents:
                continue

            m = re.match(r"^([a-z]{2,4})[\-0-9]{1}\d.*", parse_pathname(fn).base, flags=re.IGNORECASE)
            prefix = m.group(1).lower() if m else "misc"
            if prefix not in subzips:
                zf = xdfile.utils.OutputZipFile(os.path.join(args.output, prefix + ".zip"))
                sources = []
                subzips[prefix] = (zf, sources)
            else:
                zf, sources = subzips[prefix]
            progress("Processing %s -> %s" % (fn, prefix))
            zf.write_file(fn, contents, dt)

            sources.append(xd_sources_row(fn, source, iso8601(dt)))

    for zf, sources in subzips.values():
        zf.write_file("sources.tsv", xd_sources_header + "".join(sources))
Пример #4
0
def main():
    global all_uses
    args = get_args('create word pages and index')
    outf = open_output()

    all_uses = {}

    for ca in clues():
        if ca.answer not in all_uses:
            all_uses[ca.answer] = []
        all_uses[ca.answer].append(ca)

    h = '<li>%d different words</li>' % len(all_uses)

    h += '<h2>Most used words</h2>'
    h += '<table class="clues most-used-words">'
    h += th("word", "# uses", "clues used with this answer")

    wordpages_to_make = set(args.inputs)

    for answer, uses in sorted(all_uses.items(), reverse=True, key=lambda x: len(x[1]))[:100]:
        wordpages_to_make.add(answer)
        h += td(mkhref(answer.upper(), answer.upper()),
                len(uses),
                html_select_options(uses, strmaker=lambda ca: ca.clue))

    h += '</table>'

    for word in wordpages_to_make:
        outf.write_html('word/%s/index.html' % word.upper(), mkwww_wordpage(word), title=word)

    outf.write_html('word/index.html', h, title="Words")
Пример #5
0
def main():
    args = get_args('aggregates all .log files into one .html')
    outwww = open_output()
    log_html = ''
    for fn, contents, dt in sorted(find_files_with_time(*args.inputs, ext=".log"), key=lambda x: x[2]):  # earliest first
        log_html += '\n\n<h2>%s</h2><pre>%s</pre>' % (fn, cgi.escape(contents.decode("utf-8")))

    datestr = iso8601()
    outwww.write_html("logs.html", log_html, title="logs for " + datestr)
Пример #6
0
def main():
    args = utils.get_args()
    outf = utils.open_output()

    for htmlfn, contents in utils.find_files(*args.inputs):
        basepagename = utils.parse_pathname(htmlfn).base

        wrappeddiv = '<div class="text">' + contents.decode('utf-8') + '</div>'
        outf.write_html('%s/index.html' % basepagename, wrappeddiv)
Пример #7
0
def main():
    global boiled_clues
    args = get_args('create clue index')
    outf = open_output()

    boiled_clues = load_clues()

    biggest_clues = "<li>%d total clues, which boil down to %d distinct clues" % (len(clues()), len(boiled_clues))

    bcs = [ (len(v), bc, answers_from(v)) for bc, v in boiled_clues.items() ]

    nreused = len([bc for n, bc, _ in bcs if n > 1])
    biggest_clues += "<li>%d (%d%%) of these clues are used in more than one puzzle" % (nreused, nreused*100/len(boiled_clues))

    cluepages_to_make = set()

    # add all boiled clues from all input .xd files
    for fn, contents in find_files(*args.inputs, ext='.xd'):
        progress(fn)
        xd = xdfile.xdfile(contents.decode('utf-8'), fn)
        for pos, mainclue, mainanswer in xd.iterclues():
            cluepages_to_make.add(boil(mainclue))


    # add top 100 most used boiled clues from corpus
    biggest_clues += '<h2>Most used clues</h2>'

    biggest_clues += '<table class="clues most-used-clues">'
    biggest_clues += th("clue", "# uses", "answers used with this clue")
    for n, bc, ans in sorted(bcs, reverse=True)[:100]:
        cluepages_to_make.add(bc)
        biggest_clues += td(mkhref(unboil(bc), bc), n, html_select_options(ans))

    biggest_clues += '</table>'

    most_ambig = "<h2>Most ambiguous clues</h2>"
    most_ambig += '(clues with the largest number of different answers)'
    most_ambig += '<table class="clues most-different-answers">'
    most_ambig += th("Clue", "answers")

    for n, bc, ans in sorted(bcs, reverse=True, key=lambda x: len(set(x[2])))[:100]:
        cluepages_to_make.add(bc)
        clue = mkhref(unboil(bc), bc)
        if 'quip' in bc or 'quote' in bc or 'theme' in bc or 'riddle' in bc:
            most_ambig += td(clue, html_select_options(ans), rowclass="theme")
        else:
            most_ambig += td(clue, html_select_options(ans))

    most_ambig += '</table>'

    for bc in cluepages_to_make:
        contents = mkwww_cluepage(bc)
        if contents:
            outf.write_html('pub/clue/%s/index.html' % bc, contents, title=bc)

    outf.write_html('pub/clue/index.html', biggest_clues + most_ambig, title="Clues")
Пример #8
0
def main():
    args = get_args("save all clues in simple .tsv")
    outf = open_output("clues.tsv")

    outf.write_row("PublicationId Date Answer Clue".split())
    for xd in corpus(*args.inputs):
        pubid = xd.publication_id()
        dt = xd.date()
        for pos, clue, answer in xd.clues:
            outf.write_row((pubid or "", dt or "", answer, clue))
Пример #9
0
def main():
    args = get_args("save all clues in simple .tsv")
    outf = open_output("clues.tsv")

    outf.write_row("PublicationId Date Answer Clue".split())
    for xd in corpus(*args.inputs):
        pubid = xd.publication_id()
        dt = xd.date()
        for pos, clue, answer in xd.clues:
            outf.write_row((pubid or "", dt or "", answer, clue))
Пример #10
0
def main():
    args = utils.get_args()

    outf = utils.open_output()  # should be .zip

    outf.log = False
    outf.toplevel = 'xd'
    outf.write_file('README', open('doc/zip-README').read())
    outf.write_file('puzzles.tsv', open('pub/puzzles.tsv').read())
    outf.write_file('stats.tsv', open('pub/stats.tsv').read())
    outf.write_file('similar.tsv', open('gxd/similar.tsv').read())
Пример #11
0
def main():
    args = get_args("reclue puzzle with clues from other publications")
    outf = open_output()

    all_clues = load_clues()

    missing_tsv = COLUMN_SEPARATOR.join([ "grid_xdid", "clues_pubid", "num_missing" ]) + EOL

    for fn, contents in find_files(*args.inputs, ext=".xd"):
        xd = xdfile(contents, fn)
        if not xd.grid:
            continue
        xd.set_header("Title", None)
        xd.set_header("Editor", "Timothy Parker Bot")
        xd.set_header("Author", "%s %s" % (random.choice(fake_first), random.choice(fake_last)))
        xd.set_header("Copyright", None)
        xd.set_header("Date", iso8601())

        remixed = set()
        for pubid, pub_clues in list(all_clues.items()):
            try:
                if pubid == xd.publication_id():
                    continue  # don't use same publisher's clues

                nmissing = reclue(xd, pub_clues)

                outfn = "%s-%s.xd" % (xd.xdid(), pubid)

                if nmissing == 0:
                    nmutated = 0
                    while nmutated < 100:
                        nmutated += mutate(xd, pub_clues)
                    nmissing = reclue(xd, pub_clues)
                    info("%s missing %d clues after %d mutations" % (outfn, nmissing, nmutated))

                    remixed.add(pubid)
                    outf.write_file(outfn, xd.to_unicode())
                else:
                    debug("%s missing %d clues" % (outfn, nmissing))

                    missing_tsv += COLUMN_SEPARATOR.join([ xd.xdid(), pubid, str(nmissing) ]) + EOL

            except Exception as e:
                error("remix error %s" % str(e))

        if remixed:
            info("%d remixed: %s" % (len(remixed), " ".join(remixed)))
            try:
                outf.write_file(parse_pathname(fn).base + ".xd", contents.encode("utf-8"))
            except Exception as e:
                error("couldn't write: " + str(e))

    outf.write_file("remix.log", get_log().encode("utf-8"))
    outf.write_file("remix.tsv", missing_tsv)
Пример #12
0
def main():
    args = get_args(desc="find similar grids")
    g_corpus = [x for x in corpus()]

    outf = open_output()

    outf.write(xd_similar_header)

    for fn, contents in find_files(*args.inputs, strip_toplevel=False):
        needle = xdfile(contents.decode("utf-8"), fn)
        for pct, a, b in find_similar_to(needle, g_corpus):
            outf.write(xd_similar_row(a, b, pct))
Пример #13
0
def main():
    args = utils.get_args()
    outf = utils.open_output()  # should be .zip

    outf.log = False
    outf.toplevel = 'xd'
    outf.write_file('README', open('doc/zip-README').read())

    for fn, contents in sorted(utils.find_files(*args.inputs, ext='.xd')):
        xdid = utils.parse_xdid(fn)
        if metadb.is_public(xdid):
            outf.write_file(utils.strip_toplevel(fn), contents)
Пример #14
0
def main():
    args = utils.get_args('make clues.tsv files')
    outf = utils.open_output()  # should be .zip

    outf.log = False
    outf.toplevel = 'xd'
    outf.write_file('README', open('doc/zip-README').read())

    all_clues = [(ca.pubid, str(xdfile.year_from_date(ca.date)), ca.answer, ca.clue) for ca in xdfile.clues()]

    clues_tsv = ''

    clues_tsv += '\t'.join("pubid year answer clue".split()) + '\n'
    clues_tsv += '\n'.join('\t'.join(cluerow) for cluerow in sorted(all_clues))
    outf.write_file('clues.tsv', clues_tsv)
Пример #15
0
def main():
    args = utils.get_args('make clues.tsv files')
    outf = utils.open_output()  # should be .zip

    outf.log = False
    outf.toplevel = 'xd'
    outf.write_file('README', open('doc/zip-README').read())

    all_clues = [(ca.pubid, str(xdfile.year_from_date(ca.date)), ca.answer,
                  ca.clue) for ca in xdfile.clues()]

    clues_tsv = ''

    clues_tsv += '\t'.join("pubid year answer clue".split()) + '\n'
    clues_tsv += '\n'.join('\t'.join(cluerow) for cluerow in sorted(all_clues))
    outf.write_file('clues.tsv', clues_tsv)
Пример #16
0
def main():
    args = get_args('aggregates all .log files')
    outf = open_output()

    s3 = boto3.resource('s3')
    s3path = "logs/"
    # bucket = conn.get_bucket(s3path)
    bucket = s3.Bucket(os.environ['DOMAIN'])

    for obj in sorted(bucket.objects.all(), key=lambda x: x.last_modified):
        # last_modified
        if s3path in obj.key:
            print("Name: %s LastModified:%s" % (obj.key.encode('utf-8'), obj.last_modified))

    for fn, contents, dt in sorted(find_files_with_time(*args.inputs, ext=".log"), key=lambda x: x[2]):  # earliest first
        outf.write_file(fn, contents.decode("utf-8"))
Пример #17
0
def main():
    global boiled_clues
    args = get_args('create clue index')
    outf = open_output()

    boiled_clues = load_clues()

    biggest_clues = "<li>%d total clues, which boil down to %d distinct clues" % (len(clues()), len(boiled_clues))

    bcs = [ (len(v), bc, answers_from(v)) for bc, v in boiled_clues.items() ]

    nreused = len([bc for n, bc, _ in bcs if n > 1])
    biggest_clues += "<li>%d (%d%%) of these clues are used in more than one puzzle" % (nreused, nreused*100/len(boiled_clues))

    cluepages_to_make = set()

    biggest_clues += '<h2>Most used clues</h2>'

    biggest_clues += '<table class="clues most-used-clues">'
    biggest_clues += th("clue", "# uses", "answers used with this clue")
    for n, bc, ans in sorted(bcs, reverse=True)[:100]:
        cluepages_to_make.add(bc)
        biggest_clues += td(mkhref(unboil(bc), bc), n, html_select_options(ans))

    biggest_clues += '</table>'

    most_ambig = "<h2>Most ambiguous clues</h2>"
    most_ambig += '(clues with the largest number of different answers)'
    most_ambig += '<table class="clues most-different-answers">'
    most_ambig += th("Clue", "answers")

    for n, bc, ans in sorted(bcs, reverse=True, key=lambda x: len(set(x[2])))[:100]:
        cluepages_to_make.add(bc)
        clue = mkhref(unboil(bc), bc)
        if 'quip' in bc or 'quote' in bc or 'theme' in bc or 'riddle' in bc:
            most_ambig += td(clue, html_select_options(ans), rowclass="theme")
        else:
            most_ambig += td(clue, html_select_options(ans))

    most_ambig += '</table>'

    for bc in cluepages_to_make:
        outf.write_html('pub/clue/%s/index.html' % bc, mkwww_cluepage(bc), title=bc)

    outf.write_html('pub/clue/index.html', biggest_clues + most_ambig, title="Clues")
Пример #18
0
def main():
    args = get_args('aggregates all .log files')
    outf = open_output()

    s3 = boto3.resource('s3')
    s3path = "logs/"
    # bucket = conn.get_bucket(s3path)
    bucket = s3.Bucket(os.environ['DOMAIN'])

    for obj in sorted(bucket.objects.all(), key=lambda x: x.last_modified):
        # last_modified
        if s3path in obj.key:
            print("Name: %s LastModified:%s" %
                  (obj.key.encode('utf-8'), obj.last_modified))

    for fn, contents, dt in sorted(find_files_with_time(*args.inputs,
                                                        ext=".log"),
                                   key=lambda x: x[2]):  # earliest first
        outf.write_file(fn, contents.decode("utf-8"))
Пример #19
0
def main():
    args = get_args('parse downloaded emails')
    outf = open_output()

    sources_tsv = ''
    for emailfn, emailcontents in find_files(*args.inputs):
        msg = email.message_from_bytes(emailcontents)
        upload_src = msg["From"]

        if not upload_src:
            continue

        email_sources_tsv = []
        email_files = generate_email_files(msg)
        for puzfn, puzdata, puzdt in email_files:
            # a basic sanity check of filesize
            # accommodate small puzzles and .pdf
            info("%s: %s from %s" % (puzfn, iso8601(puzdt), upload_src))

        summary("%s puzzles from %s" % (len(email_files), upload_src))

            if len(puzdata) > 1000 and len(puzdata) < 100000:
                email_sources_tsv.append(xd_sources_row(puzfn, upload_src, iso8601(puzdt)))

                outf.write_file(puzfn, puzdata)

        # generate receipt row, send receipt email

        if email_sources_tsv:
            xd_send_email(upload_src,
                    fromaddr='*****@*****.**',
                    subject='Upload successful: %d files received' % len(email_sources_tsv),
                    body="These files were received:\n" + "\n".join(email_sources_tsv))
            sources_tsv += "".join(email_sources_tsv)
        else:
            xd_send_email(upload_src,
                    fromaddr='*****@*****.**',
                    subject='Upload error',
                    body='No puzzle files received')
Пример #20
0
def main():
    p = args_parser(
        'process huge puzzles archive into separate .zip and create sources.tsv'
    )
    p.add_argument('-s', '--source', default=None, help='ExternalSource')
    args = get_args(parser=p)

    outf = open_output()

    if args.source:
        source = args.source
    else:
        source = parse_pathname(args.inputs[0]).base

    subzips = {}

    for inputfn in args.inputs:
        for fn, contents, dt in xdfile.utils.find_files_with_time(inputfn):
            if not contents:
                continue

            m = re.match(r'^([a-z]{2,4})[\-0-9]{1}\d.*',
                         parse_pathname(fn).base,
                         flags=re.IGNORECASE)
            prefix = m.group(1).lower() if m else 'misc'
            if prefix not in subzips:
                zf = xdfile.utils.OutputZipFile(
                    os.path.join(args.output, prefix + ".zip"))
                sources = []
                subzips[prefix] = (zf, sources)
            else:
                zf, sources = subzips[prefix]
            progress("Processing %s -> %s" % (fn, prefix))
            zf.write_file(fn, contents, dt)

            sources.append(xd_sources_row(fn, source, iso8601(dt)))

    for zf, sources in subzips.values():
        zf.write_file("sources.tsv", xd_sources_header + "".join(sources))
Пример #21
0
def main():
    args = utils.get_args('generates .html diffs with deep clues for all puzzles in similar.tsv')
    outf = utils.open_output()

    similars = utils.parse_tsv('gxd/similar.tsv', 'Similar')
    xdids_todo = [ parse_pathname(fn).base for fn in args.inputs ]
    if not xdids_todo:
        xdids_todo = [ xdid for xdid, matches in metadb.get_similar_grids().items() if matches ]

    for mainxdid in xdids_todo:
        progress(mainxdid)

        mainxd = xdfile.get_xd(mainxdid)
        if not mainxd:
            continue

        matches = metadb.get_similar_grids().get(mainxdid, [])

        xddates = {}
        xddates[mainxdid] = mainxd.date() # Dict to store XD dates for further sort
        html_grids = {}
        html_clues = {}

        # these are added directly to similar.tsv
        nstaleclues = 0
        nstaleanswers = 0
        ntotalclues = 0

        poss_answers = [] # TODO:
        pub_uses = {}  # [pubid] -> set(ClueAnswer)

        dcl_html = ''
        deepcl_html = [] # keep deep clues to parse later - per row
        for pos, mainclue, mainanswer in mainxd.iterclues():
            deepcl_html = [] # Temporary to be replaced late
            mainca = ClueAnswer(mainxdid, mainxd.date(), mainanswer, mainclue)

            # 'grid position' column
            deepcl_html.append('<td class="pos">%s.</td>' % pos)

            # find other uses of this clue, and other answers, in a single pass
            for clueans in find_clue_variants(mainclue):
                if clueans.answer != mainanswer:
                    poss_answers.append(clueans)

                if clueans.answer == mainanswer:
                    if clueans.pubid in pub_uses:
                        otherpubs = pub_uses[clueans.pubid]
                    else:
                        otherpubs = set()  # set of ClueAnswer
                        pub_uses[clueans.pubid] = otherpubs
                    otherpubs.add(clueans)

            # add 'other uses' to clues_html
            stale = False
            deepcl_html.append('<td class="other-uses">')

            if len(pub_uses) > 0:
                sortable_uses = []
                for pubid, uses in pub_uses.items():
                    # show the earliest unboiled clue
                    for u in sorted(uses, key=lambda x: x.date or ""):
                        # only show those published earlier
                        if u.date and u.date <= mainxd.date():
                            if pubid == mainxdid and u.date == mainxd.date():
                                pass
                            else:
                                stale = True
                                sortable_uses.append((u.date, u, 1))

                deepcl_html.append(html_select([ (clue, nuses) for dt, clue, nuses in sorted(sortable_uses, key=lambda x: x[0], reverse=True) ], top_option=mainclue))

            else:
                deepcl_html.append('<div class="original">%s</div>' % esc(mainclue))

            deepcl_html.append('</td>')

            # add 'other answers' to clues_html

            deepcl_html.append('<td class="other-answers">')
            deepcl_html.append(html_select_options(poss_answers, strmaker=lambda ca: ca.answer, force_top=mainca))
            deepcl_html.append('</td>')

            # add 'other clues' to clues_html
            deepcl_html.append('<td class="other-clues">')

            # bclues is all boiled clues for this particular answer: { [bc] -> #uses }
            bclues = load_answers().get(mainanswer, [])
            stale_answer = False

            if bclues:
                uses = []
                for bc, nuses in bclues.items():
                    # then find all clues besides this one
                    clue_usages = [ ca for ca in load_clues().get(bc, []) if ca.answer == mainanswer and ca.date < mainxd.date() ]

                    if clue_usages:
                        stale_answer = True
                        if nuses > 1:
                            # only use one (the most recent) ClueAnswer per boiled clue
                            # but use the clue only (no xdid)
                            ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1].clue
                        else:
                            ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1]
                            uses.append((ca, nuses))

                if uses:
                    deepcl_html.append(html_select(uses))

            deepcl_html.append('</td>')  # end 'other-clues'

            if stale_answer:
                nstaleanswers += 1
            if stale:
                nstaleclues += 1
            ntotalclues += 1
            # Quick and dirty - to be replaced
            dcl_html += '<tr>' + ' '.join(deepcl_html) + '</tr>'

        # Store in list to make further formatting as html table easier
        mainxd = xdfile.get_xd(mainxdid)
        if mainxd:
            html_grids[mainxdid] = grid_diff_html(mainxd)

        # Add for main XD
        diff_l = []
        for pos, mainclue, mainanswer in mainxd.iterclues():
            diff_h = mktag('div','fullgrid main') + '%s.&nbsp;' %pos
            diff_h += mainclue
            diff_h += mktag('span', tagclass='main', inner='&nbsp;~&nbsp;' + mainanswer.upper())
            diff_l.append(diff_h)
        html_clues[mainxdid] = diff_l
        # Process for all matches
        for xdid in matches:
            xd = xdfile.get_xd(xdid)
            if not xd:
                continue
            xddates[xdid] = xd.date()
            # output each grid
            html_grids[xdid] = grid_diff_html(xd, compare_with=mainxd)
            diff_l = []
            # output comparison of each set of clues
            for pos, clue, answer in xd.iterclues():
                diff_h = mktag('div','fullgrid') + '%s.&nbsp;' %pos
                # Sometimes can return clue == None
                sm = difflib.SequenceMatcher(lambda x: x == ' ', mainxd.get_clue(pos) or '', clue)
                if sm.ratio() < 0.50:
                    diff_h += clue
                else:
                    # Compare based on op codes
                    for opcode in sm.get_opcodes():
                        c, a1, a2, b1, b2 = opcode
                        if c == 'equal':
                            diff_h += '<span class="match">%s</span>' % clue[b1:b2]
                        else:
                            diff_h += '<span class="diff">%s</span>' % clue[b1:b2]
                diff_h += mktag('span', tagclass=(answer == mainxd.get_answer(pos)) and 'match' or 'diff', inner='&nbsp;~&nbsp;' + answer.upper())
                diff_h += mktag('/div')
                diff_l.append(diff_h)
            html_clues[xdid] = diff_l

        # Wrap into table
        diff_h = mktag('table') + mktag('tr')
        # Sort by date
        sortedkeys = sorted(xddates.items(), key=operator.itemgetter(1))
        for w, dt in sortedkeys:
            # Wrap into table
            diff_h += mktag('td') + html_grids[w] + mktag('/td')
        diff_h += mktag('/tr')
        for i, clue in enumerate(html_clues[sortedkeys[0][0]]):
            diff_h += mktag('tr')
            for w, dt in sortedkeys:
                if i < len(html_clues[w]):
                    diff_h += mktag('td') + html_clues[w][i] + mktag('/td')
            diff_h += mktag('/tr')
        # Process deepclues
        diff_h += mktag('table') + dcl_html + mktag('/table')

        diff_h += mktag('/table')
        outf.write_html('pub/deep/%s/index.html' % mainxdid, diff_h,
                    title='Deep clue comparison for ' + mainxdid)
Пример #22
0
def main():
    p = utils.args_parser(desc="generate pubyear svg and pubyear pages")
    p.add_argument('-p',
                   '--pubonly',
                   action="store_true",
                   default=False,
                   help='only output root map')
    args = utils.get_args(parser=p)
    outf = utils.open_output()

    pubyears = defaultdict(list)
    pubyears_idx = defaultdict(list)
    # years_idx = []
    for r in metadb.read_rows('pub/stats'):
        y = r.year or '0000'
        pubyear = r.pubid + str(y)
        pubyears[pubyear].append(r)
        if y not in pubyears_idx[r.pubid]:
            pubyears_idx[r.pubid].append(y)
        # if r.year not in years_idx:
        #    years_idx.append(r.year)

    # Making collapsed decades depends on args
    allyears = []
    for i in range(DECADE_SKIP_START // 10, DECADE_SKIP_END // 10 + 1):
        allyears.append("%s0s" % i)
    allyears.extend(
        [str(y) for y in range(DECADE_SKIP_END + 10,
                               date.today().year + 1)])

    html_out = []
    html_out.append(
        '<p>Grouped by publication-year and broken out by day-of-week (Monday at top, Sunday at bottom).</p>'
    )
    html_out.append(legend)  # See definition above
    html_out.append('<table id="pubyearmap" cellspacing="0" cellpadding="0">')

    # Table header with years \ decades
    year_header = gen_year_header(allyears)
    html_out.extend(year_header)

    pubs_total = {}
    for pubid in pubyears_idx:
        pubs_total[pubid] = len(metadb.xd_puzzles(pubid))

    # sort rows by number of puzzles
    sorted_pubs = sorted(pubs_total.keys(),
                         key=lambda pubid: pubs_total[pubid],
                         reverse=True)
    for pub in args.inputs or sorted_pubs:
        if pubs_total[pub] < 20:
            continue

        # Process each pub in index
        pubobj = metadb.xd_publications().get(pub)
        if pubobj:
            pubname = pubobj.PublicationName or pubobj.PublisherName
        else:
            pubname = pub
        html_out.append('<tr><td class="header">{}</td>'.format(
            html.mkhref(pubname, 'pub/' + pub)))

        for year in sorted(allyears):
            html_out.append('<td class="year_widget">')
            py_td = td_for_pubyear(pubyears, pub, year)
            if py_td:
                html_out.append(py_td)
                if not args.pubonly:
                    outf.write_html(
                        'pub/{pub}{year}/index.html'.format(**locals()),
                        pubyear_html(pub, year),
                        "{pubname}, {year}".format(**locals()))
            else:
                # otherwise
                width = svg_w if 's' not in year else svg_w * decade_scale
                html_out.append(
                    pys.format(w=width,
                               h=svg_h,
                               title='',
                               classes='notexists',
                               body=''))

            html_out.append('</td>')

        # Add totals + publishers
        html_out.append('<td class="header">{}</td>'.format(pubs_total[pub]))
        html_out.append('<td class="header">{}</td>'.format(
            html.mkhref(pubname, 'pub/' + pub)))
        html_out.append('</tr>')

    html_out.extend(year_header)
    html_out.append('</table>')
    total_xd = len(metadb.xd_puzzles())
    outf.write_html('index.html', "".join(html_out),
                    "Comparison of %s published crossword grids" % total_xd)
Пример #23
0
def main():
    args = utils.get_args('generate pub-years data')
    outf = utils.open_output()

    weekdays = [ 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun' ]

    pubyears = {} # set()
    for xd in xdfile.corpus():
        puby = (xd.publication_id(), xd.year())
        if puby not in pubyears:
            pubyears[puby] = []
        pubyears[puby].append(xd)

    if pubyears:
        metadb.delete_stats()

    for puby, xdlist in sorted(pubyears.items()):
        pubid, year = puby
        npublic = 0

        # TODO: SELECT FROM publications
        nexisting = 0

        # organize by day-of-week
        byweekday = {}
        byweekday_similar = {}
        for w in weekdays:
            byweekday[w] = []
            byweekday_similar[w] = []

        for xd in xdlist:
            dow = dow_from_date(xd.get_header('Date'))
            if dow:  # Might be empty date or only a year
                byweekday[dow].append(xd)

        for r in metadb.xd_similar(pubid + str(year)):
            if r.match_pct < 25:
                continue
            xd = xdfile.get_xd(r.xdid)

            if xd:
                dt = xd.get_header('Date')
                if dt:
                    dow = dow_from_date(dt)
                    if dow:  # Might be empty date or only a year
                        byweekday_similar[dow].append(r)
                else:
                    debug("Date not set for: %s" % xd)

        # tally stats
        for weekday in weekdays:
            copyrights = Counter()  # [copyright_text] -> number of xd
            editors = Counter()  # [editor_name] -> number of xd
            formats = Counter()  # ["15x15 RS"] -> number of xd
            # todo
            nexisting = 0

            nxd = len(byweekday[weekday])
            public_xdids = []  # Empty for now
            for xd in byweekday[weekday]:
                xdid = xd.xdid()
                if (year.isdigit() and int(year) <= 1965) or xdid in public_xdids:
                    npublic += 1

                editor = xd.get_header('Editor').strip()
                if editor:
                    editors[editor] += 1

                sizestr = xd.sizestr()
                if sizestr:
                    formats[sizestr] += 1

                copyright = xd.get_header('Copyright').strip()
                if copyright:
                    copyrights[copyright] += 1

            # debug("ME: %s MCPR: %s MF: %s" % (list(editors), list(copyrights), list(formats)))
            def process_counter(count, comp_value):
                # Process counter comparing with comp_value
                if count:
                    item, num = count.most_common(1)[0]
                    if num != comp_value:
                        item += " (%s)" % num
                else:
                    item = ''
                return item

            #
            maineditor = process_counter(editors, nxd)
            maincopyright = process_counter(copyrights, nxd)
            mainformat = process_counter(formats, nxd)

            reprints = 0
            touchups = 0
            redones = 0
            copies = 0
            themecopies = 0
            for r in byweekday_similar[weekday]:
                xd1 = xdfile.get_xd(r.xdid)
                xd2 = xdfile.get_xd(r.match_xdid)
                if xd1 is None:
                    info("%s: similar puzzle %s not in corpus" % (r.match_xdid, r.xdid))
                    continue

                if xd2 is None:
                    info("%s: similar puzzle %s not in corpus" % (r.xdid, r.match_xdid))
                    continue

                dt1 = xd1.get_header('Date')
                dt2 = xd2.get_header('Date')
                aut1 = xd1.get_header('Author').lower()
                aut2 = xd2.get_header('Author').lower()
                pct = int(r.match_pct)
                if dt2 < dt1:  # only capture the later one
                    ##deduce_similarity_type
                    if diff_authors(aut1, aut2): # suspicious
                        if pct >= 50:
                            copies += 1
                        elif pct >= 30:
                            themecopies += 1
                    else:
                        if pct == 100:
                            reprints += 1
                        elif pct >= 50:
                            touchups += 1
                        elif pct >= 30:
                            themecopies += 1

            metadb.append_row("pub/stats",
                                  (pubid, year, weekday,
                                      mainformat, maineditor, maincopyright,
                                      nexisting, nxd, npublic,
                                      reprints, touchups, redones,
                                      copies, themecopies))
Пример #24
0
def main():
    args = utils.get_args('generates .html diffs for all puzzles in similar.tsv')
    outf = utils.open_output()

    similars = utils.parse_tsv('gxd/similar.tsv', 'Similar')
    xdids_todo = {}

    for row in metadb.xd_similar_all():
        if row.xdid not in xdids_todo:
            xdids_todo[row.xdid] = []

        xdids_todo[row.xdid].append(row)


    for mainxdid in xdids_todo:
        progress(mainxdid)

        mainxd = xdfile.get_xd(mainxdid)
        if not mainxd:
            warn('%s not in corpus' % mainxdid)
            continue

        matches = xdids_todo[mainxdid]
        info('generating diffs for %s (%d matches)' % (mainxdid, len(matches)))

        xddates = {}
        xddates[mainxdid] = mainxd.date() # Dict to store XD dates for further sort
        html_grids = {}
        html_clues = {}
        # Store in list to make further formatting as html table easier
        html_grids[mainxdid] = grid_diff_html(xdfile.get_xd(mainxdid))

        # Add for main XD
        diff_l = []
        for pos, mainclue, mainanswer in mainxd.iterclues():
            if not mainclue:
                continue
            diff_h = mktag('div','fullgrid main') + '%s.&nbsp;' %pos
            diff_h += mainclue
            diff_h += mktag('span', tagclass='main', inner='&nbsp;~&nbsp;' + mainanswer.upper())
            diff_l.append(diff_h)
        html_clues[mainxdid] = diff_l

        # Process for all matches
        for row in matches:
            xdid = row.match_xdid
            xd = xdfile.get_xd(xdid)
            # Continue if can't load xdid
            if not xd:
                continue
            xddates[xdid] = xd.date()
            # output each grid
            html_grids[xdid] = grid_diff_html(xd, compare_with=mainxd)
            diff_l = []
            # output comparison of each set of clues
            for pos, clue, answer in xd.iterclues():
                diff_h = mktag('div','fullgrid') + '%s.&nbsp;' %pos
                if not clue:
                    continue
                # Sometimes can return clue == None
                mainclue = mainxd.get_clue_for_answer(answer)
                sm = difflib.SequenceMatcher(lambda x: x == ' ', mainclue or '', clue)
                debug('MCLUE: %s [%s]' % (mainclue, sm.ratio()))
                if mainclue is None or sm.ratio() < 0.40:
                    diff_h += clue
                else:
                    # Compare based on op codes
                    for opcode in sm.get_opcodes():
                        c, a1, a2, b1, b2 = opcode
                        if c == 'equal':
                            diff_h += '<span class="match">%s</span>' % clue[b1:b2]
                        else:
                            diff_h += '<span class="diff">%s</span>' % clue[b1:b2]

                tagclass = 'match' if mainclue or answer == mainxd.get_answer(pos) else 'diff'
                diff_h += mktag('span', tagclass=tagclass, inner='&nbsp;~&nbsp;' + answer.upper())
                diff_h += mktag('/div')
                diff_l.append(diff_h)
            html_clues[xdid] = diff_l

        # Wrap into table
        diff_h = mktag('table') + mktag('tr')
        # Sort by date
        sortedkeys = sorted(xddates.items(), key=operator.itemgetter(1))
        for w, dt in sortedkeys:
            # Wrap into table
            diff_h += mktag('td') + html_grids[w] + mktag('/td')
        diff_h += mktag('/tr')
        for i, clue in enumerate(html_clues[sortedkeys[0][0]]):
            diff_h += mktag('tr')
            for w, dt in sortedkeys:
                if i < len(html_clues[w]):
                    diff_h += mktag('td') + html_clues[w][i] + mktag('/td')
            diff_h += mktag('/tr')
        diff_h += mktag('/table')

        outf.write_html('pub/%s/index.html' % mainxdid, diff_h, title='Comparison for ' + mainxdid)
Пример #25
0
def main():
    p = utils.args_parser(desc="annotate puzzle clues with earliest date used in the corpus")
    p.add_argument("-a", "--all", default=False, help="analyze all puzzles, even those already in similar.tsv")
    p.add_argument("-l", "--limit", default=100, help="limit amount of puzzles to be analyzed [default=100]")
    args = get_args(parser=p)
    outf = open_output()

    num_processed = 0
    prev_similar = metadb.read_rows("gxd/similar")
    for fn, contents in find_files(*args.inputs, ext=".xd"):
        progress(fn)
        mainxd = xdfile(contents.decode("utf-8"), fn)

        if mainxd.xdid() in prev_similar:
            continue  # skip reprocessing .xd that are already in similar.tsv

        """ find similar grids (pct, xd) for the mainxd in the corpus.
        Takes about 1 second per xd.  sorted by pct.
        """
        similar_grids = sorted(find_similar_to(mainxd, corpus(), min_pct=0.20), key=lambda x: x[0], reverse=True)

        num_processed += 1
        if num_processed > int(args.limit):
            break

        if similar_grids:
            info("similar: " + " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids))

        mainpubid = mainxd.publication_id()
        maindate = mainxd.date()

        # go over each clue/answer, find all other uses, other answers, other possibilities.
        # these are added directly to similar.tsv
        nstaleclues = 0
        nstaleanswers = 0
        ntotalclues = 0
        for pos, mainclue, mainanswer in mainxd.iterclues():
            progress(mainanswer)

            poss_answers = []
            pub_uses = {}  # [pubid] -> set(ClueAnswer)

            mainca = ClueAnswer(mainpubid, maindate, mainanswer, mainclue)

            # find other uses of this clue, and other answers, in a single pass
            for clueans in find_clue_variants(mainclue):
                if clueans.answer != mainanswer:
                    poss_answers.append(clueans)

                if clueans.answer == mainanswer:
                    if clueans.pubid in pub_uses:
                        otherpubs = pub_uses[clueans.pubid]
                    else:
                        otherpubs = set()  # set of ClueAnswer
                        pub_uses[clueans.pubid] = otherpubs

                    otherpubs.add(clueans)

            # bclues is all boiled clues for this particular answer: { [bc] -> #uses }
            bclues = load_answers().get(mainanswer, [])
            stale_answer = False

            if bclues:
                uses = []
                for bc, nuses in bclues.items():
                    # then find all clues besides this one
                    clue_usages = [
                        ca for ca in load_clues().get(bc, []) if ca.answer == mainanswer and ca.date < maindate
                    ]

                    if clue_usages:
                        stale_answer = True
                        if nuses > 1:
                            # only use one (the most recent) ClueAnswer per boiled clue
                            # but use the clue only (no xdid)
                            ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1].clue
                        else:
                            ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1]
                        uses.append((ca, nuses))

        # summary row to similar.tsv
        metadb.append_row(
            "gxd/similar",
            [
                mainxd.xdid(),  # xdid
                int(100 * sum(pct / 100.0 for pct, xd1, xd2 in similar_grids)),  # similar_grid_pct
                nstaleclues,  # reused_clues
                nstaleanswers,  # reused_answers
                ntotalclues,  # total_clues
                " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids),  # matches
            ],
        )
Пример #26
0
def main():
    args = utils.get_args('generate pub-years data')
    outf = utils.open_output()

    weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

    pubyears = {}  # set()
    for xd in xdfile.corpus():
        puby = (xd.publication_id(), xd.year())
        if puby not in pubyears:
            pubyears[puby] = []
        pubyears[puby].append(xd)

    if pubyears:
        metadb.delete_stats()

    for puby, xdlist in sorted(pubyears.items()):
        pubid, year = puby
        npublic = 0

        # TODO: SELECT FROM publications
        nexisting = 0

        # organize by day-of-week
        byweekday = {}
        byweekday_similar = {}
        for w in weekdays:
            byweekday[w] = []
            byweekday_similar[w] = []

        for xd in xdlist:
            dow = dow_from_date(xd.get_header('Date'))
            if dow:  # Might be empty date or only a year
                byweekday[dow].append(xd)

        for r in metadb.xd_similar(pubid + str(year)):
            if r.match_pct < 25:
                continue
            xd = xdfile.get_xd(r.xdid)

            if xd:
                dt = xd.get_header('Date')
                if dt:
                    dow = dow_from_date(dt)
                    if dow:  # Might be empty date or only a year
                        byweekday_similar[dow].append(r)
                else:
                    debug("Date not set for: %s" % xd)

        # tally stats
        for weekday in weekdays:
            copyrights = Counter()  # [copyright_text] -> number of xd
            editors = Counter()  # [editor_name] -> number of xd
            formats = Counter()  # ["15x15 RS"] -> number of xd
            # todo
            nexisting = 0

            nxd = len(byweekday[weekday])
            public_xdids = []  # Empty for now
            for xd in byweekday[weekday]:
                xdid = xd.xdid()
                if (year.isdigit()
                        and int(year) <= 1965) or xdid in public_xdids:
                    npublic += 1

                editor = xd.get_header('Editor').strip()
                if editor:
                    editors[editor] += 1

                sizestr = xd.sizestr()
                if sizestr:
                    formats[sizestr] += 1

                copyright = xd.get_header('Copyright').strip()
                if copyright:
                    copyrights[copyright] += 1

            # debug("ME: %s MCPR: %s MF: %s" % (list(editors), list(copyrights), list(formats)))
            def process_counter(count, comp_value):
                # Process counter comparing with comp_value
                if count:
                    item, num = count.most_common(1)[0]
                    if num != comp_value:
                        item += " (%s)" % num
                else:
                    item = ''
                return item

            #
            maineditor = process_counter(editors, nxd)
            maincopyright = process_counter(copyrights, nxd)
            mainformat = process_counter(formats, nxd)

            reprints = 0
            touchups = 0
            redones = 0
            copies = 0
            themecopies = 0
            for r in byweekday_similar[weekday]:
                xd1 = xdfile.get_xd(r.xdid)
                xd2 = xdfile.get_xd(r.match_xdid)
                if xd1 is None:
                    info("%s: similar puzzle %s not in corpus" %
                         (r.match_xdid, r.xdid))
                    continue

                if xd2 is None:
                    info("%s: similar puzzle %s not in corpus" %
                         (r.xdid, r.match_xdid))
                    continue

                dt1 = xd1.get_header('Date')
                dt2 = xd2.get_header('Date')
                aut1 = xd1.get_header('Author').lower()
                aut2 = xd2.get_header('Author').lower()
                pct = int(r.match_pct)
                if dt2 < dt1:  # only capture the later one
                    ##deduce_similarity_type
                    if diff_authors(aut1, aut2):  # suspicious
                        if pct >= 50:
                            copies += 1
                        elif pct >= 30:
                            themecopies += 1
                    else:
                        if pct == 100:
                            reprints += 1
                        elif pct >= 50:
                            touchups += 1
                        elif pct >= 30:
                            themecopies += 1

            metadb.append_row(
                "pub/stats", (pubid, year, weekday, mainformat, maineditor,
                              maincopyright, nexisting, nxd, npublic, reprints,
                              touchups, redones, copies, themecopies))
Пример #27
0
def main():
    p = utils.args_parser(desc="generate pubyear svg and pubyear pages")
    p.add_argument('-p', '--pubonly', action="store_true", default=False, help='only output root map')
    args = utils.get_args(parser=p)
    outf = utils.open_output()

    pubyears = defaultdict(list)
    pubyears_idx = defaultdict(list)
    # years_idx = []
    for r in metadb.read_rows('pub/stats'):
        y = r.year or '0000'
        pubyear = r.pubid + str(y)
        pubyears[pubyear].append(r)
        if y not in pubyears_idx[r.pubid]:
            pubyears_idx[r.pubid].append(y)
        # if r.year not in years_idx:
        #    years_idx.append(r.year)

    # Making collapsed decades depends on args
    allyears = []
    for i in range(DECADE_SKIP_START//10, DECADE_SKIP_END//10 + 1):
        allyears.append("%s0s" % i)
    allyears.extend([ str(y) for y in range(DECADE_SKIP_END + 10, date.today().year + 1) ])

    html_out = []
    html_out.append('<p>Grouped by publication-year and broken out by day-of-week (Monday at top, Sunday at bottom).</p>')
    html_out.append(legend) # See definition above
    html_out.append('<table id="pubyearmap" cellspacing="0" cellpadding="0">')

    # Table header with years \ decades
    year_header = gen_year_header(allyears)
    html_out.extend(year_header)

    pubs_total = {}
    for pubid in pubyears_idx:
        pubs_total[pubid] = len(metadb.xd_puzzles(pubid))

    # sort rows by number of puzzles
    sorted_pubs = sorted(pubs_total.keys(), key=lambda pubid: pubs_total[pubid], reverse=True)
    for pub in args.inputs or sorted_pubs:
        if pubs_total[pub] < 20:
            continue

        # Process each pub in index
        pubobj = metadb.xd_publications().get(pub)
        if pubobj:
            pubname = pubobj.PublicationName or pubobj.PublisherName
        else:
            pubname = pub
        html_out.append('<tr><td class="header">{}</td>'.format(html.mkhref(pubname, pub)))

        for year in sorted(allyears):
            html_out.append('<td class="year_widget">')
            py_td = td_for_pubyear(pubyears, pub, year)
            if py_td:
                html_out.append(py_td)
                if not args.pubonly:
                    outf.write_html('pub/{pub}{year}/index.html'.format(**locals()), pubyear_html(pub, year),
                                    "{pubname}, {year}".format(**locals()))
            else:
                # otherwise
                width = svg_w if 's' not in year else svg_w*decade_scale
                html_out.append(pys.format(w=width, h=svg_h, title='', classes='notexists', body=''))

            html_out.append('</td>')

        # Add totals + publishers
        html_out.append('<td class="header">{}</td>'.format(pubs_total[pub]))
        html_out.append('<td class="header">{}</td>'.format(html.mkhref(pubname, pub)))
        html_out.append('</tr>')


    html_out.extend(year_header)
    html_out.append('</table>')
    total_xd = len(metadb.xd_puzzles())
    outf.write_html('index.html', "".join(html_out), "Comparison of %s published crossword grids" % total_xd)
Пример #28
0
def main():
    args = utils.get_args("generates .html diffs for all puzzles in similar.tsv")
    outf = utils.open_output()

    similars = utils.parse_tsv("gxd/similar.tsv", "Similar")
    xdids_todo = args.inputs or [xdid for xdid, matches in metadb.get_similar_grids().items() if matches]
    for mainxdid in xdids_todo:
        progress(mainxdid)

        mainxd = xdfile.get_xd(mainxdid)
        if not mainxd:
            continue

        matches = metadb.get_similar_grids().get(mainxdid, [])

        xddates = {}
        xddates[mainxdid] = mainxd.date()  # Dict to store XD dates for further sort
        html_grids = {}
        html_clues = {}
        # Store in list to make further formatting as html table easier
        html_grids[mainxdid] = grid_diff_html(xdfile.get_xd(mainxdid))

        # Add for main XD
        diff_l = []
        for pos, mainclue, mainanswer in mainxd.iterclues():
            diff_h = mktag("div", "fullgrid main") + "%s.&nbsp;" % pos
            diff_h += mainclue
            diff_h += mktag("span", tagclass="main", inner="&nbsp;~&nbsp;" + mainanswer.upper())
            diff_l.append(diff_h)
        html_clues[mainxdid] = diff_l
        # Process for all matches
        for xdid in matches:
            xd = xdfile.get_xd(xdid)
            # Continue if can't load xdid
            if not xd:
                continue
            xddates[xdid] = xd.date()
            # output each grid
            html_grids[xdid] = grid_diff_html(xd, compare_with=mainxd)
            diff_l = []
            # output comparison of each set of clues
            for pos, clue, answer in xd.iterclues():
                diff_h = mktag("div", "fullgrid") + "%s.&nbsp;" % pos
                # Sometimes can return clue == None
                mainclue = mainxd.get_clue_for_answer(answer)
                sm = difflib.SequenceMatcher(lambda x: x == " ", mainclue or "", clue)
                debug("MCLUE: %s [%s]" % (mainclue, sm.ratio()))
                if mainclue is None or sm.ratio() < 0.40:
                    diff_h += clue
                else:
                    # Compare based on op codes
                    for opcode in sm.get_opcodes():
                        c, a1, a2, b1, b2 = opcode
                        if c == "equal":
                            diff_h += '<span class="match">%s</span>' % clue[b1:b2]
                        else:
                            diff_h += '<span class="diff">%s</span>' % clue[b1:b2]

                tagclass = "match" if mainclue or answer == mainxd.get_answer(pos) else "diff"
                diff_h += mktag("span", tagclass=tagclass, inner="&nbsp;~&nbsp;" + answer.upper())
                diff_h += mktag("/div")
                diff_l.append(diff_h)
            html_clues[xdid] = diff_l

        # Wrap into table
        diff_h = mktag("table") + mktag("tr")
        # Sort by date
        sortedkeys = sorted(xddates.items(), key=operator.itemgetter(1))
        for w, dt in sortedkeys:
            # Wrap into table
            diff_h += mktag("td") + html_grids[w] + mktag("/td")
        diff_h += mktag("/tr")
        for i, clue in enumerate(html_clues[sortedkeys[0][0]]):
            diff_h += mktag("tr")
            for w, dt in sortedkeys:
                if i < len(html_clues[w]):
                    diff_h += mktag("td") + html_clues[w][i] + mktag("/td")
            diff_h += mktag("/tr")
        diff_h += mktag("/table")
        outf.write_html("pub/%s/index.html" % mainxdid, diff_h, title="Comparison for " + mainxdid)
Пример #29
0
def main():
    p = args_parser('download recent puzzles')
    args = get_args(parser=p)

    outf = open_output()

    today = datetime.date.today()
    todaystr = today.strftime("%Y-%m-%d")

    sources_tsv = ''

    puzzle_sources = xd_puzzle_sources()

    new_recents_tsv = []

    # some downloads may fail, track the last successful ones
    most_recent = {}

    # download new puzzles since most recent download
    for row in metadb.xd_recent_downloads().values():
        pubid = row.pubid
        latest_date = datestr_to_datetime(row.date)

        # by default, keep the previous one
        most_recent[pubid] = row.date

        if pubid not in puzzle_sources:
            warn("unknown puzzle source for '%s', skipping" % pubid)
            continue

        puzsrc = puzzle_sources[pubid]

        if not puzsrc.urlfmt or puzsrc.urlfmt.startswith("#"):
            warn("no source url for '%s', skipping" % pubid)
            continue

        from_date = latest_date
        to_date = today
        dates_to_get = get_dates_between(from_date, to_date, int(puzsrc.freq))
        if not dates_to_get:
            warn("*** %s: nothing to get since %s" % (pubid, from_date))
            continue

        summary("*** %s: downloading %d puzzles from %s to %s" % (pubid, len(dates_to_get), from_date, to_date))

        for dt in sorted(dates_to_get):
            try:
                xdid = construct_xdid(pubid, dt)
                url = dt.strftime(puzsrc.urlfmt)
                fn = "%s.%s" % (xdid, puzsrc.ext)

                debug("downloading '%s' from '%s'" % (fn, url))

                response = urllib.request.urlopen(url)
                content = response.read()

                outf.write_file(fn, content)

                most_recent[pubid] = todaystr
            except (urllib.error.HTTPError, urllib.error.URLError) as err:
                error('%s [%s] %s: %s' % (xdid, err.code, err.reason, url))
            except Exception as e:
                error(str(e))

            sources_tsv += xd_sources_row(fn, url, todaystr)

    for k, v in most_recent.items():
        new_recents_tsv.append(xd_recent_download(k, v))

    if sources_tsv:
        outf.write_file("sources.tsv", xd_sources_header + sources_tsv)

    if new_recents_tsv:
        # on filesystem
        open(metadb.RECENT_DOWNLOADS_TSV, "w").write(xd_recents_header + "".join(sorted(new_recents_tsv)))
Пример #30
0
def main():
    global args
    parsers = {
        '.xml': [parse_ccxml, parse_uxml],
        '.json': [parse_ujson],
        '.puz': [parse_puz],
        '.html': [parse_xwordinfo],
        '.pdf': [],
        '.jpg': [],
        '.gif': [],
        '.xd': [],  # special case, just copy the input, in case re-emitting screws it up
    }

    p = args_parser('convert crosswords to .xd format')
    p.add_argument('--copyright', default=None, help='Default value for unspecified Copyright headers')
    p.add_argument('--extsrc', default=None, help='Value for receipts.ExternalSource')
    p.add_argument('--intsrc', default=None, help='Value for receipts.InternalSource')
    p.add_argument('--pubid', default=None, help='PublicationAbbr (pubid) to use')
    args = get_args(parser=p)

    outf = open_output()

    for input_source in args.inputs:
        try:
            # collect 'sources' metadata
            source_files = {}
            # collect receipts
            receipts = []

            for fn, contents, dt in find_files_with_time(input_source, ext='.tsv'):
                progress(fn)
                for row in parse_tsv_data(contents.decode('utf-8'), "Source"):
                    innerfn = strip_toplevel(row.SourceFilename)
                    if innerfn in source_files:
                        warn("%s: already in source_files!" % innerfn)
                        continue
                    source_files[innerfn] = row

            # enumerate all files in this source, reverse-sorted by time
            #  (so most recent edition gets main slot in case of shelving
            #  conflict)
            for fn, contents, dt in sorted(find_files_with_time(input_source, strip_toplevel=False), reverse=True, key=lambda x: x[2]):
                if fn.endswith(".tsv") or fn.endswith(".log"):
                    continue

                if not contents:  # 0-length files
                    continue

                innerfn = strip_toplevel(fn)
                if innerfn in source_files:
                    srcrow = source_files[innerfn]
                    CaptureTime = srcrow.DownloadTime
                    ExternalSource = args.extsrc or srcrow.ExternalSource
                    SourceFilename = innerfn
                else:
                    debug("%s not in sources.tsv" % innerfn)
                    CaptureTime = iso8601(dt)
                    ExternalSource = args.extsrc or parse_pathname(input_source).filename
                    SourceFilename = innerfn

                ReceivedTime = iso8601(time.time())
                InternalSource = args.intsrc or parse_pathname(input_source).filename

                already_received = metadb.check_already_received(ExternalSource, SourceFilename)
                xdid = ""
                prev_xdid = ""  # unshelved by default

                existing_xdids = set(r.xdid for r in already_received)
                if existing_xdids:
                    if len(existing_xdids) > 1:
                        warn('previously received this same file under multiple xdids:' + ' '.join(existing_xdids))
                    else:
                        prev_xdid = existing_xdids.pop()
                        debug('already received as %s' % prev_xdid)

                # try each parser by extension
                ext = parse_pathname(fn).ext.lower()
                possible_parsers = parsers.get(ext, parsers[".puz"])

                progress(fn)

                if ext == ".xd":
                    outf.write_file(fn, contents.decode('utf-8'), dt)
                elif not possible_parsers:
                    rejected = "no parser"
                else:
                    rejected = ""
                    for parsefunc in possible_parsers:
                        try:
                            try:
                                xd = parsefunc(contents, fn)
                            except IncompletePuzzleParse as e:
                                error("%s  %s" % (fn, e))
                                xd = e.xd
                            if not xd:
                                continue

                            xd.filename = replace_ext(strip_toplevel(fn), ".xd")
                            if not xd.get_header("Copyright"):
                                if args.copyright:
                                    xd.set_header("Copyright", args.copyright)

                            catalog.deduce_set_seqnum(xd)

                            xdstr = xd.to_unicode()

                            mdtext = "|".join((ExternalSource,InternalSource,SourceFilename))
                            xdid = prev_xdid or catalog.deduce_xdid(xd, mdtext)
                            path = catalog.get_shelf_path(xd, args.pubid, mdtext)
                            outf.write_file(path + ".xd", xdstr, dt)

                            rejected = ""
                            break  # stop after first successful parsing
                        except xdfile.NoShelfError as e:
                            error("could not shelve: %s" % str(e))
                            rejected += "[shelver] %s  " % str(e)
                        except Exception as e:
                            error("%s could not convert [%s]: %s" % (parsefunc.__name__, fn, str(e)))
                            rejected += "[%s] %s  " % (parsefunc.__name__, str(e))
                            # raise

                    if rejected:
                        error("could not convert: %s" % rejected)

                    # only add receipt if first time converting this source
                    if already_received:
                        debug("already received %s:%s" % (ExternalSource, SourceFilename))
                    else:
                        receipts.append([
                            CaptureTime,
                            ReceivedTime,
                            ExternalSource,
                            InternalSource,
                            SourceFilename,
                            xdid
                        ])

            for r in receipts:
                metadb.append_row('gxd/receipts', r)

        except Exception as e:
            error(str(e))
            if args.debug:
                raise
Пример #31
0
def main():
    p = args_parser('download recent puzzles')
    args = get_args(parser=p)

    outf = open_output()

    today = datetime.date.today()
    sources_tsv = ''

    puzzle_sources = xd_puzzle_sources()

    new_recents_tsv = []

    # some downloads may fail, track the last successful ones
    most_recent = {}

    # download new puzzles since most recent download
    for row in metadb.xd_recent_downloads().values():
        pubid = row.pubid
        latest_date = datestr_to_datetime(row.date)

        # by default, keep the previous one
        most_recent[pubid] = row.date

        if pubid not in puzzle_sources:
            warn("unknown puzzle source for '%s', skipping" % pubid)
            continue

        puzsrc = puzzle_sources[pubid]

        if not puzsrc.urlfmt or puzsrc.urlfmt.startswith("#"):
            warn("no source url for '%s', skipping" % pubid)
            continue

        from_date = latest_date
        to_date = today
        #        dates_to_get = get_dates_between(from_date, to_date, int(puzsrc.freq))
        dates_to_get = get_ungotten_dates(pubid, from_date, to_date,
                                          int(puzsrc.freq))
        if not dates_to_get:
            warn("*** %s: nothing to get since %s" % (pubid, from_date))
            continue

        all_dates_to_get = sorted(dates_to_get)
        dates_to_get = dates_to_get[0:10] + dates_to_get[-10:]

        summary(
            "*** %s: %d puzzles from %s to %s not yet gotten, getting %d of them"
            % (pubid, len(all_dates_to_get), all_dates_to_get[0], to_date,
               len(dates_to_get)))
        most_recent[pubid] = str(
            download_puzzles(outf, puzsrc, pubid, dates_to_get))

    for k, v in most_recent.items():
        new_recents_tsv.append(xd_recent_download(k, v))

#    if sources_tsv:
#        outf.write_file("sources.tsv", xd_sources_header + sources_tsv)

    if new_recents_tsv:
        # on filesystem
        open(metadb.RECENT_DOWNLOADS_TSV,
             "w").write(xd_recents_header + "".join(sorted(new_recents_tsv)))
Пример #32
0
def main():
    p = utils.args_parser(
        desc="annotate puzzle clues with earliest date used in the corpus")
    p.add_argument(
        '-a',
        '--all',
        default=False,
        help='analyze all puzzles, even those already in similar.tsv')
    p.add_argument('-l',
                   '--limit',
                   default=100,
                   help='limit amount of puzzles to be analyzed [default=100]')
    args = get_args(parser=p)
    outf = open_output()

    num_processed = 0
    prev_similar = metadb.read_rows('gxd/similar')
    for fn, contents in find_files(*args.inputs, ext=".xd"):
        progress(fn)
        mainxd = xdfile(contents.decode('utf-8'), fn)

        if mainxd.xdid() in prev_similar:
            continue  # skip reprocessing .xd that are already in similar.tsv
        """ find similar grids (pct, xd) for the mainxd in the corpus.
        Takes about 1 second per xd.  sorted by pct.
        """
        similar_grids = sorted(find_similar_to(mainxd, corpus(), min_pct=0.20),
                               key=lambda x: x[0],
                               reverse=True)

        num_processed += 1
        if num_processed > int(args.limit):
            break

        if similar_grids:
            info("similar: " + " ".join(("%s=%s" % (xd2.xdid(), pct))
                                        for pct, xd1, xd2 in similar_grids))

        mainpubid = mainxd.publication_id()
        maindate = mainxd.date()

        # go over each clue/answer, find all other uses, other answers, other possibilities.
        # these are added directly to similar.tsv
        nstaleclues = 0
        nstaleanswers = 0
        ntotalclues = 0
        for pos, mainclue, mainanswer in mainxd.iterclues():
            progress(mainanswer)

            poss_answers = []
            pub_uses = {}  # [pubid] -> set(ClueAnswer)

            mainca = ClueAnswer(mainpubid, maindate, mainanswer, mainclue)

            # find other uses of this clue, and other answers, in a single pass
            for clueans in find_clue_variants(mainclue):
                if clueans.answer != mainanswer:
                    poss_answers.append(clueans)

                if clueans.answer == mainanswer:
                    if clueans.pubid in pub_uses:
                        otherpubs = pub_uses[clueans.pubid]
                    else:
                        otherpubs = set()  # set of ClueAnswer
                        pub_uses[clueans.pubid] = otherpubs

                    otherpubs.add(clueans)

            # bclues is all boiled clues for this particular answer: { [bc] -> #uses }
            bclues = load_answers().get(mainanswer, [])
            stale_answer = False

            if bclues:
                uses = []
                for bc, nuses in bclues.items():
                    # then find all clues besides this one
                    clue_usages = [
                        ca for ca in load_clues().get(bc, [])
                        if ca.answer == mainanswer and ca.date < maindate
                    ]

                    if clue_usages:
                        stale_answer = True
                        if nuses > 1:
                            # only use one (the most recent) ClueAnswer per boiled clue
                            # but use the clue only (no xdid)
                            ca = sorted(clue_usages,
                                        key=lambda ca: ca.date or "z")[-1].clue
                        else:
                            ca = sorted(clue_usages,
                                        key=lambda ca: ca.date or "z")[-1]
                        uses.append((ca, nuses))

        # summary row to similar.tsv
        metadb.append_row(
            'gxd/similar',
            [
                mainxd.xdid(),  # xdid
                int(100 * sum(
                    pct / 100.0
                    for pct, xd1, xd2 in similar_grids)),  # similar_grid_pct
                nstaleclues,  # reused_clues
                nstaleanswers,  # reused_answers
                ntotalclues,  # total_clues
                " ".join(("%s=%s" % (xd2.xdid(), pct))
                         for pct, xd1, xd2 in similar_grids)  # matches
            ])
Пример #33
0
def main():
    global args
    parsers = {
        '.xml': [parse_ccxml, parse_uxml],
        '.json': [parse_ujson],
        '.puz': [parse_puz],
        '.html': [parse_xwordinfo],
        '.pdf': [],
        '.jpg': [],
        '.gif': [],
        '.xd':
        [],  # special case, just copy the input, in case re-emitting screws it up
    }

    p = args_parser('convert crosswords to .xd format')
    p.add_argument('--copyright',
                   default=None,
                   help='Default value for unspecified Copyright headers')
    p.add_argument('--extsrc',
                   default=None,
                   help='Value for receipts.ExternalSource')
    p.add_argument('--intsrc',
                   default=None,
                   help='Value for receipts.InternalSource')
    p.add_argument('--pubid',
                   default=None,
                   help='PublicationAbbr (pubid) to use')
    args = get_args(parser=p)

    outf = open_output()

    for input_source in args.inputs:
        try:
            # collect 'sources' metadata
            source_files = {}
            # collect receipts
            receipts = []

            for fn, contents, dt in find_files_with_time(input_source,
                                                         ext='.tsv'):
                progress(fn)
                for row in parse_tsv_data(contents.decode('utf-8'), "Source"):
                    innerfn = strip_toplevel(row.SourceFilename)
                    if innerfn in source_files:
                        warn("%s: already in source_files!" % innerfn)
                        continue
                    source_files[innerfn] = row

            # enumerate all files in this source, reverse-sorted by time
            #  (so most recent edition gets main slot in case of shelving
            #  conflict)
            for fn, contents, dt in sorted(find_files_with_time(
                    input_source, strip_toplevel=False),
                                           reverse=True,
                                           key=lambda x: x[2]):
                if fn.endswith(".tsv") or fn.endswith(".log"):
                    continue

                if not contents:  # 0-length files
                    continue

                innerfn = strip_toplevel(fn)
                if innerfn in source_files:
                    srcrow = source_files[innerfn]
                    CaptureTime = srcrow.DownloadTime
                    ExternalSource = args.extsrc or srcrow.ExternalSource
                    SourceFilename = innerfn
                else:
                    debug("%s not in sources.tsv" % innerfn)
                    CaptureTime = iso8601(dt)
                    ExternalSource = args.extsrc or parse_pathname(
                        input_source).filename
                    SourceFilename = innerfn

                ReceivedTime = iso8601(time.time())
                InternalSource = args.intsrc or parse_pathname(
                    input_source).filename

                already_received = metadb.check_already_received(
                    ExternalSource, SourceFilename)
                xdid = ""
                prev_xdid = ""  # unshelved by default

                existing_xdids = set(r.xdid for r in already_received)
                if existing_xdids:
                    if len(existing_xdids) > 1:
                        warn(
                            'previously received this same file under multiple xdids:'
                            + ' '.join(existing_xdids))
                    else:
                        prev_xdid = existing_xdids.pop()
                        debug('already received as %s' % prev_xdid)

                # try each parser by extension
                ext = parse_pathname(fn).ext.lower()
                possible_parsers = parsers.get(ext, parsers[".puz"])

                progress(fn)

                if ext == ".xd":
                    outf.write_file(fn, contents.decode('utf-8'), dt)
                elif not possible_parsers:
                    rejected = "no parser"
                else:
                    rejected = ""
                    for parsefunc in possible_parsers:
                        try:
                            try:
                                xd = parsefunc(contents, fn)
                            except IncompletePuzzleParse as e:
                                error("%s  %s" % (fn, e))
                                xd = e.xd
                            if not xd:
                                continue

                            xd.filename = replace_ext(strip_toplevel(fn),
                                                      ".xd")
                            if not xd.get_header("Copyright"):
                                if args.copyright:
                                    xd.set_header("Copyright", args.copyright)

                            catalog.deduce_set_seqnum(xd)

                            xdstr = xd.to_unicode()

                            mdtext = "|".join((ExternalSource, InternalSource,
                                               SourceFilename))
                            xdid = prev_xdid or catalog.deduce_xdid(xd, mdtext)
                            path = catalog.get_shelf_path(
                                xd, args.pubid, mdtext)
                            outf.write_file(path + ".xd", xdstr, dt)

                            rejected = ""
                            break  # stop after first successful parsing
                        except xdfile.NoShelfError as e:
                            error("could not shelve: %s" % str(e))
                            rejected += "[shelver] %s  " % str(e)
                        except Exception as e:
                            error("%s could not convert [%s]: %s" %
                                  (parsefunc.__name__, fn, str(e)))
                            rejected += "[%s] %s  " % (parsefunc.__name__,
                                                       str(e))
                            # raise

                    if rejected:
                        error("could not convert: %s" % rejected)

                    # only add receipt if first time converting this source
                    if already_received:
                        debug("already received %s:%s" %
                              (ExternalSource, SourceFilename))
                    else:
                        receipts.append([
                            CaptureTime, ReceivedTime, ExternalSource,
                            InternalSource, SourceFilename, xdid
                        ])

            for r in receipts:
                metadb.append_row('gxd/receipts', r)

        except Exception as e:
            error(str(e))
            if args.debug:
                raise
Пример #34
0
#!/usr/bin/env python3

# Usage:
#   $0 -o wwwroot/ gxd/redirects.tsv

from xdfile import html, utils

args = utils.get_args()
outf = utils.open_output()

for tsvfn, contents in utils.find_files(*args.inputs):
    for row in utils.parse_tsv_data(contents.decode('utf-8'), "Redirect"):
        outf.write_file(row.SourcePath, html.redirect_page(row.DestURL))

Пример #35
0
def main():
    args = utils.get_args(
        'generates .html diffs with deep clues for all puzzles in similar.tsv')
    outf = utils.open_output()

    similars = utils.parse_tsv('gxd/similar.tsv', 'Similar')

    xds_todo = []
    for fn, contents in find_files(*args.inputs, ext='.xd'):
        xd = xdfile.xdfile(contents.decode('utf-8'), fn)
        xds_todo.append(xd)

    for mainxd in xds_todo:
        mainxdid = mainxd.xdid()
        progress(mainxdid)

        matches = metadb.xd_similar(mainxdid)

        xddates = {}
        xddates[mainxdid] = mainxd.date(
        )  # Dict to store XD dates for further sort
        html_grids = {}

        # these are added directly to similar.tsv
        nstaleclues = 0
        nstaleanswers = 0
        ntotalclues = 0

        dcl_html = '<tr>'
        dcl_html += '<th></th>'
        dcl_html += '<th>Clue</th>'
        dcl_html += '<th>ANSWERs</th>'
        dcl_html += '<th>Alt. clue possibilities</th>'
        dcl_html += '</tr>'

        deepcl_html = []  # keep deep clues to parse later - per row
        for pos, mainclue, mainanswer in mainxd.iterclues():
            if not pos:
                continue

            poss_answers = []  # TODO:
            pub_uses = {}  # [pubid] -> set(ClueAnswer)

            deepcl_html = []  # Temporary to be replaced late
            mainca = ClueAnswer(mainxdid, mainxd.date(), mainanswer, mainclue)

            # 'grid position' column
            deepcl_html.append('<td class="pos">%s.</td>' % pos)

            # find other uses of this clue, and other answers, in a single pass
            for clueans in find_clue_variants(mainclue):
                if clueans.answer != mainanswer:
                    poss_answers.append(clueans)

                if clueans.answer == mainanswer:
                    if clueans.pubid in pub_uses:
                        otherpubs = pub_uses[clueans.pubid]
                    else:
                        otherpubs = set()  # set of ClueAnswer
                        pub_uses[clueans.pubid] = otherpubs
                    otherpubs.add(clueans)

            # add 'other uses' to clues_html
            deepcl_html.append('<td class="other-uses">')

            prev = prev_uses(pub_uses, mainxd, mainclue)
            if prev:
                deepcl_html.append('<a href="/pub/clue/%s">%s [x%s]</a>' %
                                   (boil(mainclue), mainclue, len(prev)))
                nstaleclues += 1
            else:
                deepcl_html.append(mainclue)

            deepcl_html.append('</td>')

            # add 'other answers' to clues_html
            deepcl_html.append('<td class="other-answers">')
            deepcl_html.append(
                html_select_options(poss_answers,
                                    strmaker=lambda ca: ca.answer,
                                    force_top=mainca,
                                    add_total=False))
            deepcl_html.append('</td>')

            # add 'other clues' to clues_html
            deepcl_html.append('<td class="other-clues">')

            other_clues = html_other_clues(mainanswer, mainclue, mainxd)
            if other_clues:
                deepcl_html.append(other_clues)
                nstaleanswers += 1

            deepcl_html.append('</td>')  # end 'other-clues'

            ntotalclues += 1
            # Quick and dirty - to be replaced
            dcl_html += '<tr>' + ' '.join(deepcl_html) + '</tr>'

        # Process deepclues
        diff_h = '<div class="main-container">'
        diff_h += grid_to_html(mainxd)
        diff_h += mktag('table', 'deepclues') + dcl_html + mktag('/table')
        diff_h += '</div>'

        info('writing deepclues for %s' % mainxdid)
        outf.write_html('pub/deep/%s/index.html' % mainxdid,
                        diff_h,
                        title='Deep clue analysis for ' + mainxdid)