示例#1
0
文件: xdstats.py 项目: rbairwell/xd
def get_all_words():
    ret = {}  # ["ANSWER"] = number of uses
    for xd in xdfile.corpus():
        for pos, clue, answer in xd.clues:
            ret[answer] = ret.get(answer, 0) + 1

    return ret
示例#2
0
文件: enumclues.py 项目: rbairwell/xd
def main():
    args = get_args("save all clues in simple .tsv")
    outf = open_output("clues.tsv")

    outf.write_row("PublicationId Date Answer Clue".split())
    for xd in corpus(*args.inputs):
        pubid = xd.publication_id()
        dt = xd.date()
        for pos, clue, answer in xd.clues:
            outf.write_row((pubid or "", dt or "", answer, clue))
示例#3
0
def main():
    args = get_args("save all clues in simple .tsv")
    outf = open_output("clues.tsv")

    outf.write_row("PublicationId Date Answer Clue".split())
    for xd in corpus(*args.inputs):
        pubid = xd.publication_id()
        dt = xd.date()
        for pos, clue, answer in xd.clues:
            outf.write_row((pubid or "", dt or "", answer, clue))
示例#4
0
def main():
    args = get_args(desc="find similar grids")
    g_corpus = [x for x in corpus()]

    outf = open_output()

    outf.write(xd_similar_header)

    for fn, contents in find_files(*args.inputs, strip_toplevel=False):
        needle = xdfile(contents.decode("utf-8"), fn)
        for pct, a, b in find_similar_to(needle, g_corpus):
            outf.write(xd_similar_row(a, b, pct))
示例#5
0
文件: xdstats.py 项目: rbairwell/xd
def get_duplicate_puzzles():
    dupgrids = {}
    grids = {}
    for xd in xdfile.corpus():
        g = EOL.join(xd.grid)
        if g not in grids:
            grids[g] = [xd]
        else:
            grids[g].append(xd)
            dupgrids[g] = grids[g]

    return dupgrids.values()
示例#6
0
def main():
    args = utils.get_args(desc='find grid templates')

    templates = set()

    for xd in xdfile.corpus():
        tmpl = tuple(''.join(x if x == BLOCK_CHAR else UNKNOWN_CHAR for x in L) for L in xd.grid)
        templates.add(tmpl)

    print(len(templates), 'templates')

    for input_source in args.inputs:
        for fn, contents in utils.find_files(input_source, ext='.xd'):
            xd = xdfile.xdfile(contents.decode('utf-8'), fn)
            for i, T in enumerate(templates):
                griddedxd = fit_template(T, xd)
                if griddedxd:
                    with open(args.output + ('-t%s.xd' % i), 'w') as fp:
                        fp.write(griddedxd.to_unicode())
示例#7
0
文件: xdstats.py 项目: rbairwell/xd
def most_used_grids(n=1):

    all_grids = {}
    for xd in xdfile.corpus():
        empty = get_blank_grid(xd)

        if empty not in all_grids:
            all_grids[empty] = []

        all_grids[empty].append(xd.filename)

    print("%s distinct grids" % len(all_grids))

    most_used = sorted(all_grids.items(), key=lambda x: -len(x[1]))

    for k, v in most_used[0:n]:
        print("used %s times" % len(v))
        gridlines = k.splitlines()
        for g, u in itertools.izip_longest(gridlines, sorted(v)):
            print("%15s    %s" % (u or "", g or ""))
        print()
示例#8
0
def main():
    args = utils.get_args('generate pub-years data')
    outf = utils.open_output()

    weekdays = [ 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun' ]

    pubyears = {} # set()
    for xd in xdfile.corpus():
        puby = (xd.publication_id(), xd.year())
        if puby not in pubyears:
            pubyears[puby] = []
        pubyears[puby].append(xd)

    if pubyears:
        metadb.delete_stats()

    for puby, xdlist in sorted(pubyears.items()):
        pubid, year = puby
        npublic = 0

        # TODO: SELECT FROM publications
        nexisting = 0

        # organize by day-of-week
        byweekday = {}
        byweekday_similar = {}
        for w in weekdays:
            byweekday[w] = []
            byweekday_similar[w] = []

        for xd in xdlist:
            dow = dow_from_date(xd.get_header('Date'))
            if dow:  # Might be empty date or only a year
                byweekday[dow].append(xd)

        for r in metadb.xd_similar(pubid + str(year)):
            if r.match_pct < 25:
                continue
            xd = xdfile.get_xd(r.xdid)

            if xd:
                dt = xd.get_header('Date')
                if dt:
                    dow = dow_from_date(dt)
                    if dow:  # Might be empty date or only a year
                        byweekday_similar[dow].append(r)
                else:
                    debug("Date not set for: %s" % xd)

        # tally stats
        for weekday in weekdays:
            copyrights = Counter()  # [copyright_text] -> number of xd
            editors = Counter()  # [editor_name] -> number of xd
            formats = Counter()  # ["15x15 RS"] -> number of xd
            # todo
            nexisting = 0

            nxd = len(byweekday[weekday])
            public_xdids = []  # Empty for now
            for xd in byweekday[weekday]:
                xdid = xd.xdid()
                if (year.isdigit() and int(year) <= 1965) or xdid in public_xdids:
                    npublic += 1

                editor = xd.get_header('Editor').strip()
                if editor:
                    editors[editor] += 1

                sizestr = xd.sizestr()
                if sizestr:
                    formats[sizestr] += 1

                copyright = xd.get_header('Copyright').strip()
                if copyright:
                    copyrights[copyright] += 1

            # debug("ME: %s MCPR: %s MF: %s" % (list(editors), list(copyrights), list(formats)))
            def process_counter(count, comp_value):
                # Process counter comparing with comp_value
                if count:
                    item, num = count.most_common(1)[0]
                    if num != comp_value:
                        item += " (%s)" % num
                else:
                    item = ''
                return item

            #
            maineditor = process_counter(editors, nxd)
            maincopyright = process_counter(copyrights, nxd)
            mainformat = process_counter(formats, nxd)

            reprints = 0
            touchups = 0
            redones = 0
            copies = 0
            themecopies = 0
            for r in byweekday_similar[weekday]:
                xd1 = xdfile.get_xd(r.xdid)
                xd2 = xdfile.get_xd(r.match_xdid)
                if xd1 is None:
                    info("%s: similar puzzle %s not in corpus" % (r.match_xdid, r.xdid))
                    continue

                if xd2 is None:
                    info("%s: similar puzzle %s not in corpus" % (r.xdid, r.match_xdid))
                    continue

                dt1 = xd1.get_header('Date')
                dt2 = xd2.get_header('Date')
                aut1 = xd1.get_header('Author').lower()
                aut2 = xd2.get_header('Author').lower()
                pct = int(r.match_pct)
                if dt2 < dt1:  # only capture the later one
                    ##deduce_similarity_type
                    if diff_authors(aut1, aut2): # suspicious
                        if pct >= 50:
                            copies += 1
                        elif pct >= 30:
                            themecopies += 1
                    else:
                        if pct == 100:
                            reprints += 1
                        elif pct >= 50:
                            touchups += 1
                        elif pct >= 30:
                            themecopies += 1

            metadb.append_row("pub/stats",
                                  (pubid, year, weekday,
                                      mainformat, maineditor, maincopyright,
                                      nexisting, nxd, npublic,
                                      reprints, touchups, redones,
                                      copies, themecopies))
示例#9
0
def main():
    p = utils.args_parser(desc="annotate puzzle clues with earliest date used in the corpus")
    p.add_argument("-a", "--all", default=False, help="analyze all puzzles, even those already in similar.tsv")
    p.add_argument("-l", "--limit", default=100, help="limit amount of puzzles to be analyzed [default=100]")
    args = get_args(parser=p)
    outf = open_output()

    num_processed = 0
    prev_similar = metadb.read_rows("gxd/similar")
    for fn, contents in find_files(*args.inputs, ext=".xd"):
        progress(fn)
        mainxd = xdfile(contents.decode("utf-8"), fn)

        if mainxd.xdid() in prev_similar:
            continue  # skip reprocessing .xd that are already in similar.tsv

        """ find similar grids (pct, xd) for the mainxd in the corpus.
        Takes about 1 second per xd.  sorted by pct.
        """
        similar_grids = sorted(find_similar_to(mainxd, corpus(), min_pct=0.20), key=lambda x: x[0], reverse=True)

        num_processed += 1
        if num_processed > int(args.limit):
            break

        if similar_grids:
            info("similar: " + " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids))

        mainpubid = mainxd.publication_id()
        maindate = mainxd.date()

        # go over each clue/answer, find all other uses, other answers, other possibilities.
        # these are added directly to similar.tsv
        nstaleclues = 0
        nstaleanswers = 0
        ntotalclues = 0
        for pos, mainclue, mainanswer in mainxd.iterclues():
            progress(mainanswer)

            poss_answers = []
            pub_uses = {}  # [pubid] -> set(ClueAnswer)

            mainca = ClueAnswer(mainpubid, maindate, mainanswer, mainclue)

            # find other uses of this clue, and other answers, in a single pass
            for clueans in find_clue_variants(mainclue):
                if clueans.answer != mainanswer:
                    poss_answers.append(clueans)

                if clueans.answer == mainanswer:
                    if clueans.pubid in pub_uses:
                        otherpubs = pub_uses[clueans.pubid]
                    else:
                        otherpubs = set()  # set of ClueAnswer
                        pub_uses[clueans.pubid] = otherpubs

                    otherpubs.add(clueans)

            # bclues is all boiled clues for this particular answer: { [bc] -> #uses }
            bclues = load_answers().get(mainanswer, [])
            stale_answer = False

            if bclues:
                uses = []
                for bc, nuses in bclues.items():
                    # then find all clues besides this one
                    clue_usages = [
                        ca for ca in load_clues().get(bc, []) if ca.answer == mainanswer and ca.date < maindate
                    ]

                    if clue_usages:
                        stale_answer = True
                        if nuses > 1:
                            # only use one (the most recent) ClueAnswer per boiled clue
                            # but use the clue only (no xdid)
                            ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1].clue
                        else:
                            ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1]
                        uses.append((ca, nuses))

        # summary row to similar.tsv
        metadb.append_row(
            "gxd/similar",
            [
                mainxd.xdid(),  # xdid
                int(100 * sum(pct / 100.0 for pct, xd1, xd2 in similar_grids)),  # similar_grid_pct
                nstaleclues,  # reused_clues
                nstaleanswers,  # reused_answers
                ntotalclues,  # total_clues
                " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids),  # matches
            ],
        )
示例#10
0
def main():
    args = utils.get_args('generate pub-years data')
    outf = utils.open_output()

    weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

    pubyears = {}  # set()
    for xd in xdfile.corpus():
        puby = (xd.publication_id(), xd.year())
        if puby not in pubyears:
            pubyears[puby] = []
        pubyears[puby].append(xd)

    if pubyears:
        metadb.delete_stats()

    for puby, xdlist in sorted(pubyears.items()):
        pubid, year = puby
        npublic = 0

        # TODO: SELECT FROM publications
        nexisting = 0

        # organize by day-of-week
        byweekday = {}
        byweekday_similar = {}
        for w in weekdays:
            byweekday[w] = []
            byweekday_similar[w] = []

        for xd in xdlist:
            dow = dow_from_date(xd.get_header('Date'))
            if dow:  # Might be empty date or only a year
                byweekday[dow].append(xd)

        for r in metadb.xd_similar(pubid + str(year)):
            if r.match_pct < 25:
                continue
            xd = xdfile.get_xd(r.xdid)

            if xd:
                dt = xd.get_header('Date')
                if dt:
                    dow = dow_from_date(dt)
                    if dow:  # Might be empty date or only a year
                        byweekday_similar[dow].append(r)
                else:
                    debug("Date not set for: %s" % xd)

        # tally stats
        for weekday in weekdays:
            copyrights = Counter()  # [copyright_text] -> number of xd
            editors = Counter()  # [editor_name] -> number of xd
            formats = Counter()  # ["15x15 RS"] -> number of xd
            # todo
            nexisting = 0

            nxd = len(byweekday[weekday])
            public_xdids = []  # Empty for now
            for xd in byweekday[weekday]:
                xdid = xd.xdid()
                if (year.isdigit()
                        and int(year) <= 1965) or xdid in public_xdids:
                    npublic += 1

                editor = xd.get_header('Editor').strip()
                if editor:
                    editors[editor] += 1

                sizestr = xd.sizestr()
                if sizestr:
                    formats[sizestr] += 1

                copyright = xd.get_header('Copyright').strip()
                if copyright:
                    copyrights[copyright] += 1

            # debug("ME: %s MCPR: %s MF: %s" % (list(editors), list(copyrights), list(formats)))
            def process_counter(count, comp_value):
                # Process counter comparing with comp_value
                if count:
                    item, num = count.most_common(1)[0]
                    if num != comp_value:
                        item += " (%s)" % num
                else:
                    item = ''
                return item

            #
            maineditor = process_counter(editors, nxd)
            maincopyright = process_counter(copyrights, nxd)
            mainformat = process_counter(formats, nxd)

            reprints = 0
            touchups = 0
            redones = 0
            copies = 0
            themecopies = 0
            for r in byweekday_similar[weekday]:
                xd1 = xdfile.get_xd(r.xdid)
                xd2 = xdfile.get_xd(r.match_xdid)
                if xd1 is None:
                    info("%s: similar puzzle %s not in corpus" %
                         (r.match_xdid, r.xdid))
                    continue

                if xd2 is None:
                    info("%s: similar puzzle %s not in corpus" %
                         (r.xdid, r.match_xdid))
                    continue

                dt1 = xd1.get_header('Date')
                dt2 = xd2.get_header('Date')
                aut1 = xd1.get_header('Author').lower()
                aut2 = xd2.get_header('Author').lower()
                pct = int(r.match_pct)
                if dt2 < dt1:  # only capture the later one
                    ##deduce_similarity_type
                    if diff_authors(aut1, aut2):  # suspicious
                        if pct >= 50:
                            copies += 1
                        elif pct >= 30:
                            themecopies += 1
                    else:
                        if pct == 100:
                            reprints += 1
                        elif pct >= 50:
                            touchups += 1
                        elif pct >= 30:
                            themecopies += 1

            metadb.append_row(
                "pub/stats", (pubid, year, weekday, mainformat, maineditor,
                              maincopyright, nexisting, nxd, npublic, reprints,
                              touchups, redones, copies, themecopies))
示例#11
0
文件: xdstats.py 项目: rbairwell/xd
def find_grid(fn):
    needle = get_blank_grid(xdfile(file(fn).read()))

    return [xd for xd in xdfile.corpus() if needle == get_blank_grid(xd)]
示例#12
0
文件: xdstats.py 项目: rbairwell/xd
            dupgrids[g] = grids[g]

    return dupgrids.values()


if __name__ == "__main__":
    # import sys

    all_words = get_all_words()
    print("%d unique words.  most used words:" % len(all_words))
    for word, num_uses in sorted(all_words.items(), key=lambda x: -x[1])[0:10]:
        print num_uses, word

    print
    groups = {"small": 0, "medium":0, "large":0, "huge":0}
    for xd in xdfile.corpus():
        if xd.width() < 14 and xd.height() < 14:
            groups["small"] += 1
        elif xd.height() < 17 and xd.height() < 17:
            groups["medium"] += 1
        elif xd.height() < 25 and xd.height() < 25:
            groups["large"] += 1
        else:
            print xd
            groups["huge"] += 1

    for k, v in groups.items():
        print("%s %s" % (k,v))

    print
示例#13
0
def main():
    p = utils.args_parser(
        desc="annotate puzzle clues with earliest date used in the corpus")
    p.add_argument(
        '-a',
        '--all',
        default=False,
        help='analyze all puzzles, even those already in similar.tsv')
    p.add_argument('-l',
                   '--limit',
                   default=100,
                   help='limit amount of puzzles to be analyzed [default=100]')
    args = get_args(parser=p)
    outf = open_output()

    num_processed = 0
    prev_similar = metadb.read_rows('gxd/similar')
    for fn, contents in find_files(*args.inputs, ext=".xd"):
        progress(fn)
        mainxd = xdfile(contents.decode('utf-8'), fn)

        if mainxd.xdid() in prev_similar:
            continue  # skip reprocessing .xd that are already in similar.tsv
        """ find similar grids (pct, xd) for the mainxd in the corpus.
        Takes about 1 second per xd.  sorted by pct.
        """
        similar_grids = sorted(find_similar_to(mainxd, corpus(), min_pct=0.20),
                               key=lambda x: x[0],
                               reverse=True)

        num_processed += 1
        if num_processed > int(args.limit):
            break

        if similar_grids:
            info("similar: " + " ".join(("%s=%s" % (xd2.xdid(), pct))
                                        for pct, xd1, xd2 in similar_grids))

        mainpubid = mainxd.publication_id()
        maindate = mainxd.date()

        # go over each clue/answer, find all other uses, other answers, other possibilities.
        # these are added directly to similar.tsv
        nstaleclues = 0
        nstaleanswers = 0
        ntotalclues = 0
        for pos, mainclue, mainanswer in mainxd.iterclues():
            progress(mainanswer)

            poss_answers = []
            pub_uses = {}  # [pubid] -> set(ClueAnswer)

            mainca = ClueAnswer(mainpubid, maindate, mainanswer, mainclue)

            # find other uses of this clue, and other answers, in a single pass
            for clueans in find_clue_variants(mainclue):
                if clueans.answer != mainanswer:
                    poss_answers.append(clueans)

                if clueans.answer == mainanswer:
                    if clueans.pubid in pub_uses:
                        otherpubs = pub_uses[clueans.pubid]
                    else:
                        otherpubs = set()  # set of ClueAnswer
                        pub_uses[clueans.pubid] = otherpubs

                    otherpubs.add(clueans)

            # bclues is all boiled clues for this particular answer: { [bc] -> #uses }
            bclues = load_answers().get(mainanswer, [])
            stale_answer = False

            if bclues:
                uses = []
                for bc, nuses in bclues.items():
                    # then find all clues besides this one
                    clue_usages = [
                        ca for ca in load_clues().get(bc, [])
                        if ca.answer == mainanswer and ca.date < maindate
                    ]

                    if clue_usages:
                        stale_answer = True
                        if nuses > 1:
                            # only use one (the most recent) ClueAnswer per boiled clue
                            # but use the clue only (no xdid)
                            ca = sorted(clue_usages,
                                        key=lambda ca: ca.date or "z")[-1].clue
                        else:
                            ca = sorted(clue_usages,
                                        key=lambda ca: ca.date or "z")[-1]
                        uses.append((ca, nuses))

        # summary row to similar.tsv
        metadb.append_row(
            'gxd/similar',
            [
                mainxd.xdid(),  # xdid
                int(100 * sum(
                    pct / 100.0
                    for pct, xd1, xd2 in similar_grids)),  # similar_grid_pct
                nstaleclues,  # reused_clues
                nstaleanswers,  # reused_answers
                ntotalclues,  # total_clues
                " ".join(("%s=%s" % (xd2.xdid(), pct))
                         for pct, xd1, xd2 in similar_grids)  # matches
            ])