예제 #1
0
def get_shelf_path(xd, pubid, mdtext):
    publisher = ""
    if not pubid:
        pubid = find_pubid(mdtext)

    if pubid:
        publ = metadb.xd_publications()[pubid]
    else:
        publ = get_publication(xd)
        if publ:
            pubid = publ.PublicationAbbr
        else:
            return None

    if not pubid:
        utils.warn("unknown pubid for '%s'" % xd.filename)
        return None

    publisher = publ.PublisherAbbr

    num = xd.get_header('Number')
    if num:
        return "%s/%s-%03d" % (publisher or pubid, pubid, int(num))

    dt = xd.get_header("Date")
    if not dt:
        utils.warn("neither Number nor Date for '%s'" % xd.filename)
        return 'misc/' + xd.filename

    year = xdfile.year_from_date(dt)
    return "%s/%s/%s%s" % (publisher, year, pubid, dt)
예제 #2
0
def find_pubid(rowstr):
    '''rowstr is a concatentation of all metadata fields
    Returns None if file not exist or empty
    '''
    try:
        regexes = utils.parse_tsv_data(open(PUBREGEX_TSV, 'r').read())
    except FileNotFoundError:
        utils.error("File not exists: %s" % PUBREGEX_TSV, severity='WARNING')
        return None

    matching = set()
    for r in regexes:
        m = re.search(r['regex'], rowstr, flags=re.IGNORECASE)
        if m:
            matching.add(r['pubid'])

    if not matching:
        utils.warn("%s: no regex matches" % rowstr)
    else:
        if len(matching) > 1:
            utils.warn("%s: too many regex matches (%s)" % (rowstr, " ".join(matching)))
            return None
        else:
            return matching.pop()

    return None
예제 #3
0
def find_pubid(rowstr):
    '''rowstr is a concatentation of all metadata fields
    Returns None if file not exist or empty
    '''
    try:
        regexes = utils.parse_tsv_data(open(PUBREGEX_TSV, 'r').read())
    except FileNotFoundError:
        utils.error("File not exists: %s" % PUBREGEX_TSV, severity='WARNING')
        return None

    matching = set()
    for r in regexes:
        m = re.search(r['regex'], rowstr, flags=re.IGNORECASE)
        if m:
            matching.add(r['pubid'])

    if not matching:
        utils.warn("%s: no regex matches" % rowstr)
    else:
        if len(matching) > 1:
            utils.warn("%s: too many regex matches (%s)" %
                       (rowstr, " ".join(matching)))
            return None
        else:
            return matching.pop()

    return None
예제 #4
0
def get_shelf_path(xd, pubid, mdtext):
    publisher = ""
    if not pubid:
        pubid = find_pubid(mdtext)

    if pubid:
        publ = metadb.xd_publications()[pubid]
    else:
        publ = get_publication(xd)
        if publ:
            pubid = publ.PublicationAbbr
        else:
            return None

    if not pubid:
        utils.warn("unknown pubid for '%s'" % xd.filename)
        return None

    publisher = publ.PublisherAbbr

    num = xd.get_header('Number')
    if num:
        return "%s/%s-%03d" % (publisher or pubid, pubid, int(num))

    dt = xd.get_header("Date")
    if not dt:
        utils.warn("neither Number nor Date for '%s'" % xd.filename)
        return 'misc/' + xd.filename

    year = xdfile.year_from_date(dt)
    return "%s/%s/%s%s" % (publisher, year, pubid, dt)
예제 #5
0
def clean_headers(xd):
    # remove known unwanted header fields, log unknown headers
    for hdr in list(xd.headers.keys()):
        if hdr in ["Source", "Identifier", "Acquired", "Issued", "Category"]:
            xd.set_header(hdr, None)
        else:
            if hdr.lower() not in xdfile.HEADER_ORDER:
                utils.warn("%s: '%s' header not known: '%s'" %
                           (xd.filename, hdr, xd.headers[hdr]))

    # clean Author and Editor headers
    author = xd.get_header("Author") or ""
    if not author:
        if xd.get_header("Creator"):
            assert not author
            author = xd.get_header("Creator")
            xd.set_header("Creator", None)

    editor = xd.get_header("Editor") or ""

    newauthor, neweditor = clean_author(author, editor)

    if newauthor != author:
        xd.set_header("Author" + CLEAN_SUFFIX, newauthor)

    if neweditor != editor:
        xd.set_header("Editor" + CLEAN_SUFFIX, neweditor)

    # clean Title header
    title = xd.get_header("Title") or ""
    newtitle = clean_title(title)

    if newtitle != title:
        xd.set_header("Title" + CLEAN_SUFFIX, newtitle)
    # create Date header
    dt = xd.get_header("Date")

    ## try getting Date from filename
    if not dt:
        try:
            d = utils.parse_date_from_filename(xd.filename)
            if d:
                dt = d.strftime("%Y-%m-%d")
        except Exception as e:
            utils.error(str(e))
            if args.debug:
                raise

    ## try getting Date from copyright
    if not dt:
        rights = xd.get_header("Copyright") or ""
        dt = find_date(rights)

    if dt:
        xd.set_header("Date", dt)
예제 #6
0
def clean_headers(xd):
    # remove known unwanted header fields, log unknown headers
    for hdr in list(xd.headers.keys()):
        if hdr in ["Source", "Identifier", "Acquired", "Issued", "Category"]:
            xd.set_header(hdr, None)
        else:
            if hdr.lower() not in xdfile.HEADER_ORDER:
                utils.warn("%s: '%s' header not known: '%s'" % (xd.filename, hdr, xd.headers[hdr]))

    # clean Author and Editor headers
    author = xd.get_header("Author") or ""
    if not author:
        if xd.get_header("Creator"):
            assert not author
            author = xd.get_header("Creator")
            xd.set_header("Creator", None)

    editor = xd.get_header("Editor") or ""

    newauthor, neweditor = clean_author(author, editor)

    if newauthor != author:
        xd.set_header("Author" + CLEAN_SUFFIX, newauthor)

    if neweditor != editor:
        xd.set_header("Editor" + CLEAN_SUFFIX, neweditor)

    # clean Title header
    title = xd.get_header("Title") or ""
    newtitle = clean_title(title)

    if newtitle != title:
        xd.set_header("Title" + CLEAN_SUFFIX, newtitle)
    # create Date header
    dt = xd.get_header("Date")

    ## try getting Date from filename
    if not dt:
        try:
            d = utils.parse_date_from_filename(xd.filename)
            if d:
                dt = d.strftime("%Y-%m-%d")
        except Exception as e:
            utils.error(str(e))
            if args.debug:
                raise

    ## try getting Date from copyright
    if not dt:
        rights = xd.get_header("Copyright") or ""
        dt = find_date(rights)

    if dt:
        xd.set_header("Date", dt)
예제 #7
0
def main():
    p = args_parser('download recent puzzles')
    args = get_args(parser=p)

    outf = open_output()

    today = datetime.date.today()
    todaystr = today.strftime("%Y-%m-%d")

    sources_tsv = ''

    puzzle_sources = xd_puzzle_sources()

    new_recents_tsv = []

    # some downloads may fail, track the last successful ones
    most_recent = {}

    # download new puzzles since most recent download
    for row in metadb.xd_recent_downloads().values():
        pubid = row.pubid
        latest_date = datestr_to_datetime(row.date)

        # by default, keep the previous one
        most_recent[pubid] = row.date

        if pubid not in puzzle_sources:
            warn("unknown puzzle source for '%s', skipping" % pubid)
            continue

        puzsrc = puzzle_sources[pubid]

        if not puzsrc.urlfmt or puzsrc.urlfmt.startswith("#"):
            warn("no source url for '%s', skipping" % pubid)
            continue

        from_date = latest_date
        to_date = today
        dates_to_get = get_dates_between(from_date, to_date, int(puzsrc.freq))
        if not dates_to_get:
            warn("*** %s: nothing to get since %s" % (pubid, from_date))
            continue

        summary("*** %s: downloading %d puzzles from %s to %s" % (pubid, len(dates_to_get), from_date, to_date))

        for dt in sorted(dates_to_get):
            try:
                xdid = construct_xdid(pubid, dt)
                url = dt.strftime(puzsrc.urlfmt)
                fn = "%s.%s" % (xdid, puzsrc.ext)

                debug("downloading '%s' from '%s'" % (fn, url))

                response = urllib.request.urlopen(url)
                content = response.read()

                outf.write_file(fn, content)

                most_recent[pubid] = todaystr
            except (urllib.error.HTTPError, urllib.error.URLError) as err:
                error('%s [%s] %s: %s' % (xdid, err.code, err.reason, url))
            except Exception as e:
                error(str(e))

            sources_tsv += xd_sources_row(fn, url, todaystr)

    for k, v in most_recent.items():
        new_recents_tsv.append(xd_recent_download(k, v))

    if sources_tsv:
        outf.write_file("sources.tsv", xd_sources_header + sources_tsv)

    if new_recents_tsv:
        # on filesystem
        open(metadb.RECENT_DOWNLOADS_TSV, "w").write(xd_recents_header + "".join(sorted(new_recents_tsv)))
예제 #8
0
def main():
    p = args_parser('download recent puzzles')
    args = get_args(parser=p)

    outf = open_output()

    today = datetime.date.today()
    sources_tsv = ''

    puzzle_sources = xd_puzzle_sources()

    new_recents_tsv = []

    # some downloads may fail, track the last successful ones
    most_recent = {}

    # download new puzzles since most recent download
    for row in metadb.xd_recent_downloads().values():
        pubid = row.pubid
        latest_date = datestr_to_datetime(row.date)

        # by default, keep the previous one
        most_recent[pubid] = row.date

        if pubid not in puzzle_sources:
            warn("unknown puzzle source for '%s', skipping" % pubid)
            continue

        puzsrc = puzzle_sources[pubid]

        if not puzsrc.urlfmt or puzsrc.urlfmt.startswith("#"):
            warn("no source url for '%s', skipping" % pubid)
            continue

        from_date = latest_date
        to_date = today
        #        dates_to_get = get_dates_between(from_date, to_date, int(puzsrc.freq))
        dates_to_get = get_ungotten_dates(pubid, from_date, to_date,
                                          int(puzsrc.freq))
        if not dates_to_get:
            warn("*** %s: nothing to get since %s" % (pubid, from_date))
            continue

        all_dates_to_get = sorted(dates_to_get)
        dates_to_get = dates_to_get[0:10] + dates_to_get[-10:]

        summary(
            "*** %s: %d puzzles from %s to %s not yet gotten, getting %d of them"
            % (pubid, len(all_dates_to_get), all_dates_to_get[0], to_date,
               len(dates_to_get)))
        most_recent[pubid] = str(
            download_puzzles(outf, puzsrc, pubid, dates_to_get))

    for k, v in most_recent.items():
        new_recents_tsv.append(xd_recent_download(k, v))

#    if sources_tsv:
#        outf.write_file("sources.tsv", xd_sources_header + sources_tsv)

    if new_recents_tsv:
        # on filesystem
        open(metadb.RECENT_DOWNLOADS_TSV,
             "w").write(xd_recents_header + "".join(sorted(new_recents_tsv)))
예제 #9
0
def main():
    args = utils.get_args('generates .html diffs for all puzzles in similar.tsv')
    outf = utils.open_output()

    similars = utils.parse_tsv('gxd/similar.tsv', 'Similar')
    xdids_todo = {}

    for row in metadb.xd_similar_all():
        if row.xdid not in xdids_todo:
            xdids_todo[row.xdid] = []

        xdids_todo[row.xdid].append(row)


    for mainxdid in xdids_todo:
        progress(mainxdid)

        mainxd = xdfile.get_xd(mainxdid)
        if not mainxd:
            warn('%s not in corpus' % mainxdid)
            continue

        matches = xdids_todo[mainxdid]
        info('generating diffs for %s (%d matches)' % (mainxdid, len(matches)))

        xddates = {}
        xddates[mainxdid] = mainxd.date() # Dict to store XD dates for further sort
        html_grids = {}
        html_clues = {}
        # Store in list to make further formatting as html table easier
        html_grids[mainxdid] = grid_diff_html(xdfile.get_xd(mainxdid))

        # Add for main XD
        diff_l = []
        for pos, mainclue, mainanswer in mainxd.iterclues():
            if not mainclue:
                continue
            diff_h = mktag('div','fullgrid main') + '%s. ' %pos
            diff_h += mainclue
            diff_h += mktag('span', tagclass='main', inner=' ~ ' + mainanswer.upper())
            diff_l.append(diff_h)
        html_clues[mainxdid] = diff_l

        # Process for all matches
        for row in matches:
            xdid = row.match_xdid
            xd = xdfile.get_xd(xdid)
            # Continue if can't load xdid
            if not xd:
                continue
            xddates[xdid] = xd.date()
            # output each grid
            html_grids[xdid] = grid_diff_html(xd, compare_with=mainxd)
            diff_l = []
            # output comparison of each set of clues
            for pos, clue, answer in xd.iterclues():
                diff_h = mktag('div','fullgrid') + '%s. ' %pos
                if not clue:
                    continue
                # Sometimes can return clue == None
                mainclue = mainxd.get_clue_for_answer(answer)
                sm = difflib.SequenceMatcher(lambda x: x == ' ', mainclue or '', clue)
                debug('MCLUE: %s [%s]' % (mainclue, sm.ratio()))
                if mainclue is None or sm.ratio() < 0.40:
                    diff_h += clue
                else:
                    # Compare based on op codes
                    for opcode in sm.get_opcodes():
                        c, a1, a2, b1, b2 = opcode
                        if c == 'equal':
                            diff_h += '<span class="match">%s</span>' % clue[b1:b2]
                        else:
                            diff_h += '<span class="diff">%s</span>' % clue[b1:b2]

                tagclass = 'match' if mainclue or answer == mainxd.get_answer(pos) else 'diff'
                diff_h += mktag('span', tagclass=tagclass, inner='&nbsp;~&nbsp;' + answer.upper())
                diff_h += mktag('/div')
                diff_l.append(diff_h)
            html_clues[xdid] = diff_l

        # Wrap into table
        diff_h = mktag('table') + mktag('tr')
        # Sort by date
        sortedkeys = sorted(xddates.items(), key=operator.itemgetter(1))
        for w, dt in sortedkeys:
            # Wrap into table
            diff_h += mktag('td') + html_grids[w] + mktag('/td')
        diff_h += mktag('/tr')
        for i, clue in enumerate(html_clues[sortedkeys[0][0]]):
            diff_h += mktag('tr')
            for w, dt in sortedkeys:
                if i < len(html_clues[w]):
                    diff_h += mktag('td') + html_clues[w][i] + mktag('/td')
            diff_h += mktag('/tr')
        diff_h += mktag('/table')

        outf.write_html('pub/%s/index.html' % mainxdid, diff_h, title='Comparison for ' + mainxdid)