Exemplo n.º 1
0
def find_pubid(rowstr):
    '''rowstr is a concatentation of all metadata fields
    Returns None if file not exist or empty
    '''
    try:
        regexes = utils.parse_tsv_data(open(PUBREGEX_TSV, 'r').read())
    except FileNotFoundError:
        utils.error("File not exists: %s" % PUBREGEX_TSV, severity='WARNING')
        return None

    matching = set()
    for r in regexes:
        m = re.search(r['regex'], rowstr, flags=re.IGNORECASE)
        if m:
            matching.add(r['pubid'])

    if not matching:
        utils.warn("%s: no regex matches" % rowstr)
    else:
        if len(matching) > 1:
            utils.warn("%s: too many regex matches (%s)" % (rowstr, " ".join(matching)))
            return None
        else:
            return matching.pop()

    return None
Exemplo n.º 2
0
def find_pubid(rowstr):
    '''rowstr is a concatentation of all metadata fields
    Returns None if file not exist or empty
    '''
    try:
        regexes = utils.parse_tsv_data(open(PUBREGEX_TSV, 'r').read())
    except FileNotFoundError:
        utils.error("File not exists: %s" % PUBREGEX_TSV, severity='WARNING')
        return None

    matching = set()
    for r in regexes:
        m = re.search(r['regex'], rowstr, flags=re.IGNORECASE)
        if m:
            matching.add(r['pubid'])

    if not matching:
        utils.warn("%s: no regex matches" % rowstr)
    else:
        if len(matching) > 1:
            utils.warn("%s: too many regex matches (%s)" %
                       (rowstr, " ".join(matching)))
            return None
        else:
            return matching.pop()

    return None
Exemplo n.º 3
0
def pubyear_html(pubyears=[], skip_decades=None):
    """
    skip_decades, default  { 'start': 1910, 'end': 1970 }
    """
    global g_all_pubyears
    if not g_all_pubyears:
        g_all_pubyears = utils.parse_tsv_data(
            open("pub/pubyears.tsv").read(), "pubyear")

    # Read similars to make background of widgets
    similar_d = defaultdict(dict)
    for xdid, v in utils.parse_tsv('gxd/similar.tsv', "similar").items():
        xd_split = utils.split_xdid(xdid)
        if xd_split:
            pubid, year, mon, day = xd_split
            if year in similar_d[pubid]:
                similar_d[pubid][year].append(int(v.similar_grid_pct))
            else:
                similar_d[pubid][year] = [int(v.similar_grid_pct)]

    b = []  # Body

    # Making collapsed decades depends on args
    skip_decades = skip_decades if skip_decades else {
        'start': 1910,
        'end': 1970
    }
    allyears = []
    for i in range(skip_decades['start'] // 10, skip_decades['end'] // 10 + 1):
        allyears.append("%s0s" % i)
    allyears.extend([
        str(y) for y in range(skip_decades['end'] + 10,
                              date.today().year + 1)
    ])

    pubs = defaultdict(dict)
    # generate widget for each year
    for dowl in g_all_pubyears:
        dow = {}
        pubid, year, total = dowl[:3]
        hint = ''
        for d, v in zip(utils.WEEKDAYS, dowl[3:]):
            dow[d] = {'count': int(v) // 2, 'class': ''}
            dow[d]['class'] = 'red' if d == 'Sun' else 'ord'
            hint += '%s - %s\n' % (d, v)
        hint += 'Total: %s\n' % (total)
        # Define fill class based on average similarity
        fill_class = None  # default fill class for widget
        if year in similar_d[pubid]:
            s_avg = sum(similar_d[pubid][year]) / len(similar_d[pubid][year])
            hint += 'Avg similarity: %.2f%%' % (s_avg)
            # Example if average > 10 %
            fill_class = 'similar10' if s_avg >= 10 else None

        # Fill pubs with defferent blocks will be used below
        pubs[pubid][year] = {
            'dow_data': dow,
            'widget': year_widget(dow, total, fill_class),
            'hint': hint,
            'total': int(total),
        }
    # Process for all decades
    for dec_year in [x for x in allyears if 's' in x]:
        for pubid in pubs:
            year_key = dec_year[:-2]  # Remove last year and "s" from the end
            total = 0
            for yf in [x for x in pubs[pubid] if year_key in x]:
                total += pubs[pubid][yf]['total']
            hint = 'Total: %s' % (total)
            if total > 0:
                pubs[pubid][dec_year] = {
                    'widget': decade_widget(total),
                    'hint': hint,
                    'total': int(total),
                }

    # main table
    b.append('<table class="pubyears">')
    yhdr = ['&nbsp;'] + [split_year(y) for y in allyears]
    yhdr.append("all")
    b.append(
        td_with_class(*yhdr,
                      classes=get_pubheader_classes(*yhdr),
                      rowclass="pubyearhead",
                      tag="th"))
    b.append(tr_empty())

    # Process each pubid sorted by earliest year
    for pubid in sorted(pubs, key=lambda x: min(pubs[x])):
        pub = metadb.xd_publications().get(pubid)
        pubname = pub.PublicationName if pub else ''
        # Pub id to first column
        b.append(mktag('tr'))
        b.append(mktag('td', 'pub'))
        b.append(mkcell(
            space_with_nbsp(pubname or pubid),
            "/pub/" + pubid,
        ))
        b.append(mktag('/td'))

        # Process each year not collapsed into decade
        for yi in allyears:
            if yi in pubs[pubid] and pubs[pubid][yi]['total'] > 0:
                b.append(mktag('td', 'this'))
                # Put link directly to year or to decade
                href = "/pub/%s%s" % (
                    pubid,
                    yi) if 's' not in yi else "/pub/%s/index.html#%s" % (
                        pubid, yi[:-1])
                b.append(
                    mkcell(pubs[pubid][yi]['widget'],
                           href=href,
                           title=pubs[pubid][yi]['hint']))
                b.append(mktag('/td'))
            else:
                b.append(mktag('td', 'block'))
                b.append('&nbsp;')
                b.append(mktag('/td'))

        b.append(mktag('td'))
        b.append(
            str(sum([pubs[pubid][x]['total'] for x in pubs[pubid].keys()])))
        b.append(mktag('/td'))
        b.append(mktag('/tr'))

    b.append(mktag('/table'))
    return (" ".join(b))
Exemplo n.º 4
0
def main():
    global args
    parsers = {
        '.xml': [parse_ccxml, parse_uxml],
        '.json': [parse_ujson],
        '.puz': [parse_puz],
        '.html': [parse_xwordinfo],
        '.pdf': [],
        '.jpg': [],
        '.gif': [],
        '.xd': [],  # special case, just copy the input, in case re-emitting screws it up
    }

    p = args_parser('convert crosswords to .xd format')
    p.add_argument('--copyright', default=None, help='Default value for unspecified Copyright headers')
    p.add_argument('--extsrc', default=None, help='Value for receipts.ExternalSource')
    p.add_argument('--intsrc', default=None, help='Value for receipts.InternalSource')
    p.add_argument('--pubid', default=None, help='PublicationAbbr (pubid) to use')
    args = get_args(parser=p)

    outf = open_output()

    for input_source in args.inputs:
        try:
            # collect 'sources' metadata
            source_files = {}
            # collect receipts
            receipts = []

            for fn, contents, dt in find_files_with_time(input_source, ext='.tsv'):
                progress(fn)
                for row in parse_tsv_data(contents.decode('utf-8'), "Source"):
                    innerfn = strip_toplevel(row.SourceFilename)
                    if innerfn in source_files:
                        warn("%s: already in source_files!" % innerfn)
                        continue
                    source_files[innerfn] = row

            # enumerate all files in this source, reverse-sorted by time
            #  (so most recent edition gets main slot in case of shelving
            #  conflict)
            for fn, contents, dt in sorted(find_files_with_time(input_source, strip_toplevel=False), reverse=True, key=lambda x: x[2]):
                if fn.endswith(".tsv") or fn.endswith(".log"):
                    continue

                if not contents:  # 0-length files
                    continue

                innerfn = strip_toplevel(fn)
                if innerfn in source_files:
                    srcrow = source_files[innerfn]
                    CaptureTime = srcrow.DownloadTime
                    ExternalSource = args.extsrc or srcrow.ExternalSource
                    SourceFilename = innerfn
                else:
                    debug("%s not in sources.tsv" % innerfn)
                    CaptureTime = iso8601(dt)
                    ExternalSource = args.extsrc or parse_pathname(input_source).filename
                    SourceFilename = innerfn

                ReceivedTime = iso8601(time.time())
                InternalSource = args.intsrc or parse_pathname(input_source).filename

                already_received = metadb.check_already_received(ExternalSource, SourceFilename)
                xdid = ""
                prev_xdid = ""  # unshelved by default

                existing_xdids = set(r.xdid for r in already_received)
                if existing_xdids:
                    if len(existing_xdids) > 1:
                        warn('previously received this same file under multiple xdids:' + ' '.join(existing_xdids))
                    else:
                        prev_xdid = existing_xdids.pop()
                        debug('already received as %s' % prev_xdid)

                # try each parser by extension
                ext = parse_pathname(fn).ext.lower()
                possible_parsers = parsers.get(ext, parsers[".puz"])

                progress(fn)

                if ext == ".xd":
                    outf.write_file(fn, contents.decode('utf-8'), dt)
                elif not possible_parsers:
                    rejected = "no parser"
                else:
                    rejected = ""
                    for parsefunc in possible_parsers:
                        try:
                            try:
                                xd = parsefunc(contents, fn)
                            except IncompletePuzzleParse as e:
                                error("%s  %s" % (fn, e))
                                xd = e.xd
                            if not xd:
                                continue

                            xd.filename = replace_ext(strip_toplevel(fn), ".xd")
                            if not xd.get_header("Copyright"):
                                if args.copyright:
                                    xd.set_header("Copyright", args.copyright)

                            catalog.deduce_set_seqnum(xd)

                            xdstr = xd.to_unicode()

                            mdtext = "|".join((ExternalSource,InternalSource,SourceFilename))
                            xdid = prev_xdid or catalog.deduce_xdid(xd, mdtext)
                            path = catalog.get_shelf_path(xd, args.pubid, mdtext)
                            outf.write_file(path + ".xd", xdstr, dt)

                            rejected = ""
                            break  # stop after first successful parsing
                        except xdfile.NoShelfError as e:
                            error("could not shelve: %s" % str(e))
                            rejected += "[shelver] %s  " % str(e)
                        except Exception as e:
                            error("%s could not convert [%s]: %s" % (parsefunc.__name__, fn, str(e)))
                            rejected += "[%s] %s  " % (parsefunc.__name__, str(e))
                            # raise

                    if rejected:
                        error("could not convert: %s" % rejected)

                    # only add receipt if first time converting this source
                    if already_received:
                        debug("already received %s:%s" % (ExternalSource, SourceFilename))
                    else:
                        receipts.append([
                            CaptureTime,
                            ReceivedTime,
                            ExternalSource,
                            InternalSource,
                            SourceFilename,
                            xdid
                        ])

            for r in receipts:
                metadb.append_row('gxd/receipts', r)

        except Exception as e:
            error(str(e))
            if args.debug:
                raise
Exemplo n.º 5
0
def main():
    global args
    parsers = {
        '.xml': [parse_ccxml, parse_uxml],
        '.json': [parse_ujson],
        '.puz': [parse_puz],
        '.html': [parse_xwordinfo],
        '.pdf': [],
        '.jpg': [],
        '.gif': [],
        '.xd':
        [],  # special case, just copy the input, in case re-emitting screws it up
    }

    p = args_parser('convert crosswords to .xd format')
    p.add_argument('--copyright',
                   default=None,
                   help='Default value for unspecified Copyright headers')
    p.add_argument('--extsrc',
                   default=None,
                   help='Value for receipts.ExternalSource')
    p.add_argument('--intsrc',
                   default=None,
                   help='Value for receipts.InternalSource')
    p.add_argument('--pubid',
                   default=None,
                   help='PublicationAbbr (pubid) to use')
    args = get_args(parser=p)

    outf = open_output()

    for input_source in args.inputs:
        try:
            # collect 'sources' metadata
            source_files = {}
            # collect receipts
            receipts = []

            for fn, contents, dt in find_files_with_time(input_source,
                                                         ext='.tsv'):
                progress(fn)
                for row in parse_tsv_data(contents.decode('utf-8'), "Source"):
                    innerfn = strip_toplevel(row.SourceFilename)
                    if innerfn in source_files:
                        warn("%s: already in source_files!" % innerfn)
                        continue
                    source_files[innerfn] = row

            # enumerate all files in this source, reverse-sorted by time
            #  (so most recent edition gets main slot in case of shelving
            #  conflict)
            for fn, contents, dt in sorted(find_files_with_time(
                    input_source, strip_toplevel=False),
                                           reverse=True,
                                           key=lambda x: x[2]):
                if fn.endswith(".tsv") or fn.endswith(".log"):
                    continue

                if not contents:  # 0-length files
                    continue

                innerfn = strip_toplevel(fn)
                if innerfn in source_files:
                    srcrow = source_files[innerfn]
                    CaptureTime = srcrow.DownloadTime
                    ExternalSource = args.extsrc or srcrow.ExternalSource
                    SourceFilename = innerfn
                else:
                    debug("%s not in sources.tsv" % innerfn)
                    CaptureTime = iso8601(dt)
                    ExternalSource = args.extsrc or parse_pathname(
                        input_source).filename
                    SourceFilename = innerfn

                ReceivedTime = iso8601(time.time())
                InternalSource = args.intsrc or parse_pathname(
                    input_source).filename

                already_received = metadb.check_already_received(
                    ExternalSource, SourceFilename)
                xdid = ""
                prev_xdid = ""  # unshelved by default

                existing_xdids = set(r.xdid for r in already_received)
                if existing_xdids:
                    if len(existing_xdids) > 1:
                        warn(
                            'previously received this same file under multiple xdids:'
                            + ' '.join(existing_xdids))
                    else:
                        prev_xdid = existing_xdids.pop()
                        debug('already received as %s' % prev_xdid)

                # try each parser by extension
                ext = parse_pathname(fn).ext.lower()
                possible_parsers = parsers.get(ext, parsers[".puz"])

                progress(fn)

                if ext == ".xd":
                    outf.write_file(fn, contents.decode('utf-8'), dt)
                elif not possible_parsers:
                    rejected = "no parser"
                else:
                    rejected = ""
                    for parsefunc in possible_parsers:
                        try:
                            try:
                                xd = parsefunc(contents, fn)
                            except IncompletePuzzleParse as e:
                                error("%s  %s" % (fn, e))
                                xd = e.xd
                            if not xd:
                                continue

                            xd.filename = replace_ext(strip_toplevel(fn),
                                                      ".xd")
                            if not xd.get_header("Copyright"):
                                if args.copyright:
                                    xd.set_header("Copyright", args.copyright)

                            catalog.deduce_set_seqnum(xd)

                            xdstr = xd.to_unicode()

                            mdtext = "|".join((ExternalSource, InternalSource,
                                               SourceFilename))
                            xdid = prev_xdid or catalog.deduce_xdid(xd, mdtext)
                            path = catalog.get_shelf_path(
                                xd, args.pubid, mdtext)
                            outf.write_file(path + ".xd", xdstr, dt)

                            rejected = ""
                            break  # stop after first successful parsing
                        except xdfile.NoShelfError as e:
                            error("could not shelve: %s" % str(e))
                            rejected += "[shelver] %s  " % str(e)
                        except Exception as e:
                            error("%s could not convert [%s]: %s" %
                                  (parsefunc.__name__, fn, str(e)))
                            rejected += "[%s] %s  " % (parsefunc.__name__,
                                                       str(e))
                            # raise

                    if rejected:
                        error("could not convert: %s" % rejected)

                    # only add receipt if first time converting this source
                    if already_received:
                        debug("already received %s:%s" %
                              (ExternalSource, SourceFilename))
                    else:
                        receipts.append([
                            CaptureTime, ReceivedTime, ExternalSource,
                            InternalSource, SourceFilename, xdid
                        ])

            for r in receipts:
                metadb.append_row('gxd/receipts', r)

        except Exception as e:
            error(str(e))
            if args.debug:
                raise
Exemplo n.º 6
0
#!/usr/bin/env python3

# Usage:
#   $0 -o wwwroot/ gxd/redirects.tsv

from xdfile import html, utils

args = utils.get_args()
outf = utils.open_output()

for tsvfn, contents in utils.find_files(*args.inputs):
    for row in utils.parse_tsv_data(contents.decode('utf-8'), "Redirect"):
        outf.write_file(row.SourcePath, html.redirect_page(row.DestURL))

Exemplo n.º 7
0
def pubyear_html(pubyears=[], skip_decades=None):
    """
    skip_decades, default  { 'start': 1910, 'end': 1970 }
    """
    global g_all_pubyears
    if not g_all_pubyears:
        g_all_pubyears = utils.parse_tsv_data(open("pub/pubyears.tsv").read(), "pubyear")


    # Read similars to make background of widgets
    similar_d = defaultdict(dict)
    for xdid, v in utils.parse_tsv('gxd/similar.tsv', "similar").items():
        xd_split = utils.split_xdid(xdid)
        if xd_split:
            pubid, year, mon, day = xd_split
            if year in similar_d[pubid]:
                similar_d[pubid][year].append(int(v.similar_grid_pct))
            else:
                similar_d[pubid][year] = [ int(v.similar_grid_pct) ] 

    b = [] # Body
    
    # Making collapsed decades depends on args
    skip_decades = skip_decades if skip_decades else { 'start': 1910, 'end': 1970 } 
    allyears = []
    for i in range(skip_decades['start']//10, skip_decades['end']//10 + 1):
        allyears.append("%s0s" % i)
    allyears.extend([ str(y) for y in range(skip_decades['end'] + 10, date.today().year + 1) ])
    
    pubs = defaultdict(dict)
    # generate widget for each year
    for dowl in g_all_pubyears:
        dow = {}
        pubid, year, total = dowl[:3]
        hint = ''
        for d, v in zip(utils.WEEKDAYS, dowl[3:]):
            dow[d] = { 'count': int(v)//2, 'class':'' }
            dow[d]['class'] = 'red' if d == 'Sun' else 'ord'
            hint += '%s - %s\n' % (d, v)
        hint += 'Total: %s\n' % (total)
        # Define fill class based on average similarity
        fill_class = None # default fill class for widget
        if year in similar_d[pubid]:
            s_avg = sum(similar_d[pubid][year]) / len(similar_d[pubid][year]) 
            hint += 'Avg similarity: %.2f%%' % (s_avg)
            # Example if average > 10 %
            fill_class = 'similar10' if s_avg >= 10 else None

        # Fill pubs with defferent blocks will be used below
        pubs[pubid][year] = {
                'dow_data': dow,
                'widget': year_widget(dow, total, fill_class),
                'hint': hint,
                'total': int(total),
                }
    # Process for all decades
    for dec_year in [x for x in allyears if 's' in x]:
        for pubid in pubs:
            year_key = dec_year[:-2] # Remove last year and "s" from the end
            total = 0
            for yf in [x for x in pubs[pubid] if year_key in x]:
                total += pubs[pubid][yf]['total']
            hint = 'Total: %s' % (total)
            if total > 0:
                pubs[pubid][dec_year] = {
                    'widget': decade_widget(total),
                    'hint': hint,
                    'total': int(total),
                    }

    # main table
    b.append('<table class="pubyears">')
    yhdr = [ '&nbsp;' ] + [ split_year(y) for y in allyears ]
    yhdr.append("all")
    b.append(td_with_class(*yhdr, classes=get_pubheader_classes(*yhdr),
            rowclass="pubyearhead",tag="th"))
    b.append(tr_empty())

    # Process each pubid sorted by earliest year
    for pubid in sorted(pubs, key=lambda x:min(pubs[x])):
        pub = metadb.xd_publications().get(pubid)
        pubname = pub.PublicationName if pub else ''
        # Pub id to first column
        b.append(mktag('tr'))
        b.append(mktag('td','pub'))
        b.append(mkcell(space_with_nbsp(pubname or pubid), "/pub/" + pubid, ))
        b.append(mktag('/td'))

        # Process each year not collapsed into decade
        for yi in allyears:
            if yi in pubs[pubid] and pubs[pubid][yi]['total'] > 0:
                b.append(mktag('td','this'))
                # Put link directly to year or to decade
                href = "/pub/%s%s" % (pubid, yi) if 's' not in yi else "/pub/%s/index.html#%s" % (pubid, yi[:-1])
                b.append(mkcell(pubs[pubid][yi]['widget'], href=href, 
                        title=pubs[pubid][yi]['hint']))
                b.append(mktag('/td'))
            else:
                b.append(mktag('td', 'block'))
                b.append('&nbsp;')
                b.append(mktag('/td'))

        b.append(mktag('td'))
        b.append(str(sum([ pubs[pubid][x]['total'] for x in pubs[pubid].keys() ])))
        b.append(mktag('/td'))
        b.append(mktag('/tr'))

    b.append(mktag('/table'))
    return (" ".join(b))