def main(): p = args_parser("process huge puzzles archive into separate .zip and create sources.tsv") p.add_argument("-s", "--source", default=None, help="ExternalSource") args = get_args(parser=p) outf = open_output() if args.source: source = args.source else: source = parse_pathname(args.inputs[0]).base subzips = {} for inputfn in args.inputs: for fn, contents, dt in xdfile.utils.find_files_with_time(inputfn): if not contents: continue m = re.match(r"^([a-z]{2,4})[\-0-9]{1}\d.*", parse_pathname(fn).base, flags=re.IGNORECASE) prefix = m.group(1).lower() if m else "misc" if prefix not in subzips: zf = xdfile.utils.OutputZipFile(os.path.join(args.output, prefix + ".zip")) sources = [] subzips[prefix] = (zf, sources) else: zf, sources = subzips[prefix] progress("Processing %s -> %s" % (fn, prefix)) zf.write_file(fn, contents, dt) sources.append(xd_sources_row(fn, source, iso8601(dt))) for zf, sources in subzips.values(): zf.write_file("sources.tsv", xd_sources_header + "".join(sources))
def main(): p = args_parser('catalog source files and create source.tsv') p.add_argument('-s', '--source', default=None, help='ExternalSource') args = get_args(parser=p) info("importing from %s" % args.source) outf = open_output() sources = [] for input_source in args.inputs: for fn, contents, dt in find_files_with_time(input_source): if len(contents) == 0: info("ignoring empty file") continue outf.write_file(strip_toplevel(fn), contents, dt) sources.append(xd_sources_row(fn, args.source or input_source, iso8601(dt))) info("%s files cataloged" % len(sources)) outbase = parse_pathname(args.output).base outf.write_file("%s.tsv" % outbase, xd_sources_header + "".join(sources)) outf.write_file("%s.log" % outbase, get_log())
def main(): args = get_args('parse downloaded emails') outf = open_output() sources_tsv = '' for emailfn, emailcontents in find_files(*args.inputs): msg = email.message_from_bytes(emailcontents) upload_src = msg["From"] if not upload_src: continue email_sources_tsv = [] email_files = generate_email_files(msg) for puzfn, puzdata, puzdt in email_files: # a basic sanity check of filesize # accommodate small puzzles and .pdf info("%s: %s from %s" % (puzfn, iso8601(puzdt), upload_src)) summary("%s puzzles from %s" % (len(email_files), upload_src)) if len(puzdata) > 1000 and len(puzdata) < 100000: email_sources_tsv.append(xd_sources_row(puzfn, upload_src, iso8601(puzdt))) outf.write_file(puzfn, puzdata) # generate receipt row, send receipt email if email_sources_tsv: xd_send_email(upload_src, fromaddr='*****@*****.**', subject='Upload successful: %d files received' % len(email_sources_tsv), body="These files were received:\n" + "\n".join(email_sources_tsv)) sources_tsv += "".join(email_sources_tsv) else: xd_send_email(upload_src, fromaddr='*****@*****.**', subject='Upload error', body='No puzzle files received')
def main(): p = args_parser( 'process huge puzzles archive into separate .zip and create sources.tsv' ) p.add_argument('-s', '--source', default=None, help='ExternalSource') args = get_args(parser=p) outf = open_output() if args.source: source = args.source else: source = parse_pathname(args.inputs[0]).base subzips = {} for inputfn in args.inputs: for fn, contents, dt in xdfile.utils.find_files_with_time(inputfn): if not contents: continue m = re.match(r'^([a-z]{2,4})[\-0-9]{1}\d.*', parse_pathname(fn).base, flags=re.IGNORECASE) prefix = m.group(1).lower() if m else 'misc' if prefix not in subzips: zf = xdfile.utils.OutputZipFile( os.path.join(args.output, prefix + ".zip")) sources = [] subzips[prefix] = (zf, sources) else: zf, sources = subzips[prefix] progress("Processing %s -> %s" % (fn, prefix)) zf.write_file(fn, contents, dt) sources.append(xd_sources_row(fn, source, iso8601(dt))) for zf, sources in subzips.values(): zf.write_file("sources.tsv", xd_sources_header + "".join(sources))
def main(): p = args_parser('download recent puzzles') args = get_args(parser=p) outf = open_output() today = datetime.date.today() todaystr = today.strftime("%Y-%m-%d") sources_tsv = '' puzzle_sources = xd_puzzle_sources() new_recents_tsv = [] # some downloads may fail, track the last successful ones most_recent = {} # download new puzzles since most recent download for row in metadb.xd_recent_downloads().values(): pubid = row.pubid latest_date = datestr_to_datetime(row.date) # by default, keep the previous one most_recent[pubid] = row.date if pubid not in puzzle_sources: warn("unknown puzzle source for '%s', skipping" % pubid) continue puzsrc = puzzle_sources[pubid] if not puzsrc.urlfmt or puzsrc.urlfmt.startswith("#"): warn("no source url for '%s', skipping" % pubid) continue from_date = latest_date to_date = today dates_to_get = get_dates_between(from_date, to_date, int(puzsrc.freq)) if not dates_to_get: warn("*** %s: nothing to get since %s" % (pubid, from_date)) continue summary("*** %s: downloading %d puzzles from %s to %s" % (pubid, len(dates_to_get), from_date, to_date)) for dt in sorted(dates_to_get): try: xdid = construct_xdid(pubid, dt) url = dt.strftime(puzsrc.urlfmt) fn = "%s.%s" % (xdid, puzsrc.ext) debug("downloading '%s' from '%s'" % (fn, url)) response = urllib.request.urlopen(url) content = response.read() outf.write_file(fn, content) most_recent[pubid] = todaystr except (urllib.error.HTTPError, urllib.error.URLError) as err: error('%s [%s] %s: %s' % (xdid, err.code, err.reason, url)) except Exception as e: error(str(e)) sources_tsv += xd_sources_row(fn, url, todaystr) for k, v in most_recent.items(): new_recents_tsv.append(xd_recent_download(k, v)) if sources_tsv: outf.write_file("sources.tsv", xd_sources_header + sources_tsv) if new_recents_tsv: # on filesystem open(metadb.RECENT_DOWNLOADS_TSV, "w").write(xd_recents_header + "".join(sorted(new_recents_tsv)))