def parser_main(): (bytes_to_process, files_to_process, files_to_ignore) = find_files_to_process() for path in files_to_ignore: print "Unparseable filename: {0}".format(os.path.basename(path)) print "Files to process: {0}".format(len(files_to_process)) print "Bytes to process: {0}".format(pretty_bytes(bytes_to_process)) print "Continue?" user_input = raw_input() if not 'yes'.startswith(user_input.lower()): return transactions = {} failed_lines = file(os.path.join(DATA_DIR, 'failed_lines.out'), 'w') failed_files = file(os.path.join(DATA_DIR, 'failed_files.out'), 'w') begin_time = time.time() for files_processed, ((filepath, import_date, filesize), bytes_processed) in enumerate(files_to_process, start=1): try: print print "Parsing {0}".format(os.path.basename(filepath)) file_transactions = parse_file(filepath, import_date) for (award_id, t) in file_transactions: if award_id not in transactions: transactions[award_id] = t except UnicodeDecodeError, error: log_error(db, filepath, "Unable to parse file: {0}".format(unicode(error))) except KeyboardInterrupt: break
def confirm_download_schedule(schedule): """Reports the total number of bytes and total number of files to download. Also lists the inaccessible files (based on HEAD response). Then asks user to confirm downloading. """ def content_length(tpl): return tpl[2][1] def status_code(tpl): return tpl[2][0] def href(tpl): return tpl[0] def is_OK(tpl): return status_code(tpl) == 200 def not_OK(tpl): return status_code(tpl) != 200 increment = lambda x, _: x + 1 file_count = ( schedule >> stream.filter(is_OK) >> stream.reduce(increment, 0)) bytes_to_download = ( schedule >> stream.filter(is_OK) >> stream.map(content_length) >> sum) inaccessible_files = (schedule >> stream.filter(not_OK) >> list) if len(inaccessible_files) > 0: print print "Some files are inaccessible:" for (idx, sched) in enumerate(inaccessible_files): print "%d: %d %s" % (idx, status_code(sched), href(sched)) if bytes_to_download > 0: print print "Need to download %s in %d files." % ( pretty_bytes(bytes_to_download), file_count) print print "Are you sure you want to continue? [Y/n]" user_input = raw_input("> ") return (user_input.upper() in ("", "Y", "YES")) else: print print "Nothing to download." return False
def confirm_download_schedule(schedule): """Reports the total number of bytes and total number of files to download. Also lists the inaccessible files (based on HEAD response). Then asks user to confirm downloading. """ def content_length(tpl): return tpl[2][1] def status_code(tpl): return tpl[2][0] def href(tpl): return tpl[0] def is_OK(tpl): return status_code(tpl) == 200 def not_OK(tpl): return status_code(tpl) != 200 increment = lambda x, _: x + 1 file_count = schedule >> stream.filter(is_OK) >> stream.reduce(increment, 0) bytes_to_download = schedule >> stream.filter(is_OK) >> stream.map(content_length) >> sum inaccessible_files = schedule >> stream.filter(not_OK) >> list if len(inaccessible_files) > 0: print print "Some files are inaccessible:" for (idx, sched) in enumerate(inaccessible_files): print "%d: %d %s" % (idx, status_code(sched), href(sched)) if bytes_to_download > 0: print print "Need to download %s in %d files." % (pretty_bytes(bytes_to_download), file_count) print print "Are you sure you want to continue? [Y/n]" user_input = raw_input("> ") return user_input.upper() in ("", "Y", "YES") else: print print "Nothing to download." return False
def offer_resume(): schedule_path = schedule_file_path() if os.path.exists(schedule_path): try: schedule = restore_schedule() except EOFError: print "Deleting corrupt download schedule." os.remove(schedule_path) return False size = os.path.getsize(schedule_path) print "A download schedule file exists:" print " %s (%s)" % (schedule_path, pretty_bytes(size)) print "You can either resume or delete this schedule." print "What would you like to do?" user_input = raw_input("[r]esume or [d]elete> ") if "DELETE".startswith(user_input.upper()): os.remove(schedule_path) return False else: return True else: return False
def offer_resume(): schedule_path = schedule_file_path() if os.path.exists(schedule_path): try: schedule = restore_schedule() except EOFError: print "Deleting corrupt download schedule." os.remove(schedule_path) return False size = os.path.getsize(schedule_path) print "A download schedule file exists:" print " %s (%s)" % (schedule_path, pretty_bytes(size)) print "You can either resume or delete this schedule." print "What would you like to do?" user_input = raw_input("[r]esume or [d]elete> ") if 'DELETE'.startswith(user_input.upper()): os.remove(schedule_path) return False else: return True else: return False
transactions[award_id] = t except UnicodeDecodeError, error: log_error(db, filepath, "Unable to parse file: {0}".format(unicode(error))) except KeyboardInterrupt: break now_time = time.time() bytes_per_second = bytes_processed / max(now_time - begin_time, 1) bytes_processed_pct = bytes_processed * 100 / bytes_to_process eta_seconds = (bytes_to_process - bytes_processed) / max( bytes_per_second, 1) print "{0}/{1} ({2}%), {3}/s, ETA {4}".format( pretty_bytes(bytes_processed), pretty_bytes(bytes_to_process), bytes_processed_pct, pretty_bytes(bytes_per_second), pretty_seconds(eta_seconds)) failed_lines.close() failed_files.close() print "Dumping awards dictionary..." with file(os.path.join(DATA_DIR, 'cfda_awards.out.bin'), 'wb') as outf: pickle.dump(transactions, outf) def fix_prefix(prefix): for stem in [ 'VA', 'DHS', 'HUD', 'USAID', 'DOJ', 'USTREAS', 'DOE', 'DOI', 'IMLS', 'DOC'
for (award_id, t) in file_transactions: if award_id not in transactions: transactions[award_id] = t except UnicodeDecodeError, error: log_error(db, filepath, "Unable to parse file: {0}".format(unicode(error))) except KeyboardInterrupt: break now_time = time.time() bytes_per_second = bytes_processed / max(now_time - begin_time, 1) bytes_processed_pct = bytes_processed * 100 / bytes_to_process eta_seconds = (bytes_to_process - bytes_processed) / max(bytes_per_second, 1) print "{0}/{1} ({2}%), {3}/s, ETA {4}".format( pretty_bytes(bytes_processed), pretty_bytes(bytes_to_process), bytes_processed_pct, pretty_bytes(bytes_per_second), pretty_seconds(eta_seconds)) failed_lines.close() failed_files.close() print "Dumping awards dictionary..." with file(os.path.join(DATA_DIR, 'cfda_awards.out.bin'), 'wb') as outf: pickle.dump(transactions, outf) def fix_prefix(prefix): for stem in ['VA', 'DHS', 'HUD', 'USAID', 'DOJ', 'USTREAS', 'DOE', 'DOI', 'IMLS', 'DOC']: