def main(): parser = argparse.ArgumentParser(description="generic billy util", parents=[base_arg_parser]) subparsers = parser.add_subparsers(dest="subcommand") # import command plugins for mod in COMMAND_MODULES: import_command_module(mod) # instantiate all subcommands subcommands = {} for SubcommandCls in BaseCommand.subcommands: subcommands[SubcommandCls.name] = SubcommandCls(subparsers) # parse arguments, update settings, then run the appropriate function args = parser.parse_args() settings.update(args) subcommands[args.subcommand].handle(args)
def main(): parser = argparse.ArgumentParser(description='generic billy util', parents=[base_arg_parser]) subparsers = parser.add_subparsers(dest='subcommand') # import command plugins for mod in COMMAND_MODULES: import_command_module(mod) # instantiate all subcommands subcommands = {} for SubcommandCls in BaseCommand.subcommands: subcommands[SubcommandCls.name] = SubcommandCls(subparsers) # parse arguments, update settings, then run the appropriate function args = parser.parse_args() settings.update(args) subcommands[args.subcommand].handle(args)
def main(): try: parser = argparse.ArgumentParser( description='update billy data', parents=[base_arg_parser], ) what = parser.add_argument_group( 'what to scrape', 'flags that help select what data to scrape') scrape = parser.add_argument_group('scraper config', 'settings for the scraper') parser.add_argument('module', type=str, help='scraper module (eg. nc)') what.add_argument('-s', '--session', action='append', dest='sessions', default=[], help='session(s) to scrape') what.add_argument('-t', '--term', action='append', dest='terms', help='term(s) to scrape', default=[]) for arg in ('upper', 'lower'): what.add_argument('--' + arg, action='append_const', dest='chambers', const=arg) for arg in ('bills', 'legislators', 'committees', 'votes', 'events', 'speeches'): what.add_argument('--' + arg, action='append_const', dest='types', const=arg) for arg in ('scrape', 'import', 'report', 'session-list'): parser.add_argument('--' + arg, dest='actions', action="append_const", const=arg, help='only run %s step' % arg) # special modes for debugging scrape.add_argument('--nonstrict', action='store_false', dest='strict', default=True, help="don't fail immediately when" " encountering validation warning") scrape.add_argument('--fastmode', help="scrape in fast mode", action="store_true", default=False) # scrapelib overrides scrape.add_argument('-r', '--rpm', action='store', type=int, dest='SCRAPELIB_RPM') scrape.add_argument('--timeout', action='store', type=int, dest='SCRAPELIB_TIMEOUT') scrape.add_argument('--retries', type=int, dest='SCRAPELIB_RETRY_ATTEMPTS') scrape.add_argument('--retry_wait', type=int, dest='SCRAPELIB_RETRY_WAIT_SECONDS') args = parser.parse_args() # inject scraper paths so scraper module can be found for newpath in settings.SCRAPER_PATHS: sys.path.insert(0, newpath) # get metadata module = importlib.import_module(args.module) metadata = module.metadata module_settings = getattr(module, 'settings', {}) abbrev = metadata['abbreviation'] # load module settings, then command line settings settings.update(module_settings) settings.update(args) # make output dir args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbrev) # if terms aren't set, use latest if not args.terms: if args.sessions: for session in args.sessions: args.terms.append( term_for_session(metadata['abbreviation'], session, metadata)) args.terms = list(set(args.terms or [])) else: latest_term = metadata['terms'][-1]['name'] args.terms = [latest_term] # only set sessions from terms if sessions weren't set elif not args.sessions: for term in metadata['terms']: if term['name'] in args.terms: args.sessions.extend(term['sessions']) # dedup sessions args.sessions = list(set(args.sessions or [])) if not args.sessions: args.sessions = [metadata['terms'][-1]['sessions'][-1]] # determine chambers if not args.chambers: args.chambers = ['upper', 'lower'] if not args.actions: args.actions = ['scrape', 'import', 'report'] if not args.types: args.types = ['bills', 'legislators', 'votes', 'committees', 'alldata'] if 'events' in metadata['feature_flags']: args.types.append('events') if 'speeches' in metadata['feature_flags']: args.types.append('speeches') plan = """billy-update abbr=%s actions=%s types=%s sessions=%s terms=%s""" % (args.module, ','.join(args.actions), ','.join(args.types), ','.join(args.sessions), ','.join(args.terms)) logging.getLogger('billy').info(plan) scrape_data = {} if 'scrape' in args.actions: _clear_scraped_data(args.output_dir) # validate then write metadata if hasattr(module, 'session_list'): session_list = module.session_list() else: session_list = [] check_sessions(metadata, session_list) try: schema_path = os.path.join(os.path.split(__file__)[0], '../schemas/metadata.json') schema = json.load(open(schema_path)) validator = DatetimeValidator() validator.validate(metadata, schema) except ValueError as e: logging.getLogger('billy').warning( 'metadata validation error: ' + str(e)) run_record = [] exec_record = { "run_record": run_record, "args": sys.argv, } lex = None exc_traceback = None # start to run scrapers exec_start = dt.datetime.utcnow() # scraper order matters order = ('legislators', 'committees', 'votes', 'bills', 'events', 'speeches') _traceback = None try: for stype in order: if stype in args.types: run_record += _run_scraper(stype, args, metadata) except Exception as e: _traceback = _, _, exc_traceback = sys.exc_info() run_record += [{"exception": e, "type": stype}] lex = e exec_end = dt.datetime.utcnow() exec_record['started'] = exec_start exec_record['ended'] = exec_end scrape_data['scraped'] = exec_record scrape_data['abbr'] = abbrev for record in run_record: if "exception" in record: ex = record['exception'] fb = traceback.format_exception(*_traceback) trace = "" for t in fb: trace += t record['exception'] = { "type": ex.__class__.__name__, "message": ex.message, 'traceback': trace } scrape_data['failure'] = True if lex: if 'import' in args.actions: try: db.billy_runs.save(scrape_data, safe=True) except Exception: raise lex, None, exc_traceback # XXX: This should *NEVER* happen, but it has # in the past, so we're going to catch any errors # writing # to pymongo, and raise the original # exception rather then let it look like Mongo's fault. # Thanks for catching this, Thom. # # We lose the stack trace, but the Exception is the # same in every other way. # -- paultag raise # imports if 'import' in args.actions: import_report = _do_imports(abbrev, args) scrape_data['imported'] = import_report # We're tying the run-logging into the import stage - since import # already writes to the DB, we might as well throw this in too. db.billy_runs.save(scrape_data, safe=True) # reports if 'report' in args.actions: _do_reports(abbrev, args) if 'session-list' in args.actions: if hasattr(module, 'session_list'): print("\n".join(module.session_list())) else: raise ScrapeError('session_list() is not defined') except ScrapeError as e: logging.getLogger('billy').critical('Error: %s', e) sys.exit(1)
def main(): try: parser = argparse.ArgumentParser( description='update billy data', parents=[base_arg_parser], ) what = parser.add_argument_group( 'what to scrape', 'flags that help select what data to scrape') scrape = parser.add_argument_group('scraper config', 'settings for the scraper') parser.add_argument('module', type=str, help='scraper module (eg. nc)') parser.add_argument('--pdb', action='store_true', default=False, help='invoke PDB when exception is raised') parser.add_argument('--ipdb', action='store_true', default=False, help='invoke PDB when exception is raised') parser.add_argument('--pudb', action='store_true', default=False, help='invoke PUDB when exception is raised') what.add_argument('-s', '--session', action='append', dest='sessions', default=[], help='session(s) to scrape') what.add_argument('-t', '--term', action='append', dest='terms', help='term(s) to scrape', default=[]) for arg in ('upper', 'lower'): what.add_argument('--' + arg, action='append_const', dest='chambers', const=arg) for arg in ('bills', 'legislators', 'committees', 'votes', 'events', 'speeches'): what.add_argument('--' + arg, action='append_const', dest='types', const=arg) for arg in ('scrape', 'import', 'report', 'session-list'): parser.add_argument('--' + arg, dest='actions', action="append_const", const=arg, help='only run %s step' % arg) # special modes for debugging scrape.add_argument('--nonstrict', action='store_false', dest='strict', default=True, help="don't fail immediately when" " encountering validation warning") scrape.add_argument('--fastmode', help="scrape in fast mode", action="store_true", default=False) # scrapelib overrides scrape.add_argument('-r', '--rpm', action='store', type=int, dest='SCRAPELIB_RPM') scrape.add_argument('--timeout', action='store', type=int, dest='SCRAPELIB_TIMEOUT') scrape.add_argument('--retries', type=int, dest='SCRAPELIB_RETRY_ATTEMPTS') scrape.add_argument('--retry_wait', type=int, dest='SCRAPELIB_RETRY_WAIT_SECONDS') args = parser.parse_args() if args.pdb or args.pudb or args.ipdb: _debugger = pdb if args.pudb: try: import pudb _debugger = pudb except ImportError: pass if args.ipdb: try: import ipdb _debugger = ipdb except ImportError: pass # turn on PDB-on-error mode # stolen from http://stackoverflow.com/questions/1237379/ # if this causes problems in interactive mode check that page def _tb_info(type, value, tb): traceback.print_exception(type, value, tb) _debugger.pm() sys.excepthook = _tb_info # inject scraper paths so scraper module can be found for newpath in settings.SCRAPER_PATHS: sys.path.insert(0, newpath) # get metadata module = importlib.import_module(args.module) metadata = module.metadata module_settings = getattr(module, 'settings', {}) abbrev = metadata['abbreviation'] # load module settings, then command line settings settings.update(module_settings) settings.update(args) # make output dir args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbrev) # if terms aren't set, use latest if not args.terms: if args.sessions: for session in args.sessions: args.terms.append( term_for_session(metadata['abbreviation'], session, metadata)) args.terms = list(set(args.terms or [])) else: latest_term = metadata['terms'][-1]['name'] args.terms = [latest_term] # only set sessions from terms if sessions weren't set elif not args.sessions: for term in metadata['terms']: if term['name'] in args.terms: args.sessions.extend(term['sessions']) # dedup sessions args.sessions = list(set(args.sessions or [])) if not args.sessions: args.sessions = [metadata['terms'][-1]['sessions'][-1]] # determine chambers if not args.chambers: args.chambers = ['upper', 'lower'] if not args.actions: args.actions = ['scrape', 'import', 'report'] if not args.types: args.types = ['bills', 'legislators', 'votes', 'committees', 'alldata'] if 'events' in metadata['feature_flags']: args.types.append('events') if 'speeches' in metadata['feature_flags']: args.types.append('speeches') plan = """billy-update abbr=%s actions=%s types=%s sessions=%s terms=%s""" % (args.module, ','.join(args.actions), ','.join(args.types), ','.join(args.sessions), ','.join(args.terms)) logging.getLogger('billy').info(plan) scrape_data = {} if 'scrape' in args.actions: _clear_scraped_data(args.output_dir) # validate then write metadata if hasattr(module, 'session_list'): session_list = module.session_list() else: session_list = [] check_sessions(metadata, session_list) try: schema_path = os.path.join(os.path.split(__file__)[0], '../schemas/metadata.json') schema = json.load(open(schema_path)) validator = DatetimeValidator() validator.validate(metadata, schema) except ValueError as e: logging.getLogger('billy').warning( 'metadata validation error: ' + str(e)) run_record = [] exec_record = { "run_record": run_record, "args": sys.argv, } lex = None exc_traceback = None # start to run scrapers exec_start = dt.datetime.utcnow() # scraper order matters order = ('legislators', 'committees', 'votes', 'bills', 'events', 'speeches') _traceback = None try: for stype in order: if stype in args.types: run_record += _run_scraper(stype, args, metadata) except Exception as e: _traceback = _, _, exc_traceback = sys.exc_info() run_record += [{"exception": e, "type": stype}] lex = e exec_end = dt.datetime.utcnow() exec_record['started'] = exec_start exec_record['ended'] = exec_end scrape_data['scraped'] = exec_record scrape_data['abbr'] = abbrev for record in run_record: if "exception" in record: ex = record['exception'] fb = traceback.format_exception(*_traceback) trace = "" for t in fb: trace += t record['exception'] = { "type": ex.__class__.__name__, "message": ex.message, 'traceback': trace } scrape_data['failure'] = True if lex: if 'import' in args.actions: try: db.billy_runs.save(scrape_data, safe=True) except Exception: raise lex, None, exc_traceback # XXX: This should *NEVER* happen, but it has # in the past, so we're going to catch any errors # writing # to pymongo, and raise the original # exception rather then let it look like Mongo's fault. # Thanks for catching this, Thom. # # We lose the stack trace, but the Exception is the # same in every other way. # -- paultag raise # imports if 'import' in args.actions: import_report = _do_imports(abbrev, args) scrape_data['imported'] = import_report # We're tying the run-logging into the import stage - since import # already writes to the DB, we might as well throw this in too. db.billy_runs.save(scrape_data, safe=True) # reports if 'report' in args.actions: _do_reports(abbrev, args) if 'session-list' in args.actions: if hasattr(module, 'session_list'): print("\n".join(module.session_list())) else: raise ScrapeError('session_list() is not defined') except ScrapeError as e: logging.getLogger('billy').critical('Error: %s', e) sys.exit(1)