Exemplo n.º 1
0
    def __init__(self,
                 metadata,
                 no_cache=False,
                 output_dir=None,
                 strict_validation=None,
                 **kwargs):
        """
        Create a new Scraper instance.

        :param metadata: metadata for this scraper
        :param no_cache: if True, will ignore any cached downloads
        :param output_dir: the data directory to use
        :param strict_validation: exit immediately if validation fails
        """

        # configure underlying scrapelib object
        if no_cache:
            kwargs['cache_dir'] = None
        elif 'cache_dir' not in kwargs:
            kwargs['cache_dir'] = settings.BILLY_CACHE_DIR

        if 'error_dir' not in kwargs:
            kwargs['error_dir'] = settings.BILLY_ERROR_DIR

        if 'timeout' not in kwargs:
            kwargs['timeout'] = settings.SCRAPELIB_TIMEOUT

        if 'requests_per_minute' not in kwargs:
            kwargs['requests_per_minute'] = None

        if 'retry_attempts' not in kwargs:
            kwargs['retry_attempts'] = settings.SCRAPELIB_RETRY_ATTEMPTS

        if 'retry_wait_seconds' not in kwargs:
            kwargs['retry_wait_seconds'] = \
                    settings.SCRAPELIB_RETRY_WAIT_SECONDS

        super(Scraper, self).__init__(**kwargs)

        for f in settings.BILLY_LEVEL_FIELDS[self.level]:
            if not hasattr(self, f):
                raise Exception('%s scrapers must have a %s attribute' %
                                (self.level, f))

        self.metadata = metadata
        self.output_dir = output_dir

        # validation
        self.strict_validation = strict_validation
        self.validator = DatetimeValidator()

        self.follow_robots = False

        # logging convenience methods
        self.logger = logging.getLogger("billy")
        self.log = self.logger.info
        self.debug = self.logger.debug
        self.warning = self.logger.warning
Exemplo n.º 2
0
    def __init__(self,
                 metadata,
                 output_dir=None,
                 strict_validation=None,
                 fastmode=False):
        """
        Create a new Scraper instance.

        :param metadata: metadata for this scraper
        :param output_dir: the data directory to use
        :param strict_validation: exit immediately if validation fails
        """
        super(Scraper, self).__init__()

        # scrapelib overrides
        self.timeout = settings.SCRAPELIB_TIMEOUT
        self.cache_storage = scrapelib.FileCache(settings.BILLY_CACHE_DIR)
        self.requests_per_minute = settings.SCRAPELIB_RPM
        self.retry_attempts = settings.SCRAPELIB_RETRY_ATTEMPTS
        self.retry_wait_seconds = settings.SCRAPELIB_RETRY_WAIT_SECONDS

        if fastmode:
            self.requests_per_minute = 0
            self.cache_write_only = False

        # if scraper uses dryscrape, set up session
        if settings.USES_DRYSCRAPE:
            dryscrape.start_xvfb()
            self.session = dryscrape.Session()

        self.metadata = metadata
        self.output_dir = output_dir
        self.output_names = set()

        # make output_dir
        os.path.isdir(self.output_dir) or os.path.makedirs(self.output_dir)

        # validation
        self.strict_validation = strict_validation
        self.validator = DatetimeValidator()
        self._schema = {}
        self._load_schemas()

        # logging convenience methods
        self.logger = logging.getLogger("billy")
        self.log = self.logger.info
        self.info = self.logger.info
        self.debug = self.logger.debug
        self.warning = self.logger.warning
        self.error = self.logger.error
        self.critical = self.logger.critical
Exemplo n.º 3
0
    def __init__(self,
                 metadata,
                 output_dir=None,
                 strict_validation=None,
                 fastmode=False,
                 **kwargs):
        """
        Create a new Scraper instance.

        :param metadata: metadata for this scraper
        :param output_dir: the data directory to use
        :param strict_validation: exit immediately if validation fails
        """

        # configure underlying scrapelib object
        kwargs['cache_obj'] = scrapelib.FileCache(settings.BILLY_CACHE_DIR)
        kwargs['requests_per_minute'] = settings.SCRAPELIB_RPM
        kwargs['timeout'] = settings.SCRAPELIB_TIMEOUT
        kwargs['retry_attempts'] = settings.SCRAPELIB_RETRY_ATTEMPTS
        kwargs['retry_wait_seconds'] = settings.SCRAPELIB_RETRY_WAIT_SECONDS

        if fastmode:
            kwargs['requests_per_minute'] = 0
            kwargs['cache_write_only'] = False

        super(Scraper, self).__init__(**kwargs)

        self.metadata = metadata
        self.output_dir = output_dir
        self.output_names = set()

        # make output_dir
        os.path.isdir(self.output_dir) or os.path.makedirs(self.output_dir)

        # validation
        self.strict_validation = strict_validation
        self.validator = DatetimeValidator()
        self._schema = {}
        self._load_schemas()

        self.follow_robots = False

        # logging convenience methods
        self.logger = logging.getLogger("billy")
        self.log = self.logger.info
        self.info = self.logger.info
        self.debug = self.logger.debug
        self.warning = self.logger.warning
        self.error = self.logger.error
        self.critical = self.logger.critical
Exemplo n.º 4
0
def main():

    parser = argparse.ArgumentParser(
        description='Scrape data for state, saving data to disk.',
        parents=[base_arg_parser],
    )

    parser.add_argument('state', type=str,
                        help='state scraper module (eg. nc)')
    parser.add_argument('-s', '--session', action='append', dest='sessions',
                        help='session(s) to scrape')
    parser.add_argument('-t', '--term', action='append', dest='terms',
                        help='term(s) to scrape')
    parser.add_argument('--upper', action='store_true', dest='upper',
                        default=False, help='scrape upper chamber')
    parser.add_argument('--lower', action='store_true', dest='lower',
                        default=False, help='scrape lower chamber')
    parser.add_argument('--bills', action='store_true', dest='bills',
                        default=False, help="scrape bill data")
    parser.add_argument('--legislators', action='store_true',
                        dest='legislators', default=False,
                        help="scrape legislator data")
    parser.add_argument('--committees', action='store_true', dest='committees',
                        default=False, help="scrape committee data")
    parser.add_argument('--votes', action='store_true', dest='votes',
                        default=False, help="scrape vote data")
    parser.add_argument('--events', action='store_true', dest='events',
                        default=False, help='scrape event data')
    parser.add_argument('--alldata', action='store_true', dest='alldata',
                        default=False,
                        help="scrape all available types of data")
    parser.add_argument('--strict', action='store_true', dest='strict',
                        default=False, help="fail immediately when"
                        "encountering validation warning")
    parser.add_argument('-n', '--no_cache', action='store_true',
                        dest='no_cache', help="don't use web page cache")
    parser.add_argument('--fastmode', help="scrape in fast mode",
                        action="store_true", default=False)
    parser.add_argument('-r', '--rpm', action='store', type=int, dest='rpm',
                        default=60),
    parser.add_argument('--timeout', action='store', type=int, dest='timeout',
                        default=10)

    args = parser.parse_args()

    settings.update(args)

    # set up search path
    sys.path.insert(0, os.path.join(os.path.dirname(__file__),
                                    '../../openstates'))

    # get metadata
    metadata = __import__(args.state, fromlist=['metadata']).metadata
    state = metadata['abbreviation']

    configure_logging(args.verbose, args.state)

    # make output dir
    args.output_dir = os.path.join(settings.BILLY_DATA_DIR, args.state)
    try:
        os.makedirs(args.output_dir)
    except OSError as e:
        if e.errno != 17:
            raise e

    # write metadata
    try:
        schema_path = os.path.join(os.path.split(__file__)[0],
                                   '../schemas/metadata.json')
        schema = json.load(open(schema_path))

        validator = DatetimeValidator()
        validator.validate(metadata, schema)
    except ValueError as e:
        logging.getLogger('billy').warning('metadata validation error: '
                                                 + str(e))

    with open(os.path.join(args.output_dir, 'state_metadata.json'), 'w') as f:
        json.dump(metadata, f, cls=JSONDateEncoder)

    # determine time period to run for
    if args.terms:
        for term in metadata['terms']:
            if term in args.terms:
                args.sessions.extend(term['sessions'])
    args.sessions = set(args.sessions or [])

    # determine chambers
    args.chambers = []
    if args.upper:
        args.chambers.append('upper')
    if args.lower:
        args.chambers.append('lower')
    if not args.chambers:
        args.chambers = ['upper', 'lower']

    if not (args.bills or args.legislators or args.votes or
            args.committees or args.events or args.alldata):
        raise ScrapeError("Must specify at least one of --bills, "
                          "--legislators, --committees, --votes, --events, "
                          "--alldata")

    if args.alldata:
        args.bills = True
        args.legislators = True
        args.votes = True
        args.committees = True

    if args.bills:
        _run_scraper(args.state, state, 'bills', args, metadata)
    if args.legislators:
        _run_scraper(args.state, state, 'legislators', args, metadata)
    if args.committees:
        _run_scraper(args.state, state, 'committees', args, metadata)
    if args.votes:
        _run_scraper(args.state, state, 'votes', args, metadata)
    if args.events:
        _run_scraper(args.state, state, 'events', args, metadata)
Exemplo n.º 5
0
def main():
    try:
        parser = argparse.ArgumentParser(
            description='update billy data',
            parents=[base_arg_parser],
        )

        what = parser.add_argument_group(
            'what to scrape', 'flags that help select what data to scrape')
        scrape = parser.add_argument_group('scraper config',
                                           'settings for the scraper')

        parser.add_argument('module', type=str, help='scraper module (eg. nc)')
        parser.add_argument('--pdb', action='store_true', default=False,
                            help='invoke PDB when exception is raised')
        parser.add_argument('--ipdb', action='store_true', default=False,
                            help='invoke PDB when exception is raised')
        parser.add_argument('--pudb', action='store_true', default=False,
                            help='invoke PUDB when exception is raised')
        what.add_argument('-s', '--session', action='append',
                          dest='sessions', default=[],
                          help='session(s) to scrape')
        what.add_argument('-t', '--term', action='append', dest='terms',
                          help='term(s) to scrape', default=[])

        for arg in ('upper', 'lower'):
            what.add_argument('--' + arg, action='append_const',
                              dest='chambers', const=arg)
        for arg in ('bills', 'legislators', 'committees',
                    'votes', 'events', 'speeches'):
            what.add_argument('--' + arg, action='append_const', dest='types',
                              const=arg)
        for arg in ('scrape', 'import', 'report', 'session-list'):
            parser.add_argument('--' + arg, dest='actions',
                                action="append_const", const=arg,
                                help='only run %s step' % arg)

        # special modes for debugging
        scrape.add_argument('--nonstrict', action='store_false', dest='strict',
                            default=True, help="don't fail immediately when"
                            " encountering validation warning")
        scrape.add_argument('--fastmode', help="scrape in fast mode",
                            action="store_true", default=False)

        # scrapelib overrides
        scrape.add_argument('-r', '--rpm', action='store', type=int,
                            dest='SCRAPELIB_RPM')
        scrape.add_argument('--timeout', action='store', type=int,
                            dest='SCRAPELIB_TIMEOUT')
        scrape.add_argument('--retries', type=int,
                            dest='SCRAPELIB_RETRY_ATTEMPTS')
        scrape.add_argument('--retry_wait', type=int,
                            dest='SCRAPELIB_RETRY_WAIT_SECONDS')

        args = parser.parse_args()

        if args.pdb or args.pudb or args.ipdb:
            _debugger = pdb
            if args.pudb:
                try:
                    import pudb
                    _debugger = pudb
                except ImportError:
                    pass
            if args.ipdb:
                try:
                    import ipdb
                    _debugger = ipdb
                except ImportError:
                    pass

            # turn on PDB-on-error mode
            # stolen from http://stackoverflow.com/questions/1237379/
            # if this causes problems in interactive mode check that page
            def _tb_info(type, value, tb):
                traceback.print_exception(type, value, tb)
                _debugger.pm()
            sys.excepthook = _tb_info

        # inject scraper paths so scraper module can be found
        for newpath in settings.SCRAPER_PATHS:
            sys.path.insert(0, newpath)

        # get metadata
        module = importlib.import_module(args.module)
        metadata = module.metadata
        module_settings = getattr(module, 'settings', {})
        abbrev = metadata['abbreviation']

        # load module settings, then command line settings
        settings.update(module_settings)
        settings.update(args)

        # make output dir
        args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbrev)

        # if terms aren't set, use latest
        if not args.terms:
            if args.sessions:
                for session in args.sessions:
                    args.terms.append(
                        term_for_session(metadata['abbreviation'], session,
                                         metadata))
                args.terms = list(set(args.terms or []))
            else:
                latest_term = metadata['terms'][-1]['name']
                args.terms = [latest_term]
        # only set sessions from terms if sessions weren't set
        elif not args.sessions:
            for term in metadata['terms']:
                if term['name'] in args.terms:
                    args.sessions.extend(term['sessions'])
            # dedup sessions
            args.sessions = list(set(args.sessions or []))

        if not args.sessions:
            args.sessions = [metadata['terms'][-1]['sessions'][-1]]

        # determine chambers
        if not args.chambers:
            args.chambers = ['upper', 'lower']

        if not args.actions:
            args.actions = ['scrape', 'import', 'report']

        if not args.types:
            args.types = ['bills', 'legislators', 'votes', 'committees',
                          'alldata']

            if 'events' in metadata['feature_flags']:
                args.types.append('events')

            if 'speeches' in metadata['feature_flags']:
                args.types.append('speeches')

        plan = """billy-update abbr=%s
    actions=%s
    types=%s
    sessions=%s
    terms=%s""" % (args.module, ','.join(args.actions), ','.join(args.types),
                   ','.join(args.sessions), ','.join(args.terms))
        logging.getLogger('billy').info(plan)

        scrape_data = {}

        if 'scrape' in args.actions:
            _clear_scraped_data(args.output_dir)

            # validate then write metadata
            if hasattr(module, 'session_list'):
                session_list = module.session_list()
            else:
                session_list = []
            check_sessions(metadata, session_list)

            try:
                schema_path = os.path.join(os.path.split(__file__)[0],
                                           '../schemas/metadata.json')
                schema = json.load(open(schema_path))

                validator = DatetimeValidator()
                validator.validate(metadata, schema)
            except ValueError as e:
                logging.getLogger('billy').warning(
                    'metadata validation error: ' + str(e))

            run_record = []
            exec_record = {
                "run_record": run_record,
                "args": sys.argv,
            }

            lex = None
            exc_traceback = None

            # start to run scrapers
            exec_start = dt.datetime.utcnow()

            # scraper order matters
            order = ('legislators', 'committees', 'votes', 'bills',
                     'events', 'speeches')
            _traceback = None
            try:
                for stype in order:
                    if stype in args.types:
                        run_record += _run_scraper(stype, args, metadata)
            except Exception as e:
                _traceback = _, _, exc_traceback = sys.exc_info()
                run_record += [{"exception": e, "type": stype}]
                lex = e

            exec_end = dt.datetime.utcnow()
            exec_record['started'] = exec_start
            exec_record['ended'] = exec_end
            scrape_data['scraped'] = exec_record
            scrape_data['abbr'] = abbrev

            for record in run_record:
                if "exception" in record:
                    ex = record['exception']
                    fb = traceback.format_exception(*_traceback)
                    trace = ""
                    for t in fb:
                        trace += t
                    record['exception'] = {
                        "type": ex.__class__.__name__,
                        "message": ex.message,
                        'traceback': trace
                    }
                    scrape_data['failure'] = True
            if lex:
                if 'import' in args.actions:
                    try:
                        db.billy_runs.save(scrape_data, safe=True)
                    except Exception:
                        raise lex, None, exc_traceback
                        # XXX: This should *NEVER* happen, but it has
                        # in the past, so we're going to catch any errors
                        # writing # to pymongo, and raise the original
                        # exception rather then let it look like Mongo's fault.
                        # Thanks for catching this, Thom.
                        #
                        # We lose the stack trace, but the Exception is the
                        # same in every other way.
                        #  -- paultag
                raise

        # imports
        if 'import' in args.actions:
            import_report = _do_imports(abbrev, args)
            scrape_data['imported'] = import_report
            # We're tying the run-logging into the import stage - since import
            # already writes to the DB, we might as well throw this in too.
            db.billy_runs.save(scrape_data, safe=True)

        # reports
        if 'report' in args.actions:
            _do_reports(abbrev, args)

        if 'session-list' in args.actions:
            if hasattr(module, 'session_list'):
                print("\n".join(module.session_list()))
            else:
                raise ScrapeError('session_list() is not defined')

    except ScrapeError as e:
        logging.getLogger('billy').critical('Error: %s', e)
        sys.exit(1)
Exemplo n.º 6
0
    # make output dir
    args.output_dir = os.path.join(settings.BILLY_DATA_DIR, state)
    try:
        os.makedirs(args.output_dir)
    except OSError, e:
        if e.errno != 17:
            raise e

    # write metadata
    try:
        schema_path = os.path.join(
            os.path.split(__file__)[0], '../schemas/metadata.json')
        schema = json.load(open(schema_path))

        validator = DatetimeValidator()
        validator.validate(metadata, schema)
    except ValueError, e:
        logging.getLogger('billy').warning('metadata validation error: ' +
                                           str(e))

    with open(os.path.join(args.output_dir, 'state_metadata.json'), 'w') as f:
        json.dump(metadata, f, cls=JSONDateEncoder)

    # determine time period to run for
    if args.terms:
        for term in metadata['terms']:
            if term in args.terms:
                args.sessions.extend(term['sessions'])
    args.sessions = set(args.sessions or [])