def import_data(self, data_items): """ import a bunch of dicts together """ # keep counts of all actions record = { 'insert': 0, 'update': 0, 'noop': 0, 'start': utcnow(), 'records': { 'insert': [], 'update': [], 'noop': [], } } for json_id, data in self._prepare_imports(data_items): obj_id, what = self.import_item(data) self.json_to_db_id[json_id] = obj_id record['records'][what].append(obj_id) record[what] += 1 # all objects are loaded, a perfect time to do inter-object resolution and other tasks self.postimport() record['end'] = utcnow() return {self._type: record}
def do_handle(self, args, other, juris): available_scrapers = getattr(juris, 'scrapers', {}) scrapers = OrderedDict() if not available_scrapers: raise CommandError('no scrapers defined on jurisdiction') if other: # parse arg list in format: (scraper (k:v)+)+ cur_scraper = None for arg in other: if '=' in arg: if not cur_scraper: raise CommandError('argument {} before scraper name'.format(arg)) k, v = arg.split('=', 1) scrapers[cur_scraper][k] = v elif arg in juris.scrapers: cur_scraper = arg scrapers[cur_scraper] = {} else: raise CommandError('no such scraper: module={} scraper={}'.format(args.module, arg)) else: scrapers = {key: {} for key in available_scrapers.keys()} # modify args in-place so we can pass them around if not args.actions: args.actions = ALL_ACTIONS if 'import' in args.actions: django.setup() # print the plan report = {'plan': {'module': args.module, 'actions': args.actions, 'scrapers': scrapers}, 'start': utils.utcnow(), } print_report(report) self.check_session_list(juris) try: if 'scrape' in args.actions: report['scrape'] = self.do_scrape(juris, args, scrapers) if 'import' in args.actions: report['import'] = self.do_import(juris, args) report['success'] = True except Exception as exc: report['success'] = False report['exception'] = exc report['traceback'] = traceback.format_exc() if 'import' in args.actions: save_report(report, juris.jurisdiction_id) raise if 'import' in args.actions: save_report(report, juris.jurisdiction_id) print_report(report) return report
def save_report(report, jurisdiction): from pupa.models import RunPlan # set end time report['end'] = utils.utcnow() plan = RunPlan.objects.create(jurisdiction_id=jurisdiction, success=report['success'], start_time=report['start'], end_time=report['end'], exception=report.get('exception', ''), traceback=report.get('traceback', ''), ) for scraper, details in report.get('scrape', {}).items(): args = ' '.join('{k}={v}'.format(k=k, v=v) for k, v in report['plan']['scrapers'].get(scraper, {}).items()) sr = plan.scrapers.create(scraper=scraper, args=args, start_time=details['start'], end_time=details['end']) for object_type, num in details['objects'].items(): sr.scraped_objects.create(object_type=object_type, count=num) for object_type, changes in report.get('import', {}).items(): if changes['insert'] or changes['update'] or changes['noop']: plan.imported_objects.create( object_type=object_type, insert_count=changes['insert'], update_count=changes['update'], noop_count=changes['noop'], start_time=changes['start'], end_time=changes['end'], )
def handle(self, args, other): juris = self.get_jurisdiction(args.module) available_scrapers = getattr(juris, 'scrapers', {}) scrapers = OrderedDict() if not available_scrapers: raise CommandError('no scrapers defined on jurisdiction') if other: # parse arg list in format: (scraper (k:v)+)+ cur_scraper = None for arg in other: if '=' in arg: if not cur_scraper: raise CommandError('argument {} before scraper name'.format(arg)) k, v = arg.split('=', 1) scrapers[cur_scraper][k] = v elif arg in juris.scrapers: cur_scraper = arg scrapers[cur_scraper] = {} else: raise CommandError('no such scraper: module={} scraper={}'.format(args.module, arg)) else: scrapers = {key: {} for key in available_scrapers.keys()} # modify args in-place so we can pass them around if not args.actions: args.actions = ALL_ACTIONS # print the plan report = {'plan': {'module': args.module, 'actions': args.actions, 'scrapers': scrapers}, 'start': utils.utcnow(), } print_report(report) self.check_session_list(juris) try: if 'scrape' in args.actions: report['scrape'] = self.do_scrape(juris, args, scrapers) if 'import' in args.actions: report['import'] = self.do_import(juris, args) report['success'] = True except Exception as exc: report['success'] = False report['exception'] = exc report['traceback'] = traceback.format_exc() if 'import' in args.actions: save_report(report, juris.jurisdiction_id) raise if 'import' in args.actions: save_report(report, juris.jurisdiction_id) forward_report(report, juris.jurisdiction_id) print_report(report) return report
def do_scrape(self, **kwargs): record = {'objects': defaultdict(int)} self.output_names = defaultdict(set) record['start'] = utils.utcnow() for obj in self.scrape(**kwargs) or []: if hasattr(obj, '__iter__'): for iterobj in obj: self.save_object(iterobj) else: self.save_object(obj) record['end'] = utils.utcnow() record['skipped'] = getattr(self, 'skipped', 0) if not self.output_names: raise ScrapeError('no objects returned from scrape') for _type, nameset in self.output_names.items(): record['objects'][_type] += len(nameset) return record
def do_scrape(self, **kwargs): record = {'objects': defaultdict(int)} self.output_names = defaultdict(set) record['start'] = utils.utcnow() for obj in self.scrape(**kwargs) or []: if hasattr(obj, '__iter__'): for iterobj in obj: self.save_object(iterobj) else: self.save_object(obj) record['end'] = utils.utcnow() record['skipped'] = getattr(self, 'skipped', 0) if not self.output_names: raise ScrapeError('no objects returned from {} scrape'.format(self.__class__.__name__)) for _type, nameset in self.output_names.items(): record['objects'][_type] += len(nameset) return record
def save_report(report, jurisdiction): from pupa.models import RunPlan from opencivicdata.core.models import Jurisdiction as JurisdictionModel # set end time report['end'] = utils.utcnow() # if there's an error on the first run, the jurisdiction doesn't exist # yet, we opt for skipping creation of RunPlan until there's been at least # one good run try: JurisdictionModel.objects.get(pk=jurisdiction) except JurisdictionModel.DoesNotExist: logger = logging.getLogger("pupa") logger.warning( 'could not save RunPlan, no successful runs of {} yet'.format( jurisdiction)) return plan = RunPlan.objects.create( jurisdiction_id=jurisdiction, success=report['success'], start_time=report['start'], end_time=report['end'], exception=report.get('exception', ''), traceback=report.get('traceback', ''), ) for scraper, details in report.get('scrape', {}).items(): args = ' '.join( '{k}={v}'.format(k=k, v=v) for k, v in report['plan']['scrapers'].get(scraper, {}).items()) sr = plan.scrapers.create(scraper=scraper, args=args, start_time=details['start'], end_time=details['end']) for object_type, num in details['objects'].items(): sr.scraped_objects.create(object_type=object_type, count=num) for object_type, changes in report.get('import', {}).items(): if changes['insert'] or changes['update'] or changes['noop']: plan.imported_objects.create( object_type=object_type, insert_count=changes['insert'], update_count=changes['update'], noop_count=changes['noop'], start_time=changes['start'], end_time=changes['end'], )
def save_report(report, jurisdiction): from pupa.models import RunPlan from opencivicdata.core.models import Jurisdiction as JurisdictionModel # set end time report['end'] = utils.utcnow() # if there's an error on the first run, the jurisdiction doesn't exist # yet, we opt for skipping creation of RunPlan until there's been at least # one good run try: JurisdictionModel.objects.get(pk=jurisdiction) except JurisdictionModel.DoesNotExist: logger = logging.getLogger("pupa") logger.warning('could not save RunPlan, no successful runs of {} yet'.format( jurisdiction) ) return plan = RunPlan.objects.create(jurisdiction_id=jurisdiction, success=report['success'], start_time=report['start'], end_time=report['end'], exception=report.get('exception', ''), traceback=report.get('traceback', ''), ) for scraper, details in report.get('scrape', {}).items(): args = ' '.join('{k}={v}'.format(k=k, v=v) for k, v in report['plan']['scrapers'].get(scraper, {}).items()) sr = plan.scrapers.create(scraper=scraper, args=args, start_time=details['start'], end_time=details['end']) for object_type, num in details['objects'].items(): sr.scraped_objects.create(object_type=object_type, count=num) for object_type, changes in report.get('import', {}).items(): if changes['insert'] or changes['update'] or changes['noop']: plan.imported_objects.create( object_type=object_type, insert_count=changes['insert'], update_count=changes['update'], noop_count=changes['noop'], start_time=changes['start'], end_time=changes['end'], )
def save_report(report, jurisdiction): from pupa.models import RunPlan # set end time report['end'] = utils.utcnow() plan = RunPlan.objects.create( jurisdiction_id=jurisdiction, success=report['success'], start_time=report['start'], end_time=report['end'], exception=report.get('exception', ''), traceback=report.get('traceback', ''), ) for scraper, details in report.get('scrape', {}).items(): args = ' '.join( '{k}={v}'.format(k=k, v=v) for k, v in report['plan']['scrapers'].get(scraper, {}).items()) sr = plan.scrapers.create(scraper=scraper, args=args, start_time=details['start'], end_time=details['end']) for object_type, num in details['objects'].items(): sr.scraped_objects.create(object_type=object_type, count=num) for object_type, changes in report.get('import', {}).items(): if changes['insert'] or changes['update'] or changes['noop']: plan.imported_objects.create( object_type=object_type, insert_count=changes['insert'], update_count=changes['update'], noop_count=changes['noop'], start_time=changes['start'], end_time=changes['end'], )