def command(self): from ckanext.dgu.ons.downloader import OnsData, ONS_DEFAULT_CACHE_PATH from ckanext.dgu.ons.importer import OnsImporter from ckanext.dgu.ons.loader import OnsLoader ApiCommand.command(self) log = logging.getLogger(__name__) try: if self.options.days: self.options.days = int(self.options.days) if self.options.start_date: self.options.start_date = self.parse_date(self.options.start_date) if self.options.end_date: self.options.end_date = self.parse_date(self.options.end_date) if self.options.month: self.options.month = self.parse_month(self.options.month) if self.options.months_since: self.options.months_since = self.parse_month(self.options.months_since) if not self.options.ons_cache_dir: self.options.ons_cache_dir = ONS_DEFAULT_CACHE_PATH if self.options.days or \ self.options.start_date or \ self.options.end_date: data_filepaths = OnsData.download_flexible( days=self.options.days, start_date=self.options.start_date, end_date=self.options.end_date, ons_cache_dir=self.options.ons_cache_dir) elif self.options.month: data_filepaths = OnsData.download_month(year=self.options.month.year, month=self.options.month.month) elif self.options.months_since: data_filepaths = OnsData.download_months_since( year=self.options.months_since.year, month=self.options.months_since.month, force_download=self.options.force_download) elif self.options.all_time: data_filepaths = OnsData.download_all(force_download=self.options.force_download) else: self.parser.error('Please specify a time period') filter_ = {} if self.options.publisher: filter_['publisher'] = self.options.publisher stats = StatsList() importer = OnsImporter(filepaths=data_filepaths, ckanclient=self.client, stats=stats, filter_=filter_) loader = OnsLoader(self.client, stats) loader.load_packages(importer.pkg_dict()) log.info('Summary:\n' + stats.report()) except: # Any problem, make sure it gets logged log.exception('ONS Loader exception') raise
def command(self): ApiCommand.command(self) XmlRpcCommand.command(self) if self.options.days: self.options.days = int(self.options.days) if self.options.start_date: self.options.start_date = self.parse_date(self.options.start_date) if self.options.end_date: self.options.end_date = self.parse_date(self.options.end_date) if not self.options.ons_cache_dir: self.options.ons_cache_dir = ONS_DEFAULT_CACHE_PATH if self.options.days or \ self.options.start_date or \ self.options.end_date: data_filepaths = OnsData.download_flexible( days=self.options.days, start_date=self.options.start_date, end_date=self.options.end_date, ons_cache_dir=self.options.ons_cache_dir) elif self.options.all_time: data_filepaths = OnsData.download_all() else: self.parser.error('Please specify a time period') importer = OnsImporter(filepaths=data_filepaths, xmlrpc_settings=self.xmlrpc_settings) loader = OnsLoader(self.client) loader.load_packages(importer.pkg_dict())
def command(self): ApiCommand.command(self) XmlRpcCommand.command(self) if self.options.days: self.options.days = int(self.options.days) if self.options.start_date: self.options.start_date = self.parse_date(self.options.start_date) if self.options.end_date: self.options.end_date = self.parse_date(self.options.end_date) if self.options.days or self.options.start_date or self.options.end_date: data_filepaths = OnsData.download_flexible( days=self.options.days, start_date=self.options.start_date, end_date=self.options.end_date, ons_cache_dir=self.options.ons_cache_dir, ) elif self.options.all_time: data_filepaths = OnsData.download_all() else: self.parser.error("Please specify a time period") importer = OnsImporter(filepaths=data_filepaths, xmlrpc_settings=self.xmlrpc_settings) loader = OnsLoader(self.client) loader.load_packages(importer.pkg_dict())
def command(self): from ckanext.dgu.ons.downloader import OnsData, ONS_DEFAULT_CACHE_PATH from ckanext.dgu.ons.importer import OnsImporter from ckanext.dgu.ons.loader import OnsLoader ApiCommand.command(self) if self.options.days: self.options.days = int(self.options.days) if self.options.start_date: self.options.start_date = self.parse_date(self.options.start_date) if self.options.end_date: self.options.end_date = self.parse_date(self.options.end_date) if self.options.month: self.options.month = self.parse_month(self.options.month) if self.options.months_since: self.options.months_since = self.parse_month(self.options.months_since) if not self.options.ons_cache_dir: self.options.ons_cache_dir = ONS_DEFAULT_CACHE_PATH if self.options.days or \ self.options.start_date or \ self.options.end_date: data_filepaths = OnsData.download_flexible( days=self.options.days, start_date=self.options.start_date, end_date=self.options.end_date, ons_cache_dir=self.options.ons_cache_dir) elif self.options.month: data_filepaths = OnsData.download_month(year=self.options.month.year, month=self.options.month.month) elif self.options.months_since: data_filepaths = OnsData.download_months_since( year=self.options.months_since.year, month=self.options.months_since.month, force_download=self.options.force_download) elif self.options.all_time: data_filepaths = OnsData.download_all(force_download=self.options.force_download) else: self.parser.error('Please specify a time period') importer = OnsImporter(filepaths=data_filepaths, ckanclient=self.client) loader = OnsLoader(self.client) loader.load_packages(importer.pkg_dict())
def add_missing_publisher(self): stats = StatsList() res = self.client.action( 'package_search', q='external_reference:ONSHUB !groups:["" TO *]', sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False) log.info('ONSHUB datasets missing publisher: %i', res['count']) source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE) for pkg in res['results']: # solr data_dict is not the correct sort of pkg dictionary so # get it via the API pkg = self.loader._get_package(pkg['name']) if pkg['groups']: log.error(stats.add('Package had a publisher', pkg['name'])) continue match = source_agency_re.search(pkg['notes']) if not match: log.error( stats.add('Could not match source agency', pkg['name'])) continue # Find equivalent publisher source_agency = match.groups()[0] publisher_name = OnsImporter._source_to_publisher_( source_agency, self.client) if not publisher_name: log.error( stats.add('Could not map source agency %s' % source_agency, pkg['name'])) continue pkg['groups'] = [publisher_name] if not self.dry_run: try: self.client.package_entity_put(pkg) except CkanApiError: log.error('Error (%s) adding publisher over API: %s' % \ (self.client.last_status, self.client.last_message)) stats.add( 'Error writing to publisher over API %s' % self.client.last_status, pkg['name']) continue stats.add('Added publisher %s' % publisher_name, pkg['name']) print stats.report() if self.dry_run: print 'NB: No packages changed - dry run.'
def command(self): from ckanext.dgu.ons.downloader import OnsData, ONS_DEFAULT_CACHE_PATH from ckanext.dgu.ons.importer import OnsImporter from ckanext.dgu.ons.loader import OnsLoader ApiCommand.command(self) XmlRpcCommand.command(self) if self.options.days: self.options.days = int(self.options.days) if self.options.start_date: self.options.start_date = self.parse_date(self.options.start_date) if self.options.end_date: self.options.end_date = self.parse_date(self.options.end_date) if not self.options.ons_cache_dir: self.options.ons_cache_dir = ONS_DEFAULT_CACHE_PATH if self.options.days or \ self.options.start_date or \ self.options.end_date: data_filepaths = OnsData.download_flexible( days=self.options.days, start_date=self.options.start_date, end_date=self.options.end_date, ons_cache_dir=self.options.ons_cache_dir) elif self.options.all_time: data_filepaths = OnsData.download_all() else: self.parser.error('Please specify a time period') importer = OnsImporter(filepaths=data_filepaths, xmlrpc_settings=self.xmlrpc_settings) loader = OnsLoader(self.client) loader.load_packages(importer.pkg_dict())
def add_missing_publisher(self): stats = StatsList() res = self.client.action('package_search', q='external_reference:ONSHUB !groups:["" TO *]', sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False) log.info('ONSHUB datasets missing publisher: %i', res['count']) source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE) for pkg in res['results']: # solr data_dict is not the correct sort of pkg dictionary so # get it via the API pkg = self.loader._get_package(pkg['name']) if pkg['groups']: log.error(stats.add('Package had a publisher', pkg['name'])) continue match = source_agency_re.search(pkg['notes']) if not match: log.error(stats.add('Could not match source agency', pkg['name'])) continue # Find equivalent publisher source_agency = match.groups()[0] publisher_name = OnsImporter._source_to_publisher_(source_agency, self.client) if not publisher_name: log.error(stats.add('Could not map source agency %s' % source_agency, pkg['name'])) continue pkg['groups'] = [publisher_name] if not self.dry_run: try: self.client.package_entity_put(pkg) except CkanApiError: log.error('Error (%s) adding publisher over API: %s' % \ (self.client.last_status, self.client.last_message)) stats.add('Error writing to publisher over API %s' % self.client.last_status, pkg['name']) continue stats.add('Added publisher %s' % publisher_name, pkg['name']) print stats.report() if self.dry_run: print 'NB: No packages changed - dry run.'
def command(ckan_api_url): from ckanext.dgu.ons.importer import OnsImporter # sources pasted here from http://www.statistics.gov.uk/hub/statistics-producers/index.html sources = ''' Agri-Food and Biosciences Institute Agriculture and Rural Development (Northern Ireland) Business, Innovation and Skills Cabinet Office Communities and Local Government Culture, Media and Sport Defence Education Education (Northern Ireland) Employment and Learning (Northern Ireland) Energy and Climate Change Enterprise, Trade and Investment (Northern Ireland) Environment (Northern Ireland) Environment, Food and Rural Affairs Food Standards Agency Forestry Commission Health Health and Safety Executive Health Protection Agency Health, Social Service and Public Safety (Northern Ireland) HM Revenue and Customs HM Treasury Home Office ISD Scotland (part of NHS National Services Scotland) International Development Justice Justice (Northern Ireland) Marine Management Organisation National Records of Scotland National Treatment Agency Northern Ireland Statistics and Research Agency Office for National Statistics Office for Rail Regulation Office for Standards in Education, Children\'s Services and Skills Office of Qualifications and Examinations Regulation Office of the First and Deputy First Minister Passenger Focus Police Service of Northern Ireland (PSNI) Public Health England Regional Development (Northern Ireland) Scottish Government Social Development (Northern Ireland) Transport Welsh Government Work and Pensions Other statistics producers Civil Aviation Authority Child Maintenance and Enforcement Commission Health and Social Care Information Centre Higher Education Statistics Agency Independent Police Complaints Commission NHS England Scottish Consortium for Learning Disability International statistics organisations Eurostat ''' # These are extra sources seen in the past ONS data, picked up from # the ons_merge_duplicates tool: sources += ''' Cancer Registry Northern Ireland Welsh Assembly Government ''' pasted_lines_to_ignore = ('Government Statistical Departments', 'Other statistics producers', 'International statistics organisations', ) ckanclient = CkanClient(base_location=ckan_api_url) num_errors = 0 sources = sources.split('\n') for source in sources: if not source.strip() or source in pasted_lines_to_ignore: continue publisher = OnsImporter._source_to_publisher_(source.strip(), ckanclient) if not publisher: log.error('Publisher not found: %s', source) num_errors += 1 log.info('Completed with %i errors from %i sources', num_errors, len(sources))
def command(self): from ckanext.dgu.ons.downloader import OnsData, ONS_DEFAULT_CACHE_PATH from ckanext.dgu.ons.importer import OnsImporter from ckanext.dgu.ons.loader import OnsLoader ApiCommand.command(self) log = logging.getLogger(__name__) try: if self.options.days: self.options.days = int(self.options.days) if self.options.start_date: self.options.start_date = self.parse_date( self.options.start_date) if self.options.end_date: self.options.end_date = self.parse_date(self.options.end_date) if self.options.month: self.options.month = self.parse_month(self.options.month) if self.options.months_since: self.options.months_since = self.parse_month( self.options.months_since) if not self.options.ons_cache_dir: self.options.ons_cache_dir = ONS_DEFAULT_CACHE_PATH if self.options.days or \ self.options.start_date or \ self.options.end_date: data_filepaths = OnsData.download_flexible( days=self.options.days, start_date=self.options.start_date, end_date=self.options.end_date, ons_cache_dir=self.options.ons_cache_dir) elif self.options.month: data_filepaths = OnsData.download_month( year=self.options.month.year, month=self.options.month.month) elif self.options.months_since: data_filepaths = OnsData.download_months_since( year=self.options.months_since.year, month=self.options.months_since.month, force_download=self.options.force_download) elif self.options.all_time: data_filepaths = OnsData.download_all( force_download=self.options.force_download) else: self.parser.error('Please specify a time period') filter_ = {} if self.options.publisher: filter_['publisher'] = self.options.publisher stats = StatsList() importer = OnsImporter(filepaths=data_filepaths, ckanclient=self.client, stats=stats, filter_=filter_) loader = OnsLoader(self.client, stats) loader.load_packages(importer.pkg_dict()) log.info('Summary:\n' + stats.report()) except: # Any problem, make sure it gets logged log.exception('ONS Loader exception') raise
def command(ckan_api_url): from ckanext.dgu.ons.importer import OnsImporter # sources pasted here from http://www.statistics.gov.uk/hub/statistics-producers/index.html sources = ''' Agri-Food and Biosciences Institute Agriculture and Rural Development (Northern Ireland) Business, Innovation and Skills Cabinet Office Communities and Local Government Culture, Media and Sport Defence Education Education (Northern Ireland) Employment and Learning (Northern Ireland) Energy and Climate Change Enterprise, Trade and Investment (Northern Ireland) Environment (Northern Ireland) Environment, Food and Rural Affairs Food Standards Agency Forestry Commission Health Health and Safety Executive Health Protection Agency Health, Social Service and Public Safety (Northern Ireland) HM Revenue and Customs HM Treasury Home Office ISD Scotland (part of NHS National Services Scotland) International Development Justice Justice (Northern Ireland) Marine Management Organisation National Records of Scotland National Treatment Agency Northern Ireland Statistics and Research Agency Office for National Statistics Office for Rail Regulation Office for Standards in Education, Children\'s Services and Skills Office of Qualifications and Examinations Regulation Office of the First and Deputy First Minister Passenger Focus Police Service of Northern Ireland (PSNI) Public Health England Regional Development (Northern Ireland) Scottish Government Social Development (Northern Ireland) Transport Welsh Government Work and Pensions Cancer Registry (Northern Ireland) Civil Aviation Authority Child Maintenance and Enforcement Commission Health and Social Care Information Centre Higher Education Statistics Agency Independent Police Complaints Commission NHS England Scottish Consortium for Learning Disability Student Loans Company Eurostat ''' # These are extra sources seen in the past ONS data, picked up from # the ons_merge_duplicates tool: sources += ''' Cancer Registry Northern Ireland Welsh Assembly Government ''' pasted_lines_to_ignore = ( 'Government Statistical Departments', 'Other statistics producers', 'International statistics organisations', ) ckanclient = CkanClient(base_location=ckan_api_url) num_errors = 0 sources = sources.split('\n') for source in sources: if not source.strip() or source in pasted_lines_to_ignore: continue publisher = OnsImporter._source_to_publisher_(source.strip(), ckanclient) if not publisher: log.error('Publisher not found: %s', source) num_errors += 1 log.info('Completed with %i errors from %i sources', num_errors, len(sources))
def command(self): self._load_config() log = __import__('logging').getLogger(__name__) from ckanext.dgu.ons.importer import OnsImporter # sources pasted here from http://www.statistics.gov.uk/hub/statistics-producers/index.html sources = ''' Agri-Food and Biosciences Institute Agriculture and Rural Development (Northern Ireland) Business, Innovation and Skills Cabinet Office Child Maintenance and Enforcement Commission Communities and Local Government Culture, Media and Sport Defence Education Education (Northern Ireland) Employment and Learning (Northern Ireland) Energy and Climate Change Enterprise, Trade and Investment (Northern Ireland) Environment (Northern Ireland) Environment, Food and Rural Affairs Food Standards Agency Forestry Commission HM Revenue and Customs HM Treasury Health Health and Safety Executive Health and Social Care Information Centre Health Protection Agency Health, Social Service and Public Safety (Northern Ireland) Home Office ISD Scotland (part of NHS National Services Scotland) International Development Justice Justice (Northern Ireland) Marine Management Organisation National Records of Scotland National Treatment Agency Northern Ireland Statistics and Research Agency Office for National Statistics Office for Rail Regulation Office for Standards in Education, Children\'s Services and Skills Office of Qualifications and Examinations Regulation Office of the First and Deputy First Minister Passenger Focus Police Service of Northern Ireland (PSNI) Regional Development (Northern Ireland) Scottish Government Social Development (Northern Ireland) Transport Welsh Government Work and Pensions Civil Aviation Authority Higher Education Statistics Agency Eurostat ''' for source in sources: publisher = OnsImporter._source_to_publisher(source) assert publisher, source import pdb; pdb.set_trace() log.info('Completed successfully for %i sources', len(sources))