示例#1
0
    def command(self):
        from ckanext.dgu.ons.downloader import OnsData, ONS_DEFAULT_CACHE_PATH
        from ckanext.dgu.ons.importer import OnsImporter
        from ckanext.dgu.ons.loader import OnsLoader

        ApiCommand.command(self)
        log = logging.getLogger(__name__)

        try:
            if self.options.days:
                self.options.days = int(self.options.days)
            if self.options.start_date:
                self.options.start_date = self.parse_date(self.options.start_date)
            if self.options.end_date:
                self.options.end_date = self.parse_date(self.options.end_date)
            if self.options.month:
                self.options.month = self.parse_month(self.options.month)
            if self.options.months_since:
                self.options.months_since = self.parse_month(self.options.months_since)
            if not self.options.ons_cache_dir:
                self.options.ons_cache_dir = ONS_DEFAULT_CACHE_PATH

            if self.options.days or \
                self.options.start_date or \
                self.options.end_date:
                data_filepaths = OnsData.download_flexible(
                    days=self.options.days,
                    start_date=self.options.start_date,
                    end_date=self.options.end_date,
                    ons_cache_dir=self.options.ons_cache_dir)

            elif self.options.month:
                data_filepaths = OnsData.download_month(year=self.options.month.year,
                                                        month=self.options.month.month)
            elif self.options.months_since:
                data_filepaths = OnsData.download_months_since(
                    year=self.options.months_since.year,
                    month=self.options.months_since.month,
                    force_download=self.options.force_download)
            elif self.options.all_time:
                data_filepaths = OnsData.download_all(force_download=self.options.force_download)
            else:
                self.parser.error('Please specify a time period')

            filter_ = {}
            if self.options.publisher:
                filter_['publisher'] = self.options.publisher

            stats = StatsList()
            importer = OnsImporter(filepaths=data_filepaths,
                                   ckanclient=self.client, stats=stats,
                                   filter_=filter_)
            loader = OnsLoader(self.client, stats)

            loader.load_packages(importer.pkg_dict())
            log.info('Summary:\n' + stats.report())
        except:
            # Any problem, make sure it gets logged
            log.exception('ONS Loader exception')
            raise
示例#2
0
    def command(self):
        ApiCommand.command(self)
        XmlRpcCommand.command(self)

        if self.options.days:
            self.options.days = int(self.options.days)
        if self.options.start_date:
            self.options.start_date = self.parse_date(self.options.start_date)
        if self.options.end_date:
            self.options.end_date = self.parse_date(self.options.end_date)
        if not self.options.ons_cache_dir:
            self.options.ons_cache_dir = ONS_DEFAULT_CACHE_PATH

        if self.options.days or \
               self.options.start_date or \
               self.options.end_date:
            data_filepaths = OnsData.download_flexible(
                days=self.options.days,
                start_date=self.options.start_date,
                end_date=self.options.end_date,
                ons_cache_dir=self.options.ons_cache_dir)

        elif self.options.all_time:
            data_filepaths = OnsData.download_all()
        else:
            self.parser.error('Please specify a time period')

        importer = OnsImporter(filepaths=data_filepaths,
                               xmlrpc_settings=self.xmlrpc_settings)
        loader = OnsLoader(self.client)

        loader.load_packages(importer.pkg_dict())
示例#3
0
    def command(self):
        ApiCommand.command(self)
        XmlRpcCommand.command(self)

        if self.options.days:
            self.options.days = int(self.options.days)
        if self.options.start_date:
            self.options.start_date = self.parse_date(self.options.start_date)
        if self.options.end_date:
            self.options.end_date = self.parse_date(self.options.end_date)

        if self.options.days or self.options.start_date or self.options.end_date:
            data_filepaths = OnsData.download_flexible(
                days=self.options.days,
                start_date=self.options.start_date,
                end_date=self.options.end_date,
                ons_cache_dir=self.options.ons_cache_dir,
            )

        elif self.options.all_time:
            data_filepaths = OnsData.download_all()
        else:
            self.parser.error("Please specify a time period")

        importer = OnsImporter(filepaths=data_filepaths, xmlrpc_settings=self.xmlrpc_settings)
        loader = OnsLoader(self.client)

        loader.load_packages(importer.pkg_dict())
示例#4
0
    def command(self):
        from ckanext.dgu.ons.downloader import OnsData, ONS_DEFAULT_CACHE_PATH
        from ckanext.dgu.ons.importer import OnsImporter
        from ckanext.dgu.ons.loader import OnsLoader

        ApiCommand.command(self)

        if self.options.days:
            self.options.days = int(self.options.days)
        if self.options.start_date:
            self.options.start_date = self.parse_date(self.options.start_date)
        if self.options.end_date:
            self.options.end_date = self.parse_date(self.options.end_date)
        if self.options.month:
            self.options.month = self.parse_month(self.options.month)
        if self.options.months_since:
            self.options.months_since = self.parse_month(self.options.months_since)
        if not self.options.ons_cache_dir:
            self.options.ons_cache_dir = ONS_DEFAULT_CACHE_PATH

        if self.options.days or \
               self.options.start_date or \
               self.options.end_date:
            data_filepaths = OnsData.download_flexible(
                days=self.options.days,
                start_date=self.options.start_date,
                end_date=self.options.end_date,
                ons_cache_dir=self.options.ons_cache_dir)

        elif self.options.month:
            data_filepaths = OnsData.download_month(year=self.options.month.year,
                                                    month=self.options.month.month)
        elif self.options.months_since:
            data_filepaths = OnsData.download_months_since(
                year=self.options.months_since.year,
                month=self.options.months_since.month,
                force_download=self.options.force_download)
        elif self.options.all_time:
            data_filepaths = OnsData.download_all(force_download=self.options.force_download)
        else:
            self.parser.error('Please specify a time period')

        importer = OnsImporter(filepaths=data_filepaths, ckanclient=self.client)
        loader = OnsLoader(self.client)

        loader.load_packages(importer.pkg_dict())
示例#5
0
    def add_missing_publisher(self):
        stats = StatsList()

        res = self.client.action(
            'package_search',
            q='external_reference:ONSHUB !groups:["" TO *]',
            sort='name asc',
            fq=' +site_id:"dgu" +state:active',
            wt='json',
            rows=100,
            escape_q=False)

        log.info('ONSHUB datasets missing publisher: %i', res['count'])
        source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE)

        for pkg in res['results']:
            # solr data_dict is not the correct sort of pkg dictionary so
            # get it via the API
            pkg = self.loader._get_package(pkg['name'])
            if pkg['groups']:
                log.error(stats.add('Package had a publisher', pkg['name']))
                continue
            match = source_agency_re.search(pkg['notes'])
            if not match:
                log.error(
                    stats.add('Could not match source agency', pkg['name']))
                continue
            # Find equivalent publisher
            source_agency = match.groups()[0]
            publisher_name = OnsImporter._source_to_publisher_(
                source_agency, self.client)
            if not publisher_name:
                log.error(
                    stats.add('Could not map source agency %s' % source_agency,
                              pkg['name']))
                continue
            pkg['groups'] = [publisher_name]
            if not self.dry_run:
                try:
                    self.client.package_entity_put(pkg)
                except CkanApiError:
                    log.error('Error (%s) adding publisher over API: %s' % \
                              (self.client.last_status,
                               self.client.last_message))
                    stats.add(
                        'Error writing to publisher over API %s' %
                        self.client.last_status, pkg['name'])
                    continue
            stats.add('Added publisher %s' % publisher_name, pkg['name'])

        print stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'
示例#6
0
    def command(self):
        from ckanext.dgu.ons.downloader import OnsData, ONS_DEFAULT_CACHE_PATH
        from ckanext.dgu.ons.importer import OnsImporter
        from ckanext.dgu.ons.loader import OnsLoader

        ApiCommand.command(self)
        XmlRpcCommand.command(self)

        if self.options.days:
            self.options.days = int(self.options.days)
        if self.options.start_date:
            self.options.start_date = self.parse_date(self.options.start_date)
        if self.options.end_date:
            self.options.end_date = self.parse_date(self.options.end_date)
        if not self.options.ons_cache_dir:
            self.options.ons_cache_dir = ONS_DEFAULT_CACHE_PATH

        if self.options.days or \
               self.options.start_date or \
               self.options.end_date:
            data_filepaths = OnsData.download_flexible(
                days=self.options.days,
                start_date=self.options.start_date,
                end_date=self.options.end_date,
                ons_cache_dir=self.options.ons_cache_dir)

        elif self.options.all_time:
            data_filepaths = OnsData.download_all()
        else:
            self.parser.error('Please specify a time period')

        importer = OnsImporter(filepaths=data_filepaths,
                               xmlrpc_settings=self.xmlrpc_settings)
        loader = OnsLoader(self.client)

        loader.load_packages(importer.pkg_dict())
    def add_missing_publisher(self):
        stats = StatsList()

        res = self.client.action('package_search', q='external_reference:ONSHUB !groups:["" TO *]', sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False)
        
        log.info('ONSHUB datasets missing publisher: %i', res['count'])
        source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE)

        for pkg in res['results']:
            # solr data_dict is not the correct sort of pkg dictionary so
            # get it via the API
            pkg = self.loader._get_package(pkg['name'])
            if pkg['groups']:
                log.error(stats.add('Package had a publisher', pkg['name']))
                continue
            match = source_agency_re.search(pkg['notes'])
            if not match:
                log.error(stats.add('Could not match source agency', pkg['name']))
                continue
            # Find equivalent publisher
            source_agency = match.groups()[0]
            publisher_name = OnsImporter._source_to_publisher_(source_agency, self.client)
            if not publisher_name:
                log.error(stats.add('Could not map source agency %s' % source_agency, pkg['name']))
                continue
            pkg['groups'] = [publisher_name]
            if not self.dry_run:
                try:
                    self.client.package_entity_put(pkg)
                except CkanApiError:
                    log.error('Error (%s) adding publisher over API: %s' % \
                              (self.client.last_status,
                               self.client.last_message))
                    stats.add('Error writing to publisher over API %s' % self.client.last_status, pkg['name'])
                    continue
            stats.add('Added publisher %s' % publisher_name, pkg['name'])

        print stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'
def command(ckan_api_url):

        from ckanext.dgu.ons.importer import OnsImporter
        # sources pasted here from http://www.statistics.gov.uk/hub/statistics-producers/index.html
        sources = '''
Agri-Food and Biosciences Institute
Agriculture and Rural Development (Northern Ireland)
Business, Innovation and Skills
Cabinet Office
Communities and Local Government
Culture, Media and Sport
Defence
Education
Education (Northern Ireland)
Employment and Learning (Northern Ireland)
Energy and Climate Change
Enterprise, Trade and Investment (Northern Ireland)
Environment (Northern Ireland)
Environment, Food and Rural Affairs
Food Standards Agency
Forestry Commission
Health
Health and Safety Executive
Health Protection Agency
Health, Social Service and Public Safety (Northern Ireland)
HM Revenue and Customs
HM Treasury
Home Office
ISD Scotland (part of NHS National Services Scotland)
International Development
Justice
Justice (Northern Ireland)
Marine Management Organisation
National Records of Scotland
National Treatment Agency
Northern Ireland Statistics and Research Agency
Office for National Statistics
Office for Rail Regulation
Office for Standards in Education, Children\'s Services and Skills
Office of Qualifications and Examinations Regulation
Office of the First and Deputy First Minister
Passenger Focus
Police Service of Northern Ireland (PSNI)
Public Health England
Regional Development (Northern Ireland)
Scottish Government
Social Development (Northern Ireland)
Transport
Welsh Government
Work and Pensions
Other statistics producers
Civil Aviation Authority
Child Maintenance and Enforcement Commission
Health and Social Care Information Centre
Higher Education Statistics Agency
Independent Police Complaints Commission
NHS England
Scottish Consortium for Learning Disability
International statistics organisations
Eurostat
'''
        # These are extra sources seen in the past ONS data, picked up from
        # the ons_merge_duplicates tool:
        sources += '''
Cancer Registry Northern Ireland
Welsh Assembly Government
        '''
        pasted_lines_to_ignore = ('Government Statistical Departments',
                                  'Other statistics producers',
                                  'International statistics organisations',
                                  )
        ckanclient = CkanClient(base_location=ckan_api_url)
        num_errors = 0
        sources = sources.split('\n')
        for source in sources:
            if not source.strip() or source in pasted_lines_to_ignore:
                continue
            publisher = OnsImporter._source_to_publisher_(source.strip(),
                                                          ckanclient)
            if not publisher:
                log.error('Publisher not found: %s', source)
                num_errors += 1
        log.info('Completed with %i errors from %i sources', num_errors, len(sources))
示例#9
0
    def command(self):
        from ckanext.dgu.ons.downloader import OnsData, ONS_DEFAULT_CACHE_PATH
        from ckanext.dgu.ons.importer import OnsImporter
        from ckanext.dgu.ons.loader import OnsLoader

        ApiCommand.command(self)
        log = logging.getLogger(__name__)

        try:
            if self.options.days:
                self.options.days = int(self.options.days)
            if self.options.start_date:
                self.options.start_date = self.parse_date(
                    self.options.start_date)
            if self.options.end_date:
                self.options.end_date = self.parse_date(self.options.end_date)
            if self.options.month:
                self.options.month = self.parse_month(self.options.month)
            if self.options.months_since:
                self.options.months_since = self.parse_month(
                    self.options.months_since)
            if not self.options.ons_cache_dir:
                self.options.ons_cache_dir = ONS_DEFAULT_CACHE_PATH

            if self.options.days or \
                self.options.start_date or \
                self.options.end_date:
                data_filepaths = OnsData.download_flexible(
                    days=self.options.days,
                    start_date=self.options.start_date,
                    end_date=self.options.end_date,
                    ons_cache_dir=self.options.ons_cache_dir)

            elif self.options.month:
                data_filepaths = OnsData.download_month(
                    year=self.options.month.year,
                    month=self.options.month.month)
            elif self.options.months_since:
                data_filepaths = OnsData.download_months_since(
                    year=self.options.months_since.year,
                    month=self.options.months_since.month,
                    force_download=self.options.force_download)
            elif self.options.all_time:
                data_filepaths = OnsData.download_all(
                    force_download=self.options.force_download)
            else:
                self.parser.error('Please specify a time period')

            filter_ = {}
            if self.options.publisher:
                filter_['publisher'] = self.options.publisher

            stats = StatsList()
            importer = OnsImporter(filepaths=data_filepaths,
                                   ckanclient=self.client,
                                   stats=stats,
                                   filter_=filter_)
            loader = OnsLoader(self.client, stats)

            loader.load_packages(importer.pkg_dict())
            log.info('Summary:\n' + stats.report())
        except:
            # Any problem, make sure it gets logged
            log.exception('ONS Loader exception')
            raise
示例#10
0
def command(ckan_api_url):

    from ckanext.dgu.ons.importer import OnsImporter
    # sources pasted here from http://www.statistics.gov.uk/hub/statistics-producers/index.html
    sources = '''
Agri-Food and Biosciences Institute
Agriculture and Rural Development (Northern Ireland)
Business, Innovation and Skills
Cabinet Office
Communities and Local Government
Culture, Media and Sport
Defence
Education
Education (Northern Ireland)
Employment and Learning (Northern Ireland)
Energy and Climate Change
Enterprise, Trade and Investment (Northern Ireland)
Environment (Northern Ireland)
Environment, Food and Rural Affairs
Food Standards Agency
Forestry Commission
Health
Health and Safety Executive
Health Protection Agency
Health, Social Service and Public Safety (Northern Ireland)
HM Revenue and Customs
HM Treasury
Home Office
ISD Scotland (part of NHS National Services Scotland)
International Development
Justice
Justice (Northern Ireland)
Marine Management Organisation
National Records of Scotland
National Treatment Agency
Northern Ireland Statistics and Research Agency
Office for National Statistics
Office for Rail Regulation
Office for Standards in Education, Children\'s Services and Skills
Office of Qualifications and Examinations Regulation
Office of the First and Deputy First Minister
Passenger Focus
Police Service of Northern Ireland (PSNI)
Public Health England
Regional Development (Northern Ireland)
Scottish Government
Social Development (Northern Ireland)
Transport
Welsh Government
Work and Pensions
Cancer Registry (Northern Ireland)
Civil Aviation Authority
Child Maintenance and Enforcement Commission
Health and Social Care Information Centre
Higher Education Statistics Agency
Independent Police Complaints Commission
NHS England
Scottish Consortium for Learning Disability
Student Loans Company
Eurostat
'''
    # These are extra sources seen in the past ONS data, picked up from
    # the ons_merge_duplicates tool:
    sources += '''
Cancer Registry Northern Ireland
Welsh Assembly Government
        '''
    pasted_lines_to_ignore = (
        'Government Statistical Departments',
        'Other statistics producers',
        'International statistics organisations',
    )
    ckanclient = CkanClient(base_location=ckan_api_url)
    num_errors = 0
    sources = sources.split('\n')
    for source in sources:
        if not source.strip() or source in pasted_lines_to_ignore:
            continue
        publisher = OnsImporter._source_to_publisher_(source.strip(),
                                                      ckanclient)
        if not publisher:
            log.error('Publisher not found: %s', source)
            num_errors += 1
    log.info('Completed with %i errors from %i sources', num_errors,
             len(sources))
示例#11
0
    def command(self):
        self._load_config()
        log = __import__('logging').getLogger(__name__)

        from ckanext.dgu.ons.importer import OnsImporter
        # sources pasted here from http://www.statistics.gov.uk/hub/statistics-producers/index.html
        sources = '''
Agri-Food and Biosciences Institute
Agriculture and Rural Development (Northern Ireland)
Business, Innovation and Skills
Cabinet Office
Child Maintenance and Enforcement Commission
Communities and Local Government
Culture, Media and Sport
Defence
Education
Education (Northern Ireland)
Employment and Learning (Northern Ireland)
Energy and Climate Change
Enterprise, Trade and Investment (Northern Ireland)
Environment (Northern Ireland)
Environment, Food and Rural Affairs
Food Standards Agency
Forestry Commission
HM Revenue and Customs
HM Treasury
Health
Health and Safety Executive
Health and Social Care Information Centre
Health Protection Agency
Health, Social Service and Public Safety (Northern Ireland)
Home Office
ISD Scotland (part of NHS National Services Scotland)
International Development
Justice
Justice (Northern Ireland)
Marine Management Organisation
National Records of Scotland
National Treatment Agency
Northern Ireland Statistics and Research Agency
Office for National Statistics
Office for Rail Regulation
Office for Standards in Education, Children\'s Services and Skills
Office of Qualifications and Examinations Regulation
Office of the First and Deputy First Minister
Passenger Focus
Police Service of Northern Ireland (PSNI)
Regional Development (Northern Ireland)
Scottish Government
Social Development (Northern Ireland)
Transport
Welsh Government
Work and Pensions
Civil Aviation Authority
Higher Education Statistics Agency
Eurostat
'''
        for source in sources:
            publisher = OnsImporter._source_to_publisher(source)
            assert publisher, source
        import pdb; pdb.set_trace()
        log.info('Completed successfully for %i sources', len(sources))