def test_parse(self): expected_data = [ ('Annual Report', 'annual_report'), ('Annual Report: 2006', 'annual_report-2006'), ] for str_, name in expected_data: result_name = name_munge(str_) assert_equal(result_name, name)
def record_2_package(self, item): assert isinstance(item, dict) # process item title, release = self._split_title(item['title']) munged_title = schema.name_munge(title) publisher_name = self._source_to_publisher(item['hub:source-agency']) if publisher_name: publishers = [publisher_name] else: publishers = [] log.warn('Did not find publisher for source-agency: %s', item['hub:source-agency']) # Resources guid = item['guid'] or None if guid: if not guid.startswith(guid_prefix): raise RowParseError('GUID did not start with prefix %r: %r' % (guid_prefix, guid)) guid = guid[len(guid_prefix):] if 'http' in guid: raise RowParseError('GUID de-prefixed should not have \'http\' in it still: %r' % (guid)) existing_resource = None download_url = item.get('link', None) notes_list = [] if item['description']: notes_list.append(item['description']) for column, name in [('hub:source-agency', 'Source agency'), ('hub:designation', 'Designation'), ('hub:language', 'Language'), ('hub:altTitle', 'Alternative title'), ]: if item[column]: notes_list.append('%s: %s' % (name, item[column])) notes = '\n\n'.join(notes_list) extras = { 'geographic_coverage': u'', 'external_reference': u'', 'temporal_granularity': u'', 'date_updated': u'', 'precision': u'', 'geographic_granularity': u'', 'temporal_coverage-from': u'', 'temporal_coverage-to': u'', 'national_statistic': u'', 'update_frequency': u'', 'date_released': u'', 'categories': u'', 'series':u'', } date_released = u'' if item['pubDate']: date_released = date.parse(item["pubDate"]) if date_released.qualifier: log.warn('Could not read format of publication (release) date: %r' % item["pubDate"]) extras['date_released'] = date_released.isoformat() extras['categories'] = item['hub:theme'] extras['geographic_coverage'] = self._parse_geographic_coverage(item['hub:coverage']) extras['national_statistic'] = 'yes' if item['hub:designation'] == 'National Statistics' or item['hub:designation'] == 'National Statistics' else 'no' extras['geographic_granularity'] = item['hub:geographic-breakdown'] extras['external_reference'] = u'ONSHUB' extras['series'] = title if release else u'' for update_frequency_suggestion in schema.update_frequency_options: item_info = ('%s %s' % (item['title'], item['description'])).lower() if update_frequency_suggestion in item_info: extras['update_frequency'] = update_frequency_suggestion elif update_frequency_suggestion.endswith('ly'): if update_frequency_suggestion.rstrip('ly') in item_info: extras['update_frequency'] = update_frequency_suggestion extras['import_source'] = 'ONS-%s' % self._current_filename resources = [{ 'url': download_url, 'description': release, 'hub-id': guid, 'publish-date': date_released.as_datetime().strftime('%Y-%m-%d'), }] # update package pkg_dict = { 'name': munged_title, 'title': title, 'version': None, 'url': None, 'maintainer': None, 'maintainer_email': None, 'notes': notes, 'license_id': self._crown_license_id, 'tags': [], # post-filled 'groups': publishers, 'resources': resources, 'extras': extras, } tags = schema.TagSuggester.suggest_tags(pkg_dict) for keyword in item['hub:ipsv'].split(';') + \ item['hub:keywords'].split(';') + \ item['hub:nscl'].split(';'): tag = schema.tag_munge(keyword) if tag and len(tag) > 1: tags.add(tag) tags = list(tags) tags.sort() pkg_dict['tags'] = tags return pkg_dict
def test_parse(self): expected_data = [("Annual Report", "annual_report"), ("Annual Report: 2006", "annual_report-2006")] for str_, name in expected_data: result_name = name_munge(str_) assert_equal(result_name, name)
def record_2_package(self, item): assert isinstance(item, dict) # process item title, release = self._split_title(item['title']) munged_title = schema.name_munge(title) publisher_name = self._source_to_publisher(item['hub:source-agency']) if publisher_name: publishers = [publisher_name] else: publishers = [] log.warn('Did not find publisher for source-agency: %s', item['hub:source-agency']) # Resources guid = item['guid'] or None if guid: if not guid.startswith(guid_prefix): raise RowParseError('GUID did not start with prefix %r: %r' % (guid_prefix, guid)) guid = guid[len(guid_prefix):] if 'http' in guid: raise RowParseError('GUID de-prefixed should not have \'http\' in it still: %r' % (guid)) existing_resource = None download_url = item.get('link', None) notes_list = [] if item['description']: notes_list.append(item['description']) for column, name in [('hub:source-agency', 'Source agency'), ('hub:designation', 'Designation'), ('hub:language', 'Language'), ('hub:altTitle', 'Alternative title'), ]: if item[column]: notes_list.append('%s: %s' % (name, item[column])) notes = '\n\n'.join(notes_list) extras = { 'geographic_coverage': u'', 'external_reference': u'', 'temporal_granularity': u'', 'date_updated': u'', 'precision': u'', 'geographic_granularity': u'', 'temporal_coverage-from': u'', 'temporal_coverage-to': u'', 'national_statistic': u'', 'update_frequency': u'', 'date_released': u'', 'categories': u'', 'series':u'', } date_released = u'' if item['pubDate']: date_released = date.parse(item["pubDate"]) if date_released.qualifier: log.warn('Could not read format of publication (release) date: %r' % item["pubDate"]) extras['date_released'] = date_released.isoformat() extras['categories'] = item['hub:theme'] extras['geographic_coverage'] = self._parse_geographic_coverage(item['hub:coverage']) extras['national_statistic'] = 'yes' if item['hub:designation'] == 'National Statistics' or item['hub:designation'] == 'National Statistics' else 'no' extras['geographic_granularity'] = item['hub:geographic-breakdown'] extras['external_reference'] = u'ONSHUB' extras['series'] = title if release else u'' for update_frequency_suggestion in schema.update_frequency_options: item_info = ('%s %s' % (item['title'], item['description'])).lower() if update_frequency_suggestion in item_info: extras['update_frequency'] = update_frequency_suggestion elif update_frequency_suggestion.endswith('ly'): if update_frequency_suggestion.rstrip('ly') in item_info: extras['update_frequency'] = update_frequency_suggestion extras['import_source'] = 'ONS-%s' % self._current_filename resources = [{ 'url': download_url, 'description': release, 'hub-id': guid, }] # update package pkg_dict = { 'name': munged_title, 'title': title, 'version': None, 'url': None, 'maintainer': None, 'maintainer_email': None, 'notes': notes, 'license_id': self._crown_license_id, 'tags': [], # post-filled 'groups': publishers, 'resources': resources, 'extras': extras, } tags = schema.TagSuggester.suggest_tags(pkg_dict) for keyword in item['hub:ipsv'].split(';') + \ item['hub:keywords'].split(';') + \ item['hub:nscl'].split(';'): tag = schema.tag_munge(keyword) if tag and len(tag) > 1: tags.add(tag) tags = list(tags) tags.sort() pkg_dict['tags'] = tags return pkg_dict