def test_munge(self): expected_data = [ ("pollution", "pollution"), ("fish pollution", "fish-pollution"), ("dosh$money", "doshmoney"), ("under_score", "under-score"), ] for str_, tag in expected_data: result_tag = tag_munge(str_) assert_equal(result_tag, tag)
def test_munge(self): expected_data = [ ('pollution', 'pollution'), ('fish pollution', 'fish-pollution'), ('dosh$money', 'doshmoney'), ('under_score', 'under-score'), ] for str_, tag in expected_data: result_tag = tag_munge(str_) assert_equal(result_tag, tag)
def __init__(self): themes_filepath = os.path.abspath( os.path.join(__file__, '../../themes.json')) assert os.path.exists(themes_filepath), themes_filepath log.debug('Reading themes.json') with codecs.open(themes_filepath, encoding='utf8') as f: themes_json = f.read() themes_list = json.loads(themes_json) self.data = {} self.topic_words = {} # topic:theme_name self.topic_bigrams = {} # (topicword1, topicword2):theme_name self.topic_trigrams = { } # (topicword1, topicword2, topicword3):theme_name self.gemet = {} # gemet_keyword:theme_name self.ons = {} # ons_keyword:theme_name self.lga_functions = {} # LGA functions extra self.lga_services = {} # LGA services extra for theme_dict in themes_list: name = theme_dict.get('stored_as') or theme_dict['title'] for key in ('topics', 'gemet', 'nscl', 'ons', 'lga_functions', 'lga_services'): if key in theme_dict: assert isinstance(theme_dict[key], list), (name, key) for topic in theme_dict['topics']: words = [normalize_token(word) for word in split_words(topic)] if len(words) == 1: self.topic_words[words[0]] = name elif len(words) == 2: self.topic_bigrams[tuple(words)] = name elif len(words) == 3: self.topic_trigrams[tuple(words)] = name else: assert 0, 'Too many words in topic: %s' % topic for gemet_keyword in theme_dict.get('gemet', []): self.gemet[normalize_keyword(gemet_keyword)] = name for ons_keyword in theme_dict.get('nscl', []) + theme_dict.get( 'ons', []): self.ons[tag_munge(ons_keyword)] = name for function_id in theme_dict.get('lga_functions', []): self.lga_functions[function_id] = name for service_id in theme_dict.get('lga_services', []): self.lga_services[service_id] = name self.data[name] = theme_dict self.topic_words_set = self.topic_words.viewkeys( ) # can do set-like operations on it self.topic_bigrams_set = self.topic_bigrams.viewkeys() self.topic_trigrams_set = self.topic_trigrams.viewkeys()
def __init__(self): themes_filepath = os.path.abspath(os.path.join(__file__, '../../themes.json')) assert os.path.exists(themes_filepath), themes_filepath log.debug('Reading themes.json') with codecs.open(themes_filepath, encoding='utf8') as f: themes_json = f.read() themes_list = json.loads(themes_json) self.data = {} self.topic_words = {} # topic:theme_name self.topic_bigrams = {} # (topicword1, topicword2):theme_name self.topic_trigrams = {} # (topicword1, topicword2, topicword3):theme_name self.gemet = {} # gemet_keyword:theme_name self.ons = {} # ons_keyword:theme_name self.la_function = {} # LA functions extra self.la_service = {} # LA services extra self.odc = {} # OpenDataCommunities.org theme extra for theme_dict in themes_list: name = theme_dict.get('stored_as') or theme_dict['title'] for key in ('topics', 'gemet', 'nscl', 'ons', 'la_function', 'la_service', 'odc'): if key in theme_dict: assert isinstance(theme_dict[key], list), (name, key) for topic in theme_dict['topics']: words = [normalize_token(word) for word in split_words(topic)] if len(words) == 1: self.topic_words[words[0]] = name elif len(words) == 2: self.topic_bigrams[tuple(words)] = name elif len(words) == 3: self.topic_trigrams[tuple(words)] = name else: assert 0, 'Too many words in topic: %s' % topic for gemet_keyword in theme_dict.get('gemet', []): self.gemet[normalize_keyword(gemet_keyword)] = name for ons_keyword in theme_dict.get('nscl', []) + theme_dict.get('ons', []): self.ons[tag_munge(ons_keyword)] = name for function_id in theme_dict.get('la_functions', []): self.la_function[function_id] = name for service_id in theme_dict.get('la_service', []): self.la_service[service_id] = name for keyword in theme_dict.get('odc', []): self.odc[keyword] = name self.data[name] = theme_dict self.topic_words_set = self.topic_words.viewkeys() # can do set-like operations on it self.topic_bigrams_set = self.topic_bigrams.viewkeys() self.topic_trigrams_set = self.topic_trigrams.viewkeys()
def __init__(self): self.data = {} self.topic_words = {} # topic:theme_name self.topic_bigrams = {} # (topicword1, topicword2):theme_name self.topic_trigrams = {} # (topicword1, topicword2, topicword3):theme_name self.gemet = {} # gemet_keyword:theme_name self.ons = {} # ons_keyword:theme_name self.la_function = {} # LA functions extra self.la_service = {} # LA services extra self.odc = {} # OpenDataCommunities.org theme extra context = {'model': model} terms = get_action('taxonomy_term_list')(context, {'name': 'dgu-themes'}) for term in terms: theme_dict = term['extras'] theme_dict['title'] = name = term['label'] theme_dict['description'] = term['description'] for key in ('topics', 'gemet', 'nscl', 'ons', 'la_function', 'la_service', 'odc'): if key in theme_dict: assert isinstance(theme_dict[key], list), (name, key) for topic in theme_dict['topics']: words = [normalize_token(word) for word in split_words(topic)] if len(words) == 1: self.topic_words[words[0]] = name elif len(words) == 2: self.topic_bigrams[tuple(words)] = name elif len(words) == 3: self.topic_trigrams[tuple(words)] = name else: assert 0, 'Too many words in topic: %s' % topic for gemet_keyword in theme_dict.get('gemet', []): self.gemet[normalize_keyword(gemet_keyword)] = name for ons_keyword in theme_dict.get('nscl', []) + theme_dict.get('ons', []): self.ons[tag_munge(ons_keyword)] = name for function_id in theme_dict.get('la_functions', []): self.la_function[function_id] = name for service_id in theme_dict.get('la_service', []): self.la_service[service_id] = name for keyword in theme_dict.get('odc', []): self.odc[keyword] = name self.data[name] = theme_dict self.topic_words_set = self.topic_words.viewkeys() # can do set-like operations on it self.topic_bigrams_set = self.topic_bigrams.viewkeys() self.topic_trigrams_set = self.topic_trigrams.viewkeys()
def score_by_ons_theme(pkg, scores): # There are 11 'Old ONS themes' e.g.: 'Agriculture and Environment', 'Business and Energy' # http://www.statistics.gov.uk/hub/browse-by-theme/index.html # # and there are set to be 4 'New ONS themes' e.g. 'Business, Trade and Industry' # which break down further, that we need to look at too. # http://digitalpublishing.ons.gov.uk/2013/12/05/no-longer-taxing-we-hope/ if pkg['extras'].get('external_reference') != 'ONSHUB': return themes = Themes.instance() for tag in pkg['tags']: tag = tag_munge(tag) if tag in themes.ons: theme = themes.ons[tag] reason = '%s matched ONS keyword' % tag score = 10 scores[theme].append((score, reason)) log.debug(' %s %s %s' % (theme, score, reason))
elif len(words) == 2: topic_dict = self.topic_bigrams key = tuple(words) elif len(words) == 3: topic_dict = self.topic_trigrams key = tuple(words) else: assert 0, 'Too many words in topic: %s' % topic if key not in topic_dict: topic_dict[key] = [] topic_dict[key].append(name) for gemet_keyword in theme_dict.get('gemet', []): self.gemet[normalize_keyword(gemet_keyword)] = name for ons_keyword in theme_dict.get('nscl', []) + theme_dict.get('ons', []): self.ons[tag_munge(ons_keyword)] = name for function_id in theme_dict.get('la_functions', []): self.la_function[function_id] = name for service_id in theme_dict.get('la_service', []): self.la_service[service_id] = name for keyword in theme_dict.get('odc', []): self.odc[keyword] = name self.data[name] = theme_dict self.topic_words_set = self.topic_words.viewkeys() # can do set-like operations on it self.topic_bigrams_set = self.topic_bigrams.viewkeys() self.topic_trigrams_set = self.topic_trigrams.viewkeys() def normalize_text(text): words = [normalize_token(w) for w in split_words(text)] words_without_stopwords = [word for word in words
def record_2_package(self, item): assert isinstance(item, dict) # process item title, release = self._split_title(item['title']) munged_title = schema.name_munge(title) publisher_name = self._source_to_publisher(item['hub:source-agency']) if publisher_name: publishers = [publisher_name] else: publishers = [] log.warn('Did not find publisher for source-agency: %s', item['hub:source-agency']) # Resources guid = item['guid'] or None if guid: if not guid.startswith(guid_prefix): raise RowParseError('GUID did not start with prefix %r: %r' % (guid_prefix, guid)) guid = guid[len(guid_prefix):] if 'http' in guid: raise RowParseError('GUID de-prefixed should not have \'http\' in it still: %r' % (guid)) existing_resource = None download_url = item.get('link', None) notes_list = [] if item['description']: notes_list.append(item['description']) for column, name in [('hub:source-agency', 'Source agency'), ('hub:designation', 'Designation'), ('hub:language', 'Language'), ('hub:altTitle', 'Alternative title'), ]: if item[column]: notes_list.append('%s: %s' % (name, item[column])) notes = '\n\n'.join(notes_list) extras = { 'geographic_coverage': u'', 'external_reference': u'', 'temporal_granularity': u'', 'date_updated': u'', 'precision': u'', 'geographic_granularity': u'', 'temporal_coverage-from': u'', 'temporal_coverage-to': u'', 'national_statistic': u'', 'update_frequency': u'', 'date_released': u'', 'categories': u'', 'series':u'', } date_released = u'' if item['pubDate']: date_released = date.parse(item["pubDate"]) if date_released.qualifier: log.warn('Could not read format of publication (release) date: %r' % item["pubDate"]) extras['date_released'] = date_released.isoformat() extras['categories'] = item['hub:theme'] extras['geographic_coverage'] = self._parse_geographic_coverage(item['hub:coverage']) extras['national_statistic'] = 'yes' if item['hub:designation'] == 'National Statistics' or item['hub:designation'] == 'National Statistics' else 'no' extras['geographic_granularity'] = item['hub:geographic-breakdown'] extras['external_reference'] = u'ONSHUB' extras['series'] = title if release else u'' for update_frequency_suggestion in schema.update_frequency_options: item_info = ('%s %s' % (item['title'], item['description'])).lower() if update_frequency_suggestion in item_info: extras['update_frequency'] = update_frequency_suggestion elif update_frequency_suggestion.endswith('ly'): if update_frequency_suggestion.rstrip('ly') in item_info: extras['update_frequency'] = update_frequency_suggestion extras['import_source'] = 'ONS-%s' % self._current_filename resources = [{ 'url': download_url, 'description': release, 'hub-id': guid, 'publish-date': date_released.as_datetime().strftime('%Y-%m-%d'), }] # update package pkg_dict = { 'name': munged_title, 'title': title, 'version': None, 'url': None, 'maintainer': None, 'maintainer_email': None, 'notes': notes, 'license_id': self._crown_license_id, 'tags': [], # post-filled 'groups': publishers, 'resources': resources, 'extras': extras, } tags = schema.TagSuggester.suggest_tags(pkg_dict) for keyword in item['hub:ipsv'].split(';') + \ item['hub:keywords'].split(';') + \ item['hub:nscl'].split(';'): tag = schema.tag_munge(keyword) if tag and len(tag) > 1: tags.add(tag) tags = list(tags) tags.sort() pkg_dict['tags'] = tags return pkg_dict
topic_dict = self.topic_bigrams key = tuple(words) elif len(words) == 3: topic_dict = self.topic_trigrams key = tuple(words) else: assert 0, 'Too many words in topic: %s' % topic if key not in topic_dict: topic_dict[key] = [] topic_dict[key].append(name) for gemet_keyword in theme_dict.get('gemet', []): self.gemet[normalize_keyword(gemet_keyword)] = name for ons_keyword in theme_dict.get('nscl', []) + theme_dict.get( 'ons', []): self.ons[tag_munge(ons_keyword)] = name for function_id in theme_dict.get('la_functions', []): self.la_function[function_id] = name for service_id in theme_dict.get('la_service', []): self.la_service[service_id] = name for keyword in theme_dict.get('odc', []): self.odc[keyword] = name self.data[name] = theme_dict self.topic_words_set = self.topic_words.viewkeys( ) # can do set-like operations on it self.topic_bigrams_set = self.topic_bigrams.viewkeys() self.topic_trigrams_set = self.topic_trigrams.viewkeys() def normalize_text(text): words = [normalize_token(w) for w in split_words(text)]
def record_2_package(self, item): assert isinstance(item, dict) # process item title, release = self._split_title(item['title']) munged_title = schema.name_munge(title) publisher_name = self._source_to_publisher(item['hub:source-agency']) if publisher_name: publishers = [publisher_name] else: publishers = [] log.warn('Did not find publisher for source-agency: %s', item['hub:source-agency']) # Resources guid = item['guid'] or None if guid: if not guid.startswith(guid_prefix): raise RowParseError('GUID did not start with prefix %r: %r' % (guid_prefix, guid)) guid = guid[len(guid_prefix):] if 'http' in guid: raise RowParseError('GUID de-prefixed should not have \'http\' in it still: %r' % (guid)) existing_resource = None download_url = item.get('link', None) notes_list = [] if item['description']: notes_list.append(item['description']) for column, name in [('hub:source-agency', 'Source agency'), ('hub:designation', 'Designation'), ('hub:language', 'Language'), ('hub:altTitle', 'Alternative title'), ]: if item[column]: notes_list.append('%s: %s' % (name, item[column])) notes = '\n\n'.join(notes_list) extras = { 'geographic_coverage': u'', 'external_reference': u'', 'temporal_granularity': u'', 'date_updated': u'', 'precision': u'', 'geographic_granularity': u'', 'temporal_coverage-from': u'', 'temporal_coverage-to': u'', 'national_statistic': u'', 'update_frequency': u'', 'date_released': u'', 'categories': u'', 'series':u'', } date_released = u'' if item['pubDate']: date_released = date.parse(item["pubDate"]) if date_released.qualifier: log.warn('Could not read format of publication (release) date: %r' % item["pubDate"]) extras['date_released'] = date_released.isoformat() extras['categories'] = item['hub:theme'] extras['geographic_coverage'] = self._parse_geographic_coverage(item['hub:coverage']) extras['national_statistic'] = 'yes' if item['hub:designation'] == 'National Statistics' or item['hub:designation'] == 'National Statistics' else 'no' extras['geographic_granularity'] = item['hub:geographic-breakdown'] extras['external_reference'] = u'ONSHUB' extras['series'] = title if release else u'' for update_frequency_suggestion in schema.update_frequency_options: item_info = ('%s %s' % (item['title'], item['description'])).lower() if update_frequency_suggestion in item_info: extras['update_frequency'] = update_frequency_suggestion elif update_frequency_suggestion.endswith('ly'): if update_frequency_suggestion.rstrip('ly') in item_info: extras['update_frequency'] = update_frequency_suggestion extras['import_source'] = 'ONS-%s' % self._current_filename resources = [{ 'url': download_url, 'description': release, 'hub-id': guid, }] # update package pkg_dict = { 'name': munged_title, 'title': title, 'version': None, 'url': None, 'maintainer': None, 'maintainer_email': None, 'notes': notes, 'license_id': self._crown_license_id, 'tags': [], # post-filled 'groups': publishers, 'resources': resources, 'extras': extras, } tags = schema.TagSuggester.suggest_tags(pkg_dict) for keyword in item['hub:ipsv'].split(';') + \ item['hub:keywords'].split(';') + \ item['hub:nscl'].split(';'): tag = schema.tag_munge(keyword) if tag and len(tag) > 1: tags.add(tag) tags = list(tags) tags.sort() pkg_dict['tags'] = tags return pkg_dict