def licenses(filename): '''Feed the licenses from a JSON file''' if filename.startswith('http'): json_licenses = requests.get(filename).json() else: with open(filename) as fp: json_licenses = json.load(fp) if len(json_licenses): log.info('Dropping existing licenses') License.drop_collection() for json_license in json_licenses: flags = [] for field, flag in FLAGS_MAP.items(): if json_license.get(field, False): flags.append(flag) license = License.objects.create( id=json_license['id'], title=json_license['title'], url=json_license['url'] or None, maintainer=json_license['maintainer'] or None, flags=flags, active=json_license.get('active', False), ) log.info('Added license "%s"', license.title) try: License.objects.get(id=DEFAULT_LICENSE['id']) except License.DoesNotExist: License.objects.create(**DEFAULT_LICENSE) log.info('Added license "%s"', DEFAULT_LICENSE['title']) log.info('Done')
def test_prioritize_title_over_alternate_title(self): title = faker.sentence() license = LicenseFactory(title=title) LicenseFactory(alternate_titles=[title]) found = License.guess(title) assert isinstance(found, License) assert license.id == found.id
def license(self): return License.objects(id=self.license_id).first()
def test_match_by_alternate_title_with_low_edit_distance(self): license = LicenseFactory(alternate_titles=['License']) found = License.guess('Licence') assert isinstance(found, License) assert license.id == found.id
def test_match_by_alternate_title_with_mismatching_case(self): license = LicenseFactory(alternate_titles=['License ODBl']) found = License.guess('License ODBL') assert isinstance(found, License) assert license.id == found.id
def test_imatch_by_alternate_url(self): alternate_url = '%s/CAPS.php' % faker.uri() license = LicenseFactory(alternate_urls=[alternate_url]) found = License.guess(alternate_url) assert isinstance(found, License) assert license.id == found.id
def test_exact_match_by_title_with_spaces(self): license = LicenseFactory() found = License.guess(' {0} '.format(license.title)) assert isinstance(found, License) assert license.id == found.id
def test_exact_match_by_title_with_spaces(self): license = LicenseFactory() found = License.guess(' {0} '.format(license.title)) self.assertIsInstance(found, License) self.assertEqual(license.id, found.id)
def test_empty_string(self): found = License.guess('') assert found is None
def test_match_by_alternate_title_with_extra_inner_space(self): license = LicenseFactory(alternate_titles=['License ODBl']) found = License.guess('License ODBl') # 2 spaces instead of 1 assert isinstance(found, License) assert license.id == found.id
def test_exact_match_by_alternate_title_with_spaces(self): alternate_title = faker.sentence() license = LicenseFactory(alternate_titles=[alternate_title]) found = License.guess(' {0} '.format(alternate_title)) assert isinstance(found, License) assert license.id == found.id
def test_exact_match_by_title(self): license = LicenseFactory() found = License.guess(license.title) assert isinstance(found, License) assert license.id == found.id
def test_exact_match_by_alternate_url(self): alternate_url = faker.uri() license = LicenseFactory(alternate_urls=[alternate_url]) found = License.guess(alternate_url) assert isinstance(found, License) assert license.id == found.id
def test_not_found_with_default(self): license = LicenseFactory() found = License.guess('should not be found', default=license) self.assertEqual(found.id, license.id)
def test_multiple_strings(self): license = LicenseFactory() found = License.guess('should not match', license.id) self.assertIsInstance(found, License) self.assertEqual(license.id, found.id)
def test_empty_string(self): found = License.guess('') self.assertIsNone(found)
def test_imatch_by_id(self): license = LicenseFactory(id='CAPS-ID') found = License.guess(license.id) assert isinstance(found, License) assert license.id == found.id
def test_match_by_title_with_extra_inner_space(self): license = LicenseFactory(title='License ODBl') found = License.guess('License ODBl') # 2 spaces instead of 1 self.assertIsInstance(found, License) self.assertEqual(license.id, found.id)
def test_none(self): found = License.guess(None) assert found is None
def test_not_found_with_default(self): license = LicenseFactory() found = License.guess('should not be found', default=license) assert found.id == license.id
def test_multiple_strings(self): license = LicenseFactory() found = License.guess('should not match', license.id) assert isinstance(found, License) assert license.id == found.id
def test_imatch_by_url(self): url = '%s/CAPS.php' % faker.uri() license = LicenseFactory(url=url) found = License.guess(license.url) assert isinstance(found, License) assert license.id == found.id
def remote_datasets(self): response = self.get('package_list') for name in response['result']: details = self.get('package_show', {'id': name})['result'] dataset = self.get_harvested(Dataset, details['id']) # Core attributes dataset.slug = details['name'] dataset.title = details['title'] dataset.description = details.get('notes', 'No description') dataset.license = License.objects(id=details['license_id']).first() or License.objects.get(id='notspecified') dataset.tags = [tag['name'].lower() for tag in details['tags']] dataset.frequency = self.map('frequency', details) or 'unknown' dataset.created_at = parse(details['metadata_created']) dataset.last_modified = parse(details['metadata_modified']) if any_field(details, 'territorial_coverage', 'territorial_coverage_granularity'): coverage = TerritorialCoverage( codes=[code.strip() for code in details.get('territorial_coverage', '').split(',') if code.strip()], granularity=self.map('territorial_coverage_granularity', details), ) dataset.extras['territorial_coverage'] = coverage try: dataset.spatial = territorial_to_spatial(dataset) except Exception as e: print 'Error while processing spatial coverage for {0}:'.format(dataset.title), e if all_field(details, 'temporal_coverage_from', 'temporal_coverage_to'): try: dataset.temporal_coverage = db.DateRange( start=daterange_start(details.get('temporal_coverage_from')), end=daterange_end(details.get('temporal_coverage_to')), ) except: log.error('Unable to parse temporal coverage for dataset %s', details['id']) # Organization if details.get('organization'): dataset.organization = self.get_harvested(Organization, details['organization']['id'], False) else: # Need to fetch user from roles roles = self.get('roles_show', {'domain_object': name})['result']['roles'] for role in roles: if role['role'] == 'admin' and role['context'] == 'Package': dataset.owner = self.get_harvested(User, role['user_id']) break # Supplier if details.get('supplier_id'): dataset.supplier = self.get_harvested(Organization, details['supplier_id'], False) # Remote URL if details.get('url'): dataset.extras['remote_url'] = details['url'] # Extras if 'extras' in details: extra_mapping = self.harvester.mapping.get('from_extras', {}) for extra in details['extras']: if extra['key'] in self.harvester.mapping: value = self.harvester.mapping[extra['key']].get(extra['value']) else: value = extra['value'] if extra['key'] in extra_mapping: setattr(dataset, extra_mapping[extra['key']], value) else: dataset.extras[extra['key']] = value # Resources for res in details['resources']: try: resource = get_by(dataset.resources, 'id', UUID(res['id'])) except: log.error('Unable to parse resource %s', res['id']) continue if not resource: resource = Resource(id=res['id']) dataset.resources.append(resource) resource.title = res.get('name', '') or '' resource.url = res['url'] resource.description = res.get('description') resource.format = res.get('format') resource.hash = res.get('hash') resource.created = parse(res['created']) resource.modified = parse(res['revision_timestamp']) resource.published = resource.published or resource.created yield dataset if dataset.id: followers = self.get('dataset_follower_list', {'id': name})['result'] for follower in followers: user = self.get_harvested(User, follower['id'], False) if user: follow, created = FollowDataset.objects.get_or_create(follower=user, following=dataset)
def test_match_by_url_scheme_mismatch(self): license = LicenseFactory(url='https://example.com/license') found = License.guess('http://example.com/license') assert isinstance(found, License) assert license.id == found.id
def test_match_by_alternate_url_scheme_slash_mismatch(self): alternate_url = 'https://example.com/license' license = LicenseFactory(alternate_urls=[alternate_url]) found = License.guess('http://example.com/license/') assert isinstance(found, License) assert license.id == found.id
def test_exact_match_by_title_with_mismatch_slug(self): license = LicenseFactory(title="Licence Ouverte v2", slug="licence-2") found = License.guess(license.title) assert isinstance(found, License) assert license.id == found.id
def test_no_with_multiple_alternate_titles_from_different_licences(self): LicenseFactory(alternate_titles=['Licence Ouverte v2']) LicenseFactory(alternate_titles=['Licence Ouverte v2.0']) found = License.guess('Licence Ouverte v2.0') assert found is None
def test_not_found(self): found = License.guess('should not be found') self.assertIsNone(found)
def process(self, item): ods_dataset = item.kwargs['dataset'] dataset_id = ods_dataset['datasetid'] ods_metadata = ods_dataset['metas'] ods_interopmetas = ods_dataset.get('interop_metas', {}) if not ods_dataset.get('has_records'): msg = 'Dataset {datasetid} has no record'.format(**ods_dataset) raise HarvestSkipException(msg) if 'inspire' in ods_interopmetas and not self.has_feature('inspire'): msg = 'Dataset {datasetid} has INSPIRE metadata' raise HarvestSkipException(msg.format(**ods_dataset)) dataset = self.get_dataset(item.remote_id) dataset.title = ods_metadata['title'] dataset.frequency = 'unknown' description = ods_metadata.get('description', '').strip() dataset.description = parse_html(description) dataset.private = False # Detect Organization try: organization_acronym = ods_metadata['publisher'] except KeyError: pass else: orgObj = Organization.objects(acronym=organization_acronym).first() if orgObj: dataset.organization = orgObj.id else: orgObj = Organization() orgObj.acronym = organization_acronym orgObj.name = organization_acronym orgObj.description = organization_acronym orgObj.save() dataset.organization = orgObj.id tags = set() if 'keyword' in ods_metadata: if isinstance(ods_metadata['keyword'], list): tags |= set(ods_metadata['keyword']) else: tags.add(ods_metadata['keyword']) if 'theme' in ods_metadata: if isinstance(ods_metadata['theme'], list): for theme in ods_metadata['theme']: tags.update([t.strip().lower() for t in theme.split(',')]) else: themes = ods_metadata['theme'].split(',') tags.update([t.strip().lower() for t in themes]) dataset.tags = list(tags) dataset.tags.append(urlparse(self.source.url).hostname) # Detect license default_license = dataset.license or License.default() license_id = ods_metadata.get('license') dataset.license = License.guess(license_id, self.LICENSES.get(license_id), default=default_license) self.process_resources(dataset, ods_dataset, ('csv', 'json')) if 'geo' in ods_dataset['features']: exports = ['geojson'] if ods_metadata['records_count'] <= self.SHAPEFILE_RECORDS_LIMIT: exports.append('shp') self.process_resources(dataset, ods_dataset, exports) self.process_extra_files(dataset, ods_dataset, 'alternative_export') self.process_extra_files(dataset, ods_dataset, 'attachment') dataset.extras['ods:url'] = self.explore_url(dataset_id) dataset.extras['harvest:name'] = self.source.name if 'references' in ods_metadata: dataset.extras['ods:references'] = ods_metadata['references'] dataset.extras['ods:has_records'] = ods_dataset['has_records'] dataset.extras['ods:geo'] = 'geo' in ods_dataset['features'] return dataset
def test_none(self): found = License.guess(None) self.assertIsNone(found)
def test_not_found(self): found = License.guess('should not be found') assert found is None
def test_exact_match_by_title(self): license = LicenseFactory() found = License.guess(license.title) self.assertIsInstance(found, License) self.assertEqual(license.id, found.id)
def test_match_by_title_with_low_edit_distance(self): license = LicenseFactory(title='License') found = License.guess('Licence') self.assertIsInstance(found, License) self.assertEqual(license.id, found.id)
def test_match_by_title_with_mismatching_case(self): license = LicenseFactory(title='License ODBl') found = License.guess('License ODBL') self.assertIsInstance(found, License) self.assertEqual(license.id, found.id)
def process(self, item): response = self.get_action('package_show', id=item.remote_id) data = self.validate(response['result'], schema) # Fix the remote_id: use real ID instead of not stable name item.remote_id = data['id'] # Skip if no resource if not len(data.get('resources', [])): msg = 'Dataset {0} has no record'.format(item.remote_id) raise HarvestSkipException(msg) dataset = self.get_dataset(item.remote_id) # Core attributes if not dataset.slug: dataset.slug = data['name'] dataset.title = data['title'] dataset.description = data['notes'] dataset.license = License.objects(id=data['license_id']).first() # dataset.license = license or License.objects.get(id='notspecified') dataset.tags = [t['name'] for t in data['tags']] dataset.created_at = data['metadata_created'] dataset.last_modified = data['metadata_modified'] dataset.extras['ckan:name'] = data['name'] temporal_start, temporal_end = None, None spatial_geom = None for extra in data['extras']: # GeoJSON representation (Polygon or Point) if extra['key'] == 'spatial': spatial_geom = json.loads(extra['value']) # Textual representation of the extent / location elif extra['key'] == 'spatial-text': log.debug('spatial-text value not handled') print 'spatial-text', extra['value'] # Linked Data URI representing the place name elif extra['key'] == 'spatial-uri': log.debug('spatial-uri value not handled') print 'spatial-uri', extra['value'] # Update frequency elif extra['key'] == 'frequency': print 'frequency', extra['value'] # Temporal coverage start elif extra['key'] == 'temporal_start': print 'temporal_start', extra['value'] temporal_start = daterange_start(extra['value']) continue # Temporal coverage end elif extra['key'] == 'temporal_end': print 'temporal_end', extra['value'] temporal_end = daterange_end(extra['value']) continue # else: # print extra['key'], extra['value'] dataset.extras[extra['key']] = extra['value'] if spatial_geom: dataset.spatial = SpatialCoverage() if spatial_geom['type'] == 'Polygon': coordinates = [spatial_geom['coordinates']] elif spatial_geom['type'] == 'MultiPolygon': coordinates = spatial_geom['coordinates'] else: HarvestException('Unsupported spatial geometry') dataset.spatial.geom = { 'type': 'MultiPolygon', 'coordinates': coordinates } if temporal_start and temporal_end: dataset.temporal_coverage = db.DateRange( start=temporal_start, end=temporal_end, ) # Remote URL if data.get('url'): dataset.extras['remote_url'] = data['url'] # Resources for res in data['resources']: if res['resource_type'] not in ALLOWED_RESOURCE_TYPES: continue try: resource = get_by(dataset.resources, 'id', UUID(res['id'])) except: log.error('Unable to parse resource ID %s', res['id']) continue if not resource: resource = Resource(id=res['id']) dataset.resources.append(resource) resource.title = res.get('name', '') or '' resource.description = res.get('description') resource.url = res['url'] resource.filetype = ('api' if res['resource_type'] == 'api' else 'remote') resource.format = res.get('format') resource.mime = res.get('mimetype') resource.hash = res.get('hash') resource.created = res['created'] resource.modified = res['last_modified'] resource.published = resource.published or resource.created return dataset