def get_value(self): org = (Organization.objects(metrics__reuses__gt=0).visible() .order_by('-metrics.reuses').first()) if org: return org.metrics.get('reuses', 0) else: return 0
def get_value(self): org = (Organization.objects(metrics__reuses__gt=0).visible().order_by( '-metrics.reuses').first()) if org: return org.metrics.get('reuses', 0) else: return 0
def test_purge_organizations(self): with self.autoindex(): org = Organization.objects.create(name='delete me', description='XXX') resources = [ResourceFactory() for _ in range(2)] dataset = DatasetFactory(resources=resources, organization=org) # Upload organization's logo file = create_test_image() user = AdminFactory() self.login(user) response = self.post(url_for('api.organization_logo', org=org), {'file': (file, 'test.png')}, json=False) self.assert200(response) # Delete organization response = self.delete(url_for('api.organization', org=org)) self.assert204(response) tasks.purge_organizations() # Check organization's logo is deleted self.assertEqual(list(storages.avatars.list_files()), []) dataset = Dataset.objects(id=dataset.id).first() self.assertIsNone(dataset.organization) organization = Organization.objects(name='delete me').first() self.assertIsNone(organization) indexed_dataset = DatasetSearch.get(id=dataset.id, using=es.client, index=es.index_name) self.assertIsNone(indexed_dataset.organization)
def get_context(self): context = super(UserView, self).get_context() context['organizations'] = Organization.objects( members__user=self.user) for item in navbar.items: item._args = {'user': self.user} return context
def mongo_search(cls, args): orgs = Organization.objects(deleted=None) orgs = OrgApiParser.parse_filters(orgs, args) sort = cls.parse_sort(args['sort']) or ('$text_score' if args['q'] else None) or DEFAULT_SORTING offset = (args['page'] - 1) * args['page_size'] return orgs.order_by(sort).skip(offset).limit( args['page_size']), orgs.count()
def climate_change_challenge(): partners = Organization.objects(slug__in=C3_PARTNERS) datasets = (Dataset.objects(badges__kind=C3).visible() .order_by('-metrics.followers')) return theme.render('c3.html', partners=partners, datasets=datasets, badge=C3, nb_displayed_datasets=NB_DISPLAYED_DATASETS)
def climate_change_challenge(): partners = Organization.objects(slug__in=C3_PARTNERS) datasets = (Dataset.objects( badges__kind=C3).visible().order_by('-metrics.followers')) return theme.render('c3.html', partners=partners, datasets=datasets, badge=C3, nb_displayed_datasets=NB_DISPLAYED_DATASETS)
def test_purge_organizations(self): org = Organization.objects.create(name='delete me', description='XXX') resources = [ResourceFactory() for _ in range(2)] dataset = DatasetFactory(resources=resources, organization=org) # Upload organization's logo file = create_test_image() user = AdminFactory() self.login(user) response = self.post( url_for('api.organization_logo', org=org), {'file': (file, 'test.png')}, json=False) self.assert200(response) transfer_to_org = Transfer.objects.create( owner=user, recipient=org, subject=dataset, comment='comment', ) transfer_from_org = Transfer.objects.create( owner=org, recipient=user, subject=dataset, comment='comment', ) oauth_client = OAuth2Client.objects.create( name='test-client', owner=user, organization=org, redirect_uris=['https://test.org/callback'], ) # Delete organization response = self.delete(url_for('api.organization', org=org)) self.assert204(response) tasks.purge_organizations() oauth_client.reload() assert oauth_client.organization is None assert Transfer.objects.filter(id=transfer_from_org.id).count() == 0 assert Transfer.objects.filter(id=transfer_to_org.id).count() == 0 # Check organization's logo is deleted self.assertEqual(list(storages.avatars.list_files()), []) dataset = Dataset.objects(id=dataset.id).first() self.assertIsNone(dataset.organization) organization = Organization.objects(name='delete me').first() self.assertIsNone(organization)
def purge_organizations(): for organization in Organization.objects(deleted__ne=None): log.info('Purging organization "{0}"'.format(organization)) # Remove followers FollowOrg.objects(following=organization).delete() # Remove activity Activity.objects(related_to=organization).delete() Activity.objects(organization=organization).delete() # Remove metrics Metrics.objects(object_id=organization.id).delete() organization.delete()
def purge_organizations(self): for organization in Organization.objects(deleted__ne=None): log.info('Purging organization "{0}"'.format(organization)) # Remove followers FollowOrg.objects(following=organization).delete() # Remove activity Activity.objects(related_to=organization).delete() Activity.objects(organization=organization).delete() # Remove metrics Metrics.objects(object_id=organization.id).delete() # Remove organization.delete()
def purge_organizations(self): for organization in Organization.objects(deleted__ne=None): log.info('Purging organization "{0}"'.format(organization)) # Remove followers Follow.objects(following=organization).delete() # Remove activity Activity.objects(related_to=organization).delete() Activity.objects(organization=organization).delete() # Remove metrics Metrics.objects(object_id=organization.id).delete() # Store datasets for later reindexation d_ids = [d.id for d in Dataset.objects(organization=organization)] # Remove organization.delete() # Reindex the datasets that were linked to the organization for dataset in Dataset.objects(id__in=d_ids): reindex(dataset)
def serialize(cls, reuse): """By default use the ``to_dict`` method and exclude ``_id``, ``_cls`` and ``owner`` fields. """ datasets = Dataset.objects(id__in=[r.id for r in reuse.datasets]) datasets = list(datasets.only('id', 'title').no_dereference()) organization = None owner = None if reuse.organization: organization = Organization.objects( id=reuse.organization.id).first() elif reuse.owner: owner = User.objects(id=reuse.owner.id).first() return { 'title': reuse.title, 'description': reuse.description, 'url': reuse.url, 'organization': str(organization.id) if organization else None, 'owner': str(owner.id) if owner else None, 'type': reuse.type, 'topic': reuse.topic, 'tags': reuse.tags, 'tag_suggest': reuse.tags, 'badges': [badge.kind for badge in reuse.badges], 'created': to_iso_datetime(reuse.created_at), 'last_modified': to_iso_datetime(reuse.last_modified), 'dataset': [{ 'id': str(d.id), 'title': d.title } for d in datasets], 'metrics': reuse.metrics, 'featured': reuse.featured, 'extras': reuse.extras, 'reuse_suggest': { 'input': cls.completer_tokenize(reuse.title) + [reuse.id], 'output': str(reuse.id), 'payload': { 'title': reuse.title, 'slug': reuse.slug, 'image_url': reuse.image(500, external=True), }, }, }
def test_purge_organizations(self): with self.autoindex(): org = Organization.objects.create( name='delete me', deleted='2016-01-01', description='XXX') resources = [ResourceFactory() for _ in range(2)] dataset = DatasetFactory(resources=resources, organization=org) tasks.purge_organizations() dataset = Dataset.objects(id=dataset.id).first() self.assertIsNone(dataset.organization) organization = Organization.objects(name='delete me').first() self.assertIsNone(organization) indexed_dataset = DatasetSearch.get(id=dataset.id, using=es.client, index=es.index_name) self.assertIsNone(indexed_dataset.organization)
def serialize(cls, reuse): """By default use the ``to_dict`` method and exclude ``_id``, ``_cls`` and ``owner`` fields. """ datasets = Dataset.objects(id__in=[r.id for r in reuse.datasets]) datasets = list(datasets.only('id', 'title').no_dereference()) organization = None owner = None if reuse.organization: organization = Organization.objects(id=reuse.organization.id).first() elif reuse.owner: owner = User.objects(id=reuse.owner.id).first() return { 'title': reuse.title, 'description': reuse.description, 'url': reuse.url, 'organization': str(organization.id) if organization else None, 'owner': str(owner.id) if owner else None, 'type': reuse.type, 'tags': reuse.tags, 'tag_suggest': reuse.tags, 'badges': [badge.kind for badge in reuse.badges], 'created': reuse.created_at.strftime('%Y-%m-%dT%H:%M:%S'), 'last_modified': reuse.last_modified.strftime('%Y-%m-%dT%H:%M:%S'), 'dataset': [{ 'id': str(d.id), 'title': d.title } for d in datasets], 'metrics': reuse.metrics, 'featured': reuse.featured, 'extras': reuse.extras, 'reuse_suggest': { 'input': cls.completer_tokenize(reuse.title) + [reuse.id], 'output': str(reuse.id), 'payload': { 'title': reuse.title, 'slug': reuse.slug, 'image_url': reuse.image(40, external=True), }, }, }
def test_purge_organizations(self): with self.autoindex(): org = Organization.objects.create(name='delete me', deleted='2016-01-01', description='XXX') resources = [ResourceFactory() for _ in range(2)] dataset = DatasetFactory(resources=resources, organization=org) tasks.purge_organizations() dataset = Dataset.objects(id=dataset.id).first() self.assertEqual(dataset.organization, None) organization = Organization.objects(name='delete me').first() self.assertEqual(organization, None) indexed_dataset = DatasetSearch.get(id=dataset.id, using=es.client, index=es.index_name) self.assertEqual(indexed_dataset.organization, '')
def get_context(self): context = super(UserView, self).get_context() context['organizations'] = Organization.objects( members__user=self.user) return context
def serialize(cls, dataset): organization = None owner = None image_url = None spatial_weight = DEFAULT_SPATIAL_WEIGHT temporal_weight = DEFAULT_TEMPORAL_WEIGHT if dataset.organization: organization = Organization.objects(id=dataset.organization.id).first() image_url = organization.logo(40, external=True) elif dataset.owner: owner = User.objects(id=dataset.owner.id).first() image_url = owner.avatar(40, external=True) certified = organization and organization.certified document = { 'title': dataset.title, 'description': dataset.description, 'license': getattr(dataset.license, 'id', None), 'tags': dataset.tags, 'badges': [badge.kind for badge in dataset.badges], 'tag_suggest': dataset.tags, 'resources': [ { 'title': r.title, 'description': r.description, 'format': r.format, 'type': r.type, } for r in dataset.resources], 'format_suggest': [r.format.lower() for r in dataset.resources if r.format], 'frequency': dataset.frequency, 'organization': str(organization.id) if organization else None, 'owner': str(owner.id) if owner else None, 'dataset_suggest': { 'input': cls.completer_tokenize(dataset.title) + [str(dataset.id)], 'output': dataset.title, 'payload': { 'id': str(dataset.id), 'slug': dataset.slug, 'acronym': dataset.acronym, 'image_url': image_url, }, }, 'created': dataset.created_at.strftime('%Y-%m-%dT%H:%M:%S'), 'last_modified': dataset.last_modified.strftime( '%Y-%m-%dT%H:%M:%S'), 'metrics': dataset.metrics, 'featured': dataset.featured, 'from_certified': certified, } if (dataset.temporal_coverage is not None and dataset.temporal_coverage.start and dataset.temporal_coverage.end): start = dataset.temporal_coverage.start.toordinal() end = dataset.temporal_coverage.end.toordinal() temporal_weight = min((end - start) / 365, MAX_TEMPORAL_WEIGHT) document.update({ 'temporal_coverage': {'start': start, 'end': end}, 'temporal_weight': temporal_weight, }) if dataset.spatial is not None: # Index precise zone labels and parents zone identifiers # to allow fast filtering. zone_ids = [z.id for z in dataset.spatial.zones] zones = GeoZone.objects(id__in=zone_ids).exclude('geom') parents = set() geozones = [] coverage_level = ADMIN_LEVEL_MAX for zone in zones: geozones.append({ 'id': zone.id, 'name': zone.name, 'keys': zone.keys_values }) parents |= set(zone.parents) coverage_level = min(coverage_level, admin_levels[zone.level]) geozones.extend([{'id': p} for p in parents]) spatial_weight = ADMIN_LEVEL_MAX / coverage_level document.update({ 'geozones': geozones, 'granularity': dataset.spatial.granularity, 'spatial_weight': spatial_weight, }) document['dataset_suggest']['weight'] = cls.get_suggest_weight( temporal_weight, spatial_weight, dataset.featured) if dataset.acronym: document['dataset_suggest']['input'].append(dataset.acronym) return document
def get_value(self): org = Organization.objects(metrics__datasets__gt=0).visible().order_by('-metrics.datasets').first() return org.metrics.get('datasets', 0)
def set_g_user_orgs(): if current_user.is_authenticated(): g.user_organizations = Organization.objects(members__user=current_user.id)
def process(self, item): ods_dataset = item.kwargs['dataset'] dataset_id = ods_dataset['datasetid'] ods_metadata = ods_dataset['metas'] ods_interopmetas = ods_dataset.get('interop_metas', {}) if not ods_dataset.get('has_records'): msg = 'Dataset {datasetid} has no record'.format(**ods_dataset) raise HarvestSkipException(msg) if 'inspire' in ods_interopmetas and not self.has_feature('inspire'): msg = 'Dataset {datasetid} has INSPIRE metadata' raise HarvestSkipException(msg.format(**ods_dataset)) dataset = self.get_dataset(item.remote_id) dataset.title = ods_metadata['title'] dataset.frequency = 'unknown' description = ods_metadata.get('description', '').strip() dataset.description = parse_html(description) dataset.private = False # Detect Organization try: organization_acronym = ods_metadata['publisher'] except KeyError: pass else: orgObj = Organization.objects(acronym=organization_acronym).first() if orgObj: dataset.organization = orgObj else: orgObj = Organization() orgObj.acronym = organization_acronym orgObj.name = organization_acronym orgObj.description = organization_acronym orgObj.save() dataset.organization = orgObj tags = set() if 'keyword' in ods_metadata: if isinstance(ods_metadata['keyword'], list): tags |= set(ods_metadata['keyword']) else: tags.add(ods_metadata['keyword']) if 'theme' in ods_metadata: if isinstance(ods_metadata['theme'], list): for theme in ods_metadata['theme']: tags.update([t.strip().lower() for t in theme.split(',')]) else: themes = ods_metadata['theme'].split(',') tags.update([t.strip().lower() for t in themes]) dataset.tags = list(tags) dataset.tags.append(urlparse(self.source.url).hostname) # Detect license default_license = dataset.license or License.default() license_id = ods_metadata.get('license') dataset.license = License.guess(license_id, self.LICENSES.get(license_id), default=default_license) self.process_resources(dataset, ods_dataset, ('csv', 'json')) if 'geo' in ods_dataset['features']: exports = ['geojson'] if ods_metadata['records_count'] <= self.SHAPEFILE_RECORDS_LIMIT: exports.append('shp') self.process_resources(dataset, ods_dataset, exports) self.process_extra_files(dataset, ods_dataset, 'alternative_export') self.process_extra_files(dataset, ods_dataset, 'attachment') dataset.extras['ods:url'] = self.explore_url(dataset_id) dataset.extras['harvest:name'] = self.source.name if 'references' in ods_metadata: dataset.extras['ods:references'] = ods_metadata['references'] dataset.extras['ods:has_records'] = ods_dataset['has_records'] dataset.extras['ods:geo'] = 'geo' in ods_dataset['features'] return dataset
def inject_organization_needs(sender, identity): if current_user.is_authenticated(): for org in Organization.objects(members__user=current_user.id): membership = get_by(org.members, 'user', current_user._get_current_object()) identity.provides.add(OrganizationNeed(membership.role, org.id))
def serialize(cls, dataset): organization = None owner = None if dataset.organization: org = Organization.objects(id=dataset.organization.id).first() organization = { 'id': str(org.id), 'name': org.name, 'public_service': 1 if org.public_service else 0, 'followers': org.metrics.get('followers', 0) } elif dataset.owner: owner = User.objects(id=dataset.owner.id).first() document = { 'id': str(dataset.id), 'title': dataset.title, 'description': dataset.description, 'acronym': dataset.acronym or None, 'url': dataset.display_url, 'tags': dataset.tags, 'license': getattr(dataset.license, 'id', None), 'badges': [badge.kind for badge in dataset.badges], 'frequency': dataset.frequency, 'created_at': to_iso_datetime(dataset.created_at), 'views': dataset.metrics.get('views', 0), 'followers': dataset.metrics.get('followers', 0), 'reuses': dataset.metrics.get('reuses', 0), 'featured': 1 if dataset.featured else 0, 'resources_count': len(dataset.resources), 'organization': organization, 'owner': str(owner.id) if owner else None, 'format': [r.format.lower() for r in dataset.resources if r.format], 'schema': [r.schema.get('name') for r in dataset.resources if r.schema] } extras = {} for key, value in dataset.extras.items(): extras[key] = to_iso_datetime(value) if isinstance( value, datetime.datetime) else value document.update({'extras': extras}) if (dataset.temporal_coverage is not None and dataset.temporal_coverage.start and dataset.temporal_coverage.end): start = to_iso_datetime(dataset.temporal_coverage.start) end = to_iso_datetime(dataset.temporal_coverage.end) document.update({ 'temporal_coverage_start': start, 'temporal_coverage_end': end, }) if dataset.spatial is not None: # Index precise zone labels and parents zone identifiers # to allow fast filtering. zone_ids = [z.id for z in dataset.spatial.zones] zones = GeoZone.objects(id__in=zone_ids).exclude('geom') parents = set() geozones = [] coverage_level = ADMIN_LEVEL_MAX for zone in zones: geozones.append({ 'id': zone.id, 'name': zone.name, 'keys': zone.keys_values }) parents |= set(zone.parents) coverage_level = min(coverage_level, admin_levels[zone.level]) geozones.extend([{'id': p} for p in parents]) document.update({ 'geozones': geozones, 'granularity': dataset.spatial.granularity, }) return document
def get_value(self): org = (Organization.objects(metrics__followers__gt=0).visible() .order_by('-metrics.followers').first()) return org.metrics['followers'] if org else 0
def get_value(self): org = Organization.objects(metrics__followers__gt=0).visible().order_by('-metrics.followers').first() return org.metrics.get('followers', 0)
def get_value(self): org = Organization.objects(metrics__datasets__gt=0).visible().order_by( '-metrics.datasets').first() return org.metrics.get('datasets', 0)
def get_value(self): org = Organization.objects(metrics__followers__gt=0).visible( ).order_by('-metrics.followers').first() return org.metrics.get('followers', 0)
def get_value(self): org = (Organization.objects(metrics__followers__gt=0).visible(). order_by('-metrics.followers').first()) return org.metrics['followers'] if org else 0
def get_value(self): org = (Organization.objects(metrics__datasets__gt=0).visible(). order_by('-metrics.datasets').first()) return org.metrics['datasets'] if org else 0
def get_value(self): return Organization.objects(badges__kind=PUBLIC_SERVICE).count()
def get_value(self): org = (Organization.objects(metrics__datasets__gt=0).visible() .order_by('-metrics.datasets').first()) return org.metrics['datasets'] if org else 0
def get_context(self): context = super(UserView, self).get_context() context["organizations"] = Organization.objects(members__user=self.user) for item in navbar.items: item._args = {"user": self.user} return context
def serialize(cls, dataset): organization = None owner = None image_url = None spatial_weight = DEFAULT_SPATIAL_WEIGHT temporal_weight = DEFAULT_TEMPORAL_WEIGHT if dataset.organization: organization = Organization.objects( id=dataset.organization.id).first() image_url = organization.logo(40, external=True) elif dataset.owner: owner = User.objects(id=dataset.owner.id).first() image_url = owner.avatar(40, external=True) certified = organization and organization.certified document = { 'title': dataset.title, 'description': dataset.description, 'license': getattr(dataset.license, 'id', None), 'tags': dataset.tags, 'badges': [badge.kind for badge in dataset.badges], 'tag_suggest': dataset.tags, 'resources': [{ 'title': r.title, 'description': r.description, 'format': r.format, } for r in dataset.resources], 'format_suggest': [r.format.lower() for r in dataset.resources if r.format], 'frequency': dataset.frequency, 'organization': str(organization.id) if organization else None, 'owner': str(owner.id) if owner else None, 'dataset_suggest': { 'input': cls.completer_tokenize(dataset.title) + [dataset.id], 'output': dataset.title, 'payload': { 'id': str(dataset.id), 'slug': dataset.slug, 'acronym': dataset.acronym, 'image_url': image_url, }, }, 'created': dataset.created_at.strftime('%Y-%m-%dT%H:%M:%S'), 'last_modified': dataset.last_modified.strftime('%Y-%m-%dT%H:%M:%S'), 'metrics': dataset.metrics, 'featured': dataset.featured, 'from_certified': certified, } if (dataset.temporal_coverage is not None and dataset.temporal_coverage.start and dataset.temporal_coverage.end): start = dataset.temporal_coverage.start.toordinal() end = dataset.temporal_coverage.end.toordinal() temporal_weight = min((end - start) / 365, MAX_TEMPORAL_WEIGHT) document.update({ 'temporal_coverage': { 'start': start, 'end': end }, 'temporal_weight': temporal_weight, }) if dataset.spatial is not None: # Index precise zone labels and parents zone identifiers # to allow fast filtering. zone_ids = [z.id for z in dataset.spatial.zones] zones = GeoZone.objects(id__in=zone_ids).exclude('geom') parents = set() geozones = [] coverage_level = ADMIN_LEVEL_MAX for zone in zones: geozones.append({ 'id': zone.id, 'name': zone.name, 'keys': zone.keys_values }) parents |= set(zone.parents) coverage_level = min(coverage_level, admin_levels[zone.level]) geozones.extend([{'id': p} for p in parents]) spatial_weight = ADMIN_LEVEL_MAX / coverage_level document.update({ 'geozones': geozones, 'granularity': dataset.spatial.granularity, 'spatial_weight': spatial_weight, }) document['dataset_suggest']['weight'] = cls.get_suggest_weight( temporal_weight, spatial_weight, dataset.featured) if dataset.acronym: document['dataset_suggest']['input'].append(dataset.acronym) return document
def process(self, item): response = self.get_action('package_show', id=item.remote_id) data = self.validate(response['result'], self.schema) if type(data) == list: data = data[0] # Fix the remote_id: use real ID instead of not stable name item.remote_id = data['id'] # Skip if no resource if not len(data.get('resources', [])): msg = 'Dataset {0} has no record'.format(item.remote_id) raise HarvestSkipException(msg) dataset = self.get_dataset(item.remote_id) # Core attributes if not dataset.slug: dataset.slug = data['name'] dataset.title = data['title'] dataset.description = parse_html(data['notes']) # Detect Org organization_acronym = data['organization']['name'] orgObj = Organization.objects(acronym=organization_acronym).first() if orgObj: #print 'Found %s' % orgObj.acronym dataset.organization = orgObj else: orgObj = Organization() orgObj.acronym = organization_acronym orgObj.name = data['organization']['title'] orgObj.description = data['organization']['description'] orgObj.save() #print 'Created %s' % orgObj.acronym dataset.organization = orgObj # Detect license default_license = self.harvest_config.get('license', License.default()) dataset.license = License.guess(data['license_id'], data['license_title'], default=default_license) dataset.tags = [t['name'] for t in data['tags'] if t['name']] dataset.tags.append(urlparse(self.source.url).hostname) dataset.created_at = data['metadata_created'] dataset.last_modified = data['metadata_modified'] dataset.frequency = 'unknown' dataset.extras['ckan:name'] = data['name'] temporal_start, temporal_end = None, None spatial_geom = None for extra in data['extras']: # GeoJSON representation (Polygon or Point) if extra['key'] == 'spatial': spatial_geom = json.loads(extra['value']) # Textual representation of the extent / location elif extra['key'] == 'spatial-text': log.debug('spatial-text value not handled') # Linked Data URI representing the place name elif extra['key'] == 'spatial-uri': log.debug('spatial-uri value not handled') # Update frequency elif extra['key'] == 'frequency': print 'frequency', extra['value'] # Temporal coverage start elif extra['key'] == 'temporal_start': temporal_start = daterange_start(extra['value']) continue # Temporal coverage end elif extra['key'] == 'temporal_end': temporal_end = daterange_end(extra['value']) continue dataset.extras[extra['key']] = extra['value'] # We don't want spatial to be added on harvester if self.harvest_config.get('geozones', False): dataset.spatial = SpatialCoverage() dataset.spatial.zones = [] for zone in self.harvest_config.get('geozones'): geo_zone = GeoZone.objects.get(id=zone) dataset.spatial.zones.append(geo_zone) # # if spatial_geom: # dataset.spatial = SpatialCoverage() # if spatial_geom['type'] == 'Polygon': # coordinates = [spatial_geom['coordinates']] # elif spatial_geom['type'] == 'MultiPolygon': # coordinates = spatial_geom['coordinates'] # else: # HarvestException('Unsupported spatial geometry') # dataset.spatial.geom = { # 'type': 'MultiPolygon', # 'coordinates': coordinates # } if temporal_start and temporal_end: dataset.temporal_coverage = db.DateRange( start=temporal_start, end=temporal_end, ) # Remote URL if data.get('url'): try: url = uris.validate(data['url']) except uris.ValidationError: dataset.extras['remote_url'] = self.dataset_url(data['name']) dataset.extras['ckan:source'] = data['url'] else: dataset.extras['remote_url'] = url dataset.extras['harvest:name'] = self.source.name current_resources = [ str(resource.id) for resource in dataset.resources ] fetched_resources = [] # Resources for res in data['resources']: if res['resource_type'] not in ALLOWED_RESOURCE_TYPES: continue #Ignore invalid Resources try: url = uris.validate(res['url']) except uris.ValidationError: continue try: resource = get_by(dataset.resources, 'id', UUID(res['id'])) except Exception: log.error('Unable to parse resource ID %s', res['id']) continue fetched_resources.append(str(res['id'])) if not resource: resource = Resource(id=res['id']) dataset.resources.append(resource) resource.title = res.get('name', '') or '' resource.description = parse_html(res.get('description')) resource.url = res['url'] resource.filetype = 'remote' resource.format = res.get('format') resource.mime = res.get('mimetype') resource.hash = res.get('hash') resource.created = res['created'] resource.modified = res['last_modified'] resource.published = resource.published or resource.created # Clean up old resources removed from source for resource_id in current_resources: if resource_id not in fetched_resources: try: resource = get_by(dataset.resources, 'id', UUID(resource_id)) except Exception: log.error('Unable to parse resource ID %s', resource_id) continue else: if resource and not self.dryrun: dataset.resources.remove(resource) return dataset