def merge_name(self, request, queryset_tags): datasets = Dataset.objects.filter( keywords__in=queryset_tags).distinct() if 'apply' in request.POST: form = NewKeywordForm(request.POST) if form.is_valid(): error = False name = form.cleaned_data.get('new_name') tag, created = Tag.objects.get_or_create(name=name) # WIP for dataset in datasets: dataset.keywords.add(tag) ckan_id = str(dataset.ckan_id) qs_dataset_keywords = dataset.keywords.all().exclude( id__in=queryset_tags) tags = [ *[{ 'name': k.name } for k in qs_dataset_keywords], *[{ 'name': tag.name }] ] logger.info('Update dataset %d with tags: %s' % (dataset.pk, tags)) try: CkanHandler.publish_dataset(id=ckan_id, tags=tags) except CkanBaseError as e: logger.exception(e) error = True dataset.keywords.remove(tag) break else: continue if error: messages.error(request, ( "Une erreur est survenue. " "Veuillez contacter l'administrateur de la plateforme." )) else: queryset_tags.exclude(pk=tag.pk).delete() messages.info( request, ("La mise à jour est effectuée avec succès.")) return HttpResponseRedirect(request.get_full_path()) else: # request.GET form = NewKeywordForm() # then template_html = 'admin/idgo_admin/taggit_merge_name.html' context = {'form': form, 'tags': queryset_tags, 'datasets': datasets} return render(request, template_html, context=context)
def handle(self, *args, **options): dataset_qs = Dataset.default.filter( keywords__isnull=False).distinct().order_by('id') total = dataset_qs.count() count = 0 for dataset in dataset_qs: count += 1 qs_dataset_keywords = dataset.keywords.all() ckan_id = str(dataset.ckan_id) logger.info( "[%d/%d] - Synchronize Dataset %d (%s) with tags: '%s'." % (count, total, dataset.pk, ckan_id, "', '".join( [k.name for k in qs_dataset_keywords]))) try: CkanHandler.publish_dataset(id=ckan_id, tags=[{ 'name': k.name } for k in qs_dataset_keywords]) except Exception as e: logger.exception(e) logger.warning("Error was ingored.")
def save(self, *args, harvest=True, **kwargs): Category = apps.get_model(app_label='idgo_admin', model_name='Category') Dataset = apps.get_model(app_label='idgo_admin', model_name='Dataset') License = apps.get_model(app_label='idgo_admin', model_name='License') Resource = apps.get_model(app_label='idgo_admin', model_name='Resource') ResourceFormats = apps.get_model(app_label='idgo_admin', model_name='ResourceFormats') # (1) Supprimer les jeux de données qui ne sont plus synchronisés previous = self.pk and RemoteCsw.objects.get(pk=self.pk) if previous: for dataset in Dataset.harvested_csw.filter( remote_instance=previous): dataset.delete() else: # Dans le cas d'une création, on vérifie si l'URL CSW est valide try: with CswBaseHandler(self.url): pass except CswBaseError as e: raise ValidationError(e.__str__(), code='url') # (2) Sauver l'instance super().save(*args, **kwargs) # (3) Créer/Mettre à jour les jeux de données synchronisés # On récupère dans le `stack` l'utilisateur effectuant l'opération editor = None for entry in inspect.stack(): try: editor = entry[0].f_locals['request'].user._wrapped except (KeyError, AttributeError): continue break if not previous: return if harvest: # Puis on moissonne le catalogue try: ckan_ids = [] geonet_ids = [] with transaction.atomic(): with CswBaseHandler(self.url) as csw: packages = csw.get_packages( xml=self.getrecords or None) for package in packages: if not package['type'] == 'dataset': continue geonet_id = package['id'] update_frequency = dict(Dataset.FREQUENCY_CHOICES).get( package.get('frequency'), 'unknown') update_frequency = package.get('frequency') if not (update_frequency and update_frequency in dict( Dataset.FREQUENCY_CHOICES).keys()): update_frequency = 'unknown' date_creation = package.get('dataset_creation_date', None) if date_creation: try: date_creation = datetime.strptime( date_creation, ISOFORMAT_DATE) except ValueError as e: logger.warning(e) date_creation = None date_modification = package.get( 'dataset_modification_date', None) if date_modification: try: date_modification = datetime.strptime( date_modification, ISOFORMAT_DATE) except ValueError as e: logger.warning(e) date_modification = None date_publication = package.get( 'dataset_publication_date', None) if date_publication: try: date_publication = datetime.strptime( date_publication, ISOFORMAT_DATE) except ValueError as e: logger.warning(e) date_publication = None # Licence license_titles = package.get('license_titles') filters = [ Q(slug__in=license_titles), Q(title__in=license_titles), Q(alternate_titles__overlap=license_titles), ] license = License.objects.filter(reduce( ior, filters)).distinct().first() if not license: try: license = License.objects.get( slug=settings.DEFAULTS_VALUES.get( 'LICENSE')) except License.DoesNotExist: license = License.objects.first() # On pousse la fiche de MD dans Geonet if not geonet.get_record(geonet_id): try: geonet.create_record(geonet_id, package['xml']) except Exception as e: logger.warning( 'La création de la fiche de métadonnées a échoué.' ) logger.error(e) else: geonet_ids.append(geonet_id) geonet.publish( geonet_id) # Toujours publier la fiche else: try: geonet.update_record(geonet_id, package['xml']) except Exception as e: logger.warning( 'La mise à jour de la fiche de métadonnées a échoué.' ) logger.error(e) slug = 'sync{}-{}'.format( str(uuid.uuid4())[:7].lower(), slugify(geonet_id))[:100] kvp = { 'slug': slug, 'title': package.get('title'), 'description': package.get('notes'), 'date_creation': date_creation and date_creation.date(), 'date_modification': date_modification and date_modification.date(), 'date_publication': date_publication and date_publication.date(), 'editor': editor, 'license': license, 'owner_email': self.organisation.email or DEFAULT_CONTACT_EMAIL, 'owner_name': self.organisation.legal_name or DEFAULT_PLATFORM_NAME, 'organisation': self.organisation, 'published': not package.get('private'), 'remote_instance': self, 'remote_dataset': geonet_id, 'update_frequency': update_frequency, 'bbox': package.get('bbox'), # broadcaster_email # broadcaster_name # data_type # geocover 'geonet_id': geonet_id, # granularity # thumbnail # support } dataset, created = Dataset.harvested_csw.update_or_create( **kvp) if created: ckan_ids.append(dataset.ckan_id) categories_name = [ m['name'] for m in package.get('groups', []) ] iso_topic_reverse = dict( (v, k) for k, v in Category._meta.fields[5].choices) filters = [ Q(slug__in=categories_name), Q(name__in=categories_name), Q(iso_topic__in=[ m['name'] for m in package.get('groups', []) ]), Q(iso_topic__in=[ iso_topic_reverse.get(name) for name in categories_name ]), Q(alternate_titles__overlap=categories_name), ] categories = Category.objects.filter( reduce(ior, filters)).distinct() if categories: dataset.categories.set(categories, clear=True) if not created: dataset.keywords.clear() keywords = [ tag['display_name'] for tag in package.get('tags') ] dataset.keywords.add(*keywords) dataset.save(current_user=None, synchronize=True, activate=False) for resource in package.get('resources', []): try: ckan_id = uuid.uuid4() except ValueError as e: logger.exception(e) logger.error( "I can't crash here, so I do not pay any attention to this error." ) continue filters = [] protocol = resource.get('protocol') protocol and filters.append(Q(protocol=protocol)) mimetype = resource.get('mimetype') mimetype and filters.append( Q(mimetype__overlap=[mimetype])) try: format_type = ResourceFormats.objects.get( reduce(iand, filters)) except (ResourceFormats.MultipleObjectsReturned, ResourceFormats.DoesNotExist, TypeError): format_type = None kvp = { 'ckan_id': ckan_id, 'dataset': dataset, 'format_type': format_type, 'title': resource['name'] or resource['url'], 'referenced_url': resource['url'], } try: resource = Resource.objects.get( ckan_id=ckan_id) except Resource.DoesNotExist: resource = Resource.default.create(save_opts={ 'current_user': editor, 'synchronize': True }, **kvp) else: for k, v in kvp.items(): setattr(resource, k, v) resource.save(current_user=editor, synchronize=True) except Exception as e: for id in ckan_ids: logger.warning( 'Delete CKAN package : {id}.'.format(id=str(id))) CkanHandler.purge_dataset(str(id)) for id in geonet_ids: logger.warning('Delete MD : {id}.'.format(id=str(id))) geonet.delete_record(id) logger.error(e) raise CriticalError() else: for id in ckan_ids: CkanHandler.publish_dataset(id=str(id), state='active')
def save(self, *args, **kwargs): Category = apps.get_model(app_label='idgo_admin', model_name='Category') Dataset = apps.get_model(app_label='idgo_admin', model_name='Dataset') License = apps.get_model(app_label='idgo_admin', model_name='License') Resource = apps.get_model(app_label='idgo_admin', model_name='Resource') ResourceFormats = apps.get_model(app_label='idgo_admin', model_name='ResourceFormats') # (1) Supprimer les jeux de données qui ne sont plus synchronisés previous = self.pk and RemoteCkan.objects.get(pk=self.pk) if previous: remote_organisation__in = [ x for x in (previous.sync_with or []) if x not in (self.sync_with or []) ] filter = { 'remote_instance': previous, 'remote_organisation__in': remote_organisation__in, } # TODO: 'Dataset.harvested_ckan.filter(**filter).delete()' ne fonctionne pas for dataset in Dataset.harvested_ckan.filter(**filter): dataset.delete() else: # Dans le cas d'une création, on vérifie si l'URL CKAN est valide try: with CkanBaseHandler(self.url): pass except CkanBaseError as e: raise ValidationError(e.__str__(), code='url') # (2) Sauver l'instance super().save(*args, **kwargs) # (3) Créer/Mettre à jour les jeux de données synchronisés # On récupère dans le `stack` l'utilisateur effectuant l'opération editor = User.objects.get(pk=DEFAULT_USER_ID) for entry in inspect.stack(): try: editor = entry[0].f_locals['request'].user._wrapped except (KeyError, AttributeError): continue break # Puis on moissonne le catalogue if self.sync_with: try: ckan_ids = [] with transaction.atomic(): # TODO: Factoriser for value in self.sync_with: with CkanBaseHandler(self.url) as ckan: ckan_organisation = ckan.get_organisation( value, include_datasets=True, include_groups=True, include_tags=True) if not ckan_organisation.get('package_count', 0): continue for package in ckan_organisation.get('packages'): if not package['state'] == 'active' \ or not package['type'] == 'dataset': continue with CkanBaseHandler(self.url) as ckan: package = ckan.get_package(package['id']) ckan_id = uuid.UUID(package['id']) update_frequency = dict( Dataset.FREQUENCY_CHOICES).get( package.get('frequency'), 'unknown') update_frequency = package.get('frequency') if not (update_frequency and update_frequency in dict(Dataset.FREQUENCY_CHOICES).keys()): update_frequency = 'unknown' metadata_created = package.get( 'metadata_created', None) if metadata_created: metadata_created = datetime.strptime( metadata_created, ISOFORMAT_DATETIME) metadata_modified = package.get( 'metadata_modified', None) if metadata_modified: metadata_modified = datetime.strptime( metadata_modified, ISOFORMAT_DATETIME) try: mapping_licence = MappingLicence.objects.get( remote_ckan=self, slug=package.get('license_id')) except MappingLicence.DoesNotExist: try: license = License.objects.get( slug='other-at') except MappingLicence.DoesNotExist: license = None else: logger.warning("'{}' non trouvé".format( package.get('license_id'))) license = mapping_licence.licence slug = 'sync{}-{}'.format( str(uuid.uuid4())[:7].lower(), package.get('name'))[:100] kvp = { 'slug': slug, 'title': package.get('title'), 'description': package.get('notes'), 'date_creation': metadata_created and metadata_created.date(), 'date_modification': metadata_modified and metadata_modified.date(), # date_publication 'editor': editor, 'license': license, 'owner_email': self.organisation.email or DEFAULT_CONTACT_EMAIL, 'owner_name': self.organisation.legal_name or DEFAULT_PLATFORM_NAME, 'organisation': self.organisation, 'published': not package.get('private'), 'remote_instance': self, 'remote_dataset': ckan_id, 'remote_organisation': value, 'update_frequency': update_frequency, # bbox # broadcaster_email # broadcaster_name # data_type # geocover # geonet_id # granularity # thumbnail # support } dataset, created = Dataset.harvested_ckan.update_or_create( **kvp) mapping_categories = MappingCategory.objects.filter( remote_ckan=self, slug__in=[ m['name'] for m in package.get('groups', []) ]) if mapping_categories: dataset.categories = set( mc.category for mc in mapping_categories) if not created: dataset.keywords.clear() keywords = [ tag['display_name'] for tag in package.get('tags') ] dataset.keywords.add(*keywords) dataset.save(current_user=None, synchronize=True, activate=False) ckan_ids.append(dataset.ckan_id) for resource in package.get('resources', []): try: ckan_id = uuid.UUID(resource['id']) except ValueError as e: logger.exception(e) logger.error( "I can't crash here, so I do not pay any attention to this error." ) continue try: ckan_format = resource['format'].upper() format_type = ResourceFormats.objects.get( ckan_format=ckan_format) except (ResourceFormats. MultipleObjectsReturned, ResourceFormats.DoesNotExist, TypeError) as e: logger.exception(e) logger.error( "I can't crash here, so I do not pay any attention to this error." ) format_type = None kvp = { 'ckan_id': ckan_id, 'dataset': dataset, 'format_type': format_type, 'title': resource['name'], 'referenced_url': resource['url'], } try: resource = Resource.objects.get( ckan_id=ckan_id) except Resource.DoesNotExist: resource = Resource.default.create( save_opts={ 'current_user': None, 'synchronize': True }, **kvp) else: for k, v in kvp.items(): setattr(resource, k, v) resource.save(current_user=None, synchronize=True) except Exception as e: for id in ckan_ids: CkanHandler.purge_dataset(str(id)) logger.error(e) raise CriticalError() else: for id in ckan_ids: CkanHandler.publish_dataset(id=str(id), state='active')
def synchronize(self, with_user=None, activate=None): """Synchronizer le jeu de données avec l'instance de CKAN.""" # Identifiant du package CKAN : id = self.ckan_id and str(self.ckan_id) or None # Si la valeur est `None`, alors il s'agit d'une création. # Définition des propriétés du « paquet » # ======================================= datatype = [item.slug for item in self.data_type.all()] date_creation = self.date_creation and str(self.date_creation) or '' date_modification = self.date_modification and str( self.date_modification) or '' date_publication = self.date_publication and str( self.date_publication) or '' broadcaster_name = self.broadcaster_name or \ self.support and self.support.name or DEFAULT_PLATFORM_NAME broadcaster_email = self.broadcaster_email or \ self.support and self.support.email or DEFAULT_CONTACT_EMAIL geocover = self.geocover or '' granularity = self.granularity and self.granularity.slug or '' licenses = [license['id'] for license in CkanHandler.get_licenses()] if self.license and self.license.ckan_id in licenses: license_id = self.license.ckan_id else: license_id = '' ows = False Resource = apps.get_model(app_label='idgo_admin', model_name='Resource') for resource in Resource.objects.filter(dataset=self): ows = resource.ogc_services spatial = self.bbox and self.bbox.geojson or '' support = self.support and self.support.slug or '' tags = [{'name': keyword.name} for keyword in self.keywords.all()] try: thumbnail = urljoin(DOMAIN_NAME, self.thumbnail.url) except ValueError: thumbnail = '' # On vérifie si le jeu de données est un cas particulier # de jeu de données moissonné CKAN/CSW/DCAT remote_url = None if ENABLE_CKAN_HARVESTER: # (1) DCAT RemoteCkanDataset = apps.get_model(app_label='idgo_admin', model_name='RemoteCkanDataset') try: remote_dataset = RemoteCkanDataset.objects.get(dataset=self) except RemoteCkanDataset.DoesNotExist: pass else: remote_url = remote_dataset.url if ENABLE_CSW_HARVESTER: # (2) CSW RemoteCswDataset = apps.get_model(app_label='idgo_admin', model_name='RemoteCswDataset') try: remote_dataset = RemoteCswDataset.objects.get(dataset=self) except RemoteCswDataset.DoesNotExist: pass else: remote_url = remote_dataset.url if ENABLE_DCAT_HARVESTER: # (3) DCAT RemoteDcatDataset = apps.get_model(app_label='idgo_admin', model_name='RemoteDcatDataset') try: remote_dataset = RemoteDcatDataset.objects.get(dataset=self) except RemoteDcatDataset.DoesNotExist: pass else: remote_url = remote_dataset.url data = { 'author': self.owner_name, 'author_email': self.owner_email, 'datatype': datatype, 'dataset_creation_date': date_creation, 'dataset_modification_date': date_modification, 'dataset_publication_date': date_publication, 'frequency': self.update_frequency or 'unknown', 'geocover': geocover, 'granularity': granularity, 'groups': [], 'inspire_url': self.geonet_url, 'license_id': license_id, 'maintainer': broadcaster_name, 'maintainer_email': broadcaster_email, 'name': self.slug, 'notes': self.description, 'owner_org': str(self.organisation.ckan_id), 'ows': str(ows), # IMPORTANT 'private': self.private, 'remote_url': remote_url or '', 'spatial': spatial, 'support': support, 'tags': tags, 'title': self.title, 'thumbnail': thumbnail, 'url': '', # IMPORTANT } if activate is not None: data['state'] = activate and 'active' or 'deleted' # Synchronisation des catégories : for category in self.categories.all(): data['groups'].append({'name': category.slug}) organisation_id = str(self.organisation.ckan_id) # Synchronisation de l'organisation ; si l'organisation # n'existe pas il faut la créer ckan_organisation = CkanHandler.get_organisation(organisation_id) if not ckan_organisation: CkanHandler.add_organisation(self.organisation) # et si l'organisation est désactiver il faut l'activer elif ckan_organisation.get('state') == 'deleted': CkanHandler.activate_organisation(organisation_id) if with_user: username = with_user.username # TODO: C'est très lourd de faire cela systématiquement -> voir pour améliorer cela CkanHandler.add_user_to_organisation(username, organisation_id) for category in self.categories.all(): category_id = str(category.ckan_id) CkanHandler.add_user_to_group(username, category_id) # apikey = CkanHandler.get_user(username)['apikey'] with CkanUserHandler(apikey=apikey) as ckan_user: return ckan_user.publish_dataset(id=id, **data) else: return CkanHandler.publish_dataset(id=id, **data)