def get_dataset(self, remote_id): '''Get or create a dataset given its remote ID (and its source)''' dataset = Dataset.objects(__raw__={ 'extras.harvest:remote_id': remote_id, 'extras.harvest:domain': self.source.domain }).first() return dataset or Dataset()
def get_context(self): context = super(OrganizationDetailView, self).get_context() can_edit = EditOrganizationPermission(self.organization) can_view = OrganizationPrivatePermission(self.organization) if self.organization.deleted and not can_view.can(): abort(410) datasets = Dataset.objects(organization=self.organization).order_by('-temporal_coverage.end', '-metrics.reuses', '-metrics.followers').visible() reuses = Reuse.objects(organization=self.organization).order_by('-metrics.reuses', '-metrics.followers').visible() followers = (Follow.objects.followers(self.organization) .order_by('follower.fullname')) context.update({ 'reuses': reuses.paginate(1, self.page_size), 'datasets': datasets.paginate(1, self.page_size), 'followers': followers, 'can_edit': can_edit, 'can_view': can_view, 'private_reuses': ( list(Reuse.objects(organization=self.object).hidden()) if can_view else []), 'private_datasets': ( list(Dataset.objects(organization=self.object).hidden()) if can_view else []), }) return context
def get_context(self): context = super(OrganizationDetailView, self).get_context() can_edit = EditOrganizationPermission(self.organization) can_view = OrganizationPrivatePermission(self.organization) if self.organization.deleted and not can_view.can(): abort(410) datasets = Dataset.objects(organization=self.organization).order_by( '-temporal_coverage.end', '-metrics.reuses', '-metrics.followers').visible() reuses = Reuse.objects(organization=self.organization).order_by( '-metrics.reuses', '-metrics.followers').visible() followers = (Follow.objects.followers( self.organization).order_by('follower.fullname')) context.update({ 'reuses': reuses.paginate(1, self.page_size), 'datasets': datasets.paginate(1, self.page_size), 'followers': followers, 'can_edit': can_edit, 'can_view': can_view, 'private_reuses': (list(Reuse.objects( organization=self.object).hidden()) if can_view else []), 'private_datasets': (list(Dataset.objects( organization=self.object).hidden()) if can_view else []), }) return context
def from_organizations(self, user, *organizations): from udata.models import Dataset, Reuse # Circular imports. Qs = db.Q() for dataset in Dataset.objects(owner=user).visible(): Qs |= db.Q(subject=dataset) for org in organizations: for dataset in Dataset.objects(organization=org).visible(): Qs |= db.Q(subject=dataset) for reuse in Reuse.objects.owned_by(*[user.id] + list(organizations)): Qs |= db.Q(subject=reuse) return self(Qs)
def get_dataset(self, remote_id): '''Get or create a dataset given its remote ID (and its source) We first try to match `source_id` to be source domain independent ''' dataset = Dataset.objects(__raw__={ 'extras.harvest:remote_id': remote_id, '$or': [ {'extras.harvest:domain': self.source.domain}, {'extras.harvest:source_id': str(self.source.id)}, ], }).first() return dataset or Dataset()
def get(self, level): pipeline = [ {'$project': {'territory': '$spatial.territories'}}, {'$unwind': '$territory'}, {'$match': {'territory.level': level}}, {'$group': {'_id': '$territory.id', 'count': {'$sum': 1}}} ] features = [] for row in Dataset.objects(spatial__territories__level=level).visible().aggregate(*pipeline): territory = Territory.objects.get(id=row['_id']) features.append({ 'id': str(territory.id), 'type': 'Feature', 'geometry': territory.geom, 'properties': { 'name': territory.name, 'code': territory.code, 'level': territory.level, 'datasets': row['count'] } }) return { 'type': 'FeatureCollection', 'features': features }
def test_attach_does_not_duplicate(self): attached_datasets = [] for i in range(2): dataset = DatasetFactory.build() dataset.extras["harvest:domain"] = "test.org" dataset.extras["harvest:remote_id"] = str(i) dataset.save() attached_datasets.append(dataset) datasets = DatasetFactory.create_batch(3) with NamedTemporaryFile() as csvfile: writer = csv.DictWriter(csvfile, fieldnames=["local", "remote"], delimiter=b";", quotechar=b'"') writer.writeheader() for index, dataset in enumerate(datasets): writer.writerow({"local": str(dataset.id), "remote": str(index)}) csvfile.flush() result = actions.attach("test.org", csvfile.name) dbcount = Dataset.objects(**{"extras__harvest:remote_id__exists": True}).count() self.assertEqual(result.success, len(datasets)) self.assertEqual(dbcount, result.success) for index, dataset in enumerate(datasets): dataset.reload() self.assertEqual(dataset.extras["harvest:domain"], "test.org") self.assertEqual(dataset.extras["harvest:remote_id"], str(index))
def openfield16(): datasets = (Dataset.objects( badges__kind=OPENFIELD16).visible().order_by('-metrics.followers')) return theme.render('openfield16.html', datasets=datasets, badge=OPENFIELD16, nb_displayed_datasets=NB_DISPLAYED_DATASETS)
def test_purge_organizations(self): with self.autoindex(): org = Organization.objects.create(name='delete me', description='XXX') resources = [ResourceFactory() for _ in range(2)] dataset = DatasetFactory(resources=resources, organization=org) # Upload organization's logo file = create_test_image() user = AdminFactory() self.login(user) response = self.post(url_for('api.organization_logo', org=org), {'file': (file, 'test.png')}, json=False) self.assert200(response) # Delete organization response = self.delete(url_for('api.organization', org=org)) self.assert204(response) tasks.purge_organizations() # Check organization's logo is deleted self.assertEqual(list(storages.avatars.list_files()), []) dataset = Dataset.objects(id=dataset.id).first() self.assertIsNone(dataset.organization) organization = Organization.objects(name='delete me').first() self.assertIsNone(organization) indexed_dataset = DatasetSearch.get(id=dataset.id, using=es.client, index=es.index_name) self.assertIsNone(indexed_dataset.organization)
def remote_reuses(self): # dataset_ids = (d.ext['harvest'].remote_id for d in Dataset.objects(ext__harvest__harvester=self.harvester.id)) # response = self.get('package_list') # for dataset_id in response['result']: for dataset in Dataset.objects(ext__harvest__harvester=self.harvester.id).timeout(False): try: resp = self.get('related_list', {'id': dataset.ext['harvest'].remote_id}) except: log.error('Unable to parse reuse for dataset %s', dataset.id) continue for details in resp['result']: reuse_url = details['url'] urlhash = Reuse.hash_url(reuse_url) reuse, _ = Reuse.objects.get_or_create(urlhash=urlhash, auto_save=False) reuse.url = reuse_url reuse.title = details['title'] reuse.description = details['description'] reuse.type = details['type'] # reuse.url = details['url'] reuse.image_url = details.get('image_url') reuse.featured = bool(details.get('featured', False)) reuse.created_at = parse(details['created']) if details.get('owner_id'): reuse.owner = self.get_harvested(User, details['owner_id']) if not dataset in reuse.datasets: reuse.datasets.append(dataset) for tag in dataset.tags: if not tag in reuse.tags: reuse.tags.append(tag) yield reuse
def remote_reuses(self): # dataset_ids = (d.ext['harvest'].remote_id for d in Dataset.objects(ext__harvest__harvester=self.harvester.id)) # response = self.get('package_list') # for dataset_id in response['result']: for dataset in Dataset.objects( ext__harvest__harvester=self.harvester.id).timeout(False): try: resp = self.get('related_list', {'id': dataset.ext['harvest'].remote_id}) except: log.error('Unable to parse reuse for dataset %s', dataset.id) continue for details in resp['result']: reuse_url = details['url'] urlhash = Reuse.hash_url(reuse_url) reuse, _ = Reuse.objects.get_or_create(urlhash=urlhash, auto_save=False) reuse.url = reuse_url reuse.title = details['title'] reuse.description = details['description'] reuse.type = details['type'] # reuse.url = details['url'] reuse.image_url = details.get('image_url') reuse.featured = bool(details.get('featured', False)) reuse.created_at = parse(details['created']) if details.get('owner_id'): reuse.owner = self.get_harvested(User, details['owner_id']) if not dataset in reuse.datasets: reuse.datasets.append(dataset) for tag in dataset.tags: if not tag in reuse.tags: reuse.tags.append(tag) yield reuse
def nec_mergitur(): datasets = (Dataset.objects( badges__kind=NECMERGITUR).visible().order_by('-metrics.followers')) return theme.render('nec_mergitur.html', datasets=datasets, badge=NECMERGITUR, nb_displayed_datasets=NB_DISPLAYED_DATASETS)
def get_context(self): context = super(OrganizationDetailView, self).get_context() org_id = str(self.organization.id) datasets, supplied_datasets, reuses = search.multiquery( search.SearchQuery(Dataset, sort='-created', organization=org_id, page_size=9), search.SearchQuery(Dataset, sort='-created', supplier=org_id, page_size=9), search.SearchQuery(Reuse, sort='-created', organization=org_id, page_size=9), ) followers = FollowOrg.objects.followers(self.organization).order_by('follower.fullname') can_edit = EditOrganizationPermission(self.organization.id) context.update({ 'reuses': reuses, 'datasets': datasets, 'supplied_datasets': supplied_datasets, 'followers': followers[:self.nb_followers], 'can_edit': can_edit }) if can_edit: context.update({ 'private_reuses': list(Reuse.objects(organization=self.object, private=True)), 'private_datasets': list(Dataset.objects(organization=self.object, private=True)), }) return context
def openfield16(): datasets = (Dataset.objects(badges__kind=OPENFIELD16).visible() .order_by('-metrics.followers')) return theme.render('openfield16.html', datasets=datasets, badge=OPENFIELD16, nb_displayed_datasets=NB_DISPLAYED_DATASETS)
def get(self, level): '''List each zone for a given level with their datasets count''' level = GeoLevel.objects.get_or_404(id=level) features = [] for zone in GeoZone.objects(level=level.id): # fetch nested levels IDs ids = GeoZone.objects(parents=zone.id).only('id').distinct('id') ids.append(zone.id) # Count datasets in zone nb_datasets = Dataset.objects(spatial__zones__in=ids).count() features.append({ 'id': zone.id, 'type': 'Feature', 'geometry': zone.geom, 'properties': { 'name': _(zone.name), 'code': zone.code, 'level': zone.level, 'datasets': nb_datasets } }) return { 'type': 'FeatureCollection', 'features': features }
def get_value(self): ids = itertools.chain(*[ [r.id for r in d.resources] for d in (Dataset.objects(organization=self.target).only('resources') or []) ]) return int(Metrics.objects(object_id__in=ids, level='daily') .sum('values.nb_uniq_visitors'))
def nec_mergitur(): datasets = (Dataset.objects(badges__kind=NECMERGITUR).visible() .order_by('-metrics.followers')) return theme.render('nec_mergitur.html', datasets=datasets, badge=NECMERGITUR, nb_displayed_datasets=NB_DISPLAYED_DATASETS)
def get_context(self): context = super(OrganizationIssuesView, self).get_context() datasets = Dataset.objects(organization=self.organization) reuses = Reuse.objects(organization=self.organization) ids = [o.id for o in list(datasets) + list(reuses)] context['issues'] = Issue.objects(subject__in=ids) return context
def check_availability(self): from udata.models import Dataset # Circular imports. # Performances: only check the first 20 datasets for now. return chain(*[ dataset.check_availability() for dataset in Dataset.objects(organization=self).visible()[:20] ])
def get_value(self): ids = itertools.chain( *[[r.id for r in d.resources] for d in (Dataset.objects( organization=self.target).only('resources') or [])]) return int( Metrics.objects(object_id__in=ids, level='daily').sum('values.nb_uniq_visitors'))
def migrate(db): log.info('Processing resources.') datasets = Dataset.objects().no_cache().timeout(False) for dataset in datasets: save_res = False for resource in dataset.resources: if resource.url.startswith('https://static.data.gouv.fr'): parsed = urlparse(resource.url) fs_name = parsed.path.strip('/resource/') resource.fs_filename = fs_name save_res = True if save_res: try: dataset.save() except Exception as e: log.warning(e) pass log.info('Processing community resources.') community_resources = CommunityResource.objects().no_cache().timeout(False) for community_resource in community_resources: parsed = urlparse(community_resource.url) fs_name = parsed.path.strip('/resource/') community_resource.fs_filename = fs_name try: community_resource.save() except Exception as e: log.warning(e) pass log.info('Completed.')
def aggregate_datasets_daily(org, day): keys = ['datasets_{0}'.format(k) for k in KEYS] ids = [d.id for d in Dataset.objects(organization=org).only('id')] metrics = Metrics.objects(object_id__in=ids, level='daily', date=day.isoformat()) values = [int(metrics.sum('values.{0}'.format(k))) for k in KEYS] return Metrics.objects.update_daily(org, day, **dict(zip(keys, values)))
def purge_organizations(self): for organization in Organization.objects(deleted__ne=None): log.info(f'Purging organization {organization}') # Remove followers Follow.objects(following=organization).delete() # Remove activity Activity.objects(related_to=organization).delete() Activity.objects(organization=organization).delete() # Remove transfers Transfer.objects(recipient=organization).delete() Transfer.objects(owner=organization).delete() # Store datasets for later reindexation d_ids = [d.id for d in Dataset.objects(organization=organization)] # Remove organization's logo in all sizes if organization.logo.filename is not None: storage = storages.avatars storage.delete(organization.logo.filename) storage.delete(organization.logo.original) for key, value in organization.logo.thumbnails.items(): storage.delete(value) # Remove organization.delete() # Reindex the datasets that were linked to the organization for id in d_ids: reindex(Dataset.__name__, str(id))
def test_default(self): org = OrganizationFactory() source = HarvestSourceFactory(backend='factory', organization=org) with self.assert_emit(signals.before_harvest_job, signals.after_harvest_job): self.action(source.slug) source.reload() self.assertEqual(len(HarvestJob.objects(source=source)), 1) job = source.get_last_job() self.assertEqual(job.status, 'done') self.assertEqual(job.errors, []) self.assertIsNotNone(job.started) self.assertIsNotNone(job.ended) self.assertEqual(len(job.items), COUNT) for item in job.items: self.assertEqual(item.status, 'done') self.assertEqual(item.errors, []) self.assertIsNotNone(item.started) self.assertIsNotNone(item.ended) self.assertIsNotNone(item.dataset) dataset = item.dataset self.assertIsNotNone(Dataset.objects(id=dataset.id).first()) self.assertEqual(dataset.organization, org) self.assertIn('harvest:remote_id', dataset.extras) self.assertIn('harvest:last_update', dataset.extras) self.assertIn('harvest:source_id', dataset.extras) self.assertEqual(len(HarvestJob.objects), 1) self.assertEqual(len(Dataset.objects), COUNT)
def test_default(self): org = OrganizationFactory() source = HarvestSourceFactory(backend='factory', organization=org) with assert_emit(signals.before_harvest_job, signals.after_harvest_job): self.action(source.slug) source.reload() self.assertEqual(len(HarvestJob.objects(source=source)), 1) job = source.get_last_job() self.assertEqual(job.status, 'done') self.assertEqual(job.errors, []) self.assertIsNotNone(job.started) self.assertIsNotNone(job.ended) self.assertEqual(len(job.items), COUNT) for item in job.items: self.assertEqual(item.status, 'done') self.assertEqual(item.errors, []) self.assertIsNotNone(item.started) self.assertIsNotNone(item.ended) self.assertIsNotNone(item.dataset) dataset = item.dataset self.assertIsNotNone(Dataset.objects(id=dataset.id).first()) self.assertEqual(dataset.organization, org) self.assertIn('harvest:remote_id', dataset.extras) self.assertIn('harvest:last_update', dataset.extras) self.assertIn('harvest:source_id', dataset.extras) self.assertEqual(len(HarvestJob.objects), 1) self.assertEqual(len(Dataset.objects), COUNT)
def test_default(self): org = OrganizationFactory() source = HarvestSourceFactory(backend='factory', organization=org) with assert_emit(signals.before_harvest_job, signals.after_harvest_job): self.action(source.slug) source.reload() assert len(HarvestJob.objects(source=source)) == 1 job = source.get_last_job() assert job.status == 'done' assert job.errors == [] assert job.started is not None assert job.ended is not None assert len(job.items) == COUNT for item in job.items: assert item.status == 'done' assert item.errors == [] assert item.started is not None assert item.ended is not None assert item.dataset is not None dataset = item.dataset assert Dataset.objects(id=dataset.id).first() is not None assert dataset.organization == org assert 'harvest:remote_id' in dataset.extras assert 'harvest:last_update' in dataset.extras assert 'harvest:source_id' in dataset.extras assert len(HarvestJob.objects) == 1 assert len(Dataset.objects) == COUNT
def check_availability(self): from udata.models import Dataset # Circular imports. # Performances: only check the first 20 datasets for now. return chain( *[dataset.check_availability() for dataset in Dataset.objects(organization=self).visible()[:20]] )
def get_value(self): ids = [ d.id for d in ( Dataset.objects(organization=self.target).only('id') or []) ] return int( Metrics.objects(object_id__in=ids, level='daily').sum('values.nb_uniq_visitors'))
def purge_organizations(self): for organization in Organization.objects(deleted__ne=None): log.info('Purging organization "{0}"'.format(organization)) # Remove followers Follow.objects(following=organization).delete() # Remove activity Activity.objects(related_to=organization).delete() Activity.objects(organization=organization).delete() # Remove metrics Metrics.objects(object_id=organization.id).delete() # Store datasets for later reindexation d_ids = [d.id for d in Dataset.objects(organization=organization)] # Remove organization.delete() # Reindex the datasets that were linked to the organization for dataset in Dataset.objects(id__in=d_ids): reindex(dataset)
def climate_change_challenge(): partners = Organization.objects(slug__in=C3_PARTNERS) datasets = (Dataset.objects( badges__kind=C3).visible().order_by('-metrics.followers')) return theme.render('c3.html', partners=partners, datasets=datasets, badge=C3, nb_displayed_datasets=NB_DISPLAYED_DATASETS)
def dadosGovOld_API(org_slug, file_id): format = 'json' if request.args.get('format','xml').lower() == 'json' else 'xml' dataset = Dataset.objects(__raw__={'extras.harvest:remote_id': file_id}).first() if dataset: for resource in dataset.resources: if resource.format == format: return redirect(resource.url) #Everything else return 404 return abort(404)
def climate_change_challenge(): partners = Organization.objects(slug__in=C3_PARTNERS) datasets = (Dataset.objects(badges__kind=C3).visible() .order_by('-metrics.followers')) return theme.render('c3.html', partners=partners, datasets=datasets, badge=C3, nb_displayed_datasets=NB_DISPLAYED_DATASETS)
def mongo_search(cls, args): datasets = Dataset.objects(archived=None, deleted=None, private=False) datasets = DatasetApiParser.parse_filters(datasets, args) sort = cls.parse_sort(args['sort']) or ('$text_score' if args['q'] else None) or DEFAULT_SORTING offset = (args['page'] - 1) * args['page_size'] return datasets.order_by(sort).skip(offset).limit( args['page_size']), datasets.count()
def c3_badges(filename): '''Toggle C3 badges from an organization list''' with open(filename, 'r') as titles: user = User.objects(first_name='Etalab', last_name='Bot').first() badge = DatasetBadge(kind=C3, created_by=user) for title in titles: title = title.decode('utf-8').strip(u'\n') if title.startswith(u'*'): continue slug = slugify.slugify(title.lower()) dataset = (Dataset.objects(title=title).first() or Dataset.objects(slug=slug).first()) if dataset is None: log.info(u'{title} not found'.format(title=title)) else: dataset.badges.append(badge) dataset.save() log.info('Done')
def render_territory(territory): if not current_app.config.get('ACTIVATE_TERRITORIES'): return abort(404) is_present_territory = territory.valid_at(date.today()) # Retrieve the present territory if not presently valid. present_territory = None if not is_present_territory: present_territory = GeoZone.objects.valid_at(date.today()).get( level=territory.level, ancestors__contains=territory.id) # Only display dynamic datasets for present territories. base_datasets = [] if is_present_territory: DATASETS = TERRITORY_DATASETS[territory.level_code] base_dataset_classes = sorted(DATASETS.values(), key=lambda a: a.order) base_datasets = [ base_dataset_class(territory) for base_dataset_class in base_dataset_classes ] territories = [territory] # Deal with territories with ancestors. for ancestor_object in territory.ancestors_objects: territories.append(ancestor_object) # Retrieve all datasets then split between those optionaly owned # by an org for that zone and others. We need to know if the current # user has datasets for that zone in order to display a custom # message to ease the conversion. datasets = Dataset.objects(spatial__zones__in=territories).visible() # Retrieving datasets from old regions. territory_datasets = [] other_datasets = [] editable_datasets = [] if datasets: for dataset in datasets: if (dataset.organization and territory.id == dataset.organization.zone): territory_datasets.append(dataset) else: other_datasets.append(dataset) editable_datasets.append(current_user.is_authenticated and DatasetEditPermission(dataset).can()) context = { 'territory': territory, 'present_territory': present_territory, 'base_datasets': base_datasets, 'other_datasets': other_datasets, 'has_pertinent_datasets': any(editable_datasets), 'territory_datasets': territory_datasets } template = 'territories/{level_name}.html'.format( level_name=territory.level_name) return theme.render(template, **context)
def resource_redirect(id): ''' Redirect to the latest version of a resource given its identifier. ''' dataset = Dataset.objects(resources__id=id).first() if dataset: resource = get_by(dataset.resources, 'id', id) else: resource = CommunityResource.objects(id=id).first() return redirect(resource.url.strip()) if resource else abort(404)
def rdf_catalog_format(org, format): if org.deleted: abort(410) params = multi_to_dict(request.args) page = int(params.get('page', 1)) page_size = int(params.get('page_size', 100)) datasets = Dataset.objects(organization=org).visible().paginate( page, page_size) catalog = build_org_catalog(org, datasets, format=format) return graph_response(catalog, format)
def test_purge_organizations(self): org = Organization.objects.create(name='delete me', description='XXX') resources = [ResourceFactory() for _ in range(2)] dataset = DatasetFactory(resources=resources, organization=org) # Upload organization's logo file = create_test_image() user = AdminFactory() self.login(user) response = self.post( url_for('api.organization_logo', org=org), {'file': (file, 'test.png')}, json=False) self.assert200(response) transfer_to_org = Transfer.objects.create( owner=user, recipient=org, subject=dataset, comment='comment', ) transfer_from_org = Transfer.objects.create( owner=org, recipient=user, subject=dataset, comment='comment', ) oauth_client = OAuth2Client.objects.create( name='test-client', owner=user, organization=org, redirect_uris=['https://test.org/callback'], ) # Delete organization response = self.delete(url_for('api.organization', org=org)) self.assert204(response) tasks.purge_organizations() oauth_client.reload() assert oauth_client.organization is None assert Transfer.objects.filter(id=transfer_from_org.id).count() == 0 assert Transfer.objects.filter(id=transfer_to_org.id).count() == 0 # Check organization's logo is deleted self.assertEqual(list(storages.avatars.list_files()), []) dataset = Dataset.objects(id=dataset.id).first() self.assertIsNone(dataset.organization) organization = Organization.objects(name='delete me').first() self.assertIsNone(organization)
def attach(domain, filename): '''Attach existing dataset to their harvest remote id before harvesting. The expected csv file format is the following: - a column with header "local" and the local IDs or slugs - a column with header "remote" and the remote IDs The delimiter should be ";". columns order and extras columns does not matter ''' count = 0 errors = 0 with open(filename) as csvfile: reader = csv.DictReader(csvfile, delimiter=b';', quotechar=b'"') for row in reader: try: dataset = Dataset.objects.get(id=ObjectId(row['local'])) except: # noqa (Never stop on failure) log.warning('Unable to attach dataset : %s', row['local']) errors += 1 continue # Detach previously attached dataset Dataset.objects( **{ 'extras__harvest:domain': domain, 'extras__harvest:remote_id': row['remote'] }).update( **{ 'unset__extras__harvest:domain': True, 'unset__extras__harvest:remote_id': True }) dataset.extras['harvest:domain'] = domain dataset.extras['harvest:remote_id'] = row['remote'] dataset.last_modified = datetime.now() dataset.save() count += 1 return AttachResult(count, errors)
def attach(domain, filename): '''Attach existing dataset to their harvest remote id before harvesting. The expected csv file format is the following: - a column with header "local" and the local IDs or slugs - a column with header "remote" and the remote IDs The delimiter should be ";". columns order and extras columns does not matter ''' count = 0 errors = 0 with open(filename) as csvfile: reader = csv.DictReader(csvfile, delimiter=b';', quotechar=b'"') for row in reader: try: dataset = Dataset.objects.get(id=ObjectId(row['local'])) except: # noqa (Never stop on failure) log.warning('Unable to attach dataset : %s', row['local']) errors += 1 continue # Detach previously attached dataset Dataset.objects(**{ 'extras__harvest:domain': domain, 'extras__harvest:remote_id': row['remote'] }).update(**{ 'unset__extras__harvest:domain': True, 'unset__extras__harvest:remote_id': True }) dataset.extras['harvest:domain'] = domain dataset.extras['harvest:remote_id'] = row['remote'] dataset.last_modified = datetime.now() dataset.save() count += 1 return AttachResult(count, errors)
def explore(): recent_datasets = list(Dataset.objects.visible().order_by('-date').limit(9)) recent_reuses = list(Reuse.objects.order_by('-date').limit(9)) featured_datasets = list(Dataset.objects(featured=True).visible().order_by('-date').limit(15)) featured_reuses = list(Reuse.objects(featured=True).order_by('-date').limit(15)) return render('explore.html', recent_datasets=recent_datasets, recent_reuses=recent_reuses, featured_datasets=featured_datasets, featured_reuses=featured_reuses, )
def purge_datasets(): for dataset in Dataset.objects(deleted__ne=None): log.info('Purging dataset "{0}"'.format(dataset)) # Remove followers FollowDataset.objects(following=dataset).delete() # Remove issues DatasetIssue.objects(subject=dataset).delete() # Remove activity Activity.objects(related_to=dataset).delete() # Remove metrics Metrics.objects(object_id=dataset.id).delete() dataset.delete()
def missing_datasets_warning(job_items, source): job_datasets = [item.dataset.id for item in job_items] domain_harvested_datasets = Dataset.objects( __raw__={ 'extras.harvest:domain': source.domain, 'private': False, 'deleted': None }).all() missing_datasets = [] for dataset in domain_harvested_datasets: if dataset.id not in job_datasets: dataset.private = True missing_datasets.append(dataset) dataset.save() if missing_datasets: org_recipients = [ member.user.email for member in source.organization.members if member.role == 'admin' ] admin_role = Role.objects.filter(name='admin').first() recipients = [ user.email for user in User.objects.filter(roles=admin_role).all() ] #recipients = list(set(org_recipients + recipients)) subject = 'Relatório harvesting dados.gov - {}.'.format(source) context = { 'subject': subject, 'harvester': source, 'datasets': missing_datasets, 'server': current_app.config.get('SERVER_NAME') } msg = Message(subject=subject, sender='*****@*****.**', recipients=org_recipients, cc=['*****@*****.**'], bcc=recipients) msg.body = theme.render('mail/harvester_warning.txt', **context) msg.html = theme.render('mail/harvester_warning.html', **context) mail = current_app.extensions.get('mail') try: mail.send(msg) except: pass
def get_context(self): context = super(OrganizationDetailView, self).get_context() datasets = Dataset.objects(organization=self.organization).visible().order_by('-created') supplied_datasets = Dataset.objects(supplier=self.organization).visible().order_by('-created') reuses = Reuse.objects(organization=self.organization).visible().order_by('-created') followers = FollowOrg.objects.followers(self.organization).order_by('follower.fullname') can_edit = EditOrganizationPermission(self.organization) can_view = OrganizationPrivatePermission(self.organization) context.update({ 'reuses': reuses.paginate(1, self.page_size), 'datasets': datasets.paginate(1, self.page_size), 'supplied_datasets': supplied_datasets[:self.page_size], 'followers': followers[:self.nb_followers], 'can_edit': can_edit, 'can_view': can_view, 'private_reuses': list(Reuse.objects(organization=self.object).hidden()) if can_view else [], 'private_datasets': list(Dataset.objects(organization=self.object).hidden()) if can_view else [], }) return context
def purge_datasets(self): for dataset in Dataset.objects(deleted__ne=None): log.info('Purging dataset "{0}"'.format(dataset)) # Remove followers FollowDataset.objects(following=dataset).delete() # Remove issues DatasetIssue.objects(subject=dataset).delete() # Remove activity Activity.objects(related_to=dataset).delete() # Remove metrics Metrics.objects(object_id=dataset.id).delete() # Remove dataset.delete()
def handle_downloads(self, row, day): if 'url' in row: try: hashed_url = hash_url(row['url']) data = ( Dataset.objects(resources__urlhash=hashed_url).first() or CommunityResource.objects(urlhash=hashed_url).first() ) if isinstance(data, Dataset): dataset = data resource = get_by(dataset.resources, 'urlhash', hashed_url) log.debug('Found resource download: %s', resource.url) self.count(resource, day, row) metric = ResourceViews(resource) metric.compute() # Use the MongoDB positionnal operator ($) cmd = 'set__resources__S__metrics__{0}'.format(metric.name) qs = Dataset.objects(id=dataset.id, resources__id=resource.id) qs.update(**{cmd: metric.value}) if dataset.organization: OrgResourcesDownloads(dataset.organization).compute() elif isinstance(data, CommunityResource): resource = data log.debug('Found community resource download: %s', resource.url) self.count(resource, day, row) metric = CommunityResourceViews(resource) metric.compute() resource.metrics[metric.name] = metric.value resource.save() except: log.exception('Unable to count download for %s', row['url']) if 'subtable' in row: for subrow in row['subtable']: self.handle_downloads(subrow, day)
def get_context(self): context = super(OrganizationDetailView, self).get_context() can_edit = EditOrganizationPermission(self.organization) can_view = OrganizationPrivatePermission(self.organization) if self.organization.deleted and not can_view.can(): abort(410) datasets = Dataset.objects(organization=self.organization).visible() reuses = Reuse.objects(organization=self.organization).visible() followers = FollowOrg.objects.followers(self.organization).order_by("follower.fullname") context.update( { "reuses": reuses.paginate(1, self.page_size), "datasets": datasets.paginate(1, self.page_size), "followers": followers, "can_edit": can_edit, "can_view": can_view, "private_reuses": (list(Reuse.objects(organization=self.object).hidden()) if can_view else []), "private_datasets": (list(Dataset.objects(organization=self.object).hidden()) if can_view else []), } ) return context
def test_purge_organizations(self): with self.autoindex(): org = Organization.objects.create( name='delete me', deleted='2016-01-01', description='XXX') resources = [ResourceFactory() for _ in range(2)] dataset = DatasetFactory(resources=resources, organization=org) tasks.purge_organizations() dataset = Dataset.objects(id=dataset.id).first() self.assertIsNone(dataset.organization) organization = Organization.objects(name='delete me').first() self.assertIsNone(organization) indexed_dataset = DatasetSearch.get(id=dataset.id, using=es.client, index=es.index_name) self.assertIsNone(indexed_dataset.organization)
def serialize(cls, reuse): """By default use the ``to_dict`` method and exclude ``_id``, ``_cls`` and ``owner`` fields. """ datasets = Dataset.objects(id__in=[r.id for r in reuse.datasets]) datasets = list(datasets.only('id', 'title').no_dereference()) organization = None owner = None if reuse.organization: organization = Organization.objects(id=reuse.organization.id).first() elif reuse.owner: owner = User.objects(id=reuse.owner.id).first() return { 'title': reuse.title, 'description': reuse.description, 'url': reuse.url, 'organization': str(organization.id) if organization else None, 'owner': str(owner.id) if owner else None, 'type': reuse.type, 'tags': reuse.tags, 'tag_suggest': reuse.tags, 'badges': [badge.kind for badge in reuse.badges], 'created': reuse.created_at.strftime('%Y-%m-%dT%H:%M:%S'), 'last_modified': reuse.last_modified.strftime('%Y-%m-%dT%H:%M:%S'), 'dataset': [{ 'id': str(d.id), 'title': d.title } for d in datasets], 'metrics': reuse.metrics, 'featured': reuse.featured, 'extras': reuse.extras, 'reuse_suggest': { 'input': cls.completer_tokenize(reuse.title) + [reuse.id], 'output': str(reuse.id), 'payload': { 'title': reuse.title, 'slug': reuse.slug, 'image_url': reuse.image(40, external=True), }, }, }
def get_context(self): context = super(OrganizationDetailView, self).get_context() org_id = str(self.organization.id) datasets, supplied_datasets, reuses = multiquery( SearchQuery(DatasetSearch, sort='-created', organization=org_id, page_size=9), SearchQuery(DatasetSearch, sort='-created', supplier=org_id, page_size=9), SearchQuery(ReuseSearch, sort='-created', organization=org_id, page_size=9), ) context.update({ 'reuses': reuses, 'datasets': datasets, 'supplied_datasets': supplied_datasets, 'private_reuses': list(Reuse.objects(organization=self.object, private=True)), 'private_datasets': list(Dataset.objects(organization=self.object, private=True)), 'can_edit': EditOrganizationPermission(self.organization.id) }) return context
def test_attach_does_not_duplicate(self): attached_datasets = [] for i in range(2): dataset = DatasetFactory.build() dataset.extras['harvest:domain'] = 'test.org' dataset.extras['harvest:remote_id'] = str(i) dataset.last_modified = datetime.now() dataset.save() attached_datasets.append(dataset) datasets = DatasetFactory.create_batch(3) with NamedTemporaryFile() as csvfile: writer = csv.DictWriter(csvfile, fieldnames=['local', 'remote'], delimiter=b';', quotechar=b'"') writer.writeheader() for index, dataset in enumerate(datasets): writer.writerow({ 'local': str(dataset.id), 'remote': str(index) }) csvfile.flush() result = actions.attach('test.org', csvfile.name) dbcount = Dataset.objects(**{ 'extras__harvest:remote_id__exists': True }).count() self.assertEqual(result.success, len(datasets)) self.assertEqual(dbcount, result.success) for index, dataset in enumerate(datasets): dataset.reload() self.assertEqual(dataset.extras['harvest:domain'], 'test.org') self.assertEqual(dataset.extras['harvest:remote_id'], str(index))
def get_value(self): return Dataset.objects(owner=self.user).visible().count()