def query_token(self, token, token_type_hint, client): qs = OAuth2Token.objects(client=client) if token_type_hint: qs = qs(**{token_type_hint: token}) else: qs = qs(db.Q(access_token=token) | db.Q(refresh_token=token)) return qs.first()
def guess_one(cls, text): ''' Try to guess license from a string. Try to exact match on identifier then slugified title and fallback on edit distance ranking (after slugification) ''' if not text: return qs = cls.objects text = text.strip().lower() # Stored identifiers are lower case slug = cls.slug.slugify(text) # Use slug as it normalize string license = qs( db.Q(id__iexact=text) | db.Q(slug=slug) | db.Q(url__iexact=text) | db.Q(alternate_urls__iexact=text)).first() if license is None: # Try to single match with a low Damerau-Levenshtein distance computed = ((l, rdlevenshtein(l.slug, slug)) for l in cls.objects) candidates = [l for l, d in computed if d <= MAX_DISTANCE] # If there is more that one match, we cannot determinate # which one is closer to safely choose between candidates if len(candidates) == 1: license = candidates[0] if license is None: # Try to single match with a low Damerau-Levenshtein distance computed = ((l, rdlevenshtein(cls.slug.slugify(t), slug)) for l in cls.objects for t in l.alternate_titles) candidates = [l for l, d in computed if d <= MAX_DISTANCE] # If there is more that one match, we cannot determinate # which one is closer to safely choose between candidates if len(candidates) == 1: license = candidates[0] return license
def get(self): '''Fetch site activity, optionally filtered by user of org.''' args = activity_parser.parse_args() qs = Activity.objects if args['organization']: qs = qs( db.Q(organization=args['organization']) | db.Q(related_to=args['organization'])) if args['user']: qs = qs(actor=args['user']) qs = qs.order_by('-created_at') qs = qs.paginate(args['page'], args['page_size']) # Filter out DBRefs # Always return a result even not complete # But log the error (ie. visible in sentry, silent for user) # Can happen when someone manually delete an object in DB (ie. without proper purge) safe_items = [] for item in qs.queryset.items: try: item.related_to except DoesNotExist as e: log.error(e, exc_info=True) else: safe_items.append(item) qs.queryset.items = safe_items return qs
def valid_at(self, at): '''Limit current QuerySet to zone valid at a given date''' only_start = db.Q(validity__start__lte=at, validity__end=None) only_end = db.Q(validity__start=None, validity__end__gt=at) both = db.Q(validity__end__gt=at, validity__start__lte=at) no_validity = db.Q(validity=None) | db.Q(validity__start=None, validity__end=None) return self(no_validity | both | only_start | only_end)
def query_token(self, token, token_type_hint, client): qs = OAuth2Token.objects(client=client) if token_type_hint == 'access_token': return qs.filter(access_token=token).first() elif token_type_hint == 'refresh_token': return qs.filter(refresh_token=token).first() else: qs = qs(db.Q(access_token=token) | db.Q(refresh_token=token)) return qs.first()
def from_organizations(self, user, *organizations): from udata.models import Dataset, Reuse # Circular imports. Qs = db.Q() for dataset in Dataset.objects(owner=user).visible(): Qs |= db.Q(subject=dataset) for org in organizations: for dataset in Dataset.objects(organization=org).visible(): Qs |= db.Q(subject=dataset) for reuse in Reuse.objects.owned_by(*[user.id] + list(organizations)): Qs |= db.Q(subject=reuse) return self(Qs)
def check_for_territories(query): if (not query or len(query) < 4 or not current_app.config.get('ACTIVATE_TERRITORIES')): return GeoZone.objects.none() # If it's a code, try INSEE/postal, otherwise use the name. qs = GeoZone.objects(level='fr/town') if len(query) == 5 and query.isdigit(): # Match both INSEE and postal codes. qs = qs(db.Q(code=query) | db.Q(keys__postal__contains=query)) else: # Check names starting with query or exact match. qs = qs(db.Q(name__istartswith=query) | db.Q(name__iexact=query)) # Sort matching results by population and area. return qs.order_by('-population', '-area')
def get(self): '''Fetch site activity, optionally filtered by user of org.''' args = activity_parser.parse_args() qs = Activity.objects if args['organization']: qs = qs( db.Q(organization=args['organization']) | db.Q(related_to=args['organization'])) if args['user']: qs = qs(actor=args['user']) return (qs.order_by('-created_at').paginate(args['page'], args['page_size']))
def to_python(self, value): try: quoted = self.quote(value) query = db.Q(slug=value) | db.Q(slug=quoted) obj = self.model.objects(query).get() except (InvalidQueryError, self.model.DoesNotExist): # If the model doesn't have a slug or matching slug doesn't exist. obj = None else: if obj.slug != value: return LazyRedirect(quoted) try: return obj or self.model.objects.get_or_404(id=value) except NotFound as e: if self.has_redirected_slug: latest = self.model.slug.latest(value) if latest: return LazyRedirect(latest) return e
def guess_one(cls, text): ''' Try to guess license from a string. Try to exact match on identifier then slugified title and fallback on edit distance ranking (after slugification) ''' if not text: return qs = cls.objects text = text.strip().lower() # Stored identifiers are lower case slug = cls.slug.slugify(text) # Use slug as it normalize string license = qs( db.Q(id__iexact=text) | db.Q(slug=slug) | db.Q(url__iexact=text) | db.Q(alternate_urls__iexact=text)).first() if license is None: # If we're dealing with an URL, let's try some specific stuff # like getting rid of trailing slash and scheme mismatch try: url = validate_url(text) except ValidationError: pass else: parsed = urlparse(url) path = parsed.path.rstrip('/') query = f'{parsed.netloc}{path}' license = qs( db.Q(url__icontains=query) | db.Q(alternate_urls__contains=query)).first() if license is None: # Try to single match `slug` with a low Damerau-Levenshtein distance computed = ((l, rdlevenshtein(l.slug, slug)) for l in cls.objects) candidates = [l for l, d in computed if d <= MAX_DISTANCE] # If there is more that one match, we cannot determinate # which one is closer to safely choose between candidates if len(candidates) == 1: license = candidates[0] if license is None: # Try to match `title` with a low Damerau-Levenshtein distance computed = ((l, rdlevenshtein(l.title.lower(), text)) for l in cls.objects) candidates = [l for l, d in computed if d <= MAX_DISTANCE] # If there is more that one match, we cannot determinate # which one is closer to safely choose between candidates if len(candidates) == 1: license = candidates[0] if license is None: # Try to single match `alternate_titles` with a low Damerau-Levenshtein distance computed = ((l, rdlevenshtein(cls.slug.slugify(t), slug)) for l in cls.objects for t in l.alternate_titles) candidates = [l for l, d in computed if d <= MAX_DISTANCE] # If there is more that one license matching, we cannot determinate # which one is closer to safely choose between candidates if len(set(candidates)) == 1: license = candidates[0] return license
def check_for_territories(query): """ Return a geozone queryset of territories given the `query`. Results are sorted by population and area (biggest first). """ if not query or not current_app.config.get('ACTIVATE_TERRITORIES'): return [] dbqs = db.Q() query = query.lower() is_digit = query.isdigit() query_length = len(query) for level in current_app.config.get('HANDLED_LEVELS'): if level == 'country': continue # Level not fully handled yet. q = db.Q(level=level) if (query_length == 2 and level == 'fr:departement' and (is_digit or query in ('2a', '2b'))): # Counties + Corsica. q &= db.Q(code=query) elif query_length == 3 and level == 'fr:departement' and is_digit: # French DROM-COM. q &= db.Q(code=query) elif query_length == 5 and level == 'fr:commune' and ( is_digit or query.startswith('2a') or query.startswith('2b')): # INSEE code then postal codes with Corsica exceptions. q &= db.Q(code=query) | db.Q(keys__postal__contains=query) elif query_length >= 4: # Check names starting with query or exact match. q &= db.Q(name__istartswith=query) | db.Q(name__iexact=query) else: continue # Meta Q object, ready to be passed to a queryset. dbqs |= q if dbqs.empty: return [] # Sort matching results by population and area. return GeoZone.objects(dbqs).order_by('-population', '-area')
def hidden(self): return self( db.Q(private=True) | db.Q(datasets__0__exists=False) | db.Q(deleted__ne=None))
def get_value(self): org = self.target return (Dataset.objects(db.Q(organization=org) | db.Q(supplier=org)).visible().count())
def filter_activities(self, qs): predicate = (db.Q(organization=self.organization) | db.Q(related_to=self.organization)) return qs(predicate)
def valid_at(self, valid_date): '''Limit current QuerySet to zone valid at a given date''' is_valid = db.Q(validity__end__gt=valid_date, validity__start__lte=valid_date) no_validity = db.Q(validity=None) return self(is_valid | no_validity)
def process(self, item): response = self.get_action('package_show', id=item.remote_id) data = self.validate(response['result'], self.schema) if type(data) == list: data = data[0] # Fix the remote_id: use real ID instead of not stable name item.remote_id = data['id'] # Skip if no resource if not len(data.get('resources', [])): msg = 'Dataset {0} has no record'.format(item.remote_id) raise HarvestSkipException(msg) dataset = self.get_dataset(item.remote_id) # Core attributes if not dataset.slug: dataset.slug = data['name'] dataset.title = data['title'] dataset.description = parse_html(data['notes']) # Detect license default_license = dataset.license or License.default() dataset.license = License.guess(data['license_id'], data['license_title'], default=default_license) dataset.tags = [t['name'] for t in data['tags'] if t['name']] dataset.created_at = data['metadata_created'] dataset.last_modified = data['metadata_modified'] dataset.extras['ckan:name'] = data['name'] temporal_start, temporal_end = None, None spatial_geom, spatial_zone = None, None for extra in data['extras']: key = extra['key'] value = extra['value'] if value is None or (isinstance(value, str) and not value.strip()): # Skip empty extras continue elif key == 'spatial': # GeoJSON representation (Polygon or Point) spatial_geom = json.loads(value) elif key == 'spatial-text': # Textual representation of the extent / location qs = GeoZone.objects(db.Q(name=value) | db.Q(slug=value)) qs = qs.valid_at(datetime.now()) if qs.count() == 1: spatial_zone = qs.first() else: dataset.extras['ckan:spatial-text'] = value log.debug('spatial-text value not handled: %s', value) elif key == 'spatial-uri': # Linked Data URI representing the place name dataset.extras['ckan:spatial-uri'] = value log.debug('spatial-uri value not handled: %s', value) elif key == 'frequency': # Update frequency freq = frequency_from_rdf(value) if freq: dataset.frequency = freq elif value in UPDATE_FREQUENCIES: dataset.frequency = value else: dataset.extras['ckan:frequency'] = value log.debug('frequency value not handled: %s', value) # Temporal coverage start elif key == 'temporal_start': temporal_start = daterange_start(value) # Temporal coverage end elif key == 'temporal_end': temporal_end = daterange_end(value) else: dataset.extras[extra['key']] = value if spatial_geom or spatial_zone: dataset.spatial = SpatialCoverage() if spatial_zone: dataset.spatial.zones = [spatial_zone] if spatial_geom: if spatial_geom['type'] == 'Polygon': coordinates = [spatial_geom['coordinates']] elif spatial_geom['type'] == 'MultiPolygon': coordinates = spatial_geom['coordinates'] else: raise HarvestException('Unsupported spatial geometry') dataset.spatial.geom = { 'type': 'MultiPolygon', 'coordinates': coordinates } if temporal_start and temporal_end: dataset.temporal_coverage = db.DateRange( start=temporal_start, end=temporal_end, ) # Remote URL dataset.extras['remote_url'] = self.dataset_url(data['name']) if data.get('url'): try: url = uris.validate(data['url']) except uris.ValidationError: dataset.extras['ckan:source'] = data['url'] else: # use declared `url` as `remote_url` if any dataset.extras['remote_url'] = url # Resources for res in data['resources']: if res['resource_type'] not in ALLOWED_RESOURCE_TYPES: continue try: resource = get_by(dataset.resources, 'id', UUID(res['id'])) except Exception: log.error('Unable to parse resource ID %s', res['id']) continue if not resource: resource = Resource(id=res['id']) dataset.resources.append(resource) resource.title = res.get('name', '') or '' resource.description = parse_html(res.get('description')) resource.url = res['url'] resource.filetype = 'remote' resource.format = res.get('format') resource.mime = res.get('mimetype') resource.hash = res.get('hash') resource.created = res['created'] resource.modified = res['last_modified'] resource.published = resource.published or resource.created return dataset
def hidden(self): return self( db.Q(private=True) | db.Q(resources__0__exists=False) | db.Q(deleted__ne=None) | db.Q(archived__ne=None))
def owned_by(self, *owners): Qs = db.Q() for owner in owners: Qs |= db.Q(owner=owner) | db.Q(organization=owner) return self(Qs)