def determine_version_filter(version=None, resource_ids=None, resource_ids_and_versions=None): ''' Determine and return the elasticsearch-dsl filter which can filter on the version extracted from the given parameters. :param version: the version to filter on across all resources :param resource_ids: the resource to search :param resource_ids_and_versions: a dict of resource ids -> versions providing resource specific versions for search :return: an elasticsearch-dsl object ''' if not resource_ids_and_versions: # default the version to now if necessary if version is None: version = to_timestamp(datetime.now()) # just use a single version filter if we don't have any resource specific versions return create_version_query(version) else: # run through the resource specific versions provided and ensure they're rounded down indexes_and_versions = {} for resource_id in resource_ids: target_version = resource_ids_and_versions[resource_id] if target_version is None: raise toolkit.ValidationError( u"Valid version not given for {}".format(resource_id)) index = prefix_resource(resource_id) rounded_version = common.SEARCH_HELPER.get_rounded_versions( [index], target_version)[index] indexes_and_versions[index] = rounded_version return create_index_specific_version_filter(indexes_and_versions)
def datastore_delete(resource_id, context, version=None): ''' Deletes the resource from the datastore. In reality the resource data is maintained in its index but the latest version of all records is set to an empty record. This means that the old data is still accessible to ensure searches using versions before the deletion still work but searches at the latest version or later will return no records. The deletion work is done by an rq background job and therefore this is an async action. :param resource_id: the id of the resource to delete :param context: the context dict from the action call :param version: the to mark the deletion at :return: a dict containing info about the background job that is doing the delete ''' # if the requested deletion version is missing, default to now if version is None: version = to_timestamp(datetime.now()) if is_resource_read_only(resource_id): raise toolkit.ValidationError( u'This resource has been marked as read only') # queue the job resource = toolkit.get_action(u'resource_show')(context, { u'id': resource_id }) job = queue_deletion(resource, version) return { u'queued_at': job.enqueued_at.isoformat(), u'job_id': job.id, }
def test_to_timestamp(): # create a UTC timezone class so that we don't have to use any external libs just for this test class UTC(tzinfo): def utcoffset(self, dt): return timedelta(0) def tzname(self, dt): return u'UTC' def dst(self, dt): return timedelta(0) utc = UTC() # check that dates are treated as utc assert to_timestamp( datetime.strptime(u'19700101', u'%Y%m%d').replace(tzinfo=utc)) == 0 # check a later date too assert to_timestamp( datetime.strptime(u'20180713', u'%Y%m%d').replace(tzinfo=utc)) == 1531440000000
def datastore_count(resource_ids=None, version=None): if version is None: version = to_timestamp(datetime.now()) if resource_ids is None: resource_ids = [u'*'] indexes = [ get_public_alias_name(resource_id) for resource_id in resource_ids ] search = Search(using=common.ES_CLIENT, index=indexes).filter(create_version_query(version)) return search.count()
def notify(self, entity, operation): ''' Respond to changes to model objects. We use this hook to ensure any new data is imported into the versioned datastore and to make sure the privacy settings on the data are up to date. We're only interested in: - resource deletions - new resources - resources that have had changes to their URL - packages that have changed :param entity: the entity that has changed :param operation: the operation undertaken on the object. This will be one of the options from the DomainObjectOperation enum. ''' if isinstance( entity, model.Package) and operation == DomainObjectOperation.changed: # if a package is the target entity and it's been changed ensure the privacy is applied # correctly to its resource indexes update_resources_privacy(entity) elif isinstance(entity, model.Resource): context = {u'model': model, u'ignore_auth': True} data_dict = {u'resource_id': entity.id} if operation == DomainObjectOperation.deleted: toolkit.get_action(u'datastore_delete')(context, data_dict) else: do_upsert = False if operation == DomainObjectOperation.new: # datastore_create returns True when the resource looks like it's ingestible do_upsert = toolkit.get_action(u'datastore_create')( context, data_dict) elif operation == DomainObjectOperation.changed: # always try the upsert if the resource has changed do_upsert = True if do_upsert: # use the revision version as the version data_dict[u'version'] = to_timestamp( entity.revision.timestamp) # use replace to overwrite the existing data (this is what users would expect) data_dict[u'replace'] = True try: toolkit.get_action(u'datastore_upsert')(context, data_dict) except (ReadOnlyResourceException, InvalidVersionException): # this is fine, just swallow pass
def datastore_upsert(resource_id, replace, context, original_data_dict, version=None): ''' Main data ingestion function for the datastore. The URL on the given resource will be used to retrieve and then ingest data or, if provided, records will be ingested directly from the request. Data is ingested using the an rq background job and therefore this is an async action. :param resource_id: the resource to ingest the data into :param replace: whether to replace the data already in the resource or append to it :param context: the context dict from the action call :param original_data_dict: the data_dict before it was validated :param version: the version of the new data, can be None (default) but if not must be newer than the latest version of the resource :return: information about the background job that is handling the ingestion ''' # this comes through as junk if it's not removed before validating. This happens because the # data dict is flattened during validation, but why this happens is unclear. records = original_data_dict.get(u'records', None) if is_resource_read_only(resource_id): raise ReadOnlyResourceException( u'This resource has been marked as read only') if version is None: version = to_timestamp(datetime.now()) # check that the version is valid if not check_version_is_valid(resource_id, version): raise InvalidVersionException( u'The new version must be newer than current version') # get the current user user = toolkit.get_action(u'user_show')(context, {u'id': context[u'user']}) # queue the resource import job resource = toolkit.get_action(u'resource_show')(context, { u'id': resource_id }) job = queue_import(resource, version, replace, records, user[u'apikey']) return { u'queued_at': job.enqueued_at.isoformat(), u'job_id': job.id, }
def datastore_queue_download(email_address, context, query=None, query_version=None, version=None, resource_ids=None, resource_ids_and_versions=None, separate_files=True, format=u'csv', ignore_empty_fields=True): ''' Starts a download of the data found by the given query parameters. This download is created asynchronously using the rq background job queue and a link to the results is emailed to the given email address when complete. :param email_address: the email address to send the download link to :param context: the context dict from the action call :param query: the query dict. If None (default) then an empty query is used :param query_version: the version of the query schema the query is using. If None (default) then the latest query schema version is used :param version: the version to search the data at. If None (default) the current time is used :param resource_ids: the list of resource to search. If None (default) then all the resources the user has access to are queried. If a list of resources are passed then any resources not accessible to the user will be removed before querying :param resource_ids_and_versions: a dict of resources and versions to search each of them at. This allows precise searching of each resource at a specific parameter. If None (default) then the resource_ids parameter is used together with the version parameter. If this parameter is provided though, it takes priority over the resource_ids and version parameters. :param separate_files: whether to split the results into a file per resource or just put all results in one file. The default is True - split results into a file per resource. :param format: the format to download the data in. The default is csv. :param ignore_empty_fields: whether to ignore fields with no data in them in the result set and not write them into the download file(s). Default: True. :return: a dict containing info about the background job that is doing the downloading and the download id ''' if resource_ids_and_versions is None: resource_ids_and_versions = {} else: # use the resource_ids_and_versions dict first over the resource_ids and version params resource_ids = list(resource_ids_and_versions.keys()) # figure out which resources should be searched resource_ids = get_available_datastore_resources(context, resource_ids) if not resource_ids: raise toolkit.ValidationError( u"The requested resources aren't accessible to this user") rounded_resource_ids_and_versions = {} # see if a version was provided, we'll use this is a resource id we're searching doesn't have a # directly assigned version (i.e. it was absent from the requested_resource_ids_and_versions # dict, or that parameter wasn't provided) if version is None: version = to_timestamp(datetime.now()) for resource_id in resource_ids: # try to get the target version from the passed resource_ids_and_versions dict, but if # it's not in there, default to the version variable target_version = resource_ids_and_versions.get(resource_id, version) index = prefix_resource(resource_id) # round the version down to ensure we search the exact version requested rounded_version = common.SEARCH_HELPER.get_rounded_versions( [index], target_version)[index] if rounded_version is not None: # resource ids without a rounded version are skipped rounded_resource_ids_and_versions[resource_id] = rounded_version # setup the query if query is None: query = {} if query_version is None: query_version = get_latest_query_version() validate_query(query, query_version) search = translate_query(query, query_version) query_hash = hash_query(query, query_version) options = { u'separate_files': separate_files, u'format': format, u'ignore_empty_fields': ignore_empty_fields } download = DatastoreDownload( query_hash=query_hash, query=query, query_version=query_version, resource_ids_and_versions=rounded_resource_ids_and_versions, state=u'queued', options=options) download.save() job = queue_download(email_address, download.id, query_hash, query, query_version, search.to_dict(), rounded_resource_ids_and_versions, separate_files, format, ignore_empty_fields) return { u'queued_at': job.enqueued_at.isoformat(), u'job_id': job.id, u'download_id': download.id, }
def datastore_guess_fields(context, query=None, query_version=None, version=None, resource_ids=None, resource_ids_and_versions=None, size=10, ignore_groups=None): ''' Guesses the fields that are most relevant to show with the given query. If only one resource is included in the search then the requested number of fields from the resource at the required version are returned in ingest order if the details are available. If multiple resources are queried, the most common fields across the resource under search are returned. The fields are grouped together in an attempt to match the same field name in different cases across different resources. The most common {size} groups are returned. The groups returned are ordered firstly by the number of resources they appear in in descending order, then if there are ties, the number of records the group finds is used and this again is ordered in a descending fashion. :param context: the context dict from the action call :param query: the query :param query_version: the query schema version :param version: the version to search at :param resource_ids: the resource ids to search in :param resource_ids_and_versions: a dict of resource ids -> versions to search at :param size: the number of groups to return :param ignore_groups: a list of groups to ignore from the results (default: None) :return: a list of groups ''' # provide some more complex defaults for some parameters if necessary if query is None: query = {} if query_version is None: query_version = get_latest_query_version() ignore_groups = set(g.lower() for g in ignore_groups) if ignore_groups is not None else set() try: # validate and translate the query into an elasticsearch-dsl Search object validate_query(query, query_version) search = translate_query(query, query_version) except (jsonschema.ValidationError, InvalidQuerySchemaVersionError) as e: raise toolkit.ValidationError(e.message) # figure out which resources we're searching resource_ids, skipped_resource_ids = determine_resources_to_search(context, resource_ids, resource_ids_and_versions) if not resource_ids: raise toolkit.ValidationError(u"The requested resources aren't accessible to this user") if version is None: version = to_timestamp(datetime.now()) # add the version filter necessary given the parameters and the resources we're searching version_filter = determine_version_filter(version, resource_ids, resource_ids_and_versions) search = search.filter(version_filter) # add the size parameter, we don't want any records back search = search.extra(size=0) resource_ids = find_searched_resources(search, resource_ids) all_fields = get_all_fields(resource_ids) for group in ignore_groups: all_fields.ignore(group) # allow plugins to modify the fields object for plugin in PluginImplementations(IVersionedDatastore): all_fields = plugin.datastore_modify_guess_fields(resource_ids, all_fields) if len(resource_ids) == 1: resource_id = resource_ids[0] if resource_ids_and_versions is None: up_to_version = version else: up_to_version = resource_ids_and_versions[resource_id] return get_single_resource_fields(all_fields, resource_id, up_to_version, search, size) else: size = max(0, min(size, 25)) return select_fields(all_fields, search, size)
def datastore_search(context, data_dict, original_data_dict): ''' Searches the datastore using a query schema similar to the standard CKAN datastore query schema, but with versioning. :param context: the context dict from the action call :param data_dict: the data_dict from the action call :param original_data_dict: the data_dict before it was validated :return: a dict including the search results amongst other things ''' original_data_dict, data_dict, version, search = create_search( context, data_dict, original_data_dict) resource_id = data_dict[u'resource_id'] index_name = prefix_resource(resource_id) # if the version is None, default it to the current timestamp if version is None: version = to_timestamp(datetime.now()) # add the version filter to the query search = search.filter(create_version_query(version)) # if the run query option is false (default to true if not present) then just return the query # we would have run against elasticsearch instead of actually running it. This is useful for # running the query outside of ckan, for example on a tile server. if not data_dict.get(u'run_query', True): return { u'indexes': [index_name], u'search': search.to_dict(), } else: result = run_search(search, [index_name]) # allow other extensions implementing our interface to modify the result for plugin in PluginImplementations(IVersionedDatastore): result = plugin.datastore_modify_result(context, original_data_dict, data_dict, result) # add the actual result object to the context in case the caller is an extension and they # have used one of the interface hooks to alter the search object and include, for example, # an aggregation context[u'versioned_datastore_query_result'] = result # get the fields mapping, fields = get_fields(resource_id, version) # allow other extensions implementing our interface to modify the field definitions for plugin in PluginImplementations(IVersionedDatastore): fields = plugin.datastore_modify_fields(resource_id, mapping, fields) query_for_logging = {} for key in _query_log_keys: if data_dict.get(key, None): query_for_logging[key] = data_dict[key] log_query(query_for_logging, u'basicsearch') # return a dictionary containing the results and other details return { u'total': result.hits.total, u'records': [hit.data.to_dict() for hit in result], u'facets': format_facets(result.aggs.to_dict()), u'fields': fields, u'raw_fields': mapping[u'mappings'][DOC_TYPE][u'properties'][u'data'] [u'properties'], u'after': get_last_after(result.hits), u'_backend': u'versioned-datastore', }
def datastore_search_raw(resource_id, context, data_dict, original_data_dict, search=None, version=None, raw_result=False, include_version=True): ''' Searches the datastore using a raw elasticsearch query. :param resource_id: the id of the resource to search :param context: the context dict from the action call :param data_dict: the data_dict from the action call :param original_data_dict: the data_dict before it was validated :param search: the elasticsearch query to run :param version: the version of the data to query against :param raw_result: whether to return the result as a raw elasticsearch result, or format it in the same way as a normal datastore_search call would :param include_version: whether to include the version in the search or not :return: a dict containing the results of the search ''' if search is None: search = {} if version is None: version = to_timestamp(datetime.now()) index_name = prefix_resource(resource_id) search = Search.from_dict(search) try: # the user has asked for a raw result and that the version filter is not included if raw_result and not include_version: version = None # run the query passing the version which will either be the requested version, the current # timestamp or None if no version filter should be included in the search result = run_search(search, index_name, version) if raw_result: return result.to_dict() # allow other extensions implementing our interface to modify the result object for plugin in PluginImplementations(IVersionedDatastore): result = plugin.datastore_modify_result(context, original_data_dict, data_dict, result) # add the actual result object to the context in case the caller is an extension and # they have used one of the interface hooks to alter the search object and include, for # example, an aggregation context[u'versioned_datastore_query_result'] = result # get the fields mapping, fields = get_fields(resource_id, version) # allow other extensions implementing our interface to modify the field definitions for plugin in PluginImplementations(IVersionedDatastore): fields = plugin.datastore_modify_fields(resource_id, mapping, fields) # return a dictionary containing the results and other details return { u'total': result.hits.total, u'records': [hit.data.to_dict() for hit in result], u'facets': format_facets(result.aggs.to_dict()), u'fields': fields, u'raw_fields': mapping[u'mappings'][DOC_TYPE][u'properties'][u'data'] [u'properties'], u'after': get_last_after(result.hits), u'_backend': u'versioned-datastore', } except RequestError as e: raise toolkit.ValidationError(str(e))