def run_asynchronous_job(job, job_id, job_key, input): if not scheduler.running: scheduler.start() try: db.add_pending_job(job_id, job_key, **input) except sa.exc.IntegrityError: error_string = 'job_id {} already exists'.format(job_id) return json.dumps({"error": error_string}), 409, headers scheduler.add_job(RunNowTrigger(), job, [job_id, input], None) return job_status(job_id=job_id, show_job_key=True, ignore_auth=True)
def xloader_data_into_datastore_(input, job_dict): '''This function: * downloads the resource (metadata) from CKAN * downloads the data * calls the loader to load the data into DataStore * calls back to CKAN with the new status (datapusher called this function 'push_to_datastore') ''' job_id = get_current_job().id db.init(config) # Store details of the job in the db try: db.add_pending_job(job_id, **input) except sa.exc.IntegrityError: raise JobError('job_id {} already exists'.format(job_id)) # Set-up logging to the db handler = StoringHandler(job_id, input) level = logging.DEBUG handler.setLevel(level) logger = logging.getLogger(job_id) handler.setFormatter(logging.Formatter('%(message)s')) logger.addHandler(handler) # also show logs on stderr logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.DEBUG) validate_input(input) data = input['metadata'] ckan_url = data['ckan_url'] resource_id = data['resource_id'] api_key = input.get('api_key') try: resource, dataset = get_resource_and_dataset(resource_id) except JobError, e: # try again in 5 seconds just in case CKAN is slow at adding resource time.sleep(5) resource, dataset = get_resource_and_dataset(resource_id)
def run_synchronous_job(job, job_id, job_key, input): try: db.add_pending_job(job_id, job_key, **input) except sa.exc.IntegrityError, e: error_string = 'job_id {} already exists'.format(job_id) return json.dumps({"error": error_string}), 409, headers
def xloader_data_into_datastore_(input, job_dict): '''This function: * downloads the resource (metadata) from CKAN * downloads the data * calls the loader to load the data into DataStore * calls back to CKAN with the new status (datapusher called this function 'push_to_datastore') ''' job_id = get_current_job().id db.init(config) # Store details of the job in the db try: db.add_pending_job(job_id, **input) except sa.exc.IntegrityError: raise JobError('job_id {} already exists'.format(job_id)) # Set-up logging to the db handler = StoringHandler(job_id, input) level = logging.DEBUG handler.setLevel(level) logger = logging.getLogger(job_id) handler.setFormatter(logging.Formatter('%(message)s')) logger.addHandler(handler) # also show logs on stderr logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.DEBUG) validate_input(input) data = input['metadata'] ckan_url = data['ckan_url'] resource_id = data['resource_id'] api_key = input.get('api_key') try: resource, dataset = get_resource_and_dataset(resource_id) except (JobError, ObjectNotFound) as e: # try again in 5 seconds just in case CKAN is slow at adding resource time.sleep(5) resource, dataset = get_resource_and_dataset(resource_id) resource_ckan_url = '/dataset/{}/resource/{}' \ .format(dataset['name'], resource['id']) logger.info('Express Load starting: {}'.format(resource_ckan_url)) # check if the resource url_type is a datastore if resource.get('url_type') == 'datastore': logger.info('Ignoring resource - url_type=datastore - dump files are ' 'managed with the Datastore API') return # download resource tmp_file, file_hash = _download_resource_data(resource, data, api_key, logger) if (resource.get('hash') == file_hash and not data.get('ignore_hash')): logger.info('Ignoring resource - the file hash hasn\'t changed: ' '{hash}.'.format(hash=file_hash)) return logger.info('File hash: {}'.format(file_hash)) resource['hash'] = file_hash def direct_load(): fields = loader.load_csv( tmp_file.name, resource_id=resource['id'], resource_alias=resource['name'], mimetype=resource.get('format'), logger=logger) loader.calculate_record_count( resource_id=resource['id'], logger=logger) set_datastore_active(data, resource, logger) job_dict['status'] = 'running_but_viewable' callback_xloader_hook(result_url=input['result_url'], api_key=api_key, job_dict=job_dict) logger.info('Data now available to users: {}'.format(resource_ckan_url)) loader.create_column_indexes( fields=fields, resource_id=resource['id'], logger=logger) update_resource(resource={'id': resource['id'], 'hash': resource['hash']}, patch_only=True) logger.info('File Hash updated for resource: {}'.format(resource['hash'])) def messytables_load(): try: loader.load_table(tmp_file.name, resource_id=resource['id'], resource_alias=resource['name'], mimetype=resource.get('format'), logger=logger) except JobError as e: logger.error('Error during messytables load: {}'.format(e)) raise loader.calculate_record_count( resource_id=resource['id'], logger=logger) set_datastore_active(data, resource, logger) logger.info('Finished loading with messytables') update_resource(resource={'id': resource['id'], 'hash': resource['hash']}, patch_only=True) logger.info('File Hash updated for resource: {}'.format(resource['hash'])) # Load it logger.info('Loading CSV') just_load_with_messytables = asbool(config.get( 'ckanext.xloader.just_load_with_messytables', False)) logger.info("'Just load with messytables' mode is: {}".format( just_load_with_messytables)) try: if just_load_with_messytables: messytables_load() else: try: direct_load() except JobError as e: logger.warning('Load using COPY failed: {}'.format(e)) logger.info('Trying again with messytables') messytables_load() except FileCouldNotBeLoadedError as e: logger.warning('Loading excerpt for this format not supported.') logger.error('Loading file raised an error: {}'.format(e)) raise JobError('Loading file raised an error: {}'.format(e)) tmp_file.close() logger.info('Express Load completed')