def get_or_get_and_create_output_dataset(transform, input_dataset): output_group = transform['output'].get('data-group', input_dataset['data_group']) output_type = transform['output']['data-type'] admin_api = AdminAPI( config.STAGECRAFT_URL, config.STAGECRAFT_OAUTH_TOKEN, ) output_data_set_config = admin_api.get_data_set(output_group, output_type) if not output_data_set_config: data_set_config = { 'data_type': output_type, 'data_group': output_group, 'bearer_token': input_dataset['bearer_token'], 'realtime': input_dataset['realtime'], 'published': input_dataset['published'], 'max_age_expected': input_dataset['max_age_expected'], } if 'capped_size' in input_dataset and input_dataset['capped_size']: data_set_config['capped_size'] = input_dataset['capped_size'] output_data_set_config = admin_api.create_data_set(data_set_config) return DataSet.from_group_and_type( config.BACKDROP_WRITE_URL, output_group, output_type, token=output_data_set_config['bearer_token'], )
def get_or_get_and_create_output_dataset(transform, input_dataset): output_group = transform['output'].get( 'data-group', input_dataset['data_group']) output_type = transform['output']['data-type'] admin_api = AdminAPI( config.STAGECRAFT_URL, config.STAGECRAFT_OAUTH_TOKEN, ) output_data_set_config = admin_api.get_data_set(output_group, output_type) if not output_data_set_config: data_set_config = { 'data_type': output_type, 'data_group': output_group, 'bearer_token': input_dataset['bearer_token'], 'realtime': input_dataset['realtime'], 'published': input_dataset['published'], 'max_age_expected': input_dataset['max_age_expected'], } if 'capped_size' in input_dataset and input_dataset['capped_size']: data_set_config['capped_size'] = input_dataset['capped_size'] output_data_set_config = admin_api.create_data_set(data_set_config) return DataSet.from_group_and_type( config.BACKDROP_WRITE_URL, output_group, output_type, token=output_data_set_config['bearer_token'], )
def get_module_choices(): choices = [('', '')] if not getenv('TESTING', False): try: # Create an unauthenticated client admin_client = AdminAPI(app.config['STAGECRAFT_HOST'], None) module_types = admin_client.list_module_types() choices += [ (module['id'], module['name']) for module in module_types] except requests.ConnectionError: if not app.config['DEBUG']: raise return choices
def entrypoint(dataset_id, earliest, latest): """ For the given parameters, query stagecraft for transformations to run, and dispatch tasks to the appropriate workers. """ admin_api = AdminAPI( config.STAGECRAFT_URL, config.STAGECRAFT_OAUTH_TOKEN, ) transforms = admin_api.get_data_set_transforms(dataset_id) data_set_config = admin_api.get_data_set_by_name(dataset_id) for transform in transforms: app.send_task('backdrop.transformers.dispatch.run_transform', args=(data_set_config, transform, earliest, latest))
def entrypoint(dataset_id, earliest, latest): """ For the given parameters, query stagecraft for transformations to run, and dispatch tasks to the appropriate workers. """ admin_api = AdminAPI( config.STAGECRAFT_URL, config.STAGECRAFT_OAUTH_TOKEN, ) transforms = admin_api.get_data_set_transforms(dataset_id) data_set_config = admin_api.get_data_set_by_name(dataset_id) for transform in transforms: app.send_task( 'backdrop.transformers.dispatch.run_transform', args=(data_set_config, transform, earliest, latest) ) stats_client.incr('dispatch')
def compute(new_data, transform, data_set_config): # Sort the new data by timestamp and use the latest data point. new_data.sort(key=lambda item: item['_timestamp'], reverse=True) latest_datum = new_data[0] # Only continue if we are not back filling data. if not is_latest_data(data_set_config, transform, latest_datum): pass # Input data won't have a unique key for each type of value. # E.g. completion rate and digital takeup are both "rate". # Use the data_type as the value key in the output, and map # the data_type to the expected key to get the value. value_key = data_type_to_value_mappings[data_set_config['data_type']] # A dataset may be present on multiple dashboards. Produce a # latest value for each published dashboard, keyed by slug. admin_api = AdminAPI(config.STAGECRAFT_URL, config.STAGECRAFT_OAUTH_TOKEN) latest_values = [] configs = admin_api.get_data_set_dashboard(data_set_config['name']) # New dataset name convention uses underscores. data_type = string.replace(data_set_config['data_type'], '-', '_') for dashboard_config in configs: if(dashboard_config['published'] and latest_datum[value_key] is not None): slug = dashboard_config['slug'] id = encode_id(slug, data_type) latest_values.append({ '_id': id, 'dashboard_slug': slug, data_type: latest_datum[value_key], '_timestamp': latest_datum['_timestamp'], 'service_id': slug }) return latest_values
def compute(new_data, transform, data_set_config): # Sort the new data by timestamp and use the latest data point. new_data.sort(key=lambda item: item['_timestamp'], reverse=True) latest_datum = new_data[0] # Only continue if we are not back filling data. if not is_latest_data(data_set_config, transform, latest_datum): pass # Input data won't have a unique key for each type of value. # E.g. completion rate and digital takeup are both "rate". # Use the data_type as the value key in the output, and map # the data_type to the expected key to get the value. value_key = data_type_to_value_mappings[data_set_config['data_type']] # A dataset may be present on multiple dashboards. Produce a # latest value for each published dashboard, keyed by slug. admin_api = AdminAPI(config.STAGECRAFT_URL, config.STAGECRAFT_OAUTH_TOKEN) latest_values = [] configs = admin_api.get_data_set_dashboard(data_set_config['name']) # New dataset name convention uses underscores. data_type = string.replace(data_set_config['data_type'], '-', '_') for dashboard_config in configs: if (dashboard_config['published'] and latest_datum[value_key] is not None): slug = dashboard_config['slug'] id = encode_id(slug, data_type) latest_values.append({ '_id': id, 'dashboard_slug': slug, data_type: latest_datum[value_key], '_timestamp': latest_datum['_timestamp'], 'service_id': slug }) return latest_values
def get_or_get_and_create_output_dataset(transform, input_dataset): output_group = transform['output'].get('data-group', input_dataset['data_group']) output_type = transform['output']['data-type'] admin_api = AdminAPI( config.STAGECRAFT_URL, config.STAGECRAFT_OAUTH_TOKEN, ) output_data_set_config = admin_api.get_data_set(output_group, output_type) if not output_data_set_config: data_set_config = dict(input_dataset.items() + { 'data_type': output_type, 'data_group': output_group, }.items()) del (data_set_config['name']) output_data_set_config = admin_api.create_data_set(data_set_config) return DataSet.from_group_and_type( config.BACKDROP_WRITE_URL, output_group, output_type, token=output_data_set_config['bearer_token'], )
'name': "number_of_digital_transactions", 'ignore': 'quarterly' }, { 'name': "number_of_transactions", 'ignore': 'quarterly' }, { 'name': "total_cost", 'ignore': 'quarterly' }, ] ADDITIONAL_FIELDS = ["end_at", "period", "service_id", "type"] admin_api = AdminAPI(config.STAGECRAFT_URL, config.STAGECRAFT_OAUTH_TOKEN) def _get_latest_data_point(sorted_data, data_point_name): def _use_data_point(data_point, name, ignore): should_not_be_ignored = (ignore != data_point['type']) return should_not_be_ignored name = data_point_name['name'] ignore = data_point_name['ignore'] # sorted_data should be pre sorted so # the first returned is always the most recent for data_point in sorted_data: if _use_data_point(data_point, name, ignore): return data_point