def get_or_get_and_create_output_dataset(transform, input_dataset): output_group = transform['output'].get('data-group', input_dataset['data_group']) output_type = transform['output']['data-type'] admin_api = AdminAPI( config.STAGECRAFT_URL, config.STAGECRAFT_OAUTH_TOKEN, ) output_data_set_config = admin_api.get_data_set(output_group, output_type) if not output_data_set_config: data_set_config = { 'data_type': output_type, 'data_group': output_group, 'bearer_token': input_dataset['bearer_token'], 'realtime': input_dataset['realtime'], 'published': input_dataset['published'], 'max_age_expected': input_dataset['max_age_expected'], } if 'capped_size' in input_dataset and input_dataset['capped_size']: data_set_config['capped_size'] = input_dataset['capped_size'] output_data_set_config = admin_api.create_data_set(data_set_config) return DataSet.from_group_and_type( config.BACKDROP_WRITE_URL, output_group, output_type, token=output_data_set_config['bearer_token'], )
def get_or_get_and_create_output_dataset(transform, input_dataset): output_group = transform['output'].get( 'data-group', input_dataset['data_group']) output_type = transform['output']['data-type'] admin_api = AdminAPI( config.STAGECRAFT_URL, config.STAGECRAFT_OAUTH_TOKEN, ) output_data_set_config = admin_api.get_data_set(output_group, output_type) if not output_data_set_config: data_set_config = { 'data_type': output_type, 'data_group': output_group, 'bearer_token': input_dataset['bearer_token'], 'realtime': input_dataset['realtime'], 'published': input_dataset['published'], 'max_age_expected': input_dataset['max_age_expected'], } if 'capped_size' in input_dataset and input_dataset['capped_size']: data_set_config['capped_size'] = input_dataset['capped_size'] output_data_set_config = admin_api.create_data_set(data_set_config) return DataSet.from_group_and_type( config.BACKDROP_WRITE_URL, output_group, output_type, token=output_data_set_config['bearer_token'], )
def run_transform(data_set_config, transform, earliest, latest): data_set = DataSet.from_group_and_type( config.BACKDROP_READ_URL, data_set_config['data_group'], data_set_config['data_type'], ) data = data_set.get( query_parameters=get_query_parameters(transform, earliest, latest)) transform_function = get_transform_function(transform) transformed_data = transform_function(data['data'], transform, data_set_config) if 'additionalFields' in transform['options']: additionalFields = transform['options']['additionalFields'] transformed_data = [ merge_additional_fields(datum, additionalFields) for datum in transformed_data ] output_data_set = get_or_get_and_create_output_dataset( transform, data_set_config) output_data_set.post(transformed_data) stats_client.incr('run_transform.success')
def run_transform(data_set_config, transform, earliest, latest): data_set = DataSet.from_group_and_type( config.BACKDROP_READ_URL, data_set_config['data_group'], data_set_config['data_type'], ) earliest = parse_time_as_utc(earliest) latest = parse_time_as_utc(latest) data = data_set.get( query_parameters=get_query_parameters(transform, earliest, latest) ) transform_function = get_transform_function(transform) transformed_data = transform_function(data['data'], transform, data_set_config) if 'additionalFields' in transform['options']: additionalFields = transform['options']['additionalFields'] transformed_data = [ merge_additional_fields(datum, additionalFields) for datum in transformed_data] output_data_set = get_or_get_and_create_output_dataset( transform, data_set_config) output_data_set.post(transformed_data) stats_client.incr('run_transform.success')
def is_latest_data(data_set_config, transform, latest_datum, additional_read_params={}): """ Read from backdrop to determine if new data is the latest. """ data_set = DataSet.from_group_and_type( config.BACKDROP_READ_URL, data_set_config['data_group'], data_set_config['data_type'] ) transform_params = transform.get('query_parameters', {}) generated_read_params = _get_read_params( transform_params, latest_datum['_timestamp']) read_params = dict( generated_read_params.items() + additional_read_params.items()) existing_data = data_set.get(query_parameters=read_params) if existing_data['data']: if existing_data['data'][0]['_timestamp'] > latest_datum['_timestamp']: return False return True
def main(credentials, data_set_config, query, options, start_at, end_at): credentials = credentials client = gapy.client.from_secrets_file( credentials['CLIENT_SECRETS'], storage_path=credentials['STORAGE_PATH'], http_client=HttpWithBackoff(), ) ga_query = parse_query(query) collapse_key = "pageTitle" (start, middle, end) = get_date() data = client.query.get( ga_query['id'], start, end, [ga_query['metric']], ga_query['dimensions'], ga_query['filters'] if 'filters' in ga_query else None ) collapsed_data = sum_data(data, ga_query['metric'], collapse_key, (start, middle, end), 500) trended_data = get_trends(collapsed_data) flattened_data = flatten_data_and_assign_ids(trended_data) data_set = DataSet.from_config(data_set_config) data_set.empty_data_set() data_set.post(flattened_data)
def _get_pp_data(self, dataset_name, value, filter_by=None, filter_by_prefix=None): dataset = DataSet.from_group_and_type(settings.DATA_DOMAIN, settings.DATA_GROUP, dataset_name) query_parameters = { 'group_by': 'pagePath', 'period': 'day', 'start_at': self.start_date, 'end_at': self.end_date, 'collect': value, } if filter_by: query_parameters['filter_by'] = 'pagePath:' + filter_by elif filter_by_prefix: query_parameters[ 'filter_by_prefix'] = 'pagePath:' + filter_by_prefix logger.debug('Getting {0} data with params {1}'.format( dataset_name, query_parameters)) json_data = dataset.get(query_parameters) if 'data' in json_data: return json_data['data'] else: return []
def send_records_for(self, query, to): data_set = DataSet.from_config(to) visitor_count = self._realtime.query(query) record = self._create_record(visitor_count, query.get('filters', '')) data_set.post(record)
def save_aggregated_results(self, results): data_set = DataSet.from_group_and_type(settings.DATA_DOMAIN, settings.DATA_GROUP, settings.RESULTS_DATASET, token=self.pp_token) enriched_results = [self._enrich_mandatory_pp_fields(result) for result in results] logger.info('Posting data to Performance Platform') data_set.post(enriched_results)
def save_aggregated_results(self, results): data_set = DataSet.from_group_and_type(settings.DATA_DOMAIN, settings.DATA_GROUP, settings.RESULTS_DATASET, token=self.pp_token) enriched_results = [ self._enrich_mandatory_pp_fields(result) for result in results ] logger.info('Posting data to Performance Platform') data_set.post(enriched_results)
def main(credentials, data_set_config, query, options, start_at, end_at, filename=None): nuke_local_database() if filename is not None: with open(filename, 'r') as f: save_raw_data(f) else: save_raw_data(download_url(get_latest_csv_url(INDEX_URL))) aggregate_and_save() data_set = DataSet.from_config(data_set_config) data_set.empty_data_set() push_aggregates(data_set)
def run_transform(data_set_config, transform, earliest, latest): data_set = DataSet.from_group_and_type( config.BACKDROP_READ_URL, data_set_config['data_group'], data_set_config['data_type'], ) data = data_set.get( query_parameters=get_query_parameters(transform, earliest, latest)) transform_function = get_transform_function(transform) transformed_data = transform_function(data['data'], transform, data_set_config) output_data_set = get_or_get_and_create_output_dataset( transform, data_set_config) output_data_set.post(transformed_data)
def main(credentials, data_set_config, query, options, start_at, end_at): credentials = credentials client = create_client(credentials) collapse_key = "pageTitle" (start, middle, end) = get_date() data = query_ga(client, end, query, start) collapsed_data = sum_data(data, query['metric'], collapse_key, (start, middle, end), 500) trended_data = get_trends(collapsed_data) flattened_data = flatten_data_and_assign_ids(trended_data) data_set = DataSet.from_config(data_set_config) data_set.empty_data_set() data_set.post(flattened_data)
def get_or_get_and_create_output_dataset(transform, input_dataset): output_group = transform['output'].get('data-group', input_dataset['data_group']) output_type = transform['output']['data-type'] admin_api = AdminAPI( config.STAGECRAFT_URL, config.STAGECRAFT_OAUTH_TOKEN, ) output_data_set_config = admin_api.get_data_set(output_group, output_type) if not output_data_set_config: data_set_config = dict(input_dataset.items() + { 'data_type': output_type, 'data_group': output_group, }.items()) del (data_set_config['name']) output_data_set_config = admin_api.create_data_set(data_set_config) return DataSet.from_group_and_type( config.BACKDROP_WRITE_URL, output_group, output_type, token=output_data_set_config['bearer_token'], )
def _get_pp_data(self, dataset_name, value, filter_by=None, filter_by_prefix=None): dataset = DataSet.from_group_and_type(settings.DATA_DOMAIN, settings.DATA_GROUP, dataset_name) query_parameters = { 'group_by': 'pagePath', 'period': 'day', 'start_at': self.start_date, 'end_at': self.end_date, 'collect': value, } if filter_by: query_parameters['filter_by'] = 'pagePath:' + filter_by elif filter_by_prefix: query_parameters['filter_by_prefix'] = 'pagePath:' + filter_by_prefix logger.debug('Getting {0} data with params {1}'.format(dataset_name, query_parameters)) json_data = dataset.get(query_parameters) if 'data' in json_data: return json_data['data'] else: return []
def get_old_data(data_group_name, data_type_name): data_set_client = client.from_group_and_type(get_qualified_backdrop_url(), data_group_name, data_type_name) return data_set_client.get().json()['data']
def post_new_data(data_group_name, data_type_name, bearer_token, data): data_set_client = client.from_group_and_type(get_qualified_backdrop_url(), data_group_name, data_type_name, token=bearer_token) return data_set_client.post(data)
def push_stats_to_data_set(pingdom_stats, check_name, data_set_config): data_set = DataSet.from_config(data_set_config) data_set.post( [convert_from_pingdom_to_performanceplatform(thing, check_name) for thing in pingdom_stats])
def __init__(self, target_data_set_config, options): self.data_set_client = DataSet.from_config(target_data_set_config) self.chunk_size = options.get('chunk-size', 100) self.empty_data_set = options.get('empty-data-set', False)
def __init__(self, target_data_set_config, options): self.data_set_client = DataSet.from_config(target_data_set_config) self.chunk_size = options.get('chunk-size', 100)