示例#1
0
def get_or_get_and_create_output_dataset(transform, input_dataset):
    output_group = transform['output'].get('data-group',
                                           input_dataset['data_group'])
    output_type = transform['output']['data-type']

    admin_api = AdminAPI(
        config.STAGECRAFT_URL,
        config.STAGECRAFT_OAUTH_TOKEN,
    )
    output_data_set_config = admin_api.get_data_set(output_group, output_type)
    if not output_data_set_config:
        data_set_config = {
            'data_type': output_type,
            'data_group': output_group,
            'bearer_token': input_dataset['bearer_token'],
            'realtime': input_dataset['realtime'],
            'published': input_dataset['published'],
            'max_age_expected': input_dataset['max_age_expected'],
        }

        if 'capped_size' in input_dataset and input_dataset['capped_size']:
            data_set_config['capped_size'] = input_dataset['capped_size']

        output_data_set_config = admin_api.create_data_set(data_set_config)

    return DataSet.from_group_and_type(
        config.BACKDROP_WRITE_URL,
        output_group,
        output_type,
        token=output_data_set_config['bearer_token'],
    )
示例#2
0
def get_or_get_and_create_output_dataset(transform, input_dataset):
    output_group = transform['output'].get(
        'data-group', input_dataset['data_group'])
    output_type = transform['output']['data-type']

    admin_api = AdminAPI(
        config.STAGECRAFT_URL,
        config.STAGECRAFT_OAUTH_TOKEN,
    )
    output_data_set_config = admin_api.get_data_set(output_group, output_type)
    if not output_data_set_config:
        data_set_config = {
            'data_type': output_type,
            'data_group': output_group,
            'bearer_token': input_dataset['bearer_token'],
            'realtime': input_dataset['realtime'],
            'published': input_dataset['published'],
            'max_age_expected': input_dataset['max_age_expected'],
        }

        if 'capped_size' in input_dataset and input_dataset['capped_size']:
            data_set_config['capped_size'] = input_dataset['capped_size']

        output_data_set_config = admin_api.create_data_set(data_set_config)

    return DataSet.from_group_and_type(
        config.BACKDROP_WRITE_URL,
        output_group,
        output_type,
        token=output_data_set_config['bearer_token'],
    )
示例#3
0
def run_transform(data_set_config, transform, earliest, latest):
    data_set = DataSet.from_group_and_type(
        config.BACKDROP_READ_URL,
        data_set_config['data_group'],
        data_set_config['data_type'],
    )

    data = data_set.get(
        query_parameters=get_query_parameters(transform, earliest, latest))

    transform_function = get_transform_function(transform)
    transformed_data = transform_function(data['data'], transform,
                                          data_set_config)

    if 'additionalFields' in transform['options']:
        additionalFields = transform['options']['additionalFields']
        transformed_data = [
            merge_additional_fields(datum, additionalFields)
            for datum in transformed_data
        ]

    output_data_set = get_or_get_and_create_output_dataset(
        transform, data_set_config)
    output_data_set.post(transformed_data)

    stats_client.incr('run_transform.success')
示例#4
0
def run_transform(data_set_config, transform, earliest, latest):
    data_set = DataSet.from_group_and_type(
        config.BACKDROP_READ_URL,
        data_set_config['data_group'],
        data_set_config['data_type'],
    )

    earliest = parse_time_as_utc(earliest)
    latest = parse_time_as_utc(latest)

    data = data_set.get(
        query_parameters=get_query_parameters(transform, earliest, latest)
    )

    transform_function = get_transform_function(transform)
    transformed_data = transform_function(data['data'],
                                          transform,
                                          data_set_config)

    if 'additionalFields' in transform['options']:
        additionalFields = transform['options']['additionalFields']
        transformed_data = [
            merge_additional_fields(datum, additionalFields) for datum in transformed_data]

    output_data_set = get_or_get_and_create_output_dataset(
        transform,
        data_set_config)
    output_data_set.post(transformed_data)

    stats_client.incr('run_transform.success')
示例#5
0
def is_latest_data(data_set_config,
                   transform,
                   latest_datum,
                   additional_read_params={}):
    """
    Read from backdrop to determine if new data is the latest.
    """

    data_set = DataSet.from_group_and_type(
        config.BACKDROP_READ_URL,
        data_set_config['data_group'],
        data_set_config['data_type']
    )

    transform_params = transform.get('query_parameters', {})
    generated_read_params = _get_read_params(
        transform_params, latest_datum['_timestamp'])
    read_params = dict(
        generated_read_params.items() + additional_read_params.items())
    existing_data = data_set.get(query_parameters=read_params)

    if existing_data['data']:
        if existing_data['data'][0]['_timestamp'] > latest_datum['_timestamp']:
            return False

    return True
def main(credentials, data_set_config, query, options, start_at, end_at):

    credentials = credentials
    client = gapy.client.from_secrets_file(
        credentials['CLIENT_SECRETS'],
        storage_path=credentials['STORAGE_PATH'],
        http_client=HttpWithBackoff(),
    )

    ga_query = parse_query(query)

    collapse_key = "pageTitle"

    (start, middle, end) = get_date()

    data = client.query.get(
        ga_query['id'],
        start,
        end,
        [ga_query['metric']],
        ga_query['dimensions'],
        ga_query['filters'] if 'filters' in ga_query else None
    )

    collapsed_data = sum_data(data, ga_query['metric'], collapse_key,
                              (start, middle, end), 500)
    trended_data = get_trends(collapsed_data)
    flattened_data = flatten_data_and_assign_ids(trended_data)

    data_set = DataSet.from_config(data_set_config)

    data_set.empty_data_set()
    data_set.post(flattened_data)
    def _get_pp_data(self,
                     dataset_name,
                     value,
                     filter_by=None,
                     filter_by_prefix=None):
        dataset = DataSet.from_group_and_type(settings.DATA_DOMAIN,
                                              settings.DATA_GROUP,
                                              dataset_name)
        query_parameters = {
            'group_by': 'pagePath',
            'period': 'day',
            'start_at': self.start_date,
            'end_at': self.end_date,
            'collect': value,
        }
        if filter_by:
            query_parameters['filter_by'] = 'pagePath:' + filter_by
        elif filter_by_prefix:
            query_parameters[
                'filter_by_prefix'] = 'pagePath:' + filter_by_prefix

        logger.debug('Getting {0} data with params {1}'.format(
            dataset_name, query_parameters))
        json_data = dataset.get(query_parameters)

        if 'data' in json_data:
            return json_data['data']
        else:
            return []
示例#8
0
    def send_records_for(self, query, to):
        data_set = DataSet.from_config(to)

        visitor_count = self._realtime.query(query)

        record = self._create_record(visitor_count, query.get('filters', ''))

        data_set.post(record)
 def save_aggregated_results(self, results):
     data_set = DataSet.from_group_and_type(settings.DATA_DOMAIN,
                                            settings.DATA_GROUP,
                                            settings.RESULTS_DATASET,
                                            token=self.pp_token)
     enriched_results = [self._enrich_mandatory_pp_fields(result)
                         for result in results]
     logger.info('Posting data to Performance Platform')
     data_set.post(enriched_results)
    def send_records_for(self, query, to):
        data_set = DataSet.from_config(to)

        visitor_count = self._realtime.query(query)

        record = self._create_record(visitor_count,
                                     query.get('filters', ''))

        data_set.post(record)
 def save_aggregated_results(self, results):
     data_set = DataSet.from_group_and_type(settings.DATA_DOMAIN,
                                            settings.DATA_GROUP,
                                            settings.RESULTS_DATASET,
                                            token=self.pp_token)
     enriched_results = [
         self._enrich_mandatory_pp_fields(result) for result in results
     ]
     logger.info('Posting data to Performance Platform')
     data_set.post(enriched_results)
def main(credentials, data_set_config, query, options, start_at, end_at,
         filename=None):

    nuke_local_database()

    if filename is not None:
        with open(filename, 'r') as f:
            save_raw_data(f)
    else:
        save_raw_data(download_url(get_latest_csv_url(INDEX_URL)))

    aggregate_and_save()

    data_set = DataSet.from_config(data_set_config)
    data_set.empty_data_set()
    push_aggregates(data_set)
示例#13
0
def run_transform(data_set_config, transform, earliest, latest):
    data_set = DataSet.from_group_and_type(
        config.BACKDROP_READ_URL,
        data_set_config['data_group'],
        data_set_config['data_type'],
    )

    data = data_set.get(
        query_parameters=get_query_parameters(transform, earliest, latest))

    transform_function = get_transform_function(transform)
    transformed_data = transform_function(data['data'], transform,
                                          data_set_config)

    output_data_set = get_or_get_and_create_output_dataset(
        transform, data_set_config)
    output_data_set.post(transformed_data)
def main(credentials, data_set_config, query, options, start_at, end_at):

    credentials = credentials
    client = create_client(credentials)

    collapse_key = "pageTitle"

    (start, middle, end) = get_date()

    data = query_ga(client, end, query, start)

    collapsed_data = sum_data(data, query['metric'], collapse_key,
                              (start, middle, end), 500)
    trended_data = get_trends(collapsed_data)
    flattened_data = flatten_data_and_assign_ids(trended_data)

    data_set = DataSet.from_config(data_set_config)

    data_set.empty_data_set()
    data_set.post(flattened_data)
示例#15
0
def main(credentials, data_set_config, query, options, start_at, end_at):

    credentials = credentials
    client = create_client(credentials)

    collapse_key = "pageTitle"

    (start, middle, end) = get_date()

    data = query_ga(client, end, query, start)

    collapsed_data = sum_data(data, query['metric'], collapse_key,
                              (start, middle, end), 500)
    trended_data = get_trends(collapsed_data)
    flattened_data = flatten_data_and_assign_ids(trended_data)

    data_set = DataSet.from_config(data_set_config)

    data_set.empty_data_set()
    data_set.post(flattened_data)
示例#16
0
def get_or_get_and_create_output_dataset(transform, input_dataset):
    output_group = transform['output'].get('data-group',
                                           input_dataset['data_group'])
    output_type = transform['output']['data-type']

    admin_api = AdminAPI(
        config.STAGECRAFT_URL,
        config.STAGECRAFT_OAUTH_TOKEN,
    )
    output_data_set_config = admin_api.get_data_set(output_group, output_type)
    if not output_data_set_config:
        data_set_config = dict(input_dataset.items() + {
            'data_type': output_type,
            'data_group': output_group,
        }.items())
        del (data_set_config['name'])
        output_data_set_config = admin_api.create_data_set(data_set_config)

    return DataSet.from_group_and_type(
        config.BACKDROP_WRITE_URL,
        output_group,
        output_type,
        token=output_data_set_config['bearer_token'],
    )
    def _get_pp_data(self, dataset_name, value,
                     filter_by=None, filter_by_prefix=None):
        dataset = DataSet.from_group_and_type(settings.DATA_DOMAIN,
                                              settings.DATA_GROUP,
                                              dataset_name)
        query_parameters = {
            'group_by': 'pagePath',
            'period': 'day',
            'start_at': self.start_date,
            'end_at': self.end_date,
            'collect': value,
        }
        if filter_by:
            query_parameters['filter_by'] = 'pagePath:' + filter_by
        elif filter_by_prefix:
            query_parameters['filter_by_prefix'] = 'pagePath:' + filter_by_prefix

        logger.debug('Getting {0} data with params {1}'.format(dataset_name, query_parameters))
        json_data = dataset.get(query_parameters)

        if 'data' in json_data:
            return json_data['data']
        else:
            return []
示例#18
0
def get_old_data(data_group_name, data_type_name):
    data_set_client = client.from_group_and_type(get_qualified_backdrop_url(),
                                                 data_group_name,
                                                 data_type_name)
    return data_set_client.get().json()['data']
示例#19
0
def post_new_data(data_group_name, data_type_name, bearer_token, data):
    data_set_client = client.from_group_and_type(get_qualified_backdrop_url(),
                                                 data_group_name,
                                                 data_type_name,
                                                 token=bearer_token)
    return data_set_client.post(data)
def push_stats_to_data_set(pingdom_stats, check_name, data_set_config):
    data_set = DataSet.from_config(data_set_config)
    data_set.post(
        [convert_from_pingdom_to_performanceplatform(thing, check_name) for
         thing in pingdom_stats])
示例#21
0
 def __init__(self, target_data_set_config, options):
     self.data_set_client = DataSet.from_config(target_data_set_config)
     self.chunk_size = options.get('chunk-size', 100)
     self.empty_data_set = options.get('empty-data-set', False)
 def __init__(self, target_data_set_config, options):
     self.data_set_client = DataSet.from_config(target_data_set_config)
     self.chunk_size = options.get('chunk-size', 100)