def __init__(self, config, reports_definition): self.reports_definition = reports_definition # Fetch the valid (dimension, metric) names and their types from GAClient self.client = GAClient(config)
class ReportsHelper: def __init__(self, config, reports_definition): self.reports_definition = reports_definition # Fetch the valid (dimension, metric) names and their types from GAClient self.client = GAClient(config) def generate_catalog(self): """ Generate the catalog based on the reports definition Assumptions: + All streams and attributes are automatically included + All dimensions are also defined as keys + There is a custom metadata keyword defined for all schema attributes: ga_type: dimension | metric This keyword is required for processing the catalog afterwards and generating the query to be send to GA API, as dimensions and metrics are not treated equally (they are sent as separate lists of attributes) + The {start_date, end_date} params for the report query are also added to the schema. This is important for defining the date range the records are for, especially when 'ga:date' is not part of the requested Dimensions. If 'ga:date' has not been added as one of the Dimensions, then the {start_date, end_date} attributes are also added as keys. For example, if a user requests to see user stats by device or by source, the {start_date, end_date} can be used as part of the key uniquelly identifying the generated stats. That way we can properly identify and update rows over overlapping runs of the tap. + The available (dimensions, metrics) and their data type are dynamically fetched using the GAClient. We use those lists to validate the dimension or metric names requested We also use those lists to set the data type for those attributes and cast the values accordingly (in case of integer or numeric values) Returns: A valid Singer.io Catalog. """ streams = [] for report in self.reports_definition: # For each report in reports_definition generate a Catalog Entry schema_name = report['name'] schema = { "type": ["null", "object"], "additionalProperties": False, "properties": {} } metadata = [] stream_metadata = { "metadata": { "inclusion": "automatic", "table-key-properties": None }, "breadcrumb": [] } table_key_properties = [] # Track if there is a date set as one of the Dimensions date_dimension_included = False # Add the dimensions to the schema and as key_properties for dimension in report['dimensions']: if dimension == 'ga:date': date_dimension_included = True data_type = self.client.lookup_data_type('dimension', dimension) dimension = dimension.replace("ga:","ga_") schema['properties'][dimension] = { "type": [data_type], } table_key_properties.append(dimension) metadata.append({ "metadata": { "inclusion": "automatic", "selected-by-default": True, "ga_type": 'dimension' }, "breadcrumb": ["properties", dimension] }) # Add the metrics to the schema for metric in report['metrics']: data_type = self.client.lookup_data_type('metric', metric) metric = metric.replace("ga:","ga_") schema['properties'][metric] = { # metrics are allowed to also have null values "type": ["null",data_type], } metadata.append({ "metadata": { "inclusion": "automatic", "selected-by-default": True, "ga_type": 'metric' }, "breadcrumb": ["properties", metric] }) # Also add the {start_date, end_date} params for the report query schema['properties']['report_start_date'] = { "type": ["string"], } schema['properties']['report_end_date'] = { "type": ["string"], } # If 'ga:date' has not been added as a Dimension, add the # {start_date, end_date} params as keys if not date_dimension_included: table_key_properties.append('report_start_date') table_key_properties.append('report_end_date') stream_metadata['metadata']['table-key-properties'] = table_key_properties # Add the Stream metadata (empty breadcrumb) to the start of the # metada list so that everything is neatly organized in the Catalog metadata.insert(0, stream_metadata) # create and add catalog entry catalog_entry = { 'stream': schema_name, 'tap_stream_id': schema_name, 'schema': schema, 'metadata' : metadata } streams.append(catalog_entry) return {'streams': streams} def validate(self): for report in self.reports_definition: try: name = report['name'] dimensions = report['dimensions'] metrics = report['metrics'] except KeyError: LOGGER.critical("Report definition is missing one of the required properties (name, dimensions, metrics)") sys.exit(1) # Check that not too many metrics && dimensions have been requested if len(metrics) == 0: LOGGER.critical("'{}' has no metrics defined. GA reports must specify at least one metric.".format(name)) sys.exit(1) elif len(metrics) > 10: LOGGER.critical("'{}' has too many metrics defined. GA reports can have maximum 10 metrics.".format(name)) sys.exit(1) if len(dimensions) > 7: LOGGER.critical("'{}' has too many dimensions defined. GA reports can have maximum 7 dimensions.".format(name)) sys.exit(1) self.validate_dimensions(dimensions) self.validate_metrics(metrics) # ToDo: We should also check that the given metrics can be used # with the given dimensions # Not all dimensions and metrics can be queried together. Only certain # dimensions and metrics can be used together to create valid combinations. def validate_dimensions(self, dimensions): # check that all the dimensions are proper Google Analytics Dimensions for dimension in dimensions: if not dimension.startswith(('ga:dimension', 'ga:customVarName', 'ga:customVarValue')) \ and dimension not in self.client.dimensions_ref: LOGGER.critical("'{}' is not a valid Google Analytics dimension".format(dimension)) LOGGER.info("For details see https://developers.google.com/analytics/devguides/reporting/core/dimsmets") sys.exit(1) def validate_metrics(self, metrics): # check that all the metrics are proper Google Analytics metrics for metric in metrics: if metric.startswith('ga:goal') and metric.endswith(('Starts', 'Completions', 'Value', 'ConversionRate', 'Abandons', 'AbandonRate')): # Custom Google Analytics Metrics {ga:goalXXStarts, ga:goalXXValue, ... } continue elif metric.startswith('ga:searchGoal') and metric.endswith('ConversionRate'): # Custom Google Analytics Metrics ga:searchGoalXXConversionRate continue elif not metric.startswith(('ga:metric', 'ga:calcMetric')) \ and metric not in self.client.metrics_ref: LOGGER.critical("'{}' is not a valid Google Analytics metric".format(metric)) LOGGER.info("For details see https://developers.google.com/analytics/devguides/reporting/core/dimsmets") sys.exit(1) @staticmethod def get_report_definition(stream): report = { "name" : stream['tap_stream_id'], "dimensions" : [], "metrics" : [] } stream_metadata = singer.metadata.to_map(stream['metadata']) for attribute in stream['schema']['properties'].keys(): ga_type = singer.metadata.get(stream_metadata, ('properties', attribute), "ga_type") if ga_type == 'dimension': report['dimensions'].append(attribute) elif ga_type == 'metric': report['metrics'].append(attribute) return report
def __init__(self, config, reports_definition): self.reports_definition = reports_definition # Fetch the valid (dimension, metric) # names and their types from GAClient self.client = GAClient(config.get("view_id"), config, dict())
def sync(config, state, catalog): errors_encountered = False selected_stream_ids = get_selected_streams(catalog) client = GAClient(config) # Loop over streams in catalog for stream in catalog['streams']: stream_id = stream['tap_stream_id'] stream_schema = stream['schema'] stream_metadata = metadata.to_map(stream['metadata']) key_properties = metadata.get(stream_metadata, (), "table-key-properties") if stream_id in selected_stream_ids: LOGGER.info('Syncing stream: ' + stream_id) try: report_definition = ReportsHelper.get_report_definition(stream) results = client.process_stream(report_definition) # we write the schema message after we are sure that we could # fetch records without errors singer.write_schema(stream_id, stream_schema, key_properties) singer.write_records(stream_id, results) except TapGaInvalidArgumentError as e: errors_encountered = True LOGGER.error( "Skipping stream: '{}' due to invalid report definition.". format(stream_id)) LOGGER.debug("Error: '{}'.".format(e)) except TapGaRateLimitError as e: errors_encountered = True LOGGER.error( "Skipping stream: '{}' due to Rate Limit Errors.".format( stream_id)) LOGGER.debug("Error: '{}'.".format(e)) except TapGaQuotaExceededError as e: errors_encountered = True LOGGER.error( "Skipping stream: '{}' due to Quota Exceeded Errors.". format(stream_id)) LOGGER.debug("Error: '{}'.".format(e)) except TapGaAuthenticationError as e: LOGGER.error( "Stopping execution while processing '{}' due to Authentication Errors." .format(stream_id)) LOGGER.debug("Error: '{}'.".format(e)) sys.exit(1) except TapGaUnknownError as e: LOGGER.error( "Stopping execution while processing '{}' due to Unknown Errors." .format(stream_id)) LOGGER.debug("Error: '{}'.".format(e)) sys.exit(1) else: LOGGER.info('Skipping unselected stream: ' + stream_id) # If we encountered errors, exit with 1 if errors_encountered: sys.exit(1) return
def sync(config, state, catalog): errors_encountered = False selected_stream_ids = get_selected_streams(catalog) client = GAClient(config) # Loop over streams in catalog for stream in catalog['streams']: stream_id = stream['tap_stream_id'] stream_schema = stream['schema'] if state and stream_id in state: client.start_date = state[stream_id] stream_metadata = metadata.to_map(stream['metadata']) key_properties = update_key_properties(stream_schema, stream_metadata) if stream_id in selected_stream_ids: LOGGER.info('Syncing stream: ' + stream_id) try: singer.write_schema(stream_id, stream_schema, key_properties) report_definition = ReportsHelper.get_report_definition(stream) for page, date in client.process_stream(report_definition): singer.write_records(stream_id, page) if date is not None: # we need to update all dates that are not "golden", even if it's the start date singer.write_state({stream_id: date}) except TapGaInvalidArgumentError as e: errors_encountered = True LOGGER.error( "Skipping stream: '{}' due to invalid report definition.". format(stream_id)) LOGGER.debug("Error: '{}'.".format(e)) except TapGaRateLimitError as e: errors_encountered = True LOGGER.error( "Skipping stream: '{}' due to Rate Limit Errors.".format( stream_id)) LOGGER.debug("Error: '{}'.".format(e)) except TapGaQuotaExceededError as e: errors_encountered = True LOGGER.error( "Skipping stream: '{}' due to Quota Exceeded Errors.". format(stream_id)) LOGGER.debug("Error: '{}'.".format(e)) except TapGaAuthenticationError as e: LOGGER.error( "Stopping execution while processing '{}' due to Authentication Errors." .format(stream_id)) LOGGER.debug("Error: '{}'.".format(e)) sys.exit(1) except TapGaUnknownError as e: LOGGER.error( "Stopping execution while processing '{}' due to Unknown Errors." .format(stream_id)) LOGGER.debug("Error: '{}'.".format(e)) sys.exit(1) else: LOGGER.info('Skipping unselected stream: ' + stream_id) # If we encountered errors, exit with 1 if errors_encountered: sys.exit(1) return