Exemplo n.º 1
0
    def read(self):
        if self.report_type == "ANALYTICS":
            entity_ids = self.get_active_entity_ids()

            total_jobs = (len(entity_ids) // MAX_ENTITY_IDS_PER_JOB) + 1
            logger.info(f"Processing a total of {total_jobs} jobs")

            data = []
            for chunk_entity_ids in split_list(
                    entity_ids, MAX_ENTITY_IDS_PER_JOB * MAX_CONCURRENT_JOBS):
                job_ids = self.get_job_ids(chunk_entity_ids)
                data += self.get_analytics_report(job_ids)

        elif self.report_type == "REACH":
            data = self.get_reach_report()

        elif self.report_type == "ENTITY":
            if self.entity == "CARD":
                data = self.get_cards_report()
            else:
                data = self.get_campaign_management_report()

        def result_generator():
            for record in data:
                yield self.add_request_or_period_dates(record)

        yield JSONStream("results_" + self.account.id, result_generator())
Exemplo n.º 2
0
def post_queued_async_jobs(
    client,
    account_id,
    report_name,
    report_entity,
    entity_ids,
    report_granularity,
    report_segment,
    metric_groups,
    placement,
    start_time,
    end_time,
    country_id,
    platform_id,
):
    queued_job_ids = []
    # CHUNK ENTITY_IDS LOOP
    chunk = 0  # chunk number
    # Make chunks of 20 of entity_ids
    for chunk_ids in split_list(entity_ids, 20):
        # POST async_queued_job for report entity chunk_ids
        # Reference: https://developer.twitter.com/en/docs/ads/analytics/api-reference/asynchronous#post-stats-jobs-accounts-account-id
        LOGGER.info("Report: {} - POST ASYNC queued_job, chunk#: {}".format(
            report_name, chunk))
        queued_job_path = "stats/jobs/accounts/{account_id}".replace(
            "{account_id}", account_id)
        queued_job_params = {
            # Required params
            "entity": report_entity,
            "entity_ids": ",".join(map(str, chunk_ids)),
            "metric_groups": ",".join(map(str, metric_groups)),
            "placement": placement,
            "granularity": report_granularity,
            "start_time": start_time,
            "end_time": end_time,
            # Optional params
            "segmentation_type": report_segment,
            "country": country_id,
            "platform": platform_id,
        }
        LOGGER.info("Report: {} - queued_job POST URL: {}/{}/{}".format(
            report_name, ADS_API_URL, API_VERSION, queued_job_path))
        LOGGER.info("Report: {} - queued_job params: {}".format(
            report_name, queued_job_params))

        # POST queued_job: asynchronous job
        queued_job = post_resource("queued_job", client, queued_job_path,
                                   queued_job_params)

        queued_job_data = queued_job.get("data")
        queued_job_id = queued_job_data.get("id_str")
        queued_job_ids.append(queued_job_id)
        LOGGER.info(
            "queued_job_ids = {}".format(queued_job_ids))  # COMMENT OUT
        # End: for chunk_ids in entity_ids
    return queued_job_ids
Exemplo n.º 3
0
def post_queued_async_jobs(client, account_id, report_name, report_entity, entity_ids, report_granularity, \
    report_segment, metric_groups, placement, start_time, end_time, country_id, platform_id):
    queued_job_ids = []
    # CHUNK ENTITY_IDS LOOP
    chunk = 0 # chunk number
    # Make chunks of 20 of entity_ids
    for chunk_ids in split_list(entity_ids, 20):
        # POST async_queued_job for report entity chunk_ids
        # Reference: https://developer.twitter.com/en/docs/ads/analytics/api-reference/asynchronous#post-stats-jobs-accounts-account-id
        LOGGER.info('Report: {} - POST ASYNC queued_job, chunk#: {}'.format(
            report_name, chunk))
        queued_job_path = 'stats/jobs/accounts/{account_id}'.replace(
            '{account_id}', account_id)
        queued_job_params = {
            # Required params
            'entity': report_entity,
            'entity_ids': ','.join(map(str, chunk_ids)),
            'metric_groups': ','.join(map(str, metric_groups)),
            'placement': placement,
            'granularity': report_granularity,
            'start_time': start_time,
            'end_time': end_time,
            # Optional params
            'segmentation_type': report_segment,
            'country': country_id,
            'platform': platform_id
        }
        LOGGER.info('Report: {} - queued_job POST URL: {}/{}/{}'.format(
            report_name, ADS_API_URL, API_VERSION, queued_job_path))
        LOGGER.info('Report: {} - queued_job params: {}'.format(
            report_name, queued_job_params))

        # POST queued_job: asynchronous job
        queued_job = post_resource('queued_job', client, queued_job_path, \
            queued_job_params)

        queued_job_data = queued_job.get('data')
        queued_job_id = queued_job_data.get('id_str')
        queued_job_ids.append(queued_job_id)
        LOGGER.info('queued_job_ids = {}'.format(queued_job_ids)) # COMMENT OUT
        # End: for chunk_ids in entity_ids
    return queued_job_ids
Exemplo n.º 4
0
    def get_job_ids(self, entity_ids):
        """
        Step 2 of 'ANALYTICS' report generation process:
        Create asynchronous analytics jobs and return their ids for progress tracking
        Documentation: https://developer.twitter.com/en/docs/ads/analytics/api-reference/asynchronous
        """

        return [
            ENTITY_OBJECTS[self.entity].queue_async_stats_job(
                self.account,
                chunk_entity_ids,
                self.metric_groups,
                granularity=self.granularity,
                placement=self.placement,
                start_time=self.start_date,
                end_time=self.end_date,
                segmentation_type=self.segmentation_type,
                platform=self.platform,
                country=self.country,
            ).id for chunk_entity_ids in split_list(entity_ids,
                                                    MAX_ENTITY_IDS_PER_JOB)
        ]
Exemplo n.º 5
0
    def get_reach_report(self):
        """
        Get 'REACH' report through the 'Reach and Average Frequency' endpoint of Twitter Ads API.
        Documentation: https://developer.twitter.com/en/docs/ads/analytics/api-reference/reach
        """

        resource = f"/{API_VERSION}/stats/accounts/{self.account.id}/reach/{self.entity.lower()}s"
        entity_ids = self.get_active_entity_ids()

        for chunk_entity_ids in split_list(entity_ids, MAX_ENTITY_IDS_PER_JOB):
            try:
                params = {
                    "account_id": self.account.id,
                    f"{self.entity.lower()}_ids": ",".join(entity_ids),
                    "start_time": self.start_date.strftime(API_DATEFORMAT),
                    "end_time": self.end_date.strftime(API_DATEFORMAT),
                }
                request = Request(self.client, "get", resource, params=params)
                yield from Cursor(None, request)
            except Exception:
                ex_type, ex, tb = sys.exc_info()
                logger.warning(
                    f"Failed to ingest post with error: {ex}. Traceback: {traceback.print_tb(tb)}"
                )
Exemplo n.º 6
0
# see: https://dev.twitter.com/ads/analytics/metrics-and-segmentation
metric_groups = [METRIC_GROUP.BILLING]

# fetching stats on the instance
line_items[0].stats(metric_groups)

# fetching stats for multiple line items
ids = list(map(lambda x: x.id, line_items))
if not ids:
    print('Error: A minimum of 1 items must be provided for entity_ids')
    sys.exit()

sync_data = []
# Sync/Async endpoint can handle max 20 entity IDs per request
# so split the ids list into multiple requests
for chunk_ids in split_list(ids, 20):
    sync_data.append(LineItem.all_stats(account, chunk_ids, metric_groups))

print(sync_data)

# create async stats jobs and get job ids
queued_job_ids = []
for chunk_ids in split_list(ids, 20):
    queued_job_ids.append(
        LineItem.queue_async_stats_job(account, chunk_ids, metric_groups).id)

print(queued_job_ids)

# let the job complete
seconds = 30
time.sleep(seconds)
Exemplo n.º 7
0
def sync_endpoint(
    client,
    state,
    start_date,
    stream_name,
    endpoint_config,
    tap_config,
    account_id=None,
    parent_ids=None,
    child_streams=None,
):
    # endpoint_config variables
    path = endpoint_config.get("path")
    LOGGER.info("Stream: {} - endpoint_config: {}".format(
        stream_name, endpoint_config))
    id_fields = endpoint_config.get("key_properties", [])
    parent_id_field = next(iter(id_fields), None)  # first ID field
    params = endpoint_config.get("params", {})
    bookmark_field = next(iter(endpoint_config.get("replication_keys", [])),
                          None)
    datetime_format = endpoint_config.get("datetime_format")
    sub_types = endpoint_config.get("sub_types", ["none"])
    children = endpoint_config.get("children")

    if parent_ids is None:
        parent_ids = []
    if child_streams is None:
        child_streams = []

    # tap config variabless
    # Twitter Ads does not accept True/False as boolean, must be true/false
    with_deleted = tap_config.get("with_deleted", "true")
    country_codes = tap_config.get("country_codes", "").replace(" ", "")
    country_code_list = country_codes.split(",")
    LOGGER.info(
        "country_code_list = {}".format(country_code_list))  # COMMENT OUT
    if sub_types == ["{country_code_list}"]:
        sub_types = country_code_list
    LOGGER.info("sub_types = {}".format(sub_types))  # COMMENT OUT

    # Bookmark datetimes
    last_datetime = get_bookmark(state, stream_name, start_date)
    last_dttm = strptime_to_utc(last_datetime)

    # NOTE: Risk of syncing indefinitely and never getting bookmark
    max_bookmark_value = None

    total_records = 0
    # Loop through sub_types (for tweets endpoint), all other endpoints loop once
    for sub_type in sub_types:
        LOGGER.info("sub_type = {}".format(sub_type))  # COMMENT OUT

        # Reset params and path for each sub_type
        params = {}
        new_params = {}
        path = None
        params = endpoint_config.get("params", {})
        path = endpoint_config.get("path")

        # Replace keys/ids in path and params
        add_account_id = False  # Initial default
        if "{account_id}" in path:
            add_account_id = True
            path = path.replace("{account_id}", account_id)

        if parent_ids:
            parent_id_list = ",".join(map(str, parent_ids))
            path = path.replace("{parent_ids}", parent_id_list)
        key = None
        val = None
        for key, val in list(params.items()):
            new_val = val
            if isinstance(val, str):
                if key == "with_deleted":
                    new_val = val.replace("{with_deleted}", with_deleted)
                if "{account_ids}" in val:
                    new_val = val.replace("{account_ids}", account_id)
                if "{parent_ids}" in val:
                    new_val = val.replace("{parent_ids}", parent_id_list)
                if "{start_date}" in val:
                    new_val = val.replace("{start_date}", start_date)
                if "{country_codes}" in val:
                    new_val = val.replace("{country_codes}", country_codes)
                if "{sub_type}" in val:
                    new_val = val.replace("{sub_type}", sub_type)
            new_params[key] = new_val
        LOGGER.info("Stream: {} - Request URL: {}/{}/{}".format(
            stream_name, ADS_API_URL, API_VERSION, path))
        LOGGER.info("Stream: {} - Request params: {}".format(
            stream_name, new_params))

        # API Call
        cursor = get_resource(stream_name, client, path, new_params)

        # time_extracted: datetime when the data was extracted from the API
        time_extracted = utils.now()

        i = 0
        with metrics.record_counter(stream_name) as counter:
            # Loop thru cursor records, break out if no more data or bookmark_value < last_dttm
            for record in cursor:
                # Get dictionary for record
                record_dict = obj_to_dict(record)
                if not record_dict:
                    # Finish looping
                    LOGGER.info(
                        "Stream: {} - Finished Looping, no more data".format(
                            stream_name))
                    break

                # Get record's bookmark_value
                # All bookmarked requests are sorted by updated_at descending
                #   'sort_by': ['updated_at-desc']
                # The first record is the max_bookmark_value
                if bookmark_field:
                    bookmark_value_str = record_dict.get(bookmark_field)
                    if bookmark_value_str:
                        # Tweets use a different datetime format: '%a %b %d %H:%M:%S %z %Y'
                        if datetime_format:
                            bookmark_value = datetime.strptime(
                                record_dict.get(bookmark_field),
                                datetime_format)
                        # Other bookmarked endpoints use normal UTC format
                        else:
                            bookmark_value = strptime_to_utc(
                                record_dict.get(bookmark_field))
                        # If first record, set max_bookmark_value
                        if i == 0:
                            max_bookmark_dttm = bookmark_value
                            max_bookmark_value = max_bookmark_dttm.strftime(
                                "%Y-%m-%dT%H:%M:%S%z")
                            LOGGER.info(
                                "Stream: {} - max_bookmark_value: {}".format(
                                    stream_name, max_bookmark_value))
                    else:
                        # pylint: disable=line-too-long
                        LOGGER.info(
                            "Stream: {} - NO BOOKMARK, bookmark_field: {}, record: {}"
                            .format(stream_name, bookmark_field, record_dict))
                        # pylint: enable=line-too-long
                        bookmark_value = last_dttm
                    if bookmark_value < last_dttm:
                        # Finish looping
                        LOGGER.info(
                            "Stream: {} - Finished, bookmark value < last datetime"
                            .format(stream_name))
                        break
                else:
                    bookmark_value = last_dttm

                # Check for PK fields
                for key in id_fields:
                    if not record_dict.get(key):
                        LOGGER.info(
                            "Stream: {} - Missing key {} in record: {}".format(
                                stream_name, key, record))

                # Transform record from transform.py
                prepared_record = transform_record(stream_name, record_dict)

                # Add account_id to record
                if add_account_id:
                    prepared_record["account_id"] = account_id

                write_record(stream_name,
                             prepared_record,
                             time_extracted=time_extracted)
                counter.increment()

                # Append parent_id to parent_ids
                parent_id = record_dict.get(parent_id_field)
                parent_ids.append(parent_id)

                # Increment counters
                i = i + 1
                total_records = total_records + 1

                # End: for record in cursor
            # End: with metrics as counter

        # Loop through children and chunks of parent_ids
        if children:
            for child_stream_name, child_endpoint_config in children.items():
                if child_stream_name in child_streams:
                    update_currently_syncing(state, child_stream_name)
                    # pylint: disable=line-too-long
                    LOGGER.info(
                        "Child Stream: {} - START Syncing, parent_stream: {}, account_id: {}"
                        .format(child_stream_name, stream_name, account_id))

                    total_child_records = 0
                    child_total_records = 0
                    # parent_id_limit: max list size for parent_ids
                    parent_id_limit = child_endpoint_config.get(
                        "parent_ids_limit", 1)
                    chunk = 0  # chunk number
                    # Make chunks of parent_ids
                    for chunk_ids in split_list(parent_ids, parent_id_limit):
                        # pylint: disable=line-too-long
                        LOGGER.info(
                            "Child Stream: {} - Syncing, chunk#: {}, parent_stream: {}, parent chunk_ids: {}"
                            .format(child_stream_name, chunk, stream_name,
                                    chunk_ids))
                        # pylint: enable=line-too-long

                        child_total_records = sync_endpoint(
                            client=client,
                            state=state,
                            start_date=start_date,
                            stream_name=child_stream_name,
                            endpoint_config=child_endpoint_config,
                            tap_config=tap_config,
                            account_id=account_id,
                            parent_ids=chunk_ids,
                            child_streams=child_streams,
                        )

                        # pylint: disable=line-too-long
                        LOGGER.info(
                            "Child Stream: {} - Finished chunk#: {}, parent_stream: {}"
                            .format(child_stream_name, chunk, stream_name))
                        # pylint: enable=line-too-long
                        total_child_records = total_child_records + child_total_records
                        chunk = chunk + 1
                        # End: for chunk in parent_id_chunks

                    # pylint: disable=line-too-long
                    LOGGER.info(
                        "Child Stream: {} - FINISHED Syncing, parent_stream: {}, account_id: {}"
                        .format(child_stream_name, stream_name, account_id))
                    # pylint: enable=line-too-long
                    LOGGER.info("Child Stream: {} - total_records: {}".format(
                        child_stream_name, total_child_records))
                    update_currently_syncing(state, stream_name)
                    # End: if child_stream_name in child_streams
                # End: for child_stream_name in children.items()
            # End: if children

        # pylint: disable=line-too-long
        LOGGER.info(
            "Stream: {}, Account ID: {} - FINISHED Sub Type: {}, Total Sub Type Records: {}"
            .format(stream_name, account_id, sub_type, i))
        # pylint: enable=line-too-long
        # End: for sub_type in sub_types

    # Update the state with the max_bookmark_value for the stream
    if bookmark_field:
        write_bookmark(state, stream_name, max_bookmark_value)

    return total_records