示例#1
0
def sync(client, config, catalog, state):
    start_date = config.get('start_date')
    spreadsheet_id = config.get('spreadsheet_id')

    # Get selected_streams from catalog, based on state last_stream
    #   last_stream = Previous currently synced stream, if the load was interrupted
    last_stream = singer.get_currently_syncing(state)
    LOGGER.info('last/currently syncing stream: {}'.format(last_stream))
    selected_streams = []
    for stream in catalog.get_selected_streams(state):
        selected_streams.append(stream.stream)
    LOGGER.info('selected_streams: {}'.format(selected_streams))

    if not selected_streams:
        return

    # FILE_METADATA
    file_metadata = {}
    stream_name = 'file_metadata'
    file_metadata_config = STREAMS.get(stream_name)

    # GET file_metadata
    LOGGER.info('GET file_metadata')
    file_metadata, time_extracted = get_data(
        stream_name=stream_name,
        endpoint_config=file_metadata_config,
        client=client,
        spreadsheet_id=spreadsheet_id)
    # Transform file_metadata
    LOGGER.info('Transform file_meatadata')
    file_metadata_tf = transform_file_metadata(file_metadata)
    # LOGGER.info('file_metadata_tf = {}'.format(file_metadata_tf))

    # Check if file has changed, if not break (return to __init__)
    last_datetime = strptime_to_utc(
        get_bookmark(state, stream_name, start_date))
    this_datetime = strptime_to_utc(file_metadata.get('modifiedTime'))
    LOGGER.info('last_datetime = {}, this_datetime = {}'.format(
        last_datetime, this_datetime))
    if this_datetime <= last_datetime:
        LOGGER.info(
            'this_datetime <= last_datetime, FILE NOT CHANGED. EXITING.')
        # Update file_metadata bookmark
        write_bookmark(state, 'file_metadata', strftime(this_datetime))
        return
    # Sync file_metadata if selected
    sync_stream(stream_name, selected_streams, catalog, state,
                file_metadata_tf, time_extracted)
    # file_metadata bookmark is updated at the end of sync

    # SPREADSHEET_METADATA
    spreadsheet_metadata = {}
    stream_name = 'spreadsheet_metadata'
    spreadsheet_metadata_config = STREAMS.get(stream_name)

    # GET spreadsheet_metadata
    LOGGER.info('GET spreadsheet_meatadata')
    spreadsheet_metadata, ss_time_extracted = get_data(
        stream_name=stream_name,
        endpoint_config=spreadsheet_metadata_config,
        client=client,
        spreadsheet_id=spreadsheet_id)

    # Transform spreadsheet_metadata
    LOGGER.info('Transform spreadsheet_meatadata')
    spreadsheet_metadata_tf = transform_spreadsheet_metadata(
        spreadsheet_metadata)

    # Sync spreadsheet_metadata if selected
    sync_stream(stream_name, selected_streams, catalog, state, spreadsheet_metadata_tf, \
        ss_time_extracted)

    # SHEET_METADATA and SHEET_DATA
    sheets = spreadsheet_metadata.get('sheets')
    sheet_metadata = []
    sheets_loaded = []
    sheets_loaded_config = STREAMS['sheets_loaded']
    if sheets:
        # Loop thru sheets (worksheet tabs) in spreadsheet
        for sheet in sheets:
            sheet_title = sheet.get('properties', {}).get('title')
            sheet_id = sheet.get('properties', {}).get('sheetId')

            # GET sheet_metadata and columns
            sheet_schema, columns = get_sheet_metadata(sheet, spreadsheet_id,
                                                       client)
            # LOGGER.info('sheet_schema: {}'.format(sheet_schema))

            # SKIP empty sheets (where sheet_schema and columns are None)
            if not sheet_schema or not columns:
                LOGGER.info('SKIPPING Empty Sheet: {}'.format(sheet_title))
            else:
                # Transform sheet_metadata
                sheet_metadata_tf = transform_sheet_metadata(
                    spreadsheet_id, sheet, columns)
                # LOGGER.info('sheet_metadata_tf = {}'.format(sheet_metadata_tf))
                sheet_metadata.append(sheet_metadata_tf)

                # SHEET_DATA
                # Should this worksheet tab be synced?
                if sheet_title in selected_streams:
                    LOGGER.info('STARTED Syncing Sheet {}'.format(sheet_title))
                    update_currently_syncing(state, sheet_title)
                    selected_fields = get_selected_fields(catalog, sheet_title)
                    LOGGER.info('Stream: {}, selected_fields: {}'.format(
                        sheet_title, selected_fields))
                    write_schema(catalog, sheet_title)

                    # Emit a Singer ACTIVATE_VERSION message before initial sync (but not subsequent syncs)
                    # everytime after each sheet sync is complete.
                    # This forces hard deletes on the data downstream if fewer records are sent.
                    # https://github.com/singer-io/singer-python/blob/master/singer/messages.py#L137
                    last_integer = int(get_bookmark(state, sheet_title, 0))
                    activate_version = int(time.time() * 1000)
                    activate_version_message = singer.ActivateVersionMessage(
                        stream=sheet_title, version=activate_version)
                    if last_integer == 0:
                        # initial load, send activate_version before AND after data sync
                        singer.write_message(activate_version_message)
                        LOGGER.info(
                            'INITIAL SYNC, Stream: {}, Activate Version: {}'.
                            format(sheet_title, activate_version))

                    # Determine max range of columns and rows for "paging" through the data
                    sheet_last_col_index = 1
                    sheet_last_col_letter = 'A'
                    for col in columns:
                        col_index = col.get('columnIndex')
                        col_letter = col.get('columnLetter')
                        if col_index > sheet_last_col_index:
                            sheet_last_col_index = col_index
                            sheet_last_col_letter = col_letter
                    sheet_max_row = sheet.get('properties').get(
                        'gridProperties', {}).get('rowCount')

                    # Initialize paging for 1st batch
                    is_last_row = False
                    batch_rows = 200
                    from_row = 2
                    if sheet_max_row < batch_rows:
                        to_row = sheet_max_row
                    else:
                        to_row = batch_rows

                    # Loop thru batches (each having 200 rows of data)
                    while not is_last_row and from_row < sheet_max_row and to_row <= sheet_max_row:
                        range_rows = 'A{}:{}{}'.format(from_row,
                                                       sheet_last_col_letter,
                                                       to_row)

                        # GET sheets_loaded for a worksheet tab
                        sheet_data, time_extracted = get_data(
                            stream_name='sheets_loaded',
                            endpoint_config=sheets_loaded_config,
                            client=client,
                            sheet_title=sheet_title,
                            spreadsheet_id=spreadsheet_id,
                            range_rows=range_rows)
                        # Data is returned as a list of arrays, an array of values for each row
                        sheet_data_rows = sheet_data.get('values', [])

                        # Transform batch of rows to JSON with keys for each column
                        sheet_data_tf, row_num = transform_sheet_data(
                            spreadsheet_id=spreadsheet_id,
                            sheet_id=sheet_id,
                            sheet_title=sheet_title,
                            from_row=from_row,
                            columns=columns,
                            sheet_data_rows=sheet_data_rows)
                        if row_num < to_row:
                            is_last_row = True

                        # Process records, send batch of records to target
                        record_count = process_records(
                            catalog=catalog,
                            stream_name=sheet_title,
                            records=sheet_data_tf,
                            time_extracted=ss_time_extracted,
                            version=activate_version)
                        LOGGER.info('Sheet: {}, records processed: {}'.format(
                            sheet_title, record_count))

                        # Update paging from/to_row for next batch
                        from_row = to_row + 1
                        if to_row + batch_rows > sheet_max_row:
                            to_row = sheet_max_row
                        else:
                            to_row = to_row + batch_rows

                    # End of Stream: Send Activate Version and update State
                    singer.write_message(activate_version_message)
                    write_bookmark(state, sheet_title, activate_version)
                    LOGGER.info(
                        'COMPLETE SYNC, Stream: {}, Activate Version: {}'.
                        format(sheet_title, activate_version))
                    LOGGER.info(
                        'FINISHED Syncing Sheet {}, Total Rows: {}'.format(
                            sheet_title,
                            row_num - 2))  # subtract 1 for header row
                    update_currently_syncing(state, None)

                    # SHEETS_LOADED
                    # Add sheet to sheets_loaded
                    sheet_loaded = {}
                    sheet_loaded['spreadsheetId'] = spreadsheet_id
                    sheet_loaded['sheetId'] = sheet_id
                    sheet_loaded['title'] = sheet_title
                    sheet_loaded['loadDate'] = strftime(utils.now())
                    sheet_loaded['lastRowNumber'] = row_num
                    sheets_loaded.append(sheet_loaded)

    stream_name = 'sheet_metadata'
    # Sync sheet_metadata if selected
    sync_stream(stream_name, selected_streams, catalog, state, sheet_metadata)

    stream_name = 'sheets_loaded'
    # Sync sheet_metadata if selected
    sync_stream(stream_name, selected_streams, catalog, state, sheets_loaded)

    # Update file_metadata bookmark
    write_bookmark(state, 'file_metadata', strftime(this_datetime))

    return
示例#2
0
 def test_small_years(self):
     self.assertEqual(u.strftime(dt(90, 1, 1, tzinfo=pytz.UTC)),
                      '0090-01-01T00:00:00.000000Z')
示例#3
0
def append_times_to_dates(item, date_fields):
    if date_fields:
        for date_field in date_fields:
            if item.get(date_field):
                item[date_field] = utils.strftime(
                    utils.strptime_with_tz(item[date_field]))
示例#4
0
def sync_records(sf, catalog_entry, state, counter):
    chunked_bookmark = singer_utils.strptime_with_tz(sf.get_start_date(state, catalog_entry))
    stream = catalog_entry['stream']
    schema = catalog_entry['schema']
    stream_alias = catalog_entry.get('stream_alias')
    catalog_metadata = metadata.to_map(catalog_entry['metadata'])
    replication_key = catalog_metadata.get((), {}).get('replication-key')
    stream_version = get_stream_version(catalog_entry, state)
    activate_version_message = singer.ActivateVersionMessage(stream=(stream_alias or stream),
                                                             version=stream_version)

    start_time = singer_utils.now()

    LOGGER.info('Syncing Salesforce data for stream %s', stream)

    for rec in sf.query(catalog_entry, state):
        counter.increment()
        with Transformer(pre_hook=transform_bulk_data_hook) as transformer:
            rec = transformer.transform(rec, schema)
        rec = fix_record_anytype(rec, schema)
        singer.write_message(
            singer.RecordMessage(
                stream=(
                    stream_alias or stream),
                record=rec,
                version=stream_version,
                time_extracted=start_time))

        replication_key_value = replication_key and singer_utils.strptime_with_tz(rec[replication_key])

        if sf.pk_chunking:
            if replication_key_value and replication_key_value <= start_time and replication_key_value > chunked_bookmark:
                # Replace the highest seen bookmark and save the state in case we need to resume later
                chunked_bookmark = singer_utils.strptime_with_tz(rec[replication_key])
                state = singer.write_bookmark(
                    state,
                    catalog_entry['tap_stream_id'],
                    'JobHighestBookmarkSeen',
                    singer_utils.strftime(chunked_bookmark))
                singer.write_state(state)
        # Before writing a bookmark, make sure Salesforce has not given us a
        # record with one outside our range
        elif replication_key_value and replication_key_value <= start_time:
            state = singer.write_bookmark(
                state,
                catalog_entry['tap_stream_id'],
                replication_key,
                rec[replication_key])
            singer.write_state(state)

        # Tables with no replication_key will send an
        # activate_version message for the next sync
    if not replication_key:
        singer.write_message(activate_version_message)
        state = singer.write_bookmark(
            state, catalog_entry['tap_stream_id'], 'version', None)

    # If pk_chunking is set, only write a bookmark at the end
    if sf.pk_chunking:
        # Write a bookmark with the highest value we've seen
        state = singer.write_bookmark(
            state,
            catalog_entry['tap_stream_id'],
            replication_key,
            singer_utils.strftime(chunked_bookmark))
示例#5
0
 def test_round_trip(self):
     now = dt.utcnow().replace(tzinfo=pytz.UTC)
     dtime = u.strftime(now)
     pdtime = u.strptime_to_utc(dtime)
     fdtime = u.strftime(pdtime)
     self.assertEqual(dtime, fdtime)
示例#6
0
    def get_objects(self):
        updated_at_min = self.get_bookmark()

        stop_time = singer.utils.now().replace(microsecond=0)
        date_window_size = float(
            Context.config.get("date_window_size", DATE_WINDOW_SIZE))

        # Page through till the end of the resultset
        while updated_at_min < stop_time:
            # Bookmarking can also occur on the since_id
            since_id = self.get_since_id() or 1

            if since_id != 1:
                LOGGER.info("Resuming sync from since_id %d", since_id)

            # It's important that `updated_at_min` has microseconds
            # truncated. Why has been lost to the mists of time but we
            # think it has something to do with how the API treats
            # microseconds on its date windows. Maybe it's possible to
            # drop data due to rounding errors or something like that?
            updated_at_max = updated_at_min + datetime.timedelta(
                days=date_window_size)
            if updated_at_max > stop_time:
                updated_at_max = stop_time
            while True:
                status_key = self.status_key or "status"
                query_params = self.get_query_params(since_id, status_key,
                                                     updated_at_min,
                                                     updated_at_max)

                with metrics.http_request_timer(self.name):
                    objects = self.call_api(query_params)

                for obj in objects:
                    if obj.id < since_id:
                        # This verifies the api behavior expectation we
                        # have that all results actually honor the
                        # since_id parameter.
                        raise OutOfOrderIdsError(
                            "obj.id < since_id: {} < {}".format(
                                obj.id, since_id))
                    yield obj

                # You know you're at the end when the current page has
                # less than the request size limits you set.
                if len(objects) < self.results_per_page:
                    # Save the updated_at_max as our bookmark as we've synced all rows up in our
                    # window and can move forward. Also remove the since_id because we want to
                    # restart at 1.
                    Context.state.get('bookmarks',
                                      {}).get(self.name,
                                              {}).pop('since_id', None)
                    self.update_bookmark(utils.strftime(updated_at_max))
                    break

                if objects[-1].id != max([o.id for o in objects]):
                    # This verifies the api behavior expectation we have
                    # that all pages are internally ordered by the
                    # `since_id`.
                    raise OutOfOrderIdsError(
                        "{} is not the max id in objects ({})".format(
                            objects[-1].id, max([o.id for o in objects])))
                since_id = objects[-1].id

                # Put since_id into the state.
                self.update_bookmark(since_id, bookmark_key='since_id')

            updated_at_min = updated_at_max
示例#7
0
def sync_endpoint(client,
                  catalog,
                  state,
                  start_date,
                  stream_name,
                  path,
                  endpoint_config,
                  static_params,
                  bookmark_query_field=None,
                  bookmark_field=None,
                  bookmark_type=None,
                  data_key=None,
                  id_fields=None,
                  selected_streams=None,
                  parent=None,
                  parent_id=None):

    # Get the latest bookmark for the stream and set the last_integer/datetime
    last_datetime = None
    last_integer = None
    max_bookmark_value = None

    if bookmark_type == 'integer':
        last_integer = get_bookmark(state, stream_name, 0)
        max_bookmark_value = last_integer
    else:
        last_datetime = get_bookmark(state, stream_name, start_date)
        max_bookmark_value = last_datetime

    end_dttm = utils.now()
    end_dt = end_dttm.date()
    start_dttm = end_dttm
    start_dt = end_dt

    if bookmark_query_field:
        if bookmark_type == 'datetime':
            start_dttm = strptime_to_utc(last_datetime)
            start_dt = start_dttm.date()
            start_dt_str = strftime(start_dttm)[0:10]
    # date_list provides one date for each date in range
    # Most endpoints, witout a bookmark query field, will have a single date (today)
    # Clicks endpoint will have a date for each day from bookmark to today
    date_list = [
        str(start_dt + timedelta(days=x))
        for x in range((end_dt - start_dt).days + 1)
    ]
    endpoint_total = 0
    total_records = 0
    limit = 1000  # PageSize (default for API is 100)
    for bookmark_date in date_list:
        page = 1
        offset = 0
        total_records = 0
        if stream_name == 'clicks':
            LOGGER.info('Stream: {}, Syncing bookmark_date = {}'.format(
                stream_name, bookmark_date))
        next_url = '{}/{}.json'.format(client.base_url, path)
        while next_url:
            # Squash params to query-string params
            params = {
                "PageSize": limit,
                **static_params  # adds in endpoint specific, sort, filter params
            }

            if bookmark_query_field:
                if bookmark_type == 'datetime':
                    params[bookmark_query_field] = bookmark_date
                elif bookmark_type == 'integer':
                    params[bookmark_query_field] = last_integer

            if page == 1 and not params == {}:
                param_string = '&'.join([
                    '%s=%s' % (key, value) for (key, value) in params.items()
                ])
                querystring = param_string.replace(
                    '<parent_id>', str(parent_id)).replace(
                        '<last_datetime>',
                        strptime_to_utc(last_datetime).strftime(
                            '%Y-%m-%dT%H:%M:%SZ'))
            else:
                querystring = None
            LOGGER.info('URL for Stream {}: {}{}'.format(
                stream_name, next_url,
                '?{}'.format(querystring) if querystring else ''))

            # API request data
            data = {}
            data = client.get(url=next_url,
                              path=path,
                              params=querystring,
                              endpoint=stream_name)

            # time_extracted: datetime when the data was extracted from the API
            time_extracted = utils.now()
            if not data or data is None or data == {}:
                total_records = 0
                break  # No data results

            # Get pagination details
            api_total = int(data.get('@total', '0'))
            page_size = int(data.get('@pagesize', '0'))
            if page_size:
                if page_size > limit:
                    limit = page_size
            next_page_uri = data.get('@nextpageuri', None)
            if next_page_uri:
                next_url = '{}{}'.format(BASE_URL, next_page_uri)
            else:
                next_url = None

            # Break out of loop if only paginations details data (no records)
            #   or no data_key in data
            #  company_information and report_metadata do not have pagination details
            if not stream_name in ('company_information', 'report_metadata'):
                # catalog_items has bug where api_total is always 0
                if (not stream_name == 'catalog_items') and (
                        api_total == 0) and (not next_url):
                    break
                if not data_key in data:
                    break

            # Transform data with transform_json from transform.py
            # The data_key identifies the array/list of records below the <root> element
            # LOGGER.info('data = {}'.format(data)) # TESTING, comment out
            transformed_data = []  # initialize the record list
            data_list = []
            data_dict = {}
            if isinstance(data, list) and not data_key in data:
                data_list = data
                data_dict[data_key] = data_list
                transformed_data = transform_json(data_dict, stream_name,
                                                  data_key)
            elif isinstance(data, dict) and not data_key in data:
                data_list.append(data)
                data_dict[data_key] = data_list
                transformed_data = transform_json(data_dict, stream_name,
                                                  data_key)
            else:
                transformed_data = transform_json(data, stream_name, data_key)

            # LOGGER.info('transformed_data = {}'.format(transformed_data)) # TESTING, comment out
            if not transformed_data or transformed_data is None:
                LOGGER.info('No transformed data for data = {}'.format(data))
                total_records = 0
                break  # No data results

            # Verify key id_fields are present
            for record in transformed_data:
                for key in id_fields:
                    if not record.get(key):
                        LOGGER.info(
                            'Stream: {}, Missing key {} in record: {}'.format(
                                stream_name, key, record))
                        raise RuntimeError

            # Process records and get the max_bookmark_value and record_count for the set of records
            max_bookmark_value, record_count = process_records(
                catalog=catalog,
                stream_name=stream_name,
                records=transformed_data,
                time_extracted=time_extracted,
                bookmark_field=bookmark_field,
                bookmark_type=bookmark_type,
                max_bookmark_value=max_bookmark_value,
                last_datetime=last_datetime,
                last_integer=last_integer,
                parent=parent,
                parent_id=parent_id)
            LOGGER.info('Stream {}, batch processed {} records'.format(
                stream_name, record_count))

            # Loop thru parent batch records for each children objects (if should stream)
            children = endpoint_config.get('children')
            if children:
                for child_stream_name, child_endpoint_config in children.items(
                ):
                    if child_stream_name in selected_streams:
                        write_schema(catalog, child_stream_name)
                        # For each parent record
                        for record in transformed_data:
                            i = 0
                            # Set parent_id
                            for id_field in id_fields:
                                if i == 0:
                                    parent_id_field = id_field
                                if id_field == 'id':
                                    parent_id_field = id_field
                                i = i + 1
                            parent_id = record.get(parent_id_field)

                            # sync_endpoint for child
                            LOGGER.info(
                                'START Sync for Stream: {}, parent_stream: {}, parent_id: {}'\
                                    .format(child_stream_name, stream_name, parent_id))
                            child_path = child_endpoint_config.get(
                                'path',
                                child_stream_name).format(str(parent_id))
                            child_bookmark_field = next(
                                iter(
                                    child_endpoint_config.get(
                                        'replication_keys', [])), None)
                            child_total_records = sync_endpoint(
                                client=client,
                                catalog=catalog,
                                state=state,
                                start_date=start_date,
                                stream_name=child_stream_name,
                                path=child_path,
                                endpoint_config=child_endpoint_config,
                                static_params=child_endpoint_config.get(
                                    'params', {}),
                                bookmark_query_field=child_endpoint_config.get(
                                    'bookmark_query_field'),
                                bookmark_field=child_bookmark_field,
                                bookmark_type=child_endpoint_config.get(
                                    'bookmark_type'),
                                data_key=child_endpoint_config.get(
                                    'data_key', 'results'),
                                id_fields=child_endpoint_config.get(
                                    'key_properties'),
                                selected_streams=selected_streams,
                                parent=child_endpoint_config.get('parent'),
                                parent_id=parent_id)
                            LOGGER.info(
                                'FINISHED Sync for Stream: {}, parent_id: {}, total_records: {}'\
                                    .format(child_stream_name, parent_id, child_total_records))

            # Update the state with the max_bookmark_value for the stream
            if bookmark_field:
                write_bookmark(state, stream_name, max_bookmark_value)

            # Adjust total_records w/ record_count, if needed
            if record_count > total_records:
                total_records = total_records + record_count
            else:
                total_records = api_total

            # to_rec: to record; ending record for the batch page
            to_rec = offset + limit
            if to_rec > total_records:
                to_rec = total_records

            LOGGER.info(
                'Synced Stream: {}, page: {}, {} to {} of total records: {}'.
                format(stream_name, page, offset, to_rec, total_records))
            # Pagination: increment the offset by the limit (batch-size) and page
            offset = offset + limit
            page = page + 1
        endpoint_total = endpoint_total + total_records
    # Return total_records (for all pages)
    return endpoint_total
示例#8
0
def unix_seconds_to_datetime(value):
    return strftime(
        datetime.datetime.fromtimestamp(int(value), datetime.timezone.utc))
示例#9
0
    def _query_recur(
            self,
            query,
            catalog_entry,
            start_date_str,
            end_date=None,
            retries=MAX_RETRIES):
        params = {"q": query}
        url = "{}/services/data/v41.0/queryAll".format(self.sf.instance_url)
        headers = self.sf._get_standard_headers()

        if end_date is None:
            end_date = singer_utils.now()

        if retries == 0:
            raise TapSalesforceException(
                "Ran out of retries attempting to query Salesforce Object {}".format(
                    catalog_entry['stream']))

        retryable = False
        try:
            while True:
                resp = self.sf._make_request('GET', url, headers=headers, params=params)
                resp_json = resp.json()

                for rec in resp_json.get('records'):
                    yield rec

                next_records_url = resp_json.get('nextRecordsUrl')

                if next_records_url is None:
                    break
                else:
                    url = "{}{}".format(self.sf.instance_url, next_records_url)

        except HTTPError as ex:
            response = ex.response.json()
            if isinstance(response, list) and response[0].get("errorCode") == "QUERY_TIMEOUT":
                start_date = singer_utils.strptime_with_tz(start_date_str)
                day_range = (end_date - start_date).days
                LOGGER.info(
                    "Salesforce returned QUERY_TIMEOUT querying %d days of %s",
                    day_range,
                    catalog_entry['stream'])
                retryable = True
            else:
                raise ex

        if retryable:
            start_date = singer_utils.strptime_with_tz(start_date_str)
            half_day_range = (end_date - start_date) // 2
            end_date = end_date - half_day_range

            if half_day_range.days == 0:
                raise TapSalesforceException(
                    "Attempting to query by 0 day range, this would cause infinite looping.")

            query = self.sf._build_query_string(catalog_entry, singer_utils.strftime(start_date),
                                                singer_utils.strftime(end_date))
            for record in self._query_recur(
                    query,
                    catalog_entry,
                    start_date_str,
                    end_date,
                    retries - 1):
                yield record
示例#10
0
def unix_milliseconds_to_datetime(value):
    return strftime(
        datetime.datetime.fromtimestamp(
            float(value) / 1000.0, datetime.timezone.utc))
示例#11
0
def string_to_datetime(value):
    try:
        return strftime(strptime_to_utc(value))
    except Exception as ex:
        LOGGER.log_warning("%s, (%s)", ex, value)
        return None
示例#12
0
def string_to_datetime(value):
    try:
        return strftime(pendulum.parse(value))
    except:
        return None
示例#13
0
def sync_generic_basic_endpoint(sdk_client, stream, stream_metadata):
    discovered_schema = load_schema(stream)
    field_list = get_field_list(discovered_schema, stream, stream_metadata)

    discovered_schema['properties']['_sdc_customer_id'] = {
        'description': 'Profile ID',
        'type': 'string',
        'field': "customer_id"
    }
    primary_keys = GENERIC_ENDPOINT_MAPPINGS[stream]['primary_keys']
    write_schema(stream, discovered_schema, primary_keys)

    LOGGER.info("Syncing %s for customer %s", stream,
                sdk_client.client_customer_id)

    start_index = 0
    selector = {
        'fields': field_list,
        'paging': {
            'startIndex': str(start_index),
            'numberResults': str(PAGE_SIZE)
        }
    }

    while True:
        page = get_page(sdk_client, selector, stream, start_index)
        if page['totalNumEntries'] > GOOGLE_MAX_START_INDEX:
            raise Exception("Too many %s (%s > %s) for customer %s", stream,
                            GOOGLE_MAX_START_INDEX, page['totalNumEntries'],
                            sdk_client.client_customer_id)

        if 'entries' in page:
            with metrics.record_counter(stream) as counter:
                time_extracted = utils.now()

                with Transformer(
                        singer.UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING
                ) as bumble_bee:
                    for obj in page['entries']:
                        obj['_sdc_customer_id'] = sdk_client.client_customer_id

                        bumble_bee.pre_hook = transform_pre_hook
                        # At this point the `record` is wrong because of
                        # the comment below.
                        record = bumble_bee.transform(obj, discovered_schema)
                        # retransform `startDate` and `endDate` if this is
                        # campaigns as the transformer doesn't currently
                        # have support for dates
                        #
                        # This will cause a column split in the warehouse
                        # if we ever change to a `date` type so be wary.
                        if stream == 'campaigns':
                            # [API Docs][1] state that these fields are
                            # formatted as dates using `YYYYMMDD` and that
                            # when they're added they default to the
                            # timezone of the parent account. The
                            # description is not super clear so we're
                            # making some assumptions at this point: 1.
                            # The timezone of the parent account is the
                            # timezone that the date string is in and 2.
                            # That there's no way to to create a campaign
                            # is a different time zone (if there is we
                            # don't appear to have a way to retrieve it).
                            #
                            # We grab the parent account's timezone from
                            # the ManagedCustomerService and cast these to
                            # that timezone and then format them as if
                            # they were datetimes which is not quite
                            # accurate but is the best we can currently
                            # do.
                            #
                            # [1]: https://developers.google.com/adwords/api/docs/reference/v201809/CampaignService.Campaign#startdate
                            parent_account_tz_str = get_and_cache_parent_account_tz_str(
                                sdk_client)
                            if record.get('startDate'):
                                naive_date = datetime.datetime.strptime(
                                    obj['startDate'], '%Y%m%d')
                                utc_date = pytz.timezone(
                                    parent_account_tz_str).localize(
                                        naive_date).astimezone(tz=pytz.UTC)
                                record['startDate'] = utils.strftime(utc_date)
                            if record.get('endDate'):
                                naive_date = datetime.datetime.strptime(
                                    obj['endDate'], '%Y%m%d')
                                utc_date = pytz.timezone(
                                    parent_account_tz_str).localize(
                                        naive_date).astimezone(tz=pytz.UTC)
                                record['endDate'] = utils.strftime(utc_date)

                        singer.write_record(stream,
                                            record,
                                            time_extracted=time_extracted)
                        counter.increment()

        start_index += PAGE_SIZE
        if start_index > int(page['totalNumEntries']):
            break
    LOGGER.info("Done syncing %s for customer_id %s", stream,
                sdk_client.client_customer_id)
示例#14
0
    },
    "ad_groups": {
        'primary_keys': ["id"],
        'service_name': 'AdGroupService'
    },
    "ads": {
        'primary_keys': ["adGroupId"],
        'service_name': 'AdGroupAdService'
    },
    "accounts": {
        'primary_keys': ["customerId"],
        'service_name': 'ManagedCustomerService'
    }
}

REPORT_RUN_DATETIME = utils.strftime(utils.now())

VERIFIED_REPORTS = frozenset([
    'ACCOUNT_PERFORMANCE_REPORT',
    'ADGROUP_PERFORMANCE_REPORT',
    # 'AD_CUSTOMIZERS_FEED_ITEM_REPORT',
    'AD_PERFORMANCE_REPORT',
    'AGE_RANGE_PERFORMANCE_REPORT',
    'AUDIENCE_PERFORMANCE_REPORT',
    # 'AUTOMATIC_PLACEMENTS_PERFORMANCE_REPORT',
    # 'BID_GOAL_PERFORMANCE_REPORT',
    #'BUDGET_PERFORMANCE_REPORT',                       -- does NOT allow for querying by date range
    'CALL_METRICS_CALL_DETAILS_REPORT',
    #'CAMPAIGN_AD_SCHEDULE_TARGET_REPORT',
    #'CAMPAIGN_CRITERIA_REPORT',
    #'CAMPAIGN_GROUP_PERFORMANCE_REPORT',