def sync(self, state): bookmark = self.get_bookmark(state) original_search_window_size = int( self.config.get('search_window_size', DEFAULT_SEARCH_WINDOW_SIZE)) search_window_size = original_search_window_size # We substract a second here because the API seems to compare # start_time with a >, but we typically prefer a >= behavior. # Also, the start_time query parameter filters based off of # created_at, but zendesk support confirmed with us that # satisfaction_ratings are immutable so that created_at = # updated_at #start = bookmark_epoch-1 start = bookmark - datetime.timedelta(seconds=1) end = start + datetime.timedelta(seconds=search_window_size) sync_end = singer.utils.now() - datetime.timedelta(minutes=1) epoch_sync_end = int(sync_end.strftime('%s')) parsed_sync_end = singer.strftime(sync_end, "%Y-%m-%dT%H:%M:%SZ") while start < sync_end: epoch_start = int(start.strftime('%s')) parsed_start = singer.strftime(start, "%Y-%m-%dT%H:%M:%SZ") epoch_end = int(end.strftime('%s')) parsed_end = singer.strftime(end, "%Y-%m-%dT%H:%M:%SZ") LOGGER.info("Querying for satisfaction ratings between %s and %s", parsed_start, min(parsed_end, parsed_sync_end)) satisfaction_ratings = self.client.satisfaction_ratings( start_time=epoch_start, end_time=min(epoch_end, epoch_sync_end)) # NB: We've observed that the tap can sync 50k records in ~15 # minutes, due to this, the tap will adjust the time range # dynamically to ensure bookmarks are able to be written in # cases of high volume. if satisfaction_ratings.count > 50000: search_window_size = search_window_size // 2 end = start + datetime.timedelta(seconds=search_window_size) LOGGER.info( "satisfaction_ratings - Detected Search API response size for this window is too large (> 50k). Cutting search window in half to %s seconds.", search_window_size) continue for satisfaction_rating in satisfaction_ratings: assert parsed_start <= satisfaction_rating.updated_at, "satisfaction_ratings - Record found before date window start. Details: window start ({}) is not less than or equal to updated_at ({})".format( parsed_start, satisfaction_rating.updated_at) if bookmark < utils.strptime_with_tz( satisfaction_rating.updated_at) <= end: # NB: We don't trust that the records come back ordered by # updated_at (we've observed out-of-order records), # so we can't save state until we've seen all records self.update_bookmark(state, satisfaction_rating.updated_at) if parsed_start <= satisfaction_rating.updated_at <= parsed_end: yield (self.stream, satisfaction_rating) if search_window_size <= original_search_window_size // 2: search_window_size = search_window_size * 2 LOGGER.info( "Successfully requested records. Doubling search window to %s seconds", search_window_size) singer.write_state(state) start = end - datetime.timedelta(seconds=1) end = start + datetime.timedelta(seconds=search_window_size)
def sync(self, state): original_search_window_size = int(self.config.get('search_window_size', DEFAULT_SEARCH_WINDOW_SIZE)) search_window_size = original_search_window_size bookmark = self.get_bookmark(state) start = bookmark - datetime.timedelta(seconds=1) end = start + datetime.timedelta(seconds=search_window_size) sync_end = singer.utils.now() - datetime.timedelta(minutes=1) parsed_sync_end = singer.strftime(sync_end, "%Y-%m-%dT%H:%M:%SZ") # ASSUMPTION: updated_at value always comes back in utc num_retries = 0 while start < sync_end: parsed_start = singer.strftime(start, "%Y-%m-%dT%H:%M:%SZ") parsed_end = min(singer.strftime(end, "%Y-%m-%dT%H:%M:%SZ"), parsed_sync_end) LOGGER.info("Querying for users between %s and %s", parsed_start, parsed_end) users = self.client.search("", updated_after=parsed_start, updated_before=parsed_end, type="user") # NB: Zendesk will return an error on the 1001st record, so we # need to check total response size before iterating # See: https://develop.zendesk.com/hc/en-us/articles/360022563994--BREAKING-New-Search-API-Result-Limits if users.count > 1000: if search_window_size > 1: search_window_size = search_window_size // 2 end = start + datetime.timedelta(seconds=search_window_size) LOGGER.info("users - Detected Search API response size too large. Cutting search window in half to %s seconds.", search_window_size) continue raise Exception("users - Unable to get all users within minimum window of a single second ({}), found {} users within this timestamp. Zendesk can only provide a maximum of 1000 users per request. See: https://develop.zendesk.com/hc/en-us/articles/360022563994--BREAKING-New-Search-API-Result-Limits".format(parsed_start, users.count)) # Consume the records to account for dates lower than window start users = [user for user in users] # pylint: disable=unnecessary-comprehension if not all(parsed_start <= user.updated_at for user in users): # Only retry up to 30 minutes (60 attempts at 30 seconds each) if num_retries < 60: LOGGER.info("users - Record found before date window start. Waiting 30 seconds, then retrying window for consistency. (Retry #%s)", num_retries + 1) time.sleep(30) num_retries += 1 continue raise AssertionError("users - Record found before date window start and did not resolve after 30 minutes of retrying. Details: window start ({}) is not less than or equal to updated_at value(s) {}".format( parsed_start, [str(user.updated_at) for user in users if user.updated_at < parsed_start])) # If we make it here, all quality checks have passed. Reset retry count. num_retries = 0 for user in users: if parsed_start <= user.updated_at <= parsed_end: yield (self.stream, user) self.update_bookmark(state, parsed_end) # Assumes that the for loop got everything singer.write_state(state) if search_window_size <= original_search_window_size // 2: search_window_size = search_window_size * 2 LOGGER.info("Successfully requested records. Doubling search window to %s seconds", search_window_size) start = end - datetime.timedelta(seconds=1) end = start + datetime.timedelta(seconds=search_window_size)
def transform_datetime_string(dts): parsed_dt = dateutil.parser.parse(dts) if parsed_dt.tzinfo is None: parsed_dt = parsed_dt.replace(tzinfo=timezone.utc) else: parsed_dt = parsed_dt.astimezone(timezone.utc) return singer.strftime(parsed_dt)
def populate_simple_table(dynamodb): print('\nPopulating table: simple_table') num_items = 50 table = dynamodb.Table('simple_table') table.wait_until_exists() start_datetime = datetime.datetime(2018, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc) for int_value in range(num_items): item_dt = start_datetime + datetime.timedelta(days=(5*int_value)) table.put_item( Item={ "id": int_value, "string_field": random_string_generator(), "date_field": singer.strftime(item_dt) } ) # wait for global secondary index to be backfilled while True: if not table.global_secondary_indexes or table.global_secondary_indexes[0]['IndexStatus'] != 'ACTIVE': print('Waiting for index to backfill...') time.sleep(5) table.reload() else: break print('Added {} items to table: simple_table'.format(num_items))