def __init__(self): self.results_per_page = Context.get_results_per_page(RESULTS_PER_PAGE) # set request timeout self.request_timeout = get_request_timeout()
def get_selected_parents(): for parent_stream in [ 'orders', 'customers', 'products', 'custom_collections' ]: if Context.is_selected(parent_stream): yield Context.stream_objects[parent_stream]()
def get_objects(self): updated_at_min = self.get_bookmark() stop_time = singer.utils.now().replace(microsecond=0) date_window_size = float( Context.config.get("date_window_size", DATE_WINDOW_SIZE)) results_per_page = Context.get_results_per_page(RESULTS_PER_PAGE) # Page through till the end of the resultset while updated_at_min < stop_time: # Bookmarking can also occur on the since_id since_id = self.get_since_id() or 1 if since_id != 1: LOGGER.info("Resuming sync from since_id %d", since_id) # It's important that `updated_at_min` has microseconds # truncated. Why has been lost to the mists of time but we # think it has something to do with how the API treats # microseconds on its date windows. Maybe it's possible to # drop data due to rounding errors or something like that? updated_at_max = updated_at_min + datetime.timedelta( days=date_window_size) if updated_at_max > stop_time: updated_at_max = stop_time while True: status_key = self.status_key or "status" query_params = { "since_id": since_id, "updated_at_min": updated_at_min, "updated_at_max": updated_at_max, "limit": results_per_page, status_key: "any" } with metrics.http_request_timer(self.name): objects = self.call_api(query_params) for obj in objects: if obj.id < since_id: # This verifies the api behavior expectation we # have that all results actually honor the # since_id parameter. raise OutOfOrderIdsError( "obj.id < since_id: {} < {}".format( obj.id, since_id)) yield obj # You know you're at the end when the current page has # less than the request size limits you set. if len(objects) < results_per_page: # Save the updated_at_max as our bookmark as we've synced all rows up in our # window and can move forward. Also remove the since_id because we want to # restart at 1. Context.state.get('bookmarks', {}).get(self.name, {}).pop('since_id', None) self.update_bookmark(utils.strftime(updated_at_max)) break if objects[-1].id != max([o.id for o in objects]): # This verifies the api behavior expectation we have # that all pages are internally ordered by the # `since_id`. raise OutOfOrderIdsError( "{} is not the max id in objects ({})".format( objects[-1].id, max([o.id for o in objects]))) since_id = objects[-1].id # Put since_id into the state. self.update_bookmark(since_id, bookmark_key='since_id') updated_at_min = updated_at_max
def sync(): initialize_shopify_client() # Emit all schemas first so we have them for child streams for stream in Context.catalog["streams"]: if Context.is_selected(stream["tap_stream_id"]): singer.write_schema(stream["tap_stream_id"], stream["schema"], stream["key_properties"], bookmark_properties=stream["replication_key"]) Context.counts[stream["tap_stream_id"]] = 0 Context.durations[stream["tap_stream_id"]] = None # If there is a currently syncing stream bookmark, shuffle the # stream order so it gets sync'd first currently_sync_stream_name = Context.state.get('bookmarks', {}).get('currently_sync_stream') if currently_sync_stream_name: shuffle_streams(currently_sync_stream_name) # Loop over streams in catalog for catalog_entry in Context.catalog['streams']: stream_start_time = time.time() stream_id = catalog_entry['tap_stream_id'] stream = Context.stream_objects[stream_id]() stream.schema = catalog_entry['schema'] if not Context.is_selected(stream_id): LOGGER.info('Skipping stream: %s', stream_id) continue LOGGER.info('Syncing stream: %s', stream_id) if not Context.state.get('bookmarks'): Context.state['bookmarks'] = {} Context.state['bookmarks']['currently_sync_stream'] = stream_id if Context.config.get("use_async", False) and stream.async_available: Context.counts[stream_id] = stream.sync_async() else: with Transformer() as transformer: for rec in stream.sync(): extraction_time = singer.utils.now() record_metadata = metadata.to_map(catalog_entry['metadata']) rec = transformer.transform(rec, stream.schema, record_metadata) singer.write_record(stream_id, rec, time_extracted=extraction_time) Context.counts[stream_id] += 1 Context.state['bookmarks'].pop('currently_sync_stream') singer.write_state(Context.state) stream_job_duration = time.strftime("%H:%M:%S", time.gmtime(time.time() - stream_start_time)) Context.durations[stream_id] = stream_job_duration div = "-"*50 info_msg = "\n{d}".format(d=div) info_msg += "\nShop: {}".format(Context.config['shop']) info_msg += "\n{d}\n".format(d=div) for stream_id, stream_count in Context.counts.items(): info_msg += "\n{}: {}".format(stream_id, stream_count) info_msg += "\nDuration: {}".format(Context.durations[stream_id]) info_msg += "\n{d}\n".format(d=div) LOGGER.info(info_msg)
def get_objects(self): updated_at_min = self.get_bookmark() stop_time = singer.utils.now().replace(microsecond=0) # Retrieve data for max 1 year. Otherwise log incremental needed. diff_days = (stop_time - updated_at_min).days yearly = False if diff_days > 365: yearly = True stop_time = updated_at_min + datetime.timedelta(days=365) LOGGER.info("This import will only import the first year of historical data. " "You need to trigger further incremental imports to get the missing rows.") date_window_size = float(Context.config.get("date_window_size", DATE_WINDOW_SIZE)) results_per_page = Context.get_results_per_page(RESULTS_PER_PAGE) # Page through till the end of the resultset while updated_at_min < stop_time: # Bookmarking can also occur on the since_id since_id = self.get_since_id() or 1 if since_id != 1: LOGGER.info("Resuming sync from since_id %d", since_id) # It's important that `updated_at_min` has microseconds # truncated. Why has been lost to the mists of time but we # think it has something to do with how the API treats # microseconds on its date windows. Maybe it's possible to # drop data due to rounding errors or something like that? updated_at_max = updated_at_min + datetime.timedelta(days=date_window_size) if updated_at_max > stop_time: updated_at_max = stop_time singer.log_info("getting from %s - %s", updated_at_min, updated_at_max) min_filer_key = self.get_min_replication_key() max_filer_key = self.get_max_replication_key() while True: status_key = self.status_key or "status" query_params = { "since_id": since_id, min_filer_key: updated_at_min, max_filer_key: updated_at_max, "limit": results_per_page, } if self.add_status: query_params[status_key] = "any" with metrics.http_request_timer(self.name): objects = self.call_api(query_params) for obj in objects: if obj.id < since_id: # This verifies the api behavior expectation we # have that all results actually honor the # since_id parameter. raise OutOfOrderIdsError("obj.id < since_id: {} < {}".format( obj.id, since_id)) yield obj # You know you're at the end when the current page has # less than the request size limits you set. singer.log_info(f"Got {len(objects)} records") if len(objects) < results_per_page: # Save the updated_at_max as our bookmark as we've synced all rows up in our # window and can move forward. Also remove the since_id because we want to # restart at 1. Context.state.get('bookmarks', {}).get(self.name, {}).pop('since_id', None) state_val = updated_at_max if self.skip_day: state_val = state_val + datetime.timedelta(days=1) self.update_bookmark(utils.strftime(state_val)) break if objects[-1].id != max([o.id for o in objects]): # This verifies the api behavior expectation we have # that all pages are internally ordered by the # `since_id`. raise OutOfOrderIdsError("{} is not the max id in objects ({})".format( objects[-1].id, max([o.id for o in objects]))) since_id = objects[-1].id # Put since_id into the state. self.update_bookmark(since_id, bookmark_key='since_id') updated_at_min = updated_at_max + datetime.timedelta(seconds=1) if self.skip_day: updated_at_min = updated_at_min + datetime.timedelta(days=1) if yearly: LOGGER.info("This import only imported one year of historical data. " "Please trigger further incremental data to get the missing rows.")
def sync(): initialize_shopify_client() # Emit all schemas first so we have them for child streams for stream in Context.catalog["streams"]: if Context.is_selected(stream["tap_stream_id"]): singer.write_schema(stream["tap_stream_id"], stream["schema"], stream["key_properties"], bookmark_properties=stream["replication_key"]) Context.counts[stream["tap_stream_id"]] = 0 # If there is a currently syncing stream bookmark, shuffle the # stream order so it gets sync'd first currently_sync_stream_name = Context.state.get( 'bookmarks', {}).get('currently_sync_stream') if currently_sync_stream_name: shuffle_streams(currently_sync_stream_name) # Loop over streams in catalog for catalog_entry in Context.catalog['streams']: stream_id = catalog_entry['tap_stream_id'] stream = Context.stream_objects[stream_id]() if not Context.is_selected(stream_id): LOGGER.info('Skipping stream: %s', stream_id) continue LOGGER.info('Syncing stream: %s', stream_id) if not Context.state.get('bookmarks'): Context.state['bookmarks'] = {} Context.state['bookmarks']['currently_sync_stream'] = stream_id with Transformer() as transformer: try: for rec in stream.sync(): extraction_time = singer.utils.now() record_schema = catalog_entry['schema'] record_metadata = metadata.to_map( catalog_entry['metadata']) rec = transformer.transform(rec, record_schema, record_metadata) singer.write_record(stream_id, rec, time_extracted=extraction_time) Context.counts[stream_id] += 1 except pyactiveresource.connection.ResourceNotFound as exc: raise ShopifyError(exc, 'Ensure shop is entered correctly') from exc except pyactiveresource.connection.UnauthorizedAccess as exc: raise ShopifyError(exc, 'Invalid access token - Re-authorize the connection') \ from exc except pyactiveresource.connection.ConnectionError as exc: msg = '' try: body_json = exc.response.body.decode() body = json.loads(body_json) msg = body.get('errors') finally: raise ShopifyError(exc, msg) from exc except Exception as exc: raise ShopifyError(exc) from exc Context.state['bookmarks'].pop('currently_sync_stream') singer.write_state(Context.state) LOGGER.info('----------------------') for stream_id, stream_count in Context.counts.items(): LOGGER.info('%s: %d', stream_id, stream_count) LOGGER.info('----------------------')
def __init__(self): self.results_per_page = Context.get_results_per_page(RESULTS_PER_PAGE)