Пример #1
0
    def __init__(self):
        self.results_per_page = Context.get_results_per_page(RESULTS_PER_PAGE)

        # set request timeout
        self.request_timeout = get_request_timeout()
Пример #2
0
def get_selected_parents():
    for parent_stream in [
            'orders', 'customers', 'products', 'custom_collections'
    ]:
        if Context.is_selected(parent_stream):
            yield Context.stream_objects[parent_stream]()
Пример #3
0
    def get_objects(self):
        updated_at_min = self.get_bookmark()

        stop_time = singer.utils.now().replace(microsecond=0)
        date_window_size = float(
            Context.config.get("date_window_size", DATE_WINDOW_SIZE))
        results_per_page = Context.get_results_per_page(RESULTS_PER_PAGE)

        # Page through till the end of the resultset
        while updated_at_min < stop_time:
            # Bookmarking can also occur on the since_id
            since_id = self.get_since_id() or 1

            if since_id != 1:
                LOGGER.info("Resuming sync from since_id %d", since_id)

            # It's important that `updated_at_min` has microseconds
            # truncated. Why has been lost to the mists of time but we
            # think it has something to do with how the API treats
            # microseconds on its date windows. Maybe it's possible to
            # drop data due to rounding errors or something like that?
            updated_at_max = updated_at_min + datetime.timedelta(
                days=date_window_size)
            if updated_at_max > stop_time:
                updated_at_max = stop_time
            while True:
                status_key = self.status_key or "status"
                query_params = {
                    "since_id": since_id,
                    "updated_at_min": updated_at_min,
                    "updated_at_max": updated_at_max,
                    "limit": results_per_page,
                    status_key: "any"
                }

                with metrics.http_request_timer(self.name):
                    objects = self.call_api(query_params)

                for obj in objects:
                    if obj.id < since_id:
                        # This verifies the api behavior expectation we
                        # have that all results actually honor the
                        # since_id parameter.
                        raise OutOfOrderIdsError(
                            "obj.id < since_id: {} < {}".format(
                                obj.id, since_id))
                    yield obj

                # You know you're at the end when the current page has
                # less than the request size limits you set.
                if len(objects) < results_per_page:
                    # Save the updated_at_max as our bookmark as we've synced all rows up in our
                    # window and can move forward. Also remove the since_id because we want to
                    # restart at 1.
                    Context.state.get('bookmarks',
                                      {}).get(self.name,
                                              {}).pop('since_id', None)
                    self.update_bookmark(utils.strftime(updated_at_max))
                    break

                if objects[-1].id != max([o.id for o in objects]):
                    # This verifies the api behavior expectation we have
                    # that all pages are internally ordered by the
                    # `since_id`.
                    raise OutOfOrderIdsError(
                        "{} is not the max id in objects ({})".format(
                            objects[-1].id, max([o.id for o in objects])))
                since_id = objects[-1].id

                # Put since_id into the state.
                self.update_bookmark(since_id, bookmark_key='since_id')

            updated_at_min = updated_at_max
Пример #4
0
def sync():
    initialize_shopify_client()

    # Emit all schemas first so we have them for child streams
    for stream in Context.catalog["streams"]:
        if Context.is_selected(stream["tap_stream_id"]):
            singer.write_schema(stream["tap_stream_id"],
                                stream["schema"],
                                stream["key_properties"],
                                bookmark_properties=stream["replication_key"])
            Context.counts[stream["tap_stream_id"]] = 0
            Context.durations[stream["tap_stream_id"]] = None

    # If there is a currently syncing stream bookmark, shuffle the
    # stream order so it gets sync'd first
    currently_sync_stream_name = Context.state.get('bookmarks', {}).get('currently_sync_stream')
    if currently_sync_stream_name:
        shuffle_streams(currently_sync_stream_name)

    # Loop over streams in catalog
    for catalog_entry in Context.catalog['streams']:
        stream_start_time = time.time()
        stream_id = catalog_entry['tap_stream_id']
        stream = Context.stream_objects[stream_id]()
        stream.schema = catalog_entry['schema']

        if not Context.is_selected(stream_id):
            LOGGER.info('Skipping stream: %s', stream_id)
            continue

        LOGGER.info('Syncing stream: %s', stream_id)

        if not Context.state.get('bookmarks'):
            Context.state['bookmarks'] = {}
        Context.state['bookmarks']['currently_sync_stream'] = stream_id

        if Context.config.get("use_async", False) and stream.async_available:
            Context.counts[stream_id] = stream.sync_async()
        else:
            with Transformer() as transformer:
                for rec in stream.sync():
                    extraction_time = singer.utils.now()
                    record_metadata = metadata.to_map(catalog_entry['metadata'])
                    rec = transformer.transform(rec, stream.schema, record_metadata)
                    singer.write_record(stream_id,
                                        rec,
                                        time_extracted=extraction_time)
                    Context.counts[stream_id] += 1

        Context.state['bookmarks'].pop('currently_sync_stream')
        singer.write_state(Context.state)
        stream_job_duration = time.strftime("%H:%M:%S", time.gmtime(time.time() - stream_start_time))
        Context.durations[stream_id] = stream_job_duration

    div = "-"*50
    info_msg = "\n{d}".format(d=div)
    info_msg += "\nShop: {}".format(Context.config['shop'])
    info_msg += "\n{d}\n".format(d=div)
    for stream_id, stream_count in Context.counts.items():
        info_msg += "\n{}: {}".format(stream_id, stream_count)
        info_msg += "\nDuration: {}".format(Context.durations[stream_id])
    info_msg += "\n{d}\n".format(d=div)
    LOGGER.info(info_msg)
Пример #5
0
    def get_objects(self):
        updated_at_min = self.get_bookmark()

        stop_time = singer.utils.now().replace(microsecond=0)
        # Retrieve data for max 1 year. Otherwise log incremental needed.
        diff_days = (stop_time - updated_at_min).days
        yearly = False
        if diff_days > 365:
            yearly = True
            stop_time = updated_at_min + datetime.timedelta(days=365)
            LOGGER.info("This import will only import the first year of historical data. "
                        "You need to trigger further incremental imports to get the missing rows.")

        date_window_size = float(Context.config.get("date_window_size", DATE_WINDOW_SIZE))
        results_per_page = Context.get_results_per_page(RESULTS_PER_PAGE)

        # Page through till the end of the resultset
        while updated_at_min < stop_time:
            # Bookmarking can also occur on the since_id
            since_id = self.get_since_id() or 1

            if since_id != 1:
                LOGGER.info("Resuming sync from since_id %d", since_id)

            # It's important that `updated_at_min` has microseconds
            # truncated. Why has been lost to the mists of time but we
            # think it has something to do with how the API treats
            # microseconds on its date windows. Maybe it's possible to
            # drop data due to rounding errors or something like that?
            updated_at_max = updated_at_min + datetime.timedelta(days=date_window_size)
            if updated_at_max > stop_time:
                updated_at_max = stop_time

            singer.log_info("getting from %s - %s", updated_at_min,
                            updated_at_max)

            min_filer_key = self.get_min_replication_key()
            max_filer_key = self.get_max_replication_key()

            while True:
                status_key = self.status_key or "status"
                query_params = {
                    "since_id": since_id,
                    min_filer_key: updated_at_min,
                    max_filer_key: updated_at_max,
                    "limit": results_per_page,
                }

                if self.add_status:
                    query_params[status_key] = "any"

                with metrics.http_request_timer(self.name):
                    objects = self.call_api(query_params)

                for obj in objects:
                    if obj.id < since_id:
                        # This verifies the api behavior expectation we
                        # have that all results actually honor the
                        # since_id parameter.
                        raise OutOfOrderIdsError("obj.id < since_id: {} < {}".format(
                            obj.id, since_id))
                    yield obj

                # You know you're at the end when the current page has
                # less than the request size limits you set.
                singer.log_info(f"Got {len(objects)} records")
                if len(objects) < results_per_page:
                    # Save the updated_at_max as our bookmark as we've synced all rows up in our
                    # window and can move forward. Also remove the since_id because we want to
                    # restart at 1.
                    Context.state.get('bookmarks', {}).get(self.name, {}).pop('since_id', None)
                    state_val = updated_at_max
                    if self.skip_day:
                        state_val = state_val + datetime.timedelta(days=1)
                    self.update_bookmark(utils.strftime(state_val))
                    break

                if objects[-1].id != max([o.id for o in objects]):
                    # This verifies the api behavior expectation we have
                    # that all pages are internally ordered by the
                    # `since_id`.
                    raise OutOfOrderIdsError("{} is not the max id in objects ({})".format(
                        objects[-1].id, max([o.id for o in objects])))
                since_id = objects[-1].id

                # Put since_id into the state.
                self.update_bookmark(since_id, bookmark_key='since_id')

            updated_at_min = updated_at_max + datetime.timedelta(seconds=1)

            if self.skip_day:
                updated_at_min = updated_at_min + datetime.timedelta(days=1)

        if yearly:
            LOGGER.info("This import only imported one year of historical data. "
                        "Please trigger further incremental data to get the missing rows.")
Пример #6
0
def sync():
    initialize_shopify_client()

    # Emit all schemas first so we have them for child streams
    for stream in Context.catalog["streams"]:
        if Context.is_selected(stream["tap_stream_id"]):
            singer.write_schema(stream["tap_stream_id"],
                                stream["schema"],
                                stream["key_properties"],
                                bookmark_properties=stream["replication_key"])
            Context.counts[stream["tap_stream_id"]] = 0

    # If there is a currently syncing stream bookmark, shuffle the
    # stream order so it gets sync'd first
    currently_sync_stream_name = Context.state.get(
        'bookmarks', {}).get('currently_sync_stream')
    if currently_sync_stream_name:
        shuffle_streams(currently_sync_stream_name)

    # Loop over streams in catalog
    for catalog_entry in Context.catalog['streams']:
        stream_id = catalog_entry['tap_stream_id']
        stream = Context.stream_objects[stream_id]()

        if not Context.is_selected(stream_id):
            LOGGER.info('Skipping stream: %s', stream_id)
            continue

        LOGGER.info('Syncing stream: %s', stream_id)

        if not Context.state.get('bookmarks'):
            Context.state['bookmarks'] = {}
        Context.state['bookmarks']['currently_sync_stream'] = stream_id

        with Transformer() as transformer:
            try:
                for rec in stream.sync():
                    extraction_time = singer.utils.now()
                    record_schema = catalog_entry['schema']
                    record_metadata = metadata.to_map(
                        catalog_entry['metadata'])
                    rec = transformer.transform(rec, record_schema,
                                                record_metadata)
                    singer.write_record(stream_id,
                                        rec,
                                        time_extracted=extraction_time)
                    Context.counts[stream_id] += 1
            except pyactiveresource.connection.ResourceNotFound as exc:
                raise ShopifyError(exc,
                                   'Ensure shop is entered correctly') from exc
            except pyactiveresource.connection.UnauthorizedAccess as exc:
                raise ShopifyError(exc, 'Invalid access token - Re-authorize the connection') \
                    from exc
            except pyactiveresource.connection.ConnectionError as exc:
                msg = ''
                try:
                    body_json = exc.response.body.decode()
                    body = json.loads(body_json)
                    msg = body.get('errors')
                finally:
                    raise ShopifyError(exc, msg) from exc
            except Exception as exc:
                raise ShopifyError(exc) from exc

        Context.state['bookmarks'].pop('currently_sync_stream')
        singer.write_state(Context.state)

    LOGGER.info('----------------------')
    for stream_id, stream_count in Context.counts.items():
        LOGGER.info('%s: %d', stream_id, stream_count)
    LOGGER.info('----------------------')
Пример #7
0
 def __init__(self):
     self.results_per_page = Context.get_results_per_page(RESULTS_PER_PAGE)