예제 #1
0
def get_metafields(parent_object, since_id, parent_replication_object,
                   timeout):
    # set timeout
    parent_replication_object.set_timeout(timeout)
    # This call results in an HTTP request - the parent object never has a
    # cache of this data so we have to issue that request.
    return parent_object.metafields(
        limit=Context.get_results_per_page(RESULTS_PER_PAGE),
        since_id=since_id)
예제 #2
0
파일: base.py 프로젝트: sakrafd/tap-shopify
    def get_objects(self):
        updated_at_min = self.get_bookmark()

        stop_time = singer.utils.now().replace(microsecond=0)
        date_window_size = float(
            Context.config.get("date_window_size", DATE_WINDOW_SIZE))
        results_per_page = Context.get_results_per_page(RESULTS_PER_PAGE)

        # Page through till the end of the resultset
        while updated_at_min < stop_time:
            # Bookmarking can also occur on the since_id
            since_id = self.get_since_id() or 1

            if since_id != 1:
                LOGGER.info("Resuming sync from since_id %d", since_id)

            # It's important that `updated_at_min` has microseconds
            # truncated. Why has been lost to the mists of time but we
            # think it has something to do with how the API treats
            # microseconds on its date windows. Maybe it's possible to
            # drop data due to rounding errors or something like that?
            updated_at_max = updated_at_min + datetime.timedelta(
                days=date_window_size)
            if updated_at_max > stop_time:
                updated_at_max = stop_time
            while True:
                status_key = self.status_key or "status"
                query_params = {
                    "since_id": since_id,
                    "updated_at_min": updated_at_min,
                    "updated_at_max": updated_at_max,
                    "limit": results_per_page,
                    status_key: "any"
                }

                with metrics.http_request_timer(self.name):
                    objects = self.call_api(query_params)

                for obj in objects:
                    if obj.id < since_id:
                        # This verifies the api behavior expectation we
                        # have that all results actually honor the
                        # since_id parameter.
                        raise OutOfOrderIdsError(
                            "obj.id < since_id: {} < {}".format(
                                obj.id, since_id))
                    yield obj

                # You know you're at the end when the current page has
                # less than the request size limits you set.
                if len(objects) < results_per_page:
                    # Save the updated_at_max as our bookmark as we've synced all rows up in our
                    # window and can move forward. Also remove the since_id because we want to
                    # restart at 1.
                    Context.state.get('bookmarks',
                                      {}).get(self.name,
                                              {}).pop('since_id', None)
                    self.update_bookmark(utils.strftime(updated_at_max))
                    break

                if objects[-1].id != max([o.id for o in objects]):
                    # This verifies the api behavior expectation we have
                    # that all pages are internally ordered by the
                    # `since_id`.
                    raise OutOfOrderIdsError(
                        "{} is not the max id in objects ({})".format(
                            objects[-1].id, max([o.id for o in objects])))
                since_id = objects[-1].id

                # Put since_id into the state.
                self.update_bookmark(since_id, bookmark_key='since_id')

            updated_at_min = updated_at_max
예제 #3
0
 def __init__(self):
     self.results_per_page = Context.get_results_per_page(RESULTS_PER_PAGE)
예제 #4
0
    def __init__(self):
        self.results_per_page = Context.get_results_per_page(RESULTS_PER_PAGE)

        # set request timeout
        self.request_timeout = get_request_timeout()
예제 #5
0
    def get_objects(self):
        updated_at_min = self.get_bookmark()

        stop_time = singer.utils.now().replace(microsecond=0)
        # Retrieve data for max 1 year. Otherwise log incremental needed.
        diff_days = (stop_time - updated_at_min).days
        yearly = False
        if diff_days > 365:
            yearly = True
            stop_time = updated_at_min + datetime.timedelta(days=365)
            LOGGER.info("This import will only import the first year of historical data. "
                        "You need to trigger further incremental imports to get the missing rows.")

        date_window_size = float(Context.config.get("date_window_size", DATE_WINDOW_SIZE))
        results_per_page = Context.get_results_per_page(RESULTS_PER_PAGE)

        # Page through till the end of the resultset
        while updated_at_min < stop_time:
            # Bookmarking can also occur on the since_id
            since_id = self.get_since_id() or 1

            if since_id != 1:
                LOGGER.info("Resuming sync from since_id %d", since_id)

            # It's important that `updated_at_min` has microseconds
            # truncated. Why has been lost to the mists of time but we
            # think it has something to do with how the API treats
            # microseconds on its date windows. Maybe it's possible to
            # drop data due to rounding errors or something like that?
            updated_at_max = updated_at_min + datetime.timedelta(days=date_window_size)
            if updated_at_max > stop_time:
                updated_at_max = stop_time

            singer.log_info("getting from %s - %s", updated_at_min,
                            updated_at_max)

            min_filer_key = self.get_min_replication_key()
            max_filer_key = self.get_max_replication_key()

            while True:
                status_key = self.status_key or "status"
                query_params = {
                    "since_id": since_id,
                    min_filer_key: updated_at_min,
                    max_filer_key: updated_at_max,
                    "limit": results_per_page,
                }

                if self.add_status:
                    query_params[status_key] = "any"

                with metrics.http_request_timer(self.name):
                    objects = self.call_api(query_params)

                for obj in objects:
                    if obj.id < since_id:
                        # This verifies the api behavior expectation we
                        # have that all results actually honor the
                        # since_id parameter.
                        raise OutOfOrderIdsError("obj.id < since_id: {} < {}".format(
                            obj.id, since_id))
                    yield obj

                # You know you're at the end when the current page has
                # less than the request size limits you set.
                singer.log_info(f"Got {len(objects)} records")
                if len(objects) < results_per_page:
                    # Save the updated_at_max as our bookmark as we've synced all rows up in our
                    # window and can move forward. Also remove the since_id because we want to
                    # restart at 1.
                    Context.state.get('bookmarks', {}).get(self.name, {}).pop('since_id', None)
                    state_val = updated_at_max
                    if self.skip_day:
                        state_val = state_val + datetime.timedelta(days=1)
                    self.update_bookmark(utils.strftime(state_val))
                    break

                if objects[-1].id != max([o.id for o in objects]):
                    # This verifies the api behavior expectation we have
                    # that all pages are internally ordered by the
                    # `since_id`.
                    raise OutOfOrderIdsError("{} is not the max id in objects ({})".format(
                        objects[-1].id, max([o.id for o in objects])))
                since_id = objects[-1].id

                # Put since_id into the state.
                self.update_bookmark(since_id, bookmark_key='since_id')

            updated_at_min = updated_at_max + datetime.timedelta(seconds=1)

            if self.skip_day:
                updated_at_min = updated_at_min + datetime.timedelta(days=1)

        if yearly:
            LOGGER.info("This import only imported one year of historical data. "
                        "Please trigger further incremental data to get the missing rows.")