示例#1
0
async def sync_catalog_entry(sf, catalog_entry, state):
    stream_version = get_stream_version(catalog_entry, state)
    stream = catalog_entry['stream']
    stream_alias = catalog_entry.get('stream_alias')
    stream_name = catalog_entry["tap_stream_id"]
    activate_version_message = singer.ActivateVersionMessage(
        stream=(stream_alias or stream), version=stream_version)

    catalog_metadata = metadata.to_map(catalog_entry['metadata'])
    replication_key = catalog_metadata.get((), {}).get('replication-key')

    mdata = metadata.to_map(catalog_entry['metadata'])

    if not stream_is_selected(mdata):
        LOGGER.info("%s: Skipping - not selected", stream_name)
        return

    LOGGER.info("%s: Starting", stream_name)

    singer.write_state(state)
    key_properties = metadata.to_map(catalog_entry['metadata']).get(
        (), {}).get('table-key-properties')
    singer.write_schema(stream, catalog_entry['schema'], key_properties,
                        replication_key, stream_alias)

    loop = asyncio.get_event_loop()

    job_id = singer.get_bookmark(state, catalog_entry['tap_stream_id'],
                                 'JobID')
    if job_id:
        with metrics.record_counter(stream) as counter:
            LOGGER.info(
                "Found JobID from previous Bulk Query. Resuming sync for job: %s",
                job_id)
            # Resuming a sync should clear out the remaining state once finished
            await loop.run_in_executor(None, resume_syncing_bulk_query, sf,
                                       catalog_entry, job_id, state, counter)
            LOGGER.info("Completed sync for %s", stream_name)
            state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'],
                                           {}).pop('JobID', None)
            state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'],
                                           {}).pop('BatchIDs', None)
            bookmark = state.get('bookmarks',
                                 {}).get(catalog_entry['tap_stream_id'],
                                         {}).pop('JobHighestBookmarkSeen',
                                                 None)
            state = singer.write_bookmark(state,
                                          catalog_entry['tap_stream_id'],
                                          replication_key, bookmark)
            singer.write_state(state)
    else:
        state_msg_threshold = CONFIG.get('state_message_threshold', 1000)

        # Tables with a replication_key or an empty bookmark will emit an
        # activate_version at the beginning of their sync
        bookmark_is_empty = state.get('bookmarks', {}).get(
            catalog_entry['tap_stream_id']) is None

        if replication_key or bookmark_is_empty:
            singer.write_message(activate_version_message)
            state = singer.write_bookmark(state,
                                          catalog_entry['tap_stream_id'],
                                          'version', stream_version)
        await loop.run_in_executor(None, sync_stream, sf, catalog_entry, state,
                                   state_msg_threshold)
        LOGGER.info("Completed sync for %s", stream_name)
def do_sync(sf, catalog, state):
    starting_stream = state.get("current_stream")

    if starting_stream:
        LOGGER.info("Resuming sync from %s", starting_stream)
    else:
        LOGGER.info("Starting sync")

    for catalog_entry in catalog["streams"]:
        stream_version = get_stream_version(catalog_entry, state)
        stream = catalog_entry['stream']
        stream_alias = catalog_entry.get('stream_alias')
        stream_name = catalog_entry["tap_stream_id"]
        activate_version_message = singer.ActivateVersionMessage(
            stream=(stream_alias or stream), version=stream_version)

        catalog_metadata = metadata.to_map(catalog_entry['metadata'])
        replication_key = catalog_metadata.get((), {}).get('replication-key')

        mdata = metadata.to_map(catalog_entry['metadata'])

        if not stream_is_selected(mdata):
            LOGGER.info("%s: Skipping - not selected", stream_name)
            continue

        if starting_stream:
            if starting_stream == stream_name:
                LOGGER.info("%s: Resuming", stream_name)
                starting_stream = None
            else:
                LOGGER.info("%s: Skipping - already synced", stream_name)
                continue
        else:
            LOGGER.info("%s: Starting", stream_name)

        state["current_stream"] = stream_name
        singer.write_state(state)
        key_properties = metadata.to_map(catalog_entry['metadata']).get((), {}).get('table-key-properties')
        singer.write_schema(
            stream,
            catalog_entry['schema'],
            key_properties,
            replication_key,
            stream_alias)

        job_id = singer.get_bookmark(state, catalog_entry['tap_stream_id'], 'JobID')
        if job_id:
            with metrics.record_counter(stream) as counter:
                LOGGER.info("Found JobID from previous Bulk Query. Resuming sync for job: %s", job_id)
                # Resuming a sync should clear out the remaining state once finished
                counter = resume_syncing_bulk_query(sf, catalog_entry, job_id, state, counter)
                LOGGER.info("%s: Completed sync (%s rows)", stream_name, counter.value)
                # Remove Job info from state once we complete this resumed query. One of a few cases could have occurred:
                # 1. The job succeeded, in which case make JobHighestBookmarkSeen the new bookmark
                # 2. The job partially completed, in which case make JobHighestBookmarkSeen the new bookmark, or
                #    existing bookmark if no bookmark exists for the Job.
                # 3. The job completely failed, in which case maintain the existing bookmark, or None if no bookmark
                state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'], {}).pop('JobID', None)
                state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'], {}).pop('BatchIDs', None)
                bookmark = state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'], {}) \
                                                     .pop('JobHighestBookmarkSeen', None)
                existing_bookmark = state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'], {}) \
                                                              .pop(replication_key, None)
                state = singer.write_bookmark(
                    state,
                    catalog_entry['tap_stream_id'],
                    replication_key,
                    bookmark or existing_bookmark) # If job is removed, reset to existing bookmark or None
                singer.write_state(state)
        else:
            # Tables with a replication_key or an empty bookmark will emit an
            # activate_version at the beginning of their sync
            bookmark_is_empty = state.get('bookmarks', {}).get(
                catalog_entry['tap_stream_id']) is None

            if replication_key or bookmark_is_empty:
                singer.write_message(activate_version_message)
                state = singer.write_bookmark(state,
                                              catalog_entry['tap_stream_id'],
                                              'version',
                                              stream_version)
            counter = sync_stream(sf, catalog_entry, state)
            LOGGER.info("%s: Completed sync (%s rows)", stream_name, counter.value)

    state["current_stream"] = None
    singer.write_state(state)
    LOGGER.info("Finished sync")
示例#3
0
def do_sync(sf, catalog, state):
    starting_stream = state.get("current_stream")

    if starting_stream:
        LOGGER.info("Resuming sync from %s", starting_stream)
    else:
        LOGGER.info("Starting sync")

    for catalog_entry in catalog["streams"]:
        stream_version = get_stream_version(catalog_entry, state)
        stream = catalog_entry['stream']
        stream_alias = catalog_entry.get('stream_alias')
        stream_name = catalog_entry["tap_stream_id"]
        activate_version_message = singer.ActivateVersionMessage(
            stream=(stream_alias or stream), version=stream_version)
        replication_key = catalog_entry.get('replication_key')

        mdata = metadata.to_map(catalog_entry['metadata'])
        if not stream_is_selected(mdata):
            LOGGER.info("%s: Skipping - not selected", stream_name)
            continue

        if starting_stream:
            if starting_stream == stream_name:
                LOGGER.info("%s: Resuming", stream_name)
                starting_stream = None
            else:
                LOGGER.info("%s: Skipping - already synced", stream_name)
                continue
        else:
            LOGGER.info("%s: Starting", stream_name)

        state["current_stream"] = stream_name
        singer.write_state(state)
        singer.write_schema(stream, catalog_entry['schema'],
                            catalog_entry['key_properties'], replication_key,
                            stream_alias)

        job_id = singer.get_bookmark(state, catalog_entry['tap_stream_id'],
                                     'JobID')
        if job_id:
            with metrics.record_counter(stream) as counter:
                LOGGER.info("Resuming sync for stream: %s", stream_name)
                # Resuming a sync should clear out the remaining state once finished
                counter = resume_syncing_bulk_query(sf, catalog_entry, job_id,
                                                    state, counter)
                LOGGER.info("%s: Completed sync (%s rows)", stream_name,
                            counter.value)
                state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'],
                                               {}).pop('JobID', None)
                state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'],
                                               {}).pop('BatchIDs', None)
                bookmark = state.get('bookmarks',
                                     {}).get(catalog_entry['tap_stream_id'],
                                             {}).pop('JobHighestBookmarkSeen',
                                                     None)
                state = singer.write_bookmark(state,
                                              catalog_entry['tap_stream_id'],
                                              replication_key, bookmark)
                singer.write_state(state)
        else:
            # Tables with a replication_key or an empty bookmark will emit an
            # activate_version at the beginning of their sync
            bookmark_is_empty = state.get('bookmarks', {}).get(
                catalog_entry['tap_stream_id']) is None

            if replication_key or bookmark_is_empty:
                singer.write_message(activate_version_message)
                state = singer.write_bookmark(state,
                                              catalog_entry['tap_stream_id'],
                                              'version', stream_version)
            counter = sync_stream(sf, catalog_entry, state)
            LOGGER.info("%s: Completed sync (%s rows)", stream_name,
                        counter.value)

    state["current_stream"] = None
    singer.write_state(state)
    LOGGER.info("Finished sync")