Exemplo n.º 1
0
    def sync(self):
        self.on_window_started()
        parent = self.parent_class(self.client, self.config, self.state)

        # Get the most recent parent ID and resume from there, if necessary
        bookmarked_parent = singer.get_bookmark(self.state, self.stream_id,
                                                'parent_id')
        parent_ids = [
            p['id'] for p in self._sort_parent_ids_by_created(
                self.get_parent_ids(parent))
        ]

        if bookmarked_parent and bookmarked_parent in parent_ids:
            # NB: This will cause some rework, but it will guarantee the tap doesn't miss records if interrupted.
            # - If there's too much data to sync all parents in a single run, this API is not appropriate for that data set.
            parent_ids = dropwhile(lambda p: p != bookmarked_parent,
                                   parent_ids)
        for parent_id in parent_ids:
            singer.write_bookmark(self.state, self.stream_id, "parent_id",
                                  parent_id)
            singer.write_state(self.state)
            for rec in self.get_records([parent_id]):
                yield rec
        singer.clear_bookmark(self.state, self.stream_id, "parent_id")
        self.on_window_finished()
Exemplo n.º 2
0
 def on_window_finished(self):
     window_start = singer.get_bookmark(self.state, self.stream_id,
                                        'window_end')
     singer.write_bookmark(self.state, self.stream_id, 'last_record',
                           window_start)
     singer.clear_bookmark(self.state, self.stream_id, 'window_end')
     singer.write_state(self.state)
Exemplo n.º 3
0
def sync_table(mysql_conn, catalog_entry, state, columns, stream_version):
    common.whitelist_bookmark_keys(generate_bookmark_keys(catalog_entry),
                                   catalog_entry.tap_stream_id, state)

    bookmark = state.get('bookmarks', {}).get(catalog_entry.tap_stream_id, {})
    version_exists = True if 'version' in bookmark else False

    initial_full_table_complete = singer.get_bookmark(
        state, catalog_entry.tap_stream_id, 'initial_full_table_complete')

    state_version = singer.get_bookmark(state, catalog_entry.tap_stream_id,
                                        'version')

    activate_version_message = singer.ActivateVersionMessage(
        stream=catalog_entry.stream, version=stream_version)

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if not initial_full_table_complete and not (version_exists
                                                and state_version is None):
        singer.write_message(activate_version_message)

    perform_resumable_sync = sync_is_resumable(mysql_conn, catalog_entry)

    pk_clause = ""

    with connect_with_backoff(mysql_conn) as open_conn:
        with open_conn.cursor() as cur:
            select_sql = common.generate_select_sql(catalog_entry, columns)

            if perform_resumable_sync:
                LOGGER.info(
                    "Full table sync is resumable based on primary key definition, will replicate incrementally"
                )

                state = update_incremental_full_table_state(
                    catalog_entry, state, cur)
                pk_clause = generate_pk_clause(catalog_entry, state)

            select_sql += pk_clause

            try:
                select_sql = _create_temp_table(mysql_conn, catalog_entry,
                                                columns, pk_clause)
            except Exception as ex:
                logging.warning("creating temp table failed: {}".format(
                    str(ex)))

            params = {}

            common.sync_query(cur, catalog_entry, state, select_sql, columns,
                              stream_version, params)

    # clear max pk value and last pk fetched upon successful sync
    singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values')
    singer.clear_bookmark(state, catalog_entry.tap_stream_id,
                          'last_pk_fetched')

    singer.write_message(activate_version_message)
Exemplo n.º 4
0
def whitelist_bookmark_keys(bookmark_key_set, tap_stream_id, state):
    for bk in [
            non_whitelisted_bookmark_key
            for non_whitelisted_bookmark_key in state.get('bookmarks', {}).get(
                tap_stream_id, {}).keys()
            if non_whitelisted_bookmark_key not in bookmark_key_set
    ]:
        singer.clear_bookmark(state, tap_stream_id, bk)
Exemplo n.º 5
0
 def on_window_finished(self):
     # Set window_start to current window_end
     window_start = singer.get_bookmark(self.state, self.stream_id,
                                        "window_end")
     singer.write_bookmark(self.state, self.stream_id, "window_start",
                           window_start)
     singer.clear_bookmark(self.state, self.stream_id, "window_end")
     singer.write_state(self.state)
Exemplo n.º 6
0
def sync_table(snowflake_conn, catalog_entry, state, columns):
    """Sync table incrementally"""
    common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id,
                                   state)

    catalog_metadata = metadata.to_map(catalog_entry.metadata)
    stream_metadata = catalog_metadata.get((), {})

    replication_key_metadata = stream_metadata.get('replication-key')
    replication_key_state = singer.get_bookmark(state,
                                                catalog_entry.tap_stream_id,
                                                'replication_key')

    replication_key_value = None

    if replication_key_metadata == replication_key_state:
        replication_key_value = singer.get_bookmark(
            state, catalog_entry.tap_stream_id, 'replication_key_value')
    else:
        state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                      'replication_key',
                                      replication_key_metadata)
        state = singer.clear_bookmark(state, catalog_entry.tap_stream_id,
                                      'replication_key_value')

    stream_version = common.get_stream_version(catalog_entry.tap_stream_id,
                                               state)
    state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                  'version', stream_version)

    activate_version_message = singer.ActivateVersionMessage(
        stream=catalog_entry.stream, version=stream_version)

    singer.write_message(activate_version_message)

    select_sql = common.generate_select_sql(catalog_entry, columns)
    params = {}

    with snowflake_conn.connect_with_backoff() as open_conn:
        with open_conn.cursor() as cur:
            select_sql = common.generate_select_sql(catalog_entry, columns)
            params = {}

            if replication_key_value is not None:
                if catalog_entry.schema.properties[
                        replication_key_metadata].format == 'date-time':
                    replication_key_value = pendulum.parse(
                        replication_key_value)

                # pylint: disable=duplicate-string-formatting-argument
                select_sql += ' WHERE "{}" >= \'{}\' ORDER BY "{}" ASC'.format(
                    replication_key_metadata, replication_key_value,
                    replication_key_metadata)

            elif replication_key_metadata is not None:
                select_sql += ' ORDER BY "{}" ASC'.format(
                    replication_key_metadata)

            common.sync_query(cur, catalog_entry, state, select_sql, columns,
                              stream_version, params)
Exemplo n.º 7
0
def sync(config, state, catalog):
    client = ZenefitsClient(config['token'])
    company_id = config['company_id']

    with Transformer() as transformer:
        for stream in catalog.get_selected_streams(state):
            tap_stream_id = stream.tap_stream_id
            stream_obj = STREAMS[tap_stream_id](client, state)
            replication_key = stream_obj.replication_key
            stream_schema = stream.schema.to_dict()
            stream_metadata = metadata.to_map(stream.metadata)

            LOGGER.info('Staring sync for stream: %s', tap_stream_id)

            state = singer.set_currently_syncing(state, tap_stream_id)
            singer.write_state(state)

            singer.write_schema(tap_stream_id, stream_schema,
                                stream_obj.key_properties,
                                stream.replication_key)

            for record in stream_obj.sync(company_id=company_id):
                transformed_record = transformer.transform(
                    record, stream_schema, stream_metadata)
                LOGGER.info(f"Writing record: {transformed_record}")
                singer.write_record(
                    tap_stream_id,
                    transformed_record,
                )
            state = singer.clear_bookmark(state, tap_stream_id, 'cursor')
            singer.write_state(state)

    state = singer.set_currently_syncing(state, None)
    singer.write_state(state)
Exemplo n.º 8
0
    def sync(self, state, stream_schema, stream_metadata, config, transformer):
        start_time = singer.get_bookmark(state, self.tap_stream_id,
                                         self.replication_key,
                                         config['start_date'])

        sync_start_bookmark = singer.get_bookmark(
            state, self.tap_stream_id, 'sync_start',
            singer.utils.strftime(singer.utils.now(),
                                  format_str=singer.utils.DATETIME_PARSE))
        state = singer.write_bookmark(
            state,
            self.tap_stream_id,
            'sync_start',
            sync_start_bookmark,
        )

        bookmarked_cursor = singer.get_bookmark(state, self.tap_stream_id,
                                                'cursor')

        for page, cursor in self.get_pages_safe(state, bookmarked_cursor,
                                                start_time):
            for record in page:
                if record[self.replication_key] >= start_time:
                    transformed_record = transformer.transform(
                        record,
                        stream_schema,
                        stream_metadata,
                    )
                    singer.write_record(
                        self.tap_stream_id,
                        transformed_record,
                    )
            state = singer.write_bookmark(state, self.tap_stream_id, 'cursor',
                                          cursor)
            singer.write_state(state)

        state = singer.clear_bookmark(state, self.tap_stream_id, 'sync_start')
        state = singer.clear_bookmark(state, self.tap_stream_id, 'cursor')
        state = singer.write_bookmark(
            state,
            self.tap_stream_id,
            self.replication_key,
            sync_start_bookmark,
        )
        singer.write_state(state)
        return state
Exemplo n.º 9
0
def do_sync_full_table(mysql_conn, catalog_entry, state, columns):
    LOGGER.info("Stream %s is using full table replication", catalog_entry.stream)

    write_schema_message(catalog_entry)

    stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state)

    full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version)

    # Prefer initial_full_table_complete going forward
    singer.clear_bookmark(state, catalog_entry.tap_stream_id, "version")

    state = singer.write_bookmark(
        state, catalog_entry.tap_stream_id, "initial_full_table_complete", True
    )

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
Exemplo n.º 10
0
def sync_table(mysql_conn,
               catalog_entry,
               state,
               columns,
               original_state_file=''):
    common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id,
                                   state)

    catalog_metadata = metadata.to_map(catalog_entry.metadata)
    stream_metadata = catalog_metadata.get((), {})

    replication_key_metadata = stream_metadata.get('replication-key')
    replication_key_state = singer.get_bookmark(state,
                                                catalog_entry.tap_stream_id,
                                                'replication_key')

    replication_key_value = None

    if replication_key_metadata == replication_key_state:
        replication_key_value = singer.get_bookmark(
            state, catalog_entry.tap_stream_id, 'replication_key_value')
    else:
        state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                      'replication_key',
                                      replication_key_metadata)
        state = singer.clear_bookmark(state, catalog_entry.tap_stream_id,
                                      'replication_key_value')

    stream_version = common.get_stream_version(catalog_entry.tap_stream_id,
                                               state)
    state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                  'version', stream_version)

    activate_version_message = singer.ActivateVersionMessage(
        stream='%s_%s' %
        (common.get_database_name(catalog_entry), catalog_entry.stream),
        version=stream_version)

    singer.write_message(activate_version_message)

    with connect_with_backoff(mysql_conn) as open_conn:
        with open_conn.cursor() as cur:
            select_sql = common.generate_select_sql(catalog_entry, columns)
            params = {}

            if replication_key_value is not None:
                if catalog_entry.schema.properties[
                        replication_key_metadata].format == 'date-time':
                    replication_key_value = pendulum.parse(
                        replication_key_value)

                select_sql += ' WHERE `{}` >= %(replication_key_value)s ORDER BY `{}` ASC'.format(
                    replication_key_metadata, replication_key_metadata)

                params['replication_key_value'] = replication_key_value

            common.sync_query(cur, catalog_entry, state, select_sql, columns,
                              stream_version, params, original_state_file)
Exemplo n.º 11
0
def do_sync_full_table(mssql_conn, config, catalog_entry, state, columns):
    key_properties = common.get_key_properties(catalog_entry)
    mssql_conn = MSSQLConnection(config)

    write_schema_message(catalog_entry)

    stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state)

    full_table.sync_table(mssql_conn, config, catalog_entry, state, columns, stream_version)

    # Prefer initial_full_table_complete going forward
    singer.clear_bookmark(state, catalog_entry.tap_stream_id, "version")

    state = singer.write_bookmark(
        state, catalog_entry.tap_stream_id, "initial_full_table_complete", True
    )

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
Exemplo n.º 12
0
def sync_query(config, state, stream):
    table_name = stream['tap_stream_id']

    #before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, table_name, 'version') is None

    # last run was interrupted if there is a last_id_fetched bookmark
    was_interrupted = singer.get_bookmark(state, table_name,
                                          'last_evaluated_key') is not None

    #pick a new table version if last run wasn't interrupted
    if was_interrupted:
        stream_version = singer.get_bookmark(state, table_name, 'version')
    else:
        stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, table_name, 'version', stream_version)
    singer.write_state(state)

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if first_run:
        singer.write_version(table_name, stream_version)

    mdata = metadata.to_map(stream['metadata'])
    queries = metadata.get(mdata, (), "queries")

    rows_saved = 0

    deserializer = Deserializer()
    for result in query_table(table_name, queries, config):
        for item in result.get('Items', []):
            rows_saved += 1
            # TODO: Do we actually have to put the item we retreive from
            # dynamo into a map before we can deserialize?
            record = deserializer.deserialize_item(item)
            record_message = singer.RecordMessage(stream=table_name,
                                                  record=record,
                                                  version=stream_version)

            singer.write_message(record_message)
        if result.get('LastEvaluatedKey'):
            state = singer.write_bookmark(state, table_name,
                                          'last_evaluated_key',
                                          result.get('LastEvaluatedKey'))
            singer.write_state(state)

    state = singer.clear_bookmark(state, table_name, 'last_evaluated_key')

    state = singer.write_bookmark(state, table_name,
                                  'initial_full_table_complete', True)

    singer.write_state(state)

    singer.write_version(table_name, stream_version)

    return rows_saved
Exemplo n.º 13
0
def sync_table(mssql_conn, config, catalog_entry, state, columns):
    mssql_conn = MSSQLConnection(config)
    common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id,
                                   state)

    catalog_metadata = metadata.to_map(catalog_entry.metadata)
    stream_metadata = catalog_metadata.get((), {})

    replication_key_metadata = stream_metadata.get("replication-key")
    replication_key_state = singer.get_bookmark(state,
                                                catalog_entry.tap_stream_id,
                                                "replication_key")

    replication_key_value = None

    if replication_key_metadata == replication_key_state:
        replication_key_value = singer.get_bookmark(
            state, catalog_entry.tap_stream_id, "replication_key_value")
    else:
        state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                      "replication_key",
                                      replication_key_metadata)
        state = singer.clear_bookmark(state, catalog_entry.tap_stream_id,
                                      "replication_key_value")

    stream_version = common.get_stream_version(catalog_entry.tap_stream_id,
                                               state)
    state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                  "version", stream_version)

    activate_version_message = singer.ActivateVersionMessage(
        stream=catalog_entry.stream, version=stream_version)

    singer.write_message(activate_version_message)
    LOGGER.info("Beginning SQL")
    with connect_with_backoff(mssql_conn) as open_conn:
        with open_conn.cursor() as cur:
            select_sql = common.generate_select_sql(catalog_entry, columns)
            params = {}

            if replication_key_value is not None:
                if catalog_entry.schema.properties[
                        replication_key_metadata].format == "date-time":
                    replication_key_value = pendulum.parse(
                        replication_key_value)

                select_sql += " WHERE \"{}\" >= %(replication_key_value)s ORDER BY \"{}\" ASC".format(
                    replication_key_metadata, replication_key_metadata)

                params["replication_key_value"] = replication_key_value
            elif replication_key_metadata is not None:
                select_sql += " ORDER BY \"{}\" ASC".format(
                    replication_key_metadata)

            common.sync_query(cur, catalog_entry, state, select_sql, columns,
                              stream_version, params)
Exemplo n.º 14
0
def do_sync_full_table(mssql_conn, config, catalog_entry, state, columns):
    LOGGER.info("Stream %s is using full table replication",
                catalog_entry.stream)
    key_properties = common.get_key_properties(catalog_entry)

    write_schema_message(catalog_entry)

    stream_version = common.get_stream_version(catalog_entry.tap_stream_id,
                                               state)

    full_table.sync_table(mssql_conn, catalog_entry, state, columns,
                          stream_version)

    singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'version')

    state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                  'initial_full_table_complete', True)

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
Exemplo n.º 15
0
def sync_table(mysql_conn, config, catalog_entry, state, columns,
               stream_version):
    common.whitelist_bookmark_keys(generate_bookmark_keys(catalog_entry),
                                   catalog_entry.tap_stream_id, state)

    bookmark = state.get('bookmarks', {}).get(catalog_entry.tap_stream_id, {})
    version_exists = True if 'version' in bookmark else False

    initial_full_table_complete = singer.get_bookmark(
        state, catalog_entry.tap_stream_id, 'initial_full_table_complete')

    state_version = singer.get_bookmark(state, catalog_entry.tap_stream_id,
                                        'version')

    activate_version_message = singer.ActivateVersionMessage(
        stream='%s_%s' %
        (common.get_database_name(catalog_entry), catalog_entry.stream),
        version=stream_version)

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if not initial_full_table_complete and not (version_exists
                                                and state_version is None):
        singer.write_message(activate_version_message)

    with connect_with_backoff(mysql_conn) as open_conn:
        with open_conn.cursor() as cur:
            select_sql = common.generate_select_sql(catalog_entry, columns)
            params = {}

            # common.sync_query(cur, catalog_entry, state, select_sql, columns, stream_version, params)
            common.sync_query(cur, catalog_entry, state, select_sql, columns,
                              stream_version, params)

    # clear max pk value and last pk fetched upon successful sync
    singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values')
    singer.clear_bookmark(state, catalog_entry.tap_stream_id,
                          'last_pk_fetched')

    singer.write_message(activate_version_message)
Exemplo n.º 16
0
def sync_table(connection, catalog_entry, state, columns):
    common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id,
                                   state)

    catalog_metadata = metadata.to_map(catalog_entry.metadata)
    stream_metadata = catalog_metadata.get((), {})

    replication_key_metadata = stream_metadata.get('replication-key')
    replication_key_state = singer.get_bookmark(state,
                                                catalog_entry.tap_stream_id,
                                                'replication_key')

    replication_key = replication_key_state or replication_key_metadata
    replication_key_value = None

    if replication_key_metadata == replication_key_state:
        replication_key_value = singer.get_bookmark(
            state, catalog_entry.tap_stream_id, 'replication_key_value')
    else:
        state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                      'replication_key', replication_key)
        state = singer.clear_bookmark(state, catalog_entry.tap_stream_id,
                                      'replication_key_value')

    stream_version = common.get_stream_version(catalog_entry.tap_stream_id,
                                               state)
    state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                  'version', stream_version)

    yield singer.ActivateVersionMessage(stream=catalog_entry.stream,
                                        version=stream_version)

    with connection.cursor() as cursor:
        select_sql = common.generate_select_sql(catalog_entry, columns)
        params = {}

        if replication_key_value is not None:
            if catalog_entry.schema.properties[
                    replication_key].format == 'date-time':
                replication_key_value = pendulum.parse(replication_key_value)

            select_sql += ' WHERE `{}` >= %(replication_key_value)s ORDER BY `{}` ASC'.format(
                replication_key, replication_key)

            params['replication_key_value'] = replication_key_value
        elif replication_key is not None:
            select_sql += ' ORDER BY `{}` ASC'.format(replication_key)

        for message in common.sync_query(cursor, catalog_entry, state,
                                         select_sql, columns, stream_version,
                                         params):
            yield message
Exemplo n.º 17
0
    def get_pages_safe(self, state, bookmarked_cursor, start_time):
        try:
            yield from self.get_pages(bookmarked_cursor, start_time)
        except (RuntimeError, RequestException):
            # NB> If we get a non-retryable error we should delete the
            # pagination cursor bookmark before re-raising the exception.
            LOGGER.fatal(
                "Received fatal exception during syncing of stream %s, Clearing cursor bookmark.",
                self.tap_stream_id)

            state = singer.clear_bookmark(state, self.tap_stream_id, 'cursor')
            singer.write_state(state)
            raise
Exemplo n.º 18
0
def clean_state_for_report(config, state, tap_stream_id):
    top_level_bookmark = get_bookmark(state, tap_stream_id, 'last_report_date')
    if top_level_bookmark:
        top_level_bookmark = utils.strptime_to_utc(top_level_bookmark)
        LOGGER.info("%s - Converting state to multi-profile format.",
                    tap_stream_id)
        view_ids = get_view_ids(config)
        for view_id in view_ids:
            state = singer.write_bookmark(
                state, tap_stream_id, view_id,
                {'last_report_date': top_level_bookmark.strftime("%Y-%m-%d")})
        state = singer.clear_bookmark(state, tap_stream_id, 'last_report_date')
        singer.write_state(state)
    return state
Exemplo n.º 19
0
def sync(config, state, catalog):
    client = PeekClient(config['token'])
    partner_id = config['partner_id']

    with Transformer() as transformer:
        for stream in catalog.get_selected_streams(state):
            tap_stream_id = stream.tap_stream_id
            stream_obj = STREAMS[tap_stream_id](client, state)
            replication_key = stream_obj.replication_key
            stream_schema = stream.schema.to_dict()
            stream_metadata = metadata.to_map(stream.metadata)

            LOGGER.info('Staring sync for stream: %s', tap_stream_id)

            state = singer.set_currently_syncing(state, tap_stream_id)
            singer.write_state(state)

            singer.write_schema(tap_stream_id, stream_schema,
                                stream_obj.key_properties,
                                stream.replication_key)

            start_date = singer.get_bookmark(state, tap_stream_id,
                                             replication_key,
                                             config['start_date'])
            end_date = singer.utils.strftime(
                singer.utils.now(), format_str=singer.utils.DATETIME_PARSE)

            for record in stream_obj.sync(partner_id=partner_id,
                                          start_date=start_date,
                                          end_date=end_date):
                LOGGER.info(f"Writing record: {record}")
                transformed_record = transformer.transform(
                    record, stream_schema, stream_metadata)
                singer.write_record(
                    tap_stream_id,
                    transformed_record,
                )
            state = singer.clear_bookmark(state, tap_stream_id, 'start_date')
            singer.write_state(state)

    state = singer.set_currently_syncing(state, None)
    singer.write_state(state)
Exemplo n.º 20
0
def sync(config, state, catalog):
    # Any client required PARAMETERS to hit the endpoint
    client = CLIENT_CLASS_NAME(CLIENT_PARAMETERS_HERE)

    with Transformer() as transformer:
        for stream in catalog.get_selected_streams(state):
            tap_stream_id = stream.tap_stream_id
            stream_obj = STREAMS[tap_stream_id](client, state)
            # replication_key = stream_obj.replication_key
            stream_schema = stream.schema.to_dict()
            stream_metadata = metadata.to_map(stream.metadata)

            LOGGER.info('Staring sync for stream: %s', tap_stream_id)

            state = singer.set_currently_syncing(state, tap_stream_id)
            singer.write_state(state)

            singer.write_schema(
                tap_stream_id,
                stream_schema,
                stream_obj.key_properties,
                stream.replication_key
            )

            client = CLIENT_CLASS_NAME(CLIENT_PARAMETERS_HERE)
            for record in stream_obj.sync(CLIENT_PARAMETERS_HERE):
                transformed_record = transformer.transform(
                    record, stream_schema, stream_metadata)
                LOGGER.info(f"Writing record: {transformed_record}")
                singer.write_record(
                    tap_stream_id,
                    transformed_record,
                )

            # If there is a Bookmark or state based key to store
            state = singer.clear_bookmark(
                state, tap_stream_id, BOOKMARK_KEY)
            singer.write_state(state, tap_stream_id, )

    state = singer.set_currently_syncing(state, None)
    singer.write_state(state)
Exemplo n.º 21
0
    def sync(self, state, stream_schema, stream_metadata, config, transformer):
        start_time = config['start_date']
        bookmarked_cursor = singer.get_bookmark(state, self.tap_stream_id,
                                                'cursor')

        for page, cursor in self.get_pages_safe(state, bookmarked_cursor,
                                                start_time):
            for record in page:
                if record['updated_at'] >= start_time:
                    transformed_record = transformer.transform(
                        record, stream_schema, stream_metadata)
                    singer.write_record(
                        self.tap_stream_id,
                        transformed_record,
                    )
            singer.write_bookmark(state, self.tap_stream_id, 'cursor', cursor)
            singer.write_state(state)

        state = singer.clear_bookmark(state, self.tap_stream_id, 'cursor')
        singer.write_state(state)
        return state
Exemplo n.º 22
0
def sync_collection(client, stream, state, projection):
    tap_stream_id = stream['tap_stream_id']
    LOGGER.info('Starting full table sync for %s', tap_stream_id)

    md_map = metadata.to_map(stream['metadata'])
    database_name = metadata.get(md_map, (), 'database-name')

    db = client[database_name]
    collection = db[stream['stream']]

    #before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream['tap_stream_id'],
                                    'version') is None

    # last run was interrupted if there is a last_id_fetched bookmark
    was_interrupted = singer.get_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched') is not None

    #pick a new table version if last run wasn't interrupted
    if was_interrupted:
        stream_version = singer.get_bookmark(state, stream['tap_stream_id'],
                                             'version')
    else:
        stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream['tap_stream_id'], 'version',
                                  stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    activate_version_message = singer.ActivateVersionMessage(
        stream=common.calculate_destination_stream_name(stream),
        version=stream_version)

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if first_run:
        singer.write_message(activate_version_message)

    if singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value'):
        # There is a bookmark
        max_id_value = singer.get_bookmark(state, stream['tap_stream_id'],
                                           'max_id_value')
        max_id_type = singer.get_bookmark(state, stream['tap_stream_id'],
                                          'max_id_type')
        max_id_value = common.string_to_class(max_id_value, max_id_type)
    else:
        max_id_value = get_max_id_value(collection)

    last_id_fetched = singer.get_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched')

    if max_id_value:
        # Write the bookmark if max_id_value is defined
        state = singer.write_bookmark(
            state, stream['tap_stream_id'], 'max_id_value',
            common.class_to_string(max_id_value,
                                   max_id_value.__class__.__name__))
        state = singer.write_bookmark(state, stream['tap_stream_id'],
                                      'max_id_type',
                                      max_id_value.__class__.__name__)

    find_filter = {'$lte': max_id_value}
    if last_id_fetched:
        last_id_fetched_type = singer.get_bookmark(state,
                                                   stream['tap_stream_id'],
                                                   'last_id_fetched_type')
        find_filter['$gte'] = common.string_to_class(last_id_fetched,
                                                     last_id_fetched_type)

    query_message = 'Querying {} with:\n\tFind Parameters: {}'.format(
        stream['tap_stream_id'], find_filter)
    if projection:
        query_message += '\n\tProjection: {}'.format(projection)
    # pylint: disable=logging-format-interpolation
    LOGGER.info(query_message)

    with collection.find({'_id': find_filter},
                         projection,
                         sort=[("_id", pymongo.ASCENDING)]) as cursor:
        rows_saved = 0
        time_extracted = utils.now()
        start_time = time.time()

        schema = stream['schema'] or {"type": "object", "properties": {}}
        for row in cursor:
            rows_saved += 1

            schema_build_start_time = time.time()
            if common.row_to_schema(schema, row):
                singer.write_message(
                    singer.SchemaMessage(
                        stream=common.calculate_destination_stream_name(
                            stream),
                        schema=schema,
                        key_properties=['_id']))
                common.SCHEMA_COUNT[stream['tap_stream_id']] += 1
            common.SCHEMA_TIMES[stream['tap_stream_id']] += time.time(
            ) - schema_build_start_time

            record_message = common.row_to_singer_record(
                stream, row, stream_version, time_extracted)

            singer.write_message(record_message)

            state = singer.write_bookmark(
                state, stream['tap_stream_id'], 'last_id_fetched',
                common.class_to_string(row['_id'],
                                       row['_id'].__class__.__name__))
            state = singer.write_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched_type',
                                          row['_id'].__class__.__name__)

            if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0:
                singer.write_message(
                    singer.StateMessage(value=copy.deepcopy(state)))

        common.COUNTS[tap_stream_id] += rows_saved
        common.TIMES[tap_stream_id] += time.time() - start_time

    # clear max pk value and last pk fetched upon successful sync
    singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_value')
    singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_type')
    singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched')
    singer.clear_bookmark(state, stream['tap_stream_id'],
                          'last_id_fetched_type')

    state = singer.write_bookmark(state, stream['tap_stream_id'],
                                  'initial_full_table_complete', True)

    singer.write_message(activate_version_message)

    LOGGER.info('Syncd {} records for {}'.format(rows_saved, tap_stream_id))
Exemplo n.º 23
0
def _sync_stream(client, stream, transformer, config, state, catalog,
                 **kwargs):
    record = kwargs.get('record', None)
    substreams = kwargs.get('substreams')
    tap_stream_id = stream.tap_stream_id

    stream_obj = stream()
    stream_catalog = catalog.get_stream(stream.tap_stream_id)
    replication_key = stream_obj.replication_key
    stream_schema = stream_catalog.schema.to_dict()
    stream_metadata = metadata.to_map(stream_catalog.metadata)
    replication_method = metadata.get(stream_metadata, (),
                                      'replication-method')
    stream_obj.update_replication_method(replication_method)

    LOGGER.debug('Starting sync for stream: %s', tap_stream_id)
    state = singer.set_currently_syncing(state, tap_stream_id)
    singer.write_state(state)

    # Only write schema once
    if not tap_stream_id in schemas_written:
        singer.write_schema(tap_stream_id, stream_schema,
                            stream_obj.key_properties, stream.replication_key)
        schemas_written.append(tap_stream_id)

    start_date = singer.get_bookmark(state, tap_stream_id, replication_key,
                                     config['start_date'])
    offset = singer.get_bookmark(state, tap_stream_id, 'offset', 0)

    max_record_value = start_date
    for page, cursor in stream_obj.sync(client,
                                        config,
                                        state,
                                        record=record,
                                        start_date=start_date,
                                        offset=offset):
        for record in page:
            transformed_record = transformer.transform(record, stream_schema,
                                                       stream_metadata)

            time_extracted = singer.utils.now()
            singer.write_record(tap_stream_id,
                                transformed_record,
                                time_extracted=time_extracted)

            if stream_obj.replication_method == 'INCREMENTAL':
                current_replication_value = deep_get(record, replication_key)
                if current_replication_value \
                        and current_replication_value > max_record_value:
                    max_record_value = current_replication_value

            if substreams:
                _sync_streams(client,
                              substreams.values(),
                              transformer,
                              config,
                              state,
                              catalog,
                              record=record,
                              start_date=start_date)

        state = singer.write_bookmark(state, tap_stream_id, 'offset', cursor)

        if stream_obj.replication_method == 'INCREMENTAL':
            state = singer.write_bookmark(state, tap_stream_id,
                                          replication_key, max_record_value)

        singer.write_state(state)

    state = singer.clear_bookmark(state, tap_stream_id, 'offset')
    singer.write_state(state)
Exemplo n.º 24
0
def sync_table(mysql_conn, catalog_entry, state, columns, stream_version):
    common.whitelist_bookmark_keys(generate_bookmark_keys(catalog_entry), catalog_entry.tap_stream_id, state)

    bookmark = state.get('bookmarks', {}).get(catalog_entry.tap_stream_id, {})
    version_exists = True if 'version' in bookmark else False

    initial_full_table_complete = singer.get_bookmark(state,
                                                      catalog_entry.tap_stream_id,
                                                      'initial_full_table_complete')

    state_version = singer.get_bookmark(state,
                                        catalog_entry.tap_stream_id,
                                        'version')

    activate_version_message = singer.ActivateVersionMessage(
        stream=catalog_entry.stream,
        version=stream_version
    )

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if not initial_full_table_complete and not (version_exists and state_version is None):
        singer.write_message(activate_version_message)

    key_props_are_auto_incrementing = pks_are_auto_incrementing(mysql_conn, catalog_entry)

    with connect_with_backoff(mysql_conn) as open_conn:
        with open_conn.cursor() as cur:
            select_sql = common.generate_select_sql(catalog_entry, columns)

            if key_props_are_auto_incrementing:
                LOGGER.info("Detected auto-incrementing primary key(s) - will replicate incrementally")
                max_pk_values = singer.get_bookmark(state,
                                                    catalog_entry.tap_stream_id,
                                                    'max_pk_values') or get_max_pk_values(cur, catalog_entry)


                if not max_pk_values:
                    LOGGER.info("No max value for auto-incrementing PK found for table {}".format(catalog_entry.table))
                else:
                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  'max_pk_values',
                                                  max_pk_values)

                    pk_clause = generate_pk_clause(catalog_entry, state)

                    select_sql += pk_clause

            params = {}

            common.sync_query(cur,
                              catalog_entry,
                              state,
                              select_sql,
                              columns,
                              stream_version,
                              params)

    # clear max pk value and last pk fetched upon successful sync
    singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values')
    singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'last_pk_fetched')

    singer.write_message(activate_version_message)
Exemplo n.º 25
0
def sync_table(mysql_conn, catalog_entry, state, columns, limit=None):
    common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id, state)

    catalog_metadata = metadata.to_map(catalog_entry.metadata)
    stream_metadata = catalog_metadata.get((), {})

    iterate_limit = True
    while iterate_limit:

        replication_key_metadata = stream_metadata.get('replication-key')
        replication_key_state = singer.get_bookmark(state,
                                                    catalog_entry.tap_stream_id,
                                                    'replication_key')

        replication_key_value = None

        if replication_key_metadata == replication_key_state:
            replication_key_value = singer.get_bookmark(state,
                                                        catalog_entry.tap_stream_id,
                                                        'replication_key_value')
        else:
            state = singer.write_bookmark(state,
                                          catalog_entry.tap_stream_id,
                                          'replication_key',
                                          replication_key_metadata)
            state = singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'replication_key_value')

        stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state)
        state = singer.write_bookmark(state,
                                      catalog_entry.tap_stream_id,
                                      'version',
                                      stream_version)

        activate_version_message = singer.ActivateVersionMessage(
            stream=catalog_entry.stream,
            version=stream_version
        )

        singer.write_message(activate_version_message)

        with connect_with_backoff(mysql_conn) as open_conn:
            with open_conn.cursor() as cur:
                select_sql = common.generate_select_sql(catalog_entry, columns)
                params = {}

                if replication_key_value is not None:
                    if catalog_entry.schema.properties[replication_key_metadata].format == 'date-time':
                        replication_key_value = pendulum.parse(replication_key_value)

                    select_sql += ' WHERE `{}` >= %(replication_key_value)s ORDER BY `{}` ASC'.format(
                        replication_key_metadata,
                        replication_key_metadata)

                    params['replication_key_value'] = replication_key_value
                elif replication_key_metadata is not None:
                    select_sql += ' ORDER BY `{}` ASC'.format(replication_key_metadata)

                if limit:
                    select_sql += ' LIMIT {}'.format(limit)

                num_rows = common.sync_query(cur,
                                             catalog_entry,
                                             state,
                                             select_sql,
                                             columns,
                                             stream_version,
                                             params)
                if limit is None or num_rows < limit:
                    iterate_limit = False
def sync_collection(collection: Collection, stream: Dict, state: Dict) -> None:
    """
    Sync collection records incrementally
    Args:
        collection: MongoDB collection instance
        stream: dictionary of all stream details
        state: the tap state
    """
    LOGGER.info('Starting full table sync for %s', stream['tap_stream_id'])

    # before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream['tap_stream_id'],
                                    'version') is None

    # last run was interrupted if there is a last_id_fetched bookmark
    # pick a new table version if last run wasn't interrupted
    if singer.get_bookmark(state, stream['tap_stream_id'],
                           'last_id_fetched') is not None:
        stream_version = singer.get_bookmark(state, stream['tap_stream_id'],
                                             'version')
    else:
        stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream['tap_stream_id'], 'version',
                                  stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    activate_version_message = singer.ActivateVersionMessage(
        stream=common.calculate_destination_stream_name(stream),
        version=stream_version)

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if first_run:
        singer.write_message(activate_version_message)

    if singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value'):
        # There is a bookmark
        max_id_value = common.string_to_class(
            singer.get_bookmark(state, stream['tap_stream_id'],
                                'max_id_value'),
            singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_type'))
    else:
        max_id_value = get_max_id_value(collection)

    last_id_fetched = singer.get_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched')

    if max_id_value:
        # Write the bookmark if max_id_value is defined
        state = singer.write_bookmark(
            state, stream['tap_stream_id'], 'max_id_value',
            common.class_to_string(max_id_value,
                                   max_id_value.__class__.__name__))
        state = singer.write_bookmark(state, stream['tap_stream_id'],
                                      'max_id_type',
                                      max_id_value.__class__.__name__)

    find_filter = {'$lte': max_id_value}
    if last_id_fetched:
        find_filter['$gte'] = common.string_to_class(
            last_id_fetched,
            singer.get_bookmark(state, stream['tap_stream_id'],
                                'last_id_fetched_type'))

    LOGGER.info('Querying %s with: %s', stream['tap_stream_id'],
                dict(find=find_filter))

    with collection.find({'_id': find_filter},
                         sort=[("_id", pymongo.ASCENDING)]) as cursor:
        rows_saved = 0
        start_time = time.time()

        for row in cursor:
            rows_saved += 1

            singer.write_message(
                common.row_to_singer_record(stream=stream,
                                            row=row,
                                            time_extracted=utils.now(),
                                            time_deleted=None,
                                            version=stream_version))

            state = singer.write_bookmark(
                state, stream['tap_stream_id'], 'last_id_fetched',
                common.class_to_string(row['_id'],
                                       row['_id'].__class__.__name__))
            state = singer.write_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched_type',
                                          row['_id'].__class__.__name__)

            if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0:
                singer.write_message(
                    singer.StateMessage(value=copy.deepcopy(state)))

        common.COUNTS[stream['tap_stream_id']] += rows_saved
        common.TIMES[stream['tap_stream_id']] += time.time() - start_time

    # clear max pk value and last pk fetched upon successful sync
    singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_value')
    singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_type')
    singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched')
    singer.clear_bookmark(state, stream['tap_stream_id'],
                          'last_id_fetched_type')

    singer.write_bookmark(state, stream['tap_stream_id'],
                          'initial_full_table_complete', True)

    singer.write_message(activate_version_message)

    LOGGER.info('Syncd %s records for %s', rows_saved, stream['tap_stream_id'])
Exemplo n.º 27
0
def sync(config, state, catalog):
    client = SquareClient(config)

    with Transformer() as transformer:
        for stream in catalog.get_selected_streams(state):
            tap_stream_id = stream.tap_stream_id
            stream_obj = STREAMS[tap_stream_id](client, state)
            replication_key = stream_obj.replication_key
            stream_schema = stream.schema.to_dict()
            stream_metadata = metadata.to_map(stream.metadata)

            LOGGER.info('Staring sync for stream: %s', tap_stream_id)

            state = singer.set_currently_syncing(state, tap_stream_id)
            singer.write_state(state)

            singer.write_schema(tap_stream_id, stream_schema,
                                stream_obj.key_properties,
                                stream.replication_key)

            start_time = singer.get_bookmark(state, tap_stream_id,
                                             replication_key,
                                             config['start_date'])
            bookmarked_cursor = singer.get_bookmark(state, tap_stream_id,
                                                    'cursor')

            if tap_stream_id == 'shifts':
                replication_key = stream_obj.replication_key

                sync_start_bookmark = singer.get_bookmark(
                    state, tap_stream_id, 'sync_start',
                    singer.utils.strftime(
                        singer.utils.now(),
                        format_str=singer.utils.DATETIME_PARSE))
                state = singer.write_bookmark(
                    state,
                    tap_stream_id,
                    'sync_start',
                    sync_start_bookmark,
                )
                for page, cursor in stream_obj.sync(start_time,
                                                    bookmarked_cursor):
                    for record in page:
                        if record[replication_key] >= start_time:
                            transformed_record = transformer.transform(
                                record,
                                stream_schema,
                                stream_metadata,
                            )
                            singer.write_record(
                                tap_stream_id,
                                transformed_record,
                            )
                    state = singer.write_bookmark(state, tap_stream_id,
                                                  'cursor', cursor)
                    singer.write_state(state)

                state = singer.clear_bookmark(state, tap_stream_id,
                                              'sync_start')
                state = singer.write_bookmark(
                    state,
                    tap_stream_id,
                    replication_key,
                    sync_start_bookmark,
                )
                singer.write_state(state)

            elif stream_obj.replication_method == 'INCREMENTAL':
                replication_key = stream_obj.replication_key
                max_record_value = start_time
                for page, cursor in stream_obj.sync(start_time,
                                                    bookmarked_cursor):
                    for record in page:
                        transformed_record = transformer.transform(
                            record, stream_schema, stream_metadata)
                        singer.write_record(
                            tap_stream_id,
                            transformed_record,
                        )
                        if record[replication_key] > max_record_value:
                            max_record_value = transformed_record[
                                replication_key]

                    state = singer.write_bookmark(state, tap_stream_id,
                                                  'cursor', cursor)
                    state = singer.write_bookmark(state, tap_stream_id,
                                                  replication_key,
                                                  max_record_value)
                    singer.write_state(state)

            else:
                for record in stream_obj.sync(start_time, bookmarked_cursor):
                    transformed_record = transformer.transform(
                        record, stream_schema, stream_metadata)
                    singer.write_record(
                        tap_stream_id,
                        transformed_record,
                    )
            state = singer.clear_bookmark(state, tap_stream_id, 'cursor')
            singer.write_state(state)

    state = singer.set_currently_syncing(state, None)
    singer.write_state(state)
Exemplo n.º 28
0
def generate_messages(con, config, catalog, state):
    catalog = resolve_catalog(con, catalog, state)

    for catalog_entry in catalog.streams:
        columns = list(catalog_entry.schema.properties.keys())

        if not columns:
            LOGGER.warning(
                'There are no columns selected for stream %s, skipping it.',
                catalog_entry.stream)
            continue

        state = singer.set_currently_syncing(state,
                                             catalog_entry.tap_stream_id)

        # Emit a state message to indicate that we've started this stream
        yield singer.StateMessage(value=copy.deepcopy(state))

        md_map = metadata.to_map(catalog_entry.metadata)

        replication_method = md_map.get((), {}).get('replication-method')
        replication_key = md_map.get((), {}).get('replication-key')

        if catalog_entry.is_view:
            key_properties = md_map.get((), {}).get('view-key-properties')
        else:
            key_properties = md_map.get((), {}).get('table-key-properties')

        with metrics.job_timer('sync_table') as timer:
            timer.tags['database'] = catalog_entry.database
            timer.tags['table'] = catalog_entry.table

            log_engine(con, catalog_entry)

            if replication_method == 'INCREMENTAL':
                LOGGER.info("Stream %s is using incremental replication",
                            catalog_entry.stream)

                yield generate_schema_message(catalog_entry, key_properties,
                                              [replication_key])

                for message in incremental.sync_table(con, catalog_entry,
                                                      state, columns):
                    yield message
            elif replication_method == 'LOG_BASED':
                if catalog_entry.is_view:
                    raise Exception(
                        "Unable to replicate stream({}) with binlog because it is a view."
                        .format(catalog_entry.stream))

                LOGGER.info("Stream %s is using binlog replication",
                            catalog_entry.stream)

                log_file = singer.get_bookmark(state,
                                               catalog_entry.tap_stream_id,
                                               'log_file')

                log_pos = singer.get_bookmark(state,
                                              catalog_entry.tap_stream_id,
                                              'log_pos')

                yield generate_schema_message(catalog_entry, key_properties,
                                              [])

                if log_file and log_pos:
                    columns = binlog.add_automatic_properties(
                        catalog_entry, columns)

                    for message in binlog.sync_table(con, config,
                                                     catalog_entry, state,
                                                     columns):
                        yield message
                else:
                    LOGGER.info("Performing initial full table sync")

                    log_file, log_pos = binlog.fetch_current_log_file_and_pos(
                        con)

                    stream_version = common.get_stream_version(
                        catalog_entry.tap_stream_id, state)

                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  'version', stream_version)

                    for message in full_table.sync_table(
                            con, catalog_entry, state, columns,
                            stream_version):
                        yield message

                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  'log_file', log_file)

                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  'log_pos', log_pos)

                    yield singer.StateMessage(value=copy.deepcopy(state))
            elif replication_method == 'FULL_TABLE':
                LOGGER.info("Stream %s is using full table replication",
                            catalog_entry.stream)

                yield generate_schema_message(catalog_entry, key_properties,
                                              [])

                stream_version = common.get_stream_version(
                    catalog_entry.tap_stream_id, state)

                for message in full_table.sync_table(con, catalog_entry, state,
                                                     columns, stream_version):
                    yield message

                # Prefer initial_full_table_complete going forward
                singer.clear_bookmark(state, catalog_entry.tap_stream_id,
                                      'version')

                state = singer.write_bookmark(state,
                                              catalog_entry.tap_stream_id,
                                              'initial_full_table_complete',
                                              True)

                yield singer.StateMessage(value=copy.deepcopy(state))
            else:
                raise Exception(
                    "only INCREMENTAL, LOG_BASED, and FULL TABLE replication methods are supported"
                )

    # if we get here, we've finished processing all the streams, so clear
    # currently_syncing from the state and emit a state message.
    state = singer.set_currently_syncing(state, None)
    yield singer.StateMessage(value=copy.deepcopy(state))
Exemplo n.º 29
0
def sync_table(client, stream, state, stream_version, columns):
    common.whitelist_bookmark_keys(generate_bookmark_keys(stream),
                                   stream['tap_stream_id'], state)

    mdata = metadata.to_map(stream['metadata'])
    stream_metadata = mdata.get(())

    database_name = stream_metadata['database-name']

    db = client[database_name]
    collection = db[stream['stream']]

    activate_version_message = singer.ActivateVersionMessage(
        stream=stream['stream'], version=stream_version)

    initial_full_table_complete = singer.get_bookmark(
        state, stream['tap_stream_id'], 'initial_full_table_complete')

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if not initial_full_table_complete:
        singer.write_message(activate_version_message)

    max_id_value = singer.get_bookmark(
        state, stream['tap_stream_id'],
        'max_id_value') or get_max_id_value(collection)

    last_id_fetched = singer.get_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched')

    state = singer.write_bookmark(state, stream['tap_stream_id'],
                                  'max_id_value', max_id_value)

    find_filter = {'$lte': objectid.ObjectId(max_id_value)}

    if last_id_fetched:
        find_filter['$gt':objectid.ObjectId(last_id_fetched)]

    LOGGER.info("Starting full table replication for table {}.{}".format(
        database_name, stream['stream']))

    with metrics.record_counter(None) as counter:
        with collection.find({'_id': find_filter},
                             sort=[("_id", pymongo.DESCENDING)]) as cursor:
            rows_saved = 0

            time_extracted = utils.now()

            for row in cursor:
                rows_saved += 1

                whitelisted_row = {
                    k: v
                    for k, v in row.items() if k in columns
                }
                record_message = common.row_to_singer_record(
                    stream, whitelisted_row, stream_version, time_extracted)

                singer.write_message(record_message)

                state = singer.write_bookmark(state, stream['tap_stream_id'],
                                              'last_id_fetched',
                                              str(row['_id']))

                if rows_saved % 1000 == 0:
                    singer.write_message(
                        singer.StateMessage(value=copy.deepcopy(state)))

    # clear max pk value and last pk fetched upon successful sync
    singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_value')
    singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched')

    singer.write_message(activate_version_message)