示例#1
0
def sync(config, state, catalog):
    for catalog_entry in catalog.streams:
        catalog_metadata = metadata.to_map(catalog_entry.metadata)
        replication_key = catalog_metadata.get((), {}).get('replication-key')

        state = singer.set_currently_syncing(state,
                                             catalog_entry.tap_stream_id)
        _emit(singer.StateMessage(value=state))
        if catalog_entry.is_view:
            key_properties = metadata.to_map(catalog_entry.metadata).get(
                (), {}).get('view-key-properties', [])
        else:
            key_properties = metadata.to_map(catalog_entry.metadata).get(
                (), {}).get('table-key-properties', [])

        _emit(
            singer.SchemaMessage(stream=catalog_entry.stream,
                                 schema=catalog_entry.schema.to_dict(),
                                 key_properties=key_properties,
                                 bookmark_properties=replication_key))
        with metrics.job_timer("sync_table") as timer:
            timer.tags["schema"] = catalog_entry.database
            timer.tags["table"] = catalog_entry.table
            _sync_table(config, state, catalog_entry)
    state = singer.set_currently_syncing(state, None)
    _emit(singer.StateMessage(value=state))
示例#2
0
    def wait_for_export(self, stream_type, export_id):
        # Poll the export status until it enters a finalized state or
        # exceeds the job timeout time.
        with metrics.job_timer('Export {} for {}'.format(
                export_id, stream_type)):
            timeout_time = pendulum.utcnow().add(seconds=self.job_timeout)
            while pendulum.utcnow() < timeout_time:
                status = self.poll_export(stream_type, export_id)
                singer.log_info("export %s status is %s", export_id, status)

                if status == "Created":
                    # If the status is created, the export has been made but
                    # not started, so enqueue the export.
                    self.enqueue_export(stream_type, export_id)

                elif status in ["Cancelled", "Failed"]:
                    # Cancelled and failed exports fail the current sync.
                    raise ExportFailed(status)

                elif status == "Completed":
                    return True

                time.sleep(self.poll_interval)

        raise ExportFailed("Export timed out after {} minutes".format(
            self.job_timeout / 60))
示例#3
0
def sync_binlog_streams(mysql_conn, binlog_catalog, config, state):
    if binlog_catalog.streams:
        for stream in binlog_catalog.streams:
            write_schema_message(stream)

        with metrics.job_timer('sync_binlog') as timer:
            binlog.sync_binlog_stream(mysql_conn, config, binlog_catalog.streams, state)
示例#4
0
async def poll_report(client, account_id, report_name, start_date, end_date, request_id):
    download_url = None
    with metrics.job_timer('generate_report'):
        for i in range(1, MAX_NUM_REPORT_POLLS + 1):
            LOGGER.info('Polling report job {}/{} - {} - from {} to {}'.format(
                i,
                MAX_NUM_REPORT_POLLS,
                report_name,
                start_date,
                end_date))
            response = client.PollGenerateReport(request_id)
            if response.Status == 'Error':
                LOGGER.warn(
                        'Error polling {} for account {} with request id {}'
                        .format(report_name, account_id, request_id))
                return False, None
            if response.Status == 'Success':
                if response.ReportDownloadUrl:
                    download_url = response.ReportDownloadUrl
                else:
                    LOGGER.info('No results for report: {} - from {} to {}'.format(
                        report_name,
                        start_date,
                        end_date))
                break

            if i == MAX_NUM_REPORT_POLLS:
                LOGGER.info('Generating report timed out: {} - from {} to {}'.format(
                        report_name,
                        start_date,
                        end_date))
            else:
                await asyncio.sleep(REPORT_POLL_SLEEP)

    return True, download_url
示例#5
0
async def poll_report(client, account_id, report_name, start_date, end_date,
                      request_id):
    # Get download_url of generated report
    download_url = None
    with metrics.job_timer('generate_report'):
        for i in range(1, MAX_NUM_REPORT_POLLS + 1):
            LOGGER.info('Polling report job %s/%s - %s - from %s to %s', i,
                        MAX_NUM_REPORT_POLLS, report_name, start_date,
                        end_date)
            # As in the async method backoff does not work directly we created a separate method to handle it.
            response = generate_poll_report(client, request_id)
            if response.Status == 'Error':
                LOGGER.warn(
                    'Error polling %s for account %s with request id %s',
                    report_name, account_id, request_id)
                return False, None
            if response.Status == 'Success':
                if response.ReportDownloadUrl:
                    download_url = response.ReportDownloadUrl
                else:
                    LOGGER.info('No results for report: %s - from %s to %s',
                                report_name, start_date, end_date)
                break

            if i == MAX_NUM_REPORT_POLLS:
                LOGGER.info('Generating report timed out: %s - from %s to %s',
                            report_name, start_date, end_date)
            else:
                await asyncio.sleep(REPORT_POLL_SLEEP)

    return True, download_url
示例#6
0
def sync_stream(mssql_conn, catalog, config, state):
    for catalog_entry in catalog.streams:
        columns = list(catalog_entry.schema.properties.keys())

        if not columns:
            LOGGER.warning(
                'There are no columns selected for stream %s, skipping it.',
                catalog_entry.stream)
            continue

        state = singer.set_currently_syncing(state,
                                             catalog_entry.tap_stream_id)

        singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

        md_map = metadata.to_map(catalog_entry.metadata)

        replication_method = md_map.get((), {}).get('replication-method')

        database_name = common.get_database_name(catalog_entry)

        with metrics.job_timer('sync_table') as timer:
            timer.tags['database'] = database_name
            timer.tags['table'] = catalog_entry.table

            if replication_method == 'INCREMENTAL':
                optional_limit = config.get('incremental_limit')
                # do_sync_incremental(mssql_conn, catalog_entry, state, columns, optional_limit)
            elif replication_method == 'LOG_BASED':
                pass
                # do_sync_historical_binlog(mssql_conn, config, catalog_entry, state, columns)
            elif replication_method == 'FULL_TABLE':
                do_sync_full_table(mssql_conn, config, catalog_entry, state,
                                   columns)
示例#7
0
def generate_messages(conn, db_schema, catalog, state):
    catalog = resolve.resolve_catalog(discover_catalog(conn, db_schema),
                                      catalog, state)

    for catalog_entry in catalog.streams:
        state = singer.set_currently_syncing(state,
                                             catalog_entry.tap_stream_id)
        catalog_md = metadata.to_map(catalog_entry.metadata)

        if catalog_md.get((), {}).get('is-view'):
            key_properties = catalog_md.get((), {}).get('view-key-properties')
        else:
            key_properties = catalog_md.get((), {}).get('table-key-properties')
        bookmark_properties = catalog_md.get((), {}).get('replication-key')

        # Emit a state message to indicate that we've started this stream
        yield singer.StateMessage(value=copy.deepcopy(state))

        # Emit a SCHEMA message before we sync any records
        yield singer.SchemaMessage(stream=catalog_entry.stream,
                                   schema=catalog_entry.schema.to_dict(),
                                   key_properties=key_properties,
                                   bookmark_properties=bookmark_properties)

        # Emit a RECORD message for each record in the result set
        with metrics.job_timer('sync_table') as timer:
            timer.tags['database'] = catalog_entry.database
            timer.tags['table'] = catalog_entry.table
            for message in sync_table(conn, catalog_entry, state):
                yield message

    # If we get here, we've finished processing all the streams, so clear
    # currently_syncing from the state and emit a state message.
    state = singer.set_currently_syncing(state, None)
    yield singer.StateMessage(value=copy.deepcopy(state))
示例#8
0
    def __iter__(self):
        for params in self.job_params():
            with metrics.job_timer('insights'):
                job = self.run_job(params)

            min_date_start_for_job = None
            count = 0
            for obj in job.get_result():
                count += 1
                rec = obj.export_all_data()
                if not min_date_start_for_job or rec[
                        'date_stop'] < min_date_start_for_job:
                    min_date_start_for_job = rec['date_stop']
                yield {'record': rec}
            LOGGER.info('Got %d results for insights job', count)

            # when min_date_start_for_job stays None, we should
            # still update the bookmark using 'until' in time_ranges
            if min_date_start_for_job is None:
                for time_range in params['time_ranges']:
                    if time_range['until']:
                        min_date_start_for_job = time_range['until']
            yield {
                'state':
                advance_bookmark(self, self.bookmark_key,
                                 min_date_start_for_job)
            }  # pylint: disable=no-member
示例#9
0
def poll_report(client, report_name, start_date, end_date, request_id):
    download_url = None
    with metrics.job_timer('generate_report'):
        for i in range(1, MAX_NUM_REPORT_POLLS + 1):
            LOGGER.info('Polling report job {}/{} - {} - from {} to {}'.format(
                i, MAX_NUM_REPORT_POLLS, report_name, start_date, end_date))
            response = client.PollGenerateReport(request_id)
            if response.Status == 'Error':
                raise Exception('Error running {} report'.format(report_name))
            if response.Status == 'Success':
                if response.ReportDownloadUrl:
                    download_url = response.ReportDownloadUrl
                else:
                    LOGGER.info(
                        'No results for report: {} - from {} to {}'.format(
                            report_name, start_date, end_date))
                break

            if i == MAX_NUM_REPORT_POLLS:
                LOGGER.info(
                    'Generating report timed out: {} - from {} to {}'.format(
                        report_name, start_date, end_date))
            else:
                time.sleep(REPORT_POLL_SLEEP)

    return download_url
示例#10
0
def sync_oplog_streams(client, streams, state):
    if streams:
        for stream in streams:
            write_schema_message(stream)

        with metrics.job_timer('sync_oplog') as timer:
            oplog.sync_oplog_stream(client, streams, state)
示例#11
0
def __sync_endpoint(req_state):
    # Top level variables
    endpoint_total = 0

    with metrics.job_timer('endpoint_duration'):

        LOGGER.info('{}: STARTED Syncing stream'.format(req_state.stream_name))
        singer_ops.update_currently_syncing(req_state.state,
                                            req_state.stream_name)

        # Publish schema to singer
        singer_ops.write_schema(req_state.catalog, req_state.stream_name)
        LOGGER.info('{}: Processing date window, {} to {}'.format(
            req_state.stream_name, req_state.last_date, req_state.end_date))

        if req_state.stream_name in ALL_RECORDS_STREAMS:
            endpoint_total = __process_all_records_data_stream(req_state)

        elif req_state.stream_name == 'periodic_data_standardized':
            endpoint_total = __process_standardized_data_stream(req_state)

        elif req_state.stream_name == 'periodic_data_calculated':
            endpoint_total = __process_periodic_data_calcs(req_state)

        else:
            # data_items, investment_transactions
            endpoint_total = __process_incremental_stream(req_state)

        singer_ops.update_currently_syncing(req_state.state, None)
        LOGGER.info('{}: FINISHED Syncing Stream, total_records: {}'.format(
            req_state.stream_name, endpoint_total))

    LOGGER.info('sync.py: sync complete')

    return endpoint_total
示例#12
0
def sync_non_binlog_streams(mssql_conn, non_binlog_catalog, config, state):
    mssql_conn = MSSQLConnection(config)

    for catalog_entry in non_binlog_catalog.streams:
        columns = list(catalog_entry.schema.properties.keys())

        if not columns:
            LOGGER.warning(
                "There are no columns selected for stream %s, skipping it.",
                catalog_entry.stream)
            continue

        state = singer.set_currently_syncing(state,
                                             catalog_entry.tap_stream_id)

        # Emit a state message to indicate that we've started this stream
        singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

        md_map = metadata.to_map(catalog_entry.metadata)
        replication_method = md_map.get((), {}).get("replication-method")
        replication_key = md_map.get((), {}).get("replication-key")
        primary_keys = md_map.get((), {}).get("table-key-properties")
        LOGGER.info(
            f"Table {catalog_entry.table} proposes {replication_method} sync")
        if replication_method == "INCREMENTAL" and not replication_key:
            LOGGER.info(
                f"No replication key for {catalog_entry.table}, using full table replication"
            )
            replication_method = "FULL_TABLE"
        if replication_method == "INCREMENTAL" and not primary_keys:
            LOGGER.info(
                f"No primary key for {catalog_entry.table}, using full table replication"
            )
            replication_method = "FULL_TABLE"
        LOGGER.info(
            f"Table {catalog_entry.table} will use {replication_method} sync")

        database_name = common.get_database_name(catalog_entry)

        with metrics.job_timer("sync_table") as timer:
            timer.tags["database"] = database_name
            timer.tags["table"] = catalog_entry.table

            if replication_method == "INCREMENTAL":
                LOGGER.info(f"syncing {catalog_entry.table} incrementally")
                do_sync_incremental(mssql_conn, config, catalog_entry, state,
                                    columns)
            elif replication_method == "FULL_TABLE":
                LOGGER.info(f"syncing {catalog_entry.table} full table")
                do_sync_full_table(mssql_conn, config, catalog_entry, state,
                                   columns)
            else:
                raise Exception(
                    "only INCREMENTAL and FULL TABLE replication methods are supported"
                )

    state = singer.set_currently_syncing(state, None)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
示例#13
0
def do_sync_incremental(con, catalog_entry, state, columns):
    LOGGER.info("Stream %s is using incremental replication", catalog_entry.stream)
    key_properties = get_key_properties(catalog_entry)

    if not catalog_entry.replication_key:
        raise Exception("Cannot use INCREMENTAL replication for table ({}) without a replication key.".format(catalog_entry.stream))

    singer.write_schema(catalog_entry.stream,
                        catalog_entry.schema.to_dict(),
                        key_properties,
                        [catalog_entry.replication_key])

    with metrics.job_timer('sync_table') as timer:
        timer.tags['table'] = catalog_entry.table
        sync_incremental.sync_table(con, catalog_entry, state, columns)
示例#14
0
def generate_messages(con, catalog, state):
    catalog = resolve_catalog(con, catalog, state)

    for catalog_entry in catalog.streams:
        state = singer.set_currently_syncing(state, catalog_entry.tap_stream_id)

        # Emit a state message to indicate that we've started this stream
        yield singer.StateMessage(value=copy.deepcopy(state))

        md_map = metadata.to_map(catalog_entry.metadata)

        replication_method = md_map.get((), {}).get('replication-method')
        replication_key = singer.get_bookmark(state,
                                              catalog_entry.tap_stream_id,
                                              'replication_key')

        if catalog_entry.is_view:
            key_properties = md_map.get((), {}).get('view-key-properties')
        else:
            key_properties = md_map.get((), {}).get('table-key-properties')

        # Emit a SCHEMA message before we sync any records
        yield singer.SchemaMessage(
            stream=catalog_entry.stream,
            schema=catalog_entry.schema.to_dict(),
            key_properties=key_properties,
            bookmark_properties=replication_key
        )

        with metrics.job_timer('sync_table') as timer:
            timer.tags['database'] = catalog_entry.database
            timer.tags['table'] = catalog_entry.table

            log_engine(con, catalog_entry)

            if replication_method == 'INCREMENTAL':
                for message in incremental.sync_table(con, catalog_entry, state):
                    yield message
            elif replication_method == 'FULL_TABLE':
                for message in full_table.sync_table(con, catalog_entry, state):
                    yield message
            else:
                raise Exception("only INCREMENTAL and FULL TABLE replication methods are supported")

    # if we get here, we've finished processing all the streams, so clear
    # currently_syncing from the state and emit a state message.
    state = singer.set_currently_syncing(state, None)
    yield singer.StateMessage(value=copy.deepcopy(state))
示例#15
0
def sync_non_oplog_streams(client, streams, state):
    for stream in streams:
        md_map = metadata.to_map(stream['metadata'])
        stream_metadata = md_map.get(())
        select_clause = stream_metadata.get('custom-select-clause')

        if not select_clause:
            LOGGER.warning(
                'There are no columns selected for stream %s, skipping it.',
                stream['tap_stream_stream'])
            continue

        columns = [c.strip(' ') for c in select_clause.split(',')]
        columns.append('_id')

        state = singer.set_currently_syncing(state, stream['tap_stream_id'])

        # Emit a state message to indicate that we've started this stream
        singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

        replication_method = stream_metadata.get('replication-method')

        database_name = get_database_name(stream)

        with metrics.job_timer('sync_table') as timer:
            timer.tags['database'] = database_name
            timer.tags['table'] = stream['table_name']

            if replication_method == 'LOG_BASED':
                do_sync_historical_oplog(client, stream, state, columns)
            elif replication_method == 'FULL_TABLE':
                write_schema_message(stream)
                stream_version = common.get_stream_version(
                    stream['tap_stream_id'], state)
                full_table.sync_table(client, stream, state, stream_version,
                                      columns)

                state = singer.write_bookmark(state, stream['tap_stream_id'],
                                              'initial_full_table_complete',
                                              True)
            else:
                raise Exception(
                    f"only LOG_BASED and FULL TABLE replication methods are supported (you passed {replication_method})"
                )

    state = singer.set_currently_syncing(state, None)

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
示例#16
0
def sync_non_binlog_streams(mysql_conn,
                            non_binlog_catalog,
                            config,
                            state,
                            original_state_file=''):
    for catalog_entry in non_binlog_catalog.streams:
        columns = list(catalog_entry.schema.properties.keys())

        if not columns:
            LOGGER.warning(
                'There are no columns selected for stream %s, skipping it.',
                catalog_entry.stream)
            continue

        state = singer.set_currently_syncing(state,
                                             catalog_entry.tap_stream_id)

        # Emit a state message to indicate that we've started this stream
        singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

        md_map = metadata.to_map(catalog_entry.metadata)

        replication_method = md_map.get((), {}).get('replication-method')

        database_name = common.get_database_name(catalog_entry)

        with metrics.job_timer('sync_table') as timer:
            timer.tags['database'] = database_name
            timer.tags['table'] = catalog_entry.table

            log_engine(mysql_conn, catalog_entry)

            if replication_method == 'INCREMENTAL':
                do_sync_incremental(mysql_conn, catalog_entry, state, columns,
                                    original_state_file)
            elif replication_method == 'LOG_BASED':
                do_sync_historical_binlog(mysql_conn, config, catalog_entry,
                                          state, columns)
            elif replication_method == 'FULL_TABLE':
                do_sync_full_table(mysql_conn, config, catalog_entry, state,
                                   columns)
            else:
                raise Exception(
                    "only INCREMENTAL, LOG_BASED, and FULL TABLE replication methods are supported"
                )

    state = singer.set_currently_syncing(state, None)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_traditional_stream(client: MongoClient, stream: Dict, state: Dict):
    """
    Sync given stream
    Args:
        client: MongoDb client instance
        stream: stream to sync
        state: state
    """
    tap_stream_id = stream['tap_stream_id']

    common.COUNTS[tap_stream_id] = 0
    common.TIMES[tap_stream_id] = 0
    common.SCHEMA_COUNT[tap_stream_id] = 0
    common.SCHEMA_TIMES[tap_stream_id] = 0

    md_map = metadata.to_map(stream['metadata'])
    replication_method = metadata.get(md_map, (), 'replication-method')

    if replication_method not in {INCREMENTAL_METHOD, FULL_TABLE_METHOD}:
        raise InvalidReplicationMethodException(replication_method,
                                                'replication method needs to be either FULL_TABLE or INCREMENTAL')

    database_name = metadata.get(md_map, (), 'database-name')

    # Emit a state message to indicate that we've started this stream
    state = clear_state_on_replication_change(stream, state)
    state = singer.set_currently_syncing(state, stream['tap_stream_id'])
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    write_schema_message(stream)
    common.SCHEMA_COUNT[tap_stream_id] += 1

    with metrics.job_timer('sync_table') as timer:
        timer.tags['database'] = database_name
        timer.tags['table'] = stream['table_name']

        collection = client[database_name][stream["table_name"]]

        if replication_method == 'FULL_TABLE':
            full_table.sync_collection(collection, stream, state)
        else:
            incremental.sync_collection(collection, stream, state)

    state = singer.set_currently_syncing(state, None)

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
示例#18
0
def sync_streams(snowflake_conn, catalog, state):
    for catalog_entry in catalog.streams:
        columns = list(catalog_entry.schema.properties.keys())

        if not columns:
            LOGGER.warning(
                'There are no columns selected for stream %s, skipping it.',
                catalog_entry.stream)
            continue

        state = singer.set_currently_syncing(state,
                                             catalog_entry.tap_stream_id)

        # Emit a state message to indicate that we've started this stream
        singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

        md_map = metadata.to_map(catalog_entry.metadata)

        replication_method = md_map.get((), {}).get('replication-method',
                                                    "FULL_TABLE")

        database_name = common.get_database_name(catalog_entry)
        schema_name = common.get_schema_name(catalog_entry)

        with metrics.job_timer('sync_table') as timer:
            timer.tags['database'] = database_name
            timer.tags['table'] = catalog_entry.table

            LOGGER.info('Beginning to sync %s.%s.%s', database_name,
                        schema_name, catalog_entry.table)

            if replication_method == 'INCREMENTAL':
                do_sync_incremental(snowflake_conn, catalog_entry, state,
                                    columns)
            elif replication_method == 'FULL_TABLE':
                do_sync_full_table(snowflake_conn, catalog_entry, state,
                                   columns)
            else:
                raise Exception(
                    'Only INCREMENTAL and FULL TABLE replication methods are supported'
                )

    state = singer.set_currently_syncing(state, None)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_log_based_streams(client: MongoClient,
                           log_based_streams: List[Dict],
                           database_name: str,
                           state: Dict,
                           update_buffer_size: Optional[int],
                           await_time_ms: Optional[int]
                           ):
    """
    Sync log_based streams all at once by listening on the database-level change streams events.
    Args:
        client: MongoDB client instance
        log_based_streams:  list of streams to sync
        database_name: name of the database to sync from
        state: state dictionary
        update_buffer_size: the size of buffer used to hold detected updates
        await_time_ms:  the maximum time in milliseconds for the log based to wait for changes before exiting
    """
    if not log_based_streams:
        return

    streams = streams_list_to_dict(log_based_streams)

    for tap_stream_id, stream in streams.items():
        common.COUNTS[tap_stream_id] = 0
        common.TIMES[tap_stream_id] = 0
        common.SCHEMA_COUNT[tap_stream_id] = 0
        common.SCHEMA_TIMES[tap_stream_id] = 0

        state = clear_state_on_replication_change(stream, state)
        singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

        write_schema_message(stream)
        common.SCHEMA_COUNT[tap_stream_id] += 1

    with metrics.job_timer('sync_table') as timer:
        timer.tags['database'] = database_name
        update_buffer_size = update_buffer_size or change_streams.MIN_UPDATE_BUFFER_LENGTH
        await_time_ms = await_time_ms or change_streams.DEFAULT_AWAIT_TIME_MS

        change_streams.sync_database(client[database_name], streams, state, update_buffer_size, await_time_ms)

    state = singer.set_currently_syncing(state, None)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
示例#20
0
def generate_messages(conn, catalog, state):
    for catalog_entry in catalog.streams:

        # Emit a state message to indicate that we've started this stream
        yield singer.StateMessage(value=copy.deepcopy(state))

        # Emit a SCHEMA message before we sync any records
        yield singer.SchemaMessage(
            stream=catalog_entry.stream,
            schema=catalog_entry.schema.to_dict(),
            key_properties=catalog_entry.key_properties
        )

        # Emit a RECORD message for each record in the result set
        with metrics.job_timer('sync_table') as timer:
            timer.tags['database'] = catalog_entry.database
            timer.tags['table'] = catalog_entry.table
            for message in sync_table(conn, catalog_entry, state):
                yield message

        # Emit a state message
        yield singer.StateMessage(value=copy.deepcopy(state))
示例#21
0
def generate_messages(conn, catalog, state):
    for catalog_entry in catalog.streams:

        if not catalog_entry.is_selected():
            continue

        # Emit a SCHEMA message before we sync any records
        yield singer.SchemaMessage(
            stream=catalog_entry.stream,
            schema=catalog_entry.schema.to_dict(),
            key_properties=catalog_entry.key_properties,
            bookmark_properties=[REPLICATION_KEY]
        )

        # Emit a RECORD message for each record in the result set
        with metrics.job_timer('sync_table') as timer:
            timer.tags['database'] = catalog_entry.database
            timer.tags['table'] = catalog_entry.table
            for message in sync_table(conn, catalog_entry, state):
                yield message

        # Emit a state message
        yield singer.StateMessage(value=copy.deepcopy(state))
示例#22
0
def sync_non_oplog_streams(client, streams, state):
    for stream in streams:
        md_map = metadata.to_map(stream['metadata'])
        stream_metadata = md_map.get(())
        blacklisted_fields = stream_metadata.get('blacklisted-fields')
        blacklist = [c.strip(' ') for c in blacklisted_fields.split(',')]
        state = singer.set_currently_syncing(state, stream['tap_stream_id'])

        # Emit a state message to indicate that we've started this stream
        singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

        replication_method = stream_metadata.get('replication-method')

        database_name = get_database_name(stream)

        with metrics.job_timer('sync_table') as timer:
            timer.tags['database'] = database_name
            timer.tags['table'] = stream['table_name']

            if replication_method == 'LOG_BASED':
                do_sync_historical_oplog(client, stream, state, blacklist)
            elif replication_method == 'FULL_TABLE':
                write_schema_message(stream)
                stream_version = common.get_stream_version(
                    stream['tap_stream_id'], state)
                full_table.sync_table(client, stream, state, stream_version,
                                      blacklist)
            else:
                raise Exception(
                    f"only LOG_BASED and FULL TABLE replication methods are supported (you passed {replication_method})"
                )

            state = singer.write_bookmark(state, stream['tap_stream_id'],
                                          'initial_full_table_complete', True)

    state = singer.set_currently_syncing(state, None)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
示例#23
0
def generate_messages(con, config, catalog, state):
    catalog = resolve_catalog(con, catalog, state)

    for catalog_entry in catalog.streams:
        columns = list(catalog_entry.schema.properties.keys())

        if not columns:
            LOGGER.warning(
                'There are no columns selected for stream %s, skipping it.',
                catalog_entry.stream)
            continue

        state = singer.set_currently_syncing(state,
                                             catalog_entry.tap_stream_id)

        # Emit a state message to indicate that we've started this stream
        yield singer.StateMessage(value=copy.deepcopy(state))

        md_map = metadata.to_map(catalog_entry.metadata)

        replication_method = md_map.get((), {}).get('replication-method')
        replication_key = md_map.get((), {}).get('replication-key')

        if catalog_entry.is_view:
            key_properties = md_map.get((), {}).get('view-key-properties')
        else:
            key_properties = md_map.get((), {}).get('table-key-properties')

        with metrics.job_timer('sync_table') as timer:
            timer.tags['database'] = catalog_entry.database
            timer.tags['table'] = catalog_entry.table

            log_engine(con, catalog_entry)

            if replication_method == 'INCREMENTAL':
                LOGGER.info("Stream %s is using incremental replication",
                            catalog_entry.stream)

                yield generate_schema_message(catalog_entry, key_properties,
                                              [replication_key])

                for message in incremental.sync_table(con, catalog_entry,
                                                      state, columns):
                    yield message
            elif replication_method == 'LOG_BASED':
                if catalog_entry.is_view:
                    raise Exception(
                        "Unable to replicate stream({}) with binlog because it is a view."
                        .format(catalog_entry.stream))

                LOGGER.info("Stream %s is using binlog replication",
                            catalog_entry.stream)

                log_file = singer.get_bookmark(state,
                                               catalog_entry.tap_stream_id,
                                               'log_file')

                log_pos = singer.get_bookmark(state,
                                              catalog_entry.tap_stream_id,
                                              'log_pos')

                yield generate_schema_message(catalog_entry, key_properties,
                                              [])

                if log_file and log_pos:
                    columns = binlog.add_automatic_properties(
                        catalog_entry, columns)

                    for message in binlog.sync_table(con, config,
                                                     catalog_entry, state,
                                                     columns):
                        yield message
                else:
                    LOGGER.info("Performing initial full table sync")

                    log_file, log_pos = binlog.fetch_current_log_file_and_pos(
                        con)

                    stream_version = common.get_stream_version(
                        catalog_entry.tap_stream_id, state)

                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  'version', stream_version)

                    for message in full_table.sync_table(
                            con, catalog_entry, state, columns,
                            stream_version):
                        yield message

                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  'log_file', log_file)

                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  'log_pos', log_pos)

                    yield singer.StateMessage(value=copy.deepcopy(state))
            elif replication_method == 'FULL_TABLE':
                LOGGER.info("Stream %s is using full table replication",
                            catalog_entry.stream)

                yield generate_schema_message(catalog_entry, key_properties,
                                              [])

                stream_version = common.get_stream_version(
                    catalog_entry.tap_stream_id, state)

                for message in full_table.sync_table(con, catalog_entry, state,
                                                     columns, stream_version):
                    yield message

                # Prefer initial_full_table_complete going forward
                singer.clear_bookmark(state, catalog_entry.tap_stream_id,
                                      'version')

                state = singer.write_bookmark(state,
                                              catalog_entry.tap_stream_id,
                                              'initial_full_table_complete',
                                              True)

                yield singer.StateMessage(value=copy.deepcopy(state))
            else:
                raise Exception(
                    "only INCREMENTAL, LOG_BASED, and FULL TABLE replication methods are supported"
                )

    # if we get here, we've finished processing all the streams, so clear
    # currently_syncing from the state and emit a state message.
    state = singer.set_currently_syncing(state, None)
    yield singer.StateMessage(value=copy.deepcopy(state))
示例#24
0
def sync_bulk_obj(client,
                  catalog,
                  state,
                  start_date,
                  stream_name,
                  bulk_page_size,
                  activity_type=None):
    LOGGER.info('{} - Starting export'.format(stream_name))

    stream = catalog.get_stream(stream_name)
    if activity_type:
        updated_at_field = 'CreatedAt'
    else:
        updated_at_field = 'UpdatedAt'

    last_bookmark = get_bulk_bookmark(state, stream_name)
    last_date_raw = last_bookmark.get('datetime', start_date)
    last_date = pendulum.parse(last_date_raw).to_datetime_string()
    last_sync_id = last_bookmark.get('sync_id')
    last_offset = last_bookmark.get('offset')

    if last_sync_id:
        LOGGER.info('{} - Resuming previous export: {}'.format(
            stream_name, last_sync_id))
        try:
            last_date = stream_export(client,
                                      state,
                                      catalog,
                                      stream_name,
                                      last_sync_id,
                                      updated_at_field,
                                      bulk_page_size,
                                      last_date,
                                      offset=last_offset)
        except HTTPError as e:
            if e.response.status_code in [404, 410]:
                LOGGER.info('{} - Previous export expired: {}'.format(
                    stream_name, last_sync_id))
            else:
                raise

    fields = {}
    obj_meta = None
    for meta in stream.metadata:
        if not meta['breadcrumb']:
            obj_meta = meta['metadata']
        elif meta['metadata'].get('selected', True) or \
             meta['metadata'].get('inclusion', 'available') == 'automatic':
            field_name = meta['breadcrumb'][1]
            fields[field_name] = meta['metadata']['tap-eloqua.statement']

    num_fields = len(fields.values())
    if num_fields > 250:
        LOGGER.error(
            '{} - Exports can only have 250 fields selected. {} are selected.'.
            format(stream_name, num_fields))
    else:
        LOGGER.info('{} - Syncing {} fields'.format(stream_name, num_fields))

    language_obj = obj_meta['tap-eloqua.query-language-name']

    _filter = "'{{" + language_obj + "." + updated_at_field + "}}' >= '" + last_date + "'"

    if activity_type is not None:
        _filter += " AND '{{Activity.Type}}' = '" + activity_type + "'"

    params = {
        'name': 'Singer Sync - ' + datetime.utcnow().isoformat(),
        'fields': fields,
        'filter': _filter,
        'areSystemTimestampsInUTC': True
    }

    if activity_type:
        url_obj = 'activities'
    elif obj_meta['tap-eloqua.id']:
        url_obj = 'customObjects/' + obj_meta['tap-eloqua.id']
    else:
        url_obj = stream_name

    with metrics.job_timer('bulk_export'):
        data = client.post('/api/bulk/2.0/{}/exports'.format(url_obj),
                           json=params,
                           endpoint='export_create_def')

        data = client.post('/api/bulk/2.0/syncs',
                           json={'syncedInstanceUri': data['uri']},
                           endpoint='export_create_sync')

        sync_id = re.match(r'/syncs/([0-9]+)', data['uri']).groups()[0]

        LOGGER.info('{} - Created export - {}'.format(stream_name, sync_id))

        sleep = 0
        start_time = time.time()
        while True:
            data = client.get('/api/bulk/2.0/syncs/{}'.format(sync_id),
                              endpoint='export_sync_poll')

            status = data['status']
            if status == 'success':
                break
            elif status not in ['pending', 'active']:
                message = '{} - status: {}, exporting failed'.format(
                    stream_name, status)
                LOGGER.error(message)
                raise Exception(message)
            elif (time.time() - start_time) > MAX_RETRY_ELAPSED_TIME:
                message = '{} - export deadline exceeded ({} secs)'.format(
                    stream_name, MAX_RETRY_ELAPSED_TIME)
                LOGGER.error(message)
                raise Exception(message)

            sleep = next_sleep_interval(sleep)
            LOGGER.info('{} - status: {}, sleeping for {} seconds'.format(
                stream_name, status, sleep))
            time.sleep(sleep)

    stream_export(client, state, catalog, stream_name, sync_id,
                  updated_at_field, bulk_page_size, last_date)
示例#25
0
    def write_batch_helper(self, connection, root_table_name, schema, key_properties, records, metadata):
        """
        Write all `table_batch`s associated with the given `schema` and `records` to remote.

        :param connection: remote connection, type left to be determined by implementing class
        :param root_table_name: string
        :param schema: SingerStreamSchema
        :param key_properties: [string, ...]
        :param records: [{...}, ...]
        :param metadata: additional metadata needed by implementing class
        :return: {'records_persisted': int,
                  'rows_persisted': int}
        """
        with self._set_timer_tags(metrics.job_timer(),
                                  'batch',
                                  (root_table_name,)):
            with self._set_counter_tags(metrics.record_counter(None),
                                        'batch_rows_persisted',
                                        (root_table_name,)) as batch_counter:
                self.LOGGER.info('Writing batch with {} records for `{}` with `key_properties`: `{}`'.format(
                    len(records),
                    root_table_name,
                    key_properties
                ))

                for table_batch in denest.to_table_batches(schema, key_properties, records):
                    table_batch['streamed_schema']['path'] = (root_table_name,) + \
                                                             table_batch['streamed_schema']['path']

                    with self._set_timer_tags(metrics.job_timer(),
                                              'table',
                                              table_batch['streamed_schema']['path']) as table_batch_timer:
                        with self._set_counter_tags(metrics.record_counter(None),
                                                    'table_rows_persisted',
                                                    table_batch['streamed_schema']['path']) as table_batch_counter:
                            self.LOGGER.info('Writing table batch schema for `{}`...'.format(
                                table_batch['streamed_schema']['path']
                            ))

                            remote_schema = self.upsert_table_helper(connection,
                                                                     table_batch['streamed_schema'],
                                                                     metadata)

                            self._set_metrics_tags__table(table_batch_timer, remote_schema['name'])
                            self._set_metrics_tags__table(table_batch_counter, remote_schema['name'])

                            self.LOGGER.info('Writing table batch with {} rows for `{}`...'.format(
                                len(table_batch['records']),
                                table_batch['streamed_schema']['path']
                            ))

                            batch_rows_persisted = self.write_table_batch(
                                connection,
                                {'remote_schema': remote_schema,
                                 'records': self._serialize_table_records(remote_schema,
                                                                          table_batch['streamed_schema'],
                                                                          table_batch['records'])},
                                metadata)

                            table_batch_counter.increment(batch_rows_persisted)
                            batch_counter.increment(batch_rows_persisted)

                return {
                    'records_persisted': len(records),
                    'rows_persisted': batch_counter.value
                }
示例#26
0
    def upsert_table_helper(self, connection, schema, metadata, log_schema_changes=True):
        """
        Upserts the `schema` to remote by:
        - creating table if necessary
        - adding columns
        - adding column mappings
        - migrating data from old columns to new, etc.

        :param connection: remote connection, type left to be determined by implementing class
        :param schema: TABLE_SCHEMA(local)
        :param metadata: additional information necessary for downstream operations,
        :param log_schema_changes: defaults to True, set to false to disable logging of table level schema changes
        :return: TABLE_SCHEMA(remote)
        """
        table_path = schema['path']

        with self._set_timer_tags(metrics.job_timer(),
                                  'upsert_table_schema',
                                  table_path) as timer:

            _metadata = deepcopy(metadata)
            _metadata['schema_version'] = CURRENT_SCHEMA_VERSION

            table_name = self.add_table_mapping(connection, table_path, _metadata)

            self._set_metrics_tags__table(timer, table_name)

            existing_schema = self._get_table_schema(connection, table_name)

            existing_table = True
            if existing_schema is None:
                self.add_table(connection, table_path, table_name, _metadata)
                existing_schema = self._get_table_schema(connection, table_name)
                existing_table = False

            self.add_key_properties(connection, table_name, schema.get('key_properties', None))

            ## Build up mappings to compare new columns against existing
            mappings = []

            for to, m in existing_schema.get('mappings', {}).items():
                mapping = json_schema.simple_type(m)
                mapping['from'] = tuple(m['from'])
                mapping['to'] = to
                mappings.append(mapping)

            ## Only process columns which have single, nullable, types
            column_paths_seen = set()
            single_type_columns = []

            for column_path, column_schema in schema['schema']['properties'].items():
                column_paths_seen.add(column_path)
                for sub_schema in column_schema['anyOf']:
                    single_type_columns.append((column_path, deepcopy(sub_schema)))

            ### Add any columns missing from new schema
            for m in mappings:
                if not m['from'] in column_paths_seen:
                    single_type_columns.append((m['from'], json_schema.make_nullable(m)))

            ## Process new columns against existing
            table_empty = self.is_table_empty(connection, table_name)

            for column_path, column_schema in single_type_columns:
                upsert_table_helper__start__column = time.monotonic()

                canonicalized_column_name = self._canonicalize_column_identifier(column_path, column_schema, mappings)
                nullable_column_schema = json_schema.make_nullable(column_schema)

                def log_message(msg):
                    if log_schema_changes:
                        self.LOGGER.info(
                            'Table Schema Change [`{}`.`{}`:`{}`] {} (took {} millis)'.format(
                                table_name,
                                column_path,
                                canonicalized_column_name,
                                msg,
                                _duration_millis(upsert_table_helper__start__column)))

                ## NEW COLUMN
                if not column_path in [m['from'] for m in mappings]:
                    upsert_table_helper__column = "New column"
                    ### NON EMPTY TABLE
                    if not table_empty:
                        upsert_table_helper__column += ", non empty table"
                        self.LOGGER.warning(
                            'NOT EMPTY: Forcing new column `{}` in table `{}` to be nullable due to table not empty.'.format(
                                column_path,
                                table_name))
                        column_schema = nullable_column_schema

                    self.add_column(connection,
                                    table_name,
                                    canonicalized_column_name,
                                    column_schema)
                    self.add_column_mapping(connection,
                                            table_name,
                                            column_path,
                                            canonicalized_column_name,
                                            column_schema)

                    mapping = json_schema.simple_type(column_schema)
                    mapping['from'] = column_path
                    mapping['to'] = canonicalized_column_name
                    mappings.append(mapping)

                    log_message(upsert_table_helper__column)

                    continue

                ## EXISTING COLUMNS
                ### SCHEMAS MATCH
                if [True for m in mappings if
                    m['from'] == column_path
                    and self.json_schema_to_sql_type(m) == self.json_schema_to_sql_type(column_schema)]:
                    continue
                ### NULLABLE SCHEMAS MATCH
                ###  New column _is not_ nullable, existing column _is_
                if [True for m in mappings if
                    m['from'] == column_path
                    and self.json_schema_to_sql_type(m) == self.json_schema_to_sql_type(nullable_column_schema)]:
                    continue

                ### NULL COMPATIBILITY
                ###  New column _is_ nullable, existing column is _not_
                non_null_original_column = [m for m in mappings if
                                            m['from'] == column_path and json_schema.shorthand(
                                                m) == json_schema.shorthand(column_schema)]
                if non_null_original_column:
                    ## MAKE NULLABLE
                    self.make_column_nullable(connection,
                                              table_name,
                                              canonicalized_column_name)
                    self.drop_column_mapping(connection, table_name, canonicalized_column_name)
                    self.add_column_mapping(connection,
                                            table_name,
                                            column_path,
                                            canonicalized_column_name,
                                            nullable_column_schema)

                    mappings = [m for m in mappings if not (m['from'] == column_path and json_schema.shorthand(
                        m) == json_schema.shorthand(column_schema))]

                    mapping = json_schema.simple_type(nullable_column_schema)
                    mapping['from'] = column_path
                    mapping['to'] = canonicalized_column_name
                    mappings.append(mapping)

                    log_message("Made existing column nullable.")

                    continue

                ### FIRST MULTI TYPE
                ###  New column matches existing column path, but the types are incompatible
                duplicate_paths = [m for m in mappings if m['from'] == column_path]

                if 1 == len(duplicate_paths):
                    existing_mapping = duplicate_paths[0]
                    existing_column_name = existing_mapping['to']

                    if existing_column_name:
                        self.drop_column_mapping(connection, table_name, existing_column_name)

                    ## Update existing properties
                    mappings = [m for m in mappings if m['from'] != column_path]

                    mapping = json_schema.simple_type(nullable_column_schema)
                    mapping['from'] = column_path
                    mapping['to'] = canonicalized_column_name
                    mappings.append(mapping)

                    existing_column_new_normalized_name = self._canonicalize_column_identifier(column_path,
                                                                                               existing_mapping,
                                                                                               mappings)

                    mapping = json_schema.simple_type(json_schema.make_nullable(existing_mapping))
                    mapping['from'] = column_path
                    mapping['to'] = existing_column_new_normalized_name
                    mappings.append(mapping)

                    ## Add new columns
                    ### NOTE: all migrated columns will be nullable and remain that way

                    #### Table Metadata
                    self.add_column_mapping(connection,
                                            table_name,
                                            column_path,
                                            existing_column_new_normalized_name,
                                            json_schema.make_nullable(existing_mapping))
                    self.add_column_mapping(connection,
                                            table_name,
                                            column_path,
                                            canonicalized_column_name,
                                            nullable_column_schema)

                    #### Columns
                    self.add_column(connection,
                                    table_name,
                                    existing_column_new_normalized_name,
                                    json_schema.make_nullable(existing_mapping))

                    self.add_column(connection,
                                    table_name,
                                    canonicalized_column_name,
                                    nullable_column_schema)

                    ## Migrate existing data
                    self.migrate_column(connection,
                                        table_name,
                                        existing_mapping['to'],
                                        existing_column_new_normalized_name)

                    ## Drop existing column
                    self.drop_column(connection,
                                     table_name,
                                     existing_mapping['to'])

                    upsert_table_helper__column = "Splitting `{}` into `{}` and `{}`. New column matches existing column path, but the types are incompatible.".format(
                        existing_column_name,
                        existing_column_new_normalized_name,
                        canonicalized_column_name
                    )

                ## REST MULTI TYPE
                elif 1 < len(duplicate_paths):
                    ## Add new column
                    self.add_column_mapping(connection,
                                            table_name,
                                            column_path,
                                            canonicalized_column_name,
                                            nullable_column_schema)
                    self.add_column(connection,
                                    table_name,
                                    canonicalized_column_name,
                                    nullable_column_schema)

                    mapping = json_schema.simple_type(nullable_column_schema)
                    mapping['from'] = column_path
                    mapping['to'] = canonicalized_column_name
                    mappings.append(mapping)

                    upsert_table_helper__column = "Adding new column to split column `{}`. New column matches existing column's path, but no types were compatible.".format(
                        column_path
                    )

                ## UNKNOWN
                else:
                    raise Exception(
                        'UNKNOWN: Cannot handle merging column `{}` (canonicalized as: `{}`) in table `{}`.'.format(
                            column_path,
                            canonicalized_column_name,
                            table_name
                        ))

                log_message(upsert_table_helper__column)

            if not existing_table:
                for column_names in self.new_table_indexes(schema):
                    self.add_index(connection, table_name, column_names)

            return self._get_table_schema(connection, table_name)
示例#27
0
def sync_bulk_obj(client, catalog, state, start_date, stream_name, bulk_page_size, activity_type=None, end_date=None):
    LOGGER.info('{} - Starting export'.format(stream_name))

    stream = catalog.get_stream(stream_name)
    if activity_type:
        updated_at_field_name = 'ActivityDate'
        updated_at_field = 'CreatedAt'
    else:
        updated_at_field = 'UpdatedAt'
        updated_at_field_name = updated_at_field

    last_bookmark = get_bulk_bookmark(state, stream_name)
    last_date_raw = last_bookmark.get('datetime', start_date)
    last_date = pendulum.parse(last_date_raw).to_datetime_string()
    last_sync_id = last_bookmark.get('sync_id')
    last_offset = last_bookmark.get('offset')

    if last_sync_id:
        LOGGER.info('{} - Resuming previous export: {}'.format(stream_name, last_sync_id))
        try:
            last_date = stream_export(client,
                                      state,
                                      catalog,
                                      stream_name,
                                      last_sync_id,
                                      updated_at_field_name,
                                      bulk_page_size,
                                      last_date,
                                      offset=last_offset)
        except HTTPError as e:
            if e.response.status_code in [404, 410]:
                LOGGER.info('{} - Previous export expired: {}'.format(stream_name, last_sync_id))
            else:
                raise

    fields = {}
    obj_meta = None
    for meta in stream.metadata:
        if not meta['breadcrumb']:
            obj_meta = meta['metadata']
        elif meta['metadata'].get('selected', True) or \
             meta['metadata'].get('inclusion', 'available') == 'automatic':
            field_name = meta['breadcrumb'][1]
            fields[field_name] = meta['metadata']['tap-eloqua.statement']

    num_fields = len(fields.values())
    if num_fields > 250:
        raise Exception('{} - Exports can only have 250 fields selected. {} are selected.'.format(
            stream_name, num_fields))
    else:
        LOGGER.info('{} - Syncing {} fields'.format(stream_name, num_fields))

    language_obj = obj_meta['tap-eloqua.query-language-name']

    _filter = "'{{" + language_obj + "." + updated_at_field + "}}' >= '" + last_date + "'"

    if end_date:
        _filter += " AND '{{" + language_obj + "." + updated_at_field + "}}' < '" + end_date.to_datetime_string() + "'"

    if activity_type is not None:
        _filter += " AND '{{Activity.Type}}' = '" + activity_type + "'"
        # NB: We observed shuffled data when Activity.CreatedAt was specified twice in the query.
        #     The key 'CreatedAt' is synthetic, so add it in after the export. (PR #19)
        fields.pop('CreatedAt', None)

    params = {
        'name': 'Singer Sync - ' + datetime.utcnow().isoformat(),
        'fields': fields,
        'filter': _filter,
        'areSystemTimestampsInUTC': True,
        'autoDeleteDuration': 'P3D',
        'dataRetentionDuration': 'P3D' # 3 days in ISO-8601 duration notation this can be increased up to 14 days
    }

    if activity_type:
        url_obj = 'activities'
    elif obj_meta['tap-eloqua.id']:
        url_obj = 'customObjects/' + obj_meta['tap-eloqua.id']
    else:
        url_obj = stream_name

    with metrics.job_timer('bulk_export'):
        log_string = "{} - Creating bulk export from {}".format(stream_name,
                                                                last_date)
        if end_date:
            log_string += " to {}".format(end_date.to_datetime_string())
        LOGGER.info(log_string)

        data = client.post(
            '/api/bulk/2.0/{}/exports'.format(url_obj),
            json=params,
            endpoint='export_create_def')

        data = client.post(
            '/api/bulk/2.0/syncs',
            json={
                'syncedInstanceUri': data['uri']
            },
            endpoint='export_create_sync')

        sync_id = re.match(r'/syncs/([0-9]+)', data['uri']).groups()[0]

        LOGGER.info('{} - Created export - {}'.format(stream_name, sync_id))

        write_bulk_bookmark(state, stream_name, sync_id, 0, last_date_raw)

        sleep = 0
        start_time = time.time()
        while True:
            data = client.get(
                '/api/bulk/2.0/syncs/{}'.format(sync_id),
                endpoint='export_sync_poll')

            status = data['status']
            if status == 'success':
                break
            elif status not in ['pending', 'active']:
                message = '{} - status: {}, exporting failed'.format(
                        stream_name,
                        status)
                LOGGER.error(message)
                raise Exception(message)
            elif (time.time() - start_time) > MAX_RETRY_ELAPSED_TIME:
                message = '{} - export deadline exceeded ({} secs)'.format(
                        stream_name,
                        MAX_RETRY_ELAPSED_TIME)
                LOGGER.error(message)
                raise Exception(message)

            sleep = next_sleep_interval(sleep)
            LOGGER.info('{} - status: {}, sleeping for {} seconds'.format(
                        stream_name,
                        status,
                        sleep))
            time.sleep(sleep)

    # Check record count
    data = client.get(
        '/api/bulk/2.0/syncs/{}/logs'.format(sync_id),
        endpoint='export_sync_poll')

    success_message = "Successfully exported members to csv file."
    export_success_log = next((i for i in data.get("items", [])
                               if i.get("message") == success_message), None)
    if export_success_log:
        record_count = export_success_log["count"]
        LOGGER.info("Sync id {} contains {} records.".format(sync_id, record_count))
        if activity_type and record_count >= 5000000:
            raise ActivityExportTooLarge("Export too large, retrying with smaller window.")

    stream_export(client,
                  state,
                  catalog,
                  stream_name,
                  sync_id,
                  updated_at_field_name,
                  bulk_page_size,
                  last_date,
                  activity_type=activity_type)
示例#28
0
def sync_stream(client, stream, state):
    tap_stream_id = stream['tap_stream_id']

    common.COUNTS[tap_stream_id] = 0
    common.TIMES[tap_stream_id] = 0
    common.SCHEMA_COUNT[tap_stream_id] = 0
    common.SCHEMA_TIMES[tap_stream_id] = 0

    md_map = metadata.to_map(stream['metadata'])
    replication_method = metadata.get(md_map, (), 'replication-method')
    database_name = metadata.get(md_map, (), 'database-name')

    stream_projection = load_stream_projection(stream)

    # Emit a state message to indicate that we've started this stream
    state = clear_state_on_replication_change(stream, state)
    state = singer.set_currently_syncing(state, stream['tap_stream_id'])
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    write_schema_message(stream)
    common.SCHEMA_COUNT[tap_stream_id] += 1

    with metrics.job_timer('sync_table') as timer:
        timer.tags['database'] = database_name
        timer.tags['table'] = stream['table_name']

        if replication_method == 'LOG_BASED':
            if oplog.oplog_has_aged_out(client, state, tap_stream_id):
                # remove all state for stream
                # then it will do a full sync and start oplog again.
                LOGGER.info("Clearing state because Oplog has aged out")
                state.get('bookmarks', {}).pop(tap_stream_id)

            # make sure initial full table sync has been completed
            if not singer.get_bookmark(state, tap_stream_id,
                                       'initial_full_table_complete'):
                msg = 'Must complete full table sync before starting oplog replication for %s'
                LOGGER.info(msg, tap_stream_id)

                # only mark current ts in oplog on first sync so tap has a
                # starting point after the full table sync
                if singer.get_bookmark(state, tap_stream_id,
                                       'version') is None:
                    collection_oplog_ts = oplog.get_latest_ts(client)
                    oplog.update_bookmarks(state, tap_stream_id,
                                           collection_oplog_ts)

                full_table.sync_collection(client, stream, state,
                                           stream_projection)

            oplog.sync_collection(client, stream, state, stream_projection)

        elif replication_method == 'FULL_TABLE':
            full_table.sync_collection(client, stream, state,
                                       stream_projection)

        elif replication_method == 'INCREMENTAL':
            incremental.sync_collection(client, stream, state,
                                        stream_projection)
        else:
            raise Exception(
                "only FULL_TABLE, LOG_BASED, and INCREMENTAL replication \
                methods are supported (you passed {})".format(
                    replication_method))

    state = singer.set_currently_syncing(state, None)

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))