예제 #1
0
def get_max_pk_values(cursor, catalog_entry):
    """Get actual max primary key values from database"""
    database_name = common.get_database_name(catalog_entry)
    escaped_db = common.escape(database_name)
    escaped_table = common.escape(catalog_entry.table)

    key_properties = common.get_key_properties(catalog_entry)
    escaped_columns = [common.escape(c) for c in key_properties]

    sql = """SELECT {}
               FROM {}.{}
              ORDER BY {}
              LIMIT 1
    """

    select_column_clause = ', '.join(escaped_columns)
    order_column_clause = ', '.join([pk + ' DESC' for pk in escaped_columns])

    cursor.execute(
        sql.format(select_column_clause, escaped_db, escaped_table,
                   order_column_clause))
    result = cursor.fetchone()

    if result:
        max_pk_values = dict(zip(key_properties, result))
    else:
        max_pk_values = {}

    return max_pk_values
예제 #2
0
def sync_streams(snowflake_conn, catalog, state):
    for catalog_entry in catalog.streams:
        columns = list(catalog_entry.schema.properties.keys())

        if not columns:
            LOGGER.warning(
                'There are no columns selected for stream %s, skipping it.',
                catalog_entry.stream)
            continue

        state = singer.set_currently_syncing(state,
                                             catalog_entry.tap_stream_id)

        # Emit a state message to indicate that we've started this stream
        singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

        md_map = metadata.to_map(catalog_entry.metadata)

        replication_method = md_map.get((), {}).get('replication-method',
                                                    "FULL_TABLE")

        database_name = common.get_database_name(catalog_entry)
        schema_name = common.get_schema_name(catalog_entry)

        with metrics.job_timer('sync_table') as timer:
            timer.tags['database'] = database_name
            timer.tags['table'] = catalog_entry.table

            LOGGER.info('Beginning to sync %s.%s.%s', database_name,
                        schema_name, catalog_entry.table)

            if replication_method == 'INCREMENTAL':
                do_sync_incremental(snowflake_conn, catalog_entry, state,
                                    columns)
            elif replication_method == 'FULL_TABLE':
                do_sync_full_table(snowflake_conn, catalog_entry, state,
                                   columns)
            else:
                raise Exception(
                    'Only INCREMENTAL and FULL TABLE replication methods are supported'
                )

    state = singer.set_currently_syncing(state, None)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
예제 #3
0
def resolve_catalog(discovered_catalog, streams_to_sync):
    result = Catalog(streams=[])

    # Iterate over the streams in the input catalog and match each one up
    # with the same stream in the discovered catalog.
    for catalog_entry in streams_to_sync:
        catalog_metadata = metadata.to_map(catalog_entry.metadata)
        replication_key = catalog_metadata.get((), {}).get('replication-key')

        discovered_table = discovered_catalog.get_stream(
            catalog_entry.tap_stream_id)
        database_name = common.get_database_name(catalog_entry)

        if not discovered_table:
            LOGGER.warning(
                'Database %s table %s was selected but does not exist',
                database_name, catalog_entry.table)
            continue

        selected = {
            k
            for k, v in catalog_entry.schema.properties.items()
            if common.property_is_selected(catalog_entry, k)
            or k == replication_key
        }

        # These are the columns we need to select
        columns = desired_columns(selected, discovered_table.schema)

        result.streams.append(
            CatalogEntry(tap_stream_id=catalog_entry.tap_stream_id,
                         metadata=catalog_entry.metadata,
                         stream=catalog_entry.tap_stream_id,
                         table=catalog_entry.table,
                         schema=Schema(
                             type='object',
                             properties={
                                 col: discovered_table.schema.properties[col]
                                 for col in columns
                             })))

    return result