def update_bookmark(record_message, replication_method, catalog_entry, state):
    if replication_method in ('FULL_TABLE', 'LOG_BASED'):

        key_properties = get_key_properties(catalog_entry)
        max_pk_values = singer.get_bookmark(state, catalog_entry.tap_stream_id,
                                            'max_pk_values')
        if max_pk_values:
            last_pk_fetched = {
                k: v
                for k, v in record_message.record.items()
                if k in key_properties
            }
            state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                          'last_pk_fetched', last_pk_fetched)

    elif replication_method == 'INCREMENTAL':

        replication_key = singer.get_bookmark(state,
                                              catalog_entry.tap_stream_id,
                                              'replication_key')
        if replication_key is not None:
            state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                          'replication_key', replication_key)

            state = singer.write_bookmark(
                state, catalog_entry.tap_stream_id, 'replication_key_value',
                record_message.record[replication_key])
    return state
示例#2
0
def calculate_hashkey_sql(catalog_entry):
    key_properties = get_key_properties(catalog_entry)

    keys = set(key_properties) | set([SYS_UPDATED_AT])

    schema = catalog_entry.schema.to_dict()['properties']

    properties = {k: schema[k] for k in key_properties}

    properties['_sys_updated_at'] = {'type': ['string'], 'format': 'date-time'}

    return _join_hashes_sql(properties)
示例#3
0
def calculate_hashdiff_sql(catalog_entry):
    key_properties = get_key_properties(catalog_entry)
    schema = catalog_entry.schema.to_dict()['properties']

    properties = catalog_entry.schema.to_dict()['properties'].keys()

    keys = list(
        filter(lambda x: x[0:4] not in ('_sys', '_sdc') and x[0:3] != ('_is'),
               set(sorted(properties)) - set(key_properties)))

    properties = {k: schema[k] for k in keys}

    return _join_hashes_sql(properties)
示例#4
0
def sync_query(cursor, catalog_entry, state, select_sql, columns, stream_version, params):
    replication_key = singer.get_bookmark(state,
                                          catalog_entry.tap_stream_id,
                                          'replication_key')

    query_string = cursor.mogrify(select_sql, params)

    time_extracted = utils.now()

    LOGGER.info('Running %s', query_string)
    cursor.execute(select_sql, params)

    row = cursor.fetchone()
    rows_saved = 0

    database_name = get_database_name(catalog_entry)

    with metrics.record_counter(None) as counter:
        counter.tags['database'] = database_name
        counter.tags['table'] = catalog_entry.table

        while row:
            counter.increment()
            rows_saved += 1
            record_message = row_to_singer_record(catalog_entry,
                                                  stream_version,
                                                  row,
                                                  columns,
                                                  time_extracted)
            singer.write_message(record_message)

            md_map = metadata.to_map(catalog_entry.metadata)
            stream_metadata = md_map.get((), {})
            replication_method = stream_metadata.get('replication-method')

            if replication_method in {'FULL_TABLE', 'LOG_BASED'}:
                key_properties = get_key_properties(catalog_entry)

                max_pk_values = singer.get_bookmark(state,
                                                    catalog_entry.tap_stream_id,
                                                    'max_pk_values')

                if max_pk_values:
                    last_pk_fetched = {k:v for k, v in record_message.record.items()
                                       if k in key_properties}

                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  'last_pk_fetched',
                                                  last_pk_fetched)

            elif replication_method == 'INCREMENTAL':
                if replication_key is not None:
                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  'replication_key',
                                                  replication_key)

                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  'replication_key_value',
                                                  record_message.record[replication_key])
            if rows_saved % 1000 == 0:
                singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

            row = cursor.fetchone()

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
示例#5
0
def row_to_singer_record(catalog_entry, version, db_column_map, row,
                         time_extracted):
    row_to_persist = {}

    LOGGER.debug('Schema properties: %s', catalog_entry.schema.properties)
    LOGGER.debug('Event columns: %s', db_column_map)

    key_properties = get_key_properties(catalog_entry)

    for column_name, val in row.items():
        property_type = catalog_entry.schema.properties[column_name].type
        property_format = catalog_entry.schema.properties[column_name].format
        db_column_type = db_column_map.get(column_name)

        if isinstance(val, datetime.datetime):
            if db_column_type in MYSQL_TIMESTAMP_TYPES:
                # The mysql-replication library creates datetimes from TIMESTAMP columns using fromtimestamp which
                # will use the local timezone thus we must set tzinfo accordingly See:
                # https://github.com/noplay/python-mysql-replication/blob/master/pymysqlreplication/row_event.py#L143
                # -L145
                timezone = tzlocal.get_localzone()
                local_datetime = timezone.localize(val)
                utc_datetime = local_datetime.astimezone(pytz.UTC)
                row_to_persist[column_name] = utc_datetime.isoformat()
            else:
                row_to_persist[column_name] = val.isoformat()

        elif isinstance(val, datetime.date):
            row_to_persist[column_name] = val.isoformat() + 'T00:00:00+00:00'

        elif isinstance(val, datetime.timedelta):
            if property_format == 'time':
                # this should convert time column into 'HH:MM:SS' formatted string
                row_to_persist[column_name] = str(val)
            else:
                timedelta_from_epoch = datetime.datetime.utcfromtimestamp(
                    0) + val
                row_to_persist[column_name] = timedelta_from_epoch.isoformat(
                ) + '+00:00'

        elif db_column_type == FIELD_TYPE.JSON:
            row_to_persist[column_name] = json.dumps(json_bytes_to_string(val))

        elif isinstance(val, bytes):
            if column_name == 'additional_info':
                # Additional_info has a bad header in it
                row_to_persist[column_name] = codecs.encode(
                    val, 'hex').decode('utf-8')[5:2**16 - 1]
            else:
                row_to_persist[column_name] = codecs.encode(
                    val, 'hex').decode('utf-8')[:2**16 - 1]

        elif 'boolean' in property_type or property_type == 'boolean':
            if val is None:
                boolean_representation = None
            elif val == 0:
                boolean_representation = False
            elif db_column_type == FIELD_TYPE.BIT:
                boolean_representation = int(val) != 0
            else:
                boolean_representation = True
            row_to_persist[column_name] = boolean_representation
        elif val is not None and (column_name.startswith('html')
                                  or db_column_type == 'longtext'):
            row_to_persist[column_name] = val[:2**16 - 1]
        else:
            row_to_persist[column_name] = val

    row_to_persist[SYS_HASHKEY] = calculate_hashkey(row_to_persist,
                                                    key_properties)
    row_to_persist[SYS_HASHDIFF] = calculate_hashdiff(row_to_persist,
                                                      key_properties)

    return singer.RecordMessage(stream=catalog_entry.stream,
                                record=row_to_persist,
                                version=version,
                                time_extracted=time_extracted)