def update_bookmark(record_message, replication_method, catalog_entry, state): if replication_method in ('FULL_TABLE', 'LOG_BASED'): key_properties = get_key_properties(catalog_entry) max_pk_values = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values') if max_pk_values: last_pk_fetched = { k: v for k, v in record_message.record.items() if k in key_properties } state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'last_pk_fetched', last_pk_fetched) elif replication_method == 'INCREMENTAL': replication_key = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'replication_key') if replication_key is not None: state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'replication_key', replication_key) state = singer.write_bookmark( state, catalog_entry.tap_stream_id, 'replication_key_value', record_message.record[replication_key]) return state
def calculate_hashkey_sql(catalog_entry): key_properties = get_key_properties(catalog_entry) keys = set(key_properties) | set([SYS_UPDATED_AT]) schema = catalog_entry.schema.to_dict()['properties'] properties = {k: schema[k] for k in key_properties} properties['_sys_updated_at'] = {'type': ['string'], 'format': 'date-time'} return _join_hashes_sql(properties)
def calculate_hashdiff_sql(catalog_entry): key_properties = get_key_properties(catalog_entry) schema = catalog_entry.schema.to_dict()['properties'] properties = catalog_entry.schema.to_dict()['properties'].keys() keys = list( filter(lambda x: x[0:4] not in ('_sys', '_sdc') and x[0:3] != ('_is'), set(sorted(properties)) - set(key_properties))) properties = {k: schema[k] for k in keys} return _join_hashes_sql(properties)
def sync_query(cursor, catalog_entry, state, select_sql, columns, stream_version, params): replication_key = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'replication_key') query_string = cursor.mogrify(select_sql, params) time_extracted = utils.now() LOGGER.info('Running %s', query_string) cursor.execute(select_sql, params) row = cursor.fetchone() rows_saved = 0 database_name = get_database_name(catalog_entry) with metrics.record_counter(None) as counter: counter.tags['database'] = database_name counter.tags['table'] = catalog_entry.table while row: counter.increment() rows_saved += 1 record_message = row_to_singer_record(catalog_entry, stream_version, row, columns, time_extracted) singer.write_message(record_message) md_map = metadata.to_map(catalog_entry.metadata) stream_metadata = md_map.get((), {}) replication_method = stream_metadata.get('replication-method') if replication_method in {'FULL_TABLE', 'LOG_BASED'}: key_properties = get_key_properties(catalog_entry) max_pk_values = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values') if max_pk_values: last_pk_fetched = {k:v for k, v in record_message.record.items() if k in key_properties} state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'last_pk_fetched', last_pk_fetched) elif replication_method == 'INCREMENTAL': if replication_key is not None: state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'replication_key', replication_key) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'replication_key_value', record_message.record[replication_key]) if rows_saved % 1000 == 0: singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) row = cursor.fetchone() singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def row_to_singer_record(catalog_entry, version, db_column_map, row, time_extracted): row_to_persist = {} LOGGER.debug('Schema properties: %s', catalog_entry.schema.properties) LOGGER.debug('Event columns: %s', db_column_map) key_properties = get_key_properties(catalog_entry) for column_name, val in row.items(): property_type = catalog_entry.schema.properties[column_name].type property_format = catalog_entry.schema.properties[column_name].format db_column_type = db_column_map.get(column_name) if isinstance(val, datetime.datetime): if db_column_type in MYSQL_TIMESTAMP_TYPES: # The mysql-replication library creates datetimes from TIMESTAMP columns using fromtimestamp which # will use the local timezone thus we must set tzinfo accordingly See: # https://github.com/noplay/python-mysql-replication/blob/master/pymysqlreplication/row_event.py#L143 # -L145 timezone = tzlocal.get_localzone() local_datetime = timezone.localize(val) utc_datetime = local_datetime.astimezone(pytz.UTC) row_to_persist[column_name] = utc_datetime.isoformat() else: row_to_persist[column_name] = val.isoformat() elif isinstance(val, datetime.date): row_to_persist[column_name] = val.isoformat() + 'T00:00:00+00:00' elif isinstance(val, datetime.timedelta): if property_format == 'time': # this should convert time column into 'HH:MM:SS' formatted string row_to_persist[column_name] = str(val) else: timedelta_from_epoch = datetime.datetime.utcfromtimestamp( 0) + val row_to_persist[column_name] = timedelta_from_epoch.isoformat( ) + '+00:00' elif db_column_type == FIELD_TYPE.JSON: row_to_persist[column_name] = json.dumps(json_bytes_to_string(val)) elif isinstance(val, bytes): if column_name == 'additional_info': # Additional_info has a bad header in it row_to_persist[column_name] = codecs.encode( val, 'hex').decode('utf-8')[5:2**16 - 1] else: row_to_persist[column_name] = codecs.encode( val, 'hex').decode('utf-8')[:2**16 - 1] elif 'boolean' in property_type or property_type == 'boolean': if val is None: boolean_representation = None elif val == 0: boolean_representation = False elif db_column_type == FIELD_TYPE.BIT: boolean_representation = int(val) != 0 else: boolean_representation = True row_to_persist[column_name] = boolean_representation elif val is not None and (column_name.startswith('html') or db_column_type == 'longtext'): row_to_persist[column_name] = val[:2**16 - 1] else: row_to_persist[column_name] = val row_to_persist[SYS_HASHKEY] = calculate_hashkey(row_to_persist, key_properties) row_to_persist[SYS_HASHDIFF] = calculate_hashdiff(row_to_persist, key_properties) return singer.RecordMessage(stream=catalog_entry.stream, record=row_to_persist, version=version, time_extracted=time_extracted)