def test_no_change(self): row = { "a_str": "hello", "a_list": ["foo", "bar", 1, 2, {"name": "nick"}], "an_object": { "a_nested_str": "baz", "a_nested_list": [1, 2, "hi"] } } schema = {"type": "object", "properties": {}} changed = common.row_to_schema(schema, row) self.assertFalse(changed) # another row that looks the same keeps changed false changed = common.row_to_schema(schema, row) self.assertFalse(changed) # a different looking row makes the schema change row = {"a_str": "hello", "a_date": bson.timestamp.Timestamp(1565897157, 1)} changed = common.row_to_schema(schema, row) self.assertTrue(changed) # the same (different) row again sets changed back to false changed = common.row_to_schema(schema, row) self.assertFalse(changed)
def test_decimal_and_date(self): date_row = {"a_field": bson.timestamp.Timestamp(1565897157, 1)} decimal_row = {"a_field": bson.Decimal128(decimal.Decimal('1.34'))} schema = {"type": "object", "properties": {}} changed_date = common.row_to_schema(schema, date_row) changed_decimal = common.row_to_schema(schema, decimal_row) expected = { "type": "object", "properties": { "a_field": { "anyOf": [ {"type": "string", "format": "date-time"}, {"type": "number", "multipleOf": decimal.Decimal('1e-34')}, {} ] } } } self.assertTrue(changed_date) self.assertTrue(changed_decimal) self.assertEqual(expected, schema)
def test_nested_data(self): date_row = {"foo": {"a_field": bson.timestamp.Timestamp(1565897157, 1)}} schema = {"type": "object", "properties": {}} changed = common.row_to_schema(schema, date_row) expected = { "type": "object", "properties": { "foo": { "anyOf": [ { "type": "object", "properties": { "a_field": { "anyOf": [ {"type": "string", "format": "date-time"}, {} ] } } }, {} ] } } } self.assertTrue(changed) self.assertEqual(expected, schema)
def write_schema(schema, row, stream): schema_build_start_time = time.time() if common.row_to_schema(schema, row): singer.write_message(singer.SchemaMessage( stream=common.calculate_destination_stream_name(stream), schema=schema, key_properties=['_id'])) common.SCHEMA_COUNT[stream['tap_stream_id']] += 1 common.SCHEMA_TIMES[stream['tap_stream_id']] += time.time() - schema_build_start_time
def get_collection_schema_from_rows(collection, config): LOGGER.info('Getting schema from collection data: %s.%s', collection.database.name, collection.name) schema = { 'type': 'object', 'properties': { '_id': { 'type': ['null', 'string'] } } } limit = config.get('discovery_row_limit', 1000) with collection.find().sort('_id', -1).limit(limit) as cursor: for row in cursor: try: common.row_to_schema(schema, row) except Exception as e: LOGGER.critical(e) return schema
def test_float_then_float(self): float_row = {"a_field": 1.34} float_row_2 = {"a_field": 1.34} schema = {"type": "object", "properties": {}} changed_float = common.row_to_schema(schema, float_row) changed_float_2 = common.row_to_schema(schema, float_row_2) expected = { "type": "object", "properties": { "a_field": { "anyOf": [{"type": "number"}, {}] } } } self.assertTrue(changed_float) self.assertFalse(changed_float_2) self.assertEqual(expected, schema)
def test_decimal_then_decimal(self): decimal_row = {"a_field": bson.Decimal128(decimal.Decimal('1.34'))} decimal_row_2 = {"a_field": bson.Decimal128(decimal.Decimal('1.34'))} schema = {"type": "object", "properties": {}} changed_decimal = common.row_to_schema(schema, decimal_row) changed_decimal_2 = common.row_to_schema(schema, decimal_row_2) expected = { "type": "object", "properties": { "a_field": { "anyOf": [{"type": "number", "multipleOf": decimal.Decimal('1e-34')}, {}] } } } self.assertTrue(changed_decimal) self.assertFalse(changed_decimal_2) self.assertEqual(expected, schema)
def test_decimal_then_float(self): decimal_row = {"a_field": bson.Decimal128(decimal.Decimal('1.34'))} float_row = {"a_field": 1.34} schema = {"type": "object", "properties": {}} changed_decimal = common.row_to_schema(schema, decimal_row) changed_float = common.row_to_schema(schema, float_row) expected = { "type": "object", "properties": { "a_field": { "anyOf": [{"type": "number"}, {}] } } } self.assertTrue(changed_decimal) self.assertTrue(changed_float) self.assertEqual(expected, schema)
def test_simple_float(self): row = {"a_float": 1.34} schema = {"type": "object", "properties": {}} changed = common.row_to_schema(schema, row) expected = { "type": "object", "properties": { "a_float": { "anyOf": [{"type": "number"}, {}] } } } self.assertTrue(changed) self.assertEqual(expected, schema)
def test_simple_decimal(self): row = {"a_decimal": bson.Decimal128(decimal.Decimal('1.34'))} schema = {"type": "object", "properties": {}} changed = common.row_to_schema(schema, row) expected = { "type": "object", "properties": { "a_decimal": { "anyOf": [{"type": "number", "multipleOf": decimal.Decimal('1e-34')}, {}] } } } self.assertTrue(changed) self.assertEqual(expected, schema)
def test_array_multiple_types(self): row = { "foo": [ bson.timestamp.Timestamp(1565897157, 1), bson.Decimal128(decimal.Decimal('1.34')) ] } schema = {"type": "object", "properties": {}} changed = common.row_to_schema(schema, row) expected = { "type": "object", "properties": { "foo": { "anyOf": [ { "type": "array", "items": { "anyOf": [ { "type": "string", "format": "date-time" }, { "type": "number", "multipleOf": decimal.Decimal('1e-34') }, {} ] } }, {} ] } } } self.assertTrue(changed) self.assertEqual(expected, schema)
def sync_collection(client, stream, state, projection): tap_stream_id = stream['tap_stream_id'] LOGGER.info('Starting full table sync for %s', tap_stream_id) md_map = metadata.to_map(stream['metadata']) database_name = metadata.get(md_map, (), 'database-name') db = client[database_name] collection = db[stream['stream']] #before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None # last run was interrupted if there is a last_id_fetched bookmark was_interrupted = singer.get_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') is not None #pick a new table version if last run wasn't interrupted if was_interrupted: stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version') else: stream_version = int(time.time() * 1000) state = singer.write_bookmark(state, stream['tap_stream_id'], 'version', stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) activate_version_message = singer.ActivateVersionMessage( stream=common.calculate_destination_stream_name(stream), version=stream_version) # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if first_run: singer.write_message(activate_version_message) if singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value'): # There is a bookmark max_id_value = singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value') max_id_type = singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_type') max_id_value = common.string_to_class(max_id_value, max_id_type) else: max_id_value = get_max_id_value(collection) last_id_fetched = singer.get_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') if max_id_value: # Write the bookmark if max_id_value is defined state = singer.write_bookmark( state, stream['tap_stream_id'], 'max_id_value', common.class_to_string(max_id_value, max_id_value.__class__.__name__)) state = singer.write_bookmark(state, stream['tap_stream_id'], 'max_id_type', max_id_value.__class__.__name__) find_filter = {'$lte': max_id_value} if last_id_fetched: last_id_fetched_type = singer.get_bookmark(state, stream['tap_stream_id'], 'last_id_fetched_type') find_filter['$gte'] = common.string_to_class(last_id_fetched, last_id_fetched_type) query_message = 'Querying {} with:\n\tFind Parameters: {}'.format( stream['tap_stream_id'], find_filter) if projection: query_message += '\n\tProjection: {}'.format(projection) # pylint: disable=logging-format-interpolation LOGGER.info(query_message) with collection.find({'_id': find_filter}, projection, sort=[("_id", pymongo.ASCENDING)]) as cursor: rows_saved = 0 time_extracted = utils.now() start_time = time.time() schema = stream['schema'] or {"type": "object", "properties": {}} for row in cursor: rows_saved += 1 schema_build_start_time = time.time() if common.row_to_schema(schema, row): singer.write_message( singer.SchemaMessage( stream=common.calculate_destination_stream_name( stream), schema=schema, key_properties=['_id'])) common.SCHEMA_COUNT[stream['tap_stream_id']] += 1 common.SCHEMA_TIMES[stream['tap_stream_id']] += time.time( ) - schema_build_start_time record_message = common.row_to_singer_record( stream, row, stream_version, time_extracted) singer.write_message(record_message) state = singer.write_bookmark( state, stream['tap_stream_id'], 'last_id_fetched', common.class_to_string(row['_id'], row['_id'].__class__.__name__)) state = singer.write_bookmark(state, stream['tap_stream_id'], 'last_id_fetched_type', row['_id'].__class__.__name__) if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) common.COUNTS[tap_stream_id] += rows_saved common.TIMES[tap_stream_id] += time.time() - start_time # clear max pk value and last pk fetched upon successful sync singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_value') singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_type') singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched_type') state = singer.write_bookmark(state, stream['tap_stream_id'], 'initial_full_table_complete', True) singer.write_message(activate_version_message) LOGGER.info('Syncd {} records for {}'.format(rows_saved, tap_stream_id))
def sync_collection(client, stream, state, projection): tap_stream_id = stream['tap_stream_id'] LOGGER.info('Starting incremental sync for %s', tap_stream_id) stream_metadata = metadata.to_map(stream['metadata']).get(()) collection = client[stream_metadata['database-name']][stream['stream']] #before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None #pick a new table version if last run wasn't interrupted if first_run: stream_version = int(time.time() * 1000) else: stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version') state = singer.write_bookmark(state, stream['tap_stream_id'], 'version', stream_version) activate_version_message = singer.ActivateVersionMessage( stream=common.calculate_destination_stream_name(stream), version=stream_version ) # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if first_run: singer.write_message(activate_version_message) # get replication key, and bookmarked value/type stream_state = state.get('bookmarks', {}).get(tap_stream_id, {}) replication_key_name = stream_metadata.get('replication-key') replication_key_value_bookmark = stream_state.get('replication_key_value') # write state message singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) # create query find_filter = {} if replication_key_value_bookmark: find_filter[replication_key_name] = {} find_filter[replication_key_name]['$gte'] = \ common.string_to_class(replication_key_value_bookmark, stream_state.get('replication_key_type')) # log query query_message = 'Querying {} with:\n\tFind Parameters: {}'.format(tap_stream_id, find_filter) if projection: query_message += '\n\tProjection: {}'.format(projection) LOGGER.info(query_message) # query collection schema = {"type": "object", "properties": {}} with collection.find(find_filter, projection, sort=[(replication_key_name, pymongo.ASCENDING)]) as cursor: rows_saved = 0 time_extracted = utils.now() start_time = time.time() for row in cursor: schema_build_start_time = time.time() if common.row_to_schema(schema, row): singer.write_message(singer.SchemaMessage( stream=common.calculate_destination_stream_name(stream), schema=schema, key_properties=['_id'])) common.SCHEMA_COUNT[tap_stream_id] += 1 common.SCHEMA_TIMES[tap_stream_id] += time.time() - schema_build_start_time record_message = common.row_to_singer_record(stream, row, stream_version, time_extracted) # gen_schema = common.row_to_schema_message(schema, record_message.record, row) # if DeepDiff(schema, gen_schema, ignore_order=True) != {}: # emit gen_schema # schema = gen_schema singer.write_message(record_message) rows_saved += 1 update_bookmark(row, state, tap_stream_id, replication_key_name) if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0: singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) common.COUNTS[tap_stream_id] += rows_saved common.TIMES[tap_stream_id] += time.time()-start_time singer.write_message(activate_version_message) LOGGER.info('Synced %s records for %s', rows_saved, tap_stream_id)
def test_array_nested(self): row = { "foo": [ [ bson.timestamp.Timestamp(1565897157, 1), bson.Decimal128(decimal.Decimal('1.34')) ], { "bar": bson.timestamp.Timestamp(1565897157, 1), "bat": bson.Decimal128(decimal.Decimal('1.34')) } ] } row_2 = { "bar": "1", "foo": [ ["bob", "roger"], { "bar": "bob", "bat": "roger" } ] } schema = {"type": "object", "properties": {}} changed = common.row_to_schema(schema, row) changed_2 = common.row_to_schema(schema, row_2) expected = { "type": "object", "properties": { "foo": { "anyOf": [ { "type": "array", "items": { "anyOf": [ { "type": "array", "items": { "anyOf": [ { "type": "string", "format": "date-time" }, { "type": "number", "multipleOf": decimal.Decimal('1e-34') }, {} ] } }, { "type": "object", "properties": { "bar": { "anyOf": [ { "type": "string", "format": "date-time" }, {} ] }, "bat": { "anyOf": [ { "type": "number", "multipleOf": decimal.Decimal('1e-34') }, {} ] } } }, {} ] } }, {} ] } } } singer_row = {k:common.transform_value(v, [k]) for k, v in row_2.items() if type(v) not in [bson.min_key.MinKey, bson.max_key.MaxKey]} decimal.getcontext().prec=100000 validate(instance=singer_row, schema=schema) self.assertTrue(changed) self.assertFalse(changed_2) self.assertEqual(expected, schema)