def sync_collection( collection: Collection, stream: Dict, state: Optional[Dict], ) -> None: """ Syncs the stream records incrementally Args: collection: MongoDB collection instance stream: stream dictionary state: state dictionary if exists """ LOGGER.info('Starting incremental sync for %s', stream['tap_stream_id']) # before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None # pick a new table version if last run wasn't interrupted if first_run: stream_version = int(time.time() * 1000) else: stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version') state = singer.write_bookmark(state, stream['tap_stream_id'], 'version', stream_version) activate_version_message = singer.ActivateVersionMessage( stream=common.calculate_destination_stream_name(stream), version=stream_version) # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if first_run: singer.write_message(activate_version_message) # get replication key, and bookmarked value/type stream_state = state.get('bookmarks', {}).get(stream['tap_stream_id'], {}) replication_key_name = metadata.to_map(stream['metadata']).get( ()).get('replication-key') # write state message singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) # create query find_filter = {} if stream_state.get('replication_key_value'): find_filter[replication_key_name] = {} find_filter[replication_key_name]['$gt'] = common.string_to_class( stream_state.get('replication_key_value'), stream_state.get('replication_key_type')) # log query LOGGER.info('Querying %s with: %s', stream['tap_stream_id'], dict(find=find_filter)) with collection.find(find_filter, sort=[(replication_key_name, pymongo.ASCENDING) ]) as cursor: rows_saved = 0 start_time = time.time() for row in cursor: singer.write_message( common.row_to_singer_record(stream=stream, row=row, time_extracted=utils.now(), time_deleted=None, version=stream_version)) rows_saved += 1 update_bookmark(row, state, stream['tap_stream_id'], replication_key_name) if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) common.COUNTS[stream['tap_stream_id']] += rows_saved common.TIMES[stream['tap_stream_id']] += time.time() - start_time singer.write_message(activate_version_message) LOGGER.info('Syncd %s records for %s', rows_saved, stream['tap_stream_id'])
def sync_collection(client, stream, state, projection): tap_stream_id = stream['tap_stream_id'] LOGGER.info('Starting full table sync for %s', tap_stream_id) md_map = metadata.to_map(stream['metadata']) database_name = metadata.get(md_map, (), 'database-name') db = client[database_name] collection = db[stream['stream']] #before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None # last run was interrupted if there is a last_id_fetched bookmark was_interrupted = singer.get_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') is not None #pick a new table version if last run wasn't interrupted if was_interrupted: stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version') else: stream_version = int(time.time() * 1000) state = singer.write_bookmark(state, stream['tap_stream_id'], 'version', stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) activate_version_message = singer.ActivateVersionMessage( stream=common.calculate_destination_stream_name(stream), version=stream_version) # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if first_run: singer.write_message(activate_version_message) if singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value'): # There is a bookmark max_id_value = singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value') max_id_type = singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_type') max_id_value = common.string_to_class(max_id_value, max_id_type) else: max_id_value = get_max_id_value(collection) last_id_fetched = singer.get_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') if max_id_value: # Write the bookmark if max_id_value is defined state = singer.write_bookmark( state, stream['tap_stream_id'], 'max_id_value', common.class_to_string(max_id_value, max_id_value.__class__.__name__)) state = singer.write_bookmark(state, stream['tap_stream_id'], 'max_id_type', max_id_value.__class__.__name__) find_filter = {'$lte': max_id_value} if last_id_fetched: last_id_fetched_type = singer.get_bookmark(state, stream['tap_stream_id'], 'last_id_fetched_type') find_filter['$gte'] = common.string_to_class(last_id_fetched, last_id_fetched_type) query_message = 'Querying {} with:\n\tFind Parameters: {}'.format( stream['tap_stream_id'], find_filter) if projection: query_message += '\n\tProjection: {}'.format(projection) # pylint: disable=logging-format-interpolation LOGGER.info(query_message) with collection.find({'_id': find_filter}, projection, sort=[("_id", pymongo.ASCENDING)]) as cursor: rows_saved = 0 time_extracted = utils.now() start_time = time.time() schema = stream['schema'] or {"type": "object", "properties": {}} for row in cursor: rows_saved += 1 schema_build_start_time = time.time() if common.row_to_schema(schema, row): singer.write_message( singer.SchemaMessage( stream=common.calculate_destination_stream_name( stream), schema=schema, key_properties=['_id'])) common.SCHEMA_COUNT[stream['tap_stream_id']] += 1 common.SCHEMA_TIMES[stream['tap_stream_id']] += time.time( ) - schema_build_start_time record_message = common.row_to_singer_record( stream, row, stream_version, time_extracted) singer.write_message(record_message) state = singer.write_bookmark( state, stream['tap_stream_id'], 'last_id_fetched', common.class_to_string(row['_id'], row['_id'].__class__.__name__)) state = singer.write_bookmark(state, stream['tap_stream_id'], 'last_id_fetched_type', row['_id'].__class__.__name__) if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) common.COUNTS[tap_stream_id] += rows_saved common.TIMES[tap_stream_id] += time.time() - start_time # clear max pk value and last pk fetched upon successful sync singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_value') singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_type') singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched_type') state = singer.write_bookmark(state, stream['tap_stream_id'], 'initial_full_table_complete', True) singer.write_message(activate_version_message) LOGGER.info('Syncd {} records for {}'.format(rows_saved, tap_stream_id))
def test_string_to_class_with_unsupported_type_raises_exception(self): with self.assertRaises(UnsupportedKeyTypeException): common.string_to_class(1, 'some random type')
def sync_collection(client, stream, state, projection): tap_stream_id = stream['tap_stream_id'] LOGGER.info('Starting incremental sync for %s', tap_stream_id) stream_metadata = metadata.to_map(stream['metadata']).get(()) collection = client[stream_metadata['database-name']][stream['stream']] #before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None #pick a new table version if last run wasn't interrupted if first_run: stream_version = int(time.time() * 1000) else: stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version') state = singer.write_bookmark(state, stream['tap_stream_id'], 'version', stream_version) activate_version_message = singer.ActivateVersionMessage( stream=common.calculate_destination_stream_name(stream), version=stream_version ) # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if first_run: singer.write_message(activate_version_message) # get replication key, and bookmarked value/type stream_state = state.get('bookmarks', {}).get(tap_stream_id, {}) replication_key_name = stream_metadata.get('replication-key') replication_key_value_bookmark = stream_state.get('replication_key_value') # write state message singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) # create query find_filter = {} if replication_key_value_bookmark: find_filter[replication_key_name] = {} find_filter[replication_key_name]['$gte'] = \ common.string_to_class(replication_key_value_bookmark, stream_state.get('replication_key_type')) # log query query_message = 'Querying {} with:\n\tFind Parameters: {}'.format(tap_stream_id, find_filter) if projection: query_message += '\n\tProjection: {}'.format(projection) LOGGER.info(query_message) # query collection schema = {"type": "object", "properties": {}} with collection.find(find_filter, projection, sort=[(replication_key_name, pymongo.ASCENDING)]) as cursor: rows_saved = 0 time_extracted = utils.now() start_time = time.time() for row in cursor: schema_build_start_time = time.time() if common.row_to_schema(schema, row): singer.write_message(singer.SchemaMessage( stream=common.calculate_destination_stream_name(stream), schema=schema, key_properties=['_id'])) common.SCHEMA_COUNT[tap_stream_id] += 1 common.SCHEMA_TIMES[tap_stream_id] += time.time() - schema_build_start_time record_message = common.row_to_singer_record(stream, row, stream_version, time_extracted) # gen_schema = common.row_to_schema_message(schema, record_message.record, row) # if DeepDiff(schema, gen_schema, ignore_order=True) != {}: # emit gen_schema # schema = gen_schema singer.write_message(record_message) rows_saved += 1 update_bookmark(row, state, tap_stream_id, replication_key_name) if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0: singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) common.COUNTS[tap_stream_id] += rows_saved common.TIMES[tap_stream_id] += time.time()-start_time singer.write_message(activate_version_message) LOGGER.info('Synced %s records for %s', rows_saved, tap_stream_id)
def test_string_to_class_with_Timestamp(self): ob = '3000.0' self.assertEqual(Timestamp(3000, 0), common.string_to_class(ob, 'Timestamp'))
def test_string_to_class_with_ObjectId(self): ob = '0123456789ab0123456789ab' self.assertEqual(ObjectId('0123456789ab0123456789ab'), common.string_to_class(ob, 'ObjectId'))
def test_string_to_class_with_formatted_utc_datetime(self): dt = '2020-05-10T12:01:50.000000Z' self.assertEqual(datetime(2020, 5, 10, 12, 1, 50, tzinfo=tzutc()), common.string_to_class(dt, 'datetime'))
def test_string_to_class_with_UUID(self): uid = '123e4567-e89b-12d3-a456-426652340000' self.assertEqual(uuid.UUID(uid), common.string_to_class(uid, 'UUID'))
def sync_collection(collection: Collection, stream: Dict, state: Dict) -> None: """ Sync collection records incrementally Args: collection: MongoDB collection instance stream: dictionary of all stream details state: the tap state """ LOGGER.info('Starting full table sync for %s', stream['tap_stream_id']) # before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None # last run was interrupted if there is a last_id_fetched bookmark # pick a new table version if last run wasn't interrupted if singer.get_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') is not None: stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version') else: stream_version = int(time.time() * 1000) state = singer.write_bookmark(state, stream['tap_stream_id'], 'version', stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) activate_version_message = singer.ActivateVersionMessage( stream=common.calculate_destination_stream_name(stream), version=stream_version) # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if first_run: singer.write_message(activate_version_message) if singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value'): # There is a bookmark max_id_value = common.string_to_class( singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value'), singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_type')) else: max_id_value = get_max_id_value(collection) last_id_fetched = singer.get_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') if max_id_value: # Write the bookmark if max_id_value is defined state = singer.write_bookmark( state, stream['tap_stream_id'], 'max_id_value', common.class_to_string(max_id_value, max_id_value.__class__.__name__)) state = singer.write_bookmark(state, stream['tap_stream_id'], 'max_id_type', max_id_value.__class__.__name__) find_filter = {'$lte': max_id_value} if last_id_fetched: find_filter['$gte'] = common.string_to_class( last_id_fetched, singer.get_bookmark(state, stream['tap_stream_id'], 'last_id_fetched_type')) LOGGER.info('Querying %s with: %s', stream['tap_stream_id'], dict(find=find_filter)) with collection.find({'_id': find_filter}, sort=[("_id", pymongo.ASCENDING)]) as cursor: rows_saved = 0 start_time = time.time() for row in cursor: rows_saved += 1 singer.write_message( common.row_to_singer_record(stream=stream, row=row, time_extracted=utils.now(), time_deleted=None, version=stream_version)) state = singer.write_bookmark( state, stream['tap_stream_id'], 'last_id_fetched', common.class_to_string(row['_id'], row['_id'].__class__.__name__)) state = singer.write_bookmark(state, stream['tap_stream_id'], 'last_id_fetched_type', row['_id'].__class__.__name__) if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) common.COUNTS[stream['tap_stream_id']] += rows_saved common.TIMES[stream['tap_stream_id']] += time.time() - start_time # clear max pk value and last pk fetched upon successful sync singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_value') singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_type') singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched_type') singer.write_bookmark(state, stream['tap_stream_id'], 'initial_full_table_complete', True) singer.write_message(activate_version_message) LOGGER.info('Syncd %s records for %s', rows_saved, stream['tap_stream_id'])