def sync( # noqa: WPS210, WPS213 wp: WordpressReviews, catalog: Catalog, ) -> None: """Sync data from tap source. Arguments: wp {WordpressReviews} -- WordpressReviews client catalog {Catalog} -- Stream catalog """ # For every stream in the catalog LOGGER.info('Sync') # Only selected streams are synced, whether a stream is selected is # determined by whether the key-value: "selected": true is in the schema # file. for stream in catalog.get_selected_streams({}): LOGGER.info(f'Syncing stream: {stream.tap_stream_id}') # Write the schema singer.write_schema( stream_name=stream.tap_stream_id, schema=stream.schema.to_dict(), key_properties=stream.key_properties, ) # The tap_data method yields rows of data from the API for row in wp.reviews(): # Write a row to the stream singer.write_record( stream.tap_stream_id, row, time_extracted=datetime.now(timezone.utc), )
def sync(config: Dict[str, Any], state: Dict[str, Any], catalog: Catalog) -> None: # For looking up Catalog-configured streams more efficiently # later Singer stores catalog entries as a list and iterates # over it with .get_stream() stream_defs: Dict[str, Union["Stream", "Substream"]] = {} stream_versions: Dict[str, Optional[int]] = {} check_dependency_conflicts(catalog) for stream in catalog.get_selected_streams(state): if is_substream(AVAILABLE_STREAMS[stream.tap_stream_id]): LOGGER.info( 'Skipping substream "%s" until parent stream is reached', stream.tap_stream_id, ) continue LOGGER.info("Syncing stream: %s", stream.tap_stream_id) filter_datetime = prepare_stream(stream.tap_stream_id, stream_defs, stream_versions, catalog, config, state) stream_def = stream_defs[stream.tap_stream_id] LOGGER.info("Querying since: %s", filter_datetime) for tap_stream_id, record in stream_def.sync( filter_datetime): # type: ignore state = handle_record( tap_stream_id, record, stream_defs[tap_stream_id], stream_versions[tap_stream_id], state, ) write_state(state) for substream_def in stream_def.substreams: # type: ignore if not substream_def.is_selected: continue # All substreams are necessarily FULL_TABLE and thus have a version, # so write their ACTIVATE_VERSION messages without check. write_activate_version( substream_def.tap_stream_id, stream_versions[substream_def.tap_stream_id], ) if stream_versions[stream_def.tap_stream_id] is not None: write_activate_version( stream_def.tap_stream_id, stream_versions[stream_def.tap_stream_id], ) state = set_currently_syncing(state, None) write_state(state)
def sync( basecone: Basecone, state: dict, catalog: Catalog, start_date: str, ) -> None: """Sync data from tap source. Arguments: basecone {Basecone} -- Basecone client state {dict} -- Tap state catalog {Catalog} -- Stream catalog start_date {str} -- Start date """ # For every stream in the catalog LOGGER.info('Sync') LOGGER.debug('Current state:\n{state}') # Only selected streams are synced, whether a stream is selected is # determined by whether the key-value: "selected": true is in the schema # file. for stream in catalog.get_selected_streams(state): LOGGER.info(f'Syncing stream: {stream.tap_stream_id}') # Update the current stream as active syncing in the state singer.set_currently_syncing(state, stream.tap_stream_id) # Retrieve the state of the stream stream_state: dict = tools.get_stream_state( state, stream.tap_stream_id, ) LOGGER.info(f'Stream state: {stream_state}') # Write the schema singer.write_schema( stream_name=stream.tap_stream_id, schema=stream.schema.to_dict(), key_properties=stream.key_properties, ) # Every stream has a corresponding method in the PayPal object e.g.: # The stream: paypal_transactions will call: paypal.paypal_transactions tap_data: Callable = getattr(basecone, stream.tap_stream_id) # The tap_data method yields rows of data from the API # The state of the stream is used as kwargs for the method # E.g. if the state of the stream has a key 'start_date', it will be # used in the method as start_date='2021-01-01T00:00:00+0000' for row in tap_data(**stream_state): sync_record(stream, row, state)
def test_one_selected_stream(self): selected_entry = CatalogEntry(tap_stream_id='a', schema=Schema(), metadata=[{'metadata': {'selected': True}, 'breadcrumb': []}]) catalog = Catalog( [selected_entry, CatalogEntry(tap_stream_id='b',schema=Schema(),metadata=[]), CatalogEntry(tap_stream_id='c',schema=Schema(),metadata=[])]) state = {} selected_streams = catalog.get_selected_streams(state) self.assertEquals([e for e in selected_streams],[selected_entry])
def sync(config: Dict[str, Any], state: Dict[str, Any], catalog: Catalog) -> None: """ Sync data from tap source """ client = Client(config["access_token"], config["page_size"]) # Loop over selected streams in catalog for selected_stream in catalog.get_selected_streams(state): LOGGER.info("Syncing stream: %s", selected_stream.tap_stream_id) bookmark_column = selected_stream.replication_key replication_method = ( ReplicationMethod[selected_stream.replication_method] if selected_stream.replication_method else None) last_bookmark = state.get(selected_stream.tap_stream_id) singer.write_schema( stream_name=selected_stream.tap_stream_id, schema=selected_stream.schema.to_dict(), key_properties=selected_stream.key_properties, bookmark_properties=[bookmark_column] if bookmark_column else None, ) stream = streams.get(selected_stream.tap_stream_id) max_bookmark = last_bookmark if replication_method == ReplicationMethod.INCREMENTAL else None for records in stream().get_records(client, config, bookmark_column, last_bookmark, replication_method): if len(records) == 0: continue # write one or more rows to the stream: for record in records: modified_record = Stream.convert_dates_to_rfc3339( record, selected_stream.schema) singer.write_record(selected_stream.tap_stream_id, modified_record, time_extracted=datetime.now(timezone.utc)) if bookmark_column: if stream.replication_key_is_sorted: # update bookmark to latest value singer.write_state({ selected_stream.tap_stream_id: records[-1][bookmark_column] }) else: local_max_bookmark = max( [row[bookmark_column] for row in records]) # if data unsorted, save max value until end of writes max_bookmark = max( max_bookmark, local_max_bookmark ) if max_bookmark else local_max_bookmark if bookmark_column and not stream.replication_key_is_sorted: singer.write_state({selected_stream.tap_stream_id: max_bookmark})
def test_resumes_currently_syncing_stream(self): selected_entry_a = CatalogEntry(tap_stream_id='a', schema=Schema(), metadata=[{'metadata': {'selected': True}, 'breadcrumb': []}]) selected_entry_c = CatalogEntry(tap_stream_id='c', schema=Schema(), metadata=[{'metadata': {'selected': True}, 'breadcrumb': []}]) catalog = Catalog( [selected_entry_a, CatalogEntry(tap_stream_id='b',schema=Schema(),metadata=[]), selected_entry_c]) state = {'currently_syncing': 'c'} selected_streams = catalog.get_selected_streams(state) self.assertEquals([e for e in selected_streams][0],selected_entry_c)
def sync(config: Dict[str, Any], state: Dict[str, Any], catalog: Catalog) -> None: # For looking up Catalog-configured streams more efficiently # later Singer stores catalog entries as a list and iterates # over it with .get_stream() stream_defs: Dict[str, Union["Stream", "Substream"]] = {} stream_versions: Dict[str, Optional[int]] = {} check_dependency_conflicts(catalog) for stream in catalog.get_selected_streams(state): if is_substream(AVAILABLE_STREAMS[stream.tap_stream_id]): LOGGER.info( 'Skipping substream "%s" until parent stream is reached', stream.tap_stream_id, ) continue LOGGER.info("Syncing stream: %s", stream.tap_stream_id) filter_datetime = prepare_stream(stream.tap_stream_id, stream_defs, stream_versions, catalog, config, state) stream_def = stream_defs[stream.tap_stream_id] LOGGER.info("Querying since: %s", filter_datetime) for tap_stream_id, record in stream_def.sync( filter_datetime): # type: ignore state = handle_record( tap_stream_id, record, stream_defs[tap_stream_id], stream_versions[tap_stream_id], state, ) write_state(state) state = set_currently_syncing(state, None) write_state(state)
def sync( # noqa: WPS210, WPS213 wp: WordPressSupportForums, catalog: Catalog, ) -> None: """Sync data from tap source. Arguments: wp {WordPressSupportForums} -- WordPressSupportForums client catalog {Catalog} -- Stream catalog """ # For every stream in the catalog LOGGER.info('Sync') # Only selected streams are synced, whether a stream is selected is # determined by whether the key-value: "selected": true is in the schema # file. for stream in catalog.get_selected_streams({}): LOGGER.info(f'Syncing stream: {stream.tap_stream_id}') # Write the schema singer.write_schema( stream_name=stream.tap_stream_id, schema=stream.schema.to_dict(), key_properties=stream.key_properties, ) # Every stream has a corresponding method in the WordPress Stats object # The stream: mysql will call: wp.mysql tap_data: Callable = getattr(wp, stream.tap_stream_id) # The tap_data method yields rows of data from the API for row in tap_data(): # Write a row to the stream singer.write_record( stream.tap_stream_id, row, time_extracted=datetime.now(timezone.utc), )
def sync( # noqa: WPS210, WPS213 twinfield: Twinfield, state: dict, catalog: Catalog, start_date: str, ) -> None: """Sync data from tap source. Arguments: twinfield {Twinfield} -- Twinfield client state {dict} -- Tap state catalog {Catalog} -- Stream catalog start_date {str} -- Start date """ # For every stream in the catalog LOGGER.info('Sync') LOGGER.debug('Current state:\n{state}') # Only selected streams are synced, whether a stream is selected is # determined by whether the key-value: "selected": true is in the schema # file. for stream in catalog.get_selected_streams(state): LOGGER.info(f'Syncing stream: {stream.tap_stream_id}') # Update the current stream as active syncing in the state singer.set_currently_syncing(state, stream.tap_stream_id) # Retrieve the state of the stream stream_state: dict = tools.get_stream_state( state, stream.tap_stream_id, ) LOGGER.debug(f'Stream state: {stream_state}') # Write the schema singer.write_schema( stream_name=stream.tap_stream_id, schema=stream.schema.to_dict(), key_properties=stream.key_properties, ) # Every stream has a corresponding method in the PayPal object e.g.: # The stream: paypal_transactions will call: paypal.paypal_transactions tap_data: Callable = getattr(twinfield, stream.tap_stream_id) # The tap_data method yields rows of data from the API # The state of the stream is used as kwargs for the method # E.g. if the state of the stream has a key 'start_date', it will be # used in the method as start_date='2021-01-01T00:00:00+0000' for row in tap_data(**stream_state): # Write a row to the stream singer.write_record( stream.tap_stream_id, row, time_extracted=datetime.now(timezone.utc), ) bookmark: Optional[str] = tools.get_bookmark_value( stream.tap_stream_id, row, ) # Update bookmark tools.update_bookmark(stream, bookmark, state) sys.stdout.flush()