def sync(config, streams, state, catalog, assume_sorted=True, max_page=None, auth_method="basic", raw=False, filter_by_schema=True): """ Sync the streams that were selected - assume_sorted: Assume the data to be sorted and exit the process as soon as a record having greater than end index/datetime/timestamp is detected. - max_page: Stop after making this number of API call is made. - auth_method: HTTP auth method (basic, no_auth, digest) - raw: Output raw JSON records to stdout - filter_by_schema: When True, check the extracted records against the schema and undefined/unmatching fields won't be written out. """ start_process_at = datetime.datetime.now() remaining_streams = get_streams_to_sync(streams, state) selected_streams = get_selected_streams(remaining_streams, catalog) if len(selected_streams) < 1: raise Exception("No Streams selected, please check that you have a " + "schema selected in your catalog") LOGGER.info("Starting sync. Will sync these streams: %s" % [stream.tap_stream_id for stream in selected_streams]) for stream in selected_streams: LOGGER.info("%s Start sync" % stream.tap_stream_id) singer.set_currently_syncing(state, stream.tap_stream_id) if raw is False: singer.write_state(state) try: state = sync_rows(config, state, stream.tap_stream_id, max_page=max_page, auth_method=auth_method, assume_sorted=assume_sorted, raw_output=raw, filter_by_schema=filter_by_schema) except Exception as e: LOGGER.critical(e) raise e bookmark_type = get_bookmark_type(config) last_update = state["bookmarks"][stream.tap_stream_id]["last_update"] if bookmark_type == "timestamp": last_update = str(last_update) + " (" + str( datetime.datetime.fromtimestamp(last_update)) + ")" LOGGER.info("%s End sync" % stream.tap_stream_id) LOGGER.info("%s Last record's %s: %s" % (stream.tap_stream_id, bookmark_type, last_update)) end_process_at = datetime.datetime.now() LOGGER.info("Completed sync at %s" % str(end_process_at)) LOGGER.info("Process duration: " + str(end_process_at - start_process_at))
def do_sync(self): logger.debug('Starting sync') # resuming when currently_syncing within state resume_from_stream = False if self.state and 'currently_syncing' in self.state: resume_from_stream = self.state['currently_syncing'] for stream in self.streams: stream.tap = self if resume_from_stream: if stream.schema == resume_from_stream: logger.info('Resuming from {}'.format(resume_from_stream)) resume_from_stream = False else: logger.info('Skipping stream {} as resuming from {}'.format(stream.schema, resume_from_stream)) continue # stream state, from state/bookmark or start_date stream.set_initial_state(self.state, self.config['start_date']) # currently syncing if stream.state_field: set_currently_syncing(self.state, stream.schema) self.state = singer.write_bookmark(self.state, stream.schema, stream.state_field, str(stream.initial_state)) singer.write_state(self.state) # schema stream.write_schema() # paginate while stream.has_data(): with singer.metrics.http_request_timer(stream.schema) as timer: try: response = self.execute_stream_request(stream) except (ConnectionError, RequestException) as e: raise e timer.tags[singer.metrics.Tag.http_status_code] = response.status_code self.validate_response(response) self.rate_throttling(response) stream.paginate(response) # records with metrics with singer.metrics.record_counter(stream.schema) as counter: with singer.Transformer(singer.NO_INTEGER_DATETIME_PARSING) as optimus_prime: for row in self.iterate_response(response): row = stream.process_row(row) if not row: # in case of a non-empty response with an empty element continue row = optimus_prime.transform(row, stream.get_schema()) if stream.write_record(row): counter.increment() stream.update_state(row) # update state / bookmarking only when supported by stream if stream.state_field: self.state = singer.write_bookmark(self.state, stream.schema, stream.state_field, str(stream.earliest_state)) singer.write_state(self.state) # clear currently_syncing try: del self.state['currently_syncing'] except KeyError as e: pass singer.write_state(self.state)
def update_currently_syncing(state, stream_name): if (stream_name is None) and ('currently_syncing' in state): del state['currently_syncing'] else: singer.set_currently_syncing(state, stream_name) singer.write_state(state)
def sync( # noqa: WPS210, WPS213 twinfield: Twinfield, state: dict, catalog: Catalog, start_date: str, ) -> None: """Sync data from tap source. Arguments: twinfield {Twinfield} -- Twinfield client state {dict} -- Tap state catalog {Catalog} -- Stream catalog start_date {str} -- Start date """ # For every stream in the catalog LOGGER.info('Sync') LOGGER.debug('Current state:\n{state}') # Only selected streams are synced, whether a stream is selected is # determined by whether the key-value: "selected": true is in the schema # file. for stream in catalog.get_selected_streams(state): LOGGER.info(f'Syncing stream: {stream.tap_stream_id}') # Update the current stream as active syncing in the state singer.set_currently_syncing(state, stream.tap_stream_id) # Retrieve the state of the stream stream_state: dict = tools.get_stream_state( state, stream.tap_stream_id, ) LOGGER.debug(f'Stream state: {stream_state}') # Write the schema singer.write_schema( stream_name=stream.tap_stream_id, schema=stream.schema.to_dict(), key_properties=stream.key_properties, ) # Every stream has a corresponding method in the PayPal object e.g.: # The stream: paypal_transactions will call: paypal.paypal_transactions tap_data: Callable = getattr(twinfield, stream.tap_stream_id) # The tap_data method yields rows of data from the API # The state of the stream is used as kwargs for the method # E.g. if the state of the stream has a key 'start_date', it will be # used in the method as start_date='2021-01-01T00:00:00+0000' for row in tap_data(**stream_state): # Write a row to the stream singer.write_record( stream.tap_stream_id, row, time_extracted=datetime.now(timezone.utc), ) bookmark: Optional[str] = tools.get_bookmark_value( stream.tap_stream_id, row, ) # Update bookmark tools.update_bookmark(stream, bookmark, state) sys.stdout.flush()
def do_sync(client, config, catalog, state): """ Translate metadata into a set of metrics and dimensions and call out to sync to generate the required reports. """ selected_streams = catalog.get_selected_streams(state) for stream in selected_streams: # Transform state for this report to new format before proceeding state = clean_state_for_report(config, state, stream.tap_stream_id) state = singer.set_currently_syncing(state, stream.tap_stream_id) singer.write_state(state) metrics = [] dimensions = [] mdata = metadata.to_map(stream.metadata) for field_path, field_mdata in mdata.items(): if field_path == tuple(): continue if field_mdata.get('inclusion') == 'unsupported': continue _, field_name = field_path if field_mdata.get('inclusion') == 'automatic' or \ field_mdata.get('selected') or \ (field_mdata.get('selected-by-default') and field_mdata.get('selected') is None): if field_mdata.get('behavior') == 'METRIC': metrics.append(field_name) elif field_mdata.get('behavior') == 'DIMENSION': dimensions.append(field_name) view_ids = get_view_ids(config) # NB: Resume from previous view for this report, dropping all # views before it to keep streams moving forward current_view = state.get('currently_syncing_view') if current_view: if current_view in view_ids: view_not_current = functools.partial(lambda cv, v: v != cv, current_view) view_ids = list(itertools.dropwhile(view_not_current, view_ids)) else: state.pop('currently_syncing_view', None) reports_per_view = [{ "profile_id": view_id, "name": stream.stream, "id": stream.tap_stream_id, "metrics": metrics, "dimensions": dimensions } for view_id in view_ids] end_date = get_end_date(config) schema = stream.schema.to_dict() singer.write_schema(stream.stream, schema, stream.key_properties) for report in reports_per_view: state['currently_syncing_view'] = report['profile_id'] singer.write_state(state) is_historical_sync, start_date = get_start_date( config, report['profile_id'], state, report['id']) sync_report(client, schema, report, start_date, end_date, state, is_historical_sync) state.pop('currently_syncing_view', None) singer.write_state(state) state = singer.set_currently_syncing(state, None) singer.write_state(state)
def sync(config, state, catalog): # Any client required PARAMETERS to hit the endpoint client = IreckonuClient(config) run_id = int(time.time()) pipeline_start = datetime.now().strftime("%Y-%m-%d, %H:%M:%S") pipeline_start_time = time.perf_counter() stream_comments = [] total_records = 0 with Transformer() as transformer: for stream in catalog.get_selected_streams(state): batch_start = datetime.now().strftime("%Y-%m-%d, %H:%M:%S") start_time = time.perf_counter() record_count = 0 tap_stream_id = stream.tap_stream_id stream_obj = STREAMS[tap_stream_id](client, state) replication_key = stream_obj.replication_key stream_schema = stream.schema.to_dict() stream_metadata = metadata.to_map(stream.metadata) LOGGER.info("Staring sync for stream: %s", tap_stream_id) state = singer.set_currently_syncing(state, tap_stream_id) singer.write_state(state) singer.write_schema( tap_stream_id, stream_schema, stream_obj.key_properties, stream.replication_key, ) try: for record in stream_obj.sync( config["start_date"], config["hotel_codes"] ): transformed_record = transformer.transform( record, stream_schema, stream_metadata ) singer.write_record( tap_stream_id, transformed_record, ) record_count += 1 total_records += 1 # If there is a Bookmark or state based key to store state = singer.write_bookmark( state, tap_stream_id, "Last Run Date", datetime.strftime(datetime.today(), "%Y-%m-%d"), ) singer.write_state(state) batch_stop = datetime.now().strftime("%Y-%m-%d, %H:%M:%S") AuditLogs.write_audit_log( run_id=run_id, stream_name=tap_stream_id, batch_start=batch_start, batch_end=batch_stop, records_synced=record_count, run_time=(time.perf_counter() - start_time), ) except Exception as e: stream_comments.append(f"{tap_stream_id.upper()}: {e}") batch_stop = datetime.now().strftime("%Y-%m-%d, %H:%M:%S") AuditLogs.write_audit_log( run_id=run_id, stream_name=tap_stream_id, batch_start=batch_start, batch_end=batch_stop, records_synced=record_count, run_time=(time.perf_counter() - start_time), comments=e, ) state = singer.set_currently_syncing(state, None) singer.write_state(state) # Comment out for local runs if config["slack_notifications"] == True: SlackMessenger.send_message( run_id=run_id, start_time=pipeline_start, run_time=(time.perf_counter() - pipeline_start_time), record_count=total_records, comments="\n".join(stream_comments), )
def sync_stream(client, stream, state): tap_stream_id = stream['tap_stream_id'] common.COUNTS[tap_stream_id] = 0 common.TIMES[tap_stream_id] = 0 common.SCHEMA_COUNT[tap_stream_id] = 0 common.SCHEMA_TIMES[tap_stream_id] = 0 md_map = metadata.to_map(stream['metadata']) replication_method = metadata.get(md_map, (), 'replication-method') database_name = metadata.get(md_map, (), 'database-name') stream_projection = load_stream_projection(stream) # Emit a state message to indicate that we've started this stream state = clear_state_on_replication_change(stream, state) state = singer.set_currently_syncing(state, stream['tap_stream_id']) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) write_schema_message(stream) common.SCHEMA_COUNT[tap_stream_id] += 1 with metrics.job_timer('sync_table') as timer: timer.tags['database'] = database_name timer.tags['table'] = stream['table_name'] if replication_method == 'LOG_BASED': if oplog.oplog_has_aged_out(client, state, tap_stream_id): # remove all state for stream # then it will do a full sync and start oplog again. LOGGER.info("Clearing state because Oplog has aged out") state.get('bookmarks', {}).pop(tap_stream_id) # make sure initial full table sync has been completed if not singer.get_bookmark(state, tap_stream_id, 'initial_full_table_complete'): msg = 'Must complete full table sync before starting oplog replication for %s' LOGGER.info(msg, tap_stream_id) # only mark current ts in oplog on first sync so tap has a # starting point after the full table sync if singer.get_bookmark(state, tap_stream_id, 'version') is None: collection_oplog_ts = oplog.get_latest_ts(client) oplog.update_bookmarks(state, tap_stream_id, collection_oplog_ts) full_table.sync_collection(client, stream, state, stream_projection) oplog.sync_collection(client, stream, state, stream_projection) elif replication_method == 'FULL_TABLE': full_table.sync_collection(client, stream, state, stream_projection) elif replication_method == 'INCREMENTAL': incremental.sync_collection(client, stream, state, stream_projection) else: raise Exception( "only FULL_TABLE, LOG_BASED, and INCREMENTAL replication \ methods are supported (you passed {})".format( replication_method)) state = singer.set_currently_syncing(state, None) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def _sync_stream(client, stream, transformer, config, state, catalog, **kwargs): record = kwargs.get('record', None) substreams = kwargs.get('substreams') tap_stream_id = stream.tap_stream_id stream_obj = stream() stream_catalog = catalog.get_stream(stream.tap_stream_id) replication_key = stream_obj.replication_key stream_schema = stream_catalog.schema.to_dict() stream_metadata = metadata.to_map(stream_catalog.metadata) replication_method = metadata.get(stream_metadata, (), 'replication-method') stream_obj.update_replication_method(replication_method) LOGGER.debug('Starting sync for stream: %s', tap_stream_id) state = singer.set_currently_syncing(state, tap_stream_id) singer.write_state(state) # Only write schema once if not tap_stream_id in schemas_written: singer.write_schema(tap_stream_id, stream_schema, stream_obj.key_properties, stream.replication_key) schemas_written.append(tap_stream_id) start_date = singer.get_bookmark(state, tap_stream_id, replication_key, config['start_date']) offset = singer.get_bookmark(state, tap_stream_id, 'offset', 0) max_record_value = start_date for page, cursor in stream_obj.sync(client, config, state, record=record, start_date=start_date, offset=offset): for record in page: transformed_record = transformer.transform(record, stream_schema, stream_metadata) time_extracted = singer.utils.now() singer.write_record(tap_stream_id, transformed_record, time_extracted=time_extracted) if stream_obj.replication_method == 'INCREMENTAL': current_replication_value = deep_get(record, replication_key) if current_replication_value \ and current_replication_value > max_record_value: max_record_value = current_replication_value if substreams: _sync_streams(client, substreams.values(), transformer, config, state, catalog, record=record, start_date=start_date) state = singer.write_bookmark(state, tap_stream_id, 'offset', cursor) if stream_obj.replication_method == 'INCREMENTAL': state = singer.write_bookmark(state, tap_stream_id, replication_key, max_record_value) singer.write_state(state) state = singer.clear_bookmark(state, tap_stream_id, 'offset') singer.write_state(state)
def run_sync(cls, config, properties, arguments): streams = properties['streams'] for stream in streams: table = re.sub('[^0-9a-zA-Z_]+', '_', stream['table_name']).lower() schema = stream['metadata'] properties = {} # Clean field names for f_name in schema['properties'].keys(): clean_f_name = re.sub('[^0-9a-zA-Z_]+', '_', f_name).lower() properties[clean_f_name] = schema['properties'][f_name] schema['properties'] = properties if table != 'relations' and schema['selected']: STATE.update(singer.set_currently_syncing(STATE, table)) singer.write_state(STATE) response = Airtable.get_response(config['base_id'], schema["name"]) if response.json().get('records'): response_records = response.json().get('records') clean_response_records = [] for response_record in response_records: fields = {} # Clean field names for f_name in response_record['fields'].keys(): clean_f_name = re.sub('[^0-9a-zA-Z_]+', '_', f_name).lower() fields[clean_f_name] = response_record['fields'][ f_name] response_record['fields'] = fields clean_response_records.append(response_record) records = JsonUtils.match_record_with_keys( schema, clean_response_records, config['remove_emojis']) singer.write_schema(table, schema, 'id') singer.write_records(table, records) offset = response.json().get("offset") while offset: response = Airtable.get_response( config['base_id'], schema["name"], offset) if response.json().get('records'): records = JsonUtils.match_record_with_keys( schema, response.json().get('records'), config['remove_emojis']) singer.write_records(table, records) offset = response.json().get("offset") relations_table = { "name": "relations", "properties": { "id": { "type": ["null", "string"] }, "relation1": { "type": ["null", "string"] }, "relation2": { "type": ["null", "string"] } } } singer.write_schema('relations', relations_table, 'id') singer.write_records('relations', Relations.get_records()) STATE.update(singer.set_currently_syncing(STATE, None)) singer.write_state(STATE) LOGGER.info("Sync completed")
def do_sync(config, state, stream): singer.set_currently_syncing(state, stream.tap_stream_id) singer.write_state(state) client = bigquery.Client() metadata = stream.metadata[0]["metadata"] tap_stream_id = stream.tap_stream_id inclusive_start = True start_datetime = singer.get_bookmark(state, tap_stream_id, BOOKMARK_KEY_NAME) if start_datetime: if not config.get("start_always_inclusive"): inclusive_start = False else: start_datetime = config.get("start_datetime") start_datetime = dateutil.parser.parse(start_datetime).strftime( "%Y-%m-%d %H:%M:%S.%f") if config.get("end_datetime"): end_datetime = dateutil.parser.parse( config.get("end_datetime")).strftime("%Y-%m-%d %H:%M:%S.%f") singer.write_schema(tap_stream_id, stream.schema.to_dict(), stream.key_properties) keys = { "table": metadata["table"], "columns": metadata["columns"], "datetime_key": metadata.get("datetime_key"), "start_datetime": start_datetime, "end_datetime": end_datetime } limit = config.get("limit", None) query = _build_query(keys, metadata.get("filters", []), inclusive_start, limit=limit) query_job = client.query(query) properties = stream.schema.properties last_update = start_datetime LOGGER.info("Running query:\n %s" % query) extract_tstamp = datetime.datetime.utcnow() extract_tstamp = extract_tstamp.replace(tzinfo=datetime.timezone.utc) with metrics.record_counter(tap_stream_id) as counter: for row in query_job: record = {} for key in properties.keys(): prop = properties[key] if key in [ LEGACY_TIMESTAMP, EXTRACT_TIMESTAMP, BATCH_TIMESTAMP ]: continue if row[key] is None: if prop.type[0] != "null": raise ValueError( "NULL value not allowed by the schema") else: record[key] = None elif prop.format == "date-time": if type(row[key]) == str: r = dateutil.parser.parse(row[key]) elif type(row[key]) == datetime.date: r = datetime.datetime(year=row[key].year, month=row[key].month, day=row[key].day) elif type(row[key]) == datetime.datetime: r = row[key] record[key] = r.isoformat() elif prop.type[1] == "string": record[key] = str(row[key]) elif prop.type[1] == "number": record[key] = Decimal(row[key]) elif prop.type[1] == "integer": record[key] = int(row[key]) else: record[key] = row[key] if LEGACY_TIMESTAMP in properties.keys(): record[LEGACY_TIMESTAMP] = int(round(time.time() * 1000)) if EXTRACT_TIMESTAMP in properties.keys(): record[EXTRACT_TIMESTAMP] = extract_tstamp.isoformat() singer.write_record(stream.stream, record) last_update = record[keys["datetime_key"]] counter.increment() state = singer.write_bookmark(state, tap_stream_id, BOOKMARK_KEY_NAME, last_update) singer.write_state(state)
def sync(config, state, catalog): client = SquareClient(config) with Transformer() as transformer: for stream in catalog.get_selected_streams(state): tap_stream_id = stream.tap_stream_id stream_obj = STREAMS[tap_stream_id](client, state) replication_key = stream_obj.replication_key stream_schema = stream.schema.to_dict() stream_metadata = metadata.to_map(stream.metadata) LOGGER.info('Staring sync for stream: %s', tap_stream_id) state = singer.set_currently_syncing(state, tap_stream_id) singer.write_state(state) singer.write_schema(tap_stream_id, stream_schema, stream_obj.key_properties, stream.replication_key) start_time = singer.get_bookmark(state, tap_stream_id, replication_key, config['start_date']) bookmarked_cursor = singer.get_bookmark(state, tap_stream_id, 'cursor') if tap_stream_id == 'shifts': replication_key = stream_obj.replication_key sync_start_bookmark = singer.get_bookmark( state, tap_stream_id, 'sync_start', singer.utils.strftime( singer.utils.now(), format_str=singer.utils.DATETIME_PARSE)) state = singer.write_bookmark( state, tap_stream_id, 'sync_start', sync_start_bookmark, ) for page, cursor in stream_obj.sync(start_time, bookmarked_cursor): for record in page: if record[replication_key] >= start_time: transformed_record = transformer.transform( record, stream_schema, stream_metadata, ) singer.write_record( tap_stream_id, transformed_record, ) state = singer.write_bookmark(state, tap_stream_id, 'cursor', cursor) singer.write_state(state) state = singer.clear_bookmark(state, tap_stream_id, 'sync_start') state = singer.write_bookmark( state, tap_stream_id, replication_key, sync_start_bookmark, ) singer.write_state(state) elif stream_obj.replication_method == 'INCREMENTAL': replication_key = stream_obj.replication_key max_record_value = start_time for page, cursor in stream_obj.sync(start_time, bookmarked_cursor): for record in page: transformed_record = transformer.transform( record, stream_schema, stream_metadata) singer.write_record( tap_stream_id, transformed_record, ) if record[replication_key] > max_record_value: max_record_value = transformed_record[ replication_key] state = singer.write_bookmark(state, tap_stream_id, 'cursor', cursor) state = singer.write_bookmark(state, tap_stream_id, replication_key, max_record_value) singer.write_state(state) else: for record in stream_obj.sync(start_time, bookmarked_cursor): transformed_record = transformer.transform( record, stream_schema, stream_metadata) singer.write_record( tap_stream_id, transformed_record, ) state = singer.clear_bookmark(state, tap_stream_id, 'cursor') singer.write_state(state) state = singer.set_currently_syncing(state, None) singer.write_state(state)
def do_sync(self): logger.debug('Starting sync') # resuming when currently_syncing within state resume_from_stream = False if self.state and 'currently_syncing' in self.state: resume_from_stream = self.state['currently_syncing'] for stream in self.streams: stream.tap = self if resume_from_stream: if stream.schema == resume_from_stream: logger.info('Resuming from {}'.format(resume_from_stream)) resume_from_stream = False else: logger.info('Skipping stream {} as resuming from {}'.format(stream.schema, resume_from_stream)) continue # stream state, from state/bookmark or start_date stream.set_initial_state(self.state, self.config['start_date']) # currently syncing if stream.state_field: set_currently_syncing(self.state, stream.schema) self.state = singer.write_bookmark(self.state, stream.schema, stream.state_field, str(stream.initial_state)) singer.write_state(self.state) # schema stream.write_schema() if stream.id_list: # see if we want to iterate over a list of deal_ids for deal_id in stream.get_deal_ids(self): is_last_id = False if deal_id == stream.these_deals[-1]: #find out if this is last deal_id in the current set is_last_id = True # if last page of deals, more_items in collection will be False # Need to set it to True to get deal_id pagination for the first deal on the last page if deal_id == stream.these_deals[0]: stream.more_items_in_collection = True stream.update_endpoint(deal_id) stream.start = 0 # set back to zero for each new deal_id self.do_paginate(stream) if not is_last_id: stream.more_items_in_collection = True #set back to True for pagination of next deal_id request elif is_last_id and stream.more_ids_to_get: # need to get the next batch of deal_ids stream.more_items_in_collection = True stream.start = stream.next_start else: stream.more_items_in_collection = False # set the attribution window so that the bookmark will reflect the new initial_state for the next sync stream.earliest_state = stream.stream_start.subtract(hours=3) else: # paginate self.do_paginate(stream) # update state / bookmarking only when supported by stream if stream.state_field: self.state = singer.write_bookmark(self.state, stream.schema, stream.state_field, str(stream.earliest_state)) singer.write_state(self.state) # clear currently_syncing try: del self.state['currently_syncing'] except KeyError as e: pass singer.write_state(self.state)
def sync(config, state, catalog): errors_encountered = False selected_stream_ids = get_selected_streams(catalog) client = Client(config) # Check if there are existing bookmarks, if not create a new one state['bookmarks'] = state.get('bookmarks', {}) # Loop over streams in catalog for stream in catalog['streams']: stream_id = stream['tap_stream_id'] stream_schema = stream['schema'] report_definition = Report.get_report_definition(stream) stream_metadata = metadata.to_map(stream['metadata']) key_properties = metadata.get(stream_metadata, (), "table-key-properties") if stream_id in selected_stream_ids: start_date = utils.strptime_to_utc( get_bookmark( state, stream_id, 'last_report_date', default=config['start_date'].strftime('%Y-%m-%d'))) start_date = start_date - timedelta( days=config.get('lookback_days', 15)) end_date = config['end_date'] date_interval = config['date_batching'] segment_id = config.get('segment_id', None) LOGGER.info(f'Syncing stream: {stream_id}') LOGGER.info( f'Will sync data from {start_date.isoformat()} until {end_date.isoformat()}' ) # Sets the currently sycing stream in state singer.set_currently_syncing(state, stream_id) # Writes the schema for the current stream singer.write_schema(stream_id, stream_schema, key_properties) for start_date, end_date in batch_report_dates( start_date, end_date, date_interval): LOGGER.info( f'Request for {start_date.isoformat()} to {end_date.isoformat()} started.' ) start = timer() try: results = client.process_stream(start_date, end_date, report_definition, segment_id) # Writes individual items from results array as records singer.write_records(stream_id, results) # Updates the stream bookmark with the latest report timestamp singer.write_bookmark(state, stream_id, 'last_report_date', end_date.strftime("%Y-%m-%d")) singer.write_state(state) except GaInvalidArgumentError as e: errors_encountered = True LOGGER.error( "Skipping stream: '{}' due to invalid report definition." .format(stream_id)) LOGGER.debug("Error: '{}'.".format(e)) except GaRateLimitError as e: errors_encountered = True LOGGER.error( "Skipping stream: '{}' due to Rate Limit Errors.". format(stream_id)) LOGGER.debug("Error: '{}'.".format(e)) except GaQuotaExceededError as e: errors_encountered = True LOGGER.error( "Skipping stream: '{}' due to Quota Exceeded Errors.". format(stream_id)) LOGGER.debug("Error: '{}'.".format(e)) except GaAuthenticationError as e: LOGGER.error( "Stopping execution while processing '{}' due to Authentication Errors." .format(stream_id)) LOGGER.debug("Error: '{}'.".format(e)) sys.exit(1) except GaUnknownError as e: LOGGER.error( "Stopping execution while processing '{}' due to Unknown Errors." .format(stream_id)) LOGGER.debug("Error: '{}'.".format(e)) sys.exit(1) end = timer() LOGGER.info( f'Request for {start_date.isoformat()} to {end_date.isoformat()} finished in {(end-start):.2f}.' ) singer.set_currently_syncing(state, '') singer.write_state(state) else: LOGGER.info('Skipping unselected stream: ' + stream_id) # If we encountered errors, exit with 1 if errors_encountered: sys.exit(1) return
def generate_messages(con, config, catalog, state): catalog = resolve_catalog(con, catalog, state) for catalog_entry in catalog.streams: columns = list(catalog_entry.schema.properties.keys()) if not columns: LOGGER.warning( 'There are no columns selected for stream %s, skipping it.', catalog_entry.stream) continue state = singer.set_currently_syncing(state, catalog_entry.tap_stream_id) # Emit a state message to indicate that we've started this stream yield singer.StateMessage(value=copy.deepcopy(state)) md_map = metadata.to_map(catalog_entry.metadata) replication_method = md_map.get((), {}).get('replication-method') replication_key = md_map.get((), {}).get('replication-key') if catalog_entry.is_view: key_properties = md_map.get((), {}).get('view-key-properties') else: key_properties = md_map.get((), {}).get('table-key-properties') with metrics.job_timer('sync_table') as timer: timer.tags['database'] = catalog_entry.database timer.tags['table'] = catalog_entry.table log_engine(con, catalog_entry) if replication_method == 'INCREMENTAL': LOGGER.info("Stream %s is using incremental replication", catalog_entry.stream) yield generate_schema_message(catalog_entry, key_properties, [replication_key]) for message in incremental.sync_table(con, catalog_entry, state, columns): yield message elif replication_method == 'LOG_BASED': if catalog_entry.is_view: raise Exception( "Unable to replicate stream({}) with binlog because it is a view." .format(catalog_entry.stream)) LOGGER.info("Stream %s is using binlog replication", catalog_entry.stream) log_file = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_file') log_pos = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_pos') yield generate_schema_message(catalog_entry, key_properties, []) if log_file and log_pos: columns = binlog.add_automatic_properties( catalog_entry, columns) for message in binlog.sync_table(con, config, catalog_entry, state, columns): yield message else: LOGGER.info("Performing initial full table sync") log_file, log_pos = binlog.fetch_current_log_file_and_pos( con) stream_version = common.get_stream_version( catalog_entry.tap_stream_id, state) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'version', stream_version) for message in full_table.sync_table( con, catalog_entry, state, columns, stream_version): yield message state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_file', log_file) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_pos', log_pos) yield singer.StateMessage(value=copy.deepcopy(state)) elif replication_method == 'FULL_TABLE': LOGGER.info("Stream %s is using full table replication", catalog_entry.stream) yield generate_schema_message(catalog_entry, key_properties, []) stream_version = common.get_stream_version( catalog_entry.tap_stream_id, state) for message in full_table.sync_table(con, catalog_entry, state, columns, stream_version): yield message # Prefer initial_full_table_complete going forward singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'version') state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'initial_full_table_complete', True) yield singer.StateMessage(value=copy.deepcopy(state)) else: raise Exception( "only INCREMENTAL, LOG_BASED, and FULL TABLE replication methods are supported" ) # if we get here, we've finished processing all the streams, so clear # currently_syncing from the state and emit a state message. state = singer.set_currently_syncing(state, None) yield singer.StateMessage(value=copy.deepcopy(state))