def setup_tap_from_state(self): ''' Looks into the state for a bookmark corresponding to the current stream, if found, it sets up the tap to start from the bookmark. ''' try: # Check if there's a bookmark available in the state self.start_record_id = self.state['bookmarks'][ self.stream_id][TYPO_RECORD_ID_PROPERTY] record_limit_log_message = '' if self.record_limit and self.record_limit != OPTION_DISABLED: record_limit_log_message = ' / Remaining Record Limit: `{}`'.format( self.record_limit) log_info(( 'Syncing stream `{}`. Resuming from provided state file. Start Typo record id: {}. ' + 'Records per page: {}{}.').format(self.stream_id, self.start_record_id, self.records_per_page, record_limit_log_message)) except KeyError: log_info('Syncing stream `{}`. Records per page: {}.{}'.format( self.stream_id, self.records_per_page, ' Record limit: {}.'.format(self.record_limit) if self.record_limit != OPTION_DISABLED else ''))
def __init__(self, config, state=None, catalog=None): self.config = config.copy() self.state = state.copy() if state else {} self.token = None self.base_url = config['cluster_api_endpoint'] self.api_key = config['api_key'] self.api_secret = config['api_secret'] self.repository = config.get('repository') self.dataset = config.get('dataset') # audit_id is optional self.audit_id = config.get('audit_id') self.start_record_id = OPTION_DISABLED self.records_per_page = config[ 'records_per_page'] if 'records_per_page' in config else 100 self.record_limit = config[ 'record_limit'] if 'record_limit' in config else OPTION_DISABLED self.output_rfc3339_datetime = config.get('output_rfc3339_datetime', False) # Stream properties self.key_properties = None self.schema = None self.stream_id = None if catalog: log_info('Loading catalog from provided file') self.catalog = catalog else: log_info('Discovering catalog') self.catalog = self.get_catalog()
def get_page(self, repository, dataset, audit_id, page_number): ''' Fetches one page of results from the Typo API ''' log_info('Fetching page {}.'.format(page_number)) if audit_id is not None: base_url = '{}/repositories/{}/datasets/{}/audits/{}/results'.format( self.base_url, repository, dataset, audit_id) else: base_url = '{}/repositories/{}/datasets/{}/results'.format( self.base_url, repository, dataset) # Get request start_record_id_filter = '' if self.start_record_id != OPTION_DISABLED: start_record_id_filter = '&__typo_id=gt:{}'.format( self.start_record_id) status, headers, data = self.api_get_request( '{}?per_page={}&page={}{}'.format(base_url, self.records_per_page, page_number, start_record_id_filter)) # Check Status if status != 200: log_error(data['message']) sys.exit(1) eof = not ('Link' in headers and '; rel="next"' in headers['Link']) return data, eof
def main(): ''' Called when the program is executed. ''' # Parse command line arguments try: args = utils.parse_args(REQUIRED_CONFIG_KEYS) except Exception as exception: # pylint: disable=W0703 log_critical(exception) sys.exit(1) config = args.config if args.discover: log_info('Starting in Discover Mode.') else: log_info('Starting in Sync Mode.') tap = TapTypo(catalog=args.catalog.to_dict() if args.catalog else None, config=config, state=args.state) if args.discover: tap.discover() log_info('Discover Mode completed.') else: catalog_mode = args.catalog is not None tap.sync(catalog_mode) log_info('Sync Mode completed.')
def sync(self, catalog_mode=False): ''' Parse every stream in the catalog, fetch data from Typo and send to stdout ''' if catalog_mode: selected_streams = self.get_selected_streams() for stream in self.catalog['streams']: if stream['tap_stream_id'] not in selected_streams: log_info( 'Skipped stream `{}`: stream not selected for syncing.' .format(stream['tap_stream_id'])) continue self.sync_stream(stream) else: if not self.repository and not self.dataset: log_info( 'Nothing to do as not running in catalog mode and repository, dataset and/or audit_id weren\'t specified in the config file.' ) else: repository = self.repository dataset = self.dataset audit_id = self.audit_id stream_id = get_tap_stream_id(repository, dataset, audit_id) streams = [ stream for stream in self.catalog['streams'] if stream['tap_stream_id'] == stream_id ] if len(streams) == 0: log_info( 'Nothing do to. Cannot find a stream for the provided repository, dataset and audit_id config parameters.' ) return self.sync_stream(streams[0])
def sync_stream(self, stream): self.key_properties = stream['key_properties'] self.schema = stream['schema'] self.stream_id = stream['tap_stream_id'] self.start_record_id = OPTION_DISABLED stream_metadata = singer.metadata.to_map(stream['metadata']) repository = singer.metadata.get(stream_metadata, (), 'repository') dataset = singer.metadata.get(stream_metadata, (), 'dataset') audit_id = singer.metadata.get(stream_metadata, (), 'audit_id') self.setup_tap_from_state() # Output state and schema singer.write_state(self.state) singer.write_schema(self.stream_id, self.schema, self.key_properties, bookmark_properties=BOOKMARK_PROPERTIES) eof = False record_count = 0 record_limit_reached = False page_number = 1 # Get the fields that will need rfc3339 transformations. rfc3339_fields_format = {} if self.output_rfc3339_datetime: for field_path, field_metadata in stream_metadata.items(): if field_path and 'datetime-format' in field_metadata: # NOTE: Checking for tuples that are not empty that represent field metadata field_name = field_path[1] rfc3339_fields_format[field_name] = field_metadata[ 'datetime-format'] while not eof and not record_limit_reached: data, eof = self.get_page(repository, dataset, audit_id, page_number) for record in data['data']['records']: record_count += 1 # Inserting output results from Typo record_data = record['record'] if record['has_errors']: record_data['__typo_result'] = 'Error' else: record_data['__typo_result'] = 'OK' record_data[TYPO_RECORD_ID_PROPERTY] = record['id'] if self.output_rfc3339_datetime: # Iterate fields that needs transformation into rfc3339 for field_name, field_format in rfc3339_fields_format.items( ): original_value = record_data[field_name] parsed_datetime = datetime.strptime( original_value, field_format) rfc3339_datetime = rfc3339(parsed_datetime) record_data[field_name] = rfc3339_datetime # Output record singer.write_record(self.stream_id, record_data) bookmark = record['id'] self.state = singer.write_bookmark(self.state, self.stream_id, TYPO_RECORD_ID_PROPERTY, bookmark) singer.write_state(self.state) if (self.record_limit != OPTION_DISABLED and record_count == self.record_limit): record_limit_reached = True log_info( 'Record limit reached. Finishing syncing for stream `{}`.' .format(self.stream_id)) break page_number += 1 if eof: log_info( 'Finished syncing all available data for stream `{}`.'.format( self.stream_id))