Пример #1
0
    def setup_tap_from_state(self):
        '''
        Looks into the state for a bookmark corresponding to the current stream, if found,
        it sets up the tap to start from the bookmark.
        '''
        try:
            # Check if there's a bookmark available in the state
            self.start_record_id = self.state['bookmarks'][
                self.stream_id][TYPO_RECORD_ID_PROPERTY]

            record_limit_log_message = ''

            if self.record_limit and self.record_limit != OPTION_DISABLED:
                record_limit_log_message = ' / Remaining Record Limit: `{}`'.format(
                    self.record_limit)

            log_info((
                'Syncing stream `{}`. Resuming from provided state file. Start Typo record id: {}. '
                + 'Records per page: {}{}.').format(self.stream_id,
                                                    self.start_record_id,
                                                    self.records_per_page,
                                                    record_limit_log_message))

        except KeyError:
            log_info('Syncing stream `{}`. Records per page: {}.{}'.format(
                self.stream_id, self.records_per_page,
                ' Record limit: {}.'.format(self.record_limit)
                if self.record_limit != OPTION_DISABLED else ''))
Пример #2
0
    def __init__(self, config, state=None, catalog=None):
        self.config = config.copy()
        self.state = state.copy() if state else {}
        self.token = None

        self.base_url = config['cluster_api_endpoint']
        self.api_key = config['api_key']
        self.api_secret = config['api_secret']
        self.repository = config.get('repository')
        self.dataset = config.get('dataset')
        # audit_id is optional
        self.audit_id = config.get('audit_id')
        self.start_record_id = OPTION_DISABLED
        self.records_per_page = config[
            'records_per_page'] if 'records_per_page' in config else 100
        self.record_limit = config[
            'record_limit'] if 'record_limit' in config else OPTION_DISABLED
        self.output_rfc3339_datetime = config.get('output_rfc3339_datetime',
                                                  False)

        # Stream properties
        self.key_properties = None
        self.schema = None
        self.stream_id = None

        if catalog:
            log_info('Loading catalog from provided file')
            self.catalog = catalog
        else:
            log_info('Discovering catalog')
            self.catalog = self.get_catalog()
Пример #3
0
    def get_page(self, repository, dataset, audit_id, page_number):
        '''
        Fetches one page of results from the Typo API
        '''
        log_info('Fetching page {}.'.format(page_number))

        if audit_id is not None:
            base_url = '{}/repositories/{}/datasets/{}/audits/{}/results'.format(
                self.base_url, repository, dataset, audit_id)
        else:
            base_url = '{}/repositories/{}/datasets/{}/results'.format(
                self.base_url, repository, dataset)

        # Get request
        start_record_id_filter = ''

        if self.start_record_id != OPTION_DISABLED:
            start_record_id_filter = '&__typo_id=gt:{}'.format(
                self.start_record_id)

        status, headers, data = self.api_get_request(
            '{}?per_page={}&page={}{}'.format(base_url, self.records_per_page,
                                              page_number,
                                              start_record_id_filter))

        # Check Status
        if status != 200:
            log_error(data['message'])
            sys.exit(1)

        eof = not ('Link' in headers and '; rel="next"' in headers['Link'])
        return data, eof
Пример #4
0
def main():
    '''
    Called when the program is executed.
    '''
    # Parse command line arguments
    try:
        args = utils.parse_args(REQUIRED_CONFIG_KEYS)
    except Exception as exception:  # pylint: disable=W0703
        log_critical(exception)
        sys.exit(1)

    config = args.config

    if args.discover:
        log_info('Starting in Discover Mode.')
    else:
        log_info('Starting in Sync Mode.')

    tap = TapTypo(catalog=args.catalog.to_dict() if args.catalog else None,
                  config=config,
                  state=args.state)

    if args.discover:
        tap.discover()
        log_info('Discover Mode completed.')
    else:
        catalog_mode = args.catalog is not None
        tap.sync(catalog_mode)
        log_info('Sync Mode completed.')
Пример #5
0
    def sync(self, catalog_mode=False):
        '''
        Parse every stream in the catalog, fetch data from Typo and send to stdout
        '''

        if catalog_mode:
            selected_streams = self.get_selected_streams()
            for stream in self.catalog['streams']:
                if stream['tap_stream_id'] not in selected_streams:
                    log_info(
                        'Skipped stream `{}`: stream not selected for syncing.'
                        .format(stream['tap_stream_id']))
                    continue
                self.sync_stream(stream)
        else:
            if not self.repository and not self.dataset:
                log_info(
                    'Nothing to do as not running in catalog mode and repository, dataset and/or audit_id weren\'t specified in the config file.'
                )
            else:
                repository = self.repository
                dataset = self.dataset
                audit_id = self.audit_id
                stream_id = get_tap_stream_id(repository, dataset, audit_id)
                streams = [
                    stream for stream in self.catalog['streams']
                    if stream['tap_stream_id'] == stream_id
                ]
                if len(streams) == 0:
                    log_info(
                        'Nothing do to. Cannot find a stream for the provided repository, dataset and audit_id config parameters.'
                    )
                    return
                self.sync_stream(streams[0])
Пример #6
0
    def sync_stream(self, stream):
        self.key_properties = stream['key_properties']
        self.schema = stream['schema']
        self.stream_id = stream['tap_stream_id']
        self.start_record_id = OPTION_DISABLED

        stream_metadata = singer.metadata.to_map(stream['metadata'])
        repository = singer.metadata.get(stream_metadata, (), 'repository')
        dataset = singer.metadata.get(stream_metadata, (), 'dataset')
        audit_id = singer.metadata.get(stream_metadata, (), 'audit_id')

        self.setup_tap_from_state()

        # Output state and schema
        singer.write_state(self.state)
        singer.write_schema(self.stream_id,
                            self.schema,
                            self.key_properties,
                            bookmark_properties=BOOKMARK_PROPERTIES)

        eof = False
        record_count = 0
        record_limit_reached = False
        page_number = 1

        # Get the fields that will need rfc3339 transformations.
        rfc3339_fields_format = {}
        if self.output_rfc3339_datetime:
            for field_path, field_metadata in stream_metadata.items():
                if field_path and 'datetime-format' in field_metadata:  # NOTE: Checking for tuples that are not empty that represent field metadata
                    field_name = field_path[1]
                    rfc3339_fields_format[field_name] = field_metadata[
                        'datetime-format']

        while not eof and not record_limit_reached:
            data, eof = self.get_page(repository, dataset, audit_id,
                                      page_number)

            for record in data['data']['records']:
                record_count += 1

                # Inserting output results from Typo
                record_data = record['record']

                if record['has_errors']:
                    record_data['__typo_result'] = 'Error'
                else:
                    record_data['__typo_result'] = 'OK'

                record_data[TYPO_RECORD_ID_PROPERTY] = record['id']

                if self.output_rfc3339_datetime:
                    # Iterate fields that needs transformation into rfc3339
                    for field_name, field_format in rfc3339_fields_format.items(
                    ):
                        original_value = record_data[field_name]
                        parsed_datetime = datetime.strptime(
                            original_value, field_format)
                        rfc3339_datetime = rfc3339(parsed_datetime)
                        record_data[field_name] = rfc3339_datetime

                # Output record
                singer.write_record(self.stream_id, record_data)

                bookmark = record['id']

                self.state = singer.write_bookmark(self.state, self.stream_id,
                                                   TYPO_RECORD_ID_PROPERTY,
                                                   bookmark)
                singer.write_state(self.state)

                if (self.record_limit != OPTION_DISABLED
                        and record_count == self.record_limit):
                    record_limit_reached = True
                    log_info(
                        'Record limit reached. Finishing syncing for stream `{}`.'
                        .format(self.stream_id))
                    break

            page_number += 1

        if eof:
            log_info(
                'Finished syncing all available data for stream `{}`.'.format(
                    self.stream_id))