Exemplo n.º 1
0
def get_sampled_schema_for_table(config, table_spec):
    logger.info('Sampling records to determine table schema.')

    s3_files = s3.get_input_files_for_table(config, table_spec)

    samples = s3.sample_files(config, table_spec, s3_files)

    metadata_schema = {
        '_s3_source_bucket': {
            'type': 'string'
        },
        '_s3_source_file': {
            'type': 'string'
        },
        '_s3_source_lineno': {
            'type': 'integer'
        },
    }

    data_schema = conversion.generate_schema(samples)

    return {
        'type': 'object',
        'properties': merge_dicts(data_schema, metadata_schema)
    }
Exemplo n.º 2
0
def sync_table_file(config, s3_file, table_spec, schema):
    logger.info('Syncing file "{}".'.format(s3_file))

    bucket = config['bucket']
    table_name = table_spec['name']

    iterator = tap_s3_csv.format_handler.get_row_iterator(
        config, table_spec, s3_file)

    records_synced = 0

    for row in iterator:
        metadata = {
            '_s3_source_bucket': bucket,
            '_s3_source_file': s3_file,

            # index zero, +1 for header row
            '_s3_source_lineno': records_synced + 2
        }

        try:
            to_write = [{**conversion.convert_row(row, schema), **metadata}]
            singer.write_records(table_name, to_write)
        except BrokenPipeError as bpe:
            logger.error(
                f'Pipe to loader broke after {records_synced} records were written from {s3_file}: troubled line was {row}'
            )
            raise bpe

        records_synced += 1

    return records_synced
Exemplo n.º 3
0
def sync_table_file(config, s3_file, table_spec, schema):
    logger.info('Syncing file "{}".'.format(s3_file))

    bucket = config['bucket']
    table_name = table_spec['name']

    iterator = tap_s3_csv.format_handler.get_row_iterator(
        config, table_spec, s3_file)

    records_synced = 0

    for row in iterator:
        metadata = {
            '_s3_source_bucket': bucket,
            '_s3_source_file': s3_file,

            # index zero, +1 for header row
            '_s3_source_lineno': records_synced + 2
        }

        to_write = [{**conversion.convert_row(row, schema), **metadata}]
        singer.write_records(table_name, to_write)
        records_synced += 1

    return records_synced
Exemplo n.º 4
0
def list_files_in_bucket(config, bucket, search_prefix=None):
    s3_client = boto3.client('s3')

    s3_objects = []

    max_results = 1000
    args = {
        'Bucket': bucket,
        'MaxKeys': max_results,
    }

    if search_prefix is not None:
        args['Prefix'] = search_prefix

    result = s3_client.list_objects_v2(**args)

    s3_objects += result['Contents']
    next_continuation_token = result.get('NextContinuationToken')

    while next_continuation_token is not None:
        logger.debug('Continuing pagination with token "{}".'.format(
            next_continuation_token))

        continuation_args = args.copy()
        continuation_args['ContinuationToken'] = next_continuation_token

        result = s3_client.list_objects_v2(**continuation_args)

        s3_objects += result['Contents']
        next_continuation_token = result.get('NextContinuationToken')

    logger.info("Found {} files.".format(len(s3_objects)))

    return s3_objects
Exemplo n.º 5
0
def do_sync(args):
    logger.info('Starting sync.')

    config = tap_s3_csv.config.load(args.config)
    state = load_state(args.state)

    for table in config['tables']:
        state = sync_table(config, state, table)

    logger.info('Done syncing.')
Exemplo n.º 6
0
def load(filename):
    config = {}

    try:
        with open(filename) as handle:
            config = json.load(handle)
    except:
        logger.fatal("Failed to decode config file. Is it valid json?")
        raise RuntimeError

    CONFIG_CONTRACT(config)

    return config
Exemplo n.º 7
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-c', '--config', help='Config file', required=True)
    parser.add_argument('-s', '--state', help='State file')

    args = parser.parse_args()

    try:
        do_sync(args)
    except RuntimeError:
        logger.fatal("Run failed.")
        exit(1)
Exemplo n.º 8
0
def load_state(filename):
    state = {}

    if filename is None:
        return state

    try:
        with open(filename) as handle:
            state = json.load(handle)
    except:
        logger.fatal("Failed to decode state file. Is it valid json?")
        raise RuntimeError

    return state
Exemplo n.º 9
0
def get_input_files_for_table(config, table_spec, modified_since=None):
    bucket = config['bucket']

    to_return = []
    pattern = table_spec['pattern']
    matcher = re.compile(pattern)

    logger.debug('Checking bucket "{}" for keys matching "{}"'.format(
        bucket, pattern))

    s3_objects = list_files_in_bucket(config, bucket,
                                      table_spec.get('search_prefix'))

    for s3_object in s3_objects:
        key = s3_object['Key']
        last_modified = s3_object['LastModified']

        logger.debug('Last modified: {}'.format(last_modified))

        if (matcher.search(key) and
            (modified_since is None or modified_since < last_modified)):
            logger.debug('Will download key "{}"'.format(key))
            to_return.append({'key': key, 'last_modified': last_modified})
        else:
            logger.debug('Will not download key "{}"'.format(key))

    to_return = sorted(to_return, key=lambda item: item['last_modified'])

    return to_return
Exemplo n.º 10
0
def convert_row(row, schema):
    to_return = {}

    for key, value in row.items():
        field_schema = schema['properties'][key]
        datatype = field_schema.get('_conversion_type', 'string')

        logger.debug('Converting {} value {} to {}'.format(
            key, value, datatype))
        converted, _ = convert(value, datatype)

        to_return[key] = converted

    return to_return
Exemplo n.º 11
0
def load(filename):
    config = {}

    try:
        with open(filename) as handle:
            config = json.load(handle)
            if not isinstance(config['tables'], dict):
                config['tables'] = json.loads(config['tables'])
    except:
        logger.fatal("Failed to decode config file. Is it valid json?")
        raise RuntimeError

    CONFIG_CONTRACT(config)

    return config
Exemplo n.º 12
0
def do_sync(args):
    logger.info('Starting sync.')

    meltano_config = load_json_file(args.config)
    bucket_files_definition = meltano_config.get("bucket_files_definition",
                                                 None)
    if bucket_files_definition:
        if os.path.isfile(bucket_files_definition):
            config = tap_s3_csv.config.load(bucket_files_definition)
        else:
            logger.error("tap_s3_csv: '{}' file not found".format(
                bucket_files_definition))
            exit(1)
    else:
        check_config(CONFIG, REQUIRED_CONFIG_KEYS)
        csv_files = CONFIG['files']

    state = load_state(args.state)

    for table in config['tables']:
        state = sync_table(config, state, table)

    state = {'COMPLETED': True}
    singer.write_state(state)

    logger.info('Done syncing.')
Exemplo n.º 13
0
def sample_file(config, table_spec, s3_path, sample_rate, max_records):
    logger.info('Sampling {} ({} records, every {}th record).'.format(
        s3_path, max_records, sample_rate))

    samples = []

    iterator = tap_s3_csv.format_handler.get_row_iterator(
        config, table_spec, s3_path)

    current_row = 0

    for row in iterator:
        if (current_row % sample_rate) == 0:
            samples.append(row)

        current_row += 1

        if len(samples) >= max_records:
            break

    logger.info('Sampled {} records.'.format(len(samples)))

    return samples
Exemplo n.º 14
0
def sync_table(config, state, table_spec):
    table_name = table_spec['name']
    modified_since = dateutil.parser.parse(
        state.get(table_name, {}).get('modified_since') or
        config['start_date'])

    logger.info('Syncing table "{}".'.format(table_name))
    logger.info('Getting files modified since {}.'.format(modified_since))

    s3_files = s3.get_input_files_for_table(
        config, table_spec, modified_since)

    logger.info('Found {} files to be synced.'
                .format(len(s3_files)))

    if not s3_files:
        return state

    inferred_schema = get_sampled_schema_for_table(config, table_spec)
    override_schema = {'properties': table_spec.get('schema_overrides', {})}
    schema = merge_dicts(
        inferred_schema,
        override_schema)

    singer.write_schema(
        table_name,
        schema,
        key_properties=table_spec['key_properties'])

    records_streamed = 0

    for s3_file in s3_files:
        records_streamed += sync_table_file(
            config, s3_file['key'], table_spec, schema)

        state[table_name] = {
            'modified_since': s3_file['last_modified'].isoformat()
        }

        singer.write_state(state)

    logger.info('Wrote {} records for table "{}".'
                .format(records_streamed, table_name))

    return state
Exemplo n.º 15
0
def retry_handler(details):
    logger.info("Received retryable error -- Retry %s/%s",
                details['tries'], MAX_RETRIES)