def get_sampled_schema_for_table(config, table_spec): logger.info('Sampling records to determine table schema.') s3_files = s3.get_input_files_for_table(config, table_spec) samples = s3.sample_files(config, table_spec, s3_files) metadata_schema = { '_s3_source_bucket': { 'type': 'string' }, '_s3_source_file': { 'type': 'string' }, '_s3_source_lineno': { 'type': 'integer' }, } data_schema = conversion.generate_schema(samples) return { 'type': 'object', 'properties': merge_dicts(data_schema, metadata_schema) }
def sync_table_file(config, s3_file, table_spec, schema): logger.info('Syncing file "{}".'.format(s3_file)) bucket = config['bucket'] table_name = table_spec['name'] iterator = tap_s3_csv.format_handler.get_row_iterator( config, table_spec, s3_file) records_synced = 0 for row in iterator: metadata = { '_s3_source_bucket': bucket, '_s3_source_file': s3_file, # index zero, +1 for header row '_s3_source_lineno': records_synced + 2 } try: to_write = [{**conversion.convert_row(row, schema), **metadata}] singer.write_records(table_name, to_write) except BrokenPipeError as bpe: logger.error( f'Pipe to loader broke after {records_synced} records were written from {s3_file}: troubled line was {row}' ) raise bpe records_synced += 1 return records_synced
def sync_table_file(config, s3_file, table_spec, schema): logger.info('Syncing file "{}".'.format(s3_file)) bucket = config['bucket'] table_name = table_spec['name'] iterator = tap_s3_csv.format_handler.get_row_iterator( config, table_spec, s3_file) records_synced = 0 for row in iterator: metadata = { '_s3_source_bucket': bucket, '_s3_source_file': s3_file, # index zero, +1 for header row '_s3_source_lineno': records_synced + 2 } to_write = [{**conversion.convert_row(row, schema), **metadata}] singer.write_records(table_name, to_write) records_synced += 1 return records_synced
def list_files_in_bucket(config, bucket, search_prefix=None): s3_client = boto3.client('s3') s3_objects = [] max_results = 1000 args = { 'Bucket': bucket, 'MaxKeys': max_results, } if search_prefix is not None: args['Prefix'] = search_prefix result = s3_client.list_objects_v2(**args) s3_objects += result['Contents'] next_continuation_token = result.get('NextContinuationToken') while next_continuation_token is not None: logger.debug('Continuing pagination with token "{}".'.format( next_continuation_token)) continuation_args = args.copy() continuation_args['ContinuationToken'] = next_continuation_token result = s3_client.list_objects_v2(**continuation_args) s3_objects += result['Contents'] next_continuation_token = result.get('NextContinuationToken') logger.info("Found {} files.".format(len(s3_objects))) return s3_objects
def do_sync(args): logger.info('Starting sync.') config = tap_s3_csv.config.load(args.config) state = load_state(args.state) for table in config['tables']: state = sync_table(config, state, table) logger.info('Done syncing.')
def load(filename): config = {} try: with open(filename) as handle: config = json.load(handle) except: logger.fatal("Failed to decode config file. Is it valid json?") raise RuntimeError CONFIG_CONTRACT(config) return config
def main(): parser = argparse.ArgumentParser() parser.add_argument('-c', '--config', help='Config file', required=True) parser.add_argument('-s', '--state', help='State file') args = parser.parse_args() try: do_sync(args) except RuntimeError: logger.fatal("Run failed.") exit(1)
def load_state(filename): state = {} if filename is None: return state try: with open(filename) as handle: state = json.load(handle) except: logger.fatal("Failed to decode state file. Is it valid json?") raise RuntimeError return state
def get_input_files_for_table(config, table_spec, modified_since=None): bucket = config['bucket'] to_return = [] pattern = table_spec['pattern'] matcher = re.compile(pattern) logger.debug('Checking bucket "{}" for keys matching "{}"'.format( bucket, pattern)) s3_objects = list_files_in_bucket(config, bucket, table_spec.get('search_prefix')) for s3_object in s3_objects: key = s3_object['Key'] last_modified = s3_object['LastModified'] logger.debug('Last modified: {}'.format(last_modified)) if (matcher.search(key) and (modified_since is None or modified_since < last_modified)): logger.debug('Will download key "{}"'.format(key)) to_return.append({'key': key, 'last_modified': last_modified}) else: logger.debug('Will not download key "{}"'.format(key)) to_return = sorted(to_return, key=lambda item: item['last_modified']) return to_return
def convert_row(row, schema): to_return = {} for key, value in row.items(): field_schema = schema['properties'][key] datatype = field_schema.get('_conversion_type', 'string') logger.debug('Converting {} value {} to {}'.format( key, value, datatype)) converted, _ = convert(value, datatype) to_return[key] = converted return to_return
def load(filename): config = {} try: with open(filename) as handle: config = json.load(handle) if not isinstance(config['tables'], dict): config['tables'] = json.loads(config['tables']) except: logger.fatal("Failed to decode config file. Is it valid json?") raise RuntimeError CONFIG_CONTRACT(config) return config
def do_sync(args): logger.info('Starting sync.') meltano_config = load_json_file(args.config) bucket_files_definition = meltano_config.get("bucket_files_definition", None) if bucket_files_definition: if os.path.isfile(bucket_files_definition): config = tap_s3_csv.config.load(bucket_files_definition) else: logger.error("tap_s3_csv: '{}' file not found".format( bucket_files_definition)) exit(1) else: check_config(CONFIG, REQUIRED_CONFIG_KEYS) csv_files = CONFIG['files'] state = load_state(args.state) for table in config['tables']: state = sync_table(config, state, table) state = {'COMPLETED': True} singer.write_state(state) logger.info('Done syncing.')
def sample_file(config, table_spec, s3_path, sample_rate, max_records): logger.info('Sampling {} ({} records, every {}th record).'.format( s3_path, max_records, sample_rate)) samples = [] iterator = tap_s3_csv.format_handler.get_row_iterator( config, table_spec, s3_path) current_row = 0 for row in iterator: if (current_row % sample_rate) == 0: samples.append(row) current_row += 1 if len(samples) >= max_records: break logger.info('Sampled {} records.'.format(len(samples))) return samples
def sync_table(config, state, table_spec): table_name = table_spec['name'] modified_since = dateutil.parser.parse( state.get(table_name, {}).get('modified_since') or config['start_date']) logger.info('Syncing table "{}".'.format(table_name)) logger.info('Getting files modified since {}.'.format(modified_since)) s3_files = s3.get_input_files_for_table( config, table_spec, modified_since) logger.info('Found {} files to be synced.' .format(len(s3_files))) if not s3_files: return state inferred_schema = get_sampled_schema_for_table(config, table_spec) override_schema = {'properties': table_spec.get('schema_overrides', {})} schema = merge_dicts( inferred_schema, override_schema) singer.write_schema( table_name, schema, key_properties=table_spec['key_properties']) records_streamed = 0 for s3_file in s3_files: records_streamed += sync_table_file( config, s3_file['key'], table_spec, schema) state[table_name] = { 'modified_since': s3_file['last_modified'].isoformat() } singer.write_state(state) logger.info('Wrote {} records for table "{}".' .format(records_streamed, table_name)) return state
def retry_handler(details): logger.info("Received retryable error -- Retry %s/%s", details['tries'], MAX_RETRIES)