def do_sync(args): logger.info('Starting sync.') meltano_config = load_json_file(args.config) bucket_files_definition = meltano_config.get("bucket_files_definition", None) if bucket_files_definition: if os.path.isfile(bucket_files_definition): config = tap_s3_csv.config.load(bucket_files_definition) else: logger.error("tap_s3_csv: '{}' file not found".format( bucket_files_definition)) exit(1) else: check_config(CONFIG, REQUIRED_CONFIG_KEYS) csv_files = CONFIG['files'] state = load_state(args.state) for table in config['tables']: state = sync_table(config, state, table) state = {'COMPLETED': True} singer.write_state(state) logger.info('Done syncing.')
def get_sampled_schema_for_table(config, table_spec): logger.info('Sampling records to determine table schema.') s3_files = s3.get_input_files_for_table(config, table_spec) samples = s3.sample_files(config, table_spec, s3_files) metadata_schema = { '_s3_source_bucket': { 'type': 'string' }, '_s3_source_file': { 'type': 'string' }, '_s3_source_lineno': { 'type': 'integer' }, } data_schema = conversion.generate_schema(samples) return { 'type': 'object', 'properties': merge_dicts(data_schema, metadata_schema) }
def list_files_in_bucket(config, bucket, search_prefix=None): s3_client = boto3.client('s3') s3_objects = [] max_results = 1000 args = { 'Bucket': bucket, 'MaxKeys': max_results, } if search_prefix is not None: args['Prefix'] = search_prefix result = s3_client.list_objects_v2(**args) s3_objects += result['Contents'] next_continuation_token = result.get('NextContinuationToken') while next_continuation_token is not None: logger.debug('Continuing pagination with token "{}".'.format( next_continuation_token)) continuation_args = args.copy() continuation_args['ContinuationToken'] = next_continuation_token result = s3_client.list_objects_v2(**continuation_args) s3_objects += result['Contents'] next_continuation_token = result.get('NextContinuationToken') logger.info("Found {} files.".format(len(s3_objects))) return s3_objects
def sync_table_file(config, s3_file, table_spec, schema): logger.info('Syncing file "{}".'.format(s3_file)) bucket = config['bucket'] table_name = table_spec['name'] iterator = tap_s3_csv.format_handler.get_row_iterator( config, table_spec, s3_file) records_synced = 0 for row in iterator: metadata = { '_s3_source_bucket': bucket, '_s3_source_file': s3_file, # index zero, +1 for header row '_s3_source_lineno': records_synced + 2 } try: to_write = [{**conversion.convert_row(row, schema), **metadata}] singer.write_records(table_name, to_write) except BrokenPipeError as bpe: logger.error( f'Pipe to loader broke after {records_synced} records were written from {s3_file}: troubled line was {row}' ) raise bpe records_synced += 1 return records_synced
def sync_table_file(config, s3_file, table_spec, schema): logger.info('Syncing file "{}".'.format(s3_file)) bucket = config['bucket'] table_name = table_spec['name'] iterator = tap_s3_csv.format_handler.get_row_iterator( config, table_spec, s3_file) records_synced = 0 for row in iterator: metadata = { '_s3_source_bucket': bucket, '_s3_source_file': s3_file, # index zero, +1 for header row '_s3_source_lineno': records_synced + 2 } to_write = [{**conversion.convert_row(row, schema), **metadata}] singer.write_records(table_name, to_write) records_synced += 1 return records_synced
def do_sync(args): logger.info('Starting sync.') config = tap_s3_csv.config.load(args.config) state = load_state(args.state) for table in config['tables']: state = sync_table(config, state, table) logger.info('Done syncing.')
def sample_file(config, table_spec, s3_path, sample_rate, max_records): logger.info('Sampling {} ({} records, every {}th record).'.format( s3_path, max_records, sample_rate)) samples = [] iterator = tap_s3_csv.format_handler.get_row_iterator( config, table_spec, s3_path) current_row = 0 for row in iterator: if (current_row % sample_rate) == 0: samples.append(row) current_row += 1 if len(samples) >= max_records: break logger.info('Sampled {} records.'.format(len(samples))) return samples
def sync_table(config, state, table_spec): table_name = table_spec['name'] modified_since = dateutil.parser.parse( state.get(table_name, {}).get('modified_since') or config['start_date']) logger.info('Syncing table "{}".'.format(table_name)) logger.info('Getting files modified since {}.'.format(modified_since)) s3_files = s3.get_input_files_for_table( config, table_spec, modified_since) logger.info('Found {} files to be synced.' .format(len(s3_files))) if not s3_files: return state inferred_schema = get_sampled_schema_for_table(config, table_spec) override_schema = {'properties': table_spec.get('schema_overrides', {})} schema = merge_dicts( inferred_schema, override_schema) singer.write_schema( table_name, schema, key_properties=table_spec['key_properties']) records_streamed = 0 for s3_file in s3_files: records_streamed += sync_table_file( config, s3_file['key'], table_spec, schema) state[table_name] = { 'modified_since': s3_file['last_modified'].isoformat() } singer.write_state(state) logger.info('Wrote {} records for table "{}".' .format(records_streamed, table_name)) return state
def retry_handler(details): logger.info("Received retryable error -- Retry %s/%s", details['tries'], MAX_RETRIES)