def sync_table_file(config, s3_path, table_spec, stream): LOGGER.info('Syncing file "%s".', s3_path) bucket = config['bucket'] table_name = table_spec['table_name'] s3_file_handle = s3.get_file_handle(config, s3_path) # We observed data who's field size exceeded the default maximum of # 131072. We believe the primary consequence of the following setting # is that a malformed, wide CSV would potentially parse into a single # large field rather than giving this error, but we also think the # chances of that are very small and at any rate the source data would # need to be fixed. The other consequence of this could be larger # memory consumption but that's acceptable as well. csv.field_size_limit(sys.maxsize) iterator = csv_iterator.get_row_iterator(s3_file_handle._raw_stream, table_spec) #pylint:disable=protected-access records_synced = 0 for row in iterator: with Transformer() as transformer: to_write = transformer.transform( row, stream['schema'], metadata.to_map(stream['metadata'])) singer.write_record(table_name, to_write) records_synced += 1 return records_synced
def sample_file(table_spec, s3_path, file_handle, sample_rate, extension): global skipped_files_count # Check whether file is without extension or not if not extension or s3_path.lower() == extension: LOGGER.warning('"%s" without extension will not be sampled.', s3_path) skipped_files_count = skipped_files_count + 1 return [] if extension in ["csv", "txt"]: # If file object read from s3 bucket file else use extracted file object from zip or gz file_handle = file_handle._raw_stream if hasattr( file_handle, "_raw_stream") else file_handle # pylint:disable=protected-access iterator = csv_iterator.get_row_iterator(file_handle, table_spec) csv_records = [] if iterator: csv_records = get_records_for_csv(s3_path, sample_rate, iterator) else: LOGGER.warning('Skipping "%s" file as it is empty', s3_path) skipped_files_count = skipped_files_count + 1 return csv_records if extension == "gz": return sampling_gz_file(table_spec, s3_path, file_handle, sample_rate) if extension == "jsonl": # If file object read from s3 bucket file else use extracted file object from zip or gz file_handle = file_handle._raw_stream if hasattr( file_handle, "_raw_stream") else file_handle records = get_records_for_jsonl( s3_path, sample_rate, file_handle) check_jsonl_sample_records, records = itertools.tee( records) jsonl_sample_records = list(check_jsonl_sample_records) if len(jsonl_sample_records) == 0: LOGGER.warning('Skipping "%s" file as it is empty', s3_path) skipped_files_count = skipped_files_count + 1 check_key_properties_and_date_overrides_for_jsonl_file( table_spec, jsonl_sample_records, s3_path) return records if extension == "zip": LOGGER.warning( 'Skipping "%s" file as it contains nested compression.', s3_path) skipped_files_count = skipped_files_count + 1 return [] LOGGER.warning( '"%s" having the ".%s" extension will not be sampled.', s3_path, extension) skipped_files_count = skipped_files_count + 1 return []
def sample_file(config, table_spec, s3_path, sample_rate): file_handle = get_file_handle(config, s3_path) iterator = csv_iterator.get_row_iterator(file_handle._raw_stream, table_spec) #pylint:disable=protected-access current_row = 0 sampled_row_count = 0 for row in iterator: if (current_row % sample_rate) == 0: sampled_row_count += 1 if (sampled_row_count % 200) == 0: LOGGER.info("Sampled %s rows from %s", sampled_row_count, s3_path) yield row current_row += 1 LOGGER.info("Sampled %s rows from %s", sampled_row_count, s3_path)
def sync_csv_file(config, file_handle, s3_path, table_spec, stream): LOGGER.info('Syncing file "%s".', s3_path) bucket = config['bucket'] table_name = table_spec['table_name'] # We observed data who's field size exceeded the default maximum of # 131072. We believe the primary consequence of the following setting # is that a malformed, wide CSV would potentially parse into a single # large field rather than giving this error, but we also think the # chances of that are very small and at any rate the source data would # need to be fixed. The other consequence of this could be larger # memory consumption but that's acceptable as well. csv.field_size_limit(sys.maxsize) iterator = csv_iterator.get_row_iterator(file_handle, table_spec) records_synced = 0 if iterator: for row in iterator: # Skipping the empty line of CSV if len(row) == 0: continue with Transformer() as transformer: to_write = transformer.transform( row, stream['schema'], metadata.to_map(stream['metadata'])) singer.write_record(table_name, to_write) records_synced += 1 else: LOGGER.warning('Skipping "%s" file as it is empty', s3_path) s3.skipped_files_count = s3.skipped_files_count + 1 return records_synced