def sync_table_file(config, s3_path, table_spec, stream): LOGGER.info('Syncing file "%s".', s3_path) bucket = config['bucket'] table_name = table_spec['table_name'] s3_file_handle = s3.get_file_handle(config, s3_path) iterator = csv.get_row_iterator(s3_file_handle._raw_stream, table_spec) #pylint:disable=protected-access records_synced = 0 for row in iterator: custom_columns = { s3.SDC_SOURCE_BUCKET_COLUMN: bucket, s3.SDC_SOURCE_FILE_COLUMN: s3_path, # index zero, +1 for header row s3.SDC_SOURCE_LINENO_COLUMN: records_synced + 2 } rec = {**row, **custom_columns} with Transformer() as transformer: to_write = transformer.transform( rec, stream['schema'], metadata.to_map(stream['metadata'])) singer.write_record(table_name, to_write) records_synced += 1 return records_synced
def sync_table_file(config, s3_path, table_spec, stream): LOGGER.info('Syncing file "%s".', s3_path) bucket = config['bucket'] table_name = table_spec['table_name'] s3_file_handle = s3.get_file_handle(config, s3_path) # We observed data who's field size exceeded the default maximum of # 131072. We believe the primary consequence of the following setting # is that a malformed, wide CSV would potentially parse into a single # large field rather than giving this error, but we also think the # chances of that are very small and at any rate the source data would # need to be fixed. The other consequence of this could be larger # memory consumption but that's acceptable as well. csv.field_size_limit(sys.maxsize) iterator = csv_iterator.get_row_iterator(s3_file_handle._raw_stream, table_spec) #pylint:disable=protected-access records_synced = 0 for row in iterator: with Transformer() as transformer: to_write = transformer.transform( row, stream['schema'], metadata.to_map(stream['metadata'])) singer.write_record(table_name, to_write) records_synced += 1 return records_synced
def sync_gz_file(config, s3_path, table_spec, stream, file_handler): if s3_path.endswith(".tar.gz"): LOGGER.warning('Skipping "%s" file as .tar.gz extension is not supported',s3_path) s3.skipped_files_count = s3.skipped_files_count + 1 return 0 # If file is extracted from zip use file object else get file object from s3 bucket file_object = file_handler if file_handler else s3.get_file_handle(config, s3_path) file_bytes = file_object.read() gz_file_obj = gzip.GzipFile(fileobj=io.BytesIO(file_bytes)) # pylint: disable=duplicate-code try: gz_file_name = utils.get_file_name_from_gzfile(fileobj=io.BytesIO(file_bytes)) except AttributeError as err: # If a file is compressed using gzip command with --no-name attribute, # It will not return the file name and timestamp. Hence we will skip such files. # We also seen this issue occur when tar is used to compress the file LOGGER.warning('Skipping "%s" file as we did not get the original file name',s3_path) s3.skipped_files_count = s3.skipped_files_count + 1 return 0 if gz_file_name: if gz_file_name.endswith(".gz"): LOGGER.warning('Skipping "%s" file as it contains nested compression.',s3_path) s3.skipped_files_count = s3.skipped_files_count + 1 return 0 gz_file_extension = gz_file_name.split(".")[-1].lower() return handle_file(config, s3_path + "/" + gz_file_name, table_spec, stream, gz_file_extension, io.BytesIO(gz_file_obj.read())) raise Exception('"{}" file has some error(s)'.format(s3_path))
def sync_gz_file(config, s3_path, table_spec, stream, file_handler): if s3_path.endswith(".tar.gz"): LOGGER.warning( 'Skipping "%s" file as .tar.gz extension is not supported', s3_path) return 0 # If file is extracted from zip use file object else get file object from s3 bucket file_object = file_handler if file_handler else s3.get_file_handle( config, s3_path) file_bytes = file_object.read() gz_file_obj = gzip.GzipFile(fileobj=io.BytesIO(file_bytes)) gz_file_name = utils.get_file_name_from_gzfile( fileobj=io.BytesIO(file_bytes)) if gz_file_name: if gz_file_name.endswith(".gz"): LOGGER.warning( 'Skipping "%s" file as it contains nested compression.', s3_path) return 0 gz_file_extension = gz_file_name.split(".")[-1].lower() return handle_file(config, s3_path + "/" + gz_file_name, table_spec, stream, gz_file_extension, io.BytesIO(gz_file_obj.read())) raise Exception('"{}" file has some error(s)'.format(s3_path))
def handle_file(config, s3_path, table_spec, stream, extension, file_handler=None): """ Used to sync normal supported files """ # Check whether file is without extension or not if not extension or s3_path.lower() == extension: LOGGER.warning('"%s" without extension will not be synced.', s3_path) s3.skipped_files_count = s3.skipped_files_count + 1 return 0 if extension == "gz": return sync_gz_file(config, s3_path, table_spec, stream, file_handler) if extension in ["csv", "txt"]: # If file is extracted from zip or gz use file object else get file object from s3 bucket file_handle = file_handler if file_handler else s3.get_file_handle( config, s3_path)._raw_stream # pylint:disable=protected-access return sync_csv_file(config, file_handle, s3_path, table_spec, stream) if extension == "jsonl": # If file is extracted from zip or gz use file object else get file object from s3 bucket file_handle = file_handler if file_handler else s3.get_file_handle( config, s3_path)._raw_stream records = sync_jsonl_file( config, file_handle, s3_path, table_spec, stream) if records == 0: # Only space isn't the valid JSON but it is a valid CSV header hence skipping the jsonl file with only space. s3.skipped_files_count = s3.skipped_files_count + 1 LOGGER.warning('Skipping "%s" file as it is empty', s3_path) return records if extension == "zip": LOGGER.warning( 'Skipping "%s" file as it contains nested compression.', s3_path) s3.skipped_files_count = s3.skipped_files_count + 1 return 0 LOGGER.warning( '"%s" having the ".%s" extension will not be synced.', s3_path, extension) s3.skipped_files_count = s3.skipped_files_count + 1 return 0
def sync_table_file(config, s3_path, table_spec, stream, last_modified): LOGGER.info('Syncing file "%s".', s3_path) bucket = config["bucket"] table_name = table_spec["table_name"] s3_file_handle = s3.get_file_handle(config, s3_path) # We observed data who's field size exceeded the default maximum of # 131072. We believe the primary consequence of the following setting # is that a malformed, wide CSV would potentially parse into a single # large field rather than giving this error, but we also think the # chances of that are very small and at any rate the source data would # need to be fixed. The other consequence of this could be larger # memory consumption but that's acceptable as well. csv.field_size_limit(sys.maxsize) encoding_module = singer_encodings_csv if 'encoding_module' in config: try: encoding_module = importlib.import_module( config['encoding_module']) except ModuleNotFoundError: LOGGER.warning( f'Failed to load encoding module [{config["encoding_module"]}]. Defaulting to [singer_encodings.csv]' ) iterator = encoding_module.get_row_iterator(s3_file_handle._raw_stream, table_spec) # pylint:disable=protected-access records_synced = 0 for row in iterator: custom_columns = { s3.SDC_SOURCE_BUCKET_COLUMN: bucket, s3.SDC_SOURCE_FILE_COLUMN: s3_path, # index zero, +1 for header row s3.SDC_SOURCE_LINENO_COLUMN: records_synced + 2, } rec = {**row, **custom_columns} with Transformer() as transformer: to_write = transformer.transform( rec, stream["schema"], metadata.to_map(stream["metadata"])) to_write_with_sequence = RecordMessageWithSequence( singer.RecordMessage(stream=table_name, record=to_write), last_modified) singer.write_message(to_write_with_sequence) records_synced += 1 return records_synced
def handle_file(config, s3_path, table_spec, stream, extension, file_handler=None): """ Used to sync normal supported files """ # Check whether file is without extension or not if not extension or s3_path.lower() == extension: LOGGER.warning('"%s" without extension will not be synced.', s3_path) return 0 if extension == "gz": return sync_gz_file(config, s3_path, table_spec, stream, file_handler) if extension in ["csv", "txt"]: # If file is extracted from zip or gz use file object else get file object from s3 bucket file_handle = file_handler if file_handler else s3.get_file_handle( config, s3_path)._raw_stream #pylint:disable=protected-access return sync_csv_file(config, file_handle, s3_path, table_spec, stream) if extension == "jsonl": # If file is extracted from zip or gz use file object else get file object from s3 bucket file_handle = file_handler if file_handler else s3.get_file_handle( config, s3_path)._raw_stream return sync_jsonl_file(config, file_handle, s3_path, table_spec, stream) if extension == "zip": LOGGER.warning('Skipping "%s" file as it contains nested compression.', s3_path) return 0 LOGGER.warning('"%s" having the ".%s" extension will not be synced.', s3_path, extension) return 0
def sync_table_file(config: Dict, s3_path: str, table_spec: Dict, stream: Dict) -> int: """ Sync a given csv found file :param config: tap configuration :param s3_path: file path given by S3 :param table_spec: tables specs :param stream: Stream data :return: number of streamed records """ LOGGER.info('Syncing file "%s".', s3_path) bucket = config['bucket'] table_name = table_spec['table_name'] s3_file_handle = s3.get_file_handle(config, s3_path) # We observed data who's field size exceeded the default maximum of # 131072. We believe the primary consequence of the following setting # is that a malformed, wide CSV would potentially parse into a single # large field rather than giving this error, but we also think the # chances of that are very small and at any rate the source data would # need to be fixed. The other consequence of this could be larger # memory consumption but that's acceptable as well. csv.field_size_limit(sys.maxsize) iterator = get_row_iterator(s3_file_handle._raw_stream, table_spec) # pylint:disable=protected-access records_synced = 0 for row in iterator: time_extracted = utils.now() custom_columns = { s3.SDC_SOURCE_BUCKET_COLUMN: bucket, s3.SDC_SOURCE_FILE_COLUMN: s3_path, # index zero, +1 for header row s3.SDC_SOURCE_LINENO_COLUMN: records_synced + 2 } rec = {**row, **custom_columns} with Transformer() as transformer: to_write = transformer.transform(rec, stream['schema'], metadata.to_map(stream['metadata'])) write_record(table_name, to_write, time_extracted=time_extracted) records_synced += 1 return records_synced
def sync_compressed_file(config, s3_path, table_spec, stream): LOGGER.info('Syncing Compressed file "%s".', s3_path) records_streamed = 0 s3_file_handle = s3.get_file_handle(config, s3_path) decompressed_files = compression.infer(io.BytesIO(s3_file_handle.read()), s3_path) for decompressed_file in decompressed_files: extension = decompressed_file.name.split(".")[-1].lower() if extension in ["csv", "jsonl", "gz", "txt"]: # Append the extracted file name with zip file. s3_file_path = s3_path + "/" + decompressed_file.name records_streamed += handle_file(config, s3_file_path, table_spec, stream, extension, file_handler=decompressed_file) return records_streamed
def sync_table_file(config: Dict, s3_path: str, table_spec: Dict, stream: Dict) -> int: """ Sync a given csv found file :param config: tap configuration :param s3_path: file path given by S3 :param table_spec: tables specs :param stream: Stream data :return: number of streamed records """ LOGGER.info('Syncing file "%s".', s3_path) bucket = config['bucket'] table_name = table_spec['table_name'] s3_file_handle = s3.get_file_handle(config, s3_path) iterator = get_row_iterator(s3_file_handle._raw_stream, table_spec) # pylint:disable=protected-access records_synced = 0 for row in iterator: custom_columns = { s3.SDC_SOURCE_BUCKET_COLUMN: bucket, s3.SDC_SOURCE_FILE_COLUMN: s3_path, # index zero, +1 for header row s3.SDC_SOURCE_LINENO_COLUMN: records_synced + 2 } rec = {**row, **custom_columns} with Transformer() as transformer: to_write = transformer.transform( rec, stream['schema'], metadata.to_map(stream['metadata'])) write_record(table_name, to_write) records_synced += 1 return records_synced
def sync_table_file(config, s3_path, table_spec, stream): LOGGER.info('Syncing file "%s".', s3_path) bucket = config['bucket'] table_name = table_spec['table_name'] s3_file_handle = s3.get_file_handle(config, s3_path) # We observed data who's field size exceeded the default maximum of # 131072. We believe the primary consequence of the following setting # is that a malformed, wide CSV would potentially parse into a single # large field rather than giving this error, but we also think the # chances of that are very small and at any rate the source data would # need to be fixed. The other consequence of this could be larger # memory consumption but that's acceptable as well. csv.field_size_limit(sys.maxsize) iterator = singer_encodings_csv.get_row_iterator( s3_file_handle._raw_stream, table_spec) #pylint:disable=protected-access records_synced = 0 for row in iterator: custom_columns = { s3.SDC_SOURCE_BUCKET_COLUMN: bucket, s3.SDC_SOURCE_FILE_COLUMN: s3_path, # index zero, +1 for header row s3.SDC_SOURCE_LINENO_COLUMN: records_synced + 2 } rec = {**row, **custom_columns} with Transformer() as transformer: to_write = transformer.transform(rec, stream.schema.to_dict(), metadata.to_map(stream.metadata)) singer.write_record(table_name, to_write) records_synced += 1 return records_synced
def sync_table_file(config, s3_path, table_spec, stream, modified): LOGGER.info('Syncing file "%s".', s3_path) bucket = config['bucket'] table_name = table_spec['table_name'] s3_file_handle = s3.get_file_handle(config, s3_path) # We observed data who's field size exceeded the default maximum of # 131072. We believe the primary consequence of the following setting # is that a malformed, wide CSV would potentially parse into a single # large field rather than giving this error, but we also think the # chances of that are very small and at any rate the source data would # need to be fixed. The other consequence of this could be larger # memory consumption but that's acceptable as well. csv.field_size_limit(sys.maxsize) longitud = 0 if s3_path.endswith('zip'): with io.BytesIO(s3_file_handle.read()) as tf: if tf is not None: tf.seek(0) # Read the file as a zipfile and process the members with zipfile.ZipFile(tf, mode='r') as zipf: for subfile in zipf.namelist(): with zipf.open(subfile) as myfile: iterator = singer_encodings_csv.get_row_iterator( myfile, table_spec) rows = list(iterator) longitud = len(rows) else: iterator = singer_encodings_csv.get_row_iterator( s3_file_handle._raw_stream, table_spec) #pylint:disable=protected-access rows = list(iterator) longitud = len(rows) records_synced = 0 current_row = 0 i = 0 for row in rows: custom_columns = { s3.SDC_SOURCE_BUCKET_COLUMN: bucket, s3.SDC_SOURCE_FILE_COLUMN: s3_path, # index zero, +1 for header row s3.SDC_SOURCE_LINENO_COLUMN: records_synced + 2 } rec = {**row, **custom_columns} with Transformer() as transformer: to_write = transformer.transform( rec, stream['schema'], metadata.to_map(stream['metadata'])) if "preprocess" in config and config['preprocess'] != '': preprocess_items = json.loads(config['preprocess']) for i in preprocess_items: preprocess = i if (table_name == preprocess['table_name']): for value in preprocess['values']: to_get = value.split("|")[0] to_del = value.split("|")[1] if to_get in rec: if to_del in rec: if rec[to_get] == rec[to_del]: if to_del in to_write: del to_write[to_del] else: LOGGER.warning('removing record: ' + json.dumps(rec) + ' ' + to_get + ' and ' + to_del + ' are not equals') elif to_del in rec: to_write[to_get] = rec[to_del] if to_del in to_write: del to_write[to_del] else: to_write[to_get] = "" to_write['last_modified'] = modified.__str__() singer.write_record(table_name, to_write) records_synced += 1 current_row += 1 if (i == longitud): continue return records_synced