Пример #1
0
def sync_table_file(config, s3_path, table_spec, stream):
    LOGGER.info('Syncing file "%s".', s3_path)

    bucket = config['bucket']
    table_name = table_spec['table_name']

    s3_file_handle = s3.get_file_handle(config, s3_path)
    # We observed data who's field size exceeded the default maximum of
    # 131072. We believe the primary consequence of the following setting
    # is that a malformed, wide CSV would potentially parse into a single
    # large field rather than giving this error, but we also think the
    # chances of that are very small and at any rate the source data would
    # need to be fixed. The other consequence of this could be larger
    # memory consumption but that's acceptable as well.
    csv.field_size_limit(sys.maxsize)
    iterator = csv_iterator.get_row_iterator(s3_file_handle._raw_stream,
                                             table_spec)  #pylint:disable=protected-access

    records_synced = 0

    for row in iterator:

        with Transformer() as transformer:
            to_write = transformer.transform(
                row, stream['schema'], metadata.to_map(stream['metadata']))

        singer.write_record(table_name, to_write)
        records_synced += 1

    return records_synced
Пример #2
0
def sample_file(table_spec, s3_path, file_handle, sample_rate, extension):
    global skipped_files_count

    # Check whether file is without extension or not
    if not extension or s3_path.lower() == extension:
        LOGGER.warning('"%s" without extension will not be sampled.', s3_path)
        skipped_files_count = skipped_files_count + 1
        return []
    if extension in ["csv", "txt"]:
        # If file object read from s3 bucket file else use extracted file object from zip or gz
        file_handle = file_handle._raw_stream if hasattr(
            file_handle, "_raw_stream") else file_handle  # pylint:disable=protected-access
        iterator = csv_iterator.get_row_iterator(file_handle, table_spec)
        csv_records = []
        if iterator:
            csv_records = get_records_for_csv(s3_path, sample_rate, iterator)
        else:
            LOGGER.warning('Skipping "%s" file as it is empty', s3_path)
            skipped_files_count = skipped_files_count + 1
        return csv_records
    if extension == "gz":
        return sampling_gz_file(table_spec, s3_path, file_handle, sample_rate)
    if extension == "jsonl":
        # If file object read from s3 bucket file else use extracted file object from zip or gz
        file_handle = file_handle._raw_stream if hasattr(
            file_handle, "_raw_stream") else file_handle
        records = get_records_for_jsonl(
            s3_path, sample_rate, file_handle)
        check_jsonl_sample_records, records = itertools.tee(
            records)
        jsonl_sample_records = list(check_jsonl_sample_records)
        if len(jsonl_sample_records) == 0:
            LOGGER.warning('Skipping "%s" file as it is empty', s3_path)
            skipped_files_count = skipped_files_count + 1
        check_key_properties_and_date_overrides_for_jsonl_file(
            table_spec, jsonl_sample_records, s3_path)

        return records
    if extension == "zip":
        LOGGER.warning(
            'Skipping "%s" file as it contains nested compression.', s3_path)
        skipped_files_count = skipped_files_count + 1
        return []
    LOGGER.warning(
        '"%s" having the ".%s" extension will not be sampled.', s3_path, extension)
    skipped_files_count = skipped_files_count + 1
    return []
Пример #3
0
def sample_file(config, table_spec, s3_path, sample_rate):
    file_handle = get_file_handle(config, s3_path)
    iterator = csv_iterator.get_row_iterator(file_handle._raw_stream,
                                             table_spec)  #pylint:disable=protected-access

    current_row = 0

    sampled_row_count = 0

    for row in iterator:
        if (current_row % sample_rate) == 0:
            sampled_row_count += 1
            if (sampled_row_count % 200) == 0:
                LOGGER.info("Sampled %s rows from %s", sampled_row_count,
                            s3_path)
            yield row

        current_row += 1

    LOGGER.info("Sampled %s rows from %s", sampled_row_count, s3_path)
Пример #4
0
def sync_csv_file(config, file_handle, s3_path, table_spec, stream):
    LOGGER.info('Syncing file "%s".', s3_path)

    bucket = config['bucket']
    table_name = table_spec['table_name']

    # We observed data who's field size exceeded the default maximum of
    # 131072. We believe the primary consequence of the following setting
    # is that a malformed, wide CSV would potentially parse into a single
    # large field rather than giving this error, but we also think the
    # chances of that are very small and at any rate the source data would
    # need to be fixed. The other consequence of this could be larger
    # memory consumption but that's acceptable as well.
    csv.field_size_limit(sys.maxsize)

    iterator = csv_iterator.get_row_iterator(file_handle, table_spec)

    records_synced = 0

    if iterator:
        for row in iterator:

            # Skipping the empty line of CSV
            if len(row) == 0:
                continue

            with Transformer() as transformer:
                to_write = transformer.transform(
                    row, stream['schema'], metadata.to_map(stream['metadata']))

            singer.write_record(table_name, to_write)
            records_synced += 1
    else:
        LOGGER.warning('Skipping "%s" file as it is empty', s3_path)
        s3.skipped_files_count = s3.skipped_files_count + 1

    return records_synced