def test_writer_class_sync_interval_automatic_flush(tmpdir):
    """
    Create an Avro file using the Writer class with sync_interval set to 0.
    Verify that data does not accumulate in memory but is automatically flushed
    to the file object as each record is added.
    """
    schema = {
        "type":
        "record",
        "name":
        "Test",
        "namespace":
        "test",
        "fields": [{
            "name": "field1",
            "type": {
                "type": "string"
            }
        }, {
            "name": "field2",
            "type": {
                "type": "int"
            }
        }]
    }
    records = [{
        "field1": "test1",
        "field2": -1
    }, {
        "field1": "test2",
        "field2": 5
    }]

    temp_path = tmpdir.join('test_writer_class.avro')
    with temp_path.open('wb') as fo:
        w = Writer(fo, schema, codec='deflate', sync_interval=0)

        # Creating the Writer adds the Avro file header. Get file size with
        # header only.
        file_size_history = [fo.tell()]
        for i, record in enumerate(records):
            assert w.block_count == 0
            w.write(record)

            # Verify records are being stored *in memory*:
            # 1. Block count increases
            # 2. File size does not increase
            assert w.block_count == 0
            file_size_history.append(fo.tell())
            assert file_size_history[-1] > file_size_history[-2]

        # Flushing the file writes the data. File size should increase now.
        w.flush()
        assert fo.tell() == file_size_history[-1]

    # Read the records to verify they were written correctly.
    new_reader = fastavro.reader(temp_path.open('rb'))
    new_records = list(new_reader)
    assert new_records == records
def test_writer_class_flush_end(tmpdir):
    """
    Create an Avro file using the Writer class. Verify that data accumulates in
    memory and is written when flush() is called.
    """
    schema = {
        "type":
        "record",
        "name":
        "Test",
        "namespace":
        "test",
        "fields": [{
            "name": "field1",
            "type": {
                "type": "string"
            }
        }, {
            "name": "field2",
            "type": {
                "type": "int"
            }
        }]
    }
    records = [{
        "field1": "test1",
        "field2": -1
    }, {
        "field1": "test2",
        "field2": 5
    }]

    temp_path = tmpdir.join('test_writer_class.avro')
    with temp_path.open('wb') as fo:
        w = Writer(fo, schema, codec='deflate')

        # Creating the Writer adds the Avro file header. Get file size with
        # header only.
        size_with_header_only = fo.tell()
        for i, record in enumerate(records):
            assert w.block_count == i
            w.write(record)

            # Verify records are being stored *in memory*:
            # 1. Block count increases
            # 2. File size does not increase
            assert w.block_count == i + 1
            assert fo.tell() == size_with_header_only

        # Flushing the file writes the data. File size should increase now.
        w.flush()
        assert fo.tell() > size_with_header_only

    # Read the records to verify they were written correctly.
    new_reader = fastavro.reader(temp_path.open('rb'))
    new_records = list(new_reader)
    assert new_records == records
示例#3
0
def _write_toavro(table, target, mode, schema, sample,
                  codec='deflate', compression_level=None, **avro_args):
    if table is None:
        return
    # build a schema when not defined by user
    if not schema:
        schema, table2 = _build_schema_from_values(table, sample)
    else:
        table2 = _fix_missing_headers(table, schema)
    # fastavro expects a iterator of dicts
    rows = dicts(table2) if PY3 else _ordered_dict_iterator(table2)

    target2 = write_source_from_arg(target, mode=mode)
    with target2.open(mode) as target_file:
        # delay the import of fastavro for not breaking when unused
        from fastavro import parse_schema
        from fastavro.write import Writer

        parsed_schema = parse_schema(schema)
        writer = Writer(fo=target_file,
                        schema=parsed_schema,
                        codec=codec,
                        compression_level=compression_level,
                        **avro_args)
        num = 1
        for record in rows:
            try:
                writer.write(record)
                num = num + 1
            except ValueError as verr:
                vmsg = _get_error_details(target, num, verr, record, schema)
                _raise_error(ValueError, vmsg)
            except TypeError as terr:
                tmsg = _get_error_details(target, num, terr, record, schema)
                _raise_error(TypeError, tmsg)
        # finish writing
        writer.flush()
示例#4
0
def write_avro_file(f, results_iter, fields, table, max_size=100 * 1024 ** 2):
    """Takes a database result set (list of dicts) and writes an avro file
    up to a particular size. If the schema 'name' is the same as an Avro data
    type (WRITERS.keys()) everything will break for no apparent reason. 'name'
    isn't even really used.

    Returns complete, row_count

    complete is true if the entire results_iter has been drained -- false if
    there are more records to be processed.

    row_count is the number of items written

    max_size is limit at which we should start writing another file.
    """

    if table in WRITERS:
        table += "zzz"

    schema = {"type": "record", "name": table, "fields": fields}

    writer = Writer(f, schema)

    row_count = 0
    complete = False

    try:
        # writer.io buffers before writing
        while f.tell() + writer.io.tell() < max_size:
            writer.write(_format_row(next(results_iter)))
            row_count += 1
    except StopIteration:
        complete = True
    finally:
        writer.flush()

    return complete, row_count
def test_writer_class_split_files(tmpdir):
    """
    Create 2 Avro files using the Writer class and the default sync_interval
    setting. We write to one file until the Writer automatically flushes, then
    write more records to the other file. Verify that the two files together
    contain all the records that were written.

    This simulates a real-world use case where a large Avro data set is split
    into files of approximately the same size.
    """
    schema = {
        "type": "record",
        "name": "Test",
        "namespace": "test",
        "fields": [{
            "name": "field",
            "type": {
                "type": "string"
            }
        }]
    }
    records = []

    def _append_record(writer_):
        record = {"field": "test{}".format(len(records))}
        records.append(record)
        writer_.write(record)

    temp_paths = [
        tmpdir.join('test_writer_class1.avro'),
        tmpdir.join('test_writer_class2.avro')
    ]
    interim_record_counts = []

    # First file: Write records until block_count goes back to 0 for the second
    # time.
    with temp_paths[0].open('wb') as fo:
        w = Writer(fo, schema, codec='deflate')
        _append_record(w)
        while w.block_count > 0:
            _append_record(w)
        _append_record(w)
        while w.block_count > 0:
            _append_record(w)
        w.flush()
    interim_record_counts.append(len(records))

    # Second file: 100 records
    with temp_paths[1].open('wb') as fo:
        w = Writer(fo, schema, codec='deflate')
        for i in range(100):
            _append_record(w)
        w.flush()
    interim_record_counts.append(len(records))

    assert interim_record_counts[1] == interim_record_counts[0] + 100

    # Read the records to verify they were written correctly.
    new_records = []
    new_interim_record_counts = []
    for temp_path in temp_paths:
        new_reader = fastavro.reader(temp_path.open('rb'))
        new_records += list(new_reader)
        new_interim_record_counts.append(len(new_records))
    assert new_records == records
    assert interim_record_counts == new_interim_record_counts