Exemplo n.º 1
0
def test_writer_class_sync_interval_automatic_flush(tmpdir):
    """
    Create an Avro file using the Writer class with sync_interval set to 0.
    Verify that data does not accumulate in memory but is automatically flushed
    to the file object as each record is added.
    """
    schema = {
        "type":
        "record",
        "name":
        "Test",
        "namespace":
        "test",
        "fields": [{
            "name": "field1",
            "type": {
                "type": "string"
            }
        }, {
            "name": "field2",
            "type": {
                "type": "int"
            }
        }]
    }
    records = [{
        "field1": "test1",
        "field2": -1
    }, {
        "field1": "test2",
        "field2": 5
    }]

    temp_path = tmpdir.join('test_writer_class.avro')
    with temp_path.open('wb') as fo:
        w = Writer(fo, schema, codec='deflate', sync_interval=0)

        # Creating the Writer adds the Avro file header. Get file size with
        # header only.
        file_size_history = [fo.tell()]
        for i, record in enumerate(records):
            assert w.block_count == 0
            w.write(record)

            # Verify records are being stored *in memory*:
            # 1. Block count increases
            # 2. File size does not increase
            assert w.block_count == 0
            file_size_history.append(fo.tell())
            assert file_size_history[-1] > file_size_history[-2]

        # Flushing the file writes the data. File size should increase now.
        w.flush()
        assert fo.tell() == file_size_history[-1]

    # Read the records to verify they were written correctly.
    new_reader = fastavro.reader(temp_path.open('rb'))
    new_records = list(new_reader)
    assert new_records == records
Exemplo n.º 2
0
def test_writer_class_flush_end(tmpdir):
    """
    Create an Avro file using the Writer class. Verify that data accumulates in
    memory and is written when flush() is called.
    """
    schema = {
        "type":
        "record",
        "name":
        "Test",
        "namespace":
        "test",
        "fields": [{
            "name": "field1",
            "type": {
                "type": "string"
            }
        }, {
            "name": "field2",
            "type": {
                "type": "int"
            }
        }]
    }
    records = [{
        "field1": "test1",
        "field2": -1
    }, {
        "field1": "test2",
        "field2": 5
    }]

    temp_path = tmpdir.join('test_writer_class.avro')
    with temp_path.open('wb') as fo:
        w = Writer(fo, schema, codec='deflate')

        # Creating the Writer adds the Avro file header. Get file size with
        # header only.
        size_with_header_only = fo.tell()
        for i, record in enumerate(records):
            assert w.block_count == i
            w.write(record)

            # Verify records are being stored *in memory*:
            # 1. Block count increases
            # 2. File size does not increase
            assert w.block_count == i + 1
            assert fo.tell() == size_with_header_only

        # Flushing the file writes the data. File size should increase now.
        w.flush()
        assert fo.tell() > size_with_header_only

    # Read the records to verify they were written correctly.
    new_reader = fastavro.reader(temp_path.open('rb'))
    new_records = list(new_reader)
    assert new_records == records
Exemplo n.º 3
0
def _write_toavro(table, target, mode, schema, sample,
                  codec='deflate', compression_level=None, **avro_args):
    if table is None:
        return
    # build a schema when not defined by user
    if not schema:
        schema, table2 = _build_schema_from_values(table, sample)
    else:
        table2 = _fix_missing_headers(table, schema)
    # fastavro expects a iterator of dicts
    rows = dicts(table2) if PY3 else _ordered_dict_iterator(table2)

    target2 = write_source_from_arg(target, mode=mode)
    with target2.open(mode) as target_file:
        # delay the import of fastavro for not breaking when unused
        from fastavro import parse_schema
        from fastavro.write import Writer

        parsed_schema = parse_schema(schema)
        writer = Writer(fo=target_file,
                        schema=parsed_schema,
                        codec=codec,
                        compression_level=compression_level,
                        **avro_args)
        num = 1
        for record in rows:
            try:
                writer.write(record)
                num = num + 1
            except ValueError as verr:
                vmsg = _get_error_details(target, num, verr, record, schema)
                _raise_error(ValueError, vmsg)
            except TypeError as terr:
                tmsg = _get_error_details(target, num, terr, record, schema)
                _raise_error(TypeError, tmsg)
        # finish writing
        writer.flush()
Exemplo n.º 4
0
def write_avro_file(f, results_iter, fields, table, max_size=100 * 1024 ** 2):
    """Takes a database result set (list of dicts) and writes an avro file
    up to a particular size. If the schema 'name' is the same as an Avro data
    type (WRITERS.keys()) everything will break for no apparent reason. 'name'
    isn't even really used.

    Returns complete, row_count

    complete is true if the entire results_iter has been drained -- false if
    there are more records to be processed.

    row_count is the number of items written

    max_size is limit at which we should start writing another file.
    """

    if table in WRITERS:
        table += "zzz"

    schema = {"type": "record", "name": table, "fields": fields}

    writer = Writer(f, schema)

    row_count = 0
    complete = False

    try:
        # writer.io buffers before writing
        while f.tell() + writer.io.tell() < max_size:
            writer.write(_format_row(next(results_iter)))
            row_count += 1
    except StopIteration:
        complete = True
    finally:
        writer.flush()

    return complete, row_count