def test_writer_class_split_files(tmpdir):
    """
    Create 2 Avro files using the Writer class and the default sync_interval
    setting. We write to one file until the Writer automatically flushes, then
    write more records to the other file. Verify that the two files together
    contain all the records that were written.

    This simulates a real-world use case where a large Avro data set is split
    into files of approximately the same size.
    """
    schema = {
        "type": "record",
        "name": "Test",
        "namespace": "test",
        "fields": [{
            "name": "field",
            "type": {
                "type": "string"
            }
        }]
    }
    records = []

    def _append_record(writer_):
        record = {"field": "test{}".format(len(records))}
        records.append(record)
        writer_.write(record)

    temp_paths = [
        tmpdir.join('test_writer_class1.avro'),
        tmpdir.join('test_writer_class2.avro')
    ]
    interim_record_counts = []

    # First file: Write records until block_count goes back to 0 for the second
    # time.
    with temp_paths[0].open('wb') as fo:
        w = Writer(fo, schema, codec='deflate')
        _append_record(w)
        while w.block_count > 0:
            _append_record(w)
        _append_record(w)
        while w.block_count > 0:
            _append_record(w)
        w.flush()
    interim_record_counts.append(len(records))

    # Second file: 100 records
    with temp_paths[1].open('wb') as fo:
        w = Writer(fo, schema, codec='deflate')
        for i in range(100):
            _append_record(w)
        w.flush()
    interim_record_counts.append(len(records))

    assert interim_record_counts[1] == interim_record_counts[0] + 100

    # Read the records to verify they were written correctly.
    new_records = []
    new_interim_record_counts = []
    for temp_path in temp_paths:
        new_reader = fastavro.reader(temp_path.open('rb'))
        new_records += list(new_reader)
        new_interim_record_counts.append(len(new_records))
    assert new_records == records
    assert interim_record_counts == new_interim_record_counts
示例#2
0
文件: avroio.py 项目: mahak/beam
 def open(self, temp_path):
     self.file_handle = super().open(temp_path)
     return Writer(self.file_handle, self._schema, self._codec)
示例#3
0
 def open(self, temp_path):
     file_handle = super(_AvroSink, self).open(temp_path)
     return Writer(file_handle, self._schema.to_json(), self._codec)
示例#4
0
 def open(self, temp_path):
     file_handle = super(_FastAvroSink, self).open(temp_path)
     return Writer(file_handle, self._schema, self._codec)
示例#5
0
  def open(self, temp_path):
    # TODO(BEAM-4749): fastavro fails to install in MacOS.
    from fastavro.write import Writer  # pylint: disable=wrong-import-position

    file_handle = super(_AvroSink, self).open(temp_path)
    return Writer(file_handle, self._schema.to_json(), self._codec)