示例#1
0
def test_schema_migration_schema_mismatch():
    schema = {
        "type": "record",
        "fields": [{
            "name": "test",
            "type": "string",
        }]
    }

    new_schema = {
        "type": "enum",
        "name": "test",
        "symbols": ["FOO", "BAR"],
    }

    new_file = MemoryIO()
    records = [{"test": "test"}]
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file, new_schema)
    try:
        list(new_reader)
    except fastavro._reader.SchemaResolutionError:
        pass
    else:
        assert False
示例#2
0
def test_schema_migration_array_failure():
    schema = {
        "type": "record",
        "fields": [{
            "name": "test",
            "type": {
                "type": "array",
                "items": ["string", "int"]
            },
        }]
    }

    new_schema = {
        "type": "record",
        "fields": [{
            "name": "test",
            "type": {
                "type": "array",
                "items": ["string", "boolean"]
            },
        }]
    }

    new_file = MemoryIO()
    records = [{"test": [1, 2, 3]}]
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file, new_schema)
    list(new_reader)
示例#3
0
def test_schema_migration_maps_failure():
    schema = {
        "type": "record",
        "fields": [{
            "name": "test",
            "type": {
                "type": "map",
                "values": "string"
            },
        }]
    }

    new_schema = {
        "type": "record",
        "fields": [{
            "name": "test",
            "type": {
                "type": "map",
                "values": "long"
            },
        }]
    }

    new_file = MemoryIO()
    records = [{"test": {"foo": "a"}}]
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file, new_schema)
    list(new_reader)
示例#4
0
文件: __init__.py 项目: mtth/hdfs
 def write(records):
   fastavro.writer(
     fo=self._fo.__enter__(),
     schema=self._schema,
     records=records,
     **self._writer_kwargs
   )
示例#5
0
def roundtrip(record, writer_schema, reader_schema):
    new_file = MemoryIO()
    fastavro.writer(new_file, writer_schema, [record])
    new_file.seek(0)

    new_records = list(fastavro.reader(new_file, reader_schema))
    return new_records[0]
示例#6
0
def check(filename):
    with open(filename, 'rb') as fo:
        reader = fastavro.reader(fo)
        assert hasattr(reader, 'schema'), 'no schema on file'

        if basename(filename) in NO_DATA:
            return

        records = list(reader)
        assert len(records) > 0, 'no records found'

    new_file = MemoryIO()
    fastavro.writer(new_file, reader.schema, records, reader.codec)
    new_file_bytes = new_file.getvalue()

    new_file = NoSeekMemoryIO(new_file_bytes)
    new_reader = fastavro.reader(new_file)
    assert hasattr(new_reader, 'schema'), "schema wasn't written"
    assert new_reader.schema == reader.schema
    assert new_reader.codec == reader.codec
    new_records = list(new_reader)

    assert new_records == records

    # Test schema migration with the same schema
    new_file = NoSeekMemoryIO(new_file_bytes)
    schema_migration_reader = fastavro.reader(new_file, reader.schema)
    assert schema_migration_reader.reader_schema == reader.schema
    new_records = list(schema_migration_reader)

    assert new_records == records
示例#7
0
def serialize_avro_to_string(schema, content):
    bytes_writer = BytesIO()
    fastavro.writer(bytes_writer, schema, content)
    #encoder = avro.io.BinaryEncoder(bytes_writer)
    #datum_writer.write(content, encoder)

    return bytes_writer.getvalue()
示例#8
0
def test_schema_migration_maps_with_union_promotion():
    schema = {
        "type": "record",
        "fields": [{
            "name": "test",
            "type": {
                "type": "map",
                "values": ["string", "int"]
            },
        }]
    }

    new_schema = {
        "type": "record",
        "fields": [{
            "name": "test",
            "type": {
                "type": "map",
                "values": ["string", "long"]
            },
        }]
    }

    new_file = MemoryIO()
    records = [{"test": {"foo": 1}}]
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file, new_schema)
    new_records = list(new_reader)
    assert new_records == records
示例#9
0
def test_schema_migration_array_with_union_promotion():
    schema = {
        "type": "record",
        "fields": [{
            "name": "test",
            "type": {
                "type": "array",
                "items": ["boolean", "long"]
            },
        }]
    }

    new_schema = {
        "type": "record",
        "fields": [{
            "name": "test",
            "type": {
                "type": "array",
                "items": ["string", "float"]
            },
        }]
    }

    new_file = MemoryIO()
    records = [{"test": [1, 2, 3]}]
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file, new_schema)
    new_records = list(new_reader)
    assert new_records == records
示例#10
0
def test_schema_migration_array_failure():
    schema = {
        "type": "record",
        "fields": [{
            "name": "test",
            "type": {
                "type": "array",
                "items": ["string", "int"]
            },
        }]
    }

    new_schema = {
        "type": "record",
        "fields": [{
            "name": "test",
            "type": {
                "type": "array",
                "items": ["string", "boolean"]
            },
        }]
    }

    new_file = MemoryIO()
    records = [{"test": [1, 2, 3]}]
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file, new_schema)
    try:
        list(new_reader)
    except fastavro._reader.SchemaResolutionError:
        pass
    else:
        assert False
示例#11
0
def test_str_py3():
    letters = ascii_uppercase + digits
    id_size = 100

    seed('str_py3')  # Repeatable results

    def gen_id():
        return ''.join(choice(letters) for _ in range(id_size))

    keys = ['first', 'second', 'third', 'fourth']

    testdata = [{key: gen_id() for key in keys} for _ in range(50)]

    schema = {
        "fields": [{'name': key, 'type': 'string'} for key in keys],
        "namespace": "namespace",
        "name": "zerobyte",
        "type": "record"
    }

    buf = BytesIO()
    fastavro.writer(buf, schema, testdata)

    buf.seek(0, SEEK_SET)
    for i, rec in enumerate(fastavro.reader(buf), 1):
        pass

    size = len(testdata)

    assert i == size, 'bad number of records'
    assert rec == testdata[-1], 'bad last record'
示例#12
0
def test_schema_migration_maps_failure():
    schema = {
        "type": "record",
        "fields": [{
            "name": "test",
            "type": {
                "type": "map",
                "values": "string"
            },
        }]
    }

    new_schema = {
        "type": "record",
        "fields": [{
            "name": "test",
            "type": {
                "type": "map",
                "values": "long"
            },
        }]
    }

    new_file = MemoryIO()
    records = [{"test": {"foo": "a"}}]
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file, new_schema)
    try:
        list(new_reader)
    except fastavro._reader.SchemaResolutionError:
        pass
    else:
        assert False
示例#13
0
def send_file_avro():
    schema, records = get_data()
    buf = io.BytesIO()
    fastavro.writer(buf, schema, records)
    buf.seek(0)
    return send_file(buf, 
                        attachment_filename='ccc.avro',
                        mimetype='application/octet-stream')
示例#14
0
def roundtrip(schema, records, new_schema):
    new_file = MemoryIO()
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)

    reader = fastavro.reader(new_file, new_schema)
    new_records = list(reader)
    return new_records
示例#15
0
def test_repo_caching_issue():
    schema = {
        "type": "record",
        "name": "B",
        "fields": [{
            "name": "b",
            "type": {
                "type": "record",
                "name": "C",
                "fields": [{
                    "name": "c",
                    "type": "string"
                }]
            }
        }]
    }

    new_file = MemoryIO()
    records = [{"b": {"c": "test"}}]
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file)
    new_records = list(new_reader)
    assert new_records == records

    other_schema = {
        "name": "A",
        "type": "record",
        "fields": [{
            "name": "a",
            "type": {
                "type": "record",
                "name": "B",
                "fields": [{
                    "name": "b",
                    "type": {
                        "type": "record",
                        "name": "C",
                        "fields": [{
                            "name": "c",
                            "type": "int"
                        }]
                    }
                }]
            }
        }, {
            "name": "aa",
            "type": "B"
        }]
    }

    new_file = MemoryIO()
    records = [{"a": {"b": {"c": 1}}, "aa": {"b": {"c": 2}}}]
    fastavro.writer(new_file, other_schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file)
    new_records = list(new_reader)
    assert new_records == records
示例#16
0
def test_default_values():
    schema = {"type": "record", "fields": [{"name": "default_field", "type": "string", "default": "default_value"}]}
    new_file = MemoryIO()
    records = [{}]
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file)
    new_records = list(new_reader)
    assert new_records == [{"default_field": "default_value"}]
示例#17
0
def test_metadata():
    schema = {"type": "record", "fields": []}

    new_file = MemoryIO()
    records = [{}]
    metadata = {"key": "value"}
    fastavro.writer(new_file, schema, records, metadata=metadata)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file)
    assert new_reader.metadata["key"] == metadata["key"]
示例#18
0
def test_no_default():
    io = MemoryIO()
    schema = {
        'type': 'record',
        'name': 'test',
        'fields': [
            {'type': 'boolean', 'name': 'a'}
        ],
    }
    fastavro.writer(io, schema, [{}])
示例#19
0
def write(schema, records, runs=1):
    times = []
    schema = parse_schema(schema)
    for _ in range(runs):
        iostream = BytesIO()
        start = time.time()
        writer(iostream, schema, records)
        end = time.time()
        times.append(end - start)
    print('... {0} runs averaged {1} seconds'.format(runs, (sum(times) / runs)))
    return iostream
示例#20
0
def test_schema_migration_add_default_field():
    schema = {"type": "record", "fields": []}

    new_schema = {"type": "record", "fields": [{"name": "test", "type": "string", "default": "default"}]}

    new_file = MemoryIO()
    records = [{}]
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file, new_schema)
    new_records = list(new_reader)
    assert new_records == [{"test": "default"}]
示例#21
0
def test_schema_migration_reader_union():
    schema = {"type": "record", "fields": [{"name": "test", "type": "int"}]}

    new_schema = {"type": "record", "fields": [{"name": "test", "type": ["string", "int"]}]}

    new_file = MemoryIO()
    records = [{"test": 1}]
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file, new_schema)
    new_records = list(new_reader)
    assert new_records == records
示例#22
0
def test_str_py3():
    buf = BytesIO()
    fastavro.writer(buf, schema, testdata)

    buf.seek(0, SEEK_SET)
    for i, rec in enumerate(fastavro.iter_avro(buf), 1):
        pass

    size = len(testdata)

    assert i == size, 'bad number of records'
    assert rec == testdata[-1], 'bad last record'
示例#23
0
def make_blocks(num_records=2000, codec='null'):
    records = make_records(num_records)

    new_file = MemoryIO()
    fastavro.writer(new_file, schema, records, codec=codec)

    new_file.seek(0)
    block_reader = fastavro.block_reader(new_file, schema)

    blocks = list(block_reader)

    new_file.close()

    return blocks, records
示例#24
0
def make_blocks(num_records=2000, codec='null', write_to_disk=False):
    records = make_records(num_records)

    new_file = NamedTemporaryFile() if write_to_disk else MemoryIO()
    fastavro.writer(new_file, schema, records, codec=codec)
    bytes = new_file.tell()

    new_file.seek(0)
    block_reader = fastavro.block_reader(new_file, schema)

    blocks = list(block_reader)

    new_file.close()

    return blocks, records, bytes
示例#25
0
def test_py3_union_string_and_bytes():
    schema = {
        "fields": [{'name': 'field', 'type': ['string', 'bytes']}],
        "namespace": "namespace",
        "name": "union_string_bytes",
        "type": "record"
    }

    records = [
        {'field': u'string'},
        {'field': b'bytes'}
    ]

    buf = BytesIO()
    fastavro.writer(buf, schema, records)
示例#26
0
def test_schema_migration_enum_failure():
    schema = {"type": "enum", "name": "test", "symbols": ["FOO", "BAR"]}

    new_schema = {"type": "enum", "name": "test", "symbols": ["BAZ", "BAR"]}

    new_file = MemoryIO()
    records = ["FOO"]
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file, new_schema)
    try:
        list(new_reader)
    except fastavro._reader.SchemaResolutionError:
        pass
    else:
        assert False
示例#27
0
def test_schema_migration_enum_failure():
    schema = {
        "type": "enum",
        "name": "test",
        "symbols": ["FOO", "BAR"],
    }

    new_schema = {
        "type": "enum",
        "name": "test",
        "symbols": ["BAZ", "BAR"],
    }

    new_file = MemoryIO()
    records = ["FOO"]
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file, new_schema)
    list(new_reader)
示例#28
0
def fastavro_avro(N):
    from fastavro import writer
    import numpy as np

    INTERVAL=1

    t_start = time.time()
    t0 = time.time()7
    nums = np.random.random_integers(0, 100, (N, 4))
    print("Generated data ({:.2f})".format(time.time() - t0))

    t0 = time.time()
    data = [dict(zip((col1, col2, col3, col4), item)) for item in nums]
    print("Transformed data ({:.2f})".format(time.time() - t0))

    with open("fast_avro_{}_ints.avro".format(N), "wb") as out:
        writer(out, python_schema, data)

    print("Finished ({:.2f})".format(time.time() - t_start))
示例#29
0
def test_fastavro_complex_nested():
    fo = MemoryIO()
    with open(join(data_dir, 'complex-nested.avsc')) as f:
        schema = json.load(f)

    records = [{
        "test_boolean": True,
        "test_int": 10,
        "test_long": 20,
        "test_float": 2.0,
        "test_double": 2.0,
        "test_bytes": b'asdf',
        "test_string": 'qwerty',
        "second_level": {
            "test_int2": 100,
            "test_string2": "asdf",
            "default_level": {
                "test_int_def": 1,
                "test_string_def": "nope",
            }
        },
        "fixed_int8": 1,
        "fixed_int16": 2,
        "fixed_int32": 3,
        "fixed_int64": 4,
        "fixed_uint8": 1,
        "fixed_uint16": 2,
        "fixed_uint32": 3,
        "fixed_uint64": 4,
        "fixed_int8_2": 12,
    }]

    fastavro.writer(fo, schema, records, enable_extensions=True)

    fo.seek(0)
    new_reader = fastavro.reader(fo, enable_extensions=True)

    assert new_reader.schema == schema

    new_records = list(new_reader)
    assert new_records == records
示例#30
0
def test_schema_migration_schema_mismatch():
    schema = {
        "type": "record",
        "fields": [{
            "name": "test",
            "type": "string",
        }]
    }

    new_schema = {
        "type": "enum",
        "name": "test",
        "symbols": ["FOO", "BAR"],
    }

    new_file = MemoryIO()
    records = [{"test": "test"}]
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file, new_schema)
    list(new_reader)
示例#31
0
 def write_avro():
     event_generator = (event for _ in range(n))
     fastavro.writer(avro_f, avro_schema, event_generator)
示例#32
0
schema = json.load(open("plat.avsc"))
records = [{
    "nom": "饺子",
    "origine": "北京",
    "ingredients": ["chou", "porc", "farine"],
    "prix": 4,
    "type": "plat"
}, {
    "nom": "方便面",
    "ingredients": ["piment", "nouilles"],
    "prix": 1.5,
    "type": "plat",
}, {
    "nom": "宫保鸡丁",
    "origine": "四川",
    "ingredients": ["poulet", "cacahuetes"],
    "prix": 8,
    "type": "plat"
}, {
    "nom": "米饭",
    "ingredients": ["riz"],
    "prix": 1,
    "type": "accompagnement"
}, {
    "nom": "冰水",
    "prix": 0.5,
    "type": "accompagnement"
}]

fastavro.writer(open("plats.avro", "wb"), schema, records)
示例#33
0
def test_repo_caching_issue():
    schema = {
        "type":
        "record",
        "name":
        "B",
        "fields": [{
            "name": "b",
            "type": {
                "type": "record",
                "name": "C",
                "fields": [{
                    "name": "c",
                    "type": "string"
                }]
            }
        }]
    }

    new_file = MemoryIO()
    records = [{"b": {"c": "test"}}]
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file)
    new_records = list(new_reader)
    assert new_records == records

    other_schema = {
        "name":
        "A",
        "type":
        "record",
        "fields": [{
            "name": "a",
            "type": {
                "type":
                "record",
                "name":
                "B",
                "fields": [{
                    "name": "b",
                    "type": {
                        "type": "record",
                        "name": "C",
                        "fields": [{
                            "name": "c",
                            "type": "int"
                        }]
                    }
                }]
            }
        }, {
            "name": "aa",
            "type": "B"
        }]
    }

    new_file = MemoryIO()
    records = [{"a": {"b": {"c": 1}}, "aa": {"b": {"c": 2}}}]
    fastavro.writer(new_file, other_schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file)
    new_records = list(new_reader)
    assert new_records == records
    ]
}

nodes = []
tree = ET.parse(open(SOURCE_FILE))
for node in tree.iterfind('node'):
    nodes.append({
        'id': int(node.get('id')),
        'longitude': float(node.get('lon')),
        'latitude': float(node.get('lat')),
        'username': node.get('user')
    })

# Dump nodes dictionary in an avro file
with open(AVRO_FILE, 'wb') as avro_file:
    fastavro.writer(avro_file, schema, nodes)

# Dump nodes dictionary in an avro file and use snappy compression algorithm
with open(AVRO_SNAPPY_FILE, 'wb') as avro_file:
    fastavro.writer(avro_file, schema, nodes, codec='snappy')

# Dump nodes dictionary in an avro file and use Bzip2 compression algorithm
with open(AVRO_BZIP2_FILE, 'wb') as avro_file:
    fastavro.writer(avro_file, schema, nodes, codec='bzip2')

# do the same with JSON format (for comparison)
with open(JSON_FILE, 'w') as json_file:
    json.dump([schema, nodes], json_file)


# Compare the size of the file formats
示例#35
0
 def _create_avro_example(test_schema, test_table):
     parsed_schema = fastavro.parse_schema(test_schema)
     rows = dicts(test_table)
     with NamedTemporaryFile(delete=False, mode='wb') as fo:
         fastavro.writer(fo, parsed_schema, rows)
         return fo.name
示例#36
0
def main():
    parser = argparse.ArgumentParser(
        description='load IgBLAST annotations into an Avro sequence record',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    # input files
    parser.add_argument('parse_label',
                        metavar='label',
                        help='the parse label to use for the parse')
    parser.add_argument('repertoire_filenames',
                        metavar='repertoire-file',
                        nargs=3,
                        help='the V(D)J repertoire file used in IgBLAST')
    parser.add_argument('seq_record_filename',
                        metavar='seq_record.avro',
                        help='the Avro file with the sequence records')
    parser.add_argument(
        'igblast_output_filenames',
        metavar='parse.igblast',
        nargs='+',
        help='the output of IgBLAST to parse and attach to the sequence record'
    )
    # options
    parser.add_argument('--min-v-score',
                        metavar='S',
                        type=float,
                        default=70.0,
                        help='the minimum score for the V-segment')
    parser.add_argument('--min-j-score',
                        metavar='S',
                        type=float,
                        default=26.0,
                        help='the minimum score for the V-segment')

    args = parser.parse_args()
    logging.basicConfig(level=logging.INFO)
    start_time = time.time()

    logging.info('calculating V(D)J repertoire lengths')
    germline_lengths = {}
    for rep_filename in args.repertoire_filenames:
        with open(rep_filename, 'rt') as rep_handle:
            for record in SeqIO.parse(rep_handle, 'fasta'):
                germline_lengths[record.id] = len(record)

    logging.info('adding parses to sequence records')

    with open_compressed(args.seq_record_filename, 'rb') as seq_record_handle:
        seq_record_reader = fastavro.reader(seq_record_handle)
        igblast_parse_reader = igblast_chain(args.igblast_output_filenames)

        annotator = igblast_annotator(germline_lengths, seq_record_reader,
                                      igblast_parse_reader, args.parse_label,
                                      args.min_v_score, args.min_j_score)

        fastavro.writer(sys.stdout.buffer,
                        seq_record_reader.writer_schema,
                        annotator,
                        codec='bzip2')

    elapsed_time = time.time() - start_time
    logging.info(
        'elapsed time %s',
        time.strftime('%H hours, %M minutes, %S seconds',
                      time.gmtime(elapsed_time)))
示例#37
0
def test_unsupported_codec():
    schema = {
        "doc":
        "A weather reading.",
        "name":
        "Weather",
        "namespace":
        "test",
        "type":
        "record",
        "fields": [
            {
                "name": "station",
                "type": "string"
            },
            {
                "name": "time",
                "type": "long"
            },
            {
                "name": "temp",
                "type": "int"
            },
        ],
    }

    records = [
        {
            "station": "011990-99999",
            "temp": 0,
            "time": 1433269388
        },
        {
            "station": "011990-99999",
            "temp": 22,
            "time": 1433270389
        },
        {
            "station": "011990-99999",
            "temp": -11,
            "time": 1433273379
        },
        {
            "station": "012650-99999",
            "temp": 111,
            "time": 1433275478
        },
    ]

    file = MemoryIO()
    with pytest.raises(ValueError, match="unrecognized codec"):
        fastavro.writer(file, schema, records, codec="unsupported")

    file = MemoryIO()
    fastavro.writer(file, schema, records, codec="deflate")

    # Change the avro binary to act as if it were written with a codec called
    # `unsupported`
    modified_avro = file.getvalue().replace(b"\x0edeflate", b"\x16unsupported")
    modified_file = MemoryIO(modified_avro)

    with pytest.raises(ValueError, match="Unrecognized codec"):
        list(fastavro.reader(modified_file))
示例#38
0
def create_cqi_output(filename):

    lst = list()
    # read in one avro file
    with open(const.get_cqi_input_file_path() + filename, 'rb') as fo:
        reader = fastavro.reader(fo)

        for record in reader:
            lst.append([
                record['itemId'], record['productId'], record['categoryCode'],
                record['originalAttr'], record['normalizedAttr'],
                record['excludeType'], record['categoryCodeLv1'],
                record['categoryNameLv1']
            ])

    # noinspection PyUnresolvedReferences
    df = pd.DataFrame(lst,
                      columns=[
                          'itemId', 'productId', 'categoryCode',
                          'originalAttr', 'normalizedAttr', 'excludeType',
                          'categoryCodeLv1', 'categoryNameLv1'
                      ])
    lst = None

    df['originCateCode'] = df['categoryCode']
    df['originString'] = df['originalAttr']
    df['cleanseString'] = ''
    df['predCateCode'] = ''
    df['predCateCode1'] = ''
    df['predCateCode2'] = ''
    df['predCateCode3'] = ''
    df['predCateCode4'] = ''
    df['predCateCode5'] = ''
    df['predCateCode6'] = ''
    df['scoreCateCode1'] = 0.0
    df['scoreCateCode2'] = 0.0
    df['scoreCateCode3_6'] = 0.0
    df['scoreFinal'] = 0.0
    df['success'] = 0

    # noinspection PyUnresolvedReferences
    cleansed_prod_df = pd.read_csv(
        const.get_cleansed_prod_dictionary_file_name(),
        names=['productId', 'isCleansed'],
        sep='\t',
        dtype=[('productId', 'long'), ('isCleansed', 'str')])

    # df = pd.merge(df, book_cate_df, on='originCateCode', how='left')
    # df = pd.merge(df, jikgu_prod_df, on='productId', how='left')
    # noinspection PyUnresolvedReferences
    df = pd.merge(df, cleansed_prod_df, on='productId', how='left')

    for i, row in df.iterrows():
        if not df.at[i, 'originString'] or len(df.at[i, 'originString']) == 0:
            continue

        pred = predict.predict(
            model_lv1,
            model_lv2s,
            model_lv3s,
            df.at[i,
                  'normalizedAttr'],  # input already garbage filtered string
            product_id=df.at[i, 'productId'],
            item_id=df.at[i, 'itemId'],
            garbage_filter=False)[0]

        df.at[i, 'cleanseString'] = pred.get_normalized_input_string()

        if "OLD" not in str(df.at[i, 'categoryNameLv1']).upper():

            if "JIKGU" in df.at[i, 'excludeType']:
                continue

            if "BOOK" in df.at[i, 'excludeType']:
                continue

            if "DVD" in df.at[i, 'excludeType']:
                continue

            if df.at[i, 'isCleansed'] == '1':
                if len(str(df.at[i, 'excludeType'])) == 0:
                    df.at[i, 'excludeType'] = 'OPERATOR_MODEL'
                else:
                    df.at[i, 'excludeType'] = str(
                        df.at[i, 'excludeType']) + ',OPERATOR_MODEL'
                continue

            if pred.get_predict_error() is True:
                continue

            if pred.get_final_score() < 0.25:
                df.at[i, 'scoreCateCode1'] = pred.get_lv1_score()
                df.at[i, 'scoreCateCode2'] = pred.get_lv2_score()
                df.at[i, 'scoreCateCode3_6'] = pred.get_lv3_score()
                df.at[i, 'scoreFinal'] = pred.get_final_score()
                continue

        df.at[i, 'predCateCode'] = pred.get_catecode()
        df.at[i, 'predCateCode1'] = pred.get_lv1_catecode()
        df.at[i, 'predCateCode2'] = pred.get_lv2_catecode()
        df.at[i, 'predCateCode3'] = pred.get_lv3_catecode()
        df.at[i, 'predCateCode4'] = pred.get_lv4_catecode()
        df.at[i, 'predCateCode5'] = pred.get_lv5_catecode()
        df.at[i, 'predCateCode6'] = pred.get_lv6_catecode()
        df.at[i, 'scoreCateCode1'] = pred.get_lv1_score()
        df.at[i, 'scoreCateCode2'] = pred.get_lv2_score()
        df.at[i, 'scoreCateCode3_6'] = pred.get_lv3_score()
        df.at[i, 'scoreFinal'] = pred.get_final_score()
        if pred.get_predict_error() is True:
            df.at[i, 'success'] = 0
        else:
            df.at[i, 'success'] = 1

    # write result out to avro file
    schema = {
        'name':
        'topLevelRecord',
        'type':
        'record',
        'fields': [{
            'name': 'itemId',
            'type': ['long', 'null']
        }, {
            'name': 'productId',
            'type': ['long', 'null']
        }, {
            'name': 'originCateCode',
            'type': ['string', 'null']
        }, {
            'name': 'originString',
            'type': 'string'
        }, {
            'name': 'cleanseString',
            'type': 'string'
        }, {
            'name': 'predCateCode',
            'type': ['string', 'null']
        }, {
            'name': 'predCateCode1',
            'type': ['string', 'null']
        }, {
            'name': 'predCateCode2',
            'type': ['string', 'null']
        }, {
            'name': 'predCateCode3',
            'type': ['string', 'null']
        }, {
            'name': 'predCateCode4',
            'type': ['string', 'null']
        }, {
            'name': 'predCateCode5',
            'type': ['string', 'null']
        }, {
            'name': 'predCateCode6',
            'type': ['string', 'null']
        }, {
            'name': 'scoreCateCode1',
            'type': ['float', 'null']
        }, {
            'name': 'scoreCateCode2',
            'type': ['float', 'null']
        }, {
            'name': 'scoreCateCode3_6',
            'type': ['float', 'null']
        }, {
            'name': 'scoreFinal',
            'type': ['float', 'null']
        }, {
            'name': 'excludeType',
            'type': 'string'
        }]
    }

    output = df[[
        'itemId', 'productId', 'originCateCode', 'originString',
        'cleanseString', 'predCateCode', 'predCateCode1', 'predCateCode2',
        'predCateCode3', 'predCateCode4', 'predCateCode5', 'predCateCode6',
        'scoreCateCode1', 'scoreCateCode2', 'scoreCateCode3_6', 'scoreFinal',
        'excludeType'
    ]]

    records = output.to_json(orient='records')
    records = json.loads(records)
    with open(const.get_cqi_output_file_path() + filename, 'wb') as out:
        fastavro.writer(out, schema, records)

    logger.info("Successfully write " + filename)
示例#39
0
    }, {
        'name': 'value',
        'type': 'long'
    }],
    'name':
    'AutoGen',
    'namespace':
    'autogenerated',
    'type':
    'record'
}
keys = ("key%s" % s for s in range(10000))
vals = range(10000)
data = [{'key': key, 'value': val} for key, val in zip(keys, vals)]
f = BytesIO()
fastavro.writer(f, schema, data)
f.seek(0)
avro_bytes = f.read()

f.seek(0)
av = fastavro.reader(f)
header = av._header


def test_avro_body():
    sync = header['sync']
    subset = sync.join(avro_bytes.split(sync)[2:4])
    assert subset

    for b in (avro_bytes, subset):
        b = b.split(sync, 1)[1]
示例#40
0
 def write(records):
     fastavro.writer(fo=self._fo.__enter__(),
                     schema=self._schema,
                     records=records,
                     **self._writer_kwargs)
示例#41
0
schema = {
    'name': 'TestRecord',
    'type': 'record',
    'fields': [
        { 'name': 'D0', 'type': 'string', 'pinotType': 'DIMENSION' },
        { 'name': 'D1', 'type': 'string', 'pinotType': 'DIMENSION' },
        { 'name': 'D2', 'type': 'string', 'pinotType': 'DIMENSION' },
        { 'name': 'daysSinceEpoch', 'type': 'long', 'pinotType': 'TIME' },
        { 'name': 'M0', 'type': 'long', 'pinotType': 'METRIC' },
        { 'name': 'M1', 'type': 'double', 'pinotType': 'METRIC' }
    ]
}

records = []

for i in xrange(args.num_records):
    record = {
        'D0': str(i % 2),
        'D1': str(i % 4),
        'D2': str(i % 8),
        'daysSinceEpoch': int(i % args.num_time_buckets),
        'M0': 1,
        'M1': 1.0
    }
    records.append(record)

print 'Writing {}'.format(sys.argv[1])

with open(sys.argv[1], 'wb') as out:
    writer(out, schema, records)
示例#42
0
characters = [{
    "id": 1,
    "name": "Martin Riggs"
}, {
    "id": 2,
    "name": "John Wick"
}, {
    "id": 3,
    "name": "Ripley"
}]

# Définition du schéma des données
schema = {
    "type": "record",
    "namespace": "com.badassmoviecharacters",
    "name": "Character",
    "doc": "Seriously badass characters",
    "fields": [{
        "name": "name",
        "type": "string"
    }, {
        "name": "id",
        "type": "int"
    }]
}

# Ouverture d'un fichier binaire en mode écriture
with open("characters.avro", 'wb') as avro_file:
    # Ecriture des données
    fastavro.writer(avro_file, schema, characters, codec="deflate")
示例#43
0
from fastavro import reader, writer, parse_schema
import json

inpFile = str(input("Enter the avro data file name:  "))
inpSchemaFile = str(input("Enter the avro schema file name:  "))
outFile = str(input("Enter the avro output file:   ")) or "outdata.avro"

with open(inpSchemaFile, 'rb') as sc:
    schema = sc.read()

parsed = parse_schema(json.loads(schema))

with open(inpFile, 'rb') as inp:
    records = [r for r in reader(inp)]
    records.append(records[-1])
    flag = 1
    while flag:
        field = str(input("Which field you want to edit:    "))
        if '.' in field:
            pass
        else:
            records[-1][field] = int(
                input("Enter the value for " + field + ":   "))
        flag = int(input("Press 1 to continue or 0 to halt: "))
    print(records[-1])

with open(outFile, 'wb') as out:
    writer(out, parsed, records)
示例#44
0
import sys
from xml.etree import ElementTree as ET

import fastavro

osm_file = sys.argv[1]
schema = schema = fastavro.schema.load_schema(sys.argv[2])
output_folder = sys.argv[3]
compression_codec = 'null'
if len(sys.argv) > 4:
    compression_codec = sys.argv[4]

nodes = []

tree = ET.parse(open(osm_file))

for node in tree.iterfind('node'):
    nodes.append({
        'id': int(node.get('id')),
        'longitude': float(node.get('lon')),
        'latitude': float(node.get('lat')),
        'username': node.get('user')
    })

# Dump nodes dictionary in an avro file
osm_file_name = osm_file.split('/')[-1]
avro_file = output_folder + osm_file_name[:-3] + 'avro'
with open(avro_file, 'wb') as af:
    fastavro.writer(af, schema, nodes, codec=compression_codec)
示例#45
0
characters = [{
    "id": 1,
    "name": "Martin Riggs"
}, {
    "id": 2,
    "name": "John Wick"
}, {
    "id": 3,
    "name": "Ripley"
}]

# Définition du schéma des données
schema = {
    "type": "record",
    "namespace": "com.badassmoviecharacters",
    "name": "Character",
    "doc": "Seriously badass characters",
    "fields": [{
        "name": "name",
        "type": "string"
    }, {
        "name": "id",
        "type": "int"
    }]
}

# Ouverture d'un fichier binaire en mode écriture
with open("characters.avro", 'wb') as avro_file:
    # Ecriture des données
    fastavro.writer(avro_file, schema, characters)
示例#46
0
 def serialise_event_message(event_type, name):
     buffer = io.BytesIO()
     writer(buffer, event_schema,
            [{'event_type': event_type, 'name': name}])
     return buffer.getvalue()
def write_avro_records(id, records):
    fpath = 'app-{}.avro'.format(id)
    with open(fpath, 'wb') as out:
        writer(out, PARSED_SCHEMA, records)
    return fpath
示例#48
0
def write_avro_data_to_file_with_schema(filename, json_schema, records):
    """Write out large numbers of records with the schema.
       This makes for easier reading and does not significantly affect space.
    """
    with open(filename, 'wb') as out:
        fastavro.writer(out, json_schema, records)
示例#49
0
import fastavro
import json
from io import BytesIO

schema = fastavro.parse_schema({
    "type": "record",
    "name": "testing",
    "fields": [{
        "name": "email",
        "type": "string"
    }]
})

data_dict = {}
data_dict['email'] = '*****@*****.**'

with BytesIO() as buf:
    fastavro.writer(buf, schema, [data_dict])
    buf.seek(0)

    result = fastavro.reader(buf)

    print([email for email in result])
#

#bio.seek(0)

#result = list(fastavro.reader(bio))
#for r in result:
#    print(r)
示例#50
0
def test_xz_works_by_default_on_windows_python3():
    schema = {
        "doc":
        "A weather reading.",
        "name":
        "Weather",
        "namespace":
        "test",
        "type":
        "record",
        "fields": [
            {
                "name": "station",
                "type": "string"
            },
            {
                "name": "time",
                "type": "long"
            },
            {
                "name": "temp",
                "type": "int"
            },
        ],
    }

    records = [
        {
            "station": "011990-99999",
            "temp": 0,
            "time": 1433269388
        },
        {
            "station": "011990-99999",
            "temp": 22,
            "time": 1433270389
        },
        {
            "station": "011990-99999",
            "temp": -11,
            "time": 1433273379
        },
        {
            "station": "012650-99999",
            "temp": 111,
            "time": 1433275478
        },
    ]

    file = MemoryIO()

    if sys.version_info >= (3, 0):
        fastavro.writer(file, schema, records, codec="xz")

        file.seek(0)
        out_records = list(fastavro.reader(file))
        assert records == out_records
    else:
        with pytest.raises(
                ValueError,
                match="xz codec is supported but you need to install"):
            fastavro.writer(file, schema, records, codec="xz")
示例#51
0
def test_union_records():
    #
    schema = {
        'name':
        'test_name',
        'namespace':
        'test',
        'type':
        'record',
        'fields': [{
            'name':
            'val',
            'type': [{
                'name':
                'a',
                'namespace':
                'common',
                'type':
                'record',
                'fields': [
                    {
                        'name': 'x',
                        'type': 'int'
                    },
                    {
                        'name': 'y',
                        'type': 'int'
                    },
                ],
            }, {
                'name':
                'b',
                'namespace':
                'common',
                'type':
                'record',
                'fields': [
                    {
                        'name': 'x',
                        'type': 'int'
                    },
                    {
                        'name': 'y',
                        'type': 'int'
                    },
                    {
                        'name': 'z',
                        'type': ['null', 'int']
                    },
                ],
            }]
        }]
    }

    data = [{
        'val': {
            'x': 3,
            'y': 4,
            'z': 5,
        }
    }]
    new_file = MemoryIO()
    fastavro.writer(new_file, schema, data)
    new_file.seek(0)

    new_reader = fastavro.reader(new_file)
    new_records = list(new_reader)
    assert new_records == data
示例#52
0
def serialize(schema, *records):
    buffer = BytesIO()
    fastavro.writer(buffer, schema, records)
    serialized = buffer.getvalue()
    return serialized
示例#53
0
cohort_data_entity = {
    "name": "icdc.cohort",
    "id": "n201",
    "object": cohort_data,
    "relations": []
}

assert validate(("pfb.Entity", cohort_data_entity), pfb_schema)

payload = [{
    "name": "Metadata",
    "object": {
        "name": "pfb.Metadata",
        "misc": {},
        "nodes": [icdc_cohort_meta, icdc_case_meta]
    }
}, cohort_data_entity, case_data_entity]

# Create PFB message

with open("worked-example.avro", "wb") as out:
    fastavro.writer(out, pfb_schema, payload)

# Read records from message

with open("worked-example.avro", "rb") as inf:
    rdr = fastavro.reader(inf)
    for rec in rdr:
        print(rec)
示例#54
0
def _write_avro_part(part, f, schema, codec, sync_interval, metadata):
    """Create single avro file from list of dictionaries"""
    import fastavro

    with f as f:
        fastavro.writer(f, schema, part, codec, sync_interval, metadata)
 def serialize(self):
     """Serializes the ClickEvent for sending to Kafka"""
     out = BytesIO()
     writer(out, ClickEvent.schema, [asdict(self)])
     return out.getvalue()
示例#56
0
    "type": "array",
    "items": "float"
})


def generate():
    start = stop = 0
    while start < len(array):
        stop = min(stop + events_per_basket, len(array))
        chunk = json.loads(ak.to_json(array[start:stop]))
        for x in chunk:
            yield x
        print(int(round(100 * stop / len(array))), "percent",
              time.asctime(time.localtime()))
        start = stop


for level in [9, 1]:  # 9, 1, 0:
    print("level", level)
    with open(
            "/home/jpivarski/storage/data/chep-2021-jagged-jagged-jagged/lzfour"
            + str(level) + "-jagged1.avro", "wb") as out:
        fastavro.writer(
            out,
            schema,
            generate(),
            codec="lz4",  # "deflate",
            codec_compression_level=level,
            sync_interval=45633959,
        )
示例#57
0
 def put_avro(self, schema, records, blob_name, codec='snappy'):
     path = self._get_path_and_create_dir(blob_name)
     with open(path, "wb") as f:
         fastavro.writer(f, schema, records, codec)
     size = os.path.getsize(path)
     return Blob(blob_name, size)