def test_with_fields(tmp_path):
    one_list = [[{"x": 1}, {"x": 2}, {"x": 3}], [], [{"x": 4}, {"x": 5}]]
    two_list = [[{"x": 6}], [{"x": 7}, {"x": 8}, {"x": 9}, {"x": 10}]]
    one = ak.Array(one_list)
    two = ak.Array(two_list)

    ak.to_parquet(one, tmp_path / "file1.parquet")
    ak.to_parquet(two, tmp_path / "file2.parquet")
    assert not os.path.exists(tmp_path / "_common_metadata")
    assert not os.path.exists(tmp_path / "_metadata")

    no_metadata = ak.from_parquet(tmp_path)
    assert no_metadata.tolist() == one_list + two_list

    no_metadata_lazy = ak.from_parquet(tmp_path, lazy=True)
    assert no_metadata_lazy.tolist() == one_list + two_list

    ak.to_parquet.dataset(tmp_path)
    assert os.path.exists(tmp_path / "_common_metadata")
    assert os.path.exists(tmp_path / "_metadata")

    with_metadata = ak.from_parquet(tmp_path)
    assert with_metadata.tolist() == one_list + two_list

    with_metadata_lazy = ak.from_parquet(tmp_path, lazy=True)
    assert with_metadata_lazy.tolist() == one_list + two_list
def test_no_fields(tmp_path):
    one = ak.Array([[1, 2, 3], [], [4, 5]])
    two = ak.Array([[6], [7, 8, 9, 10]])

    ak.to_parquet(one, tmp_path / "file1.parquet")
    ak.to_parquet(two, tmp_path / "file2.parquet")
    assert not os.path.exists(tmp_path / "_common_metadata")
    assert not os.path.exists(tmp_path / "_metadata")

    no_metadata = ak.from_parquet(tmp_path)
    assert no_metadata.tolist() == [[1, 2, 3], [], [4, 5], [6], [7, 8, 9, 10]]

    no_metadata_lazy = ak.from_parquet(tmp_path, lazy=True)
    assert no_metadata_lazy.tolist() == [[1, 2, 3], [], [4, 5], [6],
                                         [7, 8, 9, 10]]

    ak.to_parquet.dataset(tmp_path)
    assert os.path.exists(tmp_path / "_common_metadata")
    assert os.path.exists(tmp_path / "_metadata")

    with_metadata = ak.from_parquet(tmp_path)
    assert with_metadata.tolist() == [[1, 2, 3], [], [4, 5], [6],
                                      [7, 8, 9, 10]]

    with_metadata_lazy = ak.from_parquet(tmp_path, lazy=True)
    assert with_metadata_lazy.tolist() == [[1, 2, 3], [], [4, 5], [6],
                                           [7, 8, 9, 10]]
def test_6(one, two, three, tmp_path):
    filename = os.path.join(str(tmp_path), "test6.parquet")
    data = [
        {"x": {"y": [], "z": 1.1}},
        {"x": {"y": [one], "z": 2.2}},
        {"x": {"y": [one, two, three], "z": 3.3}},
    ]
    ak.to_parquet(ak.Array(data), filename)
    array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp")
    assert set(array.caches[0].keys()) == set()
    array.layout.field("x").array
    assert set(array.caches[0].keys()) == set()
    array.layout.field("x").array.field("z").array
    assert set(array.caches[0].keys()) == {"tmp:col:x.z[0]"}
    array.layout.field("x").array.field("y").array
    assert set(array.caches[0].keys()) == {"tmp:col:x.z[0]", "tmp:lst:x.y[0]"}
    assert array.tolist() == data
    array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp")
    assert set(array.caches[0].keys()) == set()
    array.layout.field("x").array
    assert set(array.caches[0].keys()) == set()
    array.layout.field("x").array.field("y").array
    assert set(array.caches[0].keys()) == {"tmp:lst:x.y[0]"}
    array.layout.field("x").array.field("z").array
    assert set(array.caches[0].keys()) == {"tmp:lst:x.y[0]", "tmp:col:x.z[0]"}
    assert array.tolist() == data
def test_8(one, two, three, tmp_path):
    filename = os.path.join(str(tmp_path), "test8.parquet")
    data = [
        {
            "x": []
        },
        {
            "x": [{
                "y": one,
                "z": 1.1
            }]
        },
        {
            "x": [{
                "y": one,
                "z": 1.1
            }, {
                "y": two,
                "z": 2.2
            }, {
                "y": three,
                "z": 3.3
            }]
        },
    ]
    ak.to_parquet(ak.Array(data), filename)
    array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp")
    assert set(array.caches[0].keys()) == set()
    array.layout.field("x").array
    assert set(array.caches[0].keys()) == set(["tmp:off:x.list.item.y:x[0]"])
    assert np.asarray(
        array.layout.field("x").array.offsets).tolist() == [0, 0, 1, 4]
    assert set(array.caches[0].keys()) == set(["tmp:off:x.list.item.y:x[0]"])
    array.layout.field("x").array.content.field("y").array
    assert set(array.caches[0].keys()) == set(
        ["tmp:off:x.list.item.y:x[0]", "tmp:col:x.list.item.y[0]"])
    array.layout.field("x").array.content.field("z").array
    assert set(array.caches[0].keys()) == set([
        "tmp:off:x.list.item.y:x[0]",
        "tmp:col:x.list.item.y[0]",
        "tmp:col:x.list.item.z[0]",
    ])
    assert array.tolist() == data
    array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp")
    assert set(array.caches[0].keys()) == set()
    array.layout.field("x").array
    assert set(array.caches[0].keys()) == set(["tmp:off:x.list.item.y:x[0]"])
    assert np.asarray(
        array.layout.field("x").array.offsets).tolist() == [0, 0, 1, 4]
    assert set(array.caches[0].keys()) == set(["tmp:off:x.list.item.y:x[0]"])
    array.layout.field("x").array.content.field("z").array
    assert set(array.caches[0].keys()) == set(
        ["tmp:off:x.list.item.y:x[0]", "tmp:col:x.list.item.z[0]"])
    array.layout.field("x").array.content.field("y").array
    assert set(array.caches[0].keys()) == set([
        "tmp:off:x.list.item.y:x[0]",
        "tmp:col:x.list.item.z[0]",
        "tmp:col:x.list.item.y[0]",
    ])
    assert array.tolist() == data
def test_issue2(tmp_path):
    filename = os.path.join(tmp_path, "whatever.parquet")

    null_table = pyarrow.Table.from_pydict({"null_col": pyarrow.array([None])})
    pyarrow_parquet.write_table(null_table, filename)

    assert ak.from_parquet(filename).type == ak.from_parquet(filename, lazy=True).type
def test_to_parquet_2(tmp_path):
    array = ak.Array([
        [{
            "x": 0.0,
            "y": []
        }, {
            "x": 1.1,
            "y": [1]
        }, {
            "x": 2.2,
            "y": None
        }],
        [],
        [{
            "x": 3.3,
            "y": [1, 2, 3]
        }, None, {
            "x": 4.4,
            "y": [1, 2, 3, 4]
        }],
    ])
    assert str(
        array.type) == '3 * var * ?{"x": float64, "y": option[var * int64]}'
    ak.to_parquet(array, os.path.join(tmp_path, "complicated-example.parquet"))
    array2 = ak.from_parquet(
        os.path.join(tmp_path, "complicated-example.parquet"))
    assert str(array2.type) == str(array.type)
    assert array2.tolist() == array.tolist()
示例#7
0
def test_parquet2b(tmp_path):
    filename = os.path.join(tmp_path, "whatever.parquet")
    array = ak.Array(
        [
            {"x": [{"y": 0.0, "z": 0}]},
            {"x": [{"y": 1.1, "z": 1}]},
            {"x": [{"y": 2.2, "z": 2}]},
        ]
    )
    ak.to_parquet(array, filename)

    lazy = ak.from_parquet(filename, lazy=True, lazy_cache=None)

    @numba.njit
    def f1(lazy):
        out = np.ones(3, np.float64)
        i = 0
        for obj in lazy:
            for subobj in obj.x:
                out[i] = subobj.y
                i += 1
        return out

    @numba.njit
    def f2(lazy):
        out = np.ones(3, np.float64)
        i = 0
        for obj in lazy:
            for subobj in obj.x:
                out[i] = subobj.z
                i += 1
        return out

    assert f1(lazy).tolist() == [0.0, 1.1, 2.2]
    assert f2(lazy).tolist() == [0, 1, 2]
def test(tmp_path):
    filename = os.path.join(tmp_path, "what-ever.parquet")
    fish = ak.Array([True, True])[np.newaxis]
    clob = ak.Array([2, 3, 7])[np.newaxis]
    frog = ak.zip({"c": clob, "f": fish}, depth_limit=1)
    ak.to_parquet(frog, filename)
    assert ak.from_parquet(filename).tolist() == frog.tolist()
def test_12(one, two, three, tmp_path):
    filename = os.path.join(str(tmp_path), "test12.parquet")
    data = [
        {
            "x": {
                "y": []
            }
        },
        {
            "x": {
                "y": [[one]]
            }
        },
        {
            "x": {
                "y": [[one, two], [], [three]]
            }
        },
    ]
    ak.to_parquet(ak.Array(data), filename)
    array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp")
    assert set(array.caches[0].keys()) == set()
    array.layout.field("x").array
    assert set(array.caches[0].keys()) == set()
    array.layout.field("x").array.field("y").array
    assert set(array.caches[0].keys()) == set(["tmp:lst:x.y[0]"])
    assert array.tolist() == data
def test_parsing(header_version: int, events_per_chunk: int) -> None:
    here = Path(__file__).parent
    input_filename = here / "parsing" / f"final_state_hadrons_header_v{header_version}.dat"

    for i, arrays in enumerate(
            parse_ascii.read(filename=input_filename,
                             events_per_chunk=events_per_chunk,
                             parser="pandas")):
        # Get the reference array
        # Create the reference arrays by checking out the parser v1 (e477e0277fa560f9aba82310c02da8177e61c9e4), setting
        # the chunk size in skim_ascii, and then calling:
        # $ python jetscape_analysis/analysis/reader/skim_ascii.py -i tests/parsing/final_state_hadrons_header_v1.dat -o tests/parsing/events_per_chunk_50/parser_v1_header_v1/test.parquet
        # NOTE: The final state hadron files won't exist when you check out that branch, so
        #       it's best to copy them for your existing branch.
        reference_arrays = ak.from_parquet(
            Path(
                f"{here}/parsing/events_per_chunk_{events_per_chunk}/parser_v1_header_v1/test_{i:02}.parquet"
            ))
        # There are more fields in v2 than in the reference arrays (v1), so only take those
        # that are present in reference for comparison.
        # NOTE: We have to compare the fields one-by-one because the shapes of the fields
        #       are different, and apparently don't broadcast nicely with `__eq__`
        for field in ak.fields(reference_arrays):
            new_field = _rename_columns.get(field, field)
            assert ak.all(reference_arrays[field] == arrays[new_field])

        # Check for cross section if header v2
        if header_version == 2:
            assert "cross_section" in ak.fields(arrays)
            assert "cross_section_error" in ak.fields(arrays)
def test():
    array = ak.Array([1, 2, 3])
    file_ = io.BytesIO()
    ak.to_parquet(array, file_)
    file_.seek(0)

    array_from_file = ak.from_parquet(file_)
    assert ak.to_list(array) == ak.to_list(array_from_file)
def test_16(one, two, three, tmp_path):
    filename = os.path.join(str(tmp_path), "test15.parquet")
    data = [[one, two], [], [three]]
    ak.to_parquet(ak.Array(data), filename)
    array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp")
    assert set(array.caches[0].keys()) == set()
    assert np.asarray(array.layout.array.offsets).tolist() == [0, 2, 2, 3]
    assert set(array.caches[0].keys()) == set(["tmp:lst:[0]"])
    assert array.tolist() == data
示例#13
0
    def load(cls, path, *args, **kwargs):
        path = get_path(path)

        if path.endswith(".parquet"):
            import awkward as ak
            return ak.from_parquet(path, *args, **kwargs)

        # .pickle, .pkl
        return PickleFormatter.load(path, *args, **kwargs)
def test_1(one, two, three, tmp_path):
    filename = os.path.join(str(tmp_path), "test1.parquet")
    data = [{"x": one}, {"x": two}, {"x": three}]
    ak.to_parquet(ak.Array(data), filename)
    array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp")
    assert set(array.caches[0].keys()) == set()
    array.layout.field("x").array
    assert set(array.caches[0].keys()) == set(["tmp:col:x[0]"])
    assert array.tolist() == data
def test_4(one, two, three, tmp_path):
    filename = os.path.join(str(tmp_path), "test4.parquet")
    data = [{"x": []}, {"x": [one]}, {"x": [one, two, three]}]
    ak.to_parquet(ak.Array(data), filename)
    array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp")
    assert set(array.caches[0].keys()) == set()
    array.layout.field("x").array
    assert set(array.caches[0].keys()) == {"tmp:lst:x[0]"}
    assert array.tolist() == data
def test_15(one, two, three, tmp_path):
    filename = os.path.join(str(tmp_path), "test15.parquet")
    data = [one, two, three]
    ak.to_parquet(ak.Array(data), filename)
    array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp")
    assert set(array.caches[0].keys()) == set()
    array.layout.array
    assert set(array.caches[0].keys()) == set(["tmp:col:[0]"])
    assert array.tolist() == data
示例#17
0
def test(tmp_path):
    filename = os.path.join(tmp_path, "test.parquet")
    dog = ak.from_iter([1, 2, 5])
    cat = ak.from_iter([4])
    pets = ak.zip({
        "dog": dog[np.newaxis],
        "cat": cat[np.newaxis]
    },
                  depth_limit=1)
    ak.to_parquet(pets, filename)
    assert ak.from_parquet(filename).tolist() == pets.tolist()
def test_17(one, two, three, tmp_path):
    filename = os.path.join(str(tmp_path), "test15.parquet")
    data = [[{"x": one}, {"x": two}], [], [{"x": three}]]
    ak.to_parquet(ak.Array(data), filename)
    array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp")
    assert set(array.caches[0].keys()) == set()
    assert np.asarray(array.layout.array.offsets).tolist() == [0, 2, 2, 3]
    assert set(array.caches[0].keys()) == set(["tmp:off:.list.item.x:[0]"])
    array.layout.array.content.field("x").array
    assert set(array.caches[0].keys()) == set(
        ["tmp:off:.list.item.x:[0]", "tmp:col:.list.item.x[0]"])
    assert array.tolist() == data
示例#19
0
def test_pandas(tmp_path):
    df = pandas.DataFrame(
        {"x": np.arange(10), "y": np.arange(10) % 5, "z": ["low"] * 5 + ["high"] * 5}
    )
    df.to_parquet(tmp_path, partition_cols=["z", "y"])

    a = ak.from_parquet(tmp_path)
    assert a.z.tolist() == ["high"] * 5 + ["low"] * 5  # alphabetical
    assert a.y.tolist() == ["0", "1", "2", "3", "4", "0", "1", "2", "3", "4"]
    assert a.x.tolist() == [5, 6, 7, 8, 9, 0, 1, 2, 3, 4]

    b = ak.from_parquet(tmp_path, lazy=True)
    assert b.z.tolist() == ["high"] * 5 + ["low"] * 5
    assert b.y.tolist() == ["0", "1", "2", "3", "4", "0", "1", "2", "3", "4"]
    assert b.x.tolist() == [5, 6, 7, 8, 9, 0, 1, 2, 3, 4]

    c = ak.from_parquet(tmp_path, include_partition_columns=False)
    assert ak.fields(c) == ["x"]

    d = ak.from_parquet(tmp_path, lazy=True, include_partition_columns=False)
    assert ak.fields(d) == ["x"]
def test_11(one, two, three, tmp_path):
    filename = os.path.join(str(tmp_path), "test11.parquet")
    data = [
        {"x": []},
        {"x": [{"z": 1.1, "y": {"q": one}}]},
        {
            "x": [
                {"z": 1.1, "y": {"q": one}},
                {"z": 2.2, "y": {"q": two}},
                {"z": 3.3, "y": {"q": three}},
            ]
        },
    ]
    ak.to_parquet(ak.Array(data), filename)
    array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp")
    assert set(array.caches[0].keys()) == set()
    array.layout.field("x").array
    assert len(set(array.caches[0].keys())) == 1
    assert np.asarray(array.layout.field("x").array.offsets).tolist() == [0, 0, 1, 4]
    assert len(set(array.caches[0].keys())) == 1
    array.layout.field("x").array.content.field("y").array
    assert len(set(array.caches[0].keys())) == 1
    array.layout.field("x").array.content.field("y").array.field("q").array
    assert len(set(array.caches[0].keys())) == 2
    array.layout.field("x").array.content.field("z").array
    assert len(set(array.caches[0].keys())) == 3
    assert array.tolist() == data
    array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp")
    assert set(array.caches[0].keys()) == set()
    array.layout.field("x").array
    assert len(set(array.caches[0].keys())) == 1
    assert np.asarray(array.layout.field("x").array.offsets).tolist() == [0, 0, 1, 4]
    assert len(set(array.caches[0].keys())) == 1
    array.layout.field("x").array.content.field("y").array
    assert len(set(array.caches[0].keys())) == 1
    array.layout.field("x").array.content.field("z").array
    assert len(set(array.caches[0].keys())) == 2
    array.layout.field("x").array.content.field("y").array.field("q").array
    assert len(set(array.caches[0].keys())) == 3
    assert array.tolist() == data
def test(tmp_path):
    filename = os.path.join(tmp_path, "test.parquet")
    ak.to_parquet(ak.repartition(range(8), 2), filename)

    assert ak.from_parquet(filename, row_groups=[1,
                                                 3]).tolist() == [2, 3, 6, 7]
    assert ak.from_parquet(filename, row_groups=[1, 3],
                           lazy=True).tolist() == [
                               2,
                               3,
                               6,
                               7,
                           ]

    assert ak.from_parquet(tmp_path, row_groups=[1,
                                                 3]).tolist() == [2, 3, 6, 7]
    assert ak.from_parquet(tmp_path, row_groups=[1, 3],
                           lazy=True).tolist() == [
                               2,
                               3,
                               6,
                               7,
                           ]

    ak.to_parquet.dataset(tmp_path)

    assert ak.from_parquet(tmp_path, row_groups=[1,
                                                 3]).tolist() == [2, 3, 6, 7]
    assert ak.from_parquet(tmp_path, row_groups=[1, 3],
                           lazy=True).tolist() == [
                               2,
                               3,
                               6,
                               7,
                           ]
def test_parsing_with_parquet(header_version: int, events_per_chunk: int,
                              tmp_path: Path) -> None:
    """Parse to parquet, read back, and compare."""
    here = Path(__file__).parent
    input_filename = here / "parsing" / f"final_state_hadrons_header_v{header_version}.dat"

    # Convert to chunks in a temp directory.
    base_output_filename = tmp_path / "test.parquet"
    parse_ascii.parse_to_parquet(base_output_filename=base_output_filename,
                                 store_only_necessary_columns=True,
                                 input_filename=input_filename,
                                 events_per_chunk=events_per_chunk)

    output_filenames = tmp_path.glob("*.parquet")

    for i, output_filename in enumerate(sorted(output_filenames)):
        arrays = ak.from_parquet(output_filename)

        # Create the reference arrays by checking out the parser v1 (e477e0277fa560f9aba82310c02da8177e61c9e4), setting
        # the chunk size in skim_ascii, and then calling:
        # $ python jetscape_analysis/analysis/reader/skim_ascii.py -i tests/parsing/final_state_hadrons_header_v1.dat -o tests/parsing/events_per_chunk_50/parser_v1_header_v1/test.parquet
        # NOTE: The final state hadron files won't exist when you check out that branch, so
        #       it's best to copy them for your existing branch.
        reference_arrays = ak.from_parquet(
            Path(
                f"{here}/parsing/events_per_chunk_{events_per_chunk}/parser_v1_header_v1/test_{i:02}.parquet"
            ))
        # There are more fields in v2 than in the reference arrays (v1), so only take those
        # that are present in reference for comparison.
        # NOTE: We have to compare the fields one-by-one because the shapes of the fields
        #       are different, and apparently don't broadcast nicely with `__eq__`
        for field in ak.fields(reference_arrays):
            new_field = _rename_columns.get(field, field)
            assert ak.all(reference_arrays[field] == arrays[new_field])

        # Check for cross section if header v2
        if header_version == 2:
            assert "cross_section" in ak.fields(arrays)
            assert "cross_section_error" in ak.fields(arrays)
def test(tmp_path):
    one = ak.Array([[], [{"x": [{"y": 1}]}]])
    two = ak.Array([[{"x": []}, {"x": [{"y": 1}]}]])
    three = ak.Array([[{"x": [{"y": 1}]}], [], [{"x": [{"y": 2}]}]])

    ak.to_parquet(one, tmp_path / "one.parquet")
    ak.to_parquet(two, tmp_path / "two.parquet")
    ak.to_parquet(three, tmp_path / "three.parquet")

    lazy_one = ak.from_parquet(tmp_path / "one.parquet", lazy=True)
    lazy_two = ak.from_parquet(tmp_path / "two.parquet", lazy=True)
    lazy_three = ak.from_parquet(tmp_path / "three.parquet", lazy=True)

    assert lazy_one.tolist() == [[], [{"x": [{"y": 1}]}]]
    assert lazy_two.tolist() == [[{"x": []}, {"x": [{"y": 1}]}]]
    assert lazy_three.tolist() == [[{
        "x": [{
            "y": 1
        }]
    }], [], [{
        "x": [{
            "y": 2
        }]
    }]]
示例#24
0
def test_parquet1(tmp_path):
    filename = os.path.join(tmp_path, "whatever.parquet")
    array = ak.Array([{"x": {"y": 0.0}}, {"x": {"y": 1.1}}, {"x": {"y": 2.2}}])
    ak.to_parquet(array, filename)

    lazy = ak.from_parquet(filename, lazy=True, lazy_cache=None)

    @numba.njit
    def f1(lazy):
        out = np.ones(3, np.float64)
        i = 0
        for obj in lazy:
            out[i] = obj.x.y
            i += 1
        return out

    assert f1(lazy).tolist() == [0.0, 1.1, 2.2]
def test_9(one, two, three, tmp_path):
    filename = os.path.join(str(tmp_path), "test9.parquet")
    data = [
        {"x": []},
        {"x": [{"y": {"q": one}}]},
        {"x": [{"y": {"q": one}}, {"y": {"q": two}}, {"y": {"q": three}}]},
    ]
    ak.to_parquet(ak.Array(data), filename)
    array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp")
    assert set(array.caches[0].keys()) == set()
    array.layout.field("x").array
    assert set(array.caches[0].keys()) == {"tmp:off:x.list.item.y.q:x[0]"}
    assert np.asarray(array.layout.field("x").array.offsets).tolist() == [0, 0, 1, 4]
    assert set(array.caches[0].keys()) == {"tmp:off:x.list.item.y.q:x[0]"}
    array.layout.field("x").array.content.field("y").array
    assert set(array.caches[0].keys()) == {"tmp:off:x.list.item.y.q:x[0]"}
    array.layout.field("x").array.content.field("y").array.field("q").array
    assert set(array.caches[0].keys()) == {
        "tmp:off:x.list.item.y.q:x[0]",
        "tmp:col:x.list.item.y.q[0]",
    }
    assert array.tolist() == data
def test_to_parquet(tmp_path):
    original = ak.Array([
        [{
            "x": 1,
            "y": 1.1
        }, {
            "x": 2,
            "y": 2.2
        }, {
            "x": 3,
            "y": 3.3
        }],
        [],
        [{
            "x": 4,
            "y": 4.4
        }, {
            "x": 5,
            "y": 5.5
        }],
        [],
        [],
        [
            {
                "x": 6,
                "y": 6.6
            },
            {
                "x": 7,
                "y": 7.7
            },
            {
                "x": 8,
                "y": 8.8
            },
            {
                "x": 9,
                "y": 9.9
            },
        ],
    ])

    ak.to_parquet(original, os.path.join(tmp_path, "data.parquet"))
    reconstituted = ak.from_parquet(os.path.join(tmp_path, "data.parquet"))
    assert reconstituted.tolist() == [
        [{
            "x": 1,
            "y": 1.1
        }, {
            "x": 2,
            "y": 2.2
        }, {
            "x": 3,
            "y": 3.3
        }],
        [],
        [{
            "x": 4,
            "y": 4.4
        }, {
            "x": 5,
            "y": 5.5
        }],
        [],
        [],
        [
            {
                "x": 6,
                "y": 6.6
            },
            {
                "x": 7,
                "y": 7.7
            },
            {
                "x": 8,
                "y": 8.8
            },
            {
                "x": 9,
                "y": 9.9
            },
        ],
    ]
    assert str(reconstituted.type) == '6 * var * {"x": int64, "y": float64}'
def test_parquet():
    empty = ak.from_parquet("tests/samples/zero-record-batches.parquet")
    assert isinstance(empty, ak.Array)
    assert len(empty) == 0
    assert str(empty.type) == "0 * {}"
示例#28
0
import time
import subprocess

import awkward as ak

compress = sys.argv[1]
N = int(sys.argv[2])
is_split = sys.argv[3] == "split"

s = "-split" if is_split else ""
filename = f"/home/jpivarski/storage/data/chep-2021-jagged-jagged-jagged/{compress}{s}-jagged{N}.parquet"

subprocess.call(f"vmtouch -t {filename} > /dev/null", shell=True)
subprocess.call(f"vmtouch {filename} | fgrep Pages", shell=True)

array = ak.from_parquet(filename, lazy=True)

begintime = time.time()
for partition in array.layout.partitions:
    tmp = partition.array

endtime = time.time()

print(f"pyarrow {compress}{s}-jagged{N}", endtime - begintime, "seconds")

array = ak.from_parquet(filename, lazy=True)

begintime = time.time()
for partition in array.layout.partitions:
    tmp = partition.array
示例#29
0
 def load_df():
     return ak.from_parquet(good_uproot_file_path)  # type: ignore
示例#30
0
ew = Components.EventWise.from_file("../megaIgnore/IRCchecks_noPTcut1/iridis_pp_to_jjj_lo1_fragment/iridis_pp_to_jjj_lo1_fragment0.parquet")
spectral_jets = list({name.split('_')[0] for name in ew.columns if name.startswith("Spect") and "IRC" not in name})

# files that contian kinematic info
file_name = "../megaIgnore/IRCchecks_noPTcut{}/iridis_pp_to_jjj_{}{}_fragment/kinematics.parquet"
end_time = time.time() + 60*60*36
ew_shapes = Components.EventWise.from_file("../megaIgnore/IRC_shapes2.parquet")


for n in range(1, 5):
    spectral_shapes = [[[[] for _ in spectral_jets] for _ in ew_shapes.shape_names] for _ in ew_shapes.orders]

    for order in ["nlo", "lo"]:
        o_idx = list(ew_shapes.orders).index(order)
        name = file_name.format(n, order)
        kinematics = ak.from_parquet(name)
        print(name)
        
        print("Getting jet shapes")
        for j_idx, jname in enumerate(spectral_jets):
            print('.', end='', flush=True)
            ew.selected_event = None
            for event_n in range(len(kinematics[o_idx, 0, 0])):
                ew.selected_event = event_n
                event_kinematics = ak.to_numpy(kinematics[o_idx, 1:, j_idx, event_n, :])
                if np.any(np.isnan(event_kinematics)) or len(event_kinematics) != 4:
                    shapes = [np.nan for _ in ew_shapes.shape_names]
                else:
                    shape_dict = ShapeVariables.shape(*event_kinematics)[1]
                    shapes = [shape_dict[name] for name in ew_shapes.shape_names]
                for i, val in enumerate(shapes):