Exemplo n.º 1
0
def test_orc_single(orc_files):
    fn = orc_files[0]
    d = read_orc(fn)
    assert len(d) == 70000
    assert d.npartitions == 8
    d2 = read_orc(fn, columns=["time", "date"])
    assert_eq(d[columns], d2[columns])
    with pytest.raises(ValueError, match="nonexist"):
        read_orc(fn, columns=["time", "nonexist"])
Exemplo n.º 2
0
def test_orc_single(orc_files):
    fn = orc_files[0]
    d = read_orc(fn)
    assert len(d) == 70000
    assert d.npartitions == 8
    d2 = read_orc(fn, columns=['time', 'date'])
    assert_eq(d[columns], d2[columns])
    with pytest.raises(ValueError) as e:
        read_orc(fn, columns=['time', 'nonexist'])
    assert 'nonexist' in str(e)
Exemplo n.º 3
0
def test_orc_single(orc_files):
    fn = orc_files[0]
    d = read_orc(fn)
    assert len(d) == 70000
    assert d.npartitions == 8
    d2 = read_orc(fn, columns=['time', 'date'])
    assert_eq(d[columns], d2[columns])
    with pytest.raises(ValueError) as e:
        read_orc(fn, columns=['time', 'nonexist'])
    assert 'nonexist' in str(e)
Exemplo n.º 4
0
def test_orc_aggregate_files_offset(orc_files):
    # Default read should give back 16 partitions. Therefore,
    # specifying split_stripes=11 & aggregate_files=True should
    # produce 2 partitions (with the first being larger than
    # the second)
    df2 = dd.read_orc(orc_files[:2], split_stripes=11, aggregate_files=True)
    assert df2.npartitions == 2
    assert len(df2.partitions[0].index) > len(df2.index) // 2
Exemplo n.º 5
0
def test_orc_single(orc_files):
    fn = orc_files[0]
    d = read_orc(fn)
    assert len(d) == 70000
    assert d.npartitions == 8
    d2 = read_orc(fn, columns=["time", "date"])
    assert_eq(d[columns], d2[columns])
    with pytest.raises(ValueError, match="nonexist"):
        read_orc(fn, columns=["time", "nonexist"])

    # Check that `optimize_dataframe_getitem` changes the
    # `columns` attribute of the "read-orc" layer
    d3 = d[columns]
    keys = [(d3._name, i) for i in range(d3.npartitions)]
    graph = optimize_dataframe_getitem(d3.__dask_graph__(), keys)
    key = [k for k in graph.layers.keys() if k.startswith("read-orc-")][0]
    assert set(graph.layers[key].columns) == set(columns)
Exemplo n.º 6
0
    def read_using_dask(self):
        t1 = timeit.default_timer()
        """ write parquet file using dask to_parquet"""
        ipdf = dd.read_orc(self.path, columns=self.columns)
        print("Time taken : {} seconds for reading parquet file '{}'".format(
            timeit.default_timer() - t1, self.path))

        return ipdf
Exemplo n.º 7
0
def test_orc_roundtrip_aggregate_files(tmpdir, split_stripes):
    tmp = str(tmpdir)
    data = pd.DataFrame(
        {
            "a": np.arange(100, dtype=np.float64),
            "b": np.random.choice(["cat", "dog", "mouse"], size=100),
        }
    )
    df = dd.from_pandas(data, npartitions=8)
    df.to_orc(tmp, write_index=False)
    df2 = dd.read_orc(tmp, split_stripes=split_stripes, aggregate_files=True)

    # Check that we have the expected partition count
    # and that the data is correct
    if split_stripes:
        assert df2.npartitions == df.npartitions / int(split_stripes)
    else:
        assert df2.npartitions == df.npartitions
    assert_eq(data, df2, check_index=False)
Exemplo n.º 8
0
def dataframe_loader(_context, config):
    file_type, file_options = list(config.items())[0]
    path = file_options.get("path")

    if file_type == "csv":
        return dd.read_csv(path, **dict_without_keys(file_options, "path"))
    elif file_type == "parquet":
        return dd.read_parquet(path, **dict_without_keys(file_options, "path"))
    elif file_type == "hdf":
        return dd.read_hdf(path, **dict_without_keys(file_options, "path"))
    elif file_type == "json":
        return dd.read_json(path, **dict_without_keys(file_options, "path"))
    elif file_type == "sql_table":
        return dd.read_sql_table(**file_options)
    elif file_type == "table":
        return dd.read_table(path, **dict_without_keys(file_options, "path"))
    elif file_type == "fwf":
        return dd.read_fwf(path, **dict_without_keys(file_options, "path"))
    elif file_type == "orc":
        return dd.read_orc(path, **dict_without_keys(file_options, "path"))
    else:
        raise DagsterInvariantViolationError(
            "Unsupported file_type {file_type}".format(file_type=file_type))
Exemplo n.º 9
0
def dataframe_loader(_context, config):
    file_type, file_options = list(config.items())[0]
    path = file_options.get('path')

    if file_type == 'csv':
        return dd.read_csv(path, **dict_without_keys(file_options, 'path'))
    elif file_type == 'parquet':
        return dd.read_parquet(path, **dict_without_keys(file_options, 'path'))
    elif file_type == 'hdf':
        return dd.read_hdf(path, **dict_without_keys(file_options, 'path'))
    elif file_type == 'json':
        return dd.read_json(path, **dict_without_keys(file_options, 'path'))
    elif file_type == 'sql_table':
        return dd.read_sql_table(**file_options)
    elif file_type == 'table':
        return dd.read_table(path, **dict_without_keys(file_options, 'path'))
    elif file_type == 'fwf':
        return dd.read_fwf(path, **dict_without_keys(file_options, 'path'))
    elif file_type == 'orc':
        return dd.read_orc(path, **dict_without_keys(file_options, 'path'))
    else:
        raise DagsterInvariantViolationError(
            'Unsupported file_type {file_type}'.format(file_type=file_type))
Exemplo n.º 10
0
def test_orc_roundtrip(tmpdir, index, columns):
    tmp = str(tmpdir)
    data = pd.DataFrame(
        {
            "i32": np.arange(1000, dtype=np.int32),
            "i64": np.arange(1000, dtype=np.int64),
            "f": np.arange(1000, dtype=np.float64),
            "bhello": np.random.choice(["hello", "yo", "people"], size=1000).astype(
                "O"
            ),
        }
    )
    if index:
        data.set_index(index, inplace=True)
    df = dd.from_pandas(data, chunksize=500)
    if columns:
        data = data[[c for c in columns if c != index]]

    # Write
    df.to_orc(tmp, write_index=bool(index))

    # Read
    df2 = dd.read_orc(tmp, index=index, columns=columns)
    assert_eq(data, df2, check_index=bool(index))
Exemplo n.º 11
0
def test_orc_multiple(orc_files):
    d = read_orc(orc_files[0])
    d2 = read_orc(orc_files)
    assert_eq(d2[columns], dd.concat([d, d])[columns], check_index=False)
    d2 = read_orc(os.path.dirname(orc_files[0]) + '/*.orc')
    assert_eq(d2[columns], dd.concat([d, d])[columns], check_index=False)
Exemplo n.º 12
0
def test_orc_names(orc_files, tmp_path):
    df = dd.read_orc(orc_files)
    assert df._name.startswith("read-orc")
    out = df.to_orc(tmp_path, compute=False)
    assert out._name.startswith("to-orc")
Exemplo n.º 13
0
def test_orc_with_backend():
    pytest.importorskip('requests')
    d = read_orc(url)
    assert set(d.columns) == {'time', 'date'}  # order is not guranteed
    assert len(d) == 70000
Exemplo n.º 14
0
def test_orc_multiple(orc_files):
    d = read_orc(orc_files[0])
    d2 = read_orc(orc_files)
    assert_eq(d2[columns], dd.concat([d, d])[columns], check_index=False)
    d2 = read_orc(os.path.dirname(orc_files[0]) + "/*.orc")
    assert_eq(d2[columns], dd.concat([d, d])[columns], check_index=False)
Exemplo n.º 15
0
def test_orc_with_backend():
    pytest.importorskip("requests")
    d = read_orc(url)
    assert set(d.columns) == {"time", "date"}  # order is not guranteed
    assert len(d) == 70000
Exemplo n.º 16
0
def test_orc_with_backend():
    d = read_orc(url)
    assert set(d.columns) == {'time', 'date'}  # order is not guranteed
    assert len(d) == 70000
Exemplo n.º 17
0
def test_orc_with_backend():
    d = read_orc(url)
    assert set(d.columns) == {'time', 'date'}  # order is not guranteed
    assert len(d) == 70000
Exemplo n.º 18
0
def test_orc_with_backend():
    pytest.importorskip('requests')
    d = read_orc(url)
    assert set(d.columns) == {'time', 'date'}  # order is not guranteed
    assert len(d) == 70000