def test_orc_single(orc_files): fn = orc_files[0] d = read_orc(fn) assert len(d) == 70000 assert d.npartitions == 8 d2 = read_orc(fn, columns=["time", "date"]) assert_eq(d[columns], d2[columns]) with pytest.raises(ValueError, match="nonexist"): read_orc(fn, columns=["time", "nonexist"])
def test_orc_single(orc_files): fn = orc_files[0] d = read_orc(fn) assert len(d) == 70000 assert d.npartitions == 8 d2 = read_orc(fn, columns=['time', 'date']) assert_eq(d[columns], d2[columns]) with pytest.raises(ValueError) as e: read_orc(fn, columns=['time', 'nonexist']) assert 'nonexist' in str(e)
def test_orc_single(orc_files): fn = orc_files[0] d = read_orc(fn) assert len(d) == 70000 assert d.npartitions == 8 d2 = read_orc(fn, columns=['time', 'date']) assert_eq(d[columns], d2[columns]) with pytest.raises(ValueError) as e: read_orc(fn, columns=['time', 'nonexist']) assert 'nonexist' in str(e)
def test_orc_aggregate_files_offset(orc_files): # Default read should give back 16 partitions. Therefore, # specifying split_stripes=11 & aggregate_files=True should # produce 2 partitions (with the first being larger than # the second) df2 = dd.read_orc(orc_files[:2], split_stripes=11, aggregate_files=True) assert df2.npartitions == 2 assert len(df2.partitions[0].index) > len(df2.index) // 2
def test_orc_single(orc_files): fn = orc_files[0] d = read_orc(fn) assert len(d) == 70000 assert d.npartitions == 8 d2 = read_orc(fn, columns=["time", "date"]) assert_eq(d[columns], d2[columns]) with pytest.raises(ValueError, match="nonexist"): read_orc(fn, columns=["time", "nonexist"]) # Check that `optimize_dataframe_getitem` changes the # `columns` attribute of the "read-orc" layer d3 = d[columns] keys = [(d3._name, i) for i in range(d3.npartitions)] graph = optimize_dataframe_getitem(d3.__dask_graph__(), keys) key = [k for k in graph.layers.keys() if k.startswith("read-orc-")][0] assert set(graph.layers[key].columns) == set(columns)
def read_using_dask(self): t1 = timeit.default_timer() """ write parquet file using dask to_parquet""" ipdf = dd.read_orc(self.path, columns=self.columns) print("Time taken : {} seconds for reading parquet file '{}'".format( timeit.default_timer() - t1, self.path)) return ipdf
def test_orc_roundtrip_aggregate_files(tmpdir, split_stripes): tmp = str(tmpdir) data = pd.DataFrame( { "a": np.arange(100, dtype=np.float64), "b": np.random.choice(["cat", "dog", "mouse"], size=100), } ) df = dd.from_pandas(data, npartitions=8) df.to_orc(tmp, write_index=False) df2 = dd.read_orc(tmp, split_stripes=split_stripes, aggregate_files=True) # Check that we have the expected partition count # and that the data is correct if split_stripes: assert df2.npartitions == df.npartitions / int(split_stripes) else: assert df2.npartitions == df.npartitions assert_eq(data, df2, check_index=False)
def dataframe_loader(_context, config): file_type, file_options = list(config.items())[0] path = file_options.get("path") if file_type == "csv": return dd.read_csv(path, **dict_without_keys(file_options, "path")) elif file_type == "parquet": return dd.read_parquet(path, **dict_without_keys(file_options, "path")) elif file_type == "hdf": return dd.read_hdf(path, **dict_without_keys(file_options, "path")) elif file_type == "json": return dd.read_json(path, **dict_without_keys(file_options, "path")) elif file_type == "sql_table": return dd.read_sql_table(**file_options) elif file_type == "table": return dd.read_table(path, **dict_without_keys(file_options, "path")) elif file_type == "fwf": return dd.read_fwf(path, **dict_without_keys(file_options, "path")) elif file_type == "orc": return dd.read_orc(path, **dict_without_keys(file_options, "path")) else: raise DagsterInvariantViolationError( "Unsupported file_type {file_type}".format(file_type=file_type))
def dataframe_loader(_context, config): file_type, file_options = list(config.items())[0] path = file_options.get('path') if file_type == 'csv': return dd.read_csv(path, **dict_without_keys(file_options, 'path')) elif file_type == 'parquet': return dd.read_parquet(path, **dict_without_keys(file_options, 'path')) elif file_type == 'hdf': return dd.read_hdf(path, **dict_without_keys(file_options, 'path')) elif file_type == 'json': return dd.read_json(path, **dict_without_keys(file_options, 'path')) elif file_type == 'sql_table': return dd.read_sql_table(**file_options) elif file_type == 'table': return dd.read_table(path, **dict_without_keys(file_options, 'path')) elif file_type == 'fwf': return dd.read_fwf(path, **dict_without_keys(file_options, 'path')) elif file_type == 'orc': return dd.read_orc(path, **dict_without_keys(file_options, 'path')) else: raise DagsterInvariantViolationError( 'Unsupported file_type {file_type}'.format(file_type=file_type))
def test_orc_roundtrip(tmpdir, index, columns): tmp = str(tmpdir) data = pd.DataFrame( { "i32": np.arange(1000, dtype=np.int32), "i64": np.arange(1000, dtype=np.int64), "f": np.arange(1000, dtype=np.float64), "bhello": np.random.choice(["hello", "yo", "people"], size=1000).astype( "O" ), } ) if index: data.set_index(index, inplace=True) df = dd.from_pandas(data, chunksize=500) if columns: data = data[[c for c in columns if c != index]] # Write df.to_orc(tmp, write_index=bool(index)) # Read df2 = dd.read_orc(tmp, index=index, columns=columns) assert_eq(data, df2, check_index=bool(index))
def test_orc_multiple(orc_files): d = read_orc(orc_files[0]) d2 = read_orc(orc_files) assert_eq(d2[columns], dd.concat([d, d])[columns], check_index=False) d2 = read_orc(os.path.dirname(orc_files[0]) + '/*.orc') assert_eq(d2[columns], dd.concat([d, d])[columns], check_index=False)
def test_orc_names(orc_files, tmp_path): df = dd.read_orc(orc_files) assert df._name.startswith("read-orc") out = df.to_orc(tmp_path, compute=False) assert out._name.startswith("to-orc")
def test_orc_with_backend(): pytest.importorskip('requests') d = read_orc(url) assert set(d.columns) == {'time', 'date'} # order is not guranteed assert len(d) == 70000
def test_orc_multiple(orc_files): d = read_orc(orc_files[0]) d2 = read_orc(orc_files) assert_eq(d2[columns], dd.concat([d, d])[columns], check_index=False) d2 = read_orc(os.path.dirname(orc_files[0]) + "/*.orc") assert_eq(d2[columns], dd.concat([d, d])[columns], check_index=False)
def test_orc_with_backend(): pytest.importorskip("requests") d = read_orc(url) assert set(d.columns) == {"time", "date"} # order is not guranteed assert len(d) == 70000
def test_orc_with_backend(): d = read_orc(url) assert set(d.columns) == {'time', 'date'} # order is not guranteed assert len(d) == 70000
def test_orc_with_backend(): d = read_orc(url) assert set(d.columns) == {'time', 'date'} # order is not guranteed assert len(d) == 70000
def test_orc_with_backend(): pytest.importorskip('requests') d = read_orc(url) assert set(d.columns) == {'time', 'date'} # order is not guranteed assert len(d) == 70000