Exemplo n.º 1
0
def test_categorical(tmpdir, write_engine, read_engine):
    tmp = str(tmpdir)
    df = pd.DataFrame({'x': ['a', 'b', 'c'] * 100}, dtype='category')
    ddf = dd.from_pandas(df, npartitions=3)
    dd.to_parquet(ddf, tmp, engine=write_engine)

    ddf2 = dd.read_parquet(tmp, categories='x', engine=read_engine)
    assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c']

    ddf2 = dd.read_parquet(tmp, categories=['x'], engine=read_engine)
    assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c']

    # autocat
    if read_engine != 'pyarrow':
        ddf2 = dd.read_parquet(tmp, engine=read_engine)
        assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c']

        ddf2.loc[:1000].compute()
        df.index.name = 'index'  # defaults to 'index' in this case
        assert assert_eq(df, ddf2)

    # dereference cats
    ddf2 = dd.read_parquet(tmp, categories=[], engine=read_engine)

    ddf2.loc[:1000].compute()
    assert (df.x == ddf2.x).all()
Exemplo n.º 2
0
def test_categorical(tmpdir):
    check_fastparquet()
    tmp = str(tmpdir)
    df = pd.DataFrame({'x': ['a', 'b', 'c'] * 100}, dtype='category')
    ddf = dd.from_pandas(df, npartitions=3)
    dd.to_parquet(ddf, tmp)

    ddf2 = dd.read_parquet(tmp, categories='x')
    assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c']

    ddf2 = dd.read_parquet(tmp, categories=['x'])
    assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c']

    # autocat
    ddf2 = dd.read_parquet(tmp)
    assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c']

    ddf2.loc[:1000].compute()
    df.index.name = 'index'  # defaults to 'index' in this case
    assert assert_eq(df, ddf2)

    # dereference cats
    ddf2 = dd.read_parquet(tmp, categories=[])

    ddf2.loc[:1000].compute()
    assert (df.x == ddf2.x).all()
Exemplo n.º 3
0
def test_filters(tmpdir, write_engine, read_engine):
    fn = str(tmpdir)

    df = pd.DataFrame({'at': ['ab', 'aa', 'ba', 'da', 'bb']})
    ddf = dd.from_pandas(df, npartitions=1)

    # Ok with 1 partition and filters
    ddf.repartition(npartitions=1, force=True).to_parquet(fn, write_index=False,
                                                          engine=write_engine)
    ddf2 = dd.read_parquet(fn, index=False, engine=read_engine,
                           filters=[('at', '==', 'aa')]).compute()
    assert_eq(ddf2, ddf)

    # with >1 partition and no filters
    ddf.repartition(npartitions=2, force=True).to_parquet(fn, engine=write_engine)
    dd.read_parquet(fn, engine=read_engine).compute()
    assert_eq(ddf2, ddf)

    # with >1 partition and filters using base fastparquet
    if read_engine == 'fastparquet':
        ddf.repartition(npartitions=2, force=True).to_parquet(fn, engine=write_engine)
        df2 = fastparquet.ParquetFile(fn).to_pandas(filters=[('at', '==', 'aa')])
        assert len(df2) > 0

    # with >1 partition and filters
    ddf.repartition(npartitions=2, force=True).to_parquet(fn, engine=write_engine)
    dd.read_parquet(fn, engine=read_engine, filters=[('at', '==', 'aa')]).compute()
    assert len(ddf2) > 0
Exemplo n.º 4
0
def test_writing_parquet_with_kwargs(tmpdir, engine):
    fn = str(tmpdir)
    path1 = os.path.join(fn, 'normal')
    path2 = os.path.join(fn, 'partitioned')
    pytest.importorskip("snappy")

    df = pd.DataFrame({'a': np.random.choice(['A', 'B', 'C'], size=100),
                       'b': np.random.random(size=100),
                       'c': np.random.randint(1, 5, size=100)})
    ddf = dd.from_pandas(df, npartitions=3)

    engine_kwargs = {
        'pyarrow': {
            'compression': 'snappy',
            'coerce_timestamps': None,
            'use_dictionary': True
        },
        'fastparquet': {
            'compression': 'snappy',
            'times': 'int64',
            'fixed_text': None
        }
    }

    ddf.to_parquet(path1,  engine=engine, **engine_kwargs[engine])
    out = dd.read_parquet(path1, engine=engine, infer_divisions=should_check_divs(engine))
    assert_eq(out, ddf, check_index=(engine != 'fastparquet'), check_divisions=should_check_divs(engine))

    # Avoid race condition in pyarrow 0.8.0 on writing partitioned datasets
    with dask.config.set(scheduler='sync'):
        ddf.to_parquet(path2, engine=engine, partition_on=['a'],
                       **engine_kwargs[engine])
    out = dd.read_parquet(path2, engine=engine).compute()
    for val in df.a.unique():
        assert set(df.b[df.a == val]) == set(out.b[out.a == val])
Exemplo n.º 5
0
def test_read_from_fastparquet_parquetfile(tmpdir):
    check_fastparquet()
    fn = str(tmpdir)

    df = pd.DataFrame({
        'a': np.random.choice(['A', 'B', 'C'], size=100),
        'b': np.random.random(size=100),
        'c': np.random.randint(1, 5, size=100)
    })
    d = dd.from_pandas(df, npartitions=2)
    d.to_parquet(fn, partition_on=['a'], engine='fastparquet')

    pq_f = fastparquet.ParquetFile(fn)

    # OK with no filters
    out = dd.read_parquet(pq_f).compute()
    for val in df.a.unique():
        assert set(df.b[df.a == val]) == set(out.b[out.a == val])

    # OK with  filters
    out = dd.read_parquet(pq_f, filters=[('a', '==', 'B')]).compute()
    assert set(df.b[df.a == 'B']) == set(out.b)

    # Engine should not be set to 'pyarrow'
    with pytest.raises(AssertionError):
        out = dd.read_parquet(pq_f, engine='pyarrow')
Exemplo n.º 6
0
def test_informative_error_messages():
    with pytest.raises(ValueError) as info:
        dd.read_parquet('foo', engine='foo')

    assert 'foo' in str(info.value)
    assert 'arrow' in str(info.value)
    assert 'fastparquet' in str(info.value)
Exemplo n.º 7
0
def test_read_parquet_custom_columns(tmpdir, engine):
    import glob
    tmp = str(tmpdir)
    data = pd.DataFrame({'i32': np.arange(1000, dtype=np.int32),
                         'f': np.arange(1000, dtype=np.float64)})
    df = dd.from_pandas(data, chunksize=50)
    df.to_parquet(tmp)

    df2 = dd.read_parquet(tmp,
                          columns=['i32', 'f'],
                          engine=engine,
                          infer_divisions=should_check_divs(engine))
    assert_eq(df[['i32', 'f']], df2,
              check_index=False, check_divisions=should_check_divs(engine))

    import glob
    fns = glob.glob(os.path.join(tmp, '*.parquet'))
    df2 = dd.read_parquet(fns,
                          columns=['i32'],
                          engine=engine).compute()
    df2.sort_values('i32', inplace=True)
    assert_eq(df[['i32']], df2,
              check_index=False, check_divisions=False)

    df3 = dd.read_parquet(tmp,
                          columns=['f', 'i32'],
                          engine=engine,
                          infer_divisions=should_check_divs(engine))
    assert_eq(df[['f', 'i32']], df3,
              check_index=False, check_divisions=should_check_divs(engine))
Exemplo n.º 8
0
def test_read_series(tmpdir, engine):
    fn = str(tmpdir)
    ddf.to_parquet(fn, engine=engine)
    ddf2 = dd.read_parquet(fn, columns=['x'], engine=engine, infer_divisions=should_check_divs(engine))
    assert_eq(ddf[['x']], ddf2, check_divisions=should_check_divs(engine))

    ddf2 = dd.read_parquet(fn, columns='x', index='myindex', engine=engine, infer_divisions=should_check_divs(engine))
    assert_eq(ddf.x, ddf2, check_divisions=should_check_divs(engine))
Exemplo n.º 9
0
def test_read_series(tmpdir, engine):
    fn = str(tmpdir)
    ddf.to_parquet(fn, engine=engine)
    ddf2 = dd.read_parquet(fn, columns=['x'], engine=engine)
    assert_eq(df[['x']], ddf2)

    ddf2 = dd.read_parquet(fn, columns='x', index='myindex', engine=engine)
    assert_eq(df.x, ddf2)
Exemplo n.º 10
0
def test_pyarrow_raises_filters_categoricals(tmpdir):
    check_pyarrow()
    tmp = str(tmpdir)
    data = pd.DataFrame({"A": [1, 2]})
    df = dd.from_pandas(data, npartitions=2)

    df.to_parquet(tmp, write_index=False, engine="pyarrow")

    with pytest.raises(NotImplementedError):
        dd.read_parquet(tmp, engine="pyarrow", filters=["A>1"])
Exemplo n.º 11
0
def test_nonsense_column(tmpdir, write_engine, read_engine):
    fn = str(tmpdir)
    ddf.to_parquet(fn, engine=write_engine)
    with pytest.raises((ValueError, KeyError)):
        # fastparquet fails early, pyarrow only on compute
        dd.read_parquet(fn, columns=['nonesense'], engine=read_engine
                        ).compute()
    with pytest.raises((Exception, KeyError)):
        # fastparquet fails early, pyarrow only on compute
        dd.read_parquet(fn, columns=['nonesense'] + list(ddf.columns),
                        engine=read_engine).compute()
Exemplo n.º 12
0
def test_read_parquet_custom_columns(tmpdir, engine):
    tmp = str(tmpdir)
    data = pd.DataFrame({'i32': np.arange(1000, dtype=np.int32),
                         'f': np.arange(1000, dtype=np.float64)})
    df = dd.from_pandas(data, chunksize=50)
    df.to_parquet(tmp)

    df2 = dd.read_parquet(tmp, columns=['i32', 'f'], engine=engine)
    assert_eq(df2, df2, check_index=False)

    df3 = dd.read_parquet(tmp, columns=['f', 'i32'], engine=engine)
    assert_eq(df3, df3, check_index=False)
Exemplo n.º 13
0
def test_infer_divisions_no_index(tmpdir, write_engine, read_engine):
    fn = str(tmpdir)
    ddf.to_parquet(fn, engine=write_engine, write_index=False)

    if read_engine == 'pyarrow' and not check_pa_divs:
        match = 'requires pyarrow >=0.9.0'
        ex = NotImplementedError
    else:
        match = 'no index column was discovered'
        ex = ValueError

    with pytest.raises(ex, match=match):
        dd.read_parquet(fn, engine=read_engine, infer_divisions=True)
Exemplo n.º 14
0
def test_infer_divisions_not_sorted(tmpdir, write_engine, read_engine):
    fn = str(tmpdir)
    ddf.to_parquet(fn, engine=write_engine)

    if read_engine == 'pyarrow' and not check_pa_divs:
        match = 'requires pyarrow >=0.9.0'
        ex = NotImplementedError
    else:
        match = 'not known to be sorted across partitions'
        ex = ValueError

    with pytest.raises(ex, match=match):
        dd.read_parquet(fn, index='x', engine=read_engine, infer_divisions=True)
Exemplo n.º 15
0
def test_passing_parquetfile(tmpdir):
    import shutil
    fp = pytest.importorskip('fastparquet')
    path = str(tmpdir)
    df = pd.DataFrame({"x": [1, 3, 2, 4]})
    ddf = dd.from_pandas(df, npartitions=1)

    dd.to_parquet(ddf, path)
    pf = fp.ParquetFile(path)
    shutil.rmtree(path)

    # should pass, because no need to re-read metadata
    dd.read_parquet(pf)
Exemplo n.º 16
0
def test_parquet_select_cats(tmpdir):
    check_fastparquet()
    fn = str(tmpdir)
    df = pd.DataFrame({
        'categories': pd.Series(
            np.random.choice(['a', 'b', 'c', 'd', 'e', 'f'], size=100),
            dtype='category'),
        'ints': pd.Series(list(range(0, 100)), dtype='int'),
        'floats': pd.Series(list(range(0, 100)), dtype='float')})

    ddf = dd.from_pandas(df, 1)
    ddf.to_parquet(fn)
    rddf = dd.read_parquet(fn, columns=['ints'])
    assert list(rddf.columns) == ['ints']
    rddf = dd.read_parquet(fn)
    assert list(rddf.columns) == list(df)
Exemplo n.º 17
0
def test_no_index(tmpdir, write_engine, read_engine):
    fn = str(tmpdir)
    df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
    ddf = dd.from_pandas(df, npartitions=2)
    ddf.to_parquet(fn, write_index=False, engine=write_engine)
    ddf2 = dd.read_parquet(fn, engine=read_engine)
    assert_eq(df, ddf2, check_index=False)
Exemplo n.º 18
0
def test_categories(fn):
    df = pd.DataFrame({'x': [1, 2, 3, 4, 5],
                       'y': list('caaab')})
    ddf = dd.from_pandas(df, npartitions=2)
    ddf['y'] = ddf.y.astype('category')
    ddf.to_parquet(fn)
    ddf2 = dd.read_parquet(fn, categories=['y'])
    with pytest.raises(NotImplementedError):
        ddf2.y.cat.categories
    assert set(ddf2.y.compute().cat.categories) == {'a', 'b', 'c'}
    cats_set = ddf2.map_partitions(lambda x: x.y.cat.categories).compute()
    assert cats_set.tolist() == ['a', 'c', 'a', 'b']
    assert_eq(ddf.y, ddf2.y, check_names=False)
    with pytest.raises(dask.async.RemoteException):
        # attempt to load as category that which is not so encoded
        ddf2 = dd.read_parquet(fn, categories=['x']).compute()
Exemplo n.º 19
0
def test_read_list(tmpdir, write_engine, read_engine):
    tmpdir = str(tmpdir)
    ddf.to_parquet(tmpdir, engine=write_engine)
    files = sorted([os.path.join(tmpdir, f)
                   for f in os.listdir(tmpdir)
                   if not f.endswith('_metadata')],
                   key=natural_sort_key)

    # Infer divisions for engines/versions that support it
    ddf2 = dd.read_parquet(files, engine=read_engine,
                           infer_divisions=should_check_divs(write_engine) and should_check_divs(read_engine))
    assert_eq(ddf, ddf2, check_divisions=should_check_divs(write_engine) and should_check_divs(read_engine))

    # No divisions
    ddf2_no_divs = dd.read_parquet(files, engine=read_engine, infer_divisions=False)
    assert_eq(ddf.clear_divisions(), ddf2_no_divs, check_divisions=True)
Exemplo n.º 20
0
def test_read_list(tmpdir, write_engine, read_engine):
    tmpdir = str(tmpdir)
    ddf.to_parquet(tmpdir, engine=write_engine)
    files = sorted(os.path.join(tmpdir, f)
                   for f in os.listdir(tmpdir)
                   if not f.endswith('_metadata'))
    ddf2 = dd.read_parquet(files, engine=read_engine)
    assert_eq(df, ddf2)
Exemplo n.º 21
0
def test_timestamp_index(tmpdir, engine):
    fn = str(tmpdir)
    df = tm.makeTimeDataFrame()
    df.index.name = 'foo'
    ddf = dd.from_pandas(df, npartitions=5)
    ddf.to_parquet(fn, engine=engine)
    ddf2 = dd.read_parquet(fn, engine=engine)
    assert_eq(df, ddf2)
Exemplo n.º 22
0
def test_timestamp_index(tmpdir, engine):
    fn = str(tmpdir)
    df = tm.makeTimeDataFrame()
    df.index.name = 'foo'
    ddf = dd.from_pandas(df, npartitions=5)
    ddf.to_parquet(fn, engine=engine)
    ddf2 = dd.read_parquet(fn, engine=engine, infer_divisions=should_check_divs(engine))
    assert_eq(ddf, ddf2, check_divisions=should_check_divs(engine))
Exemplo n.º 23
0
def test_read_glob(tmpdir, write_engine, read_engine):
    fn = str(tmpdir)
    ddf.to_parquet(fn, engine=write_engine)
    if os.path.exists(os.path.join(fn, '_metadata')):
        os.unlink(os.path.join(fn, '_metadata'))

    files = os.listdir(fn)
    assert '_metadata' not in files

    # Infer divisions for engines/versions that support it

    ddf2 = dd.read_parquet(os.path.join(fn, '*'), engine=read_engine,
                           infer_divisions=should_check_divs(write_engine) and should_check_divs(read_engine))
    assert_eq(ddf, ddf2, check_divisions=should_check_divs(write_engine) and should_check_divs(read_engine))

    # No divisions
    ddf2_no_divs = dd.read_parquet(os.path.join(fn, '*'), engine=read_engine, infer_divisions=False)
    assert_eq(ddf.clear_divisions(), ddf2_no_divs, check_divisions=True)
Exemplo n.º 24
0
def test_partition_on(tmpdir, write_engine, read_engine):
    tmpdir = str(tmpdir)
    df = pd.DataFrame({'a': np.random.choice(['A', 'B', 'C'], size=100),
                       'b': np.random.random(size=100),
                       'c': np.random.randint(1, 5, size=100)})
    d = dd.from_pandas(df, npartitions=2)
    d.to_parquet(tmpdir, partition_on=['a'], engine=write_engine)
    out = dd.read_parquet(tmpdir, engine=read_engine).compute()
    for val in df.a.unique():
        assert set(df.b[df.a == val]) == set(out.b[out.a == val])
Exemplo n.º 25
0
def test_empty(tmpdir, write_engine, read_engine, index):
    fn = str(tmpdir)
    df = pd.DataFrame({'a': ['a', 'b', 'b'], 'b': [4, 5, 6]})[:0]
    if index:
        df.set_index('a', inplace=True, drop=True)
    ddf = dd.from_pandas(df, npartitions=2)

    ddf.to_parquet(fn, write_index=index, engine=write_engine)
    read_df = dd.read_parquet(fn, engine=read_engine)
    assert_eq(ddf, read_df)
Exemplo n.º 26
0
def test_read_glob(tmpdir, write_engine, read_engine):
    fn = str(tmpdir)
    ddf.to_parquet(fn, engine=write_engine)
    os.unlink(os.path.join(fn, '_metadata'))

    files = os.listdir(fn)
    assert '_metadata' not in files

    ddf2 = dd.read_parquet(os.path.join(fn, '*'), engine=read_engine)
    assert_eq(df, ddf2)
Exemplo n.º 27
0
def test_timestamp96(tmpdir):
    check_fastparquet()
    fn = str(tmpdir)
    df = pd.DataFrame({'a': ['now']}, dtype='M8[ns]')
    ddf = dd.from_pandas(df, 1)
    ddf.to_parquet(fn, write_index=False, times='int96')
    pf = fastparquet.ParquetFile(fn)
    assert pf._schema[1].type == fastparquet.parquet_thrift.Type.INT96
    out = dd.read_parquet(fn).compute()
    assert_eq(out, df)
Exemplo n.º 28
0
def test_roundtrip(tmpdir, df, write_kwargs, read_kwargs):
    check_fastparquet()
    tmp = str(tmpdir)
    if df.index.name is None:
        df.index.name = 'index'
    ddf = dd.from_pandas(df, npartitions=2)

    dd.to_parquet(ddf, tmp, **write_kwargs)
    ddf2 = dd.read_parquet(tmp, index=df.index.name, **read_kwargs)
    assert_eq(ddf, ddf2)
Exemplo n.º 29
0
def test_empty_partition(tmpdir, engine):
    fn = str(tmpdir)
    df = pd.DataFrame({"a": range(10), "b": range(10)})
    ddf = dd.from_pandas(df, npartitions=5)

    ddf2 = ddf[ddf.a <= 5]
    ddf2.to_parquet(fn, engine=engine)

    ddf3 = dd.read_parquet(fn, engine=engine)
    sol = ddf2.compute()
    assert_eq(sol, ddf3, check_names=False, check_index=False)
Exemplo n.º 30
0
def test_columns_no_index(tmpdir, write_engine, read_engine):
    fn = str(tmpdir)
    ddf.to_parquet(fn, engine=write_engine)
    ddf2 = ddf.reset_index()

    # No Index
    # --------
    # All columns, none as index
    assert_eq(dd.read_parquet(fn, index=False, engine=read_engine, infer_divisions=False),
              ddf2, check_index=False, check_divisions=True)

    # Two columns, none as index
    assert_eq(dd.read_parquet(fn, index=False, columns=['x', 'y'], engine=read_engine,
                              infer_divisions=False),
              ddf2[['x', 'y']], check_index=False, check_divisions=True)

    # One column and one index, all as columns
    assert_eq(dd.read_parquet(fn, index=False, columns=['myindex', 'x'], engine=read_engine,
                              infer_divisions=False),
              ddf2[['myindex', 'x']], check_index=False, check_divisions=True)
Exemplo n.º 31
0
    def extend_meta(self, df):
        ''' Add data to the metadata by passing a dataframe with htid and the 
        new columns.'''
        with ProgressBar():
            new_ddf = self.ddf.join(df, on='htid')
            new_ddf.to_parquet(self.data_path + '.new')

        print('Extended files created. Deleting old files')
        for file in os.listdir(self.data_path):
            fname = os.path.join(self.data_path, file)
            os.remove(fname)
        os.removedirs(self.data_path)
        os.rename(self.data_path + '.new', self.data_path)

        self.ddf = dd.read_parquet(self.data_path, compression='snappy')
Exemplo n.º 32
0
def test_read_list(tmpdir, write_engine, read_engine):
    tmpdir = str(tmpdir)
    ddf.to_parquet(tmpdir, engine=write_engine)
    files = sorted([
        os.path.join(tmpdir, f)
        for f in os.listdir(tmpdir) if not f.endswith('_metadata')
    ],
                   key=natural_sort_key)

    # Infer divisions for engines/versions that support it
    ddf2 = dd.read_parquet(files,
                           engine=read_engine,
                           infer_divisions=should_check_divs(write_engine)
                           and should_check_divs(read_engine))
    assert_eq(ddf,
              ddf2,
              check_divisions=should_check_divs(write_engine)
              and should_check_divs(read_engine))

    # No divisions
    ddf2_no_divs = dd.read_parquet(files,
                                   engine=read_engine,
                                   infer_divisions=False)
    assert_eq(ddf.clear_divisions(), ddf2_no_divs, check_divisions=True)
Exemplo n.º 33
0
 def run(self):
     dsk = dd.read_parquet(
         os.getenv('local_location') + 'trading_history/*.parquet')
     #dsk['time'] = dd.to_datetime(dsk['time'])
     dsk['time'] = dsk['time'].astype("M8[D]")
     dsk = dsk.set_index('time')
     dsk = dsk[dsk['type'].isin(['DAILY_FINANCING'])]
     dsk = dsk[['amount', 'accountBalance', 'financing']]
     dsk['financing'] = dsk['financing'].fillna(0.0)
     dsk['financing'] = dsk['financing'].astype('float64')
     dsk['accountBalance'] = dsk['accountBalance'].astype('float64')
     df = dsk.compute()
     df['financing'] = df['financing'].cumsum(axis=0)
     #print(df.head())
     self.create_graph(df)
Exemplo n.º 34
0
async def fetch_ship_data(data_type: ShipDataTypes):
    valid_types = ['profile', 'discrete']
    if data_type.value in valid_types:
        df = dd.read_parquet(SHIP_S3_MAP[data_type]).compute()
        df_json = json.loads(df.to_json(orient='records'))
        return {"status": "success", "result": df_json, "msg": ""}
    else:
        return {
            "status":
            "error",
            "result":
            None,
            "msg":
            f"{data_type} is invalid. Valid values: {', '.join(valid_types)}",
        }
Exemplo n.º 35
0
def test_empty_partition(fn):
    df = pd.DataFrame({"a": range(10), "b": range(10)})
    ddf = dd.from_pandas(df, npartitions=5)

    # fails as there are empty partitions
    ddf2 = ddf[ddf.a <= 5]
    ddf2.to_parquet(fn)

    ddf3 = dd.read_parquet(fn)
    assert_eq(ddf2.compute(), ddf3.compute(), check_names=False,
              check_index=False)

    ddf2 = ddf[ddf.a <= -5]
    with pytest.raises(ValueError):
        ddf2.to_parquet(fn)
Exemplo n.º 36
0
def test_to_parquet_lazy(tmpdir, scheduler, engine):
    tmpdir = str(tmpdir)
    df = pd.DataFrame({'a': [1, 2, 3, 4],
                       'b': [1., 2., 3., 4.]})
    df.index.name = 'index'
    ddf = dd.from_pandas(df, npartitions=2)
    value = ddf.to_parquet(tmpdir, compute=False, engine=engine)

    assert hasattr(value, 'dask')
    value.compute(scheduler=scheduler)
    assert os.path.exists(tmpdir)

    ddf2 = dd.read_parquet(tmpdir, engine=engine, infer_divisions=should_check_divs(engine))

    assert_eq(ddf, ddf2, check_divisions=should_check_divs(engine))
Exemplo n.º 37
0
def test_to_parquet_lazy(tmpdir, get):
    tmpdir = str(tmpdir)
    df = pd.DataFrame({'a': [1, 2, 3, 4], 'b': [1., 2., 3., 4.]})
    df.index.name = 'index'
    ddf = dd.from_pandas(df, npartitions=2)
    value = ddf.to_parquet(tmpdir, compute=False)

    assert hasattr(value, 'dask')
    # assert not os.path.exists(tmpdir)
    value.compute(get=get)
    assert os.path.exists(tmpdir)

    ddf2 = dd.read_parquet(tmpdir)

    assert_eq(ddf, ddf2)
Exemplo n.º 38
0
def _read_numeric_file(fname):
    try:
        return dd.read_parquet(fname)
    except:
        pass

    try:
        return dd.read_csv(fname)
    except:
        pass

    try:
        return np.load(fname)
    except:
        pass
 def post(self, request):
     data = request.data
     project_slug = data['project']
     pipeline_slug = data['pipeline']
     fns = get_protein_quant_fn(project_slug, pipeline_slug)
     if len(fns) == 0: return JsonResponse({})
     cols = ['Majority protein IDs', 'Score', 'Intensity']
     ddf = dd.read_parquet(fns, engine="pyarrow")[cols]
     res = ddf.groupby(['Majority protein IDs'
                        ]).mean().sort_values('Score').compute()
     response = {}
     response['protein_names'] = list(res.index)
     for col in res.columns:
         response[col] = res[col].to_list()
     return JsonResponse(response)
Exemplo n.º 40
0
def opendamir_reduce_by_month(path2file):
    """
    Function to reduce one month of opendamir into a database indexed by PRS_NAT_REF
    and with expenditure indicators as variables
    :param path2file: String, Monthly compressed file
    :return: pandas DataFrame, Monthly expenditure database with summed columns by act
    """
    df = dd.read_parquet(path2file)
    # Filters : this can be adapted to specific problem
    df = df.loc[df["ETE_IND_TAA"] != 1, :]
    df = df.groupby(["PRS_NAT", "PSE_ACT_CAT"]).agg({
        "PRS_PAI_MNT": "sum",
        "PRS_REM_MNT": "sum"
    }).compute()
    return df
Exemplo n.º 41
0
 def _export(self, name):
     # Read the data
     feature_path = self._full_feature_path(name)
     try:
         ddf = dd.read_parquet(
             feature_path,
             engine="pyarrow",
             storage_options=self._clean_dict(self.storage_options),
         )
         # Repartition to optimise files on exported dataset
         ddf = ddf.repartition(partition_size="25MB")
         return ddf
     except Exception as e:
         # No data available
         return None
Exemplo n.º 42
0
def test_partition_on_cats(tmpdir):
    check_fastparquet()
    tmp = str(tmpdir)
    d = pd.DataFrame({'a': np.random.rand(50),
                      'b': np.random.choice(['x', 'y', 'z'], size=50),
                      'c': np.random.choice(['x', 'y', 'z'], size=50)})
    d = dd.from_pandas(d, 2)
    d.to_parquet(tmp, partition_on=['b'], engine='fastparquet')
    df = dd.read_parquet(tmp, engine='fastparquet')
    assert set(df.b.cat.categories) == {'x', 'y', 'z'}

    d.to_parquet(tmp, partition_on=['b', 'c'], engine='fastparquet')
    df = dd.read_parquet(tmp, engine='fastparquet')
    assert set(df.b.cat.categories) == {'x', 'y', 'z'}
    assert set(df.c.cat.categories) == {'x', 'y', 'z'}
    df = dd.read_parquet(tmp, columns=['a', 'c'], engine='fastparquet')
    assert set(df.c.cat.categories) == {'x', 'y', 'z'}
    assert 'b' not in df.columns
    df = dd.read_parquet(tmp, index='c', engine='fastparquet')
    assert set(df.index.categories) == {'x', 'y', 'z'}
    assert 'c' not in df.columns
    # series
    df = dd.read_parquet(tmp, columns='b', engine='fastparquet')
    assert set(df.cat.categories) == {'x', 'y', 'z'}
Exemplo n.º 43
0
    def download_rtd(self):
        """
        Pull the Rtd.__tablename__ table from db, parse it and save it on disk.
        """
        with ProgressBar():
            rtd = dd.read_sql_table(self.__tablename__, DB_CONNECT_STRING,
                                    index_col='hash_id', meta=self.meta, npartitions=200)
            rtd.to_parquet(self.DATA_CACHE_PATH, engine='pyarrow', schema='infer') # write_metadata_file=False)
            rtd = dd.read_parquet(self.DATA_CACHE_PATH, engine='pyarrow')

            rtd = self._parse(rtd)
            self._save_encoders(rtd)

            # Save data to parquet. We have to use pyarrow as fastparquet does not support pd.Int64
            rtd.to_parquet(self.DATA_CACHE_PATH, engine='pyarrow', schema='infer')
Exemplo n.º 44
0
 def _read(
     self, name, from_date=None, to_date=None, freq=None, time_travel=None, **kwargs
 ):
     # Identify which partitions to read
     filters = []
     if from_date:
         filters.append(("time", ">=", pd.Timestamp(from_date)))
     if to_date:
         filters.append(("time", "<=", pd.Timestamp(to_date)))
     if kwargs.get("partitions"):
         for p in kwargs.get("partitions"):
             filters.append(("partition", "==", p))
     filters = [filters] if filters else None
     # Read the data
     feature_path = self._full_feature_path(name)
     try:
         ddf = dd.read_parquet(
             feature_path,
             engine="pyarrow",
             filters=filters,
             storage_options=self._clean_dict(self.storage_options),
         )
         ddf = ddf.repartition(partition_size="25MB")
     except PermissionError as e:
         raise e
     except Exception as e:
         # No data available
         empty_df = pd.DataFrame(
             columns=["time", "created_time", "value", "partition"]
         ).set_index("time")
         ddf = dd.from_pandas(empty_df, chunksize=1)
     if "partition" in ddf.columns:
         ddf = ddf.drop(columns="partition")
     # Apply time-travel
     if time_travel:
         ddf = ddf.reset_index()
         ddf = ddf[ddf.created_time <= ddf.time + pd.Timedelta(time_travel)]
         ddf = ddf.set_index("time")
     # De-serialize from JSON if required
     if kwargs.get("serialized"):
         ddf = ddf.map_partitions(
             lambda df: df.assign(value=df.value.apply(pd.io.json.loads)),
             meta={
                 "value": "object",
                 "created_time": "datetime64[ns]",
             },
         )
     return ddf
Exemplo n.º 45
0
def read_df(pattern, dbsystem='dask', sqlContext=None):
    """
    Reads a set of data contained in a folder as a spark or dask DataFrame
    
    Parameters
    ----------
    pattern : str
        Unix style wildcard pattern pointing to the files, for example
        /store/msrad/folder/*.csv will read all csv files in that folder
    dbsystem : str
        Either "dask" if you want a Dask DataFrame or "spark" if you want a 
        spark dataframe
    sqlContext : sqlContext instance
        sqlContext to use, required only if dbystem = 'spark'
        
    Returns
    -------
    A spark or dask DataFrame instance
    """

    if dbsystem not in ['spark', 'dask']:
        raise NotImplementedError(
            'Only dbsystem = "spark" or "dask" are supported!')
    if dbsystem == 'spark' and sqlContext == None:
        raise ValueError('sqlContext must be provided if dbystem = "spark"!')

    files = glob.glob(pattern)
    df = None
    if '.parq' in files[0] or '.parquet' in files[0]:
        # For some reason wildcards are not accepted with parquet
        if dbsystem == 'spark':
            df = sqlContext.read.parquet(*files)
        else:
            df = dd.read_parquet(pattern)
    elif '.csv' in files[0]:
        if dbsystem == 'spark':
            df = sqlContext.read.csv(pattern, header=True, inferSchema=True)
        else:
            if '.gz' in files[0]:
                df = dd.read_csv(pattern, compression='gzip')
            else:
                df = dd.read_csv(pattern)
    else:
        logging.error("""Invalid data, only csv and parquet files are accepted.
        Make sure that they have a valid suffix (.csv, .csv.gz, .parquet,
        .parq)""")

    return df
Exemplo n.º 46
0
def _import_dask(storage,
                 merge_records=False,
                 dm_name=None,
                 import_type='staging',
                 return_dask_graph=False,
                 connector_id=None,
                 staging_name=None,
                 view_name=None,
                 columns=None,
                 max_hits=None,
                 mapping_columns=None):
    if columns:
        columns = list(set(columns))
        columns += __STAGING_FIELDS
        columns = list(set(columns))

    if import_type == 'golden':
        url = [storage.build_url_parquet_golden(dm_name=dm_name)]
    elif import_type == 'staging':
        url = []
        url1 = storage.build_url_parquet_staging(staging_name=staging_name,
                                                 connector_id=connector_id)
        if url1 is not None:
            url.append(url1)

        url2 = storage.build_url_parquet_staging_master(
            staging_name=staging_name, connector_id=connector_id)
        if url2 is not None:
            url.append(url2)

        url3 = storage.build_url_parquet_staging_rejected(
            staging_name=staging_name, connector_id=connector_id)
        if url3 is not None:
            url.append(url3)
    elif import_type == 'view':
        url = [storage.build_url_parquet_view(view_name=view_name)]
    else:
        raise KeyError('import_type should be `golden`,`staging` or `view`')

    d = dd.read_parquet(url,
                        storage_options=storage.get_dask_options(),
                        columns=columns)

    d = d.rename(columns=mapping_columns)
    if return_dask_graph:
        return d
    else:
        return d.compute()
Exemplo n.º 47
0
def test_ordering(tmpdir):
    check_fastparquet()
    tmp = str(tmpdir)
    df = pd.DataFrame({'a': [1, 2, 3],
                       'b': [10, 20, 30],
                       'c': [100, 200, 300]},
                      index=pd.Index([-1, -2, -3], name='myindex'),
                      columns=['c', 'a', 'b'])
    ddf = dd.from_pandas(df, npartitions=2)
    dd.to_parquet(ddf, tmp)

    pf = fastparquet.ParquetFile(tmp)
    assert pf.columns == ['myindex', 'c', 'a', 'b']

    ddf2 = dd.read_parquet(tmp, index='myindex')
    assert_eq(ddf, ddf2)
Exemplo n.º 48
0
 def to_dask(self):
     """
     Create a lazy dask-dataframe from the parquet data
     """
     # More efficient to call dask function directly.
     self._load_metadata()
     columns = self._kwargs.get('columns', None)
     index = self._kwargs.get('index', None)
     filters = self._kwargs.get('filters', [])
     self._df = dd.read_parquet(self._urlpath,
                                columns=columns,
                                index=index,
                                filters=filters)
     self._schema = None
     self.discover()  # resets schema to dask's better version
     return self._df
Exemplo n.º 49
0
 def _to_dask(self):
     """
     Create a lazy dask-dataframe from the parquet data
     """
     import dask.dataframe as dd
     urlpath = self._get_cache(self._urlpath)[0]
     kw = dict(columns=self._kwargs.get('columns', None),
               index=self._kwargs.get('index', None),
               engine=self._kwargs.get('engine', 'auto'))
     if 'filters' in self._kwargs:
         kw['filters'] = self._kwargs['filters']
     self._df = dd.read_parquet(urlpath,
                                storage_options=self._storage_options,
                                **kw)
     self._load_metadata()
     return self._df
Exemplo n.º 50
0
def _persist_to_file(dataset: Union[str, dd.DataFrame], stage_i, stage_name, cache_dir):
    assert cache_dir is not None, "When using dask dataframe, cache dir must be provided"
    assert '/' not in stage_name \
           and '\\' not in stage_name \
           and ':' not in stage_name \
           and '..' not in stage_name, "Unsafe stage symbols"
    cache_path = _get_cache_path(cache_dir, stage_i, stage_name)
    if isinstance(dataset, dd.DataFrame):
        dataset.to_parquet(cache_path, engine='fastparquet')
    elif isinstance(dataset, str):
        logger.debug("Moving {} to {}".format(dataset, cache_path))
        shutil.move(dataset, cache_path)
    else:
        raise NotImplementedError()

    return dd.read_parquet(cache_path)
Exemplo n.º 51
0
def combine_from_input(input_file, extractor, field, simulations_desc,
                       num_procs=1, sample_size=1000, noise=True,
                       error=True):

    temp_dir = tempfile.mkdtemp(prefix='/dev/shm/')
    try:
        levels_names = exec_in_subprocess(load_normalize_and_dump_data,
                                          input_file, extractor,
                                          simulations_desc,
                                          num_procs, temp_dir)
        df_dask = dd.read_parquet(temp_dir)
        return combine_sorted(df_dask, field, simulations_desc, levels_names,
                              num_procs=num_procs, sample_size=sample_size,
                              noise=noise, error=True)
    finally:
        shutil.rmtree(temp_dir)
Exemplo n.º 52
0
def test_columns_index(tmpdir, write_engine, read_engine):
    fn = str(tmpdir)
    ddf.to_parquet(fn, engine=write_engine)

    # With Index
    # ----------
    # ### Emtpy columns ###
    # With divisions if supported
    assert_eq(dd.read_parquet(fn, columns=[], engine=read_engine, infer_divisions=should_check_divs(read_engine)),
              ddf[[]], check_divisions=should_check_divs(read_engine))

    # No divisions
    assert_eq(dd.read_parquet(fn, columns=[], engine=read_engine, infer_divisions=False),
              ddf[[]].clear_divisions(), check_divisions=True)

    # ### Single column, auto select index ###
    # With divisions if supported
    assert_eq(dd.read_parquet(fn, columns=['x'], engine=read_engine, infer_divisions=should_check_divs(read_engine)),
              ddf[['x']], check_divisions=should_check_divs(read_engine))

    # No divisions
    assert_eq(dd.read_parquet(fn, columns=['x'], engine=read_engine, infer_divisions=False),
              ddf[['x']].clear_divisions(), check_divisions=True)

    # ### Single column, specify index ###
    # With divisions if supported
    assert_eq(dd.read_parquet(fn, index='myindex', columns=['x'], engine=read_engine,
                              infer_divisions=should_check_divs(read_engine)),
              ddf[['x']], check_divisions=should_check_divs(read_engine))

    # No divisions
    assert_eq(dd.read_parquet(fn, index='myindex', columns=['x'], engine=read_engine,
                              infer_divisions=False),
              ddf[['x']].clear_divisions(), check_divisions=True)

    # ### Two columns, specify index ###
    # With divisions if supported
    assert_eq(dd.read_parquet(fn, index='myindex', columns=['x', 'y'], engine=read_engine,
                              infer_divisions=should_check_divs(read_engine)),
              ddf, check_divisions=should_check_divs(read_engine))

    # No divisions
    assert_eq(dd.read_parquet(fn, index='myindex', columns=['x', 'y'], engine=read_engine,
                              infer_divisions=False),
              ddf.clear_divisions(), check_divisions=True)
Exemplo n.º 53
0
def test_ordering(tmpdir, write_engine, read_engine):
    tmp = str(tmpdir)
    df = pd.DataFrame({'a': [1, 2, 3],
                       'b': [10, 20, 30],
                       'c': [100, 200, 300]},
                      index=pd.Index([-1, -2, -3], name='myindex'),
                      columns=['c', 'a', 'b'])
    ddf = dd.from_pandas(df, npartitions=2)
    dd.to_parquet(ddf, tmp, engine=write_engine)

    if read_engine == 'fastparquet':
        pf = fastparquet.ParquetFile(tmp)
        assert pf.columns == ['myindex', 'c', 'a', 'b']

    ddf2 = dd.read_parquet(tmp, index='myindex', engine=read_engine)
    assert_eq(ddf, ddf2, check_divisions=False)
Exemplo n.º 54
0
def test_copy_dask_to_gcs_dir(
    tmp_path,
    tmp_gcs_url_prefix,
    expected_dask_df,
    dask_flow,
    override_gcs_for_copy_if_fake_gcp,
    gcs_fs,
):
    cloud_url = tmp_gcs_url_prefix + "output"
    local_path = tmp_path / "output"

    dask_flow.get("dask_df", mode="FileCopier").copy(destination=cloud_url)

    gcs_fs.get(cloud_url, str(local_path), recursive=True)
    actual = dd.read_parquet(local_path)
    assert equal_frame_and_index_content(actual.compute(), expected_dask_df.compute())
Exemplo n.º 55
0
def clean_cabs_at_path(special: bool, s3_in_url: str, s3_out_url: str,
                       s3_options: Dict) -> bool:

    try:
        df = dd.read_parquet(path=s3_in_url,
                             storage_options=s3_options,
                             engine='fastparquet')

        # add cab zones
        if not special:
            print('In data clean tasks for cabs. Field dolocationid not found')
            # fetch cab zones
            taxi_zones_df: GeoDataFrame = fetch_cab_zones()
            df['dolocationid'] = df.map_partitions(
                partial(add_cab_zone,
                        taxi_zone_df=taxi_zones_df,
                        lon_var='dolongitude',
                        lat_var='dolatitude',
                        locid_var='dolocationid'),
                meta=('dolocationid', int64))
            df['pulocationid'] = df.map_partitions(
                partial(add_cab_zone,
                        taxi_zone_df=taxi_zones_df,
                        lon_var='pulongitude',
                        lat_var='pulatitude',
                        locid_var='pulocationid'),
                meta=('pulocationid', int64))

            del taxi_zones_df
        df = df[[
            'pudatetime', 'dodatetime', 'passengers', 'distance',
            'dolocationid', 'pulocationid'
        ]]
        dd.to_parquet(df=df,
                      path=s3_out_url,
                      engine='fastparquet',
                      compute=True,
                      compression='GZIP',
                      storage_options=s3_options)
        del df

    except Exception as err:
        print('error in clean_cabs_at_path %s' % str(err))
        raise err

    else:
        return True
Exemplo n.º 56
0
def test_hive_partitioned_data(tmpdir, cpu):

    # Initial timeseries dataset (in cpu memory).
    # Round the full "timestamp" to the hour for partitioning.
    ddf = dask.datasets.timeseries(
        start="2000-01-01",
        end="2000-01-03",
        freq="600s",
        partition_freq="6h",
        seed=42,
    ).reset_index()
    ddf["timestamp"] = ddf["timestamp"].dt.round("D").dt.day
    ds = nvt.Dataset(ddf, engine="parquet")

    # Write the dataset to disk
    path = str(tmpdir)
    partition_keys = ["timestamp", "name"]
    ds.to_parquet(path, partition_on=partition_keys)

    # Make sure the directory structure is hive-like
    df_expect = ddf.compute()
    df_expect = df_expect.sort_values(["id", "x", "y"]).reset_index(drop=True)
    timestamp_check = df_expect["timestamp"].iloc[0]
    name_check = df_expect["name"].iloc[0]
    assert glob.glob(
        os.path.join(
            path,
            f"timestamp={timestamp_check}/name={name_check}/*",
        ))

    # Read back with dask.dataframe and check the data
    df_check = dd.read_parquet(path).compute()
    df_check["name"] = df_check["name"].astype("object")
    df_check["timestamp"] = df_check["timestamp"].astype("int64")
    df_check = df_check.sort_values(["id", "x", "y"]).reset_index(drop=True)
    for col in df_expect:
        # Order of columns can change after round-trip partitioning
        assert_eq(df_expect[col], df_check[col], check_index=False)

    # Read back with NVT and check the data
    df_check = nvt.Dataset(path, engine="parquet").to_ddf().compute()
    df_check["name"] = df_check["name"].astype("object")
    df_check["timestamp"] = df_check["timestamp"].astype("int64")
    df_check = df_check.sort_values(["id", "x", "y"]).reset_index(drop=True)
    for col in df_expect:
        # Order of columns can change after round-trip partitioning
        assert_eq(df_expect[col], df_check[col], check_index=False)
Exemplo n.º 57
0
def to_zarr(input_path: str, output_path: str, dictionary_path: str):
    import dask.dataframe as dd
    import fsspec
    import xarray as xr
    from dask.diagnostics import ProgressBar

    logger.info(f"Converting parquet at {input_path} to {output_path}")
    df = dd.read_parquet(input_path)

    trait_columns = df.columns[df.columns.to_series().str.match(r"^\d+")]
    # 41210_Z942 -> 41210 (UKB field id)
    trait_group_ids = [c.split("_")[0] for c in trait_columns]
    # 41210_Z942 -> Z942 (Data coding value as one-hot encoding in phenotype, e.g.)
    trait_code_ids = ["_".join(c.split("_")[1:]) for c in trait_columns]
    trait_values = df[trait_columns].astype("float").to_dask_array()
    trait_values.compute_chunk_sizes()

    trait_id_to_name = (
        pd.read_csv(
            dictionary_path,
            sep=",",
            usecols=["FieldID", "Field"],
            dtype={"FieldID": str, "Field": str},
        )
        .set_index("FieldID")["Field"]
        .to_dict()
    )
    trait_name = [trait_id_to_name.get(v) for v in trait_group_ids]

    ds = xr.Dataset(
        dict(
            id=("samples", np.asarray(df["userId"], dtype=int)),
            trait=(("samples", "traits"), trait_values),
            trait_id=("traits", np.asarray(trait_columns.values, dtype=str)),
            trait_group_id=("traits", np.array(trait_group_ids, dtype=int)),
            trait_code_id=("traits", np.array(trait_code_ids, dtype=str)),
            trait_name=("traits", np.array(trait_name, dtype=str)),
        )
    )
    # Keep chunks small in trait dimension for faster per-trait processing
    ds["trait"] = ds["trait"].chunk(dict(samples="auto", traits=100))
    ds = ds.rename_vars({v: f"sample_{v}" for v in ds})

    logger.info(f"Saving dataset to {output_path}:\n{ds}")
    with ProgressBar():
        ds.to_zarr(fsspec.get_mapper(output_path), consolidated=True, mode="w")
    logger.info("Done")
Exemplo n.º 58
0
    def _parquet(self):
        """
        Import parquet file

        :return dask DataFrame
        """
        return dd.read_parquet(
            path=self.full_path,
            columns=None,
            filters=self.kwargs.get('filters'),
            categories=self.kwargs.get('categories'),
            index=self.kwargs.get('index'),
            storage_options=self.kwargs.get('storage_options'),
            engine='pyarrow',
            gather_statistics=self.kwargs.get('gather_statistics'),
            split_row_groups=self.kwargs.get('split_row_groups'),
            chunksize=self.kwargs.get('chunksize'))
Exemplo n.º 59
0
    def run(self) -> None:
        self.init()
        if self.dpath.suffix == ".parquet":
            df = dd.read_parquet(self.dpath)
        elif self.dpath.suffix == ".csv":
            df = dd.read_csv(self.dpath)

        times = []
        cols = []
        for col in df.columns:
            then = time()
            self.bench(col)
            times.append(time() - then)
            cols.append(col)

        result = {"name": self.__class__.__name__, "times": times, "columns": cols}
        print(jdumps(result))
Exemplo n.º 60
0
def test_arrow_partitioning(tmpdir):
    # Issue #3518
    pytest.importorskip('pyarrow')
    path = str(tmpdir)
    data = {
        'p': np.repeat(np.arange(3), 2).astype(np.int8),
        'b': np.repeat(-1, 6).astype(np.int16),
        'c': np.repeat(-2, 6).astype(np.float32),
        'd': np.repeat(-3, 6).astype(np.float64),
    }
    pdf = pd.DataFrame(data)
    ddf = dd.from_pandas(pdf, npartitions=2)
    ddf.to_parquet(path, engine='pyarrow', partition_on='p')

    ddf = dd.read_parquet(path, engine='pyarrow')

    ddf.astype({'b': np.float32}).compute()