def to_dask(self, columns=None): import dask.dataframe as dd if columns is None: columns = self.columns token = md5(str( (self.path, os.path.getmtime(self.path))).encode()).hexdigest() name = 'from-castra-' + token divisions = [self.minimum] + self.partitions.index.tolist() if '.index' in self.categories: divisions = ( [self.categories['.index'][0]] + [self.categories['.index'][d + 1] for d in divisions[1:-1]] + [self.categories['.index'][-1]]) key_parts = list(enumerate(self.partitions.values)) dsk = dict(((name, i), (Castra.load_partition, self, part, columns)) for i, part in key_parts) if isinstance(columns, list): return dd.DataFrame(dsk, name, columns, divisions) else: return dd.Series(dsk, name, columns, divisions)
def test_loc_with_text_dates(): A = tm.makeTimeSeries(10).iloc[:5] B = tm.makeTimeSeries(10).iloc[5:] s = dd.Series({('df', 0): A, ('df', 1): B}, 'df', A, [A.index.min(), B.index.min(), B.index.max()]) assert s.loc['2000': '2010'].divisions == s.divisions assert_eq(s.loc['2000': '2010'], s) assert len(s.loc['2000-01-03': '2000-01-05'].compute()) == 3
def _construct_dask_df_with_divisions(df): """Construct the new task graph and make a new dask.dataframe around it""" divisions = _get_divisions(df) name = 'csv-index' + df._name dsk = {(name, i): (_add_to_index, (df._name, i), divisions[i]) for i in range(df.npartitions)} from toolz import merge if isinstance(df, dd.DataFrame): return dd.DataFrame(merge(dsk, df.dask), name, df.columns, divisions) elif isinstance(df, dd.Series): return dd.Series(merge(dsk, df.dask), name, df.name, divisions)
def to_dask(self, columns=None): if columns is None: columns = self.columns import dask.dataframe as dd name = 'from-castra' + next(dd.core.tokens) dsk = dict(((name, i), (Castra.load_partition, self, part, columns)) for i, part in enumerate(self.partitions.values)) divisions = [self.minimum] + list(self.partitions.index) if isinstance(columns, list): return dd.DataFrame(dsk, name, columns, divisions) else: return dd.Series(dsk, name, columns, divisions)
def _construct_dask_df_with_divisions(df): """Construct the new task graph and make a new dask.dataframe around it.""" divisions = _get_divisions(df) # pylint: disable=protected-access name = 'csv-index' + df._name dsk = {(name, i): (_add_to_index, (df._name, i), divisions[i]) for i in range(df.npartitions)} # pylint: enable=protected-access from toolz import merge # pylint: disable=g-import-not-at-top if isinstance(df, dd.DataFrame): return dd.DataFrame(merge(dsk, df.dask), name, df.columns, divisions) elif isinstance(df, dd.Series): return dd.Series(merge(dsk, df.dask), name, df.name, divisions)
def test_loc_with_text_dates(): A = dd._compat.makeTimeSeries().iloc[:5] B = dd._compat.makeTimeSeries().iloc[5:] s = dd.Series( {("df", 0): A, ("df", 1): B}, "df", A, [A.index.min(), B.index.min(), B.index.max()], ) assert s.loc["2000":"2010"].divisions == s.divisions assert_eq(s.loc["2000":"2010"], s) assert len(s.loc["2000-01-03":"2000-01-05"].compute()) == 3
def to_dask(self, columns=None): import dask.dataframe as dd if columns is None: columns = self.columns token = md5(str( (self.path, os.path.getmtime(self.path))).encode()).hexdigest() name = 'from-castra-' + token dsk = dict(((name, i), (Castra.load_partition, self, part, columns)) for i, part in enumerate(self.partitions.values)) divisions = [self.minimum] + list(self.partitions.index) if isinstance(columns, list): return dd.DataFrame(dsk, name, columns, divisions) else: return dd.Series(dsk, name, columns, divisions)