def get_patch_count(self, filters, tracts, coadd_version="unforced"): return 1 predicates = [] if filters: predicates.append(("filter", "in", filters)) if tracts: predicates.append(("tract", "in", tracts)) dataset = "analysisCoaddTable_{}".format(coadd_version) columns = ["patch"] store = partial(get_store_from_url, "hfs://" + str(self.path)) if predicates: coadd_df = read_dataset_as_ddf(predicates=[predicates], dataset_uuid=dataset, columns=columns, store=store, table="table") else: coadd_df = read_dataset_as_ddf(dataset_uuid=dataset, columns=columns, store=store, table="table") return coadd_df.drop_duplicates().count().compute()["patch"]
def get_visits_by_metric_filter(self, filt, metric): store = partial(get_store_from_url, "hfs://" + str(self.path)) columns = [ "filter", "tract", "visit", "calib_psf_used", "calib_psf_candidate", "calib_photometry_reserved", "qaBad_flag", "ra", "dec", "psfMag", ] + [metric] visits_ddf = read_dataset_as_ddf( dataset_uuid="analysisVisitTable", predicates=[[("filter", "==", filt)]], store=store, columns=columns, table="table", ) store = partial(get_store_from_url, "hfs://" + str(self.path)) columns = [ "filter", "tract", "visit", "calib_psf_used", "calib_psf_candidate", "calib_photometry_reserved", "qaBad_flag", "ra", "dec", "psfMag", ] + [metric] visits_ddf = read_dataset_as_ddf( dataset_uuid="analysisVisitTable", predicates=[[("filter", "==", filt)]], store=store, columns=columns, table="table", ) return visits_ddf[visits_ddf[metric].notnull()]
def test_reconstruct_dask_index_sorting(store_factory, monkeypatch): # Make sure we're not shuffling anything monkeypatch.delattr(dask.dataframe.shuffle, dask.dataframe.shuffle.set_index.__name__) dataset_uuid = "dataset_uuid" colA = "ColumnA" colB = "ColumnB" df = pd.DataFrame({ colA: np.random.randint(high=100000, low=-100000, size=(50, )), colB: 0 }) store_dataframes_as_dataset(store=store_factory, dataset_uuid=dataset_uuid, dfs=[df], partition_on=colA) ddf = read_dataset_as_ddf( dataset_uuid=dataset_uuid, store=store_factory, table="table", dask_index_on=colA, ) assert all( ddf.map_partitions(lambda df: df.index.min()).compute().values == ddf.divisions[:-1])
def test_reconstruct_dask_index_raise_no_index(store_factory): dataset_uuid = "dataset_uuid" colA = "ColumnA" df1 = pd.DataFrame({colA: [1, 2]}) store_dataframes_as_dataset(store=store_factory, dataset_uuid=dataset_uuid, dfs=[df1]) with pytest.raises( RuntimeError, match= r"Requested index: \['ColumnA'\] but available index columns: \[\]", ): read_dataset_as_ddf( dataset_uuid=dataset_uuid, store=store_factory, table="table", dask_index_on=colA, )
def test_read_ddf_from_categorical_partition(store_factory): df = pd.DataFrame({"x": ["a"]}).astype({"x": "category"}) store_dataframes_as_dataset(dfs=[df], dataset_uuid="dataset_uuid", store=store_factory) ddf = read_dataset_as_ddf(dataset_uuid="dataset_uuid", store=store_factory, table="table") df_expected = pd.DataFrame({"x": ["a"]}) df_actual = ddf.compute(scheduler="sync") pdt.assert_frame_equal(df_expected, df_actual) ddf = read_dataset_as_ddf( dataset_uuid="dataset_uuid", store=store_factory, categoricals=["x"], table="table", ) df_actual = ddf.compute(scheduler="sync") pdt.assert_frame_equal(df, df_actual)
def test_reconstruct_dask_index_types(store_factory, setup_reconstruct_dask_index_types, col): if col == "null": pytest.xfail(reason="Cannot index null column") ddf = read_dataset_as_ddf( dataset_uuid=setup_reconstruct_dask_index_types.uuid, store=store_factory, table="table", dask_index_on=col, ) assert ddf.known_divisions assert ddf.index.name == col
def test_reconstruct_dask_index(store_factory, index_type, monkeypatch): dataset_uuid = "dataset_uuid" colA = "ColumnA" colB = "ColumnB" df1 = pd.DataFrame({colA: [1, 2], colB: ["x", "y"]}) df2 = pd.DataFrame({colA: [3, 4], colB: ["x", "y"]}) df_chunks = np.array_split(pd.concat([df1, df2]).reset_index(drop=True), 4) df_delayed = [dask.delayed(c) for c in df_chunks] ddf_expected = dd.from_delayed(df_delayed).set_index( colA, divisions=[1, 2, 3, 4, 4]) ddf_expected_simple = dd.from_pandas(pd.concat([df1, df2]), npartitions=2).set_index(colA) if index_type == "secondary": secondary_indices = colA partition_on = None else: secondary_indices = None partition_on = colA store_dataframes_as_dataset( store=store_factory, dataset_uuid=dataset_uuid, dfs=[df1, df2], secondary_indices=secondary_indices, partition_on=partition_on, ) # Make sure we're not shuffling anything monkeypatch.delattr(dask.dataframe.shuffle, dask.dataframe.shuffle.set_index.__name__) ddf = read_dataset_as_ddf( dataset_uuid=dataset_uuid, store=store_factory, table="table", dask_index_on=colA, ) assert ddf_expected.npartitions == 4 assert len(ddf_expected.divisions) == 5 assert ddf_expected.divisions == (1, 2, 3, 4, 4) assert ddf.index.name == colA assert ddf.npartitions == 4 assert len(ddf.divisions) == 5 assert ddf.divisions == (1, 2, 3, 4, 4) assert_dask_eq(ddf_expected, ddf) assert_dask_eq(ddf_expected_simple, ddf, check_divisions=False) assert_frame_equal(ddf_expected.compute(), ddf.compute()) assert_frame_equal(ddf_expected_simple.compute(), ddf.compute())
def fetch_coadd_table(self, coadd_version="unforced"): table = "qaDashboardCoaddTable" store = partial(get_store_from_url, "hfs://" + str(self.path)) print(str(self.path)) predicates = [[("tract", "in", self.tracts)]] dataset = "analysisCoaddTable_{}".format(coadd_version) coadd_df = read_dataset_as_ddf(predicates=predicates, dataset_uuid=dataset, store=store, table="table") self.coadd[table] = coadd_df
def get_coadd_ddf_by_filter_metric(self, filter_name, metrics, tracts, coadd_version="unforced", warnings=[]): for t in tracts: if t not in self.tracts: msg = "Selected tract {} missing in data".format(t) print("WARNING: {}".format(msg)) warnings.append(msg) # filter out any tracts not in data valid_tracts = list(set(self.tracts).intersection(set(tracts))) if not valid_tracts: msg = "No Valid tracts selected...using all tracts" print("WARNING: {}".format(msg)) valid_tracts = self.tracts warnings.append(msg) predicates = [[("tract", "in", valid_tracts), ("filter", "==", filter_name)]] dataset = "analysisCoaddTable_{}".format(coadd_version) columns = metrics + self.flags + [ "ra", "dec", "filter", "psfMag", "patch" ] store = partial(get_store_from_url, "hfs://" + str(self.path)) karto_kwargs = dict(predicates=predicates, dataset_uuid=dataset, columns=columns, store=store, table="table") print(f"...loading dataset ({filter_name}, {metrics})...") coadd_df = ( read_dataset_as_ddf(**karto_kwargs).dropna(how="any") # .set_index('filter') .compute()) print("loaded.") # coadd_df = dd.from_pandas(coadd_df, chunksize=100000) return coadd_df
def load_from_ktk(self, predicates, columns=None, dask=True): ktk_kwargs = dict( dataset_uuid=self.dataset, predicates=predicates, store=self.store, columns={"table": columns}, ) # print(ktk_kwargs) if dask: ktk_kwargs["table"] = "table" return read_dataset_as_ddf(**ktk_kwargs) else: ktk_kwargs["tables"] = ["table"] datalist = read_dataset_as_dataframes(**ktk_kwargs) if datalist: return read_dataset_as_dataframes(**ktk_kwargs)[0]["table"] else: raise IOError(f"No data returned for {ktk_kwargs}.")
def get_visits_by_metric_filter(self, filt, metrics, visit=None): store = partial(get_store_from_url, "hfs://" + str(self.path)) columns = [ "filter", "tract", "visit", "calib_psf_used", "calib_psf_candidate", "calib_photometry_reserved", "qaBad_flag", "ra", "dec", "psfMag", ] + metrics if visit: predicates = [[("filter", "==", filt), ("visit", "==", visit)]] else: predicates = [[("filter", "==", filt)]] store = partial(get_store_from_url, "hfs://" + str(self.path)) columns = [ "filter", "tract", "visit", "calib_psf_used", "calib_psf_candidate", "calib_photometry_reserved", "qaBad_flag", "ra", "dec", "psfMag", ] + metrics visits_ddf = read_dataset_as_ddf( dataset_uuid="analysisVisitTable", predicates=predicates, store=store, columns=columns, table="table", ) filters = reduce(operator.and_, (visits_ddf[m].notnull() for m in metrics)) return visits_ddf[filters]
def read_dataset_as_ddf(self, **kwargs): """ # when setting dates_as_object=False import pandas as pd predicates = [[("c_date", "==", pd.to_datetime('2020-01-01'))]] # with dates_as_object=True or if querying partition key from datetime import date predicates = [[("c_date", "==", date(2020,1,1))]] columns=['col1', 'col2'], predicates=predicates, dates_as_object=True, """ return read_dataset_as_ddf(dataset_uuid=self.dataset_uuid, store=lambda: self.store, table=self.table, **kwargs)
def _read_as_ddf( dataset_uuid, store, factory=None, categoricals=None, tables=None, dataset_has_index=False, **kwargs, ): table = tables or SINGLE_TABLE if categoricals: categoricals = categoricals[table] ddf = read_dataset_as_ddf( dataset_uuid=dataset_uuid, store=store, factory=factory, categoricals=categoricals, table=table, **kwargs, ) if categoricals: assert ddf._meta.dtypes["P"] == pd.api.types.CategoricalDtype( categories=["__UNKNOWN_CATEGORIES__"], ordered=False) if dataset_has_index: assert ddf._meta.dtypes["L"] == pd.api.types.CategoricalDtype( categories=[1, 2], ordered=False) else: assert ddf._meta.dtypes["L"] == pd.api.types.CategoricalDtype( categories=["__UNKNOWN_CATEGORIES__"], ordered=False) s = pickle.dumps(ddf, pickle.HIGHEST_PROTOCOL) ddf = pickle.loads(s) ddf = ddf.compute().reset_index(drop=True) def extract_dataframe(ix): df = ddf.iloc[[ix]].copy() for col in df.columns: if pd.api.types.is_categorical(df[col]): df[col] = df[col].cat.remove_unused_categories() return df.reset_index(drop=True) return [extract_dataframe(ix) for ix in ddf.index]