def compute_container_hash( data_container: Union[QFSeries, QFDataFrame, QFDataArray]) -> str: """ For the given data container returns the hexadecimal digest of the data. Parameters ---------- data_container: QFSeries, QFDataFrame, QFDataArray container, which digest should be computed Returns ------- str hexadecimal digest of data in the passed data container """ if isinstance(data_container, QFSeries): hashed_container = hash_pandas_object(data_container) elif isinstance(data_container, QFDataFrame): hashed_container = hash_pandas_object(data_container) elif isinstance(data_container, QFDataArray): hash_data_frame = QFDataFrame([ hash_pandas_object(data_container.loc[:, :, field].to_pandas()) for field in data_container.fields ]) hashed_container = hash_pandas_object(hash_data_frame) else: raise ValueError("Unsupported type of data container") return hashlib.sha1(hashed_container.values).hexdigest()
def _get_rel_trends(data): global CACHED_REL_TRENDS, CACHED_REL_TRENDS_HASH hash = hash_pandas_object(data) if CACHED_REL_TRENDS_HASH is not None and CACHED_REL_TRENDS_HASH.equals( hash): return CACHED_REL_TRENDS end_date = data['date'].max() def rel_trend(group): nonlocal end_date all_keys = pd.date_range(start=group['date'].min(), end=end_date, freq='7d') group = pd.Series(group['cases_per_100k'].values, index=group['date']) zip_data = pd.Series(index=all_keys, dtype='float') group = zip_data.align(group, join='left', fill_value=0)[1] return group rel_trends = data.groupby('zip').apply(rel_trend).reset_index() rel_trends = rel_trends.rename(columns={0: 'cases_per_100k'}) CACHED_REL_TRENDS_HASH = hash CACHED_REL_TRENDS = rel_trends return rel_trends
def get_value_meta(self, value, meta_conf): # type: (pd.DataFrame, ValueMetaConf) -> ValueMeta data_schema = {} if meta_conf.log_schema: data_schema.update({ "type": self.type_str, "columns": list(value.columns), "shape": value.shape, "dtypes": {col: str(type_) for col, type_ in value.dtypes.items()}, }) if meta_conf.log_size: data_schema["size"] = int(value.size) if meta_conf.log_preview: value_preview = self.to_preview( value, preview_size=meta_conf.get_preview_size()) data_hash = fast_hasher.hash( hash_pandas_object(value, index=True).values) else: value_preview = None data_hash = None return ValueMeta( value_preview=value_preview, data_dimensions=value.shape, data_schema=data_schema, data_hash=data_hash, )
def test_df_value_meta(self, pandas_data_frame): expected_data_schema = { "type": DataFrameValueType.type_str, "columns": list(pandas_data_frame.columns), "size": int(pandas_data_frame.size), "shape": pandas_data_frame.shape, "dtypes": { col: str(type_) for col, type_ in pandas_data_frame.dtypes.items() }, } meta_conf = ValueMetaConf.enabled() expected_value_meta = ValueMeta( value_preview=DataFrameValueType().to_preview( pandas_data_frame, preview_size=meta_conf.get_preview_size()), data_dimensions=pandas_data_frame.shape, data_schema=expected_data_schema, data_hash=fast_hasher.hash( hash_pandas_object(pandas_data_frame, index=True).values), ) df_value_meta = DataFrameValueType().get_value_meta( pandas_data_frame, meta_conf=meta_conf) assert df_value_meta.value_preview == expected_value_meta.value_preview assert df_value_meta.data_hash == expected_value_meta.data_hash assert json_utils.dumps(df_value_meta.data_schema) == json_utils.dumps( expected_value_meta.data_schema) assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions assert df_value_meta == expected_value_meta
def test_df_value_meta( self, pandas_data_frame, pandas_data_frame_histograms, pandas_data_frame_stats ): expected_data_schema = { "type": DataFrameValueType.type_str, "columns": list(pandas_data_frame.columns), "size": int(pandas_data_frame.size), "shape": pandas_data_frame.shape, "dtypes": { col: str(type_) for col, type_ in pandas_data_frame.dtypes.items() }, } meta_conf = ValueMetaConf.enabled() expected_value_meta = ValueMeta( value_preview=DataFrameValueType().to_preview( pandas_data_frame, preview_size=meta_conf.get_preview_size() ), data_dimensions=pandas_data_frame.shape, data_schema=expected_data_schema, data_hash=fast_hasher.hash( hash_pandas_object(pandas_data_frame, index=True).values ), descriptive_stats=pandas_data_frame_stats, histograms=pandas_data_frame_histograms, ) df_value_meta = DataFrameValueType().get_value_meta( pandas_data_frame, meta_conf=meta_conf ) assert df_value_meta.value_preview == expected_value_meta.value_preview assert df_value_meta.data_hash == expected_value_meta.data_hash assert json_utils.dumps(df_value_meta.data_schema) == json_utils.dumps( expected_value_meta.data_schema ) assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions std = df_value_meta.descriptive_stats["Births"].pop("std") expected_std = expected_value_meta.descriptive_stats["Births"].pop("std") assert round(std, 2) == expected_std df_value_meta.descriptive_stats["Names"].pop("top") assert df_value_meta.descriptive_stats == expected_value_meta.descriptive_stats counts, values = df_value_meta.histograms.pop("Names") expected_counts, expected_values = expected_value_meta.histograms.pop("Names") assert counts == expected_counts assert set(values) == set(expected_values) # order changes in each run # histograms are tested in histogram tests and they change a lot, no need to test also here df_value_meta.histograms = expected_value_meta.histograms = None expected_value_meta.histogram_system_metrics = ( df_value_meta.histogram_system_metrics ) assert df_value_meta.data_schema == expected_value_meta.data_schema assert attr.asdict(df_value_meta) == attr.asdict(expected_value_meta)
def get_value_meta(self, value, meta_conf): # type: (pd.DataFrame, ValueMetaConf) -> ValueMeta data_schema = {} if meta_conf.log_schema: data_schema.update({ "type": self.type_str, "columns": list(value.columns), "shape": value.shape, "dtypes": {col: str(type_) for col, type_ in value.dtypes.items()}, }) if meta_conf.log_size: data_schema["size.bytes"] = int(value.size) value_preview, data_hash = None, None if meta_conf.log_preview: value_preview = self.to_preview( value, preview_size=meta_conf.get_preview_size()) try: data_hash = fast_hasher.hash( hash_pandas_object(value, index=True).values) except Exception as e: logger.warning( "Could not hash dataframe object %s! Exception: %s", value, e) if meta_conf.log_histograms: start_time = time.time() stats, histograms = PandasHistograms( value, meta_conf).get_histograms_and_stats() hist_sys_metrics = { "histograms_and_stats_calc_time": time.time() - start_time } else: stats, histograms = {}, {} hist_sys_metrics = None return ValueMeta( value_preview=value_preview, data_dimensions=value.shape, data_schema=data_schema, data_hash=data_hash, descriptive_stats=stats, histogram_system_metrics=hist_sys_metrics, histograms=histograms, )
def test_df_value_meta(self, pandas_data_frame): expected_data_schema = { "type": DataFrameValueType.type_str, "columns": list(pandas_data_frame.columns), "size.bytes": int(pandas_data_frame.size), "shape": pandas_data_frame.shape, "dtypes": { col: str(type_) for col, type_ in pandas_data_frame.dtypes.items() }, } meta_conf = ValueMetaConf.enabled() expected_value_meta = ValueMeta( value_preview=DataFrameValueType().to_preview( pandas_data_frame, preview_size=meta_conf.get_preview_size()), data_dimensions=pandas_data_frame.shape, data_schema=expected_data_schema, data_hash=fast_hasher.hash( hash_pandas_object(pandas_data_frame, index=True).values), ) df_value_meta = DataFrameValueType().get_value_meta( pandas_data_frame, meta_conf=meta_conf) assert df_value_meta.value_preview == expected_value_meta.value_preview assert df_value_meta.data_hash == expected_value_meta.data_hash assert df_value_meta.data_schema == expected_value_meta.data_schema assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions assert df_value_meta.data_schema == expected_value_meta.data_schema # histograms and stats are tested in histogram tests and they change a lot, no need to test also here assert set([ col_stats.column_name for col_stats in df_value_meta.columns_stats ]) == {"Names", "Births"} assert set(df_value_meta.histograms.keys()) == {"Names", "Births"}
def get_data_hash(self, value): return fast_hasher.hash(hash_pandas_object(value, index=True).values)