Пример #1
0
def compute_container_hash(
        data_container: Union[QFSeries, QFDataFrame, QFDataArray]) -> str:
    """
    For the given data container returns the hexadecimal digest of the data.

    Parameters
    ----------
    data_container: QFSeries, QFDataFrame, QFDataArray
        container, which digest should be computed

    Returns
    -------
    str
        hexadecimal digest of data in the passed data container
    """
    if isinstance(data_container, QFSeries):
        hashed_container = hash_pandas_object(data_container)

    elif isinstance(data_container, QFDataFrame):
        hashed_container = hash_pandas_object(data_container)

    elif isinstance(data_container, QFDataArray):
        hash_data_frame = QFDataFrame([
            hash_pandas_object(data_container.loc[:, :, field].to_pandas())
            for field in data_container.fields
        ])
        hashed_container = hash_pandas_object(hash_data_frame)
    else:
        raise ValueError("Unsupported type of data container")

    return hashlib.sha1(hashed_container.values).hexdigest()
Пример #2
0
def _get_rel_trends(data):
    global CACHED_REL_TRENDS, CACHED_REL_TRENDS_HASH

    hash = hash_pandas_object(data)
    if CACHED_REL_TRENDS_HASH is not None and CACHED_REL_TRENDS_HASH.equals(
            hash):
        return CACHED_REL_TRENDS

    end_date = data['date'].max()

    def rel_trend(group):
        nonlocal end_date
        all_keys = pd.date_range(start=group['date'].min(),
                                 end=end_date,
                                 freq='7d')
        group = pd.Series(group['cases_per_100k'].values, index=group['date'])
        zip_data = pd.Series(index=all_keys, dtype='float')
        group = zip_data.align(group, join='left', fill_value=0)[1]
        return group

    rel_trends = data.groupby('zip').apply(rel_trend).reset_index()
    rel_trends = rel_trends.rename(columns={0: 'cases_per_100k'})
    CACHED_REL_TRENDS_HASH = hash
    CACHED_REL_TRENDS = rel_trends
    return rel_trends
Пример #3
0
    def get_value_meta(self, value, meta_conf):
        # type: (pd.DataFrame, ValueMetaConf) -> ValueMeta
        data_schema = {}
        if meta_conf.log_schema:
            data_schema.update({
                "type": self.type_str,
                "columns": list(value.columns),
                "shape": value.shape,
                "dtypes":
                {col: str(type_)
                 for col, type_ in value.dtypes.items()},
            })

        if meta_conf.log_size:
            data_schema["size"] = int(value.size)

        if meta_conf.log_preview:
            value_preview = self.to_preview(
                value, preview_size=meta_conf.get_preview_size())
            data_hash = fast_hasher.hash(
                hash_pandas_object(value, index=True).values)
        else:
            value_preview = None
            data_hash = None

        return ValueMeta(
            value_preview=value_preview,
            data_dimensions=value.shape,
            data_schema=data_schema,
            data_hash=data_hash,
        )
Пример #4
0
    def test_df_value_meta(self, pandas_data_frame):
        expected_data_schema = {
            "type": DataFrameValueType.type_str,
            "columns": list(pandas_data_frame.columns),
            "size": int(pandas_data_frame.size),
            "shape": pandas_data_frame.shape,
            "dtypes": {
                col: str(type_)
                for col, type_ in pandas_data_frame.dtypes.items()
            },
        }

        meta_conf = ValueMetaConf.enabled()
        expected_value_meta = ValueMeta(
            value_preview=DataFrameValueType().to_preview(
                pandas_data_frame, preview_size=meta_conf.get_preview_size()),
            data_dimensions=pandas_data_frame.shape,
            data_schema=expected_data_schema,
            data_hash=fast_hasher.hash(
                hash_pandas_object(pandas_data_frame, index=True).values),
        )

        df_value_meta = DataFrameValueType().get_value_meta(
            pandas_data_frame, meta_conf=meta_conf)

        assert df_value_meta.value_preview == expected_value_meta.value_preview
        assert df_value_meta.data_hash == expected_value_meta.data_hash
        assert json_utils.dumps(df_value_meta.data_schema) == json_utils.dumps(
            expected_value_meta.data_schema)
        assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions
        assert df_value_meta == expected_value_meta
    def test_df_value_meta(
        self, pandas_data_frame, pandas_data_frame_histograms, pandas_data_frame_stats
    ):
        expected_data_schema = {
            "type": DataFrameValueType.type_str,
            "columns": list(pandas_data_frame.columns),
            "size": int(pandas_data_frame.size),
            "shape": pandas_data_frame.shape,
            "dtypes": {
                col: str(type_) for col, type_ in pandas_data_frame.dtypes.items()
            },
        }

        meta_conf = ValueMetaConf.enabled()
        expected_value_meta = ValueMeta(
            value_preview=DataFrameValueType().to_preview(
                pandas_data_frame, preview_size=meta_conf.get_preview_size()
            ),
            data_dimensions=pandas_data_frame.shape,
            data_schema=expected_data_schema,
            data_hash=fast_hasher.hash(
                hash_pandas_object(pandas_data_frame, index=True).values
            ),
            descriptive_stats=pandas_data_frame_stats,
            histograms=pandas_data_frame_histograms,
        )

        df_value_meta = DataFrameValueType().get_value_meta(
            pandas_data_frame, meta_conf=meta_conf
        )

        assert df_value_meta.value_preview == expected_value_meta.value_preview
        assert df_value_meta.data_hash == expected_value_meta.data_hash
        assert json_utils.dumps(df_value_meta.data_schema) == json_utils.dumps(
            expected_value_meta.data_schema
        )
        assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions

        std = df_value_meta.descriptive_stats["Births"].pop("std")
        expected_std = expected_value_meta.descriptive_stats["Births"].pop("std")
        assert round(std, 2) == expected_std
        df_value_meta.descriptive_stats["Names"].pop("top")
        assert df_value_meta.descriptive_stats == expected_value_meta.descriptive_stats

        counts, values = df_value_meta.histograms.pop("Names")
        expected_counts, expected_values = expected_value_meta.histograms.pop("Names")
        assert counts == expected_counts
        assert set(values) == set(expected_values)  # order changes in each run
        # histograms are tested in histogram tests and they change a lot, no need to test also here
        df_value_meta.histograms = expected_value_meta.histograms = None

        expected_value_meta.histogram_system_metrics = (
            df_value_meta.histogram_system_metrics
        )
        assert df_value_meta.data_schema == expected_value_meta.data_schema
        assert attr.asdict(df_value_meta) == attr.asdict(expected_value_meta)
Пример #6
0
    def get_value_meta(self, value, meta_conf):
        # type: (pd.DataFrame, ValueMetaConf) -> ValueMeta
        data_schema = {}
        if meta_conf.log_schema:
            data_schema.update({
                "type": self.type_str,
                "columns": list(value.columns),
                "shape": value.shape,
                "dtypes":
                {col: str(type_)
                 for col, type_ in value.dtypes.items()},
            })

        if meta_conf.log_size:
            data_schema["size.bytes"] = int(value.size)

        value_preview, data_hash = None, None
        if meta_conf.log_preview:
            value_preview = self.to_preview(
                value, preview_size=meta_conf.get_preview_size())
            try:
                data_hash = fast_hasher.hash(
                    hash_pandas_object(value, index=True).values)
            except Exception as e:
                logger.warning(
                    "Could not hash dataframe object %s! Exception: %s", value,
                    e)

        if meta_conf.log_histograms:
            start_time = time.time()
            stats, histograms = PandasHistograms(
                value, meta_conf).get_histograms_and_stats()
            hist_sys_metrics = {
                "histograms_and_stats_calc_time": time.time() - start_time
            }
        else:
            stats, histograms = {}, {}
            hist_sys_metrics = None

        return ValueMeta(
            value_preview=value_preview,
            data_dimensions=value.shape,
            data_schema=data_schema,
            data_hash=data_hash,
            descriptive_stats=stats,
            histogram_system_metrics=hist_sys_metrics,
            histograms=histograms,
        )
Пример #7
0
    def test_df_value_meta(self, pandas_data_frame):
        expected_data_schema = {
            "type": DataFrameValueType.type_str,
            "columns": list(pandas_data_frame.columns),
            "size.bytes": int(pandas_data_frame.size),
            "shape": pandas_data_frame.shape,
            "dtypes": {
                col: str(type_)
                for col, type_ in pandas_data_frame.dtypes.items()
            },
        }

        meta_conf = ValueMetaConf.enabled()
        expected_value_meta = ValueMeta(
            value_preview=DataFrameValueType().to_preview(
                pandas_data_frame, preview_size=meta_conf.get_preview_size()),
            data_dimensions=pandas_data_frame.shape,
            data_schema=expected_data_schema,
            data_hash=fast_hasher.hash(
                hash_pandas_object(pandas_data_frame, index=True).values),
        )

        df_value_meta = DataFrameValueType().get_value_meta(
            pandas_data_frame, meta_conf=meta_conf)

        assert df_value_meta.value_preview == expected_value_meta.value_preview
        assert df_value_meta.data_hash == expected_value_meta.data_hash
        assert df_value_meta.data_schema == expected_value_meta.data_schema

        assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions
        assert df_value_meta.data_schema == expected_value_meta.data_schema

        # histograms and stats are tested in histogram tests and they change a lot, no need to test also here
        assert set([
            col_stats.column_name for col_stats in df_value_meta.columns_stats
        ]) == {"Names", "Births"}
        assert set(df_value_meta.histograms.keys()) == {"Names", "Births"}
Пример #8
0
 def get_data_hash(self, value):
     return fast_hasher.hash(hash_pandas_object(value, index=True).values)