Пример #1
0
    def test_multi_indexer_undo_transform(self):
        multi_indexer = indexers.MultiIndexer([
            indexers.IdIndexer('user', 'tenant', 'actual_uid', True),
            indexers.IdIndexer('res', 'tenant', 'actual_rid', True),
        ])

        df = self.create_sample_dataframe()
        model = multi_indexer.fit(df)
        new_df = model.transform(df)

        assert new_df.filter(f.col('actual_uid') <= 0).count() == 0
        assert new_df.filter(f.col('actual_rid') <= 0).count() == 0

        orig_df = model.undo_transform(new_df.select(
            'tenant', 'actual_uid', 'actual_rid'
        ))

        assert orig_df.select(
            'tenant', 'user'
        ).distinct().orderBy('tenant', 'user').collect() == df.select(
            'tenant', 'user'
        ).distinct().orderBy('tenant', 'user').collect()

        assert orig_df.select(
            'tenant', 'res'
        ).distinct().orderBy('tenant', 'res').collect() == df.select(
            'tenant', 'res'
        ).distinct().orderBy('tenant', 'res').collect()
Пример #2
0
    def test_multi_indexer_non_per_tenant(self):
        multi_indexer = indexers.MultiIndexer([
            indexers.IdIndexer('user', 'tenant', 'actual_uid', False),
            indexers.IdIndexer('res', 'tenant', 'actual_rid', False)
        ])

        df = self.create_sample_dataframe()
        model = multi_indexer.fit(df)
        new_df = model.transform(df)

        assert new_df.count() == df.count()
        assert new_df.filter(f.col('actual_uid') <= 0).count() == 0
        assert new_df.filter(f.col('actual_rid') <= 0).count() == 0

        user_count = df.select('tenant', 'user').distinct().count()
        res_count = df.select('tenant', 'res').distinct().count()

        assert new_df.select('actual_uid').distinct().count() == user_count
        assert new_df.select('actual_rid').distinct().count() == res_count

        stats_row = new_df.select(
            f.min('actual_uid').alias('min_uid'),
            f.max('actual_uid').alias('max_uid'),
            f.min('actual_rid').alias('min_rid'),
            f.max('actual_rid').alias('max_rid')
        ).first()

        assert (stats_row['min_uid'] == 1) and (stats_row['max_uid'] == user_count)
        assert (stats_row['min_rid'] == 1) and (stats_row['max_rid'] == res_count)

        orig_df = model.undo_transform(new_df).select('tenant', 'user', 'res').orderBy('tenant', 'user', 'res')

        assert df.select('tenant', 'user', 'res').orderBy('tenant', 'user', 'res').collect() == orig_df.collect()
Пример #3
0
    def test_explain(self):
        types = [str, bool]

        def counts(c: int, tt: Type):
            return tt not in types or c > 0

        params = ['inputCol', 'partitionKey', 'outputCol', 'resetPerPartition']
        self.check_explain(indexers.IdIndexer('input', 'tenant', 'output', True), params, counts)
Пример #4
0
    def test_multi_indexer(self):
        multi_indexer = indexers.MultiIndexer([
            indexers.IdIndexer('user', 'tenant', 'actual_uid', True),
            indexers.IdIndexer('res', 'tenant', 'actual_rid', True),
        ])

        df = self.create_sample_dataframe()
        model = multi_indexer.fit(df)
        new_df = model.transform(df)

        assert new_df.count() == df.count()
        assert new_df.filter(f.col('actual_uid') <= 0).count() == 0
        assert new_df.filter(f.col('actual_rid') <= 0).count() == 0

        assert 0 == new_df.filter(
            f.col('expected_uid') != f.col('actual_uid')
        ).count()

        assert 0 == new_df.filter(
            f.col('expected_rid') != f.col('actual_rid')
        ).count()
Пример #5
0
    def test_id_indexer(self):
        indexer = indexers.IdIndexer('user', 'tenant', 'actual_uid', True)

        df = self.create_sample_dataframe()
        model = indexer.fit(df)
        new_df = model.transform(df)

        assert new_df.count() == df.count()

        assert 0 == new_df.filter(
            f.col('expected_uid') != f.col('actual_uid')
        ).count()
Пример #6
0
    def _fit(self, df: DataFrame) -> AccessAnomalyModel:
        # index the user and resource columns to allow running the spark ALS algorithm
        the_indexer = indexers.MultiIndexer(indexers=[
            indexers.IdIndexer(input_col=self.user_col,
                               partition_key=self.tenant_col,
                               output_col=self.indexed_user_col,
                               reset_per_partition=self.separate_tenants),
            indexers.IdIndexer(input_col=self.res_col,
                               partition_key=self.tenant_col,
                               output_col=self.indexed_res_col,
                               reset_per_partition=self.separate_tenants)
        ])

        the_indexer_model = the_indexer.fit(df)

        # indexed_df is the dataframe with the indices for user and resource
        indexed_df = the_indexer_model.transform(df)
        enriched_df = self._enrich_and_normalize(indexed_df).cache()

        user_res_feature_vector_mapping_df = self.create_spark_model_vectors_df(
            enriched_df)
        user_res_norm_cf_df_model = ModelNormalizeTransformer(
            enriched_df,
            self.rank_param).transform(user_res_feature_vector_mapping_df)

        # convert user and resource indices back to names
        user_index_model = the_indexer_model.get_model_by_input_col(
            self.user_col)
        res_index_model = the_indexer_model.get_model_by_input_col(
            self.res_col)
        assert user_index_model is not None and res_index_model is not None

        norm_user_mapping_df = user_res_norm_cf_df_model.user_feature_vector_mapping_df
        norm_res_mapping_df = user_res_norm_cf_df_model.res_feature_vector_mapping_df

        indexed_user_col = self.indexed_user_col
        indexed_res_col = self.indexed_res_col

        # do the actual index to name mapping (using undo_transform)
        final_user_mapping_df = user_index_model.undo_transform(
            norm_user_mapping_df).drop(indexed_user_col)
        final_res_mapping_df = res_index_model.undo_transform(
            norm_res_mapping_df).drop(indexed_res_col)

        tenant_col, user_col, res_col = self.tenant_col, self.user_col, self.res_col

        history_access_df = self.history_access_df
        access_df = \
            history_access_df if history_access_df is not None else df.select(tenant_col, user_col, res_col).cache()

        user2component_mappings_df, res2component_mappings_df = ConnectedComponents(
            tenant_col, user_col, res_col).transform(access_df)

        return AccessAnomalyModel(
            _UserResourceFeatureVectorMapping(
                tenant_col=self.tenant_col,
                user_col=self.user_col,
                user_vec_col=self.user_vec_col,
                res_col=self.res_col,
                res_vec_col=self.res_vec_col,
                history_access_df=history_access_df,
                user2component_mappings_df=user2component_mappings_df,
                res2component_mappings_df=res2component_mappings_df,
                user_feature_vector_mapping_df=final_user_mapping_df.cache(),
                res_feature_vector_mapping_df=final_res_mapping_df.cache()),
            self.output_col)
    def test_enrich_and_normalize(self):
        training = Dataset.create_new_training(1.0).cache()

        access_anomaly = AccessAnomaly(
            tenantCol=AccessAnomalyConfig.default_tenant_col,
            maxIter=10,
            applyImplicitCf=False)

        tenant_col = access_anomaly.tenant_col
        user_col = access_anomaly.user_col
        indexed_user_col = access_anomaly.indexed_user_col
        res_col = access_anomaly.res_col
        indexed_res_col = access_anomaly.indexed_res_col
        scaled_likelihood_col = access_anomaly.scaled_likelihood_col

        assert training.filter(f.col(user_col).isNull()).count() == 0
        assert training.filter(f.col(res_col).isNull()).count() == 0

        the_indexer = indexers.MultiIndexer(indexers=[
            indexers.IdIndexer(input_col=user_col,
                               partition_key=tenant_col,
                               output_col=indexed_user_col,
                               reset_per_partition=False),
            indexers.IdIndexer(input_col=res_col,
                               partition_key=tenant_col,
                               output_col=indexed_res_col,
                               reset_per_partition=False)
        ])

        the_indexer_model = the_indexer.fit(training)
        indexed_df = materialized_cache(the_indexer_model.transform(training))

        assert indexed_df.filter(f.col(indexed_user_col).isNull()).count() == 0
        assert indexed_df.filter(f.col(indexed_res_col).isNull()).count() == 0
        assert indexed_df.filter(f.col(indexed_user_col) <= 0).count() == 0
        assert indexed_df.filter(f.col(indexed_res_col) <= 0).count() == 0

        unindexed_df = materialized_cache(
            the_indexer_model.undo_transform(indexed_df))
        assert unindexed_df.filter(f.col(user_col).isNull()).count() == 0
        assert unindexed_df.filter(f.col(res_col).isNull()).count() == 0

        enriched_indexed_df = materialized_cache(
            access_anomaly._enrich_and_normalize(indexed_df))
        enriched_df = materialized_cache(
            without_ffa(the_indexer_model.undo_transform(enriched_indexed_df)))

        assert enriched_df.filter(f.col(user_col).isNull()).count() == 0
        assert enriched_df.filter(f.col(res_col).isNull()).count() == 0

        assert enriched_df.filter(
            (get_department(user_col) == get_department(res_col))
            & (f.col(scaled_likelihood_col) == 1.0)).count() == 0

        assert enriched_df.filter(
            (get_department(user_col) != get_department(res_col))
            & (f.col(scaled_likelihood_col) != 1.0)).count() == 0

        assert enriched_df.filter(
            (get_department(user_col) !=
             get_department(res_col))).count() == enriched_df.filter(
                 f.col(scaled_likelihood_col) == 1.0).count()

        assert enriched_df.filter(
            (get_department(user_col) == get_department(res_col)
             )).count() == enriched_df.filter(
                 f.col(scaled_likelihood_col) != 1.0).count()

        low_value = access_anomaly.low_value
        high_value = access_anomaly.high_value

        assert enriched_df.count() > training.count()
        assert enriched_df.filter((
            (f.col(scaled_likelihood_col) >= low_value)
            & (f.col(scaled_likelihood_col) <= high_value))
                                  | (f.col(scaled_likelihood_col) == 1.0)
                                  ).count() == enriched_df.count()