def run(self, x, y, ds_init=None, *args, **kargs): x = CArray(x).atleast_2d() y = CArray(y).atleast_2d() x_init = None if ds_init is None else CArray(ds_init.X).atleast_2d() # only consider samples that can be manipulated v = self.is_attack_class(y) idx = CArray(v.find(v)).ravel() # print(v, idx) # number of modifiable samples n_mod_samples = idx.size adv_ds = CDataset(x.deepcopy(), y.deepcopy()) # If dataset is sparse, set the proper attribute if x.issparse is True: self._issparse = True # array in which the value of the optimization function are stored fs_opt = CArray.zeros(n_mod_samples, ) y_pred = CArray.zeros(n_mod_samples, ) scores = CArray.zeros((n_mod_samples, 2)) for i in range(n_mod_samples): k = idx[i].item() # idx of sample that can be modified xi = x[k, :] if x_init is None else x_init[k, :] x_opt, f_opt = self._run(x[k, :], y[k], x_init=xi, *args, **kargs) self.logger.info( "Point: {:}/{:}, dmax:{:}, f(x):{:}, eval:{:}/{:}".format( k, x.shape[0], self._dmax, f_opt, self.f_eval, self.grad_eval)) if x_opt.shape[-1] > adv_ds.X.shape[-1]: # Need to resize the whole adv dataset, since CDataset can't deal with varying vector sizes new_length = x_opt.shape[-1] adv_ds.X = adv_ds.X.resize((adv_ds.X.shape[0], new_length), 256) adv_ds.X[k, :min(adv_ds.X.shape[-1], x_opt.shape[-1])] = x_opt fs_opt[i] = f_opt y_p, score = self.problem.model_wrapper.predict( x_opt, return_decision_function=True) scores[i, :] = score[0, :] y_pred[i] = y_p # Return the mean objective function value on the evasion points ( # computed from the outputs of the surrogate classifier) f_obj = fs_opt.mean() return y_pred, scores, adv_ds, f_obj
def _run(self, xc, yc, idx=0): """Single point poisoning. Here xc can be a *set* of points, in which case idx specifies which point should be manipulated by the poisoning attack. """ xc = CArray(xc.deepcopy()).atleast_2d() self._yc = yc self._xc = xc self._idx = idx # point to be optimized within xc self._x0 = self._xc[idx, :].ravel() self._init_solver() if self.y_target is None: # indiscriminate attack x = self._solver.maximize(self._x0) else: # targeted attack x = self._solver.minimize(self._x0) self._solution_from_solver() return x
def _forward(self, x): """ Apply the TF-IDF transform. Parameters ---------- x : CArray Array with features to be transformed. Returns ------- Array with normalized features. Shape of returned array is the same of the original array. """ # transform data x = CArray(self._sklearn_tfidf.transform(x.get_data())) if self.norm is not None: # apply unitnorm if set # store x after the tf-idf transformation (needed for grad. comp.) self._cached_x_tfidf = x.deepcopy() x = self._unitnorm.transform(x) return x
class CModuleTestCases(CUnitTest): """Unittests interface for CPreProcess.""" def setUp(self): self.array_dense = CArray([[1, 0, 0, 5], [2, 4, 0, 0], [3, 6, 0, 0]]) self.array_sparse = CArray(self.array_dense.deepcopy(), tosparse=True) self.labels = CArray([0, 1, 0]) # found bug in sklearn normalizer, see: # https://github.com/scikit-learn/scikit-learn/issues/16632 # self.row_dense = CArray([-4, 0, 6]) self.row_dense = CArray([4, 0, 6]) self.column_dense = self.row_dense.deepcopy().T self.row_sparse = CArray(self.row_dense.deepcopy(), tosparse=True) self.column_sparse = self.row_sparse.deepcopy().T @staticmethod def _create_chain(class_type_list, kwargs_list): """Creates a module with other modules chained and a list of the same modules (not chained).""" chain = None # module with preprocessing chain modules = [] # list of modules (not connected via preprocessing) for i, pre_id in enumerate(class_type_list): chain = CModule.create(pre_id, preprocess=chain, **kwargs_list[i]) modules.append(CModule.create(pre_id, **kwargs_list[i])) return chain, modules def _test_chain(self, x, class_type_list, kwargs_list, y=None): """Tests if preprocess chain and manual chaining yield same result.""" chain, modules = self._create_chain(class_type_list, kwargs_list) chain = chain.fit(x, y=y) self.logger.info("Preprocessors chain:\n{:}".format(chain)) x_chain = chain.forward(x) self.logger.info("Trasformed X (chain):\n{:}".format(x_chain)) # Train the manual chain and transform x_manual = x for module in modules: module.fit(x_manual, y=y) x_manual = module.forward(x_manual) self.logger.info("Trasformed X (manual):\n{:}".format(x_manual)) self.assert_allclose(x_chain, x_manual) return x_chain def _test_chain_gradient(self, x, class_type_list, kwargs_list, y=None): """Tests if gradient preprocess chain and gradient of manual chaining yield same result.""" chain, modules = self._create_chain(class_type_list, kwargs_list) chain = chain.fit(x, y=y) self.logger.info("module chain:\n{:}".format(chain)) v = x[1, :] fwd_chain = chain.forward(v) # this has size equal to n_outputs # compute gradient of the last output n_outputs = fwd_chain.size w = CArray.zeros(shape=(n_outputs, )) w[-1] = 1 grad_chain = chain.gradient(v, w=w) self.logger.info("chain.forward({:}):\n{:}".format(v, fwd_chain)) self.logger.info("chain.gradient({:}):\n{:}".format(v, grad_chain)) # Manually train the chain for module in modules: module.fit(x, y=y) x = module.forward(x) # test on a single point v_list = [v] for module in modules[:-1]: v = module.forward(v) v_list.append(v) v_list = list(reversed(v_list)) modules = list(reversed(modules)) grad = w for i, v in enumerate(v_list): grad = modules[i].gradient(v, w=grad) self.logger.info("chain.gradient({:}):\n{:}".format(v, grad)) self.assert_allclose(grad_chain, grad) return grad
def test_input_shape(self): """Test CArray.input_shape behavior.""" array = CArray([[[2, 3], [22, 33]], [[4, 5], [44, 55]]]) array_s = \ CArray([[[2, 3], [22, 33]], [[4, 5], [44, 55]]], tosparse=True) ref_shape = (2, 2, 2) # not propagate on getitem (as it returns new objects) out = array[0:2, 0:2] self.assertEqual(out.input_shape, out.shape) out = array_s[0:2, 0:2] self.assertEqual(out.input_shape, out.shape) # not propagate on other generic methods (as they return new objects) out = array.astype(float) self.assertEqual(out.input_shape, out.shape) out = array.unique() self.assertEqual(out.input_shape, out.shape) out = array.all(axis=0) self.assertEqual(out.input_shape, out.shape) # not propagate on classmethods (es. concatenate/append) out = CArray.concatenate(array, array, axis=0) self.assertEqual(out.input_shape, out.shape) out = CArray.concatenate(array, array, axis=None) self.assertEqual(out.input_shape, out.shape) # should propagate on copy/deepcopy from copy import copy, deepcopy array_c = copy(array) self.assertEqual(array_c.input_shape, ref_shape) array_c = copy(array_s) self.assertEqual(array_c.input_shape, ref_shape) array_c = deepcopy(array) self.assertEqual(array_c.input_shape, ref_shape) array_c = deepcopy(array_s) self.assertEqual(array_c.input_shape, ref_shape) array_c = array.deepcopy() self.assertEqual(array_c.input_shape, ref_shape) array_c = array_s.deepcopy() self.assertEqual(array_c.input_shape, ref_shape) # should propagate on setitem array_c = array.deepcopy() array_c[0:2, 0:2] = 200 self.assertEqual(array_c.input_shape, ref_shape) array_c = array.deepcopy() array_c[0:2, 0:2] = CArray([[100, 200], [300, 400]]) self.assertEqual(array_c.input_shape, ref_shape) array_c = array_s.deepcopy() array_c[0:2, 0:2] = CArray([[100, 200], [300, 400]]) self.assertEqual(array_c.input_shape, ref_shape) # should propagate on todense/tosparse self.assertEqual(array.tosparse().input_shape, ref_shape) self.assertEqual(array.todense().input_shape, ref_shape) self.assertEqual(array_s.tosparse().input_shape, ref_shape) self.assertEqual(array_s.todense().input_shape, ref_shape)
class CPreProcessTestCases(CUnitTest): """Unittests interface for CPreProcess.""" def setUp(self): self.array_dense = CArray([[1, 0, 0, 5], [2, 4, 0, 0], [3, 6, 0, 0]]) self.array_sparse = CArray(self.array_dense.deepcopy(), tosparse=True) # found bug in sklearn normalizer, see: # https://github.com/scikit-learn/scikit-learn/issues/16632 # self.row_dense = CArray([-4, 0, 6]) self.row_dense = CArray([4, 0, 6]) self.column_dense = self.row_dense.deepcopy().T self.row_sparse = CArray(self.row_dense.deepcopy(), tosparse=True) self.column_sparse = self.row_sparse.deepcopy().T @staticmethod def _create_chain(pre_id_list, kwargs_list): """Creates a preprocessor with other preprocessors chained and a list of the same preprocessors (not chained)""" chain = None pre_list = [] for i, pre_id in enumerate(pre_id_list): chain = CPreProcess.create(pre_id, preprocess=chain, **kwargs_list[i]) pre_list.append(CPreProcess.create(pre_id, **kwargs_list[i])) return chain, pre_list def _test_chain(self, x, pre_id_list, kwargs_list, y=None): """Tests if preprocess chain and manual chaining yield same result.""" chain, pre_list = self._create_chain(pre_id_list, kwargs_list) chain = chain.fit(x, y=y) self.logger.info("Preprocessors chain:\n{:}".format(chain)) x_chain = chain.transform(x) self.logger.info("Trasformed X (chain):\n{:}".format(x_chain)) # Train the manual chain and transform x_manual = x for pre in pre_list: x_manual = pre.fit_transform(x_manual, y=y) self.logger.info("Trasformed X (manual):\n{:}".format(x_manual)) self.assert_allclose(x_chain, x_manual) # Reverting array (if available) try: x_chain_revert = chain.inverse_transform(x_chain) self.logger.info("Reverted X (chain):\n{:}".format(x_chain_revert)) self.logger.info("Original X:\n{:}".format(x)) self.assert_array_almost_equal(x_chain_revert, x) except NotImplementedError: self.logger.info("inverse_transform not available") return x_chain def _test_chain_gradient(self, x, pre_id_list, kwargs_list, y=None): """Tests if gradient preprocess chain and gradient of manual chaining yield same result.""" chain, pre_list = self._create_chain(pre_id_list, kwargs_list) chain = chain.fit(x, y=y) self.logger.info("Preprocessors chain:\n{:}".format(chain)) v = x[1, :] grad_chain = chain.gradient(v) self.logger.info("gradient({:}) (chain):\n{:}".format(v, grad_chain)) # Manually compose the chain and transform for pre in pre_list: x = pre.fit_transform(x, y=y) v_list = [v] for pre in pre_list[:-1]: v = pre.transform(v) v_list.append(v) v_list = list(reversed(v_list)) pre_list = list(reversed(pre_list)) grad = None for i, v in enumerate(v_list): grad = pre_list[i].gradient(v, w=grad) self.logger.info("gradient({:}) (manual):\n{:}".format(v, grad)) self.assert_allclose(grad_chain, grad) return grad_chain
def run(self, x, y, ds_init=None, *args, **kargs): """Runs evasion on a dataset. Parameters ---------- x : CArray Data points. y : CArray True labels. ds_init : CDataset Dataset for warm starts. Returns ------- y_pred : CArray Predicted labels for all ds samples by target classifier. scores : CArray Scores for all ds samples by target classifier. adv_ds : CDataset Dataset of manipulated samples. f_obj : float Mean value of the objective function computed on each data point. """ x = CArray(x).atleast_2d() y = CArray(y).atleast_2d() x_init = None if ds_init is None else CArray(ds_init.X).atleast_2d() # only consider samples that can be manipulated v = self.is_attack_class(y) idx = CArray(v.find(v)).ravel() # print(v, idx) # number of modifiable samples n_mod_samples = idx.size adv_ds = CDataset(x.deepcopy(), y.deepcopy()) # If dataset is sparse, set the proper attribute if x.issparse is True: self._issparse = True # array in which the value of the optimization function are stored fs_opt = CArray.zeros(n_mod_samples, ) for i in range(n_mod_samples): k = idx[i].item() # idx of sample that can be modified xi = x[k, :] if x_init is None else x_init[k, :] x_opt, f_opt = self._run(x[k, :], y[k], x_init=xi, *args, **kargs) self.logger.info( "Point: {:}/{:}, dmax:{:}, f(x):{:}, eval:{:}/{:}".format( k, x.shape[0], self._dmax, f_opt, self.f_eval, self.grad_eval)) adv_ds.X[k, :] = x_opt fs_opt[i] = f_opt y_pred, scores = self.classifier.predict(adv_ds.X, return_decision_function=True) y_pred = CArray(y_pred) # Return the mean objective function value on the evasion points ( # computed from the outputs of the surrogate classifier) f_obj = fs_opt.mean() return y_pred, scores, adv_ds, f_obj
def run(self, x, y, ds_init=None) -> Tuple[CArray, CArray, CDataset, Any]: """ Runs the genetic algorithms. Parameters ---------- x : CArray input sample to perturb y : CArray original class ds_init : CDataset, optional, default None the initialization point. Default is None Returns ------- CArray y_pred : the predicted label after the attack CArray scores : the scores after the attack CDataset adv_ds : the CDataset containing the adversarial points CArray f_obj : the mean value for the objective function """ x = CArray(x).atleast_2d() y = CArray(y).atleast_2d() x_init = None if ds_init is None else CArray(ds_init.X).atleast_2d() # only consider samples that can be manipulated v = self.is_attack_class(y) idx = CArray(v.find(v)).ravel() # print(v, idx) # number of modifiable samples n_mod_samples = idx.size adv_ds = CDataset(x.deepcopy(), y.deepcopy()) # If dataset is sparse, set the proper attribute if x.issparse is True: self._issparse = True # array in which the value of the optimization function are stored fs_opt = CArray.zeros(n_mod_samples, ) y_pred = CArray.zeros(n_mod_samples, ) scores = CArray.zeros((n_mod_samples, 2)) for i in range(n_mod_samples): k = idx[i].item() # idx of sample that can be modified xi = x[k, :] if x_init is None else x_init[k, :] x_opt, f_opt = self._run(x[k, :], y[k], x_init=xi) self.logger.info("Point: {:}/{:}, f(x):{:}, eval:{:}/{:}".format( k, x.shape[0], f_opt, self.f_eval, self.grad_eval)) if x_opt.shape[-1] > adv_ds.X.shape[-1]: # Need to resize the whole adv dataset, since CDataset can't deal with varying vector sizes new_length = x_opt.shape[-1] adv_ds.X = adv_ds.X.resize((adv_ds.X.shape[0], new_length), 256) adv_ds.X[k, :min(adv_ds.X.shape[-1], x_opt.shape[-1])] = x_opt fs_opt[i] = f_opt y_p, score = self.problem.model_wrapper.predict( x_opt, return_decision_function=True) scores[i, :] = score[0, :] y_pred[i] = y_p # Return the mean objective function value on the evasion points ( # computed from the outputs of the surrogate classifier) f_obj = fs_opt.mean() return y_pred, scores, adv_ds, f_obj
def _euclidean_proj_simplex(self, v, s=1): """Compute the Euclidean projection on a positive simplex. Solves the optimisation problem (using the algorithm from [1]): min_w 0.5 * || w - v ||_2^2 , s.t. \\sum_i w_i = s, w_i >= 0 Parameters ---------- v : CArray 1-Dimensional vector s : int, optional Radius of the simplex. Default 1. Returns ------- w : CArray Euclidean projection of v on the simplex. Notes ----- The complexity of this algorithm is in O(n log(n)) as it involves sorting v. Better alternatives exist for high-dimensional sparse vectors (cf. [1]). However, this implementation still easily scales to millions of dimensions. References ---------- [1] Efficient Projections onto the l1-Ball for Learning in High Dimensions John Duchi, Shai Shalev-Shwartz, Yoram Singer, and Tushar Chandra. International Conference on Machine Learning (ICML 2008) http://www.cs.berkeley.edu/~jduchi/projects/DuchiSiShCh08.pdf """ v = CArray(v).ravel() d = v.size # check if we are already on the simplex if v.sum() == s and (v >= 0).sum() == d: return v # best projection: itself! # get the array of cumulative sums of a sorted (decreasing) copy of v u = v.deepcopy() u.sort(inplace=True) u = u[::-1] if u.issparse: u_nnz = CArray(u.nnz_data).todense() cssv = u_nnz.cumsum() else: cssv = u.cumsum() # get the number of > 0 components of the optimal solution # (only considering non-null elements in v j = CArray.arange(1, cssv.size+1) if u.issparse: rho = (j * u_nnz > (cssv - s)).sum() - 1 else: rho = (j * u > (cssv - s)).sum() - 1 # compute the Lagrange multiplier associated to the simplex constraint theta = (cssv[rho] - s) / (rho + 1.0) # compute the projection by thresholding v using theta w = v if w.issparse: p = CArray(w.nnz_data) p -= theta w[w.nnz_indices] = p else: w -= theta w[w < 0] = 0 return w
class CArrayTestCases(CUnitTest): """Unittests interface for CArray.""" def setUp(self): """Basic set up.""" self.array_dense = CArray([[1, 0, 0, 5], [2, 4, 0, 0], [3, 6, 0, 0]]) self.array_sparse = CArray(self.array_dense.deepcopy(), tosparse=True) self.array_dense_sym = CArray([[1, 2, 0], [2, 4, 6], [0, 6, 0]]) self.array_sparse_sym = CArray(self.array_dense_sym.deepcopy(), tosparse=True) self.array_dense_nozero = CArray([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]) self.array_sparse_nozero = CArray(self.array_dense_nozero.deepcopy(), tosparse=True) self.array_dense_allzero = CArray([[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]) self.array_sparse_allzero = CArray(self.array_dense_allzero.deepcopy(), tosparse=True) self.array_dense_bool = CArray([[True, False, True, True], [False, False, False, False], [True, True, True, True]]) self.array_sparse_bool = CArray(self.array_dense_bool.deepcopy(), tosparse=True) self.array_dense_bool_true = CArray([[True, True, True, True], [True, True, True, True], [True, True, True, True]]) self.array_sparse_bool_true = CArray( self.array_dense_bool_true.deepcopy(), tosparse=True) self.array_dense_bool_false = CArray([[False, False, False, False], [False, False, False, False], [False, False, False, False]]) self.array_sparse_bool_false = CArray( self.array_dense_bool_false.deepcopy(), tosparse=True) self.row_flat_dense = CArray([4, 0, 6]) self.row_dense = self.row_flat_dense.atleast_2d() self.column_dense = self.row_dense.deepcopy().T self.row_sparse = CArray(self.row_dense.deepcopy(), tosparse=True) self.column_sparse = self.row_sparse.deepcopy().T self.single_flat_dense = CArray([4]) self.single_dense = self.single_flat_dense.atleast_2d() self.single_sparse = CArray(self.single_dense.deepcopy(), tosparse=True) self.single_flat_dense_zero = CArray([0]) self.single_dense_zero = self.single_flat_dense_zero.atleast_2d() self.single_sparse_zero = CArray(self.single_dense_zero.deepcopy(), tosparse=True) self.single_bool_flat_dense = CArray([True]) self.single_bool_dense = self.single_bool_flat_dense.atleast_2d() self.single_bool_sparse = CArray(self.single_bool_dense.deepcopy(), tosparse=True) self.single_bool_flat_dense_false = CArray([False]) self.single_bool_dense_false = \ self.single_bool_flat_dense_false.atleast_2d() self.single_bool_sparse_false = CArray( self.single_bool_dense_false.deepcopy(), tosparse=True) self.empty_flat_dense = CArray([], tosparse=False) self.empty_dense = CArray([[]], tosparse=False) self.empty_sparse = CArray([], tosparse=True) def _test_multiple_eq(self, items_list): """Return True if all items are equal.""" # We are going to compare the first element # with the second, the second with the third, etc. for item_idx, item in enumerate(items_list): if item_idx == len(items_list) - 1: break # We checked all the elements self.assert_array_equal(item, items_list[item_idx + 1]) # Every item is equal to each other, return True return True def _test_operator_cycle(self, totest_op, totest_items, totest_result): """Check if operator return the expected result on given items. totest_op: list of operators totest_items: list of items PAIR to test totest_result: list of expected result (class name) for each PAIR """ for operator in totest_op: to_check = [] for pair_idx, pair in enumerate(totest_items): class0 = type(pair[0]._data) if \ hasattr(pair[0], 'isdense') else type(pair[0]) class1 = type(pair[1]._data) if \ hasattr(pair[1], 'isdense') else type(pair[1]) self.logger.info("Operator {:} between {:} and {:}" "".format(operator.__name__, class0, class1)) result = operator(pair[0], pair[1]) self.assertIsInstance(result._data, totest_result[pair_idx]) self.logger.info("Result: {:}".format( result._data.__class__.__name__)) to_check.append(result) self.assertTrue(self._test_multiple_eq(to_check)) def _test_operator_notimplemented(self, totest_op, totest_items): """Check if operator is not implemented for given items. totest_op: list of operators totest_items: list of items PAIR to test """ for operator in totest_op: for pair in totest_items: with self.assertRaises(NotImplementedError): operator(pair[0], pair[1])