def Y(self, value): """Set Dataset Labels. Parameters ---------- value : `array_like` or CArray Array containing labels. Array is converted to dense format and flattened before storing. """ y = CArray(value).todense().ravel() if self._X is not None: # Checking number of samples/labels equality self._check_samples_labels(y=y) self._Y = y
def apply_feasible_manipulations(self, t, x: CArray) -> CArray: """ Apply the format exploit practical manipulation on the input sample Parameters ---------- t : CArray the vector of manipulations in [0,1] x : CArray the input space sample to perturb Returns ------- CArray: the adversarial malware """ byte_values = (t * 255).astype(np.int) x_adv, _ = self._craft_perturbed_c_array(x) for i, index in enumerate(self.indexes_to_perturb): x_adv[index] = byte_values[i] x_adv = CArray(x_adv) x_adv = x_adv.reshape((1, x_adv.shape[-1])) return CArray(x_adv)
def setUpClass(cls): CAttackEvasionCleverhansTestCases.setUpClass() cls.seed = 0 cls.y_target = None cls.clf = CClassifierMulticlassOVA(CClassifierSVM, kernel=CKernelRBF(gamma=10), C=0.1, preprocess=CNormalizerMinMax()) cls.ds = CDLRandomBlobs(n_features=0, centers=[[0.1, 0.1], [0.5, 0], [0.8, 0.8]], cluster_std=0.01, n_samples=100, random_state=cls.seed).load() cls.clf.fit(cls.ds.X, cls.ds.Y) cls.x0 = CArray([0.6, 0.2]) cls.y0 = CArray(cls.clf.predict(cls.x0))
def X(self, value): """Set Dataset Patterns. Parameters ---------- value : `array_like` or CArray Array containing patterns. Data is converted to 2-Dimensions before storing. """ x = CArray(value).atleast_2d() if self.Y is not None: # Checking number of samples/labels equality self._check_samples_labels(x=x) self._X = x
def test_constraint(self): """Test for CConstraint.constraint().""" self._test_constraint(self.c, self.p1_inside, self.p2_outside, self.p3_on) # Test for sparse arrays self._test_constraint(self.c, self.p1_inside.tosparse(), self.p2_outside.tosparse(), self.p3_on.tosparse()) # Constraint with one or more inf, error should be raised c = CConstraintBox(lb=CArray([0, -inf]), ub=1.5) with self.assertRaises(ValueError): c.constraint(self.p1_inside)
def test_save_load(self): a = CArray([1, 2, 3]) # Dummy test array # Generate a temp file to test import tempfile tempdir = tempfile.gettempdir() tempfile = fm.join(tempdir, 'secml_testpickle') tempfile = pickle_utils.save(tempfile, a) a_loaded = pickle_utils.load(tempfile) self.assert_array_equal(a_loaded, a)
def test_draw(self): """Drawing the loss functions. Inspired by: https://en.wikipedia.org/wiki/Loss_functions_for_classification """ fig = CFigure() x = CArray.arange(-1, 3.01, 0.01) fig.sp.plot(x, CArray([1 if i <= 0 else 0 for i in x]), label='0-1 indicator') for loss_id in ('hinge', 'hinge-squared', 'square', 'log'): self.logger.info("Creating loss: {:}".format(loss_id)) loss_class = CLoss.create(loss_id) fig.sp.plot(x, loss_class.loss(CArray([1]), x), label=loss_id) fig.sp.grid() fig.sp.legend() fig.show()
def test_nonlinear_l1(self): """Test evasion of a nonlinear classifier using L1 distance.""" discrete = False eta = 0.1 sparse = False seed = 87985889 ds, clf = self._prepare_nonlinear_svm(sparse, seed) evasion_params = { "classifier": clf, "surrogate_classifier": clf, "surrogate_data": ds, "distance": 'l1', "dmax": 1.0, "lb": -1.0, "ub": 1.0, "discrete": discrete, "attack_classes": CArray([1]), "y_target": 0, "solver_params": { "eta": eta, "eta_min": 0.1, "eta_max": None } } evas, x0, y0 = self._set_evasion(ds, evasion_params) # Expected final optimal point expected_x = CArray([-0.19, -0.7967]) expected_y = 0 self._run_evasion(evas, x0, y0, expected_x, expected_y) self._plot_2d_evasion(evas, ds, x0, 'pgd_ls_nonlinear_L1.pdf')
def test_linear_l2(self): """Test evasion of a linear classifier using L2 distance.""" discrete = False eta = 0.5 sparse = True seed = 48574308 ds, clf = self._prepare_linear_svm(sparse, seed) evasion_params = { "classifier": clf, "surrogate_classifier": clf, "surrogate_data": ds, "distance": 'l2', "dmax": 1.05, "lb": -0.67, "ub": 0.67, "discrete": discrete, "attack_classes": CArray([1]), "y_target": 0, "solver_params": { "eta": eta, "eta_min": None, "eta_max": None } } evas, x0, y0 = self._set_evasion(ds, evasion_params) # Expected final optimal point expected_x = CArray([0.4463, 0.67]) expected_y = 0 self._run_evasion(evas, x0, y0, expected_x, expected_y) self._plot_2d_evasion(evas, ds, x0, 'pgd_ls_linear_L2.pdf')
def _performance_score(self, y_true, score): """Computes the Area Under the ROC Curve (AUC) using the Wilcoxon-Mann-Whitney statistic. Parameters ---------- y_true : CArray Flat array with true binary labels in range {0, 1}. score : CArray Flat array with target scores for each pattern, can either be probability estimates of the positive class or confidence values. Returns ------- metric : float Returns metric value as float. Notes ----- This implementation is restricted to the binary classification task with labels in range {0, 1}. """ if CArray(CArray(y_true != 0).logical_and(y_true != 1)).any(): raise ValueError("input labels should be binary in 0/1 interval.") idxp = y_true.find(y_true == 1) idxn = y_true.find(y_true == 0) auc = 0.0 for i in idxp: for j in idxn: if score[i] > score[j]: auc += 1.0 elif score[i] == score[j]: auc += 0.5 return auc / (len(idxp) * len(idxn))
def create_real_sample_from_adv(self, original_file_path: str, x_adv: CArray, new_file_path: str = None) -> bytearray: """ Create a real adversarial example Parameters ---------- original_file_path : str the original malware sample x_adv : CArray the perturbed malware sample, as created by the optimizer new_file_path : str, optional, default None the path where to save the adversarial malware. Leave None to not save the result to disk Returns ------- bytearray the adversarial malware, as string of bytes """ with open(original_file_path, 'rb') as f: code = bytearray(f.read()) original_x = CArray( [numpy.frombuffer(code, dtype=numpy.uint8).astype(numpy.uint16)]) if self.classifier.get_is_shifting_values(): original_x += self.classifier.get_is_shifting_values() x_init, index_to_perturb = self._generate_list_adv_example(original_x) x_init = CArray([x_init]).astype(numpy.uint8) x_init[0, index_to_perturb] = x_adv[ 0, index_to_perturb] - self.classifier.get_is_shifting_values() x_real = x_init[0, :].tolist()[0] x_real_adv = b''.join([bytes([i]) for i in x_real]) if new_file_path: with open(new_file_path, 'wb') as f: f.write(x_real_adv) return x_real_adv
def _sv_margin(self, tol=1e-6): """Return the margin support vectors.""" if self.n_classes > 2: raise ValueError("SVM is not binary!") assert (self.kernel.rv.shape[0] == self.alpha.shape[1]) alpha = self.alpha.todense() s = alpha.find( (abs(alpha) >= tol) * (abs(alpha) <= self.C - tol)) if len(s) > 0: return self.kernel.rv[s, :], CArray(s) else: # no margin SVs return None, None
def _forward(self, x): """Compute the linear kernel between x and cached rv. Parameters ---------- x : CArray or array_like Array of shape (n_x, n_features). Returns ------- kernel : CArray Kernel between x and cached rv. Array of shape (n_x, n_rv). """ return CArray(x.dot(self.rv.T))
def test_linear_l1_discrete(self): """Test evasion of a linear classifier using L1 distance (discrete).""" eta = 0.5 sparse = True seed = 10 ds, clf = self._prepare_linear_svm(sparse, seed) ds = self._discretize_data(ds, eta) evasion_params = { "classifier": clf, "double_init_ds": ds, "distance": 'l1', "dmax": 2, "lb": -1, "ub": 1, "attack_classes": CArray([1]), "y_target": 0, "solver_params": { "eta": eta, "eta_min": None, "eta_max": None } } evas, x0, y0 = self._set_evasion(ds, evasion_params) # Expected final optimal point expected_x = CArray([0.5, -1]) expected_y = 0 self._run_evasion(evas, x0, y0, expected_x, expected_y) self._plot_2d_evasion(evas, ds, x0, 'pgd_exp_linear_L1_discrete.pdf')
def explain(self, x, y, return_grad=False): """Compute influence of test sample x against all training samples. Parameters ---------- x : CArray Input sample. y : int Class wrt compute the classifier gradient. return_grad : bool, optional If True, also return the clf gradient computed on x. Default False. """ H = self.hessian(x, y) p = H.shape[0] H += 1e-9 * (CArray.eye(p)) if self._inv_H is None: # compute hessian inverse det = linalg.det(H.tondarray()) if abs(det) < 1e-6: self._inv_H = CArray(linalg.pinv2(H.tondarray())) else: self._inv_H = CArray(linalg.inv(H.tondarray())) x = x.atleast_2d() if self._grad_inner_loss_params is None: self._grad_inner_loss_params = self.grad_inner_loss_params( self.tr_ds.X, self.tr_ds.Y) v = self.grad_outer_loss_params(x, y).T.dot(self._inv_H).dot( self._grad_inner_loss_params) return (v, H) if return_grad is True else v
def test_labelkfold(self): ds = CDLRandom( n_classes=3, n_samples=10, n_informative=3, random_state=0).load() self.logger.info("Testing Label K-Fold") kf = CDataSplitterLabelKFold(num_folds=2).compute_indices(ds) tr_idx_expected = [CArray([1, 2, 6, 7, 8, 9]), CArray([0, 3, 4, 5])] ts_idx_expected = [CArray([0, 3, 4, 5]), CArray([1, 2, 6, 7, 8, 9])] self.assertEqual(len(kf.tr_idx), 2) self.assertEqual(len(kf.ts_idx), 2) for fold_idx in range(kf.num_folds): self.logger.info("{:} fold: \nTR {:} {:} \nTS {:} {:}" "".format(fold_idx, kf.tr_idx[fold_idx], ds.Y[kf.tr_idx[fold_idx]], kf.ts_idx[fold_idx], ds.Y[kf.ts_idx[fold_idx]])) self.assert_array_equal( tr_idx_expected[fold_idx], kf.tr_idx[fold_idx]) self.assert_array_equal( ts_idx_expected[fold_idx], kf.ts_idx[fold_idx])
def _check_adv_example(self, secml_attack, fb_attack): x0_tensor = as_tensor(self.x0.atleast_2d()) y0_tensor = as_tensor(self.y0.ravel()) y_target = secml_attack.y_target if y_target is None: criterion = fb.criteria.Misclassification(y0_tensor) else: criterion = fb.criteria.TargetedMisclassification(torch.tensor([y_target])) y_pred, scores, adv_ds, f_obj = secml_attack.run(self.x0, self.y0) _, adv_fb, _ = fb_attack(secml_attack.f_model, x0_tensor, criterion, epsilons=secml_attack.epsilon) adv_fb = CArray(adv_fb.numpy()) return adv_ds, adv_fb
def sklearn_comp(array, y): self.logger.info("Original array is:\n{:}".format(array)) # Sklearn normalizer sklearn_lda = LinearDiscriminantAnalysis().fit( array.tondarray(), y.tondarray()) target = CArray(sklearn_lda.transform(array.tondarray())) # Our normalizer lda = CLDA().fit(array, y) result = lda.forward(array) self.logger.info("Sklearn result is:\n{:}".format(target)) self.logger.info("Result is:\n{:}".format(result)) self.assert_array_almost_equal(result, target)
def __init__(self, end2end_model: CClassifierEnd2EndMalware, indexes_to_perturb: list, iterations: int = 100, is_debug: bool = False, random_init: bool = False, threshold: float = 0.5, penalty_regularizer: float = 0, store_checkpoints: int = None): CAttackEvasion.__init__( self, end2end_model, end2end_model, surrogate_data=CDataset(CArray([[0], [1]]), CArray([0, 1])), y_target=None, ) self.iterations = iterations self.is_debug = is_debug self.indexes_to_perturb = indexes_to_perturb self.confidences_ = [] self.changes_per_iterations_ = [] self.random_init = random_init self.embedding_size = end2end_model.get_embedding_size() self.max_input_length = end2end_model.get_input_max_length() self.invalid_pos = end2end_model.get_embedding_value() self.embedding_value = end2end_model.get_embedding_value() self.shift_values = end2end_model.get_is_shifting_values() self._invalid_value = torch.tensor([np.infty]) self.threshold = threshold self.penalty_regularizer = penalty_regularizer self.store_checkpoints = store_checkpoints
def test_lda(self): """Test for LDA. This compares sklearn equivalent to our method.""" def sklearn_comp(array, y): self.logger.info("Original array is:\n{:}".format(array)) # Sklearn normalizer sklearn_lda = LinearDiscriminantAnalysis().fit( array.tondarray(), y.tondarray()) target = CArray(sklearn_lda.transform(array.tondarray())) # Our normalizer lda = CLDA().fit(array, y) result = lda.forward(array) self.logger.info("Sklearn result is:\n{:}".format(target)) self.logger.info("Result is:\n{:}".format(result)) self.assert_array_almost_equal(result, target) # A min of 2 samples is required by LDA so we cannot test single rows sklearn_comp(self.array_dense, CArray([0, 1, 0])) sklearn_comp(self.array_sparse, CArray([0, 1, 0])) sklearn_comp(self.column_dense, CArray([0, 1, 0])) sklearn_comp(self.column_sparse, CArray([0, 1, 0]))
def refine_roc(fpr, tpr, th): """Function to ensure the bounds of a ROC. The first and last points should be (0,0) and (1,1) respectively. Parameters ---------- fpr : CArray False Positive Rates, as returned by `.BaseRoc.compute()`. tpr : CArray True Positive Rates, as returned by `.BaseRoc.compute()`. th : CArray Thresholds, as returned by `.BaseRoc.compute()`. """ if tpr[0] != fpr[0] or tpr[0] != 0 or fpr[0] != 0: fpr = CArray(0).append(fpr) tpr = CArray(0).append(tpr) th = CArray(th[0] + 1e-3).append(th) if tpr[-1] != fpr[-1] or tpr[-1] != 1 or fpr[-1] != 1: fpr = fpr.append(1) tpr = tpr.append(1) th = th.append(th[-1] - 1e-3) return fpr, tpr, th
def _compare_scalers(self, scaler, scaler_sklearn, array, convert_to_dense=False): """Compare wrapped scikit-learn scaler to the unwrapped scaler. Parameters ---------- array : CArray scaler : A wrapped CScaler scaler_sklearn Scikit-learn normalizer (from `sklearn.preprocessing`). convert_to_dense : bool, optional If True the data used by the SkLearn scaler will be converted to dense. Returns ------- scaler_sklearn Trained Scikit-learn normalizer (from `sklearn.preprocessing`). scaler : CScaler Trained normalizer. """ scaler, scaler_sklearn = \ super(TestCScalerMinMax, self)._compare_scalers(scaler, scaler_sklearn, array, convert_to_dense) self.logger.info("Testing out of range normalization") array_sk = array.get_data() if convert_to_dense is False \ else array.tondarray() # Sklearn normalizer (requires float dtype input) transform_sklearn = CArray(scaler_sklearn.transform(array_sk * 2)) # Our normalizer transform = scaler.forward(array * 2) self.logger.info("Correct result is:\n{:}".format(transform_sklearn)) self.logger.info("Our result is:\n{:}".format(transform)) self.assert_array_almost_equal(transform_sklearn, transform) return scaler, scaler_sklearn
def sklearn_comp(array, norm_type): self.logger.info("Norm type: {:}".format(norm_type)) self.logger.info("Original array is: {:}".format(array)) # Sklearn normalizer (requires float dtype input) target = CArray( Normalizer(norm=norm_type).fit_transform( array.astype(float).get_data())) # Create our normalizer result = CNormalizerUnitNorm(norm=norm_type).fit_transform(array) self.logger.info("Correct result is:\n{:}".format(target)) self.logger.info("Our result is:\n{:}".format(result)) self.assert_array_almost_equal(target, result)
def load(self): """Loads the dataset. Returns ------- dataset : CDataset The randomly generated dataset. """ from sklearn.datasets import make_blobs patterns = make_blobs(n_samples=self.n_samples, n_features=2, centers=self.centers, cluster_std=self.cluster_std, random_state=self.random_state)[0] return CDataset(patterns, self._dts_function(CArray(patterns)))
def _expand_std(self, n_feats): """Expand std value to all dimensions.""" if self.with_std is False: # set std to 1. self._std = CArray(1.0) # we just need a scalar value. else: n_channels = len(self._in_std) if not n_feats % n_channels == 0: raise ValueError("input number of features must be " "divisible by {:}".format(n_channels)) channel_size = int(n_feats / n_channels) self._std = CArray.ones(shape=(n_feats, )) for i in range(n_channels): self._std[i * channel_size:i * channel_size + channel_size] *= self._in_std[i] return self._std
def _rnd_init_poisoning_points( self, n_points=None, init_from_val=False, val=None): """Returns a random set of poisoning points randomly with flipped labels.""" if init_from_val: if val: init_dataset = val else: init_dataset = self.val else: init_dataset = self.surrogate_data if init_dataset is None: raise ValueError("Surrogate data not set!") if (self._n_points is None or self._n_points == 0) and ( n_points is None or n_points == 0): raise ValueError("Number of poisoning points (n_points) not set!") if n_points is None: n_points = self.n_points idx = CArray.randsample(init_dataset.num_samples, n_points, random_state=self.random_seed) xc = init_dataset.X[idx, :].deepcopy() if not self.discrete: # if the attack is in a continuous space we add a # little perturbation to the initial poisoning point random_noise = CArray.rand(shape=xc.shape, random_state=self.random_seed) xc += 1e-3 * (2 * random_noise - 1) else: xc = self.add_discrete_perturbation(xc) yc = CArray(init_dataset.Y[idx]).deepcopy() # true labels # randomly pick yc from a different class for i in range(yc.size): labels = CArray.randsample(init_dataset.num_classes, 2, random_state=self.random_seed) if yc[i] == labels[0]: yc[i] = labels[1] else: yc[i] = labels[0] return xc, yc
def compute(self, y_true, score, positive_label=None): """Compute TPR/FPR for classifier output. Parameters ---------- y_true : CArray Flat array with true binary labels in range {0, 1} for each patterns or a single array. If labels are not binary, pos_label should be explicitly given. score : CArray Flat array with target scores for each pattern, can either be probability estimates of the positive class or confidence values. positive_label : int, optional Label to consider as positive (others are considered negative). Returns ------- single_roc : CBaseRoc Instance of the roc curve (tpr, fpr, th). """ th = score.unique() # unique also sorts the values n = CArray(score[y_true == 0]) p = CArray(score[y_true == 1]) # Counting the fpr and the tpr fp_list = [] tp_list = [] for i in range(th.size): fp_i = (n >= th[i]).sum() if n.size != 0 else 0 tp_i = (p >= th[i]).sum() if p.size != 0 else 0 fp_list.append(fp_i) tp_list.append(tp_i) # Returning increasing fpr, tpr... fp_list.reverse() tp_list.reverse() # ...and th accordingly (decreasing) th = CArray(th[::-1]) # Normalizing in 0-1 fpr = CArray(fp_list) / n.size if n.size != 0 else CArray([0]) tpr = CArray(tp_list) / p.size if p.size != 0 else CArray([0]) # Ensure first and last points are (0,0) and (1,1) respectively self._fpr, self._tpr, self._th = refine_roc(fpr, tpr, th) return self
def _backward(self, w=None): """Calculate Polynomial kernel gradient wrt cached vector 'x'. The gradient of Polynomial kernel is given by:: dK(rv,x)/dy = rv * gamma * degree * k(rv,x, degree-1) Parameters ---------- w : CArray of shape (1, n_rv) or None if CArray, it is pre-multiplied to the gradient of the module, as in standard reverse-mode autodiff. Returns ------- kernel_gradient : CArray Kernel gradient of rv with respect to vector x, shape (n_rv, n_features) if n_rv > 1 and w is None, else (1, n_features). """ # Checking if cached x is a vector if not self._cached_x.is_vector_like: raise ValueError( "kernel gradient can be computed only wrt vector-like arrays.") if self._rv is None: raise ValueError("Please run forward with caching=True or set" "`rv` first.") k = CArray( metrics.pairwise.polynomial_kernel(self._rv.get_data(), self._cached_x.get_data(), self.degree - 1, self.gamma, self.coef0)) # Format of output array should be the same as cached x if self._cached_x.issparse: rv = self._rv.tosparse() # Casting the kernel to sparse for efficient broadcasting k = k.tosparse() else: rv = self._rv.todense() grad = rv * k * self.gamma * self.degree return grad if w is None else w.dot(grad)
def gradient_ndarray(self, x, *args, **kwargs): """Evaluates gradient of function at point x (ndarray). Parameters ---------- x : ndarray Argument of gradient. args, kwargs Other optional parameter of the function. Returns ------- out_grad : ndarray Array with gradient output. """ return self.gradient(CArray(x), *args, **kwargs).tondarray()
def compute_black_box_optimization(self, x: CArray) -> CArray: self.problem.init_starting_point(x) seed = self.problem.seed if self.problem.seed is not None else 0 algorithm = pygmo.algorithm( pygmo.sga(gen=self.problem.iterations, seed=seed)) pygmo_problem = pygmo.problem(self.problem) pygmo_population = pygmo.population(pygmo_problem, size=self.problem.population_size, seed=self.problem.seed) minimization_results = algorithm.evolve(pygmo_population) best_x = minimization_results.get_x()[minimization_results.best_idx()] evolved_problem = minimization_results.problem.extract( type(self.problem)) confidences, fitness = evolved_problem.export_internal_results() self.confidences_ = confidences self.fitness = fitness return CArray(best_x)