def _test_rule(self, rule, n_prototypes=20, random_state=None): """Generic test case for prototype selectors.""" self.logger.info("Testing: " + rule + " selector.") ps = CPrototypesSelector.create(rule) ps.verbose = 2 if random_state is None: ds_reduced = ps.select(self.dataset, n_prototypes=n_prototypes) else: ds_reduced = ps.select(self.dataset, n_prototypes=n_prototypes, random_state=random_state) if self.plots is True: self.draw_selection(ds_reduced, rule) idx_path = fm.join(fm.abspath(__file__), "idx_{:}.gz".format(rule)) self.assert_array_equal(ps.sel_idx, CArray.load(idx_path, dtype=int).ravel())
def _compare_scalers(self, scaler, scaler_sklearn, array, convert_to_dense=False): """Compare wrapped scikit-learn scaler to the unwrapped scaler. Parameters ---------- array : CArray scaler : A wrapped CScaler scaler_sklearn Scikit-learn normalizer. convert_to_dense : bool, optional If True the data used by the SkLearn scaler will be converted to dense. Returns ------- scaler_sklearn Trained Scikit-learn normalizer (from `sklearn.preprocessing`). scaler : CScaler Trained normalizer. """ self.logger.info("Original array is:\n{:}".format(array)) array_sk = array.get_data() if convert_to_dense is False \ else array.tondarray() # Sklearn normalizer scaler_sklearn.fit(array_sk, None) transform_sklearn = CArray(scaler_sklearn.transform(array_sk)) # Our normalizer scaler._fit(array) transform = scaler.forward(array) self.logger.info("sklearn result is:\n{:}".format(transform_sklearn)) self.logger.info("Our result is:\n{:}".format(transform)) self.assert_array_almost_equal(transform_sklearn, transform) return scaler, scaler_sklearn
def _backward(self, w=None): """Compute the gradient w.r.t. the input cached during the forward pass. Parameters ---------- w : CArray or None, optional If CArray, will be left-multiplied to the gradient of the preprocessor. Returns ------- gradient : CArray Gradient of the normalizer wrt input data. it will have dimensionality shape (w.shape[0], x.shape[1]) if `w` is passed as input (x.shape[1], x.shape[1]) otherwise. """ x = self._cached_x if x.shape[0] > 1: raise ValueError("Parameter 'x' passed to the forward() method " "needs to be a one dimensional vector " "(passed a {:} dimensional vector)".format( x.ndim)) d = self._cached_x.size # get the number of features if w is not None: if (w.ndim != 1) or (w.size != d): raise ValueError("Parameter 'w' needs to be a one dimensional " "vector with the same number of elements " "of parameter 'x' of the forward method " "(passed a {:} dimensional vector with {:} " "elements)".format(w.ndim, w.size)) # compute the norm of x: ||x|| x_norm = self._compute_x_norm(x) # compute the gradient of the given norm: d||x||/dx grad_norm_x = self._compute_norm_gradient(x, x_norm) # this is the derivative of the ratio x/||x|| grad = CArray.eye(d, d) * x_norm.item() - grad_norm_x.T.dot(x) grad /= (x_norm**2) return grad if w is None else w.dot(grad)
def _backward(self, w=None): """Calculate Histogram Intersection kernel gradient wrt cached vector 'x'. The kernel is computed between each row of rv (denoted with rk) and x, as:: sum_i ( min(rk[i], x[i]) ) The gradient computed w.r.t. x is thus 1 if x[i] < rk[i], and 0 elsewhere. Parameters ---------- w : CArray of shape (1, n_rv) or None if CArray, it is pre-multiplied to the gradient of the module, as in standard reverse-mode autodiff. Returns ------- kernel_gradient : CArray Kernel gradient of rv with respect to vector x, shape (n_rv, n_features) if n_rv > 1 and w is None, else (1, n_features). """ # Checking if cached x is a vector if not self._cached_x.is_vector_like: raise ValueError( "kernel gradient can be computed only wrt vector-like arrays.") if self._rv is None: raise ValueError("Please run forward with caching=True or set" "`rv` first.") if self._cached_x.issparse is True: # Broadcasting not supported for sparse arrays x_broadcast = self._cached_x.repmat(self._rv.shape[0], 1) else: # Broadcasting is supported by design for dense arrays x_broadcast = self._cached_x grad = CArray.zeros(shape=self._rv.shape, sparse=self._cached_x.issparse) grad[x_broadcast < self._rv] = 1 # TODO support from CArray still missing return grad if w is None else w.dot(grad)
def setUp(self): self.classifier = CClassifierSVM(kernel='linear', C=1.0) self.lb = -2 self.ub = +2 n_tr = 20 n_ts = 10 n_features = 2 n_reps = 1 self.sec_eval = [] self.attack_ds = [] for rep_i in range(n_reps): self.logger.info( "Loading `random_blobs` with seed: {:}".format(rep_i)) loader = CDLRandomBlobs(n_samples=n_tr + n_ts, n_features=n_features, centers=[(-0.5, -0.5), (+0.5, +0.5)], center_box=(-0.5, 0.5), cluster_std=0.5, random_state=rep_i * 100 + 10) ds = loader.load() self.tr = ds[:n_tr, :] self.ts = ds[n_tr:, :] self.classifier.fit(self.tr.X, self.tr.Y) # only manipulate positive samples, targeting negative ones self.y_target = None self.attack_classes = CArray([1]) for create_fn in (self._attack_pgd_ls, self._attack_cleverhans): self.attack_ds.append(self.ts) attack, param_name, param_values = create_fn() # set sec eval object self.sec_eval.append( CSecEval( attack=attack, param_name=param_name, param_values=param_values, ))
def setUp(self): self.max_length = 2**20 self.padding_value = 256 self.root_module_path = os.path.dirname( os.path.dirname(os.path.dirname(__file__))) self.classifier = CClassifierEnd2EndMalware(MalConv()) self.malconv_plus = CClassifierEnd2EndMalware(MalConv(), plus_version=True) self.surrogate_classifier = CClassifierEnd2EndMalware(MalConv()) self.ember_path = os.path.join( self.root_module_path, "../data/trained/pretrained_malconv.pth") self.surrogate_path = os.path.join( self.root_module_path, "../data/trained/pretrained_malconv.pth") self.malware_folder = os.path.join( self.root_module_path, "../data/malware_samples/test_folder") self.goodware_folder = os.path.join(self.root_module_path, "../data/goodware_samples/") self.single_malware_path = os.path.join( self.root_module_path, "../data/malware_samples/test_malware") self.baseline = np.array( [np.zeros(self.max_length) + self.padding_value]) X = [] y = [] for f in listdir(self.malware_folder): complete_path = os.path.join(self.malware_folder, f) if not os.path.isfile(complete_path): continue if "PE32" not in magic.from_file(complete_path): continue with open(complete_path, "rb") as malware: print(f'>Using {f}') code = MalConv.bytes_to_numpy(malware.read(), self.max_length, 256, False) X.append(code) y.append(1) X.append(self.baseline[0]) y.append(0) self.X = CArray(X) self.Y = y with open(self.single_malware_path, "rb") as f: self.byte_malware = bytearray(f.read()) self.malware = np.array([ MalConv.bytes_to_numpy(self.byte_malware, self.max_length, 256, False) ])
def compute_performance(self, estimator, dataset): """Split data in folds and return the mean estimator performance. Parameters ---------- estimator : CClassifier The Classifier that we want evaluate dataset : CDataset Dataset that we want use for evaluate the classifier Returns ------- score : float Mean performance score of estimator computed on the K-Folds. """ # Placeholder for folds' score fold_number = len(self.splitter.tr_idx) splits_score = CArray.zeros(fold_number) # estimate the performance of the estimator on each fold for split_idx in range(fold_number): train_dataset = dataset[self.splitter.tr_idx[split_idx], :] test_dataset = dataset[self.splitter.ts_idx[split_idx], :] # Train the estimator estimator.fit(train_dataset) pred_label, pred_score = estimator.predict( test_dataset.X, return_decision_function=True) if dataset.num_classes > 2: pred_score = None # Score cannot be used in multiclass case else: # Extracting score of the positive class pred_score = pred_score[:, 1].ravel() this_test_score = self.metric.performance_score(test_dataset.Y, y_pred=pred_label, score=pred_score) splits_score[split_idx] = this_test_score return splits_score.mean()
def test_minimize_beale(self): """Test for COptimizer.minimize() method on 3h-camel fun. This function tests the optimization in discrete space, with a floating eta (l1 constraint) and an integer starting point. The solution expected by this test is a float vector. """ opt_params = { 'eta': 1e-6, 'eta_min': 1e-4, 'eps': 1e-12, 'constr': CConstraintL1(center=CArray([2, 0]), radius=2), 'bounds': CConstraintBox(lb=0, ub=4) } self._test_minimize(COptimizerPGDLS, 'beale', opt_params=opt_params, label='discrete-l1')
def sv_margin_idx(self, tol=1e-6): """Indices of Margin Support Vectors. Parameters ---------- tol : float Alpha value threshold for considering a Support Vector on the margin. Returns ------- indices : CArray Flat array with the indices of the Margin Support Vectors. """ s = self.alpha.find( (abs(self.alpha) >= tol) * (abs(self.alpha) <= self.C - tol)) return CArray(s)
def apply_feasible_manipulations(self, t, x: CArray) -> CArray: """ Apply the padding practical manipulation on the input sample Parameters ---------- t : CArray the vector of manipulations in [0,1] x : CArray the input space sample to perturb Returns ------- CArray: the adversarial malware """ byte_values = (t * 255).astype(np.int) x_adv = x.append(byte_values) return x_adv
def test_save_load_sparse_conversion(self): """Test save/load of CArray""" # Array should be stored and loaded correctly whatever sparse format self.array_sparse._data._data = self.array_sparse._data.todok() self.array_sparse.save(self.test_file) # Saving to a file handle is not supported for sparse arrays with self.assertRaises(NotImplementedError): with open(self.test_file_2, 'w') as f: self.array_sparse.save(f) loaded_array_sparse = CArray.load(self.test_file, arrayformat='sparse', dtype=int) self.assertFalse((loaded_array_sparse != self.array_sparse).any(), "Saved and loaded arrays (sparse) are not equal!")
def sklearn_comp(array): self.logger.info("Original array is:\n{:}".format(array)) # Sklearn normalizer sklearn_pca = PCA().fit(array.tondarray()) target = CArray(sklearn_pca.transform(array.tondarray())) # Our normalizer pca = CPCA().fit(array) result = pca.transform(array) self.logger.info("Sklearn result is:\n{:}".format(target)) self.logger.info("Result is:\n{:}".format(result)) self.assert_array_almost_equal(result, target) original = pca.inverse_transform(result) self.assert_array_almost_equal(original, array)
def _grad(self, x): """McCormick function gradient wrt. point x.""" x = x.atleast_2d() if x.shape[1] != 2: raise ValueError("Gradient of McCormick function " "only available for 2 dimensions") # Computing gradient of each dimension grad1_1 = (x[0] + x[1]).cos() grad1_2 = 2 * (x[0] - x[1]) grad1_3 = -1.5 grad2_1 = (x[0] + x[1]).cos() grad2_2 = -2 * (x[0] - x[1]) grad2_3 = 2.5 grad1 = grad1_1 + grad1_2 + grad1_3 grad2 = grad2_1 + grad2_2 + grad2_3 return CArray.concatenate(grad1, grad2, axis=1).ravel()
def test_save_load_sparse(self): """Test save/load of CArray""" self.logger.info( "UNITTEST - CArray - Testing save/load for sparse matrix") self.array_sparse.save(self.test_file) # Saving to a file handle is not supported for sparse arrays with self.assertRaises(NotImplementedError): with open(self.test_file_2, 'w') as f: self.array_sparse.save(f) loaded_array_sparse = CArray.load(self.test_file, arrayformat='sparse', dtype=int) self.assertFalse((loaded_array_sparse != self.array_sparse).any(), "Saved and loaded arrays (sparse) are not equal!")
def fun_ndarray(self, x, *args, **kwargs): """Evaluates function on x (ndarray). Parameters ---------- x : np.ndarray Argument of fun as ndarray. args, kwargs Other optional parameter of the function. Returns ------- out_fun : scalar or CArray Function output, scalar or CArray depending on the inner function. """ return self.fun(CArray(x), *args, **kwargs)
def predict(args): if global_state.target is None: error_prompt('First you need to set a target.') return if args.path is None: if global_state.data_paths is None: error_prompt('You have to give an input path.') return paths = global_state.data_paths elif not os.path.isfile(args.path): error_prompt(f'{args.path} does not exists.') return else: paths = [args.path] net = create_wrapper_for_global_target() stats = { 'detected': 0, 'total': 0, 'confidence': 0, } for p in paths: with open(p, 'rb') as handle: code = handle.read() info_prompt(f'Computing prediction for {p}') code = CArray(np.frombuffer(code, dtype=np.uint8)).atleast_2d() y_pred, confidence = net.predict(code, return_decision_function=True) y_pred = y_pred.item() score = confidence[0, 1].item() stats['detected'] += int(y_pred != 0) stats['total'] += 1 stats['confidence'] += score info_prompt(f'predicted label: {y_pred}') info_prompt(f'confidence: {score}') print('-' * 20) if stats['total'] >= 1: separator_prompt() success_prompt('Prediction stats:') success_prompt(f'Detected: {stats["detected"]} / {stats["total"]}') success_prompt( f'Detection Rate: {stats["detected"] / stats["total"] * 100} %') success_prompt( f'Mean confidence: {stats["confidence"] / stats["total"]}')
def refine_roc(fpr, tpr, th): """Function to ensure the bounds of a ROC. The first and last points should be (0,0) and (1,1) respectively. Parameters ---------- fpr : CArray False Positive Rates, as returned by `.BaseRoc.compute()`. tpr : CArray True Positive Rates, as returned by `.BaseRoc.compute()`. th : CArray Thresholds, as returned by `.BaseRoc.compute()`. """ if tpr[0] != fpr[0] or tpr[0] != 0 or fpr[0] != 0: fpr = CArray(0).append(fpr) tpr = CArray(0).append(tpr) th = CArray(th[0] + 1e-3).append(th) if tpr[-1] != fpr[-1] or tpr[-1] != 1 or fpr[-1] != 1: fpr = fpr.append(1) tpr = tpr.append(1) th = th.append(th[-1] - 1e-3) return fpr, tpr, th
def extract_features(self, x): """ Crops and pads the input sample for being passed to the network. Parameters ---------- x : CArray The sample in the input space. Returns ------- CArray The feature space representation of the input sample. """ clf: CClassifierEnd2EndMalware = self.classifier padded_x = CArray.zeros( (1, clf.get_input_max_length())) + clf.get_embedding_value() length = min(x.shape[-1], clf.get_input_max_length()) padded_x[0, :length] = x[0, :length] + clf.get_is_shifting_values() return padded_x
def logpdf(self, data): """Log of the probability density function. Parameters ---------- data : CArray Quantiles, with the last axis of x denoting the components. Returns ------- pdf: CArray Probability density function computed at input data. """ cov = self.cov if isinstance(cov, CArray): cov = cov.tondarray() return CArray(multivariate_normal.logpdf(data.tondarray(), self.mean, cov))
def objective_function_gradient(self, x): """Compute the gradient of the evasion objective function. Parameters ---------- x : CArray A single point. """ y_pred, scores = self.classifier.predict(x, return_decision_function=True) k, c = self._find_k_c(y_pred, scores) w = CArray.zeros(shape=(self.classifier.n_classes, )) w[k.item()] = 1 w[c.item()] = -1 grad = self.classifier.gradient(x, w) return grad if self.y_target is None else -grad
def _grad(self, x): """Beale function gradient wrt. point x.""" x = x.atleast_2d() if x.shape[1] != 2: raise ValueError("Gradient of Beale function " "only available for 2 dimensions") # Computing gradient of each dimension grad1_1 = 2 * (1.5 - x[0] + x[0] * x[1]) * (-1 + x[1]) grad1_2 = 2 * (2.25 - x[0] + x[0] * x[1]**2) * (-1 + x[1]**2) grad1_3 = 2 * (2.625 - x[0] + x[0] * x[1]**3) * (-1 + x[1]**3) grad2_1 = 2 * (1.5 - x[0] + x[0] * x[1]) * x[0] grad2_2 = 2 * (2.25 - x[0] + x[0] * x[1]**2) * (2 * x[0] * x[1]) grad2_3 = 2 * (2.625 - x[0] + x[0] * x[1] ** 3) * \ (3 * x[0] * x[1] ** 2) grad1 = grad1_1 + grad1_2 + grad1_3 grad2 = grad2_1 + grad2_2 + grad2_3 return CArray.concatenate(grad1, grad2, axis=1).ravel()
def _fprop_fn(self, x_np): """Numpy function that computes and returns the output of the model. Parameters ---------- x_np: np.ndarray The input that should be classified by the model. Returns ------- scores: np.ndarray The scores given as output by the classifier. """ # compute the scores (the model output) f_x = self.fun.fun scores = f_x(CArray(x_np)).atleast_2d().tondarray().astype(np.float32) return scores
def _xk(self, x, fx, *args): """Returns a new point after gradient descent.""" # compute gradient grad = self._fun.gradient(x, *args) self._grad = grad # only used for visualization/convergence norm = grad.norm() if norm < 1e-20: return x, fx # return same point (and exit optimization) grad = grad / norm # filter modifications that would violate bounds (to sparsify gradient) grad = self._box_projected_gradient(x, grad) if self.discrete or (self.constr is not None and self.constr.class_type == 'l1'): # project z onto l1 constraint (via dual norm) grad = self._l1_projected_gradient(grad) next_point = x - grad * self._line_search.eta if self.constr is not None and self.constr.is_violated(next_point): self.logger.debug("Line-search on distance constraint.") grad = CArray(x - self.constr.projection(next_point)) grad_norm = grad.norm(order=2) if grad_norm > 1e-20: grad /= grad_norm if self.constr.class_type == 'l1': grad = grad.sign() # to move along the l1 ball surface z, fz = self._line_search.minimize(x, -grad, fx) return z, fz if self.bounds is not None and self.bounds.is_violated(next_point): self.logger.debug("Line-search on box constraint.") grad = CArray(x - self.bounds.projection(next_point)) grad_norm = grad.norm(order=2) if grad_norm > 1e-20: grad /= grad_norm z, fz = self._line_search.minimize(x, -grad, fx) return z, fz z, fz = self._line_search.minimize(x, -grad, fx) return z, fz
def _get_tr_without_point(self, p_idx): """ Given the idx of a point return a copy of the training dataset without that point Parameters ---------- p_idx int idx of the point that is wanted to be excluded by the training dataset Returns ------- new_tr CDataset dataset without the point with the given index """ all_idx = CArray.arange(self._tr.num_samples) not_p_idx = all_idx.find(all_idx != p_idx) new_tr = self._tr[not_p_idx, :] return new_tr
def _choose_x0_2c(self, x0_img_class): """Find a sample of that belong to the required class. Parameters ---------- x0_img_class : int Returns ------- x0 : CArray y0 : CArray """ adv_img_idx = \ CArray(self.ts.Y.find(self.ts.Y == x0_img_class))[0] x0 = self.ts.X[adv_img_idx, :] y0 = self.ts.Y[adv_img_idx] return x0, y0
def compute_black_box_optimization(self, x: CArray) -> CArray: self.problem.init_starting_point(x) seed = self.problem.seed if self.problem.seed is not None else 0 # algorithm = pygmo.algorithm(pygmo.sga(gen=self.problem.iterations, seed=seed, crossover='exponential', mutation='gaussian')) algorithm = pygmo.algorithm(pygmo.sga(gen=self.problem.iterations, seed=seed)) algorithm.set_verbosity(1) start_t = time.time() pygmo_problem = pygmo.problem(self.problem) pygmo_population = pygmo.population(pygmo_problem, size=self.problem.population_size, seed=self.problem.seed) minimization_results = algorithm.evolve(pygmo_population) end_t = time.time() evolved_problem = minimization_results.problem.extract(type(self.problem)) confidences, fitness, sizes = evolved_problem.export_internal_results() self.confidences_ = confidences self.fitness_ = fitness self.sizes_ = sizes self.evolved_problem_ = evolved_problem self.elapsed_time_ = end_t - start_t best_t = minimization_results.champion_x return CArray(best_t)
def create_real_sample_from_adv(self, original_file_path: str, x_adv: CArray, new_file_path: str = None): with open(original_file_path, 'rb') as f: code = bytearray(f.read()) padding_index = x_adv.find( x_adv == self.classifier.get_embedding_value()) padded_x_adv = copy.copy(x_adv) if padding_index: padded_x_adv = padded_x_adv[0, :padding_index[0]] if self.shift_values: padded_x_adv = padded_x_adv - 1 padded_x_adv = padded_x_adv.astype(np.uint8).flatten().tolist() padded_x_adv = b''.join([bytes([i]) for i in padded_x_adv]) code[:len(padded_x_adv)] = padded_x_adv if new_file_path: with open(new_file_path, 'wb') as f: f.write(code) return code
def objective_function(self, xc, acc=False): """ Parameters ---------- xc: poisoning point Returns ------- f_obj: values of objective function (average hinge loss) at x """ # index of poisoning point within xc. # This will be replaced by the input parameter xc if self._idx is None: idx = 0 else: idx = self._idx xc = CArray(xc).atleast_2d() n_samples = xc.shape[0] if n_samples > 1: raise TypeError("xc is not a single sample!") self._xc[idx, :] = xc clf, tr = self._update_poisoned_clf() # targeted attacks y_ts = self._y_target if self._y_target is not None else self.val.Y y_pred, score = clf.predict(self.val.X, return_decision_function=True) # TODO: binary loss check if self._attacker_loss.class_type != 'softmax': score = CArray(score[:, 1].ravel()) if acc is True: error = CArray(y_ts != y_pred).ravel() # compute test error else: error = self._attacker_loss.loss(y_ts, score) obj = error.mean() lid_cost = self.fast_lid_cost_calculation(xc.tondarray(), self.lid_k) obj = obj - lid_cost return obj
def test_load_paths(self): """Testing img dataset path loading.""" dl = CDataLoaderImgFolders() self.logger.info("Testing loading paths of rgb dataset...") ds_rgb_path = fm.join(fm.abspath(__file__), "ds_rgb") ds = dl.load(ds_path=ds_rgb_path, img_format='jpeg', load_data=False) self.logger.info( "Loaded {:} images of {:} features, {:} classes".format( ds.num_samples, ds.num_features, ds.num_classes)) # TODO: USE 'U' AFTER TRANSITION TO PYTHON 3 self.assertIn(ds.X.dtype.char, ('S', 'U')) # Checking behavior of `get_labels_ovr` ovr = ds.get_labels_ovr(pos_label='tiger') # Y : ['coyote', 'tiger'] self.assert_array_equal(ovr, CArray([0, 1]))
def test_approx_fprime_check_grad(self): """Test if the gradient check made up with COptimizer.approx_fprime() and .check_grad() methods is correct.""" self.logger.info( "Test for COptimizer.approx_fprime() and .check_grad() methods.") x0 = CArray([1., 0.]) # Starting point for minimization for fun_id in self.funcs: fun = self.funcs[fun_id] self.logger.info("Testing grad approx of {:}".format( fun.__class__.__name__)) grad_err = fun.check_grad(x0, epsilon=1e-8) self.logger.info( "(Real grad - approx).norm(): {:}".format(grad_err)) self.assertLess(grad_err, 1e-3)