def run(self, x, y, ds_init=None, *args, **kargs): x = CArray(x).atleast_2d() y = CArray(y).atleast_2d() x_init = None if ds_init is None else CArray(ds_init.X).atleast_2d() # only consider samples that can be manipulated v = self.is_attack_class(y) idx = CArray(v.find(v)).ravel() # print(v, idx) # number of modifiable samples n_mod_samples = idx.size adv_ds = CDataset(x.deepcopy(), y.deepcopy()) # If dataset is sparse, set the proper attribute if x.issparse is True: self._issparse = True # array in which the value of the optimization function are stored fs_opt = CArray.zeros(n_mod_samples, ) y_pred = CArray.zeros(n_mod_samples, ) scores = CArray.zeros((n_mod_samples, 2)) for i in range(n_mod_samples): k = idx[i].item() # idx of sample that can be modified xi = x[k, :] if x_init is None else x_init[k, :] x_opt, f_opt = self._run(x[k, :], y[k], x_init=xi, *args, **kargs) self.logger.info( "Point: {:}/{:}, dmax:{:}, f(x):{:}, eval:{:}/{:}".format( k, x.shape[0], self._dmax, f_opt, self.f_eval, self.grad_eval)) if x_opt.shape[-1] > adv_ds.X.shape[-1]: # Need to resize the whole adv dataset, since CDataset can't deal with varying vector sizes new_length = x_opt.shape[-1] adv_ds.X = adv_ds.X.resize((adv_ds.X.shape[0], new_length), 256) adv_ds.X[k, :min(adv_ds.X.shape[-1], x_opt.shape[-1])] = x_opt fs_opt[i] = f_opt y_p, score = self.problem.model_wrapper.predict( x_opt, return_decision_function=True) scores[i, :] = score[0, :] y_pred[i] = y_p # Return the mean objective function value on the evasion points ( # computed from the outputs of the surrogate classifier) f_obj = fs_opt.mean() return y_pred, scores, adv_ds, f_obj
def fit(self, dataset, n_jobs=1): """Trains the classifier. If a preprocess has been specified, input is normalized before training. Parameters ---------- dataset : CDataset Training set. Must be a :class:`.CDataset` instance with patterns data and corresponding labels. n_jobs : int, optional Number of parallel workers to use for training the classifier. Default 1. Cannot be higher than processor's number of cores. Returns ------- trained_cls : CClassifier Instance of the classifier trained using input dataset. """ self._n_features = dataset.num_features data_x = dataset.X # Transform data if a preprocess is defined if self.preprocess is not None: data_x = self.preprocess.fit_transform(dataset.X) return self._fit(CDataset(data_x, dataset.Y), n_jobs=n_jobs)
def load(self): """Loads the dataset. Returns ------- dataset : CDataset The randomly generated dataset. """ with CDLRandomToy.__lock: if self.toy == 'iris': from sklearn.datasets import load_iris toy_data = load_iris() elif self.toy == 'digits': from sklearn.datasets import load_digits toy_data = load_digits() elif self.toy == 'boston': from sklearn.datasets import load_boston toy_data = load_boston() elif self.toy == 'diabetes': from sklearn.datasets import load_diabetes toy_data = load_diabetes() else: raise ValueError("toy dataset {:} if not available.".format( self.toy)) # Returning a CDataset if self.class_list is None: return CDataset(CArray(toy_data.data), CArray(toy_data.target)) else: return self._select_classes(self.class_list, CArray(toy_data.data), CArray(toy_data.target))
def _select_classes(self, class_list, patterns, labels): sel_patterns = None sel_labels = None for single_class in class_list: this_class_pat_idx = labels.find(labels == single_class) if sel_patterns is None: sel_patterns = patterns[this_class_pat_idx, :] sel_labels = labels[this_class_pat_idx] else: sel_patterns = sel_patterns.append( patterns[this_class_pat_idx, :], axis=0) sel_labels = sel_labels.append(labels[this_class_pat_idx]) if self.zero_one is True: if len(class_list) > 2: raise ValueError("you are try to convert to 0 1 label for a " "dataset with more than 2 classes") else: class_list.sort() sel_labels[sel_labels == class_list[0]] = 0 sel_labels[sel_labels == class_list[1]] = 1 return CDataset(sel_patterns, sel_labels)
def _fit(self, x, y): """Trains the classifier. A One-Vs-All classifier is trained for each dataset class. Parameters ---------- x : CArray Array to be used for training with shape (n_samples, n_features). y : CArray Array of shape (n_samples,) containing the class labels. Returns ------- trained_cls : CClassifierMulticlassOVA Instance of the classifier trained using input dataset. """ # Preparing the binary classifiers self.prepare(y.unique().size) # Fit a one-vs-all classifier for each class # Use the specified number of workers self._binary_classifiers = parfor2(_fit_one_ova, self.classes.size, self.n_jobs, self, CDataset(x, y), self.verbose) return self
def _fit(self, x, y): """Trains the classifier. All the One-Vs-One classifier are trained for each dataset class. Parameters ---------- x : CArray Array to be used for training with shape (n_samples, n_features). y : CArray Array of shape (n_samples,) containing the class labels. Returns ------- trained_cls : CClassifierMulticlassOVO Instance of the classifier trained using input dataset. """ # Number of unique classes n_classes = y.unique().size # Number of classifiers to be trained ovo_clf_number = int((n_classes * (n_classes - 1)) / 2) # Preparing the binary classifiers self.prepare(ovo_clf_number) # Preparing the list of binary classifiers indices self._clf_pair_idx = list(combinations(range(n_classes), 2)) # Fit a one-vs-one classifier # Use the specified number of workers self._binary_classifiers = parfor2(_fit_one_ovo, self.num_classifiers, self.n_jobs, self, CDataset(x, y), self.verbose) return self
def binarize_subset(tr_class_idx, vs_class_idx, dataset): """Returns the binary dataset tr_class_idx vs vs_class_idx. Parameters ---------- tr_class_idx : int Index of the target class. vs_class_idx: int Index of the opposing class. dataset : CDataset Dataset from which the subset should be extracted. Returns ------- bin_subset : CDataset Binarized subset. """ tr_class = dataset.classes[tr_class_idx] vs_class = dataset.classes[vs_class_idx] tr_idx = dataset.Y.find(dataset.Y == tr_class) vs_idx = dataset.Y.find(dataset.Y == vs_class) subset = dataset[tr_idx + vs_idx, :] # Using get_labels_ovr to avoid redundant functions return CDataset(subset.X, subset.get_labels_ovr(tr_class), header=dataset.header)
def _clf_poisoning(self): """ Computes a poisoning point considering as source the sample {xc, yc}. """ xc = self.poisoning._run(self.xc, self.yc) self.logger.info("Starting score: " + str(self.poisoning.f_seq[0])) self.logger.info("Final score: " + str(self.poisoning.f_seq[-1])) self.logger.info("x*: " + str(xc)) self.logger.info("Point sequence: " + str(self.poisoning.x_seq)) self.logger.info("Score sequence: : " + str(self.poisoning.f_seq)) self.logger.info("Fun Eval: " + str(self.poisoning.f_eval)) self.logger.info("Grad Eval: " + str(self.poisoning.grad_eval)) metric = CMetric.create('accuracy') y_pred, scores = self.classifier.predict(self.ts.X, return_decision_function=True) orig_acc = metric.performance_score(y_true=self.ts.Y, y_pred=y_pred) self.logger.info("Error on testing data: " + str(1 - orig_acc)) tr = self.tr.append(CDataset(xc, self.yc)) pois_clf = self.classifier.deepcopy() pois_clf.fit(tr.X, tr.Y) y_pred, scores = pois_clf.predict(self.ts.X, return_decision_function=True) pois_acc = metric.performance_score(y_true=self.ts.Y, y_pred=y_pred) self.logger.info("Error on testing data (poisoned): " + str(1 - pois_acc)) return pois_clf, xc
def test_pretrained(self): """Test wrapping of pretrained models.""" from sklearn import datasets, svm iris = datasets.load_iris() X = iris.data y = iris.target clf = svm.SVC(kernel='linear') from secml.core.exceptions import NotFittedError with self.assertRaises(NotFittedError): secmlclf = CClassifierSkLearn(clf) secmlclf.predict(CArray(X)) clf.fit(X, y) y_pred = clf.predict(X) clf = svm.SVC(kernel='linear') secmlclf = CClassifierSkLearn(clf) secmlclf.fit(CDataset(X, y)) y_pred_secml = secmlclf.predict(CArray(X)) self.logger.info( "Predicted labels by pretrained model:\n{:}".format(y_pred)) self.logger.info( "Predicted labels by our fit:\n{:}".format(y_pred_secml)) self.assert_array_equal(y_pred, y_pred_secml)
def load(self): """Loads the dataset. Returns ------- dataset : CDataset The randomly generated dataset. """ from sklearn.datasets import make_classification patterns, labels = make_classification( n_samples=self.n_samples, n_features=self.n_features, n_informative=self.n_informative, n_redundant=self.n_redundant, n_repeated=self.n_repeated, n_classes=self.n_classes, n_clusters_per_class=self.n_clusters_per_class, weights=self.weights, flip_y=self.flip_y, class_sep=self.class_sep, hypercube=self.hypercube, shift=self.shift, scale=self.scale, random_state=self.random_state) return CDataset(patterns, labels)
def objective_function(self, xc, yc): # retrain clf on poisoned data clf = self.clf.deepcopy() tr = self.tr.append(CDataset(xc, yc)) clf.fit(tr) y_pred = clf.predict(self.ts.X) unpriv = self.unprivileged() return y_pred[unpriv == 1].mean() / y_pred[unpriv == 0].mean()
def filter_transform(ds, labels, n_ds=None, transform=img_to_tensor, bin_label=False): valid = [i for i, y in enumerate(ds.Y) if y in labels] if n_ds is not None: valid = CArray(np.random.choice(a=valid, size=n_ds, replace=False)) x = ds.X[valid, :] y = ds.Y[valid] if bin_label: y = y == labels[0] return CDataset(x=transform(x), y=y.astype(int))
def fit(self, dataset, n_jobs=1): """Trains the classifier. If a preprocess has been specified, input is normalized before training. For multiclass case see `.CClassifierMulticlass`. Parameters ---------- dataset : CDataset Training set. Must be a :class:`.CDataset` instance with patterns data and corresponding labels. n_jobs : int Number of parallel workers to use for training the classifier. Default 1. Cannot be higher than processor's number of cores. Returns ------- trained_cls : CClassifier Instance of the classifier trained using input dataset. """ if not isinstance(dataset, CDataset): raise TypeError( "training set should be provided as a CDataset object.") # Storing dataset classes self._classes = dataset.classes self._n_features = dataset.num_features data_x = dataset.X # Transform data if a preprocess is defined if self.preprocess is not None: data_x = self.preprocess.fit_transform(dataset.X) # Data is ready: fit the classifier try: # Try to use parallelization self._fit(CDataset(data_x, dataset.Y), n_jobs=n_jobs) except TypeError: # Parallelization is probably not supported self._fit(CDataset(data_x, dataset.Y)) return self
def test_poison(clf, tr, val, ts, x_poison, y_poison): poison = CDataset(x_poison, y_poison) clf_p = clf.deepcopy() clf_p.init() tr_p = tr.append(poison) clf_p.fit(tr_p) test_acc = test_clf(clf_p, ts) val_acc = test_clf(clf_p, val) return clf_p, test_acc, val_acc
def load(self): """Loads the dataset. Returns ------- dataset : CDataset The randomly generated dataset. """ patterns = CArray.randint(2, shape=(self.n_samples, self.n_features)) labels = CArray.randint(2, shape=(1, self.n_samples)) return CDataset(patterns, labels)
def test_save_and_load_svmlight_file(self): """Testing libsvm dataset loading and saving.""" self.logger.info("Testing libsvm dataset loading and saving...") test_file = fm.join(fm.abspath(__file__), "myfile.libsvm") # Cleaning test file try: fm.remove_file(test_file) except (OSError, IOError) as e: if e.errno != 2: raise e self.logger.info("Patterns saved:\n{:}".format(self.patterns)) self.logger.info("Labels saved:\n{:}".format(self.labels)) CDataLoaderSvmLight.dump(CDataset(self.patterns, self.labels), test_file) new_dataset = CDataLoaderSvmLight().load(test_file) self.assertFalse((new_dataset.X != self.patterns).any()) self.assertFalse((new_dataset.Y != self.labels).any()) # load data but now remove all zero features (colums) new_dataset = CDataLoaderSvmLight().load(test_file, remove_all_zero=True) self.logger.info("Patterns loaded:\n{:}".format(new_dataset.X)) self.logger.info("Labels loaded:\n{:}".format(new_dataset.Y)) self.logger.info("Mapping back:\n{:}".format( new_dataset.header.idx_mapping)) self.assertTrue(new_dataset.X.issparse) self.assertTrue(new_dataset.Y.isdense) self.assertTrue(new_dataset.header.idx_mapping.isdense) # non-zero elements should be unchanged self.assertEqual(self.patterns.nnz, new_dataset.X.nnz) new_nnz_data = new_dataset.X.nnz_data self.assertFalse((self.patterns.nnz_data != new_nnz_data.sort()).any()) # With idx_mapping we should be able to reconstruct original data original = CArray.zeros(self.patterns.shape, sparse=True) original[:, new_dataset.header.idx_mapping] = new_dataset.X self.assertFalse((self.patterns != original).any()) # Cleaning test file try: fm.remove_file(test_file) except (OSError, IOError) as e: if e.errno != 2: raise e
def fit_forward(self, x, y=None, caching=False): """Fit estimator using data and then execute forward on the data. To avoid returning over-fitted scores on the training set, this method runs a 5-fold cross validation on training data and returns the validation scores. Parameters ---------- x : CArray Array with shape (n_samples, n_features) to be transformed and to be used for training. y : CArray or None, optional Array of shape (n_samples,) containing the class labels. Can be None if not required by the algorithm. caching: bool True if preprocessed x should be cached for backward pass Returns ------- CArray Transformed input data. See Also -------- fit : fit the preprocessor. forward : run forward function on input data. """ kfold = CDataSplitterKFold(num_folds=5, random_state=0).compute_indices( CDataset(x, y)) scores = CArray.zeros(shape=(x.shape[0], self.classes.size)) # TODO: samples can be first preprocessed and cached, if required. # then we can use _fit and _forward to work on the preprocessed data for k in range(kfold.num_folds): tr_idx = kfold.tr_idx[k] ts_idx = kfold.ts_idx[k] self.fit(x[tr_idx, :], y[tr_idx]) scores[ts_idx, :] = self.forward(x[ts_idx, :], caching=False) # train on the full training set after computing the xval scores self.fit(x, y) # cache x if required if caching is True: self._forward_preprocess(x, caching=True) return scores
def load(self): """Loads the dataset. Returns ------- dataset : CDataset The randomly generated dataset. """ from sklearn.datasets import make_moons patterns, labels = make_moons(n_samples=self.n_samples, noise=self.noise, random_state=self.random_state) return CDataset(patterns, labels)
def __init__(self, problem: CBlackBoxProblem, is_debug: bool = False): CAttackEvasion.__init__( self, problem.model_wrapper.classifier, problem.model_wrapper.classifier, surrogate_data=CDataset(CArray([[0], [1]]), CArray([0, 1])), y_target=None, ) self.problem = problem self.confidences_ = [] self.changes_per_iterations_ = [] self.model_wrapper = problem.model_wrapper self.is_debug = is_debug self._original_x = None self.minimization_result_ = []
def load(self): """Loads the dataset. Returns ------- dataset : CDataset The randomly generated dataset. """ from sklearn.datasets import make_circles patterns = make_circles(n_samples=self.n_samples, noise=self.noise, factor=self.factor, random_state=self.random_state)[0] return CDataset(patterns, self._dts_function(patterns))
def load(self): """Loads the dataset. Returns ------- dataset : CDataset The randomly generated dataset. """ from sklearn.datasets import make_blobs patterns = make_blobs(n_samples=self.n_samples, n_features=2, centers=self.centers, cluster_std=self.cluster_std, random_state=self.random_state)[0] return CDataset(patterns, self._dts_function(CArray(patterns)))
def _update_poisoned_clf(self, clf=None, tr=None, train_normalizer=False): """ Trains classifier on D (original training data) plus {x,y} (new point). Parameters ---------- x: feature vector of new training point y: true label of new training point Returns ------- clf: trained classifier on D and {x,y} """ # xc hashing is only valid if clf and tr do not change # (when calling update_poisoned_clf() without parameters) xc_hash_is_valid = False if clf is None and tr is None: xc_hash_is_valid = True if clf is None: clf = self._solver_clf if tr is None: tr = self.surrogate_data tr = tr.append(CDataset(self._xc, self._yc)) xc_hash = self._xc.sha1() if self._xc_hash is None or self._xc_hash != xc_hash: # xc set has changed, retrain clf # hash is stored only if update_poisoned_clf() is called w/out pars self._xc_hash = xc_hash if xc_hash_is_valid else None self._poisoned_clf = clf.deepcopy() # we assume that normalizer is not changing w.r.t xc! # so we avoid re-training the normalizer on dataset including xc if self.classifier.preprocess is not None: self._poisoned_clf.retrain_normalizer = train_normalizer self._poisoned_clf.fit(tr) return self._poisoned_clf, tr
def test_openworldkfold_tr_class_skip(self): ds = CDataset([[1, 2], [3, 4], [5, 6], [10, 20], [30, 40], [50, 60], [100, 200], [300, 400], [500, 600]], [1, 2, 1, 2, 2, 0, 1, 0, 2]) # class 0 has 2 samples # create 25 folds to increase the chance of getting the warning message kf = CDataSplitterOpenWorldKFold( num_folds=25, n_train_samples=2, random_state=5000).compute_indices(ds) self.assertEqual(len(kf.tr_idx), 25) self.assertEqual(len(kf.ts_idx), 25) for fold_tr_idx, fold_ts_idx in kf: self.assertTrue((ds.Y[fold_tr_idx] != 0).all()) self.assertTrue((ds.Y[fold_ts_idx] == 0).any())
def load(self): """Loads the dataset. Returns ------- dataset : CDataset The randomly generated dataset. """ from sklearn.datasets import make_blobs patterns, labels = make_blobs(n_samples=self.n_samples, n_features=self.n_features, centers=self.centers, cluster_std=self.cluster_std, center_box=self.center_box, random_state=self.random_state) return CDataset(patterns, labels)
def binarize_dataset(class_idx, dataset): """Returns the dataset needed by the class_idx binary classifier. Parameters ---------- class_idx : int Index of the target class. dataset : CDataset Dataset to binarize. Returns ------- bin_dataset : CDataset Binarized dataset. """ return CDataset( dataset.X, dataset.get_labels_ovr(dataset.classes[class_idx]), header=dataset.header)
def load(self, min_faces_per_person=None, funneled=True, color=False): """Load LFW dataset. Extra dataset attributes: - 'img_w', 'img_h': size of the images in pixels. - 'y_names': tuple with the name string for each class. Parameters ---------- min_faces_per_person : int or None, optional The extracted dataset will only retain pictures of people that have at least min_faces_per_person different pictures. Default None, so all db images are returned. funneled : bool, optional Download and use the images aligned with deep funneling. Default True. color : bool, optional Keep the 3 RGB channels instead of averaging them to a single gray level channel. Default False. """ with CDataLoaderLFW.__lock: lfw_people = fetch_lfw_people( data_home=SECML_DS_DIR, funneled=funneled, resize=1, min_faces_per_person=min_faces_per_person, color=color, slice_=None, download_if_missing=True) x = CArray(lfw_people.data) y = CArray(lfw_people.target) img_w = lfw_people.images.shape[2] img_h = lfw_people.images.shape[1] y_names = tuple(lfw_people.target_names.tolist()) header = CDatasetHeader(img_w=img_w, img_h=img_h, y_names=y_names) return CDataset(x, y, header=header)
def load(self): """Loads the dataset. Returns ------- dataset : CDataset The randomly generated dataset. """ from sklearn.datasets import make_regression patterns, labels = make_regression(n_samples=self.n_samples, n_features=self.n_features, n_informative=self.n_informative, n_targets=self.n_targets, bias=self.bias, effective_rank=self.effective_rank, tail_strength=self.tail_strength, noise=self.noise, random_state=self.random_state) return CDataset(patterns, labels)
def _fit(self, x, y): """Trains the KNeighbors classifier. Training dataset is stored to use in kneighbors() method. Parameters ---------- x : CArray Array to be used for training with shape (n_samples, n_features) y : CArray Array of shape (n_samples,) containing the class labels. Returns ------- CClassifierKNN Trained classifier. """ self._tr = CDataset(x, y) return CClassifierSkLearn._fit(self, x, y)
def test_custom_attr(self): """Testing for custom attributes.""" header = CDatasetHeader(id='mydataset', age=34, colors=CArray([1, 2, 3])) ds = CDataset(self.X, self.Y, header=header) ds_params = ds.header.get_params() self.assertEqual(ds_params['id'], 'mydataset') self.assertEqual(ds_params['age'], 34) self.assert_array_equal(ds_params['colors'], CArray([1, 2, 3])) # Testing getitem. Immutable objects should be copied as they are. # Arrays should be indexed. ds_get = ds[[0, 2], :] ds_params = ds_get.header.get_params() self.assert_array_equal(ds_get.X, CArray([[1, 2, 3], [7, 8, 9]])) self.assert_array_equal(ds_get.Y, CArray([1, 2])) self.assertEqual(ds_params['id'], 'mydataset') self.assertEqual(ds_params['age'], 34) self.assert_array_equal(ds_params['colors'], CArray([1, 3]))
def test_margin(self): self.logger.info("Testing margin separation of SVM...") import numpy as np # we create 40 separable points rng = np.random.RandomState(0) n_samples_1 = 1000 n_samples_2 = 100 X = np.r_[1.5 * rng.randn(n_samples_1, 2), 0.5 * rng.randn(n_samples_2, 2) + [2, 2]] y = [0] * (n_samples_1) + [1] * (n_samples_2) dataset = CDataset(X, y) # fit the model clf = CClassifierSVM() clf.fit(dataset.X, dataset.Y) w = clf.w a = -w[0] / w[1] xx = CArray.linspace(-5, 5) yy = a * xx - clf.b / w[1] wclf = CClassifierSVM(class_weight={0: 1, 1: 10}) wclf.fit(dataset.X, dataset.Y) ww = wclf.w wa = -ww[0] / ww[1] wyy = wa * xx - wclf.b / ww[1] fig = CFigure(linewidth=1) fig.sp.plot(xx, yy.ravel(), 'k-', label='no weights') fig.sp.plot(xx, wyy.ravel(), 'k--', label='with weights') fig.sp.scatter(X[:, 0].ravel(), X[:, 1].ravel(), c=y) fig.sp.legend() fig.savefig( fm.join(fm.abspath(__file__), 'figs', 'test_c_classifier_svm.pdf'))