def run_linear_open_experiment(self, iterations=10, save=False): """ Train a classifier on test data, obtain the best combination of parameters through a grid search cross-validation and test the classifier using a open-world split of the dataset. The results from the number of iterations are saved as pz files. :param iterations: number of runs (training/testing) :save: save predictions and labels if True """ self.true_labels = np.array([]) self.predictions = np.array([]) for i in xrange(iterations): self.randomize_dataset_open_world() clf = GridSearchCV(svm.LinearSVC(), {'C': np.logspace(-3, 3, 7)}) clf.fit(self.X_train, self.Y_train) out = clf.best_estimator_.decision_function(self.X_test) classes = clf.best_estimator_.classes_ for scores in out: m = np.max(scores) if (abs(m / scores[:][:]) < 0.5).any(): self.predictions = np.append(self.predictions, 99) else: p = classes[np.where(scores == m)] self.predictions = np.append(self.predictions, p) self.true_labels = np.append(self.true_labels, self.Y_test) if save: pz.save(self.predictions, "mca_predictions_open.pz") pz.save(self.true_labels, "mca_true_labels_open.pz")
def run_linear_open_experiment(self, iterations=10, save=False): """ Train a classifier on test data, obtain the best combination of parameters through a grid search cross-validation and test the classifier using a open-world split of the dataset. The results from the number of iterations are saved as pz files. :param iterations: number of runs (training/testing) :save: save predictions and labels if True """ self.true_labels = np.array([]) self.predictions = np.array([]) for i in xrange(iterations): self.randomize_dataset_open_world() clf = GridSearchCV(svm.LinearSVC(), {'C': np.logspace(-3, 3, 7)}) clf.fit(self.X_train, self.Y_train) out = clf.best_estimator_.decision_function(self.X_test) classes = clf.best_estimator_.classes_ for scores in out: m = np.max(scores) if (abs(m/scores[:][:]) < 0.5).any(): self.predictions = np.append(self.predictions, 99) else: p = classes[np.where(scores==m)] self.predictions = np.append(self.predictions, p) self.true_labels = np.append(self.true_labels, self.Y_test) if save: pz.save(self.predictions, "mca_predictions_open.pz") pz.save(self.true_labels, "mca_true_labels_open.pz")
def run_linear_experiment(self, rocs_filename, iterations=10): """ Run a classification experiment by running several iterations. In each iteration data is randomized, a linear svm classifier is trained and evaluated using cross-validation over a the cost parameter in the range np.logspace(-3, 3, 7). The best classifier is used for testing and a ROC curve is computed and saved as property and locally. :param rocs_filename: the file to save all rocs computed :param iterations: number of runs (training/testing) """ for i in xrange(iterations): print "[*] Iteration {0}".format(i) print "[*] Randomizing dataset..." self.randomize_dataset() clf = GridSearchCV(svm.LinearSVC(), {'C': np.logspace(-3, 3, 7)}) print "[*] Training..." clf.fit(self.X_train, self.Y_train) out = clf.best_estimator_.decision_function(self.X_test) print "[*] Testing..." roc = eval.compute_roc(np.float32(out.flatten()), np.float32(self.Y_test)) self.rocs.append(roc) print "[*] ROC saved." pz.save(self.rocs, rocs_filename)
def process_dir(read_dir, out_dir, mode='FCG'): """ Convert a series of APK into graph objects. Load all APKs in a dir subtree and create graph objects that are pickled for later processing and learning. """ sys.setrecursionlimit(100000) files = [] # check if pdg doesnt exist yet and mark the file to be processed for dirName, subdirList, fileList in os.walk(read_dir): for f in fileList: files.append(os.path.join(dirName, f)) # set up progress bar print "\nProcessing {} APK files in dir {}".format(len(files), read_dir) widgets = ['Building graphs: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA(), ' '] pbar = ProgressBar(widgets=widgets, maxval=len(files)) pbar.start() progress = 0 # loop through .apk files and save them in .pdg.pz format for f in files: f = os.path.realpath(f) print '[] Loading {0}'.format(f) try: if mode is 'FCG': graph = FCG(f) elif mode is 'PDG': graph = PDG(f) # if an exception happens, save the .apk in the corresponding dir except Exception as e: err = e.__class__.__name__ err_dir = err + "/" d = os.path.join(read_dir, err_dir) if not os.path.exists(d): os.makedirs(d) cmd = "cp {} {}".format(f, d) os.system(cmd) print "[*] {} error loading {}".format(err, f) continue h = get_sha256(f) if out_dir: out = out_dir else: out = read_dir fnx = os.path.join(out, "{}.pz".format(h)) pz.save(graph.g, fnx) print "[*] Saved {}\n".format(fnx) progress += 1 pbar.update(progress) pbar.finish() print "Done."
def run_linear_closed_experiment(self, iterations=10, save=False): """ Train a classifier on test data, obtain the best combination of parameters through a grid search cross-validation and test the classifier using a closed-world split of the dataset. The results from the number of iterations are saved as pz files. :param iterations: number of runs (training/testing) :save: save predictions and labels if True """ self.true_labels = np.array([]) self.predictions = np.array([]) for i in xrange(iterations): self.randomize_dataset_closed_world() clf = GridSearchCV(svm.LinearSVC(), {'C': np.logspace(-3, 3, 7)}) clf.fit(self.X_train, self.Y_train) out = clf.best_estimator_.predict(self.X_test) self.predictions = np.append(self.predictions, out) self.true_labels = np.append(self.true_labels, self.Y_test) if save: pz.save(self.predictions, "mca_predictions_closed.pz") pz.save(self.true_labels, "mca_true_labels_closed.pz")
def save_data(self): """ Store pz objects for the data matrix, the labels and the name of the original samples so that they can be used in a new experiment without the need to extract all features again """ print "[*] Saving labels, data matrix and file names..." pz.save(self.X, "X.pz") pz.save(self.Y, "Y.pz") pz.save(self.fnames, "fnames.pz")