def cmd_store_outliers(): print_banner("Store outliers") global model threshold = 0.5 if config.get_boolean('analysis', 'UseCustomTestSet'): print_notice("Creating a custom test set") sel_ds = 'Custom' threshold = 0.0 my_sets = dataset_factory.get_dataset(sel_ds).get_sets() transform.transform_sets(sel_ds, my_sets, language) orig, X, Y = transform.get_xy_with_orig(sel_ds, 'testing_set', language, vuln_type, selected_features) # TODO Delete transforms and data set #dataset_factory.get_dataset(sel_ds).delete_sets() else: sel_ds = config.get_str('dataset', 'SelectedDataset') orig, X, Y = transform.get_xy_with_orig(sel_ds, 'testing_set', language, vuln_type, selected_features) X = sync_features(X) data.store_data(model, orig, X, Y, just_outliers=True, threshold=threshold)
def find_best_threshold(model, orig, X): pref_thr = max_f1 = -1 probas = model.predict_proba(X) print_notice("Finding the best threshold value for F1 score") for c in frange(0.0, 1.01, 0.01): predicted = [] y_only_filename = [] orig['predicted'] = probas[:, 1] orig['predicted'] = orig['predicted'].apply(lambda x: int(x > c)) for _, row in orig.groupby(['file_name']).agg({'vulnerable': np.sum, 'predicted': np.sum}).iterrows(): y_only_filename.append(row['vulnerable'] != 0) predicted.append(row['predicted'] != 0) # Due to the class imbalance, we use a weighted F1 score f1_score = metrics.f1_score(y_only_filename, predicted, average='weighted') if f1_score > max_f1: max_f1 = f1_score pref_thr = c return pref_thr
def print_help(): print_notice("Valid commands are:") valid_commands = sorted( [k[4:] for k in globals().keys() if k[:4] == 'cmd_']) print_notice(', '.join(valid_commands)) exit(0)
def clean_up(self): print_notice("Removing temporary files") silent_remove(self.train_fn) silent_remove(self.test_fn) print_notice("Stopping JVM") jvm.stop()
def compare_results(file_name, orig, sel_vt): compare_set = {'SQLi': [], 'XSS': []} with open(file_name, newline='') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') for row in csv_reader: compare_set[row[0]].append((os.path.realpath(row[1]), row[2])) predicted = [] y_only_filename = [] for f, row in orig.groupby(['file_name']).agg({'vulnerable': np.sum}).iterrows(): vuln_file = os.path.realpath(f) if row['vulnerable'] == 0: y_only_filename.append(0) else: y_only_filename.append(1) if any([el for el in compare_set[sel_vt] if el[0] == vuln_file]): predicted.append(1) else: predicted.append(0) print_notice(metrics.classification_report(y_only_filename, predicted, target_names=['not vulnerable', 'vulnerable']))
def create_sets(self): language = 'PHP' # TODO: What are we going to do with Python? samate_pickle = config.get_str('SAMATE', 'SamatePickle') if not os.path.isfile(samate_pickle): training_perc = config.get_int('dataset', 'TrainingPercentage') tuning_perc = config.get_int('dataset', 'TuningPercentage') training_set = {language: {}} tuning_set = {language: {}} testing_set = {language: {}} flaw_dict = {language: {}} for vuln_type in config.get_list('dataset', 'Vulnerabilities'): flaws, lst = self.get_file_list(vuln_type) flaw_dict[language][vuln_type] = flaws shuffle(lst) training_set[language][vuln_type] = slice_perc(lst, 0, training_perc) tuning_set[language][vuln_type] = slice_perc(lst, training_perc, training_perc + tuning_perc) testing_set[language][vuln_type] = slice_perc(lst, training_perc + tuning_perc, 100) dataset = {'training_set': training_set, 'tuning_set': tuning_set, 'testing_set': testing_set, 'flaw_dict': flaw_dict} # Save to pickle file for future use with open(samate_pickle, 'wb') as pickle_file: pickle.dump(dataset, pickle_file) else: print_notice("Pickle file already created")
def select_features(X, Y): k = config.get_int('model', 'kFeatures') print_notice("Sorting features based on chi^2 (k=%d):" % k) if k < 0 or k > len(X.columns): print_error("k should be >= 0 and <= %d (n_features). Got %d." % (len(X.columns), k)) exit(-1) skb = SelectKBest(chi2, k=k) skb.fit_transform(X, Y) support = skb.get_support() n = 1 features = dict() for col_name, score in zip(X.columns.values[support], skb.scores_[support]): features[col_name] = score for feature, score in sorted(features.items(), key=operator.itemgetter(1), reverse=True): print_notice("%d. %s %.2f" % (n, feature, score)) n += 1 return X.columns.values[support]
def delete_sets(self): pkl = self.pickle_path if os.path.isfile(pkl): print_notice("Removing %s" % pkl) os.remove(pkl) else: print_warning("Unable to remove %s. File does not exist." % pkl)
def print_metrics(model, X, Y): predicted = model.predict(X) probas = model.predict_proba(X) print_notice("Brier score (class: not vulnerable) %.4f" % calculate_brier_score(probas, Y, cls=0)) print_notice("Brier score (class: vulnerable) %.4f" % calculate_brier_score(probas, Y, cls=1)) print_notice("Brier score %.4f" % _brier_score_loss(Y, probas[:, 1])) print_notice("Accuracy %.2f" % metrics.accuracy_score(Y, predicted)) print_notice(metrics.classification_report(Y, predicted, target_names=['not vulnerable', 'vulnerable']))
def to_arff(self, df, test): if test: filename = self.test_fn else: filename = self.train_fn print_notice("Writing ARFF data to filename %s" % filename) pandas2arff(df, filename) return filename
def create_popular_features(dataset, sets, language): mark_whole_path = config.get_boolean('dataset', 'MarkWholePathVulnerable') flaw_dict = sets['flaw_dict'][language] num_processes = 100 with Pool(processes=num_processes) as pool: for vuln_type in config.get_list('dataset', 'Vulnerabilities'): filename = get_features_filename(dataset, language, vuln_type) if not os.path.isfile(filename): f = transform_file(flaw_dict[vuln_type], mark_whole_path) set_type = 'training_set' counter = 0 l = len(sets[set_type][language][vuln_type]) generator = iter(sets[set_type][language][vuln_type]) pf = PopularFeatures(num_features=200) while True: next_elements = list(next(generator) for _ in range(num_processes)) counter += len(next_elements) if not next_elements: break start = timeit.default_timer() res = pool.map(f, next_elements) for df in res: if df is None: continue if not all(x in df.columns.values for x in ['file_name', 'line', 'vulnerable', 'tainted']): print_warning("Could not find the right columns in data frame. Ignoring.") continue # We drop these columns so our feature filter can ignore them df.drop(['file_name', 'line', 'vulnerable', 'tainted'], axis=1, inplace=True) pf.partial_fit(df) print_notice( "%s %s %s: %d/%d (run took %.2f secs)" % (language, vuln_type, set_type, counter, l, timeit.default_timer() - start)) with open(filename, 'wb') as pickle_file: # Protocol version 4 supports large objects (> 4GB) pickle.dump(pf, pickle_file, protocol=4) else: print_notice("Pickle file %s already created" % filename)
def get_sets(self): pkl = self.pickle_path # Load the pickle file print_notice("Loading pickle file") with open(pkl, 'rb') as pickle_file: sets = pickle.load(pickle_file) if self.sampling_perc['SQLi'] < 1.0 or self.sampling_perc['XSS'] < 1.0: return self.sample_set(sets) return sets
def print_model_results(model, orig, X, c): probas = model.predict_proba(X) predicted = [] y_only_filename = [] orig['predicted'] = probas[:, 1] orig['predicted'] = orig['predicted'].apply(lambda x: int(x > c)) for _, row in orig.groupby(['file_name']).agg({'vulnerable': np.sum, 'predicted': np.sum}).iterrows(): y_only_filename.append(row['vulnerable'] != 0) predicted.append(row['predicted'] != 0) print_notice(metrics.classification_report(y_only_filename, predicted, target_names=['not vulnerable', 'vulnerable']))
def store_data(model, orig, X, Y, just_outliers, threshold=0.5): outliers_file = os.path.join(config.get_str('analysis', 'OutliersPath'), 'outliers.csv') model_type = config.get_str('model', 'Model') probas = model.predict_proba(X) df = X.copy() df[['file_name', 'line']] = orig[['file_name', 'line']] df['actual'] = Y df['predict_proba'] = probas[:, 1] df['predicted'] = (df['predict_proba'] > threshold) if model_type == 'DecisionTreeClassifier': print_notice("Adding decision paths to the data as model is a DT") node_indicator = model.decision_path(X) for i in df.index: df.loc[i, 'path'] = str( node_indicator.indices[node_indicator.indptr[i]:node_indicator. indptr[i + 1]]) print_notice("Storing in file %s" % outliers_file) if just_outliers: indices = np.flatnonzero(df['predicted'] - Y) print_notice("Number of outliers %d" % indices.size) df.iloc[indices].to_csv(outliers_file) else: print_notice("Number of records %d" % len(df.index)) df.to_csv(outliers_file)
def create_sets(self): source_dir = config.get_str('analysis', 'CustomTestSet') custom_pickle = config.get_str('analysis', 'CustomPickle') languages = config.get_list('dataset', 'Languages') vulnerabilities = config.get_list('dataset', 'Vulnerabilities') if not os.path.isfile(custom_pickle): dataset = self.create_list(source_dir, languages, vulnerabilities) # Save to pickle file for future use with open(custom_pickle, 'wb') as pickle_file: pickle.dump(dataset, pickle_file) else: print_notice("Pickle file already created")
def cmd_store_custom(): print_banner("Store custom test set results") global model print_notice("Creating a custom test set") sel_ds = 'Custom' my_sets = dataset_factory.get_dataset(sel_ds).get_sets() transform.transform_sets(sel_ds, my_sets, language) orig, X, Y = transform.get_xy_with_orig(sel_ds, 'testing_set', language, vuln_type, selected_features) X = sync_features(X) data.store_data(model, orig, X, Y, just_outliers=True, threshold=0.0)
def create_dt_graph(title, model, features): graph_dir = config.get_str('model', 'DecisionTreeGraphDirectory') dot_file = os.path.join(graph_dir, '%s.dot' % title) png_file = os.path.join(graph_dir, '%s.png' % title) print_notice("Creating Decision Tree graph in %s" % png_file) # Write DOT file tree.export_graphviz(model, out_file=dot_file, feature_names=features, filled=True, rounded=True, proportion=True, node_ids=True) # Convert DOT to PNG os.system("dot -Tpng %s >%s" % (dot_file, png_file))
def delete_transforms(): remove = False for dataset in ['NVD', 'SAMATE']: for language in config.get_list('dataset', 'Languages'): for vuln_type in config.get_list('dataset', 'Vulnerabilities'): transform_filename = get_transform_filename( dataset, language, vuln_type) features_filename = get_features_filename( dataset, language, vuln_type) for f in [transform_filename, features_filename]: if os.path.isfile(f): print_notice("Removing %s" % f) os.remove(f) remove = True if not remove: print_warning("Could not find any transform files to remove.")
def get_xy_with_orig(dataset, set_name, language, vuln_type, features=None): filename = get_transform_filename(dataset, language, vuln_type) with open(filename, 'rb') as pickle_file: set_dfs = pickle.load(pickle_file) orig = set_dfs[set_name][language][vuln_type] X = set_dfs[set_name][language][vuln_type].drop(['file_name', 'line', 'vulnerable'], axis=1) Y = set_dfs[set_name][language][vuln_type]['vulnerable'] num_features = len(X.columns) if features is not None: num_features = len(features) print_notice("Using set '%s' with %d features" % (set_name, num_features)) if features is not None: return orig, X[list(set(features).intersection(X.columns))], Y else: return orig, X, Y
def display_pr_curve(title, model, X, Y): probas = model.predict_proba(X) # Compute Precision-Recall and plot curve precision, recall, area = get_auc_score(Y, probas) roc_score = roc_auc_score(Y, probas[:, 1]) print_notice("AUC-PR: %0.2f" % area) print_notice("AUC-ROC: %0.2f" % roc_score) # Plot Precision-Recall curve plt.clf() plt.plot(recall, precision, lw=2, color='navy', label='PR curve') plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.0]) plt.xlim([0.0, 1.0]) plt.title('%s AUC-PR=%.2f' % (title, area)) plt.tight_layout() plt.legend(loc="lower left") plt.show()
def select_best_model(X, Y, X_tuning, Y_tuning): model_type = config.get_str('model', 'Model') best_model_i = -1 best_auc_pr = -1 combinations = get_hyperparameter_combinations(model_type) for i in range(len(combinations)): print_notice("Generating model %d / %d with parameters: %s" % (1 + i, len(combinations), str(combinations[i]))) model = create_model(model_type, combinations[i]) model.fit(X, Y) probas = model.predict_proba(X_tuning) _, _, auc_pr = metrics.get_auc_score(Y_tuning, probas) print_notice("Model %d has AUC-PR %.2f" % (1 + i, auc_pr)) if auc_pr > best_auc_pr: best_model_i = i best_auc_pr = auc_pr print_notice( "Model %d generated best AUC-PR (%.2f) with parameters: %s" % (1 + best_model_i, best_auc_pr, str(combinations[best_model_i])))
def cmd_filter_features(): print_banner("Filtering features") global selected_features start_string = config.get_str('model', 'FeatureFilterStartString') if selected_features is None: sel_ds = config.get_str('dataset', 'SelectedDataset') X, Y = transform.get_xy(sel_ds, 'training_set', language, vuln_type) selected_features = X.columns.values selected_features = [ feature for feature in selected_features if not feature.startswith(start_string) ] n = 1 for feature in selected_features: print_notice("%d. %s" % (n, feature)) n += 1
def cmd_count_sets(): sel_ds = config.get_str('dataset', 'SelectedDataset') _, Y_training = transform.get_xy(sel_ds, 'training_set', language, vuln_type, None) _, Y_tuning = transform.get_xy(sel_ds, 'tuning_set', language, vuln_type, None) _, Y_testing = transform.get_xy(sel_ds, 'testing_set', language, vuln_type, None) non_vuln = 0 vuln = 0 for setname, df in zip(['training', 'tuning', 'testing'], [Y_training, Y_tuning, Y_testing]): nv = len(df.loc[df[0:] == 0]) v = len(df.loc[df[0:] == 1]) non_vuln += nv vuln += v print_notice("%s set: non-vulnerable lines %d, vulnerable lines %d" % (setname, nv, v)) print_notice("total: non-vulnerable lines %d, vulnerable lines %d" % (non_vuln, vuln))
def cmd_compare_tools(): global train_features print_banner("Comparing results") sel_ds = config.get_str('dataset', 'SelectedDataset') sel_vt = config.get_str('dataset', 'SelectedVulnerabilityType') if train_features is None: X, _ = transform.get_xy(sel_ds, 'training_set', language, vuln_type, selected_features) X.sort_index(axis=1, inplace=True) train_features = X.columns orig_tuning, X_tuning, _ = transform.get_xy_with_orig( sel_ds, 'tuning_set', language, vuln_type, selected_features) X_tuning = sync_features(X_tuning) c = find_best_threshold(model, orig_tuning, X_tuning) print_notice("Preferred threshold (Y > c): %.2f" % c) orig, X, _ = transform.get_xy_with_orig(sel_ds, 'testing_set', language, vuln_type, selected_features) print_notice('-' * 55) print_notice("Our results") print_model_results(model, orig, X, c) for (tool, file_name) in config.get_items('tools'): print_notice('-' * 55) print_notice('Comparing against tool: %s' % tool) compare_results(file_name, orig, sel_vt)
def fit(self, X, Y): # Create combined dataframe of X and Y X['class'] = Y.as_matrix() filename = self.to_arff(X, False) # Remove class column del X['class'] if not jvm.started: print_notice("Starting JVM") jvm.start() loader = Loader("weka.core.converters.ArffLoader") self.train_data = loader.load_file(filename) self.train_data.class_is_last() self.classifier = Classifier(classname="weka.classifiers.bayes.BayesNet", options=["-Q", "weka.classifiers.bayes.net.search.local.TAN", "--", "-S", self.score_type, self.mbc, "-E", "weka.classifiers.bayes.net.estimate.SimpleEstimator", "--", "-A", "0.9"]) self.classifier.build_classifier(self.train_data)
def transform_sets(dataset, sets, language): mark_whole_path = config.get_boolean('dataset', 'MarkWholePathVulnerable') flaw_dict = sets['flaw_dict'][language] num_processes = 100 set_dfs = {'training_set': {language: dict()}, 'tuning_set': {language: dict()}, 'testing_set': {language: dict()}} with Pool(processes=num_processes) as pool: for vuln_type in config.get_list('dataset', 'Vulnerabilities'): filename = get_transform_filename(dataset, language, vuln_type) # pf = get_popular_features(dataset, language, vuln_type) if not os.path.isfile(filename): f = transform_file(flaw_dict[vuln_type], mark_whole_path) for set_type in ['training_set', 'tuning_set', 'testing_set']: # counter = 0 # # l = len(sets[set_type][language][vuln_type]) # generator = iter(sets[set_type][language][vuln_type]) # # ff = BatchedPCA(all_features=pf.get_all_features(), n_components=30) # # # First we determine popular columns # if set_type == 'training_set': # while True: # next_elements = list(next(generator) for _ in range(num_processes)) # counter += len(next_elements) # # if not next_elements: # break # # start = timeit.default_timer() # res = pool.map(f, next_elements) # # chunk = pd.concat([df.to_sparse(fill_value=0) for df in res if df is not None], # ignore_index=True) # chunk.fillna(0, inplace=True) # print_notice("Chunk columns: %d memory usage: %d" % (len(chunk.columns), # chunk.memory_usage().sum())) # # # We drop these columns so our feature filter can ignore them # chunk.drop(['file_name', 'line', 'vulnerable', 'tainted'], axis=1, inplace=True) # # ff.partial_fit(chunk) # # print_notice( # "%s %s %s: %d/%d (run took %.2f secs)" % (language, vuln_type, set_type, counter, l, # timeit.default_timer() - start)) # # # Create a new transform function with our feature filter # f = transform_file(flaw_dict[vuln_type], mark_whole_path, feature_filter=ff) counter = 0 l = len(sets[set_type][language][vuln_type]) generator = iter(sets[set_type][language][vuln_type]) chunks = [] while True: next_elements = list(next(generator) for _ in range(num_processes)) counter += len(next_elements) if not next_elements: break start = timeit.default_timer() res = pool.map(f, next_elements) chunk = pd.concat([df.to_sparse(fill_value=0) for df in res if df is not None], ignore_index=True) chunk.fillna(0, inplace=True) print_notice("Chunk columns: %d memory usage: %d" % (len(chunk.columns), chunk.memory_usage().sum())) chunks.append(chunk) print_notice( "%s %s %s: %d/%d (run took %.2f secs)" % (language, vuln_type, set_type, counter, l, timeit.default_timer() - start)) print_notice("Concatenating %d data frames, this will take a while" % len(chunks)) if len(chunks) > 0: set_dfs[set_type][language][vuln_type] = pd.concat(chunks, ignore_index=True) set_dfs[set_type][language][vuln_type].fillna(0, inplace=True) set_dfs[set_type][language][vuln_type] = set_dfs[set_type][language][vuln_type].to_dense() with open(filename, 'wb') as pickle_file: # Protocol version 4 supports large objects (> 4GB) pickle.dump(set_dfs, pickle_file, protocol=4) set_dfs = {'training_set': {language: dict()}, 'tuning_set': {language: dict()}, 'testing_set': {language: dict()}} else: print_notice("Pickle file %s already created" % filename)