Пример #1
0
def cmd_store_outliers():
    print_banner("Store outliers")

    global model

    threshold = 0.5

    if config.get_boolean('analysis', 'UseCustomTestSet'):
        print_notice("Creating a custom test set")
        sel_ds = 'Custom'
        threshold = 0.0

        my_sets = dataset_factory.get_dataset(sel_ds).get_sets()

        transform.transform_sets(sel_ds, my_sets, language)

        orig, X, Y = transform.get_xy_with_orig(sel_ds, 'testing_set',
                                                language, vuln_type,
                                                selected_features)

        # TODO Delete transforms and data set
        #dataset_factory.get_dataset(sel_ds).delete_sets()

    else:
        sel_ds = config.get_str('dataset', 'SelectedDataset')

        orig, X, Y = transform.get_xy_with_orig(sel_ds, 'testing_set',
                                                language, vuln_type,
                                                selected_features)

    X = sync_features(X)

    data.store_data(model, orig, X, Y, just_outliers=True, threshold=threshold)
Пример #2
0
def find_best_threshold(model, orig, X):
    pref_thr = max_f1 = -1

    probas = model.predict_proba(X)

    print_notice("Finding the best threshold value for F1 score")

    for c in frange(0.0, 1.01, 0.01):
        predicted = []
        y_only_filename = []

        orig['predicted'] = probas[:, 1]
        orig['predicted'] = orig['predicted'].apply(lambda x: int(x > c))

        for _, row in orig.groupby(['file_name']).agg({'vulnerable': np.sum, 'predicted': np.sum}).iterrows():
            y_only_filename.append(row['vulnerable'] != 0)
            predicted.append(row['predicted'] != 0)

        # Due to the class imbalance, we use a weighted F1 score
        f1_score = metrics.f1_score(y_only_filename, predicted, average='weighted')

        if f1_score > max_f1:
            max_f1 = f1_score
            pref_thr = c

    return pref_thr
Пример #3
0
def print_help():
    print_notice("Valid commands are:")
    valid_commands = sorted(
        [k[4:] for k in globals().keys() if k[:4] == 'cmd_'])

    print_notice(', '.join(valid_commands))
    exit(0)
Пример #4
0
    def clean_up(self):
        print_notice("Removing temporary files")
        silent_remove(self.train_fn)
        silent_remove(self.test_fn)

        print_notice("Stopping JVM")
        jvm.stop()
Пример #5
0
def compare_results(file_name, orig, sel_vt):
    compare_set = {'SQLi': [], 'XSS': []}

    with open(file_name, newline='') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for row in csv_reader:
            compare_set[row[0]].append((os.path.realpath(row[1]), row[2]))

    predicted = []
    y_only_filename = []

    for f, row in orig.groupby(['file_name']).agg({'vulnerable': np.sum}).iterrows():
        vuln_file = os.path.realpath(f)

        if row['vulnerable'] == 0:
            y_only_filename.append(0)
        else:
            y_only_filename.append(1)

        if any([el for el in compare_set[sel_vt] if el[0] == vuln_file]):
            predicted.append(1)
        else:
            predicted.append(0)

    print_notice(metrics.classification_report(y_only_filename, predicted, target_names=['not vulnerable', 'vulnerable']))
Пример #6
0
    def create_sets(self):
        language = 'PHP'  # TODO: What are we going to do with Python?
        samate_pickle = config.get_str('SAMATE', 'SamatePickle')

        if not os.path.isfile(samate_pickle):
            training_perc = config.get_int('dataset', 'TrainingPercentage')
            tuning_perc = config.get_int('dataset', 'TuningPercentage')

            training_set = {language: {}}
            tuning_set = {language: {}}
            testing_set = {language: {}}
            flaw_dict = {language: {}}

            for vuln_type in config.get_list('dataset', 'Vulnerabilities'):
                flaws, lst = self.get_file_list(vuln_type)

                flaw_dict[language][vuln_type] = flaws
                shuffle(lst)

                training_set[language][vuln_type] = slice_perc(lst, 0, training_perc)
                tuning_set[language][vuln_type] = slice_perc(lst, training_perc, training_perc + tuning_perc)
                testing_set[language][vuln_type] = slice_perc(lst, training_perc + tuning_perc, 100)

            dataset = {'training_set': training_set, 'tuning_set': tuning_set, 'testing_set': testing_set,
                       'flaw_dict': flaw_dict}

            # Save to pickle file for future use
            with open(samate_pickle, 'wb') as pickle_file:
                pickle.dump(dataset, pickle_file)
        else:
            print_notice("Pickle file already created")
Пример #7
0
def select_features(X, Y):
    k = config.get_int('model', 'kFeatures')

    print_notice("Sorting features based on chi^2 (k=%d):" % k)

    if k < 0 or k > len(X.columns):
        print_error("k should be >= 0 and <= %d (n_features). Got %d." %
                    (len(X.columns), k))
        exit(-1)

    skb = SelectKBest(chi2, k=k)
    skb.fit_transform(X, Y)

    support = skb.get_support()

    n = 1
    features = dict()

    for col_name, score in zip(X.columns.values[support],
                               skb.scores_[support]):
        features[col_name] = score

    for feature, score in sorted(features.items(),
                                 key=operator.itemgetter(1),
                                 reverse=True):
        print_notice("%d. %s %.2f" % (n, feature, score))
        n += 1

    return X.columns.values[support]
Пример #8
0
    def delete_sets(self):
        pkl = self.pickle_path

        if os.path.isfile(pkl):
            print_notice("Removing %s" % pkl)
            os.remove(pkl)
        else:
            print_warning("Unable to remove %s. File does not exist." % pkl)
Пример #9
0
def print_metrics(model, X, Y):
    predicted = model.predict(X)
    probas = model.predict_proba(X)

    print_notice("Brier score (class: not vulnerable) %.4f" % calculate_brier_score(probas, Y, cls=0))
    print_notice("Brier score (class: vulnerable) %.4f" % calculate_brier_score(probas, Y, cls=1))
    print_notice("Brier score %.4f" % _brier_score_loss(Y, probas[:, 1]))
    print_notice("Accuracy %.2f" % metrics.accuracy_score(Y, predicted))
    print_notice(metrics.classification_report(Y, predicted, target_names=['not vulnerable', 'vulnerable']))
Пример #10
0
    def to_arff(self, df, test):
        if test:
            filename = self.test_fn
        else:
            filename = self.train_fn

        print_notice("Writing ARFF data to filename %s" % filename)

        pandas2arff(df, filename)

        return filename
Пример #11
0
def create_popular_features(dataset, sets, language):

    mark_whole_path = config.get_boolean('dataset', 'MarkWholePathVulnerable')
    flaw_dict = sets['flaw_dict'][language]
    num_processes = 100

    with Pool(processes=num_processes) as pool:
        for vuln_type in config.get_list('dataset', 'Vulnerabilities'):
            filename = get_features_filename(dataset, language, vuln_type)

            if not os.path.isfile(filename):
                f = transform_file(flaw_dict[vuln_type], mark_whole_path)
                set_type = 'training_set'

                counter = 0

                l = len(sets[set_type][language][vuln_type])
                generator = iter(sets[set_type][language][vuln_type])

                pf = PopularFeatures(num_features=200)

                while True:
                    next_elements = list(next(generator) for _ in range(num_processes))
                    counter += len(next_elements)

                    if not next_elements:
                        break

                    start = timeit.default_timer()
                    res = pool.map(f, next_elements)

                    for df in res:
                        if df is None:
                            continue

                        if not all(x in df.columns.values for x in ['file_name', 'line', 'vulnerable', 'tainted']):
                            print_warning("Could not find the right columns in data frame. Ignoring.")
                            continue

                        # We drop these columns so our feature filter can ignore them
                        df.drop(['file_name', 'line', 'vulnerable', 'tainted'], axis=1, inplace=True)

                        pf.partial_fit(df)

                    print_notice(
                        "%s %s %s: %d/%d (run took %.2f secs)" % (language, vuln_type, set_type, counter, l,
                                                                  timeit.default_timer() - start))

                with open(filename, 'wb') as pickle_file:
                    # Protocol version 4 supports large objects (> 4GB)
                    pickle.dump(pf, pickle_file, protocol=4)

            else:
                print_notice("Pickle file %s already created" % filename)
Пример #12
0
    def get_sets(self):
        pkl = self.pickle_path

        # Load the pickle file
        print_notice("Loading pickle file")

        with open(pkl, 'rb') as pickle_file:
            sets = pickle.load(pickle_file)

        if self.sampling_perc['SQLi'] < 1.0 or self.sampling_perc['XSS'] < 1.0:
            return self.sample_set(sets)

        return sets
Пример #13
0
def print_model_results(model, orig, X, c):
    probas = model.predict_proba(X)

    predicted = []
    y_only_filename = []

    orig['predicted'] = probas[:, 1]
    orig['predicted'] = orig['predicted'].apply(lambda x: int(x > c))

    for _, row in orig.groupby(['file_name']).agg({'vulnerable': np.sum, 'predicted': np.sum}).iterrows():
        y_only_filename.append(row['vulnerable'] != 0)
        predicted.append(row['predicted'] != 0)

    print_notice(metrics.classification_report(y_only_filename, predicted, target_names=['not vulnerable', 'vulnerable']))
Пример #14
0
def store_data(model, orig, X, Y, just_outliers, threshold=0.5):
    outliers_file = os.path.join(config.get_str('analysis', 'OutliersPath'),
                                 'outliers.csv')
    model_type = config.get_str('model', 'Model')

    probas = model.predict_proba(X)

    df = X.copy()
    df[['file_name', 'line']] = orig[['file_name', 'line']]
    df['actual'] = Y
    df['predict_proba'] = probas[:, 1]
    df['predicted'] = (df['predict_proba'] > threshold)

    if model_type == 'DecisionTreeClassifier':
        print_notice("Adding decision paths to the data as model is a DT")
        node_indicator = model.decision_path(X)

        for i in df.index:
            df.loc[i, 'path'] = str(
                node_indicator.indices[node_indicator.indptr[i]:node_indicator.
                                       indptr[i + 1]])

    print_notice("Storing in file %s" % outliers_file)

    if just_outliers:
        indices = np.flatnonzero(df['predicted'] - Y)
        print_notice("Number of outliers %d" % indices.size)
        df.iloc[indices].to_csv(outliers_file)
    else:
        print_notice("Number of records %d" % len(df.index))
        df.to_csv(outliers_file)
Пример #15
0
    def create_sets(self):
        source_dir = config.get_str('analysis', 'CustomTestSet')
        custom_pickle = config.get_str('analysis', 'CustomPickle')
        languages = config.get_list('dataset', 'Languages')
        vulnerabilities = config.get_list('dataset', 'Vulnerabilities')

        if not os.path.isfile(custom_pickle):

            dataset = self.create_list(source_dir, languages, vulnerabilities)

            # Save to pickle file for future use
            with open(custom_pickle, 'wb') as pickle_file:
                pickle.dump(dataset, pickle_file)
        else:
            print_notice("Pickle file already created")
Пример #16
0
def cmd_store_custom():
    print_banner("Store custom test set results")

    global model

    print_notice("Creating a custom test set")
    sel_ds = 'Custom'

    my_sets = dataset_factory.get_dataset(sel_ds).get_sets()

    transform.transform_sets(sel_ds, my_sets, language)

    orig, X, Y = transform.get_xy_with_orig(sel_ds, 'testing_set', language,
                                            vuln_type, selected_features)

    X = sync_features(X)

    data.store_data(model, orig, X, Y, just_outliers=True, threshold=0.0)
Пример #17
0
def create_dt_graph(title, model, features):
    graph_dir = config.get_str('model', 'DecisionTreeGraphDirectory')

    dot_file = os.path.join(graph_dir, '%s.dot' % title)
    png_file = os.path.join(graph_dir, '%s.png' % title)

    print_notice("Creating Decision Tree graph in %s" % png_file)

    # Write DOT file
    tree.export_graphviz(model,
                         out_file=dot_file,
                         feature_names=features,
                         filled=True,
                         rounded=True,
                         proportion=True,
                         node_ids=True)

    # Convert DOT to PNG
    os.system("dot -Tpng %s >%s" % (dot_file, png_file))
Пример #18
0
def delete_transforms():
    remove = False

    for dataset in ['NVD', 'SAMATE']:
        for language in config.get_list('dataset', 'Languages'):
            for vuln_type in config.get_list('dataset', 'Vulnerabilities'):
                transform_filename = get_transform_filename(
                    dataset, language, vuln_type)
                features_filename = get_features_filename(
                    dataset, language, vuln_type)

                for f in [transform_filename, features_filename]:
                    if os.path.isfile(f):
                        print_notice("Removing %s" % f)
                        os.remove(f)
                        remove = True

    if not remove:
        print_warning("Could not find any transform files to remove.")
Пример #19
0
def get_xy_with_orig(dataset, set_name, language, vuln_type, features=None):
    filename = get_transform_filename(dataset, language, vuln_type)

    with open(filename, 'rb') as pickle_file:
        set_dfs = pickle.load(pickle_file)

        orig = set_dfs[set_name][language][vuln_type]
        X = set_dfs[set_name][language][vuln_type].drop(['file_name', 'line', 'vulnerable'], axis=1)
        Y = set_dfs[set_name][language][vuln_type]['vulnerable']

    num_features = len(X.columns)

    if features is not None:
        num_features = len(features)

    print_notice("Using set '%s' with %d features" % (set_name, num_features))

    if features is not None:
        return orig, X[list(set(features).intersection(X.columns))], Y
    else:
        return orig, X, Y
Пример #20
0
def display_pr_curve(title, model, X, Y):
    probas = model.predict_proba(X)

    # Compute Precision-Recall and plot curve
    precision, recall, area = get_auc_score(Y, probas)

    roc_score = roc_auc_score(Y, probas[:, 1])
    print_notice("AUC-PR: %0.2f" % area)
    print_notice("AUC-ROC: %0.2f" % roc_score)

    # Plot Precision-Recall curve
    plt.clf()
    plt.plot(recall, precision, lw=2, color='navy', label='PR curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.0])
    plt.xlim([0.0, 1.0])
    plt.title('%s AUC-PR=%.2f' % (title, area))
    plt.tight_layout()
    plt.legend(loc="lower left")
    plt.show()
Пример #21
0
def select_best_model(X, Y, X_tuning, Y_tuning):
    model_type = config.get_str('model', 'Model')
    best_model_i = -1
    best_auc_pr = -1

    combinations = get_hyperparameter_combinations(model_type)

    for i in range(len(combinations)):

        print_notice("Generating model %d / %d with parameters: %s" %
                     (1 + i, len(combinations), str(combinations[i])))

        model = create_model(model_type, combinations[i])

        model.fit(X, Y)

        probas = model.predict_proba(X_tuning)

        _, _, auc_pr = metrics.get_auc_score(Y_tuning, probas)

        print_notice("Model %d has AUC-PR %.2f" % (1 + i, auc_pr))

        if auc_pr > best_auc_pr:
            best_model_i = i
            best_auc_pr = auc_pr

    print_notice(
        "Model %d generated best AUC-PR (%.2f) with parameters: %s" %
        (1 + best_model_i, best_auc_pr, str(combinations[best_model_i])))
Пример #22
0
def cmd_filter_features():
    print_banner("Filtering features")

    global selected_features

    start_string = config.get_str('model', 'FeatureFilterStartString')

    if selected_features is None:
        sel_ds = config.get_str('dataset', 'SelectedDataset')

        X, Y = transform.get_xy(sel_ds, 'training_set', language, vuln_type)

        selected_features = X.columns.values

    selected_features = [
        feature for feature in selected_features
        if not feature.startswith(start_string)
    ]
    n = 1

    for feature in selected_features:
        print_notice("%d. %s" % (n, feature))
        n += 1
Пример #23
0
def cmd_count_sets():
    sel_ds = config.get_str('dataset', 'SelectedDataset')
    _, Y_training = transform.get_xy(sel_ds, 'training_set', language,
                                     vuln_type, None)
    _, Y_tuning = transform.get_xy(sel_ds, 'tuning_set', language, vuln_type,
                                   None)
    _, Y_testing = transform.get_xy(sel_ds, 'testing_set', language, vuln_type,
                                    None)

    non_vuln = 0
    vuln = 0

    for setname, df in zip(['training', 'tuning', 'testing'],
                           [Y_training, Y_tuning, Y_testing]):
        nv = len(df.loc[df[0:] == 0])
        v = len(df.loc[df[0:] == 1])
        non_vuln += nv
        vuln += v
        print_notice("%s set: non-vulnerable lines %d, vulnerable lines %d" %
                     (setname, nv, v))

    print_notice("total: non-vulnerable lines %d, vulnerable lines %d" %
                 (non_vuln, vuln))
Пример #24
0
def cmd_compare_tools():
    global train_features

    print_banner("Comparing results")

    sel_ds = config.get_str('dataset', 'SelectedDataset')
    sel_vt = config.get_str('dataset', 'SelectedVulnerabilityType')

    if train_features is None:
        X, _ = transform.get_xy(sel_ds, 'training_set', language, vuln_type,
                                selected_features)
        X.sort_index(axis=1, inplace=True)

        train_features = X.columns

    orig_tuning, X_tuning, _ = transform.get_xy_with_orig(
        sel_ds, 'tuning_set', language, vuln_type, selected_features)

    X_tuning = sync_features(X_tuning)

    c = find_best_threshold(model, orig_tuning, X_tuning)

    print_notice("Preferred threshold (Y > c): %.2f" % c)

    orig, X, _ = transform.get_xy_with_orig(sel_ds, 'testing_set', language,
                                            vuln_type, selected_features)

    print_notice('-' * 55)
    print_notice("Our results")

    print_model_results(model, orig, X, c)

    for (tool, file_name) in config.get_items('tools'):
        print_notice('-' * 55)
        print_notice('Comparing against tool: %s' % tool)
        compare_results(file_name, orig, sel_vt)
Пример #25
0
    def fit(self, X, Y):
        # Create combined dataframe of X and Y
        X['class'] = Y.as_matrix()

        filename = self.to_arff(X, False)

        # Remove class column
        del X['class']

        if not jvm.started:
            print_notice("Starting JVM")
            jvm.start()

        loader = Loader("weka.core.converters.ArffLoader")
        self.train_data = loader.load_file(filename)
        self.train_data.class_is_last()

        self.classifier = Classifier(classname="weka.classifiers.bayes.BayesNet",
                                     options=["-Q", "weka.classifiers.bayes.net.search.local.TAN",
                                              "--", "-S", self.score_type, self.mbc,
                                              "-E", "weka.classifiers.bayes.net.estimate.SimpleEstimator",
                                              "--", "-A", "0.9"])

        self.classifier.build_classifier(self.train_data)
Пример #26
0
def transform_sets(dataset, sets, language):

    mark_whole_path = config.get_boolean('dataset', 'MarkWholePathVulnerable')
    flaw_dict = sets['flaw_dict'][language]
    num_processes = 100

    set_dfs = {'training_set': {language: dict()}, 'tuning_set': {language: dict()}, 'testing_set': {language: dict()}}

    with Pool(processes=num_processes) as pool:
        for vuln_type in config.get_list('dataset', 'Vulnerabilities'):
            filename = get_transform_filename(dataset, language, vuln_type)
            # pf = get_popular_features(dataset, language, vuln_type)

            if not os.path.isfile(filename):
                f = transform_file(flaw_dict[vuln_type], mark_whole_path)

                for set_type in ['training_set', 'tuning_set', 'testing_set']:
                    # counter = 0
                    #
                    # l = len(sets[set_type][language][vuln_type])
                    # generator = iter(sets[set_type][language][vuln_type])
                    #
                    # ff = BatchedPCA(all_features=pf.get_all_features(), n_components=30)
                    #
                    # # First we determine popular columns
                    # if set_type == 'training_set':
                    #     while True:
                    #         next_elements = list(next(generator) for _ in range(num_processes))
                    #         counter += len(next_elements)
                    #
                    #         if not next_elements:
                    #             break
                    #
                    #         start = timeit.default_timer()
                    #         res = pool.map(f, next_elements)
                    #
                    #         chunk = pd.concat([df.to_sparse(fill_value=0) for df in res if df is not None],
                    #                           ignore_index=True)
                    #         chunk.fillna(0, inplace=True)
                    #         print_notice("Chunk columns: %d memory usage: %d" % (len(chunk.columns),
                    #                                                              chunk.memory_usage().sum()))
                    #
                    #         # We drop these columns so our feature filter can ignore them
                    #         chunk.drop(['file_name', 'line', 'vulnerable', 'tainted'], axis=1, inplace=True)
                    #
                    #         ff.partial_fit(chunk)
                    #
                    #         print_notice(
                    #             "%s %s %s: %d/%d (run took %.2f secs)" % (language, vuln_type, set_type, counter, l,
                    #                                                       timeit.default_timer() - start))
                    #
                    # # Create a new transform function with our feature filter
                    # f = transform_file(flaw_dict[vuln_type], mark_whole_path, feature_filter=ff)

                    counter = 0

                    l = len(sets[set_type][language][vuln_type])
                    generator = iter(sets[set_type][language][vuln_type])

                    chunks = []

                    while True:
                        next_elements = list(next(generator) for _ in range(num_processes))
                        counter += len(next_elements)

                        if not next_elements:
                            break

                        start = timeit.default_timer()
                        res = pool.map(f, next_elements)

                        chunk = pd.concat([df.to_sparse(fill_value=0) for df in res if df is not None],
                                          ignore_index=True)
                        chunk.fillna(0, inplace=True)
                        print_notice("Chunk columns: %d memory usage: %d" % (len(chunk.columns),
                                                                             chunk.memory_usage().sum()))
                        chunks.append(chunk)

                        print_notice(
                            "%s %s %s: %d/%d (run took %.2f secs)" % (language, vuln_type, set_type, counter, l,
                                                                      timeit.default_timer() - start))

                    print_notice("Concatenating %d data frames, this will take a while" % len(chunks))

                    if len(chunks) > 0:
                        set_dfs[set_type][language][vuln_type] = pd.concat(chunks, ignore_index=True)
                        set_dfs[set_type][language][vuln_type].fillna(0, inplace=True)
                        set_dfs[set_type][language][vuln_type] = set_dfs[set_type][language][vuln_type].to_dense()

                with open(filename, 'wb') as pickle_file:
                    # Protocol version 4 supports large objects (> 4GB)
                    pickle.dump(set_dfs, pickle_file, protocol=4)

                set_dfs = {'training_set': {language: dict()}, 'tuning_set': {language: dict()},
                           'testing_set': {language: dict()}}
            else:
                print_notice("Pickle file %s already created" % filename)