예제 #1
0
def cmd_store_outliers():
    print_banner("Store outliers")

    global model

    threshold = 0.5

    if config.get_boolean('analysis', 'UseCustomTestSet'):
        print_notice("Creating a custom test set")
        sel_ds = 'Custom'
        threshold = 0.0

        my_sets = dataset_factory.get_dataset(sel_ds).get_sets()

        transform.transform_sets(sel_ds, my_sets, language)

        orig, X, Y = transform.get_xy_with_orig(sel_ds, 'testing_set',
                                                language, vuln_type,
                                                selected_features)

        # TODO Delete transforms and data set
        #dataset_factory.get_dataset(sel_ds).delete_sets()

    else:
        sel_ds = config.get_str('dataset', 'SelectedDataset')

        orig, X, Y = transform.get_xy_with_orig(sel_ds, 'testing_set',
                                                language, vuln_type,
                                                selected_features)

    X = sync_features(X)

    data.store_data(model, orig, X, Y, just_outliers=True, threshold=threshold)
예제 #2
0
def create_popular_features(dataset, sets, language):

    mark_whole_path = config.get_boolean('dataset', 'MarkWholePathVulnerable')
    flaw_dict = sets['flaw_dict'][language]
    num_processes = 100

    with Pool(processes=num_processes) as pool:
        for vuln_type in config.get_list('dataset', 'Vulnerabilities'):
            filename = get_features_filename(dataset, language, vuln_type)

            if not os.path.isfile(filename):
                f = transform_file(flaw_dict[vuln_type], mark_whole_path)
                set_type = 'training_set'

                counter = 0

                l = len(sets[set_type][language][vuln_type])
                generator = iter(sets[set_type][language][vuln_type])

                pf = PopularFeatures(num_features=200)

                while True:
                    next_elements = list(next(generator) for _ in range(num_processes))
                    counter += len(next_elements)

                    if not next_elements:
                        break

                    start = timeit.default_timer()
                    res = pool.map(f, next_elements)

                    for df in res:
                        if df is None:
                            continue

                        if not all(x in df.columns.values for x in ['file_name', 'line', 'vulnerable', 'tainted']):
                            print_warning("Could not find the right columns in data frame. Ignoring.")
                            continue

                        # We drop these columns so our feature filter can ignore them
                        df.drop(['file_name', 'line', 'vulnerable', 'tainted'], axis=1, inplace=True)

                        pf.partial_fit(df)

                    print_notice(
                        "%s %s %s: %d/%d (run took %.2f secs)" % (language, vuln_type, set_type, counter, l,
                                                                  timeit.default_timer() - start))

                with open(filename, 'wb') as pickle_file:
                    # Protocol version 4 supports large objects (> 4GB)
                    pickle.dump(pf, pickle_file, protocol=4)

            else:
                print_notice("Pickle file %s already created" % filename)
예제 #3
0
def select_model(language, vuln_type, X, Y):
    model_type = config.get_str('model', 'Model')
    params = config.get_dict('model',
                             model_type + vuln_type + 'Params',
                             optional=True)
    model = create_model(model_type, params)

    model.fit(X, Y)

    if model_type == "DecisionTreeClassifier" and config.get_boolean(
            'model', 'GenerateDecisionTreeGraph'):
        create_dt_graph("%s_%s" % (language, vuln_type), model,
                        X.columns.values)

    return model
예제 #4
0
def transform_sets(dataset, sets, language):

    mark_whole_path = config.get_boolean('dataset', 'MarkWholePathVulnerable')
    flaw_dict = sets['flaw_dict'][language]
    num_processes = 100

    set_dfs = {'training_set': {language: dict()}, 'tuning_set': {language: dict()}, 'testing_set': {language: dict()}}

    with Pool(processes=num_processes) as pool:
        for vuln_type in config.get_list('dataset', 'Vulnerabilities'):
            filename = get_transform_filename(dataset, language, vuln_type)
            # pf = get_popular_features(dataset, language, vuln_type)

            if not os.path.isfile(filename):
                f = transform_file(flaw_dict[vuln_type], mark_whole_path)

                for set_type in ['training_set', 'tuning_set', 'testing_set']:
                    # counter = 0
                    #
                    # l = len(sets[set_type][language][vuln_type])
                    # generator = iter(sets[set_type][language][vuln_type])
                    #
                    # ff = BatchedPCA(all_features=pf.get_all_features(), n_components=30)
                    #
                    # # First we determine popular columns
                    # if set_type == 'training_set':
                    #     while True:
                    #         next_elements = list(next(generator) for _ in range(num_processes))
                    #         counter += len(next_elements)
                    #
                    #         if not next_elements:
                    #             break
                    #
                    #         start = timeit.default_timer()
                    #         res = pool.map(f, next_elements)
                    #
                    #         chunk = pd.concat([df.to_sparse(fill_value=0) for df in res if df is not None],
                    #                           ignore_index=True)
                    #         chunk.fillna(0, inplace=True)
                    #         print_notice("Chunk columns: %d memory usage: %d" % (len(chunk.columns),
                    #                                                              chunk.memory_usage().sum()))
                    #
                    #         # We drop these columns so our feature filter can ignore them
                    #         chunk.drop(['file_name', 'line', 'vulnerable', 'tainted'], axis=1, inplace=True)
                    #
                    #         ff.partial_fit(chunk)
                    #
                    #         print_notice(
                    #             "%s %s %s: %d/%d (run took %.2f secs)" % (language, vuln_type, set_type, counter, l,
                    #                                                       timeit.default_timer() - start))
                    #
                    # # Create a new transform function with our feature filter
                    # f = transform_file(flaw_dict[vuln_type], mark_whole_path, feature_filter=ff)

                    counter = 0

                    l = len(sets[set_type][language][vuln_type])
                    generator = iter(sets[set_type][language][vuln_type])

                    chunks = []

                    while True:
                        next_elements = list(next(generator) for _ in range(num_processes))
                        counter += len(next_elements)

                        if not next_elements:
                            break

                        start = timeit.default_timer()
                        res = pool.map(f, next_elements)

                        chunk = pd.concat([df.to_sparse(fill_value=0) for df in res if df is not None],
                                          ignore_index=True)
                        chunk.fillna(0, inplace=True)
                        print_notice("Chunk columns: %d memory usage: %d" % (len(chunk.columns),
                                                                             chunk.memory_usage().sum()))
                        chunks.append(chunk)

                        print_notice(
                            "%s %s %s: %d/%d (run took %.2f secs)" % (language, vuln_type, set_type, counter, l,
                                                                      timeit.default_timer() - start))

                    print_notice("Concatenating %d data frames, this will take a while" % len(chunks))

                    if len(chunks) > 0:
                        set_dfs[set_type][language][vuln_type] = pd.concat(chunks, ignore_index=True)
                        set_dfs[set_type][language][vuln_type].fillna(0, inplace=True)
                        set_dfs[set_type][language][vuln_type] = set_dfs[set_type][language][vuln_type].to_dense()

                with open(filename, 'wb') as pickle_file:
                    # Protocol version 4 supports large objects (> 4GB)
                    pickle.dump(set_dfs, pickle_file, protocol=4)

                set_dfs = {'training_set': {language: dict()}, 'tuning_set': {language: dict()},
                           'testing_set': {language: dict()}}
            else:
                print_notice("Pickle file %s already created" % filename)