def cmd_store_outliers(): print_banner("Store outliers") global model threshold = 0.5 if config.get_boolean('analysis', 'UseCustomTestSet'): print_notice("Creating a custom test set") sel_ds = 'Custom' threshold = 0.0 my_sets = dataset_factory.get_dataset(sel_ds).get_sets() transform.transform_sets(sel_ds, my_sets, language) orig, X, Y = transform.get_xy_with_orig(sel_ds, 'testing_set', language, vuln_type, selected_features) # TODO Delete transforms and data set #dataset_factory.get_dataset(sel_ds).delete_sets() else: sel_ds = config.get_str('dataset', 'SelectedDataset') orig, X, Y = transform.get_xy_with_orig(sel_ds, 'testing_set', language, vuln_type, selected_features) X = sync_features(X) data.store_data(model, orig, X, Y, just_outliers=True, threshold=threshold)
def create_popular_features(dataset, sets, language): mark_whole_path = config.get_boolean('dataset', 'MarkWholePathVulnerable') flaw_dict = sets['flaw_dict'][language] num_processes = 100 with Pool(processes=num_processes) as pool: for vuln_type in config.get_list('dataset', 'Vulnerabilities'): filename = get_features_filename(dataset, language, vuln_type) if not os.path.isfile(filename): f = transform_file(flaw_dict[vuln_type], mark_whole_path) set_type = 'training_set' counter = 0 l = len(sets[set_type][language][vuln_type]) generator = iter(sets[set_type][language][vuln_type]) pf = PopularFeatures(num_features=200) while True: next_elements = list(next(generator) for _ in range(num_processes)) counter += len(next_elements) if not next_elements: break start = timeit.default_timer() res = pool.map(f, next_elements) for df in res: if df is None: continue if not all(x in df.columns.values for x in ['file_name', 'line', 'vulnerable', 'tainted']): print_warning("Could not find the right columns in data frame. Ignoring.") continue # We drop these columns so our feature filter can ignore them df.drop(['file_name', 'line', 'vulnerable', 'tainted'], axis=1, inplace=True) pf.partial_fit(df) print_notice( "%s %s %s: %d/%d (run took %.2f secs)" % (language, vuln_type, set_type, counter, l, timeit.default_timer() - start)) with open(filename, 'wb') as pickle_file: # Protocol version 4 supports large objects (> 4GB) pickle.dump(pf, pickle_file, protocol=4) else: print_notice("Pickle file %s already created" % filename)
def select_model(language, vuln_type, X, Y): model_type = config.get_str('model', 'Model') params = config.get_dict('model', model_type + vuln_type + 'Params', optional=True) model = create_model(model_type, params) model.fit(X, Y) if model_type == "DecisionTreeClassifier" and config.get_boolean( 'model', 'GenerateDecisionTreeGraph'): create_dt_graph("%s_%s" % (language, vuln_type), model, X.columns.values) return model
def transform_sets(dataset, sets, language): mark_whole_path = config.get_boolean('dataset', 'MarkWholePathVulnerable') flaw_dict = sets['flaw_dict'][language] num_processes = 100 set_dfs = {'training_set': {language: dict()}, 'tuning_set': {language: dict()}, 'testing_set': {language: dict()}} with Pool(processes=num_processes) as pool: for vuln_type in config.get_list('dataset', 'Vulnerabilities'): filename = get_transform_filename(dataset, language, vuln_type) # pf = get_popular_features(dataset, language, vuln_type) if not os.path.isfile(filename): f = transform_file(flaw_dict[vuln_type], mark_whole_path) for set_type in ['training_set', 'tuning_set', 'testing_set']: # counter = 0 # # l = len(sets[set_type][language][vuln_type]) # generator = iter(sets[set_type][language][vuln_type]) # # ff = BatchedPCA(all_features=pf.get_all_features(), n_components=30) # # # First we determine popular columns # if set_type == 'training_set': # while True: # next_elements = list(next(generator) for _ in range(num_processes)) # counter += len(next_elements) # # if not next_elements: # break # # start = timeit.default_timer() # res = pool.map(f, next_elements) # # chunk = pd.concat([df.to_sparse(fill_value=0) for df in res if df is not None], # ignore_index=True) # chunk.fillna(0, inplace=True) # print_notice("Chunk columns: %d memory usage: %d" % (len(chunk.columns), # chunk.memory_usage().sum())) # # # We drop these columns so our feature filter can ignore them # chunk.drop(['file_name', 'line', 'vulnerable', 'tainted'], axis=1, inplace=True) # # ff.partial_fit(chunk) # # print_notice( # "%s %s %s: %d/%d (run took %.2f secs)" % (language, vuln_type, set_type, counter, l, # timeit.default_timer() - start)) # # # Create a new transform function with our feature filter # f = transform_file(flaw_dict[vuln_type], mark_whole_path, feature_filter=ff) counter = 0 l = len(sets[set_type][language][vuln_type]) generator = iter(sets[set_type][language][vuln_type]) chunks = [] while True: next_elements = list(next(generator) for _ in range(num_processes)) counter += len(next_elements) if not next_elements: break start = timeit.default_timer() res = pool.map(f, next_elements) chunk = pd.concat([df.to_sparse(fill_value=0) for df in res if df is not None], ignore_index=True) chunk.fillna(0, inplace=True) print_notice("Chunk columns: %d memory usage: %d" % (len(chunk.columns), chunk.memory_usage().sum())) chunks.append(chunk) print_notice( "%s %s %s: %d/%d (run took %.2f secs)" % (language, vuln_type, set_type, counter, l, timeit.default_timer() - start)) print_notice("Concatenating %d data frames, this will take a while" % len(chunks)) if len(chunks) > 0: set_dfs[set_type][language][vuln_type] = pd.concat(chunks, ignore_index=True) set_dfs[set_type][language][vuln_type].fillna(0, inplace=True) set_dfs[set_type][language][vuln_type] = set_dfs[set_type][language][vuln_type].to_dense() with open(filename, 'wb') as pickle_file: # Protocol version 4 supports large objects (> 4GB) pickle.dump(set_dfs, pickle_file, protocol=4) set_dfs = {'training_set': {language: dict()}, 'tuning_set': {language: dict()}, 'testing_set': {language: dict()}} else: print_notice("Pickle file %s already created" % filename)