def handle_app(app_id, ids_entries, experiment): """ Full flow for one classifier. """ verify_ids_entries(ids_entries, app_id, experiment.storer_printer) training, scoring = ids_tools.ids_entries_to_train_test(ids_entries) X_train, _ = IdsConverter().ids_entries_to_X_y(training) X_test, y_true = IdsConverter().ids_entries_to_X_y(scoring) classifiers = [sk_svm.OneClassSVM(), sk_ens.IsolationForest()] for classifier in classifiers: classifier.fit(X_train) y_pred = classifier.predict(X_test) experiment.visualise_store("SPEC", app_id, classifier, y_true, y_pred)
def _read_log_lines_then_yield(yielder, first_line): """ Read all provided log lines from the given yielder. """ first_entry = LogEntry.from_log_string(first_line) log_entries = [first_entry] for line in yielder: log_entry = LogEntry.from_log_string(line) log_entries.append(log_entry) ids_entry_dict = IdsConverter().log_entries_to_ids_entries_dict( log_entries) for _, app_entries in ids_entry_dict.items(): for ids_entry in app_entries: yield ids_entry
def run(experiment): log_entries = [] for line in Dir.yield_lines(experiment.file_path, ITEM_LIMIT): log_entry = LogEntry.from_log_string(line) log_entries.append(log_entry) experiment.entries = log_entries # Exp 1: map time_before_map = time.time() OneHotVsMapping.handle_log_entries("MAP", OneHotVsMappingConverter(), log_entries, experiment) # Exp 2: one-hot time_after_map_before_one_hot = time.time() OneHotVsMapping.handle_log_entries("OHOT", IdsConverter(), log_entries, experiment) time_after_all = time.time() time_for_map = time_after_map_before_one_hot - time_before_map time_for_one_hot = time_after_all - time_after_map_before_one_hot timing_lines = [ "Benchmark result | %s entries processed | OneClassSVM classifier" % len(log_entries), "", "Mapping: %s" % util.fmtr.format_time_passed(time_for_map), "One-hot: %s" % util.fmtr.format_time_passed(time_for_one_hot) ] experiment.add_result_file("time_map_vs_onehot", timing_lines)
def _yield_idse_lines(yielder): """ Yield (and verify) IDSE lines one by one from the given yielder. """ converter = IdsConverter() for line in yielder: app_id, vector, vclass = _process_idse_line(line, converter) yield IdsEntry(app_id, vector, vclass)
def read_convert(self, file_path): """ Read IDS entries from the given file and convert the result. """ converter = IdsConverter() self.entries = [] for entry in idse_dao.yield_entries(file_path): self.entries.append(entry) if len(self.entries) >= 5000000: warnings.warn( "Skipping remaining entries - limit of 5000000 reached!") break ids_entries_dict = converter.ids_entries_to_dict(self.entries) return ids_entries_dict
def run_cycle_for_app(ids_entries, app_id, percentage_intruded_training, experiment): """ One app with the given percentage. """ verify_ids_entries(ids_entries, app_id, experiment.storer_printer) training, scoring = CleanTrainingVsDistorted.custom_train_test_split( ids_entries, percentage_intruded_training) X_train, _ = IdsConverter().ids_entries_to_X_y(training) X_test, y_true = IdsConverter().ids_entries_to_X_y(scoring) classifier = sk_svm.OneClassSVM() name = CleanTrainingVsDistorted.get_name(percentage_intruded_training) classifier.fit(X_train) y_pred = classifier.predict(X_test) experiment.visualise_store(name, app_id, classifier, y_true, y_pred)
def handle_all(experiment): """ Full flow for a one-fits-all classifier. """ from ids.TEMP_IDS_CONVERTER import IdsConverter as TEMPCONVERTER converter = TEMPCONVERTER() log_entries = [] for line in Dir.yield_lines(experiment.file_path, ITEM_LIMIT): log_entry = LogEntry.from_log_string(line) log_entries.append(log_entry) all_entries = converter.LOG_ENTRIES_TO_IDS_ENTRIES(log_entries, binary=True) training_entries, scoring_entries = ids_tools.ids_entries_to_train_test( all_entries) X_train, _ = IdsConverter().ids_entries_to_X_y(training_entries) scoring_dict = {} for ids_entry in scoring_entries: if ids_entry.app_id not in scoring_dict: scoring_dict[ids_entry.app_id] = [] scoring_dict[ids_entry.app_id].append(ids_entry) # Classify with all entries: training_entries classifiers = [sk_svm.OneClassSVM(), sk_ens.IsolationForest()] for classifier in classifiers: classifier.fit(X_train) # Score for each app: scoring_dict for app_id, app_entries in util.seqr.yield_items_in_key_order( scoring_dict): X_test, y_true = IdsConverter().ids_entries_to_X_y(app_entries) y_preds = [clf.predict(X_test) for clf in classifiers] for clf, y_pred in zip(classifiers, y_preds): experiment.visualise_store("ALL", app_id, clf, y_true, y_pred)
def _score_pr(file_path): printer = util.prtr.Printer() squelcher = util.prtr.Printer(squelch=True) converter = IdsConverter() log_entries = _read_file_flow(file_path) scores_acc = _empty_app_id_dict() scores_prec = _empty_app_id_dict() scores_rec = _empty_app_id_dict() printer.prt("Preparing... ", newline=False) # converted_entries: [(app_id, vector, class)] converted_entries = [] for log_entry in log_entries: converted_entries.append( converter.log_entry_to_prepared_tuple(log_entry, binary=True)) printer.prt("Filtering... ", newline=False) train_entries, test_entries = ids_tools.converted_entries_to_train_test( converted_entries) printer.prt("Splitting... ", newline=False) train_dict = converter.prepared_tuples_to_train_dict( train_entries, squelcher) test_dict = converter.prepared_tuples_to_train_dict( test_entries, squelcher) result_table = [] result_table.append(["App id", "Actual (+)", "Actual (-)"]) printer.prt("Scoring... ") for app_id in converter.app_ids: X_train, y_train = train_dict[app_id] X_test, y_test = test_dict[app_id] clf = sklearn.svm.OneClassSVM(random_state=0) clf.fit(X_train) result = clf.predict(X_test) # TODO MOAR warnings.filterwarnings( "ignore", category=sklearn.exceptions.UndefinedMetricWarning) scores_acc[app_id].append(sk_met.accuracy_score(y_test, result)) scores_prec[app_id].append(sk_met.precision_score(y_test, result)) scores_rec[app_id].append(sk_met.recall_score(y_test, result)) tn, fp, fn, tp = sk_met.confusion_matrix(y_test, result).ravel() result_table.append(["{} (+)".format(app_id), tp, fp]) result_table.append(["{} (-)".format(" " * len(app_id)), fn, tn]) dash = "-" * 10 result_table.append([dash, dash, dash]) _print_scores(scores_acc, printer, headline="Accuracy") _print_scores(scores_prec, printer, headline="Precision") _print_scores(scores_rec, printer, headline="Recall") util.outp.print_table(result_table, headline="Confusion matrix", printer=printer)