def run(experiment): log_entries = [] for line in Dir.yield_lines(experiment.file_path, ITEM_LIMIT): log_entry = LogEntry.from_log_string(line) log_entries.append(log_entry) experiment.entries = log_entries # Exp 1: map time_before_map = time.time() OneHotVsMapping.handle_log_entries("MAP", OneHotVsMappingConverter(), log_entries, experiment) # Exp 2: one-hot time_after_map_before_one_hot = time.time() OneHotVsMapping.handle_log_entries("OHOT", IdsConverter(), log_entries, experiment) time_after_all = time.time() time_for_map = time_after_map_before_one_hot - time_before_map time_for_one_hot = time_after_all - time_after_map_before_one_hot timing_lines = [ "Benchmark result | %s entries processed | OneClassSVM classifier" % len(log_entries), "", "Mapping: %s" % util.fmtr.format_time_passed(time_for_map), "One-hot: %s" % util.fmtr.format_time_passed(time_for_one_hot) ] experiment.add_result_file("time_map_vs_onehot", timing_lines)
def _get_log_entries_from_file(file_path, limit): """ Read up to <limit> number of log entries from the given file. """ log_entries = [] for line in Dir.yield_lines(file_path, limit): log_entries.append(LogEntry.from_log_string(line)) return log_entries
def yield_entries(file_path, limit=None): """ Yield IdsEntry objects from the given file. First access on log files is costly! *limit: Optional maximum number of entries to retrieve. """ if not os.path.lexists(file_path): _raise_file_doesnt_exist(file_path) yielder = Dir.yield_lines(file_path, limit) first_line = yielder.next() file_type = _detect_type(first_line) if file_type == FileType.IDSE_FILE: return _yield_idse_lines(yielder) elif file_type == FileType.LOG_FILE: return _read_log_lines_then_yield(yielder, first_line) else: raise NotImplementedError("File type not implemented: %s" % file_type)
def _sample(file_path, number_of_elements, limit_to): """ Sample <number_of_elements> from the given file. """ print("Sampling...") target_file_path = "%s_%s-sample" % (file_path, number_of_elements) if not os.path.lexists(file_path): raise IOError("Input file doesn't exist") target_file_path = Dir.uniquify(target_file_path) line_generator = Dir.yield_lines(file_path) log_lines = None if limit_to is None: log_lines = ids_tools.reservoir_sample(line_generator, number_of_elements) else: log_lines = ids_tools.reservoir_sample_limit(line_generator, number_of_elements, limit_to) Dir.write_lines(target_file_path, log_lines) print("Done. Wrote to file:\n%s" % target_file_path)
def handle_all(experiment): """ Full flow for a one-fits-all classifier. """ from ids.TEMP_IDS_CONVERTER import IdsConverter as TEMPCONVERTER converter = TEMPCONVERTER() log_entries = [] for line in Dir.yield_lines(experiment.file_path, ITEM_LIMIT): log_entry = LogEntry.from_log_string(line) log_entries.append(log_entry) all_entries = converter.LOG_ENTRIES_TO_IDS_ENTRIES(log_entries, binary=True) training_entries, scoring_entries = ids_tools.ids_entries_to_train_test( all_entries) X_train, _ = IdsConverter().ids_entries_to_X_y(training_entries) scoring_dict = {} for ids_entry in scoring_entries: if ids_entry.app_id not in scoring_dict: scoring_dict[ids_entry.app_id] = [] scoring_dict[ids_entry.app_id].append(ids_entry) # Classify with all entries: training_entries classifiers = [sk_svm.OneClassSVM(), sk_ens.IsolationForest()] for classifier in classifiers: classifier.fit(X_train) # Score for each app: scoring_dict for app_id, app_entries in util.seqr.yield_items_in_key_order( scoring_dict): X_test, y_true = IdsConverter().ids_entries_to_X_y(app_entries) y_preds = [clf.predict(X_test) for clf in classifiers] for clf, y_pred in zip(classifiers, y_preds): experiment.visualise_store("ALL", app_id, clf, y_true, y_pred)
def analyse(file_path, to_file, output_printer): """ Analyse the given log file. """ # Check output file if requested # output_path = file_path + ".analysis" if to_file and os.path.lexists(output_path): raise IOError("Output file {} exists already! (Re)Move it and try again.".format(output_path)) output_printer.prt("Analysing...") # Get file access # file_type = idse_dao.detect_type(file_path) if file_type == idse_dao.FileType.IDSE_FILE: print("Can't analyse IDSE files!") return elif file_type != idse_dao.FileType.LOG_FILE: raise NotImplementedError("File type \"%s\" not implemented!" % file_type) log_entry_generator = (LogEntry.from_log_string(line) for line in Dir.yield_lines(file_path)) # Analysis # all_app_ids = ids_data.get_app_ids() all_classes = ids_data.get_labels() ( total_entries, found_app_ids, entry_count_per_app_id, elements_per_class_per_app_id, found_classes, entry_count_per_class, app_ids_per_class, duplicate_elements_per_app_id, scorable_app_ids, dispersion_index, duplicate_index ) = analyse_entries(log_entry_generator) # Output # printer = output_printer if to_file: printer = util.prtr.Storer() get_pl = lambda s, obj: s if len(obj) > 1 else "" total_line_name = "<total>" if not to_file: printer.prt("") printer.prt("Analysis {}: Found {:,} entries with {}/{} app id{} and {}/{} class{}".format( VERSION, total_entries, len(found_app_ids), len(all_app_ids), get_pl("s", found_app_ids), len(found_classes), len(all_classes), get_pl("es", found_classes)) ) # "Elements and classes per app ID" table per_app_id = [] per_app_id.append(["App ID", "Elements", "El. %"] + all_classes) total_entries_assertion = 0 for app_id in all_app_ids: total_entries_assertion += entry_count_per_app_id[app_id] line = [ app_id, "{:,}".format(entry_count_per_app_id[app_id]), util.fmtr.format_percentage(entry_count_per_app_id[app_id] / float(total_entries), True, 2) ] for a_class in all_classes: class_count_str = "" if a_class in elements_per_class_per_app_id[app_id]: class_count_str = "{:,}".format(elements_per_class_per_app_id[app_id][a_class]) line.append(class_count_str) per_app_id.append(line) assert(total_entries == total_entries_assertion) empty_line = [""] * (3 + len(all_classes)) per_app_id.append(empty_line) total_line = [ total_line_name, "{:,}".format(total_entries), util.fmtr.format_percentage(1, True, 2) ] for a_class in all_classes: total_line.append("{:,}".format(entry_count_per_class[a_class])) per_app_id.append(total_line) util.outp.print_table(per_app_id, headline="Elements and classes per app ID", printer=printer) # "per class" table per_class = [] per_class.append([""] + all_classes) app_ids_line = ["App IDs"] percent_line = ["Percentage"] for a_class in all_classes: app_ids_line.append( len(app_ids_per_class[a_class])) percent_line.append( util.fmtr.format_percentage(entry_count_per_class[a_class] / float(total_entries), False, 2)) per_class.append(app_ids_line) per_class.append(percent_line) util.outp.print_table(per_class, headline="Metrics per class", printer=printer) # "Duplicates per app ID" table duplicates = [] duplicates.append(["App ID", "All", "Unique", "Duplicates", "Duplicate %"]) total_number_of_duplicates = 0 total_entries_assertion = 0 for app_id in all_app_ids: result = duplicate_elements_per_app_id[app_id] unique_count = result["uniq"] duplicate_count = result["dupe"] all_count = unique_count + duplicate_count total_number_of_duplicates += duplicate_count total_entries_assertion += all_count duplicate_percent = 0 if all_count > 0: duplicate_percent = float(duplicate_count) / all_count duplicate_percent_str = util.fmtr.format_percentage(duplicate_percent, True, 3) new_line = [app_id] new_line.extend(["{:,}".format(x) for x in [all_count, unique_count, duplicate_count]]) new_line.append(duplicate_percent_str) duplicates.append(new_line) assert(total_entries == total_entries_assertion) # Don't output table if there are no duplicates if total_number_of_duplicates == 0: printer.prt("\nDuplicate analysis: No duplicates found!") else: empty_line = [""] * 5 duplicates.append(empty_line) total_duplicate_percent = float(total_number_of_duplicates) / total_entries total_line = [total_line_name] total_line.extend([ "{:,}".format(x) for x in [total_entries, total_entries - total_number_of_duplicates, total_number_of_duplicates] ]) total_line.append(util.fmtr.format_percentage(total_duplicate_percent, True, 3)) duplicates.append(total_line) util.outp.print_table(duplicates, headline="Duplicates per app ID", printer=printer) printer.prt("\nScores for %s scorable app ids: Dispersion index = %s | Duplicate index = %s" % (len(scorable_app_ids), round(dispersion_index, 3), round(duplicate_index, 3)) ) printer.prt("Scorable app ids: %s" % scorable_app_ids) if to_file: with open(output_path, "w") as output_file: for line in printer.get_messages(): output_file.write(line + "\n") output_printer.prt("Successfully saved analysis to \"{}\".".format(output_path)) # harmonious? all labelled / some / none? # for each app id: are there roughly the same number of entries per class? return
def detect_type(file_path): """ Detect the file type of the file. """ first_line = Dir.yield_lines(file_path).next() return _detect_type(first_line)
def _yield_log_entries_from_file(file_path): for line in Dir.yield_lines(file_path): yield LogEntry.from_log_string(line)
print(msg) exit() ### Main program ### p = argparse.ArgumentParser() p.add_argument("file_path", metavar="PATH/FILE", help="Log file") args = p.parse_args() orig_path = os.path.expanduser(args.file_path) if not os.path.lexists(orig_path): prexit("File doesn't exist") tmp_path = orig_path + "_bak" if os.path.lexists(tmp_path): prexit("%s exists" % tmp_path) os.rename(orig_path, tmp_path) with open(orig_path, "w") as output_file: for line in Dir.yield_lines(tmp_path): processed_string = proc_log_string(line) output_file.write("%s\n" % processed_string) print("Done.") print("Wrote to: %s" % orig_path) print("Old file: %s" % tmp_path)