def run(experiment): log_entries = [] for line in Dir.yield_lines(experiment.file_path, ITEM_LIMIT): log_entry = LogEntry.from_log_string(line) log_entries.append(log_entry) experiment.entries = log_entries # Exp 1: map time_before_map = time.time() OneHotVsMapping.handle_log_entries("MAP", OneHotVsMappingConverter(), log_entries, experiment) # Exp 2: one-hot time_after_map_before_one_hot = time.time() OneHotVsMapping.handle_log_entries("OHOT", IdsConverter(), log_entries, experiment) time_after_all = time.time() time_for_map = time_after_map_before_one_hot - time_before_map time_for_one_hot = time_after_all - time_after_map_before_one_hot timing_lines = [ "Benchmark result | %s entries processed | OneClassSVM classifier" % len(log_entries), "", "Mapping: %s" % util.fmtr.format_time_passed(time_for_map), "One-hot: %s" % util.fmtr.format_time_passed(time_for_one_hot) ] experiment.add_result_file("time_map_vs_onehot", timing_lines)
def _get_log_entries_from_file(file_path, limit): """ Read up to <limit> number of log entries from the given file. """ log_entries = [] for line in Dir.yield_lines(file_path, limit): log_entries.append(LogEntry.from_log_string(line)) return log_entries
def __init__(self, file_path, store_title): """ Ctor """ object.__init__(self) # State self.title = None # Time self.start_time = time.time() self.end_time = None # Loaded entries self.entries = [] # ClassifierResultGroup objects (name, classifier, result) self.classifier_results = [] # OtherResult objects (file_name lines) self.other_result_files = [] # StorerAndPrinter - stores and prints ;) time_printer = util.prtr.TimePrinter(name="exp") self.storer_printer = util.prtr.StorerAndPrinter(printer=time_printer) # Paths and name self.file_path = os.path.expanduser(file_path) self.input_file_name = os.path.basename(self.file_path) experiment_dir_name = None if not os.path.lexists(self.file_path): util.outp.exit_on_error("Input file not found: %s" % self.file_path) self.title = store_title if self.title is None: random_num_str = "".join( str(x) for x in (random.sample(range(0, 15), 5))) self.title = "Experiment %s" % random_num_str experiment_dir_name = Dir.remove_disallowed_characters( self.title.lower()) experiment_dir_name += time.strftime("_%m-%d_%H-%M") self.experiment_dir_path = self.get_experiment_folder( experiment_dir_name) if os.path.lexists(self.experiment_dir_path): self.experiment_dir_path = Dir.uniquify(self.experiment_dir_path)
def save_entries(file_path, entry_generator): """ Store the entries as a file. IDS entries in IDSE files, log entries as log files. returns: The file path in which the file was saved. """ entries_list = list(entry_generator) first_entry = entries_list[0] # Where to store? file_path_full = None # What to store? lines = None # How to convert? to_line = None # LogEntry objects: No extension, no header, call entry.get_log_string() if isinstance(first_entry, LogEntry): file_path_full = file_path lines = [] to_line = lambda l: l.get_log_string() # IdsEntry objects: IDSE extension, IDSE header and run _ids_entry_to_idse_string(entry) elif isinstance(first_entry, IdsEntry): file_path_full = add_idse_extension(file_path) lines = [HEADER] to_line = _ids_entry_to_idse_string else: raise TypeError( "[IDSE DAO] Given elements are neither LogEntry nor IdsEntry objects!" ) if os.path.lexists(file_path_full): _raise_file_exists(file_path_full) # Actual entry -> string conversion lines.extend([to_line(e) for e in entries_list]) Dir.write_lines(file_path_full, lines) return file_path_full
def _sample(file_path, number_of_elements, limit_to): """ Sample <number_of_elements> from the given file. """ print("Sampling...") target_file_path = "%s_%s-sample" % (file_path, number_of_elements) if not os.path.lexists(file_path): raise IOError("Input file doesn't exist") target_file_path = Dir.uniquify(target_file_path) line_generator = Dir.yield_lines(file_path) log_lines = None if limit_to is None: log_lines = ids_tools.reservoir_sample(line_generator, number_of_elements) else: log_lines = ids_tools.reservoir_sample_limit(line_generator, number_of_elements, limit_to) Dir.write_lines(target_file_path, log_lines) print("Done. Wrote to file:\n%s" % target_file_path)
def yield_entries(file_path, limit=None): """ Yield IdsEntry objects from the given file. First access on log files is costly! *limit: Optional maximum number of entries to retrieve. """ if not os.path.lexists(file_path): _raise_file_doesnt_exist(file_path) yielder = Dir.yield_lines(file_path, limit) first_line = yielder.next() file_type = _detect_type(first_line) if file_type == FileType.IDSE_FILE: return _yield_idse_lines(yielder) elif file_type == FileType.LOG_FILE: return _read_log_lines_then_yield(yielder, first_line) else: raise NotImplementedError("File type not implemented: %s" % file_type)
def _train(file_path): """ Train the classifier with the given file. """ print("Using file \"{}\"".format(os.path.join(os.getcwd(), file_path))) saved_so_far = [] if os.path.lexists(_HISTORY_FILE): saved_so_far = Dir.read_lines(_HISTORY_FILE) if file_path in saved_so_far: print("This file has already been used for training." + " If you think this is a mistake, rename it and run again.") return log_entry_generator = _yield_log_entries_from_file(file_path) _train_entries(log_entry_generator) with open(_HISTORY_FILE, 'a') as hist_file: hist_file.write(file_path + "\n")
def handle_all(experiment): """ Full flow for a one-fits-all classifier. """ from ids.TEMP_IDS_CONVERTER import IdsConverter as TEMPCONVERTER converter = TEMPCONVERTER() log_entries = [] for line in Dir.yield_lines(experiment.file_path, ITEM_LIMIT): log_entry = LogEntry.from_log_string(line) log_entries.append(log_entry) all_entries = converter.LOG_ENTRIES_TO_IDS_ENTRIES(log_entries, binary=True) training_entries, scoring_entries = ids_tools.ids_entries_to_train_test( all_entries) X_train, _ = IdsConverter().ids_entries_to_X_y(training_entries) scoring_dict = {} for ids_entry in scoring_entries: if ids_entry.app_id not in scoring_dict: scoring_dict[ids_entry.app_id] = [] scoring_dict[ids_entry.app_id].append(ids_entry) # Classify with all entries: training_entries classifiers = [sk_svm.OneClassSVM(), sk_ens.IsolationForest()] for classifier in classifiers: classifier.fit(X_train) # Score for each app: scoring_dict for app_id, app_entries in util.seqr.yield_items_in_key_order( scoring_dict): X_test, y_true = IdsConverter().ids_entries_to_X_y(app_entries) y_preds = [clf.predict(X_test) for clf in classifiers] for clf, y_pred in zip(classifiers, y_preds): experiment.visualise_store("ALL", app_id, clf, y_true, y_pred)
def analyse(file_path, to_file, output_printer): """ Analyse the given log file. """ # Check output file if requested # output_path = file_path + ".analysis" if to_file and os.path.lexists(output_path): raise IOError("Output file {} exists already! (Re)Move it and try again.".format(output_path)) output_printer.prt("Analysing...") # Get file access # file_type = idse_dao.detect_type(file_path) if file_type == idse_dao.FileType.IDSE_FILE: print("Can't analyse IDSE files!") return elif file_type != idse_dao.FileType.LOG_FILE: raise NotImplementedError("File type \"%s\" not implemented!" % file_type) log_entry_generator = (LogEntry.from_log_string(line) for line in Dir.yield_lines(file_path)) # Analysis # all_app_ids = ids_data.get_app_ids() all_classes = ids_data.get_labels() ( total_entries, found_app_ids, entry_count_per_app_id, elements_per_class_per_app_id, found_classes, entry_count_per_class, app_ids_per_class, duplicate_elements_per_app_id, scorable_app_ids, dispersion_index, duplicate_index ) = analyse_entries(log_entry_generator) # Output # printer = output_printer if to_file: printer = util.prtr.Storer() get_pl = lambda s, obj: s if len(obj) > 1 else "" total_line_name = "<total>" if not to_file: printer.prt("") printer.prt("Analysis {}: Found {:,} entries with {}/{} app id{} and {}/{} class{}".format( VERSION, total_entries, len(found_app_ids), len(all_app_ids), get_pl("s", found_app_ids), len(found_classes), len(all_classes), get_pl("es", found_classes)) ) # "Elements and classes per app ID" table per_app_id = [] per_app_id.append(["App ID", "Elements", "El. %"] + all_classes) total_entries_assertion = 0 for app_id in all_app_ids: total_entries_assertion += entry_count_per_app_id[app_id] line = [ app_id, "{:,}".format(entry_count_per_app_id[app_id]), util.fmtr.format_percentage(entry_count_per_app_id[app_id] / float(total_entries), True, 2) ] for a_class in all_classes: class_count_str = "" if a_class in elements_per_class_per_app_id[app_id]: class_count_str = "{:,}".format(elements_per_class_per_app_id[app_id][a_class]) line.append(class_count_str) per_app_id.append(line) assert(total_entries == total_entries_assertion) empty_line = [""] * (3 + len(all_classes)) per_app_id.append(empty_line) total_line = [ total_line_name, "{:,}".format(total_entries), util.fmtr.format_percentage(1, True, 2) ] for a_class in all_classes: total_line.append("{:,}".format(entry_count_per_class[a_class])) per_app_id.append(total_line) util.outp.print_table(per_app_id, headline="Elements and classes per app ID", printer=printer) # "per class" table per_class = [] per_class.append([""] + all_classes) app_ids_line = ["App IDs"] percent_line = ["Percentage"] for a_class in all_classes: app_ids_line.append( len(app_ids_per_class[a_class])) percent_line.append( util.fmtr.format_percentage(entry_count_per_class[a_class] / float(total_entries), False, 2)) per_class.append(app_ids_line) per_class.append(percent_line) util.outp.print_table(per_class, headline="Metrics per class", printer=printer) # "Duplicates per app ID" table duplicates = [] duplicates.append(["App ID", "All", "Unique", "Duplicates", "Duplicate %"]) total_number_of_duplicates = 0 total_entries_assertion = 0 for app_id in all_app_ids: result = duplicate_elements_per_app_id[app_id] unique_count = result["uniq"] duplicate_count = result["dupe"] all_count = unique_count + duplicate_count total_number_of_duplicates += duplicate_count total_entries_assertion += all_count duplicate_percent = 0 if all_count > 0: duplicate_percent = float(duplicate_count) / all_count duplicate_percent_str = util.fmtr.format_percentage(duplicate_percent, True, 3) new_line = [app_id] new_line.extend(["{:,}".format(x) for x in [all_count, unique_count, duplicate_count]]) new_line.append(duplicate_percent_str) duplicates.append(new_line) assert(total_entries == total_entries_assertion) # Don't output table if there are no duplicates if total_number_of_duplicates == 0: printer.prt("\nDuplicate analysis: No duplicates found!") else: empty_line = [""] * 5 duplicates.append(empty_line) total_duplicate_percent = float(total_number_of_duplicates) / total_entries total_line = [total_line_name] total_line.extend([ "{:,}".format(x) for x in [total_entries, total_entries - total_number_of_duplicates, total_number_of_duplicates] ]) total_line.append(util.fmtr.format_percentage(total_duplicate_percent, True, 3)) duplicates.append(total_line) util.outp.print_table(duplicates, headline="Duplicates per app ID", printer=printer) printer.prt("\nScores for %s scorable app ids: Dispersion index = %s | Duplicate index = %s" % (len(scorable_app_ids), round(dispersion_index, 3), round(duplicate_index, 3)) ) printer.prt("Scorable app ids: %s" % scorable_app_ids) if to_file: with open(output_path, "w") as output_file: for line in printer.get_messages(): output_file.write(line + "\n") output_printer.prt("Successfully saved analysis to \"{}\".".format(output_path)) # harmonious? all labelled / some / none? # for each app id: are there roughly the same number of entries per class? return
def store_experiment(self): """ Store the results saved in this class in our experiment directory. """ self.end_time = time.time() self.storer_printer.prt("Storing experiment results...") Dir.ensure_folder_exists(self.experiment_dir_path) entry_file_path = os.path.join(self.experiment_dir_path, "used_entries") result_file_path = os.path.join(self.experiment_dir_path, "result") stdout_file_path = os.path.join(self.experiment_dir_path, "stdout") classifiers_file_path = os.path.join(self.experiment_dir_path, "classifiers") file_paths = [ entry_file_path, result_file_path, stdout_file_path, classifiers_file_path ] other_result_files_paths = [] for file_name, _ in self.other_result_files: oth_res_path_creation = os.path.join(self.experiment_dir_path, file_name) oth_res_path_creation = Dir.uniquify(oth_res_path_creation) other_result_files_paths.append(oth_res_path_creation) if any([ os.path.lexists(x) for x in file_paths + other_result_files_paths ]): raise IOError("One of the files exists: %s" % (file_paths + other_result_files_paths)) self.storer_printer.prt("Data verified. Storing utilised entries...") # Create new file with my entries saved_path = idse_dao.save_entries(entry_file_path, self.entries) self.storer_printer.prt("Done. Analysing file...") # Analyse that file log_file_analysis.analyse(saved_path, to_file=True, output_printer=util.prtr.Storer()) self.storer_printer.prt("Done. Saving classifiers...") # Save trained classifiers classifier_lines = self.create_classifier_lines() Dir.write_lines(classifiers_file_path, classifier_lines) self.storer_printer.prt("Done. Saving result digest...") # Save the result result_lines = self.create_result_lines() Dir.write_lines(result_file_path, result_lines) if self.other_result_files: for oth_res_path, (oth_res_name, oth_res_lines) in zip(other_result_files_paths, self.other_result_files): self.storer_printer.prt("Saving others: %s..." % oth_res_name) Dir.write_lines(oth_res_path, oth_res_lines) self.storer_printer.prt("Done!") self.storer_printer.prt("Experiment stored in: %s" % self.experiment_dir_path) # Save the stdout (tee replacement) stdout_lines = self.storer_printer.get_messages() Dir.write_lines(stdout_file_path, stdout_lines)
def detect_type(file_path): """ Detect the file type of the file. """ first_line = Dir.yield_lines(file_path).next() return _detect_type(first_line)
def _yield_log_entries_from_file(file_path): for line in Dir.yield_lines(file_path): yield LogEntry.from_log_string(line)
print(msg) exit() ### Main program ### p = argparse.ArgumentParser() p.add_argument("file_path", metavar="PATH/FILE", help="Log file") args = p.parse_args() orig_path = os.path.expanduser(args.file_path) if not os.path.lexists(orig_path): prexit("File doesn't exist") tmp_path = orig_path + "_bak" if os.path.lexists(tmp_path): prexit("%s exists" % tmp_path) os.rename(orig_path, tmp_path) with open(orig_path, "w") as output_file: for line in Dir.yield_lines(tmp_path): processed_string = proc_log_string(line) output_file.write("%s\n" % processed_string) print("Done.") print("Wrote to: %s" % orig_path) print("Old file: %s" % tmp_path)