def process_incident(self, incident): """ get the incident time from the db and gathers all features INPUT: log_files: the logs that we went through it. """ if(incident is None): return ip_sieve = IPSieve() ip_records = {} banned_ips = [] if(incident["file_name"] is None) or (len(incident["file_name"]) == 0): # get the logs from ES # get the logs from ES banned_ips = self.es_handler.get_banjax(incident['start'], incident['stop'], incident['target']) ats_records = self.es_handler.get(incident['start'], incident['stop'], incident['target']) # calculate IP dictionary with ATS records ip_records = ip_sieve.process_ats_records(ats_records) else: # read the sessions from the log file ip_sieve.add_log_file(incident["file_name"]) ip_records = ip_sieve.parse_log("nginx") # calculate features ip_feature_db = {} #At this stage it is only a peliminary list we might lose features #due to 0 variance self._active_feature_list = [] #do a dry run on all features just to gather the indeces of all available #features for CurentFeature in Learn2BanFeature.__subclasses__(): f = CurentFeature(ip_records, ip_feature_db) self._active_feature_list.append(f._FEATURE_INDEX) for CurentFeature in Learn2BanFeature.__subclasses__(): f = CurentFeature(ip_records, ip_feature_db) #logging.info("Computing feature %i..."% f._FEATURE_INDEX) print "Computing feature %i..."% f._FEATURE_INDEX f.compute() # post process the features ip_feature_db = self.bothound_tools.post_process(ip_feature_db) # delete the old sessions for thie incidend self.bothound_tools.delete_sessions(incident['id']) #print ip_feature_db self.bothound_tools.add_sessions(incident['id'], ip_feature_db, banned_ips) self.bothound_tools.set_incident_process(incident['id'], False) print "Incident {} processed.".format(incident['id']) return ip_feature_db
class Learn2BanTools(): ip_sieve = IPSieve() ip_feature_db = {} log_files = list() def load_data_logs(self): """ Retrieve all training logs from testing log directory """ data_log_list = glob.glob(self.data_dir + '*') self.log_files = data_log_list return self.log_files def connect_to_db(self): """ This connetcion to the db will live for the live time of the learn2bantools instance and will be used to save data back to the db """ self.db = MySQLdb.connect(self.db_host, self.db_user, self.db_password) #Create cursor object to allow query execution self.cur = self.db.cursor(MySQLdb.cursors.DictCursor) sql = 'CREATE DATABASE IF NOT EXISTS learn2ban' self.cur.execute(sql) #Connect directly to DB self.db = MySQLdb.connect(self.db_host, self.db_user, self.db_password, self.db_name) self.cur = self.db.cursor(MySQLdb.cursors.DictCursor) def disconnect_from_db(self): """ Close connection to the database """ self.cur.close() self.db.close() # def save_experiment(self, experiment_result): # """ # Save the results of an experimental run (old version) # """ # add_experiment_result = ("INSERT INTO experiment_result( experiment_id, result_file) VALUES (%(experiment_id)s,%(result_file)s)") # self.cur.execute(add_experiment_result, experiment_result) # self.db.commit() def save_experiment_result(self, experiment_result): """ Saves the result of an experimental run, including testing proportion used and score """ add_experiment_result = ( "INSERT INTO experiment_results( experiment_id, result_file, proportion, score, active_features, pca_ratios, mrmr_score) VALUES (%(experiment_id)s,%(result_file)s,%(proportion)s,%(score)s,%(active_features)s,%(pca_ratios)s,%(mrmr_score)s)" ) self.cur.execute(add_experiment_result, experiment_result) self.db.commit() def retrieve_experiments_results(self): """ Retrieve the results of the experiments already run """ self.cur.execute("SELECT * FROM experiment_results") experiment_results = self.cur.fetchall() return experiment_results def delete_all_experiments_results(self): """ Drops the entire experiment_results table """ self.cur.execute("TRUNCATE TABLE experiment_results") def retrieve_experiments(self): """ Retrieve the set of experiments to run from the database """ self.cur.execute("SELECT * FROM experiments where enabled=TRUE") self.experiment_set = self.cur.fetchall() return self.experiment_set def retrieve_experiment_logs(self, experiment_id): """ Read the experiment_logs table and retrieve the name of logs associated to the experiment id INPUT: experiment_id: the id of the experiment whose logs are sought """ self.cur.execute( "SELECT experiment_logs.log_id, logs.file_name FROM experiment_logs, logs WHERE experiment_logs.log_id = logs.id AND experiment_logs.experiment_id =" + str(experiment_id) + ";") log_set = self.cur.fetchall() #add the full path to log files for cur_log in log_set: cur_log['file_name'] = self.data_dir + cur_log['file_name'] return log_set def load_database_config(self): """ Get configuration parameters from the learn2ban config file and from the lern2ban database """ config = ConfigParser.ConfigParser() config.readfp(open(src_dir + '/config/train2ban.cfg')) self.db_user = config.get('db_params', 'db_user') self.db_password = config.get('db_params', 'db_password') self.db_host = config.get('db_params', 'db_host') self.db_name = config.get('db_params', 'db_name') self.config_profile = config.get('db_params', 'config_profile') def load_train2ban_config(self): #Get database connection params self.load_database_config() #Establish database connection object self.connect_to_db() #Get basic config parameters #first try to see if there is a config specific to this host if (not self.config_profile): self.config_profile = "default" self.cur.execute("SELECT * from config where profile_name='" + self.config_profile + "';") config_row = self.cur.fetchone() #otherwise we read first row of the database (the one with minimum id) if (not config_row): self.cur.execute("SELECT * from config ORDER BY id ASC") config_row = self.cur.fetchone() if (not config_row): raise IOError, "No configuration record in the database" if not config_row["absolute_paths"]: cur_dir = src_dir else: cur_dir = "" try: self.data_dir = cur_dir + config_row["training_directory"] + ( config_row["training_directory"][-1] != "/" and "/" or "") self.analyser_results_dir = cur_dir + config_row[ "analyser_results_directory"] + ( config_row["analyser_results_directory"][-1] != "/" and "/" or "") except IndexError: raise ValueError, "Data and Result directory can not be left blank" #depricated for now, we are entering the regexes directly into the db if config_row["regex_filter_directory"]: self.filter_dir = cur_dir + config_row["regex_filter_directory"] + ( config_row["regex_filter_directory"][-1] != "/" and "/" or "") if config_row["default_filter_file"]: self.filter_file = self.filter_dir + config_row[ "default_filter_file"] + ( config_row["default_filter_file"][-1] != "/" and "/" or "") def add_data_log(self, log): self.log_files = list() self.log_files.append(self.train_dir + log) def load_bad_filters_from_db(self, experiment_id): #TODO: ensure cur is live self.cur.execute( "SELECT regex_assignment.log_id, regex_filters.regex from regex_assignment, regex_filters, experiment_logs WHERE regex_assignment.regex_filter_id = regex_filters.id AND regex_assignment.log_id = experiment_logs.log_id AND experiment_logs.experiment_id = " + str(experiment_id)) return self.cur.fetchall() def load_bad_filters(self): """ Load set of regex filters from the default filter file. This is to allow expression of an individual filter file, rather than a set or by experiment. """ tree = ET.parse(self.filter_file) root = tree.getroot() filters = list() for child in root: filters.append(child.text) return filters def sieve_the_ip(self): """ This was used when all experiment were using all of log files but in new model each experiment has its own file """ for cur_log_file in self.log_files: self.ip_sieve.add_log_file(cur_log_file) self.ip_sieve.parse_log_file() # def magnitude(self,v): # return math.sqrt(sum(v[i]*v[i] for i in v)) def clear_data(self): self.ip_sieve = IPSieve() self.ip_feature_db = {} def gather_all_features(self, log_files): """ gathers all features INPUT: log_files: the logs that we went through it. """ for cur_log_file in log_files: self.ip_sieve.add_log_file(cur_log_file) self.ip_sieve.parse_log() for CurrentFeatureType in Learn2BanFeature.__subclasses__(): cur_feature_tester = CurrentFeatureType( self.ip_sieve, self.ip_feature_db) cur_feature_tester.compute() return self.ip_feature_db def construct_svm_classifier(self, kernel_mode='linear'): """ Creates an instance of the SVM classifier with a given mode """ return svm.SVC(kernel=kernel_mode) def random_slicer(self, data_size, train_portion=0.5): """ Return two arrays with random true and false and complement of each other, used for slicing a set into trainig and testing INPUT: data_size: size of the array to return train_portion: between 0,1 indicate the portion for the True entry """ from random import random random_selector = [ random() < train_portion for i in range(0, data_size) ] complement_selector = np.logical_not(random_selector) return random_selector, complement_selector def __init__(self): #we would like people to able to use the tool object even #if they don't have a db so we have no reason to load this #config in the constructor #self.load_database_config() pass
class Learn2BanTools(): ip_sieve = IPSieve() ip_feature_db = {} log_files = list() def load_data_logs(self): """ Retrieve all training logs from testing log directory """ data_log_list = glob.glob(self.data_dir + '*') self.log_files = data_log_list return self.log_files def connect_to_db(self): """ This connetcion to the db will live for the live time of the learn2bantools instance and will be used to save data back to the db """ self.db = MySQLdb.connect(self.db_host, self.db_user, self.db_password) #Create cursor object to allow query execution self.cur = self.db.cursor(MySQLdb.cursors.DictCursor) sql = 'CREATE DATABASE IF NOT EXISTS learn2ban' self.cur.execute(sql) #Connect directly to DB self.db = MySQLdb.connect(self.db_host, self.db_user, self.db_password, self.db_name) self.cur = self.db.cursor(MySQLdb.cursors.DictCursor) def disconnect_from_db(self): """ Close connection to the database """ self.cur.close() self.db.close() # def save_experiment(self, experiment_result): # """ # Save the results of an experimental run (old version) # """ # add_experiment_result = ("INSERT INTO experiment_result( experiment_id, result_file) VALUES (%(experiment_id)s,%(result_file)s)") # self.cur.execute(add_experiment_result, experiment_result) # self.db.commit() def save_experiment_result(self, experiment_result): """ Saves the result of an experimental run, including testing proportion used and score """ add_experiment_result = ("INSERT INTO experiment_results( experiment_id, result_file, proportion, score, active_features, pca_ratios, mrmr_score) VALUES (%(experiment_id)s,%(result_file)s,%(proportion)s,%(score)s,%(active_features)s,%(pca_ratios)s,%(mrmr_score)s)") self.cur.execute(add_experiment_result, experiment_result) self.db.commit() def retrieve_experiments_results(self): """ Retrieve the results of the experiments already run """ self.cur.execute("SELECT * FROM experiment_results") experiment_results = self.cur.fetchall() return experiment_results def delete_all_experiments_results(self): """ Drops the entire experiment_results table """ self.cur.execute("TRUNCATE TABLE experiment_results") def retrieve_experiments(self): """ Retrieve the set of experiments to run from the database """ self.cur.execute("SELECT * FROM experiments where enabled=TRUE") self.experiment_set = self.cur.fetchall() return self.experiment_set def retrieve_experiment_logs(self, experiment_id): """ Read the experiment_logs table and retrieve the name of logs associated to the experiment id INPUT: experiment_id: the id of the experiment whose logs are sought """ self.cur.execute("SELECT experiment_logs.log_id, logs.file_name FROM experiment_logs, logs WHERE experiment_logs.log_id = logs.id AND experiment_logs.experiment_id =" + str(experiment_id) + ";") log_set = self.cur.fetchall() #add the full path to log files for cur_log in log_set: cur_log['file_name'] = self.data_dir + cur_log['file_name'] return log_set def load_database_config(self): """ Get configuration parameters from the learn2ban config file and from the lern2ban database """ config = ConfigParser.ConfigParser() config.readfp(open(src_dir+'/config/train2ban.cfg')) self.db_user = config.get('db_params', 'db_user') self.db_password = config.get('db_params', 'db_password') self.db_host = config.get('db_params', 'db_host') self.db_name = config.get('db_params', 'db_name') self.config_profile = config.get('db_params', 'config_profile') def load_train2ban_config(self): #Get database connection params self.load_database_config() #Establish database connection object self.connect_to_db() #Get basic config parameters #first try to see if there is a config specific to this host if (not self.config_profile): self.config_profile = "default" self.cur.execute("SELECT * from config where profile_name='"+ self.config_profile+"';") config_row = self.cur.fetchone() #otherwise we read first row of the database (the one with minimum id) if (not config_row): self.cur.execute("SELECT * from config ORDER BY id ASC") config_row = self.cur.fetchone() if (not config_row): raise IOError, "No configuration record in the database" if not config_row["absolute_paths"]: cur_dir = src_dir else: cur_dir = "" try: self.data_dir = cur_dir + config_row["training_directory"] + (config_row["training_directory"][-1] != "/" and "/" or "") self.analyser_results_dir = cur_dir + config_row["analyser_results_directory"] + (config_row["analyser_results_directory"][-1] != "/" and "/" or "") except IndexError: raise ValueError, "Data and Result directory can not be left blank" #depricated for now, we are entering the regexes directly into the db if config_row["regex_filter_directory"]: self.filter_dir = cur_dir + config_row["regex_filter_directory"] + (config_row["regex_filter_directory"][-1] != "/" and "/" or "") if config_row["default_filter_file"]: self.filter_file = self.filter_dir + config_row["default_filter_file"] + (config_row["default_filter_file"][-1] != "/" and "/" or "") def add_data_log(self, log): self.log_files = list() self.log_files.append(self.train_dir + log) def load_bad_filters_from_db(self, experiment_id): #TODO: ensure cur is live self.cur.execute("SELECT regex_assignment.log_id, regex_filters.regex from regex_assignment, regex_filters, experiment_logs WHERE regex_assignment.regex_filter_id = regex_filters.id AND regex_assignment.log_id = experiment_logs.log_id AND experiment_logs.experiment_id = " +str(experiment_id)) return self.cur.fetchall() def load_bad_filters(self): """ Load set of regex filters from the default filter file. This is to allow expression of an individual filter file, rather than a set or by experiment. """ tree = ET.parse(self.filter_file) root = tree.getroot() filters = list() for child in root: filters.append(child.text) return filters def sieve_the_ip(self): """ This was used when all experiment were using all of log files but in new model each experiment has its own file """ for cur_log_file in self.log_files: self.ip_sieve.add_log_file(cur_log_file) self.ip_sieve.parse_log_file() # def magnitude(self,v): # return math.sqrt(sum(v[i]*v[i] for i in v)) def clear_data(self): self.ip_sieve = IPSieve() self.ip_feature_db = {} def gather_all_features(self, log_files): """ gathers all features INPUT: log_files: the logs that we went through it. """ for cur_log_file in log_files: self.ip_sieve.add_log_file(cur_log_file) self.ip_sieve.parse_log() for CurrentFeatureType in Learn2BanFeature.__subclasses__(): cur_feature_tester = CurrentFeatureType(self.ip_sieve, self.ip_feature_db) cur_feature_tester.compute() return self.ip_feature_db def construct_svm_classifier(self, kernel_mode='linear'): """ Creates an instance of the SVM classifier with a given mode """ return svm.SVC(kernel=kernel_mode) def random_slicer(self, data_size, train_portion=0.5): """ Return two arrays with random true and false and complement of each other, used for slicing a set into trainig and testing INPUT: data_size: size of the array to return train_portion: between 0,1 indicate the portion for the True entry """ from random import random random_selector = [random() < train_portion for i in range(0, data_size)] complement_selector = np.logical_not(random_selector) return random_selector, complement_selector def __init__(self): #we would like people to able to use the tool object even #if they don't have a db so we have no reason to load this #config in the constructor #self.load_database_config() pass
class Experimentor(): """ There is need for two type of Experiment objests one that correspond to each experiment record in experiment table and one that correspond to each result record in experiment_result. That is becaues from one experiment you can run many other experiments with little change in paramters and we don't want to store all these in DB as the design (train/test protion for example). Hence InseminatorExperiment read the experiment from the db (Expriment type 1) and Generator the L2BExperiment (Experiment type 2) """ # #user will send one of these values to make tweak the analyser behavoir # #train begin will take the begining portion of the data for training # #train random will choose random rows of the sample set # TRAIN_BEGIN = 0 # TRAIN_RANDOM = 1 # def __init__(self, where_to_train = TRAIN_BEGIN, training_portion = 1): # """ # Intitiate the behavoir of the analyzer. These parametrs should be # also tweakable from database # INPUT: # - where_to_train: which part of the sample should be used for # training # - training_protion: Between 0 - 1, tells the analyser how much # of the sample is for training and how much # for testing. # """ # self._where_to_train = where_to_train # self._training_portion = training_portion def __init__(self, exp, l2btools): """ store the exp config in self's attribute. """ self.expr_dict = exp self.id = self.expr_dict['id'] self.l2btools = l2btools self.ip_sieve = IPSieve() self.ip_feature_db = {} #Create classifier, currently only SVM supported #but trainer is agnostic of classifier used provided it supports fit and predict self.experiment_classifier = self.l2btools.construct_svm_classifier( self.expr_dict['kernel_type']) #Create classifier self.trainer = Train2Ban(self.experiment_classifier) #Setup base data set #the base filename we are going to associate to the result of this experiment utc_datetime = datetime.datetime.utcnow() utc_datetime.strftime("%Y-%m-%d-%H%MZ") self.base_analyse_log_file = self.l2btools.analyser_results_dir + 'base_analyse_' + str( utc_datetime) #this make more sense to happens in the constructor however, self._process_logs() self._mark_bots() def param_stochastifier(self): """ Here we return a randomised set of parameters for the experiments. At present we choose between for normalisation(sparse,individual), dimension reduction(PCA,ISOMap, MD5) and training portion(scale from 0-1) """ param_set = [] return param_set def _process_logs(self): """ get the log name from db and gathers all features INPUT: log_files: the logs that we went through it. """ #this is not a oop way of retrieving the logs but I think we are #avoiding db access in other classes beside l2btools cur_experiment_logs = self.l2btools.retrieve_experiment_logs(self.id) #if there is no log associated to this experiment then there is nothing #to do if len(cur_experiment_logs) == 0: logging.info("Giving up on experiment %i with no training log" % self.expr_dict['id']) return #log id is needed to be send to the trainer so the the trainer #knows which regex is detecting the bots for which log self.trainer.add_malicious_history_log_files([ (cur_log_info['log_id'], cur_log_info['file_name']) for cur_log_info in cur_experiment_logs ]) #extracitng the filenames #Get IP Features log_filenames = tuple(cur_log['file_name'] for cur_log in cur_experiment_logs) #At this stage it is only a peliminary list we might lose features #due to 0 variance self._active_feature_list = [] #do a dry run on all features just to gather the indeces of all available #features for CurrentFeatureType in Learn2BanFeature.__subclasses__(): cur_feature_tester = CurrentFeatureType(self.ip_sieve, self.ip_feature_db) self._active_feature_list.append(cur_feature_tester._FEATURE_INDEX) for cur_log_file in log_filenames: #in theory it might be more memory efficient #to crunch the logs one by one but python is quite disappointing in memory #management try: self.ip_sieve.add_log_file(cur_log_file) self.ip_sieve.parse_log() except IOError: print "Unable to read ", cur_log_file, "skipping..." for CurrentFeatureType in Learn2BanFeature.__subclasses__(): cur_feature_tester = CurrentFeatureType(self.ip_sieve, self.ip_feature_db) logging.info("Computing feature %i..." % cur_feature_tester._FEATURE_INDEX) cur_feature_tester.compute() # we have memory problem here :( # import objgraph # objgraph.show_refs([self.ip_sieve._ordered_records], filename='ips-graph.png') del self.ip_sieve._ordered_records del self.ip_sieve #f**k python with not letting the memory released # import gc # gc.collect() # print gc.garbage() self.trainer.add_to_sample(self.ip_feature_db) #we store the non-normailized vectors in a json file jsonized_ip_feature_db = {} for k, v in self.ip_feature_db.items(): jsonized_ip_feature_db[str(k)] = v import json with open(self.base_analyse_log_file + ".prenormal_ip_feature_db.json", "w") as ip_feature_file: json.dump(jsonized_ip_feature_db, ip_feature_file) del self.ip_feature_db del jsonized_ip_feature_db #Normalise training set, normalisation should happen after all #sample is gathered self.trainer.normalise(self.expr_dict['norm_mode']) def _mark_bots(self): """ Read the regexes correspond to this experience log and apply them to the trainer. this should be called after the logs has been processed. """ #Add Faill2Ban filters filters_for_experiment = self.l2btools.load_bad_filters_from_db( self.id) for cur_filter in filters_for_experiment: self.trainer.add_bad_regexes(cur_filter['log_id'], (cur_filter['regex'], )) #Use Fail2ban filters to identify and mark DDOS IPs in data set malicious_ips = self.trainer.mark_bad_target() with open(self.base_analyse_log_file + ".malicious_ip_list", "w") as malicious_ip_file: malicious_ip_file.write(str(malicious_ips).strip('[]')) def _pca_importance_ananlysis(self, pca_model): """ Retrieve the pca transformation and use the following formula to determine the importance of each feature: length(variance*|c_1j|/sqrt(sum(c1i_2^2))) INPUT: pca_model: (the transfarmation matrix in np array, importance of each component) the output of L2BExperiment.PCA_transform_detail OUTPUT: an array containing the importance ratio of features based on above forumla """ pca_transform_matrix = pca_model[0] pca_var_ratio = pca_model[1] #row_sums = pca_transform_matrix.sum(axis=1) #apparently pca transfomation is normalised along both access #anyway for some reason reshape(-1) doesn't work as transpose scaled_coeffs = pca_var_ratio.reshape(len(pca_var_ratio), 1) * pca_transform_matrix return np.apply_along_axis(np.linalg.norm, 0, scaled_coeffs) def run_l2b_experiment(self, train_portion, stochastic_params): """ Run individual instance of given experiment """ utc_datetime = datetime.datetime.utcnow() utc_datetime.strftime("%Y-%m-%d-%H%MZ") analyse_log_file = self.l2btools.analyser_results_dir + 'analyse_' + str( utc_datetime) logging.basicConfig(filename=analyse_log_file, level=logging.INFO) logging.info('Begin learn 2 ban analysis for Experiment Id: ' + str(self.expr_dict['id'])) #Divide up data set into training and testing portions based on initial given value marked_training_set = self.trainer.get_training_set() #if no body is a bot then this is not a fruitful experiment if marked_training_set.no_culprit(): logging.info("No bot detected, Giving up on experiment " + str(self.expr_dict['id'])) return #here we need to check if we lost features or not due to normalisation #sparse normaliastion doesn't cut off feature if self.expr_dict['norm_mode'] == 'individual': dimension_reducer = [ cur_feature_std != 0 for cur_feature_std in marked_training_set._normalisation_data[ marked_training_set.SAMPLE_STD] ] self._active_feature_list = [ self._active_feature_list[red_plc[0]] for red_plc in enumerate(dimension_reducer) if red_plc[1] ] active_features = str(self._active_feature_list).strip('[]') #TODO: Iterate with different slicing to get reliable result train_selector, test_selector = self.l2btools.random_slicer( len(marked_training_set), train_portion) train_set = marked_training_set.get_training_subset( case_selector=train_selector) test_set = marked_training_set.get_training_subset( case_selector=test_selector) #initializes L2BEXperiment cur_experiment = L2BExperiment(train_set, test_set, self.trainer) #TODO:mRMR and PCA are independent of slicing and should # computed over the whole dataset # Get the mRMR mrmr = cur_experiment.get_mrmr() logging.info('mRMR score: ' + str(mrmr)) # Get the PCA ratios as a string pca_ratios = str( self._pca_importance_ananlysis( cur_experiment.pca_transform_detail())).strip('[]') logging.info('PCA ratios: ' + pca_ratios) #Train model against training set cur_experiment.train() #Predict for training data using constructed model score = cur_experiment.cross_validate_test() logging.info('Crossvalidation score: ' + str(score)) self.store_results(analyse_log_file, train_portion, score, active_features, pca_ratios, mrmr) def store_results(self, analyse_log_file, train_portion, score, active_features, pca_ratios, mrmr): # Add the result to the database experiment_result = {} experiment_result['experiment_id'] = self.expr_dict['id'] experiment_result['result_file'] = analyse_log_file experiment_result['proportion'] = train_portion experiment_result['score'] = score experiment_result['active_features'] = active_features experiment_result['pca_ratios'] = pca_ratios experiment_result['mrmr_score'] = str(mrmr).strip('[]') #while the pickle model is always created the result file only #get stored in the case there are an error self.l2btools.save_experiment_result(experiment_result) self.trainer.save_model(analyse_log_file + ".l2b_pickle_model") #also try to store in recontsructable libsvm format if the function #if the save_svm_model function is implmented try: self.trainer.save_model(analyse_log_file + ".normal_svm_model", "normal_svm") except NotImplementedError: print "save_svm_model is not implmeneted in your scikit-learn, skipping storing the model in libsvm format" print "Experiment", self.expr_dict[ 'id'], ": train portion = ", train_portion, ", score = ", score, ", mRMR = ", mrmr, ", PCA ratios = ", pca_ratios print experiment_result
def process_incident(self, incident): """ get the incident time from the db and gathers all features INPUT: log_files: the logs that we went through it. """ if (incident is None): return ip_sieve = IPSieve() ip_records = {} banned_ips = [] if (incident["file_name"] is None) or (len(incident["file_name"]) == 0): # get the logs from ES # get the logs from ES banned_ips = self.es_handler.get_banjax(incident['start'], incident['stop'], incident['target']) ats_records = self.es_handler.get(incident['start'], incident['stop'], incident['target']) # calculate IP dictionary with ATS records ip_records = ip_sieve.process_ats_records(ats_records) else: # read the sessions from the log file ip_sieve.add_log_file(incident["file_name"]) ip_records = ip_sieve.parse_log("nginx") # calculate features ip_feature_db = {} #At this stage it is only a peliminary list we might lose features #due to 0 variance self._active_feature_list = [] #do a dry run on all features just to gather the indeces of all available #features for CurentFeature in Learn2BanFeature.__subclasses__(): f = CurentFeature(ip_records, ip_feature_db) self._active_feature_list.append(f._FEATURE_INDEX) for CurentFeature in Learn2BanFeature.__subclasses__(): f = CurentFeature(ip_records, ip_feature_db) #logging.info("Computing feature %i..."% f._FEATURE_INDEX) print "Computing feature %i..." % f._FEATURE_INDEX f.compute() # post process the features ip_feature_db = self.bothound_tools.post_process(ip_feature_db) # delete the old sessions for thie incidend self.bothound_tools.delete_sessions(incident['id']) #print ip_feature_db self.bothound_tools.add_sessions(incident['id'], ip_feature_db, banned_ips) self.bothound_tools.set_incident_process(incident['id'], False) print "Incident {} processed.".format(incident['id']) return ip_feature_db
class Experimentor(): """ There is need for two type of Experiment objests one that correspond to each experiment record in experiment table and one that correspond to each result record in experiment_result. That is becaues from one experiment you can run many other experiments with little change in paramters and we don't want to store all these in DB as the design (train/test protion for example). Hence InseminatorExperiment read the experiment from the db (Expriment type 1) and Generator the L2BExperiment (Experiment type 2) """ # #user will send one of these values to make tweak the analyser behavoir # #train begin will take the begining portion of the data for training # #train random will choose random rows of the sample set # TRAIN_BEGIN = 0 # TRAIN_RANDOM = 1 # def __init__(self, where_to_train = TRAIN_BEGIN, training_portion = 1): # """ # Intitiate the behavoir of the analyzer. These parametrs should be # also tweakable from database # INPUT: # - where_to_train: which part of the sample should be used for # training # - training_protion: Between 0 - 1, tells the analyser how much # of the sample is for training and how much # for testing. # """ # self._where_to_train = where_to_train # self._training_portion = training_portion def __init__(self, exp, l2btools): """ store the exp config in self's attribute. """ self.expr_dict = exp self.id = self.expr_dict['id'] self.l2btools = l2btools self.ip_sieve = IPSieve() self.ip_feature_db = {} #Create classifier, currently only SVM supported #but trainer is agnostic of classifier used provided it supports fit and predict self.experiment_classifier = self.l2btools.construct_svm_classifier(self.expr_dict['kernel_type']) #Create classifier self.trainer = Train2Ban(self.experiment_classifier) #Setup base data set #the base filename we are going to associate to the result of this experiment utc_datetime = datetime.datetime.utcnow() utc_datetime.strftime("%Y-%m-%d-%H%MZ") self.base_analyse_log_file = self.l2btools.analyser_results_dir + 'base_analyse_' + str(utc_datetime) #this make more sense to happens in the constructor however, self._process_logs() self._mark_bots() def param_stochastifier(self): """ Here we return a randomised set of parameters for the experiments. At present we choose between for normalisation(sparse,individual), dimension reduction(PCA,ISOMap, MD5) and training portion(scale from 0-1) """ param_set = [] return param_set def _process_logs(self): """ get the log name from db and gathers all features INPUT: log_files: the logs that we went through it. """ #this is not a oop way of retrieving the logs but I think we are #avoiding db access in other classes beside l2btools cur_experiment_logs = self.l2btools.retrieve_experiment_logs(self.id) #if there is no log associated to this experiment then there is nothing #to do if len(cur_experiment_logs) == 0: logging.info("Giving up on experiment %i with no training log"%self.expr_dict['id']) return #log id is needed to be send to the trainer so the the trainer #knows which regex is detecting the bots for which log self.trainer.add_malicious_history_log_files([(cur_log_info['log_id'], cur_log_info['file_name']) for cur_log_info in cur_experiment_logs]) #extracitng the filenames #Get IP Features log_filenames = tuple(cur_log['file_name'] for cur_log in cur_experiment_logs) #At this stage it is only a peliminary list we might lose features #due to 0 variance self._active_feature_list = [] #do a dry run on all features just to gather the indeces of all available #features for CurrentFeatureType in Learn2BanFeature.__subclasses__(): cur_feature_tester = CurrentFeatureType(self.ip_sieve, self.ip_feature_db) self._active_feature_list.append(cur_feature_tester._FEATURE_INDEX) for cur_log_file in log_filenames: #in theory it might be more memory efficient #to crunch the logs one by one but python is quite disappointing in memory #management try: self.ip_sieve.add_log_file(cur_log_file) self.ip_sieve.parse_log() except IOError: print "Unable to read ", cur_log_file, "skipping..." for CurrentFeatureType in Learn2BanFeature.__subclasses__(): cur_feature_tester = CurrentFeatureType(self.ip_sieve, self.ip_feature_db) logging.info("Computing feature %i..."%cur_feature_tester._FEATURE_INDEX) cur_feature_tester.compute() # we have memory problem here :( # import objgraph # objgraph.show_refs([self.ip_sieve._ordered_records], filename='ips-graph.png') del self.ip_sieve._ordered_records del self.ip_sieve #f**k python with not letting the memory released # import gc # gc.collect() # print gc.garbage() self.trainer.add_to_sample(self.ip_feature_db) #we store the non-normailized vectors in a json file jsonized_ip_feature_db = {} for k,v in self.ip_feature_db.items(): jsonized_ip_feature_db[str(k)] = v import json with open(self.base_analyse_log_file+".prenormal_ip_feature_db.json", "w") as ip_feature_file: json.dump(jsonized_ip_feature_db, ip_feature_file) del self.ip_feature_db del jsonized_ip_feature_db #Normalise training set, normalisation should happen after all #sample is gathered self.trainer.normalise(self.expr_dict['norm_mode']) def _mark_bots(self): """ Read the regexes correspond to this experience log and apply them to the trainer. this should be called after the logs has been processed. """ #Add Faill2Ban filters filters_for_experiment = self.l2btools.load_bad_filters_from_db(self.id) for cur_filter in filters_for_experiment: self.trainer.add_bad_regexes(cur_filter['log_id'], (cur_filter['regex'],)) #Use Fail2ban filters to identify and mark DDOS IPs in data set malicious_ips = self.trainer.mark_bad_target() with open(self.base_analyse_log_file+".malicious_ip_list", "w") as malicious_ip_file: malicious_ip_file.write(str(malicious_ips).strip('[]')) def _pca_importance_ananlysis(self, pca_model): """ Retrieve the pca transformation and use the following formula to determine the importance of each feature: length(variance*|c_1j|/sqrt(sum(c1i_2^2))) INPUT: pca_model: (the transfarmation matrix in np array, importance of each component) the output of L2BExperiment.PCA_transform_detail OUTPUT: an array containing the importance ratio of features based on above forumla """ pca_transform_matrix = pca_model[0] pca_var_ratio = pca_model[1] #row_sums = pca_transform_matrix.sum(axis=1) #apparently pca transfomation is normalised along both access #anyway for some reason reshape(-1) doesn't work as transpose scaled_coeffs = pca_var_ratio.reshape(len(pca_var_ratio),1) * pca_transform_matrix return np.apply_along_axis(np.linalg.norm, 0 , scaled_coeffs) def run_l2b_experiment(self, train_portion, stochastic_params): """ Run individual instance of given experiment """ utc_datetime = datetime.datetime.utcnow() utc_datetime.strftime("%Y-%m-%d-%H%MZ") analyse_log_file = self.l2btools.analyser_results_dir + 'analyse_' + str(utc_datetime) logging.basicConfig(filename=analyse_log_file, level=logging.INFO) logging.info('Begin learn 2 ban analysis for Experiment Id: ' + str(self.expr_dict['id'])) #Divide up data set into training and testing portions based on initial given value marked_training_set = self.trainer.get_training_set() #if no body is a bot then this is not a fruitful experiment if marked_training_set.no_culprit(): logging.info("No bot detected, Giving up on experiment " + str(self.expr_dict['id'])) return #here we need to check if we lost features or not due to normalisation #sparse normaliastion doesn't cut off feature if self.expr_dict['norm_mode']=='individual': dimension_reducer = [cur_feature_std != 0 for cur_feature_std in marked_training_set._normalisation_data[marked_training_set.SAMPLE_STD]] self._active_feature_list = [self._active_feature_list[red_plc[0]] for red_plc in enumerate(dimension_reducer) if red_plc[1]] active_features = str(self._active_feature_list).strip('[]') #TODO: Iterate with different slicing to get reliable result train_selector, test_selector = self.l2btools.random_slicer(len(marked_training_set), train_portion) train_set = marked_training_set.get_training_subset(case_selector=train_selector) test_set = marked_training_set.get_training_subset(case_selector=test_selector) #initializes L2BEXperiment cur_experiment = L2BExperiment(train_set, test_set, self.trainer) #TODO:mRMR and PCA are independent of slicing and should # computed over the whole dataset # Get the mRMR mrmr = cur_experiment.get_mrmr() logging.info('mRMR score: ' + str(mrmr)) # Get the PCA ratios as a string pca_ratios = str(self._pca_importance_ananlysis(cur_experiment.pca_transform_detail())).strip('[]') logging.info('PCA ratios: ' + pca_ratios) #Train model against training set cur_experiment.train() #Predict for training data using constructed model score = cur_experiment.cross_validate_test() logging.info('Crossvalidation score: ' + str(score)) self.store_results(analyse_log_file, train_portion, score, active_features, pca_ratios, mrmr) def store_results(self, analyse_log_file, train_portion, score, active_features, pca_ratios, mrmr): # Add the result to the database experiment_result = {} experiment_result['experiment_id'] = self.expr_dict['id'] experiment_result['result_file'] = analyse_log_file experiment_result['proportion'] = train_portion experiment_result['score'] = score experiment_result['active_features'] = active_features experiment_result['pca_ratios'] = pca_ratios experiment_result['mrmr_score'] = str(mrmr).strip('[]') #while the pickle model is always created the result file only #get stored in the case there are an error self.l2btools.save_experiment_result(experiment_result) self.trainer.save_model(analyse_log_file+".l2b_pickle_model") #also try to store in recontsructable libsvm format if the function #if the save_svm_model function is implmented try: self.trainer.save_model(analyse_log_file+".normal_svm_model", "normal_svm") except NotImplementedError: print "save_svm_model is not implmeneted in your scikit-learn, skipping storing the model in libsvm format" print "Experiment", self.expr_dict['id'], ": train portion = ", train_portion, ", score = ", score, ", mRMR = ", mrmr, ", PCA ratios = ", pca_ratios print experiment_result