Exemplo n.º 1
0
    def __init__(self, exp, l2btools):
        """
        store the exp config in self's attribute.
        """
        self.expr_dict = exp
        self.id = self.expr_dict['id']
        self.l2btools = l2btools

        self.ip_sieve = IPSieve()
        self.ip_feature_db = {}

        #Create classifier, currently only SVM supported
        #but trainer is agnostic of classifier used provided it supports fit and predict
        self.experiment_classifier = self.l2btools.construct_svm_classifier(
            self.expr_dict['kernel_type'])
        #Create classifier
        self.trainer = Train2Ban(self.experiment_classifier)
        #Setup base data set
        #the base filename we are going to associate to the result of this experiment
        utc_datetime = datetime.datetime.utcnow()
        utc_datetime.strftime("%Y-%m-%d-%H%MZ")
        self.base_analyse_log_file = self.l2btools.analyser_results_dir + 'base_analyse_' + str(
            utc_datetime)
        #this make more sense to happens in the constructor however,
        self._process_logs()
        self._mark_bots()
Exemplo n.º 2
0
	def process_incident(self, incident):
		"""
		get the incident time from the db and gathers all features

		INPUT:
			log_files: the logs that we went through it.
		"""
		if(incident is None):
			return 

		ip_sieve = IPSieve()
		ip_records = {}		
		banned_ips = []

		if(incident["file_name"] is None) or (len(incident["file_name"]) == 0):
			# get the logs from ES
			# get the logs from ES
			banned_ips = self.es_handler.get_banjax(incident['start'], incident['stop'], incident['target'])
			ats_records = self.es_handler.get(incident['start'], incident['stop'], incident['target'])

			# calculate IP dictionary with ATS records
			ip_records = ip_sieve.process_ats_records(ats_records)
		else:
			# read the sessions from the log file
			ip_sieve.add_log_file(incident["file_name"])
			ip_records = ip_sieve.parse_log("nginx")
	
		# calculate features
		ip_feature_db = {}

		#At this stage it is only a peliminary list we might lose features
		#due to 0 variance
		self._active_feature_list = []
		#do a dry run on all features just to gather the indeces of all available
		#features
		for CurentFeature in Learn2BanFeature.__subclasses__():
			f = CurentFeature(ip_records, ip_feature_db)
			self._active_feature_list.append(f._FEATURE_INDEX)

		for CurentFeature in Learn2BanFeature.__subclasses__():
			f = CurentFeature(ip_records, ip_feature_db)
			#logging.info("Computing feature %i..."% f._FEATURE_INDEX)
			print "Computing feature %i..."% f._FEATURE_INDEX
			f.compute()

		# post process the features
		ip_feature_db = self.bothound_tools.post_process(ip_feature_db)

		# delete the old sessions for thie incidend
		self.bothound_tools.delete_sessions(incident['id'])

		#print ip_feature_db
		self.bothound_tools.add_sessions(incident['id'], ip_feature_db, banned_ips)
		self.bothound_tools.set_incident_process(incident['id'], False)
		print "Incident {} processed.".format(incident['id'])
		return ip_feature_db
Exemplo n.º 3
0
class BasicTests(unittest.TestCase):
    log_files = (
        src_dir + "/test/deflect_test.log",
        src_dir + "/test/deflect.log_cool1.20120810_five_percent.log"
    )  #src_dir+"/test/deflect.log_cool1.20120810.23h59m50s-20120812.00h00m00s.old" )
    #log_files = (src_dir+"/tests/deflect_test.log", src_dir+"/tests/deflect_test.log")
    test_ip_sieve = IPSieve()
    test_ip_feature_db = {}

    def __init__(self):
        pass

    def test_ip_sieve_parse(self):
        for cur_log_file in self.log_files:
            self.test_ip_sieve.add_log_file(cur_log_file)
            self.test_ip_sieve.parse_log()

    def test_all_features(self):
        for cur_log_file in self.log_files:
            self.test_ip_sieve.add_log_file(cur_log_file)
            self.test_ip_sieve.parse_log()

            for CurrentFeatureType in Learn2BanFeature.__subclasses__():
                cur_feature_tester = CurrentFeatureType(
                    self.test_ip_sieve, self.test_ip_feature_db)
                cur_feature_tester.compute()

        print self.test_ip_feature_db

    def run_tests(self):
        """
        needs a function to feed to the profiler
        """
        self.test_ip_sieve_parse()
        self.test_all_features()
Exemplo n.º 4
0
    def __init__(self, exp, l2btools):
        """
        store the exp config in self's attribute.
        """
        self.expr_dict = exp
        self.id = self.expr_dict['id']
        self.l2btools = l2btools

        self.ip_sieve = IPSieve()
        self.ip_feature_db = {}

        #Create classifier, currently only SVM supported
        #but trainer is agnostic of classifier used provided it supports fit and predict
        self.experiment_classifier = self.l2btools.construct_svm_classifier(self.expr_dict['kernel_type'])
        #Create classifier
        self.trainer = Train2Ban(self.experiment_classifier)
        #Setup base data set
        #the base filename we are going to associate to the result of this experiment
        utc_datetime = datetime.datetime.utcnow()
        utc_datetime.strftime("%Y-%m-%d-%H%MZ")
        self.base_analyse_log_file = self.l2btools.analyser_results_dir + 'base_analyse_' + str(utc_datetime)
        #this make more sense to happens in the constructor however,
        self._process_logs()
        self._mark_bots()
Exemplo n.º 5
0
class Learn2BanTools():
    ip_sieve = IPSieve()
    ip_feature_db = {}
    log_files = list()

    def load_data_logs(self):
        """
        Retrieve all training logs from testing log directory
        """
        data_log_list = glob.glob(self.data_dir + '*')
        self.log_files = data_log_list
        return self.log_files

    def connect_to_db(self):
        """
        This connetcion to the db will live for the live time of the
        learn2bantools instance and will be used to save data back to the db
        """
        self.db = MySQLdb.connect(self.db_host, self.db_user, self.db_password)

        #Create cursor object to allow query execution
        self.cur = self.db.cursor(MySQLdb.cursors.DictCursor)
        sql = 'CREATE DATABASE IF NOT EXISTS learn2ban'
        self.cur.execute(sql)

        #Connect directly to DB
        self.db = MySQLdb.connect(self.db_host, self.db_user, self.db_password,
                                  self.db_name)
        self.cur = self.db.cursor(MySQLdb.cursors.DictCursor)

    def disconnect_from_db(self):
        """
        Close connection to the database
        """
        self.cur.close()
        self.db.close()

    # def save_experiment(self, experiment_result):
    #     """
    #     Save the results of an experimental run (old version)
    #     """
    #     add_experiment_result = ("INSERT INTO experiment_result( experiment_id, result_file) VALUES (%(experiment_id)s,%(result_file)s)")
    #     self.cur.execute(add_experiment_result, experiment_result)
    #     self.db.commit()

    def save_experiment_result(self, experiment_result):
        """
        Saves the result of an experimental run, including testing proportion used and score
        """
        add_experiment_result = (
            "INSERT INTO experiment_results( experiment_id, result_file, proportion, score, active_features, pca_ratios, mrmr_score) VALUES (%(experiment_id)s,%(result_file)s,%(proportion)s,%(score)s,%(active_features)s,%(pca_ratios)s,%(mrmr_score)s)"
        )
        self.cur.execute(add_experiment_result, experiment_result)
        self.db.commit()

    def retrieve_experiments_results(self):
        """
        Retrieve the results of the experiments already run
        """
        self.cur.execute("SELECT * FROM experiment_results")
        experiment_results = self.cur.fetchall()
        return experiment_results

    def delete_all_experiments_results(self):
        """
        Drops the entire experiment_results table
        """
        self.cur.execute("TRUNCATE TABLE experiment_results")

    def retrieve_experiments(self):
        """
        Retrieve the set of experiments to run from the database
        """
        self.cur.execute("SELECT * FROM experiments where enabled=TRUE")
        self.experiment_set = self.cur.fetchall()
        return self.experiment_set

    def retrieve_experiment_logs(self, experiment_id):
        """
        Read the experiment_logs table and retrieve the name
        of logs associated to the experiment id

        INPUT:
           experiment_id: the id of the experiment whose logs are sought
        """
        self.cur.execute(
            "SELECT experiment_logs.log_id, logs.file_name FROM experiment_logs, logs WHERE experiment_logs.log_id = logs.id AND experiment_logs.experiment_id ="
            + str(experiment_id) + ";")
        log_set = self.cur.fetchall()
        #add the full path to log files
        for cur_log in log_set:
            cur_log['file_name'] = self.data_dir + cur_log['file_name']
        return log_set

    def load_database_config(self):
        """
        Get configuration parameters from the learn2ban config file
        and from the lern2ban database
        """
        config = ConfigParser.ConfigParser()
        config.readfp(open(src_dir + '/config/train2ban.cfg'))
        self.db_user = config.get('db_params', 'db_user')
        self.db_password = config.get('db_params', 'db_password')
        self.db_host = config.get('db_params', 'db_host')
        self.db_name = config.get('db_params', 'db_name')
        self.config_profile = config.get('db_params', 'config_profile')

    def load_train2ban_config(self):
        #Get database connection params
        self.load_database_config()
        #Establish database connection object
        self.connect_to_db()
        #Get basic config parameters
        #first try to see if there is a config specific to this host
        if (not self.config_profile):
            self.config_profile = "default"

        self.cur.execute("SELECT * from config where profile_name='" +
                         self.config_profile + "';")
        config_row = self.cur.fetchone()

        #otherwise we read first row of the database (the one with minimum id)
        if (not config_row):
            self.cur.execute("SELECT * from config ORDER BY id ASC")
            config_row = self.cur.fetchone()

        if (not config_row):
            raise IOError, "No configuration record in the database"

        if not config_row["absolute_paths"]:
            cur_dir = src_dir
        else:
            cur_dir = ""

        try:
            self.data_dir = cur_dir + config_row["training_directory"] + (
                config_row["training_directory"][-1] != "/" and "/" or "")
            self.analyser_results_dir = cur_dir + config_row[
                "analyser_results_directory"] + (
                    config_row["analyser_results_directory"][-1] != "/" and "/"
                    or "")

        except IndexError:
            raise ValueError, "Data and Result directory can not be left blank"

        #depricated for now, we are entering the regexes directly into the db
        if config_row["regex_filter_directory"]:
            self.filter_dir = cur_dir + config_row["regex_filter_directory"] + (
                config_row["regex_filter_directory"][-1] != "/" and "/" or "")

        if config_row["default_filter_file"]:
            self.filter_file = self.filter_dir + config_row[
                "default_filter_file"] + (
                    config_row["default_filter_file"][-1] != "/" and "/" or "")

    def add_data_log(self, log):
        self.log_files = list()
        self.log_files.append(self.train_dir + log)

    def load_bad_filters_from_db(self, experiment_id):
        #TODO: ensure cur is live
        self.cur.execute(
            "SELECT regex_assignment.log_id, regex_filters.regex from regex_assignment, regex_filters, experiment_logs WHERE regex_assignment.regex_filter_id = regex_filters.id AND regex_assignment.log_id = experiment_logs.log_id AND experiment_logs.experiment_id = "
            + str(experiment_id))
        return self.cur.fetchall()

    def load_bad_filters(self):
        """
        Load set of regex filters from the default filter file.
        This is to allow expression of an individual filter file, rather
        than a set or by experiment.
        """
        tree = ET.parse(self.filter_file)
        root = tree.getroot()
        filters = list()
        for child in root:
            filters.append(child.text)
        return filters

    def sieve_the_ip(self):
        """
        This was used when all experiment were using all of log files
        but in new model each experiment has its own file
        """
        for cur_log_file in self.log_files:
            self.ip_sieve.add_log_file(cur_log_file)
            self.ip_sieve.parse_log_file()

    # def magnitude(self,v):
    #     return math.sqrt(sum(v[i]*v[i] for i in v))

    def clear_data(self):
        self.ip_sieve = IPSieve()
        self.ip_feature_db = {}

    def gather_all_features(self, log_files):
        """
        gathers all features

        INPUT:
            log_files: the logs that we went through it.
        """
        for cur_log_file in log_files:
            self.ip_sieve.add_log_file(cur_log_file)
            self.ip_sieve.parse_log()
            for CurrentFeatureType in Learn2BanFeature.__subclasses__():
                cur_feature_tester = CurrentFeatureType(
                    self.ip_sieve, self.ip_feature_db)
                cur_feature_tester.compute()

        return self.ip_feature_db

    def construct_svm_classifier(self, kernel_mode='linear'):
        """
        Creates an instance of the SVM classifier with a given mode
        """
        return svm.SVC(kernel=kernel_mode)

    def random_slicer(self, data_size, train_portion=0.5):
        """
        Return two arrays with random true and false and complement of each
        other, used for slicing a set into trainig and testing

        INPUT:
            data_size: size of the array to return
            train_portion: between 0,1 indicate the portion for the True
                           entry
        """
        from random import random
        random_selector = [
            random() < train_portion for i in range(0, data_size)
        ]
        complement_selector = np.logical_not(random_selector)

        return random_selector, complement_selector

    def __init__(self):
        #we would like people to able to use the tool object even
        #if they don't have a db so we have no reason to load this
        #config in the constructor
        #self.load_database_config()
        pass
Exemplo n.º 6
0
 def clear_data(self):
     self.ip_sieve = IPSieve()
     self.ip_feature_db = {}
Exemplo n.º 7
0
class Learn2BanTools():
    ip_sieve = IPSieve()
    ip_feature_db = {}
    log_files = list()

    def load_data_logs(self):
        """
        Retrieve all training logs from testing log directory
        """
        data_log_list = glob.glob(self.data_dir + '*')
        self.log_files = data_log_list
        return self.log_files

    def connect_to_db(self):
        """
        This connetcion to the db will live for the live time of the
        learn2bantools instance and will be used to save data back to the db
        """
        self.db = MySQLdb.connect(self.db_host, self.db_user, self.db_password)

        #Create cursor object to allow query execution
        self.cur = self.db.cursor(MySQLdb.cursors.DictCursor)
        sql = 'CREATE DATABASE IF NOT EXISTS learn2ban'
        self.cur.execute(sql)

	#Connect directly to DB
        self.db = MySQLdb.connect(self.db_host, self.db_user, self.db_password, self.db_name)
        self.cur = self.db.cursor(MySQLdb.cursors.DictCursor)

    def disconnect_from_db(self):
        """
        Close connection to the database
        """
        self.cur.close()
        self.db.close()

    # def save_experiment(self, experiment_result):
    #     """
    #     Save the results of an experimental run (old version)
    #     """
    #     add_experiment_result = ("INSERT INTO experiment_result( experiment_id, result_file) VALUES (%(experiment_id)s,%(result_file)s)")
    #     self.cur.execute(add_experiment_result, experiment_result)
    #     self.db.commit()

    def save_experiment_result(self, experiment_result):
        """
        Saves the result of an experimental run, including testing proportion used and score
        """
        add_experiment_result = ("INSERT INTO experiment_results( experiment_id, result_file, proportion, score, active_features, pca_ratios, mrmr_score) VALUES (%(experiment_id)s,%(result_file)s,%(proportion)s,%(score)s,%(active_features)s,%(pca_ratios)s,%(mrmr_score)s)")
        self.cur.execute(add_experiment_result, experiment_result)
        self.db.commit()

    def retrieve_experiments_results(self):
        """
        Retrieve the results of the experiments already run
        """
        self.cur.execute("SELECT * FROM experiment_results")
        experiment_results = self.cur.fetchall()
        return experiment_results

    def delete_all_experiments_results(self):
        """
        Drops the entire experiment_results table
        """
        self.cur.execute("TRUNCATE TABLE experiment_results")

    def retrieve_experiments(self):
        """
        Retrieve the set of experiments to run from the database
        """
        self.cur.execute("SELECT * FROM experiments where enabled=TRUE")
        self.experiment_set = self.cur.fetchall()
        return self.experiment_set

    def retrieve_experiment_logs(self, experiment_id):
        """
        Read the experiment_logs table and retrieve the name
        of logs associated to the experiment id

        INPUT:
           experiment_id: the id of the experiment whose logs are sought
        """
        self.cur.execute("SELECT experiment_logs.log_id, logs.file_name FROM experiment_logs, logs WHERE experiment_logs.log_id = logs.id AND experiment_logs.experiment_id =" + str(experiment_id) + ";")
        log_set =  self.cur.fetchall()
        #add the full path to log files
        for cur_log in log_set:
            cur_log['file_name'] = self.data_dir + cur_log['file_name']
        return log_set

    def load_database_config(self):
        """
        Get configuration parameters from the learn2ban config file
        and from the lern2ban database
        """
        config = ConfigParser.ConfigParser()
        config.readfp(open(src_dir+'/config/train2ban.cfg'))
        self.db_user = config.get('db_params', 'db_user')
        self.db_password = config.get('db_params', 'db_password')
        self.db_host = config.get('db_params', 'db_host')
        self.db_name = config.get('db_params', 'db_name')
        self.config_profile = config.get('db_params', 'config_profile')

    def load_train2ban_config(self):
        #Get database connection params
        self.load_database_config()
        #Establish database connection object
        self.connect_to_db()
        #Get basic config parameters
        #first try to see if there is a config specific to this host
        if (not self.config_profile):
            self.config_profile = "default"

        self.cur.execute("SELECT * from config where profile_name='"+ self.config_profile+"';")
        config_row = self.cur.fetchone()

        #otherwise we read first row of the database (the one with minimum id)
        if (not config_row):
            self.cur.execute("SELECT * from config ORDER BY id ASC")
            config_row = self.cur.fetchone()

        if (not config_row):
            raise IOError, "No configuration record in the database"

        if not config_row["absolute_paths"]:
            cur_dir = src_dir
        else:
            cur_dir = ""

        try:
            self.data_dir = cur_dir + config_row["training_directory"] + (config_row["training_directory"][-1] != "/" and "/" or "")
            self.analyser_results_dir = cur_dir + config_row["analyser_results_directory"] +  (config_row["analyser_results_directory"][-1] != "/" and "/" or "")

        except IndexError:
            raise ValueError, "Data and Result directory can not be left blank"

        #depricated for now, we are entering the regexes directly into the db
        if config_row["regex_filter_directory"]:
            self.filter_dir = cur_dir + config_row["regex_filter_directory"]  + (config_row["regex_filter_directory"][-1] != "/" and "/" or "")
            
        if config_row["default_filter_file"]:
            self.filter_file = self.filter_dir + config_row["default_filter_file"] +  (config_row["default_filter_file"][-1] != "/" and "/" or "")

    def add_data_log(self, log):
        self.log_files = list()
        self.log_files.append(self.train_dir + log)

    def load_bad_filters_from_db(self, experiment_id):
        #TODO: ensure cur is live
        self.cur.execute("SELECT regex_assignment.log_id, regex_filters.regex from regex_assignment, regex_filters, experiment_logs WHERE regex_assignment.regex_filter_id = regex_filters.id AND regex_assignment.log_id = experiment_logs.log_id AND experiment_logs.experiment_id = " +str(experiment_id))
        return self.cur.fetchall()

    def load_bad_filters(self):
        """
        Load set of regex filters from the default filter file.
        This is to allow expression of an individual filter file, rather
        than a set or by experiment.
        """
        tree = ET.parse(self.filter_file)
        root = tree.getroot()
        filters = list()
        for child in root:
            filters.append(child.text)
        return filters

    def sieve_the_ip(self):
        """
        This was used when all experiment were using all of log files
        but in new model each experiment has its own file
        """
        for cur_log_file in self.log_files:
            self.ip_sieve.add_log_file(cur_log_file)
            self.ip_sieve.parse_log_file()

    # def magnitude(self,v):
    #     return math.sqrt(sum(v[i]*v[i] for i in v))

    def clear_data(self):
        self.ip_sieve = IPSieve()
        self.ip_feature_db = {}

    def gather_all_features(self, log_files):
        """
        gathers all features

        INPUT:
            log_files: the logs that we went through it.
        """
        for cur_log_file in log_files:
            self.ip_sieve.add_log_file(cur_log_file)
            self.ip_sieve.parse_log()
            for CurrentFeatureType in Learn2BanFeature.__subclasses__():
                cur_feature_tester = CurrentFeatureType(self.ip_sieve, self.ip_feature_db)
                cur_feature_tester.compute()

        return self.ip_feature_db

    def construct_svm_classifier(self, kernel_mode='linear'):
        """
        Creates an instance of the SVM classifier with a given mode
        """
        return svm.SVC(kernel=kernel_mode)

    def random_slicer(self, data_size, train_portion=0.5):
        """
        Return two arrays with random true and false and complement of each
        other, used for slicing a set into trainig and testing

        INPUT:
            data_size: size of the array to return
            train_portion: between 0,1 indicate the portion for the True
                           entry
        """
        from random import random
        random_selector = [random() < train_portion for i in range(0, data_size)]
        complement_selector = np.logical_not(random_selector)

        return random_selector, complement_selector

    def __init__(self):
        #we would like people to able to use the tool object even
        #if they don't have a db so we have no reason to load this
        #config in the constructor
        #self.load_database_config()
        pass
Exemplo n.º 8
0
 def clear_data(self):
     self.ip_sieve = IPSieve()
     self.ip_feature_db = {}
Exemplo n.º 9
0
class Experimentor():
    """
    There is need for two type of Experiment objests one that correspond
    to each experiment record in experiment table and one that correspond
    to each result record in experiment_result.

    That is becaues from one experiment you can run many other experiments
    with little change in paramters and we don't want to store all these
    in DB as the design (train/test protion for example).

    Hence InseminatorExperiment read the experiment from the db (Expriment type 1)
    and Generator the L2BExperiment (Experiment type 2)
    """

    # #user will send one of these values to make tweak the analyser behavoir
    # #train begin will take the begining portion of the data for training
    # #train random will choose random rows of the sample set
    # TRAIN_BEGIN = 0
    # TRAIN_RANDOM = 1
    # def __init__(self, where_to_train = TRAIN_BEGIN, training_portion = 1):
    #     """
    #     Intitiate the behavoir of the analyzer. These parametrs should be
    #     also tweakable from database

    #     INPUT:

    #       - where_to_train: which part of the sample should be used for
    #                         training
    #       - training_protion: Between 0 - 1, tells the analyser how much
    #                           of the sample is for training and how much
    #                           for testing.
    #     """
    #     self._where_to_train = where_to_train
    #     self._training_portion = training_portion
    def __init__(self, exp, l2btools):
        """
        store the exp config in self's attribute.
        """
        self.expr_dict = exp
        self.id = self.expr_dict['id']
        self.l2btools = l2btools

        self.ip_sieve = IPSieve()
        self.ip_feature_db = {}

        #Create classifier, currently only SVM supported
        #but trainer is agnostic of classifier used provided it supports fit and predict
        self.experiment_classifier = self.l2btools.construct_svm_classifier(
            self.expr_dict['kernel_type'])
        #Create classifier
        self.trainer = Train2Ban(self.experiment_classifier)
        #Setup base data set
        #the base filename we are going to associate to the result of this experiment
        utc_datetime = datetime.datetime.utcnow()
        utc_datetime.strftime("%Y-%m-%d-%H%MZ")
        self.base_analyse_log_file = self.l2btools.analyser_results_dir + 'base_analyse_' + str(
            utc_datetime)
        #this make more sense to happens in the constructor however,
        self._process_logs()
        self._mark_bots()

    def param_stochastifier(self):
        """
        Here we return a randomised set of parameters for the experiments.
        At present we choose between for normalisation(sparse,individual), dimension reduction(PCA,ISOMap, MD5) and training portion(scale from 0-1)
        """
        param_set = []
        return param_set

    def _process_logs(self):
        """
        get the log name from db and gathers all features

        INPUT:
            log_files: the logs that we went through it.
        """
        #this is not a oop way of retrieving the logs but I think we are
        #avoiding db access in other classes beside l2btools
        cur_experiment_logs = self.l2btools.retrieve_experiment_logs(self.id)

        #if there is no log associated to this experiment then there is nothing
        #to do
        if len(cur_experiment_logs) == 0:
            logging.info("Giving up on experiment %i with no training log" %
                         self.expr_dict['id'])
            return

        #log id is needed to be send to the trainer so the the trainer
        #knows which regex is detecting the bots for which log
        self.trainer.add_malicious_history_log_files([
            (cur_log_info['log_id'], cur_log_info['file_name'])
            for cur_log_info in cur_experiment_logs
        ])

        #extracitng the filenames
        #Get IP Features
        log_filenames = tuple(cur_log['file_name']
                              for cur_log in cur_experiment_logs)
        #At this stage it is only a peliminary list we might lose features
        #due to 0 variance
        self._active_feature_list = []
        #do a dry run on all features just to gather the indeces of all available
        #features
        for CurrentFeatureType in Learn2BanFeature.__subclasses__():
            cur_feature_tester = CurrentFeatureType(self.ip_sieve,
                                                    self.ip_feature_db)
            self._active_feature_list.append(cur_feature_tester._FEATURE_INDEX)

        for cur_log_file in log_filenames:  #in theory it might be more memory efficient
            #to crunch the logs one by one but python is quite disappointing in memory
            #management
            try:
                self.ip_sieve.add_log_file(cur_log_file)
                self.ip_sieve.parse_log()
            except IOError:
                print "Unable to read ", cur_log_file, "skipping..."

        for CurrentFeatureType in Learn2BanFeature.__subclasses__():
            cur_feature_tester = CurrentFeatureType(self.ip_sieve,
                                                    self.ip_feature_db)
            logging.info("Computing feature %i..." %
                         cur_feature_tester._FEATURE_INDEX)
            cur_feature_tester.compute()

            # we have memory problem here :(
            # import objgraph
            # objgraph.show_refs([self.ip_sieve._ordered_records], filename='ips-graph.png')

        del self.ip_sieve._ordered_records
        del self.ip_sieve

        #f**k python with not letting the memory released
        # import gc
        # gc.collect()
        # print gc.garbage()

        self.trainer.add_to_sample(self.ip_feature_db)

        #we store the non-normailized vectors in a json file
        jsonized_ip_feature_db = {}
        for k, v in self.ip_feature_db.items():
            jsonized_ip_feature_db[str(k)] = v
        import json
        with open(self.base_analyse_log_file + ".prenormal_ip_feature_db.json",
                  "w") as ip_feature_file:
            json.dump(jsonized_ip_feature_db, ip_feature_file)

        del self.ip_feature_db
        del jsonized_ip_feature_db

        #Normalise training set, normalisation should happen after all
        #sample is gathered
        self.trainer.normalise(self.expr_dict['norm_mode'])

    def _mark_bots(self):
        """
        Read the regexes correspond to this experience log and apply them to
        the trainer. this should be called after the logs has been processed.
        """
        #Add Faill2Ban filters
        filters_for_experiment = self.l2btools.load_bad_filters_from_db(
            self.id)
        for cur_filter in filters_for_experiment:
            self.trainer.add_bad_regexes(cur_filter['log_id'],
                                         (cur_filter['regex'], ))
        #Use Fail2ban filters to identify and mark DDOS IPs in data set
        malicious_ips = self.trainer.mark_bad_target()

        with open(self.base_analyse_log_file + ".malicious_ip_list",
                  "w") as malicious_ip_file:
            malicious_ip_file.write(str(malicious_ips).strip('[]'))

    def _pca_importance_ananlysis(self, pca_model):
        """
        Retrieve the pca transformation and use the following formula to
        determine the importance of each feature:

        length(variance*|c_1j|/sqrt(sum(c1i_2^2)))

        INPUT:
           pca_model: (the transfarmation matrix in np array, importance of each
                      component) the output of L2BExperiment.PCA_transform_detail
        OUTPUT: an array containing the importance ratio of features based
                on above forumla
        """
        pca_transform_matrix = pca_model[0]
        pca_var_ratio = pca_model[1]

        #row_sums = pca_transform_matrix.sum(axis=1)
        #apparently pca transfomation is normalised along both access
        #anyway for some reason reshape(-1) doesn't work as transpose
        scaled_coeffs = pca_var_ratio.reshape(len(pca_var_ratio),
                                              1) * pca_transform_matrix

        return np.apply_along_axis(np.linalg.norm, 0, scaled_coeffs)

    def run_l2b_experiment(self, train_portion, stochastic_params):
        """
        Run individual instance of given experiment
        """
        utc_datetime = datetime.datetime.utcnow()
        utc_datetime.strftime("%Y-%m-%d-%H%MZ")
        analyse_log_file = self.l2btools.analyser_results_dir + 'analyse_' + str(
            utc_datetime)
        logging.basicConfig(filename=analyse_log_file, level=logging.INFO)
        logging.info('Begin learn 2 ban analysis for Experiment Id: ' +
                     str(self.expr_dict['id']))

        #Divide up data set into training and testing portions based on initial given value
        marked_training_set = self.trainer.get_training_set()

        #if no body is a bot then this is not a fruitful experiment
        if marked_training_set.no_culprit():
            logging.info("No bot detected, Giving up on experiment " +
                         str(self.expr_dict['id']))
            return

        #here we need to check if we lost features or not due to normalisation
        #sparse normaliastion doesn't cut off feature
        if self.expr_dict['norm_mode'] == 'individual':
            dimension_reducer = [
                cur_feature_std != 0
                for cur_feature_std in marked_training_set._normalisation_data[
                    marked_training_set.SAMPLE_STD]
            ]
            self._active_feature_list = [
                self._active_feature_list[red_plc[0]]
                for red_plc in enumerate(dimension_reducer) if red_plc[1]
            ]

        active_features = str(self._active_feature_list).strip('[]')
        #TODO: Iterate with different slicing to get reliable result
        train_selector, test_selector = self.l2btools.random_slicer(
            len(marked_training_set), train_portion)
        train_set = marked_training_set.get_training_subset(
            case_selector=train_selector)
        test_set = marked_training_set.get_training_subset(
            case_selector=test_selector)
        #initializes L2BEXperiment
        cur_experiment = L2BExperiment(train_set, test_set, self.trainer)

        #TODO:mRMR and PCA are independent of slicing and should
        #     computed over the whole dataset
        # Get the mRMR
        mrmr = cur_experiment.get_mrmr()
        logging.info('mRMR score: ' + str(mrmr))

        # Get the PCA ratios as a string
        pca_ratios = str(
            self._pca_importance_ananlysis(
                cur_experiment.pca_transform_detail())).strip('[]')
        logging.info('PCA ratios: ' + pca_ratios)

        #Train model against training set
        cur_experiment.train()

        #Predict for training data using constructed model
        score = cur_experiment.cross_validate_test()
        logging.info('Crossvalidation score: ' + str(score))

        self.store_results(analyse_log_file, train_portion, score,
                           active_features, pca_ratios, mrmr)

    def store_results(self, analyse_log_file, train_portion, score,
                      active_features, pca_ratios, mrmr):
        # Add the result to the database
        experiment_result = {}
        experiment_result['experiment_id'] = self.expr_dict['id']
        experiment_result['result_file'] = analyse_log_file
        experiment_result['proportion'] = train_portion
        experiment_result['score'] = score
        experiment_result['active_features'] = active_features
        experiment_result['pca_ratios'] = pca_ratios
        experiment_result['mrmr_score'] = str(mrmr).strip('[]')

        #while the pickle model is always created the result file only
        #get stored in the case there are an error
        self.l2btools.save_experiment_result(experiment_result)

        self.trainer.save_model(analyse_log_file + ".l2b_pickle_model")
        #also try to store in recontsructable libsvm format if the function
        #if the save_svm_model function is implmented
        try:
            self.trainer.save_model(analyse_log_file + ".normal_svm_model",
                                    "normal_svm")
        except NotImplementedError:
            print "save_svm_model is not implmeneted in your scikit-learn, skipping storing the model in libsvm format"

        print "Experiment", self.expr_dict[
            'id'], ": train portion = ", train_portion, ", score = ", score, ", mRMR = ", mrmr, ", PCA ratios = ", pca_ratios
        print experiment_result
Exemplo n.º 10
0
    def process_incident(self, incident):
        """
		get the incident time from the db and gathers all features

		INPUT:
			log_files: the logs that we went through it.
		"""
        if (incident is None):
            return

        ip_sieve = IPSieve()
        ip_records = {}
        banned_ips = []

        if (incident["file_name"] is None) or (len(incident["file_name"])
                                               == 0):
            # get the logs from ES
            # get the logs from ES
            banned_ips = self.es_handler.get_banjax(incident['start'],
                                                    incident['stop'],
                                                    incident['target'])
            ats_records = self.es_handler.get(incident['start'],
                                              incident['stop'],
                                              incident['target'])

            # calculate IP dictionary with ATS records
            ip_records = ip_sieve.process_ats_records(ats_records)
        else:
            # read the sessions from the log file
            ip_sieve.add_log_file(incident["file_name"])
            ip_records = ip_sieve.parse_log("nginx")

        # calculate features
        ip_feature_db = {}

        #At this stage it is only a peliminary list we might lose features
        #due to 0 variance
        self._active_feature_list = []
        #do a dry run on all features just to gather the indeces of all available
        #features
        for CurentFeature in Learn2BanFeature.__subclasses__():
            f = CurentFeature(ip_records, ip_feature_db)
            self._active_feature_list.append(f._FEATURE_INDEX)

        for CurentFeature in Learn2BanFeature.__subclasses__():
            f = CurentFeature(ip_records, ip_feature_db)
            #logging.info("Computing feature %i..."% f._FEATURE_INDEX)
            print "Computing feature %i..." % f._FEATURE_INDEX
            f.compute()

        # post process the features
        ip_feature_db = self.bothound_tools.post_process(ip_feature_db)

        # delete the old sessions for thie incidend
        self.bothound_tools.delete_sessions(incident['id'])

        #print ip_feature_db
        self.bothound_tools.add_sessions(incident['id'], ip_feature_db,
                                         banned_ips)
        self.bothound_tools.set_incident_process(incident['id'], False)
        print "Incident {} processed.".format(incident['id'])
        return ip_feature_db
Exemplo n.º 11
0
class Experimentor():
    """
    There is need for two type of Experiment objests one that correspond
    to each experiment record in experiment table and one that correspond
    to each result record in experiment_result.

    That is becaues from one experiment you can run many other experiments
    with little change in paramters and we don't want to store all these
    in DB as the design (train/test protion for example).

    Hence InseminatorExperiment read the experiment from the db (Expriment type 1)
    and Generator the L2BExperiment (Experiment type 2)
    """
    # #user will send one of these values to make tweak the analyser behavoir
    # #train begin will take the begining portion of the data for training
    # #train random will choose random rows of the sample set
    # TRAIN_BEGIN = 0
    # TRAIN_RANDOM = 1
    # def __init__(self, where_to_train = TRAIN_BEGIN, training_portion = 1):
    #     """
    #     Intitiate the behavoir of the analyzer. These parametrs should be
    #     also tweakable from database

    #     INPUT:

    #       - where_to_train: which part of the sample should be used for
    #                         training
    #       - training_protion: Between 0 - 1, tells the analyser how much
    #                           of the sample is for training and how much
    #                           for testing.
    #     """
    #     self._where_to_train = where_to_train
    #     self._training_portion = training_portion
    def __init__(self, exp, l2btools):
        """
        store the exp config in self's attribute.
        """
        self.expr_dict = exp
        self.id = self.expr_dict['id']
        self.l2btools = l2btools

        self.ip_sieve = IPSieve()
        self.ip_feature_db = {}

        #Create classifier, currently only SVM supported
        #but trainer is agnostic of classifier used provided it supports fit and predict
        self.experiment_classifier = self.l2btools.construct_svm_classifier(self.expr_dict['kernel_type'])
        #Create classifier
        self.trainer = Train2Ban(self.experiment_classifier)
        #Setup base data set
        #the base filename we are going to associate to the result of this experiment
        utc_datetime = datetime.datetime.utcnow()
        utc_datetime.strftime("%Y-%m-%d-%H%MZ")
        self.base_analyse_log_file = self.l2btools.analyser_results_dir + 'base_analyse_' + str(utc_datetime)
        #this make more sense to happens in the constructor however,
        self._process_logs()
        self._mark_bots()

    def param_stochastifier(self):
        """
        Here we return a randomised set of parameters for the experiments.
        At present we choose between for normalisation(sparse,individual), dimension reduction(PCA,ISOMap, MD5) and training portion(scale from 0-1)
        """
        param_set = []
        return param_set

    def _process_logs(self):
        """
        get the log name from db and gathers all features

        INPUT:
            log_files: the logs that we went through it.
        """
        #this is not a oop way of retrieving the logs but I think we are
        #avoiding db access in other classes beside l2btools
        cur_experiment_logs = self.l2btools.retrieve_experiment_logs(self.id)

        #if there is no log associated to this experiment then there is nothing
        #to do
        if len(cur_experiment_logs) == 0:
            logging.info("Giving up on experiment %i with no training log"%self.expr_dict['id'])
            return

        #log id is needed to be send to the trainer so the the trainer
        #knows which regex is detecting the bots for which log
        self.trainer.add_malicious_history_log_files([(cur_log_info['log_id'], cur_log_info['file_name']) for cur_log_info in cur_experiment_logs])

        #extracitng the filenames
        #Get IP Features
        log_filenames = tuple(cur_log['file_name'] for cur_log in cur_experiment_logs)
        #At this stage it is only a peliminary list we might lose features
        #due to 0 variance
        self._active_feature_list = []
        #do a dry run on all features just to gather the indeces of all available
        #features
        for CurrentFeatureType in Learn2BanFeature.__subclasses__():
            cur_feature_tester = CurrentFeatureType(self.ip_sieve, self.ip_feature_db)
            self._active_feature_list.append(cur_feature_tester._FEATURE_INDEX)

        for cur_log_file in log_filenames: #in theory it might be more memory efficient
            #to crunch the logs one by one but python is quite disappointing in memory
            #management
            try:
                self.ip_sieve.add_log_file(cur_log_file)
                self.ip_sieve.parse_log()
            except IOError:
                print "Unable to read ", cur_log_file, "skipping..."

        for CurrentFeatureType in Learn2BanFeature.__subclasses__():
            cur_feature_tester = CurrentFeatureType(self.ip_sieve, self.ip_feature_db)
            logging.info("Computing feature %i..."%cur_feature_tester._FEATURE_INDEX)
            cur_feature_tester.compute()

            # we have memory problem here :(
            # import objgraph
            # objgraph.show_refs([self.ip_sieve._ordered_records], filename='ips-graph.png')

        del self.ip_sieve._ordered_records
        del self.ip_sieve

        #f**k python with not letting the memory released
        # import gc
        # gc.collect()
        # print gc.garbage()

        self.trainer.add_to_sample(self.ip_feature_db)

        #we store the non-normailized vectors in a json file
        jsonized_ip_feature_db = {}
        for k,v in self.ip_feature_db.items():
            jsonized_ip_feature_db[str(k)] = v
        import json
        with open(self.base_analyse_log_file+".prenormal_ip_feature_db.json", "w") as ip_feature_file:
            json.dump(jsonized_ip_feature_db, ip_feature_file)

        del self.ip_feature_db
        del jsonized_ip_feature_db

        #Normalise training set, normalisation should happen after all
        #sample is gathered
        self.trainer.normalise(self.expr_dict['norm_mode'])

    def _mark_bots(self):
        """
        Read the regexes correspond to this experience log and apply them to
        the trainer. this should be called after the logs has been processed.
        """
        #Add Faill2Ban filters
        filters_for_experiment = self.l2btools.load_bad_filters_from_db(self.id)
        for cur_filter in filters_for_experiment:
            self.trainer.add_bad_regexes(cur_filter['log_id'], (cur_filter['regex'],))
        #Use Fail2ban filters to identify and mark DDOS IPs in data set
        malicious_ips = self.trainer.mark_bad_target()

        with open(self.base_analyse_log_file+".malicious_ip_list", "w") as malicious_ip_file:
            malicious_ip_file.write(str(malicious_ips).strip('[]'))

    def _pca_importance_ananlysis(self, pca_model):
        """
        Retrieve the pca transformation and use the following formula to
        determine the importance of each feature:

        length(variance*|c_1j|/sqrt(sum(c1i_2^2)))

        INPUT:
           pca_model: (the transfarmation matrix in np array, importance of each
                      component) the output of L2BExperiment.PCA_transform_detail
        OUTPUT: an array containing the importance ratio of features based
                on above forumla
        """
        pca_transform_matrix = pca_model[0]
        pca_var_ratio = pca_model[1]

        #row_sums = pca_transform_matrix.sum(axis=1)
        #apparently pca transfomation is normalised along both access
        #anyway for some reason reshape(-1) doesn't work as transpose
        scaled_coeffs = pca_var_ratio.reshape(len(pca_var_ratio),1) * pca_transform_matrix

        return np.apply_along_axis(np.linalg.norm, 0 , scaled_coeffs)

    def run_l2b_experiment(self, train_portion, stochastic_params):
        """
        Run individual instance of given experiment
        """
        utc_datetime = datetime.datetime.utcnow()
        utc_datetime.strftime("%Y-%m-%d-%H%MZ")
        analyse_log_file = self.l2btools.analyser_results_dir + 'analyse_' + str(utc_datetime)
        logging.basicConfig(filename=analyse_log_file, level=logging.INFO)
        logging.info('Begin learn 2 ban analysis for Experiment Id: ' + str(self.expr_dict['id']))

        #Divide up data set into training and testing portions based on initial given value
        marked_training_set = self.trainer.get_training_set()

        #if no body is a bot then this is not a fruitful experiment
        if marked_training_set.no_culprit():
            logging.info("No bot detected, Giving up on experiment " + str(self.expr_dict['id']))
            return

        #here we need to check if we lost features or not due to normalisation
        #sparse normaliastion doesn't cut off feature
        if self.expr_dict['norm_mode']=='individual':
            dimension_reducer = [cur_feature_std != 0 for cur_feature_std in marked_training_set._normalisation_data[marked_training_set.SAMPLE_STD]]
            self._active_feature_list = [self._active_feature_list[red_plc[0]] for red_plc in enumerate(dimension_reducer) if red_plc[1]]

        active_features = str(self._active_feature_list).strip('[]')
        #TODO: Iterate with different slicing to get reliable result
        train_selector, test_selector = self.l2btools.random_slicer(len(marked_training_set), train_portion)
        train_set = marked_training_set.get_training_subset(case_selector=train_selector)
        test_set = marked_training_set.get_training_subset(case_selector=test_selector)
        #initializes L2BEXperiment
        cur_experiment = L2BExperiment(train_set, test_set, self.trainer)

        #TODO:mRMR and PCA are independent of slicing and should
        #     computed over the whole dataset
        # Get the mRMR
        mrmr = cur_experiment.get_mrmr()
        logging.info('mRMR score: ' + str(mrmr))

        # Get the PCA ratios as a string
        pca_ratios = str(self._pca_importance_ananlysis(cur_experiment.pca_transform_detail())).strip('[]')
        logging.info('PCA ratios: ' + pca_ratios)

        #Train model against training set
        cur_experiment.train()

        #Predict for training data using constructed model
        score = cur_experiment.cross_validate_test()
        logging.info('Crossvalidation score: ' + str(score))

        self.store_results(analyse_log_file, train_portion, score, active_features, pca_ratios, mrmr)

    def store_results(self, analyse_log_file, train_portion, score, active_features, pca_ratios, mrmr):
        # Add the result to the database
        experiment_result = {}
        experiment_result['experiment_id'] = self.expr_dict['id']
        experiment_result['result_file'] = analyse_log_file
        experiment_result['proportion'] = train_portion
        experiment_result['score'] = score
        experiment_result['active_features'] = active_features
        experiment_result['pca_ratios'] = pca_ratios
        experiment_result['mrmr_score'] = str(mrmr).strip('[]')

        #while the pickle model is always created the result file only
        #get stored in the case there are an error
        self.l2btools.save_experiment_result(experiment_result)

        self.trainer.save_model(analyse_log_file+".l2b_pickle_model")
        #also try to store in recontsructable libsvm format if the function
        #if the save_svm_model function is implmented
        try:
            self.trainer.save_model(analyse_log_file+".normal_svm_model", "normal_svm")
        except NotImplementedError:
            print "save_svm_model is not implmeneted in your scikit-learn, skipping storing the model in libsvm format"

        print "Experiment", self.expr_dict['id'], ": train portion = ", train_portion, ", score = ", score, ", mRMR = ", mrmr, ", PCA ratios = ", pca_ratios
        print experiment_result