def __init__(self, files=None, config=None, model=None, model_hash=None, model_path=None): self.logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) logging.getLogger('pika').setLevel(logging.WARNING) self.logger = Common().setup_logger(self.logger) self.common = Common(config=config) if self.common.use_rabbit: self.common.connect_rabbit() self.r = self.common.r if config: try: self.time_const = config['time constant'] self.state_size = config['state size'] self.look_time = config['look time'] self.threshold = config['threshold'] self.rnn_size = config['rnn size'] self.conf_labels = config['conf labels'] self.duration = config['duration'] except Exception as e: # pragma: no cover self.logger.error('Unable to read config properly because: %s', str(e)) self.files = files if files else [] self.model = model self.model_hash = model_hash self.model_path = model_path
def __init__(self, files=None, config=None, model=None, model_hash=None, model_path=None, sos_model=None): ## Initiate logging information on this instance self.logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) logging.getLogger('pika').setLevel(logging.WARNING) self.logger = Common().setup_logger(self.logger) self.common = Common(config=config) self.sos_model = sos_model ## RabbitMQ acts as a message broker if self.common.use_rabbit: self.common.connect_rabbit( host=self.common.rabbit_host, port=self.common.rabbit_port, exchange=self.common.rabbit_exchange, routing_key=self.common.rabbit_routing_key, queue=self.common.rabbit_queue, queue_name=self.common.rabbit_queue_name) ## Redis provides a storage capability if self.common.use_redis: self.common.connect_redis(host=self.common.redis_host) if config: try: ## For description of these configuration values, see the ## README.md file in the networkml/configs folder self.time_const = config['time constant'] self.state_size = config['state size'] self.look_time = config['look time'] self.threshold = config['threshold'] self.rnn_size = config['rnn size'] self.conf_labels = config['conf labels'] self.duration = config['duration'] except Exception as e: # pragma: no cover self.logger.error('Unable to read config properly because: %s', str(e)) self.files = files if files else [] ## Store network capture files self.model = model self.model_hash = model_hash self.model_path = model_path self.has_avx = self.detect_avx
def __init__(self): ## Set logging information for instance self.logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) ## Take arguments from command line self.args = None self.read_args() ## Take input from configuration file self.get_config() self.common = Common(config=self.config) ## Instantiate a logger to to leg messages to aid debugging self.logger = Common().setup_logger(self.logger) ## Add network traffic files for parsing self.get_files() self.model_hash = None self.model = Model(duration=self.duration, hidden_size=None, model_type=self.args.algorithm) def create_base_alg(): return BaseAlgorithm(files=self.files, config=self.config, model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model, sos_model=self.args.sos_model) ## Check whether operation is evaluation, train, or test ## Evaluation returns predictions that are useful for the deployment ## of networkml in an operational environment. if self.args.operation == 'eval': self.load_model() if (self.args.algorithm == 'onelayer' or self.args.algorithm == 'randomforest'): base_alg = create_base_alg() base_alg.eval(self.args.algorithm) ## SOS refers to statistical outlier selection model elif self.args.algorithm == 'sos': from networkml.algorithms.sos.eval_SoSModel import eval_pcap eval_pcap(self.args.path, self.args.sos_model, self.conf_labels, self.time_const) ## Train entails training a new model on specific packet captures elif self.args.operation == 'train': ## Check for model type specified ## onelayer refers to a one-layer neural network if self.args.algorithm == 'onelayer': m = MLPClassifier((self.state_size), alpha=0.1, activation='relu', max_iter=1000) base_alg = create_base_alg() base_alg.train(self.args.path, self.args.save, m, self.args.algorithm) ## Random forests refers to a decision tree-based model elif self.args.algorithm == 'randomforest': m = RandomForestClassifier(n_estimators=100, min_samples_split=5, class_weight='balanced') base_alg = create_base_alg() base_alg.train(self.args.path, self.args.save, m, self.args.algorithm) ## SOS refers to statistical outlier selection model elif self.args.algorithm == 'sos': from networkml.algorithms.sos.train_SoSModel import train train(self.args.path, self.args.sos_model, self.time_const, self.rnn_size, self.conf_labels, self.args.save) ## Test is for checking overall performance of networkML models for ## the device classification task. It is a benchmarking operation. elif self.args.operation == 'test': self.load_model() ## Check for model type specified ## onelayer refers to a one-layer neural network ## Random forests refers to a decision tree-based model if (self.args.algorithm == 'onelayer' or self.args.algorithm == 'randomforest'): base_alg = create_base_alg() base_alg.test(self.args.path, self.args.save) ## SOS refers to statistical outlier selection model elif self.args.algorithm == 'sos': self.logger.info( 'There is no testing operation for the SoSModel.')
class NetworkML(): """' Main class that instantiates prediction models of the types of devices found in computer network traffic and whether that device is acting normal given its type (also based on network traffic). The three model types built in to this class are random forests, neural networks, and stochastic outlier selection (SOS). """ def __init__(self): ## Set logging information for instance self.logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) ## Take arguments from command line self.args = None self.read_args() ## Take input from configuration file self.get_config() self.common = Common(config=self.config) ## Instantiate a logger to to leg messages to aid debugging self.logger = Common().setup_logger(self.logger) ## Add network traffic files for parsing self.get_files() self.model_hash = None self.model = Model(duration=self.duration, hidden_size=None, model_type=self.args.algorithm) def create_base_alg(): return BaseAlgorithm(files=self.files, config=self.config, model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model, sos_model=self.args.sos_model) ## Check whether operation is evaluation, train, or test ## Evaluation returns predictions that are useful for the deployment ## of networkml in an operational environment. if self.args.operation == 'eval': self.load_model() if (self.args.algorithm == 'onelayer' or self.args.algorithm == 'randomforest'): base_alg = create_base_alg() base_alg.eval(self.args.algorithm) ## SOS refers to statistical outlier selection model elif self.args.algorithm == 'sos': from networkml.algorithms.sos.eval_SoSModel import eval_pcap eval_pcap(self.args.path, self.args.sos_model, self.conf_labels, self.time_const) ## Train entails training a new model on specific packet captures elif self.args.operation == 'train': ## Check for model type specified ## onelayer refers to a one-layer neural network if self.args.algorithm == 'onelayer': m = MLPClassifier((self.state_size), alpha=0.1, activation='relu', max_iter=1000) base_alg = create_base_alg() base_alg.train(self.args.path, self.args.save, m, self.args.algorithm) ## Random forests refers to a decision tree-based model elif self.args.algorithm == 'randomforest': m = RandomForestClassifier(n_estimators=100, min_samples_split=5, class_weight='balanced') base_alg = create_base_alg() base_alg.train(self.args.path, self.args.save, m, self.args.algorithm) ## SOS refers to statistical outlier selection model elif self.args.algorithm == 'sos': from networkml.algorithms.sos.train_SoSModel import train train(self.args.path, self.args.sos_model, self.time_const, self.rnn_size, self.conf_labels, self.args.save) ## Test is for checking overall performance of networkML models for ## the device classification task. It is a benchmarking operation. elif self.args.operation == 'test': self.load_model() ## Check for model type specified ## onelayer refers to a one-layer neural network ## Random forests refers to a decision tree-based model if (self.args.algorithm == 'onelayer' or self.args.algorithm == 'randomforest'): base_alg = create_base_alg() base_alg.test(self.args.path, self.args.save) ## SOS refers to statistical outlier selection model elif self.args.algorithm == 'sos': self.logger.info( 'There is no testing operation for the SoSModel.') def read_args(self): """ Read arguments from command line to determine what operations to implement. """ parser = argparse.ArgumentParser() parser.add_argument('--algorithm', '-a', default='onelayer', choices=['onelayer', 'randomforest', 'sos'], help='which algorithm to run') parser.add_argument('--format', '-f', default='pcap', choices=['netflow', 'pcap'], help='which format are the files to process in') parser.add_argument('--operation', '-o', default='eval', choices=['eval', 'train', 'test'], help='which operation to run') parser.add_argument('--sos_model', '-s', default='networkml/trained_models/sos/SoSmodel', help='path to SoSmodel') parser.add_argument( '--trained_model', '-m', default='networkml/trained_models/onelayer/OneLayerModel.pkl', help='path to the trained model file') parser.add_argument( '--path', '-p', default='/pcaps', help='path to file or directory of files to process') parser.add_argument( '--save', '-w', default='networkml/trained_models/onelayer/OneLayerModel.pkl', help='path to save the trained model, if training') self.args = parser.parse_args() return def get_files(self): """ Add directory of files or file for parsing. """ # TODO checking extensions here should be moved to parsers, and it should # probably use 'magic' rather than extensions. See Python magic library self.files = [] if Path(self.args.path).is_dir(): for root, dirnames, filenames in os.walk(self.args.path): for extension in ['pcap', 'dump', 'cap']: for filename in fnmatch.filter(filenames, '*.' + extension): self.files.append(os.path.join(root, filename)) elif Path(self.args.path).is_file() and \ os.path.split(str(self.args.path))[-1].split('.')[-1] in {'pcap', 'dump', 'cap'}: self.files.append(str(self.args.path)) else: self.logger.error( 'Input \'%s\' was neither a file nor a directory.', str(self.args.path)) if not self.files: self.logger.error('Did not find file(s) from \'%s\'.', str(self.args.path)) return def get_config(self, cfg_file='networkml/configs/config.json', labels_file='networkml/configs/label_assignments.json'): """ Load values from configuration file. Args: cfg_file: path to configuration file labels_file: path to labels (or the types of devices predicted) """ try: with open(cfg_file, 'r') as config_file: self.config = json.load(config_file) ## Time constant is used for creating a moving average self.time_const = self.config['time constant'] ## State size sets the number of nodes in the neural network self.state_size = self.config['state size'] ## An amount of time set between investigations of a potentially ## suspicious device self.look_time = self.config['look time'] ## time in seconds ## Threshold sets the confidence needed to identify abnormal ## behavior self.threshold = self.config['threshold'] ## Set parameter for SOS model self.rnn_size = self.config['rnn size'] ## Duration for time window of network traffic for which to computer ## information on features self.duration = self.config['duration'] #self.batch_size = self.config['batch size'] ## Import device label typology with open(labels_file, 'r') as label_file: labels = json.load(label_file) self.conf_labels = [] for label in labels: self.conf_labels.append(labels[label]) self.conf_labels.append('Unknown') self.config['conf labels'] = self.conf_labels except Exception as e: # pragma: no cover self.logger.error("Unable to read '%s' properly because: %s", cfg_file, str(e)) return def load_model(self): """ Load trained machine learning model. """ with open(self.args.trained_model, 'rb') as handle: self.model_hash = hashlib.sha224(handle.read()).hexdigest() self.model.load(self.args.trained_model) self.logger.debug('Loaded model from %s', self.args.trained_model) return
class BaseAlgorithm: """ Base algorithm class that reads a PCAP (packet capture file) and updates the stored representation of the source. The class can then be used by more specific algorithms. """ def __init__(self, files=None, config=None, model=None, model_hash=None, model_path=None, sos_model=None): ## Initiate logging information on this instance self.logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) logging.getLogger('pika').setLevel(logging.WARNING) self.logger = Common().setup_logger(self.logger) self.common = Common(config=config) self.sos_model = sos_model ## RabbitMQ acts as a message broker if self.common.use_rabbit: self.common.connect_rabbit( host=self.common.rabbit_host, port=self.common.rabbit_port, exchange=self.common.rabbit_exchange, routing_key=self.common.rabbit_routing_key, queue=self.common.rabbit_queue, queue_name=self.common.rabbit_queue_name) ## Redis provides a storage capability if self.common.use_redis: self.common.connect_redis(host=self.common.redis_host) if config: try: ## For description of these configuration values, see the ## README.md file in the networkml/configs folder self.time_const = config['time constant'] self.state_size = config['state size'] self.look_time = config['look time'] self.threshold = config['threshold'] self.rnn_size = config['rnn size'] self.conf_labels = config['conf labels'] self.duration = config['duration'] except Exception as e: # pragma: no cover self.logger.error('Unable to read config properly because: %s', str(e)) self.files = files if files else [] ## Store network capture files self.model = model self.model_hash = model_hash self.model_path = model_path self.has_avx = self.detect_avx @staticmethod def detect_avx(): ## Check if CPU supports AVX (advanced vector extension), ## which speeds up certain calculations if 'flags' in get_cpu_info() and ('avx' in get_cpu_info()['flags'] or 'avx2' in get_cpu_info()['flags']): return True return False @staticmethod def parse_pcap_name(base_pcap): ## The parsing operation below assumes a specific file naming ## convention trace_DeviceName-deviceID-time-duration-flags.pcap ## Explanation: All files coming from Poseidon have trace_ at their ## beginning. The device name and deviceID colums are self explanatory. ## Time refers to the day of the week and time of day. Duration refers ## to the length of the network traffic capture. The flags aspect ## refers to an unknown characteristic. # TODO: tolerate tshark labels in the trace name, but do not parse them for now. pcap_key = None pcap_labels = None if base_pcap.startswith('trace_'): pcap_re = re.compile( r'^trace_([\da-f]+)_.+(client|server)-(.+).pcap$') pcap_match = pcap_re.match(base_pcap) if pcap_match: pcap_key = pcap_match.group(1) pcap_labels = pcap_match.group(3) else: # Not a Poseidon trace file, return basename as key. pcap_key = base_pcap.split('.')[0] return (pcap_key, pcap_labels) @staticmethod def parse_pcap_labels(pcap_labels_str): pcap_labels = { 'ip_lowest_port': None, 'ip_proto': None, 'ip_version': None, 'ip_app': None, } pcap_label_res = { 'ip_lowest_port': re.compile(r'.*port-(\d+).*'), 'ip_proto': re.compile(r'.+\-(arp|icmp|icmp6|udp|tcp)\-.+'), 'ip_app': re.compile(r'.+\-(bootp|dns|esp|ftp|http|ssl|ntp)\-.+'), } for field, label_re in pcap_label_res.items(): match = label_re.match(pcap_labels_str) if match: pcap_labels[field] = match.group(1) if 'ipv6' in pcap_labels_str: pcap_labels['ip_version'] = 6 elif pcap_labels['ip_proto'] is not None: pcap_labels['ip_version'] = 4 return pcap_labels def publish_message(self, message, close=False): if self.common.use_rabbit: uid = os.getenv('id', 'None') file_path = os.getenv('file_path', 'None') message.update({ 'id': uid, 'file_path': file_path, 'type': 'metadata', 'results': { 'tool': 'networkml', 'version': networkml.__version__ } }) message = json.dumps(message) self.logger.info('Message: ' + message) self.common.channel.basic_publish( exchange=self.common.exchange, routing_key=self.common.routing_key, body=message, properties=pika.BasicProperties(delivery_mode=2, )) if close: try: self.common.connection.close() except Exception as e: # pragma: no cover self.logger.error( 'Unable to close rabbit connection because: {0}'. format(str(e))) def eval(self, algorithm): """ This operation uses a specified algorithm to predict--for particular network traffic--what devices types are present and whether the device is acting normally or abnormally. This is the function that should be used in production when a user wants to actually employ networkML to classify and assess traffic. Args: algorithm: type of algorithm (random forest, neural network, or stochastic outlier selection (SOS). """ if self.files: self.model.sessionize_pcaps(self.files) for fi in self.files: self.logger.info('Processing {0}...'.format(fi)) base_pcap = os.path.basename(fi) pcap_key, pcap_labels = self.parse_pcap_name(base_pcap) if pcap_key is None: self.logger.debug('Ignoring unknown pcap name %s', base_pcap) continue ## Get representations from the model reps, source_mac, timestamps, preds, others, capture_ip_source = self.model.get_representation( str(fi), source_ip=None, mean=False) ## If no predictions are made, send a message with explanation if preds is None: message = {} message[pcap_key] = {'valid': False, 'pcap': base_pcap} message = {'data': message} self.logger.info('Not enough sessions in file \'%s\'', str(fi)) self.publish_message(message) continue else: ## If a prediction is made, send message with prediction self.logger.debug('Generating predictions') ## Update the stored representation if reps is not None: self.logger.debug('Updating stored data') r_key = self.common.update_data(source_mac, reps, timestamps, preds, others, self.model_hash) ## Get the sessions that the model looked at sessions = self.model.sessions ## Clean the sessions clean_sessions = [] inferred_mac = None for session_dict in sessions: cleaned_sessions, inferred_mac = \ clean_session_dict( session_dict, source_address=source_mac ) clean_sessions.append(cleaned_sessions) if source_mac is None: source_mac = inferred_mac ## Make simple decisions based on vector differences and update ## times timestamp = timestamps[0].timestamp() labels, confs = zip(*preds) abnormality = 0.0 if self.has_avx(): from networkml.algorithms.sos.eval_SoSModel import eval_pcap try: abnormality = eval_pcap(str(fi), self.sos_model, self.conf_labels, self.time_const, label=labels[0], rnn_size=self.rnn_size, model_path=self.model_path, model_type=algorithm) except ValueError: self.logger.warning( "Can't run abnormality detection because not a big enough sample size" ) else: self.logger.warning( "Can't run abnormality detection because this CPU doesn't support AVX" ) prev_s = self.common.get_address_info(source_mac, timestamp) decision = self.common.basic_decision(pcap_key, source_mac, prev_s, timestamp, labels, confs, abnormality) sources = { 'source_ip': capture_ip_source, 'source_mac': source_mac, 'pcap_labels': pcap_labels, } if pcap_key in decision: decision[pcap_key].update(sources) elif source_mac in decision: decision[source_mac].update(sources) self.logger.debug('Created message') for i in range(3): self.logger.info(labels[i] + ' : ' + str(round(confs[i], 3))) # update Redis with decision if self.common.use_redis: redis_decision = {} for k in decision: redis_decision[k] = str(decision[k]) try: self.common.r.hmset(r_key, redis_decision) except Exception as e: # pragma: no cover self.logger.error( 'Failed to update keys in Redis because: {0}'. format(str(e))) message = {'data': decision} message['data']['pcap'] = base_pcap self.publish_message(message) message = {'data': ''} self.publish_message(message, close=True) def train(self, data_dir, save_path, m, algorithm): # Initialize the model model = Model(duration=self.duration, hidden_size=self.state_size, labels=self.conf_labels, model=m, model_type=algorithm, threshold_time=self.threshold) # Train the model model.train(data_dir) # Save the model to the specified path model.save(save_path) def test(self, data_dir, save_path): # Initialize results dictionary results = {} results['labels'] = self.conf_labels # Get the true label assignments self.logger.info('Getting label assignments') label_assignments = get_labels( 'networkml/configs/label_assignments.json', model_labels=self.model.labels) if not label_assignments: self.logger.warn( 'Could not read label assignments; continuing anyway.') # Walk through testing directory and get all the pcaps self.logger.info('Getting pcaps') pcaps = get_pcap_paths(data_dir) if not pcaps: self.logger.error( 'No pcaps were found in data directory; exiting.') return # Evaluate the model on each pcap file_size = 0 file_num = 0 time_slices = 0 self.logger.info('processing pcaps') tick = time.perf_counter() self.model.sessionize_pcaps(pcaps) for pcap in pcaps: # Get the true label name, label = get_true_label(pcap, label_assignments) single_result = {} single_result['label'] = label self.logger.info('Reading ' + name + ' as ' + label) # Get the internal representations representations, _, _, p, _, _ = self.model.get_representation( pcap, mean=False) if representations is not None: file_size += os.path.getsize(pcap) file_num += 1 length = representations.shape[0] time_slices += length single_result['aggregate'] = p individual_dict = {} # Classify each slice self.logger.info('Computing classifications by slice') for i in range(length): p_r = self.model.classify_representation( representations[i]) individual_dict[i] = p_r single_result['individual'] = individual_dict results[pcap] = single_result tock = time.perf_counter() # Save results to path specified by third argument with open(save_path, 'w') as output_file: json.dump(results, output_file) self.logger.info('-' * 80) self.logger.info('Results with unknowns') self.logger.info('-' * 80) self.model.calc_f1(results) self.logger.info('-' * 80) self.logger.info('Results forcing decisions') self.logger.info('-' * 80) self.model.calc_f1(results, ignore_unknown=True) self.logger.info('-' * 80) self.logger.info('Analysis statistics') self.logger.info('-' * 80) elapsed_time = tock - tick rate = file_size / (pow(10, 6) * elapsed_time) self.logger.info('Evaluated {0} pcaps in {1} seconds'.format( file_num, round(elapsed_time, 3))) self.logger.info('Total data: {0} Mb'.format(file_size / pow(10, 6))) self.logger.info('Total capture time: {0} hours'.format(time_slices / 4)) self.logger.info( 'Data processing rate: {0} Mb per second'.format(rate)) self.logger.info('time per 15 minute capture {0} seconds'.format( (elapsed_time) / (time_slices + 0.01))) self.logger.info('-' * 80)
def test_setup_logger(): logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) os.environ['LOG_LEVEL'] = 'DEBUG' Common.setup_logger(logger)
def test_get_address_info(): common = Common() last_update = common.get_address_info('foo', 'bar') assert last_update == None
def test_connect_rabbit(): common = Common() common.connect_rabbit()
def test_connect_redis(): common = Common() common.connect_redis()
def test_setup_env(): common = Common() common.setup_env()
def __init__(self): self.logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) self.args = None self.read_args() self.get_config() self.common = Common(config=self.config) self.logger = Common().setup_logger(self.logger) self.get_files() self.model_hash = None self.model = Model(duration=self.duration, hidden_size=None, model_type=self.args.algorithm) if self.args.operation == 'eval': self.load_model() if self.args.algorithm == 'onelayer': BaseAlgorithm(files=self.files, config=self.config, model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).eval( self.args.algorithm) elif self.args.algorithm == 'randomforest': BaseAlgorithm(files=self.files, config=self.config, model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).eval( self.args.algorithm) elif self.args.algorithm == 'sos': from networkml.algorithms.sos.eval_SoSModel import eval_pcap eval_pcap(self.args.path, self.conf_labels, self.time_const) elif self.args.operation == 'train': if self.args.algorithm == 'onelayer': m = MLPClassifier((self.state_size), alpha=0.1, activation='relu', max_iter=1000) BaseAlgorithm(files=self.files, config=self.config, model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).train( self.args.path, self.args.save, m, self.args.algorithm) elif self.args.algorithm == 'randomforest': m = RandomForestClassifier(n_estimators=100, min_samples_split=5, class_weight='balanced') BaseAlgorithm(files=self.files, config=self.config, model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).train( self.args.path, self.args.save, m, self.args.algorithm) elif self.args.algorithm == 'sos': from networkml.algorithms.sos.train_SoSModel import train train(self.args.path, self.time_const, self.rnn_size, self.conf_labels, self.args.save) elif self.args.operation == 'test': self.load_model() if self.args.algorithm == 'onelayer': BaseAlgorithm(files=self.files, config=self.config, model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).test( self.args.path, self.args.save) elif self.args.algorithm == 'randomforest': BaseAlgorithm(files=self.files, config=self.config, model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).test( self.args.path, self.args.save) elif self.args.algorithm == 'sos': self.logger.info( 'There is no testing operation for the SoSModel.')
class NetworkML(): """' Main class to run different algorithms against different network traffic data sources """ def __init__(self): self.logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) self.args = None self.read_args() self.get_config() self.common = Common(config=self.config) self.logger = Common().setup_logger(self.logger) self.get_files() self.model_hash = None self.model = Model(duration=self.duration, hidden_size=None, model_type=self.args.algorithm) if self.args.operation == 'eval': self.load_model() if self.args.algorithm == 'onelayer': BaseAlgorithm(files=self.files, config=self.config, model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).eval( self.args.algorithm) elif self.args.algorithm == 'randomforest': BaseAlgorithm(files=self.files, config=self.config, model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).eval( self.args.algorithm) elif self.args.algorithm == 'sos': from networkml.algorithms.sos.eval_SoSModel import eval_pcap eval_pcap(self.args.path, self.conf_labels, self.time_const) elif self.args.operation == 'train': if self.args.algorithm == 'onelayer': m = MLPClassifier((self.state_size), alpha=0.1, activation='relu', max_iter=1000) BaseAlgorithm(files=self.files, config=self.config, model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).train( self.args.path, self.args.save, m, self.args.algorithm) elif self.args.algorithm == 'randomforest': m = RandomForestClassifier(n_estimators=100, min_samples_split=5, class_weight='balanced') BaseAlgorithm(files=self.files, config=self.config, model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).train( self.args.path, self.args.save, m, self.args.algorithm) elif self.args.algorithm == 'sos': from networkml.algorithms.sos.train_SoSModel import train train(self.args.path, self.time_const, self.rnn_size, self.conf_labels, self.args.save) elif self.args.operation == 'test': self.load_model() if self.args.algorithm == 'onelayer': BaseAlgorithm(files=self.files, config=self.config, model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).test( self.args.path, self.args.save) elif self.args.algorithm == 'randomforest': BaseAlgorithm(files=self.files, config=self.config, model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).test( self.args.path, self.args.save) elif self.args.algorithm == 'sos': self.logger.info( 'There is no testing operation for the SoSModel.') def read_args(self): parser = argparse.ArgumentParser() parser.add_argument('--algorithm', '-a', default='onelayer', choices=['onelayer', 'randomforest', 'sos'], help='which algorithm to run') parser.add_argument('--format', '-f', default='pcap', choices=['netflow', 'pcap'], help='which format are the files to process in') parser.add_argument('--operation', '-o', default='eval', choices=['eval', 'train', 'test'], help='which operation to run') parser.add_argument( '--trained_model', '-m', default='networkml/trained_models/onelayer/OneLayerModel.pkl', help='path to the trained model file') parser.add_argument( '--path', '-p', default='/pcaps', help='path to file or directory of files to process') parser.add_argument( '--save', '-w', default='networkml/trained_models/onelayer/OneLayerModel.pkl', help='path to save the trained model, if training') self.args = parser.parse_args() return def get_files(self): # TODO checking extensions here should be moved to parsers, and it should probably use 'magic' rather than extensions self.files = [] if Path(self.args.path).is_dir(): for root, dirnames, filenames in os.walk(self.args.path): for extension in ['pcap', 'dump', 'cap']: for filename in fnmatch.filter(filenames, '*.' + extension): self.files.append(os.path.join(root, filename)) elif Path(self.args.path).is_file() and \ os.path.split(str(self.args.path))[-1].split('.')[-1] in {'pcap', 'dump', 'cap'}: self.files.append(str(self.args.path)) else: self.logger.error( 'Input \'%s\' was neither a file nor a directory.', str(self.args.path)) if not self.files: self.logger.error('Did not find file(s) from \'%s\'.', str(self.args.path)) return def get_config(self, cfg_file='networkml/configs/config.json', labels_file='networkml/configs/label_assignments.json'): try: with open(cfg_file, 'r') as config_file: self.config = json.load(config_file) self.time_const = self.config['time constant'] self.state_size = self.config['state size'] self.look_time = self.config['look time'] self.threshold = self.config['threshold'] self.rnn_size = self.config['rnn size'] self.duration = self.config['duration'] #self.batch_size = self.config['batch size'] with open(labels_file, 'r') as label_file: labels = json.load(label_file) self.conf_labels = [] for label in labels: self.conf_labels.append(labels[label]) self.conf_labels.append('Unknown') self.config['conf labels'] = self.conf_labels except Exception as e: # pragma: no cover self.logger.error("Unable to read '%s' properly because: %s", cfg_file, str(e)) return def load_model(self): # Compute model hash with open(self.args.trained_model, 'rb') as handle: self.model_hash = hashlib.sha224(handle.read()).hexdigest() self.model.load(self.args.trained_model) self.logger.debug('Loaded model from %s', self.args.trained_model) return
class BaseAlgorithm: """ Base algorithm that rreads a pcap and updates the stored representation of the source, to be used by more specific algorithms. """ def __init__(self, files=None, config=None, model=None, model_hash=None, model_path=None): self.logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) logging.getLogger('pika').setLevel(logging.WARNING) self.logger = Common().setup_logger(self.logger) self.common = Common(config=config) if self.common.use_rabbit: self.common.connect_rabbit( host=self.common.rabbit_host, port=self.common.rabbit_port, exchange=self.common.rabbit_exchange, routing_key=self.common.rabbit_routing_key, queue=self.common.rabbit_queue, queue_name=self.common.rabbit_queue_name) if self.common.use_redis: self.common.connect_redis(host=self.common.redis_host) if config: try: self.time_const = config['time constant'] self.state_size = config['state size'] self.look_time = config['look time'] self.threshold = config['threshold'] self.rnn_size = config['rnn size'] self.conf_labels = config['conf labels'] self.duration = config['duration'] except Exception as e: # pragma: no cover self.logger.error('Unable to read config properly because: %s', str(e)) self.files = files if files else [] self.model = model self.model_hash = model_hash self.model_path = model_path def eval(self, algorithm): for fi in self.files: self.logger.info('Processing {0}...'.format(fi)) source_mac = None key = None split_path = 'None' try: split_path = os.path.split(fi)[-1] split_path = split_path.split('.') split_path = split_path[0].split('-') key = split_path[0].split('_')[1] except Exception as e: # pragma: no cover self.logger.debug('Could not get key because %s', str(e)) # ignore misc files if (split_path[-1] == 'miscellaneous'): continue # Get representations from the model reps, source_mac, timestamps, preds, others, capture_ip_source = self.model.get_representation( str(fi), source_ip=source_mac, mean=False) if preds is None: message = {} message[key] = {'valid': False, 'pcap': os.path.split(fi)[-1]} uid = os.getenv('id', 'None') file_path = os.getenv('file_path', 'None') message = { 'id': uid, 'type': 'metadata', 'file_path': file_path, 'data': message, 'results': { 'tool': 'networkml', 'version': networkml.__version__ } } message = json.dumps(message) self.logger.info('Not enough sessions in file \'%s\'', str(fi)) if self.common.use_rabbit: self.common.channel.basic_publish( exchange=self.common.exchange, routing_key=self.common.routing_key, body=message, properties=pika.BasicProperties(delivery_mode=2, )) continue else: self.logger.debug('Generating predictions') last_update, prev_rep = self.common.get_previous_state( source_mac, timestamps[0]) # TODO are these calls actually needed??? _, mean_rep = self.common.average_representation( reps, timestamps, prev_representation=prev_rep, last_update=last_update) mean_preds = self.model.classify_representation(mean_rep) # Update the stored representation if reps is not None: self.logger.debug('Updating stored data') r_key = self.common.update_data(source_mac, reps, timestamps, preds, others, self.model_hash) # Get the sessions that the model looked at sessions = self.model.sessions # Clean the sessions clean_sessions = [] inferred_mac = None for session_dict in sessions: cleaned_sessions, inferred_mac = \ clean_session_dict( session_dict, source_address=source_mac ) clean_sessions.append(cleaned_sessions) if source_mac is None: source_mac = inferred_mac # Make simple decisions based on vector differences and update times timestamp = timestamps[0].timestamp() labels, confs = zip(*preds) abnormality = 0.0 has_avx = False if 'flags' in get_cpu_info() and ( 'avx' in get_cpu_info()['flags'] or 'avx2' in get_cpu_info()['flags']): has_avx = True if has_avx: from networkml.algorithms.sos.eval_SoSModel import eval_pcap abnormality = eval_pcap(str(fi), self.conf_labels, self.time_const, label=labels[0], rnn_size=self.rnn_size, model_path=self.model_path, model_type=algorithm) else: self.logger.warning( "Can't run abnormality detection because this CPU doesn't support AVX" ) prev_s = self.common.get_address_info(source_mac, timestamp) decision = self.common.basic_decision(key, source_mac, prev_s, timestamp, labels, confs, abnormality) if key in decision: decision[key]['source_ip'] = capture_ip_source decision[key]['source_mac'] = source_mac elif source_mac in decision: decision[source_mac]['source_ip'] = capture_ip_source decision[source_mac]['source_mac'] = source_mac self.logger.debug('Created message') for i in range(3): self.logger.info(labels[i] + ' : ' + str(round(confs[i], 3))) # update Redis with decision if self.common.use_redis: redis_decision = {} for k in decision: redis_decision[k] = str(decision[k]) try: self.common.r.hmset(r_key, redis_decision) except Exception as e: # pragma: no cover self.logger.error( 'Failed to update keys in Redis because: {0}'. format(str(e))) # Get json message uid = os.getenv('id', 'None') file_path = os.getenv('file_path', 'None') message = { 'id': uid, 'type': 'metadata', 'file_path': file_path, 'data': decision, 'results': { 'tool': 'networkml', 'version': networkml.__version__ } } message['data']['pcap'] = os.path.split(fi)[-1] message = json.dumps(message) self.logger.info('Message: ' + message) if self.common.use_rabbit: self.common.channel.basic_publish( exchange=self.common.exchange, routing_key=self.common.routing_key, body=message, properties=pika.BasicProperties(delivery_mode=2, )) uid = os.getenv('id', 'None') file_path = os.getenv('file_path', 'None') message = { 'id': uid, 'type': 'metadata', 'file_path': file_path, 'data': '', 'results': { 'tool': 'networkml', 'version': networkml.__version__ } } message = json.dumps(message) if self.common.use_rabbit: self.common.channel.basic_publish( exchange=self.common.exchange, routing_key=self.common.routing_key, body=message, properties=pika.BasicProperties(delivery_mode=2, )) try: self.common.connection.close() except Exception as e: # pragma: no cover self.logger.error( 'Unable to close rabbit connection because: {0}'.format( str(e))) return def train(self, data_dir, save_path, m, algorithm): # Initialize the model model = Model(duration=self.duration, hidden_size=self.state_size, labels=self.conf_labels, model=m, model_type=algorithm, threshold_time=self.threshold) # Train the model model.train(data_dir) # Save the model to the specified path model.save(save_path) def test(self, data_dir, save_path): # Initialize results dictionary results = {} results['labels'] = self.conf_labels # Get the true label assignments self.logger.info('Getting label assignments') label_assignments = get_labels( 'networkml/configs/label_assignments.json', model_labels=self.model.labels) if not label_assignments: self.logger.warn( 'Could not read label assignments; continuing anyway.') # Walk through testing directory and get all the pcaps self.logger.info('Getting pcaps') pcaps = get_pcap_paths(data_dir) if not pcaps: self.logger.error( 'No pcaps were found in data directory; exiting.') return # Evaluate the model on each pcap file_size = 0 file_num = 0 time_slices = 0 self.logger.info('processing pcaps') tick = time.clock() for pcap in pcaps: # Get the true label name, label = get_true_label(pcap, label_assignments) single_result = {} single_result['label'] = label self.logger.info('Reading ' + name + ' as ' + label) # Get the internal representations representations, _, _, p, _, _ = self.model.get_representation( pcap, mean=False) if representations is not None: file_size += os.path.getsize(pcap) file_num += 1 length = representations.shape[0] time_slices += length single_result['aggregate'] = p individual_dict = {} # Classify each slice self.logger.info('Computing classifications by slice') for i in range(length): p_r = self.model.classify_representation( representations[i]) individual_dict[i] = p_r single_result['individual'] = individual_dict results[pcap] = single_result tock = time.clock() # Save results to path specified by third argument with open(save_path, 'w') as output_file: json.dump(results, output_file) self.logger.info('-' * 80) self.logger.info('Results with unknowns') self.logger.info('-' * 80) self.model.calc_f1(results) self.logger.info('-' * 80) self.logger.info('Results forcing decisions') self.logger.info('-' * 80) self.model.calc_f1(results, ignore_unknown=True) self.logger.info('-' * 80) self.logger.info('Analysis statistics') self.logger.info('-' * 80) elapsed_time = tock - tick rate = file_size / (pow(10, 6) * elapsed_time) self.logger.info('Evaluated {0} pcaps in {1} seconds'.format( file_num, round(elapsed_time, 3))) self.logger.info('Total data: {0} Mb'.format(file_size / pow(10, 6))) self.logger.info('Total capture time: {0} hours'.format(time_slices / 4)) self.logger.info( 'Data processing rate: {0} Mb per second'.format(rate)) self.logger.info('time per 15 minute capture {0} seconds'.format( (elapsed_time) / (time_slices + 0.01))) self.logger.info('-' * 80)