def test_train(self): data = utils.dataloader('data/wslog.dat.template', ApacheLogSchemaRaw, ' ') data = [ApacheLog.format(l) for l in data] featext_config = utils.configloader( 'config.cfg.template')['feat_extraction'] model = ApacheLogModel(data, featext_config) tm = model.train() print(tm) print(tm.show_topics())
def test_extract_bows(self): data = utils.dataloader('data/wslog.dat.template', ApacheLogSchemaRaw, ' ') data = [ApacheLog.format(l) for l in data] featext_config = utils.configloader( 'config.cfg.template')['feat_extraction'] model = ApacheLogModel(data, featext_config) actual = model.extract_bows(data)[0] expected = [ '_bytes_out_val_low', '_referrer_ent_low', '_user_agent_ent_veryhigh', '_request_resource_ent_med', '_referrer_len_low', '_request_resource_len_low', '_301', 'mozilla', 'macintosh', 'intel mac os x', 'applewebkit', 'khtml', 'like gecko', 'chrome', '... safari', 'svds.com', 'rockandroll' ] self.assertEqual(actual, expected)
def test_dataloader (self): datafile = 'data/wslog.dat.template' data = utils.dataloader (datafile, ApacheLogSchemaRaw, ' ') actual = data[0] expected = {'ip': '198.0.200.105', 'user_ident': '-', 'user_http': '-', 'ts': '[14/Jan/2014:09:36:50', 'ms': '-0800]', 'request': 'GET /svds.com/rockandroll HTTP/1.1', 'response_code': '301', 'bytes_out': '241', 'referrer': '-', 'user_agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'} self.assertEqual(actual, expected) expected_len = 20 self.assertEqual (len(data), expected_len)
def test_derive_ranges(self): data = utils.dataloader('data/wslog.dat.template', ApacheLogSchemaRaw, ' ') data = [ApacheLog.format(l) for l in data] featext_config = utils.configloader( 'config.cfg.template')['feat_extraction'] model = ApacheLogModel(data, featext_config) actual = model.derive_stats() expected = { 'bytes_out_val': { 'high': 14308.57885358002, 'med': 5101, 'low': 3518 }, 'referrer_len': { 'high': 52.121734779245941, 'med': 30, 'low': 28 }, 'request_resource_len': { 'high': 64.657561374216158, 'med': 43, 'low': 37 }, 'request_resource_ent': { 'high': 4.4115352724758283, 'med': 4, 'low': 3 }, 'user_agent_ent': { 'high': 5.0920380749115814, 'med': 5, 'low': 5 }, 'referrer_ent': { 'high': 6.4544661178668585, 'med': 3, 'low': 3 } } self.assertEqual(actual, expected)
datapath = args.datapath num_topics = args.num_topics alpha = args.alpha iterations = args.iterations modelname = args.modelname # init logging logging.basicConfig(filename=config['logging']['logfile'].format( dt.now().date()), level=config['logging']['level'], format=config['logging']['format']) logging.getLogger().addHandler(logging.StreamHandler()) logger = logging.getLogger(__name__) # load data data = utils.dataloader(datapath, ApacheLogSchemaRaw, ' ') data = [ApacheLog.format(l) for l in data] logger.info('completed loading data from {}'.format(datapath)) # load feature extraction configs featextraction_config = utils.configloader( 'config.cfg.template')['feat_extraction'] # init model model = ApacheLogModel(data, featextraction_config, num_topics=num_topics, alpha=alpha, iterations=iterations, modelname=modelname)
def test_to_dataframe (self): data = utils.dataloader ('data/wslog.dat.template', ApacheLogSchemaRaw, ' ') df = utils.to_dataframe (data) expected_len = 20 actual_len = len (df) self.assertEqual(actual_len, expected_len)