class CTUGet(Command): """Get CTU dataset components.""" log = logging.getLogger(__name__) def get_parser(self, prog_name): parser = super().get_parser(prog_name) parser.formatter_class = argparse.RawDescriptionHelpFormatter parser.add_argument( '--force', action='store_true', dest='force', default=False, help="Force over-writing files if they exist (default: ``False``)") parser.add_argument('--no-subdir', action='store_true', dest='no_subdir', default=False, help=('Do not maintain scenario name subdirectory ' '(default: ``False``)')) _default_protocols = ",".join(DEFAULT_PROTOCOLS) parser.add_argument('-P', '--protocols', metavar='<protocol-list>', dest='protocols', type=lambda s: [i for i in s.split(',')], default=_default_protocols, help=("Protocols to include, or 'any' " f'(default: ``{_default_protocols}``)')) parser.add_argument( '-L', '--maxlines', metavar='<lines>', dest='maxlines', default=None, help="Maximum number of lines to get (default: ``None``)") cache_file = CTU_Dataset.get_cache_file() parser.add_argument('--cache-file', action='store', dest='cache_file', default=cache_file, help=('Cache file path for CTU metadata ' '(Env: ``LIM_CTU_CACHE``; ' f'default: ``{cache_file}``)')) parser.add_argument( '--ignore-cache', action='store_true', dest='ignore_cache', default=False, help="Ignore any cached results (default: ``False``)") parser.add_argument('scenario', nargs=1, type=normalize_ctu_name, default=None) data_types = str(", ".join( [f'{i.lower()}' for i in CTU_Dataset.get_data_columns()])) parser.add_argument( 'data', nargs='+', type=str.lower, choices=[ c.lower() for c in CTU_Dataset.get_data_columns() + ['all'] ], default=None) parser.epilog = textwrap.dedent(f"""\ Get one or more data components from a scenario. These components are the raw PCAP file, Netflow file, and other analytic products from intrusion detection system processing, etc. See ``lim ctu list --help`` for more on the ``scenario`` argument. For the ``data`` argument, you can use ``all`` to recursively download all scenario data, or one or more of the data files by type: ``{data_types}`` By default, or when using the ``all`` attribute identifier, the file(s) are placed in a subdirectory with the full name of the scenario to better organize data across multiple scenarios. You can override this when getting specific files (i.e., not using ``all``) with the ``--no-subdir`` option. \n""") + CTU_Dataset.get_disclaimer() # noqa return parser def take_action(self, parsed_args): self.log.debug('[+] getting CTU data') if 'ctu_metadata' not in dir(self): self.ctu_metadata = CTU_Dataset( cache_file=parsed_args.cache_file, ignore_cache=parsed_args.ignore_cache, debug=self.app_args.debug) # TODO(dittrich): Work this back into init() method. self.ctu_metadata.load_ctu_metadata() scenario = self.ctu_metadata.get_fullname(name=parsed_args.scenario[0]) if not self.ctu_metadata.is_valid_scenario(scenario): raise RuntimeError(f"[-] scenario '{scenario}' does not exist") if parsed_args.no_subdir: data_dir = self.app_args.data_dir else: data_dir = os.path.join(self.app_args.data_dir, scenario) if 'all' in parsed_args.data: self.recursive_get_all(name=scenario, data_dir=data_dir) else: for attribute in parsed_args.data: self.log.debug(f'[+] downloading {attribute} data ' f"for scenario '{scenario}' to {data_dir}") self.ctu_metadata.fetch_scenario_content_byattribute( data_dir=data_dir, name=scenario, attribute=attribute) def recursive_get_all(self, name, data_dir=os.getcwd(), stderr=subprocess.STDOUT, shell=False): """Use wget to recursively get all scenario data.""" # Ensure data directory exists os.makedirs(os.path.abspath(data_dir), exist_ok=True) cmd = ['wget', '-h'] result = "" try: result = subprocess.check_output( # nosec cmd, stderr=stderr, shell=shell).decode('UTF-8').splitlines() except Exception as err: message = f'[-] cannot run "wget": { err }' else: message = '[-] cannot run "wget"' if len(result) > 1 and result[0].find(' Wget ') < 0: raise RuntimeError(message) url = self.ctu_metadata.get_scenario_data(name=name, attribute='Capture_URL') url_path = urlparse(url).path.lstrip('/') cut_dirs = len(url_path.split('/')) cmd = [ 'wget', '--mirror', '-l3', '--no-parent', '--no-host-directories', f'--cut-dirs={cut_dirs}', '--reject=index.html?*', '-P', data_dir, '--no-check-certificate' ] if not url.endswith('/'): # Required by wget --no-parent to work right url = f"{url}/" cmd.append(url) """Use subprocess.check_ouput to run subcommand""" self.log.debug('[+] cmd: {" ".join(cmd)}') self.log.info('[+] recursively getting all data ' f"from {url} to '{data_dir}'") try: result = subprocess.check_output( # nosec cmd, cwd=data_dir, stderr=stderr, shell=shell).decode('UTF-8').splitlines() except subprocess.CalledProcessError as err: sys.stderr.write('\n'.join([line for line in result]) + '\n') sys.stderr.write(str(err.output) + '\n') sys.exit(err.returncode) pass
class Test_CTU_Dataset(unittest.TestCase): def setUp(self): self.ctu_dataset = CTU_Dataset(cache_file=TEST_CACHE) self.ctu_dataset.load_ctu_metadata() def tearDown(self): pass def test_cache_exists(self): self.assertTrue(os.path.exists(TEST_CACHE)) def test_get_file_last_mtime_exists(self): self.assertNotEqual( get_file_last_mtime(file_path=TEST_CACHE), 0) def test_get_file_last_mtime_notexists(self): self.assertEqual( get_file_last_mtime(file_path=TEST_EMPTY_CACHE), 0) def test_get_file_last_mtime_nopath(self): self.assertRaises(RuntimeError, get_file_last_mtime) def test_get_file_last_mtime_relative_path(self): self.assertRaises(RuntimeError, get_file_last_mtime, file_path='../../../etc/passwd') def test_get_file_last_mtime_clean_empty(self): os.makedirs(os.path.dirname(TEST_EMPTY_CACHE), exist_ok=True) f = open(TEST_EMPTY_CACHE, 'w') f.close() self.assertEqual( get_file_last_mtime(file_path=TEST_EMPTY_CACHE, clean=True), 0) self.assertRaises(FileNotFoundError, open, TEST_EMPTY_CACHE, 'r') def test_get_data_columns(self): columns = CTU_Dataset.get_data_columns() self.assertIs(type(columns), type(list())) self.assertTrue(len(columns) > 0) def test_get_index_columns(self): columns = CTU_Dataset.get_index_columns() self.assertIs(type(columns), type(list())) self.assertTrue(len(columns) > 0) def test_get_all_columns(self): columns = CTU_Dataset.get_all_columns() self.assertIs(type(columns), type(list())) self.assertTrue(len(columns) > 0) def test_get_disclaimer(self): disclaimer = CTU_Dataset.get_disclaimer() self.assertTrue("http://dx.doi.org/10.1016/j.cose.2014.05.011" in disclaimer) def test_get_scenarios(self): scenarios = self.ctu_dataset.get_scenarios() self.assertIs(type(scenarios), type(dict())) self.assertIn('CTU-Malware-Capture-Botnet-48', scenarios) def test_get_scenario_names(self): scenario_names = self.ctu_dataset.get_scenario_names() self.assertIs(type(scenario_names), type(list())) self.assertTrue(len(scenario_names) > 0) self.assertEqual(scenario_names[0], 'CTU-Malware-Capture-Botnet-90', msg=f'scenario_names[0]={scenario_names[0]:40}...') def test_is_valid_scenario_short_MATCH(self): self.assertFalse(self.ctu_dataset.is_valid_scenario('Botnet-48')) def test_is_valid_scenario_long_MATCH(self): self.assertTrue(self.ctu_dataset.is_valid_scenario('CTU-Malware-Capture-Botnet-48')) def test_is_valid_scenario_FAIL(self): self.assertFalse(self.ctu_dataset.is_valid_scenario('CTU-Milware-Copture-Botnet-48')) def test_get_scenario_data_url_SUCCESS(self): self.assertEqual( self.ctu_dataset.get_scenario_data('CTU-Malware-Capture-Botnet-48', 'Capture_URL'), 'https://mcfp.felk.cvut.cz/publicDatasets/CTU-Malware-Capture-Botnet-48') def test_get_data_columns(self): items = [a for a in CTU_Dataset.__DATA_COLUMNS__] self.assertListEqual(items, self.ctu_dataset.get_data_columns()) def test_get_scenario_data_url_FAIL(self): try: _ = self.ctu_dataset.get_scenario_data('CTU-Malware-Capture-Botnet-48', 'Capture_ORL') except RuntimeError as err: self.assertIn('is not supported', str(err)) def test_get_scenario_data_pcap(self): url = self.ctu_dataset.get_scenario_data('CTU-Malware-Capture-Botnet-113-1', 'PCAP') self.assertEqual(url, 'https://mcfp.felk.cvut.cz/publicDatasets/CTU-Malware-Capture-Botnet-113-1/2015-03-12_capture-win6.pcap', msg=f'url={url}') def test_get_scenario_page_short(self): self.assertIn('DOCTYPE HTML PUBLIC', self.ctu_dataset.get_scenario_page('Malware-Botnet-42')) def test_get_scenario_page_full(self): self.assertIn('DOCTYPE HTML PUBLIC', self.ctu_dataset.get_scenario_page('CTU-Malware-Capture-Botnet-42')) def test_filename_from_url(self): filename = self.ctu_dataset.filename_from_url( 'https://mcfp.felk.cvut.cz/publicDatasets/CTU-Mixed-Capture-1/2015-07-28_mixed.pcap') self.assertEqual(filename, '2015-07-28_mixed.pcap', msg='filename={}'.format(filename)) def test_get_fullname_short_5parts(self): fullname = self.ctu_dataset.get_fullname(name='CTU-Malware-Capture-Botnet-116-1') self.assertEqual(fullname, 'CTU-Malware-Capture-Botnet-116-1') def test_get_fullname_short_4parts(self): fullname = self.ctu_dataset.get_fullname('Malware-Capture-Botnet-116-1') self.assertEqual(fullname, 'CTU-Malware-Capture-Botnet-116-1') def test_get_fullname_short_3parts1(self): fullname = self.ctu_dataset.get_fullname(name='Malware-Botnet-116-1') self.assertEqual(fullname, 'CTU-Malware-Capture-Botnet-116-1') def test_get_fullname_short_3parts2(self): fullname = self.ctu_dataset.get_fullname(name='Malware-Capture-42') self.assertEqual(fullname, 'CTU-Malware-Capture-Botnet-42') def test_get_fullname_short_2parts1(self): fullname = self.ctu_dataset.get_fullname(name='Malware-42') self.assertEqual(fullname, 'CTU-Malware-Capture-Botnet-42') def test_get_fullname_short_2parts2(self): fullname = self.ctu_dataset.get_fullname(name='Capture-42') self.assertEqual(fullname, 'CTU-Malware-Capture-Botnet-42') def test_get_fullname_short_1part_number(self): fullname = self.ctu_dataset.get_fullname(name='42') self.assertEqual(fullname, 'CTU-Malware-Capture-Botnet-42') def test_get_fullname_short_1part_name(self): self.assertRaises(SystemExit, self.ctu_dataset.get_fullname, name='IoT') def test_get_fullname_short_fail(self): fullname = self.ctu_dataset.get_fullname(name='Botnet-1') self.assertEqual(fullname, None) def test_get_fullname_typo(self): fullname = self.ctu_dataset.get_fullname(name='CTU_Malware_Capture-Botnet-42') self.assertEqual(fullname, None) def test_get_shortname_match(self): shortname = self.ctu_dataset.get_shortname(name='CTU-Malware-Capture-Botnet-42') self.assertEqual(shortname, 'Malware-Botnet-42') def test_normalize_ctu_name_lower(self): self.assertEqual(normalize_ctu_name('ctu-malware-botnet-42'), 'CTU-Malware-Botnet-42') self.assertEqual(normalize_ctu_name('iot-malware-33-1'), 'IoT-Malware-33-1') def test_normalize_ctu_name_upper(self): self.assertEqual(normalize_ctu_name('CTU-MALWARE-BOTNET-42'), 'CTU-Malware-Botnet-42') self.assertEqual(normalize_ctu_name('IOT-MALWARE-33-1'), 'IoT-Malware-33-1') def test_normalize_ctu_name_mixed(self): self.assertEqual(normalize_ctu_name('Ctu-Malware-Botnet-42'), 'CTU-Malware-Botnet-42') self.assertEqual(normalize_ctu_name('Iot-Malware-33-1'), 'IoT-Malware-33-1') def test_normalize_ctu_name_random(self): self.assertEqual(normalize_ctu_name('CTU-MALWARE-BOTNET-42'), 'CTU-Malware-Botnet-42') self.assertEqual(normalize_ctu_name('IoT-MaLwArE-33-1'), 'IoT-Malware-33-1')
class CTUOverview(Command): """Get CTU dataset overview.""" log = logging.getLogger(__name__) def get_parser(self, prog_name): parser = super().get_parser(prog_name) parser.formatter_class = argparse.RawDescriptionHelpFormatter parser = add_browser_options(parser) cache_file = CTU_Dataset.get_cache_file() parser.add_argument( '--cache-file', action='store', dest='cache_file', default=cache_file, help=('Cache file path for CTU metadata ' '(Env: ``LIM_CTU_CACHE``; ' f'default: ``{cache_file}``)') ) parser.add_argument( '--ignore-cache', action='store_true', dest='ignore_cache', default=False, help="Ignore any cached results (default: ``False``)" ) parser.add_argument( 'scenario', nargs='*', type=normalize_ctu_name, default=None) parser.epilog = textwrap.dedent("""\ Opens a browser for the web page containing the scenario descriptions and data links. Arguments are scenario names using either the full name form (e.g., ``CTU-Malware-Capture-Botnet-123-1``) or an abbreviated form (e.g., ``Botnet-123-1``). The URL to use is the one seen in the ``SCENARIO_URL`` column of the output of the ``lim ctu list`` command. To see help information about how the browser option works and how you can configure it, see ``lim about --help``. """) return parser def take_action(self, parsed_args): self.log.debug('[+] showing overview of CTU datasets') # TODO(dittrich): Getting really not DRY: Move this into class. pages = [] # Expand scenario names if abbreviated scenarios = [CTU_Dataset.get_fullname(name=s) for s in parsed_args.scenario] if 'ctu_metadata' not in dir(self): self.ctu_metadata = CTU_Dataset( cache_file=parsed_args.cache_file, ignore_cache=parsed_args.ignore_cache, debug=self.app_args.debug) self.ctu_metadata.load_ctu_metadata() if len(scenarios) == 0: print("{}".format(CTU_Dataset.get_disclaimer())) pages.append(CTU_Dataset.get_ctu_datasets_overview_url()) else: for scenario in scenarios: page = self.ctu_metadata.get_scenario_data(scenario, 'Capture_URL') if page is not None: pages.append(page) for page in pages: open_browser(page=page, browser=parsed_args.browser, force=parsed_args.force)