Пример #1
0
class CTUGet(Command):
    """Get CTU dataset components."""

    log = logging.getLogger(__name__)

    def get_parser(self, prog_name):
        parser = super().get_parser(prog_name)
        parser.formatter_class = argparse.RawDescriptionHelpFormatter
        parser.add_argument(
            '--force',
            action='store_true',
            dest='force',
            default=False,
            help="Force over-writing files if they exist (default: ``False``)")
        parser.add_argument('--no-subdir',
                            action='store_true',
                            dest='no_subdir',
                            default=False,
                            help=('Do not maintain scenario name subdirectory '
                                  '(default: ``False``)'))
        _default_protocols = ",".join(DEFAULT_PROTOCOLS)
        parser.add_argument('-P',
                            '--protocols',
                            metavar='<protocol-list>',
                            dest='protocols',
                            type=lambda s: [i for i in s.split(',')],
                            default=_default_protocols,
                            help=("Protocols to include, or 'any' "
                                  f'(default: ``{_default_protocols}``)'))
        parser.add_argument(
            '-L',
            '--maxlines',
            metavar='<lines>',
            dest='maxlines',
            default=None,
            help="Maximum number of lines to get (default: ``None``)")
        cache_file = CTU_Dataset.get_cache_file()
        parser.add_argument('--cache-file',
                            action='store',
                            dest='cache_file',
                            default=cache_file,
                            help=('Cache file path for CTU metadata '
                                  '(Env: ``LIM_CTU_CACHE``; '
                                  f'default: ``{cache_file}``)'))
        parser.add_argument(
            '--ignore-cache',
            action='store_true',
            dest='ignore_cache',
            default=False,
            help="Ignore any cached results (default: ``False``)")
        parser.add_argument('scenario',
                            nargs=1,
                            type=normalize_ctu_name,
                            default=None)
        data_types = str(", ".join(
            [f'{i.lower()}' for i in CTU_Dataset.get_data_columns()]))
        parser.add_argument(
            'data',
            nargs='+',
            type=str.lower,
            choices=[
                c.lower() for c in CTU_Dataset.get_data_columns() + ['all']
            ],
            default=None)
        parser.epilog = textwrap.dedent(f"""\
            Get one or more data components from a scenario. These
            components are the raw PCAP file, Netflow file, and
            other analytic products from intrusion detection system
            processing, etc.

            See ``lim ctu list --help`` for more on the ``scenario`` argument.

            For the ``data`` argument, you can use ``all`` to recursively
            download all scenario data, or one or more of the data
            files by type: ``{data_types}``

            By default, or when using the ``all`` attribute identifier,
            the file(s) are placed in a subdirectory with the full name
            of the scenario to better organize data across multiple
            scenarios. You can override this when getting specific files
            (i.e., not using ``all``) with the ``--no-subdir`` option.
           \n""") + CTU_Dataset.get_disclaimer()  # noqa
        return parser

    def take_action(self, parsed_args):
        self.log.debug('[+] getting CTU data')
        if 'ctu_metadata' not in dir(self):
            self.ctu_metadata = CTU_Dataset(
                cache_file=parsed_args.cache_file,
                ignore_cache=parsed_args.ignore_cache,
                debug=self.app_args.debug)
            # TODO(dittrich): Work this back into init() method.
        self.ctu_metadata.load_ctu_metadata()

        scenario = self.ctu_metadata.get_fullname(name=parsed_args.scenario[0])
        if not self.ctu_metadata.is_valid_scenario(scenario):
            raise RuntimeError(f"[-] scenario '{scenario}' does not exist")
        if parsed_args.no_subdir:
            data_dir = self.app_args.data_dir
        else:
            data_dir = os.path.join(self.app_args.data_dir, scenario)
        if 'all' in parsed_args.data:
            self.recursive_get_all(name=scenario, data_dir=data_dir)
        else:
            for attribute in parsed_args.data:
                self.log.debug(f'[+] downloading {attribute} data '
                               f"for scenario '{scenario}' to {data_dir}")
                self.ctu_metadata.fetch_scenario_content_byattribute(
                    data_dir=data_dir, name=scenario, attribute=attribute)

    def recursive_get_all(self,
                          name,
                          data_dir=os.getcwd(),
                          stderr=subprocess.STDOUT,
                          shell=False):
        """Use wget to recursively get all scenario data."""
        # Ensure data directory exists
        os.makedirs(os.path.abspath(data_dir), exist_ok=True)
        cmd = ['wget', '-h']
        result = ""
        try:
            result = subprocess.check_output(  # nosec
                cmd, stderr=stderr, shell=shell).decode('UTF-8').splitlines()
        except Exception as err:
            message = f'[-] cannot run "wget": { err }'
        else:
            message = '[-] cannot run "wget"'
        if len(result) > 1 and result[0].find(' Wget ') < 0:
            raise RuntimeError(message)

        url = self.ctu_metadata.get_scenario_data(name=name,
                                                  attribute='Capture_URL')
        url_path = urlparse(url).path.lstrip('/')
        cut_dirs = len(url_path.split('/'))
        cmd = [
            'wget', '--mirror', '-l3', '--no-parent', '--no-host-directories',
            f'--cut-dirs={cut_dirs}', '--reject=index.html?*', '-P', data_dir,
            '--no-check-certificate'
        ]
        if not url.endswith('/'):
            # Required by wget --no-parent to work right
            url = f"{url}/"
        cmd.append(url)
        """Use subprocess.check_ouput to run subcommand"""
        self.log.debug('[+] cmd: {" ".join(cmd)}')
        self.log.info('[+] recursively getting all data '
                      f"from {url} to '{data_dir}'")
        try:
            result = subprocess.check_output(  # nosec
                cmd, cwd=data_dir, stderr=stderr,
                shell=shell).decode('UTF-8').splitlines()
        except subprocess.CalledProcessError as err:
            sys.stderr.write('\n'.join([line for line in result]) + '\n')
            sys.stderr.write(str(err.output) + '\n')
            sys.exit(err.returncode)
        pass
Пример #2
0
class Test_CTU_Dataset(unittest.TestCase):
    def setUp(self):
        self.ctu_dataset = CTU_Dataset(cache_file=TEST_CACHE)
        self.ctu_dataset.load_ctu_metadata()

    def tearDown(self):
        pass

    def test_cache_exists(self):
        self.assertTrue(os.path.exists(TEST_CACHE))

    def test_get_file_last_mtime_exists(self):
        self.assertNotEqual(
            get_file_last_mtime(file_path=TEST_CACHE), 0)

    def test_get_file_last_mtime_notexists(self):
        self.assertEqual(
            get_file_last_mtime(file_path=TEST_EMPTY_CACHE), 0)

    def test_get_file_last_mtime_nopath(self):
        self.assertRaises(RuntimeError,
                          get_file_last_mtime)

    def test_get_file_last_mtime_relative_path(self):
        self.assertRaises(RuntimeError,
                          get_file_last_mtime,
                          file_path='../../../etc/passwd')

    def test_get_file_last_mtime_clean_empty(self):
        os.makedirs(os.path.dirname(TEST_EMPTY_CACHE), exist_ok=True)
        f = open(TEST_EMPTY_CACHE, 'w')
        f.close()
        self.assertEqual(
            get_file_last_mtime(file_path=TEST_EMPTY_CACHE, clean=True), 0)
        self.assertRaises(FileNotFoundError,
                          open,
                          TEST_EMPTY_CACHE,
                          'r')

    def test_get_data_columns(self):
        columns = CTU_Dataset.get_data_columns()
        self.assertIs(type(columns), type(list()))
        self.assertTrue(len(columns) > 0)

    def test_get_index_columns(self):
        columns = CTU_Dataset.get_index_columns()
        self.assertIs(type(columns), type(list()))
        self.assertTrue(len(columns) > 0)

    def test_get_all_columns(self):
        columns = CTU_Dataset.get_all_columns()
        self.assertIs(type(columns), type(list()))
        self.assertTrue(len(columns) > 0)

    def test_get_disclaimer(self):
        disclaimer = CTU_Dataset.get_disclaimer()
        self.assertTrue("http://dx.doi.org/10.1016/j.cose.2014.05.011" in disclaimer)

    def test_get_scenarios(self):
        scenarios = self.ctu_dataset.get_scenarios()
        self.assertIs(type(scenarios), type(dict()))
        self.assertIn('CTU-Malware-Capture-Botnet-48', scenarios)

    def test_get_scenario_names(self):
        scenario_names = self.ctu_dataset.get_scenario_names()
        self.assertIs(type(scenario_names), type(list()))
        self.assertTrue(len(scenario_names) > 0)
        self.assertEqual(scenario_names[0], 'CTU-Malware-Capture-Botnet-90',
            msg=f'scenario_names[0]={scenario_names[0]:40}...')

    def test_is_valid_scenario_short_MATCH(self):
        self.assertFalse(self.ctu_dataset.is_valid_scenario('Botnet-48'))

    def test_is_valid_scenario_long_MATCH(self):
        self.assertTrue(self.ctu_dataset.is_valid_scenario('CTU-Malware-Capture-Botnet-48'))

    def test_is_valid_scenario_FAIL(self):
        self.assertFalse(self.ctu_dataset.is_valid_scenario('CTU-Milware-Copture-Botnet-48'))

    def test_get_scenario_data_url_SUCCESS(self):
        self.assertEqual(
            self.ctu_dataset.get_scenario_data('CTU-Malware-Capture-Botnet-48',
                                              'Capture_URL'),
            'https://mcfp.felk.cvut.cz/publicDatasets/CTU-Malware-Capture-Botnet-48')

    def test_get_data_columns(self):
        items = [a for a in CTU_Dataset.__DATA_COLUMNS__]
        self.assertListEqual(items, self.ctu_dataset.get_data_columns())

    def test_get_scenario_data_url_FAIL(self):
        try:
            _ = self.ctu_dataset.get_scenario_data('CTU-Malware-Capture-Botnet-48',
                                                   'Capture_ORL')
        except RuntimeError as err:
            self.assertIn('is not supported', str(err))

    def test_get_scenario_data_pcap(self):
        url = self.ctu_dataset.get_scenario_data('CTU-Malware-Capture-Botnet-113-1',
                                                 'PCAP')
        self.assertEqual(url,
            'https://mcfp.felk.cvut.cz/publicDatasets/CTU-Malware-Capture-Botnet-113-1/2015-03-12_capture-win6.pcap',
            msg=f'url={url}')

    def test_get_scenario_page_short(self):
        self.assertIn('DOCTYPE HTML PUBLIC',
                      self.ctu_dataset.get_scenario_page('Malware-Botnet-42'))

    def test_get_scenario_page_full(self):
        self.assertIn('DOCTYPE HTML PUBLIC',
                      self.ctu_dataset.get_scenario_page('CTU-Malware-Capture-Botnet-42'))

    def test_filename_from_url(self):
        filename = self.ctu_dataset.filename_from_url(
                'https://mcfp.felk.cvut.cz/publicDatasets/CTU-Mixed-Capture-1/2015-07-28_mixed.pcap')
        self.assertEqual(filename, '2015-07-28_mixed.pcap',
                         msg='filename={}'.format(filename))

    def test_get_fullname_short_5parts(self):
        fullname = self.ctu_dataset.get_fullname(name='CTU-Malware-Capture-Botnet-116-1')
        self.assertEqual(fullname, 'CTU-Malware-Capture-Botnet-116-1')

    def test_get_fullname_short_4parts(self):
        fullname = self.ctu_dataset.get_fullname('Malware-Capture-Botnet-116-1')
        self.assertEqual(fullname, 'CTU-Malware-Capture-Botnet-116-1')

    def test_get_fullname_short_3parts1(self):
        fullname = self.ctu_dataset.get_fullname(name='Malware-Botnet-116-1')
        self.assertEqual(fullname, 'CTU-Malware-Capture-Botnet-116-1')

    def test_get_fullname_short_3parts2(self):
        fullname = self.ctu_dataset.get_fullname(name='Malware-Capture-42')
        self.assertEqual(fullname, 'CTU-Malware-Capture-Botnet-42')

    def test_get_fullname_short_2parts1(self):
        fullname = self.ctu_dataset.get_fullname(name='Malware-42')
        self.assertEqual(fullname, 'CTU-Malware-Capture-Botnet-42')

    def test_get_fullname_short_2parts2(self):
        fullname = self.ctu_dataset.get_fullname(name='Capture-42')
        self.assertEqual(fullname, 'CTU-Malware-Capture-Botnet-42')

    def test_get_fullname_short_1part_number(self):
        fullname = self.ctu_dataset.get_fullname(name='42')
        self.assertEqual(fullname, 'CTU-Malware-Capture-Botnet-42')

    def test_get_fullname_short_1part_name(self):
        self.assertRaises(SystemExit,
                          self.ctu_dataset.get_fullname,
                          name='IoT')

    def test_get_fullname_short_fail(self):
        fullname = self.ctu_dataset.get_fullname(name='Botnet-1')
        self.assertEqual(fullname, None)

    def test_get_fullname_typo(self):
        fullname = self.ctu_dataset.get_fullname(name='CTU_Malware_Capture-Botnet-42')
        self.assertEqual(fullname, None)

    def test_get_shortname_match(self):
        shortname = self.ctu_dataset.get_shortname(name='CTU-Malware-Capture-Botnet-42')
        self.assertEqual(shortname, 'Malware-Botnet-42')

    def test_normalize_ctu_name_lower(self):
        self.assertEqual(normalize_ctu_name('ctu-malware-botnet-42'),
                        'CTU-Malware-Botnet-42')
        self.assertEqual(normalize_ctu_name('iot-malware-33-1'),
                        'IoT-Malware-33-1')
    def test_normalize_ctu_name_upper(self):
        self.assertEqual(normalize_ctu_name('CTU-MALWARE-BOTNET-42'),
                        'CTU-Malware-Botnet-42')
        self.assertEqual(normalize_ctu_name('IOT-MALWARE-33-1'),
                        'IoT-Malware-33-1')
    def test_normalize_ctu_name_mixed(self):
        self.assertEqual(normalize_ctu_name('Ctu-Malware-Botnet-42'),
                        'CTU-Malware-Botnet-42')
        self.assertEqual(normalize_ctu_name('Iot-Malware-33-1'),
                        'IoT-Malware-33-1')
    def test_normalize_ctu_name_random(self):
        self.assertEqual(normalize_ctu_name('CTU-MALWARE-BOTNET-42'),
                        'CTU-Malware-Botnet-42')
        self.assertEqual(normalize_ctu_name('IoT-MaLwArE-33-1'),
                        'IoT-Malware-33-1')
Пример #3
0
class CTUOverview(Command):
    """Get CTU dataset overview."""

    log = logging.getLogger(__name__)

    def get_parser(self, prog_name):
        parser = super().get_parser(prog_name)
        parser.formatter_class = argparse.RawDescriptionHelpFormatter
        parser = add_browser_options(parser)
        cache_file = CTU_Dataset.get_cache_file()
        parser.add_argument(
            '--cache-file',
            action='store',
            dest='cache_file',
            default=cache_file,
            help=('Cache file path for CTU metadata '
                  '(Env: ``LIM_CTU_CACHE``; '
                  f'default: ``{cache_file}``)')
        )
        parser.add_argument(
            '--ignore-cache',
            action='store_true',
            dest='ignore_cache',
            default=False,
            help="Ignore any cached results (default: ``False``)"
        )
        parser.add_argument(
            'scenario',
            nargs='*',
            type=normalize_ctu_name,
            default=None)
        parser.epilog = textwrap.dedent("""\
            Opens a browser for the web page containing the scenario
            descriptions and data links.

            Arguments are scenario names using either the full name
            form (e.g., ``CTU-Malware-Capture-Botnet-123-1``) or an
            abbreviated form (e.g., ``Botnet-123-1``).

            The URL to use is the one seen in the ``SCENARIO_URL`` column
            of the output of the ``lim ctu list`` command.

            To see help information about how the browser option works and
            how you can configure it, see ``lim about --help``.
            """)
        return parser

    def take_action(self, parsed_args):
        self.log.debug('[+] showing overview of CTU datasets')
        # TODO(dittrich): Getting really not DRY: Move this into class.
        pages = []
        # Expand scenario names if abbreviated
        scenarios = [CTU_Dataset.get_fullname(name=s)
                     for s in parsed_args.scenario]
        if 'ctu_metadata' not in dir(self):
            self.ctu_metadata = CTU_Dataset(
                cache_file=parsed_args.cache_file,
                ignore_cache=parsed_args.ignore_cache,
                debug=self.app_args.debug)
        self.ctu_metadata.load_ctu_metadata()
        if len(scenarios) == 0:
            print("{}".format(CTU_Dataset.get_disclaimer()))
            pages.append(CTU_Dataset.get_ctu_datasets_overview_url())
        else:
            for scenario in scenarios:
                page = self.ctu_metadata.get_scenario_data(scenario,
                                                           'Capture_URL')
                if page is not None:
                    pages.append(page)
        for page in pages:
            open_browser(page=page,
                         browser=parsed_args.browser,
                         force=parsed_args.force)