def _get_preprocessed_test_data(): """Return preprocessed data. Note: used for tests only.""" from nvdlib.nvd import NVD feed = NVD.from_feeds(feed_names=['recent']) # download and update feed.update() # get the sample cves __cve_iter = feed.cves() __records = 500 data = [next(__cve_iter) for _ in range(__records)] # only first n to speed up tests pipeline = get_preprocessing_pipeline() steps, preps = list(zip(*pipeline.steps)) # set up fit parameters (see sklearn fit_params notation) fit_params = { "%s__feed_attributes" % steps[2]: ['description'], "%s__output_attributes" % steps[2]: ['label'] } prep_data = pipeline.fit_transform(X=data, **fit_params) return prep_data
def _get_preprocessed_test_data(): """Return preprocessed data. Note: used for tests only. """ feed = NVD.from_feeds(feed_names=['recent']) # download and update feed.update() # get the sample cves __cve_iter = feed.cves() __records = 500 data = list() for i, cve in enumerate(__cve_iter): if i >= __records: break data.append(cve) pipeline = get_preprocessing_pipeline( nvd_attributes=['project', 'description']) steps, preps = list(zip(*pipeline.steps)) # set up fit parameters (see sklearn fit_params notation) fit_params = { "%s__feed_attributes" % steps[2]: ['description'], "%s__output_attributes" % steps[2]: ['label'] } prep_data = pipeline.fit_transform(X=data, **fit_params) return prep_data
def setUpClass(cls): """Return preprocessed data.""" feed = NVD.from_feeds(feed_names=['recent']) # download and update feed.update() # get the sample cves __cve_iter = feed.cves() __records = 500 data = [next(__cve_iter) for _ in range(__records)] # only first n to speed up tests pipeline = get_preprocessing_pipeline( nvd_attributes=['project', 'description'], share_hooks=True ) steps, preps = list(zip(*pipeline.steps)) # set up fit parameters (see sklearn fit_params notation) fit_params = { "%s__feed_attributes" % steps[2]: ['description'], "%s__output_attributes" % steps[2]: ['label'] } prep_data = pipeline.fit_transform( X=data, **fit_params ) cls.test_data = prep_data
def test_vendor_product_match_hook(self): """Test vendor_product_hook.""" hook = feature_hooks.vendor_product_match_hook from nvdlib.nvd import NVD feed = NVD.from_feeds(['recent']) feed.update() cve_list = list(feed.cves()) # find application CPE cpe = cve = None for cve in cve_list: try: cpe = cve.configurations[0].cpe[0] except IndexError: continue if cpe.is_application(): break assert all([cve, cpe]), "Failed to gather test data." vendor, product = cpe.vendor[0], cpe.product[0] # mock CVE with empty configurations instead of searching it empty_cve = type('emptyCVE', (), {}) empty_cve.configurations = [] cve_dict = {cve.cve_id: cve, 'empty': empty_cve} # empty configurations features = [(product, 'NUM')] result = hook.__call__(features, 0, cve_dict, 'empty') self.assertFalse(result) # non existing ID result = hook.__call__(features, 0, cve_dict, 'non-existing-id') self.assertFalse(result) # matching product result = hook.__call__(features, 0, cve_dict, cve.cve_id) self.assertTrue(result) # matching vendor features = [(vendor, 'NUM')] result = hook.__call__(features, 0, cve_dict, cve.cve_id) self.assertTrue(result) # neither of vendor and product match features = [('mismatch', 'NUM')] result = hook.__call__(features, 0, cve_dict, cve.cve_id) self.assertFalse(result)
def main(argv): """Run.""" args = parse_args(argv=argv) if args.csv: # TODO raise NotImplementedError("The feature has not been implemented yet." " Sorry for the inconvenience.") else: print("Getting NVD Feed...") feed = NVD.from_feeds(feed_names=args.nvd_feeds) feed.update() data = list(feed.cves()) # generator cve_dict = {cve.cve_id: cve for cve in data} # set up default argument for vendor-product feature hook feature_hooks.vendor_product_match_hook.default_kwargs = { 'cve_dict': cve_dict } training_pipeline = Pipeline( steps=[('nvd_feed_preprocessor', preprocessing.NVDFeedPreprocessor( attributes=['cve_id', 'description'])), ('label_preprocessor', preprocessing.LabelPreprocessor( feed_attributes=['project', 'description'], output_attributes=['cve_id', 'description'], hook=transformers.Hook( key='label_hook', reuse=True, func=utils.find_))), ('nltk_preprocessor', preprocessing.NLTKPreprocessor( feed_attributes=['description'], output_attributes=['cve_id', 'label'])), ('feature_extractor', transformers.FeatureExtractor(feature_hooks=FEATURE_HOOKS, share_hooks=True) ), ('classifier', transformers.NBClassifier())]) start_time = time() print("Training started") try: classifier = training_pipeline.fit_transform(X=data) finally: print(f"Training finished in {time() - start_time} seconds") if args.export: classifier.export(args.export_dir)
def test_nvd_to_dataframe(self): """Test NVD feed transformation to pandas.DataFrame object.""" from pandas import DataFrame # test without handler cves = list(NVD.from_feeds(['recent']).cves()) df = utils.nvd_to_dataframe(cves) self.assertIsNotNone(df) self.assertIsInstance(df, DataFrame) # test with handler - should raise cause of missing gh token with self.assertRaises(StatusError): _ = utils.nvd_to_dataframe(cves, handler=GitHubHandler)
def main(): args = __parser.parse_args() if args.csv: # TODO raise NotImplementedError("The feature has not been implemented yet." " Sorry for the inconvenience.") else: print("Getting NVD Feed...") feed = NVD.from_feeds(feed_names=args.nvd_feeds) feed.update() data = feed.cves() # generator # transform and transform the data with the pre-processing pipeline print("Preprocessing...") prep_pipeline = pipelines.get_preprocessing_pipeline() steps, preps = list(zip(*prep_pipeline.steps)) fit_params = { "%s__feed_attributes" % steps[2]: ['description'], "%s__output_attributes" % steps[2]: ['label'] } prep_data = prep_pipeline.fit_transform(X=data, **fit_params) print("Preprocessing done.") prep_data = np.array(prep_data) if not prep_data.size > 0: print( "No data left after preprocessing. Check the data provided" " or modify preprocessing pipeline.", file=sys.stderr) exit(1) # split the data to labels features, labels = prep_data[:, 0], prep_data[:, 1] print("Training...") # transform and transform the data with the training pipeline train_pipeline = pipelines.get_training_pipeline( feature_hooks=FEATURE_HOOKS) classifier = train_pipeline.fit_transform(X=features, y=labels) print("Training done.") if args.export: classifier.export()
def _get_test_data(n_records=500): """Return preprocessed data. Note: used for tests only.""" from nvdlib.nvd import NVD feed = NVD.from_feeds(feed_names=['recent']) # download and update feed.update() # get the sample cves __cve_iter = feed.cves() __records = n_records data = [next(__cve_iter) for _ in range(__records)] # only first n to speed up tests return data
def setUpClass(cls): """Return preprocessed extracted labeled features.""" from nvdlib.nvd import NVD feed = NVD.from_feeds(feed_names=['recent']) # download and update feed.update() # get the sample cves __cve_iter = feed.cves() data = list(__cve_iter) data, labels = extract_labeled_features( data=data, nvd_attributes=['project', 'description'], nltk_feed_attributes=['description']) cls.data, cls.labels = data, labels
def _get_extracted_test_data(): """Return preprocessed data. Note: used for tests only.""" from nvdlib.nvd import NVD feed = NVD.from_feeds(feed_names=['recent']) # download and update feed.update() # get the sample cves __cve_iter = feed.cves() __records = 500 data = [next(__cve_iter) for _ in range(__records)] data, labels = extract_labeled_features(data=data, attributes=['description']) return data, labels
def _get_extracted_test_data(): """Return preprocessed data. Note: used for tests only. """ from nvdlib.nvd import NVD feed = NVD.from_feeds(feed_names=['recent']) # download and update feed.update() # get the sample cves __cve_iter = feed.cves() data = list(__cve_iter) data, labels = extract_labeled_features( data=data, nvd_attributes=['project', 'description']) return data, labels
def _get_test_data(n_records=500): """Return preprocessed data. Note: used for tests only. """ from nvdlib.nvd import NVD feed = NVD.from_feeds(feed_names=['recent']) # download and update feed.update() # get the sample cves cve_iter = feed.cves() data = list() for i, cve in enumerate(cve_iter): if i >= n_records: break data.append(cve) return data
def main(): args = __parser.parse_args() if args.csv: # TODO raise NotImplementedError("The feature has not been implemented yet." " Sorry for the inconvenience.") else: print("Getting NVD Feed...") feed = NVD.from_feeds(feed_names=args.nvd_feeds) feed.update() data = feed.cves() # generator # transform and transform the data with the pre-processing pipeline print("Preprocessing...") features, labels = pipelines.extract_labeled_features( data=data, feature_hooks=FEATURE_HOOKS, attributes=['description'], ) print("Preprocessing done.") if not data: print("No data left after preprocessing. Check the data provided" " or modify preprocessing pipeline.", file=sys.stderr) exit(1) path_to_classifier = os.path.join(os.getcwd(), args.path_to_classifier) classifier = classifiers.NBClassifier.restore(path_to_classifier) # noinspection PyPep8Naming X_train, X_test, y_train, y_test = train_test_split( # pylint: disable=invalid-name features, labels, test_size=0.2, random_state=np.random.randint(0, 100), shuffle=True ) if args.eval: score = classifier.evaluate(X_test, y_test, sample=True, n=args.num_candidates) print("Evaluation accuracy:", score) if args.cross_validate: score = classifiers.cross_validate( classifier, X_train, y_train, sample=True, n=args.num_candidates, folds=args.cross_validation_folds, shuffle=True ) print("Cross-validation results:") print("-------------------------") print("\tIntermediate results:\n") print( "\n".join("\t\tFold {}: {}".format(fold, np.round(value, 2)) for fold, value in enumerate(score.values)) ) print("\tAccuracy: %.2f (+/- %.4f)" % (np.round(score.mean, 2), np.round(score.std * 2, 4)))