def test_preprocessing_pipeline(self): """Test preprocessing pipeline.""" # default prep pipeline pipeline = pipelines.get_preprocessing_pipeline( nvd_attributes=['project', 'description'], share_hooks=True) # check that the pipeline contains correct steps steps, preps = list(zip(*pipeline.steps)) self.assertIsInstance(preps[0], preprocessors.NVDFeedPreprocessor) self.assertIsInstance(preps[1], preprocessors.LabelPreprocessor) self.assertIsInstance(preps[2], preprocessors.NLTKPreprocessor) # set up fit parameters (see sklearn fit_params notation) fit_params = { "%s__feed_attributes" % steps[2]: ['description'], "%s__output_attributes" % steps[2]: ['label'] } prep_data = pipeline.fit_transform(X=self.test_data, **fit_params) # sanity check self.assertLessEqual(len(prep_data), len(self.test_data)) # check that prep_data is not empty # NOTE: this is a bit risky since there is no assurance that there # are suitable cves in the first n records self.assertTrue(any(prep_data)) self.assertTrue(hasattr(prep_data[0], 'features')) # default output self.assertTrue(hasattr(prep_data[0], 'label')) # custom attribute # --- # custom attributes pipeline = pipelines.get_preprocessing_pipeline( nvd_attributes=['cve_id', 'project', 'description'], share_hooks=True # reuse already existing hook ) fit_params = { "%s__feed_attributes" % steps[2]: ['description'], "%s__output_attributes" % steps[2]: ['cve_id', 'label'] } prep_data = pipeline.fit_transform(X=self.test_data, **fit_params) # sanity check self.assertLessEqual(len(prep_data), len(self.test_data)) self.assertTrue(any(prep_data)) self.assertTrue(hasattr(prep_data[0], 'features')) # default output self.assertTrue(hasattr(prep_data[0], 'label')) # custom attribute
def _get_preprocessed_test_data(): """Return preprocessed data. Note: used for tests only.""" from nvdlib.nvd import NVD feed = NVD.from_feeds(feed_names=['recent']) # download and update feed.update() # get the sample cves __cve_iter = feed.cves() __records = 500 data = [next(__cve_iter) for _ in range(__records)] # only first n to speed up tests pipeline = get_preprocessing_pipeline() steps, preps = list(zip(*pipeline.steps)) # set up fit parameters (see sklearn fit_params notation) fit_params = { "%s__feed_attributes" % steps[2]: ['description'], "%s__output_attributes" % steps[2]: ['label'] } prep_data = pipeline.fit_transform(X=data, **fit_params) return prep_data
def test_training_pipeline(self): """Test training pipeline.""" prep_pipeline = pipelines.get_preprocessing_pipeline( nvd_attributes=['project', 'description'], share_hooks=True) steps, preps = list(zip(*prep_pipeline.steps)) fit_params = { "%s__feed_attributes" % steps[2]: ['description'], "%s__output_attributes" % steps[2]: ['label'] } prep_data = prep_pipeline.fit_transform(X=self.test_data, **fit_params) # split the data prep_data = np.array(prep_data) features, labels = prep_data[:, 0], prep_data[:, 1] train_pipeline = pipelines.get_training_pipeline() _, trains = list(zip(*train_pipeline.steps)) self.assertIsInstance(trains[0], extractors.FeatureExtractor) self.assertIsInstance(trains[1], classifiers.NBClassifier) clf = train_pipeline.fit_transform(X=features, y=labels) self.assertIsNotNone(clf) self.assertIsNotNone(clf.features)
def main(): args = __parser.parse_args() if args.csv: # TODO raise NotImplementedError("The feature has not been implemented yet." " Sorry for the inconvenience.") else: print("Getting NVD Feed...") feed = NVD.from_feeds(feed_names=args.nvd_feeds) feed.update() data = feed.cves() # generator # transform and transform the data with the pre-processing pipeline print("Preprocessing...") prep_pipeline = pipelines.get_preprocessing_pipeline() steps, preps = list(zip(*prep_pipeline.steps)) fit_params = { "%s__feed_attributes" % steps[2]: ['description'], "%s__output_attributes" % steps[2]: ['label'] } prep_data = prep_pipeline.fit_transform(X=data, **fit_params) print("Preprocessing done.") prep_data = np.array(prep_data) if not prep_data.size > 0: print( "No data left after preprocessing. Check the data provided" " or modify preprocessing pipeline.", file=sys.stderr) exit(1) # split the data to labels features, labels = prep_data[:, 0], prep_data[:, 1] print("Training...") # transform and transform the data with the training pipeline train_pipeline = pipelines.get_training_pipeline( feature_hooks=FEATURE_HOOKS) classifier = train_pipeline.fit_transform(X=features, y=labels) print("Training done.") if args.export: classifier.export()