def test_prediction_pipeline(self): """Test pipeline prediction.""" train_data, _ = pipelines.extract_labeled_features( self.test_data, nvd_attributes=['project', 'description'], nltk_feed_attributes=['description']) clf = classifiers.NBClassifier().fit(train_data) pred_data = [ 'Sample project name prediction', 'Sample project name prediction', 'Sample project name prediction', ] pred_pipeline = pipelines.get_prediction_pipeline(clf, ) n_candidates = 3 predictions = pred_pipeline.fit_predict(pred_data, classifier__n=n_candidates, classifier__sample=True) self.assertIsNotNone(predictions) self.assertEqual(predictions.shape[1], n_candidates) self.assertEqual(predictions.shape[-1], 2) # candidate - proba
def test_evaluation(self): """Test evaluation of extracted features""" test_data = _get_test_data() featuresets, _ = pipelines.extract_labeled_features( data=test_data, attributes=['description'], ) clf = classifiers.NBClassifier().fit(featuresets) self.assertIsNotNone(clf) # evaluation == 0.0 zero_labels = [None] * len(featuresets) score = clf.evaluate(featuresets, zero_labels, sample=True) self.assertIsNotNone(score) self.assertEqual(score, 0.0) score = classifiers.cross_validate(clf, featuresets, zero_labels, sample=True) self.assertIsNotNone(score) self.assertEqual(score.mean, 0.0)
def test_extract_labeled_features(self): """Test labeled feature extraction.""" featuresets, labels = pipelines.extract_labeled_features( data=self.test_data, nvd_attributes=['project', 'description'], nltk_feed_attributes=['description']) self.assertTrue(np.any(featuresets)) self.assertTrue(any(labels))
def test_extract_labeled_features(self): """Test labeled feature extraction.""" test_data = _get_test_data() featuresets, labels = pipelines.extract_labeled_features( data=test_data, attributes=['description'], ) self.assertTrue(any(featuresets)) self.assertTrue(any(labels))
def _export_classifier(): """Set up for unit tests by exporting classifier.""" raw_data = _get_test_data() data, _ = pipelines.extract_labeled_features( data=raw_data, nvd_attributes=['project', 'description'], nltk_feed_attributes=['description']) classifier = classifiers.NBClassifier() classifier = classifier.fit(data) tmp_dir = tempfile.mkdtemp(prefix='test_export_') pickle_path = classifier.export(export_dir=tmp_dir) return pickle_path
def setUpClass(cls): """Return preprocessed extracted labeled features.""" from nvdlib.nvd import NVD feed = NVD.from_feeds(feed_names=['recent']) # download and update feed.update() # get the sample cves __cve_iter = feed.cves() data = list(__cve_iter) data, labels = extract_labeled_features( data=data, nvd_attributes=['project', 'description'], nltk_feed_attributes=['description']) cls.data, cls.labels = data, labels
def _get_extracted_test_data(): """Return preprocessed data. Note: used for tests only.""" from nvdlib.nvd import NVD feed = NVD.from_feeds(feed_names=['recent']) # download and update feed.update() # get the sample cves __cve_iter = feed.cves() __records = 500 data = [next(__cve_iter) for _ in range(__records)] data, labels = extract_labeled_features(data=data, attributes=['description']) return data, labels
def _get_extracted_test_data(): """Return preprocessed data. Note: used for tests only. """ from nvdlib.nvd import NVD feed = NVD.from_feeds(feed_names=['recent']) # download and update feed.update() # get the sample cves __cve_iter = feed.cves() data = list(__cve_iter) data, labels = extract_labeled_features( data=data, nvd_attributes=['project', 'description']) return data, labels
def main(): args = __parser.parse_args() if args.csv: # TODO raise NotImplementedError("The feature has not been implemented yet." " Sorry for the inconvenience.") else: print("Getting NVD Feed...") feed = NVD.from_feeds(feed_names=args.nvd_feeds) feed.update() data = feed.cves() # generator # transform and transform the data with the pre-processing pipeline print("Preprocessing...") features, labels = pipelines.extract_labeled_features( data=data, feature_hooks=FEATURE_HOOKS, attributes=['description'], ) print("Preprocessing done.") if not data: print("No data left after preprocessing. Check the data provided" " or modify preprocessing pipeline.", file=sys.stderr) exit(1) path_to_classifier = os.path.join(os.getcwd(), args.path_to_classifier) classifier = classifiers.NBClassifier.restore(path_to_classifier) # noinspection PyPep8Naming X_train, X_test, y_train, y_test = train_test_split( # pylint: disable=invalid-name features, labels, test_size=0.2, random_state=np.random.randint(0, 100), shuffle=True ) if args.eval: score = classifier.evaluate(X_test, y_test, sample=True, n=args.num_candidates) print("Evaluation accuracy:", score) if args.cross_validate: score = classifiers.cross_validate( classifier, X_train, y_train, sample=True, n=args.num_candidates, folds=args.cross_validation_folds, shuffle=True ) print("Cross-validation results:") print("-------------------------") print("\tIntermediate results:\n") print( "\n".join("\t\tFold {}: {}".format(fold, np.round(value, 2)) for fold, value in enumerate(score.values)) ) print("\tAccuracy: %.2f (+/- %.4f)" % (np.round(score.mean, 2), np.round(score.std * 2, 4)))