Пример #1
0
    def test_preprocessing_pipeline(self):
        """Test preprocessing pipeline."""
        # default prep pipeline
        pipeline = pipelines.get_preprocessing_pipeline(
            nvd_attributes=['project', 'description'], share_hooks=True)

        # check that the pipeline contains correct steps
        steps, preps = list(zip(*pipeline.steps))
        self.assertIsInstance(preps[0], preprocessors.NVDFeedPreprocessor)
        self.assertIsInstance(preps[1], preprocessors.LabelPreprocessor)
        self.assertIsInstance(preps[2], preprocessors.NLTKPreprocessor)

        # set up fit parameters (see sklearn fit_params notation)
        fit_params = {
            "%s__feed_attributes" % steps[2]: ['description'],
            "%s__output_attributes" % steps[2]: ['label']
        }

        prep_data = pipeline.fit_transform(X=self.test_data, **fit_params)

        # sanity check
        self.assertLessEqual(len(prep_data), len(self.test_data))

        # check that prep_data is not empty
        # NOTE: this is a bit risky since there is no assurance that there
        # are suitable cves in the first n records
        self.assertTrue(any(prep_data))
        self.assertTrue(hasattr(prep_data[0], 'features'))  # default output
        self.assertTrue(hasattr(prep_data[0], 'label'))  # custom attribute

        # ---

        # custom attributes
        pipeline = pipelines.get_preprocessing_pipeline(
            nvd_attributes=['cve_id', 'project', 'description'],
            share_hooks=True  # reuse already existing hook
        )

        fit_params = {
            "%s__feed_attributes" % steps[2]: ['description'],
            "%s__output_attributes" % steps[2]: ['cve_id', 'label']
        }

        prep_data = pipeline.fit_transform(X=self.test_data, **fit_params)

        # sanity check
        self.assertLessEqual(len(prep_data), len(self.test_data))

        self.assertTrue(any(prep_data))
        self.assertTrue(hasattr(prep_data[0], 'features'))  # default output
        self.assertTrue(hasattr(prep_data[0], 'label'))  # custom attribute
def _get_preprocessed_test_data():
    """Return preprocessed data.

    Note: used for tests only."""
    from nvdlib.nvd import NVD

    feed = NVD.from_feeds(feed_names=['recent'])
    # download and update
    feed.update()

    # get the sample cves
    __cve_iter = feed.cves()
    __records = 500

    data = [next(__cve_iter)
            for _ in range(__records)]  # only first n to speed up tests
    pipeline = get_preprocessing_pipeline()
    steps, preps = list(zip(*pipeline.steps))

    # set up fit parameters (see sklearn fit_params notation)
    fit_params = {
        "%s__feed_attributes" % steps[2]: ['description'],
        "%s__output_attributes" % steps[2]: ['label']
    }

    prep_data = pipeline.fit_transform(X=data, **fit_params)

    return prep_data
Пример #3
0
    def test_training_pipeline(self):
        """Test training pipeline."""
        prep_pipeline = pipelines.get_preprocessing_pipeline(
            nvd_attributes=['project', 'description'], share_hooks=True)

        steps, preps = list(zip(*prep_pipeline.steps))
        fit_params = {
            "%s__feed_attributes" % steps[2]: ['description'],
            "%s__output_attributes" % steps[2]: ['label']
        }

        prep_data = prep_pipeline.fit_transform(X=self.test_data, **fit_params)

        # split the data
        prep_data = np.array(prep_data)

        features, labels = prep_data[:, 0], prep_data[:, 1]

        train_pipeline = pipelines.get_training_pipeline()
        _, trains = list(zip(*train_pipeline.steps))

        self.assertIsInstance(trains[0], extractors.FeatureExtractor)
        self.assertIsInstance(trains[1], classifiers.NBClassifier)

        clf = train_pipeline.fit_transform(X=features, y=labels)

        self.assertIsNotNone(clf)
        self.assertIsNotNone(clf.features)
Пример #4
0
def main():
    args = __parser.parse_args()

    if args.csv:
        # TODO
        raise NotImplementedError("The feature has not been implemented yet."
                                  " Sorry for the inconvenience.")
    else:
        print("Getting NVD Feed...")
        feed = NVD.from_feeds(feed_names=args.nvd_feeds)
        feed.update()
        data = feed.cves()  # generator

    # transform and transform the data with the pre-processing pipeline
    print("Preprocessing...")
    prep_pipeline = pipelines.get_preprocessing_pipeline()
    steps, preps = list(zip(*prep_pipeline.steps))
    fit_params = {
        "%s__feed_attributes" % steps[2]: ['description'],
        "%s__output_attributes" % steps[2]: ['label']
    }

    prep_data = prep_pipeline.fit_transform(X=data, **fit_params)
    print("Preprocessing done.")

    prep_data = np.array(prep_data)
    if not prep_data.size > 0:
        print(
            "No data left after preprocessing. Check the data provided"
            " or modify preprocessing pipeline.",
            file=sys.stderr)
        exit(1)

    # split the data to labels
    features, labels = prep_data[:, 0], prep_data[:, 1]

    print("Training...")
    # transform and transform the data with the training pipeline
    train_pipeline = pipelines.get_training_pipeline(
        feature_hooks=FEATURE_HOOKS)

    classifier = train_pipeline.fit_transform(X=features, y=labels)
    print("Training done.")

    if args.export:
        classifier.export()