def get_training_pipeline(feature_hooks=None) -> Pipeline: """Build the training pipeline from FeatureExtractor and NBClassifier. The training pipeline expects as an input preprocessed data and trains NBClassifier on that data. *must be fit using `fit_transform` method.* :param feature_hooks: dict, {feature_key: Hook} to be used as an argument to `FeatureExtractor` Specify features which should be extracted from the given set. The hooks are called for each element of the set and return corresponding features. """ return Pipeline(steps=[ ( 'feature_extractor', transformers.FeatureExtractor( feature_hooks=feature_hooks, # make hooks sharable (useful if training pipeline was used before) share_hooks=True)), ('classifier', transformers.NBClassifier()) ])
def main(argv): """Run.""" args = parse_args(argv=argv) if args.csv: # TODO raise NotImplementedError("The feature has not been implemented yet." " Sorry for the inconvenience.") else: print("Getting NVD Feed...") feed = NVD.from_feeds(feed_names=args.nvd_feeds) feed.update() data = list(feed.cves()) # generator cve_dict = {cve.cve_id: cve for cve in data} # set up default argument for vendor-product feature hook feature_hooks.vendor_product_match_hook.default_kwargs = { 'cve_dict': cve_dict } training_pipeline = Pipeline( steps=[('nvd_feed_preprocessor', preprocessing.NVDFeedPreprocessor( attributes=['cve_id', 'description'])), ('label_preprocessor', preprocessing.LabelPreprocessor( feed_attributes=['project', 'description'], output_attributes=['cve_id', 'description'], hook=transformers.Hook( key='label_hook', reuse=True, func=utils.find_))), ('nltk_preprocessor', preprocessing.NLTKPreprocessor( feed_attributes=['description'], output_attributes=['cve_id', 'label'])), ('feature_extractor', transformers.FeatureExtractor(feature_hooks=FEATURE_HOOKS, share_hooks=True) ), ('classifier', transformers.NBClassifier())]) start_time = time() print("Training started") try: classifier = training_pipeline.fit_transform(X=data) finally: print(f"Training finished in {time() - start_time} seconds") if args.export: classifier.export(args.export_dir)
def get_full_training_pipeline(labeling_func: typing.Callable = None, feature_hooks=None, share_hooks=False) -> Pipeline: """Build the full training pipeline with no predefined attributes. The pipeline accepts raw data, performs preprocessing and feature extraction and trains NBClassifier on that data. The customization of feed and output attributes is fully left to user. It is necessary to provide `fit_params` when fitting, as this pipeline does not contain any predefined arguments. *must be fit using `fit_transform` method with `fit_params`* :param feature_hooks: dict, {feature_key: Hook} to be used as an argument to `FeatureExtractor` Specify features which should be extracted from the given set. The hooks are called for each element of the set and return corresponding features. :param labeling_func: callable object to be used for labeling The `labeling_func` is used to create a hook for `LabelPreprocessor` (see `LabelPreprocessor` documentation for more info). By default `toolkit.utils.find_` function is used for that purpose. :param share_hooks: boolean, whether to reuse hooks :returns: Pipeline """ if labeling_func is None: labeling_func = utils.find_ return Pipeline( steps=[('nvd_feed_preprocessor', preprocessing.NVDFeedPreprocessor()), ('label_preprocessor', preprocessing.LabelPreprocessor(hook=transformers.Hook( key='label_hook', reuse=share_hooks, func=labeling_func)) ), ('nltk_preprocessor', preprocessing.NLTKPreprocessor()), ('feature_extractor', transformers.FeatureExtractor(feature_hooks=feature_hooks, share_hooks=True) ), ('classifier', transformers.NBClassifier())])