def test_model(names, labels, attributes, model, print_res=True, print_score=True): """ Use an existing model to classify new JS inputs. ------- Parameters: - names: list Name of the data files used to be tested using the following model. - labels: list Labels (i.e. 'benign', 'malicious', or '?') of the test data using the model. - attributes: csr_matrix Features of the data used to be tested using the following model. - model Model to be used to classify new observations. Beware: the model must have been constructed using the same parameters as for the current classification process. - print_res: bool Indicates whether to print or not the classifier's predictions. - print_score: bool Indicates whether to print or not the classifier's performance. ------- Returns: - list: List of labels predicted. """ if isinstance(model, str): model = pickle.load(open(model, 'rb')) labels_predicted_test = model.predict(attributes) if print_res: machine_learning.get_classification_results(names, labels_predicted_test) if print_score: machine_learning.get_score(labels, labels_predicted_test) return labels_predicted_test
def classify(names, labels, attributes, model_dir, model_name, clf_choice, estimators, print_score=False, print_res=False): """ Training a classifier. ------- Parameters: - names: list Name of the data files used to build a model from. - labels: list Labels (i.e. 'benign', 'malicious') of the data used to build a model from. - attributes: np.array Features of the data used to build a model from. - model_dir: str Path to store the model that will be produced. - model_name: str Name of the model that will be produced. - clf_choice: str Classifier choice. Either RF, BNB, or MNB. - estimators: int Number of trees in the forest. - print_score: bool Indicates whether to print or not the classifier's performance. Default: False. - print_res: bool Indicates whether to print or not the classifier's predictions. Default: False. ------- Returns: - The selected model constructed using the training attributes. Beware: the model was implemented as a global variable in sklearn. - If specified, can also: * Print the detection rate and the TP, FP, FN and TN rates of the training names tested with the model built from the training attributes, in stdout. It will only work for these two classes: 'benign' and 'malicious'. * Print the classifier's predictions. Beware, the predictions made using the same file to build and test the model will give hopelessly optimistic results. """ # Directory to store the classification related files if not os.path.exists(model_dir): os.makedirs(model_dir) clf = machine_learning.classifier_choice(clf_choice=clf_choice, estimators=estimators) trained = clf.fit(attributes, labels) # Model labels_predicted = clf.predict( attributes) # Classification and class predictions if print_score: machine_learning.get_score(labels, labels_predicted) if print_res: machine_learning.get_classification_results(names, labels_predicted) model_path = os.path.join(model_dir, model_name) pickle.dump(trained, open(model_path, 'wb')) logging.info('The model has been successfully stored in %s', model_path) return trained
def test_model(names, labels, attributes, model, print_res=False, print_res_verbose=True, print_score=True, threshold=0.50): """ Use an existing model to classify new JS inputs. ------- Parameters: - names: list Name of the data files used to be tested using the following model. - labels: list Labels (i.e. 'benign', 'malicious', or '?') of the test data using the model. - attributes: csr_matrix Features of the data used to be tested using the following model. - model Model to be used to classify new observations. Beware: the model must have been constructed using files of the same format (i.e. same attributes) as the format of test_file. - print_res: bool Indicates whether to print or not the classifier's predictions. - print_res_verbose: bool Indicates whether to print or not the classifier's predictions, including the probability of membership for each class. - print_score: bool Indicates whether to print or not the classifier's performance. - threshold: float Probability of a sample being malicious over which the sample will be classified as malicious. ------- Returns: - list: List of labels predicted. """ if isinstance(model, str): model = pickle.load(open(model, 'rb')) labels_predicted_proba_test = model.predict_proba(attributes) # Probability of the samples for each class in the model. # First column = benign, second = malicious. # labels_predicted_test = model.predict(attributes_test) # accuracy_test = model.score(attributes_test, labels_test) # Detection rate labels_predicted_test = machine_learning.\ predict_labels_using_threshold(len(names), labels_predicted_proba_test, threshold) # Perform classification using a threshold (probability of the sample being malicious) # to predict the target values if print_res: machine_learning.get_classification_results(names, labels_predicted_test) if print_res_verbose: machine_learning.get_classification_results_verbose( names, labels, labels_predicted_test, labels_predicted_proba_test, model, attributes, threshold) if print_score: machine_learning.get_score(labels, labels_predicted_test) return labels_predicted_test