def test_model(names,
               labels,
               attributes,
               model,
               print_res=True,
               print_score=True):
    """
        Use an existing model to classify new JS inputs.

        -------
        Parameters:
        - names: list
            Name of the data files used to be tested using the following model.
        - labels: list
            Labels (i.e. 'benign', 'malicious', or '?') of the test data using the model.
        - attributes: csr_matrix
            Features of the data used to be tested using the following model.
        - model
            Model to be used to classify new observations. Beware: the model must have been
            constructed using the same parameters as for the current classification process.
        - print_res: bool
            Indicates whether to print or not the classifier's predictions.
        - print_score: bool
            Indicates whether to print or not the classifier's performance.

        -------
        Returns:
        - list:
            List of labels predicted.
    """

    if isinstance(model, str):
        model = pickle.load(open(model, 'rb'))

    labels_predicted_test = model.predict(attributes)

    if print_res:
        machine_learning.get_classification_results(names,
                                                    labels_predicted_test)

    if print_score:
        machine_learning.get_score(labels, labels_predicted_test)

    return labels_predicted_test
Exemplo n.º 2
0
def classify(names,
             labels,
             attributes,
             model_dir,
             model_name,
             clf_choice,
             estimators,
             print_score=False,
             print_res=False):
    """
        Training a classifier.

        -------
        Parameters:
        - names: list
            Name of the data files used to build a model from.
        - labels: list
            Labels (i.e. 'benign', 'malicious') of the data used to build a model from.
        - attributes: np.array
            Features of the data used to build a model from.
        - model_dir: str
            Path to store the model that will be produced.
        - model_name: str
            Name of the model that will be produced.
        - clf_choice: str
            Classifier choice. Either RF, BNB, or MNB.
        - estimators: int
            Number of trees in the forest.
        - print_score: bool
            Indicates whether to print or not the classifier's performance. Default: False.
        - print_res: bool
            Indicates whether to print or not the classifier's predictions. Default: False.

        -------
        Returns:
        - The selected model constructed using the training attributes.
            Beware: the model was implemented as a global variable in sklearn.
        - If specified, can also:
            * Print the detection rate and the TP, FP, FN and TN rates of
            the training names tested with the model built from the training attributes, in stdout.
            It will only work for these two classes: 'benign' and 'malicious'.
            * Print the classifier's predictions.
            Beware, the predictions made using the same file to build and test
            the model will give hopelessly optimistic results.
    """

    # Directory to store the classification related files
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    clf = machine_learning.classifier_choice(clf_choice=clf_choice,
                                             estimators=estimators)
    trained = clf.fit(attributes, labels)  # Model
    labels_predicted = clf.predict(
        attributes)  # Classification and class predictions

    if print_score:
        machine_learning.get_score(labels, labels_predicted)

    if print_res:
        machine_learning.get_classification_results(names, labels_predicted)

    model_path = os.path.join(model_dir, model_name)
    pickle.dump(trained, open(model_path, 'wb'))
    logging.info('The model has been successfully stored in %s', model_path)

    return trained
Exemplo n.º 3
0
def test_model(names,
               labels,
               attributes,
               model,
               print_res=False,
               print_res_verbose=True,
               print_score=True,
               threshold=0.50):
    """
        Use an existing model to classify new JS inputs.

        -------
        Parameters:
        - names: list
            Name of the data files used to be tested using the following model.
        - labels: list
            Labels (i.e. 'benign', 'malicious', or '?') of the test data using the model.
        - attributes: csr_matrix
            Features of the data used to be tested using the following model.
        - model
            Model to be used to classify new observations.
        Beware: the model must have been constructed using files of the same format
        (i.e. same attributes) as the format of test_file.
        - print_res: bool
            Indicates whether to print or not the classifier's predictions.
        - print_res_verbose: bool
            Indicates whether to print or not the classifier's predictions, including the
            probability of membership for each class.
        - print_score: bool
            Indicates whether to print or not the classifier's performance.
        - threshold: float
            Probability of a sample being malicious over which the sample will be classified
            as malicious.

        -------
        Returns:
        - list:
            List of labels predicted.
    """

    if isinstance(model, str):
        model = pickle.load(open(model, 'rb'))

    labels_predicted_proba_test = model.predict_proba(attributes)
    # Probability of the samples for each class in the model.
    # First column = benign, second = malicious.
    # labels_predicted_test = model.predict(attributes_test)
    # accuracy_test = model.score(attributes_test, labels_test)  # Detection rate

    labels_predicted_test = machine_learning.\
        predict_labels_using_threshold(len(names), labels_predicted_proba_test, threshold)
    # Perform classification using a threshold (probability of the sample being malicious)
    # to predict the target values

    if print_res:
        machine_learning.get_classification_results(names,
                                                    labels_predicted_test)

    if print_res_verbose:
        machine_learning.get_classification_results_verbose(
            names, labels, labels_predicted_test, labels_predicted_proba_test,
            model, attributes, threshold)

    if print_score:
        machine_learning.get_score(labels, labels_predicted_test)

    return labels_predicted_test