예제 #1
0
def classify_email_with_enron(email):
    """
    Classify spam possibility of given email with enron dataset.
    Args:
      email (str):
        Raw e-mail.
    Returns:
      Spam or not.
    """

    vocablary_dict = email_processor.create_enron_dictionary()
    feature_vector = email_processor.feature_vector_from_email(
        email, vocablary_dict)
    double_dimesion_email = np.reshape(feature_vector, (-1, 3000))
    if os.path.exists('enron_features_matrix.npy'
                      ) == False & os.path.exists('enron_labels.npy') == False:
        features_matrix, labels = email_processor.extract_enron_features()
        np.save('enron_features_matrix.npy', features_matrix)
        np.save('enron_labels.npy', labels)
    else:
        features_matrix = np.load('enron_features_matrix.npy')
        labels = np.load('enron_labels.npy')
    X_train, _, y_train, _ = train_test_split(features_matrix,
                                              labels,
                                              test_size=0.40)
    linear_svc.fit(X_train, y_train)
    return linear_svc.predict(double_dimesion_email)
예제 #2
0
 def test_feature_vector_from_email(self):
     email = "<*****@*****.**> Do You Want To Make $1000 Or More Per Week? https://github.com"
     vocablary_dict = email_processor.get_vocablary_dict()
     feature_vector = email_processor.feature_vector_from_email(
         email, vocablary_dict
     )
     self.assertEqual(len(feature_vector), 1899)
예제 #3
0
def classify_email(email):
    """
    Classify spam possibility of given email.
    Args:
      email (str):
        Raw e-mail.
    Returns:
      Spam or not.
    """

    train_svm()
    vocablary_dict = email_processor.get_vocablary_dict()
    feature_vector = email_processor.feature_vector_from_email(
        email, vocablary_dict)
    double_dimesion_email = np.reshape(feature_vector, (-1, 1899))
    spam_prediction = linear_svm.predict(double_dimesion_email)
    return spam_prediction