Exemplo n.º 1
0
def main():
    """
    Main entry point / top-level execution here
    """
    labels_dict = extract_labels()

    raw_unlabeled_test_set_dict = read_unlabeled_test_set()
    processed_unlabeled_test_set = preprocess(None,
                                              raw_unlabeled_test_set_dict)

    raw_training_set_dict = read_training_set()
    processed_training_set = preprocess(labels_dict, raw_training_set_dict)

    split = len(processed_training_set) // CORPUS_SPLIT

    testing = dict(list(processed_training_set.items())[:split])  # 1/3
    training = dict(list(processed_training_set.items())[split:])  # 2/3

    d_print('training and testing set sizes',
            str(len(training)),
            str(len(testing)),
            source='main')

    # TODO: INSTANTIATE YOUR CLASSIFIER AND ADD IT TO THE DICT
    nb = naive_bayesian.NaiveBayesianClassifier()
    svm_clf = svm.SVMClassifier()
    dt = decision_tree.DecisionTreeClassifier()

    classifiers = {'Naive Bayesian': nb, 'SVM': svm_clf, 'Decision Tree': dt}

    train(classifiers, training)
    classify(classifiers, testing, processed_unlabeled_test_set)
Exemplo n.º 2
0
    def classify_all(self, emails):
        start = timer()
        ret = {}
        for key, email in emails.items():
            ret[key] = self.classify(email)[0]

        end = timer()
        d_print("Classification done, t = " + str(end - start), source="SVM")
        return ret
Exemplo n.º 3
0
def extract_labels():
    """
    Extract labels.txt to build a dictionary and save the result to JSON file.
    If there's already a labels.json available, just read from this file.
    """
    if (os.path.isfile('./labels.json')):
        d_print('Reading labels from the local JSON cache',
                source='extract_labels')
        with open('labels.json') as labels_json:
            return json.load(labels_json)
    else:
        d_print('Generating a local JSON cache of labels',
                source='extract_labels')
        labels_txt = open('labels.txt')
        labels_txt_lines = labels_txt.readlines()
        labels_dict = dict(
            (label.split()[1], label.split()[0]) for label in labels_txt_lines)
        with open('labels.json', 'w') as labels_json:
            json.dump(labels_dict, labels_json)
        return labels_dict
Exemplo n.º 4
0
def read_unlabeled_test_set():
    """
    Read all raw training files from the directory ./TRAINING into a dictionary, { filename: content ... }
    """
    if (os.path.isfile('./unlabeled_test_set.json')):
        d_print('Reading unlabeled test set from the local JSON cache',
                source='read_unlabeled_test_set')
        with open('unlabeled_test_set.json') as unlabeled_test_set:
            return json.load(unlabeled_test_set)
    else:
        d_print('Generating unlabeled test set from EML files',
                source='read_unlabeled_test_set')
        test_files = os.listdir('TESTING')
        test_files_dict = {}
        for file_name in test_files:
            temp = open('TESTING/' + file_name,
                        'r',
                        encoding='utf-8',
                        errors='ignore')
            test_files_dict[file_name] = temp.read()
        with open('unlabeled_test_set.json', 'w') as unlabeled_test_set:
            json.dump(test_files_dict, unlabeled_test_set)
        return test_files_dict
Exemplo n.º 5
0
def train(classifiers, training_set):
    """
    Calls training routines of the classifiers.
    """
    for cls_name in classifiers.keys():
        if cls_name in EXCLUSION_LIST_FOR_LIVE_DEMO:
            d_print(cls_name,
                    'skipped for Live Demo',
                    source=cls_name + ' (main)')
        else:
            d_print('Starting additional pre-processing and training',
                    source=cls_name + ' (main)')

            start = timer()
            classifiers[cls_name].train(training_set)
            end = timer()

            d_print('Training complete, t =',
                    str(end - start),
                    source=cls_name + ' (main)')
Exemplo n.º 6
0
    def train(self, training_set):

        self.all_features = self.all_words(training_set)
        features = []
        labels = []

        # The number of emails to use for training
        n_laps = len(training_set)
        start = timer()
        i = 0
        for _, email_data in training_set.items():
            # Find all used words
            f_vec = self.get_feature_vector(email_data)
            labels.append(email_data["label"])
            features.append(f_vec)

            # Abort earlier so we can limit the nr of features
            if i == n_laps:
                break
            else:
                i = i + 1

        end = timer()
        d_print("Pre-processing done, t = " + str(end - start), source="SVM")

        #  Reduce the number of features
        start = timer()
        self.feature_selection = SelectKBest(f_classif, k = self.number_of_features)
        important_features = self.feature_selection.fit_transform(features, labels)
        end = timer()
        d_print("Feature selection done, t = " + str(end - start), source="SVM")


        #  Train the classifier
        start = timer()
        self.classifier = SVC()
        self.classifier.fit(important_features, labels)
        end = timer()
        d_print("Classifier training done, t = " + str(end - start), source="SVM")
Exemplo n.º 7
0
def classify(classifiers, testing_set, unlabeled_testing_set):
    """
    Calls classify routines of the classifiers using classify_all (in meta.py), and reports accuracy.
    """
    for cls_name in classifiers.keys():
        print()
        if cls_name in EXCLUSION_LIST_FOR_LIVE_DEMO:
            d_print(cls_name, 'skipped for Live Demo', source='classify')
        else:
            d_print('Starting classification of labeled testing set',
                    source=cls_name + ' (main)')

            start = timer()
            result = classifiers[cls_name].classify_all(testing_set)
            end = timer()

            d_print('Classification of labeled testing set done, t =',
                    str(end - start),
                    source=cls_name + ' (main)')
            assert len(result) == len(testing_set)

            correct_result_count = 0
            for eml_filename in result.keys():
                if str(result[eml_filename]) == str(
                        testing_set[eml_filename]['label']):
                    correct_result_count += 1
            print(
                '\n', correct_result_count, 'out of', len(result),
                'cases were correct.\n', cls_name,
                'is {:6.4f} % accurate.\n'.format(correct_result_count /
                                                  len(result) * 100))

            d_print('Starting classification of unlabeled testing set',
                    source=cls_name + ' (main)')

            start = timer()
            result = classifiers[cls_name].classify_all(unlabeled_testing_set)
            end = timer()

            d_print('Classification of unlabeled testing set done, t =',
                    str(end - start),
                    source=cls_name + ' (main)')
            assert len(result) == len(unlabeled_testing_set)

            spam_result_count = 0
            for eml_filename in result.keys():
                if str(result[eml_filename]) == str(0):
                    spam_result_count += 1
            print(
                '\n', spam_result_count, 'out of', len(result),
                'unlabeled cases were reported as spam.\n', cls_name,
                'claims {:6.4f} % of unlabeled test set is spam.\n'.format(
                    spam_result_count / len(result) * 100))

            d_print('Finished classification', source=cls_name + ' (main)')