def test_load_dataset_from_csv():

    classifier = NaiveBayesClassifier()

    csv_filename = 'datasets/iris.csv'

    data_0 = ['5.1', '3.5', '1.4', '0.2', 'Iris-setosa']
    data_2 = ['4.7', '3.2', '1.3', '0.2', 'Iris-setosa']
    data_39 = ['5.1','3.4','1.5','0.2','Iris-setosa']
    data_60 = ['5.0','2.0','3.5','1.0','Iris-versicolor']
    data_81 = ['5.5','2.4','3.7','1.0','Iris-versicolor']
    data_89 = ['5.5','2.5','4.0','1.3','Iris-versicolor']
    data_104 = ['6.5','3.0','5.8','2.2','Iris-virginica']
    data_110 = ['6.5','3.2','5.1','2.0','Iris-virginica']
    data_125 = ['7.2', '3.2', '6.0', '1.8', 'Iris-virginica']
    data_143 = ['6.8','3.2','5.9','2.3','Iris-virginica']

    readed_dataset = classifier.load_dataset_from_csv(csv_filename)

    assert readed_dataset[0] == data_0
    assert readed_dataset[2] == data_2
    assert readed_dataset[39] == data_39
    assert readed_dataset[60] == data_60
    assert readed_dataset[81] == data_81
    assert readed_dataset[89] == data_89
    assert readed_dataset[104] == data_104
    assert readed_dataset[110] == data_110
    assert readed_dataset[125] == data_125
    assert readed_dataset[143] == data_143

    csv_filename_2 = 'tests/unit_tests/resources/load_test.csv'

    readed_dataset_2 = classifier.load_dataset_from_csv(csv_filename_2)

    assert len(readed_dataset_2) == 3
示例#2
0
    def __init__(self):

        self.dataset_filename = 'datasets/iris.csv'
        self.description_filename = 'datasets/iris.names'
        self.nbc = NaiveBayesClassifier()
        self.dataset = self.nbc.load_dataset_from_csv(self.dataset_filename)
        self.class_map = dict()
示例#3
0
def test_std_deviation():

    classifier = NaiveBayesClassifier()
    numbers = [0.5, 1, 4.56, 3]

    assert np.around(classifier.std_deviation(numbers), 13) == 1.8728498783049
    assert classifier.std_deviation(numbers) == np.std(numbers, ddof=1)
def test_gaussian_probability():

    classifier = NaiveBayesClassifier()

    numbers = [[1.0, 1.0, 1.0], [2.0, 1.0, 1.0], [0.0, 1.0, 1.0]]
    results = [0.3989422804014327, 0.24197072451914337, 0.24197072451914337]

    for i in range(0, len(numbers)):
        assert classifier.gaussian_probability(numbers[i][0], numbers[i][1],
                                               numbers[i][2]) == results[i]
示例#5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('path', type=str, help='the path to training data')
    args = parser.parse_args()

    training_data_path = args.path

    _, labels, sentences = load_training_set(path=training_data_path)
    nb = NB(sentences=sentences, labels=labels)
    nb.learn()
    nb.save_model('./nbmodel.txt')
示例#6
0
 def __init__(self):
     NaiveBayesClassifier.__init__(
         self, 5, classes=["ORGANIZATION", "LOCATION", "PERSON"])
     NAMES = []
     f = open('./gazeteers/names.male', 'r').read().splitlines()
     for n in f:
         NAMES.append(n)
     f = open('./gazeteers/names.female', 'r').read().splitlines()
     for n in f:
         NAMES.append(n)
     f = open('./gazeteers/names.family', 'r').read().splitlines()
     for n in f:
         NAMES.append(n)
     self.names = NAMES
def test_evaluate_algorithm():

    classifier = NaiveBayesClassifier()

    dataset = [[3.393533211, 2.331273381, 0], [3.110073483, 1.781539638, 0],
               [1.343808831, 3.368360954, 0], [3.582294042, 4.67917911, 0],
               [2.280362439, 2.866990263, 0], [7.423436942, 4.696522875, 1],
               [5.745051997, 3.533989803, 1], [9.172168622, 2.511101045, 1],
               [7.792783481, 3.424088941, 1], [7.939820817, 0.791637231, 1]]

    n_folds = 5
    results_data = classifier.evaluate_algorithm(dataset, n_folds)

    assert len(results_data) == n_folds
    assert [data for data in results_data if 0 <= data <= 100]
示例#8
0
def test_gather_data_params():

    classifier = NaiveBayesClassifier()

    dataset = [[3.393533211, 2.331273381, 0], [3.110073483, 1.781539638, 0],
               [1.343808831, 3.368360954, 0], [3.582294042, 4.67917911, 0],
               [2.280362439, 2.866990263, 0], [7.423436942, 4.696522875, 1],
               [5.745051997, 3.533989803, 1], [9.172168622, 2.511101045, 1],
               [7.792783481, 3.424088941, 1], [7.939820817, 0.791637231, 1]]

    results_dataset = [(5.178333386499999, 2.7665845055177263, 10),
                       (2.9984683241, 1.218556343617447, 10)]
    test_results = classifier.gather_data_params(dataset)

    assert test_results == results_dataset
示例#9
0
def test_k_fold_cross_validation_split():

    classifier = NaiveBayesClassifier()

    dataset = [[3.393533211, 2.331273381, 0], [3.110073483, 1.781539638, 0],
               [1.343808831, 3.368360954, 0], [3.582294042, 4.67917911, 0],
               [2.280362439, 2.866990263, 0], [7.423436942, 4.696522875, 1],
               [5.745051997, 3.533989803, 1], [9.172168622, 2.511101045, 1],
               [7.792783481, 3.424088941, 1], [7.939820817, 0.791637231, 1]]

    folds_num = 5

    results_dataset = classifier.k_fold_cross_validation_split(
        dataset, folds_num)

    assert len(results_dataset) == folds_num
示例#10
0
def test_predict():

    classifier = NaiveBayesClassifier()

    dataset = {
        1: [(2.7420144012, 0.9265683289298018, 5),
            (3.0054686692, 1.1073295894898725, 5)],
        0: [(7.6146523718, 1.2344321550313704, 5),
            (2.9914679790000003, 1.4541931384601618, 5)]
    }

    row = [3.7, 2.9, 0]

    results_predict = classifier.predict(dataset, row)

    assert results_predict == 1
示例#11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('path', type=str, help='the path to training data')
    args = parser.parse_args()

    test_data_path = args.path
    ids, sentences = load_dev_set(test_data_path)

    nb = NB()
    nb.load_model('./nbmodel.txt')

    results = list()

    for s in sentences:
        re = nb.classify(s)
        results.append(re)

    save_results('./nboutput.txt', results, ids)
示例#12
0
def test_calculate_class_parameters():

    classifier = NaiveBayesClassifier()

    dataset = [[3.393533211, 2.331273381, 0], [3.110073483, 1.781539638, 0],
               [1.343808831, 3.368360954, 0], [3.582294042, 4.67917911, 0],
               [2.280362439, 2.866990263, 0], [7.423436942, 4.696522875, 1],
               [5.745051997, 3.533989803, 1], [9.172168622, 2.511101045, 1],
               [7.792783481, 3.424088941, 1], [7.939820817, 0.791637231, 1]]

    results_dataset = {
        0: [(2.7420144012, 0.9265683289298018, 5),
            (3.0054686692, 1.1073295894898725, 5)],
        1: [(7.6146523718, 1.2344321550313704, 5),
            (2.9914679790000003, 1.4541931384601618, 5)]
    }

    assert classifier.calculate_class_parameters(dataset) == results_dataset
示例#13
0
def main():
    '''

    Main function
    :return:
        NAN
    '''

    # Load Data
    x_train, y_train, x_test, y_test, label_dict = load_mnist(
        which_type='fashion', threshold=0.5)

    # Get the Model
    nbc = NaiveBayesClassifier()

    # Train
    nbc.fit(x_train, y_train)

    # Test
    predictions = nbc.predict(x_test)

    # Evaluate accuracy
    accuracy = np.sum(np.uint8(predictions == y_test)) / len(y_test)
    print("Accuracy: ", accuracy)

    # Show Confusion Matrix
    plot_confusion_matrix(targets=y_test,
                          predictions=predictions,
                          classes=[label_dict[l] for l in label_dict])

    # Plot predictions
    plt.figure()
    while True:
        idx = np.random.randint(0, x_test.shape[0])
        x = x_test[idx]
        p = predictions[idx]
        y = y_test[idx]

        plt.imshow(x, cmap='gray')
        plt.title('Target: {}, Prediction: {}'.format(label_dict[int(y)],
                                                      label_dict[int(p)]))
        plt.waitforbuttonpress()
def test_divide_data_by_class():

    classifier = NaiveBayesClassifier()

    dataset = [[3.393533211, 2.331273381, 0], [3.110073483, 1.781539638, 0],
               [1.343808831, 3.368360954, 0], [3.582294042, 4.67917911, 0],
               [2.280362439, 2.866990263, 0], [7.423436942, 4.696522875, 1],
               [5.745051997, 3.533989803, 1], [9.172168622, 2.511101045, 1],
               [7.792783481, 3.424088941, 1], [7.939820817, 0.791637231, 1]]

    results_dataset = {
        0: [[3.393533211, 2.331273381, 0], [3.110073483, 1.781539638, 0],
            [1.343808831, 3.368360954, 0], [3.582294042, 4.67917911, 0],
            [2.280362439, 2.866990263, 0]],
        1: [[7.423436942, 4.696522875, 1], [5.745051997, 3.533989803, 1],
            [9.172168622, 2.511101045, 1], [7.792783481, 3.424088941, 1],
            [7.939820817, 0.791637231, 1]]
    }

    assert classifier.divide_data_by_class(dataset) == results_dataset
def test_map_class_names_to_ints():

    classifier = NaiveBayesClassifier()

    dataset = [['3.393533211', '2.331273381', '0'],
               ['3.110073483', '1.781539638', '0'],
               ['1.343808831', '3.368360954', '0'],
               ['3.582294042', '4.67917911', '0'],
               ['2.280362439', '2.866990263', '0'],
               ['7.423436942', '4.696522875', '1'],
               ['5.745051997', '3.533989803', '1'],
               ['9.172168622', '2.511101045', '1'],
               ['7.792783481', '3.424088941', '1'],
               ['7.939820817', '0.791637231', '1']]

    classifier.map_class_names_to_ints(dataset, len(dataset[0]) - 1, True)

    for i in range(0, len(dataset)):
        tested_row = random.randint(0, len(dataset) - 1)
        assert isinstance(dataset[tested_row][len(dataset[0]) - 1], int)

    classifier.map_class_names_to_ints(dataset, len(dataset[0]) - 1, False)

    for i in range(0, len(dataset)):
        tested_row = random.randint(0, len(dataset) - 1)
        assert isinstance(dataset[tested_row][len(dataset[0]) - 1], int)
示例#16
0
def main():
    """ Main function """

    # load data
    x_train, y_train, x_test, y_test, label_dict = load_mnist(
        which_type='digits', threshold=0.5)

    # get the model
    nbc = NaiveBayesClassifier()

    # train
    nbc.fit(x_train, y_train)

    # test
    predictions = nbc.predict(x_test)

    # evaluate performances
    accuracy = np.sum(np.uint8(predictions == y_test)) / len(y_test)
    print('Accuracy: {}'.format(accuracy))

    # show confusion matrix
    plot_confusion_matrix(targets=y_test,
                          predictions=predictions,
                          classes=[label_dict[l] for l in label_dict])

    # plot predictions
    plt.figure()
    while True:
        idx = np.random.randint(0, x_test.shape[0])

        x = x_test[idx]
        p = predictions[idx]
        y = y_test[idx]

        plt.imshow(x, cmap='gray')
        plt.title('Target: {}, Prediction: {}'.format(label_dict[int(y)],
                                                      label_dict[int(p)]))
        plt.waitforbuttonpress()
    '/Users/rileylittlefield/Desktop/notes/readingnotes/python-ml/data-science-from-scratch/12-exercises'
)
from data_split_for_model_training import split_data

from naive_bayes import NaiveBayesClassifier
from data_harvester import data
import random
import pdb
from collections import defaultdict

random.seed(0)
train_data, test_data = split_data(data, 0.75)
print("train_data_length = %s" % len(train_data))
print("test_data_length = %s" % len(test_data))

classifier = NaiveBayesClassifier()
# pdb.set_trace()
classifier.train(train_data)
classified = [(subject, is_spam, classifier.classify(subject))
              for subject, is_spam in test_data]

true_positives = []
true_negatives = []
false_positives = []
false_negatives = []
subject, classification, predicted_prob = 0, 1, 2
for my_tuple in classified:
    is_spam = my_tuple[classification]
    predict_is_spam = (my_tuple[predicted_prob] > 0.5)
    # if predict_is_spam:
    #     print('hey ho!')
示例#18
0
文件: run.py 项目: shrut1996/MiniML
X2 = titanic_data.iloc[:, 1:]
y2 = titanic_data.iloc[:, 0]

########################## Classification ##################################

X_train, X_test, y_train, y_test = train_test_split(X2,
                                                    y2,
                                                    test_size=0.2,
                                                    random_state=42)

X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

clf = NaiveBayesClassifier(type='Gaussian')

####### Convert X_train and X_test into an np array for Logistic Regression ######
# clf = LogisticRegression(num_steps=5000, regularisation='L2')

# clf1 = DecisionTree(max_depth=5, split_val_metric='mean', split_node_criterion='gini')
# clf = RandomForest(n_trees=10, sample_size=0.8, max_features=6,
#                    max_depth=5, split_val_metric='mean', split_node_criterion='gini')

##### Using two decision trees and a single naive bayes here while logistic regression is by default the meta-learner
# clf = Stacking([(clf, 1), (clf1, 2)])

# clf1 = BoostingDecisionTree(max_depth=5, split_val_metric='mean', split_node_criterion='gini')
# clf = AdaBoostClassifier(n_trees=100, learning_rate=1)

#### For Logistic Regression
# get in/out file information
train_labels_file = input('Enter training file: ')
test_file = input('Enter test file: ')
out_file = input('Enter out file: ')

# corpus files are relative to these files
path_to_train_labels_file = '/'.join(train_labels_file.split('/')[0:-1])
path_to_test_file = '/'.join(test_file.split('/')[0:-1])

# helper to get doc tokens
get_doc_tokens = lambda handle: \
    list(map(stemmer.stem, nltk.word_tokenize(''.join(handle.readlines()))))

# training stage
print('Beginning training stage...')
classifier = NaiveBayesClassifier()
with open(train_labels_file) as train_handle:
    for line in train_handle:
        doc_file, cls = line.rstrip('\n').split(' ')

        with open(f'{path_to_train_labels_file}/{doc_file}',
                  'r') as doc_handle:
            classifier.train(get_doc_tokens(doc_handle), cls)

classifier.compile()

# validation stage
print('Beginning validation stage...')
with open(test_file, 'r') as test_handle:
    doc_filenames = test_handle.read().splitlines()
示例#20
0
文件: main.py 项目: fsammart/ML2021
from naive_bayes import NaiveBayesClassifier
import pandas as pd

train_data = pd.read_excel("data/PreferenciasBritanicos.xlsx")
nb = NaiveBayesClassifier()

test_data = [[1, 0, 1, 1, 0], [0, 1, 1, 0, 1]]
test_data = pd.DataFrame(
    test_data, columns=["scones", "cerveza", "wiskey", "avena", "futbol"])

print("*** Train data ***\n\n", train_data, "\n")
print("*** Test data ***\n\n", test_data, "\n")

nb.train(train_data, "Nacionalidad")
prediction = nb.predict(test_data)

print("*** Prediction ***\n\n", prediction)
示例#21
0
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

from naive_bayes import NaiveBayesClassifier


def compute_accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)


if __name__ == '__main__':
    X, y = make_classification(n_samples=1000, n_features=10, n_classes=2)
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

    clf = NaiveBayesClassifier()
    clf.fit(X_train, Y_train)

    predictions = clf.predict(X_test)

    accuracy = compute_accuracy(Y_test, predictions)
    print("The accuracy of the model is: {}".format(accuracy))
示例#22
0
def test_arithmetic_mean():

    classifier = NaiveBayesClassifier()
    assert classifier.arithmetic_mean(numbers=[1, 2, 3, 4, 5, 6, 7]) == 4
示例#23
0
print()
print(classification_report(y_test, predictions))
'''

### K-NEAREST NEIGHBORS ###
'''
from sklearn.datasets import load_iris
from knn import KNN

X, y = load_iris(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

model = KNN()

model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))
'''

### NAIVE BAYES CLASSIFIER ###
from sklearn.datasets import load_wine
from naive_bayes import NaiveBayesClassifier

X, y = load_wine(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

model = NaiveBayesClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))
示例#24
0
def train(nb, file):
    correct = 0
    count = 0
    with codecs.open(file, 'r', encoding=ModelReader.encoding) as f:
        for line in f:
            count += 1
            lang, sent = line.strip().split("\t")
            pred_lang = nb.predict(sent)
            print("Predicted {0}, actual {1}".format(pred_lang, lang))
            correct += (pred_lang == lang)
    print("Accuracy: {0}".format(correct / count))

def test(nb, file):
    with codecs.open(file, 'r', encoding=ModelReader.encoding) as f:
        for line in f:
            _, sent = line.strip().split("\t")
            pred_lang = nb.predict(sent)
            print("Predicted '{0}' for '{1}'".format(pred_lang, sent))

if __name__ == "__main__":
    test_pass = True

    dir = "/opt/dropbox/17-18/473/project5/language-models"
    files = [os.path.abspath(os.path.join(dir, file)) for file in os.listdir(dir)]
    lang_file_pairs = { file.split(".")[0][-3:]: ModelReader(file).get() for file in files }
    nb = NaiveBayesClassifier(lang_file_pairs, verbose=True)

    if test_pass:
        test(nb, "/opt/dropbox/17-18/473/project5/test.txt")
    else:
        train(nb, "/opt/dropbox/17-18/473/project5/train.txt")
class PimaIndiansDiabetes:
    """

    Works on pima-indians-diabetes.csv dataset and interactively performs the following actions:\n
    1. Classify new data entered by user.\n
    2. Calculate the algorithm implementation accuracy.\n
    3. Show dataset description (pima-indians-diabetes.names file).\n
    4. Show dataset rows.

    """
    def __init__(self):

        self.dataset_filename = 'datasets/pima-indians-diabetes.csv'
        self.description_filename = 'datasets/pima-indians-diabetes.names'
        self.nbc = NaiveBayesClassifier()
        self.dataset = self.nbc.load_dataset_from_csv(self.dataset_filename)

    def data_preprocessing(self):
        """

        Converts class names (strings) to ints and class values to floats.

        Args:
            None.

        Returns:
            Nothing.

        """

        for i in range(len(self.dataset[0]) - 1):
            self.nbc.convert_class_values_to_floats(self.dataset, i)

        self.nbc.map_class_names_to_ints(self.dataset,
                                         len(self.dataset[0]) - 1,
                                         numbers_already=True)

    def classify_data(self):
        """

        Creates a new row with values inputted by the user, then classifies it to the proper class
        using Naive Bayes Classifier algorithm.

        Args:
            None.

        Returns:
            Nothing.

        """

        print('\nEnter the data to be classified.\n')

        attributes = {
            'Number of times pregnant: ':
            0.0,
            'Plasma glucose concentration a 2 hours in an oral glucose tolerance test: ':
            0.0,
            'Diastolic blood pressure (mm Hg): ':
            0.0,
            'Triceps skin fold thickness (mm): ':
            0.0,
            '2-Hour serum insulin (mu U/ml): ':
            0.0,
            'Body mass index (weight in kg/(height in m)^2): ':
            0.0,
            'Diabetes pedigree function: ':
            0.0,
            'Age (years): ':
            0.0
        }

        for attr in attributes:

            correct_input = False

            while correct_input == False:

                try:
                    attr_value = float(input(attr))
                    correct_input = True
                except:
                    print(
                        'Incorrect value! Please enter an integer or a floating point number.'
                    )

            attributes[attr] = attr_value

        print('\nEntered attributes:\n')

        for attr in attributes:
            print(f'{attr}{attributes[attr]}')

        print()

        confirm_sign = ''

        while confirm_sign not in ['y', 'Y', 'n', 'N']:
            confirm_sign = input('Confirm (y/n): ')

        if confirm_sign in ['n', 'N']:
            return

        model = self.nbc.calculate_class_parameters(self.dataset)
        label = self.nbc.predict(model, list(attributes.values()))

        # Original dataset contains class names represented as numbers,
        # so it's needed to print the labels explicitly
        if label == 0:
            print(f'\nThe entered entity was classified as: Negative')
        elif label == 1:
            print(f'\nThe entered entity was classified as: Positive')
        else:
            raise

    def calculate_accuracy(self, n_folds=5):
        """

        Calculates algorithm accuracy by using evaluate_algorithm() function.

        Args:
            n_folds (int)
                Number of folds used in the k-fold cross validation split algorithm.

        Returns:
            accuracy
                Calculated classifier accuracy in percent.

        """

        scores = self.nbc.evaluate_algorithm(self.dataset, n_folds)

        print(
            '\n\nCalculating the accuracy of the classifier using the pima-indians-diabetes.csv dataset...'
        )
        print('\nResampling: k-fold cross validation split')

        accuracy = (sum(scores) / float(len(scores)))
        print(f'\nAccuracy ({n_folds} folds): {round(accuracy, 3)} %\n')

        return accuracy

    def show_dataset_description(self):
        """

        Prints the 'pima-indians-diabetes.names' file to the console output.

        Args:
            None.

        Returns:
            Nothing.

        """

        with open(self.description_filename, 'r') as f:

            csv_reader = csv.reader(f,
                                    delimiter=',',
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)

            for row in csv_reader:
                for word in row:
                    print(word, end='')
                print()

    def show_dataset_rows(self):
        """

        Prints the 'pima-indians-diabetes.csv' file to the console output.

        Args:
            None.

        Returns:
            Nothing.

        """

        with open(self.dataset_filename, 'r') as f:

            csv_reader = csv.reader(f,
                                    delimiter=',',
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)

            for row in csv_reader:
                for i in range(len(row) - 1):
                    print(row[i], end=',')
                print(row[len(row) - 1])

    def run(self):
        """

        Creates the interactive menu from which the user can execute the actions handled
        by the other methods in this class.

        Args:
            None.

        Returns:
            Nothing.

        """

        seed(1)

        print('\n=================================')
        print('   Pima Indians Diabetes dataset')
        print('=================================')

        self.data_preprocessing()

        returned_from_function = True

        while True:

            if returned_from_function == True:
                print('\nChoose the action:')
                print('\n1. Classify new data.')
                print('2. Calculate algorithm accuracy.')
                print('3. Show dataset description.')
                print('4. Show dataset rows.')
                print('5. Go back to the main menu.\n')

            returned_from_function = False

            choice = input('Choice: ')

            if choice not in ['1', '2', '3', '4', '5']:
                print('Wrong choice! Please choose option 1-5.')

            elif choice == '1':

                try:
                    self.classify_data()
                    returned_from_function = True
                    continue
                except KeyboardInterrupt:
                    returned_from_function = True
                    continue

            elif choice == '2':

                try:
                    self.calculate_accuracy()
                    returned_from_function = True
                    continue
                except KeyboardInterrupt:
                    returned_from_function = True
                    continue

            elif choice == '3':

                try:
                    self.show_dataset_description()
                    returned_from_function = True
                    continue
                except KeyboardInterrupt:
                    returned_from_function = True
                    continue

            elif choice == '4':

                try:
                    self.show_dataset_rows()
                    returned_from_function = True
                    continue
                except KeyboardInterrupt:
                    returned_from_function = True
                    continue

            elif choice == '5':
                break

            else:
                raise
    def __init__(self):

        self.dataset_filename = 'datasets/pima-indians-diabetes.csv'
        self.description_filename = 'datasets/pima-indians-diabetes.names'
        self.nbc = NaiveBayesClassifier()
        self.dataset = self.nbc.load_dataset_from_csv(self.dataset_filename)
from naive_bayes import NaiveBayesClassifier
import csv

classifier = NaiveBayesClassifier(range(5))

with open("train 2.tsv", 'rb') as phrasedata:
    tsvin = csv.reader(phrasedata, delimiter='\t')
    headers = tsvin.next()
    curriter = 1
    for (_, _, phrase, sentiment) in tsvin:
        classifier.add_example(phrase, int(sentiment))
        if curriter % 1000 == 0:
            print "nother 1000"
        curriter += 1

classifier.sanitize_features()

print "done training"
with open("test.tsv", 'rb') as phrasedata:
    tsvin = csv.reader(phrasedata, delimiter='\t')
    headers = tsvin.next()
    curritter = 1
    print "PhraseId,Sentiment"
    for (phraseid, _, phrase) in tsvin:
        label = classifier.predict(phrase)
        print "{},{}".format(phraseid, label)
from sklearn.model_selection import train_test_split
from naive_bayes import NaiveBayesClassifier

# Upload Dataset
spams = pd.read_csv("spam.csv", engine="python")

# Clean the DataFrame
spams = spams.dropna(axis=1)
spams.columns = ["spam", "body"]
spams = spams[["body", "spam"]]

# Encode the label
spams["spam"] = LabelEncoder().fit_transform(spams["spam"])

emails = spams["body"]
labels = spams["spam"]

X_train, X_test, y_train, y_test = train_test_split(emails,
                                                    labels,
                                                    test_size=0.3)

train_data = pd.concat([X_train, y_train], axis=1)

# Train and classify
nc = NaiveBayesClassifier()
nc.train(train_data)
print(nc)
print(nc.classify("sign up today and win a prize"))
print(nc.classify("At what time would you like to meet"))

# Notes: this type of models work better on small datasets
示例#29
0
class Iris:
    """

    Works on iris.csv dataset and interactively performs the following actions:\n
    1. Classify new data entered by user.\n
    2. Calculate the algorithm implementation accuracy.\n
    3. Show dataset description (iris.names file).\n
    4. Show dataset rows.

    """
    def __init__(self):

        self.dataset_filename = 'datasets/iris.csv'
        self.description_filename = 'datasets/iris.names'
        self.nbc = NaiveBayesClassifier()
        self.dataset = self.nbc.load_dataset_from_csv(self.dataset_filename)
        self.class_map = dict()

    def data_preprocessing(self):
        """

        Converts class names (strings) to ints and class values to floats.

        Args:
            None.

        Returns:
            Nothing.

        """

        seed(1)

        for i in range(len(self.dataset[0]) - 1):
            self.nbc.convert_class_values_to_floats(self.dataset, i)

        self.class_map = self.nbc.map_class_names_to_ints(
            self.dataset,
            len(self.dataset[0]) - 1)

    def classify_data(self):
        """

        Creates a new row with values inputted by the user, then classifies it to the proper class
        using Naive Bayes Classifier algorithm.

        Args:
            None.

        Returns:
            Nothing.

        """

        print('\nEnter the data to be classified.\n')

        attributes = {
            'Sepal length [cm]: ': 0.0,
            'Sepal width [cm]: ': 0.0,
            'Petal length [cm]: ': 0.0,
            'Petal width [cm]: ': 0.0
        }

        for attr in attributes:

            correct_input = False

            while correct_input == False:

                try:
                    attr_value = float(input(attr))
                    correct_input = True
                except:
                    print(
                        'Incorrect value! Please enter an integer or a floating point number.'
                    )

            attributes[attr] = attr_value

        print('\nEntered attributes:\n')

        for attr in attributes:
            print(f'{attr}{attributes[attr]}')

        print()

        confirm_sign = ''

        while confirm_sign not in ['y', 'Y', 'n', 'N']:
            confirm_sign = input('Confirm (y/n): ')

        if confirm_sign in ['n', 'N']:
            return

        model = self.nbc.calculate_class_parameters(self.dataset)
        label = self.nbc.predict(model, list(attributes.values()))

        for key, value in self.class_map.items():
            if value == label:
                print(f'\nThe entered entity was classified as: {key}')
                break

    def calculate_accuracy(self, n_folds=5):
        """

        Calculates algorithm accuracy by using evaluate_algorithm() function.

        Args:
            n_folds (int)
                Number of folds used in the k-fold cross validation split algorithm.

        Returns:
            accuracy
                Calculated classifier accuracy in percent.

        """

        scores = self.nbc.evaluate_algorithm(self.dataset, n_folds)

        print(
            '\n\nCalculating the accuracy of the classifier using the iris.csv dataset...'
        )
        print('\nResampling: k-fold cross validation split')

        accuracy = (sum(scores) / float(len(scores)))
        print(f'\nAccuracy ({n_folds} folds): {round(accuracy, 3)} %\n')

        return accuracy

    def show_dataset_description(self):
        """

        Prints the 'iris.names' file to the console output.

        Args:
            None.

        Returns:
            Nothing.

        """

        with open(self.description_filename, 'r') as f:

            csv_reader = csv.reader(f,
                                    delimiter=',',
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)

            for row in csv_reader:
                for word in row:
                    print(word, end='')
                print()

    def show_dataset_rows(self):
        """

        Prints the 'iris.csv' file to the console output.

        Args:
            None.

        Returns:
            Nothing.

        """

        with open(self.dataset_filename, 'r') as f:

            csv_reader = csv.reader(f,
                                    delimiter=',',
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)

            for row in csv_reader:
                for i in range(len(row) - 1):
                    print(row[i], end=',')
                print(row[len(row) - 1])

    def run(self):
        """

        Creates the interactive menu from which the user can execute the actions handled
        by the other methods in this class.

        Args:
            None.

        Returns:
            Nothing.

        """

        print('\n=================================')
        print('          Iris dataset')
        print('=================================\n')

        self.data_preprocessing()

        returned_from_function = True

        while True:

            if returned_from_function == True:
                print('\nChoose the action:')
                print('\n1. Classify new data.')
                print('2. Calculate algorithm accuracy.')
                print('3. Show dataset description.')
                print('4. Show dataset rows.')
                print('5. Go back to the main menu.\n')

            returned_from_function = False

            choice = input('Choice: ')

            if choice not in ['1', '2', '3', '4', '5']:
                print('Wrong choice! Please choose option 1-5.')

            elif choice == '1':

                try:
                    self.classify_data()
                    returned_from_function = True
                    continue
                except KeyboardInterrupt:
                    returned_from_function = True
                    continue

            elif choice == '2':

                try:
                    self.calculate_accuracy()
                    returned_from_function = True
                    continue
                except KeyboardInterrupt:
                    returned_from_function = True
                    continue

            elif choice == '3':

                try:
                    self.show_dataset_description()
                    returned_from_function = True
                    continue
                except KeyboardInterrupt:
                    returned_from_function = True
                    continue

            elif choice == '4':

                try:
                    self.show_dataset_rows()
                    returned_from_function = True
                    continue
                except KeyboardInterrupt:
                    returned_from_function = True
                    continue

            elif choice == '5':
                break

            else:
                raise
word_features = sorted(word_features)
word_features = sorted(word_features)
word_vector = vectorize(word_features, data['tweet'], data['sentiment'])

vector = []
labels = []
for example in word_vector:
    vector = vector + [example[0]]
    labels = labels + [example[1]]
print "Stage 1: Word Polarity"
print "training bayesian network"

words = get_words("features.txt")
bayes_vector = naive_bayes_vector(words, data['tweet'], data['sentiment'])
#print bayes_vector
NaiveBayesClassifier.train(bayes_vector)

#gnb =  BernoulliNB()
#gnb.fit(vector,labels)
#with open('classifier_bayes.pkl', 'wb') as fid:
#    cPickle.dump(gnb, fid)

print "training svm"

svc = SVM()
svm_vector = []
for v in vector:
    for i in range(0, len(v)):
        if (v[i] == True):
            v[i] = 1.0
        else:
    for w in word_features:
        features[w] = (w in words)

    return features


featuresets = [(find_features(rev), category) for (rev, category) in documents]

# Making sure there is no bias
random.shuffle(featuresets)
print(len(featuresets))

training_set = featuresets[:3000]
testing_set = featuresets[3000:4000]

classifier = NaiveBayesClassifier.train(training_set)
print("Original Naive Bayes Algo accuracy percent:",
      (nltk.classify.accuracy(classifier, testing_set)) * 100)
classifier.show_most_informative_features(15)

save_classifier = open("LIMITED_PICKLES/originalnaivebayes5k.pickle", "wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

# MNB_classifier = SklearnClassifier(MultinomialNB())
# MNB_classifier.train(training_set)
# print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

# ###############
# save_classifier = open("LIMITED_PICKLES/MNB_classifier5k.pickle","wb")
# pickle.dump(MNB_classifier, save_classifier)
示例#32
0
def main():
    os.system("clear")
    print "Sentiment Analysis by Luca Giacomel. Disclaimer: this very simple algorithm wont probably work, but it might be worth a try."
    
    def update_progress(progress,current_operation_message,p):
        df=2 #dimension factor, len of the graph = 100/df
        sys.stdout.write('\r[{0}{1}] {2}% (Page: {4}) Current operation: {3}\r\r'.format('#'*(progress/df)," "*(100/df-(progress/df)), progress,current_operation_message,p))
        sys.stdout.flush()
    
    load_from_hd="n"
    
    if os.path.exists("/tmp/db.bin") and os.path.exists("/tmp/neg.tweets") and os.path.exists("/tmp/pos.tweets"):
        proceed=raw_input("I found some tweets already stored, do you want me to use them [y=Yes | n=No | a=Append]? [y/N/a] ").lower()
        while proceed not in ["","y","n","a"]:
            proceed=raw_input("I found some tweets already stored, do you want me to use them? [y/N] ").lower()
        load_from_hd=proceed.lower()
            
    if load_from_hd=="y" or load_from_hd=="":
        test_tweets=[]
        nb=NaiveBayesClassifier(db_path="/tmp/db.bin",categories=['positive','negative'])
        print "Done. Read a db of %s words" % len(nb.db)
        search_value=raw_input("What keyword do you want to use to perform the analysis? (you can use @ # :) :( as special operators) ")
        print "Downloading 30 tweets for keywords %s.." % search_value
        z=json.loads(urllib.urlopen("http://search.twitter.com/search.json?q=%s&rpp=30&lang=en" % (urllib.quote(search_value))).read())
        print "Done."
        for m in z['results']:
            test_tweets.append(m['text'])
        
                    
    elif load_from_hd=="n" or load_from_hd=="a":
        pages_to_load=raw_input("How many pages should I load? [default=20] ")
        while 1:
            try:
                if pages_to_load=="":
                    pages_to_load=20
                    break
                pages_to_load=int(pages_to_load)
                break
            except:
                pages_to_load=raw_input("How many pages should I load? [default=20] ")
        
        if load_from_hd=="a":
            pos_tweets=json.load(open("/tmp/neg.tweets"))
            neg_tweets=json.load(open("/tmp/pos.tweets"))
        else:
            pos_tweets,neg_tweets=[],[]
        
        for p in range(1,pages_to_load+1):
            perc=int(float(p*100)/pages_to_load)
            isleep=0
            cycle=True
            while 1:
                try:
                    if cycle:
                        raw_pos_tweets=json.loads(urllib.urlopen("http://search.twitter.com/search.json?page=%s&q=%s&rpp=100&lang=en" % (p,urllib.quote(":)"))).read())
                        raw_neg_tweets=json.loads(urllib.urlopen("http://search.twitter.com/search.json?page=%s&q=%s&rpp=100&lang=en" % (p,urllib.quote(":("))).read()) 
                        if len(neg_tweets)<len(pos_tweets):
                            cycle=False
                    else:
                        raw_neg_tweets=json.loads(urllib.urlopen("http://search.twitter.com/search.json?page=%s&q=%s&rpp=100&lang=en" % (p,urllib.quote(":("))).read()) 
                        raw_pos_tweets=json.loads(urllib.urlopen("http://search.twitter.com/search.json?page=%s&q=%s&rpp=100&lang=en" % (p,urllib.quote(":)"))).read())
                        if len(neg_tweets)>len(pos_tweets):
                            cycle=True
                    raw_pos_tweets['results'],raw_neg_tweets['results']
                    time.sleep(1)
                    for i in raw_pos_tweets['results']:
                        if pos_tweets.count((i['text'],'positive'))==0:
                            pos_tweets.append((i['text'],'positive'))
                    for i in raw_neg_tweets['results']:
                        if neg_tweets.count((i['text'],'negative'))==0:
                            neg_tweets.append((i['text'],'negative'))
                    update_progress(perc, "Elements: %s positive, %s negative." % (len(pos_tweets),len(neg_tweets)),p)
                    break
                except:
                    update_progress(perc, "Failed to fetch the json, trying again in %s seconds" % 2**isleep ,p)
                    time.sleep(2**isleep)
                    isleep+=1
                    if 2**isleep>64:
                        update_progress(perc, "Load time >64sec. Skipping page.. "+str(p),p)
                        break                    
        update_progress(perc, "\n",p)
        open("/tmp/pos.tweets","w").write(json.dumps(pos_tweets))
        open("/tmp/neg.tweets","w").write(json.dumps(neg_tweets))

        training_start=time.time()
        
        index=min(len(pos_tweets),len(neg_tweets))
        test_tweets=[]
        search_value=raw_input("What keyword do you want to use to perform the analysis? (you can use @ # :) :( as special operators) ")
        print "Downloading 30 tweets for keywords %s.." % search_value
        z=json.loads(urllib.urlopen("http://search.twitter.com/search.json?q=%s&rpp=30&lang=en" % (urllib.quote(search_value))).read())
        print "Done."
        for m in z['results']:
            test_tweets.append(m['text'])
        print "Training the classifier. This might take a while, grab a coffe while I work."

        nb=NaiveBayesClassifier(db={},categories=['negative','positive'])
        nb.train(pos_tweets[:index]+neg_tweets[:index])        
        
        print "Done. Training based on a set of %s elements took %s seconds." % (index*2,time.time()-training_start)
    
    for tx in test_tweets:
        print "Tweet: "+OKBLUE+tx+ENDC
        r=nb.classify(tx.lower())
        if r=="positive":
            print "Result: "+OKGREEN+r+ENDC
        elif r=="negative":
            print "Result: "+FAIL+r+ENDC
        #else:
        #print "Result: "+WARNING+"neutral (was %s with accuracy %s)" % (r[0],r[1]) +ENDC
            
    nb.save_to_hard_disk()
    
    nb.show_most_informative()