Exemplo n.º 1
0
def main():
    import glob, re

    # modify the path to wherever you've put the files
    path = 'spam_data/*/*'

    data: List[Message] = []

    # glob.glob returns every filename that matches the wildcarded path
    for filename in glob.glob(path):
        is_spam = "ham" not in filename

        # There are some garbage characters in the emails, the errors='ignore'
        # skips them instead of raising an exception.
        with open(filename, errors='ignore') as email_file:
            for line in email_file:
                if line.startswith("Subject:"):
                    subject = line.lstrip("Subject: ")
                    data.append(Message(subject, is_spam))
                    break  # done with this file

    import random
    from scratch.machine_learning import split_data

    random.seed(0)  # just so you get the same answers as me
    train_messages, test_messages = split_data(data, 0.75)

    model = NaiveBayesClassifier()
    model.train(train_messages)

    from collections import Counter

    predictions = [(message, model.predict(message.text))
                   for message in test_messages]

    # Assume that spam_probability > 0.5 corresponds to spam prediction
    # and count the combinations of (actual is_spam, predicted is_spam)
    confusion_matrix = Counter((message.is_spam, spam_probability > 0.5)
                               for message, spam_probability in predictions)

    print(confusion_matrix)

    # 주어진 모델에서 P(스팸|token) 확률 계산
    def p_spam_given_token(token: str, model: NaiveBayesClassifier) -> float:
        # We probably shouldn't call private methods, but it's for a good cause.
        prob_if_spam, prob_if_ham = model._probabilities(token)

        return prob_if_spam / (prob_if_spam + prob_if_ham)

    words = sorted(model.tokens, key=lambda t: p_spam_given_token(t, model))

    print("spammiest_words", words[-10:])
    print("hammiest_words", words[:10])
Exemplo n.º 2
0
def main():
    import glob, re

    # ustaw ścieżkę do swojego folderu
    path = 'spam_data/*/*'

    data: List[Message] = []

    # glob.glob zwraca każdą nazwę pliku, która pasuje do podanej ścieżki
    for filename in glob.glob(path):
        is_spam = "ham" not in filename

        # W e-mailach znajduje się trochę nietypowych znaków. Opcja errors='ignore'
        # powoduje pomijanie ich zamiast wyrzucenia wyjątku.
        with open(filename, errors='ignore') as email_file:
            for line in email_file:
                if line.startswith("Subject:"):
                    subject = line.lstrip("Subject: ")
                    data.append(Message(subject, is_spam))
                    break  # ten plik jest już gotowy

    import random
    from scratch.machine_learning import split_data

    random.seed(0)  # W celu ujednolicenia uzyskanych wyników.
    train_messages, test_messages = split_data(data, 0.75)

    model = NaiveBayesClassifier()
    model.train(train_messages)

    from collections import Counter

    predictions = [(message, model.predict(message.text))
                   for message in test_messages]

    # Załóż, że spam_probability > 0.5 oznacza to, że dana wiadomość jest spamem.
    # Policz kombinacje par (rzeczywista etykieta is_spam, przewidywana etykieta is_spam).
    confusion_matrix = Counter((message.is_spam, spam_probability > 0.5)
                               for message, spam_probability in predictions)

    print(confusion_matrix)

    def p_spam_given_token(token: str, model: NaiveBayesClassifier) -> float:
        # Nie powinniśmy wywoływać prywatnej funkcji, ale tutaj robimy to w dobrej wierze.
        prob_if_spam, prob_if_ham = model._probabilities(token)

        return prob_if_spam / (prob_if_spam + prob_if_ham)

    words = sorted(model.tokens, key=lambda t: p_spam_given_token(t, model))

    print("spammiest_words", words[-10:])
    print("hammiest_words", words[:10])
def main():
    import glob, re
    
    # modify the path to wherever you've put the files
    path = 'spam_data/*/*'
    
    data: List[Message] = []
    
    # glob.glob returns every filename that matches the wildcarded path
    for filename in glob.glob(path):
        is_spam = "ham" not in filename
    
        # There are some garbage characters in the emails, the errors='ignore'
        # skips them instead of raising an exception.
        with open(filename, errors='ignore') as email_file:
            for line in email_file:
                if line.startswith("Subject:"):
                    subject = line.lstrip("Subject: ")
                    data.append(Message(subject, is_spam))
                    break  # done with this file
    
    import random
    from scratch.machine_learning import split_data
    
    random.seed(0)      # just so you get the same answers as me
    train_messages, test_messages = split_data(data, 0.75)
    
    model = NaiveBayesClassifier()
    model.train(train_messages)
    
    from collections import Counter
    
    predictions = [(message, model.predict(message.text))
                   for message in test_messages]
    
    # Assume that spam_probability > 0.5 corresponds to spam prediction
    # and count the combinations of (actual is_spam, predicted is_spam)
    confusion_matrix = Counter((message.is_spam, spam_probability > 0.5)
                               for message, spam_probability in predictions)
    
    print(confusion_matrix)
    
    def p_spam_given_token(token: str, model: NaiveBayesClassifier) -> float:
        # We probably shouldn't call private methods, but it's for a good cause.
        prob_if_spam, prob_if_ham = model._probabilities(token)
    
        return prob_if_spam / (prob_if_spam + prob_if_ham)
    
    words = sorted(model.tokens, key=lambda t: p_spam_given_token(t, model))
    
    print("spammiest_words", words[-10:])
    print("hammiest_words", words[:10])
Exemplo n.º 4
0
def main():
    #RETRIEVE DATA

    data = requests.get(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
    )

    with open('iris.dat', 'w') as f:
        f.write(data.text)

    def parse_iris_row(row: List[str]) -> LabeledPoint:
        """
        sepal_length, sepal_width, petal_length, petal_width, class
        """
        measurements = [float(value) for value in row[:-1]]
        label = row[-1].split("-")[-1]

        return LabeledPoint(measurements, label)

    with open('iris.dat', 'r') as f:
        reader = csv.reader(f)
        iris_data = [parse_iris_row(row) for row in reader if row != []]

    #K NEAREST PREDICTION
    random.seed(12)
    iris_train, iris_test = split_data(iris_data, 0.70)

    confusion_matrix: Dict[Tuple[str, str], int] = defaultdict(int)
    num_correct = 0

    for iris in iris_test:
        predicted = knn_classify(5, iris_train, iris.point)
        actual = iris.label

        if predicted == actual:
            num_correct += 1

        confusion_matrix[(predicted, actual)] += 1

    pct_correct = num_correct / len(iris_test)
    print(pct_correct, confusion_matrix)

    #PLOT

    points_by_species: Dict[str, List[Vector]] = defaultdict(list)
    for iris in iris_data:
        points_by_species[iris.label].append(iris.point)

    plot(["sepal_length", "sepal_width", "petal_lenght", "petal_width"], 3, points_by_species)
Exemplo n.º 5
0
def main():
    from typing import Dict
    import csv
    from collections import defaultdict

    def parse_iris_row(row: List[str]) -> LabeledPoint:
        """
        sepal_length, sepal_width, petal_length, petal_width, class
        """
        measurements = [float(value) for value in row[:-1]]
        # class is e.g. "Iris-virginica"; we just want "virginica"
        label = row[-1].split("-")[-1]

        return LabeledPoint(measurements, label)

    with open('iris.data') as f:
        reader = csv.reader(f)
        iris_data = [parse_iris_row(row) for row in reader]

    # We'll also group just the points by species/label so we can plot them.
    points_by_species: Dict[str, List[Vector]] = defaultdict(list)
    for iris in iris_data:
        points_by_species[iris.label].append(iris.point)

    from matplotlib import pyplot as plt
    metrics = ['sepal length', 'sepal width', 'petal length', 'petal width']
    pairs = [(i, j) for i in range(4) for j in range(4) if i < j]
    marks = ['+', '.', 'x']  # we have 3 classes, so 3 markers

    fig, ax = plt.subplots(2, 3)

    for row in range(2):
        for col in range(3):
            i, j = pairs[3 * row + col]
            ax[row][col].set_title(f"{metrics[i]} vs {metrics[j]}", fontsize=8)
            ax[row][col].set_xticks([])
            ax[row][col].set_yticks([])

            for mark, (species, points) in zip(marks,
                                               points_by_species.items()):
                xs = [point[i] for point in points]
                ys = [point[j] for point in points]
                ax[row][col].scatter(xs, ys, marker=mark, label=species)

    ax[-1][-1].legend(loc='lower right', prop={'size': 6})
    # plt.show()

    plt.savefig('im/iris_scatter.png')
    plt.gca().clear()

    import random
    from scratch.machine_learning import split_data

    random.seed(12)
    iris_train, iris_test = split_data(iris_data, 0.70)
    assert len(iris_train) == 0.7 * 150
    assert len(iris_test) == 0.3 * 150

    from typing import Tuple

    # track how many times we see (predicted, actual)
    confusion_matrix: Dict[Tuple[str, str], int] = defaultdict(int)
    num_correct = 0

    for iris in iris_test:
        predicted = knn_classify(5, iris_train, iris.point)
        actual = iris.label

        if predicted == actual:
            num_correct += 1

        confusion_matrix[(predicted, actual)] += 1

    pct_correct = num_correct / len(iris_test)
    print(pct_correct, confusion_matrix)

    import tqdm
    dimensions = range(1, 101)

    avg_distances = []
    min_distances = []

    random.seed(0)
    for dim in tqdm.tqdm(dimensions, desc="Curse of Dimensionality"):
        distances = random_distances(dim, 10000)  # 10,000 random pairs
        avg_distances.append(sum(distances) / 10000)  # track the average
        min_distances.append(min(distances))  # track the minimum

    min_avg_ratio = [
        min_dist / avg_dist
        for min_dist, avg_dist in zip(min_distances, avg_distances)
    ]
def main():
    from typing import Dict
    import csv
    from collections import defaultdict
    
    def parse_iris_row(row: List[str]) -> LabeledPoint:
        """
        sepal_length, sepal_width, petal_length, petal_width, class
        """
        measurements = [float(value) for value in row[:-1]]
        # class is e.g. "Iris-virginica"; we just want "virginica"
        label = row[-1].split("-")[-1]
    
        return LabeledPoint(measurements, label)
    
    with open('iris.data') as f:
        reader = csv.reader(f)
        iris_data = [parse_iris_row(row) for row in reader]
    
    # We'll also group just the points by species/label so we can plot them.
    points_by_species: Dict[str, List[Vector]] = defaultdict(list)
    for iris in iris_data:
        points_by_species[iris.label].append(iris.point)
    
    from matplotlib import pyplot as plt
    metrics = ['sepal length', 'sepal width', 'petal length', 'petal width']
    pairs = [(i, j) for i in range(4) for j in range(4) if i < j]
    marks = ['+', '.', 'x']  # we have 3 classes, so 3 markers
    
    fig, ax = plt.subplots(2, 3)
    
    for row in range(2):
        for col in range(3):
            i, j = pairs[3 * row + col]
            ax[row][col].set_title(f"{metrics[i]} vs {metrics[j]}", fontsize=8)
            ax[row][col].set_xticks([])
            ax[row][col].set_yticks([])
    
            for mark, (species, points) in zip(marks, points_by_species.items()):
                xs = [point[i] for point in points]
                ys = [point[j] for point in points]
                ax[row][col].scatter(xs, ys, marker=mark, label=species)
    
    ax[-1][-1].legend(loc='lower right', prop={'size': 6})
    # plt.show()
    
    
    
    plt.savefig('im/iris_scatter.png')
    plt.gca().clear()
    
    import random
    from scratch.machine_learning import split_data
    
    random.seed(12)
    iris_train, iris_test = split_data(iris_data, 0.70)
    assert len(iris_train) == 0.7 * 150
    assert len(iris_test) == 0.3 * 150
    
    from typing import Tuple
    
    # track how many times we see (predicted, actual)
    confusion_matrix: Dict[Tuple[str, str], int] = defaultdict(int)
    num_correct = 0
    
    for iris in iris_test:
        predicted = knn_classify(5, iris_train, iris.point)
        actual = iris.label
    
        if predicted == actual:
            num_correct += 1
    
        confusion_matrix[(predicted, actual)] += 1
    
    pct_correct = num_correct / len(iris_test)
    print(pct_correct, confusion_matrix)
    
    import tqdm
    dimensions = range(1, 101)
    
    avg_distances = []
    min_distances = []
    
    random.seed(0)
    for dim in tqdm.tqdm(dimensions, desc="Curse of Dimensionality"):
        distances = random_distances(dim, 10000)      # 10,000 random pairs
        avg_distances.append(sum(distances) / 10000)  # track the average
        min_distances.append(min(distances))          # track the minimum
    
    min_avg_ratio = [min_dist / avg_dist
                     for min_dist, avg_dist in zip(min_distances, avg_distances)]
Exemplo n.º 7
0
    # 레이블된 포인트를 가장 가까운 데이터부터 가장 먼 데이터 순서로 정렬
    by_distance = sorted(labeled_points,
                         key=lambda lp: distance(lp.point, new_point))

    # 가장 가까운 k 데이터 포인트의 레이블을 살펴보고
    k_nearest_labels = [lp.label for lp in by_distance[:k]]

    # 투표한다.
    return majority_vote(k_nearest_labels)


import random
from scratch.machine_learning import split_data

random.seed(12)
iris_train, iris_test = split_data(iris_data, 0.70)
assert len(iris_train) == 0.7 * 150
assert len(iris_test) == 0.3 * 150

from typing import Tuple

confusion_matrix: Dict[Tuple[str, str], int] = defaultdict(int)
num_correct = 0

for iris in iris_test:
    predicted = knn_classify(5, iris_train, iris.point)
    actual = iris.label

    if predicted == actual:
        num_correct += 1
def main():
    from typing import Dict
    import csv
    from collections import defaultdict

    def parse_iris_row(row: List[str]) -> LabeledPoint:
        """
        sepal_length, sepal_width, petal_length, petal_width, class
        """
        measurements = [float(value) for value in row[:-1]]
        # jeżeli wartość class to np. "Iris-virginica", wystarczy nam samo "virginica"
        label = row[-1].split("-")[-1]

        return LabeledPoint(measurements, label)

    with open('iris.data') as f:
        reader = csv.reader(f)
        iris_data = [parse_iris_row(row) for row in reader]

    # Pogrupujemy punkty po gatunku, aby wyświetlić je na wykresie
    points_by_species: Dict[str, List[Vector]] = defaultdict(list)
    for iris in iris_data:
        points_by_species[iris.label].append(iris.point)

    from matplotlib import pyplot as plt
    metrics = ['sepal length', 'sepal width', 'petal length', 'petal width']
    pairs = [(i, j) for i in range(4) for j in range(4) if i < j]
    marks = ['+', '.', 'x']  # 3 oznaczenia dla 3 rodzajów

    fig, ax = plt.subplots(2, 3)

    for row in range(2):
        for col in range(3):
            i, j = pairs[3 * row + col]
            ax[row][col].set_title(f"{metrics[i]} vs {metrics[j]}", fontsize=8)
            ax[row][col].set_xticks([])
            ax[row][col].set_yticks([])

            for mark, (species, points) in zip(marks,
                                               points_by_species.items()):
                xs = [point[i] for point in points]
                ys = [point[j] for point in points]
                ax[row][col].scatter(xs, ys, marker=mark, label=species)

    ax[-1][-1].legend(loc='lower right', prop={'size': 6})
    plt.show()

    plt.savefig('im/iris_scatter.png')
    plt.gca().clear()

    import random
    from scratch.machine_learning import split_data

    random.seed(12)
    iris_train, iris_test = split_data(iris_data, 0.70)
    assert len(iris_train) == 0.7 * 150
    assert len(iris_test) == 0.3 * 150

    from typing import Tuple

    # liczymy, ile razy wartość przewidziana jest zgodna z faktyczną
    confusion_matrix: Dict[Tuple[str, str], int] = defaultdict(int)
    num_correct = 0

    for iris in iris_test:
        predicted = knn_classify(5, iris_train, iris.point)
        actual = iris.label

        if predicted == actual:
            num_correct += 1

        confusion_matrix[(predicted, actual)] += 1

    pct_correct = num_correct / len(iris_test)
    print(pct_correct, confusion_matrix)

    import tqdm
    dimensions = range(1, 101)

    avg_distances = []
    min_distances = []

    random.seed(0)
    for dim in tqdm.tqdm(dimensions, desc="Curse of Dimensionality"):
        distances = random_distances(dim, 10000)  # 10,000 losowych par.
        avg_distances.append(sum(distances) / 10000)  # Określ wartość średnią.
        min_distances.append(min(distances))  # Określ wartość najmniejszą.

    min_avg_ratio = [
        min_dist / avg_dist
        for min_dist, avg_dist in zip(min_distances, avg_distances)
    ]
Exemplo n.º 9
0
path = "/Users/dag/github/learning/python_da/dsfs/mycode/*/*"

data: List[Message] = []
# glob.glob returns every filname that matches the wildcarded path
for filename in glob.glob(path):
    is_spam = "ham" not in filename

    with open(filename, errors="ignore") as email_file:
        for line in email_file:
            if  line.startswith("Subject:"):
                subject = line.lstrip("Subject:")
                data.append(Message(subject, is_spam))
                break

import random
from scratch.machine_learning import split_data

random.seed(0)
train_messages, test_messages = split_data(data, 0.75)

model = NaiveBayesClassifier(k = 2)
model.train(train_messages)

from collections import Counter

predictions = [(message, model.predict(message.text))
               for message in test_messages]
confusion_matrix = Counter((message.is_spam, spam_probability > 0.5)
                            for message, spam_probability in predictions)

print(confusion_matrix)