def main(): import glob, re # modify the path to wherever you've put the files path = 'spam_data/*/*' data: List[Message] = [] # glob.glob returns every filename that matches the wildcarded path for filename in glob.glob(path): is_spam = "ham" not in filename # There are some garbage characters in the emails, the errors='ignore' # skips them instead of raising an exception. with open(filename, errors='ignore') as email_file: for line in email_file: if line.startswith("Subject:"): subject = line.lstrip("Subject: ") data.append(Message(subject, is_spam)) break # done with this file import random from scratch.machine_learning import split_data random.seed(0) # just so you get the same answers as me train_messages, test_messages = split_data(data, 0.75) model = NaiveBayesClassifier() model.train(train_messages) from collections import Counter predictions = [(message, model.predict(message.text)) for message in test_messages] # Assume that spam_probability > 0.5 corresponds to spam prediction # and count the combinations of (actual is_spam, predicted is_spam) confusion_matrix = Counter((message.is_spam, spam_probability > 0.5) for message, spam_probability in predictions) print(confusion_matrix) # 주어진 모델에서 P(스팸|token) 확률 계산 def p_spam_given_token(token: str, model: NaiveBayesClassifier) -> float: # We probably shouldn't call private methods, but it's for a good cause. prob_if_spam, prob_if_ham = model._probabilities(token) return prob_if_spam / (prob_if_spam + prob_if_ham) words = sorted(model.tokens, key=lambda t: p_spam_given_token(t, model)) print("spammiest_words", words[-10:]) print("hammiest_words", words[:10])
def main(): import glob, re # ustaw ścieżkę do swojego folderu path = 'spam_data/*/*' data: List[Message] = [] # glob.glob zwraca każdą nazwę pliku, która pasuje do podanej ścieżki for filename in glob.glob(path): is_spam = "ham" not in filename # W e-mailach znajduje się trochę nietypowych znaków. Opcja errors='ignore' # powoduje pomijanie ich zamiast wyrzucenia wyjątku. with open(filename, errors='ignore') as email_file: for line in email_file: if line.startswith("Subject:"): subject = line.lstrip("Subject: ") data.append(Message(subject, is_spam)) break # ten plik jest już gotowy import random from scratch.machine_learning import split_data random.seed(0) # W celu ujednolicenia uzyskanych wyników. train_messages, test_messages = split_data(data, 0.75) model = NaiveBayesClassifier() model.train(train_messages) from collections import Counter predictions = [(message, model.predict(message.text)) for message in test_messages] # Załóż, że spam_probability > 0.5 oznacza to, że dana wiadomość jest spamem. # Policz kombinacje par (rzeczywista etykieta is_spam, przewidywana etykieta is_spam). confusion_matrix = Counter((message.is_spam, spam_probability > 0.5) for message, spam_probability in predictions) print(confusion_matrix) def p_spam_given_token(token: str, model: NaiveBayesClassifier) -> float: # Nie powinniśmy wywoływać prywatnej funkcji, ale tutaj robimy to w dobrej wierze. prob_if_spam, prob_if_ham = model._probabilities(token) return prob_if_spam / (prob_if_spam + prob_if_ham) words = sorted(model.tokens, key=lambda t: p_spam_given_token(t, model)) print("spammiest_words", words[-10:]) print("hammiest_words", words[:10])
def main(): import glob, re # modify the path to wherever you've put the files path = 'spam_data/*/*' data: List[Message] = [] # glob.glob returns every filename that matches the wildcarded path for filename in glob.glob(path): is_spam = "ham" not in filename # There are some garbage characters in the emails, the errors='ignore' # skips them instead of raising an exception. with open(filename, errors='ignore') as email_file: for line in email_file: if line.startswith("Subject:"): subject = line.lstrip("Subject: ") data.append(Message(subject, is_spam)) break # done with this file import random from scratch.machine_learning import split_data random.seed(0) # just so you get the same answers as me train_messages, test_messages = split_data(data, 0.75) model = NaiveBayesClassifier() model.train(train_messages) from collections import Counter predictions = [(message, model.predict(message.text)) for message in test_messages] # Assume that spam_probability > 0.5 corresponds to spam prediction # and count the combinations of (actual is_spam, predicted is_spam) confusion_matrix = Counter((message.is_spam, spam_probability > 0.5) for message, spam_probability in predictions) print(confusion_matrix) def p_spam_given_token(token: str, model: NaiveBayesClassifier) -> float: # We probably shouldn't call private methods, but it's for a good cause. prob_if_spam, prob_if_ham = model._probabilities(token) return prob_if_spam / (prob_if_spam + prob_if_ham) words = sorted(model.tokens, key=lambda t: p_spam_given_token(t, model)) print("spammiest_words", words[-10:]) print("hammiest_words", words[:10])
def main(): #RETRIEVE DATA data = requests.get( "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data" ) with open('iris.dat', 'w') as f: f.write(data.text) def parse_iris_row(row: List[str]) -> LabeledPoint: """ sepal_length, sepal_width, petal_length, petal_width, class """ measurements = [float(value) for value in row[:-1]] label = row[-1].split("-")[-1] return LabeledPoint(measurements, label) with open('iris.dat', 'r') as f: reader = csv.reader(f) iris_data = [parse_iris_row(row) for row in reader if row != []] #K NEAREST PREDICTION random.seed(12) iris_train, iris_test = split_data(iris_data, 0.70) confusion_matrix: Dict[Tuple[str, str], int] = defaultdict(int) num_correct = 0 for iris in iris_test: predicted = knn_classify(5, iris_train, iris.point) actual = iris.label if predicted == actual: num_correct += 1 confusion_matrix[(predicted, actual)] += 1 pct_correct = num_correct / len(iris_test) print(pct_correct, confusion_matrix) #PLOT points_by_species: Dict[str, List[Vector]] = defaultdict(list) for iris in iris_data: points_by_species[iris.label].append(iris.point) plot(["sepal_length", "sepal_width", "petal_lenght", "petal_width"], 3, points_by_species)
def main(): from typing import Dict import csv from collections import defaultdict def parse_iris_row(row: List[str]) -> LabeledPoint: """ sepal_length, sepal_width, petal_length, petal_width, class """ measurements = [float(value) for value in row[:-1]] # class is e.g. "Iris-virginica"; we just want "virginica" label = row[-1].split("-")[-1] return LabeledPoint(measurements, label) with open('iris.data') as f: reader = csv.reader(f) iris_data = [parse_iris_row(row) for row in reader] # We'll also group just the points by species/label so we can plot them. points_by_species: Dict[str, List[Vector]] = defaultdict(list) for iris in iris_data: points_by_species[iris.label].append(iris.point) from matplotlib import pyplot as plt metrics = ['sepal length', 'sepal width', 'petal length', 'petal width'] pairs = [(i, j) for i in range(4) for j in range(4) if i < j] marks = ['+', '.', 'x'] # we have 3 classes, so 3 markers fig, ax = plt.subplots(2, 3) for row in range(2): for col in range(3): i, j = pairs[3 * row + col] ax[row][col].set_title(f"{metrics[i]} vs {metrics[j]}", fontsize=8) ax[row][col].set_xticks([]) ax[row][col].set_yticks([]) for mark, (species, points) in zip(marks, points_by_species.items()): xs = [point[i] for point in points] ys = [point[j] for point in points] ax[row][col].scatter(xs, ys, marker=mark, label=species) ax[-1][-1].legend(loc='lower right', prop={'size': 6}) # plt.show() plt.savefig('im/iris_scatter.png') plt.gca().clear() import random from scratch.machine_learning import split_data random.seed(12) iris_train, iris_test = split_data(iris_data, 0.70) assert len(iris_train) == 0.7 * 150 assert len(iris_test) == 0.3 * 150 from typing import Tuple # track how many times we see (predicted, actual) confusion_matrix: Dict[Tuple[str, str], int] = defaultdict(int) num_correct = 0 for iris in iris_test: predicted = knn_classify(5, iris_train, iris.point) actual = iris.label if predicted == actual: num_correct += 1 confusion_matrix[(predicted, actual)] += 1 pct_correct = num_correct / len(iris_test) print(pct_correct, confusion_matrix) import tqdm dimensions = range(1, 101) avg_distances = [] min_distances = [] random.seed(0) for dim in tqdm.tqdm(dimensions, desc="Curse of Dimensionality"): distances = random_distances(dim, 10000) # 10,000 random pairs avg_distances.append(sum(distances) / 10000) # track the average min_distances.append(min(distances)) # track the minimum min_avg_ratio = [ min_dist / avg_dist for min_dist, avg_dist in zip(min_distances, avg_distances) ]
def main(): from typing import Dict import csv from collections import defaultdict def parse_iris_row(row: List[str]) -> LabeledPoint: """ sepal_length, sepal_width, petal_length, petal_width, class """ measurements = [float(value) for value in row[:-1]] # class is e.g. "Iris-virginica"; we just want "virginica" label = row[-1].split("-")[-1] return LabeledPoint(measurements, label) with open('iris.data') as f: reader = csv.reader(f) iris_data = [parse_iris_row(row) for row in reader] # We'll also group just the points by species/label so we can plot them. points_by_species: Dict[str, List[Vector]] = defaultdict(list) for iris in iris_data: points_by_species[iris.label].append(iris.point) from matplotlib import pyplot as plt metrics = ['sepal length', 'sepal width', 'petal length', 'petal width'] pairs = [(i, j) for i in range(4) for j in range(4) if i < j] marks = ['+', '.', 'x'] # we have 3 classes, so 3 markers fig, ax = plt.subplots(2, 3) for row in range(2): for col in range(3): i, j = pairs[3 * row + col] ax[row][col].set_title(f"{metrics[i]} vs {metrics[j]}", fontsize=8) ax[row][col].set_xticks([]) ax[row][col].set_yticks([]) for mark, (species, points) in zip(marks, points_by_species.items()): xs = [point[i] for point in points] ys = [point[j] for point in points] ax[row][col].scatter(xs, ys, marker=mark, label=species) ax[-1][-1].legend(loc='lower right', prop={'size': 6}) # plt.show() plt.savefig('im/iris_scatter.png') plt.gca().clear() import random from scratch.machine_learning import split_data random.seed(12) iris_train, iris_test = split_data(iris_data, 0.70) assert len(iris_train) == 0.7 * 150 assert len(iris_test) == 0.3 * 150 from typing import Tuple # track how many times we see (predicted, actual) confusion_matrix: Dict[Tuple[str, str], int] = defaultdict(int) num_correct = 0 for iris in iris_test: predicted = knn_classify(5, iris_train, iris.point) actual = iris.label if predicted == actual: num_correct += 1 confusion_matrix[(predicted, actual)] += 1 pct_correct = num_correct / len(iris_test) print(pct_correct, confusion_matrix) import tqdm dimensions = range(1, 101) avg_distances = [] min_distances = [] random.seed(0) for dim in tqdm.tqdm(dimensions, desc="Curse of Dimensionality"): distances = random_distances(dim, 10000) # 10,000 random pairs avg_distances.append(sum(distances) / 10000) # track the average min_distances.append(min(distances)) # track the minimum min_avg_ratio = [min_dist / avg_dist for min_dist, avg_dist in zip(min_distances, avg_distances)]
# 레이블된 포인트를 가장 가까운 데이터부터 가장 먼 데이터 순서로 정렬 by_distance = sorted(labeled_points, key=lambda lp: distance(lp.point, new_point)) # 가장 가까운 k 데이터 포인트의 레이블을 살펴보고 k_nearest_labels = [lp.label for lp in by_distance[:k]] # 투표한다. return majority_vote(k_nearest_labels) import random from scratch.machine_learning import split_data random.seed(12) iris_train, iris_test = split_data(iris_data, 0.70) assert len(iris_train) == 0.7 * 150 assert len(iris_test) == 0.3 * 150 from typing import Tuple confusion_matrix: Dict[Tuple[str, str], int] = defaultdict(int) num_correct = 0 for iris in iris_test: predicted = knn_classify(5, iris_train, iris.point) actual = iris.label if predicted == actual: num_correct += 1
def main(): from typing import Dict import csv from collections import defaultdict def parse_iris_row(row: List[str]) -> LabeledPoint: """ sepal_length, sepal_width, petal_length, petal_width, class """ measurements = [float(value) for value in row[:-1]] # jeżeli wartość class to np. "Iris-virginica", wystarczy nam samo "virginica" label = row[-1].split("-")[-1] return LabeledPoint(measurements, label) with open('iris.data') as f: reader = csv.reader(f) iris_data = [parse_iris_row(row) for row in reader] # Pogrupujemy punkty po gatunku, aby wyświetlić je na wykresie points_by_species: Dict[str, List[Vector]] = defaultdict(list) for iris in iris_data: points_by_species[iris.label].append(iris.point) from matplotlib import pyplot as plt metrics = ['sepal length', 'sepal width', 'petal length', 'petal width'] pairs = [(i, j) for i in range(4) for j in range(4) if i < j] marks = ['+', '.', 'x'] # 3 oznaczenia dla 3 rodzajów fig, ax = plt.subplots(2, 3) for row in range(2): for col in range(3): i, j = pairs[3 * row + col] ax[row][col].set_title(f"{metrics[i]} vs {metrics[j]}", fontsize=8) ax[row][col].set_xticks([]) ax[row][col].set_yticks([]) for mark, (species, points) in zip(marks, points_by_species.items()): xs = [point[i] for point in points] ys = [point[j] for point in points] ax[row][col].scatter(xs, ys, marker=mark, label=species) ax[-1][-1].legend(loc='lower right', prop={'size': 6}) plt.show() plt.savefig('im/iris_scatter.png') plt.gca().clear() import random from scratch.machine_learning import split_data random.seed(12) iris_train, iris_test = split_data(iris_data, 0.70) assert len(iris_train) == 0.7 * 150 assert len(iris_test) == 0.3 * 150 from typing import Tuple # liczymy, ile razy wartość przewidziana jest zgodna z faktyczną confusion_matrix: Dict[Tuple[str, str], int] = defaultdict(int) num_correct = 0 for iris in iris_test: predicted = knn_classify(5, iris_train, iris.point) actual = iris.label if predicted == actual: num_correct += 1 confusion_matrix[(predicted, actual)] += 1 pct_correct = num_correct / len(iris_test) print(pct_correct, confusion_matrix) import tqdm dimensions = range(1, 101) avg_distances = [] min_distances = [] random.seed(0) for dim in tqdm.tqdm(dimensions, desc="Curse of Dimensionality"): distances = random_distances(dim, 10000) # 10,000 losowych par. avg_distances.append(sum(distances) / 10000) # Określ wartość średnią. min_distances.append(min(distances)) # Określ wartość najmniejszą. min_avg_ratio = [ min_dist / avg_dist for min_dist, avg_dist in zip(min_distances, avg_distances) ]
path = "/Users/dag/github/learning/python_da/dsfs/mycode/*/*" data: List[Message] = [] # glob.glob returns every filname that matches the wildcarded path for filename in glob.glob(path): is_spam = "ham" not in filename with open(filename, errors="ignore") as email_file: for line in email_file: if line.startswith("Subject:"): subject = line.lstrip("Subject:") data.append(Message(subject, is_spam)) break import random from scratch.machine_learning import split_data random.seed(0) train_messages, test_messages = split_data(data, 0.75) model = NaiveBayesClassifier(k = 2) model.train(train_messages) from collections import Counter predictions = [(message, model.predict(message.text)) for message in test_messages] confusion_matrix = Counter((message.is_spam, spam_probability > 0.5) for message, spam_probability in predictions) print(confusion_matrix)