def cluster_means(k: int, imputs: List[Vector], assignments: List[int]) -> List[Vector]: # cluster i contains the inputs whose assignment is i clusters = [[] for i in range(k)] for input, assignment in zip(inputs, assignments): clusters[assignment].append(input) # if cluster is empty then just use a random point return [vector_mean(cluster) if cluster else random.choice(inputs) for cluster in clusters]
def least_squares_fit(xs: List[Vector], ys: Vector, alpha: float, learning_rate: float = 0.001, num_steps: int = 1000, batch_size: int = 1) -> Vector: """Finds beta that minimizes the sum of squared errors assuming the model dot(x, beta)""" # start with random guess guess = [random.random() for _ in xs[0]] for _ in tqdm.trange(num_steps, desc = "least squares fit"): for start in range(0,len(xs), batch_size): batch_xs = xs[start:start+batch_size] batch_ys = ys[start:start+batch_size] gradient = vector_mean([sqerror_ridge_gradient(x,y,guess,alpha) for x,y in zip(batch_xs, batch_ys)]) guess = gradient_step(guess, gradient, -learning_rate) return guess
def train(self, inputs): # choose k random points as the initial means self.means = random.sample(inputs, self.k) assignments = None while True: # Find new assignments new_assignments = map(self.classify, inputs) # if no assignments have changed, we're done if assignments == new_assignments: return # Otherwise keep the new assignments assignments = new_assignments # And compute new means based on the new assignments for i in range(self.k): # find all the points assigned to cluster i i_points = [p for p, a in zip(inputs, assignments) if a == i] # make sure i_points is not empty so don't divide by 0 if i_points: self.means[i] = vector_mean(i_points)
predicted = slope * x + intercept # prediction of a linear model error = (predicted - y) squared_error = error**2 # minimize squared error grad = [2 * error * x, 2 * error] # using its gradient return grad from vector_operations import vector_mean # start with a random slope and intercept theta = [random.uniform(-1, 1), random.uniform(-1, 1)] learning_rate = 0.001 for epoch in range(5000): # compute mean of the gradients grad = vector_mean([linear_gradient(x, y, theta) for x, y in input]) # Take a step in that direction theta = gradient_step(theta, grad, -learning_rate) print(epoch, theta) slope, intercept = theta assert 19.9 < slope < 20.1 # slope should be close to 20 assert 4.9 < intercept < 5.1 # intercept should be close to 5 """Let's solve the above problem in minibatches""" from typing import TypeVar, List, Iterator T = TypeVar('T') # this allows us to type generic functions def minibatches(dataset: List[T], batch_size=int,
# choose the last merged of our clusters next_cluster = min(clusters, key = get_merge_order) clusters = [c for c in clusters if c != next_cluster] # and add its children to the list (i.e. unmerge it) clusters.extend(get_children(next_cluster)) # once we have enough clusters, return those return clusters three_clusters = [get_values(cluster) for cluster in generate_clusters(base_cluster, 3)] from matplotlib import pyplot as plt for i, cluster, marker, color in zip([1,2,3], three_clusters, ['D','o', '*'], ['r','g','b']): xs, ys = zip(*cluster) # magic unzipping trick plt.scatter(xs, ys, color = color, marker = marker) # put a number at the mean of a cluster x,y = vector_mean(cluster) plt.plot(x,y, marker = '$' + str(i) + '$', color = 'black') plt.title("User Locations -- 3 Bottom-up Clusters, Min") plt.xlabel("blocks east of city center") plt.ylabel("blocks north of city center") plt.show()
guess = [random.random() for _ in xs[0]] from linear_regression import total_sum_of_squares; def multiple_r_squared(xs: List[Vector], ys: Vector, beta: Vector) -> float: sum_of_squared_errors = sum(error(x,y,beta) ** 2 for x,y in zip(xs,ys)) return 1.0 - sum_of_squared_errors/total_sum_of_squares(ys) print(multiple_r_squared(inputs,daily_minutes_good,beta)) for _ in tqdm.trange(num_steps, desc = "least squares fit"): for start in range(0,len(xs), batch_size): batch_xs = xs[start:start+batch_size] batch_ys = ys[start:start+batch_size] gradient = vector_mean([sqerror_gradient(x,y,guess) for x,y in zip(batch_xs, batch_ys)]) guess = gradient_step(guess, gradient, -learning_rate) return guess num_friends = [100.0,49,41,40,25,21,21,19,19,18,18,16,15,15,15,15,14,14,13,13,13,13,12,12,11,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,8,8,8,8,8,8,8,8,8,8,8,8,8,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] daily_minutes = [1,68.77,51.25,52.08,38.36,44.54,57.13,51.4,41.42,31.22,34.76,54.01,38.79,47.59,49.1,27.66,41.03,36.73,48.65,28.12,46.62,35.57,32.98,35,26.07,23.77,39.73,40.57,31.65,31.21,36.32,20.45,21.93,26.02,27.34,23.49,46.94,30.5,33.8,24.23,21.4,27.94,32.24,40.57,25.07,19.42,22.39,18.42,46.96,23.72,26.41,26.97,36.76,40.32,35.02,29.47,30.2,31,38.11,38.18,36.31,21.03,30.86,36.07,28.66,29.08,37.28,15.28,24.17,22.31,30.17,25.53,19.85,35.37,44.6,17.23,13.47,26.33,35.02,32.09,24.81,19.33,28.77,24.26,31.98,25.73,24.86,16.28,34.51,15.23,39.72,40.8,26.06,35.76,34.76,16.13,44.04,18.03,19.65,32.62,35.59,39.43,14.18,35.24,40.13,41.82,35.45,36.07,43.67,24.61,20.9,21.9,18.79,27.61,27.21,26.61,29.77,20.59,27.53,13.82,33.2,25,33.1,36.65,18.63,14.87,22.2,36.81,25.53,24.62,26.25,18.21,28.08,19.42,29.79,32.8,35.99,28.32,27.79,35.88,29.06,36.28,14.1,36.63,37.49,26.9,18.58,38.48,24.48,18.95,33.55,14.24,29.04,32.51,25.63,22.22,19,32.73,15.16,13.9,27.2,32.01,29.27,33,13.74,20.42,27.32,18.23,35.35,28.48,9.08,24.62,20.12,35.26,19.92,31.02,16.49,12.16,30.7,31.22,34.65,13.13,27.51,33.2,31.57,14.1,33.42,17.44,10.12,24.42,9.82,23.39,30.93,15.03,21.67,31.09,33.29,22.61,26.89,23.48,8.38,27.81,32.35,23.84] daily_hours = [dm / 60 for dm in daily_minutes] outlier = num_friends.index(100) # index of outlier num_friends_good = [x for i, x in enumerate(num_friends) if i != outlier] daily_minutes_good = [x