def scale(data: List[Vector]) -> Tuple[Vector, Vector]: """Returns mean and standard deviation of each feature""" dim = len(data[0]) means = vector_mean(data) stdevs = [ standard_deviation([vector[i] for vector in data]) for i in range(dim) ] return means, stdevs
def cluster_means(k: int, imputs: List[Vector], assignments: List[int]) -> List[Vector]: # cluster i contains the inputs whose assignment is i clusters = [[] for i in range(k)] for input, assignment in zip(inputs, assignments): clusters[assignment].append(input) # if cluster is empty then just use a random point return [vector_mean(cluster) if cluster else random.choice(inputs) for cluster in clusters]
def least_squares_fit(xs: List[Vector], ys: Vector, alpha: float, learning_rate: float = 0.001, num_steps: int = 1000, batch_size: int = 1) -> Vector: """Finds beta that minimizes the sum of squared errors assuming the model dot(x, beta)""" # start with random guess guess = [random.random() for _ in xs[0]] for _ in tqdm.trange(num_steps, desc="least squares fit"): for start in range(0, len(xs), batch_size): batch_xs = xs[start:start + batch_size] batch_ys = ys[start:start + batch_size] gradient = vector_mean([ sqerror_ridge_gradient(x, y, guess, alpha) for x, y in zip(batch_xs, batch_ys) ]) guess = gradient_step(guess, gradient, -learning_rate) return guess
# choose the last merged of our clusters next_cluster = min(clusters, key = get_merge_order) clusters = [c for c in clusters if c != next_cluster] # and add its children to the list (i.e. unmerge it) clusters.extend(get_children(next_cluster)) # once we have enough clusters, return those return clusters three_clusters = [get_values(cluster) for cluster in generate_clusters(base_cluster, 3)] from matplotlib import pyplot as plt for i, cluster, marker, color in zip([1,2,3], three_clusters, ['D','o', '*'], ['r','g','b']): xs, ys = zip(*cluster) # magic unzipping trick plt.scatter(xs, ys, color = color, marker = marker) # put a number at the mean of a cluster x,y = vector_mean(cluster) plt.plot(x,y, marker = '$' + str(i) + '$', color = 'black') plt.title("User Locations -- 3 Bottom-up Clusters, Min") plt.xlabel("blocks east of city center") plt.ylabel("blocks north of city center") plt.show()
def linear_gradient(x: float, y: float, theta: Vector) -> Vector: slope, intercept = theta predicted = slope * x + intercept # The prediction of the model error = (predicted - y) #error is predicted - actual squared_error = error**2 #Minime the squared error grad = [2 * error * x, 2 * error] return grad # Start with random values for slope and intercept theta = [random.uniform(-1, 1), random.uniform(-1, 1)] learning_rate = 0.001 for epoch in range(5000): #Compute the mean of the gradients grad = vector_mean([linear_gradient(x, y, theta) for x, y in inputs]) # Take a step in that direction theta = gradient_step(theta, grad, -learning_rate) print(epoch, theta) slope, intercept = theta assert 19.9 < slope < 20.1 # Slope should be around 20 assert 4.9 < intercept < 5.1 # Intercept should be around 5 """ ############################################################################################################## ################# Split data into mini-batches and use gradient descent to fit models ######################## ############################################################################################################## """ from typing import TypeVar, List, Iterator T = TypeVar('T')
def de_mean(data: List[Vector]) -> List[Vector]: """Recenters the data to have 0 mean in every dimension""" mean = vector_mean(data) return [subtract(vector, mean) for vector in data]