def test_dist_all(self): m1 = np.array([[0.0], [0.0]]) m2 = np.array([[0.0], [0.0]]) expected = np.array([[0.0, 0.0], [0.0, 0.0]]) assert_array_equal(expected, dist.dist_all(m1, m2)[0]) assert_array_equal(expected, dist.dist_all(m1, m2)[1]) m1 = np.array([[1.0], [1.0]]) m2 = np.array([[0.0], [0.0]]) expected = np.array([[1.0, 1.0], [1.0, 1.0]]) assert_array_equal(expected, dist.dist_all(m1, m2)[0]) m1 = np.array([[2.0, 3.0, 4.0], [3.0, 4.0, 0.0]]) m2 = np.array([[2.0, 3.0, 4.0], [3.0, 4.0, 0.0]]) expected = np.array([[0.0, 2 / sqrt(29)], [2 / sqrt(29), 0.0]]) assert_array_almost_equal(expected, dist.dist_all(m1, m2, True)[0])
def main(tseries_fpath, test_fpath, cents_fpath): X = ioutil.load_series(tseries_fpath, test_fpath) C = np.loadtxt(cents_fpath) dist_cents = dist.dist_all(C, X, rolling=True)[0] y_true = dist_cents.argmin(axis=0) for t in y_true: print t
def cost(tseries, assign, centroids, dist_centroids=None): num_series = tseries.shape[0] if dist_centroids is None: dist_centroids = dist_all(centroids, tseries) cost_f = 0.0 for i in xrange(num_series): k = assign[i] cost_f += dist_centroids[k, i] ** 2 return cost_f / num_series
def cost(tseries, assign, centroids, dist_centroids=None): num_series = tseries.shape[0] if dist_centroids is None: dist_centroids = dist_all(centroids, tseries) cost_f = 0.0 for i in range(num_series): k = assign[i] cost_f += dist_centroids[k, i]**2 return cost_f / num_series
def avg_inter_dist(tseries, assign, dists_all_pairs=None): num_series = tseries.shape[0] if dists_all_pairs is None: dists_all_pairs = dist_all(tseries, tseries, rolling=True)[0] dists = [] for i in xrange(num_series): k = assign[i] non_members = assign != k dists_i = dists_all_pairs[i] dists.extend(dists_i[non_members]) return np.mean(dists), np.std(dists)
def avg_inter_dist(tseries, assign, dists_all_pairs=None): num_series = tseries.shape[0] if dists_all_pairs is None: dists_all_pairs = dist_all(tseries, tseries, rolling=True)[0] dists = [] for i in range(num_series): k = assign[i] non_members = assign != k dists_i = dists_all_pairs[i] dists.extend(dists_i[non_members]) return np.mean(dists), np.std(dists)
def main(tseries_fpath, in_folder): ids = [] with open(tseries_fpath) as tseries_file: for l in tseries_file: ids.append(l.split()[0]) ids = np.array(ids) folders = glob.glob(os.path.join(in_folder, 'fold-*/ksc')) num_folders = len(folders) agree = 0 diff = 0 for i in xrange(num_folders): base_i = os.path.dirname(folders[i]) Ci = np.loadtxt(os.path.join(folders[i], 'cents.dat')) train_i = np.loadtxt(os.path.join(base_i, 'train.dat'), dtype='bool') assign_i = np.loadtxt(os.path.join(folders[i], 'assign.dat')) for j in xrange(i, num_folders): base_j = os.path.dirname(folders[j]) Cj = np.loadtxt(os.path.join(folders[j], 'cents.dat')) dists = dist.dist_all(Ci, Cj, rolling=True)[0] argsrt = dists.argsort(axis=1) train_j = np.loadtxt(os.path.join(base_j, 'train.dat'), dtype='bool') assign_j = np.loadtxt(os.path.join(folders[j], 'assign.dat')) for k in xrange(argsrt.shape[0]): first = True for o in argsrt[k]: ids_k = set(ids[train_i][assign_i == k]) ids_o = set(ids[train_j][assign_j == o]) n_inter = len(ids_k.intersection(ids_o)) if first: first = False agree += n_inter else: diff += n_inter print('AgreedProb = ', agree / (agree + diff)) print('DisagreeProb = ', diff / (agree + diff))
def main(tseries_fpath, base_folder): folders = glob.glob(os.path.join(base_folder, 'fold-*')) num_folders = len(folders) cluster_mapping = [] C_base = np.loadtxt(os.path.join(folders[0], 'ksc/cents.dat')) for i in range(num_folders): Ci = np.loadtxt(os.path.join(folders[i], 'ksc/cents.dat')) dists = dist.dist_all(Ci, C_base, rolling=True)[0] closest = dists.argmin(axis=1) cluster_mapping.append({}) for k in range(Ci.shape[0]): cluster_mapping[i][k] = closest[k] y_true_all = [] y_pred_all = [] for i in range(num_folders): y_true = np.loadtxt(os.path.join(folders[i], 'ksc/test_assign.dat')) y_pred = np.loadtxt(os.path.join(folders[i], \ 'cls-res-fitted-50/pred.dat')) for j in range(y_true.shape[0]): y_true[j] = cluster_mapping[i][y_true[j]] if y_pred[j] != -1: y_pred[j] = cluster_mapping[i][y_pred[j]] y_true_all.extend(y_true) y_pred_all.extend(y_pred) y_pred_all = np.asarray(y_pred_all) y_true_all = np.asarray(y_true_all) report = classification_report(y_true_all, y_pred_all) valid = y_pred_all != -1 print() print('Using the centroids from folder: ', folders[0]) print('Micro Aggregation of Folds:') print('%.3f fract of videos were not classified' % (sum(~valid) / y_pred_all.shape[0])) print() print(classification_report(y_true_all[valid], y_pred_all[valid]))
def main(tseries_fpath, base_folder): folders = glob.glob(os.path.join(base_folder, "fold-*")) num_folders = len(folders) cluster_mapping = [] C_base = np.loadtxt(os.path.join(folders[0], "ksc/cents.dat")) for i in xrange(num_folders): Ci = np.loadtxt(os.path.join(folders[i], "ksc/cents.dat")) dists = dist.dist_all(Ci, C_base, rolling=True)[0] closest = dists.argmin(axis=1) cluster_mapping.append({}) for k in xrange(Ci.shape[0]): cluster_mapping[i][k] = closest[k] y_true_all = [] y_pred_all = [] for i in xrange(num_folders): y_true = np.loadtxt(os.path.join(folders[i], "ksc/test_assign.dat")) y_pred = np.loadtxt(os.path.join(folders[i], "cls-res-fitted-50/pred.dat")) for j in xrange(y_true.shape[0]): y_true[j] = cluster_mapping[i][y_true[j]] if y_pred[j] != -1: y_pred[j] = cluster_mapping[i][y_pred[j]] y_true_all.extend(y_true) y_pred_all.extend(y_pred) y_pred_all = np.asarray(y_pred_all) y_true_all = np.asarray(y_true_all) report = classification_report(y_true_all, y_pred_all) valid = y_pred_all != -1 print() print("Using the centroids from folder: ", folders[0]) print("Micro Aggregation of Folds:") print("%.3f fract of videos were not classified" % (sum(~valid) / y_pred_all.shape[0])) print() print(classification_report(y_true_all[valid], y_pred_all[valid]))
def silhouette(tseries, assign, dists_all_pairs=None): if dists_all_pairs is None: dists_all_pairs = dist_all(tseries, tseries, rolling=True)[0] num_series = tseries.shape[0] sils = np.zeros(num_series, dtype='f') labels = set(assign) for i in xrange(num_series): k = assign[i] dists_i = dists_all_pairs[i] intra = np.mean(dists_i[assign == k]) min_inter = float('inf') for o in labels: if o != k: inter = np.mean(dists_i[assign == o]) if inter < min_inter: min_inter = inter sils[i] = (min_inter - intra) / max(intra, min_inter) return np.mean(sils)
def silhouette(tseries, assign, dists_all_pairs=None): if dists_all_pairs is None: dists_all_pairs = dist_all(tseries, tseries, rolling=True)[0] num_series = tseries.shape[0] sils = np.zeros(num_series, dtype='f') labels = set(assign) for i in range(num_series): k = assign[i] dists_i = dists_all_pairs[i] intra = np.mean(dists_i[assign == k]) min_inter = float('inf') for o in labels: if o != k: inter = np.mean(dists_i[assign == o]) if inter < min_inter: min_inter = inter sils[i] = (min_inter - intra) / max(intra, min_inter) return np.mean(sils)
def _base_ksc(tseries, initial_centroids, n_iters=-1): ''' This is the base of the KSC algorithm. It follows the same idea of a K-Means algorithm. Firstly, we assign time series to a new cluster based on the distance to the centroids. For each time series, it is computed the best shift to minimize the distance to the closest centroid. The assignment step is followed by an update step where new centroids are computed based on the new clustering (based on the update step). Both steps above are repeated `n_iters` times. If this parameter is negative then the steps are repeated until convergence, that is, until no time series changes cluster between consecutive steps. Arguments --------- tseries: a matrix of shape (number of time series, size of each series) The time series to cluster initial_centroids: a matrix of shape (num. of clusters, size of time series) The initial centroid estimates n_iters: int The number of iterations which the algorithm will run Returns ------- centroids: a matrix of shape (num. of clusters, size of time series) The final centroids found by the algorithm assign: an array of num. series size The cluster id which each time series belongs to best_shift: an array of num. series size The amount shift amount performed for each time series cent_dists: a matrix of shape (num. centroids, num. series) The distance of each centroid to each time series References ---------- References ---------- .. [1] J. Yang and J. Leskovec, "Patterns of Temporal Variation in Online Media" - WSDM'11 http://dl.acm.org/citation.cfm?id=1935863 .. [1] J. Yang and J. Leskovec, "Patterns of Temporal Variation in Online Media" - WSDM'11 http://dl.acm.org/citation.cfm?id=1935863 .. [2] Wikipedia, "K-means clustering" http://en.wikipedia.org/wiki/K-means_clustering ''' num_clusters = initial_centroids.shape[0] num_series = tseries.shape[0] centroids = initial_centroids #KSC algorithm cent_dists = None assign = None prev_assign = None best_shift = None iters = n_iters converged = False while iters != 0 and not converged: #assign elements to new clusters References cent_dists, shifts = dist_all(centroids, tseries, rolling=True) assign = cent_dists.argmin(axis=0) best_shift = np.ndarray(num_series, dtype='i') for i in xrange(shifts.shape[1]): best_shift[i] = shifts[assign[i], i] #check if converged, if not compute new centroids if prev_assign is not None and not (prev_assign - assign).any(): converged = True else: centroids = _compute_centroids(tseries, assign, num_clusters, best_shift) prev_assign = assign iters -= 1 return centroids, assign, best_shift, cent_dists
def main(tseries_fpath, plot_foldpath): assert os.path.isdir(plot_foldpath) initialize_matplotlib() X = np.genfromtxt(tseries_fpath)[:, 1:].copy() n_samples = X.shape[0] sample_rows = np.arange(n_samples) clust_range = range(2, 16) n_clustering_vals = len(clust_range) intra_array = np.zeros(shape=(25, n_clustering_vals)) inter_array = np.zeros(shape=(25, n_clustering_vals)) bcvs_array = np.zeros(shape=(25, n_clustering_vals)) costs_array = np.zeros(shape=(25, n_clustering_vals)) r = 0 for i in xrange(5): np.random.shuffle(sample_rows) rand_sample = sample_rows[:200] X_new = X[rand_sample] D_new = dist.dist_all(X_new, X_new, rolling=True)[0] for j in xrange(5): for k in clust_range: intra, inter, bcv, cost = run_clustering(X_new, k, D_new) intra_array[r, k - 2] = intra inter_array[r, k - 2] = inter bcvs_array[r, k - 2] = bcv costs_array[r, k - 2] = cost r += 1 print(r) intra_err = np.zeros(n_clustering_vals) inter_err = np.zeros(n_clustering_vals) bcvs_err = np.zeros(n_clustering_vals) costs_err = np.zeros(n_clustering_vals) for k in clust_range: j = k - 2 intra_err[j] = hci(intra_array[:, j], 0.95) inter_err[j] = hci(inter_array[:, j], 0.95) bcvs_err[j] = hci(bcvs_array[:, j], 0.95) costs_err[j] = hci(costs_array[:, j], 0.95) plt.errorbar(clust_range, np.mean(inter_array, axis=0), fmt="gD", label="Inter Cluster", yerr=inter_err) plt.errorbar(clust_range, np.mean(bcvs_array, axis=0), fmt="bo", label="BetaCV", yerr=bcvs_err) plt.errorbar(clust_range, np.mean(intra_array, axis=0), fmt="rs", label="Intra Cluster", yerr=intra_err) plt.ylabel("Average Distance") plt.xlabel("Number of clusters") plt.xlim((0.0, 16)) plt.ylim((0.0, 1.0)) plt.legend(frameon=False, loc="lower left") plt.savefig(os.path.join(plot_foldpath, "bcv.pdf")) plt.close() plt.errorbar(clust_range, np.mean(costs_array, axis=0), fmt="bo", label="Cost", yerr=costs_err) plt.ylabel("Cost (F)") plt.xlabel("Number of clusters") plt.xlim((0.0, 16)) plt.ylim((0.0, 1.0)) plt.legend(frameon=False, loc="lower left") plt.savefig(os.path.join(plot_foldpath, "cost.pdf")) plt.close()
def main(tseries_fpath, plot_foldpath): assert os.path.isdir(plot_foldpath) initialize_matplotlib() X = np.genfromtxt(tseries_fpath)[:, 1:].copy() n_samples = X.shape[0] sample_rows = np.arange(n_samples) clust_range = range(2, 16) n_clustering_vals = len(clust_range) intra_array = np.zeros(shape=(25, n_clustering_vals)) inter_array = np.zeros(shape=(25, n_clustering_vals)) bcvs_array = np.zeros(shape=(25, n_clustering_vals)) costs_array = np.zeros(shape=(25, n_clustering_vals)) r = 0 for i in xrange(5): np.random.shuffle(sample_rows) rand_sample = sample_rows[:200] X_new = X[rand_sample] D_new = dist.dist_all(X_new, X_new, rolling=True)[0] for j in xrange(5): for k in clust_range: intra, inter, bcv, cost = run_clustering(X_new, k, D_new) intra_array[r, k - 2] = intra inter_array[r, k - 2] = inter bcvs_array[r, k - 2] = bcv costs_array[r, k - 2] = cost r += 1 print(r) intra_err = np.zeros(n_clustering_vals) inter_err = np.zeros(n_clustering_vals) bcvs_err = np.zeros(n_clustering_vals) costs_err = np.zeros(n_clustering_vals) for k in clust_range: j = k - 2 intra_err[j] = hci(intra_array[:, j], .95) inter_err[j] = hci(inter_array[:, j], .95) bcvs_err[j] = hci(bcvs_array[:, j], .95) costs_err[j] = hci(costs_array[:, j], .95) plt.errorbar(clust_range, np.mean(inter_array, axis=0), fmt='gD', label='Inter Cluster', yerr=inter_err) plt.errorbar(clust_range, np.mean(bcvs_array, axis=0), fmt='bo', label='BetaCV', yerr=bcvs_err) plt.errorbar(clust_range, np.mean(intra_array, axis=0), fmt='rs', label='Intra Cluster', yerr=intra_err) plt.ylabel('Average Distance') plt.xlabel('Number of clusters') plt.xlim((0., 16)) plt.ylim((0., 1.)) plt.legend(frameon=False, loc='lower left') plt.savefig(os.path.join(plot_foldpath, 'bcv.pdf')) plt.close() plt.errorbar(clust_range, np.mean(costs_array, axis=0), fmt='bo', label='Cost', yerr=costs_err) plt.ylabel('Cost (F)') plt.xlabel('Number of clusters') plt.xlim((0., 16)) plt.ylim((0., 1.)) plt.legend(frameon=False, loc='lower left') plt.savefig(os.path.join(plot_foldpath, 'cost.pdf')) plt.close()
def _base_ksc(tseries, initial_centroids, n_iters=-1): ''' This is the base of the KSC algorithm. It follows the same idea of a K-Means algorithm. Firstly, we assign time series to a new cluster based on the distance to the centroids. For each time series, it is computed the best shift to minimize the distance to the closest centroid. The assignment step is followed by an update step where new centroids are computed based on the new clustering (based on the update step). Both steps above are repeated `n_iters` times. If this parameter is negative then the steps are repeated until convergence, that is, until no time series changes cluster between consecutive steps. Arguments --------- tseries: a matrix of shape (number of time series, size of each series) The time series to cluster initial_centroids: a matrix of shape (num. of clusters, size of time series) The initial centroid estimates n_iters: int The number of iterations which the algorithm will run Returns ------- centroids: a matrix of shape (num. of clusters, size of time series) The final centroids found by the algorithm assign: an array of num. series size The cluster id which each time series belongs to best_shift: an array of num. series size The amount shift amount performed for each time series cent_dists: a matrix of shape (num. centroids, num. series) The distance of each centroid to each time series References ---------- References ---------- .. [1] J. Yang and J. Leskovec, "Patterns of Temporal Variation in Online Media" - WSDM'11 http://dl.acm.org/citation.cfm?id=1935863 .. [1] J. Yang and J. Leskovec, "Patterns of Temporal Variation in Online Media" - WSDM'11 http://dl.acm.org/citation.cfm?id=1935863 .. [2] Wikipedia, "K-means clustering" http://en.wikipedia.org/wiki/K-means_clustering ''' num_clusters = initial_centroids.shape[0] num_series = tseries.shape[0] centroids = initial_centroids #KSC algorithm cent_dists = None assign = None prev_assign = None best_shift = None iters = n_iters converged = False while iters != 0 and not converged: #assign elements to new clusters References cent_dists, shifts = dist_all(centroids, tseries, rolling=True) assign = cent_dists.argmin(axis=0) best_shift = np.ndarray(num_series, dtype='i') for i in range(shifts.shape[1]): best_shift[i] = shifts[assign[i], i] #check if converged, if not compute new centroids if prev_assign is not None and not (prev_assign - assign).any(): converged = True else: centroids = _compute_centroids(tseries, assign, num_clusters, best_shift) prev_assign = assign iters -= 1 return centroids, assign, best_shift, cent_dists
Z = preprocessing.StandardScaler().fit_transform(T) km = cluster.MiniBatchKMeans(n_clusters=num_clusters) km = km.fit(Z) D = km.transform(Z) return D if __name__ == '__main__': X_train, T12_train, hosts_train = myio.read_features(test=False) Y_train = myio.read_response_train() k = 50 print('K-means') D = transform_km(T12_train, k) X_train_new = np.hstack((D, X_train)) model = OLS() model.fit(X_train_new, Y_train) print(k, np.sqrt(model.G.mean(axis=0))) print('KSC') C = np.genfromtxt('ksc-results/cents_visits_%d.dat' % k, dtype='d') T_nolog = np.asarray(np.exp(T12_train) - 1, order='C') D = dist_all(C, T_nolog, rolling=True)[0].T X_train_new = np.hstack((D, X_train)) model = OLS() model.fit(X_train_new, Y_train) print(k, np.sqrt(model.G.mean(axis=0)))