def cluster_data(): examples = [pages[page][2] for page in pages] kmeans_results = kmeans.runkmeans_sklearn(examples, range(1, 16)) x = sorted(kmeans_results.keys()) y = [kmeans_results[key].inertia_ for key in x] plt.plot(x, y) plt.xlabel('Number of Clusters') plt.ylabel('Loss') plt.title('K-means Clustering\nLoss vs. Number of Clusters') plt.savefig('kmeans_cluster_num_graph.png') plt.show()
def set_up_links_and_features(pages): print 'total articles:', len(pages) ref = {} examples = [] for i, (page, value) in enumerate(pages.iteritems()): val = value[0] links = get_links_from_text(pages, val) if not isinstance(val, basestring): features = {} else: features = extract_features(val, links) if i % 1000 == 0: print 'i = ', i pages[page] = (val, links, features) ref[page] = i examples.append(features) kmeans_results = kmeans.runkmeans_sklearn(examples, [NUM_CLUSTERS]) labs = kmeans_results[NUM_CLUSTERS].labels_ for page, value in pages.iteritems(): features = value[2] cluster = labs[ref[page]] for i in range(NUM_CLUSTERS): features['IN_CLUSTER_'+str(i)] = 1 if cluster == i else 0