def cluster_data():
    examples = [pages[page][2] for page in pages]
    kmeans_results = kmeans.runkmeans_sklearn(examples, range(1, 16))
    x = sorted(kmeans_results.keys())
    y = [kmeans_results[key].inertia_ for key in x]
    plt.plot(x, y)

    plt.xlabel('Number of Clusters')
    plt.ylabel('Loss')
    plt.title('K-means Clustering\nLoss vs. Number of Clusters')
    plt.savefig('kmeans_cluster_num_graph.png')
    plt.show()
def set_up_links_and_features(pages):
    print 'total articles:', len(pages)
    ref = {}
    examples = []
    for i, (page, value) in enumerate(pages.iteritems()):
        val = value[0]
        links = get_links_from_text(pages, val)
        if not isinstance(val, basestring):
            features = {}
        else:
            features = extract_features(val, links)
            if i % 1000 == 0:
                print 'i = ', i
        pages[page] = (val, links, features)
        ref[page] = i
        examples.append(features)

    kmeans_results = kmeans.runkmeans_sklearn(examples, [NUM_CLUSTERS])
    labs = kmeans_results[NUM_CLUSTERS].labels_
    for page, value in pages.iteritems():
        features = value[2]
        cluster = labs[ref[page]]
        for i in range(NUM_CLUSTERS):
            features['IN_CLUSTER_'+str(i)] = 1 if cluster == i else 0