예제 #1
0
    # data_set, n_clusters  = 'math_garden-multiplication', 3
    # data_set, n_clusters  = 'math_garden-addition', 3
    # data_set, n_clusters  = 'math_garden-subtraction', 3
    # data_set, n_clusters  = 'math_garden-all', 3
    answers = pd.read_pickle('data/{}-answers.pd'.format(data_set))
    items = pd.read_pickle('data/{}-items.pd'.format(data_set))
    true_cluster_names = list(items['concept'].unique())

    print(len(answers), len(items))

    projection = tsne
    similarity, euclid = similarity_pearson, True


    X = similarity(answers)
    xs, ys = projection(X, euclid=euclid, perplexity=10)

    items_ids = X.index
    ground_truth = np.array([true_cluster_names.index(items.get_value(item, 'concept')) for item in items_ids])

    plot_clustering(
        items_ids, xs, ys,
        labels=ground_truth,
        texts=[items.get_value(item, 'name') for item in items_ids],
        shapes=ground_truth,
    )


if False:
    df = pd.read_csv('tsne-prid-jmena-data.csv', sep=';')
    print(df)
예제 #2
0
            similarities_names.append("{} -> {}".format(f.__name__.replace('similarity_', ''), g.__name__.replace('similarity_', '')))
        else:
            similarities.append(lambda x, f=f: f(x, cache=data_set + str(modificator)))
            similarities_names.append(f.__name__.replace('similarity_', ''))

if True:
    euclid = True
    clusters = []
    for i, similarity in enumerate(similarities):
        print(similarities_names[i])
        X = similarity(answers)
        labels = kmeans(X, concepts=n_clusters, euclid=euclid)
        clusters.append(labels)

        items_ids = X.index
        xs, ys = projection(X, euclid=euclid)
        ground_truth = [true_cluster_names.index(items.get_value(item, 'concept')) for item in items_ids]

        plt.subplot(2, len(similarities) / 2 + 1, i + 1)
        plt.title(similarities_names[i])
        plot_clustering(
            items_ids, xs, ys,
            labels=labels,
            texts=[items.get_value(item, 'name') for item in items_ids],
            shapes=ground_truth,
        )

    plt.legend(handles=[
        mlines.Line2D([], [], color='black', linewidth=0, marker=markers[i], label=v)
        for i, v in enumerate(true_cluster_names)
        ])