# data_set, n_clusters = 'math_garden-multiplication', 3 # data_set, n_clusters = 'math_garden-addition', 3 # data_set, n_clusters = 'math_garden-subtraction', 3 # data_set, n_clusters = 'math_garden-all', 3 answers = pd.read_pickle('data/{}-answers.pd'.format(data_set)) items = pd.read_pickle('data/{}-items.pd'.format(data_set)) true_cluster_names = list(items['concept'].unique()) print(len(answers), len(items)) projection = tsne similarity, euclid = similarity_pearson, True X = similarity(answers) xs, ys = projection(X, euclid=euclid, perplexity=10) items_ids = X.index ground_truth = np.array([true_cluster_names.index(items.get_value(item, 'concept')) for item in items_ids]) plot_clustering( items_ids, xs, ys, labels=ground_truth, texts=[items.get_value(item, 'name') for item in items_ids], shapes=ground_truth, ) if False: df = pd.read_csv('tsne-prid-jmena-data.csv', sep=';') print(df)
similarities_names.append("{} -> {}".format(f.__name__.replace('similarity_', ''), g.__name__.replace('similarity_', ''))) else: similarities.append(lambda x, f=f: f(x, cache=data_set + str(modificator))) similarities_names.append(f.__name__.replace('similarity_', '')) if True: euclid = True clusters = [] for i, similarity in enumerate(similarities): print(similarities_names[i]) X = similarity(answers) labels = kmeans(X, concepts=n_clusters, euclid=euclid) clusters.append(labels) items_ids = X.index xs, ys = projection(X, euclid=euclid) ground_truth = [true_cluster_names.index(items.get_value(item, 'concept')) for item in items_ids] plt.subplot(2, len(similarities) / 2 + 1, i + 1) plt.title(similarities_names[i]) plot_clustering( items_ids, xs, ys, labels=labels, texts=[items.get_value(item, 'name') for item in items_ids], shapes=ground_truth, ) plt.legend(handles=[ mlines.Line2D([], [], color='black', linewidth=0, marker=markers[i], label=v) for i, v in enumerate(true_cluster_names) ])