Exemplo n.º 1
0
def skill_correlations(runs=50, n_clusters=5):
    results = []
    clustering = kmeans
    for run in range(runs):
        for skill_correlation in list(np.arange(0, 0.9, 0.1)) + [0.85]:
            for clustering in clusterings:
                for students in [10, 20, 30, 50, 100, 200, 300, 500, 1000, 2000, 3000, 5000, 10000, 20000, 50000, 100000, 200000, 500000, 1000000]:
                    answers, items  = data(n_students=students, n_items=20, n_concepts=n_clusters, skill_correlation=skill_correlation)
                    true_cluster_names = list(items['concept'].unique())
                    X = similarity(answers)
                    items_ids = X.index
                    ground_truth = np.array([true_cluster_names.index(items.get_value(item, 'concept')) for item in items_ids])

                    labels = clustering(X, n_clusters, euclid=euclid)
                    rand = rand_index(ground_truth, labels)

                    print(run, skill_correlation, clustering.__name__, students, '===', rand)
                    if rand >= 0.9:
                        results.append([students, clustering.__name__, rand, skill_correlation])
                        break

    results = pd.DataFrame(results, columns=['students', 'clustering', 'rand_index', 'skill_correlation'])

    print(results)
    f, ax = plt.subplots(figsize=(7, 7))
    ax.set(yscale="log")
    sns.pointplot(data=results, x='skill_correlation', y='students', hue='clustering', ax=ax)
Exemplo n.º 2
0
def students(runs=15):
    results = []
    for run in range(runs):
        # for n_students in range(100, 1001, 100):
        # for n_students in [10, 25, 50, 100, 200, 300,  400, 600]:
        for difficulty_shift in np.arange(-1, 1.1, 0.2):
            answers, items = data(n_students=n_students, n_items=n_items, n_concepts=n_clusters, skill_correlation=skill_correlation, difficulty_shift=difficulty_shift, missing=missing)
            true_cluster_names = list(items['concept'].unique())
            # for i, clustering in enumerate(clusterings):
            for similarity, euclid, similarity_name in similarities:
                X = similarity(answers)
                items_ids = X.index
                ground_truth = np.array([true_cluster_names.index(items.get_value(item, 'concept')) for item in items_ids])

                labels = clustering(X, n_clusters, euclid=euclid)
                rand = rand_index(ground_truth, labels)
                results.append([n_students, clustering.__name__, rand, skill_correlation, difficulty_shift, similarity_name])
                print(run, n_students, similarity_name, rand)

    results = pd.DataFrame(results, columns=['students', 'clustering', 'rand_index', 'skill_correlation', 'difficulty_shift', 'similarity'])
    print(results)

    plt.figure(figsize=(16, 24))
    sns.pointplot(data=results, x='difficulty_shift', y='rand_index', hue='similarity')
Exemplo n.º 3
0
            texts=[items.get_value(item, 'name') for item in items_ids],
            shapes=ground_truth,
        )

    plt.legend(handles=[
        mlines.Line2D([], [], color='black', linewidth=0, marker=markers[i], label=v)
        for i, v in enumerate(true_cluster_names)
        ])


    plt.subplot(2, len(similarities) / 2 + 1, len(similarities) + 1)
    rands = []
    for c1 in [ground_truth] + clusters:
        l = []
        for c2 in [ground_truth] + clusters:
            l.append(rand_index(c1, c2))
        rands.append(l)

    sns.heatmap(rands, xticklabels=['truth'] + similarities_names, yticklabels=['truth'] + similarities_names, annot=True)
    plt.title(data_set)
    # sns.clustermap(rands, xticklabels=['truth'] + similarities_names, yticklabels=['truth'] + similarities_names, annot=True)


if False:
    for i, (similarity, similarities_name) in enumerate(zip(similarities, similarities_names)):
        print(similarities_name)
        X = similarity(answers)
        ground_truth =np.array([true_cluster_names.index(items.get_value(item, 'concept')) for item in X.index])
        same, different = [], []
        for concept1 in set(ground_truth):
            for concept2 in set(ground_truth):
Exemplo n.º 4
0
runs = 30
results = []
for run in range(runs):
    A = answers.sample(frac=0.5)
    for similarity, euclid, similarity_name in similarities:
        print(similarity_name)
        X = similarity(A)
        items_ids = X.index
        if dimensions:
            model = PCA(n_components=dimensions)
            X = pd.DataFrame(data=model.fit_transform(X), index=X.index)

        ground_truth = np.array([true_cluster_names.index(items.get_value(item, "concept")) for item in items_ids])

        for i, clustering in enumerate(clusterings):

            labels = clustering(X, n_clusters, euclid=euclid)
            rand = rand_index(ground_truth, labels)
            print("  - ", clustering.__name__, rand)
            results.append([similarity_name, clustering.__name__, rand])

results = pd.DataFrame(results, columns=["similarity", "clustering", "rand_index"])
print(results)

plt.figure(figsize=(16, 24))
plt.title(data_set)
sns.barplot(data=results, x="similarity", y="rand_index", hue="clustering")

plt.show()