def hierarchical_clustering(data, skill,  method='single', metric='euclidean', dendrogram=True, concepts=False, cluster_number=3, corr_as_vectors=False):
    pk, level = data.get_skill_id(skill)
    items = data.get_items_df()
    skills = data.get_skills_df()
    corr = compute_corr(data, merge_skills=concepts)
    print("Corr ({}) contain total {} values and from that {} nans".format(corr.shape, corr.size, corr.isnull().sum().sum()))
    corr[corr.isnull()] = 0

    if concepts:
        items = items[items["skill_lvl_" + str(level)] == pk]
        skill_ids = items[~items["skill_lvl_3"].isnull()]["skill_lvl_3"].unique()
        corr = pd.DataFrame(corr, index=skill_ids, columns=skill_ids)
        labels = list(skills.loc[corr.index]["name"])

    else:
        items = items[items["skill_lvl_" + str(level)] == pk]
        items = items[items["visualization"] != "pairing"]
        corr = pd.DataFrame(corr, index=items.index, columns=items.index)
        labels = ["{1} - {0}".format(item["name"], item["visualization"][0]) for id, item in list(items.iterrows())]

    if corr_as_vectors:
        Z = hr.linkage(corr, method=method, metric=metric)
    else:
        Z = hr.linkage(dst.squareform(1 - corr), method=method)
    Z[Z < 0] = 0
    if dendrogram:
        plt.title('{}: method: {}, metric: {}, as vectors: {}'.format(skill, method, metric, corr_as_vectors))
        plt.xlabel('items' if not concepts else "concepts")
        plt.ylabel('distance')
        hr.dendrogram(Z, leaf_rotation=90., leaf_font_size=10., labels=labels)

    return hr.fcluster(Z, cluster_number, "maxclust")
def difficulty_vs_time(data, the_skill, concepts=False):
    data.filter_data(0, 100)
    pk, level = data.get_skill_id(the_skill)
    data.trim_times()
    data.add_log_response_times()
    m = EloPriorCurrentModel(KC=2, KI=0.5)
    items = data.get_items_df()
    items = items[items["visualization"] != "pairing"]
    items = items.join(get_difficulty(data, m))
    items = items.join(pd.Series(data.get_dataframe_all().groupby(["item"])["log_response_time"].mean(), name="log_response_time_mean"))
    items = items[items["skill_lvl_"+str(level)] == pk]

    if concepts:
        skills = data.get_skills_df()
        skills = skills.join(items.groupby("skill_lvl_3")["difficulty"].mean())
        skills = skills.join(items.groupby("skill_lvl_3")["log_response_time_mean"].mean())
        skills = skills[skills.index.isin(items["skill_lvl_3"].unique())]
        for id, skill in skills.iterrows():
            plt.plot(skill["difficulty"], skill["log_response_time_mean"], "ok")
            plt.text(skill["difficulty"], skill["log_response_time_mean"], skill["name"])
    else:
        colors = "rgbyk"
        visualizations = list(items["visualization"].unique())
        for id, item in items.iterrows():
            plt.plot(item["difficulty"], item["log_response_time_mean"], "o", color=colors[visualizations.index(item["visualization"])])
            plt.text(item["difficulty"], item["log_response_time_mean"], item["name"])
        for i, vis in enumerate(visualizations):
            plt.plot(-1, 2, "o", color=colors[i], label=vis)
    plt.xlabel("difficulty according to " + str(m))
    plt.ylabel("mean of log time")
    plt.legend(loc=0)
    plt.title(the_skill)
def concept_clustering(data, skill, cluster_number=3, plot=True):
    pk, level = data.get_skill_id(skill)
    items = data.get_items_df()
    items = items[items["skill_lvl_" + str(level)] == pk]
    skills = data.get_skills_df()
    skill_ids = items[~items["skill_lvl_3"].isnull()]["skill_lvl_3"].unique()

    corr = compute_corr(data, merge_skills=True)
    corr = pd.DataFrame(corr, index=skill_ids, columns=skill_ids)
    print("Corr ({}) contain total {} values and from that {} nans".format(corr.shape, corr.size, corr.isnull().sum().sum()))
    corr[corr.isnull()] = 0

    try:
        sc = SpectralClusterer(corr, kcut=corr.shape[0] * 0.5, mutual=True)
        labels = sc.run(cluster_number=cluster_number, KMiter=50, sc_type=2)
    except np.linalg.linalg.LinAlgError:
        sc = SpectralClusterer(corr, kcut=corr.shape[0] * 0.5, mutual=False)
        labels = sc.run(cluster_number=cluster_number, KMiter=50, sc_type=2)

    if plot:
        colors = "rgbyk"
        for i, p in enumerate(corr.columns):
            skill = skills.loc[int(p)]
            plt.plot(sc.eig_vect[i, 1], sc.eig_vect[i, 2], "o", color=colors[labels[i]])
            plt.text(sc.eig_vect[i, 1], sc.eig_vect[i, 2], skill["name"])
        plt.title(data)

    return labels
def plot_skill_values(data, values):
    colors = ["p", "b", "y", "k", "g", "r"]
    skills = data.get_skills_df()
    items = data.get_items_df()
    skills = skills.join(items.groupby("skill")["skill_lvl_1"].first())
    colors = dict(zip(skills["skill_lvl_1"].unique(), colors))

    skills = skills.join(pd.Series(values, name="value"))
    skills = skills[~skills["value"].isnull()]
    for _, skill in skills.iterrows():
        x = random.random()
        # print(skill["value"], skill["name"])
        plt.plot(x, skill["value"], "o", color=colors[skill["skill_lvl_1"]] if not np.isnan(skill["skill_lvl_1"]) else "k")
        plt.text(x, skill["value"], skill["name"])