def difficulty_vs_time(data, the_skill, concepts=False): data.filter_data(0, 100) pk, level = data.get_skill_id(the_skill) data.trim_times() data.add_log_response_times() m = EloPriorCurrentModel(KC=2, KI=0.5) items = data.get_items_df() items = items[items["visualization"] != "pairing"] items = items.join(get_difficulty(data, m)) items = items.join(pd.Series(data.get_dataframe_all().groupby(["item"])["log_response_time"].mean(), name="log_response_time_mean")) items = items[items["skill_lvl_"+str(level)] == pk] if concepts: skills = data.get_skills_df() skills = skills.join(items.groupby("skill_lvl_3")["difficulty"].mean()) skills = skills.join(items.groupby("skill_lvl_3")["log_response_time_mean"].mean()) skills = skills[skills.index.isin(items["skill_lvl_3"].unique())] for id, skill in skills.iterrows(): plt.plot(skill["difficulty"], skill["log_response_time_mean"], "ok") plt.text(skill["difficulty"], skill["log_response_time_mean"], skill["name"]) else: colors = "rgbyk" visualizations = list(items["visualization"].unique()) for id, item in items.iterrows(): plt.plot(item["difficulty"], item["log_response_time_mean"], "o", color=colors[visualizations.index(item["visualization"])]) plt.text(item["difficulty"], item["log_response_time_mean"], item["name"]) for i, vis in enumerate(visualizations): plt.plot(-1, 2, "o", color=colors[i], label=vis) plt.xlabel("difficulty according to " + str(m)) plt.ylabel("mean of log time") plt.legend(loc=0) plt.title(the_skill)
def hierarchical_clustering(data, skill, method='single', metric='euclidean', dendrogram=True, concepts=False, cluster_number=3, corr_as_vectors=False): pk, level = data.get_skill_id(skill) items = data.get_items_df() skills = data.get_skills_df() corr = compute_corr(data, merge_skills=concepts) print("Corr ({}) contain total {} values and from that {} nans".format(corr.shape, corr.size, corr.isnull().sum().sum())) corr[corr.isnull()] = 0 if concepts: items = items[items["skill_lvl_" + str(level)] == pk] skill_ids = items[~items["skill_lvl_3"].isnull()]["skill_lvl_3"].unique() corr = pd.DataFrame(corr, index=skill_ids, columns=skill_ids) labels = list(skills.loc[corr.index]["name"]) else: items = items[items["skill_lvl_" + str(level)] == pk] items = items[items["visualization"] != "pairing"] corr = pd.DataFrame(corr, index=items.index, columns=items.index) labels = ["{1} - {0}".format(item["name"], item["visualization"][0]) for id, item in list(items.iterrows())] if corr_as_vectors: Z = hr.linkage(corr, method=method, metric=metric) else: Z = hr.linkage(dst.squareform(1 - corr), method=method) Z[Z < 0] = 0 if dendrogram: plt.title('{}: method: {}, metric: {}, as vectors: {}'.format(skill, method, metric, corr_as_vectors)) plt.xlabel('items' if not concepts else "concepts") plt.ylabel('distance') hr.dendrogram(Z, leaf_rotation=90., leaf_font_size=10., labels=labels) return hr.fcluster(Z, cluster_number, "maxclust")
def concept_clustering(data, skill, cluster_number=3, plot=True): pk, level = data.get_skill_id(skill) items = data.get_items_df() items = items[items["skill_lvl_" + str(level)] == pk] skills = data.get_skills_df() skill_ids = items[~items["skill_lvl_3"].isnull()]["skill_lvl_3"].unique() corr = compute_corr(data, merge_skills=True) corr = pd.DataFrame(corr, index=skill_ids, columns=skill_ids) print("Corr ({}) contain total {} values and from that {} nans".format(corr.shape, corr.size, corr.isnull().sum().sum())) corr[corr.isnull()] = 0 try: sc = SpectralClusterer(corr, kcut=corr.shape[0] * 0.5, mutual=True) labels = sc.run(cluster_number=cluster_number, KMiter=50, sc_type=2) except np.linalg.linalg.LinAlgError: sc = SpectralClusterer(corr, kcut=corr.shape[0] * 0.5, mutual=False) labels = sc.run(cluster_number=cluster_number, KMiter=50, sc_type=2) if plot: colors = "rgbyk" for i, p in enumerate(corr.columns): skill = skills.loc[int(p)] plt.plot(sc.eig_vect[i, 1], sc.eig_vect[i, 2], "o", color=colors[labels[i]]) plt.text(sc.eig_vect[i, 1], sc.eig_vect[i, 2], skill["name"]) plt.title(data) return labels
def item_clustering(data, skill, cluster_number=3, plot=True): pk, level = data.get_skill_id(skill) items = data.get_items_df() items = items[items["skill_lvl_" + str(level)] == pk] items = items[items["visualization"] != "pairing"] corr = compute_corr(data) corr = pd.DataFrame(corr, index=items.index, columns=items.index) print("Corr ({}) contain total {} values and from that {} nans".format(corr.shape, corr.size, corr.isnull().sum().sum())) corr[corr.isnull()] = 0 sc = SpectralClusterer(corr, kcut=corr.shape[0] / 2, mutual=True) # sc = SpectralClusterer(corr, kcut=30, mutual=True) labels = sc.run(cluster_number=cluster_number, KMiter=50, sc_type=2) if plot: colors = "rgbyk" visualizations = list(items["visualization"].unique()) for i, p in enumerate(corr.columns): item = items.loc[p] plt.plot(sc.eig_vect[i,1], sc.eig_vect[i,2], "o", color=colors[visualizations.index(item["visualization"])]) # plt.plot(sc.eig_vect[i, 1], sc.eig_vect[i, 2], "o", color=colors[labels[i]]) plt.text(sc.eig_vect[i, 1], sc.eig_vect[i, 2], item["name"]) for i, vis in enumerate(visualizations): plt.plot(0, 0, "o", color=colors[i], label=vis) plt.title(data) plt.legend(loc=3) return labels
def get_difficulty(data, model, normalize=False): runner.Runner(data, model).run(force=True) if not normalize: return pd.Series(model.difficulty, name="difficulty") items = data.get_items_df() items = items.join(pd.Series(model.difficulty, name="difficulty")) skills = get_mean_skill(data, model) for skill, value in skills.items(): if skill == 1: items.loc[:, "difficulty"] -= value else: items.loc[(items["skill_lvl_1"] == skill) | (items["skill_lvl_2"] == skill) | (items["skill_lvl_3"] == skill), "difficulty"] -= value return items["difficulty"]
def plot_skill_values(data, values): colors = ["p", "b", "y", "k", "g", "r"] skills = data.get_skills_df() items = data.get_items_df() skills = skills.join(items.groupby("skill")["skill_lvl_1"].first()) colors = dict(zip(skills["skill_lvl_1"].unique(), colors)) skills = skills.join(pd.Series(values, name="value")) skills = skills[~skills["value"].isnull()] for _, skill in skills.iterrows(): x = random.random() # print(skill["value"], skill["name"]) plt.plot(x, skill["value"], "o", color=colors[skill["skill_lvl_1"]] if not np.isnan(skill["skill_lvl_1"]) else "k") plt.text(x, skill["value"], skill["name"])
def mean_by_skill(data, values, filter_skills=None): items = data.get_items_df() if filter_skills is not None: items = items[items["skill_lvl_1"].isin(filter_skills) | items["skill_lvl_2"].isin(filter_skills)] items = items.join(pd.Series(values, name="value")) return items.groupby("skill")["value"].mean()