def get_clusters(): kernel = ['linear', 'cosine', 'sigmoid', 'polynomial'] for ii in np.arange(32, 37, 1): for ij in kernel: pca = kPCA(k = ii, kernel = ij).fit(np.array(simscore)) pca = pca.components_.T km = kkMeans(k = ii, kernel = ij, gamma = 1).fit_predict(pca) cluster_labels = km.clusters if not os.path.exists(os.path.join(path, 'labels')): os.makedirs(os.path.join(path, 'labels')) pd.DataFrame(cluster_labels).to_csv(os.path.join(path, f'labels/labels_{ii}_{ij}.csv')) else: pd.DataFrame(cluster_labels).to_csv(os.path.join(path, f'labels/labels_{ii}_{ij}.csv'))
def update_figure(make_selection, g_m, knl, drop, yaxis, clust): # data_places = data[(data.year_edited >= make_selection[0]) & (data.year_edited <= make_selection[1])] ts = pd.read_csv(os.path.join(path, f'tsne/tsne_{int(clust)}.csv')).iloc[:, 1:] ts = ts.sort_values(by=['year']) data_places = ts[(ts.year >= make_selection[0]) & (ts.year <= make_selection[1])] if g_m == 'Cluster': if drop != []: traces = [] for val in drop: traces.append(go.Scattergl( x = np.array(data_places.loc[data_places.year == int(val), '0']), y = np.array(data_places.loc[data_places.year == int(val), '1']), text = [(x, y, z, w, p) for (x, y, z, w, p) in zip(\ data_places.loc[data_places['year'] == int(val), 'pdf_names'].apply(lambda x: x.split('.')[0]),\ data_places.loc[data_places['year'] == int(val), 'year'],\ data_places.loc[data_places['year'] == int(val), 'language'],\ data_places.loc[data_places['year'] == int(val), 'authors'],\ data_places.loc[data_places['year'] == int(val), 'title'])], customdata = [(x, y, z, w, p) for (x, y, z, w, p) in zip(\ data_places.loc[data_places['year'] == int(val), 'pdf_names'].apply(lambda x: x.split('.')[0]),\ data_places.loc[data_places['year'] == int(val), 'year'],\ data_places.loc[data_places['year'] == int(val), 'language'],\ data_places.loc[data_places['year'] == int(val), 'authors'],\ data_places.loc[data_places['year'] == int(val), 'title'])], mode = 'markers', opacity = 0.6, marker = {'size': 15, 'line': {'width': 0.5, 'color': 'white'}}, name = val, )) return { 'data': traces, 'layout': go.Layout(xaxis={'title': 'tsne-2'}, yaxis={ 'type': 'linear' if yaxis == 'Linear' else 'log', 'title': 'tsne-1' }, margin={ 'l': 40, 'b': 40, 't': 10, 'r': 10 }, legend={ 'x': 1, 'y': 1 }, hovermode='closest') } else: pca = kPCA(k=int(clust), kernel=knl).fit(np.array(simscore)) pca = pca.components_.T km = kkMeans(k=int(clust), kernel=knl, gamma=1).fit_predict(pca) cluster_labels = km.clusters ts = pd.read_csv(os.path.join( path, f'tsne/tsne_{int(clust)}.csv')).iloc[:, 1:] ts = ts[(ts.year >= make_selection[0]) & (ts.year <= make_selection[1])] traces = go.Scattergl( x = np.array(ts)[:, 0], y = np.array(ts)[:, 1], text = [(x, y, z, w, p) for (x, y, z, w, p) in zip(\ ts['pdf_names'].apply(lambda x: x.split('.')[0]),\ ts['year'],\ ts['language'],\ ts['authors'],\ ts['title'])], customdata = [(x, y, z, w, p) for (x, y, z, w, p) in zip(\ ts['pdf_names'].apply(lambda x: x.split('.')[0]),\ ts['year'],\ ts['language'],\ ts['authors'],\ ts['title'])], mode = 'markers', opacity = 0.7, marker = {'size': 15, # 'opacity': 0.9, 'color': cluster_labels, 'colorscale':'Viridis', 'line': {'width': .5, 'color': 'white'}}, ) return { 'data': [traces], 'layout': go.Layout(height=600, xaxis={'title': 'tsne-2'}, yaxis={ 'type': 'linear' if yaxis == 'Linear' else 'log', 'title': 'tsne-1' }, margin={ 'l': 40, 'b': 40, 't': 10, 'r': 10 }, legend={ 'x': 1, 'y': 1 }, hovermode='closest') } else: ss = np.array(simscore) m, n = ss.shape G = nx.Graph() for n in range(m): G.add_node(n) for i in range(m): for j in range(n): if ss[i, j] != 0 and i != j: G.add_edge(i, j) E = [edg for edg in G.edges] pos = nx.fruchterman_reingold_layout(G) Xv = [pos[k][0] for k in range(n)] Yv = [pos[k][1] for k in range(n)] Xed = [] Yed = [] for edge in E: Xed += [pos[edge[0]][0], pos[edge[1]][0], None] Yed += [pos[edge[0]][1], pos[edge[1]][1], None] etrace = go.Scattergl(x=Xed, y=Yed, mode='lines', line=dict(color='rgb(210,210,210)', width=.5), hoverinfo='none') vtrace = go.Scattergl( x=Xv, y=Yv, mode='markers', name='net', marker=dict(symbol='circle-dot', size=5, color='#6959CD', line=dict(color='rgb(50,50,50)', width=0.5)), # text = labels, hoverinfo='text') return { 'data': [etrace, vtrace], 'layout': go.Layout(height=600, xaxis={'title': 'year'}, yaxis={ 'type': 'linear' if yaxis == 'Linear' else 'log', 'title': 'Similarity score' }, margin={ 'l': 40, 'b': 40, 't': 10, 'r': 10 }, legend={ 'x': 1, 'y': 1 }, hovermode='closest') }
'time': [], 'acc': [], 'prec': [], 'rec': [], 'f1': [], 'randind': [] } } for p, q in data_name.items(): for ii in kernels: start = time.time() if p == 'moon': gamma = 10 d = 3 kmeans = kkMeans(k=2, kernel=ii, gamma=gamma).fit_predict(q[0]) kernel_outcome[ii]['acc'].append( kmeans.accuracy(q[1], kmeans.clusters)) kernel_outcome[ii]['prec'].append( kmeans.precision(q[1], kmeans.clusters)) kernel_outcome[ii]['rec'].append( kmeans.recall(q[1], kmeans.clusters)) kernel_outcome[ii]['f1'].append(kmeans.f1(q[1], kmeans.clusters)) kernel_outcome[ii]['randind'].append( kmeans.rand_index_score(kmeans.clusters, q[1])) elif p == 'circle': gamma = 10 d = 3 kmeans = kkMeans(k=2, kernel=ii, gamma=gamma).fit_predict(q[0]) kernel_outcome[ii]['acc'].append( kmeans.accuracy(q[1], kmeans.clusters))