dendrogram(links) knn(meta[liwc].T, labels=liwc) knn(meta[liwc], labels=meta['Name of Work'].values) '''K-means''' k = KMeans(n_clusters=5) # 5 is at an elbow for sse in 2-d km = k.fit_transform(truncatedFeatures) '''PCA''' pca, X_pca, k, km = kcluster(justDFeatures, n_clusters=8) print features.columns[np.argsort(pca.components_[0])[:100]] # plt.savefig("scree.png", dpi= 100) pca = decomposition.PCA(n_components=2) X_pca = pca.fit_transform(X_centered) plot_embedding(X_pca, y) k.plot_k_sse(X_pca) # for 2 components 5 clusters ''' Supervised Learning''' # Logistic Regression and Random Forest seem to perform the best # Nonfiction seems unpredictable, while fiction, letters and poetry # are somewhat predictabe for genre in set(meta.Genre): df = meta[meta.Genre == genre].reset_index() if len(df) > 20: y = df.pop('deprivation') print genre, 'Logit' p.plot_roc(df[liwc].fillna(0), y, LogisticRegression) print genre, 'Random Forest' p.plot_roc(df[liwc].fillna(0), y, RandomForestClassifier)