def grid_search_knn(): knn_parameters = { "n_neighbors": [1, 2, 3, 4], "weights": ["uniform", "distance"], } pipeline = TextPipeline(KNeighborsClassifier(), knn_parameters, n_jobs=-1, verbose=1) corpus_folder = "train_pages_preprocessed/" corpus, metadata = create_corpus(corpus_folder, ["bad", "good"]) vfunc = np.vectorize(binarize) pipeline.fit(corpus, vfunc(metadata['label'])) pipeline.save_results("results/knn_results.csv")
from features import create_corpus if __name__ == "__main__": s_index = "20150701" e_index = "20150702" features_as_list, ids = indexed_features(s_index, e_index) n_topics = 0 all_features = [] for index, feature in features_as_list: all_features += feature round_topics = int( len(feature) / 25 ) + 1 n_topics += round_topics dictionary, corpus = create_corpus(index, all_features) x, lda_model, lda_corpus = single_lda("20150703", corpus, dictionary, n_topics) x = pca(x, 2) km, (centroids, c, k) = kmeans(x, n_topics) cluster_plot_2d(x, centroids, c, k) clusters = {} for idx, el in enumerate(c): if not el in clusters: clusters[el] = [] clusters[el].append( idx ) for key, doc in clusters.items(): for _id in doc[:5]: print(_id)