def grid_search_knn():
    knn_parameters = {
        "n_neighbors": [1, 2, 3, 4],
        "weights": ["uniform", "distance"],
    }

    pipeline = TextPipeline(KNeighborsClassifier(),
                            knn_parameters,
                            n_jobs=-1,
                            verbose=1)
    corpus_folder = "train_pages_preprocessed/"
    corpus, metadata = create_corpus(corpus_folder, ["bad", "good"])
    vfunc = np.vectorize(binarize)
    pipeline.fit(corpus, vfunc(metadata['label']))
    pipeline.save_results("results/knn_results.csv")
from features import create_corpus


if __name__ == "__main__":
  s_index = "20150701"
  e_index = "20150702"
  features_as_list, ids = indexed_features(s_index, e_index)
 
  n_topics = 0
  all_features = []
  for index, feature in features_as_list:
    all_features += feature
    round_topics = int( len(feature) / 25 ) + 1
    n_topics += round_topics

  dictionary, corpus = create_corpus(index, all_features)
  x, lda_model, lda_corpus = single_lda("20150703", corpus, dictionary, n_topics)

  x = pca(x, 2)
  km, (centroids, c, k) = kmeans(x, n_topics)
  cluster_plot_2d(x, centroids, c, k)

  clusters = {}
  for idx, el in enumerate(c):
    if not el in clusters:
      clusters[el] = []
    clusters[el].append( idx )

  for key, doc in clusters.items():
    for _id in doc[:5]:
      print(_id)