from WikiSampleLoader import WikiSampleLoader
from WikiTfIdfVectorizer import WikiTfIdfVectorizer
from WikiKmeans import WikiKmeans
from sklearn import metrics
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
import os

# Load dataset from pkz file and vectorize it
#w_tf_idf = WikiTfIdfVectorizer(use_hashing=True)
wsl = WikiSampleLoader()
#wsl = WikiSampleLoader(file_name=os.environ['HOME']+"/scikit_learn_data/20news-bydate.pkz")
stop_w = set(ENGLISH_STOP_WORDS)
stop_w = stop_w.union(['url', 'http', 'www', 'ref', 'jpg', 'file', 'com'])
stop_w = stop_w.union(['web', 'category', 'reference', 'title', 'org', 'br'])
w_tf_idf = WikiTfIdfVectorizer(stop_words=stop_w)
w_tf_idf.vectorize(wsl)
# get vectorized dataset
X = w_tf_idf._X
# init K-means
k = len(w_tf_idf._cluster_list)
labels = w_tf_idf._labels
wkm = WikiKmeans(k)
# apply K-means
km = wkm.apply_K_means(X)

print(labels)
print(km.labels_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
op.print_help()

(opts, args) = op.parse_args()

# Initialize WikiSampleLoader
wsl = WikiSampleLoader(file_name=opts.dataset_file)

# Add wiki frequent technical words to stop_words
# to avoid overfitting on not relevant terms
stop_w = set(ENGLISH_STOP_WORDS)
# Enrich stop_words set with wiki frequent technical tags
stop_w = stop_w.union(['url', 'http', 'www', 'ref', 'jpg', 'file', 'com'])
stop_w = stop_w.union(['web', 'category', 'reference', 'title', 'org', 'br'])

w_tf_idf = WikiTfIdfVectorizer(stop_words=stop_w,
                               use_idf=opts.use_idf,
                               n_features=opts.n_features,
                               use_hashing=opts.use_hashing)
w_tf_idf.vectorize(wsl)

# get vectorized dataset
X = w_tf_idf.get_vectorized_dataset()
# init K-means
k = len(w_tf_idf.get_cluster_list())
labels = w_tf_idf.get_label_vector()
wkm = WikiKmeans(k, verbose=opts.verbose, mini_batch=opts.minibatch, init=opts.init)
# apply K-means
km = wkm.apply_K_means(X)

#print(labels)
#print(km.labels_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))