from sklearn.decomposition import TruncatedSVD from main.arffJson.ArffJsonCorpus import ArffJsonCorpus, ArffJsonDocument import joblib import numpy as np from sklearn.cluster import KMeans corpusFilepath = "/home/simon/Projekte/zbMathClustering/raw_vector.json" corpus = ArffJsonCorpus(corpusFilepath) TDM = corpus.toCsrMatrix(shapeCols = 54334) """svd = TruncatedSVD(n_components=250) svd.fit(TDM) joblib.dump(svd, "lsi250-model")""" svd2 = joblib.load("lsi250-model") LSI_TDM = svd2.transform(TDM) km = KMeans(n_clusters=63, init='k-means++', max_iter=100, n_init=10) km.fit(LSI_TDM) joblib.dump(km, "km63-sklean_lsi250") """clModel = joblib.load("km63-sklean_lsi250") # log = open("clusters-km63-sklearn_lsi250", "w") log = open("foo", "w") count = 0 for arr in LSI_TDM: # npArray = sparseData2Matrix(doc.data, len(index2chiIndex), index2chiIndex) log.write(doc.id + ";" + str(clModel.predict(npArray)[0]) + "\n") count += 1 log.close()"""
if __name__ == "__main__": corpusFilepath = "/home/simon/Projekte/MIRS/testing_java_ml_libraries/raw_vector.json" """TDM_full_text = load_csr_matrix("derived_data/zb_math_full_text_tdm.npz") tfidf_trans = TfidfTransformer() tfidf_trans.fit(TDM_full_text) joblib.dump(tfidf_trans, "models/tfidf_full_text_model") TDM_full_text_reweighted = tfidf_trans.transform(TDM_full_text) km = KMeans(n_clusters=63, init='k-means++', max_iter=100, n_init=10) km.fit(TDM_full_text_reweighted) joblib.dump(km, "models/km63-full_text_tfidf")""" # g = fitGmmModel(getTDM()) # joblib.dump(g, gmmModelFile) clModel = joblib.load("models/gmm-sklean_lsi250") corpus = ArffJsonCorpus("raw_data/raw_vector.json") lsi_model = joblib.load("models/lsi250-model") log = open("results/clusters-gmm-sklean_lsi250", "w") for doc in corpus: sparseDoc = sparseData2Matrix(doc, 54334) arr = lsi_model.transform(sparseDoc) log.write(doc.id + ";" + str(clModel.predict(arr)[0]) + "\n") log.flush() log.close()