from sklearn.decomposition import TruncatedSVD from main.arffJson.ArffJsonCorpus import ArffJsonCorpus, ArffJsonDocument import joblib import numpy as np from sklearn.cluster import KMeans corpusFilepath = "/home/simon/Projekte/zbMathClustering/raw_vector.json" corpus = ArffJsonCorpus(corpusFilepath) TDM = corpus.toCsrMatrix(shapeCols = 54334) """svd = TruncatedSVD(n_components=250) svd.fit(TDM) joblib.dump(svd, "lsi250-model")""" svd2 = joblib.load("lsi250-model") LSI_TDM = svd2.transform(TDM) km = KMeans(n_clusters=63, init='k-means++', max_iter=100, n_init=10) km.fit(LSI_TDM) joblib.dump(km, "km63-sklean_lsi250") """clModel = joblib.load("km63-sklean_lsi250") # log = open("clusters-km63-sklearn_lsi250", "w") log = open("foo", "w") count = 0 for arr in LSI_TDM: # npArray = sparseData2Matrix(doc.data, len(index2chiIndex), index2chiIndex) log.write(doc.id + ";" + str(clModel.predict(npArray)[0]) + "\n") count += 1 log.close()"""
from sklearn.decomposition import TruncatedSVD from main.arffJson.ArffJsonCorpus import ArffJsonCorpus, ArffJsonDocument import joblib import numpy as np from sklearn.cluster import AffinityPropagation, MeanShift from sklearn.mixture import GMM import random random.seed(0) corpusFilepath = "raw_data/raw_vector.json" corpus = ArffJsonCorpus(corpusFilepath) TDM = corpus.toCsrMatrix(shapeCols=54334, selection=lambda doc: True if random.random() < 0.1 else False) print "TDM shape: " + str(TDM.shape) svd2 = joblib.load("models/lsi250-model") LSI_TDM = svd2.transform(TDM) #ap = AffinityPropagation( # damping=0.95, # max_iter=200, # convergence_iter=15, # copy=True, # preference=None, # affinity='euclidean', # verbose=False #) # ap.fit(LSI_TDM)
from sklearn.decomposition import TruncatedSVD from main.arffJson.ArffJsonCorpus import ArffJsonCorpus, ArffJsonDocument import joblib import numpy as np from sklearn.cluster import AffinityPropagation, MeanShift from sklearn.mixture import GMM import random random.seed(0) corpusFilepath = "raw_data/raw_vector.json" corpus = ArffJsonCorpus(corpusFilepath) TDM = corpus.toCsrMatrix(shapeCols = 54334, selection = lambda doc: True if random.random() < 0.1 else False) print "TDM shape: " + str(TDM.shape) svd2 = joblib.load("models/lsi250-model") LSI_TDM = svd2.transform(TDM) #ap = AffinityPropagation( # damping=0.95, # max_iter=200, # convergence_iter=15, # copy=True, # preference=None, # affinity='euclidean', # verbose=False #) # ap.fit(LSI_TDM) """ms = MeanShift(
from sklearn.decomposition import TruncatedSVD from main.arffJson.ArffJsonCorpus import ArffJsonCorpus, ArffJsonDocument import joblib import numpy as np from sklearn.cluster import KMeans corpusFilepath = "/home/simon/Projekte/zbMathClustering/raw_vector.json" corpus = ArffJsonCorpus(corpusFilepath) TDM = corpus.toCsrMatrix(shapeCols=54334) """svd = TruncatedSVD(n_components=250) svd.fit(TDM) joblib.dump(svd, "lsi250-model")""" svd2 = joblib.load("lsi250-model") LSI_TDM = svd2.transform(TDM) km = KMeans(n_clusters=63, init="k-means++", max_iter=100, n_init=10) km.fit(LSI_TDM) joblib.dump(km, "km63-sklean_lsi250") """clModel = joblib.load("km63-sklean_lsi250") # log = open("clusters-km63-sklearn_lsi250", "w") log = open("foo", "w") count = 0 for arr in LSI_TDM: # npArray = sparseData2Matrix(doc.data, len(index2chiIndex), index2chiIndex) log.write(doc.id + ";" + str(clModel.predict(npArray)[0]) + "\n") count += 1 log.close()"""