def run(self): ''' Main Method that drives SpectralClusterer. Refer Ulrike's Tutorial on Spectral Clustering to understand notation. Return: Clusters: A dict with keys as cluster IDs and values as a list of vector IDs. ''' self.logger.debug('Generating Degree Matrix') self.getD() self.logger.debug('Generating Unnormalized Laplacian Matrix') self.getL() self.logger.debug('Generating Normalized Laplacian Matrix') self.getLsym() self.logger.debug('Generating Eigenvectors Matrix') self.getU() self.logger.debug('Generating Normalized Eigenvectors Matrix') self.getT() self.logger.debug('Doing KMeans') data = (self.T.T).tocsc() if self.k is None: self.k = corpusutil.find_no_clusters(X = data, samplesize =\ self.SAMPLE_SIZE_PERCENT,mink =\ self.MIN_K, maxk = self.MAX_K,\ classical = self.classical,\ verbose = self.verbose) self.logger.debug('k found to be %d',self.k) kmeans = corpusutil.KMeans(data = data, k = self.k, n = self.n,\ delta = self.delta, randomcentroids =\ self.randomcentroids, verbose =\ self.verbose, classical = self.classical) result = kmeans.run() return result['clusters']
def main(): parser = gen_args() args = parser.parse_args() sessionid = args.sessionid logger = logging.getLogger(__name__) logger.addHandler(logging.StreamHandler()) if args.verbose: logger.setLevel(logging.DEBUG) if args.classical: normalize = True else: normalize = False if args.stopwords is None: stopwords = None else: stopwords = args.stopwords.read().split() if args.opinion or args.corpus: if args.opinion: corpus = corpusutil.create(args.opinion) else: corpus = cPickle.load(args.corpus) logger.debug("Number of documents in corpus: %d ", len(corpus)) datacreator = corpusutil.GenerateVectors(corpus = corpus, mindfpercent\ = args.mindfpercent,\ maxdfpercent =\ args.maxdfpercent,\ minfrequency =\ args.minfrequency,\ verbose = args.verbose,\ usebigrams = args.usebigrams,\ normalize = normalize,\ tf = args.tf,\ stopwords = stopwords) result = datacreator.create() docids = result['docids'] featuredict = result['featuredict'] else: index = cPickle.load(args.indexstuff[0]) featuredict = cPickle.load(args.indexstuff[1]) docids = cPickle.load(args.indexstuff[2]) datacreator = corpusutil.GenerateVectors(index = index, featuredict =\ featuredict, docids = docids,\ ndocs_content = ndocs_content,\ normalize = normalize,\ tf = args.tf) result = datacreator.create() data = result['data'] if args.k is None: SAMPLE_SIZE_PERCENT = 50 MIN_K = 2 MAX_K = 50 logger.debug('k not set, finding k using sample size:\ %f',SAMPLE_SIZE_PERCENT) k = corpusutil.find_no_clusters(X = data, samplesize =\ SAMPLE_SIZE_PERCENT, mink = MIN_K, maxk\ = MAX_K, verbose = args.verbose,\ classical = args.classical) else: k = args.k if args.saveint: cPickle.dump(docids,open("data_key_"+sessionid+'.pck','w')) spio.mmwrite(open("data_"+sessionid+".mtx",'w')\ ,data,comment="CSC Matrix",field = 'real') kmeans = corpusutil.KMeans(data = data,k = k,n = args.n, delta =\ args.delta,randomcentroids =\ args.randomcentroids, verbose =\ args.verbose,classical = args.classical) result = kmeans.run() clusters = result['clusters'] centroids = result['centroids'] centroiddict = result['centroiddict'] if args.saveint: cPickle.dump(clusters,open("data_clusters_"+sessionid+'.pck','w')) spio.mmwrite(open("data_centroids_"+sessionid+'.mtx','w'),centroids,\ comment="CSC Matrix", field = 'real') logger.info(" %d Clusters Generated ",len(clusters)) vis_output = corpusutil.genconceptclouds(centroids = centroids,\ centroiddict = centroiddict,\ featuredict = featuredict,\ corpus = corpus,\ clusters = clusters,\ docids = docids,\ sessionid = sessionid) kmeansvis = open("kmeans-concept_clouds_"+str(sessionid)+'.html','w') kmeansvis.write(vis_output) kmeansvis.close() vis_output = corpusutil.genfeatureclouds(centroids.todense(),centroiddict,\ featuredict,sessionid) kmeansvis = open("kmeans-feature_clusters_"+str(sessionid)+'.html','w') kmeansvis.write(vis_output) kmeansvis.close()