def main(): parser = gen_args() args = parser.parse_args() sessionid = args.sessionid logger = logging.getLogger(__name__) logger.addHandler(logging.StreamHandler()) if args.verbose: logger.setLevel(logging.DEBUG) if args.classical: normalize = True else: normalize = False if args.opinion: corpus = corpusutil.create(args.opinion) logger.debug("Number of documents in corpus: %d ", len(corpus)) if args.stopwords: stopwords = args.stopwords.read().split() datacreator = corpusutil.GenerateVectors(corpus = corpus, mindfpercent\ = args.mindfpercent,\ maxdfpercent =\ args.maxdfpercent,\ minfrequency =\ args.minfrequency,\ verbose = args.verbose,\ usebigrams = args.usebigrams,\ normalize = normalize,\ tf = args.tf,\ stopwords = stopwords) else: datacreator = corpusutil.GenerateVectors(corpus = corpus, mindfpercent\ = args.mindfpercent,\ maxdfpercent =\ args.maxdfpercent,\ minfrequency =\ args.minfrequency,\ verbose = args.verbose,\ usebigrams = args.usebigrams,\ normalize = normalize,\ tf = args.tf,\ stopwords = None) result = datacreator.create() docids = result['docids'] featuredict = result['featuredict'] elif args.corpus: corpus = cPickle.load(args.corpus) logger.debug("Number of documents in corpus: %d ", len(corpus)) if args.stopwords: stopwords = args.stopwords.read().split() datacreator = corpusutil.GenerateVectors(corpus = corpus, mindfpercent\ = args.mindfpercent,\ maxdfpercent =\ args.maxdfpercent,\ minfrequency =\ args.minfrequency,\ verbose = args.verbose,\ usebigrams = args.usebigrams,\ normalize = normalize,\ tf = args.tf,\ stopwords = stopwords) else: datacreator = corpusutil.GenerateVectors(corpus = corpus, mindfpercent\ = args.mindfpercent,\ maxdfpercent =\ args.maxdfpercent,\ minfrequency =\ args.minfrequency,\ verbose = args.verbose,\ usebigrams = args.usebigrams,\ normalize = normalize,\ tf = args.tf,\ stopwords = None) result = datacreator.create() docids = result['docids'] featuredict = result['featuredict'] else: index = cPickle.load(args.indexstuff[0]) featuredict = cPickle.load(args.indexstuff[1]) docids = cPickle.load(args.indexstuff[2]) datacreator = corpusutil.GenerateVectors(index = index, featuredict =\ featuredict, docids = docids,\ ndocs_content = ndocs_content,\ normalize = normalize,\ tf = args.tf) result = datacreator.create() data = result['data'] p = data.shape[0] n = data.shape[1] logger.debug(" Vectors are of dimensions: (%d,%d)",\ p, n) if args.saveint: cPickle.dump(docids,open("tfidfvectors_key_"+sessionid+'.pck','w')) spio.mmwrite(open("tfidfvectors_"+sessionid+".mtx",'w')\ ,data,comment="CSC Matrix",field = 'real') #DEFAULT_RANK chosen because it works well in practice. DEFAULT_RANK = 250 r = args.r maxr = min(p,n) logger.debug(" Data can have rank not greate than : %d", maxr) if maxr >= DEFAULT_RANK: if DEFAULT_RANK > r or r > maxr: r = DEFAULT_RANK else: r = int(maxr/2) logger.debug(" Going to generate rank %d approximation", r) ut,s,vt = sparsesvd(data,r) red_data = ssp.csc_matrix(np.dot(ut.T,np.dot(np.diag(s),vt))) logger.debug(" Generated rank %d approximation", r) if normalize: logger.debug(" Normalizing columns of reduced rank matrix...") invnorms = np.zeros(n) normsii = np.arange(0,n,1) normsjj = np.arange(0,n,1) for col in range(n): invnorms[col] = math.sqrt((red_data[:,col].T*red_data[:,col]).todense()) if invnorms[col] is not 0: invnorms[col] = 1/invnorms[col] diag = ssp.coo_matrix((invnorms,(normsii,normsjj)),shape = (n,n)).tocsc() red_data = red_data*diag logger.debug(" Doing KMeans on reduced rank matrix...") kmeans = corpusutil.KMeans(data = red_data,k = args.k,n = args.n, delta =\ args.delta,randomcentroids =\ args.randomcentroids, verbose = args.verbose,classical = args.classical) result = kmeans.run() clusters = result['clusters'] centroids = result['centroids'] centroiddict = result['centroiddict'] if args.saveint: cPickle.dump(clusters,open("redrank_clusters_"+sessionid+'.pck','w')) spio.mmwrite(open("redrank_centroids_"+sessionid+'.mtx','w'),centroids,\ comment="CSC Matrix", field = 'real') logger.info(" %d Clusters Generated ",len(clusters)) result = corpusutil.getcentroids(data,clusters) originalmat_centroids = result['centroids'] originalmat_centroiddict = result['centroiddict'] if args.saveint: spio.mmwrite(open("originalmat_centroids_"+sessionid+'.mtx','w'),\ originalmat_centroids,comment="CSC Matrix", field = 'real') vis_output = corpusutil.genconceptclouds(centroids = centroids,\ centroiddict = centroiddict,\ featuredict = featuredict,\ corpus = corpus,\ clusters = clusters,\ docids = docids,\ sessionid = sessionid) svdkmeansvis = open("svdkmeans-concept_clouds_"+str(sessionid)+'.html','w') svdkmeansvis.write(vis_output) svdkmeansvis.close() vis_output = corpusutil.genfeatureclouds(originalmat_centroids.todense(),\ originalmat_centroiddict,\ featuredict,sessionid) svdkmeansvis = open("svdkmeans-feature_clusters_"+str(sessionid)+'.html','w') svdkmeansvis.write(vis_output) svdkmeansvis.close()
def main(): parser = gen_args() args = parser.parse_args() sessionid = args.sessionid logger = logging.getLogger(__name__) logger.addHandler(logging.StreamHandler()) if args.verbose: logger.setLevel(logging.DEBUG) if args.stopwords: stopwords = args.stopwords.read().split() else: stopwords = None if args.classical: normalize = True else: normalize = False if args.opinion or args.corpus: if args.opinion: corpus = corpusutil.create(args.opinion) else: corpus = cPickle.load(args.corpus) logger.debug("Number of documents in corpus: %d ", len(corpus)) datacreator = corpusutil.GenerateVectors(corpus = corpus, mindfpercent\ = args.mindfpercent,\ maxdfpercent =\ args.maxdfpercent,\ minfrequency =\ args.minfrequency,\ verbose = args.verbose,\ usebigrams = args.usebigrams,\ normalize = normalize,\ tf = args.tf,\ stopwords = stopwords) result = datacreator.create() docids = result['docids'] featuredict = result['featuredict'] else: index = cPickle.load(args.indexstuff[0]) featuredict = cPickle.load(args.indexstuff[1]) docids = cPickle.load(args.indexstuff[2]) datacreator = corpusutil.GenerateVectors(index = index, featuredict =\ featuredict, docids = docids,\ ndocs_content = ndocs_content,\ normalize = normalize,\ tf = args.tf) result = datacreator.create() X = result['data'] if args.k is None: MIN_K = 2 MAX_K = 50 SAMPLE_SIZE_PERCENT = 100 spectral = SpectralClusterer(X = X, usecosine = args.usecosine, sigma =\ args.sigma, n = args.n, delta =\ args.delta, MIN_K = MIN_K, MAX_K = MAX_K,\ SAMPLE_SIZE_PERCENT = SAMPLE_SIZE_PERCENT,\ randomcentroids = args.randomcentroids,\ classical = args.classical, verbose = \ args.verbose) else: spectral = SpectralClusterer(X = X, usecosine = args.usecosine, sigma = \ args.sigma, k = args.k, n = args.n, delta = \ args.delta, randomcentroids = \ args.randomcentroids, classical =\ args.classical, verbose =\ args.verbose) clusters = spectral.run() result = corpusutil.getcentroids(X, clusters, normalize) centroids = result['centroids'] centroiddict = result['centroiddict'] logger.info(" %d Clusters Generated ",len(clusters)) vis_output = corpusutil.genconceptclouds(centroids = centroids,\ centroiddict = centroiddict,\ featuredict = featuredict,\ corpus = corpus,\ clusters = clusters,\ docids = docids,\ sessionid = sessionid) kmeansvis = open("Spectral-concept_clouds_"+str(sessionid)+'.html','w') kmeansvis.write(vis_output) kmeansvis.close() vis_output = corpusutil.genfeatureclouds(centroids.todense(),centroiddict,\ featuredict,sessionid) kmeansvis = open("Spectral-feature_clusters_"+str(sessionid)+'.html','w') kmeansvis.write(vis_output) kmeansvis.close()