def test(): kind_list = [ 'C3-Art', 'C19-Computer', 'C7-History', 'C32-Agriculture', 'C31-Enviornment' ] stopwords = tl.read_stopwords() doc = [] for i in kind_list: ans = tl.read_kind(i, 50) for j in ans: doc.append(tl.cut_without_stopwords(j, stopwords)) print('分词完成') pdoc = [] for i in doc: con = ' '.join(i) pdoc.append(con) from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer() transformer = TfidfTransformer() tfidf = transformer.fit_transform(vectorizer.fit_transform(pdoc)) word = vectorizer.get_feature_names() #所有文本的关键字 weight = tfidf.toarray() #对应的tfidf矩阵 ''' ans, bag = tV.create_VSM(doc) print('词袋模型构建完毕') TF_IDF = tT.cal_TFIDF(ans) print('TF-IDF权重矩阵计算完毕') # TF_IDF = tP.create_PCA(TF_IDF, int(0.1 * TF_IDF.shape[1])) ''' cluster = tc.KMeans(weight, len(kind_list)) print('获得分类结果') return cluster, kind_list
def runClustering(weights, k, featureFunctionMapping, businesses, truthPairs, truthIds, cache): cacheId = paramId(weights, k) if (cacheId in cache): return cache[cacheId] featureDistMap = featureDistanceMap.FeatureDistanceMap( featureFunctionMapping, weights) kMeans = clustering.KMeans(k, featureDistMap) id = "\t".join([str(weight) for weight in weights]) randIndex = -1 try: clusters = kMeans.cluster(businesses) randIndex = metrics.randIndex(clusters, businesses, truthPairs, truthIds) print("%s\t%f" % (id, randIndex), file=sys.stderr) ''' for i in range(len(clusters)): print("Cluster: %02d, Size: %02d" % (i, len(clusters[i]))) print(" %s" % (", ".join([str(x) for x in sorted([businesses[index].otherInfo['yelpId'] for index in clusters[i]])]))) ''' except Exception as ex: print(ex) print("%s\tERROR" % (id), file=sys.stderr) cache[cacheId] = randIndex return randIndex
def run(): businesses = features.getBusinesses(data.DATA_SOURCE_HUMAN_EVAL) featureDistMap = featureDistanceMap.FeatureDistanceMap(learnWeights.getFeatureMapping(), WEIGHTS) kMeans = clustering.KMeans(K, featureDistMap) # kMeans = clustering.KMeans(K, featureDistanceMap.FeatureDistanceMap()) clusters = kMeans.cluster(businesses) for i in range(len(clusters)): print("Cluster: %02d, Size: %02d" % (i, len(clusters[i]))) print(" %s" % (", ".join([str(x) for x in sorted([businesses[index].otherInfo['name'] for index in clusters[i]])]))) # Metrics truthPairs, truthIds = metrics.getHumanTruthPairs() print("Rand Index: %f" % (metrics.randIndex(clusters, businesses, truthPairs, truthIds)))
def run(businessType): businesses = features.getBusinesses(businessType) # Arbitrary K kMeans = clustering.KMeans(K, featureDistanceMap.FeatureDistanceMap()) clusters = kMeans.cluster(businesses) for i in range(len(clusters)): print("Cluster: %02d, Size: %02d" % (i, len(clusters[i]))) print(" %s" % (", ".join([ str(x) for x in sorted( [businesses[index].otherInfo['name'] for index in clusters[i]]) ]))) # Metrics goldLabel = metrics.readGoldLabel("../data/groundtruth") b_cluster = metrics.getClusterBusinessID(businesses, clusters) randIndex = metrics.oldRandIndex(b_cluster, goldLabel) print("Old Rand Index: " + str(randIndex)) print("New Rand Index: %f" % (metrics.randIndex(clusters, businesses)))
def run(weights, k, scalarNorm, setDistance): businesses = features.getBusinesses(data.DATA_SOURCE_GROUNDTRUTH_ALL) featureDistMap = featureDistanceMap.FeatureDistanceMap( buildFeatureMapping(scalarNorm, setDistance), weights) kMeans = clustering.KMeans(k, featureDistMap) clusters = kMeans.cluster(businesses) for i in range(len(clusters)): print("Cluster: %02d, Size: %02d" % (i, len(clusters[i]))) print(" %s" % (", ".join([ str(x) for x in sorted([ businesses[index].otherInfo['yelpId'] for index in clusters[i] ]) ]))) goldLabel = metrics.readGoldLabel("../data/groundtruth") b_cluster = metrics.getClusterBusinessID(businesses, clusters) randIndex = metrics.oldRandIndex(b_cluster, goldLabel) return randIndex
def test_kmeansBase(self): data = [ business.Business(10, [0, 0, 0]), business.Business(20, [1, 1, 1]), business.Business(30, [2, 2, 2]), business.Business(411, [10, 10, 10]), business.Business(511, [11, 11, 11]), business.Business(611, [12, 12, 12]), business.Business(7123, [110, 110, 110]), business.Business(8123, [111, 111, 111]), business.Business(9123, [112, 112, 112]) ] expected = [ [0, 1, 2], [3, 4, 5], [6, 7, 8] ] manhattan = lambda a, b: distance.manhattan([a], [b]) kMeans = clustering.KMeans(3, featureDistanceMap.FeatureDistanceMap([manhattan, manhattan, manhattan])) self.assertEqual(sorted(kMeans.cluster(data)), sorted(expected))