def main(): businesses = features.getBusinesses(DATA) truthPairs, truthIds = metrics.getGoldTruthPairs() weights = [START_WEIGHT] * featureDistanceMap.NUM_FEATURES # weights = featureDistanceMap.DEFAULT_WEIGHTS featureFunctionMapping = getFeatureMapping() cache = {} oldWeights = list(weights) for iteration in range(MAX_ITERATIONS): weightsOrder = list(range(len(weights))) random.shuffle(weightsOrder) randIndex = probeWeight(weights, weightsOrder, 0, K, featureFunctionMapping, businesses, truthPairs, truthIds, cache) if (oldWeights == weights): break oldWeights = list(weights) id = "\t".join([str(weight) for weight in weights]) print("Converged in %d / %d iterations" % (iteration + 1, MAX_ITERATIONS)) print("Final Weights: %s\t%f" % (id, randIndex), file=sys.stderr)
def run(): businesses = features.getBusinesses(data.DATA_SOURCE_HUMAN_EVAL) featureDistMap = featureDistanceMap.FeatureDistanceMap(learnWeights.getFeatureMapping(), WEIGHTS) kMeans = clustering.KMeans(K, featureDistMap) # kMeans = clustering.KMeans(K, featureDistanceMap.FeatureDistanceMap()) clusters = kMeans.cluster(businesses) for i in range(len(clusters)): print("Cluster: %02d, Size: %02d" % (i, len(clusters[i]))) print(" %s" % (", ".join([str(x) for x in sorted([businesses[index].otherInfo['name'] for index in clusters[i]])]))) # Metrics truthPairs, truthIds = metrics.getHumanTruthPairs() print("Rand Index: %f" % (metrics.randIndex(clusters, businesses, truthPairs, truthIds)))
def run(businessType): businesses = features.getBusinesses(businessType) # Arbitrary K kMeans = clustering.KMeans(K, featureDistanceMap.FeatureDistanceMap()) clusters = kMeans.cluster(businesses) for i in range(len(clusters)): print("Cluster: %02d, Size: %02d" % (i, len(clusters[i]))) print(" %s" % (", ".join([ str(x) for x in sorted( [businesses[index].otherInfo['name'] for index in clusters[i]]) ]))) # Metrics goldLabel = metrics.readGoldLabel("../data/groundtruth") b_cluster = metrics.getClusterBusinessID(businesses, clusters) randIndex = metrics.oldRandIndex(b_cluster, goldLabel) print("Old Rand Index: " + str(randIndex)) print("New Rand Index: %f" % (metrics.randIndex(clusters, businesses)))
def run(weights, k, scalarNorm, setDistance): businesses = features.getBusinesses(data.DATA_SOURCE_GROUNDTRUTH_ALL) featureDistMap = featureDistanceMap.FeatureDistanceMap( buildFeatureMapping(scalarNorm, setDistance), weights) kMeans = clustering.KMeans(k, featureDistMap) clusters = kMeans.cluster(businesses) for i in range(len(clusters)): print("Cluster: %02d, Size: %02d" % (i, len(clusters[i]))) print(" %s" % (", ".join([ str(x) for x in sorted([ businesses[index].otherInfo['yelpId'] for index in clusters[i] ]) ]))) goldLabel = metrics.readGoldLabel("../data/groundtruth") b_cluster = metrics.getClusterBusinessID(businesses, clusters) randIndex = metrics.oldRandIndex(b_cluster, goldLabel) return randIndex