def runClustering(weights, k, featureFunctionMapping, businesses, truthPairs,
                  truthIds, cache):
    cacheId = paramId(weights, k)
    if (cacheId in cache):
        return cache[cacheId]

    featureDistMap = featureDistanceMap.FeatureDistanceMap(
        featureFunctionMapping, weights)
    kMeans = clustering.KMeans(k, featureDistMap)

    id = "\t".join([str(weight) for weight in weights])
    randIndex = -1

    try:
        clusters = kMeans.cluster(businesses)
        randIndex = metrics.randIndex(clusters, businesses, truthPairs,
                                      truthIds)
        print("%s\t%f" % (id, randIndex), file=sys.stderr)
        '''
        for i in range(len(clusters)):
            print("Cluster: %02d, Size: %02d" % (i, len(clusters[i])))
            print("         %s" % (", ".join([str(x) for x in sorted([businesses[index].otherInfo['yelpId'] for index in clusters[i]])])))
        '''
    except Exception as ex:
        print(ex)
        print("%s\tERROR" % (id), file=sys.stderr)

    cache[cacheId] = randIndex

    return randIndex
Exemplo n.º 2
0
def run():
    businesses = features.getBusinesses(data.DATA_SOURCE_HUMAN_EVAL)

    featureDistMap = featureDistanceMap.FeatureDistanceMap(learnWeights.getFeatureMapping(), WEIGHTS)
    kMeans = clustering.KMeans(K, featureDistMap)

    # kMeans = clustering.KMeans(K, featureDistanceMap.FeatureDistanceMap())

    clusters = kMeans.cluster(businesses)

    for i in range(len(clusters)):
        print("Cluster: %02d, Size: %02d" % (i, len(clusters[i])))
        print("         %s" % (", ".join([str(x) for x in sorted([businesses[index].otherInfo['name'] for index in clusters[i]])])))

    # Metrics
    truthPairs, truthIds = metrics.getHumanTruthPairs()
    print("Rand Index: %f" % (metrics.randIndex(clusters, businesses, truthPairs, truthIds)))
Exemplo n.º 3
0
def run(businessType):
    businesses = features.getBusinesses(businessType)

    # Arbitrary K
    kMeans = clustering.KMeans(K, featureDistanceMap.FeatureDistanceMap())
    clusters = kMeans.cluster(businesses)

    for i in range(len(clusters)):
        print("Cluster: %02d, Size: %02d" % (i, len(clusters[i])))
        print("         %s" % (", ".join([
            str(x) for x in sorted(
                [businesses[index].otherInfo['name'] for index in clusters[i]])
        ])))

    # Metrics
    goldLabel = metrics.readGoldLabel("../data/groundtruth")
    b_cluster = metrics.getClusterBusinessID(businesses, clusters)
    randIndex = metrics.oldRandIndex(b_cluster, goldLabel)
    print("Old Rand Index: " + str(randIndex))

    print("New Rand Index: %f" % (metrics.randIndex(clusters, businesses)))
def run(weights, k, scalarNorm, setDistance):
    businesses = features.getBusinesses(data.DATA_SOURCE_GROUNDTRUTH_ALL)

    featureDistMap = featureDistanceMap.FeatureDistanceMap(
        buildFeatureMapping(scalarNorm, setDistance), weights)
    kMeans = clustering.KMeans(k, featureDistMap)
    clusters = kMeans.cluster(businesses)

    for i in range(len(clusters)):
        print("Cluster: %02d, Size: %02d" % (i, len(clusters[i])))
        print("         %s" % (", ".join([
            str(x) for x in sorted([
                businesses[index].otherInfo['yelpId'] for index in clusters[i]
            ])
        ])))

    goldLabel = metrics.readGoldLabel("../data/groundtruth")
    b_cluster = metrics.getClusterBusinessID(businesses, clusters)
    randIndex = metrics.oldRandIndex(b_cluster, goldLabel)

    return randIndex
    def test_kmeansBase(self):
        data = [
            business.Business(10, [0, 0, 0]),
            business.Business(20, [1, 1, 1]),
            business.Business(30, [2, 2, 2]),

            business.Business(411, [10, 10, 10]),
            business.Business(511, [11, 11, 11]),
            business.Business(611, [12, 12, 12]),

            business.Business(7123, [110, 110, 110]),
            business.Business(8123, [111, 111, 111]),
            business.Business(9123, [112, 112, 112])
        ]

        expected = [
            [0, 1, 2],
            [3, 4, 5],
            [6, 7, 8]
        ]

        manhattan = lambda a, b: distance.manhattan([a], [b])
        kMeans = clustering.KMeans(3, featureDistanceMap.FeatureDistanceMap([manhattan, manhattan, manhattan]))
        self.assertEqual(sorted(kMeans.cluster(data)), sorted(expected))
    def get(self, i, j):
        if (i == j):
            return 0

        small = min(i, j)
        big = max(i, j)

        return self._distances[self._calcIndex(small, big)]

if __name__ == '__main__':
    data = [
        business.Business(1, [0, 0, 0]),
        business.Business(2, [1, 1, 1]),
        business.Business(3, [2, 2, 2]),

        business.Business(4, [10, 10, 10]),
        business.Business(5, [11, 11, 11]),
        business.Business(6, [12, 12, 12]),

        business.Business(7, [110, 110, 110]),
        business.Business(8, [111, 111, 111]),
        business.Business(9, [112, 112, 112])
    ]

    manhattan = lambda a, b: distance.manhattan([a], [b])
    kMeans = KMeans(3, featureDistanceMap.FeatureDistanceMap([manhattan, manhattan, manhattan]))
    clusters = kMeans.cluster(data)

    for i in range(len(clusters)):
        print("Cluster: %02d, Size: %02d - %s" % (i, len(clusters[i]), [str(x) for x in sorted(clusters[i])]))