def main():
    businesses = features.getBusinesses(DATA)
    truthPairs, truthIds = metrics.getGoldTruthPairs()

    weights = [START_WEIGHT] * featureDistanceMap.NUM_FEATURES
    # weights = featureDistanceMap.DEFAULT_WEIGHTS
    featureFunctionMapping = getFeatureMapping()

    cache = {}

    oldWeights = list(weights)
    for iteration in range(MAX_ITERATIONS):
        weightsOrder = list(range(len(weights)))
        random.shuffle(weightsOrder)

        randIndex = probeWeight(weights, weightsOrder, 0, K,
                                featureFunctionMapping, businesses, truthPairs,
                                truthIds, cache)

        if (oldWeights == weights):
            break

        oldWeights = list(weights)

    id = "\t".join([str(weight) for weight in weights])
    print("Converged in %d / %d iterations" % (iteration + 1, MAX_ITERATIONS))
    print("Final Weights: %s\t%f" % (id, randIndex), file=sys.stderr)
예제 #2
0
def run():
    businesses = features.getBusinesses(data.DATA_SOURCE_HUMAN_EVAL)

    featureDistMap = featureDistanceMap.FeatureDistanceMap(learnWeights.getFeatureMapping(), WEIGHTS)
    kMeans = clustering.KMeans(K, featureDistMap)

    # kMeans = clustering.KMeans(K, featureDistanceMap.FeatureDistanceMap())

    clusters = kMeans.cluster(businesses)

    for i in range(len(clusters)):
        print("Cluster: %02d, Size: %02d" % (i, len(clusters[i])))
        print("         %s" % (", ".join([str(x) for x in sorted([businesses[index].otherInfo['name'] for index in clusters[i]])])))

    # Metrics
    truthPairs, truthIds = metrics.getHumanTruthPairs()
    print("Rand Index: %f" % (metrics.randIndex(clusters, businesses, truthPairs, truthIds)))
예제 #3
0
def run(businessType):
    businesses = features.getBusinesses(businessType)

    # Arbitrary K
    kMeans = clustering.KMeans(K, featureDistanceMap.FeatureDistanceMap())
    clusters = kMeans.cluster(businesses)

    for i in range(len(clusters)):
        print("Cluster: %02d, Size: %02d" % (i, len(clusters[i])))
        print("         %s" % (", ".join([
            str(x) for x in sorted(
                [businesses[index].otherInfo['name'] for index in clusters[i]])
        ])))

    # Metrics
    goldLabel = metrics.readGoldLabel("../data/groundtruth")
    b_cluster = metrics.getClusterBusinessID(businesses, clusters)
    randIndex = metrics.oldRandIndex(b_cluster, goldLabel)
    print("Old Rand Index: " + str(randIndex))

    print("New Rand Index: %f" % (metrics.randIndex(clusters, businesses)))
def run(weights, k, scalarNorm, setDistance):
    businesses = features.getBusinesses(data.DATA_SOURCE_GROUNDTRUTH_ALL)

    featureDistMap = featureDistanceMap.FeatureDistanceMap(
        buildFeatureMapping(scalarNorm, setDistance), weights)
    kMeans = clustering.KMeans(k, featureDistMap)
    clusters = kMeans.cluster(businesses)

    for i in range(len(clusters)):
        print("Cluster: %02d, Size: %02d" % (i, len(clusters[i])))
        print("         %s" % (", ".join([
            str(x) for x in sorted([
                businesses[index].otherInfo['yelpId'] for index in clusters[i]
            ])
        ])))

    goldLabel = metrics.readGoldLabel("../data/groundtruth")
    b_cluster = metrics.getClusterBusinessID(businesses, clusters)
    randIndex = metrics.oldRandIndex(b_cluster, goldLabel)

    return randIndex