def makeContours(xcoordinates,ycoordinates,width,height,binsize): getLngLat = utilities.makeGetLngLat(metadata) getMeters = utilities.makeGetMeters(metadata) # make the 2d histogram clusterdata, xedges, yedges = numpy.histogram2d(xcoordinates, ycoordinates, bins=(int(width/binsize), int(height/binsize)), range=((0, width), (0, height))) if len(xcoordinates) == 0: clusterdata = numpy.zeros((int(width/binsize), int(height/binsize)), dtype=numpy.dtype(float)) # make contours for the three levels; the contour polygons are expressed in pixel-index coordinates (not lng/lat or meters) contoursMin = utilities.contours(clusterdata, xedges, yedges, 0.5, interpolate=True, smooth=True) cutLevel50 = utilities.cutLevel(clusterdata, 50.0) contours50 = utilities.contours(clusterdata, xedges, yedges, cutLevel50, interpolate=True, smooth=True) cutLevel95 = utilities.cutLevel(clusterdata, 95.0) contours95 = utilities.contours(clusterdata, xedges, yedges, cutLevel95, interpolate=True, smooth=True) # construct output data that includes the polygons in lng,lat coordinates, circumferences in meters, and areas in meters^2 clusterData = {"contoursMin": [], "contours50": [], "contours95": [], "numberOfLabeledPixels": len(xcoordinates), "cutLevel50": cutLevel50, "cutLevel95": cutLevel95} for polygon in contoursMin: lnglatPolygon = utilities.convert(polygon, getLngLat) metersPolygon = utilities.convert(lnglatPolygon, getMeters) data = {"rowcolpolygon": polygon, "lnglatpolygon": lnglatPolygon, "areaInPixels": numpy.abs(utilities.area(polygon)), "circumferenceInMeters": numpy.abs(utilities.circumference(metersPolygon)), "areaInMeters": numpy.abs(utilities.area(metersPolygon)), "centroidInLngLat": utilities.centroid(lnglatPolygon)} clusterData["contoursMin"].append(data) for polygon in contours50: lnglatPolygon = utilities.convert(polygon, getLngLat) metersPolygon = utilities.convert(lnglatPolygon, getMeters) data = {"rowcolpolygon": polygon, "lnglatpolygon": lnglatPolygon, "areaInPixels": numpy.abs(utilities.area(polygon)), "circumferenceInMeters": numpy.abs(utilities.circumference(metersPolygon)), "areaInMeters": numpy.abs(utilities.area(metersPolygon)), "centroidInLngLat": utilities.centroid(lnglatPolygon)} clusterData["contours50"].append(data) for polygon in contours95: lnglatPolygon = utilities.convert(polygon, getLngLat) metersPolygon = utilities.convert(lnglatPolygon, getMeters) data = {"rowcolpolygon": polygon, "lnglatpolygon": lnglatPolygon, "areaInPixels": numpy.abs(utilities.area(polygon)), "circumferenceInMeters": numpy.abs(utilities.circumference(metersPolygon)), "areaInMeters": numpy.abs(utilities.area(metersPolygon)), "centroidInLngLat": utilities.centroid(lnglatPolygon)} clusterData["contours95"].append(data) return clusterData
def doEverything(metadata, mask, originalBlock, numSetColors, numBands, bandToIndexLookup): globalStart = time.time() if numSetColors < numBands: heartbeat("Image should have {} bands, but only {} were set\n".format(numBands, numSetColors)) return if cameraName != "ALI": heartbeat("Reducing its dimensionality to 26\n") windowsOfBandsToTake = range(0, 27) + range(43, 46) + range(58, 67) + range(77, 92) + range(115, 139) reducedBlock = originalBlock[windowsOfBandsToTake,:,:] bandsToTake = [] reducedBlock2 = numpy.zeros((reducedBlock.shape[0]/3, reducedBlock.shape[1], reducedBlock.shape[2]), dtype=numpy.double) for i in xrange(reducedBlock2.shape[0]): bandsToTake.append(windowsOfBandsToTake[3*i + 1]) reducedBlock2[i] = reducedBlock[3*i:3*(i+1),:,:].mean(axis=0) del reducedBlock else: bandsToTake = numpy.argsort(metadata["bandNames"]) reducedBlock2 = originalBlock heartbeat("Improving the mask\n") betterMask = mask > 0 for i in xrange(reducedBlock2.shape[0]): numpy.logical_and(betterMask, reducedBlock2[i,:,:] > 0.0, betterMask) del mask shrinkmask = betterMask > 0 for roll in -2, -1, 1, 2: for axis in 0, 1: numpy.logical_and(shrinkmask, numpy.roll(betterMask, roll, axis=axis) > 0, shrinkmask) heartbeat("Taking logarithm\n") oldsettings = numpy.seterr(divide="ignore", invalid="ignore") block = numpy.log(reducedBlock2) numpy.seterr(**oldsettings) heartbeat("Reducing image to a bag of pixels\n") bag = block.view() bag.shape = (block.shape[0], block.shape[1] * block.shape[2]) del block bagMask = betterMask.view() bagMask.shape = (originalBlock.shape[1] * originalBlock.shape[2]) bag = bag[:, bagMask] del bagMask projectionMatrix = numpy.matrix([[1 if j == i else -1 if j == i + 1 else 0 for j in xrange(reducedBlock2.shape[0])] for i in xrange(reducedBlock2.shape[0] - 1)]) projectionInverse = projectionMatrix.I heartbeat("Projecting the bag onto the color-only basis\n") projected = numpy.array(numpy.dot(projectionMatrix, bag)) del bag heartbeat("Casting the projected bag onto the image shape\n") projectedBlock = numpy.empty((reducedBlock2.shape[0] - 1, reducedBlock2.shape[1], reducedBlock2.shape[2]), dtype=numpy.double) for i in xrange(reducedBlock2.shape[0] - 1): projectedBlock[i,betterMask] = projected[i,:] heartbeat("Detecting edges\n") # 5x5 Kroon without integer-rounding: http://www.k-zone.nl/Kroon_DerivativePaper.pdf Gx = numpy.array([[ 0.0007, 0.0052, 0.0370, 0.0052, 0.0007], [ 0.0037, 0.1187, 0.2589, 0.1187, 0.0037], [ 0.0, 0.0, 0.0, 0.0, 0.0], [-0.0037, -0.1187, -0.2589, -0.1187, -0.0037], [-0.0007, -0.0052, -0.0370, -0.0052, -0.0007]]) Gy = Gx.T startTime = time.time() convBlock = numpy.zeros((projectedBlock.shape[1], projectedBlock.shape[2]), numpy.double) for index in xrange(projectedBlock.shape[0]): heartbeat(" {} {} {}\n".format(index, time.time() - startTime, convBlock.max())) convGx2 = numpy.power(convolve(projectedBlock[index,:,:], Gx)[2:projectedBlock.shape[1]+2, 2:projectedBlock.shape[2]+2], 2) convGy2 = numpy.power(convolve(projectedBlock[index,:,:], Gy)[2:projectedBlock.shape[1]+2, 2:projectedBlock.shape[2]+2], 2) convBlock = convBlock + convGx2 convBlock = convBlock + convGy2 convBlock = numpy.sqrt(convBlock) heartbeat("Edges took {} seconds to detect\n".format(time.time() - startTime)) heartbeat("Optimizing GMM\n") numGMMcomponents = 20 startTime = time.time() if projected.shape[1] < 10 * numGMMcomponents or projected.shape[1] < 10 * projected.shape[0]: heartbeat("There are only {} points; skipping (number of GMM components is {} and number of dimensions in the space is {})\n".format(projected.shape[1], numGMMcomponents, projected.shape[0])) return attempts = 0 done = False while not done: try: if 10000 < projected.shape[1]: randomSelection = projected[:,random.sample(xrange(projected.shape[1]), 10000)] model = MoG(randomSelection, numGMMcomponents) model.em(10) heartbeat(" time for first pass: {} seconds\n".format(time.time() - startTime)) randomSelection = projected[:,random.sample(xrange(projected.shape[1]), 10000)] model = MoG(randomSelection, numGMMcomponents, means=model.means, covs=model.covs, mixprops=model.mixprops) model.em(10) heartbeat(" time for second pass: {} seconds\n".format(time.time() - startTime)) randomSelection = projected[:,random.sample(xrange(projected.shape[1]), 10000)] model = MoG(randomSelection, numGMMcomponents, means=model.means, covs=model.covs, mixprops=model.mixprops) model.em(10) heartbeat(" time for third pass: {} seconds\n".format(time.time() - startTime)) model = MoG(projected, numGMMcomponents, means=model.means, covs=model.covs, mixprops=model.mixprops) model.em(5) done = True else: heartbeat(" skipping three-pass subfit because the dataset is small\n") model = MoG(projected, numGMMcomponents) model.em(5) done = True except numpy.linalg.linalg.LinAlgError: attempts += 1 if attempts > 4: heartbeat(" could not fit in 4 attempts; giving up\n") return heartbeat("GMM took {} seconds to optimize\n".format(time.time() - startTime)) heartbeat("Scoring all pixels with GMM\n") startTime = time.time() scores = logsumexp(model.compute_posteriors(projected, reinit=True, normalize=False, logscale=True), 0) scoresBlock = numpy.zeros((reducedBlock2.shape[1], reducedBlock2.shape[2]), dtype=numpy.double) scoresBlock[betterMask] = scores del scores themin, themax = numpy.percentile(convBlock[shrinkmask], [0.5, 99.5]) convNorm = (convBlock[shrinkmask] - themin)/(themax - themin) convBlockNorm = (convBlock - themin)/(themax - themin) themin, themax = numpy.percentile(scoresBlock[shrinkmask], [0.5, 99.5]) scoresNorm = 1.0 - (scoresBlock[shrinkmask] - themin)/(themax - themin) scoresBlockNorm = 1.0 - (scoresBlock - themin)/(themax - themin) rawscoresmin, rawscoresmax = themin, themax del scoresBlock del scoresNorm selection = numpy.logical_and(scoresBlockNorm > 1.0, shrinkmask) indexes = zip(*numpy.nonzero(selection)) scoredBag = numpy.argmax(model.compute_posteriors(projected, reinit=True), axis=0) heartbeat("GMM took {} seconds to score all pixels (a few times in different ways)\n".format(time.time() - startTime)) heartbeat("Blurring scores for bucket-fill to spread better\n") startTime = time.time() spot = numpy.array([[math.exp(-((i - 2)**2 + (j - 2)**2)/2.0/1.0**2) for i in xrange(5)] for j in xrange(5)]) spot = spot / spot.sum() blurScores = convolve(scoresBlockNorm[:,:], spot)[2:scoresBlockNorm.shape[0]+2, 2:scoresBlockNorm.shape[1]+2] heartbeat("Time to blur image: {} seconds\n".format(time.time() - startTime)) heartbeat("Building the KD-tree\n") startTime = time.time() kdtree = KDTree(projected) dynamicRange = (lambda x: x[1] - x[0])(numpy.percentile(projected, [1, 99])) heartbeat("KD-tree took {} seconds to build\n".format(time.time() - startTime)) heartbeat("Performing bucket-fill searches\n") startTime = time.time() clumps = set() assigned = numpy.empty((projectedBlock.shape[1], projectedBlock.shape[2]), dtype=numpy.dtype(object)) considered = numpy.zeros((projectedBlock.shape[1], projectedBlock.shape[2]), dtype=numpy.dtype(bool)) Clump.assigned = assigned Clump.projectedBlock = projectedBlock Clump.projectionInverse = projectionInverse Clump.metadata = metadata Clump.kdtree = kdtree Clump.bandsToTake = bandsToTake Clump.projected = projected Clump.convBlockNorm = convBlockNorm Clump.model = model Clump.rawscoresmin = rawscoresmin Clump.rawscoresmax = rawscoresmax Clump.dynamicRange = dynamicRange Clump.blurBlock = None Clump.scoresBlockNorm = scoresBlockNorm Clump.blurScores = blurScores for index, (x, y) in enumerate(indexes): if index % 100 == 0: heartbeat(" {} {}\n".format(float(index)/len(indexes), time.time() - startTime)) queue = [(x, y)] clump = Clump((x, y)) while len(queue) > 0: i, j = queue.pop() if i >= 0 and j >= 0 and i < shrinkmask.shape[0] and j < shrinkmask.shape[1] and shrinkmask[i, j]: if not considered[i, j]: if blurScores[i, j] > 1.0: clump.add((i, j)) assigned[i, j] = clump if clump.size() > Clump.maxSize: heartbeat(" clump is too big!\n") break newQueue = [] for ii in -1, 0, 1: for jj in -1, 0, 1: if ii != jj: newQueue.append((i + ii, j + jj)) queue = newQueue + queue elif assigned[i, j] is not None and assigned[i, j] != clump: heartbeat(" merging clumps\n") clump = assigned[i, j].mergeIn(clump) if clump.size() > Clump.maxSize: heartbeat(" ... into one that was too big!\n") break considered[i, j] = True if not clump.isEmpty() and clump.size() <= Clump.maxSize: heartbeat(" new clump with size {}\n".format(clump.size())) clumps.add(clump) heartbeat("bucket-fill search took {} seconds\n".format(time.time() - startTime)) heartbeat("Calculating attributes of the image and clumps\n") startTime = time.time() def gmmSpectrum(index): unnormalized = numpy.exp(numpy.array(numpy.dot(projectionInverse, model.means[:,index].T))[0]) return dict(zip(list(numpy.array(metadata["bandNames"])[bandsToTake]), unnormalized / unnormalized.sum())) def fullSpectrumAt(indexes): unnormalized = numpy.zeros(originalBlock.shape[0], dtype=numpy.double) for i, j in indexes: unnormalized += originalBlock[:,i,j] return dict(zip(metadata["bandNames"], unnormalized / len(indexes))) output = {"metadata": metadata} getLngLat = utilities.makeGetLngLat(metadata) getMeters = utilities.makeGetMeters(metadata) wavelengths = [metadata["bandWavelength"][x] for x in numpy.array(metadata["bandNames"])[bandsToTake]] clusterNumber = 0 for clump in clumps: heartbeat(" do clump {}\n".format(clump.indexes)) border1 = clump.border() borderPoint1 = numpy.zeros(projectedBlock.shape[0], dtype=numpy.double) for i, j in clump.indexes: borderPoint1 = borderPoint1 + projectedBlock[:, i, j] borderPoint1 = borderPoint1 / len(border1) border2 = clump.border(list(clump.indexes) + list(border1)) borderPoint2 = numpy.zeros(projectedBlock.shape[0], dtype=numpy.double) for i, j in clump.indexes: borderPoint2 = borderPoint2 + projectedBlock[:, i, j] borderPoint2 = borderPoint2 / len(border2) numSeeds = len(clump.seeds) numPixels = clump.size() seeds = sorted((int(i), int(j)) for i, j in clump.seeds) indexes = sorted((int(i), int(j)) for i, j in clump.indexes) border1 = sorted((int(i), int(j)) for i, j in border1) border2 = sorted((int(i), int(j)) for i, j in border2) mean = list(clump.mean()) meanSeeds = list(clump.meanSeeds()) stdev = clump.stdev() specMean = clump.spectrumOf(clump.mean()) specMeanSeeds = clump.spectrumOf(clump.meanSeeds()) borderSpec1 = clump.spectrumOf(borderPoint1) borderSpec2 = clump.spectrumOf(borderPoint2) edgeScore1 = clump.edginess(border1) edgeScore2 = clump.edginess(border2) gmmScoreMean = clump.gmmscoreOf(clump.mean()) gmmScoreMeanSeeds = clump.gmmscoreOf(clump.meanSeeds()) fullSpectrum = fullSpectrumAt(clump.indexes) fullSpectrumSeeds = fullSpectrumAt(clump.seeds) fullSpectrumBorder1 = fullSpectrumAt(border1) fullSpectrumBorder2 = fullSpectrumAt(border2) r200 = float(clump.density(clump.meanSeeds(), 0.200)) r500 = float(clump.density(clump.meanSeeds(), 0.500)) if r200 > 0.0 and r500 > 0.0 and gmmScoreMeanSeeds > 1.42 and gmmScoreMeanSeeds - gmmScoreMean > 0.12 and math.log10(r200) < -2.78 and math.log10(r500) < -1.30 and stdev > 0.067 and edgeScore2 < 0.61: clusterName = "cluster_{}".format(clusterNumber) output[clusterName] = {} clusterNumber += 1 output[clusterName]["contours95"] = [{}] c95 = output[clusterName]["contours95"][0] x0 = numpy.mean([x for x, y in border2]) y0 = numpy.mean([y for x, y in border2]) order = numpy.argsort([math.atan2(y - y0, x - x0) for x, y in border2]) c95["rowcolpolygon"] = [(int(x), int(y)) for x, y in numpy.array(border2)[order]] c95["lnglatpolygon"] = [getLngLat(x, y) for x, y in numpy.array(border2)[order]] c95["centroidInLngLat"] = getLngLat(x0, y0) c95["areaInPixels"] = numPixels pixelLength = abs(getMeters(*getLngLat(x0 + 0.5, y0))[0] - getMeters(*getLngLat(x0 - 0.5, y0))[0]) pixelHeight = abs(getMeters(*getLngLat(x0, y0 + 0.5))[1] - getMeters(*getLngLat(x0, y0 - 0.5))[1]) c95["areaInMeters"] = numPixels * pixelLength * pixelHeight c95["circumferenceInMeters"] = 0.5*(pixelLength + pixelHeight) * len(border1) rawGMMscore = float(logsumexp(model.compute_posteriors(numpy.array([clump.meanSeeds()]).T, reinit=True, normalize=False, logscale=True), 0)[0]) rawKNNscore = float(r500) c95["score"] = rawGMMscore, rawKNNscore c95["other"] = { "numSeeds": numSeeds, "numPixels": numPixels, "seeds": seeds, "indexes": indexes, "border1": border1, "border2": border2, "mean": mean, "meanSeeds": meanSeeds, "stdev": stdev, "specMean": specMean, "specMeanSeeds": specMeanSeeds, "borderSpec1": borderSpec1, "borderSpec2": borderSpec2, "edgeScore1": edgeScore1, "edgeScore2": edgeScore2, "gmmScoreMean": gmmScoreMean, "gmmScoreMeanSeeds": gmmScoreMeanSeeds, "fullSpectrum": fullSpectrum, "fullSpectrumSeeds": fullSpectrumSeeds, "fullSpectrumBorder1": fullSpectrumBorder1, "fullSpectrumBorder2": fullSpectrumBorder2, "r200": r200, "r500": r500} binaryhadoop.emit(sys.stdout, metadata["originalDirName"], output, encoding=binaryhadoop.TYPEDBYTES_JSON) heartbeat("Calculating attributes took {} seconds\n".format(time.time() - startTime)) totalTime = time.time() - globalStart heartbeat("Time to do everything: {} sec, which is {} min\n".format(totalTime, totalTime/60.0))