def main(key, stack):
    sys.stderr.write('Aggregate for value %s\n'%repr(key))
    counts = numpy.zeros(4)
    for rec in stack:
      counts += rec[:4]
      sys.stderr.write('%s\n'%repr(rec))
    if counts[0] + counts[2] > 0:
      eff = float(counts[0])/float(counts[0] + counts[2])
    else:
      eff = 0.0
    if counts[0] + counts[1] > 0:
      fpr = float(counts[1])/ (counts[0] + counts[1])
    else:
      fpr = 0.0
    outrec ='%f, %5.3f, %5.3f, %i, %i, %i, %i\n'%( float(key), eff, fpr, counts[0], counts[1], counts[2], counts[3])
    binaryhadoop.emit(sys.stdout, key, outrec, encoding = binaryhadoop.TYPEDBYTES_JSON)
        for i,lbl in enumerate(set(clusterList[:round(anomalyPercentage*nx.size),0])):
            xedges = clusterList[clusterList[:,0]==lbl,5]
            yedges = clusterList[clusterList[:,0]==lbl,4]
            cluster["cluster_" + str(i)] = makeContours(xedges,yedges,width,height,binSize)
            cluster["cluster_" + str(i)].update({"clusterCenter": clusterCenters[lbl,:].tolist()})


        sys.stderr.write("ready to emit cluster information\n")
        cluster["metadata"] = metadata


        #Assign contour scores
        for clusterKey, clusterValue in cluster.iteritems():
            if "metadata" not in clusterKey:
                for i in range(len(clusterValue['contours95'])):
                    sys.stderr.write('reporter:status:still computing score\n')
                    sys.stderr.flush()
                    polygons = numpy.array([[round(x[0]),round(x[1])] for x in clusterValue['contours95'][i]['rowcolpolygon']])
                    scoreIndex = []
                    for p in polygons:
                        scoreIndex.append(numpy.argmin(numpy.sum(numpy.square(numpy.fliplr(clusterList[:,4:])-p),axis=1)))
                    scoremdist = mDist(numpy.array(cluster[clusterKey]["clusterCenter"]).reshape(1,-1),pcaEigenvalues.reshape(1,-1))[0] 
                    try:
                        scores = [scoreDict[x][1] for x in map(tuple,[map(int,map(round,y)) for y in clusterValue['contours95'][i]['rowcolpolygon']])]
                        scoreedist = sum(scores)/len(scores)
                    except KeyError:
                        scoreedist = numpy.mean(clusterList[numpy.array(scoreIndex),3])
                    cluster[clusterKey]['contours95'][i]['score'] = [scoremdist,scoreedist]
 
        binaryhadoop.emit(sys.stdout,regionKey,cluster,encoding = binaryhadoop.TYPEDBYTES_JSON)
示例#3
0
            bands[key] = numpy.array(value[mask],dtype=numpy.float64)

    if imageData["metadata"] is not None:
        if 'HSI' in imageData["metadata"].keys():
            wavelengths = {}
            multipliers = {}
            for w,wave in enumerate(imageData["metadata"][unicode("HSI")][unicode("wavelength")]):
                wavelengths["B" + "%03d" % w] = float(wave)
                multipliers["B" + "%03d" % w] = 1
        else:
            wavelengths = imageData["metadata"]["bandWavelength"]
            multipliers = imageData["metadata"]["bandMultiplier"]        
        
        imageList = utilities.preprocessImage(bands, multipliers, wavelengths, imageData)
        sys.stderr.write("This is the number of bands: %r\n" % len(imageList))

        iDot,iPartial = main(imageList)
        imageData["imageDot"] = iDot
        imageData["imagePartial"] = iPartial

        if noiseFlag.upper() == "TRUE":
            noiseList = calcNoise(imageList,mask)
            nDot,nPartial = main(noiseList)
            imageData["noiseDot"] = nDot
            imageData["noisePartial"] = nPartial
        try:
            regionKey = imageData["metadata"]["originalDirName"]
        except KeyError:
            regionKey = imageData["metadata"]["outputFile"]
        binaryhadoop.emit(sys.stdout,regionKey,imageData,encoding=binaryhadoop.TYPEDBYTES_JSON)
示例#4
0

        if 'HSI' in imageData["metadata"].keys():
            wavelengths = {}
            multipliers = {}
            for w,wave in enumerate(imageData["metadata"][unicode("HSI")][unicode("wavelength")]):
                wavelengths["B" + "%03d" % w] = float(wave)
                multipliers["B" + "%03d" % w] = 1
        else:
            wavelengths = imageData["metadata"]["bandWavelength"]
            multipliers = imageData["metadata"]["bandMultiplier"]        
        
        imageList = utilities.preprocessImage(bands, multipliers, wavelengths, imageData, selectBands=selectbands)
        sys.stderr.write("This is the number of bands: %r\n" % len(imageList))

        iDot,iPartial = main(imageList)
        imageData["imageDot"] = iDot
        imageData["imagePartial"] = iPartial

        if noiseFlag.upper() == "TRUE":
            noiseList = calcNoise(imageList,mask)
            nDot,nPartial = main(noiseList)
            imageData["noiseDot"] = nDot
            imageData["noisePartial"] = nPartial
        try:
            regionKey = imageData["metadata"]["originalDirName"]
        except KeyError:
            regionKey = imageData["metadata"]["outputFile"]

        binaryhadoop.emit(sys.stdout,regionKey,imageData,encoding=binaryhadoop.TYPEDBYTES_JSON)
示例#5
0
#!/usr/bin/env python
import binaryhadoop
import sys
import augustus
import report_unknowntruth
import json

if __name__ == "__main__":
    cmrecs = []
    for key, value in binaryhadoop.mapperInput(sys.stdin, typeMap={None: binaryhadoop.TYPEDBYTES_JSON, "KEY": binaryhadoop.TYPEDBYTES_JSON}):      
      cmrec = value.split(',')
      cmrecs.append('%d, %d'%(float(cmrec[0]), float(cmrec[1])))
    if len(cmrecs) > 0:
      html, img1, img2, img3 = report_unknowntruth.main(cmrecs, 'SummaryStatistics.json')
      record = "%s"%(json.dumps([html,  img1, img2, img3]))
      binaryhadoop.emit(sys.stdout, "KEY", record, encoding = binaryhadoop.TYPEDBYTES_JSON)
def main(filename, metadata, mask, bands, start, end, iscan, trueRowsStart, nTrueRows, trueColsStart, nTrueCols, FCUT = .08, PCUTBLOB = .001, PCUTCHI = .01, MINCATALOGUEDSIZE = 0):
    START = start; END = end
    tstart = datetime.datetime.now()
    imageBands = sorted(bands.keys())
    windowRadius = 1
    truePositive = 0
    falsePositive = 0
    taggedPixels = 0
    imageArray = numpy.zeros((bands[imageBands[0]].shape + (len(bands),)  ))
    maxScore = len(imageBands)/PCUTBLOB

    # Attempt to remove some problematic bands, in particular, ones with 'stripes'.
    bandToRemove = []
    sys.stderr.write('Number of Bands: %i\n'%len(imageBands))
    sys.stderr.write('Evaluate Whether Columns are reasonable\n')
    for i, band in enumerate(imageBands):
        imageArray[:,:,i] = bands[band]             
        if config.blobspectra.VALIDATECOLUMNS:
          ok = True
          bandrange = numpy.ma.max(bands[band].data) - numpy.ma.min(bands[band].data)
          sys.stderr.write('For band %i bandrange is %f\n'%(i,bandrange))
          sys.stderr.write('At %s\n'%(repr(numpy.unravel_index(numpy.ma.argmax(bands[band].data), bands[band].data.shape))))
          maxpix = numpy.unravel_index(numpy.ma.argmax(bands[band].data), bands[band].data.shape)
          sys.stderr.write('Value at max %d\n'%bands[band][maxpix])
          for j in range(1, bands[band].shape[1] - 1):
            if (numpy.ma.max(bands[band].data[:, j]) -  numpy.ma.min(bands[band].data[:, j])) /  bandrange < .01:
              if True:
                sys.stderr.write('%i %f %f\n'%(j, numpy.ma.max(bands[band].data[:,j]),numpy.ma.min(bands[band].data[:,j])))
              ok = False
          if not ok:
            sys.stderr.write('Remove band %i because of un-natural radiance distribution in Cols\n' % i)
            bandToRemove.append(i)

    if config.blobspectra.VALIDATEROWS:
      sys.stderr.write('Evaluate Whether Rows are reasonable\n')
      for i, band in enumerate(imageBands):
          ok = True
          bandrange = numpy.ma.max(bands[band].data) - numpy.ma.min(bands[band].data)
          for j in range(1, bands[band].shape[0] - 1 ):
            if (numpy.ma.max(bands[band].data[j,:]) -  numpy.ma.min(bands[band].data[j,:])) /  bandrange < .01:
              ok = False
          if not ok:
            sys.stderr.write('Remove band %i because of un-natural radiance distribution in Rows\n' % i)
            bandToRemove.append(i)
           
    imageArray = numpy.delete(imageArray, bandToRemove, 2)
    sys.stderr.write('Shape %s\n'%repr(imageArray.shape))
    try:
      sys.stderr.write('Sample %s\n'%repr(imageArray[400,400,:]))
    except:
      pass
    nPixels = imageArray.shape[0] * imageArray.shape[1]
    sys.stderr.write("this is the size of the image: " + str(imageArray.shape) + '\n')
    NimageRows = imageArray.shape[0]
    sys.stderr.write("This is the number of pixels per band in the image: " + str(imageArray.shape[0] * imageArray.shape[1]) + '\n')
    sys.stderr.write("This is the total number of pixels in the image: " + str(imageArray.size) + '\n')
    # This is the number of pixels that are non-zero. Here, a pixel is a 3-index object: x,y,wavelength. Dividing
    # this number by the number of bands ought to be pretty close to the number of unmasked geospatial pixels in this portion.
    sys.stderr.write("This is the total number of non-zero pixels: " + str((imageArray.reshape(-1, ) > 0).sum()) + '\n')
    # numpy.prod(imageArray,2) multiplies the radiances across all wavelengths together. This construct will add up
    # number of geospatial pixels for which at least one band is zero.
    sys.stderr.write("This is the total number of zero pixels: " + str((numpy.prod(imageArray, 2) == 0).sum()) + '\n')
    imageIndices = numpy.arange(imageArray.shape[0] * imageArray.shape[1]).reshape((imageArray.shape[0], imageArray.shape[1]))

    nbands = imageArray.shape[2]
    # (ij)th element of ny yields i. (ij)th element of nx yields j
    ny, nx = numpy.mgrid[0 : mask.shape[0], 0 : mask.shape[1]]
    globalMean = numpy.mean(imageArray.reshape(-1, ))
    sys.stderr.write('global mean: %f\n'%globalMean)
    sys.stderr.write('global var : %f\n'%numpy.var(imageArray.reshape(-1,)))
    if globalMean <= 5  :
      sys.stderr.write('=====> Insufficient radiance in image. Returning!\n')
      return
    if numpy.var(imageArray.reshape(-1,)) < 200**2 :
      sys.stderr.write('=====> Insufficient variability in image. Returning!\n')
      return
    globalMean = globalMean * mask.sum() / (mask.shape[0] * mask.shape[1]) #This line is because the original mean 
                                                                              #over all pixels, not just the non-zeros in the mask

    #################################
    #####STANDARD DEVIATION WINDOW###
    #################################

    #Calculate the mean and standard deviation of 3-by-3 pixel windows
    boxStandardDev = numpy.zeros(imageArray.shape)
    boxMean = numpy.zeros(imageArray.shape)
    nBoxPixels = (1 + 2 * windowRadius) ** 2
    for i in xrange(nbands):
        box = rolling_window(imageArray[:, :, i], 1)
        boxsq = rolling_window(imageArray[:, :, i] ** 2, 1)
        boxStandardDev[:, :, i] = numpy.sqrt(boxsq / nBoxPixels - (box / nBoxPixels) ** 2)
        boxMean[:, :, i] = box / nBoxPixels
        boxStdZero = numpy.where(boxStandardDev[:,:,i] == 0)

    #################################
    ####CREATE STANDARD DEV MASK#####
    #################################

    #Create a boolean array (mask) with TRUE entries where standard deviation is below FCUT * max standard dev of each band
    zfilter = numpy.zeros(imageArray.shape)
    for i in xrange(nbands):
        #sys.stderr.write('%s\n'%repr(numpy.histogram((numpy.ma.masked_array(boxStandardDev[:,:,i], mask=mask)).compressed(), 100)))
        zfilter[:, :, i] = boxStandardDev[:, :, i] < FCUT * numpy.max(boxStandardDev[:, :, i])
    zfilterAll = numpy.ma.all(zfilter, 2)
    zfilterAll = numpy.ma.masked_array(zfilterAll, mask=mask)
    sys.stderr.write('Unmasked pixels %i\n'%zfilterAll.count())
    sys.stderr.write('Pixels in chromatically homogenous region %s\n'%repr(zfilterAll.sum()))
    #Combine this mask with the original image shape mask
    #sdMask = numpy.ma.masked_array(zfilterAll.astype('uint8'), mask | ~zfilterAll)
    sdMask = zfilterAll

    ######################################
    #####CONNECTED COMPONENTS LABELING####
    ######################################
 
    #This step uses a connected components labeling algorithm.
    #It is applied to the binary mask generated in the previous step,
    #and labels each connected group of pixels with a separate integer.
    #Note that the background pixels are also labeled. If, as is most often the case,
    #the background is connected, it takes on the value 0.      
    connected_regions, num_features = find_regions(numpy.ma.masked_array(sdMask, mask=mask))
    sys.stderr.write('Using find_regions \n')
    #connected_regions, num_features = ndimage.label(sdMask)
    #sys.stderr.write('Using scipy %i\n'%num_features)
    connected_regions[numpy.prod(imageArray, 2) == 0] = sorted(numpy.unique(connected_regions))[-1] + 1
    regions_labels = sorted(numpy.unique(connected_regions))
    #
    # background (i.e. region 0) will be green, masked out (radiance=0) areas will be red per
    # the rgb convention used here.
    #connectedPlot.save("connectedPlot.png", "PNG", options="optimize")
    ######################################
    #####BLOB STATISTICS##################
    ######################################
    regions = {}
    regions["bandNames"] = imageBands

    
    if num_features == 0:
      sys.stderr.write('=====> No features identified in this image. Returning!\n')
      return
    #Calcuate the mean and standard deviation of each connected, across each band.
    #The arrays are placed in a dictionary which contains the row-column coordinates,
    #along with the mean and standard deviation for each band.

    #In most cases, the background of the image will be simply connected. In rare cases, a feature from the initial sdMask
    #may cut the image, in which case there will be more than one background. Here, the x-y coordinates of each region are
    #rigorously checked against the background (which is False in the sdMask) x-y coordinates.

    nyx_background = set(map(tuple,numpy.hstack((ny[sdMask == False].reshape(-1, 1),nx[sdMask == False].reshape(-1, 1)))))
    sys.stderr.write('Number of background pixels %i\n'%len(nyx_background))
    regions = defaultdict(dict)
    background_regions = defaultdict(dict)   

    #The bad pixel (i.e., zero radiance because information was missing) has the last label, which gets avoided here 
    for i,label in enumerate(regions_labels[:-1]):
        #First assign pixels which are "background", i.e., did not pass the FCUT, to a separate dictionary
        regionindices = connected_regions == label
        regionrows = ny[regionindices].reshape(-1,1)
        regioncols = nx[regionindices].reshape(-1,1)
        regionpixels = set(map(tuple, numpy.hstack((regionrows, regioncols))))        
        if regionpixels <= nyx_background:
            background_regions[label]["yxCoordinates"] = numpy.hstack((regionrows , regioncols))
            background_regions[label]["mean"] = numpy.mean(imageArray[regionindices , :] , axis=0)
            background_regions[label]["standard_deviation"] = numpy.std(imageArray[regionindices , :] , axis=0)            
            background_regions[label]["mySubBlobs"] = [[label , regionindices.sum()]] #a record is kept of the background region
            background_regions[label]["radiances"] = imageArray[regionindices , :]
        else:
            regions[label]["yxCoordinates"] = numpy.hstack((regionrows , regioncols))
            regions[label]["mean"] = numpy.mean(imageArray[regionindices , :] , axis = 0)
            regions[label]["background"] = []
            try:
              regions[label]["mean"] = regions[label]["mean"] * globalMean / numpy.mean(regions[label]["mean"])
            except:
              sys.stderr.write('Fail to normalize this mean for region %i\n'%label)
              sys.stderr.write(repr(regions[label]["mean"]))
            regions[label]["standard_deviation"] = numpy.std(imageArray[regionindices , :] , axis=0)
            regions[label]["mySubBlobs"] = [[label , regionindices.sum()]] #a record is kept of all blobs and their respective sizes
            if regions[label]["mySubBlobs"][0][1] > 1:
              regions[label]["standard_deviation"] = numpy.mean(boxStandardDev[regionindices,:], axis=0)
            else:
              regions[label]["standard_deviation"] = boxStandardDev[regionindices,:].reshape(-1,)
    regionlabels = regions.keys()
    sys.stderr.write('non background region labels %i\n'%len(regionlabels))

    singletonRad = 0
    for key in background_regions.keys():
        if singletonRad == 0:
            singletonRad = background_regions[key]["radiances"]
            singletonYX = background_regions[key]["yxCoordinates"]
        else:
            singletonRad = numpy.vstack((singletonRad,background_regions[key]["radiances"]))
            singletonYX = numpy.vstack((singletonYX,background_regions[key]["yxCoordinates"]))


    #This is where the merging of blobs (hierarchical clustering) takes place.
    #First test whether regions which made the initial cut (where sdMask=True)
    #can be grouped together.
    #Start out by using the mean and standard dev of blobs to construct a t-test statistic.  
    #If this statistic is above a threshold PCUT, the blobs will be merged. The second blob
    #is always appended to the first. 

    #For regions below 3 pixels, use a chi-squared test in place of t-test to determined
    #whether pixels should be merged to a blob.
    regionsBlobbed = defaultdict(dict)                         #regions will be appended to this dictionary as they are merged with other regions 
    regionsUnblobbed = copy.deepcopy(regions)   #regions will be removed from this dictioanry when they have been merged to another region

    merge_keys = copy.copy(regions.keys())

    unmerged_regions = sorted([x for x in merge_keys if regions[x]["yxCoordinates"].shape[0] > MINCATALOGUEDSIZE]) 
    sys.stderr.write('Merging %i valid among a total of %i multi-cell blobs first\n'%(len(unmerged_regions), len(merge_keys)))
    merged_regions = []
    merge_count = 0
    for i, band1 in enumerate(sorted([x for x in merge_keys if regions[x]["yxCoordinates"].shape[0] > MINCATALOGUEDSIZE])):
        if band1 in unmerged_regions:
            unmerged_regions.remove(band1)
            regionsBlobbed[band1] = copy.deepcopy(regions[band1])
            del regionsUnblobbed[band1] 
            n1 = sum([subblob[1] for subblob in regions[band1]['mySubBlobs']])
            for j, band2 in enumerate(unmerged_regions):
                n2 = sum([subblob[1] for subblob in regions[band2]['mySubBlobs']])
                blobbed_test_stat, test_stat, n1n2 = calc_tdist(regions[band1]["mean"], regions[band2]["mean"], regions[band1]["standard_deviation"],regions[band2]["standard_deviation"], n1, n2)
                blobbed_test_stat = numpy.min(test_stat)
                if blobbed_test_stat > PCUTBLOB / len(regions[band1]["mean"]):
                    #sys.stderr.write('%s, %s, %s \n'%(repr(blobbed_test_stat), repr(test_stat), repr(n1n2)))
                    merged_regions.append(band2)
                    merge_count += 1
                    regionsBlobbed[band1]["yxCoordinates"] = numpy.vstack((regionsBlobbed[band1]["yxCoordinates"], regions[band2]["yxCoordinates"]))
                    regionsBlobbed[band1]["mySubBlobs"].append([band2,regions[band2]["yxCoordinates"].shape[0]]) 
            for merged in merged_regions:
                unmerged_regions.remove(merged)
                merge_count -= 1
                del regionsUnblobbed[merged]
            merged_regions = []
    
    sys.stderr.write('Number of Blobs after blob-to-blob merging %i Number of SubBlobs %i\n'%(len(regionsBlobbed), numpy.sum([len(regionsBlobbed[b]["mySubBlobs"]) for b in regionsBlobbed.keys()])))
    sys.stderr.write('Spread in pixels associated with blobs:\n')
    blobsizes = []
    for k in regionsBlobbed.keys():
      npixels = regionsBlobbed[k]["yxCoordinates"].size/2
      if npixels != 1:
       sys.stderr.write('largish blob %i %s\n'%(k, repr(regionsBlobbed[k]["mySubBlobs"])))
       sys.stderr.write('%d\n'%(regionsBlobbed[k]["yxCoordinates"].size/2.0))
      blobsizes.append(regionsBlobbed[k]["yxCoordinates"].size/2)
    blobdist = numpy.histogram(blobsizes)
    sys.stderr.write('%s\n'%repr(blobdist))
    sys.stderr.write('Number of Pixels in Merged blobs: %i\n'%numpy.array(blobsizes).sum())
    regions = []

    merged_blobs = numpy.zeros(sdMask.shape)
    for band in regionsBlobbed.keys():
        rows = regionsBlobbed[band]["yxCoordinates"][:,0]
        cols = regionsBlobbed[band]["yxCoordinates"][:,1]
        merged_blobs[rows, cols] = band
    ############################
    #Re-group blob statistics###
    ############################

    blobMeans = []
    blobStandardDevs = []
    blobbedRegionKeys = sorted(regionsBlobbed.keys())
    for blob in blobbedRegionKeys:
        blobMeans.append(regionsBlobbed[blob]["mean"])
        blobStandardDevs.append(regionsBlobbed[blob]["standard_deviation"])
        try:
          if blobStandardDevs[-1][-1] == 0.0:
             row = regionsBlobbed[blob]['yxCoordinates'][0][0]
             col = regionsBlobbed[blob]['yxCoordinates'][0][1]
        except:
           sys.stderr.write('############\n')
           sys.stderr.write('Shape of blobStanardDevs %s\n'%repr((numpy.array(blobStandardDevs)).shape))
           sys.stderr.write('%s\n'%repr(regionsBlobbed[blob]))
           sys.stderr.write('%s\n'%repr(blobStandardDevs[-1][-1]))
    blobMeans = numpy.array(blobMeans)
    try:
      blobMeansMean = numpy.mean(blobMeans,axis=1)
    except:
      sys.stderr.write('blobmeans: %s\n'%repr(blobMeans))
    if len(blobMeans)==0:
      return

    blobStandardDevs = numpy.array(blobStandardDevs)

    sys.stderr.write('Done blob-blob merging\n')
    ###################################################################
    ######MERGE UNBLOBBED, PASSED FCUT REGIONS-Chi Squared Test#######
    ###################################################################

    if len(regionsUnblobbed.keys()) > 0: 
      #Merge any blobs (pixels that passed initial FCUT) which are smaller than MINCATALOGUEDSIZE
      #based on result of a chi-squared test
      unBlobbedMeans = []
      unBlobbedRegionKeys = sorted(regionsUnblobbed.keys())       #Avoid first key, which are the pixels that do not pass FCUT
      for unblobbed in unBlobbedRegionKeys:
          unBlobbedMeans.append(regionsUnblobbed[unblobbed]["mean"])  #Table the mean of the unblobbed regions
      unBlobbedMeans = numpy.array(unBlobbedMeans)

      unBlobbedChiSq = numpy.zeros((unBlobbedMeans.shape[0],blobMeans.shape[0]))
      for i in numpy.arange(blobMeans.shape[0]):
          #This is the vectorized argument to be used in the chi-squared test
          try:
            a=(unBlobbedMeans.T/numpy.mean(unBlobbedMeans, axis=1)).T
            b=blobMeans[i,:]/blobMeansMean[i]
            unBlobbedChiSq[:,i] = numpy.sum(numpy.square( ((a-b)*globalMean)/blobStandardDevs[i,:]), axis=1)
          except:
            sys.stderr.write('Unable to perform this chi-squared test ' + repr(unBlobbedMeans) + '\n')
            
      unblobbed_test_stat = calc_chisq(unBlobbedChiSq, unBlobbedMeans.shape[1])  
      unblobbed_test_stat_max = numpy.max(unblobbed_test_stat, axis=1) #Take max

      #Table the label of the unblobbed region with the label of the blobbed region where the unblobbed-blobbed chi-squared test has its max value
      unblobbed_to_merge_with = numpy.hstack((numpy.array(unBlobbedRegionKeys).reshape(-1,1),numpy.array(blobbedRegionKeys)[numpy.argmax(unblobbed_test_stat,axis=1)].reshape(-1,1)))

      #Find out which unblobbed regions pass the chi-squared test
      unblobbed_to_merge_with = unblobbed_to_merge_with[unblobbed_test_stat_max > PCUTCHI,:] 

      #Do the merging of the unblobbed regions that passed chi-squared test with appropriate blobbed region
      for unblobbed in unblobbed_to_merge_with:
          regionsBlobbed[unblobbed[1]]["yxCoordinates"] = numpy.vstack((regionsBlobbed[unblobbed[1]]["yxCoordinates"],regionsUnblobbed[unblobbed[0]]["yxCoordinates"]))    
          regionsBlobbed[unblobbed[1]]["mySubBlobs"].append([unblobbed[0],regionsUnblobbed[unblobbed[0]]["yxCoordinates"].shape[0]])
          del regionsUnblobbed[unblobbed[0]]         
       

    #################################################################
    ##     MERGE UNBLOBBED, DID NOT PASS FCUT                      ##
    #Do the chi-squared test again, but this time with the pixels  ##
    #which did not pass the initial FCUT                           ##
    #################################################################
    sys.stderr.write('Merge Single Pixels with high local SD to known blobs\n')
    sdNotMask = sdMask==False
    #
    #sdPlot = Image.fromarray(numpy.array(255*sdNotMask,dtype=numpy.uint8))
    #sdPlot.save("sdNotMask.png","PNG",options="optimize")
    #

    unBlobbedMeans = [] 
    for key in background_regions.keys():
        unBlobbedMeans = background_regions[key]["radiances"]
        unBlobbedChiSq = numpy.zeros((unBlobbedMeans.shape[0], blobMeans.shape[0]))
        # this construct normalizes each bg pixel by its mean over all bands.
        a = (unBlobbedMeans.T/numpy.mean(unBlobbedMeans, axis=1)).T
        b = blobMeans / blobMeansMean[:, None]
        unBlobbedChiSq = numpy.apply_along_axis(chisq, 1, a, b, blobStandardDevs, globalMean)
        best_chisquared = numpy.min(unBlobbedChiSq, axis = 1)
        unblobbed_test_stat = calc_chisq(best_chisquared, unBlobbedMeans.shape[1])
        #unblobbed_test_stat_max = numpy.max(unblobbed_test_stat, axis = 1)
        bgcoord = background_regions[key]["yxCoordinates"]
        background_coords = imageIndices[bgcoord[:, 0], bgcoord[:, 1]]
        # this variable has two rows, the first is a flattened index into the image raster,
        #                             the second is the best chi-squared match to a blob.
        unblobbed_to_merge_with = numpy.hstack((background_coords.reshape(-1, 1), \
                                  numpy.array(blobbedRegionKeys)[numpy.argmin(unBlobbedChiSq, axis = 1)].reshape(-1, 1)))
        # Background pixels will be merged if they have a good enough match.
        unblobbed_to_merge_with = unblobbed_to_merge_with[unblobbed_test_stat > PCUTCHI, :]
        # this construct yields an N x 2 array  where N is the number of pixels in the image.
        # the array is a list of the row, column index of the ith pixel (counting across and then down).
        singletonCoords = numpy.hstack((ny.reshape(-1, 1), nx.reshape(-1, 1)))
        blobsToUpdate = numpy.unique(unblobbed_to_merge_with[:, 1])
        testregionsBlobbed = {}
        for blob in blobsToUpdate:
          testregionsBlobbed[blob] = {}
          pixelsToAdd = unblobbed_to_merge_with[unblobbed_to_merge_with[:, 1] == blob][:,0]
          try:
            regionsBlobbed[blob]["yxCoordinates"] = numpy.vstack((regionsBlobbed[blob]["yxCoordinates"], singletonCoords[pixelsToAdd, : ]))          
            #testregionsBlobbed[blob]["yxCoordinates"] = numpy.vstack((regionsBlobbed[blob]["yxCoordinates"], singletonCoords[pixelsToAdd, : ]))          
          except:
            sys.stderr.write('%s\n'%repr(blob))
            sys.stderr.write('pixelstoadd %s\n'%repr(pixelsToAdd))
            sys.stderr.write('shapes: %s %s \n'%(regionsBlobbed[blob]["yxCoordinates"].shape, singletonCoords[pixelsToAdd,:].shape))
            sys.stderr.write('singleton coordinates %s\n'%repr(singletonCoords[pixelsToAdd,:]))
        for unblobbed in unblobbed_to_merge_with:
            regionsBlobbed[unblobbed[1]]["mySubBlobs"].append(["background pixel " + str(unblobbed[0]),1])
            regionsBlobbed[unblobbed[1]]["background"].append(1)

        background_regions[key]["yxCoordinates"] = numpy.delete(background_regions[key]["yxCoordinates"], numpy.arange(unblobbed_test_stat.size)[unblobbed_test_stat > PCUTCHI],0)
        background_regions[key]["radiances"] = numpy.delete(background_regions[key]["radiances"], numpy.arange(unblobbed_test_stat.size)[unblobbed_test_stat > PCUTCHI],0)
        background_regions[key]["mySubBlobs"][0][1] -= (unblobbed_test_stat > PCUTCHI).sum()


    sys.stderr.write('Done Merging Single High SD Pixels\n')   

    ##################################
    #   UNMATCHED BACKGROUND PIXELS  #
    ##################################

    singletonRad = 0
    for key in background_regions.keys():
        if singletonRad == 0:
            singletonRad = background_regions[key]["radiances"]
            singletonYX = background_regions[key]["yxCoordinates"]
        else:
            singletonRad = numpy.vstack((singletonRad,background_regions[key]["radiances"]))
            singletonYX = numpy.vstack((singletonYX,background_regions[key]["yxCoordinates"]))

    removeZeroSingletons = numpy.where( numpy.prod(singletonRad, 1) == 0 )
    if len(removeZeroSingletons[0]) > 0:
      singletonRad = numpy.delete(singletonRad, removeZeroSingletons, 0)
      singletonyx = numpy.delete(singletonYX, removeZeroSingletons, 0)
    singletonRadMeans = numpy.mean(singletonRad, axis = 0)
    singletonRad = singletonRad * globalMean / singletonRadMeans


    ##################################
    # TRY TO FIND BINARY BLOB MIXES ##    
    ##################################
    #for key in background_regions.keys():
    #  singleton

    sys.stderr.write('Cluster locally hight SD pixels that differ from blobs with one another\n')
    ##################################
    #   CLUSTER UNMERGED PIXELS      #
    ##################################
    #binlabelStart = int(sorted(regionsBlobbed.keys())[-1])
    binlabelStart = num_features + 1
    if len(singletonRad) > 0:
      start = 0
      chunk = 10000
      mergeSingletons = {}
      total = 0
      while total < len(singletonRad):
        end = min(len(singletonRad), start + chunk)
        mergeSingletons.update(mergevecs(singletonRad[start:end,:]))
        total +=  (end - start)
        start = start + chunk
      for i, tlabel in mergeSingletons.items():
          binlabel = binlabelStart + i
          #tlabel = mergeSingletons[i]
          regionsBlobbed[binlabel]["radiances"] = singletonRad[tlabel,:]
          regionsBlobbed[binlabel]["yxCoordinates"] = singletonYX[tlabel,:]
          regionsBlobbed[binlabel]["mySubBlobs"] = [["background pixels " + str(binlabel), len(tlabel)]]
          try:
            regionsBlobbed[binlabel]["background"].append(len(tlabel))
          except:
            regionsBlobbed[binlabel]["background"] = [len(tlabel)]
          regionsBlobbed[binlabel]["mean"] = numpy.mean(singletonRad[tlabel,:], axis=0)
          regionsBlobbed[binlabel]["standard_deviation"] = numpy.std(singletonRad[tlabel,:], axis=0)

   

    merged_blobs = numpy.zeros(sdMask.shape)
    for band in regionsBlobbed.keys():
        for i in xrange(regionsBlobbed[band]["yxCoordinates"].shape[0]):
            coord = regionsBlobbed[band]["yxCoordinates"]
            merged_blobs[regionsBlobbed[band]["yxCoordinates"][i,0],regionsBlobbed[band]["yxCoordinates"][i,1]] = band

    sys.stderr.write('Done Categorizing Pixels. Fill out catalog\n')
    imageArray[:, :, 4] = numpy.array(255. * imageArray[:, :, 4] / imageArray[:, :, 4].max(), dtype=numpy.uint8)
    catalog = {}
    candregions = []
    # no need to distinguish between the two categories at this point.
    regionsBlobbed.update(regionsUnblobbed)
    finalregions = regionsBlobbed.keys()
    finalregions.sort()
    ncand = 0
    select = selectioncriteria.selectioncriteria[config.blobspectra.SELECTIONCRITERIA]
    paramval = selectioncriteria.scanvals[iscan]
    select = tuple([x.replace('SCAN', str(paramval)) for x in select])
    if len(select) == 1:
      select = (select[0], "True")
    nLevel0 = 0
    sys.stderr.write('Evaluating %i Regions\n'%len(finalregions))
    nEvaluated = 0
    for iregion in range(len(finalregions)):
        nEvaluated += 1
        select = selectioncriteria.selectioncriteria[config.blobspectra.SELECTIONCRITERIA]    
        select = tuple([x.replace('SCAN', str(paramval)) for x in select])
        if len(select) == 1:
          select = (select[0], "True")
        region = finalregions[iregion]
        spectraldistances = []
        regionsBlobbed[region]["mean"] = regionsBlobbed[region]["mean"] * globalMean / numpy.mean(regionsBlobbed[region]["mean"])
        catalog[region] = {'color' : region, 'nsubBlobs' : len(regionsBlobbed[region]["mySubBlobs"])}

        blobsizes = [sblob[1] for sblob in regionsBlobbed[region]["mySubBlobs"]]
        nsingleton = 0
        nblobs = 0       
        nsingleton = str(regionsBlobbed[region]["mySubBlobs"]).count('background')
        if 'background' in regionsBlobbed[region].keys():
          singletonpixels = sum(regionsBlobbed[region]['background'])
          nbgadded = len(regionsBlobbed[region]['background'])
        else:
          singletonpixels = 0
          nbgadded = 0
        nblobs = len(regionsBlobbed[region]["mySubBlobs"]) + singletonpixels - nbgadded
        catalog[region]['nsubBlobs'] = nblobs
        try:
          if catalog[region]['nsubBlobs']==nsingleton:
            catalog[region]['meanNonSingletonSize'] = 0.0
          else:
            catalog[region]['meanNonSingletonSize'] = (numpy.sum(blobsizes) - nsingleton)/ float(catalog[region]['nsubBlobs'] - nsingleton)
        except:
          catalog[region]['meanNonSingletonSize'] = 0.0
        catalog[region]['singletons'] = nsingleton
        catalog[region]['meanBlobSize'] = numpy.mean(blobsizes)
        catalog[region]['sdBlobSize' ] = numpy.std(blobsizes)
        catalog[region]['nNonSingletonBlobs'] = catalog[region]['nsubBlobs'] - catalog[region]['singletons']
        saveSelect0 = select[0]
        #sys.stderr.write('Evaluating Region %i %d %d %d\n'%(nEvaluated, catalog[region]['meanNonSingletonSize'], catalog[region]['meanBlobSize'], catalog[region]['nNonSingletonBlobs']))
        if eval(select[0]):
          nLevel0 += 1         
          n1 = max(2, sum(blobsizes))
          for j in finalregions:
                if 'mean' in regionsBlobbed[j].keys():                  
                  regionsBlobbed[j]["mean"] = regionsBlobbed[j]["mean"] * globalMean / numpy.mean(regionsBlobbed[j]["mean"])
                  n2 = max(2, sum([subblob[1] for subblob in regionsBlobbed[j]['mySubBlobs']]))
                  blobbed_test_stat, test_stat, n1n2 = calc_tdist(regionsBlobbed[region]["mean"],regionsBlobbed[j]["mean"],regionsBlobbed[region]["standard_deviation"],regionsBlobbed[j]["standard_deviation"], n1, n2)
                  blobbed_test_stat = numpy.min(test_stat)
                  if numpy.isnan(blobbed_test_stat):
                    blobbed_test_stat = -numpy.inf
                elif 'radiances' in regionsBlobbed[j].keys():
                  if (regionsBlobbed[j]['radiances']==0).all():                
                    blobbed_test_stat = -numpy.inf
                  else:
                    regionindices = connected_regions == j
                    jmean = numpy.mean(imageArray[regionindices , :] , axis=0)
                    regionsBlobbed[j]["mean"] = jmean * globalMean / numpy.mean(jmean)
                    regionsBlobbed[j]["standard_deviation"] = numpy.std(imageArray[regionindices , :] , axis=0)
                    n2 = sum([subblob[1] for subblob in regionsBlobbed[j]['mySubBlobs']])
                    blobbed_test_stat, test_stat, n1n2 = calc_tdist(regionsBlobbed[region]["mean"],regionsBlobbed[j]["mean"],regionsBlobbed[region]["standard_deviation"],regionsBlobbed[j]["standard_deviation"], n1, n2)
                    blobbed_test_stat = numpy.min(test_stat)
                else:
                   blobbed_test_stat = -numpy.inf
                if j==region:
                  spectraldistances.append( -numpy.inf)
                else:
                  spectraldistances.append(blobbed_test_stat)
          spectralneighborind = numpy.argmax(spectraldistances)
          spectralneighbor = finalregions[spectralneighborind]
          catalog[region]['ClosestSpectralAlternate'] = (spectralneighbor, spectraldistances[spectralneighborind] * nbands / PCUTBLOB)
        else:
          # Failing the Level 0 cut forces Level 1 to fail
          select = ("True", "False")
          catalog[region]['ClosestSpectralAlternate'] = (0, numpy.inf)
        if eval(select[1]):
            candregions.append(region)
            nyx = regionsBlobbed[region]["yxCoordinates"]
            overlaprows = numpy.in1d(nyx[:,0], range(trueRowsStart, trueRowsStart + nTrueRows))
            overlapcols = numpy.in1d(nyx[:,1], range(trueColsStart, trueColsStart + nTrueCols))          
            Noverlap = sum(overlaprows & overlapcols)
            truePositive += Noverlap
            falsePositive += (numpy.sum(blobsizes) - Noverlap)
            taggedPixels += numpy.sum(blobsizes)
            key = filename + '-' + str(region)
            if not config.blobspectra.KNOWNTRUTH:
              key = 'KEY'
              score = '%5.6f'%((maxScore - catalog[region]['ClosestSpectralAlternate'][1])/maxScore)
              sys.stderr.write('%d %s\n'%(maxScore, repr(catalog[region]['ClosestSpectralAlternate'])))
              idcand = '%s'%repr(catalog[region]['color'])
              image = '%s'%os.path.basename(os.environ["map_input_file"])
              #sys.stderr.write('%s\n'%os.path.basename(os.environ["map_input_file"]))
              #for key in catalog[region].keys():
              #  sys.stderr.write('%s %s\n'%(repr(key), repr(catalog[region][key])))
              conv = utilities.makeGetLngLat(metadata)
              for pixel in nyx:
                latit, longit = conv(pixel[1], START + pixel[0])
                sys.stderr.write('%s %d %d %d %d %s\n'%(mask[pixel[0], pixel[1]], START + pixel[0], pixel[1], conv(0, 0)[0], conv(0, 0)[1], score))
                #sys.stderr.write('%6.3f %6.3f\n'%(latit, longit))
                #sys.stderr.write('%d %i %d %d\n'%(catalog[region]['meanNonSingletonSize'], catalog[region]['singletons'], catalog[region]['sdBlobSize'], catalog[region]['nNonSingletonBlobs']))
                outrec = ','.join((image, idcand, '%6.3f'%longit, '%6.3f'%latit, '%i'%(START + pixel[0]), '%i'%(pixel[1]), score, '%10f'%float(catalog[region]['meanBlobSize'])))
                binaryhadoop.emit(sys.stdout, key, outrec, encoding = binaryhadoop.TYPEDBYTES_JSON)
            if sum(overlaprows & overlapcols) > 0:
              sys.stderr.write('%s\n'%repr(saveSelect0))
              sys.stderr.write('%s\n'%repr(eval(saveSelect0)))
              sys.stderr.write('%s\n'%repr(catalog[region]))
              sys.stderr.write('tested with %i %i %i %i\n'%(trueRowsStart, nTrueRows, trueColsStart, nTrueCols))
              sys.stderr.write('%s\n'%repr(nyx[:,1]))
              sys.stderr.write('%s\n'%repr(range(trueRowsStart, trueRowsStart + nTrueRows)))
        else:
          catalog.pop(region)
            
    merged_blobs = numpy.zeros(sdMask.shape)
    for band in regionsBlobbed.keys():
        rows = regionsBlobbed[band]["yxCoordinates"][:,0]
        cols = regionsBlobbed[band]["yxCoordinates"][:,1]
        if band in candregions:
          merged_blobs[rows, cols] = band
        else:
          merged_blobs[rows, cols] = 0
    trueNegative = (nTrueRows * nTrueCols) - truePositive
    falseNegative = nPixels- taggedPixels - trueNegative
    key = str(paramval)
    if taggedPixels > 0:
      fp = float(falsePositive)/taggedPixels
    else:
      fp = 0.0
    if (truePositive + trueNegative) > 0:    
      eff = float(truePositive)/(truePositive + trueNegative)
    else:
      eff = 0.0
    sys.stderr.write('Confusion Matrix - TP: %d FP: %d TN: %d FN: %d Eff: %5.3f FPfrac:%5.3f \n'%(truePositive, falsePositive, trueNegative, falseNegative, eff, fp))
    val = '%d, %d, %d, %d, %5.3f, %5.3f'%(truePositive, falsePositive, trueNegative, falseNegative, eff, fp)
    if config.blobspectra.KNOWNTRUTH:
      outrec = int(truePositive), int(falsePositive), int(trueNegative), int(falseNegative), float(eff), float(fp)
      binaryhadoop.emit(sys.stdout, key, outrec, encoding = binaryhadoop.TYPEDBYTES_JSON)
        try:
            regionKey = metadata["originalDirName"]
        except KeyError:
            regionKey = metadata["outputFile"]

        pca_data = pcaData[regionKey]
        for i in xrange(numberOfPcaComponents):
            metadata["principal_component_" + str(i + 1)] = pca_data["principal_component_" + str(i + 1)]

        imageList = utilities.preprocessImage(bands, multipliers, wavelengths, {})
        sys.stderr.write("This is the number of bands after pre-processing: %r\n" % len(imageList))
        virtualBands, pcaComponents, imageMean = pcaProcessImage(imageList, pca_data, sorted(bands.keys()))
        rogueBands = checkPCA(pcaComponents, numberOfPcaComponents)
        metadata["image mean"] = imageMean.tolist()

        if len(rogueBands) > 0:
            sys.stderr.write("These are the dropped bands that have high pca loading variance: \n")
            for rogue in rogueBands:
                sys.stderr.write("[PCA component, Load index, Leave-one-out variance] " + str(rogue) + "\n")
        else:
            sys.stderr.write("There were no dropped bands that have high pca loading variance\n")

        projectedImage = {}
        projectedImage["metadata"] = metadata
        projectedImage["mask"] = mask
        for i in xrange(numberOfPcaComponents):
            projectedImage["band_" + str(i + 1)] = virtualBands[i, :]

        binaryhadoop.emit(sys.stdout, regionKey, projectedImage, encoding=binaryhadoop.TYPEDBYTES_PICKLE)
示例#8
0
                maskLeft = numpy.roll(mask, 1, axis=0)
                maskRight = numpy.roll(mask, -1, axis=0)
                maskUp = numpy.roll(mask, 1, axis=1)
                maskDown = numpy.roll(mask, -1, axis=1)

                numpy.logical_and(mask, maskLeft, mask)
                numpy.logical_and(mask, maskRight, mask)
                numpy.logical_and(mask, maskUp, mask)
                numpy.logical_and(mask, maskDown, mask)

            imageData["numPixels"] = numpy.nonzero(mask)[0].size
        else:
            bands[key] = numpy.array(value[mask],dtype=numpy.float64)


    if imageData["metadata"] is not None:
        region_key = "REGION_1"  #this is a placeholder...still need to figure out best way to get this
        imageList = preprocessImage(bands, multipliers, wavelengths, imageData)

        iDot,iPartial = main(imageList)
        imageData["imageDot"] = iDot
        imageData["imagePartial"] = iPartial

        if len(sys.argv) > 1 and sys.argv[1].upper() == "TRUE":
            noiseList = calcNoise(imageList,mask)
            nDot,nPartial = main(noiseList)
            imageData["noiseDot"] = nDot
            imageData["noisePartial"] = nPartial

        binaryhadoop.emit(sys.stdout,region_key,json.dumps(imageData),encoding=binaryhadoop.TYPEDBYTES_JSON)
示例#9
0
def doEverything(metadata, mask, originalBlock, numSetColors, numBands, bandToIndexLookup):
    globalStart = time.time()

    if numSetColors < numBands:
        heartbeat("Image should have {} bands, but only {} were set\n".format(numBands, numSetColors))
        return

    if cameraName != "ALI":
        heartbeat("Reducing its dimensionality to 26\n")
        windowsOfBandsToTake = range(0, 27) + range(43, 46) + range(58, 67) + range(77, 92) + range(115, 139)
        reducedBlock = originalBlock[windowsOfBandsToTake,:,:]
        bandsToTake = []
        reducedBlock2 = numpy.zeros((reducedBlock.shape[0]/3, reducedBlock.shape[1], reducedBlock.shape[2]), dtype=numpy.double)
        for i in xrange(reducedBlock2.shape[0]):
            bandsToTake.append(windowsOfBandsToTake[3*i + 1])
            reducedBlock2[i] = reducedBlock[3*i:3*(i+1),:,:].mean(axis=0)
        del reducedBlock
    else:
        bandsToTake = numpy.argsort(metadata["bandNames"])
        reducedBlock2 = originalBlock

    heartbeat("Improving the mask\n")
    betterMask = mask > 0
    for i in xrange(reducedBlock2.shape[0]):
        numpy.logical_and(betterMask, reducedBlock2[i,:,:] > 0.0, betterMask)
    del mask

    shrinkmask = betterMask > 0
    for roll in -2, -1, 1, 2:
        for axis in 0, 1:
            numpy.logical_and(shrinkmask, numpy.roll(betterMask, roll, axis=axis) > 0, shrinkmask)

    heartbeat("Taking logarithm\n")
    oldsettings = numpy.seterr(divide="ignore", invalid="ignore")
    block = numpy.log(reducedBlock2)
    numpy.seterr(**oldsettings)

    heartbeat("Reducing image to a bag of pixels\n")
    bag = block.view()
    bag.shape = (block.shape[0], block.shape[1] * block.shape[2])
    del block

    bagMask = betterMask.view()
    bagMask.shape = (originalBlock.shape[1] * originalBlock.shape[2])
    bag = bag[:, bagMask]
    del bagMask

    projectionMatrix = numpy.matrix([[1 if j == i else -1 if j == i + 1 else 0 for j in xrange(reducedBlock2.shape[0])] for i in xrange(reducedBlock2.shape[0] - 1)])
    projectionInverse = projectionMatrix.I

    heartbeat("Projecting the bag onto the color-only basis\n")
    projected = numpy.array(numpy.dot(projectionMatrix, bag))
    del bag

    heartbeat("Casting the projected bag onto the image shape\n")
    projectedBlock = numpy.empty((reducedBlock2.shape[0] - 1, reducedBlock2.shape[1], reducedBlock2.shape[2]), dtype=numpy.double)
    for i in xrange(reducedBlock2.shape[0] - 1):
        projectedBlock[i,betterMask] = projected[i,:]

    heartbeat("Detecting edges\n")
    # 5x5 Kroon without integer-rounding: http://www.k-zone.nl/Kroon_DerivativePaper.pdf
    Gx = numpy.array([[ 0.0007,  0.0052,  0.0370,  0.0052,  0.0007],
                      [ 0.0037,  0.1187,  0.2589,  0.1187,  0.0037],
                      [ 0.0,     0.0,     0.0,     0.0,     0.0],
                      [-0.0037, -0.1187, -0.2589, -0.1187, -0.0037],
                      [-0.0007, -0.0052, -0.0370, -0.0052, -0.0007]])
    Gy = Gx.T

    startTime = time.time()
    convBlock = numpy.zeros((projectedBlock.shape[1], projectedBlock.shape[2]), numpy.double)
    for index in xrange(projectedBlock.shape[0]):
        heartbeat("    {} {} {}\n".format(index, time.time() - startTime, convBlock.max()))
        convGx2 = numpy.power(convolve(projectedBlock[index,:,:], Gx)[2:projectedBlock.shape[1]+2, 2:projectedBlock.shape[2]+2], 2)
        convGy2 = numpy.power(convolve(projectedBlock[index,:,:], Gy)[2:projectedBlock.shape[1]+2, 2:projectedBlock.shape[2]+2], 2)
        convBlock = convBlock + convGx2
        convBlock = convBlock + convGy2
    convBlock = numpy.sqrt(convBlock)

    heartbeat("Edges took {} seconds to detect\n".format(time.time() - startTime))

    heartbeat("Optimizing GMM\n")
    numGMMcomponents = 20
    startTime = time.time()

    if projected.shape[1] < 10 * numGMMcomponents or projected.shape[1] < 10 * projected.shape[0]:
        heartbeat("There are only {} points; skipping (number of GMM components is {} and number of dimensions in the space is {})\n".format(projected.shape[1], numGMMcomponents, projected.shape[0]))
        return

    attempts = 0
    done = False
    while not done:
        try:
            if 10000 < projected.shape[1]:
                randomSelection = projected[:,random.sample(xrange(projected.shape[1]), 10000)]
                model = MoG(randomSelection, numGMMcomponents)
                model.em(10)
                heartbeat("     time for first pass: {} seconds\n".format(time.time() - startTime))

                randomSelection = projected[:,random.sample(xrange(projected.shape[1]), 10000)]
                model = MoG(randomSelection, numGMMcomponents, means=model.means, covs=model.covs, mixprops=model.mixprops)
                model.em(10)
                heartbeat("     time for second pass: {} seconds\n".format(time.time() - startTime))

                randomSelection = projected[:,random.sample(xrange(projected.shape[1]), 10000)]
                model = MoG(randomSelection, numGMMcomponents, means=model.means, covs=model.covs, mixprops=model.mixprops)
                model.em(10)
                heartbeat("     time for third pass: {} seconds\n".format(time.time() - startTime))

                model = MoG(projected, numGMMcomponents, means=model.means, covs=model.covs, mixprops=model.mixprops)
                model.em(5)
                done = True

            else:
                heartbeat("     skipping three-pass subfit because the dataset is small\n")

                model = MoG(projected, numGMMcomponents)
                model.em(5)
                done = True

        except numpy.linalg.linalg.LinAlgError:
            attempts += 1
            if attempts > 4:
                heartbeat("    could not fit in 4 attempts; giving up\n")
                return

    heartbeat("GMM took {} seconds to optimize\n".format(time.time() - startTime))

    heartbeat("Scoring all pixels with GMM\n")
    startTime = time.time()
    scores = logsumexp(model.compute_posteriors(projected, reinit=True, normalize=False, logscale=True), 0)

    scoresBlock = numpy.zeros((reducedBlock2.shape[1], reducedBlock2.shape[2]), dtype=numpy.double)
    scoresBlock[betterMask] = scores
    del scores

    themin, themax = numpy.percentile(convBlock[shrinkmask], [0.5, 99.5])
    convNorm = (convBlock[shrinkmask] - themin)/(themax - themin)
    convBlockNorm = (convBlock - themin)/(themax - themin)

    themin, themax = numpy.percentile(scoresBlock[shrinkmask], [0.5, 99.5])
    scoresNorm = 1.0 - (scoresBlock[shrinkmask] - themin)/(themax - themin)
    scoresBlockNorm = 1.0 - (scoresBlock - themin)/(themax - themin)
    rawscoresmin, rawscoresmax = themin, themax
    del scoresBlock
    del scoresNorm

    selection = numpy.logical_and(scoresBlockNorm > 1.0, shrinkmask)
    indexes = zip(*numpy.nonzero(selection))

    scoredBag = numpy.argmax(model.compute_posteriors(projected, reinit=True), axis=0)
    heartbeat("GMM took {} seconds to score all pixels (a few times in different ways)\n".format(time.time() - startTime))

    heartbeat("Blurring scores for bucket-fill to spread better\n")
    startTime = time.time()
    spot = numpy.array([[math.exp(-((i - 2)**2 + (j - 2)**2)/2.0/1.0**2) for i in xrange(5)] for j in xrange(5)])
    spot = spot / spot.sum()
    blurScores = convolve(scoresBlockNorm[:,:], spot)[2:scoresBlockNorm.shape[0]+2, 2:scoresBlockNorm.shape[1]+2]
    heartbeat("Time to blur image: {} seconds\n".format(time.time() - startTime))

    heartbeat("Building the KD-tree\n")
    startTime = time.time()
    kdtree = KDTree(projected)
    dynamicRange = (lambda x: x[1] - x[0])(numpy.percentile(projected, [1, 99]))
    heartbeat("KD-tree took {} seconds to build\n".format(time.time() - startTime))

    heartbeat("Performing bucket-fill searches\n")
    startTime = time.time()

    clumps = set()
    assigned = numpy.empty((projectedBlock.shape[1], projectedBlock.shape[2]), dtype=numpy.dtype(object))
    considered = numpy.zeros((projectedBlock.shape[1], projectedBlock.shape[2]), dtype=numpy.dtype(bool))

    Clump.assigned = assigned
    Clump.projectedBlock = projectedBlock
    Clump.projectionInverse = projectionInverse
    Clump.metadata = metadata
    Clump.kdtree = kdtree
    Clump.bandsToTake = bandsToTake
    Clump.projected = projected
    Clump.convBlockNorm = convBlockNorm
    Clump.model = model
    Clump.rawscoresmin = rawscoresmin
    Clump.rawscoresmax = rawscoresmax
    Clump.dynamicRange = dynamicRange
    Clump.blurBlock = None
    Clump.scoresBlockNorm = scoresBlockNorm
    Clump.blurScores = blurScores

    for index, (x, y) in enumerate(indexes):
        if index % 100 == 0:
            heartbeat("    {} {}\n".format(float(index)/len(indexes), time.time() - startTime))
        queue = [(x, y)]
        clump = Clump((x, y))
        while len(queue) > 0:
            i, j = queue.pop()

            if i >= 0 and j >= 0 and i < shrinkmask.shape[0] and j < shrinkmask.shape[1] and shrinkmask[i, j]:
                if not considered[i, j]:
                    if blurScores[i, j] > 1.0:
                        clump.add((i, j))
                        assigned[i, j] = clump

                        if clump.size() > Clump.maxSize:
                            heartbeat("         clump is too big!\n")
                            break

                        newQueue = []
                        for ii in -1, 0, 1:
                            for jj in -1, 0, 1:
                                if ii != jj:
                                    newQueue.append((i + ii, j + jj))

                        queue = newQueue + queue

                elif assigned[i, j] is not None and assigned[i, j] != clump:
                    heartbeat("         merging clumps\n")
                    clump = assigned[i, j].mergeIn(clump)
                    if clump.size() > Clump.maxSize:
                        heartbeat("             ... into one that was too big!\n")
                        break

                considered[i, j] = True

        if not clump.isEmpty() and clump.size() <= Clump.maxSize:
            heartbeat("     new clump with size {}\n".format(clump.size()))
            clumps.add(clump)

    heartbeat("bucket-fill search took {} seconds\n".format(time.time() - startTime))

    heartbeat("Calculating attributes of the image and clumps\n")
    startTime = time.time()

    def gmmSpectrum(index):
        unnormalized = numpy.exp(numpy.array(numpy.dot(projectionInverse, model.means[:,index].T))[0])
        return dict(zip(list(numpy.array(metadata["bandNames"])[bandsToTake]), unnormalized / unnormalized.sum()))

    def fullSpectrumAt(indexes):
        unnormalized = numpy.zeros(originalBlock.shape[0], dtype=numpy.double)
        for i, j in indexes:
            unnormalized += originalBlock[:,i,j]
        return dict(zip(metadata["bandNames"], unnormalized / len(indexes)))

    output = {"metadata": metadata}
    getLngLat = utilities.makeGetLngLat(metadata)
    getMeters = utilities.makeGetMeters(metadata)

    wavelengths = [metadata["bandWavelength"][x] for x in numpy.array(metadata["bandNames"])[bandsToTake]]
    clusterNumber = 0
    for clump in clumps:
        heartbeat("     do clump {}\n".format(clump.indexes))

        border1 = clump.border()
        borderPoint1 = numpy.zeros(projectedBlock.shape[0], dtype=numpy.double)
        for i, j in clump.indexes:
            borderPoint1 = borderPoint1 + projectedBlock[:, i, j]
        borderPoint1 = borderPoint1 / len(border1)

        border2 = clump.border(list(clump.indexes) + list(border1))
        borderPoint2 = numpy.zeros(projectedBlock.shape[0], dtype=numpy.double)
        for i, j in clump.indexes:
            borderPoint2 = borderPoint2 + projectedBlock[:, i, j]
        borderPoint2 = borderPoint2 / len(border2)

        numSeeds = len(clump.seeds)
        numPixels = clump.size()
        seeds = sorted((int(i), int(j)) for i, j in clump.seeds)
        indexes = sorted((int(i), int(j)) for i, j in clump.indexes)
        border1 = sorted((int(i), int(j)) for i, j in border1)
        border2 = sorted((int(i), int(j)) for i, j in border2)
        mean = list(clump.mean())
        meanSeeds = list(clump.meanSeeds())
        stdev = clump.stdev()
        specMean = clump.spectrumOf(clump.mean())
        specMeanSeeds = clump.spectrumOf(clump.meanSeeds())
        borderSpec1 = clump.spectrumOf(borderPoint1)
        borderSpec2 = clump.spectrumOf(borderPoint2)
        edgeScore1 = clump.edginess(border1)
        edgeScore2 = clump.edginess(border2)
        gmmScoreMean = clump.gmmscoreOf(clump.mean())
        gmmScoreMeanSeeds = clump.gmmscoreOf(clump.meanSeeds())
        fullSpectrum = fullSpectrumAt(clump.indexes)
        fullSpectrumSeeds = fullSpectrumAt(clump.seeds)
        fullSpectrumBorder1 = fullSpectrumAt(border1)
        fullSpectrumBorder2 = fullSpectrumAt(border2)

        r200 = float(clump.density(clump.meanSeeds(), 0.200))
        r500 = float(clump.density(clump.meanSeeds(), 0.500))

        if r200 > 0.0 and r500 > 0.0 and gmmScoreMeanSeeds > 1.42 and gmmScoreMeanSeeds - gmmScoreMean > 0.12 and math.log10(r200) < -2.78 and math.log10(r500) < -1.30 and stdev > 0.067 and edgeScore2 < 0.61:
            clusterName = "cluster_{}".format(clusterNumber)
            output[clusterName] = {}
            clusterNumber += 1

            output[clusterName]["contours95"] = [{}]
            c95 = output[clusterName]["contours95"][0]

            x0 = numpy.mean([x for x, y in border2])
            y0 = numpy.mean([y for x, y in border2])
            order = numpy.argsort([math.atan2(y - y0, x - x0) for x, y in border2])
            c95["rowcolpolygon"] = [(int(x), int(y)) for x, y in numpy.array(border2)[order]]
            c95["lnglatpolygon"] = [getLngLat(x, y) for x, y in numpy.array(border2)[order]]

            c95["centroidInLngLat"] = getLngLat(x0, y0)
            c95["areaInPixels"] = numPixels

            pixelLength = abs(getMeters(*getLngLat(x0 + 0.5, y0))[0] - getMeters(*getLngLat(x0 - 0.5, y0))[0])
            pixelHeight = abs(getMeters(*getLngLat(x0, y0 + 0.5))[1] - getMeters(*getLngLat(x0, y0 - 0.5))[1])
            c95["areaInMeters"] = numPixels * pixelLength * pixelHeight
            c95["circumferenceInMeters"] = 0.5*(pixelLength + pixelHeight) * len(border1)

            rawGMMscore = float(logsumexp(model.compute_posteriors(numpy.array([clump.meanSeeds()]).T, reinit=True, normalize=False, logscale=True), 0)[0])
            rawKNNscore = float(r500)

            c95["score"] = rawGMMscore, rawKNNscore

            c95["other"] = {
                "numSeeds": numSeeds,
                "numPixels": numPixels,
                "seeds": seeds,
                "indexes": indexes,
                "border1": border1,
                "border2": border2,
                "mean": mean,
                "meanSeeds": meanSeeds,
                "stdev": stdev,
                "specMean": specMean,
                "specMeanSeeds": specMeanSeeds,
                "borderSpec1": borderSpec1,
                "borderSpec2": borderSpec2,
                "edgeScore1": edgeScore1,
                "edgeScore2": edgeScore2,
                "gmmScoreMean": gmmScoreMean,
                "gmmScoreMeanSeeds": gmmScoreMeanSeeds,
                "fullSpectrum": fullSpectrum,
                "fullSpectrumSeeds": fullSpectrumSeeds,
                "fullSpectrumBorder1": fullSpectrumBorder1,
                "fullSpectrumBorder2": fullSpectrumBorder2,
                "r200": r200,
                "r500": r500}

    binaryhadoop.emit(sys.stdout, metadata["originalDirName"], output, encoding=binaryhadoop.TYPEDBYTES_JSON)

    heartbeat("Calculating attributes took {} seconds\n".format(time.time() - startTime))

    totalTime = time.time() - globalStart
    heartbeat("Time to do everything: {} sec, which is {} min\n".format(totalTime, totalTime/60.0))