def main(key, stack): sys.stderr.write('Aggregate for value %s\n'%repr(key)) counts = numpy.zeros(4) for rec in stack: counts += rec[:4] sys.stderr.write('%s\n'%repr(rec)) if counts[0] + counts[2] > 0: eff = float(counts[0])/float(counts[0] + counts[2]) else: eff = 0.0 if counts[0] + counts[1] > 0: fpr = float(counts[1])/ (counts[0] + counts[1]) else: fpr = 0.0 outrec ='%f, %5.3f, %5.3f, %i, %i, %i, %i\n'%( float(key), eff, fpr, counts[0], counts[1], counts[2], counts[3]) binaryhadoop.emit(sys.stdout, key, outrec, encoding = binaryhadoop.TYPEDBYTES_JSON)
for i,lbl in enumerate(set(clusterList[:round(anomalyPercentage*nx.size),0])): xedges = clusterList[clusterList[:,0]==lbl,5] yedges = clusterList[clusterList[:,0]==lbl,4] cluster["cluster_" + str(i)] = makeContours(xedges,yedges,width,height,binSize) cluster["cluster_" + str(i)].update({"clusterCenter": clusterCenters[lbl,:].tolist()}) sys.stderr.write("ready to emit cluster information\n") cluster["metadata"] = metadata #Assign contour scores for clusterKey, clusterValue in cluster.iteritems(): if "metadata" not in clusterKey: for i in range(len(clusterValue['contours95'])): sys.stderr.write('reporter:status:still computing score\n') sys.stderr.flush() polygons = numpy.array([[round(x[0]),round(x[1])] for x in clusterValue['contours95'][i]['rowcolpolygon']]) scoreIndex = [] for p in polygons: scoreIndex.append(numpy.argmin(numpy.sum(numpy.square(numpy.fliplr(clusterList[:,4:])-p),axis=1))) scoremdist = mDist(numpy.array(cluster[clusterKey]["clusterCenter"]).reshape(1,-1),pcaEigenvalues.reshape(1,-1))[0] try: scores = [scoreDict[x][1] for x in map(tuple,[map(int,map(round,y)) for y in clusterValue['contours95'][i]['rowcolpolygon']])] scoreedist = sum(scores)/len(scores) except KeyError: scoreedist = numpy.mean(clusterList[numpy.array(scoreIndex),3]) cluster[clusterKey]['contours95'][i]['score'] = [scoremdist,scoreedist] binaryhadoop.emit(sys.stdout,regionKey,cluster,encoding = binaryhadoop.TYPEDBYTES_JSON)
bands[key] = numpy.array(value[mask],dtype=numpy.float64) if imageData["metadata"] is not None: if 'HSI' in imageData["metadata"].keys(): wavelengths = {} multipliers = {} for w,wave in enumerate(imageData["metadata"][unicode("HSI")][unicode("wavelength")]): wavelengths["B" + "%03d" % w] = float(wave) multipliers["B" + "%03d" % w] = 1 else: wavelengths = imageData["metadata"]["bandWavelength"] multipliers = imageData["metadata"]["bandMultiplier"] imageList = utilities.preprocessImage(bands, multipliers, wavelengths, imageData) sys.stderr.write("This is the number of bands: %r\n" % len(imageList)) iDot,iPartial = main(imageList) imageData["imageDot"] = iDot imageData["imagePartial"] = iPartial if noiseFlag.upper() == "TRUE": noiseList = calcNoise(imageList,mask) nDot,nPartial = main(noiseList) imageData["noiseDot"] = nDot imageData["noisePartial"] = nPartial try: regionKey = imageData["metadata"]["originalDirName"] except KeyError: regionKey = imageData["metadata"]["outputFile"] binaryhadoop.emit(sys.stdout,regionKey,imageData,encoding=binaryhadoop.TYPEDBYTES_JSON)
if 'HSI' in imageData["metadata"].keys(): wavelengths = {} multipliers = {} for w,wave in enumerate(imageData["metadata"][unicode("HSI")][unicode("wavelength")]): wavelengths["B" + "%03d" % w] = float(wave) multipliers["B" + "%03d" % w] = 1 else: wavelengths = imageData["metadata"]["bandWavelength"] multipliers = imageData["metadata"]["bandMultiplier"] imageList = utilities.preprocessImage(bands, multipliers, wavelengths, imageData, selectBands=selectbands) sys.stderr.write("This is the number of bands: %r\n" % len(imageList)) iDot,iPartial = main(imageList) imageData["imageDot"] = iDot imageData["imagePartial"] = iPartial if noiseFlag.upper() == "TRUE": noiseList = calcNoise(imageList,mask) nDot,nPartial = main(noiseList) imageData["noiseDot"] = nDot imageData["noisePartial"] = nPartial try: regionKey = imageData["metadata"]["originalDirName"] except KeyError: regionKey = imageData["metadata"]["outputFile"] binaryhadoop.emit(sys.stdout,regionKey,imageData,encoding=binaryhadoop.TYPEDBYTES_JSON)
#!/usr/bin/env python import binaryhadoop import sys import augustus import report_unknowntruth import json if __name__ == "__main__": cmrecs = [] for key, value in binaryhadoop.mapperInput(sys.stdin, typeMap={None: binaryhadoop.TYPEDBYTES_JSON, "KEY": binaryhadoop.TYPEDBYTES_JSON}): cmrec = value.split(',') cmrecs.append('%d, %d'%(float(cmrec[0]), float(cmrec[1]))) if len(cmrecs) > 0: html, img1, img2, img3 = report_unknowntruth.main(cmrecs, 'SummaryStatistics.json') record = "%s"%(json.dumps([html, img1, img2, img3])) binaryhadoop.emit(sys.stdout, "KEY", record, encoding = binaryhadoop.TYPEDBYTES_JSON)
def main(filename, metadata, mask, bands, start, end, iscan, trueRowsStart, nTrueRows, trueColsStart, nTrueCols, FCUT = .08, PCUTBLOB = .001, PCUTCHI = .01, MINCATALOGUEDSIZE = 0): START = start; END = end tstart = datetime.datetime.now() imageBands = sorted(bands.keys()) windowRadius = 1 truePositive = 0 falsePositive = 0 taggedPixels = 0 imageArray = numpy.zeros((bands[imageBands[0]].shape + (len(bands),) )) maxScore = len(imageBands)/PCUTBLOB # Attempt to remove some problematic bands, in particular, ones with 'stripes'. bandToRemove = [] sys.stderr.write('Number of Bands: %i\n'%len(imageBands)) sys.stderr.write('Evaluate Whether Columns are reasonable\n') for i, band in enumerate(imageBands): imageArray[:,:,i] = bands[band] if config.blobspectra.VALIDATECOLUMNS: ok = True bandrange = numpy.ma.max(bands[band].data) - numpy.ma.min(bands[band].data) sys.stderr.write('For band %i bandrange is %f\n'%(i,bandrange)) sys.stderr.write('At %s\n'%(repr(numpy.unravel_index(numpy.ma.argmax(bands[band].data), bands[band].data.shape)))) maxpix = numpy.unravel_index(numpy.ma.argmax(bands[band].data), bands[band].data.shape) sys.stderr.write('Value at max %d\n'%bands[band][maxpix]) for j in range(1, bands[band].shape[1] - 1): if (numpy.ma.max(bands[band].data[:, j]) - numpy.ma.min(bands[band].data[:, j])) / bandrange < .01: if True: sys.stderr.write('%i %f %f\n'%(j, numpy.ma.max(bands[band].data[:,j]),numpy.ma.min(bands[band].data[:,j]))) ok = False if not ok: sys.stderr.write('Remove band %i because of un-natural radiance distribution in Cols\n' % i) bandToRemove.append(i) if config.blobspectra.VALIDATEROWS: sys.stderr.write('Evaluate Whether Rows are reasonable\n') for i, band in enumerate(imageBands): ok = True bandrange = numpy.ma.max(bands[band].data) - numpy.ma.min(bands[band].data) for j in range(1, bands[band].shape[0] - 1 ): if (numpy.ma.max(bands[band].data[j,:]) - numpy.ma.min(bands[band].data[j,:])) / bandrange < .01: ok = False if not ok: sys.stderr.write('Remove band %i because of un-natural radiance distribution in Rows\n' % i) bandToRemove.append(i) imageArray = numpy.delete(imageArray, bandToRemove, 2) sys.stderr.write('Shape %s\n'%repr(imageArray.shape)) try: sys.stderr.write('Sample %s\n'%repr(imageArray[400,400,:])) except: pass nPixels = imageArray.shape[0] * imageArray.shape[1] sys.stderr.write("this is the size of the image: " + str(imageArray.shape) + '\n') NimageRows = imageArray.shape[0] sys.stderr.write("This is the number of pixels per band in the image: " + str(imageArray.shape[0] * imageArray.shape[1]) + '\n') sys.stderr.write("This is the total number of pixels in the image: " + str(imageArray.size) + '\n') # This is the number of pixels that are non-zero. Here, a pixel is a 3-index object: x,y,wavelength. Dividing # this number by the number of bands ought to be pretty close to the number of unmasked geospatial pixels in this portion. sys.stderr.write("This is the total number of non-zero pixels: " + str((imageArray.reshape(-1, ) > 0).sum()) + '\n') # numpy.prod(imageArray,2) multiplies the radiances across all wavelengths together. This construct will add up # number of geospatial pixels for which at least one band is zero. sys.stderr.write("This is the total number of zero pixels: " + str((numpy.prod(imageArray, 2) == 0).sum()) + '\n') imageIndices = numpy.arange(imageArray.shape[0] * imageArray.shape[1]).reshape((imageArray.shape[0], imageArray.shape[1])) nbands = imageArray.shape[2] # (ij)th element of ny yields i. (ij)th element of nx yields j ny, nx = numpy.mgrid[0 : mask.shape[0], 0 : mask.shape[1]] globalMean = numpy.mean(imageArray.reshape(-1, )) sys.stderr.write('global mean: %f\n'%globalMean) sys.stderr.write('global var : %f\n'%numpy.var(imageArray.reshape(-1,))) if globalMean <= 5 : sys.stderr.write('=====> Insufficient radiance in image. Returning!\n') return if numpy.var(imageArray.reshape(-1,)) < 200**2 : sys.stderr.write('=====> Insufficient variability in image. Returning!\n') return globalMean = globalMean * mask.sum() / (mask.shape[0] * mask.shape[1]) #This line is because the original mean #over all pixels, not just the non-zeros in the mask ################################# #####STANDARD DEVIATION WINDOW### ################################# #Calculate the mean and standard deviation of 3-by-3 pixel windows boxStandardDev = numpy.zeros(imageArray.shape) boxMean = numpy.zeros(imageArray.shape) nBoxPixels = (1 + 2 * windowRadius) ** 2 for i in xrange(nbands): box = rolling_window(imageArray[:, :, i], 1) boxsq = rolling_window(imageArray[:, :, i] ** 2, 1) boxStandardDev[:, :, i] = numpy.sqrt(boxsq / nBoxPixels - (box / nBoxPixels) ** 2) boxMean[:, :, i] = box / nBoxPixels boxStdZero = numpy.where(boxStandardDev[:,:,i] == 0) ################################# ####CREATE STANDARD DEV MASK##### ################################# #Create a boolean array (mask) with TRUE entries where standard deviation is below FCUT * max standard dev of each band zfilter = numpy.zeros(imageArray.shape) for i in xrange(nbands): #sys.stderr.write('%s\n'%repr(numpy.histogram((numpy.ma.masked_array(boxStandardDev[:,:,i], mask=mask)).compressed(), 100))) zfilter[:, :, i] = boxStandardDev[:, :, i] < FCUT * numpy.max(boxStandardDev[:, :, i]) zfilterAll = numpy.ma.all(zfilter, 2) zfilterAll = numpy.ma.masked_array(zfilterAll, mask=mask) sys.stderr.write('Unmasked pixels %i\n'%zfilterAll.count()) sys.stderr.write('Pixels in chromatically homogenous region %s\n'%repr(zfilterAll.sum())) #Combine this mask with the original image shape mask #sdMask = numpy.ma.masked_array(zfilterAll.astype('uint8'), mask | ~zfilterAll) sdMask = zfilterAll ###################################### #####CONNECTED COMPONENTS LABELING#### ###################################### #This step uses a connected components labeling algorithm. #It is applied to the binary mask generated in the previous step, #and labels each connected group of pixels with a separate integer. #Note that the background pixels are also labeled. If, as is most often the case, #the background is connected, it takes on the value 0. connected_regions, num_features = find_regions(numpy.ma.masked_array(sdMask, mask=mask)) sys.stderr.write('Using find_regions \n') #connected_regions, num_features = ndimage.label(sdMask) #sys.stderr.write('Using scipy %i\n'%num_features) connected_regions[numpy.prod(imageArray, 2) == 0] = sorted(numpy.unique(connected_regions))[-1] + 1 regions_labels = sorted(numpy.unique(connected_regions)) # # background (i.e. region 0) will be green, masked out (radiance=0) areas will be red per # the rgb convention used here. #connectedPlot.save("connectedPlot.png", "PNG", options="optimize") ###################################### #####BLOB STATISTICS################## ###################################### regions = {} regions["bandNames"] = imageBands if num_features == 0: sys.stderr.write('=====> No features identified in this image. Returning!\n') return #Calcuate the mean and standard deviation of each connected, across each band. #The arrays are placed in a dictionary which contains the row-column coordinates, #along with the mean and standard deviation for each band. #In most cases, the background of the image will be simply connected. In rare cases, a feature from the initial sdMask #may cut the image, in which case there will be more than one background. Here, the x-y coordinates of each region are #rigorously checked against the background (which is False in the sdMask) x-y coordinates. nyx_background = set(map(tuple,numpy.hstack((ny[sdMask == False].reshape(-1, 1),nx[sdMask == False].reshape(-1, 1))))) sys.stderr.write('Number of background pixels %i\n'%len(nyx_background)) regions = defaultdict(dict) background_regions = defaultdict(dict) #The bad pixel (i.e., zero radiance because information was missing) has the last label, which gets avoided here for i,label in enumerate(regions_labels[:-1]): #First assign pixels which are "background", i.e., did not pass the FCUT, to a separate dictionary regionindices = connected_regions == label regionrows = ny[regionindices].reshape(-1,1) regioncols = nx[regionindices].reshape(-1,1) regionpixels = set(map(tuple, numpy.hstack((regionrows, regioncols)))) if regionpixels <= nyx_background: background_regions[label]["yxCoordinates"] = numpy.hstack((regionrows , regioncols)) background_regions[label]["mean"] = numpy.mean(imageArray[regionindices , :] , axis=0) background_regions[label]["standard_deviation"] = numpy.std(imageArray[regionindices , :] , axis=0) background_regions[label]["mySubBlobs"] = [[label , regionindices.sum()]] #a record is kept of the background region background_regions[label]["radiances"] = imageArray[regionindices , :] else: regions[label]["yxCoordinates"] = numpy.hstack((regionrows , regioncols)) regions[label]["mean"] = numpy.mean(imageArray[regionindices , :] , axis = 0) regions[label]["background"] = [] try: regions[label]["mean"] = regions[label]["mean"] * globalMean / numpy.mean(regions[label]["mean"]) except: sys.stderr.write('Fail to normalize this mean for region %i\n'%label) sys.stderr.write(repr(regions[label]["mean"])) regions[label]["standard_deviation"] = numpy.std(imageArray[regionindices , :] , axis=0) regions[label]["mySubBlobs"] = [[label , regionindices.sum()]] #a record is kept of all blobs and their respective sizes if regions[label]["mySubBlobs"][0][1] > 1: regions[label]["standard_deviation"] = numpy.mean(boxStandardDev[regionindices,:], axis=0) else: regions[label]["standard_deviation"] = boxStandardDev[regionindices,:].reshape(-1,) regionlabels = regions.keys() sys.stderr.write('non background region labels %i\n'%len(regionlabels)) singletonRad = 0 for key in background_regions.keys(): if singletonRad == 0: singletonRad = background_regions[key]["radiances"] singletonYX = background_regions[key]["yxCoordinates"] else: singletonRad = numpy.vstack((singletonRad,background_regions[key]["radiances"])) singletonYX = numpy.vstack((singletonYX,background_regions[key]["yxCoordinates"])) #This is where the merging of blobs (hierarchical clustering) takes place. #First test whether regions which made the initial cut (where sdMask=True) #can be grouped together. #Start out by using the mean and standard dev of blobs to construct a t-test statistic. #If this statistic is above a threshold PCUT, the blobs will be merged. The second blob #is always appended to the first. #For regions below 3 pixels, use a chi-squared test in place of t-test to determined #whether pixels should be merged to a blob. regionsBlobbed = defaultdict(dict) #regions will be appended to this dictionary as they are merged with other regions regionsUnblobbed = copy.deepcopy(regions) #regions will be removed from this dictioanry when they have been merged to another region merge_keys = copy.copy(regions.keys()) unmerged_regions = sorted([x for x in merge_keys if regions[x]["yxCoordinates"].shape[0] > MINCATALOGUEDSIZE]) sys.stderr.write('Merging %i valid among a total of %i multi-cell blobs first\n'%(len(unmerged_regions), len(merge_keys))) merged_regions = [] merge_count = 0 for i, band1 in enumerate(sorted([x for x in merge_keys if regions[x]["yxCoordinates"].shape[0] > MINCATALOGUEDSIZE])): if band1 in unmerged_regions: unmerged_regions.remove(band1) regionsBlobbed[band1] = copy.deepcopy(regions[band1]) del regionsUnblobbed[band1] n1 = sum([subblob[1] for subblob in regions[band1]['mySubBlobs']]) for j, band2 in enumerate(unmerged_regions): n2 = sum([subblob[1] for subblob in regions[band2]['mySubBlobs']]) blobbed_test_stat, test_stat, n1n2 = calc_tdist(regions[band1]["mean"], regions[band2]["mean"], regions[band1]["standard_deviation"],regions[band2]["standard_deviation"], n1, n2) blobbed_test_stat = numpy.min(test_stat) if blobbed_test_stat > PCUTBLOB / len(regions[band1]["mean"]): #sys.stderr.write('%s, %s, %s \n'%(repr(blobbed_test_stat), repr(test_stat), repr(n1n2))) merged_regions.append(band2) merge_count += 1 regionsBlobbed[band1]["yxCoordinates"] = numpy.vstack((regionsBlobbed[band1]["yxCoordinates"], regions[band2]["yxCoordinates"])) regionsBlobbed[band1]["mySubBlobs"].append([band2,regions[band2]["yxCoordinates"].shape[0]]) for merged in merged_regions: unmerged_regions.remove(merged) merge_count -= 1 del regionsUnblobbed[merged] merged_regions = [] sys.stderr.write('Number of Blobs after blob-to-blob merging %i Number of SubBlobs %i\n'%(len(regionsBlobbed), numpy.sum([len(regionsBlobbed[b]["mySubBlobs"]) for b in regionsBlobbed.keys()]))) sys.stderr.write('Spread in pixels associated with blobs:\n') blobsizes = [] for k in regionsBlobbed.keys(): npixels = regionsBlobbed[k]["yxCoordinates"].size/2 if npixels != 1: sys.stderr.write('largish blob %i %s\n'%(k, repr(regionsBlobbed[k]["mySubBlobs"]))) sys.stderr.write('%d\n'%(regionsBlobbed[k]["yxCoordinates"].size/2.0)) blobsizes.append(regionsBlobbed[k]["yxCoordinates"].size/2) blobdist = numpy.histogram(blobsizes) sys.stderr.write('%s\n'%repr(blobdist)) sys.stderr.write('Number of Pixels in Merged blobs: %i\n'%numpy.array(blobsizes).sum()) regions = [] merged_blobs = numpy.zeros(sdMask.shape) for band in regionsBlobbed.keys(): rows = regionsBlobbed[band]["yxCoordinates"][:,0] cols = regionsBlobbed[band]["yxCoordinates"][:,1] merged_blobs[rows, cols] = band ############################ #Re-group blob statistics### ############################ blobMeans = [] blobStandardDevs = [] blobbedRegionKeys = sorted(regionsBlobbed.keys()) for blob in blobbedRegionKeys: blobMeans.append(regionsBlobbed[blob]["mean"]) blobStandardDevs.append(regionsBlobbed[blob]["standard_deviation"]) try: if blobStandardDevs[-1][-1] == 0.0: row = regionsBlobbed[blob]['yxCoordinates'][0][0] col = regionsBlobbed[blob]['yxCoordinates'][0][1] except: sys.stderr.write('############\n') sys.stderr.write('Shape of blobStanardDevs %s\n'%repr((numpy.array(blobStandardDevs)).shape)) sys.stderr.write('%s\n'%repr(regionsBlobbed[blob])) sys.stderr.write('%s\n'%repr(blobStandardDevs[-1][-1])) blobMeans = numpy.array(blobMeans) try: blobMeansMean = numpy.mean(blobMeans,axis=1) except: sys.stderr.write('blobmeans: %s\n'%repr(blobMeans)) if len(blobMeans)==0: return blobStandardDevs = numpy.array(blobStandardDevs) sys.stderr.write('Done blob-blob merging\n') ################################################################### ######MERGE UNBLOBBED, PASSED FCUT REGIONS-Chi Squared Test####### ################################################################### if len(regionsUnblobbed.keys()) > 0: #Merge any blobs (pixels that passed initial FCUT) which are smaller than MINCATALOGUEDSIZE #based on result of a chi-squared test unBlobbedMeans = [] unBlobbedRegionKeys = sorted(regionsUnblobbed.keys()) #Avoid first key, which are the pixels that do not pass FCUT for unblobbed in unBlobbedRegionKeys: unBlobbedMeans.append(regionsUnblobbed[unblobbed]["mean"]) #Table the mean of the unblobbed regions unBlobbedMeans = numpy.array(unBlobbedMeans) unBlobbedChiSq = numpy.zeros((unBlobbedMeans.shape[0],blobMeans.shape[0])) for i in numpy.arange(blobMeans.shape[0]): #This is the vectorized argument to be used in the chi-squared test try: a=(unBlobbedMeans.T/numpy.mean(unBlobbedMeans, axis=1)).T b=blobMeans[i,:]/blobMeansMean[i] unBlobbedChiSq[:,i] = numpy.sum(numpy.square( ((a-b)*globalMean)/blobStandardDevs[i,:]), axis=1) except: sys.stderr.write('Unable to perform this chi-squared test ' + repr(unBlobbedMeans) + '\n') unblobbed_test_stat = calc_chisq(unBlobbedChiSq, unBlobbedMeans.shape[1]) unblobbed_test_stat_max = numpy.max(unblobbed_test_stat, axis=1) #Take max #Table the label of the unblobbed region with the label of the blobbed region where the unblobbed-blobbed chi-squared test has its max value unblobbed_to_merge_with = numpy.hstack((numpy.array(unBlobbedRegionKeys).reshape(-1,1),numpy.array(blobbedRegionKeys)[numpy.argmax(unblobbed_test_stat,axis=1)].reshape(-1,1))) #Find out which unblobbed regions pass the chi-squared test unblobbed_to_merge_with = unblobbed_to_merge_with[unblobbed_test_stat_max > PCUTCHI,:] #Do the merging of the unblobbed regions that passed chi-squared test with appropriate blobbed region for unblobbed in unblobbed_to_merge_with: regionsBlobbed[unblobbed[1]]["yxCoordinates"] = numpy.vstack((regionsBlobbed[unblobbed[1]]["yxCoordinates"],regionsUnblobbed[unblobbed[0]]["yxCoordinates"])) regionsBlobbed[unblobbed[1]]["mySubBlobs"].append([unblobbed[0],regionsUnblobbed[unblobbed[0]]["yxCoordinates"].shape[0]]) del regionsUnblobbed[unblobbed[0]] ################################################################# ## MERGE UNBLOBBED, DID NOT PASS FCUT ## #Do the chi-squared test again, but this time with the pixels ## #which did not pass the initial FCUT ## ################################################################# sys.stderr.write('Merge Single Pixels with high local SD to known blobs\n') sdNotMask = sdMask==False # #sdPlot = Image.fromarray(numpy.array(255*sdNotMask,dtype=numpy.uint8)) #sdPlot.save("sdNotMask.png","PNG",options="optimize") # unBlobbedMeans = [] for key in background_regions.keys(): unBlobbedMeans = background_regions[key]["radiances"] unBlobbedChiSq = numpy.zeros((unBlobbedMeans.shape[0], blobMeans.shape[0])) # this construct normalizes each bg pixel by its mean over all bands. a = (unBlobbedMeans.T/numpy.mean(unBlobbedMeans, axis=1)).T b = blobMeans / blobMeansMean[:, None] unBlobbedChiSq = numpy.apply_along_axis(chisq, 1, a, b, blobStandardDevs, globalMean) best_chisquared = numpy.min(unBlobbedChiSq, axis = 1) unblobbed_test_stat = calc_chisq(best_chisquared, unBlobbedMeans.shape[1]) #unblobbed_test_stat_max = numpy.max(unblobbed_test_stat, axis = 1) bgcoord = background_regions[key]["yxCoordinates"] background_coords = imageIndices[bgcoord[:, 0], bgcoord[:, 1]] # this variable has two rows, the first is a flattened index into the image raster, # the second is the best chi-squared match to a blob. unblobbed_to_merge_with = numpy.hstack((background_coords.reshape(-1, 1), \ numpy.array(blobbedRegionKeys)[numpy.argmin(unBlobbedChiSq, axis = 1)].reshape(-1, 1))) # Background pixels will be merged if they have a good enough match. unblobbed_to_merge_with = unblobbed_to_merge_with[unblobbed_test_stat > PCUTCHI, :] # this construct yields an N x 2 array where N is the number of pixels in the image. # the array is a list of the row, column index of the ith pixel (counting across and then down). singletonCoords = numpy.hstack((ny.reshape(-1, 1), nx.reshape(-1, 1))) blobsToUpdate = numpy.unique(unblobbed_to_merge_with[:, 1]) testregionsBlobbed = {} for blob in blobsToUpdate: testregionsBlobbed[blob] = {} pixelsToAdd = unblobbed_to_merge_with[unblobbed_to_merge_with[:, 1] == blob][:,0] try: regionsBlobbed[blob]["yxCoordinates"] = numpy.vstack((regionsBlobbed[blob]["yxCoordinates"], singletonCoords[pixelsToAdd, : ])) #testregionsBlobbed[blob]["yxCoordinates"] = numpy.vstack((regionsBlobbed[blob]["yxCoordinates"], singletonCoords[pixelsToAdd, : ])) except: sys.stderr.write('%s\n'%repr(blob)) sys.stderr.write('pixelstoadd %s\n'%repr(pixelsToAdd)) sys.stderr.write('shapes: %s %s \n'%(regionsBlobbed[blob]["yxCoordinates"].shape, singletonCoords[pixelsToAdd,:].shape)) sys.stderr.write('singleton coordinates %s\n'%repr(singletonCoords[pixelsToAdd,:])) for unblobbed in unblobbed_to_merge_with: regionsBlobbed[unblobbed[1]]["mySubBlobs"].append(["background pixel " + str(unblobbed[0]),1]) regionsBlobbed[unblobbed[1]]["background"].append(1) background_regions[key]["yxCoordinates"] = numpy.delete(background_regions[key]["yxCoordinates"], numpy.arange(unblobbed_test_stat.size)[unblobbed_test_stat > PCUTCHI],0) background_regions[key]["radiances"] = numpy.delete(background_regions[key]["radiances"], numpy.arange(unblobbed_test_stat.size)[unblobbed_test_stat > PCUTCHI],0) background_regions[key]["mySubBlobs"][0][1] -= (unblobbed_test_stat > PCUTCHI).sum() sys.stderr.write('Done Merging Single High SD Pixels\n') ################################## # UNMATCHED BACKGROUND PIXELS # ################################## singletonRad = 0 for key in background_regions.keys(): if singletonRad == 0: singletonRad = background_regions[key]["radiances"] singletonYX = background_regions[key]["yxCoordinates"] else: singletonRad = numpy.vstack((singletonRad,background_regions[key]["radiances"])) singletonYX = numpy.vstack((singletonYX,background_regions[key]["yxCoordinates"])) removeZeroSingletons = numpy.where( numpy.prod(singletonRad, 1) == 0 ) if len(removeZeroSingletons[0]) > 0: singletonRad = numpy.delete(singletonRad, removeZeroSingletons, 0) singletonyx = numpy.delete(singletonYX, removeZeroSingletons, 0) singletonRadMeans = numpy.mean(singletonRad, axis = 0) singletonRad = singletonRad * globalMean / singletonRadMeans ################################## # TRY TO FIND BINARY BLOB MIXES ## ################################## #for key in background_regions.keys(): # singleton sys.stderr.write('Cluster locally hight SD pixels that differ from blobs with one another\n') ################################## # CLUSTER UNMERGED PIXELS # ################################## #binlabelStart = int(sorted(regionsBlobbed.keys())[-1]) binlabelStart = num_features + 1 if len(singletonRad) > 0: start = 0 chunk = 10000 mergeSingletons = {} total = 0 while total < len(singletonRad): end = min(len(singletonRad), start + chunk) mergeSingletons.update(mergevecs(singletonRad[start:end,:])) total += (end - start) start = start + chunk for i, tlabel in mergeSingletons.items(): binlabel = binlabelStart + i #tlabel = mergeSingletons[i] regionsBlobbed[binlabel]["radiances"] = singletonRad[tlabel,:] regionsBlobbed[binlabel]["yxCoordinates"] = singletonYX[tlabel,:] regionsBlobbed[binlabel]["mySubBlobs"] = [["background pixels " + str(binlabel), len(tlabel)]] try: regionsBlobbed[binlabel]["background"].append(len(tlabel)) except: regionsBlobbed[binlabel]["background"] = [len(tlabel)] regionsBlobbed[binlabel]["mean"] = numpy.mean(singletonRad[tlabel,:], axis=0) regionsBlobbed[binlabel]["standard_deviation"] = numpy.std(singletonRad[tlabel,:], axis=0) merged_blobs = numpy.zeros(sdMask.shape) for band in regionsBlobbed.keys(): for i in xrange(regionsBlobbed[band]["yxCoordinates"].shape[0]): coord = regionsBlobbed[band]["yxCoordinates"] merged_blobs[regionsBlobbed[band]["yxCoordinates"][i,0],regionsBlobbed[band]["yxCoordinates"][i,1]] = band sys.stderr.write('Done Categorizing Pixels. Fill out catalog\n') imageArray[:, :, 4] = numpy.array(255. * imageArray[:, :, 4] / imageArray[:, :, 4].max(), dtype=numpy.uint8) catalog = {} candregions = [] # no need to distinguish between the two categories at this point. regionsBlobbed.update(regionsUnblobbed) finalregions = regionsBlobbed.keys() finalregions.sort() ncand = 0 select = selectioncriteria.selectioncriteria[config.blobspectra.SELECTIONCRITERIA] paramval = selectioncriteria.scanvals[iscan] select = tuple([x.replace('SCAN', str(paramval)) for x in select]) if len(select) == 1: select = (select[0], "True") nLevel0 = 0 sys.stderr.write('Evaluating %i Regions\n'%len(finalregions)) nEvaluated = 0 for iregion in range(len(finalregions)): nEvaluated += 1 select = selectioncriteria.selectioncriteria[config.blobspectra.SELECTIONCRITERIA] select = tuple([x.replace('SCAN', str(paramval)) for x in select]) if len(select) == 1: select = (select[0], "True") region = finalregions[iregion] spectraldistances = [] regionsBlobbed[region]["mean"] = regionsBlobbed[region]["mean"] * globalMean / numpy.mean(regionsBlobbed[region]["mean"]) catalog[region] = {'color' : region, 'nsubBlobs' : len(regionsBlobbed[region]["mySubBlobs"])} blobsizes = [sblob[1] for sblob in regionsBlobbed[region]["mySubBlobs"]] nsingleton = 0 nblobs = 0 nsingleton = str(regionsBlobbed[region]["mySubBlobs"]).count('background') if 'background' in regionsBlobbed[region].keys(): singletonpixels = sum(regionsBlobbed[region]['background']) nbgadded = len(regionsBlobbed[region]['background']) else: singletonpixels = 0 nbgadded = 0 nblobs = len(regionsBlobbed[region]["mySubBlobs"]) + singletonpixels - nbgadded catalog[region]['nsubBlobs'] = nblobs try: if catalog[region]['nsubBlobs']==nsingleton: catalog[region]['meanNonSingletonSize'] = 0.0 else: catalog[region]['meanNonSingletonSize'] = (numpy.sum(blobsizes) - nsingleton)/ float(catalog[region]['nsubBlobs'] - nsingleton) except: catalog[region]['meanNonSingletonSize'] = 0.0 catalog[region]['singletons'] = nsingleton catalog[region]['meanBlobSize'] = numpy.mean(blobsizes) catalog[region]['sdBlobSize' ] = numpy.std(blobsizes) catalog[region]['nNonSingletonBlobs'] = catalog[region]['nsubBlobs'] - catalog[region]['singletons'] saveSelect0 = select[0] #sys.stderr.write('Evaluating Region %i %d %d %d\n'%(nEvaluated, catalog[region]['meanNonSingletonSize'], catalog[region]['meanBlobSize'], catalog[region]['nNonSingletonBlobs'])) if eval(select[0]): nLevel0 += 1 n1 = max(2, sum(blobsizes)) for j in finalregions: if 'mean' in regionsBlobbed[j].keys(): regionsBlobbed[j]["mean"] = regionsBlobbed[j]["mean"] * globalMean / numpy.mean(regionsBlobbed[j]["mean"]) n2 = max(2, sum([subblob[1] for subblob in regionsBlobbed[j]['mySubBlobs']])) blobbed_test_stat, test_stat, n1n2 = calc_tdist(regionsBlobbed[region]["mean"],regionsBlobbed[j]["mean"],regionsBlobbed[region]["standard_deviation"],regionsBlobbed[j]["standard_deviation"], n1, n2) blobbed_test_stat = numpy.min(test_stat) if numpy.isnan(blobbed_test_stat): blobbed_test_stat = -numpy.inf elif 'radiances' in regionsBlobbed[j].keys(): if (regionsBlobbed[j]['radiances']==0).all(): blobbed_test_stat = -numpy.inf else: regionindices = connected_regions == j jmean = numpy.mean(imageArray[regionindices , :] , axis=0) regionsBlobbed[j]["mean"] = jmean * globalMean / numpy.mean(jmean) regionsBlobbed[j]["standard_deviation"] = numpy.std(imageArray[regionindices , :] , axis=0) n2 = sum([subblob[1] for subblob in regionsBlobbed[j]['mySubBlobs']]) blobbed_test_stat, test_stat, n1n2 = calc_tdist(regionsBlobbed[region]["mean"],regionsBlobbed[j]["mean"],regionsBlobbed[region]["standard_deviation"],regionsBlobbed[j]["standard_deviation"], n1, n2) blobbed_test_stat = numpy.min(test_stat) else: blobbed_test_stat = -numpy.inf if j==region: spectraldistances.append( -numpy.inf) else: spectraldistances.append(blobbed_test_stat) spectralneighborind = numpy.argmax(spectraldistances) spectralneighbor = finalregions[spectralneighborind] catalog[region]['ClosestSpectralAlternate'] = (spectralneighbor, spectraldistances[spectralneighborind] * nbands / PCUTBLOB) else: # Failing the Level 0 cut forces Level 1 to fail select = ("True", "False") catalog[region]['ClosestSpectralAlternate'] = (0, numpy.inf) if eval(select[1]): candregions.append(region) nyx = regionsBlobbed[region]["yxCoordinates"] overlaprows = numpy.in1d(nyx[:,0], range(trueRowsStart, trueRowsStart + nTrueRows)) overlapcols = numpy.in1d(nyx[:,1], range(trueColsStart, trueColsStart + nTrueCols)) Noverlap = sum(overlaprows & overlapcols) truePositive += Noverlap falsePositive += (numpy.sum(blobsizes) - Noverlap) taggedPixels += numpy.sum(blobsizes) key = filename + '-' + str(region) if not config.blobspectra.KNOWNTRUTH: key = 'KEY' score = '%5.6f'%((maxScore - catalog[region]['ClosestSpectralAlternate'][1])/maxScore) sys.stderr.write('%d %s\n'%(maxScore, repr(catalog[region]['ClosestSpectralAlternate']))) idcand = '%s'%repr(catalog[region]['color']) image = '%s'%os.path.basename(os.environ["map_input_file"]) #sys.stderr.write('%s\n'%os.path.basename(os.environ["map_input_file"])) #for key in catalog[region].keys(): # sys.stderr.write('%s %s\n'%(repr(key), repr(catalog[region][key]))) conv = utilities.makeGetLngLat(metadata) for pixel in nyx: latit, longit = conv(pixel[1], START + pixel[0]) sys.stderr.write('%s %d %d %d %d %s\n'%(mask[pixel[0], pixel[1]], START + pixel[0], pixel[1], conv(0, 0)[0], conv(0, 0)[1], score)) #sys.stderr.write('%6.3f %6.3f\n'%(latit, longit)) #sys.stderr.write('%d %i %d %d\n'%(catalog[region]['meanNonSingletonSize'], catalog[region]['singletons'], catalog[region]['sdBlobSize'], catalog[region]['nNonSingletonBlobs'])) outrec = ','.join((image, idcand, '%6.3f'%longit, '%6.3f'%latit, '%i'%(START + pixel[0]), '%i'%(pixel[1]), score, '%10f'%float(catalog[region]['meanBlobSize']))) binaryhadoop.emit(sys.stdout, key, outrec, encoding = binaryhadoop.TYPEDBYTES_JSON) if sum(overlaprows & overlapcols) > 0: sys.stderr.write('%s\n'%repr(saveSelect0)) sys.stderr.write('%s\n'%repr(eval(saveSelect0))) sys.stderr.write('%s\n'%repr(catalog[region])) sys.stderr.write('tested with %i %i %i %i\n'%(trueRowsStart, nTrueRows, trueColsStart, nTrueCols)) sys.stderr.write('%s\n'%repr(nyx[:,1])) sys.stderr.write('%s\n'%repr(range(trueRowsStart, trueRowsStart + nTrueRows))) else: catalog.pop(region) merged_blobs = numpy.zeros(sdMask.shape) for band in regionsBlobbed.keys(): rows = regionsBlobbed[band]["yxCoordinates"][:,0] cols = regionsBlobbed[band]["yxCoordinates"][:,1] if band in candregions: merged_blobs[rows, cols] = band else: merged_blobs[rows, cols] = 0 trueNegative = (nTrueRows * nTrueCols) - truePositive falseNegative = nPixels- taggedPixels - trueNegative key = str(paramval) if taggedPixels > 0: fp = float(falsePositive)/taggedPixels else: fp = 0.0 if (truePositive + trueNegative) > 0: eff = float(truePositive)/(truePositive + trueNegative) else: eff = 0.0 sys.stderr.write('Confusion Matrix - TP: %d FP: %d TN: %d FN: %d Eff: %5.3f FPfrac:%5.3f \n'%(truePositive, falsePositive, trueNegative, falseNegative, eff, fp)) val = '%d, %d, %d, %d, %5.3f, %5.3f'%(truePositive, falsePositive, trueNegative, falseNegative, eff, fp) if config.blobspectra.KNOWNTRUTH: outrec = int(truePositive), int(falsePositive), int(trueNegative), int(falseNegative), float(eff), float(fp) binaryhadoop.emit(sys.stdout, key, outrec, encoding = binaryhadoop.TYPEDBYTES_JSON)
try: regionKey = metadata["originalDirName"] except KeyError: regionKey = metadata["outputFile"] pca_data = pcaData[regionKey] for i in xrange(numberOfPcaComponents): metadata["principal_component_" + str(i + 1)] = pca_data["principal_component_" + str(i + 1)] imageList = utilities.preprocessImage(bands, multipliers, wavelengths, {}) sys.stderr.write("This is the number of bands after pre-processing: %r\n" % len(imageList)) virtualBands, pcaComponents, imageMean = pcaProcessImage(imageList, pca_data, sorted(bands.keys())) rogueBands = checkPCA(pcaComponents, numberOfPcaComponents) metadata["image mean"] = imageMean.tolist() if len(rogueBands) > 0: sys.stderr.write("These are the dropped bands that have high pca loading variance: \n") for rogue in rogueBands: sys.stderr.write("[PCA component, Load index, Leave-one-out variance] " + str(rogue) + "\n") else: sys.stderr.write("There were no dropped bands that have high pca loading variance\n") projectedImage = {} projectedImage["metadata"] = metadata projectedImage["mask"] = mask for i in xrange(numberOfPcaComponents): projectedImage["band_" + str(i + 1)] = virtualBands[i, :] binaryhadoop.emit(sys.stdout, regionKey, projectedImage, encoding=binaryhadoop.TYPEDBYTES_PICKLE)
maskLeft = numpy.roll(mask, 1, axis=0) maskRight = numpy.roll(mask, -1, axis=0) maskUp = numpy.roll(mask, 1, axis=1) maskDown = numpy.roll(mask, -1, axis=1) numpy.logical_and(mask, maskLeft, mask) numpy.logical_and(mask, maskRight, mask) numpy.logical_and(mask, maskUp, mask) numpy.logical_and(mask, maskDown, mask) imageData["numPixels"] = numpy.nonzero(mask)[0].size else: bands[key] = numpy.array(value[mask],dtype=numpy.float64) if imageData["metadata"] is not None: region_key = "REGION_1" #this is a placeholder...still need to figure out best way to get this imageList = preprocessImage(bands, multipliers, wavelengths, imageData) iDot,iPartial = main(imageList) imageData["imageDot"] = iDot imageData["imagePartial"] = iPartial if len(sys.argv) > 1 and sys.argv[1].upper() == "TRUE": noiseList = calcNoise(imageList,mask) nDot,nPartial = main(noiseList) imageData["noiseDot"] = nDot imageData["noisePartial"] = nPartial binaryhadoop.emit(sys.stdout,region_key,json.dumps(imageData),encoding=binaryhadoop.TYPEDBYTES_JSON)
def doEverything(metadata, mask, originalBlock, numSetColors, numBands, bandToIndexLookup): globalStart = time.time() if numSetColors < numBands: heartbeat("Image should have {} bands, but only {} were set\n".format(numBands, numSetColors)) return if cameraName != "ALI": heartbeat("Reducing its dimensionality to 26\n") windowsOfBandsToTake = range(0, 27) + range(43, 46) + range(58, 67) + range(77, 92) + range(115, 139) reducedBlock = originalBlock[windowsOfBandsToTake,:,:] bandsToTake = [] reducedBlock2 = numpy.zeros((reducedBlock.shape[0]/3, reducedBlock.shape[1], reducedBlock.shape[2]), dtype=numpy.double) for i in xrange(reducedBlock2.shape[0]): bandsToTake.append(windowsOfBandsToTake[3*i + 1]) reducedBlock2[i] = reducedBlock[3*i:3*(i+1),:,:].mean(axis=0) del reducedBlock else: bandsToTake = numpy.argsort(metadata["bandNames"]) reducedBlock2 = originalBlock heartbeat("Improving the mask\n") betterMask = mask > 0 for i in xrange(reducedBlock2.shape[0]): numpy.logical_and(betterMask, reducedBlock2[i,:,:] > 0.0, betterMask) del mask shrinkmask = betterMask > 0 for roll in -2, -1, 1, 2: for axis in 0, 1: numpy.logical_and(shrinkmask, numpy.roll(betterMask, roll, axis=axis) > 0, shrinkmask) heartbeat("Taking logarithm\n") oldsettings = numpy.seterr(divide="ignore", invalid="ignore") block = numpy.log(reducedBlock2) numpy.seterr(**oldsettings) heartbeat("Reducing image to a bag of pixels\n") bag = block.view() bag.shape = (block.shape[0], block.shape[1] * block.shape[2]) del block bagMask = betterMask.view() bagMask.shape = (originalBlock.shape[1] * originalBlock.shape[2]) bag = bag[:, bagMask] del bagMask projectionMatrix = numpy.matrix([[1 if j == i else -1 if j == i + 1 else 0 for j in xrange(reducedBlock2.shape[0])] for i in xrange(reducedBlock2.shape[0] - 1)]) projectionInverse = projectionMatrix.I heartbeat("Projecting the bag onto the color-only basis\n") projected = numpy.array(numpy.dot(projectionMatrix, bag)) del bag heartbeat("Casting the projected bag onto the image shape\n") projectedBlock = numpy.empty((reducedBlock2.shape[0] - 1, reducedBlock2.shape[1], reducedBlock2.shape[2]), dtype=numpy.double) for i in xrange(reducedBlock2.shape[0] - 1): projectedBlock[i,betterMask] = projected[i,:] heartbeat("Detecting edges\n") # 5x5 Kroon without integer-rounding: http://www.k-zone.nl/Kroon_DerivativePaper.pdf Gx = numpy.array([[ 0.0007, 0.0052, 0.0370, 0.0052, 0.0007], [ 0.0037, 0.1187, 0.2589, 0.1187, 0.0037], [ 0.0, 0.0, 0.0, 0.0, 0.0], [-0.0037, -0.1187, -0.2589, -0.1187, -0.0037], [-0.0007, -0.0052, -0.0370, -0.0052, -0.0007]]) Gy = Gx.T startTime = time.time() convBlock = numpy.zeros((projectedBlock.shape[1], projectedBlock.shape[2]), numpy.double) for index in xrange(projectedBlock.shape[0]): heartbeat(" {} {} {}\n".format(index, time.time() - startTime, convBlock.max())) convGx2 = numpy.power(convolve(projectedBlock[index,:,:], Gx)[2:projectedBlock.shape[1]+2, 2:projectedBlock.shape[2]+2], 2) convGy2 = numpy.power(convolve(projectedBlock[index,:,:], Gy)[2:projectedBlock.shape[1]+2, 2:projectedBlock.shape[2]+2], 2) convBlock = convBlock + convGx2 convBlock = convBlock + convGy2 convBlock = numpy.sqrt(convBlock) heartbeat("Edges took {} seconds to detect\n".format(time.time() - startTime)) heartbeat("Optimizing GMM\n") numGMMcomponents = 20 startTime = time.time() if projected.shape[1] < 10 * numGMMcomponents or projected.shape[1] < 10 * projected.shape[0]: heartbeat("There are only {} points; skipping (number of GMM components is {} and number of dimensions in the space is {})\n".format(projected.shape[1], numGMMcomponents, projected.shape[0])) return attempts = 0 done = False while not done: try: if 10000 < projected.shape[1]: randomSelection = projected[:,random.sample(xrange(projected.shape[1]), 10000)] model = MoG(randomSelection, numGMMcomponents) model.em(10) heartbeat(" time for first pass: {} seconds\n".format(time.time() - startTime)) randomSelection = projected[:,random.sample(xrange(projected.shape[1]), 10000)] model = MoG(randomSelection, numGMMcomponents, means=model.means, covs=model.covs, mixprops=model.mixprops) model.em(10) heartbeat(" time for second pass: {} seconds\n".format(time.time() - startTime)) randomSelection = projected[:,random.sample(xrange(projected.shape[1]), 10000)] model = MoG(randomSelection, numGMMcomponents, means=model.means, covs=model.covs, mixprops=model.mixprops) model.em(10) heartbeat(" time for third pass: {} seconds\n".format(time.time() - startTime)) model = MoG(projected, numGMMcomponents, means=model.means, covs=model.covs, mixprops=model.mixprops) model.em(5) done = True else: heartbeat(" skipping three-pass subfit because the dataset is small\n") model = MoG(projected, numGMMcomponents) model.em(5) done = True except numpy.linalg.linalg.LinAlgError: attempts += 1 if attempts > 4: heartbeat(" could not fit in 4 attempts; giving up\n") return heartbeat("GMM took {} seconds to optimize\n".format(time.time() - startTime)) heartbeat("Scoring all pixels with GMM\n") startTime = time.time() scores = logsumexp(model.compute_posteriors(projected, reinit=True, normalize=False, logscale=True), 0) scoresBlock = numpy.zeros((reducedBlock2.shape[1], reducedBlock2.shape[2]), dtype=numpy.double) scoresBlock[betterMask] = scores del scores themin, themax = numpy.percentile(convBlock[shrinkmask], [0.5, 99.5]) convNorm = (convBlock[shrinkmask] - themin)/(themax - themin) convBlockNorm = (convBlock - themin)/(themax - themin) themin, themax = numpy.percentile(scoresBlock[shrinkmask], [0.5, 99.5]) scoresNorm = 1.0 - (scoresBlock[shrinkmask] - themin)/(themax - themin) scoresBlockNorm = 1.0 - (scoresBlock - themin)/(themax - themin) rawscoresmin, rawscoresmax = themin, themax del scoresBlock del scoresNorm selection = numpy.logical_and(scoresBlockNorm > 1.0, shrinkmask) indexes = zip(*numpy.nonzero(selection)) scoredBag = numpy.argmax(model.compute_posteriors(projected, reinit=True), axis=0) heartbeat("GMM took {} seconds to score all pixels (a few times in different ways)\n".format(time.time() - startTime)) heartbeat("Blurring scores for bucket-fill to spread better\n") startTime = time.time() spot = numpy.array([[math.exp(-((i - 2)**2 + (j - 2)**2)/2.0/1.0**2) for i in xrange(5)] for j in xrange(5)]) spot = spot / spot.sum() blurScores = convolve(scoresBlockNorm[:,:], spot)[2:scoresBlockNorm.shape[0]+2, 2:scoresBlockNorm.shape[1]+2] heartbeat("Time to blur image: {} seconds\n".format(time.time() - startTime)) heartbeat("Building the KD-tree\n") startTime = time.time() kdtree = KDTree(projected) dynamicRange = (lambda x: x[1] - x[0])(numpy.percentile(projected, [1, 99])) heartbeat("KD-tree took {} seconds to build\n".format(time.time() - startTime)) heartbeat("Performing bucket-fill searches\n") startTime = time.time() clumps = set() assigned = numpy.empty((projectedBlock.shape[1], projectedBlock.shape[2]), dtype=numpy.dtype(object)) considered = numpy.zeros((projectedBlock.shape[1], projectedBlock.shape[2]), dtype=numpy.dtype(bool)) Clump.assigned = assigned Clump.projectedBlock = projectedBlock Clump.projectionInverse = projectionInverse Clump.metadata = metadata Clump.kdtree = kdtree Clump.bandsToTake = bandsToTake Clump.projected = projected Clump.convBlockNorm = convBlockNorm Clump.model = model Clump.rawscoresmin = rawscoresmin Clump.rawscoresmax = rawscoresmax Clump.dynamicRange = dynamicRange Clump.blurBlock = None Clump.scoresBlockNorm = scoresBlockNorm Clump.blurScores = blurScores for index, (x, y) in enumerate(indexes): if index % 100 == 0: heartbeat(" {} {}\n".format(float(index)/len(indexes), time.time() - startTime)) queue = [(x, y)] clump = Clump((x, y)) while len(queue) > 0: i, j = queue.pop() if i >= 0 and j >= 0 and i < shrinkmask.shape[0] and j < shrinkmask.shape[1] and shrinkmask[i, j]: if not considered[i, j]: if blurScores[i, j] > 1.0: clump.add((i, j)) assigned[i, j] = clump if clump.size() > Clump.maxSize: heartbeat(" clump is too big!\n") break newQueue = [] for ii in -1, 0, 1: for jj in -1, 0, 1: if ii != jj: newQueue.append((i + ii, j + jj)) queue = newQueue + queue elif assigned[i, j] is not None and assigned[i, j] != clump: heartbeat(" merging clumps\n") clump = assigned[i, j].mergeIn(clump) if clump.size() > Clump.maxSize: heartbeat(" ... into one that was too big!\n") break considered[i, j] = True if not clump.isEmpty() and clump.size() <= Clump.maxSize: heartbeat(" new clump with size {}\n".format(clump.size())) clumps.add(clump) heartbeat("bucket-fill search took {} seconds\n".format(time.time() - startTime)) heartbeat("Calculating attributes of the image and clumps\n") startTime = time.time() def gmmSpectrum(index): unnormalized = numpy.exp(numpy.array(numpy.dot(projectionInverse, model.means[:,index].T))[0]) return dict(zip(list(numpy.array(metadata["bandNames"])[bandsToTake]), unnormalized / unnormalized.sum())) def fullSpectrumAt(indexes): unnormalized = numpy.zeros(originalBlock.shape[0], dtype=numpy.double) for i, j in indexes: unnormalized += originalBlock[:,i,j] return dict(zip(metadata["bandNames"], unnormalized / len(indexes))) output = {"metadata": metadata} getLngLat = utilities.makeGetLngLat(metadata) getMeters = utilities.makeGetMeters(metadata) wavelengths = [metadata["bandWavelength"][x] for x in numpy.array(metadata["bandNames"])[bandsToTake]] clusterNumber = 0 for clump in clumps: heartbeat(" do clump {}\n".format(clump.indexes)) border1 = clump.border() borderPoint1 = numpy.zeros(projectedBlock.shape[0], dtype=numpy.double) for i, j in clump.indexes: borderPoint1 = borderPoint1 + projectedBlock[:, i, j] borderPoint1 = borderPoint1 / len(border1) border2 = clump.border(list(clump.indexes) + list(border1)) borderPoint2 = numpy.zeros(projectedBlock.shape[0], dtype=numpy.double) for i, j in clump.indexes: borderPoint2 = borderPoint2 + projectedBlock[:, i, j] borderPoint2 = borderPoint2 / len(border2) numSeeds = len(clump.seeds) numPixels = clump.size() seeds = sorted((int(i), int(j)) for i, j in clump.seeds) indexes = sorted((int(i), int(j)) for i, j in clump.indexes) border1 = sorted((int(i), int(j)) for i, j in border1) border2 = sorted((int(i), int(j)) for i, j in border2) mean = list(clump.mean()) meanSeeds = list(clump.meanSeeds()) stdev = clump.stdev() specMean = clump.spectrumOf(clump.mean()) specMeanSeeds = clump.spectrumOf(clump.meanSeeds()) borderSpec1 = clump.spectrumOf(borderPoint1) borderSpec2 = clump.spectrumOf(borderPoint2) edgeScore1 = clump.edginess(border1) edgeScore2 = clump.edginess(border2) gmmScoreMean = clump.gmmscoreOf(clump.mean()) gmmScoreMeanSeeds = clump.gmmscoreOf(clump.meanSeeds()) fullSpectrum = fullSpectrumAt(clump.indexes) fullSpectrumSeeds = fullSpectrumAt(clump.seeds) fullSpectrumBorder1 = fullSpectrumAt(border1) fullSpectrumBorder2 = fullSpectrumAt(border2) r200 = float(clump.density(clump.meanSeeds(), 0.200)) r500 = float(clump.density(clump.meanSeeds(), 0.500)) if r200 > 0.0 and r500 > 0.0 and gmmScoreMeanSeeds > 1.42 and gmmScoreMeanSeeds - gmmScoreMean > 0.12 and math.log10(r200) < -2.78 and math.log10(r500) < -1.30 and stdev > 0.067 and edgeScore2 < 0.61: clusterName = "cluster_{}".format(clusterNumber) output[clusterName] = {} clusterNumber += 1 output[clusterName]["contours95"] = [{}] c95 = output[clusterName]["contours95"][0] x0 = numpy.mean([x for x, y in border2]) y0 = numpy.mean([y for x, y in border2]) order = numpy.argsort([math.atan2(y - y0, x - x0) for x, y in border2]) c95["rowcolpolygon"] = [(int(x), int(y)) for x, y in numpy.array(border2)[order]] c95["lnglatpolygon"] = [getLngLat(x, y) for x, y in numpy.array(border2)[order]] c95["centroidInLngLat"] = getLngLat(x0, y0) c95["areaInPixels"] = numPixels pixelLength = abs(getMeters(*getLngLat(x0 + 0.5, y0))[0] - getMeters(*getLngLat(x0 - 0.5, y0))[0]) pixelHeight = abs(getMeters(*getLngLat(x0, y0 + 0.5))[1] - getMeters(*getLngLat(x0, y0 - 0.5))[1]) c95["areaInMeters"] = numPixels * pixelLength * pixelHeight c95["circumferenceInMeters"] = 0.5*(pixelLength + pixelHeight) * len(border1) rawGMMscore = float(logsumexp(model.compute_posteriors(numpy.array([clump.meanSeeds()]).T, reinit=True, normalize=False, logscale=True), 0)[0]) rawKNNscore = float(r500) c95["score"] = rawGMMscore, rawKNNscore c95["other"] = { "numSeeds": numSeeds, "numPixels": numPixels, "seeds": seeds, "indexes": indexes, "border1": border1, "border2": border2, "mean": mean, "meanSeeds": meanSeeds, "stdev": stdev, "specMean": specMean, "specMeanSeeds": specMeanSeeds, "borderSpec1": borderSpec1, "borderSpec2": borderSpec2, "edgeScore1": edgeScore1, "edgeScore2": edgeScore2, "gmmScoreMean": gmmScoreMean, "gmmScoreMeanSeeds": gmmScoreMeanSeeds, "fullSpectrum": fullSpectrum, "fullSpectrumSeeds": fullSpectrumSeeds, "fullSpectrumBorder1": fullSpectrumBorder1, "fullSpectrumBorder2": fullSpectrumBorder2, "r200": r200, "r500": r500} binaryhadoop.emit(sys.stdout, metadata["originalDirName"], output, encoding=binaryhadoop.TYPEDBYTES_JSON) heartbeat("Calculating attributes took {} seconds\n".format(time.time() - startTime)) totalTime = time.time() - globalStart heartbeat("Time to do everything: {} sec, which is {} min\n".format(totalTime, totalTime/60.0))