def compare_and_rm(t_names, m_names): """ Compare just the file names, remove non-matches and return a list :param t_names: test file names :param m_names: master file names :return: """ fn_diffs = sorted( list(set(rm_fn(t_names)).difference(set(rm_fn(m_names))))) if len(fn_diffs) > 0: logger.warning("Files to be removed: {0}".format(fn_diffs)) if len(fn_diffs) == 0: return t_names # get only file name test_fn = rm_fn(t_names) rm = [] for ii in test_fn: if ii in fn_diffs: rm.append(False) else: rm.append(True) logger.debug("remove boolean: {0}".format(rm)) logger.debug("test_fn: {0}".format(test_fn)) logger.debug("final list: {0}".format( list(itertools.compress(t_names, rm)))) return list(itertools.compress(t_names, rm))
def check_text_files(test, mast, ext): """Check master and test text-based files (headers, XML, etc.) line-by-line for differences. Sort all the lines to attempt to capture new entries. Args: test <str>: path to test text file mast <str>: path to master text file ext <str>: file extension (should be .txt, .xml or .gtf """ logger.info("Checking {0} files...".format(ext)) test, mast = Cleanup.remove_nonmatching_files(test, mast) # Do some checks to make sure files are worth testing if mast is None or test is None: logger.warning("No {0} files to check in test and/or mast " "directories.".format(ext)) return if len(mast) != len(test): logger.error("{0} file lengths differ. Master: {1} | Test:" " {2}".format(ext, len(mast), len(test))) return for i, j in zip(test, mast): topen = open(i) mopen = open(j) # Read text line-by-line from file file_topen = topen.readlines() file_mopen = mopen.readlines() # Close files topen.close() mopen.close() # Check file names for name differences. # Print non-matching names in details. # get file names i_fn = i.split(os.sep)[-1] j_fn = j.split(os.sep)[-1] if i_fn != j_fn: logger.error( "{0} file names differ. Master: {1} | Test: {2}".format( ext, j, i)) return else: logger.info("{0} file names equivalent. Master: {1} | Test: " "{2}".format(ext, j, i)) # Check open files line-by-line (sorted) for changes. # Print non-matching lines in details. txt_diffs = set(file_topen).difference(set(file_mopen)) if len(txt_diffs) > 0: for k in txt_diffs: logger.error("{0} changes: {1}".format(ext, k)) else: logger.info("No differences between {0} and {1}.".format(i, j))
def pct_diff_raster(ds_tband: np.ndarray, ds_mband: np.ndarray, diff_rast: np.ndarray, nodata: int = -9999) -> np.ndarray: """ Calculate percent difference raster :param ds_tband: array of test raster :param ds_mband: array of master raster :param diff_rast: array of difference raster :param nodata: int representing no data value :return: """ # get min and max of both rasters' worth of data mins = [] maxs = [] # empty variable to compare both rasters' mins ds_tband = np.ma.masked_where(ds_tband == nodata, ds_tband) ds_mband = np.ma.masked_where(ds_mband == nodata, ds_mband) mins.append(np.min(ds_tband)) mins.append(np.min(ds_mband)) rmin = np.min(mins) maxs.append(np.max(ds_tband)) maxs.append(np.max(ds_mband)) rmax = np.max(maxs) # make a pct diff raster pct_diff_raster = ((np.abs(diff_rast) / np.abs(float(rmax - rmin))) * 100) logger.warning("Percent difference raster created.") return pct_diff_raster
def cancel_orders(txt_in: str, username: str, espa_env: str): """ :param txt_in: The full path and filename of the input txt file containing the ESPA orders :param username: ESPA user name :param espa_env: ESPA environment :return: """ order_list = json.loads(open(txt_in, "r").read()) t0 = get_time() espa_url = get_espa_env(espa_env) passwd = espa_login() for order in order_list: url = espa_url + api_config.api_urls["order"] + order["orderid"] logger.warning('Cancelling order: %s', url) cancellation = {'orderid': order['orderid'], 'status': 'cancelled'} result = requests.put(url, auth=(username, passwd), json=cancellation).json() logger.info('Result: %s', result) logger.warning('Removing cancelled file: %s', txt_in) os.unlink(txt_in) logger.info("Processing time: {}".format(get_time() - t0))
def read_band_as_array(rast, n_bands=1): """Read gdal object as an array. Mask out nodata. Args: rast <osgeo.gdal.Dataset>: open raster n_bands <int>: number of bands in file (default=1) """ # get nodata value r_a = rast.GetRasterBand(n_bands) r_nd = False try: r_nd = r_a.GetNoDataValue() except AttributeError: logger.warning( "Variable {0} does not have NoData value.".format(r_a)) # read raster as array rast_arr = np.array(r_a.ReadAsArray()) # mask nodata value, if it exists if r_nd is not False: rast_arr = np.ma.masked_where(rast_arr == r_nd, rast_arr) logger.info("NoData value: {0}".format(r_nd)) else: rast_arr = r_a logger.info("NoData value could not be determined.") return (rast_arr, r_nd)
def check_images(test, mast): """Read in a generic (non-geographic) image, like JPEG, and do a diff Return diff raster if actually different Args: test <str>: path to test image mast <str>: path to master image """ try: from scipy.misc import imread except ImportError: from scipy.ndimage import imread # read images try: test_im = imread(test) mast_im = imread(mast) except ImportError: logger.warning("Likely missing Python Image Library (PIL).") # try Scikit Image from skimage.io import imread try: mast_im = imread(mast) test_im = imread(test) except (ValueError, TypeError, ImportError): logger.warning("Not able to open image with skimag.io. Likely missing image library.") return None # check diff try: diff_im = do_diff(test_im, mast_im) if len(np.nonzero(diff_im)) > 3: logger.error("Values differ between {0} and {1}.".format(test, mast)) return diff_im else: logger.info("Values equivalent between {0} and {1}.".format(test, mast)) return None except ValueError: logger.error("Image {0} and {1} are not the same dimensions.".format(test, mast))
def place_order(espa_env: str, username: str, outdir: str = None, order_key: str = None): """ Place the order with the appropriate ESPA environment :param order_key: Optionally specify a keyword pointing to a specific order :param outdir: Optionally specify full path to the output directory, otherwise os.getcwd() is used :param espa_env: The name of the ESPA environment :param username: ESPA username :return: """ espa_url = espa_orders_api.get_espa_env(espa_env) orders = load_order(order_key) passwd = espa_orders_api.espa_login() order_length = len(orders) response = list() for i, order in enumerate(orders): logger.info("Requesting order %d of %d", i + 1, order_length) r = requests.post(espa_url + api_config.api_urls["order"], auth=(username, passwd), json=order) try: result = r.json() if 'orderid' not in result: logger.error('Order "%s" %d/%d failed: %s', order_key, i + 1, order_length, result) continue response.append(result) except json.decoder.JSONDecodeError: # inherits from ValueError # This error seems to occur when trying to decode a None-type object logger.error( "There was likely a problem connecting with the host. " "Check to see if ESPA is down for maintenance.") filename = order_text(outdir) logger.warning('Store ordering results: %s', filename) with open(filename, "a") as f: f.write(json.dumps(response, indent=4))
def cleanup_files(indir: str): """ Clean up all unpacked files, leaving alone the .tar.gz archives :param indir: Full path to the target directory :return: """ print("Cleaning up files...") all_files = [ os.path.join(dirpath, f) for dirpath, dirnames, files in os.walk(indir) for f in fnmatch.filter(files, '*') ] for f in all_files: if any(f.endswith(x) for x in (".tar.gz", '.tar')): continue else: try: os.remove(f) except: continue logger.warning("Cleaned up all data files.") # Clean up gap mask files gm = [ os.path.join(dirpath, f) for dirpath, dirnames, files in os.walk(indir) for f in fnmatch.filter(dirnames, 'gap_mask') ] st = [ os.path.join(dirpath, f) for dirpath, dirnames, files in os.walk(indir) for f in fnmatch.filter(dirnames, 'stats') ] [shutil.rmtree(i, ignore_errors=True) for i in gm] [shutil.rmtree(i, ignore_errors=True) for i in st] logger.warning("Removed all non-archive files.") return None
def plot_diff_image(test, mast, diff_raster, fn_out, fn_type, dir_out, do_abs=False): """Take difference array and plot as image. Args: test <str>: name of test file mast <str>: name of mast file diff_raster <numpy.ndarray>: numpy array of values fn_out <str>: basename for file fn_type <str>: defines title of plot - "diff" or "pct_diff" dir_out <str>: directory where output data are being stored """ import matplotlib.pyplot as plt import numpy as np # mask pixels that did not differ diff_raster = np.ma.masked_where(diff_raster == 0, diff_raster) # make output file im_out = dir_out + os.sep + fn_out + "_" + fn_type + ".png" # plot diff figure if do_abs: plt.imshow(np.abs(diff_raster), cmap='gist_gray') plt.colorbar(label="Abs. Difference") else: plt.imshow(diff_raster, cmap='PuOr') plt.colorbar(label="Difference") # annotate plot with file names plt.annotate(str(mast) + "\n" + str(test) + "\n", fontsize=5, xy=(0.01, 0.94), xycoords='axes fraction') plt.title(fn_out, y=1.05) plt.savefig(im_out, dpi=250) plt.close() logger.warning("{0} raster written to {1}.".format(fn_type, im_out))
def check_xml_schema(test, schema): """Ensure XML matches ESPA schema. :param test: <str> XML metadata file to compare with schema. :param schema: <str> Path to XML schema file. :return: None """ # read schema xmlschema = etree.XMLSchema(etree.parse(schema)) # read XML xmlfile = etree.parse(test) # do validation result = xmlschema.validate(xmlfile) if result: logger.warning('XML file {0} is valid with XML schema {1}.'.format( test, schema)) else: logger.critical( 'XML file {0} is NOT valid with XML schema {1}.'.format( test, schema))
def do_diff(test, mast, nodata=False): """Do image diff, break if the grids are not the same size. Args: test <numpy.ndarray>: array of test raster mast <numpy.ndarray>: array of master raster """ # If a NoData value is present, or the "--include-nodata" flag was used: if nodata is not False: test = np.ma.masked_where(test == nodata, test) mast = np.ma.masked_where(mast == nodata, mast) logger.info("Making nodata value {0} from diff calc.".format(nodata)) try: # TODO: Figure out why some bands cannot be compared correctly. diff = test.astype(np.float) - mast.astype(np.float) return diff except (ValueError, AttributeError, TypeError) as e: logger.warning("Error: {0}".format(e)) return False
def call_stats(test, mast, rast_arr, fn_out, dir_out, rast_num=0): """Call stats function(s) if data are valid Args: test <str>: name of test file mast <str>: name of master file rast_arr <numpy.ndarray>: array of target raster fn_out <str>: file path of image dir_out <str>: path to output directory rast_num <int>: individual number of image (default=0) nodata <int>: no data value (default=-9999) """ if isinstance(rast_arr, (np.ndarray, np.ma.core.MaskedArray)): if np.any(rast_arr != 0): logger.warning("Image difference found!") logger.warning("Test: {0} | Master: {1}".format(test, mast)) # find file name (for saving plot) fout = fn_out.split(os.sep)[-1] # do stats of difference stats.img_stats(test, mast, rast_arr, os.path.dirname(fn_out), fout, dir_out, rast_num) # plot diff image ImWrite.plot_diff_image(test, mast, rast_arr, fout, "diff_" + str(rast_num), dir_out) # plot abs diff image ImWrite.plot_diff_image(test, mast, rast_arr, fout, "abs_diff_" + str(rast_num), dir_out, do_abs=True) # plot diff histograms ImWrite.plot_hist(test, mast, rast_arr, fout, "diff_" + str(rast_num), dir_out) else: logger.info("Binary data match.") else: logger.warning("Target raster is not a valid numpy array or numpy " "masked array. Cannot run statistics!")
def check_jpeg_files(test: list, mast: list, dir_out: str) -> None: """ Check JPEG files (i.e., Gverify or preview images) for diffs in file size or file contents. Plot difference image if applicable :param test: List of paths to test jpg files :param mast: List of paths to master jpg files :param dir_out: Full path to output directory :return: """ test, mast = Cleanup.remove_nonmatching_files(test, mast) logger.info("Checking JPEG preview/gverify files...") if mast is None or test is None: logger.error("No JPEG files to check in test and/or mast " "directories.") else: for i, j in zip(test, mast): # Compare file sizes if os.path.getsize(i) != os.path.getsize(j): logger.warning("JPEG file sizes do not match for " "Master {0} and Test {1}...\n".format(j, i)) logger.warning("{0} size: {1}".format( i, os.path.getsize(i))) logger.warning("{0} size: {1}".format( j, os.path.getsize(j))) else: logger.info("JPEG files {0} and {1} are the same " "size".format(j, i)) # diff images result = ArrayImage.check_images(i, j) if result: ImWrite.plot_diff_image(test=i, mast=j, diff_raster=result, fn_out=i.split(os.sep)[-1], fn_type="diff_", dir_out=dir_out)
def check_images(test, mast, dir_out, ext, include_nd=False): """Compare the test and master images, both for their raw contents and geographic parameters. If differences exist, produce diff plot + CSV stats file. Args: test <str>: path to test image mast <str>: path to master image dir_out <str>: path to output directory ext <str>: file extension include_nd <bool>: incl. nodata values in file cmp (default=False) """ logger.warning("Checking {0} files...".format(ext)) # clean up non-matching files test, mast = Cleanup.remove_nonmatching_files(test, mast) # make sure there are actually files to check if mast is None or test is None: logger.error("No {0} files to check in test and/or mast directories.".format(ext)) return False # do other comparison checks, return stats + plots if diffs exist for i, j in zip(test, mast): logger.info("Checking Test {0} against Master {1}".format(i, j)) # Open each raster ds_test = RasterIO.open_raster(i) ds_mast = RasterIO.open_raster(j) # Compare various raster parameters status = list() status.append(RasterCmp.compare_proj_ref(ds_test, ds_mast)) status.append(RasterCmp.compare_geo_trans(ds_test, ds_mast)) status.append(RasterCmp.extent_diff_cols(ds_test, ds_mast)) status.append(RasterCmp.extent_diff_rows(ds_test, ds_mast)) # If any above tests fail, go to next iteration if any(stat is False for stat in status): continue # Count number of sub-bands in the files d_range = Find.count(i, ds_test, j, ds_mast, ext) if d_range is None: logger.critical("Number of files different; data cannot be tested successfully.") continue # if sub-bands exist, read them one-by-one and do diffs + stats if d_range > 1: for ii in range(0, d_range): # Get the first band from each raster if ext == ".img": logger.info("Reading sub-band {0} from .img {1}...".format(ii, i)) ds_tband = RasterIO.read_band_as_array(ds_test, ii) ds_mband = RasterIO.read_band_as_array(ds_mast, ii) else: logger.info("Reading .hdf/.nc SDS {0} from file {0}...".format(ii, i)) sds_tband = RasterIO.open_raster(RasterIO.get_sds(ds_test)[ii][0]) sds_mband = RasterIO.open_raster(RasterIO.get_sds(ds_mast)[ii][0]) ds_tband, t_nd = RasterIO.read_band_as_array(sds_tband) ds_mband, m_nd = RasterIO.read_band_as_array(sds_mband) # do image differencing without masking NoData if isinstance(t_nd, type(None)) or include_nd: diff = do_diff(ds_tband, ds_mband) # do image differencing with NoData masked else: diff = do_diff(ds_tband, ds_mband, nodata=int(t_nd)) # call stats functions to write out results/plots/etc. call_stats(i, j, diff, i, dir_out, rast_num=ii) else: # else it's a singleband raster logger.info("Reading {0}...".format(i)) # read in bands as array ds_tband, t_nd = RasterIO.read_band_as_array(ds_test) ds_mband, m_nd = RasterIO.read_band_as_array(ds_mast) # do diff if isinstance(t_nd, type(None)) or include_nd: diff = do_diff(ds_tband, ds_mband) else: diff = do_diff(ds_tband, ds_mband, nodata=int(t_nd)) # call stats functions to write out results/plots/etc. call_stats(i, j, diff, i, dir_out)
def qa_data(dir_mast: str, dir_test: str, dir_out: str, archive: bool = True, xml_schema: str = None, incl_nd: bool = False) -> None: """ Function to check files and call appropriate QA module(s) :param dir_mast: Full path to the master directory :param dir_test: Full path to the test directory :param dir_out: Full path to the QA output directory :param archive: If True, will clean up existing files and extract from archives :param xml_schema: Full path to XML files, default is None :param incl_nd: If True, include NoData in comparisons :return: """ # start timing code t0 = time.time() # create output dir if it doesn't exist if not os.path.exists(dir_out): os.makedirs(dir_out) if archive: # do initial cleanup of input directories Cleanup.cleanup_files(dir_mast) Cleanup.cleanup_files(dir_test) # create output directory if it doesn't exist if not os.path.exists(dir_out): os.makedirs(dir_out) # read in .tar.gz files test_files = Find.find_files(dir_test, ".tar*") mast_files = Find.find_files(dir_mast, ".tar*") # Extract files from archive Extract.unzip_files(test_files, mast_files) # find only the deepest dirs test_dirs = sorted([r for r, d, f in os.walk(dir_test) if not d]) mast_dirs = sorted([r for r, d, f in os.walk(dir_mast) if not d]) if len(test_dirs) != len(mast_dirs): logger.critical( "Directory structure of Master differs from Test., MASTER: %s, TEST: %s", mast_dirs, test_dirs) sys.exit(1) for i in range(0, len(test_dirs)): # Find extracted files all_test = sorted(Find.find_files(test_dirs[i], ".*")) all_mast = sorted(Find.find_files(mast_dirs[i], ".*")) # Find unique file extensions exts = Find.get_ext(all_test, all_mast) for ext in exts: logger.info("Finding {0} files...".format(ext)) test_f = Find.find_files(test_dirs[i], ext) mast_f = Find.find_files(mast_dirs[i], ext) logger.info("Performing QA on {0} files located in {1}".format( ext, dir_test)) logger.info("Test files: {0}".format(test_f)) logger.info("Mast files: {0}".format(mast_f)) # remove any _hdf.img files found with .img files if ext == ".img": test_f = Cleanup.rm_files(test_f, "_hdf.img") mast_f = Cleanup.rm_files(mast_f, "_hdf.img") # if a text-based file if (ext.lower() == ".txt" or ext.lower() == ".xml" or ext.lower() == ".gtf" or ext.lower() == ".hdr" or ext.lower() == ".stats"): MetadataQA.check_text_files(test_f, mast_f, ext) # if text-based file is xml if ext.lower() == ".xml" and xml_schema: MetadataQA.check_xml_schema(test_f, xml_schema) MetadataQA.check_xml_schema(mast_f, xml_schema) # if non-geo image elif ext.lower() == ".jpg": MetadataQA.check_jpeg_files(test_f, mast_f, dir_out) # if no extension elif len(ext) == 0: continue # else, it's probably a geo-based image else: GeoImage.check_images(test_f, mast_f, dir_out, ext, include_nd=incl_nd) if archive: # Clean up files Cleanup.cleanup_files(dir_mast) Cleanup.cleanup_files(dir_test) # end timing t1 = time.time() m, s = divmod(t1 - t0, 60) h, m = divmod(m, 60) logger.warning("Total runtime: {0}h, {1}m, {2}s.".format( h, round(m, 3), round(s, 3))) logger.warning("Done.") return None
def plot_hist(test, mast, diff_raster, fn_out, fn_type, dir_out, bins=False): """Take difference array and plot as histogram. Args: test <str>: name of test file mast <str>: name of master file diff_raster <numpy.ndarray>: numpy array of values fn_out <str>: basename for file fn_type <str>: defines title of plot - "diff" or "pct_diff" dir_out <str>: directory where output data are being stored bins <int>: number of bins for histogram (default=255) """ import matplotlib.pyplot as plt import numpy as np def bin_size(rast): """Determine bin size based upon data type. Args: rast <numpy.ndarray>: numpy array of values """ dt = rast.dtype if '64' or '32' in dt.name: return 2000 elif '16' in dt.name: return 1000 elif '8' in dt.name: return 256 else: return 50 # mask pixels that did not differ diff_raster = np.ma.masked_where(diff_raster == 0, diff_raster) # make output file im_out = dir_out + os.sep + fn_out + "_" + fn_type + "_hist.png" # get array of values that are actually different diff_valid = diff_raster.compressed() # determine bin size if not bins: bins = bin_size(diff_raster) # do histogram try: plt.hist(diff_valid, bins) except AttributeError: logger.warning("Difference values from diff_valid variable could" " not be plotted.") return # do basic stats diff_mean = np.mean(diff_raster) diff_sd = np.std(diff_raster) diff_abs_mean = np.mean(np.abs(diff_raster)) diff_pix = len(diff_valid) diff_pct = (np.float(diff_pix) / np.product(np.shape(diff_raster))) \ * 100.0 # annotate plot with file names plt.annotate(str(mast) + "\n" + str(test) + "\n", fontsize=5, xy=(0.01, 0.94), xycoords='axes fraction') # annotate plot with basic stats plt.annotate("mean diff: " + str(round(diff_mean, 3)) + "\n" + "std. dev.: " + str(round(diff_sd, 3)) + "\n" + "abs. mean diff: " + str(round(diff_abs_mean, 3)) + "\n" + "# diff pixels: " + str(diff_pix) + "\n" + "% diff: " + str(round(diff_pct, 3)) + "\n" + "# bins: " + str(bins) + "\n", xy=(0.68, 0.72), xycoords='axes fraction') # write figure out to PNG plt.savefig(im_out, bbox_inches="tight", dpi=350) plt.close() logger.warning("Difference histogram written to {0}.".format(im_out))