def test_get_line_cache_pattern_glob(cachedir, cachefile, n_images, n_row, n_bands): import glob pattern = cache.get_line_cache_pattern(n_row, n_bands, regex=False) found = glob.glob('%s/%s' % (cachedir, pattern))[0] assert found == cachefile
def test_get_line_cache_pattern_regex(cachedir, cachefile, n_images, n_row, n_bands): import re pattern = cache.get_line_cache_pattern(n_row, n_bands, regex=True) found = [f for f in os.listdir(cachedir) if re.match(pattern, f)] found = os.path.join(cachedir, found[0]) assert found == cachefile
def test_get_line_cache_pattern_glob(self): import glob pattern = cache.get_line_cache_pattern( self.n_row, self.n_bands, regex=False) found = glob.glob('{d}/{p}'.format( d=self.config['cache_line_dir'], p=pattern))[0] self.assertEqual(found, self.test_file)
def test_get_line_cache_pattern_glob(self): import glob pattern = cache.get_line_cache_pattern(self.n_row, self.n_bands, regex=False) found = glob.glob('{d}/{p}'.format(d=self.config['cache_line_dir'], p=pattern))[0] self.assertEqual(found, self.test_file)
def test_get_line_cache_pattern_regex(self): import re pattern = cache.get_line_cache_pattern( self.n_row, self.n_bands, regex=True) found = [f for f in os.listdir(self.config['cache_line_dir']) if re.match(pattern, f)] found = os.path.join(self.config['cache_line_dir'], found[0]) self.assertEqual(found, self.test_file)
def test_get_line_cache_pattern_regex(self): import re pattern = cache.get_line_cache_pattern(self.n_row, self.n_bands, regex=True) found = [ f for f in os.listdir(self.config['cache_line_dir']) if re.match(pattern, f) ] found = os.path.join(self.config['cache_line_dir'], found[0]) self.assertEqual(found, self.test_file)
def main(args): # Parse and validate configuration file dataset_config, yatsm_config = config_parser.parse_config_file( args['config_file']) if not os.path.isdir(dataset_config['cache_line_dir']): os.makedirs(dataset_config['cache_line_dir']) dates, images = utils.csvfile_to_dataset( dataset_config['input_file'], date_format=dataset_config['date_format'] ) image_IDs = utils.get_image_IDs(images) nrow, ncol, nband, dtype = reader.get_image_attribute(images[0]) # Determine lines to work on job_lines = utils.calculate_lines(args['job_number'], args['total_jobs'], nrow, interlaced=args['interlace']) logger.debug('Responsible for lines: {l}'.format(l=job_lines)) # Determine file reader if dataset_config['use_bip_reader']: logger.debug('Reading in data from disk using BIP reader') image_reader = reader.read_row_BIP image_reader_kwargs = {'size': (ncol, nband), 'dtype': dtype} else: logger.debug('Reading in data from disk using GDAL') image_reader = reader.read_row_GDAL image_reader_kwargs = {} # Attempt to update cache files previous_cache = None if args['update_pattern']: previous_cache = fnmatch.filter( os.listdir(dataset_config['cache_line_dir']), args['update_pattern']) if not previous_cache: logger.warning('Could not find cache files to update with pattern' '{p}'.format(p=args['update_pattern'])) else: logger.debug('Found {n} previously cached files to update'.format( n=len(previous_cache))) for job_line in job_lines: cache_filename = cache.get_line_cache_name( dataset_config, len(images), job_line, nband) logger.debug('Caching line {l} to {f}'.format( l=job_line, f=cache_filename)) start_time = time.time() # Find matching cache file update = False if previous_cache: pattern = cache.get_line_cache_pattern( job_line, nband, regex=False) potential = fnmatch.filter(previous_cache, pattern) if not potential: logger.info('Found zero previous cache files for ' 'line {l}'.format(l=job_line)) elif len(potential) > 1: logger.info('Found more than one previous cache file for ' 'line {l}. Keeping first'.format(l=job_line)) update = os.path.join(dataset_config['cache_line_dir'], potential[0]) else: update = os.path.join(dataset_config['cache_line_dir'], potential[0]) logger.info('Updating from cache file {f}'.format(f=update)) if update: cache.update_cache_file( images, image_IDs, update, cache_filename, job_line, image_reader, image_reader_kwargs ) else: if dataset_config['use_bip_reader']: # Use BIP reader logger.debug('Reading in data from disk using BIP reader') Y = reader.read_row_BIP(images, job_line, (ncol, nband), dtype) else: # Read in data just using GDAL logger.debug('Reading in data from disk using GDAL') Y = reader.read_row_GDAL(images, job_line) cache.write_cache_file(cache_filename, Y, image_IDs) logger.debug('Took {s}s to cache the data'.format( s=round(time.time() - start_time, 2)))
def main(args): # Parse and validate configuration file dataset_config, yatsm_config = config_parser.parse_config_file( args['config_file']) if not os.path.isdir(dataset_config['cache_line_dir']): os.makedirs(dataset_config['cache_line_dir']) dates, images = utils.csvfile_to_dataset( dataset_config['input_file'], date_format=dataset_config['date_format']) image_IDs = utils.get_image_IDs(images) nrow, ncol, nband, dtype = reader.get_image_attribute(images[0]) # Determine lines to work on job_lines = utils.calculate_lines(args['job_number'], args['total_jobs'], nrow, interlaced=args['interlace']) logger.debug('Responsible for lines: {l}'.format(l=job_lines)) # Determine file reader if dataset_config['use_bip_reader']: logger.debug('Reading in data from disk using BIP reader') image_reader = reader.read_row_BIP image_reader_kwargs = {'size': (ncol, nband), 'dtype': dtype} else: logger.debug('Reading in data from disk using GDAL') image_reader = reader.read_row_GDAL image_reader_kwargs = {} # Attempt to update cache files previous_cache = None if args['update_pattern']: previous_cache = fnmatch.filter( os.listdir(dataset_config['cache_line_dir']), args['update_pattern']) if not previous_cache: logger.warning('Could not find cache files to update with pattern' '{p}'.format(p=args['update_pattern'])) else: logger.debug('Found {n} previously cached files to update'.format( n=len(previous_cache))) for job_line in job_lines: cache_filename = cache.get_line_cache_name(dataset_config, len(images), job_line, nband) logger.debug('Caching line {l} to {f}'.format(l=job_line, f=cache_filename)) start_time = time.time() # Find matching cache file update = False if previous_cache: pattern = cache.get_line_cache_pattern(job_line, nband, regex=False) potential = fnmatch.filter(previous_cache, pattern) if not potential: logger.info('Found zero previous cache files for ' 'line {l}'.format(l=job_line)) elif len(potential) > 1: logger.info('Found more than one previous cache file for ' 'line {l}. Keeping first'.format(l=job_line)) update = os.path.join(dataset_config['cache_line_dir'], potential[0]) else: update = os.path.join(dataset_config['cache_line_dir'], potential[0]) logger.info('Updating from cache file {f}'.format(f=update)) if update: cache.update_cache_file(images, image_IDs, update, cache_filename, job_line, image_reader, image_reader_kwargs) else: if dataset_config['use_bip_reader']: # Use BIP reader logger.debug('Reading in data from disk using BIP reader') Y = reader.read_row_BIP(images, job_line, (ncol, nband), dtype) else: # Read in data just using GDAL logger.debug('Reading in data from disk using GDAL') Y = reader.read_row_GDAL(images, job_line) cache.write_cache_file(cache_filename, Y, image_IDs) logger.debug('Took {s}s to cache the data'.format( s=round(time.time() - start_time, 2)))
def cache(ctx, config, job_number, total_jobs, update_pattern, interlace): cfg = parse_config_file(config) if not os.path.isdir(cfg['dataset']['cache_line_dir']): os.makedirs(cfg['dataset']['cache_line_dir']) df = csvfile_to_dataframe(cfg['dataset']['input_file'], cfg['dataset']['date_format']) df['image_IDs'] = get_image_IDs(df['filename']) nrow, ncol, nband, dtype = reader.get_image_attribute(df['filename'][0]) # Determine lines to work on job_lines = distribute_jobs(job_number, total_jobs, nrow, interlaced=interlace) logger.debug('Responsible for lines: {l}'.format(l=job_lines)) # Determine file reader if cfg['dataset']['use_bip_reader']: logger.debug('Reading in data from disk using BIP reader') image_reader = reader.read_row_BIP image_reader_kwargs = {'size': (ncol, nband), 'dtype': dtype} else: logger.debug('Reading in data from disk using GDAL') image_reader = reader.read_row_GDAL image_reader_kwargs = {} # Attempt to update cache files previous_cache = None if update_pattern: previous_cache = fnmatch.filter( os.listdir(cfg['dataset']['cache_line_dir']), update_pattern) if not previous_cache: logger.warning('Could not find cache files to update with pattern ' '%s' % update_pattern) else: logger.debug('Found %s previously cached files to update' % len(previous_cache)) for job_line in job_lines: cache_filename = get_line_cache_name(cfg['dataset'], len(df), job_line, nband) logger.debug('Caching line {l} to {f}'.format( l=job_line, f=cache_filename)) start_time = time.time() # Find matching cache file update = False if previous_cache: pattern = get_line_cache_pattern(job_line, nband, regex=False) potential = fnmatch.filter(previous_cache, pattern) if not potential: logger.info('Found zero previous cache files for ' 'line {l}'.format(l=job_line)) elif len(potential) > 1: logger.info('Found more than one previous cache file for ' 'line {l}. Keeping first'.format(l=job_line)) update = os.path.join(cfg['dataset']['cache_line_dir'], potential[0]) else: update = os.path.join(cfg['dataset']['cache_line_dir'], potential[0]) logger.info('Updating from cache file {f}'.format(f=update)) if update: update_cache_file(df['filename'], df['image_IDs'], update, cache_filename, job_line, image_reader, image_reader_kwargs) else: if cfg['dataset']['use_bip_reader']: # Use BIP reader logger.debug('Reading in data from disk using BIP reader') Y = reader.read_row_BIP(df['filename'], job_line, (ncol, nband), dtype) else: # Read in data just using GDAL logger.debug('Reading in data from disk using GDAL') Y = reader.read_row_GDAL(df['filename'], job_line) write_cache_file(cache_filename, Y, df['image_IDs']) logger.debug('Took {s}s to cache the data'.format( s=round(time.time() - start_time, 2)))