def classify(ctx, config, algo, job_number, total_jobs, resume): cfg = parse_config_file(config) df = csvfile_to_dataframe(cfg['dataset']['input_file'], cfg['dataset']['date_format']) nrow = get_image_attribute(df['filename'][0])[0] classifier = joblib.load(algo) # Split into lines and classify job_lines = distribute_jobs(job_number, total_jobs, nrow) logger.debug('Responsible for lines: {l}'.format(l=job_lines)) start_time = time.time() logger.info('Starting to run lines') for job_line in job_lines: filename = get_output_name(cfg['dataset'], job_line) if not os.path.exists(filename): logger.warning('No model result found for line {l} ' '(file {f})'.format(l=job_line, f=filename)) pass if resume and try_resume(filename): logger.debug('Already processed line {l}'.format(l=job_line)) continue logger.debug('Classifying line {l}'.format(l=job_line)) classify_line(filename, classifier) logger.debug('Completed {n} lines in {m} minutes'.format( n=len(job_lines), m=round((time.time() - start_time) / 60.0, 2)) )
def test_distribute_jobs_interlaced(nrow, njob): assigned = [] for i in range(njob): assigned.extend(utils.distribute_jobs(i, njob, nrow, interlaced=True)) assigned = np.sort(np.asarray(assigned)) all_rows = np.arange(0, nrow) np.testing.assert_equal(assigned, all_rows)
def batch(ctx, job_number, total_jobs, executor, block_size, force_overwrite): """ Run a YATSM pipeline on a dataset in batch mode The dataset is split into a number of subsets based on the structure of the files in the dataset. The internal structure is determined by the block sizes, or internal tile sizes, retrieved by GDAL. In the absence of the dataset being tiled, GDAL will default to 256 pixels in the X dimension and a value in the Y dimension that ensures that the block fits in 8K or less. TODO: Users may override the size of the subsets using command line options. """ config = options.fetch_config(ctx) # Imports inside CLI for speed from yatsm.io.utils import block_windows from yatsm.utils import distribute_jobs # TODO: remove when not debugging import dask dask.set_options(get=dask. async .get_sync) # TODO: Better define how authoritative reader when using multiple datasets # and choosing block shape (in config?) # TODO: Allow user to specify block shape in config (?) if block_size[0] and block_size[1]: windows = list(block_windows(block_size, config.primary_reader.shape)) else: windows = config.primary_reader.block_windows job_idx = distribute_jobs(job_number, total_jobs, len(windows)) job_windows = [windows[i] for i in job_idx] logger.debug('Working on {0} of {1} block windows'.format( len(job_idx), len(windows))) force_overwrite = (force_overwrite or config['pipeline'].get('overwrite', False)) # TODO: iterate over windows assigned to ``job_id`` futures = {} for idx, window in job_windows: future = executor.submit(batch_block, config=config, readers=config.readers, window=window, overwrite=force_overwrite) futures[future] = window n_good, n_skip, n_fail = 0, 0, 0 for future in executor.as_completed(futures): window = futures[future] try: result = future.result() if isinstance(result, str): logger.info("Wrote to: %s" % result) n_good += 1 else: n_skip += 1 time.sleep(1) except KeyboardInterrupt: logger.critical('Interrupting and shutting down') executor.shutdown() raise click.Abort() except Exception: logger.exception("Exception for window: {}".format(window)) n_fail += 1 raise # TODO: remove and log? logger.info('Complete: %s' % n_good) logger.info('Skipped: %s' % n_skip) logger.info('Failed: %s' % n_fail)
def line(ctx, config, job_number, total_jobs, resume, check_cache, do_not_run, verbose_yatsm): if verbose_yatsm: logger_algo.setLevel(logging.DEBUG) # Parse config cfg = parse_config_file(config) if ('phenology' in cfg and cfg['phenology'].get('enable')) and not pheno: click.secho('Could not import yatsm.phenology but phenology metrics ' 'are requested', fg='red') click.secho('Error: %s' % pheno_exception, fg='red') raise click.Abort() # Make sure output directory exists and is writable output_dir = cfg['dataset']['output'] try: os.makedirs(output_dir) except OSError as e: # File exists if e.errno == 17: pass elif e.errno == 13: click.secho('Cannot create output directory %s' % output_dir, fg='red') raise click.Abort() if not os.access(output_dir, os.W_OK): click.secho('Cannot write to output directory %s' % output_dir, fg='red') raise click.Abort() # Test existence of cache directory read_cache, write_cache = test_cache(cfg['dataset']) logger.info('Job {i} of {n} - using config file {f}'.format(i=job_number, n=total_jobs, f=config)) df = csvfile_to_dataframe(cfg['dataset']['input_file'], cfg['dataset']['date_format']) df['image_ID'] = get_image_IDs(df['filename']) # Get attributes of one of the images nrow, ncol, nband, dtype = get_image_attribute(df['filename'][0]) # Calculate the lines this job ID works on job_lines = distribute_jobs(job_number, total_jobs, nrow) logger.debug('Responsible for lines: {l}'.format(l=job_lines)) # Calculate X feature input dates = np.asarray(df['date']) kws = {'x': dates} kws.update(df.to_dict()) X = patsy.dmatrix(cfg['YATSM']['design_matrix'], kws) cfg['YATSM']['design'] = X.design_info.column_name_indexes # Form YATSM class arguments fit_indices = np.arange(cfg['dataset']['n_bands']) if cfg['dataset']['mask_band'] is not None: fit_indices = fit_indices[:-1] if cfg['YATSM']['reverse']: X = np.flipud(X) # Create output metadata to save md = { 'YATSM': cfg['YATSM'], cfg['YATSM']['algorithm']: cfg[cfg['YATSM']['algorithm']] } if cfg['phenology']['enable']: md.update({'phenology': cfg['phenology']}) # Begin process start_time_all = time.time() for line in job_lines: out = get_output_name(cfg['dataset'], line) if resume: try: np.load(out) except: pass else: logger.debug('Already processed line %s' % line) continue logger.debug('Running line %s' % line) start_time = time.time() Y = read_line(line, df['filename'], df['image_ID'], cfg['dataset'], ncol, nband, dtype, read_cache=read_cache, write_cache=write_cache, validate_cache=False) if do_not_run: continue if cfg['YATSM']['reverse']: Y = np.fliplr(Y) output = [] for col in np.arange(Y.shape[-1]): _Y = Y.take(col, axis=2) # Mask idx_mask = cfg['dataset']['mask_band'] - 1 valid = cyprep.get_valid_mask( _Y, cfg['dataset']['min_values'], cfg['dataset']['max_values']).astype(bool) valid *= np.in1d(_Y.take(idx_mask, axis=0), cfg['dataset']['mask_values'], invert=True).astype(np.bool) _Y = np.delete(_Y, idx_mask, axis=0)[:, valid] _X = X[valid, :] _dates = dates[valid] # Run model cls = cfg['YATSM']['algorithm_cls'] algo_cfg = cfg[cfg['YATSM']['algorithm']] yatsm = cls(lm=cfg['YATSM']['prediction_object'], **algo_cfg.get('init', {})) yatsm.px = col yatsm.py = line try: yatsm.fit(_X, _Y, _dates, **algo_cfg.get('fit', {})) except TSLengthException: continue if yatsm.record is None or len(yatsm.record) == 0: continue # Postprocess if cfg['YATSM'].get('commission_alpha'): yatsm.record = postprocess.commission_test( yatsm, cfg['YATSM']['commission_alpha']) for prefix, lm in zip(cfg['YATSM']['refit']['prefix'], cfg['YATSM']['refit']['prediction_object']): yatsm.record = postprocess.refit_record(yatsm, prefix, lm, keep_regularized=True) if cfg['phenology']['enable']: pcfg = cfg['phenology'] ltm = pheno.LongTermMeanPhenology(**pcfg.get('init', {})) yatsm.record = ltm.fit(yatsm, **pcfg.get('fit', {})) output.extend(yatsm.record) logger.debug(' Saving YATSM output to %s' % out) np.savez(out, record=np.array(output), version=__version__, metadata=md) run_time = time.time() - start_time logger.debug('Line %s took %ss to run' % (line, run_time)) logger.info('Completed {n} lines in {m} minutes'.format( n=len(job_lines), m=round((time.time() - start_time_all) / 60.0, 2)))
def test_distribute_jobs_sequential_onejob(nrow, njob): with pytest.raises(ValueError): utils.distribute_jobs(nrow, nrow, njob, interlaced=False)
def cache(ctx, config, job_number, total_jobs, update_pattern, interlace): cfg = parse_config_file(config) if not os.path.isdir(cfg['dataset']['cache_line_dir']): os.makedirs(cfg['dataset']['cache_line_dir']) df = csvfile_to_dataframe(cfg['dataset']['input_file'], cfg['dataset']['date_format']) df['image_IDs'] = get_image_IDs(df['filename']) nrow, ncol, nband, dtype = reader.get_image_attribute(df['filename'][0]) # Determine lines to work on job_lines = distribute_jobs(job_number, total_jobs, nrow, interlaced=interlace) logger.debug('Responsible for lines: {l}'.format(l=job_lines)) # Determine file reader if cfg['dataset']['use_bip_reader']: logger.debug('Reading in data from disk using BIP reader') image_reader = reader.read_row_BIP image_reader_kwargs = {'size': (ncol, nband), 'dtype': dtype} else: logger.debug('Reading in data from disk using GDAL') image_reader = reader.read_row_GDAL image_reader_kwargs = {} # Attempt to update cache files previous_cache = None if update_pattern: previous_cache = fnmatch.filter( os.listdir(cfg['dataset']['cache_line_dir']), update_pattern) if not previous_cache: logger.warning('Could not find cache files to update with pattern ' '%s' % update_pattern) else: logger.debug('Found %s previously cached files to update' % len(previous_cache)) for job_line in job_lines: cache_filename = get_line_cache_name(cfg['dataset'], len(df), job_line, nband) logger.debug('Caching line {l} to {f}'.format( l=job_line, f=cache_filename)) start_time = time.time() # Find matching cache file update = False if previous_cache: pattern = get_line_cache_pattern(job_line, nband, regex=False) potential = fnmatch.filter(previous_cache, pattern) if not potential: logger.info('Found zero previous cache files for ' 'line {l}'.format(l=job_line)) elif len(potential) > 1: logger.info('Found more than one previous cache file for ' 'line {l}. Keeping first'.format(l=job_line)) update = os.path.join(cfg['dataset']['cache_line_dir'], potential[0]) else: update = os.path.join(cfg['dataset']['cache_line_dir'], potential[0]) logger.info('Updating from cache file {f}'.format(f=update)) if update: update_cache_file(df['filename'], df['image_IDs'], update, cache_filename, job_line, image_reader, image_reader_kwargs) else: if cfg['dataset']['use_bip_reader']: # Use BIP reader logger.debug('Reading in data from disk using BIP reader') Y = reader.read_row_BIP(df['filename'], job_line, (ncol, nband), dtype) else: # Read in data just using GDAL logger.debug('Reading in data from disk using GDAL') Y = reader.read_row_GDAL(df['filename'], job_line) write_cache_file(cache_filename, Y, df['image_IDs']) logger.debug('Took {s}s to cache the data'.format( s=round(time.time() - start_time, 2)))