def train(ctx, config, classifier_config, model, n_fold, seed, plot, diagnostics, overwrite): """ Train a classifier from `scikit-learn` on YATSM output and save result to file <model>. Dataset configuration is specified by <yatsm_config> and classifier and classifier parameters are specified by <classifier_config>. """ if not model.endswith('.pkl'): model += '.pkl' if os.path.isfile(model) and not overwrite: logger.error('<model> exists and --overwrite was not specified') raise click.Abort() if seed: np.random.seed(seed) # Parse YATSM config dataset_config, yatsm_config = parse_config_file(config) if not dataset_config['training_image'] or \ not os.path.isfile(dataset_config['training_image']): logger.error('Training data image {f} does not exist'.format( f=dataset_config['training_image'])) raise click.Abort() # Parse classifier config algorithm_helper = classifiers.ini_to_algorthm(classifier_config) main(dataset_config, yatsm_config, algorithm_helper, model, diagnostics, n_fold, plot)
def classify(ctx, config, algo, job_number, total_jobs, resume): cfg = parse_config_file(config) df = csvfile_to_dataframe(cfg['dataset']['input_file'], cfg['dataset']['date_format']) nrow = get_image_attribute(df['filename'][0])[0] classifier = joblib.load(algo) # Split into lines and classify job_lines = distribute_jobs(job_number, total_jobs, nrow) logger.debug('Responsible for lines: {l}'.format(l=job_lines)) start_time = time.time() logger.info('Starting to run lines') for job_line in job_lines: filename = get_output_name(cfg['dataset'], job_line) if not os.path.exists(filename): logger.warning('No model result found for line {l} ' '(file {f})'.format(l=job_line, f=filename)) pass if resume and try_resume(filename): logger.debug('Already processed line {l}'.format(l=job_line)) continue logger.debug('Classifying line {l}'.format(l=job_line)) classify_line(filename, classifier) logger.debug('Completed {n} lines in {m} minutes'.format( n=len(job_lines), m=round((time.time() - start_time) / 60.0, 2)) )
def main(args): """ Classify dataset """ # Parse config and file data dataset_config, yatsm_config = parse_config_file(args['config_file']) # Get some attributes about the dataset dates, sensors, images = csvfile_to_dataset( dataset_config['input_file'], date_format=dataset_config['date_format'] ) nrow, _, _, _ = get_image_attribute(images[0]) # Read in the saved classification result try: _ = open(args['algo']) except: logger.error('Could not open pickled classifier') sys.exit(1) classifier = joblib.load(args['algo']) # Split into lines and classify job_lines = calculate_lines(args['job_number'] - 1, args['total_jobs'], nrow) logger.debug('Responsible for lines: {l}'.format(l=job_lines)) start_time = time.time() logger.info('Starting to run lines') for job_line in job_lines: filename = get_output_name(dataset_config, job_line) if not os.path.exists(filename): logger.warning('No model result found for line {l} ' '(file {f})'.format(l=job_line, f=filename)) pass if args['resume'] and try_resume(filename): logger.debug('Already processed line {l}'.format(l=job_line)) continue logger.debug('Classifying line {l}'.format(l=job_line)) classify_line(filename, classifier) logger.debug('Completed {n} lines in {m} minutes'.format( n=len(job_lines), m=round((time.time() - start_time) / 60.0, 2)) )
def train(ctx, config, classifier_config, model, n_fold, seed, plot, diagnostics, overwrite): """ Train a classifier from `scikit-learn` on YATSM output and save result to file <model>. Dataset configuration is specified by <yatsm_config> and classifier and classifier parameters are specified by <classifier_config>. """ # Setup if not model.endswith(".pkl"): model += ".pkl" if os.path.isfile(model) and not overwrite: logger.error("<model> exists and --overwrite was not specified") raise click.Abort() if seed: np.random.seed(seed) # Parse config & algorithm config cfg = parse_config_file(config) algo, algo_cfg = classifiers.cfg_to_algorithm(classifier_config) training_image = cfg["classification"]["training_image"] if not training_image or not os.path.isfile(training_image): logger.error("Training data image %s does not exist" % training_image) raise click.Abort() # Find information from results -- e.g., design info attrs = find_result_attributes(cfg) cfg["YATSM"].update(attrs) # Cache file for training data has_cache = False training_cache = cfg["classification"]["cache_training"] if training_cache: # If doesn't exist, retrieve it if not os.path.isfile(training_cache): logger.info("Could not retrieve cache file for Xy") logger.info(" file: %s" % training_cache) else: logger.info("Restoring X/y from cache file") has_cache = True training_image = cfg["classification"]["training_image"] # Check if we need to regenerate the cache file because training data is # newer than the cache regenerate_cache = is_cache_old(training_cache, training_image) if regenerate_cache: logger.warning("Existing cache file older than training data ROI") logger.warning("Regenerating cache file") if not has_cache or regenerate_cache: logger.debug("Reading in X/y") X, y, row, col, labels = get_training_inputs(cfg) logger.debug("Done reading in X/y") else: logger.debug("Reading in X/y from cache file %s" % training_cache) with np.load(training_cache) as f: X = f["X"] y = f["y"] row = f["row"] col = f["col"] labels = f["labels"] logger.debug("Read in X/y from cache file %s" % training_cache) # If cache didn't exist but is specified, create it for first time if not has_cache and training_cache: logger.info("Saving X/y to cache file %s" % training_cache) try: np.savez(training_cache, X=X, y=y, row=row, col=col, labels=labels) except: logger.error("Could not save X/y to cache file") raise # Do modeling logger.info("Training classifier") algo.fit(X, y, **algo_cfg.get("fit", {})) # Serialize algorithm to file logger.info("Pickling classifier with sklearn.externals.joblib") joblib.dump(algo, model, compress=3) # Diagnostics if diagnostics: algo_diagnostics(cfg, X, y, row, col, algo, n_fold, plot)
def line(ctx, config, job_number, total_jobs, resume, check_cache, do_not_run, verbose_yatsm): if verbose_yatsm: logger_algo.setLevel(logging.DEBUG) # Parse config cfg = parse_config_file(config) if ('phenology' in cfg and cfg['phenology'].get('enable')) and not pheno: click.secho('Could not import yatsm.phenology but phenology metrics ' 'are requested', fg='red') click.secho('Error: %s' % pheno_exception, fg='red') raise click.Abort() # Make sure output directory exists and is writable output_dir = cfg['dataset']['output'] try: os.makedirs(output_dir) except OSError as e: # File exists if e.errno == 17: pass elif e.errno == 13: click.secho('Cannot create output directory %s' % output_dir, fg='red') raise click.Abort() if not os.access(output_dir, os.W_OK): click.secho('Cannot write to output directory %s' % output_dir, fg='red') raise click.Abort() # Test existence of cache directory read_cache, write_cache = test_cache(cfg['dataset']) logger.info('Job {i} of {n} - using config file {f}'.format(i=job_number, n=total_jobs, f=config)) df = csvfile_to_dataframe(cfg['dataset']['input_file'], cfg['dataset']['date_format']) df['image_ID'] = get_image_IDs(df['filename']) # Get attributes of one of the images nrow, ncol, nband, dtype = get_image_attribute(df['filename'][0]) # Calculate the lines this job ID works on job_lines = distribute_jobs(job_number, total_jobs, nrow) logger.debug('Responsible for lines: {l}'.format(l=job_lines)) # Calculate X feature input dates = np.asarray(df['date']) kws = {'x': dates} kws.update(df.to_dict()) X = patsy.dmatrix(cfg['YATSM']['design_matrix'], kws) cfg['YATSM']['design'] = X.design_info.column_name_indexes # Form YATSM class arguments fit_indices = np.arange(cfg['dataset']['n_bands']) if cfg['dataset']['mask_band'] is not None: fit_indices = fit_indices[:-1] if cfg['YATSM']['reverse']: X = np.flipud(X) # Create output metadata to save md = { 'YATSM': cfg['YATSM'], cfg['YATSM']['algorithm']: cfg[cfg['YATSM']['algorithm']] } if cfg['phenology']['enable']: md.update({'phenology': cfg['phenology']}) # Begin process start_time_all = time.time() for line in job_lines: out = get_output_name(cfg['dataset'], line) if resume: try: np.load(out) except: pass else: logger.debug('Already processed line %s' % line) continue logger.debug('Running line %s' % line) start_time = time.time() Y = read_line(line, df['filename'], df['image_ID'], cfg['dataset'], ncol, nband, dtype, read_cache=read_cache, write_cache=write_cache, validate_cache=False) if do_not_run: continue if cfg['YATSM']['reverse']: Y = np.fliplr(Y) output = [] for col in np.arange(Y.shape[-1]): _Y = Y.take(col, axis=2) # Mask idx_mask = cfg['dataset']['mask_band'] - 1 valid = cyprep.get_valid_mask( _Y, cfg['dataset']['min_values'], cfg['dataset']['max_values']).astype(bool) valid *= np.in1d(_Y.take(idx_mask, axis=0), cfg['dataset']['mask_values'], invert=True).astype(np.bool) _Y = np.delete(_Y, idx_mask, axis=0)[:, valid] _X = X[valid, :] _dates = dates[valid] # Run model cls = cfg['YATSM']['algorithm_cls'] algo_cfg = cfg[cfg['YATSM']['algorithm']] yatsm = cls(lm=cfg['YATSM']['prediction_object'], **algo_cfg.get('init', {})) yatsm.px = col yatsm.py = line try: yatsm.fit(_X, _Y, _dates, **algo_cfg.get('fit', {})) except TSLengthException: continue if yatsm.record is None or len(yatsm.record) == 0: continue # Postprocess if cfg['YATSM'].get('commission_alpha'): yatsm.record = postprocess.commission_test( yatsm, cfg['YATSM']['commission_alpha']) for prefix, lm in zip(cfg['YATSM']['refit']['prefix'], cfg['YATSM']['refit']['prediction_object']): yatsm.record = postprocess.refit_record(yatsm, prefix, lm, keep_regularized=True) if cfg['phenology']['enable']: pcfg = cfg['phenology'] ltm = pheno.LongTermMeanPhenology(**pcfg.get('init', {})) yatsm.record = ltm.fit(yatsm, **pcfg.get('fit', {})) output.extend(yatsm.record) logger.debug(' Saving YATSM output to %s' % out) np.savez(out, record=np.array(output), version=__version__, metadata=md) run_time = time.time() - start_time logger.debug('Line %s took %ss to run' % (line, run_time)) logger.info('Completed {n} lines in {m} minutes'.format( n=len(job_lines), m=round((time.time() - start_time_all) / 60.0, 2)))
do_not_run = args['--do-not-run'] # Setup logger if args['--verbose']: logger.setLevel(logging.DEBUG) if args['--verbose-yatsm']: loglevel_YATSM = logging.DEBUG if args['--quiet']: loglevel_YATSM = logging.WARNING logger.setLevel(logging.WARNING) # Parse and validate configuration file dataset_config, yatsm_config = parse_config_file(config_file) # Import phenology stuff only if necessary since it relies on rpy2 / R if yatsm_config['calc_pheno'] and not do_not_run: import yatsm.phenology as pheno # Make output directory try: os.makedirs(dataset_config['output']) except OSError as e: # File exists if e.errno == 17: pass elif e.errno == 13: print('Error - cannot create output directory {d}'.format( d=dataset_config['output']))
def pixel(ctx, config, px, py, band, plot, ylim, style, cmap, embed, seed, algo_kw): # Set seed np.random.seed(seed) # Convert band to index band -= 1 # Get colormap if hasattr(palettable.colorbrewer, cmap): mpl_cmap = getattr(palettable.colorbrewer, cmap).mpl_colormap elif hasattr(palettable.cubehelix, cmap): mpl_cmap = getattr(palettable.cubehelix, cmap).mpl_colormap elif hasattr(palettable.wesanderson, cmap): mpl_cmap = getattr(palettable.wesanderson, cmap).mpl_colormap else: raise click.Abort('Cannot find specified colormap in `palettable`') # Parse config cfg = parse_config_file(config) # Apply algorithm overrides revalidate = False for kw in algo_kw: for cfg_key in cfg: if kw in cfg[cfg_key]: # Parse as YAML for type conversions used in config parser value = yaml.load(algo_kw[kw]) print('Overriding cfg[%s][%s]=%s with %s' % (cfg_key, kw, cfg[cfg_key][kw], value)) cfg[cfg_key][kw] = value revalidate = True if revalidate: cfg = convert_config(cfg) # Locate and fetch attributes from data df = csvfile_to_dataframe(cfg['dataset']['input_file'], date_format=cfg['dataset']['date_format']) df['image_ID'] = get_image_IDs(df['filename']) # Setup X/Y kws = {'x': df['date']} kws.update(df.to_dict()) X = patsy.dmatrix(cfg['YATSM']['design_matrix'], kws) design_info = X.design_info Y = read_pixel_timeseries(df['filename'], px, py) fit_indices = np.arange(cfg['dataset']['n_bands']) if cfg['dataset']['mask_band'] is not None: fit_indices = fit_indices[:-1] # Mask out of range data idx_mask = cfg['dataset']['mask_band'] - 1 valid = cyprep.get_valid_mask(Y, cfg['dataset']['min_values'], cfg['dataset']['max_values']).astype(np.bool) valid *= np.in1d(Y[idx_mask, :], cfg['dataset']['mask_values'], invert=True).astype(np.bool) # Apply mask Y = np.delete(Y, idx_mask, axis=0)[:, valid] X = X[valid, :] dates = np.array([dt.datetime.fromordinal(d) for d in df['date'][valid]]) # Plot before fitting with plt.xkcd() if style == 'xkcd' else mpl.style.context(style): for _plot in plot: if _plot == 'TS': plot_TS(dates, Y[band, :]) elif _plot == 'DOY': plot_DOY(dates, Y[band, :], mpl_cmap) elif _plot == 'VAL': plot_VAL(dates, Y[band, :], mpl_cmap) if ylim: plt.ylim(ylim) plt.title('Timeseries: px={px} py={py}'.format(px=px, py=py)) plt.ylabel('Band {b}'.format(b=band + 1)) if embed and has_embed: IPython_embed() plt.tight_layout() plt.show() # Eliminate config parameters not algorithm and fit model yatsm = cfg['YATSM']['algorithm_cls'](lm=cfg['YATSM']['prediction_object'], **cfg[cfg['YATSM']['algorithm']]) yatsm.px = px yatsm.py = py yatsm.fit(X, Y, np.asarray(df['date'][valid])) # Plot after predictions with plt.xkcd() if style == 'xkcd' else mpl.style.context(style): for _plot in plot: if _plot == 'TS': plot_TS(dates, Y[band, :]) elif _plot == 'DOY': plot_DOY(dates, Y[band, :], mpl_cmap) elif _plot == 'VAL': plot_VAL(dates, Y[band, :], mpl_cmap) if ylim: plt.ylim(ylim) plt.title('Timeseries: px={px} py={py}'.format(px=px, py=py)) plt.ylabel('Band {b}'.format(b=band + 1)) plot_results(band, cfg['YATSM'], yatsm, plot_type=_plot) if embed and has_embed: IPython_embed() plt.tight_layout() plt.show()
def main(args): # Parse and validate configuration file dataset_config, yatsm_config = config_parser.parse_config_file( args['config_file']) if not os.path.isdir(dataset_config['cache_line_dir']): os.makedirs(dataset_config['cache_line_dir']) dates, images = utils.csvfile_to_dataset( dataset_config['input_file'], date_format=dataset_config['date_format'] ) image_IDs = utils.get_image_IDs(images) nrow, ncol, nband, dtype = reader.get_image_attribute(images[0]) # Determine lines to work on job_lines = utils.calculate_lines(args['job_number'], args['total_jobs'], nrow, interlaced=args['interlace']) logger.debug('Responsible for lines: {l}'.format(l=job_lines)) # Determine file reader if dataset_config['use_bip_reader']: logger.debug('Reading in data from disk using BIP reader') image_reader = reader.read_row_BIP image_reader_kwargs = {'size': (ncol, nband), 'dtype': dtype} else: logger.debug('Reading in data from disk using GDAL') image_reader = reader.read_row_GDAL image_reader_kwargs = {} # Attempt to update cache files previous_cache = None if args['update_pattern']: previous_cache = fnmatch.filter( os.listdir(dataset_config['cache_line_dir']), args['update_pattern']) if not previous_cache: logger.warning('Could not find cache files to update with pattern' '{p}'.format(p=args['update_pattern'])) else: logger.debug('Found {n} previously cached files to update'.format( n=len(previous_cache))) for job_line in job_lines: cache_filename = cache.get_line_cache_name( dataset_config, len(images), job_line, nband) logger.debug('Caching line {l} to {f}'.format( l=job_line, f=cache_filename)) start_time = time.time() # Find matching cache file update = False if previous_cache: pattern = cache.get_line_cache_pattern( job_line, nband, regex=False) potential = fnmatch.filter(previous_cache, pattern) if not potential: logger.info('Found zero previous cache files for ' 'line {l}'.format(l=job_line)) elif len(potential) > 1: logger.info('Found more than one previous cache file for ' 'line {l}. Keeping first'.format(l=job_line)) update = os.path.join(dataset_config['cache_line_dir'], potential[0]) else: update = os.path.join(dataset_config['cache_line_dir'], potential[0]) logger.info('Updating from cache file {f}'.format(f=update)) if update: cache.update_cache_file( images, image_IDs, update, cache_filename, job_line, image_reader, image_reader_kwargs ) else: if dataset_config['use_bip_reader']: # Use BIP reader logger.debug('Reading in data from disk using BIP reader') Y = reader.read_row_BIP(images, job_line, (ncol, nband), dtype) else: # Read in data just using GDAL logger.debug('Reading in data from disk using GDAL') Y = reader.read_row_GDAL(images, job_line) cache.write_cache_file(cache_filename, Y, image_IDs) logger.debug('Took {s}s to cache the data'.format( s=round(time.time() - start_time, 2)))
def main(args): # Parse and validate configuration file dataset_config, yatsm_config = config_parser.parse_config_file( args['config_file']) if not os.path.isdir(dataset_config['cache_line_dir']): os.makedirs(dataset_config['cache_line_dir']) dates, images = utils.csvfile_to_dataset( dataset_config['input_file'], date_format=dataset_config['date_format']) image_IDs = utils.get_image_IDs(images) nrow, ncol, nband, dtype = reader.get_image_attribute(images[0]) # Determine lines to work on job_lines = utils.calculate_lines(args['job_number'], args['total_jobs'], nrow, interlaced=args['interlace']) logger.debug('Responsible for lines: {l}'.format(l=job_lines)) # Determine file reader if dataset_config['use_bip_reader']: logger.debug('Reading in data from disk using BIP reader') image_reader = reader.read_row_BIP image_reader_kwargs = {'size': (ncol, nband), 'dtype': dtype} else: logger.debug('Reading in data from disk using GDAL') image_reader = reader.read_row_GDAL image_reader_kwargs = {} # Attempt to update cache files previous_cache = None if args['update_pattern']: previous_cache = fnmatch.filter( os.listdir(dataset_config['cache_line_dir']), args['update_pattern']) if not previous_cache: logger.warning('Could not find cache files to update with pattern' '{p}'.format(p=args['update_pattern'])) else: logger.debug('Found {n} previously cached files to update'.format( n=len(previous_cache))) for job_line in job_lines: cache_filename = cache.get_line_cache_name(dataset_config, len(images), job_line, nband) logger.debug('Caching line {l} to {f}'.format(l=job_line, f=cache_filename)) start_time = time.time() # Find matching cache file update = False if previous_cache: pattern = cache.get_line_cache_pattern(job_line, nband, regex=False) potential = fnmatch.filter(previous_cache, pattern) if not potential: logger.info('Found zero previous cache files for ' 'line {l}'.format(l=job_line)) elif len(potential) > 1: logger.info('Found more than one previous cache file for ' 'line {l}. Keeping first'.format(l=job_line)) update = os.path.join(dataset_config['cache_line_dir'], potential[0]) else: update = os.path.join(dataset_config['cache_line_dir'], potential[0]) logger.info('Updating from cache file {f}'.format(f=update)) if update: cache.update_cache_file(images, image_IDs, update, cache_filename, job_line, image_reader, image_reader_kwargs) else: if dataset_config['use_bip_reader']: # Use BIP reader logger.debug('Reading in data from disk using BIP reader') Y = reader.read_row_BIP(images, job_line, (ncol, nband), dtype) else: # Read in data just using GDAL logger.debug('Reading in data from disk using GDAL') Y = reader.read_row_GDAL(images, job_line) cache.write_cache_file(cache_filename, Y, image_IDs) logger.debug('Took {s}s to cache the data'.format( s=round(time.time() - start_time, 2)))
def cache(ctx, config, job_number, total_jobs, update_pattern, interlace): cfg = parse_config_file(config) if not os.path.isdir(cfg['dataset']['cache_line_dir']): os.makedirs(cfg['dataset']['cache_line_dir']) df = csvfile_to_dataframe(cfg['dataset']['input_file'], cfg['dataset']['date_format']) df['image_IDs'] = get_image_IDs(df['filename']) nrow, ncol, nband, dtype = reader.get_image_attribute(df['filename'][0]) # Determine lines to work on job_lines = distribute_jobs(job_number, total_jobs, nrow, interlaced=interlace) logger.debug('Responsible for lines: {l}'.format(l=job_lines)) # Determine file reader if cfg['dataset']['use_bip_reader']: logger.debug('Reading in data from disk using BIP reader') image_reader = reader.read_row_BIP image_reader_kwargs = {'size': (ncol, nband), 'dtype': dtype} else: logger.debug('Reading in data from disk using GDAL') image_reader = reader.read_row_GDAL image_reader_kwargs = {} # Attempt to update cache files previous_cache = None if update_pattern: previous_cache = fnmatch.filter( os.listdir(cfg['dataset']['cache_line_dir']), update_pattern) if not previous_cache: logger.warning('Could not find cache files to update with pattern ' '%s' % update_pattern) else: logger.debug('Found %s previously cached files to update' % len(previous_cache)) for job_line in job_lines: cache_filename = get_line_cache_name(cfg['dataset'], len(df), job_line, nband) logger.debug('Caching line {l} to {f}'.format( l=job_line, f=cache_filename)) start_time = time.time() # Find matching cache file update = False if previous_cache: pattern = get_line_cache_pattern(job_line, nband, regex=False) potential = fnmatch.filter(previous_cache, pattern) if not potential: logger.info('Found zero previous cache files for ' 'line {l}'.format(l=job_line)) elif len(potential) > 1: logger.info('Found more than one previous cache file for ' 'line {l}. Keeping first'.format(l=job_line)) update = os.path.join(cfg['dataset']['cache_line_dir'], potential[0]) else: update = os.path.join(cfg['dataset']['cache_line_dir'], potential[0]) logger.info('Updating from cache file {f}'.format(f=update)) if update: update_cache_file(df['filename'], df['image_IDs'], update, cache_filename, job_line, image_reader, image_reader_kwargs) else: if cfg['dataset']['use_bip_reader']: # Use BIP reader logger.debug('Reading in data from disk using BIP reader') Y = reader.read_row_BIP(df['filename'], job_line, (ncol, nband), dtype) else: # Read in data just using GDAL logger.debug('Reading in data from disk using GDAL') Y = reader.read_row_GDAL(df['filename'], job_line) write_cache_file(cache_filename, Y, df['image_IDs']) logger.debug('Took {s}s to cache the data'.format( s=round(time.time() - start_time, 2)))