Exemplo n.º 1
0
def test_refit_none():
    """ Test refit if model is None/[]
    """
    refit = refit_record(None, 'ols', None)
    assert refit is None
    refit = refit_record([], 'ols', None)
    assert refit is None
Exemplo n.º 2
0
def test_refit_none():
    """ Test refit if model is None/[]
    """
    refit = refit_record(None, 'ols', None)
    assert refit is None
    refit = refit_record([], 'ols', None)
    assert refit is None
Exemplo n.º 3
0
def test_refit_nochange_rlm(sim_nochange):
    """ Test record refitting of one record using robust linear models
    """
    from yatsm.regression import RLM
    estimator = RLM(maxiter=10)

    refit = refit_record(sim_nochange, 'rlm', estimator, keep_regularized=True)
    assert 'rlm_coef' in refit.dtype.names
    assert 'rlm_rmse' in refit.dtype.names

    coef = np.array([[-3.84164779e+03, -3.84164779e+03],
                     [5.26200993e-03, 5.26200993e-03]])
    rmse = np.array([0.96866816, 0.96866816])
    np.testing.assert_allclose(refit[0]['rlm_coef'], coef)
    np.testing.assert_allclose(refit[0]['rlm_rmse'], rmse)
Exemplo n.º 4
0
def test_refit_nochange_rlm(sim_nochange):
    """ Test record refitting of one record using robust linear models
    """
    from yatsm.regression import RLM
    estimator = RLM(maxiter=10)

    refit = refit_record(sim_nochange, 'rlm', estimator,
                         keep_regularized=True)
    assert 'rlm_coef' in refit.dtype.names
    assert 'rlm_rmse' in refit.dtype.names

    coef = np.array([[-3.84164779e+03, -3.84164779e+03],
                     [5.26200993e-03, 5.26200993e-03]])
    rmse = np.array([0.96866816, 0.96866816])
    np.testing.assert_allclose(refit[0]['rlm_coef'], coef)
    np.testing.assert_allclose(refit[0]['rlm_rmse'], rmse)
Exemplo n.º 5
0
def test_refit_nochange_reg(sim_nochange):
    """ Test refit ``keep_regularized=False`` (i.e., not ignoring coef == 0)
    """
    from sklearn.linear_model import LinearRegression as OLS
    estimator = OLS()

    refit = refit_record(sim_nochange, 'ols', estimator,
                         keep_regularized=False)
    assert 'ols_coef' in refit.dtype.names
    assert 'ols_rmse' in refit.dtype.names

    coef = np.array([[-3.83016528e+03, -3.83016528e+03],
                     [5.24635240e-03, 5.24635240e-03]])
    rmse = np.array([0.96794599, 0.96794599])
    np.testing.assert_allclose(refit[0]['ols_coef'], coef)
    np.testing.assert_allclose(refit[0]['ols_rmse'], rmse)
Exemplo n.º 6
0
def test_refit_nochange_reg(sim_nochange):
    """ Test refit ``keep_regularized=False`` (i.e., not ignoring coef == 0)
    """
    from sklearn.linear_model import LinearRegression as OLS
    estimator = OLS()

    refit = refit_record(sim_nochange,
                         'ols',
                         estimator,
                         keep_regularized=False)
    assert 'ols_coef' in refit.dtype.names
    assert 'ols_rmse' in refit.dtype.names

    coef = np.array([[-3.83016528e+03, -3.83016528e+03],
                     [5.24635240e-03, 5.24635240e-03]])
    rmse = np.array([0.96794599, 0.96794599])
    np.testing.assert_allclose(refit[0]['ols_coef'], coef)
    np.testing.assert_allclose(refit[0]['ols_rmse'], rmse)
Exemplo n.º 7
0
def test_refit_issue_79(sim_nochange):
    """ Issue 79: missing coverage for case when record['coef'] are all zero

    Fix is to use ``refit_[(coef|rmse)]`` prefix variable to index the record
    name
    """
    from yatsm.regression import RLM
    estimator = RLM(maxiter=10)

    # Set record.coef to 0.
    sim_nochange.record['coef'] = np.zeros_like(sim_nochange.record['coef'])

    refit = refit_record(sim_nochange, 'rlm', estimator, keep_regularized=True)
    assert 'rlm_coef' in refit.dtype.names
    assert 'rlm_rmse' in refit.dtype.names

    coef = np.zeros_like(sim_nochange.record[0]['coef'])
    rmse = np.array([0.97117668, 0.97117668])
    np.testing.assert_allclose(refit[0]['rlm_coef'], coef)
    np.testing.assert_allclose(refit[0]['rlm_rmse'], rmse)
Exemplo n.º 8
0
def test_refit_issue_79(sim_nochange):
    """ Issue 79: missing coverage for case when record['coef'] are all zero

    Fix is to use ``refit_[(coef|rmse)]`` prefix variable to index the record
    name
    """
    from yatsm.regression import RLM
    estimator = RLM(maxiter=10)

    # Set record.coef to 0.
    sim_nochange.record['coef'] = np.zeros_like(sim_nochange.record['coef'])

    refit = refit_record(sim_nochange, 'rlm', estimator,
                         keep_regularized=True)
    assert 'rlm_coef' in refit.dtype.names
    assert 'rlm_rmse' in refit.dtype.names

    coef = np.zeros_like(sim_nochange.record[0]['coef'])
    rmse = np.array([0.97117668, 0.97117668])
    np.testing.assert_allclose(refit[0]['rlm_coef'], coef)
    np.testing.assert_allclose(refit[0]['rlm_rmse'], rmse)
Exemplo n.º 9
0
def line(ctx, config, job_number, total_jobs,
         resume, check_cache, do_not_run, verbose_yatsm):
    if verbose_yatsm:
        logger_algo.setLevel(logging.DEBUG)

    # Parse config
    cfg = parse_config_file(config)

    if ('phenology' in cfg and cfg['phenology'].get('enable')) and not pheno:
        click.secho('Could not import yatsm.phenology but phenology metrics '
                    'are requested', fg='red')
        click.secho('Error: %s' % pheno_exception, fg='red')
        raise click.Abort()

    # Make sure output directory exists and is writable
    output_dir = cfg['dataset']['output']
    try:
        os.makedirs(output_dir)
    except OSError as e:
        # File exists
        if e.errno == 17:
            pass
        elif e.errno == 13:
            click.secho('Cannot create output directory %s' % output_dir,
                        fg='red')
            raise click.Abort()

    if not os.access(output_dir, os.W_OK):
        click.secho('Cannot write to output directory %s' % output_dir,
                    fg='red')
        raise click.Abort()

    # Test existence of cache directory
    read_cache, write_cache = test_cache(cfg['dataset'])

    logger.info('Job {i} of {n} - using config file {f}'.format(i=job_number,
                                                                n=total_jobs,
                                                                f=config))
    df = csvfile_to_dataframe(cfg['dataset']['input_file'],
                              cfg['dataset']['date_format'])
    df['image_ID'] = get_image_IDs(df['filename'])

    # Get attributes of one of the images
    nrow, ncol, nband, dtype = get_image_attribute(df['filename'][0])

    # Calculate the lines this job ID works on
    job_lines = distribute_jobs(job_number, total_jobs, nrow)
    logger.debug('Responsible for lines: {l}'.format(l=job_lines))

    # Calculate X feature input
    dates = np.asarray(df['date'])
    kws = {'x': dates}
    kws.update(df.to_dict())
    X = patsy.dmatrix(cfg['YATSM']['design_matrix'], kws)
    cfg['YATSM']['design'] = X.design_info.column_name_indexes

    # Form YATSM class arguments
    fit_indices = np.arange(cfg['dataset']['n_bands'])
    if cfg['dataset']['mask_band'] is not None:
        fit_indices = fit_indices[:-1]

    if cfg['YATSM']['reverse']:
        X = np.flipud(X)

    # Create output metadata to save
    md = {
        'YATSM': cfg['YATSM'],
        cfg['YATSM']['algorithm']: cfg[cfg['YATSM']['algorithm']]
    }
    if cfg['phenology']['enable']:
        md.update({'phenology': cfg['phenology']})

    # Begin process
    start_time_all = time.time()
    for line in job_lines:
        out = get_output_name(cfg['dataset'], line)

        if resume:
            try:
                np.load(out)
            except:
                pass
            else:
                logger.debug('Already processed line %s' % line)
                continue

        logger.debug('Running line %s' % line)
        start_time = time.time()

        Y = read_line(line, df['filename'], df['image_ID'], cfg['dataset'],
                      ncol, nband, dtype,
                      read_cache=read_cache, write_cache=write_cache,
                      validate_cache=False)
        if do_not_run:
            continue
        if cfg['YATSM']['reverse']:
            Y = np.fliplr(Y)

        output = []
        for col in np.arange(Y.shape[-1]):
            _Y = Y.take(col, axis=2)
            # Mask
            idx_mask = cfg['dataset']['mask_band'] - 1
            valid = cyprep.get_valid_mask(
                _Y,
                cfg['dataset']['min_values'],
                cfg['dataset']['max_values']).astype(bool)

            valid *= np.in1d(_Y.take(idx_mask, axis=0),
                             cfg['dataset']['mask_values'],
                             invert=True).astype(np.bool)

            _Y = np.delete(_Y, idx_mask, axis=0)[:, valid]
            _X = X[valid, :]
            _dates = dates[valid]

            # Run model
            cls = cfg['YATSM']['algorithm_cls']
            algo_cfg = cfg[cfg['YATSM']['algorithm']]

            yatsm = cls(lm=cfg['YATSM']['prediction_object'],
                        **algo_cfg.get('init', {}))
            yatsm.px = col
            yatsm.py = line

            try:
                yatsm.fit(_X, _Y, _dates, **algo_cfg.get('fit', {}))
            except TSLengthException:
                continue

            if yatsm.record is None or len(yatsm.record) == 0:
                continue

            # Postprocess
            if cfg['YATSM'].get('commission_alpha'):
                yatsm.record = postprocess.commission_test(
                    yatsm, cfg['YATSM']['commission_alpha'])

            for prefix, lm in zip(cfg['YATSM']['refit']['prefix'],
                                  cfg['YATSM']['refit']['prediction_object']):
                yatsm.record = postprocess.refit_record(yatsm, prefix, lm,
                                                        keep_regularized=True)

            if cfg['phenology']['enable']:
                pcfg = cfg['phenology']
                ltm = pheno.LongTermMeanPhenology(**pcfg.get('init', {}))
                yatsm.record = ltm.fit(yatsm, **pcfg.get('fit', {}))

            output.extend(yatsm.record)

        logger.debug('    Saving YATSM output to %s' % out)
        np.savez(out,
                 record=np.array(output),
                 version=__version__,
                 metadata=md)

        run_time = time.time() - start_time
        logger.debug('Line %s took %ss to run' % (line, run_time))

    logger.info('Completed {n} lines in {m} minutes'.format(
                n=len(job_lines),
                m=round((time.time() - start_time_all) / 60.0, 2)))