def targets_entrypoint( batchMB: float, shapefile: str, records: List[str], name: str, every: int, categorical: bool, normalise: bool, random_seed: int, ) -> None: """Targets entrypoint without click cruft.""" log.info("Loading shapefile targets") out_filename = os.path.join(os.getcwd(), "targets_{}.hdf5".format(name)) nworkers = 0 # shapefile reading breaks with concurrency with tables.open_file(out_filename, mode="w", title=name) as h5file: log.info("Reading shapefile point coordinates") cocon_src = CoordinateShpArraySource(shapefile, random_seed) cocon_batchsize = mb_to_points(batchMB, ndim_con=0, ndim_cat=0, ndim_coord=2) write_coordinates(cocon_src, h5file, cocon_batchsize) if categorical: log.info("Reading shapefile categorical records") cat_source = CategoricalShpArraySource(shapefile, records, random_seed) cat_batchsize = mb_to_points(batchMB, ndim_con=0, ndim_cat=cat_source.shape[-1]) catdata = get_maps(cat_source, cat_batchsize) mappings, counts = catdata.mappings, catdata.counts ncats = np.array([len(m) for m in mappings]) write_categorical(cat_source, h5file, nworkers, cat_batchsize, mappings) cat_meta = meta.CategoricalTarget( N=cat_source.shape[0], labels=cat_source.columns, nvalues=ncats, mappings=mappings, counts=counts, ) write_target_metadata(cat_meta, h5file) else: log.info("Reading shapefile continuous records") con_source = ContinuousShpArraySource(shapefile, records, random_seed) con_batchsize = mb_to_points(batchMB, ndim_con=con_source.shape[-1], ndim_cat=0) mean, sd = get_stats(con_source, con_batchsize) if normalise else None, None write_continuous(con_source, h5file, nworkers, con_batchsize) con_meta = meta.ContinuousTarget(N=con_source.shape[0], labels=con_source.columns, means=mean, sds=sd) write_target_metadata(con_meta, h5file) log.info("Target import complete")
def test_get_categories(mocker): rnd = np.random.RandomState(seed=666) x = rnd.randint(0, 10, size=(20, 3), dtype=CategoricalType) missing_in = -1 columns = ["1", "2", "3"] source = NPCatArraySource(x, missing_in, columns) batchsize = 3 res = category.get_maps(source, batchsize) mappings, counts = res.mappings, res.counts for m, c, x in zip(mappings, counts, x.T): assert set(x) == set(m) for m_i, c_i in zip(m, c): assert c_i == np.sum(x == m_i)
def tifs_entrypoint(nworkers: int, batchMB: float, categorical: List[str], continuous: List[str], normalise: bool, name: str, ignore_crs: bool) -> None: """Entrypoint for tifs without click cruft.""" out_filename = os.path.join(os.getcwd(), "features_{}.hdf5".format(name)) con_filenames = tifnames(continuous) cat_filenames = tifnames(categorical) log.info("Found {} continuous TIF files".format(len(con_filenames))) log.info("Found {} categorical TIF files".format(len(cat_filenames))) has_con = len(con_filenames) > 0 has_cat = len(cat_filenames) > 0 all_filenames = con_filenames + cat_filenames if not len(all_filenames) > 0: raise errors.NoTifFilesFound() N_con, N_cat = None, None con_meta, cat_meta = None, None spec = shared_image_spec(all_filenames, ignore_crs) with tables.open_file(out_filename, mode="w", title=name) as outfile: if has_con: con_source = ContinuousStackSource(spec, con_filenames) ndims_con = con_source.shape[-1] con_rows_per_batch = mb_to_rows(batchMB, spec.width, ndims_con, 0) N_con = con_source.shape[0] * con_source.shape[1] N = N_con log.info("Continuous missing value set to {}".format( con_source.missing)) stats = None if normalise: stats = get_stats(con_source, con_rows_per_batch) sd = stats[1] if any(sd == 0.0): raise errors.ZeroDeviation(sd, con_source.columns) log.info("Writing normalised continuous data to output file") else: log.info("Writing unnormalised continuous data to output file") con_meta = meta.ContinuousFeatureSet(labels=con_source.columns, missing=con_source.missing, stats=stats) write_continuous(con_source, outfile, nworkers, con_rows_per_batch, stats) if has_cat: cat_source = CategoricalStackSource(spec, cat_filenames) N_cat = cat_source.shape[0] * cat_source.shape[1] N = N_cat if N_con and N_cat != N_con: raise errors.ConCatNMismatch(N_con, N_cat) ndims_cat = cat_source.shape[-1] cat_rows_per_batch = mb_to_rows(batchMB, spec.width, 0, ndims_cat) log.info("Categorical missing value set to {}".format( cat_source.missing)) catdata = get_maps(cat_source, cat_rows_per_batch) maps, counts = catdata.mappings, catdata.counts ncats = np.array([len(m) for m in maps]) log.info("Writing mapped categorical data to output file") cat_meta = meta.CategoricalFeatureSet(labels=cat_source.columns, missing=cat_source.missing, nvalues=ncats, mappings=maps, counts=counts) write_categorical(cat_source, outfile, nworkers, cat_rows_per_batch, maps) m = meta.FeatureSet(continuous=con_meta, categorical=cat_meta, image=spec, N=N, halfwidth=0) write_feature_metadata(m, outfile) log.info("Tif import complete")