def load_config_from_file(path): config_file = Path(path) _, config = next(read_documents(config_file)) IngestorConfig.validate(config) config['filename'] = str(normalise_path(config_file)) return config
def resolve_location(path: Location) -> str: """ Make sure a dataset location is a URL, suitable to be the dataset_location in datacube indexing. Users may specify a pathlib.Path(), and we'll convert it as needed. """ if isinstance(path, str): if not dc_uris.is_url(path) and not dc_uris.is_vsipath(path): raise ValueError( "A string location is expected to be a URL or VSI path. " "Perhaps you want to give it as a local pathlib.Path()?") return path path = dc_uris.normalise_path(path) if ".tar" in path.suffixes: return f"tar:{path}!/" elif ".zip" in path.suffixes: return f"zip:{path}!/" else: uri = path.as_uri() # Base paths specified as directories must end in a slash, # so they will be url joined as subfolders. (pathlib strips them) if path.is_dir(): return f"{uri}/" return uri
def test_normalise_path(): cwd = Path('.').resolve() assert normalise_path('.').resolve() == cwd p = Path('/a/b/c/d.txt') assert normalise_path(p) == Path(p) assert normalise_path(str(p)) == Path(p) base = Path('/a/b/') p = Path('c/d.txt') assert normalise_path(p, base) == (base / p) assert normalise_path(str(p), str(base)) == (base / p) assert normalise_path(p) == (cwd / p) with pytest.raises(ValueError): normalise_path(p, 'not/absolute/path')
def mk_uri(self, file_path, storage_config): """ Constructs a URI from the file_path and storage config. A typical implementation should return f'{scheme}://{file_path}' Example: file_path = '/path/to/my_file.nc' storage_config = {'driver': 'NetCDF CF'} mk_uri(file_path, storage_config) should return 'file:///path/to/my_file.nc' :param Path file_path: The file path of the file to be converted into a URI. :param dict storage_config: The dict holding the storage config found in the ingest definition. :return: file_path as a URI that the Driver understands. :rtype: str """ return normalise_path(file_path).as_uri()
def mk_uri(file_path): if driver.uri_scheme == "file": return normalise_path(file_path).as_uri() return '{}://{}'.format(driver.uri_scheme, file_path)
def main( local_config: LocalConfig, output_base: Optional[Path], input_relative_to: Optional[Path], datasets: Tuple[Path], datasets_path: Optional[Path], provider: Optional[str], overwrite_existing: bool, verbose: bool, workers: int, thoroughly_check_existing: bool, embed_location: Optional[bool], only_regions_in_file: Optional[Path], before_month: Optional[Tuple[int, int]], after_month: Optional[Tuple[int, int]], dry_run: bool, always_granule_id: Optional[bool], index_to_odc: bool, ): if sys.argv[1] == "sentinel-l1c": warnings.warn( "Command name 'sentinel-l1c-prepare' is deprecated: remove the 'c', and use `sentinel-l1-prepare`" ) included_regions = None if only_regions_in_file: included_regions = set(only_regions_in_file.read_text().splitlines()) if datasets_path: datasets = [ *datasets, *(normalise_path(p.strip()) for p in (datasets_path.read_text().splitlines())), ] _LOG.info("kickoff", path_count=len(datasets), worker_count=workers) # Are we indexing on success? index = None if index_to_odc: _LOG.info("Indexing new datasets", local_config=local_config) index = index_connect(local_config, application_name="s2-prepare") products = {} def on_success(dataset: DatasetDoc, dataset_path: Path): """ Index the dataset """ product_name = dataset.product.name product = products.get(product_name) if not product: product = index.products.get_by_name(product_name) if not product: raise ValueError( f"Product {product_name} not found in ODC index") products[product_name] = product index.datasets.add( Dataset(product, serialise.to_doc(dataset), uris=dataset.locations)) _LOG.debug("Indexed dataset", dataset_id=dataset.id, dataset_path=dataset_path) else: def on_success(dataset: DatasetDoc, dataset_path: Path): """Nothing extra""" def find_inputs_in_path(input_path: Path) -> Iterable[InputDataset]: """ Scan the input path for our key identifying files of a package. """ found_something = False if provider == "sinergise.com" or not provider: for p in _rglob_with_self(input_path, "tileInfo.json"): found_something = True yield InputDataset( producer="sinergise.com", # Dataset location is the metadata file itself. path=p, # Output is a sibling metadata file, with the same name as the folder (usually S2A....). base_folder=p.parent.parent, name=p.parent.stem, ) if provider == "esa.int" or not provider: for p in _rglob_with_self(input_path, "*.zip"): found_something = True yield InputDataset( producer="esa.int", # Dataset location is the zip file path=p, # Metadata is a sibling file with a metadata suffix. base_folder=p.parent, name=p.stem, ) if not found_something: raise ValueError( f"No S2 datasets found in given path {input_path}. " f"Expected either Sinergise (productInfo.json) files or ESA zip files to be contained in it." ) def find_jobs() -> Iterable[Job]: region_lookup = RegionLookup() nonlocal input_relative_to, embed_location for input_path in datasets: first = True for found_dataset in find_inputs_in_path(input_path): _LOG.debug("found_dataset", name=found_dataset.name) # Make sure we tick progress on extra datasets that were found. if not first: first = False # Filter based on metadata info = found_dataset.metadata # Skip regions that are not in the limit? if included_regions or before_month or after_month: if info is None: raise ValueError( f"Cannot filter from non-standard folder layout: {found_dataset.path} " f" expected of form L1C/yyyy/yyyy-mm/area/S2_..") if included_regions: # If it's an older dataset without a region, try to map its area to a known region. if info.region_code is None: for region in region_lookup.get(info.area): if region in included_regions: _LOG.debug( "mapped_area_match", input_area=info.area, region_match=region, ) break else: _LOG.debug( "skipping.mapped_area_not_in_regions", input_area=info.area, ) continue elif info.region_code not in included_regions: _LOG.debug( "skipping.region_not_in_region_list", region_code=info.region_code, ) continue if after_month is not None: year, month = after_month if info.year < year or (info.year == year and info.month < month): _LOG.debug( "skipping.too_old", dataset_year_month=(info.year, info.month), max_year_month=(year, month), ) continue if before_month is not None: year, month = before_month if info.year > year or (info.year == year and info.month > month): _LOG.debug( "skipping.too_young", dataset_year_month=(info.year, info.month), min_year_month=(year, month), ) continue # Put outputs in a different folder? if output_base: # What base folder should we choose for creating subfolders in the output? if input_relative_to is None: input_relative_to = _get_default_relative_folder_base( found_dataset.base_folder) output_folder = output_base / found_dataset.base_folder.relative_to( input_relative_to) # Default to true. if embed_location is None: embed_location = True else: output_folder = found_dataset.base_folder # Default to false if embed_location is None: embed_location = False # It's very slow to read the list of inner granules. # # So, if we're not thoroughly checking for missing outputs. if ((not thoroughly_check_existing) # ... and any outputs exist at all and list( output_folder.glob( f"{found_dataset.name}*.odc-metadata.yaml")) # ... and we're not overwriting our outputs and not overwrite_existing): # Skip it! _LOG.debug( "At least one output exists: skipping.", dataset_name=found_dataset.name, ) continue # This has to read the files, so can be slow. That's why we try to skip above if possible. granule_ids = found_dataset.granule_ids # When granule_id is None, it means process all without filtering. if not granule_ids: granule_ids = [None] else: _LOG.debug("found_granules", granule_count=len(granule_ids)) for granule_id in granule_ids: if always_granule_id or ( # None means 'auto': ie. automatically include granule id when there are multiple granules always_granule_id is None and len(granule_ids) > 1): yaml_filename = ( f"{found_dataset.name}.{granule_id}.odc-metadata.yaml" ) else: yaml_filename = f"{found_dataset.name}.odc-metadata.yaml" output_yaml = output_folder / yaml_filename if output_yaml.exists(): if not overwrite_existing: _LOG.debug("Output exists: skipping.", output_yaml=output_yaml) continue _LOG.debug("Output exists: overwriting.", output_yaml=output_yaml) _LOG.info( "queued", dataset_name=found_dataset.name, granule=granule_id or "any", ) yield Job( dataset_path=found_dataset.path, output_yaml_path=output_yaml, producer=found_dataset.producer, granule_id=granule_id, embed_location=embed_location, ) errors = 0 if dry_run: _LOG.info("Dry run: not writing any files.") # If only one process, call it directly. # (Multiprocessing makes debugging harder, so we prefer to make it optional) successes = 0 try: if workers == 1 or dry_run: for job in find_jobs(): try: if dry_run: _LOG.info( "Would write dataset", dataset_path=job.dataset_path, output_yaml_path=job.output_yaml_path, ) else: dataset, path = prepare_and_write( job.dataset_path, job.output_yaml_path, job.producer, granule_id=job.granule_id, embed_location=job.embed_location, ) _LOG.info("Wrote dataset", dataset_id=dataset.id, datsaset_path=path) on_success(dataset, path) successes += 1 except Exception: _LOG.exception("failed_job", job=job) errors += 1 else: with Pool(processes=workers) as pool: for res in pool.imap_unordered(_write_dataset_safe, find_jobs()): if isinstance(res, str): _LOG.error(res) errors += 1 else: dataset, path = res _LOG.info("Wrote dataset", dataset_id=dataset.id, dataset_path=path) on_success(dataset, path) successes += 1 pool.close() pool.join() finally: if index is not None: index.close() _LOG.info("completed", success_count=successes, failure_count=errors) sys.exit(errors)