예제 #1
0
def cli(index, all_, collections_):
    """
    Find duplicate datasets for a collection.

    (eg. if a dataset has been reprocessed but both versions are indexed and active)

    This uses the unique fields defined in a collection to try to group them.

    Note that this is really a prototype: it won't report all duplicates as the unique fields aren't good enough.

      - Scenes group by "day" not "solar day"

      - Tiled products should be grouped by tile_index, but it's not in the metadata.

    """
    collections.init_nci_collections(index)

    if all_:
        collection_names = collections.registered_collection_names()
    else:
        collection_names = collections_

    write_duplicates_csv(
        index, [collections.get_collection(name) for name in collection_names],
        sys.stdout)
예제 #2
0
def cli(index: Index, collection_specifiers: Iterable[str], cache_folder: str,
        format_: str, output_file: str, min_trash_age_hours: bool, jobs: int,
        **fix_settings):
    """
    Update a datacube index to the state of the filesystem.

    This will update locations, trash or index new datasets, depending on the chosen options.
    """
    uiutil.init_logging()

    if fix_settings['index_missing'] and fix_settings['trash_missing']:
        click.echo(
            'Can either index missing datasets (--index-missing) , or trash them (--trash-missing), '
            'but not both at the same time.',
            err=True)
        sys.exit(1)

    cs.init_nci_collections(index)

    mismatches = get_mismatches(cache_folder, collection_specifiers, format_,
                                jobs)

    out_f = sys.stdout
    try:
        if output_file:
            out_f = open(output_file, 'w')

        fixes.fix_mismatches(mismatches,
                             index,
                             min_trash_age_hours=min_trash_age_hours,
                             **fix_settings)
    finally:
        if output_file:
            out_f.close()
예제 #3
0
def main(folders: Iterable[str],
         dry_run: bool,
         queue: str,
         project: str,
         work_folder: str,
         cache_folder: str,
         max_jobs: int,
         concurrent_jobs: int,
         submit_limit: int):
    """
    Submit PBS jobs to run dea-sync

    Note that this is currently specific to tiled products, as it expects their folder naming conventions
    when splitting up jobs. TODO generalise function task_split()

    Example usage: dea-submit-sync 5fc /g/data/fk4/datacube/002/LS5_TM_FC

    5fc is just the name for the job: subsequent resubmissions will not rerun jobs with the same name
    if output files exist.

    A run folder is used (defaulting to `runs` in current dir) for storing output status.
    """
    input_paths = [Path(folder).absolute() for folder in folders]

    with index_connect(application_name='sync-submit') as index:
        collections.init_nci_collections(index)
        submitter = SyncSubmission(cache_folder, project, queue, dry_run, verbose=True, workers=4)
        click.echo(
            "{} input path(s)".format(len(input_paths))
        )
        tasks = _paths_to_tasks(input_paths)
        click.echo(
            "Found {} tasks across collection(s): {}".format(
                len(tasks),
                ', '.join(set(t.collection.name for t in tasks))
            )
        )

        if len(tasks) > max_jobs:
            click.echo(
                "Grouping (max_jobs={})".format(max_jobs)
            )
        tasks = group_tasks(tasks, maximum=max_jobs)

        total_datasets = sum(t.dataset_count for t in tasks)
        click.secho(
            "Submitting {} total jobs with {} datasets (avg {:.2f} each)...".format(
                len(tasks),
                total_datasets,
                total_datasets / len(tasks)
            ),
            bold=True
        )

        _find_and_submit(tasks, work_folder, concurrent_jobs, submit_limit, submitter)
예제 #4
0
def cli(index, dry_run, paths, destination, checksum):
    """
    Move the given folder of datasets into the given destination folder.

    This will checksum the data, copy it to the destination, and mark the original as archived in the DEA index.


    Notes:

    * An operator can later run dea-clean to trash the archived original locations.

    * Source datasets with failing checksums will be left as-is, with a warning logged.

    * Both the source(s) and destination paths are expected to be paths containing existing DEA collections.
    (See collections.py and paths.py)
    """
    init_logging()
    init_nci_collections(index)

    if not is_base_directory(destination):
        raise click.BadArgumentUsage(
            'Not a known DEA base directory; {}\nExpected one of:\n\t{}'.
            format(destination, '\n\t'.join(BASE_DIRECTORIES)))

    # We want to iterate all datasets in the given input folder, so we find collections that exist in
    # that folder and then iterate through all the collection datasets within that folder. Simple :)

    # We do this aggressively to find errors in arguments immediately. (with the downside of `paths` memory usage)
    resulting_paths = []
    for input_path in map(Path, paths):
        collections = list(get_collections_in_path(input_path))
        if not collections:
            raise click.BadArgumentUsage(
                f"Directory doesn't match any known collections: {input_path}")

        for collection in collections:
            resulting_paths.extend(
                list(collection.iter_fs_paths_within(input_path)))

    _LOG.info("dataset.count",
              input_count=len(paths),
              dataset_count=len(resulting_paths))

    # TODO: @ui.executor_cli_options
    move_all(index,
             resulting_paths,
             Path(destination),
             dry_run=dry_run,
             checksum=checksum)
예제 #5
0
    return ' '.join(printable(v) for v in val)


@printable.register(UUID)
def printable_uuid(val):
    return str(val)


def _write_csv(unique_fields, dicts, stream, append=False):
    writer = csv.DictWriter(stream, _get_headers(unique_fields))
    if not append:
        writer.writeheader()
    writer.writerows(({k: printable(v) for k, v in d.items()} for d in dicts))


collections.init_nci_collections(None)


@click.command('duplicates')
@global_cli_options
@click.option('-a', '--all_', is_flag=True)
@click.argument('collections_',
                type=click.Choice(collections.registered_collection_names()),
                nargs=-1)
@pass_index(app_name="find-duplicates")
def cli(index, all_, collections_):
    """
    Find duplicate datasets for a collection.

    (eg. if a dataset has been reprocessed but both versions are indexed and active)