def test_create_derived_proto_dataset(tmp_dir_fixture): # NOQA import dtoolcore name = "derived-data-ds" base_uri = _sanitise_base_uri(tmp_dir_fixture) readme_content = "---\ndescription: a test dataset" creator_username = "******" with dtoolcore.DataSetCreator("raw-data-ds", base_uri) as dataset_creator: source_dataset_uri = dataset_creator.uri source_dataset = dtoolcore.DataSet.from_uri(source_dataset_uri) proto_dataset = dtoolcore.create_derived_proto_dataset( name=name, base_uri=base_uri, source_dataset=source_dataset, readme_content=readme_content, creator_username=creator_username) assert isinstance(proto_dataset, dtoolcore.ProtoDataSet) assert proto_dataset._admin_metadata[ "creator_username"] == creator_username # NOQA assert proto_dataset.name == name # Test the annotations. assert proto_dataset.get_annotation( "source_dataset_name") == source_dataset.name # NOQA assert proto_dataset.get_annotation( "source_dataset_uri") == source_dataset.uri # NOQA assert proto_dataset.get_annotation( "source_dataset_uuid") == source_dataset.uuid # NOQA
def test_DataSetCreator(tmp_dir_fixture): # NOQA import dtoolcore from dtoolcore.utils import generate_identifier name = "my-test-ds" base_uri = _sanitise_base_uri(tmp_dir_fixture) readme_content = "---\ndescription: a test dataset" creator_username = "******" local_file_path = os.path.join(TEST_SAMPLE_DATA, "tiny.png") with dtoolcore.DataSetCreator( name=name, base_uri=base_uri, readme_content=readme_content, creator_username=creator_username) as dataset_creator: assert dataset_creator.name == name uri = dataset_creator.uri handle = dataset_creator.put_item(local_file_path, "subdir/tiny.png") dataset_creator.add_item_metadata(handle, "ext", ".png") # The below would raise if the dataset was not frozen. dataset = dtoolcore.DataSet.from_uri(uri) # Check the content. expected_identifier = generate_identifier("subdir/tiny.png") assert expected_identifier in dataset.identifiers assert len(dataset.identifiers) == 1 # Check item metadata expected_ext_overlay = {expected_identifier: ".png"} assert dataset.get_overlay("ext") == expected_ext_overlay
def test_DataSetCreator_staging_api(tmp_dir_fixture): # NOQA import dtoolcore name = "my-test-ds" base_uri = _sanitise_base_uri(tmp_dir_fixture) readme_content = "---\ndescription: a test dataset" creator_username = "******" with dtoolcore.DataSetCreator( name=name, base_uri=base_uri, readme_content=readme_content, creator_username=creator_username) as dataset_creator: # Ensure that the staging directory exists. assert os.path.isdir(dataset_creator.staging_directory) uri = dataset_creator.uri # Ensure that the staging directory has been removed. assert not os.path.isdir(dataset_creator.staging_directory) # The below would raise if the dataset was not frozen. dtoolcore.DataSet.from_uri(uri)
def main(config_fpath): logging.basicConfig(level=logging.INFO) config = Config(config_fpath) params = SimpleNamespace(**config.params) dl = DataLoader(config.raw_config) specs = get_specs(config) readme_str = config.as_readme_format() dfs = [] with dtoolcore.DataSetCreator( config.output_name, config.output_base_uri ) as output_ds: for spec in specs: # FIXME logger.info("Processing n={expid}".format(**spec)) try: dataitem = dl.load_by_specifier(**spec) df = process_dataitem(dataitem, spec, params, config, output_ds) df['expid'] = spec['expid'] dfs.append(df) except FileNotFoundError as err: logger.warning(f"Couldn't load: {err}") summary_output_abspath = output_ds.prepare_staging_abspath_promise(f"summary.csv") pd.concat(dfs).to_csv(summary_output_abspath, index=False) output_ds.put_readme(readme_str)
def main(config_fpath): logging.basicConfig(level=logging.INFO) config = Config(config_fpath) ids = ImageDataSet(config.ids_uri) specs = [ stack_tuple_to_spec(tp) for tp in ids.all_possible_stack_tuples() ] with dtoolcore.DataSetCreator( config.output_name, config.output_base_uri ) as output_ds: logging.info(f"Monitor at: {output_ds.staging_directory}") for spec in specs: try: logging.info(f"Processing {spec}") process_image_and_series(config, spec, output_ds) except KeyError: logging.error(f"Failed on {spec}") readme_str = config.as_readme_format() output_ds.put_readme(readme_str)
def convert_image_data(data_root_fpath, output_base_uri, output_name): # TODO - bfconvert preflight glob = "*.czi" ids = IndexedDirtree(data_root_fpath, glob=glob) with dtoolcore.DataSetCreator(output_name, output_base_uri) as output_ds: raw_image_dataset_to_image_dataset(ids, output_ds)
def create_tensor_dataset_from_arrays( output_base_uri, output_name, data_array, label_array, image_dim, readme_content ): """Create a dtool DataSet with the necessary annotations to be used as a TensorDataSet. Args: output_base_uri: The base URI where the dataset will be created. output_name: The name for the output dataset. data_array (ndarray): The numpy array holding data. label_array (ndarray): The numpy array holding labels. image_dim (tuple): Dimensions to which input images should be reshaped. readme_content (string): Content that will be used to create README.yml in the created dataset. Returns: URI: The URI of the created dataset """ with dtoolcore.DataSetCreator(output_name, output_base_uri) as qds: data_fpath = qds.prepare_staging_abspath_promise('data.npy') np.save(data_fpath, data_array) labels_fpath = qds.prepare_staging_abspath_promise('labels.npy') np.save(labels_fpath, label_array) data_idn = dtoolcore.utils.generate_identifier('data.npy') qds.put_annotation("tensor_file_idn", data_idn) qds.put_annotation("image_dimensions", image_dim) qds.put_annotation("dtoolAI.inputtype", "TensorDataSet") qds.put_readme(readme_content) return qds.uri
def image_mask_dataset_from_im_mask_iter_only(output_base_uri, output_name, im_mask_iter): with dtoolcore.DataSetCreator( output_name, output_base_uri ) as output_ds: for n, (image, mask) in enumerate(im_mask_iter): metadata_appends = add_image_mask_pair(output_ds, image, mask, n) for relpath, key, value in metadata_appends: output_ds.add_item_metadata(relpath, key, value)
def main(source_dirpath, output_base_uri, output_name): dirpath = pathlib.Path(source_dirpath) diter = pathlib.Path(source_dirpath).glob("*.dbim") with dtoolcore.DataSetCreator(output_name, output_base_uri) as output_ds: for fpath in diter: name = fpath_to_name(fpath) files = load_file_data(dirpath / f"{name}.csv") handle = output_ds.put_item(fpath, fpath.name) output_ds.add_item_metadata(handle, "files", files)
def test_DataSetCreator_put_annotation(tmp_dir_fixture): # NOQA import dtoolcore name = "my-test-ds" base_uri = _sanitise_base_uri(tmp_dir_fixture) creator_username = "******" with dtoolcore.DataSetCreator( name=name, base_uri=base_uri, creator_username=creator_username) as dataset_creator: uri = dataset_creator.uri dataset_creator.put_annotation("key", "value") # The below would raise if the dataset was not frozen. dataset = dtoolcore.DataSet.from_uri(uri) assert dataset.get_annotation("key") == "value"
def test_frozen_at_value_when_copying_dataset(tmp_dir_fixture): # NOQA with dc.DataSetCreator("delete-me", tmp_dir_fixture) as ds_creator: src_uri = ds_creator.uri dest_base_uri = os.path.join(tmp_dir_fixture, "dest") os.mkdir(dest_base_uri) src_dataset = dc.DataSet.from_uri(src_uri) src_frozen_at = src_dataset._admin_metadata["frozen_at"] time.sleep(0.1) dest_uri = dc.copy(src_uri, dest_base_uri) dest_dataset = dc.DataSet.from_uri(dest_uri) dest_frozen_at = dest_dataset._admin_metadata["frozen_at"] assert src_frozen_at == dest_frozen_at
def main(root_dirpath, output_base_uri, output_name): logging.basicConfig(level=logging.INFO) dirpaths_fns = [] for dirpath, dirnames, filenames in os.walk(root_dirpath): for fn in filenames: if is_image(fn): dirpaths_fns.append((dirpath, fn)) with dtoolcore.DataSetCreator(output_name, output_base_uri) as output_ds: for dirpath, filename in dirpaths_fns: basedir = os.path.basename(dirpath) relpath = f"{basedir}/{filename}" src_abspath = os.path.join(dirpath, filename) dst_abspath = output_ds.prepare_staging_abspath_promise(relpath) shutil.copy(src_abspath, dst_abspath)
def test_DataSetCreator_put_readme(tmp_dir_fixture): # NOQA import dtoolcore name = "my-test-ds" base_uri = _sanitise_base_uri(tmp_dir_fixture) readme_content = "---\ndescription: a test dataset" creator_username = "******" with dtoolcore.DataSetCreator( name=name, base_uri=base_uri, creator_username=creator_username) as dataset_creator: uri = dataset_creator.uri assert dataset_creator.proto_dataset.get_readme_content() == "" dataset_creator.put_readme(readme_content) # The below would raise if the dataset was not frozen. dataset = dtoolcore.DataSet.from_uri(uri) assert readme_content == dataset.get_readme_content()
def test_DataSetCreator_staging_api_stage_item(tmp_dir_fixture): # NOQA import dtoolcore from dtoolcore.utils import generate_identifier name = "my-test-ds" base_uri = _sanitise_base_uri(tmp_dir_fixture) readme_content = "---\ndescription: a test dataset" creator_username = "******" handle = "subdir/test.txt" with dtoolcore.DataSetCreator( name=name, base_uri=base_uri, readme_content=readme_content, creator_username=creator_username) as dataset_creator: # Ensure that the staging directory exists. assert os.path.isdir(dataset_creator.staging_directory) # Add an item more programatically. staging_abspath = dataset_creator.prepare_staging_abspath_promise( # NOQA handle) with open(staging_abspath, "w") as fh: fh.write("Hello world!") uri = dataset_creator.uri # Ensure that the staging directory has been removed. assert not os.path.isdir(dataset_creator.staging_directory) # The below would raise if the dataset was not frozen. dataset = dtoolcore.DataSet.from_uri(uri) # Check the content. expected_identifier = generate_identifier(handle) assert expected_identifier in dataset.identifiers manual_item_props = dataset.item_properties(expected_identifier) assert manual_item_props["size_in_bytes"] == 12 assert len(dataset.identifiers) == 1
def test_DataSetCreator_does_not_freeze_if_raises(tmp_dir_fixture): # NOQA import dtoolcore name = "my-test-ds" base_uri = _sanitise_base_uri(tmp_dir_fixture) readme_content = "---\ndescription: a test dataset" creator_username = "******" try: with dtoolcore.DataSetCreator( name=name, base_uri=base_uri, readme_content=readme_content, creator_username=creator_username) as dataset_creator: uri = dataset_creator.uri raise (RuntimeError("Something went wrong")) except RuntimeError: # The below would raise if the dataset was frozen. dtoolcore.ProtoDataSet.from_uri(uri)
def test_DerivedDataSetCreator(tmp_dir_fixture): # NOQA import dtoolcore name = "derived-data-ds" base_uri = _sanitise_base_uri(tmp_dir_fixture) creator_username = "******" with dtoolcore.DataSetCreator("raw-data-ds", base_uri) as dataset_creator: source_dataset_uri = dataset_creator.uri source_dataset = dtoolcore.DataSet.from_uri(source_dataset_uri) with dtoolcore.DerivedDataSetCreator( name=name, base_uri=base_uri, source_dataset=source_dataset, creator_username=creator_username) as derived_dataset_creator: derived_dataset_uri = derived_dataset_creator.uri # The below would raise if the dataset was not frozen. dtoolcore.DataSet.from_uri(derived_dataset_uri)
def test_promised_abspath_missing_raises(tmp_dir_fixture): # NOQA import dtoolcore name = "my-test-ds" base_uri = _sanitise_base_uri(tmp_dir_fixture) readme_content = "---\ndescription: a test dataset" creator_username = "******" handle = "test.txt" with pytest.raises(dtoolcore.DtoolCoreBrokenStagingPromise): with dtoolcore.DataSetCreator( name=name, base_uri=base_uri, readme_content=readme_content, creator_username=creator_username) as dataset_creator: # Add an item more programatically. staging_abspath = dataset_creator.prepare_staging_abspath_promise( # NOQA handle)
def process_from_config(config_fpath): logging.basicConfig(level=logging.INFO) config = Config(config_fpath) params = SimpleNamespace(**config.params) dl = DataLoader(config.raw_config) all_specs = get_specs(config) specs = all_specs from fishtools.data import load_multiannotation_di from dtoolbioimage import Image as dbiImage readme_str = config.as_readme_format() dfs = [] with dtoolcore.DataSetCreator( config.output_name, config.output_base_uri ) as output_ds: for spec in specs: logger.info("Processing n={expid}".format(**spec)) try: # FIXME - naming! dataitem = load_multiannotation_di(config, spec, config.use_deconv) df = process_dataitem( dataitem, spec, params, config, output_ds) df['expid'] = spec['expid'] dfs.append(df) except (FileNotFoundError, IndexError) as err: logger.warning(f"Couldn't load: {err}") summary_output_abspath = output_ds.prepare_staging_abspath_promise( f"summary.csv") pd.concat(dfs).to_csv(summary_output_abspath, index=False) output_ds.put_readme(readme_str)
def image_dataset_from_dirtree(dirtree_dirpath, output_base_uri, output_name): """Creates an ImageDataSet based on a root path containing directories of images. Each named directory is assumed to name a category of images. Args: dirtree_dirpath: Path a the root of a directory tree containing named subdirectories. output_base_uri: Base URI at which DataSet will be created. output_name: Name of created DataSet. Returns: URI of created ImageDataSet. """ categories = [d for d in os.listdir(dirtree_dirpath)] def relpath_from_srcpath(srcpath, cat): return cat + '/' + os.path.basename(srcpath) items_to_include = {} for cat in categories: srcpath_iter = (Path(dirtree_dirpath) / cat).iterdir() items_to_include.update({ srcpath: (relpath_from_srcpath(srcpath, cat), cat) for srcpath in srcpath_iter }) category_encoding = {c: n for n, c in enumerate(categories)} abs_dirpath = os.path.abspath(dirtree_dirpath) readme_content = IMAGEDS_README_TEMPLATE.format(dirpath=abs_dirpath) with dtoolcore.DataSetCreator(output_name, output_base_uri, readme_content=readme_content) as output_ds: for srcpath, (relpath, cat) in items_to_include.items(): handle = output_ds.put_item(srcpath, relpath) output_ds.add_item_metadata(handle, 'category', cat) output_ds.proto_dataset.put_annotation('category_encoding', category_encoding) output_ds.proto_dataset.put_annotation("dtoolAI.inputtype", "ImageDataSet") return output_ds.uri
def main(config_fpath): logging.basicConfig(level=logging.INFO) config = Config(config_fpath) params = SimpleNamespace(**config.params) dl = DataLoader(config.raw_config) all_specs = get_specs(config) import random # specs = random.sample(all_specs, 10) specs = all_specs # print(specs) # import sys; sys.exit(0) # print(specs) # from dtoolbioimage import ImageDataSet # ids = ImageDataSet(config.ids_uri) # print(ids.all_possible_stack_tuples()) # specs = get_specs(config) # spec = specs[0] # df = dl.load_by_specifier(**spec) from fishtools.data import load_multiannotation_di from dtoolbioimage import Image as dbiImage # TODO - some visual info dumping # di = load_wubbly(config, specs[0]) # diagnostics(di, specs[0], config, params) # di.maxproj.view(dbiImage).save("max.png") # filt = get_filtered_segmentation(di, params) # filt.pretty_color_image.view(dbiImage).save("seg.png") # process_dataitem(di, spec, params, config, None) readme_str = config.as_readme_format() dfs = [] with dtoolcore.DataSetCreator(config.output_name, config.output_base_uri) as output_ds: for spec in specs: # FIXME logger.info("Processing n={expid}".format(**spec)) try: # dataitem = dl.load_by_specifier(**spec) # FIXME - naming! dataitem = load_multiannotation_di(config, spec) df = process_dataitem(dataitem, spec, params, config, output_ds) df['expid'] = spec['expid'] dfs.append(df) except FileNotFoundError as err: logger.warning(f"Couldn't load: {err}") summary_output_abspath = output_ds.prepare_staging_abspath_promise( f"summary.csv") pd.concat(dfs).to_csv(summary_output_abspath, index=False) output_ds.put_readme(readme_str)