def collection_fixture(request): collection_path = tempfile.mkdtemp() collection = dtoolcore.Collection() collection.persist_to_path(collection_path) for ds_name in ["rice", "wheat", "barley"]: ds_path = os.path.join(collection_path, ds_name) os.mkdir(ds_path) dataset = dtoolcore.DataSet(ds_name, "data") dataset.persist_to_path(ds_path) for s in ["sow", "grow", "harvest"]: fname = s + ".txt" fpath = os.path.join(ds_path, "data", fname) with open(fpath, "w") as fh: fh.write("{} {}\n".format(s, ds_name)) dataset.update_manifest() @request.addfinalizer def teardown(): shutil.rmtree(collection_path) return collection_path
def dataset_fixture(request): d = tempfile.mkdtemp() dataset = dtoolcore.DataSet("test", "data") dataset.persist_to_path(d) for s in ["hello", "world"]: fname = s + ".txt" fpath = os.path.join(d, "data", fname) with open(fpath, "w") as fh: fh.write(s) dataset.update_manifest() @request.addfinalizer def teardown(): shutil.rmtree(d) return d
def template(dataset_path, new_dataset_path): """Create new empty dataset with metadata from existing dataset.""" parent_dataset = dtoolcore.DataSet.from_path(dataset_path) new_dataset_path = os.path.abspath(new_dataset_path) output_dir, dataset_name = os.path.split(new_dataset_path) # There are ways of doing this that result in error messages where # the specific offending argument is highlighted. # http://click.pocoo.org/5/options/#callbacks-for-validation if os.path.exists(new_dataset_path): raise click.BadParameter( "Path already exists: {}".format(new_dataset_path) ) if not os.path.isdir(output_dir): raise click.BadParameter( "Output directory does not exist: {}".format(output_dir) ) # Create empty dataset new_dataset = dtoolcore.DataSet(dataset_name, data_directory="data") os.mkdir(new_dataset_path) new_dataset.persist_to_path(new_dataset_path) # Template the descriptive metadata. with open(parent_dataset.abs_readme_path) as fh: descriptive_metadata = yaml.load(fh) # Need explicit call to str() to ensure pyyaml does not mark up data with # Python types. descriptive_metadata["dataset_name"] = str(dataset_name) descriptive_metadata["creation_date"] = str(datetime.date.today()) descriptive_metadata["parent_dataset"] = dict(path=parent_dataset._abs_path, uuid=str(parent_dataset.uuid)) with open(new_dataset.abs_readme_path, "w") as fh: yaml.dump( descriptive_metadata, fh, explicit_start=True, default_flow_style=False)
def dminify(dataset_path, new_dataset_path, n): parent_dataset = dtoolcore.DataSet.from_path(dataset_path) output_dir, dataset_name = os.path.split(new_dataset_path) # There are ways of doing this that result in error messages where # the specific offending argument is highlighted. # http://click.pocoo.org/5/options/#callbacks-for-validation if os.path.exists(new_dataset_path): raise click.BadParameter( "Path already exists: {}".format(new_dataset_path) ) if not os.path.isdir(output_dir): raise click.BadParameter( "Output directory does not exist: {}".format(output_dir) ) output_dataset_data_dir = os.path.join(new_dataset_path, 'data') for entry in parent_dataset.manifest['file_list']: if is_file_extension_in_list(entry['path'], ['.fq', '.fq.gz']): output_file_path = os.path.join( output_dataset_data_dir, entry['path'] ) identifier = entry['hash'] input_file_path = parent_dataset.abspath_from_identifier( identifier ) minify(input_file_path, output_file_path, n) output_dataset = dtoolcore.DataSet(dataset_name, 'data') output_dataset.persist_to_path(new_dataset_path) output_dataset.update_manifest() with open(parent_dataset.abs_readme_path, 'r') as ifh: with open(output_dataset.abs_readme_path, 'w') as ofh: ofh.write(ifh.read()) ofh.write("minified_from: {}\n".format(parent_dataset.uuid))