def test_get_tree_size(tmpdir): tmpdir.mkdir("l1").mkdir("l2").join("test").write("foo") tmpdir.mkdir("l3").mkdir("l2").join("test").write("foo") sym_false = utils.get_tree_size(tmpdir.join("l1"), follow_symlinks=False) os.symlink(tmpdir.join("l3"), tmpdir.join("l1").join("l3")) sym_true = utils.get_tree_size(tmpdir.join("l1"), follow_symlinks=True) assert sym_false assert sym_true assert sym_true > sym_false
def import_bedfiles( cls, technique, targets_path, baits_path, assembly, species, description=None ): """ Register input_bed_path in technique's storage dir and update `data`. Arguments: technique (str): technique slug. targets_path (str): path to targets bedfile. baits_path (str): path to baits bedfile. assembly (str): name of reference genome for bedfile. species (str): name of genome species. description (str): a description of the BED files. Returns: dict: updated technique instance as retrieved from API. """ utils.check_admin() technique = api.get_instance("techniques", technique) targets_key = f"{assembly}_targets_bedfile" baits_key = f"{assembly}_baits_bedfile" if targets_key in technique["reference_data"]: raise click.UsageError( f"Technique '{technique['slug']}' " f"has registered BED files for '{assembly}':\n" f'\n\t{technique["reference_data"][targets_key]}' f'\n\t{technique["reference_data"][baits_key]}' ) if not technique["storage_url"]: technique = update_storage_url("techniques", technique["pk"]) api.create_instance("assemblies", name=assembly, species=species) beds_dir = join(technique["storage_url"], "bed_files", assembly) base_name = slugify(f'{technique["slug"]}.{assembly}') targets_dst = join(beds_dir, f"{base_name}.targets.bed") baits_dst = join(beds_dir, f"{base_name}.baits.bed") os.makedirs(beds_dir, exist_ok=True) for src, dst in [(targets_path, targets_dst), (baits_path, baits_dst)]: cls.echo_src_dst("Copying", src, dst) shutil.copy(src, dst) click.secho(f"\nProcessing {basename(dst)}...", fg="blue") cls.process_bedfile(dst) click.secho(f'\nSuccess! patching {technique["slug"]}...', fg="green") for i, j in [(targets_key, targets_dst), (baits_key, baits_dst)]: technique["reference_data"][i] = { "url": j + ".gz", "description": description, } return api.patch_instance( endpoint="techniques", instance_id=technique["pk"], storage_usage=utils.get_tree_size(technique["storage_url"]), reference_data=technique["reference_data"], )
def import_data( cls, identifier, data_src, data_id, symlink, description, sub_dir=None, model="assemblies", ): """ Register reference resources for a given assembly. Arguments: identifier (str): name of assembly or technique. model (str): either `techniques` or `assemblies`. data_src (str): path to reference data. data_id (str): identifier that will be used for reference data. symlink (str): symlink instead of move. description (str): reference data description. sub_dir (str): target sub dir for the resource, default is data_id. Returns: dict: updated assembly instance as retrieved from API. """ utils.check_admin() data_id = slugify(data_id, separator="_") click.echo(f'`data_id` set to: {click.style(data_id, fg="green")}') instance = api.get_instance(model, identifier) if data_id in instance["reference_data"]: raise click.UsageError( f"{instance['name']} has already reference data registered with id " f'"{data_id}":\n\n\t{instance["reference_data"][data_id]}' ) if not instance["storage_url"]: instance = update_storage_url(model, instance["name"]) data_dir = join(instance["storage_url"], sub_dir or data_id) data_dst = join(data_dir, basename(data_src)) os.makedirs(data_dir, exist_ok=True) if symlink: cls.echo_src_dst("Linking", data_src, data_dst) cls.symlink(data_src, data_dst) else: cls.echo_src_dst("Moving", data_src, data_dst) cls.move(data_src, data_dst) click.secho(f'\nSuccess! patching {instance["name"]}...', fg="green") instance["reference_data"][data_id] = {} instance["reference_data"][data_id]["url"] = data_dst instance["reference_data"][data_id]["description"] = description return api.patch_instance( endpoint=model, instance_id=instance["pk"], storage_usage=utils.get_tree_size(instance["storage_url"]), reference_data=instance["reference_data"], )
def patch_analysis_status(analysis, status): """ Patch a successful analysis. Make sure analysis is owned by admin user and that results field is updated. Arguments: analysis (dict): analysis instance. status (dict): analysis status. Returns: dict: patched analysis instance. """ data = {"status": status} storage_url = analysis["storage_url"] analysis["status"] = status # make sure that the analysis status is updated _set_analysis_permissions(analysis) if status in {"FAILED", "SUCCEEDED", "IN_PROGRESS"}: data["storage_usage"] = utils.get_tree_size(storage_url) if status == "STARTED": data["ran_by"] = system_settings.api_username if status in {"SUCCEEDED", "IN_PROGRESS"}: try: data["results"] = _get_analysis_results(analysis, raise_error=True) except Exception as error: # pragma: no cover data["status"] = "FAILED" patch_instance("analyses", analysis["pk"], **data) raise error return patch_instance("analyses", analysis["pk"], **data)
def import_files(self, instance, files, files_data, symlink): """ Move/link files into instance's `storage_url` and update database. Arguments: instance (dict): experiment instance. files (dict): list of files to be imported. symlink (dict): whether to symlink or move the data. files_data (dict): keys are files basenames and values are dicts with extra annotations such as PL, LB, or any other. Raises: click.UsageError: if multiple data formats are found. Returns: dict: patched experiment instance. """ raw_data = [] src_dst = [] if not instance["storage_url"]: instance = update_storage_url( endpoint="experiments", identifier=instance["pk"], use_hash=True ) data_dir = join(instance["storage_url"], "data") os.makedirs(data_dir, exist_ok=True) for src, file_type in [(i["path"], i["dtype"]) for i in files]: file_name = basename(src) file_data = files_data.get(file_name, {}) # make sure there are no duplicate file names if not file_name.startswith(instance["system_id"]): file_hash = hex(abs(hash(dirname(src))))[2:] file_name = f'{instance["system_id"]}_{file_hash}_{file_name}' # make sure we don't add the same file twice if all(i != src for i, _ in src_dst): dst = join(data_dir, file_name) src_dst.append((src, dst)) raw_data.append( dict( hash_value=getsize(src), hash_method="os.path.getsize", file_url=dst, file_type=file_type, file_data=self.annotate_file_data( experiment=instance, file_type=file_type, file_data=file_data, src=src, dst=dst, ), ) ) for src, dst in src_dst: if symlink: self.symlink(src, dst) else: self.move(src, dst) return api.patch_instance( endpoint="experiments", instance_id=instance["pk"], storage_url=instance["storage_url"], storage_usage=utils.get_tree_size(instance["storage_url"]), raw_data=sorted(raw_data, key=lambda i: i["file_url"]), )
def cmd(assembly, symlink, genome_path, dont_index): """ Register an assembly reference genome. By default, an attempt to create indexes will be perfomed. """ assembly = LocalReferenceDataImporter.import_data( data_id="genome_fasta", symlink=symlink, data_src=genome_path, identifier=assembly, model="assemblies", description="Reference Genome Fasta File.", ) genome_fasta = assembly["reference_data"]["genome_fasta"]["url"] genome_dir = dirname(genome_fasta) commands = [ ["bwa", "index", genome_fasta], ["samtools", "faidx", genome_fasta], [ "samtools", "dict", genome_fasta, "-a", assembly["name"], "-s", assembly["species"], "-o", join(genome_fasta + ".dict"), ], ] for i in commands: if dont_index: click.secho(f"Skipping indexing:\n\n\t{' '.join(i)}", fg="yellow") continue try: # pragma: no cover subprocess.check_call(i) except subprocess.CalledProcessError: # pragma: no cover click.secho( f"INDEX FAILED, MUST BE FIXED:\n\n\t{' '.join(i)}", fg="red" ) indexes = { "bwa index": ["amb", "ann", "bwt", "pac", "sa"], "samtools faidx": ["fai"], "samtools dict": ["dict"], } for i, indexes in indexes.items(): for j in indexes: assembly["reference_data"][f"genome_fasta_{j}"] = { "url": join(genome_fasta + f".{j}"), "description": f"Index generated by: {i}", } for i in glob(genome_fasta.split(".", 1)[0] + "*"): dst = join(genome_dir, assembly["name"] + "." + i.split(".", 1)[-1]) if i != dst: utils.force_symlink(i, dst) api.patch_instance( endpoint="assemblies", instance_id=assembly["pk"], storage_usage=utils.get_tree_size(assembly["storage_url"]), reference_data=assembly["reference_data"], )