Пример #1
0
def test_get_tree_size(tmpdir):
    tmpdir.mkdir("l1").mkdir("l2").join("test").write("foo")
    tmpdir.mkdir("l3").mkdir("l2").join("test").write("foo")
    sym_false = utils.get_tree_size(tmpdir.join("l1"), follow_symlinks=False)
    os.symlink(tmpdir.join("l3"), tmpdir.join("l1").join("l3"))
    sym_true = utils.get_tree_size(tmpdir.join("l1"), follow_symlinks=True)

    assert sym_false
    assert sym_true
    assert sym_true > sym_false
Пример #2
0
    def import_bedfiles(
        cls, technique, targets_path, baits_path, assembly, species, description=None
    ):
        """
        Register input_bed_path in technique's storage dir and update `data`.

        Arguments:
            technique (str): technique slug.
            targets_path (str): path to targets bedfile.
            baits_path (str): path to baits bedfile.
            assembly (str): name of reference genome for bedfile.
            species (str): name of genome species.
            description (str): a description of the BED files.

        Returns:
            dict: updated technique instance as retrieved from API.
        """
        utils.check_admin()
        technique = api.get_instance("techniques", technique)
        targets_key = f"{assembly}_targets_bedfile"
        baits_key = f"{assembly}_baits_bedfile"

        if targets_key in technique["reference_data"]:
            raise click.UsageError(
                f"Technique '{technique['slug']}' "
                f"has registered BED files for '{assembly}':\n"
                f'\n\t{technique["reference_data"][targets_key]}'
                f'\n\t{technique["reference_data"][baits_key]}'
            )

        if not technique["storage_url"]:
            technique = update_storage_url("techniques", technique["pk"])

        api.create_instance("assemblies", name=assembly, species=species)
        beds_dir = join(technique["storage_url"], "bed_files", assembly)
        base_name = slugify(f'{technique["slug"]}.{assembly}')
        targets_dst = join(beds_dir, f"{base_name}.targets.bed")
        baits_dst = join(beds_dir, f"{base_name}.baits.bed")
        os.makedirs(beds_dir, exist_ok=True)

        for src, dst in [(targets_path, targets_dst), (baits_path, baits_dst)]:
            cls.echo_src_dst("Copying", src, dst)
            shutil.copy(src, dst)
            click.secho(f"\nProcessing {basename(dst)}...", fg="blue")
            cls.process_bedfile(dst)

        click.secho(f'\nSuccess! patching {technique["slug"]}...', fg="green")

        for i, j in [(targets_key, targets_dst), (baits_key, baits_dst)]:
            technique["reference_data"][i] = {
                "url": j + ".gz",
                "description": description,
            }

        return api.patch_instance(
            endpoint="techniques",
            instance_id=technique["pk"],
            storage_usage=utils.get_tree_size(technique["storage_url"]),
            reference_data=technique["reference_data"],
        )
Пример #3
0
    def import_data(
        cls,
        identifier,
        data_src,
        data_id,
        symlink,
        description,
        sub_dir=None,
        model="assemblies",
    ):
        """
        Register reference resources for a given assembly.

        Arguments:
            identifier (str): name of assembly or technique.
            model (str): either `techniques` or `assemblies`.
            data_src (str): path to reference data.
            data_id (str): identifier that will be used for reference data.
            symlink (str): symlink instead of move.
            description (str): reference data description.
            sub_dir (str): target sub dir for the resource, default is data_id.

        Returns:
            dict: updated assembly instance as retrieved from API.
        """
        utils.check_admin()
        data_id = slugify(data_id, separator="_")
        click.echo(f'`data_id` set to: {click.style(data_id, fg="green")}')
        instance = api.get_instance(model, identifier)

        if data_id in instance["reference_data"]:
            raise click.UsageError(
                f"{instance['name']} has already reference data registered with id "
                f'"{data_id}":\n\n\t{instance["reference_data"][data_id]}'
            )

        if not instance["storage_url"]:
            instance = update_storage_url(model, instance["name"])

        data_dir = join(instance["storage_url"], sub_dir or data_id)
        data_dst = join(data_dir, basename(data_src))
        os.makedirs(data_dir, exist_ok=True)

        if symlink:
            cls.echo_src_dst("Linking", data_src, data_dst)
            cls.symlink(data_src, data_dst)
        else:
            cls.echo_src_dst("Moving", data_src, data_dst)
            cls.move(data_src, data_dst)

        click.secho(f'\nSuccess! patching {instance["name"]}...', fg="green")
        instance["reference_data"][data_id] = {}
        instance["reference_data"][data_id]["url"] = data_dst
        instance["reference_data"][data_id]["description"] = description
        return api.patch_instance(
            endpoint=model,
            instance_id=instance["pk"],
            storage_usage=utils.get_tree_size(instance["storage_url"]),
            reference_data=instance["reference_data"],
        )
Пример #4
0
def patch_analysis_status(analysis, status):
    """
    Patch a successful analysis.

    Make sure analysis is owned by admin user and that results field is updated.

    Arguments:
        analysis (dict): analysis instance.
        status (dict): analysis status.

    Returns:
        dict: patched analysis instance.
    """
    data = {"status": status}
    storage_url = analysis["storage_url"]
    analysis["status"] = status  # make sure that the analysis status is updated
    _set_analysis_permissions(analysis)

    if status in {"FAILED", "SUCCEEDED", "IN_PROGRESS"}:
        data["storage_usage"] = utils.get_tree_size(storage_url)

    if status == "STARTED":
        data["ran_by"] = system_settings.api_username

    if status in {"SUCCEEDED", "IN_PROGRESS"}:
        try:
            data["results"] = _get_analysis_results(analysis, raise_error=True)
        except Exception as error:  # pragma: no cover
            data["status"] = "FAILED"
            patch_instance("analyses", analysis["pk"], **data)
            raise error

    return patch_instance("analyses", analysis["pk"], **data)
Пример #5
0
    def import_files(self, instance, files, files_data, symlink):
        """
        Move/link files into instance's `storage_url` and update database.

        Arguments:
            instance (dict): experiment instance.
            files (dict): list of files to be imported.
            symlink (dict): whether to symlink or move the data.
            files_data (dict): keys are files basenames and values are
                dicts with extra annotations such as PL, LB, or any other.

        Raises:
            click.UsageError: if multiple data formats are found.

        Returns:
            dict: patched experiment instance.
        """
        raw_data = []
        src_dst = []

        if not instance["storage_url"]:
            instance = update_storage_url(
                endpoint="experiments", identifier=instance["pk"], use_hash=True
            )

        data_dir = join(instance["storage_url"], "data")
        os.makedirs(data_dir, exist_ok=True)

        for src, file_type in [(i["path"], i["dtype"]) for i in files]:
            file_name = basename(src)
            file_data = files_data.get(file_name, {})

            # make sure there are no duplicate file names
            if not file_name.startswith(instance["system_id"]):
                file_hash = hex(abs(hash(dirname(src))))[2:]
                file_name = f'{instance["system_id"]}_{file_hash}_{file_name}'

            # make sure we don't add the same file twice
            if all(i != src for i, _ in src_dst):
                dst = join(data_dir, file_name)
                src_dst.append((src, dst))
                raw_data.append(
                    dict(
                        hash_value=getsize(src),
                        hash_method="os.path.getsize",
                        file_url=dst,
                        file_type=file_type,
                        file_data=self.annotate_file_data(
                            experiment=instance,
                            file_type=file_type,
                            file_data=file_data,
                            src=src,
                            dst=dst,
                        ),
                    )
                )

        for src, dst in src_dst:
            if symlink:
                self.symlink(src, dst)
            else:
                self.move(src, dst)

        return api.patch_instance(
            endpoint="experiments",
            instance_id=instance["pk"],
            storage_url=instance["storage_url"],
            storage_usage=utils.get_tree_size(instance["storage_url"]),
            raw_data=sorted(raw_data, key=lambda i: i["file_url"]),
        )
Пример #6
0
        def cmd(assembly, symlink, genome_path, dont_index):
            """
            Register an assembly reference genome.

            By default, an attempt to create indexes will be perfomed.
            """
            assembly = LocalReferenceDataImporter.import_data(
                data_id="genome_fasta",
                symlink=symlink,
                data_src=genome_path,
                identifier=assembly,
                model="assemblies",
                description="Reference Genome Fasta File.",
            )

            genome_fasta = assembly["reference_data"]["genome_fasta"]["url"]
            genome_dir = dirname(genome_fasta)
            commands = [
                ["bwa", "index", genome_fasta],
                ["samtools", "faidx", genome_fasta],
                [
                    "samtools",
                    "dict",
                    genome_fasta,
                    "-a",
                    assembly["name"],
                    "-s",
                    assembly["species"],
                    "-o",
                    join(genome_fasta + ".dict"),
                ],
            ]
            for i in commands:
                if dont_index:
                    click.secho(f"Skipping indexing:\n\n\t{' '.join(i)}", fg="yellow")
                    continue

                try:  # pragma: no cover
                    subprocess.check_call(i)
                except subprocess.CalledProcessError:  # pragma: no cover
                    click.secho(
                        f"INDEX FAILED, MUST BE FIXED:\n\n\t{' '.join(i)}", fg="red"
                    )

            indexes = {
                "bwa index": ["amb", "ann", "bwt", "pac", "sa"],
                "samtools faidx": ["fai"],
                "samtools dict": ["dict"],
            }

            for i, indexes in indexes.items():
                for j in indexes:
                    assembly["reference_data"][f"genome_fasta_{j}"] = {
                        "url": join(genome_fasta + f".{j}"),
                        "description": f"Index generated by: {i}",
                    }

            for i in glob(genome_fasta.split(".", 1)[0] + "*"):
                dst = join(genome_dir, assembly["name"] + "." + i.split(".", 1)[-1])
                if i != dst:
                    utils.force_symlink(i, dst)

            api.patch_instance(
                endpoint="assemblies",
                instance_id=assembly["pk"],
                storage_usage=utils.get_tree_size(assembly["storage_url"]),
                reference_data=assembly["reference_data"],
            )