Exemplo n.º 1
0
def test_failed_signal():
    analysis = api.create_instance("analyses", **factories.AnalysisFactory())
    get_kwargs = dict(
        target_endpoint="analyses", endpoint="signals", target_id=analysis.pk
    )

    # check signals work and nothing is created
    api._run_signals("analyses", analysis, [besuhof_signal])
    assert len(api.get_instances(**get_kwargs)) == 0

    # check signals failed
    analysis = api.patch_instance("analyses", analysis.pk, notes="please fail")
    api._run_signals("analyses", analysis, [besuhof_signal])
    instances = api.get_instances(**get_kwargs)
    assert len(instances) == 1
    assert _FAILED_SIGNAL_MESSAGE in instances[0].data["failure_traceback"]

    # assert that error traceback is updated
    runner = CliRunner()
    args = f"-fi target_endpoint analyses -fi target_id {analysis.pk}".split()
    api.patch_instance("analyses", analysis.pk, notes="fail with different msg")
    runner.invoke(commands.rerun_signals, args, catch_exceptions=False)
    instances = api.get_instances(**get_kwargs)
    assert len(instances) == 1
    assert "but with a different msg..." in instances[0].data["failure_traceback"]

    # assert that signal is deleted after no failure is detected
    api.patch_instance("analyses", analysis.pk, notes="")
    runner.invoke(commands.rerun_signals, args, catch_exceptions=False)
    assert len(api.get_instances(**get_kwargs)) == 0
def test_get_bed():
    runner = CliRunner()
    technique = api.create_instance("techniques",
                                    **factories.TechniqueFactory())
    args = [str(technique.pk)]
    result = runner.invoke(commands.get_bed, args, catch_exceptions=False)
    assert "No BED files" in result.output

    api.patch_instance(
        "techniques",
        technique.pk,
        reference_data={"test_targets_bedfile": {
            "url": "/hello/world"
        }},
    )

    result = runner.invoke(commands.get_bed, args, catch_exceptions=False)
    assert "/hello/world" in result.output

    api.patch_instance(
        "techniques",
        technique.pk,
        reference_data={
            "test_targets_bedfile": {
                "url": "/hello/world"
            },
            "another_targets_bedfile": {
                "url": "/hello/world"
            },
        },
    )

    result = runner.invoke(commands.get_bed, args, catch_exceptions=False)
    assert "Multiple BEDs" in result.output
Exemplo n.º 3
0
def test_get_experiments_from_default_cli_options(tmpdir):
    app = ExperimentsFromDefaulCLIApplication()
    experiments = [
        api.create_instance("experiments", **factories.ExperimentFactory())
        for i in range(4)
    ]
    analysis = api.create_instance(
        "analyses",
        **{
            **factories.AnalysisFactory(),
            "targets": experiments,
            "references": experiments,
        },
    )

    pairs_file = tmpdir.join("pairs.txt")
    pairs_file.write(experiments[1].system_id + "\t" +
                     experiments[0].system_id + "\n")

    # get coverage for invalid experiments
    api.patch_instance("experiments",
                       experiments[0].system_id,
                       notes="raise validation error")

    command = ExperimentsFromDefaulCLIApplication.as_cli_command()
    runner = CliRunner()
    result = runner.invoke(
        command,
        [
            "--pair",
            experiments[0].system_id,
            experiments[1].system_id,
            "--pairs",
            experiments[2].system_id,
            experiments[3].system_id,
            "--targets-filters",
            "pk",
            experiments[3].pk,
            "--references-filters",
            "pk",
            experiments[2].pk,
            "--analyses-filters",
            "pk",
            analysis.pk,
            "--pairs-from-file",
            str(pairs_file),
        ],
        catch_exceptions=False,
    )
    assert experiments[0].system_id in result.output
    assert "INVALID" in result.output

    # just get coverage for get_job_name
    assert ExperimentsFromDefaulCLIApplication.get_job_name(analysis)
def test_get_bams():
    runner = CliRunner()
    experiment = api.create_instance("experiments",
                                     **factories.ExperimentFactory())
    args = [str(experiment.pk)]
    result = runner.invoke(commands.get_bams, args, catch_exceptions=False)
    assert "No bams for" in result.output

    result = runner.invoke(commands.get_bams,
                           args + ["--verbose"],
                           catch_exceptions=False)
    assert experiment.system_id in result.output
    assert "None" in result.output

    api.patch_instance(
        "experiments",
        experiment.pk,
        bam_files={"grch": {
            "url": "/hello/world",
            "analysis": 1
        }},
    )

    result = runner.invoke(commands.get_bams, args, catch_exceptions=False)
    assert "/hello/world" in result.output

    api.patch_instance(
        "experiments",
        experiment.pk,
        bam_files={
            "a1": {
                "url": "/hello/world",
                "analysis": 1
            },
            "a2": {
                "url": "/hello/mars",
                "analysis": 2
            },
        },
    )

    result = runner.invoke(commands.get_bams, args, catch_exceptions=False)
    assert "Multiple bams" in result.output

    result = runner.invoke(commands.get_bams,
                           args + ["--assembly", "a2"],
                           catch_exceptions=False)
    assert "/hello/mars" in result.output
Exemplo n.º 5
0
    def import_bedfiles(
        cls, technique, targets_path, baits_path, assembly, species, description=None
    ):
        """
        Register input_bed_path in technique's storage dir and update `data`.

        Arguments:
            technique (str): technique slug.
            targets_path (str): path to targets bedfile.
            baits_path (str): path to baits bedfile.
            assembly (str): name of reference genome for bedfile.
            species (str): name of genome species.
            description (str): a description of the BED files.

        Returns:
            dict: updated technique instance as retrieved from API.
        """
        utils.check_admin()
        technique = api.get_instance("techniques", technique)
        targets_key = f"{assembly}_targets_bedfile"
        baits_key = f"{assembly}_baits_bedfile"

        if targets_key in technique["reference_data"]:
            raise click.UsageError(
                f"Technique '{technique['slug']}' "
                f"has registered BED files for '{assembly}':\n"
                f'\n\t{technique["reference_data"][targets_key]}'
                f'\n\t{technique["reference_data"][baits_key]}'
            )

        if not technique["storage_url"]:
            technique = update_storage_url("techniques", technique["pk"])

        api.create_instance("assemblies", name=assembly, species=species)
        beds_dir = join(technique["storage_url"], "bed_files", assembly)
        base_name = slugify(f'{technique["slug"]}.{assembly}')
        targets_dst = join(beds_dir, f"{base_name}.targets.bed")
        baits_dst = join(beds_dir, f"{base_name}.baits.bed")
        os.makedirs(beds_dir, exist_ok=True)

        for src, dst in [(targets_path, targets_dst), (baits_path, baits_dst)]:
            cls.echo_src_dst("Copying", src, dst)
            shutil.copy(src, dst)
            click.secho(f"\nProcessing {basename(dst)}...", fg="blue")
            cls.process_bedfile(dst)

        click.secho(f'\nSuccess! patching {technique["slug"]}...', fg="green")

        for i, j in [(targets_key, targets_dst), (baits_key, baits_dst)]:
            technique["reference_data"][i] = {
                "url": j + ".gz",
                "description": description,
            }

        return api.patch_instance(
            endpoint="techniques",
            instance_id=technique["pk"],
            storage_usage=utils.get_tree_size(technique["storage_url"]),
            reference_data=technique["reference_data"],
        )
Exemplo n.º 6
0
    def import_data(
        cls,
        identifier,
        data_src,
        data_id,
        symlink,
        description,
        sub_dir=None,
        model="assemblies",
    ):
        """
        Register reference resources for a given assembly.

        Arguments:
            identifier (str): name of assembly or technique.
            model (str): either `techniques` or `assemblies`.
            data_src (str): path to reference data.
            data_id (str): identifier that will be used for reference data.
            symlink (str): symlink instead of move.
            description (str): reference data description.
            sub_dir (str): target sub dir for the resource, default is data_id.

        Returns:
            dict: updated assembly instance as retrieved from API.
        """
        utils.check_admin()
        data_id = slugify(data_id, separator="_")
        click.echo(f'`data_id` set to: {click.style(data_id, fg="green")}')
        instance = api.get_instance(model, identifier)

        if data_id in instance["reference_data"]:
            raise click.UsageError(
                f"{instance['name']} has already reference data registered with id "
                f'"{data_id}":\n\n\t{instance["reference_data"][data_id]}'
            )

        if not instance["storage_url"]:
            instance = update_storage_url(model, instance["name"])

        data_dir = join(instance["storage_url"], sub_dir or data_id)
        data_dst = join(data_dir, basename(data_src))
        os.makedirs(data_dir, exist_ok=True)

        if symlink:
            cls.echo_src_dst("Linking", data_src, data_dst)
            cls.symlink(data_src, data_dst)
        else:
            cls.echo_src_dst("Moving", data_src, data_dst)
            cls.move(data_src, data_dst)

        click.secho(f'\nSuccess! patching {instance["name"]}...', fg="green")
        instance["reference_data"][data_id] = {}
        instance["reference_data"][data_id]["url"] = data_dst
        instance["reference_data"][data_id]["description"] = description
        return api.patch_instance(
            endpoint=model,
            instance_id=instance["pk"],
            storage_usage=utils.get_tree_size(instance["storage_url"]),
            reference_data=instance["reference_data"],
        )
Exemplo n.º 7
0
def patch_results(filters, force):
    """Update the results field of many analyses."""
    utils.check_admin()
    skipped = []

    with click.progressbar(
            api.get_instances("analyses", verbose=True, **filters),
            label="Patching analyses...",
    ) as bar:
        for i in bar:
            if force or not i.results:
                results = api._get_analysis_results(i, raise_error=False)
                api.patch_instance("analyses", i.pk, results=results)
            else:  # pragma: no cover
                skipped.append(i)

    if skipped:  # pragma: no cover
        click.echo(
            f"{len(skipped)} analyses had results, use --force to update...")
Exemplo n.º 8
0
    def get_analysis_results(self, analysis):
        target = analysis["targets"][0]
        outdir = analysis["storage_url"]
        multiqc = join(outdir, "multiqc")
        multiqc_data = join(multiqc, "multiqc_data")

        results = {
            "multiqc_html": join(multiqc, "multiqc_report.html"),
            "multiqc_data": join(multiqc_data, "multiqc_data.json"),
            "multiqc_stats": join(multiqc_data, "multiqc_general_stats.txt"),
            "read_length": None,
        }

        for key, i in results.items():
            if key == "multiqc_data":
                continue
            assert True if i is None else isfile(i), f"Missing result {i}"

        if target["technique"]["category"] == "DNA":
            read_length_column = "MEAN_READ_LENGTH"
            read_length_path = "multiqc_picard_AlignmentSummaryMetrics.txt"
            read_length_path = join(multiqc_data, read_length_path)
        else:
            read_length_column = "Read Length"
            read_length_path = join(multiqc_data, "multiqc_rna_seqc.txt")

        with open(read_length_path) as f:
            row = next(csv.DictReader(f, delimiter="\t"))
            results["read_length"] = float(row[read_length_column])

            if "read_length" in target:
                api.patch_instance(
                    endpoint="experiments",
                    instance_id=target["pk"],
                    read_length=results["read_length"],
                )

        return results
def test_get_data(tmpdir):
    runner = CliRunner()
    experiment = api.create_instance("experiments",
                                     **factories.ExperimentFactory())
    experiment = data.update_storage_url("experiments", experiment.pk)
    args = [str(experiment.pk)]
    result = runner.invoke(commands.get_data, args, catch_exceptions=False)
    assert "No data for" in result.output

    result = runner.invoke(commands.get_bams,
                           args + ["--verbose"],
                           catch_exceptions=False)
    assert experiment.system_id in result.output
    assert "None" in result.output

    api.patch_instance(
        "experiments",
        experiment.pk,
        raw_data=[
            {
                "file_url": "/hello/world",
                "file_type": "TXT"
            },
            {
                "file_url": "/hello/mars",
                "file_type": "PNG"
            },
        ],
    )

    result = runner.invoke(commands.get_data, args, catch_exceptions=False)
    assert "/hello/world" in result.output
    assert "/hello/mars" in result.output

    result = runner.invoke(commands.get_data,
                           args + ["--dtypes", "TXT"],
                           catch_exceptions=False)
    assert "/hello/mars" not in result.output
Exemplo n.º 10
0
def test_system_id():
    data_a = factories.ExperimentFactory()
    data_b = factories.ExperimentFactory(sample=data_a["sample"])
    instance_a = api.create_instance("experiments", **data_a)
    instance_b = api.create_instance("experiments", **data_b)
    system_ids = [instance_a["system_id"], instance_b["system_id"]]
    assert instance_a["sample"]["pk"] == instance_b["sample"]["pk"]
    assert api.get_instance("experiments",
                            system_ids[0])["pk"] == instance_a["pk"]
    assert len(api.get_instances("experiments", system_ids)) == 2

    instance_a["sample"]["data"]["key"] = "value"
    instance_a["sample"]["notes"] = "a note"
    patched = api.patch_instance("experiments",
                                 instance_a["pk"],
                                 sample=instance_a["sample"])
    assert patched["sample"]["data"]["key"] == "value"
    assert patched["sample"]["notes"] == "a note"
Exemplo n.º 11
0
def test_api_methods():
    endpoint = "diseases"
    diseases = [factories.DiseaseFactory() for _ in range(3)]
    created = [api.create_instance(endpoint, **i) for i in diseases]
    pk = created[0]["pk"]
    pks = [i["pk"] for i in created[:2]]
    patched = api.patch_instance(endpoint, pk, data={"one": 1})

    assert patched["data"]["one"] == 1
    assert api.get_instance(endpoint, pk)["pk"] == pk
    assert api.get_instances(endpoint, pk=pk)[0]["pk"] == pk
    assert api.get_instances_count(endpoint, pk=pk) == 1
    assert len(
        api.get_instances(endpoint)) == api.get_instances_count(endpoint)
    assert len(api.get_instances(endpoint, pks)) == 2
    assert len(api.get_instances(endpoint, pks, pk__in=pks)) == 2
    assert len(api.get_instances(endpoint, pks, pk__in=pks[0])) == 1

    for i in created:
        assert api.delete_instance(endpoint, i["pk"]) is None

    assert api.get_token_headers()["Authorization"]
Exemplo n.º 12
0
def update_experiment_bam_file(experiment, assembly_name, analysis_pk, bam_url):
    """
    Update default bam for a experiment given the assembly.

    Arguments:
        experiment (dict): experiment dict.
        assembly_name (str): assembly name.
        analysis_pk (int): analysis primary key.
        bam_url (str): bam url.

    Returns:
        dict: patched experiment instance
    """
    utils.check_admin()
    pk = experiment["pk"]
    bam_files = experiment["bam_files"]

    if bam_files.get(assembly_name, None):  # pragma: no cover
        raise click.UsageError(f"Experiment {pk} already has {assembly_name} bam")

    bam_files[assembly_name] = {"url": bam_url, "analysis": analysis_pk}
    return api.patch_instance("experiments", pk, bam_files=bam_files)
Exemplo n.º 13
0
    def import_files(self, instance, files, files_data, symlink):
        """
        Move/link files into instance's `storage_url` and update database.

        Arguments:
            instance (dict): experiment instance.
            files (dict): list of files to be imported.
            symlink (dict): whether to symlink or move the data.
            files_data (dict): keys are files basenames and values are
                dicts with extra annotations such as PL, LB, or any other.

        Raises:
            click.UsageError: if multiple data formats are found.

        Returns:
            dict: patched experiment instance.
        """
        raw_data = []
        src_dst = []

        if not instance["storage_url"]:
            instance = update_storage_url(
                endpoint="experiments", identifier=instance["pk"], use_hash=True
            )

        data_dir = join(instance["storage_url"], "data")
        os.makedirs(data_dir, exist_ok=True)

        for src, file_type in [(i["path"], i["dtype"]) for i in files]:
            file_name = basename(src)
            file_data = files_data.get(file_name, {})

            # make sure there are no duplicate file names
            if not file_name.startswith(instance["system_id"]):
                file_hash = hex(abs(hash(dirname(src))))[2:]
                file_name = f'{instance["system_id"]}_{file_hash}_{file_name}'

            # make sure we don't add the same file twice
            if all(i != src for i, _ in src_dst):
                dst = join(data_dir, file_name)
                src_dst.append((src, dst))
                raw_data.append(
                    dict(
                        hash_value=getsize(src),
                        hash_method="os.path.getsize",
                        file_url=dst,
                        file_type=file_type,
                        file_data=self.annotate_file_data(
                            experiment=instance,
                            file_type=file_type,
                            file_data=file_data,
                            src=src,
                            dst=dst,
                        ),
                    )
                )

        for src, dst in src_dst:
            if symlink:
                self.symlink(src, dst)
            else:
                self.move(src, dst)

        return api.patch_instance(
            endpoint="experiments",
            instance_id=instance["pk"],
            storage_url=instance["storage_url"],
            storage_usage=utils.get_tree_size(instance["storage_url"]),
            raw_data=sorted(raw_data, key=lambda i: i["file_url"]),
        )
Exemplo n.º 14
0
        def cmd(assembly, symlink, genome_path, dont_index):
            """
            Register an assembly reference genome.

            By default, an attempt to create indexes will be perfomed.
            """
            assembly = LocalReferenceDataImporter.import_data(
                data_id="genome_fasta",
                symlink=symlink,
                data_src=genome_path,
                identifier=assembly,
                model="assemblies",
                description="Reference Genome Fasta File.",
            )

            genome_fasta = assembly["reference_data"]["genome_fasta"]["url"]
            genome_dir = dirname(genome_fasta)
            commands = [
                ["bwa", "index", genome_fasta],
                ["samtools", "faidx", genome_fasta],
                [
                    "samtools",
                    "dict",
                    genome_fasta,
                    "-a",
                    assembly["name"],
                    "-s",
                    assembly["species"],
                    "-o",
                    join(genome_fasta + ".dict"),
                ],
            ]
            for i in commands:
                if dont_index:
                    click.secho(f"Skipping indexing:\n\n\t{' '.join(i)}", fg="yellow")
                    continue

                try:  # pragma: no cover
                    subprocess.check_call(i)
                except subprocess.CalledProcessError:  # pragma: no cover
                    click.secho(
                        f"INDEX FAILED, MUST BE FIXED:\n\n\t{' '.join(i)}", fg="red"
                    )

            indexes = {
                "bwa index": ["amb", "ann", "bwt", "pac", "sa"],
                "samtools faidx": ["fai"],
                "samtools dict": ["dict"],
            }

            for i, indexes in indexes.items():
                for j in indexes:
                    assembly["reference_data"][f"genome_fasta_{j}"] = {
                        "url": join(genome_fasta + f".{j}"),
                        "description": f"Index generated by: {i}",
                    }

            for i in glob(genome_fasta.split(".", 1)[0] + "*"):
                dst = join(genome_dir, assembly["name"] + "." + i.split(".", 1)[-1])
                if i != dst:
                    utils.force_symlink(i, dst)

            api.patch_instance(
                endpoint="assemblies",
                instance_id=assembly["pk"],
                storage_usage=utils.get_tree_size(assembly["storage_url"]),
                reference_data=assembly["reference_data"],
            )
Exemplo n.º 15
0
def update_storage_url(endpoint, identifier, use_hash=False, **data):
    """Make storage directory and return patched instance."""
    data["storage_url"] = get_storage_url(endpoint, identifier, use_hash)
    return api.patch_instance(endpoint, identifier, **data)
Exemplo n.º 16
0
def test_local_data_import(tmpdir):
    dirs = [tmpdir.strpath]
    projects = [api.create_instance("projects", **factories.ProjectFactory())]
    experiments = [
        factories.ExperimentFactory(projects=projects) for i in range(4)
    ]
    experiments = [
        api.create_instance("experiments", **i) for i in experiments
    ]
    keys = [i["pk"] for i in experiments]

    importer = data.LocalDataImporter()
    _, summary = importer.import_data(directories=dirs, pk__in=keys)
    obtained = len(summary.rsplit("no files matched"))
    assert obtained == 4 + 1

    # test can't determine type of fastq
    with pytest.raises(click.UsageError) as error:
        path_1 = tmpdir.join(f'{experiments[0]["system_id"]}.fastq')
        path_1.write("foo")
        importer.import_data(directories=dirs, pk__in=keys)

    path_1.remove()
    assert "cant determine fastq type from" in str(error.value)

    # test imports fastq
    path_1 = tmpdir.join(f'{experiments[0]["system_id"]}_R1_foo.fastq')
    path_2 = tmpdir.join(f'{experiments[0]["system_id"]}_R2_foo.fastq')
    path_1.write("foo")
    path_2.write("foo")
    _, summary = importer.import_data(directories=dirs,
                                      pk__in=keys,
                                      commit=True)
    assert "samples matched: 1" in summary
    assert api.Experiment(experiments[0].pk).get_fastq()

    # test can exclude formats
    path_1 = tmpdir.join(f'{experiments[1]["system_id"]}_1.fastq')
    path_2 = tmpdir.join(f'{experiments[1]["system_id"]}.bam')
    path_1.write("foo")
    path_2.write("foo")
    _, summary = importer.import_data(directories=dirs,
                                      pk__in=keys,
                                      dtypes=["BAM"])
    assert "FASTQ_R1" not in str(summary)
    assert "BAM" in str(summary)

    # test can import multiple formats
    _, summary = importer.import_data(directories=dirs,
                                      pk__in=keys,
                                      commit=True)
    assert "FASTQ_R1" in str(summary)
    assert "BAM" in str(summary)

    # test raise error if duplicated ids
    with pytest.raises(click.UsageError) as error:
        api.patch_instance("experiments",
                           experiments[2]["pk"],
                           identifier="dup_id")
        api.patch_instance("experiments",
                           experiments[3]["pk"],
                           identifier="dup_id")
        importer.import_data(key=lambda x: x["identifier"],
                             directories=dirs,
                             pk__in=keys)

    assert "same identifier for" in str(error.value)

    # test summary
    path_1 = tmpdir.join(f'_{experiments[2]["system_id"]}_cram1_.cram')
    path_2 = tmpdir.join(f'_{experiments[2]["system_id"]}_cram2_.cram')
    path_3 = tmpdir.join(f'_{experiments[3]["system_id"]}_bam1_.bam')
    path_4 = tmpdir.join(f'_{experiments[3]["system_id"]}_bam2_.bam')
    path_1.write("foo")
    path_2.write("foo")
    path_3.write("foo")
    path_4.write("foo")
    imported, summary = importer.import_data(directories=dirs,
                                             commit=True,
                                             symlink=True,
                                             pk__in=keys)

    project = api.get_instance("projects", projects[0]["pk"])
    assert project["storage_url"]
    assert imported[0]["storage_usage"] > 0
    assert imported[0]["raw_data"]
    assert imported[1]["raw_data"]
    assert "experiments" in imported[1]["storage_url"]
    assert len(os.listdir(os.path.join(imported[1]["storage_url"],
                                       "data"))) == 2
    assert "samples matched: 2" in summary
    assert "samples skipped: 2" in summary

    # test import data from command line and files_data functionality
    path_1 = tmpdir.join(f'{experiments[1]["system_id"]}_1.fastq')
    path_2 = tmpdir.join(f'{experiments[1]["system_id"]}_2.fastq')
    path_1.write("foo")
    path_2.write("foo")
    api.patch_instance("experiments", experiments[1]["pk"], raw_data=None)
    file_data = tmpdir.join("file_data.yaml")

    with open(file_data.strpath, "w") as f:
        yaml.dump(
            {
                os.path.basename(path_1.strpath): {
                    "PU": "TEST_PU"
                },
                os.path.basename(path_2.strpath): {
                    "PU": "TEST_PU"
                },
            },
            f,
            default_flow_style=False,
        )

    command = data.LocalDataImporter.as_cli_command()
    runner = CliRunner()
    args = [
        "-di",
        tmpdir.strpath,
        "-id",
        "system_id",
        "-fi",
        "pk__in",
        keys,
        "--files-data",
        file_data.strpath,
        "--commit",
    ]

    result = runner.invoke(command, args, catch_exceptions=False)
    assert "samples matched: 1" in result.output
    experiments[1] = api.get_instance("experiments", experiments[1]["pk"])
    assert experiments[1]["raw_data"][0]["file_data"]["PU"] == "TEST_PU"
    assert experiments[1]["raw_data"][1]["file_data"]["PU"] == "TEST_PU"

    # test import using invalid identifier
    args = ["-di", tmpdir.strpath, "-id", "sample", "-fi", "pk__in", keys]
    result = runner.invoke(command, args)
    assert "invalid type for identifier" in result.output