Пример #1
0
def compare_gzip(file1: Path, file2: Path):
    crc_size1 = subby.sub(f"gzip -lv {file1} | tail -1 | awk '{{print $2\":\"$7}}'")
    crc_size2 = subby.sub(f"gzip -lv {file2} | tail -1 | awk '{{print $2\":\"$7}}'")
    if crc_size1 != crc_size2:  # TODO: test this
        raise AssertionError(
            f"CRCs and/or uncompressed sizes differ between expected identical "
            f"gzip files {file1}, {file2}"
        )
Пример #2
0
def diff_default(file1: Path, file2: Path) -> int:
    """
    Default diff command.

    Args:
        file1: First file to compare
        file2: Second file to compare

    Returns:
        Number of different lines.
    """
    with tempdir() as temp:
        # Remove trailing whitespace, and ensure a newline at the end of the file
        cmp_file1 = temp / "file1"
        cmp_file2 = temp / "file2"
        subby.run("sed 's/[[:space:]]*$//; $a\\'", stdin=file1, stdout=cmp_file1)
        subby.run("sed 's/[[:space:]]*$//; $a\\'", stdin=file2, stdout=cmp_file2)

        # diff - it would be possible to do this without sed using GNU diff with the
        # `--ignore-trailing-space` option, but unfortunately that option is not
        # available in macOS diff, which provides BSD versions of the tools by default.
        cmds = [
            f"diff -y --suppress-common-lines {cmp_file1} {cmp_file2}",
            "grep -c '^'"
        ]

        # It's a valid result to have no lines match, so allow a grep returncode of 1
        return int(subby.sub(cmds, allowed_return_codes=(0, 1)))
Пример #3
0
 def make_comparable(infile, outfile):
     cmd = ["grep -vE '^#'", "cut -f 1-5,7,10", "cut -d ':' -f 1"]
     output = subby.sub(cmd, stdin=infile)
     with open(outfile, "wt") as out:
         if compare_phase:
             out.write(output)
         else:
             # Normalize the allele separator and sort the alleles
             for row in output.splitlines(keepends=True):
                 r, g = row.rsplit("\t", 1)
                 g_strip = g.rstrip()
                 g_norm = "/".join(sorted(GENO_RE.split(g_strip)))
                 out.write(f"{r}\t{g_norm}")
                 if len(g) != len(g_strip):
                     out.write("\n")
Пример #4
0
def bam_to_sam(input_bam: Path,
               output_sam: Path,
               headers: Optional[Iterable[str]] = ("HD", "SQ", "RG"),
               min_mapq: Optional[int] = None,
               sorting: Sorting = Sorting.NONE):
    """
    Use PySAM to convert bam to sam.
    """
    opts = []
    if headers:
        opts.append("-h")
        headers = set(headers)
    if min_mapq:
        opts.extend(["-q", str(min_mapq)])
    sam = pysam.view(*opts, str(input_bam)).rstrip()
    # Replace any randomly assigned readgroups with a common placeholder
    sam = re.sub(r"UNSET-\w*\b", "UNSET-placeholder", sam)

    lines = sam.splitlines(keepends=True)
    header_lines = []
    start = 0
    if headers:
        for i, line in enumerate(lines):
            if not line.startswith("@"):
                start = i
                break
            elif line[1:3] in headers:
                header_lines.append(line)

    body_lines = lines[start:]
    if sorting is not Sorting.NONE:
        with tempdir() as temp:
            temp_sam = temp / f"output_{str(output_sam.stem)}.sam"
            with open(temp_sam, "w") as out:
                out.write("".join(body_lines))
            if sorting is Sorting.COORDINATE:
                sort_cols = "-k3,3 -k4,4n -k2,2n"
            else:
                sort_cols = "-k1,1 -k2,2n"
            sorted_sam = subby.sub(f"cat {str(temp_sam)} | sort {sort_cols}")
            body_lines = [sorted_sam]

    with open(output_sam, "w") as out:
        out.write("".join(header_lines + body_lines))
Пример #5
0
def test_sub():
    with pytest.raises(ValueError):
        subby.sub("grep foo | wc -l", stdin=b"foo\nbar", mode=bytes)
    with pytest.raises(ValueError):
        subby.sub("grep foo | wc -l", stdin="foo\nbar", block=False)
    assert subby.sub("grep foo | wc -l", stdin="foo\nbar") == "1"
Пример #6
0
    def _resolve_workflow(
        self,
        wdl_path: Path,
        workflow_name: str,
        kwargs: dict,
    ) -> dxpy.DXWorkflow:
        if wdl_path in DxWdlExecutor._workflow_cache:
            return DxWdlExecutor._workflow_cache[wdl_path]

        project_id = (
            kwargs.get("workflow_project_id") or
            kwargs.get("project_id", dxpy.PROJECT_CONTEXT_ID)
        )
        folder = kwargs.get("workflow_folder") or kwargs.get("folder", "/")

        # # This probably isn't necessary, since (I think) dxWDL creates the folder
        # # if it doesn't exist
        # if not folder:
        #     folder = "/"
        # else:
        #     # Check that the project exists and create the folder (any any missing
        #     # parents) if it doesn't exist. May also fail if the user does not have
        #     # write access to the project.
        #     project = dxpy.DXProject(project_id)
        #     project.new_folder(folder, parents=True)

        build_workflow = kwargs.get("force", False)
        workflow_id = None

        if not build_workflow:
            existing_workflow = list(dxpy.find_data_objects(
                classname="workflow",
                name=workflow_name,
                project=project_id,
                folder=folder,
                describe={
                    "created": True
                }
            ))

            if not existing_workflow:
                build_workflow = True
            else:
                created = existing_workflow[0]["describe"]["created"]
                if wdl_path.stat().st_mtime > created:
                    build_workflow = True
                elif self._import_dirs:
                    for import_dir in self._import_dirs:
                        for imp in import_dir.glob("*.wdl"):
                            if imp.stat().st_mtime > created:
                                build_workflow = True
                                break
                    else:
                        workflow_id = existing_workflow[0]["id"]

        if build_workflow:
            java_args = kwargs.get("java_args", self.java_args) or ""
            imports_args = " ".join(f"-imports {d}" for d in self._import_dirs)
            extras = kwargs.get("extras")
            extras_arg = f"-extras {extras}" if extras else ""
            archive = kwargs.get("archive")
            archive_arg = "-a" if archive else "-f"

            cmd = (
                f"{self.java_bin} {java_args} -jar {self._dxwdl_jar_file} compile "
                f"{wdl_path} -destination {project_id}:{folder} {imports_args} "
                f"{extras_arg} {archive_arg}"
            )

            LOG.info(f"Building workflow with command '{cmd}'")

            try:
                workflow_id = subby.sub(cmd).splitlines(False)[-1]
            except subby.core.CalledProcessError as perr:
                raise ExecutorError(
                    "dxwdl",
                    f"Error building DNAnexus workflow with dxWDL; "
                    f"stdout={perr.stdout}; stderr={perr.stderr}"
                ) from perr

        workflow = dxpy.DXWorkflow(workflow_id)
        DxWdlExecutor._workflow_cache[wdl_path] = workflow
        return workflow