def compare_gzip(file1: Path, file2: Path): crc_size1 = subby.sub(f"gzip -lv {file1} | tail -1 | awk '{{print $2\":\"$7}}'") crc_size2 = subby.sub(f"gzip -lv {file2} | tail -1 | awk '{{print $2\":\"$7}}'") if crc_size1 != crc_size2: # TODO: test this raise AssertionError( f"CRCs and/or uncompressed sizes differ between expected identical " f"gzip files {file1}, {file2}" )
def diff_default(file1: Path, file2: Path) -> int: """ Default diff command. Args: file1: First file to compare file2: Second file to compare Returns: Number of different lines. """ with tempdir() as temp: # Remove trailing whitespace, and ensure a newline at the end of the file cmp_file1 = temp / "file1" cmp_file2 = temp / "file2" subby.run("sed 's/[[:space:]]*$//; $a\\'", stdin=file1, stdout=cmp_file1) subby.run("sed 's/[[:space:]]*$//; $a\\'", stdin=file2, stdout=cmp_file2) # diff - it would be possible to do this without sed using GNU diff with the # `--ignore-trailing-space` option, but unfortunately that option is not # available in macOS diff, which provides BSD versions of the tools by default. cmds = [ f"diff -y --suppress-common-lines {cmp_file1} {cmp_file2}", "grep -c '^'" ] # It's a valid result to have no lines match, so allow a grep returncode of 1 return int(subby.sub(cmds, allowed_return_codes=(0, 1)))
def make_comparable(infile, outfile): cmd = ["grep -vE '^#'", "cut -f 1-5,7,10", "cut -d ':' -f 1"] output = subby.sub(cmd, stdin=infile) with open(outfile, "wt") as out: if compare_phase: out.write(output) else: # Normalize the allele separator and sort the alleles for row in output.splitlines(keepends=True): r, g = row.rsplit("\t", 1) g_strip = g.rstrip() g_norm = "/".join(sorted(GENO_RE.split(g_strip))) out.write(f"{r}\t{g_norm}") if len(g) != len(g_strip): out.write("\n")
def bam_to_sam(input_bam: Path, output_sam: Path, headers: Optional[Iterable[str]] = ("HD", "SQ", "RG"), min_mapq: Optional[int] = None, sorting: Sorting = Sorting.NONE): """ Use PySAM to convert bam to sam. """ opts = [] if headers: opts.append("-h") headers = set(headers) if min_mapq: opts.extend(["-q", str(min_mapq)]) sam = pysam.view(*opts, str(input_bam)).rstrip() # Replace any randomly assigned readgroups with a common placeholder sam = re.sub(r"UNSET-\w*\b", "UNSET-placeholder", sam) lines = sam.splitlines(keepends=True) header_lines = [] start = 0 if headers: for i, line in enumerate(lines): if not line.startswith("@"): start = i break elif line[1:3] in headers: header_lines.append(line) body_lines = lines[start:] if sorting is not Sorting.NONE: with tempdir() as temp: temp_sam = temp / f"output_{str(output_sam.stem)}.sam" with open(temp_sam, "w") as out: out.write("".join(body_lines)) if sorting is Sorting.COORDINATE: sort_cols = "-k3,3 -k4,4n -k2,2n" else: sort_cols = "-k1,1 -k2,2n" sorted_sam = subby.sub(f"cat {str(temp_sam)} | sort {sort_cols}") body_lines = [sorted_sam] with open(output_sam, "w") as out: out.write("".join(header_lines + body_lines))
def test_sub(): with pytest.raises(ValueError): subby.sub("grep foo | wc -l", stdin=b"foo\nbar", mode=bytes) with pytest.raises(ValueError): subby.sub("grep foo | wc -l", stdin="foo\nbar", block=False) assert subby.sub("grep foo | wc -l", stdin="foo\nbar") == "1"
def _resolve_workflow( self, wdl_path: Path, workflow_name: str, kwargs: dict, ) -> dxpy.DXWorkflow: if wdl_path in DxWdlExecutor._workflow_cache: return DxWdlExecutor._workflow_cache[wdl_path] project_id = ( kwargs.get("workflow_project_id") or kwargs.get("project_id", dxpy.PROJECT_CONTEXT_ID) ) folder = kwargs.get("workflow_folder") or kwargs.get("folder", "/") # # This probably isn't necessary, since (I think) dxWDL creates the folder # # if it doesn't exist # if not folder: # folder = "/" # else: # # Check that the project exists and create the folder (any any missing # # parents) if it doesn't exist. May also fail if the user does not have # # write access to the project. # project = dxpy.DXProject(project_id) # project.new_folder(folder, parents=True) build_workflow = kwargs.get("force", False) workflow_id = None if not build_workflow: existing_workflow = list(dxpy.find_data_objects( classname="workflow", name=workflow_name, project=project_id, folder=folder, describe={ "created": True } )) if not existing_workflow: build_workflow = True else: created = existing_workflow[0]["describe"]["created"] if wdl_path.stat().st_mtime > created: build_workflow = True elif self._import_dirs: for import_dir in self._import_dirs: for imp in import_dir.glob("*.wdl"): if imp.stat().st_mtime > created: build_workflow = True break else: workflow_id = existing_workflow[0]["id"] if build_workflow: java_args = kwargs.get("java_args", self.java_args) or "" imports_args = " ".join(f"-imports {d}" for d in self._import_dirs) extras = kwargs.get("extras") extras_arg = f"-extras {extras}" if extras else "" archive = kwargs.get("archive") archive_arg = "-a" if archive else "-f" cmd = ( f"{self.java_bin} {java_args} -jar {self._dxwdl_jar_file} compile " f"{wdl_path} -destination {project_id}:{folder} {imports_args} " f"{extras_arg} {archive_arg}" ) LOG.info(f"Building workflow with command '{cmd}'") try: workflow_id = subby.sub(cmd).splitlines(False)[-1] except subby.core.CalledProcessError as perr: raise ExecutorError( "dxwdl", f"Error building DNAnexus workflow with dxWDL; " f"stdout={perr.stdout}; stderr={perr.stderr}" ) from perr workflow = dxpy.DXWorkflow(workflow_id) DxWdlExecutor._workflow_cache[wdl_path] = workflow return workflow