示例#1
0
def _metadata_to_dataset(metadata_xml):
    output = tempfile.NamedTemporaryFile(suffix=".hdfsubreadset.xml").name
    log.debug("Generating temporary dataset: {x}".format(x=output))

    cmd = "{m} {p} {o}".format(m=Constants.RS_MOVIE_TO_DS, p=metadata_xml, o=output)

    # the output from movie-metadata-to-dataset is not properly wrapped in pbds namespace,
    # but the tempfile indicated in the stdout is. Not sure why there are two
    # outputs
    stderr_path = tempfile.NamedTemporaryFile(suffix=".stderr").name
    stderr_fh = open(stderr_path, "w")

    run_cmd(cmd, stdout_fh=sys.stdout, stderr_fh=stderr_fh)

    with open(stderr_path, "r") as f:
        stderr = f.readlines()

    def _get_tmpfile(stderr):
        for line in stderr:
            path = line.split(" ")[-1].rstrip()
            if os.path.exists(path):
                if is_dataset(path):
                    return path

    tmp_dataset_xml = _get_tmpfile(stderr)
    return tmp_dataset_xml
示例#2
0
def archive_files(input_file_names, output_file_name, remove_path=True):
    """
    Create a gzipped tarball from a list of input files.

    :param remove_path: if True, the directory will be removed from the input
                        file names before archiving.  All inputs and the output
                        file must be in the same directory for this to work.
    """
    if remove_path:
        input_file_names = [op.basename(fn) for fn in input_file_names]
    args = ["tar", "-czf", output_file_name] + input_file_names
    log.info("Running '{a}'".format(a=" ".join(args)))
    _cwd = os.getcwd()
    try:
        # we want the files to have no leading path
        os.chdir(op.dirname(output_file_name))
        result = run_cmd(" ".join(args),
                         stdout_fh=sys.stdout,
                         stderr_fh=sys.stderr)
    except Exception:
        raise
    else:
        if result.exit_code != 0:
            return result.exit_code
    finally:
        os.chdir(_cwd)
    assert op.isfile(output_file_name)
    return 0
示例#3
0
def _run_cmd(cmd):
    print cmd
    result = run_cmd(cmd, sys.stdout, sys.stderr)
    if result.exit_code != 0:
        print result
        raise ValueError("Failed to generate TC from {c}".format(c=cmd))
    return result
示例#4
0
def run_fasta_to_reference(input_file_name,
                           output_file_name,
                           organism=None,
                           reference_name=None,
                           ploidy="haploid"):
    if reference_name is None or reference_name == "":
        reference_name = op.splitext(op.basename(input_file_name))[0]
    ds_in = ContigSet(input_file_name)
    if len(ds_in.externalResources) > 1:
        raise TypeError("Only a single FASTA file is supported as input.")
    fasta_file_name = ds_in.externalResources[0].resourceId
    output_dir_name = op.dirname(output_file_name)
    args = [
        "fasta-to-reference", "--organism",
        str(organism) if organism != "" else "unknown", "--ploidy",
        str(ploidy) if ploidy != "" else "unknown", "--debug", fasta_file_name,
        output_dir_name, reference_name
    ]
    log.info(" ".join(args))
    result = run_cmd(" ".join(args),
                     stdout_fh=sys.stdout,
                     stderr_fh=sys.stderr)
    if result.exit_code != 0:
        return result.exit_code
    ref_file = op.join(output_dir_name, reference_name, "referenceset.xml")
    assert op.isfile(ref_file)
    with ReferenceSet(ref_file, strict=True) as ds_ref:
        ds_ref.makePathsAbsolute()
        log.info("saving final ReferenceSet to {f}".format(f=output_file_name))
        ds_ref.write(output_file_name)
    return 0
示例#5
0
def run_fasta_to_referenceset(input_file_name, output_file_name):
    args = ["dataset create", "--type ReferenceSet", "--generateIndices",
            output_file_name, input_file_name]
    log.info(" ".join(args))
    result = run_cmd(" ".join(args), stdout_fh = sys.stdout,
                     stderr_fh=sys.stderr)
    # the '.py' name difference will be resolved in pbdataset/pbcoretools, but
    # for now, work with either
    if result.exit_code == 127:
        args = ["dataset.py create", "--type ReferenceSet",
                "--generateIndices",
                output_file_name, input_file_name]
        log.info(" ".join(args))
        result = run_cmd(" ".join(args), stdout_fh = sys.stdout,
                         stderr_fh=sys.stderr)
    return result.exit_code
示例#6
0
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name,
                   nproc=1, score_mode="symmetric"):
    if not score_mode in ["asymmetric", "symmetric"]:
        raise ValueError("Unrecognized score mode '{m}'".format(m=score_mode))
    bc = BarcodeSet(barcode_set_file)
    if len(bc.resourceReaders()) > 1:
        raise NotImplementedError("Multi-FASTA BarcodeSet input is not supported.")
    new_prefix = re.sub(".subreadset.xml$", "", output_file_name)
    args = [
        "bam2bam",
        "-j", str(nproc),
        "-b", str(nproc),
        "-o", new_prefix,
        "--barcodes", barcode_set_file,
        "--scoreMode", score_mode,
        subread_set_file
    ]
    log.info(" ".join(args))
    result = run_cmd(" ".join(args),
                     stdout_fh=sys.stdout,
                     stderr_fh=sys.stderr)
    if result.exit_code != 0:
        return result.exit_code
    assert op.isfile(output_file_name)
    tmp_out = op.join(op.dirname(output_file_name),
                      "tmp_" + op.basename(output_file_name))
    shutil.move(output_file_name, tmp_out)
    with SubreadSet(tmp_out, strict=True) as ds:
        with SubreadSet(subread_set_file) as ds_in:
            ds.metadata = ds_in.metadata
            ds.name = ds_in.name + " (barcoded)"
        ds.updateCounts()
        ds.newUuid()
        ds.write(output_file_name)
    return 0
示例#7
0
def _run_bax_to_bam(input_file_name, output_file_name):
    base_name = ".".join(output_file_name.split(".")[:-2])
    input_file_name_tmp = input_file_name
    # XXX bax2bam won't write an hdfsubreadset unless the input is XML too
    if input_file_name.endswith(".bax.h5"):
        input_file_name_tmp = tempfile.NamedTemporaryFile(
            suffix=".hdfsubreadset.xml").name
        ds_tmp = HdfSubreadSet(input_file_name)
        ds_tmp.write(input_file_name_tmp)
    args =[
        "bax2bam",
        "--subread",
        "-o", base_name,
        "--output-xml", output_file_name,
        "--xml", input_file_name_tmp
    ]
    log.info(" ".join(args))
    result = run_cmd(" ".join(args),
                     stdout_fh=sys.stdout,
                     stderr_fh=sys.stderr)
    if result.exit_code != 0:
        return result.exit_code
    with SubreadSet(output_file_name) as ds:
        ds.assertIndexed()
    return 0
示例#8
0
def run_bam_to_fastx(program_name, input_file_name, output_file_name):
    def _splitext(path):
        base, ext = os.path.splitext(path)
        if ext == ".gz":
            base, ext2 = os.path.splitext(base)
            ext = ext2 + ext
        return base, ext
    args = [
        program_name,
        "-o", _splitext(output_file_name)[0],
        input_file_name,
    ]
    logging.info(" ".join(args))
    result = run_cmd(" ".join(args),
                     stdout_fh=sys.stdout,
                     stderr_fh=sys.stderr)
    if result.exit_code != 0:
        return result.exit_code
    else:
        if not output_file_name.endswith(".gz"):
            output_file_name_ = output_file_name + ".gz"
            with gzip.open(output_file_name_) as f_in:
                with open(output_file_name, "w") as f_out:
                    f_out.write(f_in.read())
    return 0
示例#9
0
def __run_fasta_to_reference(program_name, dataset_class,
                             input_file_name, output_file_name,
                             organism=None, reference_name=None,
                             ploidy="haploid"):
    if reference_name is None or reference_name == "":
        reference_name = op.splitext(op.basename(input_file_name))[0]
    ds_in = ContigSet(input_file_name)
    if len(ds_in.externalResources) > 1:
        raise TypeError("Only a single FASTA file is supported as input.")
    fasta_file_name = ds_in.externalResources[0].resourceId
    output_dir_name = op.dirname(output_file_name)
    args = [
        program_name,
        "--organism", str(organism) if organism != "" else "unknown",
        "--ploidy", str(ploidy) if ploidy != "" else "unknown",
        "--debug",
        fasta_file_name,
        output_dir_name,
        reference_name
    ]
    log.info(" ".join(args))
    result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr)
    if result.exit_code != 0:
        return result.exit_code
    ref_file = op.join(output_dir_name, reference_name,
                       "{t}.xml".format(t=dataset_class.__name__.lower()))
    assert op.isfile(ref_file), ref_file
    with dataset_class(ref_file, strict=True) as ds_ref:
        ds_ref.makePathsAbsolute()
        log.info("saving final {t} to {f}".format(
                 f=output_file_name, t=dataset_class.__name__))
        ds_ref.write(output_file_name)
    return 0
示例#10
0
def archive_files(input_file_names, output_file_name, remove_path=True):
    """
    Create a gzipped tarball from a list of input files.

    :param remove_path: if True, the directory will be removed from the input
                        file names before archiving.  All inputs and the output
                        file must be in the same directory for this to work.
    """
    if remove_path:
        input_file_names = [op.basename(fn) for fn in input_file_names]
    args = ["tar", "-czf", output_file_name] + input_file_names
    log.info("Running '{a}'".format(a=" ".join(args)))
    _cwd = os.getcwd()
    try:
        # we want the files to have no leading path
        os.chdir(op.dirname(output_file_name))
        result = run_cmd(" ".join(args),
                         stdout_fh=sys.stdout,
                         stderr_fh=sys.stderr)
    except Exception:
        raise
    else:
        if result.exit_code != 0:
            return result.exit_code
    finally:
        os.chdir(_cwd)
    assert op.isfile(output_file_name)
    return 0
示例#11
0
def _run_bax_to_bam(input_file_name, output_file_name):
    base_name = ".".join(output_file_name.split(".")[:-2])
    input_file_name_tmp = input_file_name
    # XXX bax2bam won't write an hdfsubreadset unless the input is XML too
    if input_file_name.endswith(".bax.h5"):
        input_file_name_tmp = tempfile.NamedTemporaryFile(
            suffix=".hdfsubreadset.xml").name
        ds_tmp = HdfSubreadSet(input_file_name)
        ds_tmp.write(input_file_name_tmp)
    args =[
        "bax2bam",
        "--subread",
        "-o", base_name,
        "--output-xml", output_file_name,
        "--xml", input_file_name_tmp
    ]
    log.info(" ".join(args))
    result = run_cmd(" ".join(args),
                     stdout_fh=sys.stdout,
                     stderr_fh=sys.stderr)
    if result.exit_code != 0:
        return result.exit_code
    with SubreadSet(output_file_name) as ds:
        ds.assertIndexed()
    return 0
示例#12
0
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name,
                   nproc=1, score_mode="symmetric"):
    if not score_mode in ["asymmetric", "symmetric"]:
        raise ValueError("Unrecognized score mode '{m}'".format(m=score_mode))
    bc = BarcodeSet(barcode_set_file)
    if len(bc.resourceReaders()) > 1:
        raise NotImplementedError("Multi-FASTA BarcodeSet input is not supported.")
    new_prefix = re.sub(".subreadset.xml$", "", output_file_name)
    args = [
        "bam2bam",
        "-j", str(nproc),
        "-b", str(nproc),
        "-o", new_prefix,
        "--barcodes", barcode_set_file,
        "--scoreMode", score_mode,
        subread_set_file
    ]
    log.info(" ".join(args))
    result = run_cmd(" ".join(args),
                     stdout_fh=sys.stdout,
                     stderr_fh=sys.stderr)
    if result.exit_code != 0:
        return result.exit_code
    assert op.isfile(output_file_name)
    tmp_out = op.join(op.dirname(output_file_name),
                      "tmp_" + op.basename(output_file_name))
    shutil.move(output_file_name, tmp_out)
    with SubreadSet(tmp_out, strict=True) as ds:
        with SubreadSet(subread_set_file) as ds_in:
            ds.metadata = ds_in.metadata
            ds.name = ds_in.name + " (barcoded)"
        ds.updateCounts()
        ds.newUuid()
        ds.write(output_file_name)
    return 0
def _run_cmd(cmd):
    print cmd
    result = run_cmd(cmd, sys.stdout, sys.stderr)
    if result.exit_code != 0:
        print result
        raise ValueError("Failed to generate TC from {c}".format(c=cmd))
    return result
示例#14
0
def _run_bax_to_bam(input_file_name, output_file_name):
    base_name = ".".join(output_file_name.split(".")[:-2])
    input_file_name_tmp = input_file_name
    # XXX bax2bam won't write an hdfsubreadset unless the input is XML too
    if input_file_name.endswith(".bax.h5"):
        input_file_name_tmp = tempfile.NamedTemporaryFile(
            suffix=".hdfsubreadset.xml").name
        ds_tmp = HdfSubreadSet(input_file_name)
        ds_tmp.write(input_file_name_tmp)
    args =[
        "bax2bam",
        "--subread",
        "-o", base_name,
        "--output-xml", output_file_name,
        "--xml", input_file_name_tmp
    ]
    logging.info(" ".join(args))
    result = run_cmd(" ".join(args),
                     stdout_fh=sys.stdout,
                     stderr_fh=sys.stderr)
    if result.exit_code != 0:
        return result.exit_code
    tmp = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
    shutil.move(output_file_name, tmp)
    # FIXME it would be better to leave this to bax2bam
    with SubreadSet(tmp) as ds:
        if not ds.isIndexed:
            ds.induceIndices()
        ds.write(output_file_name)
    return 0
示例#15
0
def run_fasta_to_fofn(input_file_name, output_file_name):
    args = ["echo", input_file_name, ">", output_file_name]
    logging.info(" ".join(args))
    result = run_cmd(" ".join(args),
                     stdout_fh=sys.stdout,
                     stderr_fh=sys.stderr)
    return result.exit_code
示例#16
0
def run_bam_to_fastx(program_name, input_file_name, output_file_name):
    def _splitext(path):
        base, ext = os.path.splitext(path)
        if ext == ".gz":
            base, ext2 = os.path.splitext(base)
            ext = ext2 + ext
        return base, ext

    args = [
        program_name,
        "-o",
        _splitext(output_file_name)[0],
        input_file_name,
    ]
    logging.info(" ".join(args))
    result = run_cmd(" ".join(args),
                     stdout_fh=sys.stdout,
                     stderr_fh=sys.stderr)
    if result.exit_code != 0:
        return result.exit_code
    else:
        if not output_file_name.endswith(".gz"):
            output_file_name_ = output_file_name + ".gz"
            with gzip.open(output_file_name_) as f_in:
                with open(output_file_name, "w") as f_out:
                    f_out.write(f_in.read())
    return 0
示例#17
0
def run_fasta_to_referenceset(input_file_name, output_file_name):
    args = ["dataset create", "--type ReferenceSet", "--generateIndices",
            output_file_name, input_file_name]
    log.info(" ".join(args))
    result = run_cmd(" ".join(args), stdout_fh = sys.stdout,
                     stderr_fh=sys.stderr)
    # the '.py' name difference will be resolved in pbdataset/pbcoretools, but
    # for now, work with either
    if result.exit_code == 127:
        args = ["dataset.py create", "--type ReferenceSet",
                "--generateIndices",
                output_file_name, input_file_name]
        log.info(" ".join(args))
        result = run_cmd(" ".join(args), stdout_fh = sys.stdout,
                         stderr_fh=sys.stderr)
    return result.exit_code
示例#18
0
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name,
                   nproc=1):
    bc = BarcodeSet(barcode_set_file)
    if len(bc.resourceReaders()) > 1:
        raise NotImplementedError("Multi-FASTA BarcodeSet input is not supported.")
    barcode_fasta = bc.toExternalFiles()[0]
    with SubreadSet(subread_set_file) as ds:
        # TODO(nechols)(2016-03-15): replace with BarcodedSubreadSet
        ds_new = SubreadSet(strict=True)
        for ext_res in ds.externalResources:
            subreads_bam = ext_res.bam
            scraps_bam = ext_res.scraps
            assert subreads_bam is not None
            if scraps_bam is None:
                raise TypeError("The input SubreadSet must include scraps.")
            new_prefix = op.join(op.dirname(output_file_name),
                re.sub(".subreads.bam", "_barcoded", op.basename(subreads_bam)))
            if not op.isabs(subreads_bam):
                subreads_bam = op.join(op.dirname(subread_set_file),
                    subreads_bam)
            if not op.isabs(scraps_bam):
                scraps_bam = op.join(op.dirname(subread_set_file), scraps_bam)
            args = [
                "bam2bam",
                "-j", str(nproc),
                "-b", str(nproc),
                "-o", new_prefix,
                "--barcodes", barcode_fasta,
                subreads_bam, scraps_bam
            ]
            print args
            log.info(" ".join(args))
            result = run_cmd(" ".join(args),
                             stdout_fh=sys.stdout,
                             stderr_fh=sys.stderr)
            if result.exit_code != 0:
                return result.exit_code
            subreads_bam = new_prefix + ".subreads.bam"
            scraps_bam = new_prefix + ".scraps.bam"
            assert op.isfile(subreads_bam), "Missing {f}".format(f=subreads_bam)
            # FIXME we need a more general method for this
            ext_res_new = ExternalResource()
            ext_res_new.resourceId = subreads_bam
            ext_res_new.metaType = 'PacBio.SubreadFile.SubreadBamFile'
            ext_res_new.addIndices([subreads_bam + ".pbi"])
            ext_res_inner = ExternalResources()
            ext_res_scraps = ExternalResource()
            ext_res_scraps.resourceId = scraps_bam
            ext_res_scraps.metaType = 'PacBio.SubreadFile.ScrapsBamFile'
            ext_res_scraps.addIndices([scraps_bam + ".pbi"])
            ext_res_inner.append(ext_res_scraps)
            ext_res_new.append(ext_res_inner)
            ds_new.externalResources.append(ext_res_new)
        ds._filters.clearCallbacks()
        ds_new._filters = ds._filters
        ds_new._populateMetaTypes()
        ds_new.updateCounts()
        ds_new.write(output_file_name)
    return 0
示例#19
0
def run_bam_to_bam(subread_set_file,
                   barcode_set_file,
                   output_file_name,
                   nproc=1,
                   score_mode="symmetric"):
    if not score_mode in ["asymmetric", "symmetric"]:
        raise ValueError("Unrecognized score mode '{m}'".format(m=score_mode))
    bc = BarcodeSet(barcode_set_file)
    if len(bc.resourceReaders()) > 1:
        raise NotImplementedError(
            "Multi-FASTA BarcodeSet input is not supported.")
    barcode_fasta = bc.toExternalFiles()[0]
    with SubreadSet(subread_set_file) as ds:
        ds_new = SubreadSet(strict=True)
        for ext_res in ds.externalResources:
            subreads_bam = ext_res.bam
            scraps_bam = ext_res.scraps
            assert subreads_bam is not None
            if scraps_bam is None:
                raise TypeError("The input SubreadSet must include scraps.")
            new_prefix = op.join(
                op.dirname(output_file_name),
                re.sub(".subreads.bam", "_barcoded",
                       op.basename(subreads_bam)))
            if not op.isabs(subreads_bam):
                subreads_bam = op.join(op.dirname(subread_set_file),
                                       subreads_bam)
            if not op.isabs(scraps_bam):
                scraps_bam = op.join(op.dirname(subread_set_file), scraps_bam)
            args = [
                "bam2bam", "-j",
                str(nproc), "-b",
                str(nproc), "-o", new_prefix, "--barcodes", barcode_fasta,
                "--scoreMode", score_mode, subreads_bam, scraps_bam
            ]
            log.info(" ".join(args))
            result = run_cmd(" ".join(args),
                             stdout_fh=sys.stdout,
                             stderr_fh=sys.stderr)
            if result.exit_code != 0:
                return result.exit_code
            subreads_bam = new_prefix + ".subreads.bam"
            scraps_bam = new_prefix + ".scraps.bam"
            assert op.isfile(subreads_bam), "Missing {f}".format(
                f=subreads_bam)
            add_subread_resources(ds_new,
                                  subreads=subreads_bam,
                                  scraps=scraps_bam,
                                  barcodes=barcode_set_file)
        ds._filters.clearCallbacks()
        ds_new._filters = ds._filters
        ds_new._populateMetaTypes()
        ds_new.metadata = ds.metadata
        ds_new.name = ds.name + " (barcoded)"
        ds_new.updateCounts()
        ds_new.newUuid()
        ds_new.write(output_file_name)
    return 0
def run_rtc(rtc):
    """Dev Task for calling a subprocess exe. In this case it's python"""

    nrecords = rtc.task.options[_to_opt_id("nrecords")]
    _d = dict(i=rtc.task.input_files[0], o=rtc.task.output_files[0], r=nrecords)
    exe = "hello-world.py {i} {p} --nrecords {r}".format(**_d)
    result = run_cmd(exe, sys.stdout, sys.stderr)
    log.info("Completed running {e} Result {r}".format(e=exe, r=result))
    return result
示例#21
0
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name,
                   nproc=1, score_mode="symmetric"):
    if not score_mode in ["asymmetric", "symmetric"]:
        raise ValueError("Unrecognized score mode '{m}'".format(m=score_mode))
    bc = BarcodeSet(barcode_set_file)
    if len(bc.resourceReaders()) > 1:
        raise NotImplementedError("Multi-FASTA BarcodeSet input is not supported.")
    barcode_fasta = bc.toExternalFiles()[0]
    with SubreadSet(subread_set_file) as ds:
        ds_new = SubreadSet(strict=True)
        for ext_res in ds.externalResources:
            subreads_bam = ext_res.bam
            scraps_bam = ext_res.scraps
            assert subreads_bam is not None
            if scraps_bam is None:
                raise TypeError("The input SubreadSet must include scraps.")
            new_prefix = op.join(op.dirname(output_file_name),
                re.sub(".subreads.bam", "_barcoded", op.basename(subreads_bam)))
            if not op.isabs(subreads_bam):
                subreads_bam = op.join(op.dirname(subread_set_file),
                    subreads_bam)
            if not op.isabs(scraps_bam):
                scraps_bam = op.join(op.dirname(subread_set_file), scraps_bam)
            args = [
                "bam2bam",
                "-j", str(nproc),
                "-b", str(nproc),
                "-o", new_prefix,
                "--barcodes", barcode_fasta,
                "--scoreMode", score_mode,
                subreads_bam, scraps_bam
            ]
            log.info(" ".join(args))
            result = run_cmd(" ".join(args),
                             stdout_fh=sys.stdout,
                             stderr_fh=sys.stderr)
            if result.exit_code != 0:
                return result.exit_code
            subreads_bam = new_prefix + ".subreads.bam"
            scraps_bam = new_prefix + ".scraps.bam"
            assert op.isfile(subreads_bam), "Missing {f}".format(f=subreads_bam)
            add_subread_resources(ds_new,
                subreads=subreads_bam,
                scraps=scraps_bam,
                barcodes=barcode_set_file)
        ds._filters.clearCallbacks()
        ds_new._filters = ds._filters
        ds_new._populateMetaTypes()
        ds_new.metadata = ds.metadata
        ds_new.name = ds.name + " (barcoded)"
        ds_new.updateCounts()
        ds_new.newUuid()
        ds_new.write(output_file_name)
    return 0
示例#22
0
def _run_bam_to_fastx(program_name, fastx_reader, fastx_writer,
                     input_file_name, output_file_name, tmp_dir=None):
    assert isinstance(program_name, basestring)
    barcode_mode = False
    if output_file_name.endswith(".gz"):
        with openDataSet(input_file_name) as ds_in:
            barcode_mode = ds_in.isBarcoded
    tmp_out_prefix = tempfile.NamedTemporaryFile(dir=tmp_dir).name
    args = [
        program_name,
        "-o", tmp_out_prefix,
        input_file_name,
    ]
    if barcode_mode:
        args.insert(1, "--split-barcodes")
    log.info(" ".join(args))
    result = run_cmd(" ".join(args),
                     stdout_fh=sys.stdout,
                     stderr_fh=sys.stderr)
    if result.exit_code != 0:
        return result.exit_code
    else:
        base_ext = re.sub("bam2", "", program_name) 
        if not barcode_mode:
            tmp_out = "{p}.{b}.gz".format(p=tmp_out_prefix, b=base_ext)
            assert os.path.isfile(tmp_out), tmp_out
            if output_file_name.endswith(".gz"):
                log.info("cp {t} {f}".format(t=tmp_out, f=output_file_name))
                shutil.copyfile(tmp_out, output_file_name)
            else:
                _unzip_fastx(tmp_out, output_file_name)
            os.remove(tmp_out)
        else:
            suffix = "{f}.gz".format(f=base_ext)
            tmp_out_dir = op.dirname(tmp_out_prefix)
            tc_out_dir = op.dirname(output_file_name)
            barcoded_file_names = []
            # find the barcoded FASTX files and unzip them to the same
            # output directory and file prefix as the ultimate output
            for fn in os.listdir(tmp_out_dir):
                fn = op.join(tmp_out_dir, fn)
                if fn.startswith(tmp_out_prefix) and fn.endswith(suffix):
                    bc_fwd_rev = fn.split(".")[-3].split("_")
                    suffix2 = ".{f}_{r}.{t}".format(
                        f=bc_fwd_rev[0], r=bc_fwd_rev[1], t=base_ext)
                    assert fn == tmp_out_prefix + suffix2 + ".gz"
                    fn_out = re.sub(".gz$", suffix2, output_file_name)
                    fastx_out = op.join(tc_out_dir, fn_out)
                    _unzip_fastx(fn, fastx_out)
                    barcoded_file_names.append(fn_out)
                    os.remove(fn)
            assert len(barcoded_file_names) > 0
            return archive_files(barcoded_file_names, output_file_name)
    return 0
示例#23
0
def _run_bam_to_fastx(program_name, fastx_reader, fastx_writer,
                     input_file_name, output_file_name, tmp_dir=None):
    assert isinstance(program_name, basestring)
    barcode_mode = False
    if output_file_name.endswith(".gz"):
        with openDataSet(input_file_name) as ds_in:
            barcode_mode = ds_in.isBarcoded
    tmp_out_prefix = tempfile.NamedTemporaryFile(dir=tmp_dir).name
    args = [
        program_name,
        "-o", tmp_out_prefix,
        input_file_name,
    ]
    if barcode_mode:
        args.insert(1, "--split-barcodes")
    log.info(" ".join(args))
    result = run_cmd(" ".join(args),
                     stdout_fh=sys.stdout,
                     stderr_fh=sys.stderr)
    if result.exit_code != 0:
        return result.exit_code
    else:
        base_ext = re.sub("bam2", "", program_name) 
        if not barcode_mode:
            tmp_out = "{p}.{b}.gz".format(p=tmp_out_prefix, b=base_ext)
            assert os.path.isfile(tmp_out), tmp_out
            if output_file_name.endswith(".gz"):
                log.info("cp {t} {f}".format(t=tmp_out, f=output_file_name))
                shutil.copyfile(tmp_out, output_file_name)
            else:
                _unzip_fastx(tmp_out, output_file_name)
            os.remove(tmp_out)
        else:
            suffix = "{f}.gz".format(f=base_ext)
            tmp_out_dir = op.dirname(tmp_out_prefix)
            tc_out_dir = op.dirname(output_file_name)
            barcoded_file_names = []
            # find the barcoded FASTX files and unzip them to the same
            # output directory and file prefix as the ultimate output
            for fn in os.listdir(tmp_out_dir):
                fn = op.join(tmp_out_dir, fn)
                if fn.startswith(tmp_out_prefix) and fn.endswith(suffix):
                    bc_fwd_rev = fn.split(".")[-3].split("_")
                    suffix2 = ".{f}_{r}.{t}".format(
                        f=bc_fwd_rev[0], r=bc_fwd_rev[1], t=base_ext)
                    assert fn == tmp_out_prefix + suffix2 + ".gz"
                    fn_out = re.sub(".gz$", suffix2, output_file_name)
                    fastx_out = op.join(tc_out_dir, fn_out)
                    _unzip_fastx(fn, fastx_out)
                    barcoded_file_names.append(fn_out)
                    os.remove(fn)
            assert len(barcoded_file_names) > 0
            return archive_files(barcoded_file_names, output_file_name)
    return 0
示例#24
0
def run_bax_to_bam(input_file_name, output_file_name):
    base_name = os.path.splitext(output_file_name)[0]
    args = [
        "bax2bam", "--subread", "-o", base_name, "--output-xml",
        output_file_name, "--xml", input_file_name
    ]
    logging.info(" ".join(args))
    result = run_cmd(" ".join(args),
                     stdout_fh=sys.stdout,
                     stderr_fh=sys.stderr)
    return result.exit_code
示例#25
0
def run_bax_to_bam(input_file_name, output_file_name):
    base_name = os.path.splitext(output_file_name)[0]
    args = [
        "bax2bam",
        "--subread",
        "-o", base_name,
        "--output-xml", output_file_name,
        "--xml", input_file_name
    ]
    logging.info(" ".join(args))
    result = run_cmd(" ".join(args),
                     stdout_fh=sys.stdout,
                     stderr_fh=sys.stderr)
    return result.exit_code
示例#26
0
    def test_simple_run_cmd(self):
        d = get_temp_dir("simple-cmd")
        txt_in = get_temp_file(".txt", d)
        txt_out = get_temp_file("*.txt", d)
        exe = "cat {i} > {o}".format(i=txt_in, o=txt_out)

        # this could all be bundled into a context manager
        # with RunCommand('/path/stdout', '/path/to/stderr') as r:
        #   r.exe("echo 'exe1')
        #   r.exe("echo 'exe2')
        #   result = r.get_result() # close the file handles
        stdout = get_temp_file("-stdout", d)
        stderr = get_temp_file("-stderr", d)
        with open(stdout, 'w') as fo:
            with open(stderr, 'w') as fe:
                result = run_cmd(exe, fo, fe)

        emgs = "Command {e} failed".format(e=exe)
        self.assertEquals(result.exit_code, 0, emgs)
示例#27
0
def run_bam_to_fastx(program_name, fastx_reader, fastx_writer,
                     input_file_name, output_file_name,
                     min_subread_length=0):
    assert isinstance(program_name, basestring)
    # XXX this is really annoying; bam2fastx needs a --no-gzip feature
    tmp_out_prefix = tempfile.NamedTemporaryFile().name
    args = [
        program_name,
        "-o", tmp_out_prefix,
        input_file_name,
    ]
    logging.info(" ".join(args))
    result = run_cmd(" ".join(args),
                     stdout_fh=sys.stdout,
                     stderr_fh=sys.stderr)
    if result.exit_code != 0:
        return result.exit_code
    else:
        base_ext = re.sub("bam2", "", program_name)
        tmp_out = "{p}.{b}.gz".format(p=tmp_out_prefix, b=base_ext)
        assert os.path.isfile(tmp_out), tmp_out
        logging.info("raw output in {f}".format(f=tmp_out))
        def _open_file(file_name):
            if file_name.endswith(".gz"):
                return gzip.open(file_name)
            else:
                return open(file_name)
        if min_subread_length > 0:
            logging.info("Filtering subreads by minimum length = {l}".format(
                l=min_subread_length))
        elif min_subread_length < 0:
            logging.warn("min_subread_length = {l}, ignoring".format(
                l=min_subread_length))
        with _open_file(tmp_out) as raw_in:
            with fastx_reader(raw_in) as fastx_in:
                with fastx_writer(output_file_name) as fastx_out:
                    for rec in fastx_in:
                        if (min_subread_length < 1 or
                            min_subread_length < len(rec.sequence)):
                            fastx_out.writeRecord(rec)
        os.remove(tmp_out)
    return 0
示例#28
0
def run_bax_to_bam(input_file_name, output_file_name):
    base_name = os.path.splitext(output_file_name)[0]
    args = [
        "bax2bam", "--subread", "-o", base_name, "--output-xml",
        output_file_name, "--xml", input_file_name
    ]
    logging.info(" ".join(args))
    result = run_cmd(" ".join(args),
                     stdout_fh=sys.stdout,
                     stderr_fh=sys.stderr)
    if result.exit_code != 0:
        return result.exit_code
    tmp = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
    shutil.move(output_file_name, tmp)
    # FIXME it would be better to leave this to bax2bam
    with openDataSet(tmp) as ds:
        if not ds.isIndexed:
            ds.induceIndices()
        ds.write(output_file_name, validate=False)  # FIXME bad XML!
    return 0
示例#29
0
def run_basecaller(trc_file, baz_file,
                   nproc=1,
                   stdout=sys.stdout,
                   stderr=sys.stderr,
                   basecaller_exe=Constants.BASECALLER_EXE,
                   basecaller_options=Constants.BASECALLER_OPTIONS,
                   basecaller_module=Constants.BASECALLER_MODULE):
    """
    Run the offline basecaller on a trace file.
    """

    exe = " && ".join([Constants.GNU_MODULE_INIT,
                       ' '.join([Constants.GNU_MODULE_LOAD,
                                 basecaller_module]),
                       basecaller_exe])

    args = [
        # Wrap with a bash invocation (instead of sh)
        "/bin/bash",
        "-c",
        "'",
        exe,
        "--inputfile={i}".format(i=trc_file),
        "--outputbazfile={o}".format(o=baz_file),
        "--numthreads={n}".format(n=nproc),
    ]
    if basecaller_options != "":
        args.extend(basecaller_options.split(' '))

    # finish bash invocation wrap
    args.append("'")

    logging.info("Command " + ' '.join(args))

    result = run_cmd(' '.join(args), stdout_fh=stdout, stderr_fh=stderr)

    if not op.isfile(baz_file):
        stderr.write("Result {}".format(result))
        stderr.write("Unable to produce Baz file from command: {a}\n".format(a=args))

    return result.exit_code
def run(args):
    output_dir = os.getcwd()
    if len(args) == 1:
        output_dir = args[0]
        assert os.path.isdir(output_dir), "Not a directory: %s"%output_dir
    module_dir = os.path.join(os.path.dirname(__file__), "pbcoretools", "tasks")
    for file_name in os.listdir(module_dir):
        if file_name.endswith(".py") and not file_name.startswith("_"):
            if file_name in ["converters.py", "filters.py"]:
                continue
            module_name = "pbcoretools.tasks.{m}".format(m=file_name[:-3])
            json_file = os.path.join(output_dir,
                "{m}_tool_contract.json".format(m=module_name))
            cmd = "python -m {m} --emit-tool-contract > {j}".format(
                m=module_name, j=json_file)
            run_cmd(cmd, sys.stdout, sys.stderr)
    cmd = "python -m pbcoretools.tasks.converters emit-tool-contracts -o {d}".format(d=output_dir)
    run_cmd(cmd, sys.stdout, sys.stderr)
    cmd = "python -m pbcoretools.tasks.filters emit-tool-contracts -o {d}".format(d=output_dir)
    run_cmd(cmd, sys.stdout, sys.stderr)
示例#31
0
def run(args):
    output_dir = os.getcwd()
    if len(args) == 1:
        output_dir = args[0]
        assert os.path.isdir(output_dir), "Not a directory: %s" % output_dir
    module_dir = os.path.join(os.path.dirname(__file__), "pbcoretools",
                              "tasks")
    for file_name in os.listdir(module_dir):
        if file_name.endswith(".py") and not file_name.startswith("_"):
            if file_name in ["converters.py", "filters.py"]:
                continue
            module_name = "pbcoretools.tasks.{m}".format(m=file_name[:-3])
            json_file = os.path.join(
                output_dir, "{m}_tool_contract.json".format(m=module_name))
            cmd = "python -m {m} --emit-tool-contract > {j}".format(
                m=module_name, j=json_file)
            run_cmd(cmd, sys.stdout, sys.stderr)
    cmd = "python -m pbcoretools.tasks.converters emit-tool-contracts -o {d}".format(
        d=output_dir)
    run_cmd(cmd, sys.stdout, sys.stderr)
    cmd = "python -m pbcoretools.tasks.filters emit-tool-contracts -o {d}".format(
        d=output_dir)
    run_cmd(cmd, sys.stdout, sys.stderr)
示例#32
0
def _run_cmd(cmd):
    x = run_cmd(cmd, sys.stdout, sys.stderr)
    if x.exit_code != 0:
        log.error(x)
    return x
示例#33
0
def _run_cmd(cmd):
    x = run_cmd(cmd, sys.stdout, sys.stderr)
    if x.exit_code != 0:
        log.error(x)
    return x
示例#34
0
def run_baz2bam(baz_file, adapter_fa, metadata_xml, output_file,
                nproc=1,
                min_subread_length=Constants.MIN_SUBREAD_LENGTH,
                baz2bam_exe=Constants.BAZ2BAM_EXE,
                ppa_module=Constants.PPA_MODULE,
                stdout=sys.stdout, stderr=sys.stderr,
                dataset_name_suffix=None):
    """
    Convert the .baz file from the basecaller to a SubreadSet.

    Note, the emitted SubreadSet will have a new UUID

    :param output_file: Base prefix for output files

    :param dataset_name_suffix: Will update the dataset name with the supplied suffix
    :type dataset_name_suffix: str | None
    """

    assert output_file.endswith(".subreadset.xml")
    output_base = re.sub(".subreadset.xml", "", output_file)
    output_dir = op.dirname(output_file)

    exe = " && ".join([Constants.GNU_MODULE_INIT,
                       ' '.join([Constants.GNU_MODULE_LOAD,
                                 ppa_module]),
                       baz2bam_exe])

    args = [
        "/bin/bash",
        "-c",
        "'",
        exe,
        baz_file,
        "--silent",
        "--minSubLength", str(min_subread_length),
        "--metadata={x}".format(x=metadata_xml),
        "--adapter={f}".format(f=adapter_fa),
        "-o", output_base,
        "-j", str(nproc),
        "-b", str(nproc),
        "'",
    ]
    logging.info(" ".join(args))
    result = run_cmd(' '.join(args), stdout, stderr)
    assert result.exit_code == 0, \
        "Failed with exit code {c}".format(c=result.exit_code)
    subreads_file = output_base + ".subreads.bam"
    scraps_file = output_base + ".scraps.bam"
    assert op.isfile(subreads_file), subreads_file
    assert op.isfile(scraps_file), scraps_file
    subreadset_file = output_base + ".subreadset.xml"
    assert op.isfile(subreadset_file)
    tmp_ds = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name

    # Must copy the adapters file to new SubreadSet output dir
    # otherwise, the file will be invalid
    new_adapters = op.join(output_dir, op.basename(adapter_fa))
    if not op.exists(new_adapters):
        shutil.copy(adapter_fa, new_adapters)

    # FIXME, This should really update the PA version (SigProcVer) or at a minimum,
    # augment the version

    with SubreadSet(subreadset_file) as ds:
        ds.makePathsAbsolute()
        if dataset_name_suffix is not None:
            name = ds.name
            new_ds_name = "_".join([name, dataset_name_suffix])
            ds.name = new_ds_name
        ds.newUuid(setter=True)
        ds.write(tmp_ds)
        log.info("Wrote new SubreadSet {u} to {p}".format(u=ds.uuid, p=subreads_file))

    shutil.move(tmp_ds, subreadset_file)
    return 0
示例#35
0
def _run_bam_to_fastx(program_name, fastx_reader, fastx_writer,
                      input_file_name, output_file_name, tmp_dir=None,
                      seqid_prefix=None, subreads_in=None):
    """
    Converts a dataset to a set of fastx file, possibly archived.
    Can take a subreadset or consensusreadset as input.
    Will convert to either fasta or fastq.
    If the dataset is barcoded, it will split the fastx files per-barcode.
    If the output file is .zip, the fastx file(s) will be archived accordingly.
    """
    assert isinstance(program_name, str)
    barcode_mode = False
    barcode_sets = set()
    output_is_archive = (output_file_name.endswith(".zip") or
                         output_file_name.endswith(".tar.gz") or
                         output_file_name.endswith(".tgz"))
    if output_is_archive:
        with openDataSet(input_file_name) as ds_in:
            barcode_mode = ds_in.isBarcoded
            if barcode_mode:
                # attempt to collect the labels of barcodes used on this
                # dataset.  assumes that all BAM files used the same barcodes
                for bam in ds_in.externalResources:
                    if bam.barcodes is not None:
                        barcode_sets.add(bam.barcodes)
    barcode_labels = []
    bio_samples_to_bc = None
    if barcode_mode:
        if len(barcode_sets) == 1:
            bc_file = list(barcode_sets)[0]
            log.info("Reading barcode labels from %s", bc_file)
            try:
                with BarcodeSet(bc_file) as bc_in:
                    for bc in bc_in:
                        barcode_labels.append(bc.id)
            except IOError as e:
                log.error("Can't read %s", bc_file)
                log.error(e)
        elif len(barcode_sets) > 1:
            log.warning("Multiple barcode sets used for this SubreadSet:")
            for fn in sorted(list(barcode_sets)):
                log.warning("  %s", fn)
        else:
            log.info("No barcode labels available")
        if subreads_in is not None:
            bio_samples_to_bc = {}
            with SubreadSet(subreads_in, strict=True) as subread_ds:
                if subread_ds.isBarcoded:  # pylint: disable=no-member
                    bio_samples_to_bc = get_barcode_sample_mappings(subread_ds)
    base_ext = re.sub("bam2", ".", program_name)
    suffix = "{f}.gz".format(f=base_ext)
    tmp_out_dir = tempfile.mkdtemp(dir=tmp_dir)
    tmp_out_prefix = op.join(tmp_out_dir, "tmp_fastx")
    args = [
        program_name,
        "-o", tmp_out_prefix,
        input_file_name,
    ]
    if barcode_mode:
        args.insert(1, "--split-barcodes")
    if seqid_prefix is not None:
        args.extend(["--seqid-prefix", pipes.quote(seqid_prefix)])
    log.info(" ".join(args))
    remove_files = []
    result = run_cmd(" ".join(args),
                     stdout_fh=sys.stdout,
                     stderr_fh=sys.stderr)

    def _is_fastx_file(fn):
        return fn.startswith(tmp_out_prefix) and fn.endswith(suffix)

    try:
        assert result.exit_code == 0, "{p} exited with code {c}".format(
            p=program_name, c=result.exit_code)
        if output_is_archive:
            tc_out_dir = op.dirname(output_file_name)
            fastx_file_names = []
            # find the barcoded FASTX files and un-gzip them to the same
            # output directory and file prefix as the ultimate output
            for fn in walker(tmp_out_dir, _is_fastx_file):
                if barcode_mode:
                    # bam2fastx outputs files with the barcode indices
                    # encoded in the file names; here we attempt to
                    # translate these to barcode labels, falling back on
                    # the original indices if necessary
                    bc_fwd_rev = fn.split(".")[-3].split("_")
                    bc_label = "unbarcoded"
                    if (bc_fwd_rev != ["65535", "65535"] and
                            bc_fwd_rev != ["-1", "-1"]):
                        def _label_or_none(x):
                            try:
                                bc = int(x)
                                if bc < 0:
                                    return "none"
                                elif bc < len(barcode_labels):
                                    return barcode_labels[bc]
                            except ValueError as e:
                                pass
                            return x
                        bc_fwd_label = _label_or_none(bc_fwd_rev[0])
                        bc_rev_label = _label_or_none(bc_fwd_rev[1])
                        bc_label = "{f}--{r}".format(f=bc_fwd_label,
                                                     r=bc_rev_label)
                    suffix2 = ".{l}{t}".format(l=bc_label, t=base_ext)
                    if bio_samples_to_bc is not None:
                        sample = bio_samples_to_bc.get(bc_label, "unknown")
                        suffix2 = ".{}".format(sample) + suffix2
                else:
                    suffix2 = base_ext
                base = re.sub(".zip$", "",
                              re.sub(".tar.gz", "",
                                     re.sub(".tgz", "",
                                            op.basename(output_file_name))))
                fn_out = base
                if not fn_out.endswith(suffix2):
                    fn_out = re.sub(base_ext, suffix2, fn_out)
                fastx_out = op.join(tc_out_dir, fn_out)
                _ungzip_fastx(fn, fastx_out)
                fastx_file_names.append(fastx_out)
                remove_files.append(fn)
            assert len(fastx_file_names) > 0
            remove_files.extend(fastx_file_names)
            return archive_files(fastx_file_names, output_file_name)
        else:
            tmp_out = "{p}{b}.gz".format(p=tmp_out_prefix, b=base_ext)
            _ungzip_fastx(tmp_out, output_file_name)
            remove_files = [tmp_out]
    finally:
        for fn in remove_files:
            os.remove(fn)
    return 0
示例#36
0
def run_fasta_to_fofn(input_file_name, output_file_name):
    args = ["echo", input_file_name, ">", output_file_name]
    log.info(" ".join(args))
    result = run_cmd(" ".join(args), stdout_fh = sys.stdout,
                     stderr_fh=sys.stderr)
    return result.exit_code