示例#1
0
def _set_align_split_size(data):
    """Set useful align_split_size, generating an estimate if it doesn't exist.

    We try to split on larger inputs and avoid too many pieces, aiming for size
    chunks of 5Gb or at most 50 maximum splits.

    The size estimate used in calculations is 20 million reads for ~5Gb.

    For UMI calculations we skip splitting since we're going to align and
    re-align after consensus.
    """
    target_size = 5  # Gb
    target_size_reads = 20  # million reads
    max_splits = 100  # Avoid too many pieces, causing merge memory problems
    val = tz.get_in(["config", "algorithm", "align_split_size"], data)
    umi_consensus = dd.get_umi_consensus(data)
    if val is None:
        if not umi_consensus:
            total_size = 0  # Gb
            # Use original files if we might have reduced the size of our prepped files
            input_files = data.get("files_orig", []) if dd.get_save_diskspace(data) else data.get("files", [])
            for fname in input_files:
                if os.path.exists(fname):
                    total_size += os.path.getsize(fname) / (1024.0 * 1024.0 * 1024.0)
            # Only set if we have files and are bigger than the target size
            if total_size > target_size:
                data["config"]["algorithm"]["align_split_size"] = \
                  int(1e6 * _pick_align_split_size(total_size, target_size,
                                                   target_size_reads, max_splits))
    elif val:
        assert not umi_consensus, "Cannot set align_split_size to %s with UMI conensus specified" % val
    return data
示例#2
0
def _set_align_split_size(data):
    """Set useful align_split_size, generating an estimate if it doesn't exist.

    We try to split on larger inputs and avoid too many pieces, aiming for size
    chunks of 5Gb or at most 50 maximum splits.

    The size estimate used in calculations is 20 million reads for ~5Gb.

    For UMI calculations we skip splitting since we're going to align and
    re-align after consensus.
    """
    target_size = 5  # Gb
    target_size_reads = 20  # million reads
    max_splits = 100  # Avoid too many pieces, causing merge memory problems
    val = tz.get_in(["config", "algorithm", "align_split_size"], data)
    umi_consensus = dd.get_umi_consensus(data)
    if val is None:
        if not umi_consensus:
            total_size = 0  # Gb
            # Use original files if we might have reduced the size of our prepped files
            input_files = data.get(
                "files_orig", []) if dd.get_save_diskspace(data) else data.get(
                    "files", [])
            for fname in input_files:
                if os.path.exists(fname):
                    total_size += os.path.getsize(fname) / (1024.0 * 1024.0 *
                                                            1024.0)
            # Only set if we have files and are bigger than the target size
            if total_size > target_size:
                data["config"]["algorithm"]["align_split_size"] = \
                  int(1e6 * _pick_align_split_size(total_size, target_size,
                                                   target_size_reads, max_splits))
    elif val:
        assert not umi_consensus, "Cannot set align_split_size to %s with UMI conensus specified" % val
    return data
示例#3
0
def run(_, data, out_dir):
    """Run kraken, generating report in specified directory and parsing metrics.
       Using only first paired reads.
    """
    # logger.info("Number of aligned reads < than 0.60 in %s: %s" % (dd.get_sample_name(data), ratio))
    logger.info("Running kraken to determine contaminant: %s" %
                dd.get_sample_name(data))
    # ratio = bam.get_aligned_reads(bam_file, data)
    out = out_stats = None
    db = tz.get_in(["config", "algorithm", "kraken"], data)
    kraken_cmd = config_utils.get_program("kraken", data["config"])
    if db == "minikraken":
        db = os.path.join(install._get_data_dir(), "genomes", "kraken",
                          "minikraken")

    if not os.path.exists(db):
        logger.info("kraken: no database found %s, skipping" % db)
        return {"kraken_report": "null"}

    if not os.path.exists(os.path.join(out_dir, "kraken_out")):
        work_dir = os.path.dirname(out_dir)
        utils.safe_makedir(work_dir)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        fn_file = data["files_orig"][0] if dd.get_save_diskspace(
            data) else data["files"][0]
        if fn_file.endswith("bam"):
            logger.info("kraken: need fastq files as input")
            return {"kraken_report": "null"}
        with tx_tmpdir(data) as tx_tmp_dir:
            with utils.chdir(tx_tmp_dir):
                out = os.path.join(tx_tmp_dir, "kraken_out")
                out_stats = os.path.join(tx_tmp_dir, "kraken_stats")
                cat = "zcat" if fn_file.endswith(".gz") else "cat"
                cl = ("{cat} {fn_file} | {kraken_cmd} --db {db} --quick "
                      "--preload --min-hits 2 "
                      "--threads {num_cores} "
                      "--output {out} --fastq-input /dev/stdin  2> {out_stats}"
                      ).format(**locals())
                do.run(cl, "kraken: %s" % dd.get_sample_name(data))
                if os.path.exists(out_dir):
                    shutil.rmtree(out_dir)
                shutil.move(tx_tmp_dir, out_dir)
    metrics = _parse_kraken_output(out_dir, db, data)
    return metrics
示例#4
0
def run(_, data, out_dir):
    """Run kraken, generating report in specified directory and parsing metrics.
       Using only first paired reads.
    """
    # logger.info("Number of aligned reads < than 0.60 in %s: %s" % (dd.get_sample_name(data), ratio))
    logger.info("Running kraken to determine contaminant: %s" % dd.get_sample_name(data))
    # ratio = bam.get_aligned_reads(bam_file, data)
    out = out_stats = None
    db = tz.get_in(["config", "algorithm", "kraken"], data)
    if db and isinstance(db, (list, tuple)):
        db = db[0]
    kraken_cmd = config_utils.get_program("kraken", data["config"])
    if db == "minikraken":
        db = os.path.join(install._get_data_dir(), "genomes", "kraken", "minikraken")

    if not os.path.exists(db):
        logger.info("kraken: no database found %s, skipping" % db)
        return {"kraken_report": "null"}

    if not os.path.exists(os.path.join(out_dir, "kraken_out")):
        work_dir = os.path.dirname(out_dir)
        utils.safe_makedir(work_dir)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        fn_file = data["files_orig"][0] if dd.get_save_diskspace(data) else data["files"][0]
        if fn_file.endswith("bam"):
            logger.info("kraken: need fastq files as input")
            return {"kraken_report": "null"}
        with tx_tmpdir(data) as tx_tmp_dir:
            with utils.chdir(tx_tmp_dir):
                out = os.path.join(tx_tmp_dir, "kraken_out")
                out_stats = os.path.join(tx_tmp_dir, "kraken_stats")
                cat = "zcat" if fn_file.endswith(".gz") else "cat"
                cl = ("{cat} {fn_file} | {kraken_cmd} --db {db} --quick "
                      "--preload --min-hits 2 "
                      "--threads {num_cores} "
                      "--output {out} --fastq-input /dev/stdin  2> {out_stats}").format(**locals())
                do.run(cl, "kraken: %s" % dd.get_sample_name(data))
                if os.path.exists(out_dir):
                    shutil.rmtree(out_dir)
                shutil.move(tx_tmp_dir, out_dir)
    metrics = _parse_kraken_output(out_dir, db, data)
    return metrics