Exemplo n.º 1
0
def trim_srna_sample(data):
    """
    Remove 3' adapter for smallRNA-seq
    Uses cutadapt but with different parameters than for other pipelines.
    """
    data = umi_transform(data)
    in_file = data["files"][0]
    names = data["rgnames"]['sample']
    work_dir = os.path.join(dd.get_work_dir(data), "trimmed")
    out_dir = os.path.join(work_dir, names)
    log_out = os.path.join(out_dir, "%s.log" % names)
    utils.safe_makedir(out_dir)
    out_file = replace_directory(append_stem(in_file, ".clean"), out_dir)
    trim_reads = data["config"]["algorithm"].get("trim_reads", True)
    if utils.file_exists(out_file):
        data["files"][0] = out_file
        data["clean_fastq"] = out_file
        data["collapse"] = _collapse(data["clean_fastq"])
        data["size_stats"] = _summary(data['collapse'])
        data["log_trimming"] = log_out
        return [[data]]

    adapter = dd.get_adapters(data)
    is_4n = any([a == "4N" for a in adapter])
    adapter = [a for a in adapter if re.compile("^([NATGC]+)$").match(a)]
    if adapter and not trim_reads:
        trim_reads = True
        logger.info("Adapter is set up in config file, but trim_reads is not true."
                    "If you want to skip trimming, skip adapter option from config.")
    if trim_reads and not adapter and error_dnapi:
        raise ValueError(error_dnapi)
    if trim_reads:
        adapters = adapter if adapter else _dnapi_prediction(in_file, out_dir)
    times = "" if not trim_reads or len(adapters) == 1 else "--times %s" % len(adapters)
    if trim_reads and adapters:
        adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters))
        if any([a for a in adapters if re.compile("^N+$").match(a)]):
            adapter_cmd = "-N %s" % adapter_cmd
        out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir)
        out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir)
        atropos = _get_atropos()
        options = " ".join(data.get('resources', {}).get('atropos', {}).get("options", ""))
        if options.strip() == "-u 4 -u -4":
            options = ""
            is_4n = "4N"
        cores = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "")
        if " ".join(data.get('resources', {}).get('cutadapt', {}).get("options", "")):
            raise ValueError("Atropos is now used, but cutadapt options found in YAML file."
                             "See https://atropos.readthedocs.io/en/latest/")
        cmd = _cmd_atropos()
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                do.run(cmd.format(**locals()), "remove adapter for %s" % names)
                if utils.file_exists(log_out):
                    content = open(log_out).read().replace(out_short_file, names)
                    open(log_out, 'w').write(content)
                if is_4n:
                    options = "-u 4 -u -4"
                    in_file = append_stem(tx_out_file, ".tmp")
                    utils.move_safe(tx_out_file, in_file)
                    cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17"
                    do.run(cmd.format(**locals()), "atropos with this parameters %s for %s" %(options, names))
        data["log_trimming"] = log_out
    else:
        if not trim_reads:
            logger.debug("Skip trimming for: %s" % names)
        elif not adapters:
            logger.info("No adapter founds in %s, this is an issue related"
                        " to no small RNA enrichment in your sample." % names)
        symlink_plus(in_file, out_file)
    data["files"][0] = out_file
    data["clean_fastq"] = out_file
    data["collapse"] = _collapse(data["clean_fastq"])
    data["size_stats"] = _summary(data['collapse'])
    return [[data]]
Exemplo n.º 2
0
def trim_srna_sample(data):
    """
    Remove 3' adapter for smallRNA-seq
    Uses cutadapt but with different parameters than for other pipelines.
    """
    data = umi_transform(data)
    in_file = data["files"][0]
    names = data["rgnames"]['sample']
    work_dir = os.path.join(dd.get_work_dir(data), "trimmed")
    out_dir = os.path.join(work_dir, names)
    log_out = os.path.join(out_dir, "%s.log" % names)
    utils.safe_makedir(out_dir)
    out_file = replace_directory(append_stem(in_file, ".clean"), out_dir)
    trim_reads = data["config"]["algorithm"].get("trim_reads", True)
    if utils.file_exists(out_file):
        data["files"][0] = out_file
        data["clean_fastq"] = out_file
        data["collapse"] = _collapse(data["clean_fastq"])
        data["size_stats"] = _summary(data['collapse'])
        data["log_trimming"] = log_out
        return [[data]]

    adapter = dd.get_adapters(data)
    is_4n = any([a == "4N" for a in adapter])
    adapter = [a for a in adapter if re.compile("^([NATGC]+)$").match(a)]
    if adapter and not trim_reads:
        trim_reads = True
        logger.info("Adapter is set up in config file, but trim_reads is not true."
                    "If you want to skip trimming, skip adapter option from config.")
    if trim_reads and not adapter and error_dnapi:
        raise ValueError(error_dnapi)
    if trim_reads:
        adapters = adapter if adapter else _dnapi_prediction(in_file, out_dir)
    times = "" if not trim_reads or len(adapters) == 1 else "--times %s" % len(adapters)
    if trim_reads and adapters:
        adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters))
        if any([a for a in adapters if re.compile("^N+$").match(a)]):
            adapter_cmd = "-N %s" % adapter_cmd
        out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir)
        out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir)
        # atropos = _get_atropos()
        atropos = config_utils.get_program("atropos", data, default="atropos")
        options = " ".join(data.get('resources', {}).get('atropos', {}).get("options", ""))
        if options.strip() == "-u 4 -u -4":
            options = ""
            is_4n = "4N"
        cores = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "")
        if " ".join(data.get('resources', {}).get('cutadapt', {}).get("options", "")):
            raise ValueError("Atropos is now used, but cutadapt options found in YAML file."
                             "See https://atropos.readthedocs.io/en/latest/")
        cmd = _cmd_atropos()
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                do.run(cmd.format(**locals()), "remove adapter for %s" % names)
                if utils.file_exists(log_out):
                    content = open(log_out).read().replace(out_short_file, names)
                    open(log_out, 'w').write(content)
                if is_4n:
                    options = "-u 4 -u -4"
                    in_file = append_stem(tx_out_file, ".tmp")
                    utils.move_safe(tx_out_file, in_file)
                    cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17"
                    do.run(cmd.format(**locals()), "atropos with this parameters %s for %s" %(options, names))
        data["log_trimming"] = log_out
    else:
        if not trim_reads:
            logger.debug("Skip trimming for: %s" % names)
        elif not adapters:
            logger.info("No adapter founds in %s, this is an issue related"
                        " to no small RNA enrichment in your sample." % names)
        symlink_plus(in_file, out_file)
    data["files"][0] = out_file
    data["clean_fastq"] = out_file
    data["collapse"] = _collapse(data["clean_fastq"])
    data["size_stats"] = _summary(data['collapse'])
    return [[data]]