def resolved_tool_contract_runner(rtc):
    """Given resolved tool contract, run"""
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, PartialChunkTask) for task in p])
    dummy_sentinel_file = rtc.task.input_files[1]
    ccs_file = rtc.task.input_files[2]
    nproc = rtc.task.nproc
    tmp_dir = rtc.task.tmpdir_resources[0].path \
            if len(rtc.task.tmpdir_resources) > 0 else None

    log.info("Looking for QVs in CCS input...")
    with ConsensusReadSet(ccs_file) as ds:
        for bam in ds.resourceReaders():
            qvs = bam.pulseFeaturesAvailable()
            if qvs != set(['SubstitutionQV', 'InsertionQV', 'DeletionQV']):
                log.warn("Missing QV fields from %s, will use default probabilities",
                         bam.filename)
                ccs_file = None
                break

    with open(rtc.task.output_files[0], 'w') as writer:
        for task in p:
            log.info("Running ice_partial on cluster bin %s, nfl chunk %s/%s",
                     str(task.cluster_bin_index),
                     str(task.nfl_index), str(task.n_nfl_chunks))
            task_runner(task=task, ccs_file=ccs_file, nproc=nproc, tmp_dir=tmp_dir)
            writer.write("ice_partial of cluster bin %s, nfl chunk %s/%s in %s is DONE: %s\n" %
                         (task.cluster_bin_index, task.nfl_index, task.n_nfl_chunks,
                          task.cluster_out_dir, task.nfl_pickle))
Exemplo n.º 2
0
def resolved_tool_contract_runner(rtc):
    """
    For each cluster bin, clean up intermediate files under tmp.
    """
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, ClusterChunkTask) for task in p])

    cluster_bin_indices = [task.cluster_bin_index for task in p]
    # sanity check that Cluster indices are unique!
    assert len(set(cluster_bin_indices)) == len(cluster_bin_indices)

    sentinel_out = rtc.task.output_files[0]
    with open(sentinel_out, 'w') as writer:
        for task in p:
            icef = IceFiles(prog_name="ice_cleanup",
                            root_dir=task.cluster_out_dir)
            tmp_dir = icef.tmp_dir
            log.info("Cleaning up, removing %s", tmp_dir)
            writer.write("removing %s\n" % tmp_dir)
            execute("rm -rf %s" % tmp_dir)

            quivered_dir = icef.quivered_dir
            log.info("Cleaning up, removing %s", quivered_dir)
            writer.write("removing %s\n" % quivered_dir)
            execute("rm -rf %s" % quivered_dir)
Exemplo n.º 3
0
def resolved_tool_contract_runner(rtc):
    """
    For each cluster bin, clean up intermediate files under tmp.
    """
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, ClusterChunkTask) for task in p])

    cluster_bin_indices = [task.cluster_bin_index for task in p]
    # sanity check that Cluster indices are unique!
    assert len(set(cluster_bin_indices)) == len(cluster_bin_indices)

    sentinel_out = rtc.task.output_files[0]
    with open(sentinel_out, 'w') as writer:
        for task in p:
            icef = IceFiles(prog_name="ice_cleanup",
                            root_dir=task.cluster_out_dir)
            tmp_dir = icef.tmp_dir
            log.info("Cleaning up, removing %s", tmp_dir)
            writer.write("removing %s\n" % tmp_dir)
            execute("rm -rf %s" % real_upath(tmp_dir))

            quivered_dir = icef.quivered_dir
            log.info("Cleaning up, removing %s", quivered_dir)
            writer.write("removing %s\n" % quivered_dir)
            execute("rm -rf %s" % real_upath(quivered_dir))
Exemplo n.º 4
0
def resolved_tool_contract_runner(rtc):
    """Given resolved tool contract, run"""
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, PartialChunkTask) for task in p])
    dummy_sentinel_file = rtc.task.input_files[1]
    ccs_file = rtc.task.input_files[2]
    nproc = rtc.task.nproc
    tmp_dir = rtc.task.tmpdir_resources[0].path \
            if len(rtc.task.tmpdir_resources) > 0 else None

    log.info("Looking for QVs in CCS input...")
    with ConsensusReadSet(ccs_file) as ds:
        for bam in ds.resourceReaders():
            qvs = bam.pulseFeaturesAvailable()
            if qvs != set(['SubstitutionQV', 'InsertionQV', 'DeletionQV']):
                log.warn(
                    "Missing QV fields from %s, will use default probabilities",
                    bam.filename)
                ccs_file = None
                break

    with open(rtc.task.output_files[0], 'w') as writer:
        for task in p:
            log.info("Running ice_partial on cluster bin %s, nfl chunk %s/%s",
                     str(task.cluster_bin_index), str(task.nfl_index),
                     str(task.n_nfl_chunks))
            task_runner(task=task,
                        ccs_file=ccs_file,
                        nproc=nproc,
                        tmp_dir=tmp_dir)
            writer.write(
                "ice_partial of cluster bin %s, nfl chunk %s/%s in %s is DONE: %s\n"
                % (task.cluster_bin_index, task.nfl_index, task.n_nfl_chunks,
                   task.cluster_out_dir, task.nfl_pickle))
Exemplo n.º 5
0
def resolved_tool_contract_runner(rtc):
    """run all tasks in cluster_chunks.pickle given rtc"""
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, ClusterChunkTask) for task in p])

    ccs_file = rtc.task.input_files[1]
    assert op.exists(ccs_file)

    nproc = rtc.task.nproc
    use_finer_qv = False
    #if rtc.task.options.get(Constants.USE_FINER_QV_ID, False):
    #    use_finer_qv = True

    with open(rtc.task.output_files[0], 'w') as writer:
        for i, task in enumerate(p):
            args = task_to_args(task=task,
                                ccs_file=ccs_file,
                                nproc=nproc,
                                use_finer_qv=use_finer_qv)
            log.info("ARGUMENTS of Task %s/%s:\n%s", str(i), str(len(p)),
                     str(args))
            log.info("Running ICE on cluster bin %s", task.cluster_bin_index)
            PBTranscript(args, subCommand="cluster").start()
            writer.write("ICE of cluster bin %s in %s is DONE: %s\n" %
                         (task.cluster_bin_index, task.cluster_out_dir,
                          task.consensus_isoforms_file))
Exemplo n.º 6
0
def resolved_tool_contract_runner(rtc):
    """resolved tool contract runner."""
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, PolishChunkTask) for task in p])
    dummy_sentinel_file = rtc.task.input_files[1]

    subread_set = rtc.task.input_files[2]
    nproc = rtc.task.nproc
    tmp_dir = rtc.task.tmpdir_resources[0].path \
            if len(rtc.task.tmpdir_resources) > 0 else None

    with open(rtc.task.output_files[0], 'w') as writer:
        for task in p:
            log.info(
                "Running ice_polish on cluster bin %s, polish chunk %s/%s",
                str(task.cluster_bin_index), str(task.polish_index),
                str(task.n_polish_chunks))
            log.debug("ice_quiver root_dir is %s", task.cluster_out_dir)
            log.debug("consensus_isoforms is %s", task.consensus_isoforms_file)

            task_runner(task=task,
                        subread_set=subread_set,
                        nproc=nproc,
                        tmp_dir=tmp_dir)
            writer.write(
                "ice_polish of cluster bin %s, polish chunk %s/%s in %s is DONE.\n"
                % (task.cluster_bin_index, task.polish_index,
                   task.n_polish_chunks, task.cluster_out_dir))
Exemplo n.º 7
0
def run_main(cluster_chunks_pickle_file, ccs_file, output_json_file,
             max_nchunks):
    """Scatter items in cluster_chunks_pickle
    Parameters:
      cluster_chunks_pickle_file -- ChunkTasksPickle of ClusterChunkTask objects.
      ccs_file -- ccs.consensusreadset.xml
      output_json_file -- chunk.json
      max_nchunks -- maximum # of chunks
    """
    p = ChunkTasksPickle.read(cluster_chunks_pickle_file)
    assert all([isinstance(r, ClusterChunkTask) for r in p])
    out_dir = op.dirname(output_json_file)

    # sort and group tasks
    groups = p.sort_and_group_tasks(max_nchunks=max_nchunks)

    # Writing chunk.json
    base_name = "spawned_cluster_chunk"
    chunks = []
    spawned_pickles = []
    for group_index in range(0, len(groups)):
        chunk_id = "_".join([base_name, 'group', str(group_index)])
        spawned_pickle_file = op.join(out_dir, chunk_id + ".pickle")
        d = {
            Constants.CHUNK_KEYS[0]: spawned_pickle_file,
            Constants.CHUNK_KEYS[1]: ccs_file
        }
        c = PipelineChunk(chunk_id, **d)
        chunks.append(c)
        spawned_pickles.append(spawned_pickle_file)

    log.info("Spawning %s into %d files", cluster_chunks_pickle_file,
             len(groups))
    p.spawn_pickles_by_groups(groups, spawned_pickles)
    log.debug("Spawned files: %s.", ", ".join(spawned_pickles))

    #    n_chunks = len(p)
    #    for i in range(0, n_chunks):
    #        chunk_id = "_".join([base_name, str(i)])
    #        spawned_pickle_file = op.join(out_dir, chunk_id + ".pickle")
    #        d = {Constants.CHUNK_KEYS[0]: spawned_pickle_file,
    #             Constants.CHUNK_KEYS[1]: ccs_file}
    #        c = PipelineChunk(chunk_id, **d)
    #        chunks.append(c)
    #        spawned_pickles.append(spawned_pickle_file)
    #
    #    log.info("Spawning %s into %s files", cluster_chunks_pickle_file, str(n_chunks))
    #    p.spawn_pickles(spawned_pickles)
    #    log.debug("Spawned files: %s.", ", ".join(spawned_pickles))

    log.info("Writing chunk.json to %s", output_json_file)
    write_pipeline_chunks(chunks, output_json_file,
                          "created by %s" % Constants.TOOL_ID)
    return 0
Exemplo n.º 8
0
def run_main(cluster_chunks_pickle_file, ccs_file, output_json_file, max_nchunks):
    """Scatter items in cluster_chunks_pickle
    Parameters:
      cluster_chunks_pickle_file -- ChunkTasksPickle of ClusterChunkTask objects.
      ccs_file -- ccs.consensusreadset.xml
      output_json_file -- chunk.json
      max_nchunks -- maximum # of chunks
    """
    p = ChunkTasksPickle.read(cluster_chunks_pickle_file)
    assert all([isinstance(r, ClusterChunkTask) for r in p])
    out_dir = op.dirname(output_json_file)

    # sort and group tasks
    groups = p.sort_and_group_tasks(max_nchunks=max_nchunks)

    # Writing chunk.json
    base_name = "spawned_cluster_chunk"
    chunks = []
    spawned_pickles = []
    for group_index in range(0, len(groups)):
        chunk_id = "_".join([base_name, 'group', str(group_index)])
        spawned_pickle_file = op.join(out_dir, chunk_id + ".pickle")
        d = {Constants.CHUNK_KEYS[0]: spawned_pickle_file,
             Constants.CHUNK_KEYS[1]: ccs_file}
        c = PipelineChunk(chunk_id, **d)
        chunks.append(c)
        spawned_pickles.append(spawned_pickle_file)

    log.info("Spawning %s into %d files", cluster_chunks_pickle_file, len(groups))
    p.spawn_pickles_by_groups(groups, spawned_pickles)
    log.debug("Spawned files: %s.", ", ".join(spawned_pickles))

#    n_chunks = len(p)
#    for i in range(0, n_chunks):
#        chunk_id = "_".join([base_name, str(i)])
#        spawned_pickle_file = op.join(out_dir, chunk_id + ".pickle")
#        d = {Constants.CHUNK_KEYS[0]: spawned_pickle_file,
#             Constants.CHUNK_KEYS[1]: ccs_file}
#        c = PipelineChunk(chunk_id, **d)
#        chunks.append(c)
#        spawned_pickles.append(spawned_pickle_file)
#
#    log.info("Spawning %s into %s files", cluster_chunks_pickle_file, str(n_chunks))
#    p.spawn_pickles(spawned_pickles)
#    log.debug("Spawned files: %s.", ", ".join(spawned_pickles))

    log.info("Writing chunk.json to %s", output_json_file)
    write_pipeline_chunks(chunks, output_json_file,
                          "created by %s" % Constants.TOOL_ID)
    return 0
def resolved_tool_contract_runner(rtc):
    """Given resolved tool contract, run"""
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    p.sorted_by_attr(attr='cluster_bin_index')
    assert all([isinstance(task, PartialChunkTask) for task in p])

    with open(rtc.task.output_files[0], 'w') as writer:
        for i, group in groupby(p, lambda x: x.cluster_bin_index):
            gs = [g for g in group]
            nfl_pickles_of_bin_i = [g.nfl_pickle for g in gs]
            out_pickle = IceFiles(prog_name="", root_dir=gs[0].cluster_out_dir,
                                  no_log_f=True).nfl_all_pickle_fn
            log.info("Combining nfl pickles of cluster bin %s.", str(i))
            log.debug("nfl pickles are: %s.", (", ".join(nfl_pickles_of_bin_i)))
            log.debug("Output merged nfl pickle is %s.", out_pickle)
            combine_nfl_pickles(splitted_pickles=nfl_pickles_of_bin_i, out_pickle=out_pickle)
            writer.write("Merge nfl pickles of cluster bin %s DONE: %s\n" %
                         (i, out_pickle))
def run_main(partial_chunks_pickle_file, sentinel_file, ccs_file,
             output_json_file, max_nchunks):
    """
    Spawn partial Chunk Tasks in pickle.
    Parameters:
      partial_chunks_pickle_file -- ChunkTasksPickle of PartialChunkTask objects
      ccs_file -- ccs dataset
      sentinel_file -- sentinel file to connect pbsmrtpipe tasks
      output_json -- chunk.json
    """
    p = ChunkTasksPickle.read(partial_chunks_pickle_file)
    assert all([isinstance(r, PartialChunkTask) for r in p])
    out_dir = op.dirname(output_json_file)

    # sort and group tasks
    groups = p.sort_and_group_tasks(max_nchunks=max_nchunks)

    # Writing chunk.json
    base_name = "spawned_partial_chunk"
    chunks = []
    spawned_pickles = []
    for group_index in range(0, len(groups)):
        chunk_id = "_".join([base_name, 'group', str(group_index)])
        spawned_pickle_file = op.join(out_dir, chunk_id + ".pickle")
        d = {
            Constants.CHUNK_KEYS[0]: spawned_pickle_file,
            Constants.CHUNK_KEYS[1]: sentinel_file,
            Constants.CHUNK_KEYS[2]: ccs_file
        }
        c = PipelineChunk(chunk_id, **d)
        chunks.append(c)
        spawned_pickles.append(spawned_pickle_file)

    log.info("Spawning %s into %d files", partial_chunks_pickle_file,
             len(groups))
    p.spawn_pickles_by_groups(groups=groups, out_pickle_fns=spawned_pickles)
    log.debug("Spawned files: %s.", ", ".join(spawned_pickles))

    log.info("Writing chunk.json to %s", output_json_file)
    write_pipeline_chunks(chunks, output_json_file,
                          "created by %s" % Constants.TOOL_ID)
    return 0
def resolved_tool_contract_runner(rtc):
    """Given resolved tool contract, run"""
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    p.sorted_by_attr(attr='cluster_bin_index')
    assert all([isinstance(task, PartialChunkTask) for task in p])

    with open(rtc.task.output_files[0], 'w') as writer:
        for i, group in groupby(p, lambda x: x.cluster_bin_index):
            gs = [g for g in group]
            nfl_pickles_of_bin_i = [g.nfl_pickle for g in gs]
            out_pickle = IceFiles(prog_name="",
                                  root_dir=gs[0].cluster_out_dir,
                                  no_log_f=True).nfl_all_pickle_fn
            log.info("Combining nfl pickles of cluster bin %s.", str(i))
            log.debug("nfl pickles are: %s.",
                      (", ".join(nfl_pickles_of_bin_i)))
            log.debug("Output merged nfl pickle is %s.", out_pickle)
            combine_nfl_pickles(splitted_pickles=nfl_pickles_of_bin_i,
                                out_pickle=out_pickle)
            writer.write("Merge nfl pickles of cluster bin %s DONE: %s\n" %
                         (i, out_pickle))
def run_main(polish_chunks_pickle_file, sentinel_file,
             subreads_file, output_json_file, max_nchunks):
    """
    Spawn polish Chunk Tasks in pickle.
    Parameters:
      polish_chunks_pickle_file -- ChunkTasksPickle of PolishChunkTask objects
      subreads_file -- ccs dataset
      sentinel_file -- sentinel file to connect pbsmrtpipe tasks.
      output_json -- chunk.json
    """
    p = ChunkTasksPickle.read(polish_chunks_pickle_file)
    assert all([isinstance(r, PolishChunkTask) for r in p])
    out_dir = op.dirname(output_json_file)

    # sort and group tasks
    groups = p.sort_and_group_tasks(max_nchunks=max_nchunks)

    # Writing chunk.json
    base_name = "spawned_polish_chunk"
    chunks = []
    spawned_pickles = []
    for group_index in range(0, len(groups)):
        chunk_id = "_".join([base_name, 'group', str(group_index)])
        spawned_pickle_file = op.join(out_dir, chunk_id + ".pickle")
        d = {Constants.CHUNK_KEYS[0]: spawned_pickle_file,
             Constants.CHUNK_KEYS[1]: sentinel_file,
             Constants.CHUNK_KEYS[2]: subreads_file}
        c = PipelineChunk(chunk_id, **d)
        chunks.append(c)
        spawned_pickles.append(spawned_pickle_file)

    log.info("Spawning %s into %d files", polish_chunks_pickle_file, len(groups))
    p.spawn_pickles_by_groups(groups=groups, out_pickle_fns=spawned_pickles)
    log.debug("Spawned files: %s.", ", ".join(spawned_pickles))

    log.info("Writing chunk.json to %s", output_json_file)
    write_pipeline_chunks(chunks, output_json_file,
                          "created by %s" % Constants.TOOL_ID)
    return 0
Exemplo n.º 13
0
def resolved_tool_contract_runner(rtc):
    """run all tasks in cluster_chunks.pickle given rtc"""
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, ClusterChunkTask) for task in p])

    ccs_file = rtc.task.input_files[1]
    assert op.exists(ccs_file)

    nproc = rtc.task.nproc
    use_finer_qv = False
    #if rtc.task.options.get(Constants.USE_FINER_QV_ID, False):
    #    use_finer_qv = True

    with open(rtc.task.output_files[0], 'w') as writer:
        for i, task in enumerate(p):
            args = task_to_args(task=task, ccs_file=ccs_file,
                                nproc=nproc, use_finer_qv=use_finer_qv)
            log.info("ARGUMENTS of Task %s/%s:\n%s", str(i), str(len(p)), str(args))
            log.info("Running ICE on cluster bin %s", task.cluster_bin_index)
            PBTranscript(args, subCommand="cluster").start()
            writer.write("ICE of cluster bin %s in %s is DONE: %s\n" %
                         (task.cluster_bin_index, task.cluster_out_dir,
                          task.consensus_isoforms_file))
def resolved_tool_contract_runner(rtc):
    """
    For each cluster bin, create summary.json, cluster_report.csv,
    hq_isoforms.fa|fq, lq_isoforms.fa|fq
    """
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, PolishChunkTask) for task in p])
    p.sorted_by_attr(attr='cluster_bin_index')

    opts = rtc.task.options
    ipq_opts = IceQuiverHQLQOptions(qv_trim_5=opts[Constants.QV_TRIM_FIVEPRIME_ID],
                                    qv_trim_3=opts[Constants.QV_TRIM_THREEPRIME_ID],
                                    hq_quiver_min_accuracy=opts[Constants.HQ_QUIVER_MIN_ACCURACY_ID])

    with open(rtc.task.output_files[0], 'w') as writer:
        for cluster_bin_index, cluster_out_dir in p.sorted_no_redundant_cluster_bins():
            log.info("ice_quiver_postprocess of cluster bin index %s in %s.",
                     str(cluster_bin_index), str(cluster_out_dir))
            good_hq, bad_hq = \
            ice_quiver_postprocess_a_cluster_bin(cluster_out_dir=cluster_out_dir,
                                                 ipq_opts=ipq_opts)
            writer.write("ice_quiver_postprocess of cluster bin index %s in %s DONE:\n%s\n%s\n" %
                         (cluster_bin_index, cluster_out_dir, good_hq, bad_hq))
Exemplo n.º 15
0
def resolved_tool_contract_runner(rtc):
    """resolved tool contract runner."""
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, PolishChunkTask) for task in p])
    dummy_sentinel_file = rtc.task.input_files[1]

    subread_set = rtc.task.input_files[2]
    nproc = rtc.task.nproc
    tmp_dir = rtc.task.tmpdir_resources[0].path \
            if len(rtc.task.tmpdir_resources) > 0 else None

    with open(rtc.task.output_files[0], 'w') as writer:
        for task in p:
            log.info("Running ice_polish on cluster bin %s, polish chunk %s/%s",
                     str(task.cluster_bin_index),
                     str(task.polish_index), str(task.n_polish_chunks))
            log.debug("ice_quiver root_dir is %s", task.cluster_out_dir)
            log.debug("consensus_isoforms is %s", task.consensus_isoforms_file)

            task_runner(task=task, subread_set=subread_set, nproc=nproc, tmp_dir=tmp_dir)
            writer.write("ice_polish of cluster bin %s, polish chunk %s/%s in %s is DONE.\n" %
                         (task.cluster_bin_index, task.polish_index, task.n_polish_chunks,
                          task.cluster_out_dir))
Exemplo n.º 16
0
def resolved_tool_contract_runner(rtc):
    """
    For each cluster bin, create summary.json, cluster_report.csv,
    hq_isoforms.fa|fq, lq_isoforms.fa|fq
    Finally, merge all cluster bins and save all outputs to 'combined'.
    """
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, ClusterChunkTask) for task in p])
    p.sorted_by_attr(attr='cluster_bin_index')

    opts = rtc.task.options
    ipq_opts = IceQuiverHQLQOptions(
        qv_trim_5=opts[Constants.QV_TRIM_FIVEPRIME_ID],
        qv_trim_3=opts[Constants.QV_TRIM_THREEPRIME_ID],
        hq_quiver_min_accuracy=opts[Constants.HQ_QUIVER_MIN_ACCURACY_ID])
    sample_name = get_sample_name(
        input_sample_name=opts[Constants.SAMPLE_NAME_ID])

    out_consensus_isoforms_cs = rtc.task.output_files[0]
    out_summary = rtc.task.output_files[1]
    out_report = rtc.task.output_files[2]
    out_hq_cs = rtc.task.output_files[3]
    out_hq_fq = rtc.task.output_files[4]
    out_lq_cs = rtc.task.output_files[5]
    out_lq_fq = rtc.task.output_files[6]
    out_hq_lq_prefix_dict_pickle = rtc.task.output_files[7]

    assert out_consensus_isoforms_cs.endswith(".contigset.xml")
    assert out_hq_cs.endswith(".contigset.xml")
    assert out_lq_cs.endswith(".contigset.xml")
    out_consensus_isoforms_fa = out_consensus_isoforms_cs.replace(
        ".contigset.xml", ".fasta")
    out_hq_fa = out_hq_cs.replace('.contigset.xml', '.fasta')
    out_lq_fa = out_lq_cs.replace('.contigset.xml', '.fasta')

    hq_fq_fns, lq_fq_fns = [], []
    split_uc_pickles, split_partial_uc_pickles = [], []
    split_consensus_isoforms = []

    cluster_bin_indices = [task.cluster_bin_index for task in p]
    cluster_out_dirs = [task.cluster_out_dir for task in p]
    # sanity check that Cluster indices are unique!
    assert len(set(cluster_bin_indices)) == len(cluster_bin_indices)

    for task in p:
        ice_pq = IceQuiverPostprocess(root_dir=task.cluster_out_dir,
                                      ipq_opts=ipq_opts)
        hq_fq_fns.append(ice_pq.quivered_good_fq)
        lq_fq_fns.append(ice_pq.quivered_bad_fq)
        split_uc_pickles.append(ice_pq.final_pickle_fn)
        split_partial_uc_pickles.append(ice_pq.nfl_all_pickle_fn)
        split_consensus_isoforms.append(ice_pq.final_consensus_fa)

    combined_dir = op.join(op.dirname(op.dirname(cluster_out_dirs[0])),
                           "combined")
    mkdir(combined_dir)
    combined_files = CombinedFiles(combined_dir)
    log.info("Combining results of all cluster bins to %s.", combined_dir)
    log.info("Merging HQ|LQ isoforms from all cluster bins.")
    log.info("HQ isoforms are: %s.", ",".join(hq_fq_fns))
    log.info("LQ isoforms are: %s.", ",".join(lq_fq_fns))
    combine_polished_isoforms(
        split_indices=cluster_bin_indices,
        split_hq_fns=hq_fq_fns,
        split_lq_fns=lq_fq_fns,
        combined_hq_fa=combined_files.all_hq_fa,
        combined_hq_fq=combined_files.all_hq_fq,
        combined_lq_fa=combined_files.all_lq_fa,
        combined_lq_fq=combined_files.all_lq_fq,
        hq_lq_prefix_dict_pickle=combined_files.hq_lq_prefix_dict_pickle,
        sample_name=sample_name)

    ln(combined_files.all_hq_fa, out_hq_fa)  #'HQ isoforms'
    ln(combined_files.all_hq_fq, out_hq_fq)  #'HQ isoforms'
    ln(combined_files.all_lq_fa, out_lq_fa)  #'LQ isoforms'
    ln(combined_files.all_lq_fq, out_lq_fq)  #'LQ isoforms'
    ln(combined_files.hq_lq_prefix_dict_pickle, out_hq_lq_prefix_dict_pickle)

    as_contigset(out_hq_fa, out_hq_cs)
    as_contigset(out_lq_fa, out_lq_cs)

    log.info("Merging consensus isoforms from all cluster bins.")
    combine_consensus_isoforms(split_indices=cluster_bin_indices,
                               split_files=split_consensus_isoforms,
                               combined_consensus_isoforms_fa=combined_files.
                               all_consensus_isoforms_fa,
                               sample_name=sample_name)
    ln(combined_files.all_consensus_isoforms_fa, out_consensus_isoforms_fa)
    #consensus isoforms
    as_contigset(out_consensus_isoforms_fa, out_consensus_isoforms_cs)

    log.info("Writing cluster summary to %s",
             combined_files.all_cluster_summary_fn)
    write_cluster_summary(summary_fn=combined_files.all_cluster_summary_fn,
                          isoforms_fa=out_consensus_isoforms_cs,
                          hq_fa=out_hq_fa,
                          lq_fa=out_lq_fa)
    ln(combined_files.all_cluster_summary_fn, out_summary)  # "cluster summary"

    log.info("Writing cluster report to %s",
             combined_files.all_cluster_report_fn)
    write_combined_cluster_report(
        split_indices=cluster_bin_indices,
        split_uc_pickles=split_uc_pickles,
        split_partial_uc_pickles=split_partial_uc_pickles,
        report_fn=combined_files.all_cluster_report_fn,
        sample_name=sample_name)
    ln(combined_files.all_cluster_report_fn, out_report)  # "cluster report"
Exemplo n.º 17
0
def resolved_tool_contract_runner(rtc):
    """
    For each cluster bin, create summary.json, cluster_report.csv,
    hq_isoforms.fa|fq, lq_isoforms.fa|fq
    Finally, merge all cluster bins and save all outputs to 'combined'.
    """
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, ClusterChunkTask) for task in p])
    p.sorted_by_attr(attr='cluster_bin_index')

    opts = rtc.task.options
    ipq_opts = IceQuiverHQLQOptions(qv_trim_5=opts[Constants.QV_TRIM_FIVEPRIME_ID],
                                    qv_trim_3=opts[Constants.QV_TRIM_THREEPRIME_ID],
                                    hq_quiver_min_accuracy=opts[Constants.HQ_QUIVER_MIN_ACCURACY_ID])
    sample_name = get_sample_name(input_sample_name=opts[Constants.SAMPLE_NAME_ID])

    out_consensus_isoforms_cs = rtc.task.output_files[0]
    out_summary = rtc.task.output_files[1]
    out_report = rtc.task.output_files[2]
    out_hq_cs = rtc.task.output_files[3]
    out_hq_fq = rtc.task.output_files[4]
    out_lq_cs = rtc.task.output_files[5]
    out_lq_fq = rtc.task.output_files[6]
    out_hq_lq_prefix_dict_pickle = rtc.task.output_files[7]

    assert out_consensus_isoforms_cs.endswith(".contigset.xml")
    assert out_hq_cs.endswith(".contigset.xml")
    assert out_lq_cs.endswith(".contigset.xml")
    out_consensus_isoforms_fa = out_consensus_isoforms_cs.replace(".contigset.xml", ".fasta")
    out_hq_fa = out_hq_cs.replace('.contigset.xml', '.fasta')
    out_lq_fa = out_lq_cs.replace('.contigset.xml', '.fasta')

    hq_fq_fns, lq_fq_fns = [], []
    split_uc_pickles, split_partial_uc_pickles = [], []
    split_consensus_isoforms = []

    cluster_bin_indices = [task.cluster_bin_index for task in p]
    cluster_out_dirs = [task.cluster_out_dir for task in p]
    # sanity check that Cluster indices are unique!
    assert len(set(cluster_bin_indices)) == len(cluster_bin_indices)

    for task in p:
        ice_pq = IceQuiverPostprocess(root_dir=task.cluster_out_dir,
                                      ipq_opts=ipq_opts)
        hq_fq_fns.append(ice_pq.quivered_good_fq)
        lq_fq_fns.append(ice_pq.quivered_bad_fq)
        split_uc_pickles.append(ice_pq.final_pickle_fn)
        split_partial_uc_pickles.append(ice_pq.nfl_all_pickle_fn)
        split_consensus_isoforms.append(ice_pq.final_consensus_fa)

    combined_dir = op.join(op.dirname(op.dirname(cluster_out_dirs[0])), "combined")
    mkdir(combined_dir)
    combined_files = CombinedFiles(combined_dir)
    log.info("Combining results of all cluster bins to %s.", combined_dir)
    log.info("Merging HQ|LQ isoforms from all cluster bins.")
    log.info("HQ isoforms are: %s.", ",".join(hq_fq_fns))
    log.info("LQ isoforms are: %s.", ",".join(lq_fq_fns))
    combine_polished_isoforms(split_indices=cluster_bin_indices,
                              split_hq_fns=hq_fq_fns,
                              split_lq_fns=lq_fq_fns,
                              combined_hq_fa=combined_files.all_hq_fa,
                              combined_hq_fq=combined_files.all_hq_fq,
                              combined_lq_fa=combined_files.all_lq_fa,
                              combined_lq_fq=combined_files.all_lq_fq,
                              hq_lq_prefix_dict_pickle=combined_files.hq_lq_prefix_dict_pickle,
                              sample_name=sample_name)

    ln(combined_files.all_hq_fa, out_hq_fa) #'HQ isoforms'
    ln(combined_files.all_hq_fq, out_hq_fq) #'HQ isoforms'
    ln(combined_files.all_lq_fa, out_lq_fa) #'LQ isoforms'
    ln(combined_files.all_lq_fq, out_lq_fq) #'LQ isoforms'
    ln(combined_files.hq_lq_prefix_dict_pickle, out_hq_lq_prefix_dict_pickle)

    as_contigset(out_hq_fa, out_hq_cs)
    as_contigset(out_lq_fa, out_lq_cs)

    log.info("Merging consensus isoforms from all cluster bins.")
    combine_consensus_isoforms(split_indices=cluster_bin_indices,
                               split_files=split_consensus_isoforms,
                               combined_consensus_isoforms_fa=combined_files.all_consensus_isoforms_fa,
                               sample_name=sample_name)
    ln(combined_files.all_consensus_isoforms_fa, out_consensus_isoforms_fa)
    #consensus isoforms
    as_contigset(out_consensus_isoforms_fa, out_consensus_isoforms_cs)

    log.info("Writing cluster summary to %s", combined_files.all_cluster_summary_fn)
    write_cluster_summary(summary_fn=combined_files.all_cluster_summary_fn,
                          isoforms_fa=out_consensus_isoforms_cs,
                          hq_fa=out_hq_fa,
                          lq_fa=out_lq_fa)
    ln(combined_files.all_cluster_summary_fn, out_summary) # "cluster summary"

    log.info("Writing cluster report to %s", combined_files.all_cluster_report_fn)
    write_combined_cluster_report(split_indices=cluster_bin_indices,
                                  split_uc_pickles=split_uc_pickles,
                                  split_partial_uc_pickles=split_partial_uc_pickles,
                                  report_fn=combined_files.all_cluster_report_fn,
                                  sample_name=sample_name)
    ln(combined_files.all_cluster_report_fn, out_report) # "cluster report"