예제 #1
0
def create_polish_pickle(n_polish_chunks_in_bins, flnc_files, out_pickle):
    """
    Parameters:
      n_polish_chunks_in_bins -- number of ice_polish chunks in each bin
      flnc_files -- full-length non-chimeric files in bins
      out_pickle -- output pickle for saving PolishChunkTask objects
    """
    n_bins = len(flnc_files)
    assert isinstance(n_polish_chunks_in_bins, list)
    assert len(n_polish_chunks_in_bins) == n_bins

    log.info("Writing %s ice_polish chunk tasks to %s.",
             str(sum(n_polish_chunks_in_bins)), out_pickle)
    p = ChunkTasksPickle()

    for i, flnc_file in enumerate(flnc_files):
        log.debug("Creating %s ice_polish chunks for bin index=%s.",
                  str(n_polish_chunks_in_bins[i]), str(i))
        cluster_out_dir = _get_cluster_out_dir(flnc_file)

        for j in range(0, n_polish_chunks_in_bins[i]):
            # Create Polish chunk tasks.
            task_ = PolishChunkTask(cluster_bin_index=i, flnc_file=flnc_file,
                                    cluster_out_dir=cluster_out_dir,
                                    polish_index=j,
                                    n_polish_chunks=n_polish_chunks_in_bins[i])
            p.append(task_)

    p.write(out_pickle)
    log.info("Saved %s polish chunk tasks to %s.", str(sum(n_polish_chunks_in_bins)), out_pickle)
예제 #2
0
def create_partial_pickle(flnc_files, chunked_nfl_files, out_pickle):
    """
    Parameters:
      flnc_files -- full-length non-chimeric files in bins
      chunked_nfl_files -- chunked non-chimeric files
      out_pickle -- output pickle for saving PolishChunkTask objects
    """
    n_bins = len(flnc_files)
    n_nfl_chunks = len(chunked_nfl_files)

    log.info("Writing %s ice_partial chunk tasks to %s.", str(n_bins * n_nfl_chunks), out_pickle)
    p = ChunkTasksPickle()

    for i, flnc_file in enumerate(flnc_files):
        log.debug("Processing cluster bin index=%s.", i)
        cluster_out_dir = _get_cluster_out_dir(flnc_file)

        for j, nfl_file in enumerate(chunked_nfl_files):
            # Create Partial chunk tasks.
            task_ = PartialChunkTask(cluster_bin_index=i, flnc_file=flnc_file,
                                     cluster_out_dir=cluster_out_dir,
                                     nfl_file=nfl_file,
                                     nfl_index=j, n_nfl_chunks=n_nfl_chunks)
            p.append(task_)

    p.write(out_pickle)
    log.info("Saved %s partial chunk tasks to %s.", str(n_bins * n_nfl_chunks), out_pickle)
def resolved_tool_contract_runner(rtc):
    """Given resolved tool contract, run"""
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, PartialChunkTask) for task in p])
    dummy_sentinel_file = rtc.task.input_files[1]
    ccs_file = rtc.task.input_files[2]
    nproc = rtc.task.nproc
    tmp_dir = rtc.task.tmpdir_resources[0].path \
            if len(rtc.task.tmpdir_resources) > 0 else None

    log.info("Looking for QVs in CCS input...")
    with ConsensusReadSet(ccs_file) as ds:
        for bam in ds.resourceReaders():
            qvs = bam.pulseFeaturesAvailable()
            if qvs != set(['SubstitutionQV', 'InsertionQV', 'DeletionQV']):
                log.warn("Missing QV fields from %s, will use default probabilities",
                         bam.filename)
                ccs_file = None
                break

    with open(rtc.task.output_files[0], 'w') as writer:
        for task in p:
            log.info("Running ice_partial on cluster bin %s, nfl chunk %s/%s",
                     str(task.cluster_bin_index),
                     str(task.nfl_index), str(task.n_nfl_chunks))
            task_runner(task=task, ccs_file=ccs_file, nproc=nproc, tmp_dir=tmp_dir)
            writer.write("ice_partial of cluster bin %s, nfl chunk %s/%s in %s is DONE: %s\n" %
                         (task.cluster_bin_index, task.nfl_index, task.n_nfl_chunks,
                          task.cluster_out_dir, task.nfl_pickle))
예제 #4
0
def resolved_tool_contract_runner(rtc):
    """
    For each cluster bin, clean up intermediate files under tmp.
    """
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, ClusterChunkTask) for task in p])

    cluster_bin_indices = [task.cluster_bin_index for task in p]
    # sanity check that Cluster indices are unique!
    assert len(set(cluster_bin_indices)) == len(cluster_bin_indices)

    sentinel_out = rtc.task.output_files[0]
    with open(sentinel_out, 'w') as writer:
        for task in p:
            icef = IceFiles(prog_name="ice_cleanup",
                            root_dir=task.cluster_out_dir)
            tmp_dir = icef.tmp_dir
            log.info("Cleaning up, removing %s", tmp_dir)
            writer.write("removing %s\n" % tmp_dir)
            execute("rm -rf %s" % tmp_dir)

            quivered_dir = icef.quivered_dir
            log.info("Cleaning up, removing %s", quivered_dir)
            writer.write("removing %s\n" % quivered_dir)
            execute("rm -rf %s" % quivered_dir)
예제 #5
0
def resolved_tool_contract_runner(rtc):
    """resolved tool contract runner."""
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, PolishChunkTask) for task in p])
    dummy_sentinel_file = rtc.task.input_files[1]

    subread_set = rtc.task.input_files[2]
    nproc = rtc.task.nproc
    tmp_dir = rtc.task.tmpdir_resources[0].path \
            if len(rtc.task.tmpdir_resources) > 0 else None

    with open(rtc.task.output_files[0], 'w') as writer:
        for task in p:
            log.info(
                "Running ice_polish on cluster bin %s, polish chunk %s/%s",
                str(task.cluster_bin_index), str(task.polish_index),
                str(task.n_polish_chunks))
            log.debug("ice_quiver root_dir is %s", task.cluster_out_dir)
            log.debug("consensus_isoforms is %s", task.consensus_isoforms_file)

            task_runner(task=task,
                        subread_set=subread_set,
                        nproc=nproc,
                        tmp_dir=tmp_dir)
            writer.write(
                "ice_polish of cluster bin %s, polish chunk %s/%s in %s is DONE.\n"
                % (task.cluster_bin_index, task.polish_index,
                   task.n_polish_chunks, task.cluster_out_dir))
예제 #6
0
def resolved_tool_contract_runner(rtc):
    """Given resolved tool contract, run"""
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, PartialChunkTask) for task in p])
    dummy_sentinel_file = rtc.task.input_files[1]
    ccs_file = rtc.task.input_files[2]
    nproc = rtc.task.nproc
    tmp_dir = rtc.task.tmpdir_resources[0].path \
            if len(rtc.task.tmpdir_resources) > 0 else None

    log.info("Looking for QVs in CCS input...")
    with ConsensusReadSet(ccs_file) as ds:
        for bam in ds.resourceReaders():
            qvs = bam.pulseFeaturesAvailable()
            if qvs != set(['SubstitutionQV', 'InsertionQV', 'DeletionQV']):
                log.warn(
                    "Missing QV fields from %s, will use default probabilities",
                    bam.filename)
                ccs_file = None
                break

    with open(rtc.task.output_files[0], 'w') as writer:
        for task in p:
            log.info("Running ice_partial on cluster bin %s, nfl chunk %s/%s",
                     str(task.cluster_bin_index), str(task.nfl_index),
                     str(task.n_nfl_chunks))
            task_runner(task=task,
                        ccs_file=ccs_file,
                        nproc=nproc,
                        tmp_dir=tmp_dir)
            writer.write(
                "ice_partial of cluster bin %s, nfl chunk %s/%s in %s is DONE: %s\n"
                % (task.cluster_bin_index, task.nfl_index, task.n_nfl_chunks,
                   task.cluster_out_dir, task.nfl_pickle))
예제 #7
0
def resolved_tool_contract_runner(rtc):
    """
    For each cluster bin, clean up intermediate files under tmp.
    """
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, ClusterChunkTask) for task in p])

    cluster_bin_indices = [task.cluster_bin_index for task in p]
    # sanity check that Cluster indices are unique!
    assert len(set(cluster_bin_indices)) == len(cluster_bin_indices)

    sentinel_out = rtc.task.output_files[0]
    with open(sentinel_out, 'w') as writer:
        for task in p:
            icef = IceFiles(prog_name="ice_cleanup",
                            root_dir=task.cluster_out_dir)
            tmp_dir = icef.tmp_dir
            log.info("Cleaning up, removing %s", tmp_dir)
            writer.write("removing %s\n" % tmp_dir)
            execute("rm -rf %s" % real_upath(tmp_dir))

            quivered_dir = icef.quivered_dir
            log.info("Cleaning up, removing %s", quivered_dir)
            writer.write("removing %s\n" % quivered_dir)
            execute("rm -rf %s" % real_upath(quivered_dir))
예제 #8
0
def resolved_tool_contract_runner(rtc):
    """run all tasks in cluster_chunks.pickle given rtc"""
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, ClusterChunkTask) for task in p])

    ccs_file = rtc.task.input_files[1]
    assert op.exists(ccs_file)

    nproc = rtc.task.nproc
    use_finer_qv = False
    #if rtc.task.options.get(Constants.USE_FINER_QV_ID, False):
    #    use_finer_qv = True

    with open(rtc.task.output_files[0], 'w') as writer:
        for i, task in enumerate(p):
            args = task_to_args(task=task,
                                ccs_file=ccs_file,
                                nproc=nproc,
                                use_finer_qv=use_finer_qv)
            log.info("ARGUMENTS of Task %s/%s:\n%s", str(i), str(len(p)),
                     str(args))
            log.info("Running ICE on cluster bin %s", task.cluster_bin_index)
            PBTranscript(args, subCommand="cluster").start()
            writer.write("ICE of cluster bin %s in %s is DONE: %s\n" %
                         (task.cluster_bin_index, task.cluster_out_dir,
                          task.consensus_isoforms_file))
예제 #9
0
def run_main(cluster_chunks_pickle_file, ccs_file, output_json_file,
             max_nchunks):
    """Scatter items in cluster_chunks_pickle
    Parameters:
      cluster_chunks_pickle_file -- ChunkTasksPickle of ClusterChunkTask objects.
      ccs_file -- ccs.consensusreadset.xml
      output_json_file -- chunk.json
      max_nchunks -- maximum # of chunks
    """
    p = ChunkTasksPickle.read(cluster_chunks_pickle_file)
    assert all([isinstance(r, ClusterChunkTask) for r in p])
    out_dir = op.dirname(output_json_file)

    # sort and group tasks
    groups = p.sort_and_group_tasks(max_nchunks=max_nchunks)

    # Writing chunk.json
    base_name = "spawned_cluster_chunk"
    chunks = []
    spawned_pickles = []
    for group_index in range(0, len(groups)):
        chunk_id = "_".join([base_name, 'group', str(group_index)])
        spawned_pickle_file = op.join(out_dir, chunk_id + ".pickle")
        d = {
            Constants.CHUNK_KEYS[0]: spawned_pickle_file,
            Constants.CHUNK_KEYS[1]: ccs_file
        }
        c = PipelineChunk(chunk_id, **d)
        chunks.append(c)
        spawned_pickles.append(spawned_pickle_file)

    log.info("Spawning %s into %d files", cluster_chunks_pickle_file,
             len(groups))
    p.spawn_pickles_by_groups(groups, spawned_pickles)
    log.debug("Spawned files: %s.", ", ".join(spawned_pickles))

    #    n_chunks = len(p)
    #    for i in range(0, n_chunks):
    #        chunk_id = "_".join([base_name, str(i)])
    #        spawned_pickle_file = op.join(out_dir, chunk_id + ".pickle")
    #        d = {Constants.CHUNK_KEYS[0]: spawned_pickle_file,
    #             Constants.CHUNK_KEYS[1]: ccs_file}
    #        c = PipelineChunk(chunk_id, **d)
    #        chunks.append(c)
    #        spawned_pickles.append(spawned_pickle_file)
    #
    #    log.info("Spawning %s into %s files", cluster_chunks_pickle_file, str(n_chunks))
    #    p.spawn_pickles(spawned_pickles)
    #    log.debug("Spawned files: %s.", ", ".join(spawned_pickles))

    log.info("Writing chunk.json to %s", output_json_file)
    write_pipeline_chunks(chunks, output_json_file,
                          "created by %s" % Constants.TOOL_ID)
    return 0
예제 #10
0
def create_polish_pickle(n_polish_chunks_in_bins, flnc_files, out_pickle):
    """
    Parameters:
      n_polish_chunks_in_bins -- number of ice_polish chunks in each bin
      flnc_files -- full-length non-chimeric files in bins
      out_pickle -- output pickle for saving PolishChunkTask objects
    """
    n_bins = len(flnc_files)
    assert isinstance(n_polish_chunks_in_bins, list)
    assert len(n_polish_chunks_in_bins) == n_bins

    log.info("Writing %s ice_polish chunk tasks to %s.",
             str(sum(n_polish_chunks_in_bins)), out_pickle)
    p = ChunkTasksPickle()

    for i, flnc_file in enumerate(flnc_files):
        log.debug("Creating %s ice_polish chunks for bin index=%s.",
                  str(n_polish_chunks_in_bins[i]), str(i))
        cluster_out_dir = _get_cluster_out_dir(flnc_file)

        for j in range(0, n_polish_chunks_in_bins[i]):
            # Create Polish chunk tasks.
            task_ = PolishChunkTask(cluster_bin_index=i,
                                    flnc_file=flnc_file,
                                    cluster_out_dir=cluster_out_dir,
                                    polish_index=j,
                                    n_polish_chunks=n_polish_chunks_in_bins[i])
            p.append(task_)

    p.write(out_pickle)
    log.info("Saved %s polish chunk tasks to %s.",
             str(sum(n_polish_chunks_in_bins)), out_pickle)
예제 #11
0
def create_partial_pickle(flnc_files, chunked_nfl_files, out_pickle):
    """
    Parameters:
      flnc_files -- full-length non-chimeric files in bins
      chunked_nfl_files -- chunked non-chimeric files
      out_pickle -- output pickle for saving PolishChunkTask objects
    """
    n_bins = len(flnc_files)
    n_nfl_chunks = max(1, len(chunked_nfl_files))

    log.info("Writing %s ice_partial chunk tasks to %s.",
             str(n_bins * n_nfl_chunks), out_pickle)
    p = ChunkTasksPickle()

    for i, flnc_file in enumerate(flnc_files):
        log.debug("Processing cluster bin index=%s.", i)
        cluster_out_dir = _get_cluster_out_dir(flnc_file)

        for j, nfl_file in enumerate(chunked_nfl_files):
            # Create Partial chunk tasks.
            task_ = PartialChunkTask(cluster_bin_index=i,
                                     flnc_file=flnc_file,
                                     cluster_out_dir=cluster_out_dir,
                                     nfl_file=nfl_file,
                                     nfl_index=j,
                                     n_nfl_chunks=n_nfl_chunks)
            p.append(task_)

    p.write(out_pickle)
    log.info("Saved %s partial chunk tasks to %s.", str(n_bins * n_nfl_chunks),
             out_pickle)
예제 #12
0
def run_main(cluster_chunks_pickle_file, ccs_file, output_json_file, max_nchunks):
    """Scatter items in cluster_chunks_pickle
    Parameters:
      cluster_chunks_pickle_file -- ChunkTasksPickle of ClusterChunkTask objects.
      ccs_file -- ccs.consensusreadset.xml
      output_json_file -- chunk.json
      max_nchunks -- maximum # of chunks
    """
    p = ChunkTasksPickle.read(cluster_chunks_pickle_file)
    assert all([isinstance(r, ClusterChunkTask) for r in p])
    out_dir = op.dirname(output_json_file)

    # sort and group tasks
    groups = p.sort_and_group_tasks(max_nchunks=max_nchunks)

    # Writing chunk.json
    base_name = "spawned_cluster_chunk"
    chunks = []
    spawned_pickles = []
    for group_index in range(0, len(groups)):
        chunk_id = "_".join([base_name, 'group', str(group_index)])
        spawned_pickle_file = op.join(out_dir, chunk_id + ".pickle")
        d = {Constants.CHUNK_KEYS[0]: spawned_pickle_file,
             Constants.CHUNK_KEYS[1]: ccs_file}
        c = PipelineChunk(chunk_id, **d)
        chunks.append(c)
        spawned_pickles.append(spawned_pickle_file)

    log.info("Spawning %s into %d files", cluster_chunks_pickle_file, len(groups))
    p.spawn_pickles_by_groups(groups, spawned_pickles)
    log.debug("Spawned files: %s.", ", ".join(spawned_pickles))

#    n_chunks = len(p)
#    for i in range(0, n_chunks):
#        chunk_id = "_".join([base_name, str(i)])
#        spawned_pickle_file = op.join(out_dir, chunk_id + ".pickle")
#        d = {Constants.CHUNK_KEYS[0]: spawned_pickle_file,
#             Constants.CHUNK_KEYS[1]: ccs_file}
#        c = PipelineChunk(chunk_id, **d)
#        chunks.append(c)
#        spawned_pickles.append(spawned_pickle_file)
#
#    log.info("Spawning %s into %s files", cluster_chunks_pickle_file, str(n_chunks))
#    p.spawn_pickles(spawned_pickles)
#    log.debug("Spawned files: %s.", ", ".join(spawned_pickles))

    log.info("Writing chunk.json to %s", output_json_file)
    write_pipeline_chunks(chunks, output_json_file,
                          "created by %s" % Constants.TOOL_ID)
    return 0
def resolved_tool_contract_runner(rtc):
    """Given resolved tool contract, run"""
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    p.sorted_by_attr(attr='cluster_bin_index')
    assert all([isinstance(task, PartialChunkTask) for task in p])

    with open(rtc.task.output_files[0], 'w') as writer:
        for i, group in groupby(p, lambda x: x.cluster_bin_index):
            gs = [g for g in group]
            nfl_pickles_of_bin_i = [g.nfl_pickle for g in gs]
            out_pickle = IceFiles(prog_name="", root_dir=gs[0].cluster_out_dir,
                                  no_log_f=True).nfl_all_pickle_fn
            log.info("Combining nfl pickles of cluster bin %s.", str(i))
            log.debug("nfl pickles are: %s.", (", ".join(nfl_pickles_of_bin_i)))
            log.debug("Output merged nfl pickle is %s.", out_pickle)
            combine_nfl_pickles(splitted_pickles=nfl_pickles_of_bin_i, out_pickle=out_pickle)
            writer.write("Merge nfl pickles of cluster bin %s DONE: %s\n" %
                         (i, out_pickle))
def run_main(partial_chunks_pickle_file, sentinel_file, ccs_file,
             output_json_file, max_nchunks):
    """
    Spawn partial Chunk Tasks in pickle.
    Parameters:
      partial_chunks_pickle_file -- ChunkTasksPickle of PartialChunkTask objects
      ccs_file -- ccs dataset
      sentinel_file -- sentinel file to connect pbsmrtpipe tasks
      output_json -- chunk.json
    """
    p = ChunkTasksPickle.read(partial_chunks_pickle_file)
    assert all([isinstance(r, PartialChunkTask) for r in p])
    out_dir = op.dirname(output_json_file)

    # sort and group tasks
    groups = p.sort_and_group_tasks(max_nchunks=max_nchunks)

    # Writing chunk.json
    base_name = "spawned_partial_chunk"
    chunks = []
    spawned_pickles = []
    for group_index in range(0, len(groups)):
        chunk_id = "_".join([base_name, 'group', str(group_index)])
        spawned_pickle_file = op.join(out_dir, chunk_id + ".pickle")
        d = {
            Constants.CHUNK_KEYS[0]: spawned_pickle_file,
            Constants.CHUNK_KEYS[1]: sentinel_file,
            Constants.CHUNK_KEYS[2]: ccs_file
        }
        c = PipelineChunk(chunk_id, **d)
        chunks.append(c)
        spawned_pickles.append(spawned_pickle_file)

    log.info("Spawning %s into %d files", partial_chunks_pickle_file,
             len(groups))
    p.spawn_pickles_by_groups(groups=groups, out_pickle_fns=spawned_pickles)
    log.debug("Spawned files: %s.", ", ".join(spawned_pickles))

    log.info("Writing chunk.json to %s", output_json_file)
    write_pipeline_chunks(chunks, output_json_file,
                          "created by %s" % Constants.TOOL_ID)
    return 0
def resolved_tool_contract_runner(rtc):
    """Given resolved tool contract, run"""
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    p.sorted_by_attr(attr='cluster_bin_index')
    assert all([isinstance(task, PartialChunkTask) for task in p])

    with open(rtc.task.output_files[0], 'w') as writer:
        for i, group in groupby(p, lambda x: x.cluster_bin_index):
            gs = [g for g in group]
            nfl_pickles_of_bin_i = [g.nfl_pickle for g in gs]
            out_pickle = IceFiles(prog_name="",
                                  root_dir=gs[0].cluster_out_dir,
                                  no_log_f=True).nfl_all_pickle_fn
            log.info("Combining nfl pickles of cluster bin %s.", str(i))
            log.debug("nfl pickles are: %s.",
                      (", ".join(nfl_pickles_of_bin_i)))
            log.debug("Output merged nfl pickle is %s.", out_pickle)
            combine_nfl_pickles(splitted_pickles=nfl_pickles_of_bin_i,
                                out_pickle=out_pickle)
            writer.write("Merge nfl pickles of cluster bin %s DONE: %s\n" %
                         (i, out_pickle))
def run_main(polish_chunks_pickle_file, sentinel_file,
             subreads_file, output_json_file, max_nchunks):
    """
    Spawn polish Chunk Tasks in pickle.
    Parameters:
      polish_chunks_pickle_file -- ChunkTasksPickle of PolishChunkTask objects
      subreads_file -- ccs dataset
      sentinel_file -- sentinel file to connect pbsmrtpipe tasks.
      output_json -- chunk.json
    """
    p = ChunkTasksPickle.read(polish_chunks_pickle_file)
    assert all([isinstance(r, PolishChunkTask) for r in p])
    out_dir = op.dirname(output_json_file)

    # sort and group tasks
    groups = p.sort_and_group_tasks(max_nchunks=max_nchunks)

    # Writing chunk.json
    base_name = "spawned_polish_chunk"
    chunks = []
    spawned_pickles = []
    for group_index in range(0, len(groups)):
        chunk_id = "_".join([base_name, 'group', str(group_index)])
        spawned_pickle_file = op.join(out_dir, chunk_id + ".pickle")
        d = {Constants.CHUNK_KEYS[0]: spawned_pickle_file,
             Constants.CHUNK_KEYS[1]: sentinel_file,
             Constants.CHUNK_KEYS[2]: subreads_file}
        c = PipelineChunk(chunk_id, **d)
        chunks.append(c)
        spawned_pickles.append(spawned_pickle_file)

    log.info("Spawning %s into %d files", polish_chunks_pickle_file, len(groups))
    p.spawn_pickles_by_groups(groups=groups, out_pickle_fns=spawned_pickles)
    log.debug("Spawned files: %s.", ", ".join(spawned_pickles))

    log.info("Writing chunk.json to %s", output_json_file)
    write_pipeline_chunks(chunks, output_json_file,
                          "created by %s" % Constants.TOOL_ID)
    return 0
예제 #17
0
def resolved_tool_contract_runner(rtc):
    """resolved tool contract runner."""
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, PolishChunkTask) for task in p])
    dummy_sentinel_file = rtc.task.input_files[1]

    subread_set = rtc.task.input_files[2]
    nproc = rtc.task.nproc
    tmp_dir = rtc.task.tmpdir_resources[0].path \
            if len(rtc.task.tmpdir_resources) > 0 else None

    with open(rtc.task.output_files[0], 'w') as writer:
        for task in p:
            log.info("Running ice_polish on cluster bin %s, polish chunk %s/%s",
                     str(task.cluster_bin_index),
                     str(task.polish_index), str(task.n_polish_chunks))
            log.debug("ice_quiver root_dir is %s", task.cluster_out_dir)
            log.debug("consensus_isoforms is %s", task.consensus_isoforms_file)

            task_runner(task=task, subread_set=subread_set, nproc=nproc, tmp_dir=tmp_dir)
            writer.write("ice_polish of cluster bin %s, polish chunk %s/%s in %s is DONE.\n" %
                         (task.cluster_bin_index, task.polish_index, task.n_polish_chunks,
                          task.cluster_out_dir))
def resolved_tool_contract_runner(rtc):
    """
    For each cluster bin, create summary.json, cluster_report.csv,
    hq_isoforms.fa|fq, lq_isoforms.fa|fq
    """
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, PolishChunkTask) for task in p])
    p.sorted_by_attr(attr='cluster_bin_index')

    opts = rtc.task.options
    ipq_opts = IceQuiverHQLQOptions(qv_trim_5=opts[Constants.QV_TRIM_FIVEPRIME_ID],
                                    qv_trim_3=opts[Constants.QV_TRIM_THREEPRIME_ID],
                                    hq_quiver_min_accuracy=opts[Constants.HQ_QUIVER_MIN_ACCURACY_ID])

    with open(rtc.task.output_files[0], 'w') as writer:
        for cluster_bin_index, cluster_out_dir in p.sorted_no_redundant_cluster_bins():
            log.info("ice_quiver_postprocess of cluster bin index %s in %s.",
                     str(cluster_bin_index), str(cluster_out_dir))
            good_hq, bad_hq = \
            ice_quiver_postprocess_a_cluster_bin(cluster_out_dir=cluster_out_dir,
                                                 ipq_opts=ipq_opts)
            writer.write("ice_quiver_postprocess of cluster bin index %s in %s DONE:\n%s\n%s\n" %
                         (cluster_bin_index, cluster_out_dir, good_hq, bad_hq))
예제 #19
0
def resolved_tool_contract_runner(rtc):
    """run all tasks in cluster_chunks.pickle given rtc"""
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, ClusterChunkTask) for task in p])

    ccs_file = rtc.task.input_files[1]
    assert op.exists(ccs_file)

    nproc = rtc.task.nproc
    use_finer_qv = False
    #if rtc.task.options.get(Constants.USE_FINER_QV_ID, False):
    #    use_finer_qv = True

    with open(rtc.task.output_files[0], 'w') as writer:
        for i, task in enumerate(p):
            args = task_to_args(task=task, ccs_file=ccs_file,
                                nproc=nproc, use_finer_qv=use_finer_qv)
            log.info("ARGUMENTS of Task %s/%s:\n%s", str(i), str(len(p)), str(args))
            log.info("Running ICE on cluster bin %s", task.cluster_bin_index)
            PBTranscript(args, subCommand="cluster").start()
            writer.write("ICE of cluster bin %s in %s is DONE: %s\n" %
                         (task.cluster_bin_index, task.cluster_out_dir,
                          task.consensus_isoforms_file))
예제 #20
0
def create_cluster_pickle(flnc_files, out_pickle):
    """Create cluster chunk task pickle.
    Parameters:
      n_bins -- number of bins
      flnc_files -- full-length non-chimeric files in bins
      out_pickle -- output pickle for saving ClusterChunkTask objects
    """
    n_bins = len(flnc_files)
    log.info("Writing %s cluster chunk tasks to %s.", str(n_bins), out_pickle)
    p = ChunkTasksPickle()

    for i, flnc_file in enumerate(flnc_files):
        log.debug("Processing cluster bin index=%s.", i)
        cluster_out_dir = _get_cluster_out_dir(flnc_file)

        # Create Cluster chunk tasks.
        task_ = ClusterChunkTask(cluster_bin_index=i, flnc_file=flnc_file,
                                 cluster_out_dir=cluster_out_dir)
        p.append(task_)

    p.write(out_pickle)
    log.info("Saved %s cluster chunk tasks to %s.", str(n_bins), out_pickle)
예제 #21
0
def resolved_tool_contract_runner(rtc):
    """
    For each cluster bin, create summary.json, cluster_report.csv,
    hq_isoforms.fa|fq, lq_isoforms.fa|fq
    Finally, merge all cluster bins and save all outputs to 'combined'.
    """
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, ClusterChunkTask) for task in p])
    p.sorted_by_attr(attr='cluster_bin_index')

    opts = rtc.task.options
    ipq_opts = IceQuiverHQLQOptions(
        qv_trim_5=opts[Constants.QV_TRIM_FIVEPRIME_ID],
        qv_trim_3=opts[Constants.QV_TRIM_THREEPRIME_ID],
        hq_quiver_min_accuracy=opts[Constants.HQ_QUIVER_MIN_ACCURACY_ID])
    sample_name = get_sample_name(
        input_sample_name=opts[Constants.SAMPLE_NAME_ID])

    out_consensus_isoforms_cs = rtc.task.output_files[0]
    out_summary = rtc.task.output_files[1]
    out_report = rtc.task.output_files[2]
    out_hq_cs = rtc.task.output_files[3]
    out_hq_fq = rtc.task.output_files[4]
    out_lq_cs = rtc.task.output_files[5]
    out_lq_fq = rtc.task.output_files[6]
    out_hq_lq_prefix_dict_pickle = rtc.task.output_files[7]

    assert out_consensus_isoforms_cs.endswith(".contigset.xml")
    assert out_hq_cs.endswith(".contigset.xml")
    assert out_lq_cs.endswith(".contigset.xml")
    out_consensus_isoforms_fa = out_consensus_isoforms_cs.replace(
        ".contigset.xml", ".fasta")
    out_hq_fa = out_hq_cs.replace('.contigset.xml', '.fasta')
    out_lq_fa = out_lq_cs.replace('.contigset.xml', '.fasta')

    hq_fq_fns, lq_fq_fns = [], []
    split_uc_pickles, split_partial_uc_pickles = [], []
    split_consensus_isoforms = []

    cluster_bin_indices = [task.cluster_bin_index for task in p]
    cluster_out_dirs = [task.cluster_out_dir for task in p]
    # sanity check that Cluster indices are unique!
    assert len(set(cluster_bin_indices)) == len(cluster_bin_indices)

    for task in p:
        ice_pq = IceQuiverPostprocess(root_dir=task.cluster_out_dir,
                                      ipq_opts=ipq_opts)
        hq_fq_fns.append(ice_pq.quivered_good_fq)
        lq_fq_fns.append(ice_pq.quivered_bad_fq)
        split_uc_pickles.append(ice_pq.final_pickle_fn)
        split_partial_uc_pickles.append(ice_pq.nfl_all_pickle_fn)
        split_consensus_isoforms.append(ice_pq.final_consensus_fa)

    combined_dir = op.join(op.dirname(op.dirname(cluster_out_dirs[0])),
                           "combined")
    mkdir(combined_dir)
    combined_files = CombinedFiles(combined_dir)
    log.info("Combining results of all cluster bins to %s.", combined_dir)
    log.info("Merging HQ|LQ isoforms from all cluster bins.")
    log.info("HQ isoforms are: %s.", ",".join(hq_fq_fns))
    log.info("LQ isoforms are: %s.", ",".join(lq_fq_fns))
    combine_polished_isoforms(
        split_indices=cluster_bin_indices,
        split_hq_fns=hq_fq_fns,
        split_lq_fns=lq_fq_fns,
        combined_hq_fa=combined_files.all_hq_fa,
        combined_hq_fq=combined_files.all_hq_fq,
        combined_lq_fa=combined_files.all_lq_fa,
        combined_lq_fq=combined_files.all_lq_fq,
        hq_lq_prefix_dict_pickle=combined_files.hq_lq_prefix_dict_pickle,
        sample_name=sample_name)

    ln(combined_files.all_hq_fa, out_hq_fa)  #'HQ isoforms'
    ln(combined_files.all_hq_fq, out_hq_fq)  #'HQ isoforms'
    ln(combined_files.all_lq_fa, out_lq_fa)  #'LQ isoforms'
    ln(combined_files.all_lq_fq, out_lq_fq)  #'LQ isoforms'
    ln(combined_files.hq_lq_prefix_dict_pickle, out_hq_lq_prefix_dict_pickle)

    as_contigset(out_hq_fa, out_hq_cs)
    as_contigset(out_lq_fa, out_lq_cs)

    log.info("Merging consensus isoforms from all cluster bins.")
    combine_consensus_isoforms(split_indices=cluster_bin_indices,
                               split_files=split_consensus_isoforms,
                               combined_consensus_isoforms_fa=combined_files.
                               all_consensus_isoforms_fa,
                               sample_name=sample_name)
    ln(combined_files.all_consensus_isoforms_fa, out_consensus_isoforms_fa)
    #consensus isoforms
    as_contigset(out_consensus_isoforms_fa, out_consensus_isoforms_cs)

    log.info("Writing cluster summary to %s",
             combined_files.all_cluster_summary_fn)
    write_cluster_summary(summary_fn=combined_files.all_cluster_summary_fn,
                          isoforms_fa=out_consensus_isoforms_cs,
                          hq_fa=out_hq_fa,
                          lq_fa=out_lq_fa)
    ln(combined_files.all_cluster_summary_fn, out_summary)  # "cluster summary"

    log.info("Writing cluster report to %s",
             combined_files.all_cluster_report_fn)
    write_combined_cluster_report(
        split_indices=cluster_bin_indices,
        split_uc_pickles=split_uc_pickles,
        split_partial_uc_pickles=split_partial_uc_pickles,
        report_fn=combined_files.all_cluster_report_fn,
        sample_name=sample_name)
    ln(combined_files.all_cluster_report_fn, out_report)  # "cluster report"
예제 #22
0
def resolved_tool_contract_runner(rtc):
    """
    For each cluster bin, create summary.json, cluster_report.csv,
    hq_isoforms.fa|fq, lq_isoforms.fa|fq
    Finally, merge all cluster bins and save all outputs to 'combined'.
    """
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, ClusterChunkTask) for task in p])
    p.sorted_by_attr(attr='cluster_bin_index')

    opts = rtc.task.options
    ipq_opts = IceQuiverHQLQOptions(qv_trim_5=opts[Constants.QV_TRIM_FIVEPRIME_ID],
                                    qv_trim_3=opts[Constants.QV_TRIM_THREEPRIME_ID],
                                    hq_quiver_min_accuracy=opts[Constants.HQ_QUIVER_MIN_ACCURACY_ID])
    sample_name = get_sample_name(input_sample_name=opts[Constants.SAMPLE_NAME_ID])

    out_consensus_isoforms_cs = rtc.task.output_files[0]
    out_summary = rtc.task.output_files[1]
    out_report = rtc.task.output_files[2]
    out_hq_cs = rtc.task.output_files[3]
    out_hq_fq = rtc.task.output_files[4]
    out_lq_cs = rtc.task.output_files[5]
    out_lq_fq = rtc.task.output_files[6]
    out_hq_lq_prefix_dict_pickle = rtc.task.output_files[7]

    assert out_consensus_isoforms_cs.endswith(".contigset.xml")
    assert out_hq_cs.endswith(".contigset.xml")
    assert out_lq_cs.endswith(".contigset.xml")
    out_consensus_isoforms_fa = out_consensus_isoforms_cs.replace(".contigset.xml", ".fasta")
    out_hq_fa = out_hq_cs.replace('.contigset.xml', '.fasta')
    out_lq_fa = out_lq_cs.replace('.contigset.xml', '.fasta')

    hq_fq_fns, lq_fq_fns = [], []
    split_uc_pickles, split_partial_uc_pickles = [], []
    split_consensus_isoforms = []

    cluster_bin_indices = [task.cluster_bin_index for task in p]
    cluster_out_dirs = [task.cluster_out_dir for task in p]
    # sanity check that Cluster indices are unique!
    assert len(set(cluster_bin_indices)) == len(cluster_bin_indices)

    for task in p:
        ice_pq = IceQuiverPostprocess(root_dir=task.cluster_out_dir,
                                      ipq_opts=ipq_opts)
        hq_fq_fns.append(ice_pq.quivered_good_fq)
        lq_fq_fns.append(ice_pq.quivered_bad_fq)
        split_uc_pickles.append(ice_pq.final_pickle_fn)
        split_partial_uc_pickles.append(ice_pq.nfl_all_pickle_fn)
        split_consensus_isoforms.append(ice_pq.final_consensus_fa)

    combined_dir = op.join(op.dirname(op.dirname(cluster_out_dirs[0])), "combined")
    mkdir(combined_dir)
    combined_files = CombinedFiles(combined_dir)
    log.info("Combining results of all cluster bins to %s.", combined_dir)
    log.info("Merging HQ|LQ isoforms from all cluster bins.")
    log.info("HQ isoforms are: %s.", ",".join(hq_fq_fns))
    log.info("LQ isoforms are: %s.", ",".join(lq_fq_fns))
    combine_polished_isoforms(split_indices=cluster_bin_indices,
                              split_hq_fns=hq_fq_fns,
                              split_lq_fns=lq_fq_fns,
                              combined_hq_fa=combined_files.all_hq_fa,
                              combined_hq_fq=combined_files.all_hq_fq,
                              combined_lq_fa=combined_files.all_lq_fa,
                              combined_lq_fq=combined_files.all_lq_fq,
                              hq_lq_prefix_dict_pickle=combined_files.hq_lq_prefix_dict_pickle,
                              sample_name=sample_name)

    ln(combined_files.all_hq_fa, out_hq_fa) #'HQ isoforms'
    ln(combined_files.all_hq_fq, out_hq_fq) #'HQ isoforms'
    ln(combined_files.all_lq_fa, out_lq_fa) #'LQ isoforms'
    ln(combined_files.all_lq_fq, out_lq_fq) #'LQ isoforms'
    ln(combined_files.hq_lq_prefix_dict_pickle, out_hq_lq_prefix_dict_pickle)

    as_contigset(out_hq_fa, out_hq_cs)
    as_contigset(out_lq_fa, out_lq_cs)

    log.info("Merging consensus isoforms from all cluster bins.")
    combine_consensus_isoforms(split_indices=cluster_bin_indices,
                               split_files=split_consensus_isoforms,
                               combined_consensus_isoforms_fa=combined_files.all_consensus_isoforms_fa,
                               sample_name=sample_name)
    ln(combined_files.all_consensus_isoforms_fa, out_consensus_isoforms_fa)
    #consensus isoforms
    as_contigset(out_consensus_isoforms_fa, out_consensus_isoforms_cs)

    log.info("Writing cluster summary to %s", combined_files.all_cluster_summary_fn)
    write_cluster_summary(summary_fn=combined_files.all_cluster_summary_fn,
                          isoforms_fa=out_consensus_isoforms_cs,
                          hq_fa=out_hq_fa,
                          lq_fa=out_lq_fa)
    ln(combined_files.all_cluster_summary_fn, out_summary) # "cluster summary"

    log.info("Writing cluster report to %s", combined_files.all_cluster_report_fn)
    write_combined_cluster_report(split_indices=cluster_bin_indices,
                                  split_uc_pickles=split_uc_pickles,
                                  split_partial_uc_pickles=split_partial_uc_pickles,
                                  report_fn=combined_files.all_cluster_report_fn,
                                  sample_name=sample_name)
    ln(combined_files.all_cluster_report_fn, out_report) # "cluster report"