def write_run_daligner_chunks_falcon(pread_aln, chunk_file, config_json_fn, run_jobs_fn, max_total_nchunks, dir_name, chunk_base_name, chunk_ext, chunk_keys): db_prefix = 'preads' if pread_aln else 'raw_reads' xform_script = get_script_xformer(pread_aln) def chunk(): # cmds is actually a list of small bash scripts, including linefeeds. cmds = get_daligner_job_descriptions(open(run_jobs_fn), db_prefix).values() if max_total_nchunks < len(cmds): log.debug("max_total_nchunks < # daligner cmds: %d < %d" % (max_total_nchunks, len(cmds))) cmds = joined_strs(cmds, max_total_nchunks) symlink_dazzdb(os.path.dirname(run_jobs_fn), db_prefix) for i, script in enumerate(cmds): chunk_id = '_'.join([chunk_base_name, str(i)]) chunk_name = '.'.join([chunk_id, chunk_ext]) chunk_path = os.path.join(dir_name, chunk_name) script = xform_script(script) open(chunk_path, 'w').write(script) d = {} d[chunk_keys[1]] = os.path.abspath(chunk_path) d[chunk_keys[0]] = config_json_fn c = PipelineChunk(chunk_id, **d) yield c chunks = list(chunk()) write_pipeline_chunks(chunks, chunk_file, comment=None)
def write_run_daligner_chunks_falcon( pread_aln, chunk_file, config_json_fn, run_jobs_fn, max_total_nchunks, dir_name, chunk_base_name, chunk_ext, chunk_keys): db_prefix = 'preads' if pread_aln else 'raw_reads' xform_script = get_script_xformer(pread_aln) def chunk(): # cmds is actually a list of small bash scripts, including linefeeds. cmds = get_daligner_job_descriptions(open(run_jobs_fn), db_prefix).values() if max_total_nchunks < len(cmds): log.debug("max_total_nchunks < # daligner cmds: %d < %d" %( max_total_nchunks, len(cmds))) cmds = joined_strs(cmds, max_total_nchunks) symlink_dazzdb(os.path.dirname(run_jobs_fn), db_prefix) for i, script in enumerate(cmds): chunk_id = '_'.join([chunk_base_name, str(i)]) chunk_name = '.'.join([chunk_id, chunk_ext]) chunk_path = os.path.join(dir_name, chunk_name) script = xform_script(script) open(chunk_path, 'w').write(script) d = {} d[chunk_keys[1]] = os.path.abspath(chunk_path) d[chunk_keys[0]] = config_json_fn c = PipelineChunk(chunk_id, **d) yield c chunks = list(chunk()) write_pipeline_chunks(chunks, chunk_file, comment=None)
def run_main(subreads_file, isoforms_file, cluster_pickle_file, nfl_pickle_file, output_json, max_nchunks): log.info("Running {f} into {n} chunks".format(f=cluster_pickle_file, n=max_nchunks)) uc = {} with open(cluster_pickle_file, 'rb') as f: a = cPickle.load(f) uc = a['uc'] assert len(uc) > 0 n_chunks = min(len(uc), max_nchunks) base_name = "cluster_chunk" dir_name = os.path.dirname(output_json) chunks = [] for i in range(n_chunks): chunk_id = "_".join([base_name, str(i)]) chunk_name = ".".join([chunk_id, "pickle"]) chunk_pickle_file = os.path.join(dir_name, chunk_name) with open(chunk_pickle_file, 'wb') as f: cPickle.dump({ '__chunk_i': i, '__chunk_n': n_chunks, 'pickle_file': cluster_pickle_file, }, f) d = { '$chunk.subreadset_id': subreads_file, '$chunk.contigset_id': isoforms_file, '$chunk.nfl_pickle_id': nfl_pickle_file, '$chunk.pickle_id': chunk_pickle_file, } c = PipelineChunk(chunk_id, **d) chunks.append(c) write_pipeline_chunks(chunks, output_json, "created by pbtranscript.tasks.scatter_clusters") return 0
def test_write_chunks(self): def f(i): return { "{c}movie_fofn_id".format(c=PipelineChunk.CHUNK_KEY_PREFIX): "/path/to_movie-{i}.fofn".format(i=i), "{c}region_fofn_id".format(c=PipelineChunk.CHUNK_KEY_PREFIX): "/path/rgn_{i}.fofn".format(i=i) } def to_i(i): return "chunk-id-{i}".format(i=i) def to_p(i): return PipelineChunk(to_i(i), **f(i)) nchunks = 5 pipeline_chunks = [to_p(i) for i in range(nchunks)] log.debug(pipeline_chunks) tmp_dir = get_temp_dir("pipeline-chunks") tmp_name = get_temp_file("_chunk.json", tmp_dir) write_pipeline_chunks(pipeline_chunks, tmp_name, "Example chunk file") pchunks = load_pipeline_chunks_from_json(tmp_name) assert len(pchunks) == nchunks
def run_main(fastq_file, gmap_ref_file, output_json_file, max_nchunks): """ Parameters: fastq_file -- HQ isoforms in FASTQ gmap_ref_file -- GMAP reference set xml output_json -- chunk.json """ # Chunk FASTQ output_fastq_json = output_json_file + ".fastq.json" output_dir = op.dirname(output_json_file) CU.write_fastq_chunks_to_file(output_fastq_json, fastq_file, max_nchunks, output_dir, "scattered-fastq", "fastq") # get fastq_ids from output_fastq_json fastq_chunks = load_pipeline_chunks_from_json(output_fastq_json) fastq_files = get_datum_from_chunks_by_chunk_key(fastq_chunks, "$chunk.fastq_id") log.debug("Chunked FASTQ files are %s.", (', '.join(fastq_files))) # Writing chunk.json chunks = [] for i, fastq_file in enumerate(fastq_files): chunk_id = "_".join(["map_isoforms_to_genome_chunk", str(i)]) d = { Constants.CHUNK_KEYS[0]: fastq_file, Constants.CHUNK_KEYS[1]: gmap_ref_file } c = PipelineChunk(chunk_id, **d) chunks.append(c) log.info("Writing chunk.json to %s", output_json_file) write_pipeline_chunks(chunks, output_json_file, "created by %s" % Constants.TOOL_ID) return 0
def run_main(json_file, output_json_file, max_nchunks): """ Spawn a json with scripts into multiple json files each containing a script. Parameters: json_file -- json <- dict{p_id: args}, where args <- dict{'script_fn': script_fn, ...} output_json -- chunk.json """ a = json.load(open(json_file, 'r')) if len(a) == 0: raise ValueError("script json %s is empty" % json_file) out_dir = op.dirname(output_json_file) num_chunks = min(max_nchunks, len(a)) num_scripts_in_chunks = num_items_in_chunks(num_items=len(a), num_chunks=num_chunks) # Writing chunk.json base_name = "spawned_json_w_scripts_chunk" chunks = [] spawned_jsons = [] p_ids = sorted(a.keys()) for chunk_idx in range(0, num_chunks): chunk_id = "_".join([base_name, str(chunk_idx)]) spawned_json_file = op.join(out_dir, chunk_id + ".json") spawned_txt_file = op.join(out_dir, chunk_id + "_done.txt") # make a chunk d = {Constants.CHUNK_KEYS[0]: spawned_json_file, Constants.CHUNK_KEYS[1]: spawned_txt_file} c = PipelineChunk(chunk_id, **d) chunks.append(c) # make content for the spawned json scripts_dict = dict() num_scripts = num_scripts_in_chunks[chunk_idx] for script_idx in range(0, num_scripts): p_id = p_ids[script_idx] scripts_dict[p_id] = a[p_id] # delete p_ids[0: num_scripts] p_ids = p_ids[num_scripts:] # Write script_dict, which is a dict of {p_id: args} to spawned json with open(spawned_json_file, 'w') as writer: writer.write(json.dumps(scripts_dict) + "\n") spawned_jsons.append(spawned_json_file) with open(spawned_txt_file, 'w') as writer: writer.write("%s" % spawned_json_file) if len(p_ids) != 0: raise AssertionError("Scripts of p_ids %s are not scattered." % repr(p_ids)) log.info("Spawning %s into %d files", json_file, num_chunks) log.debug("Spawned files: %s.", ", ".join(spawned_jsons)) log.info("Writing chunk.json to %s", output_json_file) write_pipeline_chunks(chunks, output_json_file, "created by %s" % Constants.TOOL_ID) return 0
def run_main(json_file, output_json_file, max_nchunks): """ Spawn a json with scripts into multiple json files each containing a script. Parameters: json_file -- json <- dict{p_id: args}, where args <- dict{'script_fn': script_fn, ...} output_json -- chunk.json """ a = json.load(open(json_file, 'r')) if len(a) == 0: raise ValueError("script json %s is empty" % json_file) out_dir = op.dirname(output_json_file) num_chunks = min(max_nchunks, len(a)) num_scripts_in_chunks = num_items_in_chunks(num_items=len(a), num_chunks=num_chunks) # Writing chunk.json base_name = "spawned_json_w_scripts_chunk" chunks = [] spawned_jsons = [] p_ids = sorted(a.keys()) for chunk_idx in range(0, num_chunks): chunk_id = "_".join([base_name, str(chunk_idx)]) spawned_json_file = op.join(out_dir, chunk_id + ".json") # make a chunk d = {Constants.CHUNK_KEYS[0]: spawned_json_file} c = PipelineChunk(chunk_id, **d) chunks.append(c) # make content for the spawned json scripts_dict = dict() num_scripts = num_scripts_in_chunks[chunk_idx] for script_idx in range(0, num_scripts): p_id = p_ids[script_idx] scripts_dict[p_id] = a[p_id] # delete p_ids[0: num_scripts] p_ids = p_ids[num_scripts:] # Write script_dict, which is a dict of {p_id: args} to spawned json with open(spawned_json_file, 'w') as writer: writer.write(json.dumps(scripts_dict) + "\n") spawned_jsons.append(spawned_json_file) if len(p_ids) != 0: raise AssertionError("Scripts of p_ids %s are not scattered." % repr(p_ids)) log.info("Spawning %s into %d files", json_file, num_chunks) log.debug("Spawned files: %s.", ", ".join(spawned_jsons)) log.info("Writing chunk.json to %s", output_json_file) write_pipeline_chunks(chunks, output_json_file, "created by %s" % Constants.TOOL_ID) return 0
def run_main(cluster_chunks_pickle_file, ccs_file, output_json_file, max_nchunks): """Scatter items in cluster_chunks_pickle Parameters: cluster_chunks_pickle_file -- ChunkTasksPickle of ClusterChunkTask objects. ccs_file -- ccs.consensusreadset.xml output_json_file -- chunk.json max_nchunks -- maximum # of chunks """ p = ChunkTasksPickle.read(cluster_chunks_pickle_file) assert all([isinstance(r, ClusterChunkTask) for r in p]) out_dir = op.dirname(output_json_file) # sort and group tasks groups = p.sort_and_group_tasks(max_nchunks=max_nchunks) # Writing chunk.json base_name = "spawned_cluster_chunk" chunks = [] spawned_pickles = [] for group_index in range(0, len(groups)): chunk_id = "_".join([base_name, 'group', str(group_index)]) spawned_pickle_file = op.join(out_dir, chunk_id + ".pickle") d = { Constants.CHUNK_KEYS[0]: spawned_pickle_file, Constants.CHUNK_KEYS[1]: ccs_file } c = PipelineChunk(chunk_id, **d) chunks.append(c) spawned_pickles.append(spawned_pickle_file) log.info("Spawning %s into %d files", cluster_chunks_pickle_file, len(groups)) p.spawn_pickles_by_groups(groups, spawned_pickles) log.debug("Spawned files: %s.", ", ".join(spawned_pickles)) # n_chunks = len(p) # for i in range(0, n_chunks): # chunk_id = "_".join([base_name, str(i)]) # spawned_pickle_file = op.join(out_dir, chunk_id + ".pickle") # d = {Constants.CHUNK_KEYS[0]: spawned_pickle_file, # Constants.CHUNK_KEYS[1]: ccs_file} # c = PipelineChunk(chunk_id, **d) # chunks.append(c) # spawned_pickles.append(spawned_pickle_file) # # log.info("Spawning %s into %s files", cluster_chunks_pickle_file, str(n_chunks)) # p.spawn_pickles(spawned_pickles) # log.debug("Spawned files: %s.", ", ".join(spawned_pickles)) log.info("Writing chunk.json to %s", output_json_file) write_pipeline_chunks(chunks, output_json_file, "created by %s" % Constants.TOOL_ID) return 0
def run_main(cluster_chunks_pickle_file, ccs_file, output_json_file, max_nchunks): """Scatter items in cluster_chunks_pickle Parameters: cluster_chunks_pickle_file -- ChunkTasksPickle of ClusterChunkTask objects. ccs_file -- ccs.consensusreadset.xml output_json_file -- chunk.json max_nchunks -- maximum # of chunks """ p = ChunkTasksPickle.read(cluster_chunks_pickle_file) assert all([isinstance(r, ClusterChunkTask) for r in p]) out_dir = op.dirname(output_json_file) # sort and group tasks groups = p.sort_and_group_tasks(max_nchunks=max_nchunks) # Writing chunk.json base_name = "spawned_cluster_chunk" chunks = [] spawned_pickles = [] for group_index in range(0, len(groups)): chunk_id = "_".join([base_name, 'group', str(group_index)]) spawned_pickle_file = op.join(out_dir, chunk_id + ".pickle") d = {Constants.CHUNK_KEYS[0]: spawned_pickle_file, Constants.CHUNK_KEYS[1]: ccs_file} c = PipelineChunk(chunk_id, **d) chunks.append(c) spawned_pickles.append(spawned_pickle_file) log.info("Spawning %s into %d files", cluster_chunks_pickle_file, len(groups)) p.spawn_pickles_by_groups(groups, spawned_pickles) log.debug("Spawned files: %s.", ", ".join(spawned_pickles)) # n_chunks = len(p) # for i in range(0, n_chunks): # chunk_id = "_".join([base_name, str(i)]) # spawned_pickle_file = op.join(out_dir, chunk_id + ".pickle") # d = {Constants.CHUNK_KEYS[0]: spawned_pickle_file, # Constants.CHUNK_KEYS[1]: ccs_file} # c = PipelineChunk(chunk_id, **d) # chunks.append(c) # spawned_pickles.append(spawned_pickle_file) # # log.info("Spawning %s into %s files", cluster_chunks_pickle_file, str(n_chunks)) # p.spawn_pickles(spawned_pickles) # log.debug("Spawned files: %s.", ", ".join(spawned_pickles)) log.info("Writing chunk.json to %s", output_json_file) write_pipeline_chunks(chunks, output_json_file, "created by %s" % Constants.TOOL_ID) return 0
def test_write_chunks(self): def f(i): return {"{c}movie_fofn_id".format(c=PipelineChunk.CHUNK_KEY_PREFIX): "/path/to_movie-{i}.fofn".format(i=i), "{c}region_fofn_id".format(c=PipelineChunk.CHUNK_KEY_PREFIX): "/path/rgn_{i}.fofn".format(i=i)} to_i = lambda i: "chunk-id-{i}".format(i=i) to_p = lambda i: PipelineChunk(to_i(i), **f(i)) nchunks = 5 pipeline_chunks = [to_p(i) for i in xrange(nchunks)] log.debug(pipeline_chunks) tmp_dir = get_temp_dir("pipeline-chunks") tmp_name = get_temp_file("_chunk.json", tmp_dir) write_pipeline_chunks(pipeline_chunks, tmp_name, "Example chunk file") pchunks = load_pipeline_chunks_from_json(tmp_name) self.assertEquals(len(pchunks), nchunks)
def run_main(partial_chunks_pickle_file, sentinel_file, ccs_file, output_json_file, max_nchunks): """ Spawn partial Chunk Tasks in pickle. Parameters: partial_chunks_pickle_file -- ChunkTasksPickle of PartialChunkTask objects ccs_file -- ccs dataset sentinel_file -- sentinel file to connect pbsmrtpipe tasks output_json -- chunk.json """ p = ChunkTasksPickle.read(partial_chunks_pickle_file) assert all([isinstance(r, PartialChunkTask) for r in p]) out_dir = op.dirname(output_json_file) # sort and group tasks groups = p.sort_and_group_tasks(max_nchunks=max_nchunks) # Writing chunk.json base_name = "spawned_partial_chunk" chunks = [] spawned_pickles = [] for group_index in range(0, len(groups)): chunk_id = "_".join([base_name, 'group', str(group_index)]) spawned_pickle_file = op.join(out_dir, chunk_id + ".pickle") d = { Constants.CHUNK_KEYS[0]: spawned_pickle_file, Constants.CHUNK_KEYS[1]: sentinel_file, Constants.CHUNK_KEYS[2]: ccs_file } c = PipelineChunk(chunk_id, **d) chunks.append(c) spawned_pickles.append(spawned_pickle_file) log.info("Spawning %s into %d files", partial_chunks_pickle_file, len(groups)) p.spawn_pickles_by_groups(groups=groups, out_pickle_fns=spawned_pickles) log.debug("Spawned files: %s.", ", ".join(spawned_pickles)) log.info("Writing chunk.json to %s", output_json_file) write_pipeline_chunks(chunks, output_json_file, "created by %s" % Constants.TOOL_ID) return 0
def run_main(fastq_file, gmap_ref_file, output_json_file, max_nchunks): """ Parameters: fastq_file -- HQ isoforms in FASTQ gmap_ref_file -- GMAP reference set xml output_json -- chunk.json """ # Check size of fastq_file before scattering, so that a meaningful # error message can be displayed instead of 'float division by zero' if os.stat(fastq_file).st_size == 0: raise IOError("Fastq file %s is empty, exiting." % fastq_file) # Chunk FASTQ output_fastq_json = output_json_file + ".fastq.json" output_dir = op.dirname(output_json_file) CU.write_fastq_chunks_to_file(output_fastq_json, fastq_file, max_nchunks, output_dir, "scattered-fastq", "fastq") # get fastq_ids from output_fastq_json fastq_chunks = load_pipeline_chunks_from_json(output_fastq_json) fastq_files = get_datum_from_chunks_by_chunk_key(fastq_chunks, "$chunk.fastq_id") log.debug("Chunked FASTQ files are %s.", (', '.join(fastq_files))) # Writing chunk.json chunks = [] for i, fastq_file in enumerate(fastq_files): chunk_id = "_".join(["map_isoforms_to_genome_chunk", str(i)]) d = { Constants.CHUNK_KEYS[0]: fastq_file, Constants.CHUNK_KEYS[1]: gmap_ref_file } c = PipelineChunk(chunk_id, **d) chunks.append(c) log.info("Writing chunk.json to %s", output_json_file) write_pipeline_chunks(chunks, output_json_file, "created by %s" % Constants.TOOL_ID) return 0
def run_main(polish_chunks_pickle_file, sentinel_file, subreads_file, output_json_file, max_nchunks): """ Spawn polish Chunk Tasks in pickle. Parameters: polish_chunks_pickle_file -- ChunkTasksPickle of PolishChunkTask objects subreads_file -- ccs dataset sentinel_file -- sentinel file to connect pbsmrtpipe tasks. output_json -- chunk.json """ p = ChunkTasksPickle.read(polish_chunks_pickle_file) assert all([isinstance(r, PolishChunkTask) for r in p]) out_dir = op.dirname(output_json_file) # sort and group tasks groups = p.sort_and_group_tasks(max_nchunks=max_nchunks) # Writing chunk.json base_name = "spawned_polish_chunk" chunks = [] spawned_pickles = [] for group_index in range(0, len(groups)): chunk_id = "_".join([base_name, 'group', str(group_index)]) spawned_pickle_file = op.join(out_dir, chunk_id + ".pickle") d = {Constants.CHUNK_KEYS[0]: spawned_pickle_file, Constants.CHUNK_KEYS[1]: sentinel_file, Constants.CHUNK_KEYS[2]: subreads_file} c = PipelineChunk(chunk_id, **d) chunks.append(c) spawned_pickles.append(spawned_pickle_file) log.info("Spawning %s into %d files", polish_chunks_pickle_file, len(groups)) p.spawn_pickles_by_groups(groups=groups, out_pickle_fns=spawned_pickles) log.debug("Spawned files: %s.", ", ".join(spawned_pickles)) log.info("Writing chunk.json to %s", output_json_file) write_pipeline_chunks(chunks, output_json_file, "created by %s" % Constants.TOOL_ID) return 0
def run_main(fastq_file, gmap_ref_file, output_json_file, max_nchunks): """ Parameters: fastq_file -- HQ isoforms in FASTQ gmap_ref_file -- GMAP reference set xml output_json -- chunk.json """ # Check size of fastq_file before scattering, so that a meaningful # error message can be displayed instead of 'float division by zero' if os.stat(fastq_file).st_size == 0: raise IOError("Fastq file %s is empty, exiting." % fastq_file) # Chunk FASTQ output_fastq_json = output_json_file + ".fastq.json" output_dir = op.dirname(output_json_file) CU.write_fastq_chunks_to_file(output_fastq_json, fastq_file, max_nchunks, output_dir, "scattered-fastq", "fastq") # get fastq_ids from output_fastq_json fastq_chunks = load_pipeline_chunks_from_json(output_fastq_json) fastq_files = get_datum_from_chunks_by_chunk_key(fastq_chunks, "$chunk.fastq_id") log.debug("Chunked FASTQ files are %s.", (', '.join(fastq_files))) # Writing chunk.json chunks = [] for i, fastq_file in enumerate(fastq_files): chunk_id = "_".join(["map_isoforms_to_genome_chunk", str(i)]) d = {Constants.CHUNK_KEYS[0]: fastq_file, Constants.CHUNK_KEYS[1]: gmap_ref_file} c = PipelineChunk(chunk_id, **d) chunks.append(c) log.info("Writing chunk.json to %s", output_json_file) write_pipeline_chunks(chunks, output_json_file, "created by %s" % Constants.TOOL_ID) return 0
def write_chunks_to_json(chunks, chunk_file): log.debug("Wrote {n} chunks to {f}.".format(n=len(chunks), f=chunk_file)) write_pipeline_chunks( chunks, chunk_file, "Chunks written at {d}".format(d=datetime.datetime.now())) return 0
def write_chunks_to_json(chunks, chunk_file): log.debug("Wrote {n} chunks to {f}.".format(n=len(chunks), f=chunk_file)) write_pipeline_chunks(chunks, chunk_file, "Chunks written at {d}".format(d=datetime.datetime.now())) return 0