def make_graph(self, in_queue, args): parallel_key_dequeue = tuple(in_queue.dequeue() for _ in range(args.enqueue)) # read_files: [(file_path, (mmaped_file_handles, a gen)) x N] read_files = tuple((path_base, ) + tuple(read_gen) for path_base, read_gen in zip( parallel_key_dequeue, pipeline.local_read_pipeline( upstream_tensors=parallel_key_dequeue, columns=self.columns))) # need to use tf.tuple to make sure that these are both made ready at the same time to_central_gen = (a[1:] for a in read_files) pass_around_gen = ((a[0], ) for a in read_files) aligner_results, run_first = tuple( self.make_central_pipeline(args=args, input_gen=to_central_gen, pass_around_gen=pass_around_gen)) to_writer_gen = tuple( (buffer_list_handle, record_id, first_ordinal, num_records, file_basename) for buffer_list_handle, num_records, first_ordinal, record_id, file_basename in aligner_results) written_records = (tuple(a) for a in pipeline.local_write_pipeline( upstream_tensors=to_writer_gen, compressed=(args.compress_parallel > 0), record_types=self.write_columns)) final_output_gen = zip( written_records, ((record_id, num_records, first_ordinal, file_basename) for _, num_records, first_ordinal, record_id, file_basename in aligner_results)) output = (b + (a, ) for a, b in final_output_gen) return output, run_first
def import_sga_local(in_queue, argsj, outdir=None, parallel_parse=1, feature="NFAT", path="."): manifest = argsj.dataset if 'reference' not in manifest: raise Exception( "No reference data in manifest {}. Unaligned BAM not yet supported. Please align dataset first." .format(args.dataset)) """ key: tensor with chunk key string local_directory: the "base path" from which these should be read column_grouping_factor: the number of keys to put together parallel_parse: the parallelism for processing records (decomp) """ ref_lens = [] ref_seqs = [] for contig in manifest['reference_contigs']: ref_lens.append(contig['length']) ref_seqs.append(contig['name']) parallel_key_dequeue = tuple(in_queue.dequeue() for _ in range(parallel_parse)) result_chunks = pipeline.local_read_pipeline( upstream_tensors=parallel_key_dequeue, columns=['results']) result_chunk_list = [list(c) for c in result_chunks] parsed_results = pipeline.agd_reader_multi_column_pipeline( upstream_tensorz=result_chunk_list) parsed_results_list = list(parsed_results) parsed_result = pipeline.join(parsed_results_list, parallel=1, capacity=8, multi=True)[0] result_buf, num_results, first_ord, record_id = parsed_result result_buf = tf.unstack(result_buf)[0] result = persona_ops.import_sga(results_handle=result_buf, num_records=num_results, ref_sequences=ref_seqs, ref_seq_sizes=ref_lens, feature=feature, path=path, name="importsgaop") return result
def agd_mark_duplicates_local(in_queue, outdir=None, parallel_parse=1, parallel_write=1, parallel_compress=1): """ key: tensor with chunk key string local_directory: the "base path" from which these should be read column_grouping_factor: the number of keys to put together parallel_parse: the parallelism for processing records (decomp) """ parallel_key_dequeue = tuple(in_queue.dequeue() for _ in range(parallel_parse)) result_chunks = pipeline.local_read_pipeline(upstream_tensors=parallel_key_dequeue, columns=['results']) result_chunk_list = [ list(c) for c in result_chunks ] parsed_results = pipeline.agd_reader_multi_column_pipeline(upstream_tensorz=result_chunk_list) parsed_results_list = list(parsed_results) parsed_result = pipeline.join(parsed_results_list, parallel=1, capacity=8, multi=True)[0] # result_buf, num_recs, first_ord, record_id #parsed_results = tf.contrib.persona.persona_in_pipe(key=key, dataset_dir=local_directory, columns=["results"], parse_parallel=parallel_parse, #process_parallel=1) print(parsed_result) result_buf, num_results, first_ord, record_id = parsed_result result_buf = tf.unstack(result_buf)[0] print(result_buf) bpp = persona_ops.buffer_pair_pool(size=0, bound=False, name="output_buffer_pair_pool") result_out = persona_ops.agd_mark_duplicates(results_handle=result_buf, num_records=num_results, buffer_pair_pool=bpp, name="markdupsop") result_to_write = pipeline.join([result_out, num_results, first_ord, record_id], parallel=parallel_write, capacity=8, multi=False) compressed = compress_pipeline(result_to_write, parallel_compress) written = _make_writers(compressed_batch=list(compressed), output_dir=outdir, write_parallelism=parallel_write) recs = list(written) all_written_keys = pipeline.join(recs, parallel=1, capacity=8, multi=False) return all_written_keys
def make_read_stage(self, gate): """ :param gate: :param args: :return: a generator of [ id_and_count, [ filename ], [ a list of handles in the order of the columns, NOT STACKED ] ] """ # each item in dequeue_ops' components is a single filename dequeue_ops = tuple(gate.dequeue() for _ in range(self.read_parallel)) filenames = (components[0] for _, components in dequeue_ops) path_prefix = self.path_prefix if path_prefix != "": if path_prefix[-1] != "/": path_prefix = "{}/".format(path_prefix) path_prefix = tf.constant(path_prefix) filenames = (tf.string_join((path_prefix, fname)) for fname in filenames) read_file_gen = zip(dequeue_ops, pipeline.local_read_pipeline( upstream_tensors=filenames, columns=self.columns # a[1][0] gets the components, which is just a filename )) for a,b in read_file_gen: yield tuple(a)+tuple(b)
def make_read_stage(self, local_gate): """ :param local_gate: components in local_gate: just the basename of intermediate files :return: a generator of type (id_and_count, [ handles, to, mmaped, columns ], filenames) """ def gen_filenames(): for i in range(self.read_parallel): idc, comp = local_gate.dequeue() assert len(comp) == 1 filename = comp[0] yield idc, filename ids_and_counts, filenames = zip(*gen_filenames()) assert len(filenames) > 0 read_groups = (tuple(a) for a in pipeline.local_read_pipeline( delete_after_use=True, upstream_tensors=filenames, columns=self.columns, name="local_read_merge_pipeline")) pool = persona_ops.raw_file_system_column_pool(bound=False, size=0) convert = partial(persona_ops.raw_file_converter, column_pool=pool) def gen_conversion(): for read_group in read_groups: values = tuple( convert(data=file_handle) for file_handle in read_group) handles, record_ids = zip(*values) assert len(record_ids) > 0 yield tf.stack( handles, name="stack_raw_filesystem_columns"), record_ids[0] for idc, (handles, record_id), filename in zip(ids_and_counts, gen_conversion(), filenames): yield idc, (handles, record_id, filename)
def agd_flagstat_local(in_queue, outdir=None, parallel_parse=1, parallel_write=1, parallel_compress=1): """ key: tensor with chunk key string local_directory: the "base path" from which these should be read column_grouping_factor: the number of keys to put together parallel_parse: the parallelism for processing records (decomp) """ parallel_key_dequeue = tuple(in_queue.dequeue() for _ in range(parallel_parse)) result_chunks = pipeline.local_read_pipeline( upstream_tensors=parallel_key_dequeue, columns=['results']) result_chunk_list = [list(c) for c in result_chunks] parsed_results = pipeline.agd_reader_multi_column_pipeline( upstream_tensorz=result_chunk_list) parsed_results_list = list(parsed_results) parsed_result = pipeline.join(parsed_results_list, parallel=1, capacity=8, multi=True)[0] # print(parsed_result) result_buf, num_results, first_ord, record_id = parsed_result result_buf = tf.unstack(result_buf)[0] # print(result_buf) result_out = persona_ops.agd_flagstat(results_handle=result_buf, num_records=num_results, name="flagstat") return result_out
def export_bam(in_queue, args): manifest = args.dataset if 'reference' not in manifest: raise Exception( "No reference data in manifest {}. Unaligned BAM not yet supported. Please align dataset first." .format(args.dataset)) #bp_handle = persona_ops.buffer_pool(size=10, bound=False, name="buf_pool") #mmap_pool = persona_ops.m_map_pool(size=10, bound=False, name="file_mmap_buffer_pool") columns = ["base", "qual", "metadata", "results"] num_secondary = 0 for column in manifest['columns']: if 'secondary' in column: columns.append(column) secondary += 1 print("BAM output using columns: {}".format(columns)) # TODO provide option for reading from Ceph result_chunks = pipeline.local_read_pipeline( upstream_tensors=[in_queue.dequeue()], columns=columns) result_chunk_list = [list(c) for c in result_chunks] to_parse = pipeline.join(upstream_tensors=result_chunk_list, parallel=args.parallel_parse, multi=True, capacity=8) parsed_results = pipeline.agd_reader_multi_column_pipeline( upstream_tensorz=to_parse) parsed_results_list = list(parsed_results) parsed_result = pipeline.join(parsed_results_list, parallel=1, capacity=8, multi=True)[0] # base, qual, meta, result, [secondary], num_recs, first_ord, record_id handles = parsed_result[0] bases = handles[0] quals = handles[1] meta = handles[2] # give a matrix of all the result columns results = tf.stack(handles[3:]) num_recs = parsed_result[1] first_ord = parsed_result[2] if args.output_path == "": output_path = manifest['name'] + ".bam" else: output_path = args.output_path ref_lens = [] ref_seqs = [] for contig in manifest['reference_contigs']: ref_lens.append(contig['length']) ref_seqs.append(contig['name']) sort = manifest['sort'] if 'sort' in manifest else 'unsorted' pg_id = "personaAGD" # TODO get from manifest read_group = manifest['name'] agd_to_bam = persona_ops.agd_output_bam(results_handle=results, bases_handle=bases, qualities_handle=quals, metadata_handle=meta, num_records=num_recs, path=output_path, ref_sequences=ref_seqs, ref_seq_sizes=ref_lens, pg_id=pg_id, read_group=read_group, sort_order=sort, num_threads=args.threads) return [agd_to_bam], []
def make_graph(self, in_queue, args): # TODO remove the _out when we are satisfied it works correctly rec_name = args.dataset['records'][0][ 'path'][:-1] # assuming path name is chunk_file_{ordinal} #print("Sorting {} chunks".format(len(args.dataset['records']))) parallel_key_dequeue = tuple(in_queue.dequeue() for _ in range(args.sort_read_parallel)) # read_files: [(file_path, (mmaped_file_handles, a gen)) x N] mmap_pool = persona_ops.m_map_pool(name="mmap_pool", size=10, bound=False) read_files = list( list(a) for a in pipeline.local_read_pipeline( upstream_tensors=parallel_key_dequeue, columns=self.inter_columns, mmap_pool=mmap_pool)) # need to use tf.tuple to make sure that these are both made ready at the same time buf_pool = persona_ops.buffer_pool(size=0, bound=False, name="bufpool") bpp = persona_ops.buffer_pair_pool( size=0, bound=False, name="local_read_merge_buffer_list_pool") sorters = self.make_sort_pipeline(args=args, input_gen=read_files, buf_pool=buf_pool, bufpair_pool=bpp) writers = self.make_inter_writers(sorters, args.dataset_dir, args.write_parallel) inter_file_paths = pipeline.join(writers, parallel=1, capacity=3, multi=True, name="writer_queue")[0] inter_file_name = inter_file_paths[-1] num_inter_files = int( math.ceil(len(args.dataset['records']) / args.column_grouping)) # these two queues form a barrier, to force downstream to wait until all intermediate superchunks are ready for merge # wait for num_inter_files f = tf.train.batch([inter_file_name], batch_size=num_inter_files, name="inter_file_batcher") # now output them one by one files = tf.train.batch([f], enqueue_many=True, batch_size=1, name="inter_file_output") full_path = tf.string_join([args.dataset_dir, "/", files]) # needs to be scalar not shape [1] which seems pretty stupid ... full_path_scalar = tf.reshape(full_path, []) # may need to add disk read parallelism here merge_cols = self.inter_columns #if args.order_by == location_value: #merge_cols = self.merge_result_columns #else: #merge_cols = self.merge_meta_columns merge_files = list( list(a) for a in pipeline.local_read_pipeline( upstream_tensors=[full_path_scalar], sync=False, columns=merge_cols, mmap_pool=mmap_pool)) stacked_chunks = [] for f in merge_files: stacked_chunks.append(tf.stack(f)) # batch all the intermediate superchunks that are now mmap'd chunks_to_merge = tf.train.batch(stacked_chunks, batch_size=num_inter_files, name="mapped_inter_files_to_merge") merge_tuple = self.make_merge_pipeline(args=args, chunks_to_merge=chunks_to_merge, record_name=rec_name, bpp=bpp) # out_tuple = [results, base, qual, meta, record_name, first_ord, num_recs, file_name] compress_queue = pipeline.join(merge_tuple, capacity=4, parallel=args.compress_parallel, multi=False, name="to_compress") compressed_bufs = list(self.make_compressors(compress_queue, buf_pool)) #print(compressed_bufs) writers = list( list(a) for a in self.make_writers(args, compressed_bufs)) return writers, []