def main1(prog_name, input_config_fn, logger_config_fn=None): global LOG LOG = run_support.setup_logger(logger_config_fn) lfs_setstripe_maybe(path='.', stripe=12) LOG.info('fc_run started with configuration %s', input_config_fn) try: config = run_support.parse_cfg_file(input_config_fn) import json dumped = json.dumps(config, indent=2, separators=(',', ': '), sort_keys=True) LOG.info('cfg=\n{}'.format(dumped)) except Exception: LOG.exception('Failed to parse config "{}".'.format(input_config_fn)) raise general_config = config['General'] check_general_config(general_config, input_config_fn) input_fofn_fn = general_config['input_fofn'] genome_size = int(general_config['genome_size']) squash = True if 0 < genome_size < 1000000 else False wf = PypeProcWatcherWorkflow(job_defaults=config['job.defaults'], squash=squash, ) general_config['ver'] = '100' # Store config as JSON, available to many tasks. config_fn = './config.json' # must not be in a task-dir io.serialize(config_fn, config) run(wf, config, os.path.abspath(config_fn), input_fofn_fn=input_fofn_fn, )
def main1(prog_name, input_config_fn, logger_config_fn=None): global fc_run_logger fc_run_logger = support.setup_logger(logger_config_fn) fc_run_logger.info('fc_run started with configuration %s', input_config_fn) try: config = support.get_dict_from_old_falcon_cfg( support.parse_config(input_config_fn)) except Exception: fc_run_logger.exception( 'Failed to parse config "{}".'.format(input_config_fn)) raise input_fofn_plf = makePypeLocalFile(config['input_fofn']) genome_size = config.get('genome_size') squash = True if 0 < genome_size < 1000000 else False wf = PypeProcWatcherWorkflow( job_type=config['job_type'], job_queue=config['job_queue'], sge_option=config.get('sge_option', ''), watcher_type=config['pwatcher_type'], watcher_directory=config['pwatcher_directory'], use_tmpdir=config.get('use_tmpdir'), squash=squash) run( wf, config, os.path.abspath(input_config_fn), input_fofn_plf=input_fofn_plf, )
def main(args): job_defaults = dict( njobs=1, NPROC=1, MB=24000, submit=submit, job_type='local', pwatcher_type='blocking', ) wf = PypeProcWatcherWorkflow(job_defaults=job_defaults, ) seq_dataset_lst = os.path.abspath(args["<reads.lst>"]) read_db_abs_prefix, read_db = run_build_db(wf, args, seq_dataset_lst) LOG.info('Finished: {}'.format(read_db_abs_prefix)) index_abs_prefix, read_idx = run_build_idx(wf, args, read_db_abs_prefix) LOG.info('Finished: {}'.format(index_abs_prefix)) ovlp_in = {} ovlp_in.update(read_db) ovlp_in.update(read_idx) ovlp_out = run_overlapper(wf, args, read_db_abs_prefix, index_abs_prefix, ovlp_in) LOG.info('Finished: {}'.format(ovlp_out)) ctg_out = run_ovlp_to_ctg(wf, args, read_db_abs_prefix, read_db, ovlp_out) LOG.info('Finished: {}'.format(ctg_out)) if args['--with-consensus']: cns_out = run_cns(wf, args, read_db_abs_prefix, read_db, index_abs_prefix, read_idx, ctg_out) LOG.info('Finished: {}'.format(cns_out))
def get_read_hctg_map(asm_dir, hasm_dir, read_to_contig_map_fn): wf = PypeProcWatcherWorkflow( max_jobs= 12, # TODO: Why was NumThreads ever set? There is only one task! ) rawread_id_file = makePypeLocalFile( os.path.join(asm_dir, 'read_maps/dump_rawread_ids/rawread_ids')) pread_id_file = makePypeLocalFile( os.path.join(asm_dir, 'read_maps/dump_pread_ids/pread_ids')) h_ctg_edges = makePypeLocalFile(os.path.join(hasm_dir, 'all_h_ctg_edges')) p_ctg_edges = makePypeLocalFile(os.path.join(hasm_dir, 'all_p_ctg_edges')) h_ctg_ids = makePypeLocalFile(os.path.join(hasm_dir, "all_h_ctg_ids")) #make_dirs(os.path.dirname(os.path.abspath(read_to_contig_map_fn)) # Workflow does this. read_to_contig_map_plf = makePypeLocalFile(read_to_contig_map_fn) inputs = { 'rawread_id_file': rawread_id_file, 'pread_id_file': pread_id_file, 'h_ctg_edges': h_ctg_edges, 'p_ctg_edges': p_ctg_edges, 'h_ctg_ids': h_ctg_ids } make_task = PypeTask( inputs=inputs, outputs={'read_to_contig_map': read_to_contig_map_plf}, ) wf.addTask(make_task(generate_read_to_hctg_map)) wf.refreshTargets() # block
def main1(prog_name, input_config_fn, logger_config_fn=None): global CFG setup_logger(logger_config_fn) LOG.info('config={!r}, log={!r}'.format(input_config_fn, logger_config_fn)) CFG = parse_config(input_config_fn)['General'] LOG.info('CFG=\n{}'.format(pprint.pformat(CFG))) wf = PypeProcWatcherWorkflow( job_type=CFG['job_type'], job_queue=CFG['job_queue'], watcher_type=CFG['watcher_type'], max_jobs=CFG.get('max_jobs', 2), ) run(wf, CFG)
def setup_workflow(): PRODUCERS.clear() # Forget any PypeTasks already defined. job_defaults = { 'job_type': 'string', #'submit': 'bash -C ${CMD} >| ${STDOUT_FILE} 2>| ${STDERR_FILE}', 'submit': 'bash -C ${CMD}', #'JOB_OPTS': '-pe smp 8 -q bigmem', 'pwatcher_type': 'blocking', #'pwatcher_directory': config.get('pwatcher_directory', 'mypwatcher'), #'use_tmpdir': '/scratch', 'njobs': 4, } wf = PypeProcWatcherWorkflow(job_defaults=job_defaults, ) return wf
def setup_workflow(): config = { 'job_type': 'string', 'job_queue': 'bash -C ${CMD} >| ${STDOUT_FILE} 2>| ${STDERR_FILE}', #'job_queue': 'bash -C ${CMD}', #'sge_option': '-pe smp 8 -q bigmem', 'pwatcher_type': 'blocking', #watcher_directory=config.get('pwatcher_directory', 'mypwatcher'), #'use_tmpdir': '/scratch', } wf = PypeProcWatcherWorkflow( max_jobs=4, job_type=config['job_type'], job_queue=config.get('job_queue'), sge_option=config.get('sge_option'), watcher_type=config.get('pwatcher_type'), #watcher_directory=config.get('pwatcher_directory', 'mypwatcher'), use_tmpdir=config.get('use_tmpdir'), ) return wf
def main1(prog_name, input_config_fn, logger_config_fn=None): global LOG LOG = support.setup_logger(logger_config_fn) lfs_setstripe_maybe(path='.', stripe=12) LOG.info('fc_run started with configuration %s', input_config_fn) try: config = support.parse_cfg_file(input_config_fn) import json dumped = json.dumps(config, indent=2, separators=(',', ': '), sort_keys=True) LOG.info('cfg=\n{}'.format(dumped)) except Exception: LOG.exception('Failed to parse config "{}".'.format(input_config_fn)) raise general_config = config['General'] assert 'input_fofn' in general_config, 'Missing "input_fofn" in {}.'.format( input_config_fn) input_fofn_plf = makePypeLocalFile(general_config['input_fofn']) genome_size = int(general_config.get('genome_size', '0')) squash = True if 0 < genome_size < 1000000 else False wf = PypeProcWatcherWorkflow( job_defaults=config['job.defaults'], squash=squash, ) general_config['ver'] = '100' config_fn = './config.json' # must not be in a task-dir io.serialize(config_fn, config) with open('foo.snake', 'w') as snakemake_writer: rule_writer = snakemake.SnakemakeRuleWriter(snakemake_writer) run( wf, config, rule_writer, os.path.abspath(config_fn), input_fofn_plf=input_fofn_plf, )
def get_read_ctg_map(rawread_dir, pread_dir, asm_dir): read_map_dir = os.path.abspath(os.path.join(asm_dir, 'read_maps')) make_dirs(read_map_dir) wf = PypeProcWatcherWorkflow(max_jobs=12, ) """ job_type=config['job_type'], job_queue=config['job_queue'], sge_option=config.get('sge_option', ''), watcher_type=config['pwatcher_type'], watcher_directory=config['pwatcher_directory']) """ rawread_db = makePypeLocalFile(os.path.join(rawread_dir, 'raw_reads.db')) rawread_id_file = makePypeLocalFile( os.path.join(read_map_dir, 'dump_rawread_ids', 'rawread_ids')) task = PypeTask(inputs={'rawread_db': rawread_db}, outputs={'rawread_id_file': rawread_id_file}, TaskType=PypeThreadTaskBase, URL='task://localhost/dump_rawread_ids') wf.addTask(task(dump_rawread_ids)) pread_db = makePypeLocalFile(os.path.join(pread_dir, 'preads.db')) pread_id_file = makePypeLocalFile( os.path.join(read_map_dir, 'dump_pread_ids', 'pread_ids')) task = PypeTask(inputs={'pread_db': pread_db}, outputs={'pread_id_file': pread_id_file}, TaskType=PypeThreadTaskBase, URL='task://localhost/dump_pread_ids') wf.addTask(task(dump_pread_ids)) wf.refreshTargets() # block sg_edges_list = makePypeLocalFile(os.path.join(asm_dir, 'sg_edges_list')) utg_data = makePypeLocalFile(os.path.join(asm_dir, 'utg_data')) ctg_paths = makePypeLocalFile(os.path.join(asm_dir, 'ctg_paths')) inputs = { 'rawread_id_file': rawread_id_file, 'pread_id_file': pread_id_file, 'sg_edges_list': sg_edges_list, 'utg_data': utg_data, 'ctg_paths': ctg_paths } read_to_contig_map = makePypeLocalFile( os.path.join(read_map_dir, 'get_ctg_read_map', 'read_to_contig_map')) task = PypeTask(inputs=inputs, outputs={'read_to_contig_map': read_to_contig_map}, TaskType=PypeThreadTaskBase, URL='task://localhost/get_ctg_read_map') wf.addTask(task(generate_read_to_ctg_map)) wf.refreshTargets() # block
def main(argv=sys.argv): global LOG LOG = support.setup_logger(None) if len(sys.argv) < 2: print >> sys.stderr, 'you need to provide a configuration file to specific a couple cluster running environment' sys.exit(1) config_fn = sys.argv[1] config_absbasedir = os.path.dirname(os.path.abspath(config_fn)) config = ConfigParser.ConfigParser() config.read(config_fn) job_type = 'SGE' if config.has_option('General', 'job_type'): job_type = config.get('General', 'job_type') sge_track_reads = ' -pe smp 12 -q bigmem' if config.has_option('Unzip', 'sge_track_reads'): sge_track_reads = config.get('Unzip', 'sge_track_reads') sge_quiver = ' -pe smp 24 -q bigmem ' if config.has_option('Unzip', 'sge_quiver'): sge_quiver = config.get('Unzip', 'sge_quiver') smrt_bin = '/mnt/secondary/builds/full/3.0.0/prod/smrtanalysis_3.0.0.153854/smrtcmds/bin/' if config.has_option('Unzip', 'smrt_bin'): smrt_bin = config.get('Unzip', 'smrt_bin') input_bam_fofn = 'input_bam.fofn' if config.has_option('Unzip', 'input_bam_fofn'): input_bam_fofn = config.get('Unzip', 'input_bam_fofn') if not os.path.isabs(input_bam_fofn): input_bam_fofn = os.path.join(config_absbasedir, input_bam_fofn) quiver_concurrent_jobs = 8 if config.has_option('Unzip', 'quiver_concurrent_jobs'): quiver_concurrent_jobs = config.getint('Unzip', 'quiver_concurrent_jobs') config = { 'job_type': job_type, 'sge_quiver': sge_quiver, 'sge_track_reads': sge_track_reads, 'input_bam_fofn': input_bam_fofn, 'smrt_bin': smrt_bin } LOG.info('config={}'.format(pprint.pformat(config))) #support.job_type = 'SGE' #tmp hack until we have a configuration parser wf = PypeProcWatcherWorkflow(max_jobs=quiver_concurrent_jobs, ) abscwd = os.path.abspath('.') parameters = { 'wd': os.path.join(abscwd, '4-quiver', 'track_reads_h'), 'config': config } hasm_done_plf = makePypeLocalFile( './3-unzip/1-hasm/hasm_done') # by convention track_reads_h_done_plf = makePypeLocalFile( os.path.join(parameters['wd'], 'track_reads_h_done')) make_track_reads_task = PypeTask( inputs={'hasm_done': hasm_done_plf}, outputs={'job_done': track_reads_h_done_plf}, parameters=parameters, ) track_reads_task = make_track_reads_task(task_track_reads) #sge_track_reads = config['sge_track_reads'] wf.addTask(track_reads_task) scattered_quiver_plf = makePypeLocalFile( '4-quiver/quiver_scatter/scattered.json') make_task = PypeTask( inputs={ 'p_ctg_fa': makePypeLocalFile('3-unzip/all_p_ctg.fa'), 'h_ctg_fa': makePypeLocalFile('3-unzip/all_h_ctg.fa'), 'track_reads_h_done': track_reads_h_done_plf, }, outputs={ 'scattered_quiver_json': scattered_quiver_plf, }, parameters={}, ) wf.addTask(make_task(task_scatter_quiver)) wf.refreshTargets() p_ctg_out, h_ctg_out, job_done_plfs = create_quiver_jobs( scattered_quiver_plf) gathered_p_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/p_ctg.txt') gathered_h_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/h_ctg.txt') gather_done_plf = makePypeLocalFile('4-quiver/cns_gather/job_done') mkdir('4-quiver/cns_gather') with open(fn(gathered_p_ctg_plf), 'w') as ifs: for cns_fasta_fn, cns_fastq_fn in sorted(p_ctg_out): ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn)) with open(fn(gathered_h_ctg_plf), 'w') as ifs: for cns_fasta_fn, cns_fastq_fn in sorted(h_ctg_out): ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn)) make_task = PypeTask( inputs=job_done_plfs, outputs={ 'job_done': gather_done_plf, }, parameters={}, ) wf.addTask(make_task(task_gather_quiver)) wf.refreshTargets() cns_p_ctg_fasta_plf = makePypeLocalFile( '4-quiver/cns_output/cns_p_ctg.fasta') cns_p_ctg_fastq_plf = makePypeLocalFile( '4-quiver/cns_output/cns_p_ctg.fastq') cns_h_ctg_fasta_plf = makePypeLocalFile( '4-quiver/cns_output/cns_h_ctg.fasta') cns_h_ctg_fastq_plf = makePypeLocalFile( '4-quiver/cns_output/cns_h_ctg.fastq') zcat_done_plf = makePypeLocalFile('4-quiver/cns_output/job_done') make_task = PypeTask( inputs={ 'gathered_p_ctg': gathered_p_ctg_plf, 'gathered_h_ctg': gathered_h_ctg_plf, 'gather_done': gather_done_plf, }, outputs={ 'cns_p_ctg_fasta': cns_p_ctg_fasta_plf, 'cns_p_ctg_fastq': cns_p_ctg_fastq_plf, 'cns_h_ctg_fasta': cns_h_ctg_fasta_plf, 'cns_h_ctg_fastq': cns_h_ctg_fastq_plf, 'job_done': zcat_done_plf, }, ) wf.addTask(make_task(task_cns_zcat)) wf.refreshTargets()
def phasing(args): bam_fn = args.bam fasta_fn = args.fasta ctg_id = args.ctg_id base_dir = args.base_dir samtools = args.samtools ref_seq = "" for r in FastaReader(fasta_fn): rid = r.name.split()[0] if rid != ctg_id: continue ref_seq = r.sequence.upper() wf = PypeProcWatcherWorkflow( max_jobs=1, ) bam_file = makePypeLocalFile(bam_fn) vmap_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "variant_map") ) vpos_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "variant_pos") ) q_id_map_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "q_id_map") ) parameters = {} parameters["ctg_id"] = ctg_id parameters["ref_seq"] = ref_seq parameters["base_dir"] = base_dir parameters["samtools"] = samtools make_het_call_task = PypeTask( inputs = { "bam_file": bam_file }, outputs = { "vmap_file": vmap_file, "vpos_file": vpos_file, "q_id_map_file": q_id_map_file }, parameters = parameters, ) (make_het_call) wf.addTasks([make_het_call_task]) atable_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'g_atable', "atable") ) parameters = {} parameters["ctg_id"] = ctg_id parameters["base_dir"] = base_dir generate_association_table_task = PypeTask( inputs = { "vmap_file": vmap_file }, outputs = { "atable_file": atable_file }, parameters = parameters, ) (generate_association_table) wf.addTasks([generate_association_table_task]) phased_variant_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'get_phased_blocks', "phased_variants") ) get_phased_blocks_task = PypeTask( inputs = { "vmap_file": vmap_file, "atable_file": atable_file }, outputs = { "phased_variant_file": phased_variant_file }, ) (get_phased_blocks) wf.addTasks([get_phased_blocks_task]) phased_read_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, "phased_reads") ) get_phased_reads_task = PypeTask( inputs = { "vmap_file": vmap_file, "q_id_map_file": q_id_map_file, "phased_variant_file": phased_variant_file }, outputs = { "phased_read_file": phased_read_file }, parameters = {"ctg_id": ctg_id}, ) (get_phased_reads) wf.addTasks([get_phased_reads_task]) wf.refreshTargets()
def unzip_all(config): unzip_blasr_concurrent_jobs = config['unzip_blasr_concurrent_jobs'] unzip_phasing_concurrent_jobs = config['unzip_phasing_concurrent_jobs'] wf = PypeProcWatcherWorkflow( max_jobs=unzip_blasr_concurrent_jobs, job_type=config['job_type'], job_queue=config.get('job_queue'), sge_option=config.get('sge_option'), watcher_type=config.get('pwatcher_type'), #watcher_directory=config.get('pwatcher_directory', 'mypwatcher'), use_tmpdir=config.get('use_tmpdir'), ) ctg_list_file = makePypeLocalFile('./3-unzip/reads/ctg_list') falcon_asm_done = makePypeLocalFile('./2-asm-falcon/falcon_asm_done') wdir = os.path.abspath('./3-unzip/reads') parameters = { 'wd': wdir, 'config': config, 'sge_option': config['sge_track_reads'], } job_done = makePypeLocalFile( os.path.join(parameters['wd'], 'track_reads_done')) make_track_reads_task = PypeTask( inputs={'falcon_asm_done': falcon_asm_done}, outputs={ 'job_done': job_done, 'ctg_list_file': ctg_list_file }, parameters=parameters, wdir=wdir, ) track_reads_task = make_track_reads_task(task_track_reads) wf.addTask(track_reads_task) wf.refreshTargets() #force refresh now, will put proper dependence later ctg_ids = [] with open('./3-unzip/reads/ctg_list') as f: for row in f: row = row.strip() ctg_ids.append(row) aln1_outs = {} all_ctg_out = {} for ctg_id in ctg_ids: # inputs ref_fasta = makePypeLocalFile( './3-unzip/reads/{ctg_id}_ref.fa'.format(ctg_id=ctg_id)) read_fasta = makePypeLocalFile( './3-unzip/reads/{ctg_id}_reads.fa'.format(ctg_id=ctg_id)) # outputs wd = os.path.join( os.getcwd(), './3-unzip/0-phasing/{ctg_id}/'.format(ctg_id=ctg_id)) #mkdir(wd) blasr_dir = os.path.join(wd, 'blasr') ctg_aln_out = makePypeLocalFile( os.path.join(blasr_dir, '{ctg_id}_sorted.bam'.format(ctg_id=ctg_id))) job_done = makePypeLocalFile( os.path.join(blasr_dir, 'aln_{ctg_id}_done'.format(ctg_id=ctg_id))) parameters = { 'job_uid': 'aln-' + ctg_id, 'wd': blasr_dir, 'config': config, 'ctg_id': ctg_id, 'sge_option': config['sge_blasr_aln'], } make_blasr_task = PypeTask( inputs={ 'ref_fasta': ref_fasta, 'read_fasta': read_fasta }, outputs={ 'ctg_aln_out': ctg_aln_out, 'job_done': job_done }, parameters=parameters, ) blasr_task = make_blasr_task(task_run_blasr) aln1_outs[ctg_id] = (ctg_aln_out, job_done) wf.addTask(blasr_task) wf.refreshTargets() wf.max_jobs = unzip_phasing_concurrent_jobs for ctg_id in ctg_ids: # inputs ref_fasta = makePypeLocalFile( './3-unzip/reads/{ctg_id}_ref.fa'.format(ctg_id=ctg_id)) read_fasta = makePypeLocalFile( './3-unzip/reads/{ctg_id}_reads.fa'.format(ctg_id=ctg_id)) # outputs wd = os.path.join( os.getcwd(), './3-unzip/0-phasing/{ctg_id}/'.format(ctg_id=ctg_id)) blasr_dir = os.path.join(wd, 'blasr') ctg_aln_out = makePypeLocalFile( os.path.join(blasr_dir, '{ctg_id}_sorted.bam'.format(ctg_id=ctg_id))) phasing_dir = os.path.join(wd, 'phasing') job_done = makePypeLocalFile( os.path.join(phasing_dir, 'p_{ctg_id}_done'.format(ctg_id=ctg_id))) rid_to_phase_out = makePypeLocalFile( os.path.join( wd, 'rid_to_phase.{ctg_id}'.format(ctg_id=ctg_id))) # TODO: ??? all_ctg_out['r2p.{ctg_id}'.format( ctg_id=ctg_id)] = rid_to_phase_out # implicit output? parameters = { 'job_uid': 'ha-' + ctg_id, 'wd': wd, 'config': config, 'ctg_id': ctg_id, 'sge_option': config['sge_phasing'], } make_phasing_task = PypeTask( inputs={ 'ref_fasta': ref_fasta, 'aln_bam': ctg_aln_out }, outputs={'job_done': job_done}, parameters=parameters, ) phasing_task = make_phasing_task(task_phasing) wf.addTask(phasing_task) wf.refreshTargets() hasm_wd = os.path.abspath('./3-unzip/1-hasm/') #mkdir(hasm_wd) rid_to_phase_all = makePypeLocalFile( os.path.join(hasm_wd, 'rid-to-phase-all', 'rid_to_phase.all')) task = PypeTask( inputs=all_ctg_out, outputs={'rid_to_phase_all': rid_to_phase_all}, )(get_rid_to_phase_all) wf.addTask(task) parameters['wd'] = hasm_wd parameters['sge_option'] = config['sge_hasm'] job_done = makePypeLocalFile(os.path.join(hasm_wd, 'hasm_done')) make_hasm_task = PypeTask( inputs={'rid_to_phase_all': rid_to_phase_all}, outputs={'job_done': job_done}, parameters=parameters, ) hasm_task = make_hasm_task(task_hasm) wf.addTask(hasm_task) wf.refreshTargets()
import shlex import os def make_dirs(d): if not os.path.isdir(d): os.makedirs(d) rawread_dir = os.path.abspath( "./0-rawreads" ) pread_dir = os.path.abspath( "./1-preads_ovl" ) asm_dir = os.path.abspath( os.path.join("./3-unzip/") ) read_map_dir = os.path.abspath(os.path.join(asm_dir, "read_maps")) make_dirs(read_map_dir) wf = PypeProcWatcherWorkflow( max_jobs=12, ) rawread_db = makePypeLocalFile( os.path.join( rawread_dir, "raw_reads.db" ) ) rawread_id_file = makePypeLocalFile( os.path.join( rawread_dir, "raw_read_ids" ) ) @PypeTask( inputs = {"rawread_db": rawread_db}, outputs = {"rawread_id_file": rawread_id_file}, TaskType = PypeThreadTaskBase, URL = "task://localhost/dump_rawread_ids" ) def dump_rawread_ids(self): rawread_db = fn( self.rawread_db ) rawread_id_file = fn( self.rawread_id_file ) os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" % (rawread_db, rawread_id_file) ) wf.addTask( dump_rawread_ids )
def get_read_ctg_map(rawread_dir, pread_dir, asm_dir): read_map_dir = os.path.abspath(os.path.join(asm_dir, 'read_maps')) make_dirs(read_map_dir) wf = PypeProcWatcherWorkflow( max_jobs=12, ) """ job_type=config['job_type'], job_queue=config['job_queue'], sge_option=config.get('sge_option', ''), watcher_type=config['pwatcher_type'], watcher_directory=config['pwatcher_directory']) """ rawread_db = makePypeLocalFile(os.path.join(rawread_dir, 'raw_reads.db')) rawread_id_file = makePypeLocalFile(os.path.join( read_map_dir, 'dump_rawread_ids', 'rawread_ids')) task = PypeTask( inputs={'rawread_db': rawread_db}, outputs={'rawread_id_file': rawread_id_file}, ) wf.addTask(task(pype_tasks.task_dump_rawread_ids)) pread_db = makePypeLocalFile(os.path.join(pread_dir, 'preads.db')) pread_id_file = makePypeLocalFile(os.path.join( read_map_dir, 'dump_pread_ids', 'pread_ids')) task = PypeTask( inputs={'pread_db': pread_db}, outputs={'pread_id_file': pread_id_file}, ) wf.addTask(task(pype_tasks.task_dump_pread_ids)) wf.refreshTargets() # block sg_edges_list = makePypeLocalFile(os.path.join(asm_dir, 'sg_edges_list')) utg_data = makePypeLocalFile(os.path.join(asm_dir, 'utg_data')) ctg_paths = makePypeLocalFile(os.path.join(asm_dir, 'ctg_paths')) inputs = {'rawread_id_file': rawread_id_file, 'pread_id_file': pread_id_file, 'sg_edges_list': sg_edges_list, 'utg_data': utg_data, 'ctg_paths': ctg_paths} read_to_contig_map = makePypeLocalFile(os.path.join( read_map_dir, 'get_ctg_read_map', 'read_to_contig_map')) task = PypeTask( inputs=inputs, outputs={'read_to_contig_map': read_to_contig_map}, ) wf.addTask(task(pype_tasks.task_generate_read_to_ctg_map)) wf.refreshTargets() # block
def main(argv=sys.argv): global LOG LOG = support.setup_logger(None) if len(sys.argv) < 2: print>>sys.stderr, 'you need to provide a configuration file to specific a couple cluster running environment' sys.exit(1) config_fn = sys.argv[1] config_absbasedir = os.path.dirname(os.path.abspath(config_fn)) config = ConfigParser.ConfigParser() config.read(config_fn) job_type = 'SGE' if config.has_option('General', 'job_type'): job_type = config.get('General', 'job_type') job_queue = 'default' if config.has_option('General', 'job_queue'): job_queue = config.get('General', 'job_queue') pwatcher_type = 'fs_based' if config.has_option('General', 'pwatcher_type'): pwatcher_type = config.get('General', 'pwatcher_type') sge_track_reads = ' -pe smp 12 -q bigmem' if config.has_option('Unzip', 'sge_track_reads'): sge_track_reads = config.get('Unzip', 'sge_track_reads') sge_quiver = ' -pe smp 24 -q bigmem ' if config.has_option('Unzip', 'sge_quiver'): sge_quiver = config.get('Unzip', 'sge_quiver') smrt_bin = '/mnt/secondary/builds/full/3.0.0/prod/smrtanalysis_3.0.0.153854/smrtcmds/bin/' if config.has_option('Unzip', 'smrt_bin'): smrt_bin = config.get('Unzip', 'smrt_bin') input_bam_fofn = 'input_bam.fofn' if config.has_option('Unzip', 'input_bam_fofn'): input_bam_fofn = config.get('Unzip', 'input_bam_fofn') if not os.path.isabs(input_bam_fofn): input_bam_fofn = os.path.join(config_absbasedir, input_bam_fofn) quiver_concurrent_jobs = 8 if config.has_option('Unzip', 'quiver_concurrent_jobs'): quiver_concurrent_jobs = config.getint('Unzip', 'quiver_concurrent_jobs') config = {'job_type': job_type, 'job_queue': job_queue, 'sge_quiver': sge_quiver, 'sge_track_reads': sge_track_reads, 'input_bam_fofn': input_bam_fofn, 'pwatcher_type': pwatcher_type, 'smrt_bin': smrt_bin} LOG.info('config={}'.format(pprint.pformat(config))) #support.job_type = 'SGE' #tmp hack until we have a configuration parser wf = PypeProcWatcherWorkflow( max_jobs=quiver_concurrent_jobs, job_type=config['job_type'], job_queue=config.get('job_queue'), sge_option=config.get('sge_option'), watcher_type=config.get('pwatcher_type'), #watcher_directory=config.get('pwatcher_directory', 'mypwatcher'), use_tmpdir=config.get('use_tmpdir'), ) abscwd = os.path.abspath('.') parameters = { 'sge_option': config['sge_track_reads'], } input_bam_fofn_fn = config['input_bam_fofn'] input_bam_fofn_plf = makePypeLocalFile(input_bam_fofn_fn) hasm_done_plf = makePypeLocalFile('./3-unzip/1-hasm/hasm_done') # by convention track_reads_h_done_plf = makePypeLocalFile('./4-quiver/reads/track_reads_h_done') make_track_reads_task = PypeTask(inputs = { 'input_bam_fofn': input_bam_fofn_plf, 'hasm_done': hasm_done_plf}, outputs = {'job_done': track_reads_h_done_plf}, parameters = parameters, ) track_reads_task = make_track_reads_task(task_track_reads) wf.addTask(track_reads_task) scattered_quiver_plf = makePypeLocalFile('4-quiver/quiver_scatter/scattered.json') parameters = { 'config': config, } make_task = PypeTask( inputs = { 'p_ctg_fa': makePypeLocalFile('3-unzip/all_p_ctg.fa'), 'h_ctg_fa': makePypeLocalFile('3-unzip/all_h_ctg.fa'), 'track_reads_h_done': track_reads_h_done_plf, }, outputs = { 'scattered_quiver_json': scattered_quiver_plf, }, parameters = parameters, ) wf.addTask(make_task(task_scatter_quiver)) wf.refreshTargets() p_ctg_out, h_ctg_out, job_done_plfs = create_quiver_jobs(wf, scattered_quiver_plf) gathered_p_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/p_ctg.txt') gathered_h_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/h_ctg.txt') gather_done_plf = makePypeLocalFile('4-quiver/cns_gather/job_done') mkdir('4-quiver/cns_gather') with open(fn(gathered_p_ctg_plf), 'w') as ifs: for cns_fasta_fn, cns_fastq_fn in sorted(p_ctg_out): ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn)) with open(fn(gathered_h_ctg_plf), 'w') as ifs: for cns_fasta_fn, cns_fastq_fn in sorted(h_ctg_out): ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn)) make_task = PypeTask( inputs = job_done_plfs, outputs = { 'job_done': gather_done_plf, }, parameters = {}, ) wf.addTask(make_task(task_gather_quiver)) wf.refreshTargets() cns_p_ctg_fasta_plf = makePypeLocalFile('4-quiver/cns_output/cns_p_ctg.fasta') cns_p_ctg_fastq_plf = makePypeLocalFile('4-quiver/cns_output/cns_p_ctg.fastq') cns_h_ctg_fasta_plf = makePypeLocalFile('4-quiver/cns_output/cns_h_ctg.fasta') cns_h_ctg_fastq_plf = makePypeLocalFile('4-quiver/cns_output/cns_h_ctg.fastq') zcat_done_plf = makePypeLocalFile('4-quiver/cns_output/job_done') make_task = PypeTask( inputs = { 'gathered_p_ctg': gathered_p_ctg_plf, 'gathered_h_ctg': gathered_h_ctg_plf, 'gather_done': gather_done_plf, }, outputs = { 'cns_p_ctg_fasta': cns_p_ctg_fasta_plf, 'cns_p_ctg_fastq': cns_p_ctg_fastq_plf, 'cns_h_ctg_fasta': cns_h_ctg_fasta_plf, 'cns_h_ctg_fastq': cns_h_ctg_fastq_plf, 'job_done': zcat_done_plf, }, ) wf.addTask(make_task(task_cns_zcat)) wf.refreshTargets()
def unzip_all(config): unzip_blasr_concurrent_jobs = config['unzip_blasr_concurrent_jobs'] unzip_phasing_concurrent_jobs = config['unzip_phasing_concurrent_jobs'] wf = PypeProcWatcherWorkflow( max_jobs=unzip_blasr_concurrent_jobs, job_type=config['job_type'], job_queue=config.get('job_queue'), sge_option=config.get('sge_option'), watcher_type=config.get('pwatcher_type'), #watcher_directory=config.get('pwatcher_directory', 'mypwatcher'), use_tmpdir=config.get('use_tmpdir'), ) ctg_list_file = makePypeLocalFile('./3-unzip/reads/ctg_list') falcon_asm_done = makePypeLocalFile('./2-asm-falcon/falcon_asm_done') wdir = os.path.abspath('./3-unzip/reads') parameters = {'wd': wdir, 'config': config, 'sge_option': config['sge_track_reads'], } job_done = makePypeLocalFile(os.path.join(parameters['wd'], 'track_reads_done')) make_track_reads_task = PypeTask(inputs = {'falcon_asm_done': falcon_asm_done}, outputs = {'job_done': job_done, 'ctg_list_file': ctg_list_file}, parameters = parameters, wdir = wdir, ) track_reads_task = make_track_reads_task(task_track_reads) wf.addTask(track_reads_task) wf.refreshTargets() #force refresh now, will put proper dependence later ctg_ids = [] with open('./3-unzip/reads/ctg_list') as f: for row in f: row = row.strip() ctg_ids.append(row) aln1_outs = {} all_ctg_out = {} for ctg_id in ctg_ids: # inputs ref_fasta = makePypeLocalFile('./3-unzip/reads/{ctg_id}_ref.fa'.format(ctg_id = ctg_id)) read_fasta = makePypeLocalFile('./3-unzip/reads/{ctg_id}_reads.fa'.format(ctg_id = ctg_id)) # outputs wd = os.path.join(os.getcwd(), './3-unzip/0-phasing/{ctg_id}/'.format(ctg_id = ctg_id)) #mkdir(wd) blasr_dir = os.path.join(wd, 'blasr') ctg_aln_out = makePypeLocalFile(os.path.join(blasr_dir, '{ctg_id}_sorted.bam'.format(ctg_id = ctg_id))) job_done = makePypeLocalFile(os.path.join(blasr_dir, 'aln_{ctg_id}_done'.format(ctg_id = ctg_id))) parameters = {'job_uid':'aln-'+ctg_id, 'wd': blasr_dir, 'config':config, 'ctg_id': ctg_id, 'sge_option': config['sge_blasr_aln'], } make_blasr_task = PypeTask(inputs = {'ref_fasta': ref_fasta, 'read_fasta': read_fasta}, outputs = {'ctg_aln_out': ctg_aln_out, 'job_done': job_done}, parameters = parameters, ) blasr_task = make_blasr_task(task_run_blasr) aln1_outs[ctg_id] = (ctg_aln_out, job_done) wf.addTask(blasr_task) wf.refreshTargets() wf.max_jobs = unzip_phasing_concurrent_jobs for ctg_id in ctg_ids: # inputs ref_fasta = makePypeLocalFile('./3-unzip/reads/{ctg_id}_ref.fa'.format(ctg_id = ctg_id)) read_fasta = makePypeLocalFile('./3-unzip/reads/{ctg_id}_reads.fa'.format(ctg_id = ctg_id)) # outputs wd = os.path.join(os.getcwd(), './3-unzip/0-phasing/{ctg_id}/'.format(ctg_id = ctg_id)) blasr_dir = os.path.join(wd, 'blasr') ctg_aln_out = makePypeLocalFile(os.path.join(blasr_dir, '{ctg_id}_sorted.bam'.format(ctg_id = ctg_id))) phasing_dir = os.path.join(wd, 'phasing') job_done = makePypeLocalFile(os.path.join(phasing_dir, 'p_{ctg_id}_done'.format(ctg_id = ctg_id))) rid_to_phase_out = makePypeLocalFile(os.path.join(wd, 'rid_to_phase.{ctg_id}'.format(ctg_id = ctg_id))) # TODO: ??? all_ctg_out[ 'r2p.{ctg_id}'.format(ctg_id = ctg_id) ] = rid_to_phase_out # implicit output? parameters = {'job_uid':'ha-'+ctg_id, 'wd': wd, 'config':config, 'ctg_id': ctg_id, 'sge_option': config['sge_phasing'], } make_phasing_task = PypeTask(inputs = {'ref_fasta': ref_fasta, 'aln_bam':ctg_aln_out}, outputs = {'job_done': job_done}, parameters = parameters, ) phasing_task = make_phasing_task(task_phasing) wf.addTask(phasing_task) wf.refreshTargets() hasm_wd = os.path.abspath('./3-unzip/1-hasm/') #mkdir(hasm_wd) rid_to_phase_all = makePypeLocalFile(os.path.join(hasm_wd, 'rid-to-phase-all', 'rid_to_phase.all')) task = PypeTask(inputs = all_ctg_out, outputs = {'rid_to_phase_all': rid_to_phase_all}, ) (get_rid_to_phase_all) wf.addTask(task) parameters['wd'] = hasm_wd parameters['sge_option'] = config['sge_hasm'] job_done = makePypeLocalFile(os.path.join(hasm_wd, 'hasm_done')) make_hasm_task = PypeTask(inputs = {'rid_to_phase_all': rid_to_phase_all}, outputs = {'job_done': job_done}, parameters = parameters, ) hasm_task = make_hasm_task(task_hasm) wf.addTask(hasm_task) wf.refreshTargets()