def run_hgap_prepare(input_files, output_files, options): """Generate a config-file from options. """ say('options to run_hgap_prepare:\n{}'.format(pprint.pformat(options))) i_subreadset_fn, = input_files o_hgap_cfg_fn, o_logging_cfg_fn, o_log_fn = output_files run_dir = os.path.dirname(o_hgap_cfg_fn) symlink(os.path.join(run_dir, 'stderr'), o_log_fn) # This will be the cfg we pass to hgap_run. all_cfg = collections.defaultdict(lambda: collections.defaultdict(str)) # Get grid options, for job-distribution. update_for_grid(all_cfg, run_dir) # Set some other falcon options, based on hgap options. update_falcon(all_cfg) # Override from pbsmrtpipe config/preset.xml. all_cfg[OPTION_SECTION_FALCON]['genome_size'] = options[ TASK_HGAP_GENOME_LENGTH].strip() all_cfg[OPTION_SECTION_FALCON]['length_cutoff'] = options[ TASK_HGAP_SEED_LENGTH_CUTOFF].strip() all_cfg[OPTION_SECTION_FALCON]['seed_coverage'] = options[ TASK_HGAP_SEED_COVERAGE].strip() cfg_json = options[TASK_HGAP_OPTIONS].strip() if not cfg_json: cfg_json = '{}' override_cfg = json.loads(stricter_json(cfg_json)) update2(all_cfg, override_cfg) # Get options from pbsmrtpipe. pbsmrtpipe_opts = get_pbsmrtpipe_opts(run_dir) if OPTION_SECTION_PBSMRTPIPE not in all_cfg: all_cfg[OPTION_SECTION_PBSMRTPIPE] = dict() pbsmrtpipe_opts.update(all_cfg[OPTION_SECTION_PBSMRTPIPE]) all_cfg[OPTION_SECTION_PBSMRTPIPE] = pbsmrtpipe_opts # Dump all_cfg. say('Dumping to {}'.format(repr(o_hgap_cfg_fn))) dump_as_json(all_cfg, open(o_hgap_cfg_fn, 'w')) # Get logging cfg. logging_cfg = DEFAULT_LOGGING_CFG # Dump logging cfg. say('Dumping to {}'.format(repr(o_logging_cfg_fn))) dump_as_json(logging_cfg, open(o_logging_cfg_fn, 'w'))
def run_hgap_prepare(input_files, output_files, options): """Generate a config-file from options. """ say('options to run_hgap_prepare:\n{}'.format(pprint.pformat(options))) i_subreadset_fn, = input_files o_hgap_cfg_fn, o_logging_cfg_fn, o_log_fn = output_files run_dir = os.path.dirname(o_hgap_cfg_fn) symlink(os.path.join(run_dir, 'stderr'), o_log_fn) # This will be the cfg we pass to hgap_run. all_cfg = collections.defaultdict(lambda: collections.defaultdict(str)) # Get grid options, for job-distribution. update_for_grid(all_cfg, run_dir) # Set some other falcon options, based on hgap options. update_falcon(all_cfg) # Override from pbsmrtpipe config/preset.xml. all_cfg[OPTION_SECTION_FALCON]['genome_size'] = options[TASK_HGAP_GENOME_LENGTH].strip() all_cfg[OPTION_SECTION_FALCON]['length_cutoff'] = options[TASK_HGAP_SEED_LENGTH_CUTOFF].strip() all_cfg[OPTION_SECTION_FALCON]['seed_coverage'] = options[TASK_HGAP_SEED_COVERAGE].strip() cfg_json = options[TASK_HGAP_OPTIONS].strip() if not cfg_json: cfg_json = '{}' override_cfg = json.loads(stricter_json(cfg_json)) update2(all_cfg, override_cfg) update_pwatcher(all_cfg) # Get options from pbsmrtpipe. pbsmrtpipe_opts = get_pbsmrtpipe_opts(run_dir) if OPTION_SECTION_PBSMRTPIPE not in all_cfg: all_cfg[OPTION_SECTION_PBSMRTPIPE] = dict() pbsmrtpipe_opts.update(all_cfg[OPTION_SECTION_PBSMRTPIPE]) all_cfg[OPTION_SECTION_PBSMRTPIPE] = pbsmrtpipe_opts # Dump all_cfg. say('Dumping to {}'.format(repr(o_hgap_cfg_fn))) dump_as_json(all_cfg, open(o_hgap_cfg_fn, 'w')) # Get logging cfg. logging_cfg = DEFAULT_LOGGING_CFG # Dump logging cfg. say('Dumping to {}'.format(repr(o_logging_cfg_fn))) dump_as_json(logging_cfg, open(o_logging_cfg_fn, 'w'))
def run_falcon_build_rdb(input_files, output_files): print("output_files: %s" % (repr(output_files))) cwd = os.getcwd() odir = os.path.realpath(os.path.abspath(os.path.dirname(output_files[0]))) if True: # debug if cwd != odir: raise Exception("%r != %r" % (cwd, odir)) i_json_config_fn, i_fofn_fn = input_files print("output_files: %s" % repr(output_files)) run_daligner_jobs_fn, raw_reads_db_fn, job_done_fn = output_files config = _get_config_from_json_fileobj(open(i_json_config_fn)) script_fn = os.path.join(odir, "prepare_rdb.sh") # implies run-dir too # job_done_fn = os.path.join(odir, 'job.done') # not needed in pbsmrtpipe today tho support.build_rdb(i_fofn_fn, config, job_done_fn, script_fn, run_daligner_jobs_fn) run_cmd("bash %s" % script_fn, sys.stdout, sys.stderr, shell=False) job_descs = falcon_kit.functional.get_daligner_job_descriptions(open(run_daligner_jobs_fn), "raw_reads") # We do not bother to calculate 'single' b/c this is only a sanity-check. if not job_descs: raise Exception("No daligner jobs generated in '%s' by '%s'." % (run_daligner_jobs_fn, script_fn)) symlink("raw_reads.db", raw_reads_db_fn) return 0
def symlink_dazzdb(actualdir, db_prefix): """Symlink elements of dazzler db. For now, 3 files. """ symlink(os.path.join(actualdir, '.%s.bps'%db_prefix)) symlink(os.path.join(actualdir, '.%s.idx'%db_prefix)) symlink(os.path.join(actualdir, '%s.db'%db_prefix))
def run_hgap(input_files, output_files, tmpdir): i_cfg_fn, i_logging_fn, i_subreadset_fn = input_files o_contigset_fn, o_preass_json_fn, o_polass_json_fn, o_log_fn, = output_files # Update the logging-cfg with our log-file. logging_cfg = json.loads(open(i_logging_fn).read()) logging_cfg['handlers']['handler_file_all']['filename'] = o_log_fn logging_fn = 'logging.json' with open(logging_fn, 'w') as ofs: ofs.write(json.dumps(logging_cfg)) # Update the cfg with our subreadset. (Inside hgap_run?) # Run pypeflow.hgap.main. cmd = 'TMPDIR={tmpdir} python -m pbfalcon.cli.hgap_run --logging {logging_fn} {i_cfg_fn}'.format( **locals()) system(cmd) # Write Reports with open('run-falcon/0-rawreads/pre_assembly_stats.json' ) as stats_ifs: # by convention with open(o_preass_json_fn, 'w') as report_ofs: report_preassembly.write_report_from_stats(stats_ifs, report_ofs) # Symlink expected outputs, by convention. symlink('run-gc-gather/contigset.xml', o_contigset_fn) symlink('run-polished-assembly-report/polished_assembly_report.json', o_polass_json_fn) return 0
def run_falcon_build_rdb(input_files, output_files): print('output_files: %s' % (repr(output_files))) cwd = os.getcwd() odir = os.path.realpath(os.path.abspath(os.path.dirname(output_files[0]))) if True: #debug if cwd != odir: raise Exception('%r != %r' % (cwd, odir)) i_json_config_fn, i_fofn_fn = input_files print('output_files: %s' % repr(output_files)) run_daligner_jobs_fn, raw_reads_db_fn, job_done_fn = output_files config = _get_config_from_json_fileobj(open(i_json_config_fn)) script_fn = os.path.join(odir, 'prepare_rdb.sh') # implies run-dir too #job_done_fn = os.path.join(odir, 'job.done') # not needed in pbsmrtpipe today tho support.build_rdb(i_fofn_fn, config, job_done_fn, script_fn, run_daligner_jobs_fn) run_cmd('bash %s' % script_fn, sys.stdout, sys.stderr, shell=False) job_descs = falcon_kit.functional.get_daligner_job_descriptions( open(run_daligner_jobs_fn), 'raw_reads') # We do not bother to calculate 'single' b/c this is only a sanity-check. if not job_descs: raise Exception("No daligner jobs generated in '%s' by '%s'." % (run_daligner_jobs_fn, script_fn)) symlink('raw_reads.db', raw_reads_db_fn) return 0
def create_merge_tasks(i_fofn_fn, run_jobs_fn, wd, db_prefix, config): #merge_scripts = bash.scripts_merge(config, db_prefix, run_jobs_fn) tasks = {} # pid -> (merge_params, cons_params) mjob_data = {} with open(run_jobs_fn) as f: for l in f: l = l.strip().split() if l[0] not in ("LAsort", "LAmerge", "mv"): continue if l[0] == "LAsort": # We now run this part w/ daligner, but we still need # a small script for some book-keeping. p_id = int(l[2].split(".")[1]) mjob_data.setdefault(p_id, []) #mjob_data[p_id].append( " ".join(l) ) # Already done w/ daligner! if l[0] == "LAmerge": l2 = l[2].split(".") if l2[1][0] == "L": p_id = int(l[2].split(".")[2]) mjob_data.setdefault(p_id, []) mjob_data[p_id].append(" ".join(l)) else: p_id = int(l[2].split(".")[1]) mjob_data.setdefault(p_id, []) mjob_data[p_id].append(" ".join(l)) if l[0] == "mv": l2 = l[1].split(".") if l2[1][0] == "L": p_id = int(l[1].split(".")[2]) mjob_data.setdefault(p_id, []) mjob_data[p_id].append(" ".join(l)) else: p_id = int(l[1].split(".")[1]) mjob_data.setdefault(p_id, []) mjob_data[p_id].append(" ".join(l)) # Could be L1.* or preads.* re_las = re.compile(r'\.(\d*)(\.\d*)?\.las$') for p_id in mjob_data: s_data = mjob_data[p_id] support.make_dirs("%s/preads" % (wd)) support.make_dirs("%s/las_files" % (wd)) merge_subdir = "m_%05d" % p_id merge_dir = os.path.join(wd, merge_subdir) support.make_dirs(merge_dir) #merge_script_file = os.path.abspath( "%s/m_%05d/m_%05d.sh" % (wd, p_id, p_id) ) merge_script = StringIO.StringIO() with cd(merge_dir): print("i_fofn_fn=%r" % i_fofn_fn) # Since we could be in the gather-task-dir, instead of globbing, # we will read the fofn. for fn in open(i_fofn_fn).read().splitlines(): basename = os.path.basename(fn) mo = re_las.search(basename) if not mo: continue left_block = int(mo.group(1)) if left_block != p_id: # By convention, m_00005 merges L1.5.*.las, etc. continue symlink(fn) for l in s_data: print >> merge_script, l las_bfn = '%s.%d.las' % (db_prefix, p_id) #print >> merge_script, 'echo %s >| %s' %(las_bfn, merged_las_fofn_bfn) #job_done = makePypeLocalFile(os.path.abspath( "%s/m_%05d/m_%05d_done" % (wd, p_id, p_id) )) parameters = { "script": merge_script.getvalue(), "merge_subdir": merge_subdir, "config": config } merge_task = parameters fasta_bfn = "out.%05d.fasta" % p_id out_file_fn = os.path.abspath("%s/preads/%s" % (wd, fasta_bfn)) #out_done = makePypeLocalFile(os.path.abspath( "%s/preads/c_%05d_done" % (wd, p_id) )) parameters = { "db_fn": '{}/{}'.format(os.getcwd(), db_prefix), "las_fn": '{}/{}/{}'.format(os.getcwd(), merge_subdir, las_bfn), # assuming merge ran in merge_dir "out_file_fn": out_file_fn, #"out_done": out_done, "config": config } cons_task = parameters tasks[p_id] = (merge_task, cons_task, las_bfn, fasta_bfn) return tasks
def run_hgap(input_files, output_files, tmpdir): i_cfg_fn, i_logging_fn, i_subreadset_fn = input_files o_preads_fasta_fn, \ o_polished_fasta_fn, o_polished_fastq_fn, o_polished_csv_fn, \ o_aligned_subreads_fn, o_alignment_summary_gff_fn, o_unmapped_subreads_txt_fn, \ o_contigset_fn, o_preass_json_fn, o_polass_json_fn, o_log_fn, = output_files # Update the logging-cfg with our log-file. logging_cfg = json.loads(open(i_logging_fn).read()) logging_cfg['handlers']['handler_file_all']['filename'] = o_log_fn logging_fn = 'logging.json' with open(logging_fn, 'w') as ofs: ofs.write(json.dumps(logging_cfg)) # Update the cfg with our subreadset. (Inside hgap_run?) # Run pypeflow.hgap.main. cmd = 'TMPDIR={tmpdir} python -m pbfalcon.cli.hgap_run --logging {logging_fn} {i_cfg_fn}'.format( **locals()) system(cmd) # Write Reports with open('run-falcon/0-rawreads/report/pre_assembly_stats.json' ) as stats_ifs: # by convention with open(o_preass_json_fn, 'w') as report_ofs: report_preassembly.write_report_from_stats(stats_ifs, report_ofs) # Symlink expected outputs, by convention. symlink('run-falcon/1-preads_ovl/db2falcon/preads4falcon.fasta', o_preads_fasta_fn) symlink('run-gc-gather/contigset.fasta', o_polished_fasta_fn) symlink('run-gc-gather/gathered.fastq', o_polished_fastq_fn) symlink('run-polished-assembly-report/polished_coverage_vs_quality.csv', o_polished_csv_fn) symlink('run-polished-assembly-report/alignment.summary.gff', o_alignment_summary_gff_fn) symlink('run-pbalign_gather/aligned.subreads.alignmentset.xml', o_aligned_subreads_fn) symlink('run-pbalign_gather/unmapped.txt', o_unmapped_subreads_txt_fn) symlink('run-gc-gather/contigset.xml', o_contigset_fn) symlink('run-polished-assembly-report/polished_assembly_report.json', o_polass_json_fn) return 0
def run_hgap(input_files, output_files, tmpdir): i_cfg_fn, i_logging_fn, i_subreadset_fn = input_files o_preads_fasta_fn, o_polished_fasta_fn, o_polished_fastq_fn, o_polished_csv_fn, o_aligned_subreads_fn, o_alignment_summary_gff_fn, o_unmapped_subreads_txt_fn, o_contigset_fn, o_preass_json_fn, o_polass_json_fn, o_log_fn, = ( output_files ) # Update the logging-cfg with our log-file. logging_cfg = json.loads(open(i_logging_fn).read()) logging_cfg["handlers"]["handler_file_all"]["filename"] = o_log_fn logging_fn = "logging.json" with open(logging_fn, "w") as ofs: ofs.write(json.dumps(logging_cfg)) # Update the cfg with our subreadset. (Inside hgap_run?) # Run pypeflow.hgap.main. cmd = "TMPDIR={tmpdir} python -m pbfalcon.cli.hgap_run --logging {logging_fn} {i_cfg_fn}".format(**locals()) system(cmd) # Write Reports with open("run-falcon/0-rawreads/report/pre_assembly_stats.json") as stats_ifs: # by convention with open(o_preass_json_fn, "w") as report_ofs: report_preassembly.write_report_from_stats(stats_ifs, report_ofs) # Symlink expected outputs, by convention. symlink("run-falcon/1-preads_ovl/db2falcon/preads4falcon.fasta", o_preads_fasta_fn) symlink("run-gc-gather/contigset.fasta", o_polished_fasta_fn) symlink("run-gc-gather/gathered.fastq", o_polished_fastq_fn) symlink("run-polished-assembly-report/polished_coverage_vs_quality.csv", o_polished_csv_fn) symlink("run-polished-assembly-report/alignment.summary.gff", o_alignment_summary_gff_fn) symlink("run-pbalign_gather/aligned.subreads.alignmentset.xml", o_aligned_subreads_fn) symlink("run-pbalign_gather/unmapped.txt", o_unmapped_subreads_txt_fn) symlink("run-gc-gather/contigset.xml", o_contigset_fn) symlink("run-polished-assembly-report/polished_assembly_report.json", o_polass_json_fn) return 0
def create_merge_tasks(i_fofn_fn, run_jobs_fn, wd, db_prefix, config): # merge_scripts = bash.scripts_merge(config, db_prefix, run_jobs_fn) tasks = {} # pid -> (merge_params, cons_params) mjob_data = {} with open(run_jobs_fn) as f: for l in f: l = l.strip().split() if l[0] not in ("LAsort", "LAmerge", "mv"): continue if l[0] == "LAsort": # We now run this part w/ daligner, but we still need # a small script for some book-keeping. p_id = int(l[2].split(".")[1]) mjob_data.setdefault(p_id, []) # mjob_data[p_id].append( " ".join(l) ) # Already done w/ daligner! if l[0] == "LAmerge": l2 = l[2].split(".") if l2[1][0] == "L": p_id = int(l[2].split(".")[2]) mjob_data.setdefault(p_id, []) mjob_data[p_id].append(" ".join(l)) else: p_id = int(l[2].split(".")[1]) mjob_data.setdefault(p_id, []) mjob_data[p_id].append(" ".join(l)) if l[0] == "mv": l2 = l[1].split(".") if l2[1][0] == "L": p_id = int(l[1].split(".")[2]) mjob_data.setdefault(p_id, []) mjob_data[p_id].append(" ".join(l)) else: p_id = int(l[1].split(".")[1]) mjob_data.setdefault(p_id, []) mjob_data[p_id].append(" ".join(l)) # Could be L1.* or preads.* re_las = re.compile(r"\.(\d*)(\.\d*)?\.las$") for p_id in mjob_data: s_data = mjob_data[p_id] support.make_dirs("%s/preads" % (wd)) support.make_dirs("%s/las_files" % (wd)) merge_subdir = "m_%05d" % p_id merge_dir = os.path.join(wd, merge_subdir) support.make_dirs(merge_dir) # merge_script_file = os.path.abspath( "%s/m_%05d/m_%05d.sh" % (wd, p_id, p_id) ) merge_script = StringIO.StringIO() with cd(merge_dir): print("i_fofn_fn=%r" % i_fofn_fn) # Since we could be in the gather-task-dir, instead of globbing, # we will read the fofn. for fn in open(i_fofn_fn).read().splitlines(): basename = os.path.basename(fn) mo = re_las.search(basename) if not mo: continue left_block = int(mo.group(1)) if left_block != p_id: # By convention, m_00005 merges L1.5.*.las, etc. continue symlink(fn) for l in s_data: print >> merge_script, l las_bfn = "%s.%d.las" % (db_prefix, p_id) # print >> merge_script, 'echo %s >| %s' %(las_bfn, merged_las_fofn_bfn) # job_done = makePypeLocalFile(os.path.abspath( "%s/m_%05d/m_%05d_done" % (wd, p_id, p_id) )) parameters = {"script": merge_script.getvalue(), "merge_subdir": merge_subdir, "config": config} merge_task = parameters fasta_bfn = "out.%05d.fasta" % p_id out_file_fn = os.path.abspath("%s/preads/%s" % (wd, fasta_bfn)) # out_done = makePypeLocalFile(os.path.abspath( "%s/preads/c_%05d_done" % (wd, p_id) )) parameters = { "db_fn": "{}/{}".format(os.getcwd(), db_prefix), "las_fn": "{}/{}/{}".format(os.getcwd(), merge_subdir, las_bfn), # assuming merge ran in merge_dir "out_file_fn": out_file_fn, # "out_done": out_done, "config": config, } cons_task = parameters tasks[p_id] = (merge_task, cons_task, las_bfn, fasta_bfn) # tasks <- dict{p_id: (merge_task, cons_task, las_bfn, fasta_bfn)}, where # p_id is an integer, e.g., 1 # merge_task <- dict{'merge_dir': 'm_00001', 'script'='LAmerge -v raw_reads.1 L1.1.1 L1.1.2 L1.1.3', 'script_fn': x} # cons_task <- dict{'db_fn':x, 'las_fn':x, 'out_file_fn':x, 'config':config} # las_bfn, e.g. raw_reads.1.las # fasta_bfn, e.g., out.00001.fasta return tasks