def _run_db2falcon_jobs(cwd, config, dry_run, db2falcon_json_fn=None): """ cwd --- current workding directory dry_run --- if True, do not actually run the scripts db2falcon_json_fn --- if not None, write dict{0: dict('script_fn':script_fn, 'script_dir':script_dir)} """ # Generate preads4falcon.fasta from preads.db script_fn = os.path.join(cwd, "run_db2falcon.sh") job_done = script_fn + "_done" args = { "config": config, "job_done": job_done, "script_fn": script_fn, "preads4falcon_fn": "preads4falcon.fasta", "preads_db": "preads.db", } support.run_db2falcon(**args) json_fn = "db2falcon.json" if db2falcon_json_fn is None else db2falcon_json_fn # add script_dir to args for scattered tasks to work in the correct dir json_data = {0: {"script_fn": os.path.basename(script_fn), "script_dir": cwd}} with open(json_fn, "w") as writer: writer.write(json.dumps(json_data) + "\n") mkdir(cwd) with cd(cwd): if dry_run is False: run_cmd("bash %s" % os.path.basename(script_fn), sys.stdout, sys.stderr, shell=False) assert_nonzero("preads4falcon.fasta")
def run_daligner_jobs(input_files, output_files, db_prefix='raw_reads'): print('run_daligner_jobs: %s %s' % (repr(input_files), repr(output_files))) i_json_config_fn, run_daligner_job_fn, = input_files o_fofn_fn, = output_files db_dir = os.path.dirname(run_daligner_job_fn) cmds = ['pwd', 'ls -al'] fns = ['.{pre}.bps', '.{pre}.idx', '{pre}.db'] cmds += [r'\rm -f %s' % fn for fn in fns] cmds += ['ln -sf {dir}/%s .' % fn for fn in fns] cmd = ';'.join(cmds).format(dir=os.path.relpath(db_dir), pre=db_prefix) run_cmd(cmd, sys.stdout, sys.stderr, shell=True) cwd = os.getcwd() config = _get_config_from_json_fileobj(open(i_json_config_fn)) tasks = create_daligner_tasks(run_daligner_job_fn, cwd, db_prefix, db_prefix + '.db', config) odirs = [] for jobd, args in tasks.items(): with cd(jobd): support.run_daligner(**args) script_fn = args['script_fn'] run_cmd('bash %s' % script_fn, sys.stdout, sys.stderr, shell=False) odirs.append(os.path.dirname(script_fn)) write_fns( o_fofn_fn, itertools.chain.from_iterable( glob.glob('%s/*.las' % d) for d in odirs)) return 0
def _run_consensus_jobs(tasks, dry_run=False, cons_json_fn=None): """dry_run --- if True, do not actually run the scripts cons_json_fn --- if not None, write dict{p_id: dict{'script_fn':script_fn, 'script_dir':script_dir}} to it """ fns = list() json_data = dict() for p_id, (cons_args, fasta_bfn) in tasks.items(): run_dir = "preads" job_done = "c_%05d_done" % p_id script_fn = os.path.join(run_dir, "c_%05d.sh" % (p_id)) cons_args["job_done"] = job_done cons_args["script_fn"] = script_fn # cons_args <- dict{ # 'out_file_fn': abspath to preads/out.00001.fasta # 'script_fn' : c_00001.sh # 'job_done' : c_00001_done # 'raw_reads' : raw_reads # 'config' : config} support.run_consensus(**cons_args) mkdir(run_dir) with cd(run_dir): if dry_run is False: run_cmd("bash %s" % os.path.basename(script_fn), sys.stdout, sys.stderr, shell=False) fns.append(os.path.join(run_dir, fasta_bfn)) # add script_dir to args for scattered tasks to work in the correct dir json_data[p_id] = { "script_fn": os.path.basename(script_fn), # 'c_00001.sh' "script_dir": os.path.join(os.getcwd(), run_dir), } # '/pbi/.../tasks/falcon_ns.tasks.task_falcon0_run_merge_jobs/preads/' json_fn = "cons_jobs.json" if cons_json_fn is None else cons_json_fn with open(json_fn, "w") as writer: writer.write(json.dumps(json_data) + "\n") return fns # *.fasta ['preads/out.0001.fasta', 'preads/out.00002.fasta', 'preads/out.00003.fasta']
def run_falcon_config_get_fasta(input_files, output_files): i_config_fn, = input_files o_fofn_fn, = output_files config = _get_config(i_config_fn) i_fofn_fn = config['input_fofn'] if not os.path.isabs(i_fofn_fn): i_fofn_fn = os.path.join(os.path.dirname(i_config_fn), i_fofn_fn) msg = '%r -> %r' % (i_fofn_fn, o_fofn_fn) say(msg) with cd(os.path.dirname(i_fofn_fn)): return support.make_fofn_abs(i_fofn_fn, o_fofn_fn) return 0
def run_falcon_config_get_fasta(input_files, output_files): i_config_fn, = input_files o_fofn_fn, = output_files config = _get_config(i_config_fn) i_fofn_fn = config["input_fofn"] if not os.path.isabs(i_fofn_fn): i_fofn_fn = os.path.join(os.path.dirname(i_config_fn), i_fofn_fn) msg = "%r -> %r" % (i_fofn_fn, o_fofn_fn) say(msg) with cd(os.path.dirname(i_fofn_fn)): return support.make_fofn_abs(i_fofn_fn, o_fofn_fn) return 0
def run_falcon_make_fofn_abs(input_files, output_files): i_json_fn, = input_files o_fofn_fn, = output_files config = _get_config_from_json_fileobj(open(i_json_fn)) i_fofn_fn = config["input_fofn"] if not i_fofn_fn.startswith("/"): # i_fofn_fn can be relative to the location of the config file. original_config_fn = config["original_self"] i_fofn_fn = os.path.join(os.path.dirname(original_config_fn), i_fofn_fn) msg = "run_falcon_make_fofn_abs(%r -> %r)" % (i_fofn_fn, o_fofn_fn) say(msg) with cd(os.path.dirname(i_fofn_fn)): return support.make_fofn_abs(i_fofn_fn, o_fofn_fn) return 0
def run_falcon_make_fofn_abs(input_files, output_files): i_json_fn, = input_files o_fofn_fn, = output_files config = _get_config_from_json_fileobj(open(i_json_fn)) i_fofn_fn = config['input_fofn'] if not i_fofn_fn.startswith('/'): # i_fofn_fn can be relative to the location of the config file. original_config_fn = config['original_self'] i_fofn_fn = os.path.join(os.path.dirname(original_config_fn), i_fofn_fn) msg = 'run_falcon_make_fofn_abs(%r -> %r)' % (i_fofn_fn, o_fofn_fn) say(msg) with cd(os.path.dirname(i_fofn_fn)): return support.make_fofn_abs(i_fofn_fn, o_fofn_fn) return 0
def _run_merge_jobs(tasks, dry_run=False, merge_json_fn=None): """dry_run --- if True, do not actually run the scripts, merge_json_fn --- if not None, write dict{p_id->mege_args} to it """ fns = list() json_data = dict() for p_id, (merge_args, las_bfn) in tasks.items(): run_dir = merge_args['merge_subdir'] job_done = "merge_%05d_done" % p_id script_fn = os.path.join(run_dir, "merge_%05d.sh" % (p_id)) merge_args['job_done'] = job_done merge_args['script_fn'] = script_fn del merge_args['merge_subdir'] # was just a temporary hack # merge_args <- dict{ # 'job_done' : 'merge_00001_done', # 'script_fn' : 'merge_00001.sh', # 'script' : 'LAmege -v ...', # 'config' : config} support.run_las_merge(**merge_args) mkdir(run_dir) with cd(run_dir): if dry_run is False: run_cmd('bash %s' % os.path.basename(script_fn), sys.stdout, sys.stderr, shell=False) fns.append(os.path.join(run_dir, las_bfn)) # add script_dir to args for scattered tasks to work in the correct dir json_data[p_id] = { 'script_dir': os.path.join(os.getcwd(), run_dir), # 'merge_00001.sh' 'script_fn': os.path.basename(script_fn) } # '/pbi/.../tasks/falcon_ns.task.task_falcon0_run_merge_consensus_jobs/m_00001', json_fn = 'merge_jobs.json' if merge_json_fn is None else merge_json_fn # Write dict{p_id: dict{'script_fn':script_fn, 'script_dir':script_dir}} to a json file with open(json_fn, 'w') as writer: writer.write(json.dumps(json_data) + "\n") return fns # *.las, e.g., ['m_00001/raw_reads.1.las', 'm_00002/raw_reads.2.las', 'm_00003/raw_reads.3.las']
def _run_consensus_jobs(tasks, dry_run=False, cons_json_fn=None): """dry_run --- if True, do not actually run the scripts cons_json_fn --- if not None, write dict{p_id: dict{'script_fn':script_fn, 'script_dir':script_dir}} to it """ fns = list() json_data = dict() for p_id, (cons_args, fasta_bfn) in tasks.items(): run_dir = 'preads' job_done = "c_%05d_done" % p_id script_fn = os.path.join(run_dir, "c_%05d.sh" % (p_id)) cons_args['job_done'] = job_done cons_args['script_fn'] = script_fn # cons_args <- dict{ # 'out_file_fn': abspath to preads/out.00001.fasta # 'script_fn' : c_00001.sh # 'job_done' : c_00001_done # 'raw_reads' : raw_reads # 'config' : config} support.run_consensus(**cons_args) mkdir(run_dir) with cd(run_dir): if dry_run is False: run_cmd('bash %s' % os.path.basename(script_fn), sys.stdout, sys.stderr, shell=False) fns.append(os.path.join(run_dir, fasta_bfn)) # add script_dir to args for scattered tasks to work in the correct dir json_data[p_id] = { 'script_fn': os.path.basename(script_fn), # 'c_00001.sh' 'script_dir': os.path.join(os.getcwd(), run_dir) } # '/pbi/.../tasks/falcon_ns.tasks.task_falcon0_run_merge_jobs/preads/' json_fn = "cons_jobs.json" if cons_json_fn is None else cons_json_fn with open(json_fn, 'w') as writer: writer.write(json.dumps(json_data) + "\n") return fns # *.fasta ['preads/out.0001.fasta', 'preads/out.00002.fasta', 'preads/out.00003.fasta']
def run_scripts_in_json(input_files, output_files): """ input_files = ['*.json'] (e.g., merge|cons|db2falcon.json), where *.json <- dict(p_id: dict('script_fn':script_fn, 'script_dir':script_dir)) output_files = ['*_done.txt'] (e.g., merge_done.txt, cons_done.txt, db2falcon_done.txt) execute all script files. """ json_fn = input_files[0] txt_fn = output_files[0] a = json.load(open(json_fn, "r")) writer = open(txt_fn, "w") for p_id, args in a.iteritems(): if "script_fn" not in args: raise ValueError("Could not find 'script_fn' in json %r key %r" % (json_fn, p_id)) script_dir = str(args["script_dir"]) with cd(script_dir): script_fn = str(args["script_fn"]) run_cmd("bash %s" % script_fn, sys.stdout, sys.stderr, shell=False) writer.write(script_fn + "\n") writer.close() return 0
def run_daligner_jobs(input_files, output_files, db_prefix="raw_reads"): print("run_daligner_jobs: %s %s" % (repr(input_files), repr(output_files))) i_json_config_fn, run_daligner_job_fn, = input_files o_fofn_fn, = output_files db_dir = os.path.dirname(run_daligner_job_fn) cmds = ["pwd", "ls -al"] fns = [".{pre}.bps", ".{pre}.idx", "{pre}.db"] cmds += [r"\rm -f %s" % fn for fn in fns] cmds += ["ln -sf {dir}/%s ." % fn for fn in fns] cmd = ";".join(cmds).format(dir=os.path.relpath(db_dir), pre=db_prefix) run_cmd(cmd, sys.stdout, sys.stderr, shell=True) cwd = os.getcwd() config = _get_config_from_json_fileobj(open(i_json_config_fn)) tasks = create_daligner_tasks(run_daligner_job_fn, cwd, db_prefix, db_prefix + ".db", config) odirs = [] for jobd, args in tasks.items(): with cd(jobd): support.run_daligner(**args) script_fn = args["script_fn"] run_cmd("bash %s" % script_fn, sys.stdout, sys.stderr, shell=False) odirs.append(os.path.dirname(script_fn)) write_fns(o_fofn_fn, itertools.chain.from_iterable(glob.glob("%s/*.las" % d) for d in odirs)) return 0
def _run_db2falcon_jobs(cwd, config, dry_run, db2falcon_json_fn=None): """ cwd --- current workding directory dry_run --- if True, do not actually run the scripts db2falcon_json_fn --- if not None, write dict{0: dict('script_fn':script_fn, 'script_dir':script_dir)} """ # Generate preads4falcon.fasta from preads.db script_fn = os.path.join(cwd, "run_db2falcon.sh") job_done = script_fn + '_done' args = { 'config': config, 'job_done': job_done, 'script_fn': script_fn, 'preads4falcon_fn': 'preads4falcon.fasta', 'preads_db': 'preads.db', } support.run_db2falcon(**args) json_fn = "db2falcon.json" if db2falcon_json_fn is None else db2falcon_json_fn # add script_dir to args for scattered tasks to work in the correct dir json_data = { 0: { 'script_fn': os.path.basename(script_fn), 'script_dir': cwd } } with open(json_fn, 'w') as writer: writer.write(json.dumps(json_data) + "\n") mkdir(cwd) with cd(cwd): if dry_run is False: run_cmd('bash %s' % os.path.basename(script_fn), sys.stdout, sys.stderr, shell=False) assert_nonzero('preads4falcon.fasta')
def run_scripts_in_json(input_files, output_files): """ input_files = ['*.json'] (e.g., merge|cons|db2falcon.json), where *.json <- dict(p_id: dict('script_fn':script_fn, 'script_dir':script_dir)) output_files = ['*_done.txt'] (e.g., merge_done.txt, cons_done.txt, db2falcon_done.txt) execute all script files. """ json_fn = input_files[0] txt_fn = output_files[0] a = json.load(open(json_fn, 'r')) writer = open(txt_fn, 'w') for p_id, args in a.iteritems(): if 'script_fn' not in args: raise ValueError("Could not find 'script_fn' in json %r key %r" % (json_fn, p_id)) script_dir = str(args['script_dir']) with cd(script_dir): script_fn = str(args['script_fn']) run_cmd('bash %s' % script_fn, sys.stdout, sys.stderr, shell=False) writer.write(script_fn + "\n") writer.close() return 0
def _run_merge_jobs(tasks, dry_run=False, merge_json_fn=None): """dry_run --- if True, do not actually run the scripts, merge_json_fn --- if not None, write dict{p_id->mege_args} to it """ fns = list() json_data = dict() for p_id, (merge_args, las_bfn) in tasks.items(): run_dir = merge_args["merge_subdir"] job_done = "merge_%05d_done" % p_id script_fn = os.path.join(run_dir, "merge_%05d.sh" % (p_id)) merge_args["job_done"] = job_done merge_args["script_fn"] = script_fn del merge_args["merge_subdir"] # was just a temporary hack # merge_args <- dict{ # 'job_done' : 'merge_00001_done', # 'script_fn' : 'merge_00001.sh', # 'script' : 'LAmege -v ...', # 'config' : config} support.run_las_merge(**merge_args) mkdir(run_dir) with cd(run_dir): if dry_run is False: run_cmd("bash %s" % os.path.basename(script_fn), sys.stdout, sys.stderr, shell=False) fns.append(os.path.join(run_dir, las_bfn)) # add script_dir to args for scattered tasks to work in the correct dir json_data[p_id] = { "script_dir": os.path.join(os.getcwd(), run_dir), # 'merge_00001.sh' "script_fn": os.path.basename(script_fn), } # '/pbi/.../tasks/falcon_ns.task.task_falcon0_run_merge_consensus_jobs/m_00001', json_fn = "merge_jobs.json" if merge_json_fn is None else merge_json_fn # Write dict{p_id: dict{'script_fn':script_fn, 'script_dir':script_dir}} to a json file with open(json_fn, "w") as writer: writer.write(json.dumps(json_data) + "\n") return fns # *.las, e.g., ['m_00001/raw_reads.1.las', 'm_00002/raw_reads.2.las', 'm_00003/raw_reads.3.las']
def create_merge_tasks(i_fofn_fn, run_jobs_fn, wd, db_prefix, config): #merge_scripts = bash.scripts_merge(config, db_prefix, run_jobs_fn) tasks = {} # pid -> (merge_params, cons_params) mjob_data = {} with open(run_jobs_fn) as f: for l in f: l = l.strip().split() if l[0] not in ("LAsort", "LAmerge", "mv"): continue if l[0] == "LAsort": # We now run this part w/ daligner, but we still need # a small script for some book-keeping. p_id = int(l[2].split(".")[1]) mjob_data.setdefault(p_id, []) #mjob_data[p_id].append( " ".join(l) ) # Already done w/ daligner! if l[0] == "LAmerge": l2 = l[2].split(".") if l2[1][0] == "L": p_id = int(l[2].split(".")[2]) mjob_data.setdefault(p_id, []) mjob_data[p_id].append(" ".join(l)) else: p_id = int(l[2].split(".")[1]) mjob_data.setdefault(p_id, []) mjob_data[p_id].append(" ".join(l)) if l[0] == "mv": l2 = l[1].split(".") if l2[1][0] == "L": p_id = int(l[1].split(".")[2]) mjob_data.setdefault(p_id, []) mjob_data[p_id].append(" ".join(l)) else: p_id = int(l[1].split(".")[1]) mjob_data.setdefault(p_id, []) mjob_data[p_id].append(" ".join(l)) # Could be L1.* or preads.* re_las = re.compile(r'\.(\d*)(\.\d*)?\.las$') for p_id in mjob_data: s_data = mjob_data[p_id] support.make_dirs("%s/preads" % (wd)) support.make_dirs("%s/las_files" % (wd)) merge_subdir = "m_%05d" % p_id merge_dir = os.path.join(wd, merge_subdir) support.make_dirs(merge_dir) #merge_script_file = os.path.abspath( "%s/m_%05d/m_%05d.sh" % (wd, p_id, p_id) ) merge_script = StringIO.StringIO() with cd(merge_dir): print("i_fofn_fn=%r" % i_fofn_fn) # Since we could be in the gather-task-dir, instead of globbing, # we will read the fofn. for fn in open(i_fofn_fn).read().splitlines(): basename = os.path.basename(fn) mo = re_las.search(basename) if not mo: continue left_block = int(mo.group(1)) if left_block != p_id: # By convention, m_00005 merges L1.5.*.las, etc. continue symlink(fn) for l in s_data: print >> merge_script, l las_bfn = '%s.%d.las' % (db_prefix, p_id) #print >> merge_script, 'echo %s >| %s' %(las_bfn, merged_las_fofn_bfn) #job_done = makePypeLocalFile(os.path.abspath( "%s/m_%05d/m_%05d_done" % (wd, p_id, p_id) )) parameters = { "script": merge_script.getvalue(), "merge_subdir": merge_subdir, "config": config } merge_task = parameters fasta_bfn = "out.%05d.fasta" % p_id out_file_fn = os.path.abspath("%s/preads/%s" % (wd, fasta_bfn)) #out_done = makePypeLocalFile(os.path.abspath( "%s/preads/c_%05d_done" % (wd, p_id) )) parameters = { "db_fn": '{}/{}'.format(os.getcwd(), db_prefix), "las_fn": '{}/{}/{}'.format(os.getcwd(), merge_subdir, las_bfn), # assuming merge ran in merge_dir "out_file_fn": out_file_fn, #"out_done": out_done, "config": config } cons_task = parameters tasks[p_id] = (merge_task, cons_task, las_bfn, fasta_bfn) return tasks
def create_merge_tasks(i_fofn_fn, run_jobs_fn, wd, db_prefix, config): # merge_scripts = bash.scripts_merge(config, db_prefix, run_jobs_fn) tasks = {} # pid -> (merge_params, cons_params) mjob_data = {} with open(run_jobs_fn) as f: for l in f: l = l.strip().split() if l[0] not in ("LAsort", "LAmerge", "mv"): continue if l[0] == "LAsort": # We now run this part w/ daligner, but we still need # a small script for some book-keeping. p_id = int(l[2].split(".")[1]) mjob_data.setdefault(p_id, []) # mjob_data[p_id].append( " ".join(l) ) # Already done w/ daligner! if l[0] == "LAmerge": l2 = l[2].split(".") if l2[1][0] == "L": p_id = int(l[2].split(".")[2]) mjob_data.setdefault(p_id, []) mjob_data[p_id].append(" ".join(l)) else: p_id = int(l[2].split(".")[1]) mjob_data.setdefault(p_id, []) mjob_data[p_id].append(" ".join(l)) if l[0] == "mv": l2 = l[1].split(".") if l2[1][0] == "L": p_id = int(l[1].split(".")[2]) mjob_data.setdefault(p_id, []) mjob_data[p_id].append(" ".join(l)) else: p_id = int(l[1].split(".")[1]) mjob_data.setdefault(p_id, []) mjob_data[p_id].append(" ".join(l)) # Could be L1.* or preads.* re_las = re.compile(r"\.(\d*)(\.\d*)?\.las$") for p_id in mjob_data: s_data = mjob_data[p_id] support.make_dirs("%s/preads" % (wd)) support.make_dirs("%s/las_files" % (wd)) merge_subdir = "m_%05d" % p_id merge_dir = os.path.join(wd, merge_subdir) support.make_dirs(merge_dir) # merge_script_file = os.path.abspath( "%s/m_%05d/m_%05d.sh" % (wd, p_id, p_id) ) merge_script = StringIO.StringIO() with cd(merge_dir): print("i_fofn_fn=%r" % i_fofn_fn) # Since we could be in the gather-task-dir, instead of globbing, # we will read the fofn. for fn in open(i_fofn_fn).read().splitlines(): basename = os.path.basename(fn) mo = re_las.search(basename) if not mo: continue left_block = int(mo.group(1)) if left_block != p_id: # By convention, m_00005 merges L1.5.*.las, etc. continue symlink(fn) for l in s_data: print >> merge_script, l las_bfn = "%s.%d.las" % (db_prefix, p_id) # print >> merge_script, 'echo %s >| %s' %(las_bfn, merged_las_fofn_bfn) # job_done = makePypeLocalFile(os.path.abspath( "%s/m_%05d/m_%05d_done" % (wd, p_id, p_id) )) parameters = {"script": merge_script.getvalue(), "merge_subdir": merge_subdir, "config": config} merge_task = parameters fasta_bfn = "out.%05d.fasta" % p_id out_file_fn = os.path.abspath("%s/preads/%s" % (wd, fasta_bfn)) # out_done = makePypeLocalFile(os.path.abspath( "%s/preads/c_%05d_done" % (wd, p_id) )) parameters = { "db_fn": "{}/{}".format(os.getcwd(), db_prefix), "las_fn": "{}/{}/{}".format(os.getcwd(), merge_subdir, las_bfn), # assuming merge ran in merge_dir "out_file_fn": out_file_fn, # "out_done": out_done, "config": config, } cons_task = parameters tasks[p_id] = (merge_task, cons_task, las_bfn, fasta_bfn) # tasks <- dict{p_id: (merge_task, cons_task, las_bfn, fasta_bfn)}, where # p_id is an integer, e.g., 1 # merge_task <- dict{'merge_dir': 'm_00001', 'script'='LAmerge -v raw_reads.1 L1.1.1 L1.1.2 L1.1.3', 'script_fn': x} # cons_task <- dict{'db_fn':x, 'las_fn':x, 'out_file_fn':x, 'config':config} # las_bfn, e.g. raw_reads.1.las # fasta_bfn, e.g., out.00001.fasta return tasks