def merge_split( las_paths_fn, dbname="raw_reads", ): las_paths = io_io.deserialize(las_paths_fn) re_las_pair = re.compile( r'{db}\.(\d+)\.{db}\.(\d+)\.las$'.format(db=dbname)) las_map = collections.defaultdict(list) for path in las_paths: mo = re_las_pair.search(path) if not mo: msg = '{!r} does not match regex {!r}'.format( path, re_las_pair.pattern) raise Exception(msg) a, b = int(mo.group(1)), int(mo.group(2)) las_map[a].append(path) for i, block in enumerate(las_map): job_id = 'm_{:05d}'.format(i) # Write the las files for this job. input_dir = os.path.join('merge-scripts', job_id) las_paths_fn = os.path.join('.', input_dir, 'las-paths.json') io_io.mkdirs(input_dir) las_paths = las_map[block] io_io.serialize(las_paths_fn, las_paths) las_name = os.path.join('.', input_dir, 'las_fn') las_fn = '{}.{}.las'.format("L" + dbname, block) with open(las_name, "w") as f: f.writelines(las_fn)
def rep_split(las_paths_fn, group_size, coverage_limit): """For foo.db, HPC.REPmask would produce rep-jobs.05.MASK lines like this: # REPmask jobs (n) REPmask -v -c30 -nrep1 foo foo.R1.@1-3 REPmask -v -c30 -nrep1 foo foo.R1.@4-6 ... (That's for level R1.) We will do one block at-a-time, for simplicity. """ las_paths = io_io.deserialize(las_paths_fn) scripts = list() for i, las_fn in enumerate(las_paths): las_files = las_fn # one at-a-time script_lines = [ 'set -vex\n', #'LAcheck {} {}\n'.format(db, las_files), 'REPmask -v -c{} -nrep{} {} {}\n'.format(coverage_limit, group_size, "raw_reads", las_files), '#rm -f {}\n'.format(las_files), ] scripts.append(''.join(script_lines)) for i, script in enumerate(scripts): job_id = 'rep_{:03d}'.format(i) script_dir = os.path.join('.', 'rep-scripts', job_id) script_fn = os.path.join(script_dir, 'run_REPmask.sh') io_io.mkdirs(script_dir) with open(script_fn, 'w') as stream: stream.write('{}\n'.format(script))
def rep_daligner_split(REPmask_opt, db_fn, group_size, coverage_limit): """Similar to daligner_split(), but based on HPC.REPmask instead of HPC.daligner. """ scripts = _get_rep_daligner_split_scripts(REPmask_opt, db_fn, group_size, coverage_limit) for i, script in enumerate(scripts): job_id = 'rep_{:04d}'.format(i) script_dir = os.path.join('.', 'rep-scripts', job_id) script_fn = os.path.join(script_dir, 'run_daligner.sh') io_io.mkdirs(script_dir) with open(script_fn, 'w') as stream: stream.write('{}\n'.format(script))
def daligner_split(daligner_opt, db_fn, length_cutoff_fn): db = os.path.splitext(db_fn)[0] dbname = os.path.basename(db) tracks = get_tracks(db_fn) script = ''.join([ script_HPC_daligner(daligner_opt, db, length_cutoff_fn, tracks, prefix='daligner-jobs'), ]) script_fn = 'split_db.sh' with open(script_fn, 'w') as ofs: exe = bash.write_sub_script(ofs, script) io_io.syscall('bash -vex {}'.format(script_fn)) # We now have files like daligner-jobs.01.OVL # We need to parse that one. (We ignore the others.) lines = open('daligner-jobs.01.OVL').readlines() preads_aln = True if dbname == 'preads' else False xformer = functional.get_script_xformer(preads_aln) LOG.debug('preads_aln={!r} (True => use daligner_p)'.format(preads_aln)) scripts = list() for line in lines: if line.startswith('#'): continue if not line.strip(): continue line = xformer(line) # Use daligner_p for preads. scripts.append(line) """ Special case: # Daligner jobs (1) daligner raw_reads raw_reads && mv raw_reads.raw_reads.las raw_reads.las In that case, the "block" name is empty. (See functional.py) We will rename the file. (LAmerge on a single input is a no-op, which is fine.) """ if len(scripts) == 1: script = scripts[0] re_script = re.compile( r'(mv\b.*\S+\s+)(\S+)$') # no trailing newline, for now mo = re_script.search(script) if not mo: msg = 'Only 1 line in daligner-jobs.01.OVL, but\n {!r} did not match\n {!r}.'.format( re_script.pattern, script) LOG.warning(msg) else: new_script = re_script.sub( r'\1{dbname}.1.{dbname}.1.las'.format(dbname=dbname), script, 1) msg = 'Only 1 line in daligner-jobs.01.OVL:\n {!r} matches\n {!r}. Replacing with\n {!r}.'.format( re_script.pattern, script, new_script) LOG.warning(msg) scripts = [new_script] for i, script in enumerate(scripts): LAcheck = 'LAcheck -vS {} *.las'.format(db) script += '\n' + LAcheck + '\n' scripts[i] = "set -vex\n" + script for i, script in enumerate(scripts): job_id = 'j_{:04d}'.format(i) script_dir = os.path.join('.', 'daligner-scripts', job_id) script_fn = os.path.join(script_dir, 'run_daligner.sh') io_io.mkdirs(script_dir) with open(script_fn, 'w') as stream: stream.write(script)
def tan_split(tanmask_opt, db_fn, uows_fn, bash_template_fn): with open(bash_template_fn, 'w') as stream: stream.write( "python -m falcon_kit.mains.dazzler --config={input.config} --db={input.db} tan-split --split={output.split} --bash-template={output.bash_template}" ) # TANmask would put track-files in the DB-directory, not '.', # so we need to symlink everything first. db = symlink_db(db_fn) script = ''.join([ script_HPC_TANmask(tanmask_opt, db, prefix='tan-jobs'), ]) script_fn = 'split_db.sh' with open(script_fn, 'w') as ofs: exe = bash.write_sub_script(ofs, script) io_io.syscall('bash -vex {}'.format(script_fn)) # We now have files like tan-jobs.01.OVL # We need to parse that one. (We ignore the others.) lines = open('tan-jobs.01.OVL').readlines() re_block = re.compile(r'{}(\.\d+|)'.format(db)) def get_blocks(line): """Return ['.1', '.2', ...] """ return [mo.group(1) for mo in re_block.finditer(line)] scripts = list() for line in lines: if line.startswith('#'): continue if not line.strip(): continue blocks = get_blocks(line) assert blocks, 'No blocks found in {!r} from {!r}'.format( line, 'tan-jobs.01.OVL') las_files = ' '.join('TAN.{db}{block}.las'.format(db=db, block=block) for block in blocks) script_lines = [ line, 'LAcheck {} {}\n'.format(db, las_files), 'TANmask {} {}\n'.format(db, las_files), 'rm -f {}\n'.format(las_files), ] if [''] == blocks: # special case -- If we have only 1 block, then HPC.TANmask fails to use the block-number. # However, if there are multiple blocks, it is still possible for a single line to have # only 1 block. So we look for a solitary block that is '', and we symlink the .las to pretend # that it was named properly in the first place. script_lines.append( 'mv .{db}.tan.data .{db}.1.tan.data\n'.format(db=db)) script_lines.append( 'mv .{db}.tan.anno .{db}.1.tan.anno\n'.format(db=db)) scripts.append(''.join(script_lines)) db_dir = os.path.dirname(db_fn) for i, script in enumerate(scripts): bash_script = """ db_dir={db_dir} ln -sf ${{db_dir}}/.{db_prefix}.bps . ln -sf ${{db_dir}}/.{db_prefix}.idx . ln -sf ${{db_dir}}/{db_prefix}.db . ln -sf ${{db_dir}}/.{db_prefix}.dust.anno . ln -sf ${{db_dir}}/.{db_prefix}.dust.data . {script} """.format(db_dir=db_dir, db_prefix="raw_reads", script=script) job_id = 'tan_{:03d}'.format(i) script_dir = os.path.join('.', 'tan-scripts', job_id) script_fn = os.path.join(script_dir, 'run_datander.sh') io_io.mkdirs(script_dir) with open(script_fn, 'w') as stream: stream.write('{}\n'.format(bash_script))