def rep_combine(db_fn, gathered_fn, group_size):
    # new_db = "rep_db"
    db = symlink_db(db_fn)
    # db ="raw_reads"
    # Remove old, in case of resume.
    # io_io.syscall('rm -f .{db}.*.rep{group_size}.anno .{db}.*.rep{group_size}.data'.format(**locals()))

    # gathered = io_io.deserialize(gathered_fn)
    # gathered_dn = os.path.dirname(gathered_fn)
    with open(gathered_fn, "r") as f:
        data = f.readlines()[0][1:-1]
        new_data = data.replace(" ", "")
        gathered = new_data.split(",")
    print(len(gathered))
    # Create symlinks for all track-files.
    for job in gathered:
        job_path = os.path.dirname(job)
        if not os.path.isabs(job_path):
            LOG.info('Found relative done-file: {!r}'.format(job_path))
        annos = glob.glob('{}/.{}.*.rep{}.anno'.format(job_path, db,
                                                       group_size))
        datas = glob.glob('{}/.{}.*.rep{}.data'.format(job_path, db,
                                                       group_size))
        assert len(annos) == len(
            datas), 'Mismatched globs:\n{!r}\n{!r}'.format(annos, datas)
        for fn in annos + datas:
            symlink(fn, force=False)
    cmd = 'Catrack -vdf {} rep{}'.format(db, group_size)
    io_io.syscall(cmd)
예제 #2
0
def rep_apply(db_fn, script_fn):
    # daligner would put track-files in the DB-directory, not '.',
    # so we need to symlink everything first.
    db = symlink_db(db_fn)

    symlink(script_fn)
    io_io.syscall('bash -vex {}'.format(os.path.basename(script_fn)))
예제 #3
0
def merge_apply(las_paths_fn, las_fn):
    """Merge the las files into one, a few at a time.
    This replaces the logic of HPC.daligner.
    """
    with open(las_fn, "r") as f:
        las_name = f.readlines()[0].strip()
    io_io.rm_force(las_name)
    print(las_name)
    #all_las_paths = rel_to(io_io.deserialize(las_paths_fn), os.path.dirname(las_paths_fn))
    all_las_paths = io_io.deserialize(las_paths_fn)

    # Create symlinks, so system calls will be shorter.
    all_syms = list()
    for fn in all_las_paths:
        symlink(fn)
        all_syms.append(os.path.basename(fn))
    curr_paths = sorted(all_syms)

    # Merge a few at-a-time.
    at_a_time = 250  # max is 252 for LAmerge
    level = 1
    while len(curr_paths) > 1:
        level += 1
        next_paths = list()
        for i, paths in enumerate(ichunked(curr_paths, at_a_time)):
            tmp_las = 'L{}.{}.las'.format(level, i + 1)
            paths_arg = ' '.join(paths)
            cmd = 'LAmerge -v {} {}'.format(tmp_las, paths_arg)
            io_io.syscall(cmd)
            next_paths.append(tmp_las)
        curr_paths = next_paths

    io_io.syscall('mv -f {} {}'.format(curr_paths[0], 'keep-this'))
    io_io.syscall('mv -f  {} {}'.format('keep-this', las_name))
예제 #4
0
def _get_rep_daligner_split_scripts(REPmask_opt, db_fn, group_size, coverage_limit):
    db = os.path.splitext(db_fn)[0]
    dbname = os.path.basename(db)
    tracks = get_tracks(db_fn)

    # First, run HPC.REPmask immediately.
    script = ''.join([
        script_HPC_REPmask(REPmask_opt, db, tracks,
            prefix='rep-jobs', group_size=group_size, coverage_limit=coverage_limit),
    ])
    script_fn = 'split_db.sh'
    with open(script_fn, 'w') as ofs:
        exe = bash.write_sub_script(ofs, script)
    io_io.syscall('bash -vex {}'.format(script_fn))

    # We now have files like rep-jobs.01.OVL
    # We need to parse that one. (We ignore the others.)
    lines = open('rep-jobs.01.OVL').readlines()

    scripts = list()
    for line in lines:
        if line.startswith('#'):
            continue
        if not line.strip():
            continue
        scripts.append(line)

    if len(scripts) == 1:
        scripts = [fake_rep_as_daligner_script_moved(s, dbname) for s in scripts]
    else:
        scripts = [fake_rep_as_daligner_script_unmoved(s, dbname) for s in scripts]

    for i, script in enumerate(scripts):
        LAcheck = 'LAcheck -vS {} *.las'.format(db)
        script += '\n' + LAcheck + '\n'
        scripts[i] = "set -uex\n"+script

    return scripts
예제 #5
0
def daligner_split(daligner_opt, db_fn, length_cutoff_fn):
    db = os.path.splitext(db_fn)[0]
    dbname = os.path.basename(db)

    tracks = get_tracks(db_fn)

    script = ''.join([
        script_HPC_daligner(daligner_opt,
                            db,
                            length_cutoff_fn,
                            tracks,
                            prefix='daligner-jobs'),
    ])
    script_fn = 'split_db.sh'
    with open(script_fn, 'w') as ofs:
        exe = bash.write_sub_script(ofs, script)
    io_io.syscall('bash -vex {}'.format(script_fn))

    # We now have files like daligner-jobs.01.OVL
    # We need to parse that one. (We ignore the others.)
    lines = open('daligner-jobs.01.OVL').readlines()

    preads_aln = True if dbname == 'preads' else False
    xformer = functional.get_script_xformer(preads_aln)
    LOG.debug('preads_aln={!r} (True => use daligner_p)'.format(preads_aln))

    scripts = list()
    for line in lines:
        if line.startswith('#'):
            continue
        if not line.strip():
            continue
        line = xformer(line)  # Use daligner_p for preads.
        scripts.append(line)
    """
    Special case:
        # Daligner jobs (1)
        daligner raw_reads raw_reads && mv raw_reads.raw_reads.las raw_reads.las
    In that case, the "block" name is empty. (See functional.py)
    We will rename the file. (LAmerge on a single input is a no-op, which is fine.)
    """
    if len(scripts) == 1:
        script = scripts[0]
        re_script = re.compile(
            r'(mv\b.*\S+\s+)(\S+)$')  # no trailing newline, for now
        mo = re_script.search(script)
        if not mo:
            msg = 'Only 1 line in daligner-jobs.01.OVL, but\n {!r} did not match\n {!r}.'.format(
                re_script.pattern, script)
            LOG.warning(msg)
        else:
            new_script = re_script.sub(
                r'\1{dbname}.1.{dbname}.1.las'.format(dbname=dbname), script,
                1)
            msg = 'Only 1 line in daligner-jobs.01.OVL:\n {!r} matches\n {!r}. Replacing with\n {!r}.'.format(
                re_script.pattern, script, new_script)
            LOG.warning(msg)
            scripts = [new_script]

    for i, script in enumerate(scripts):
        LAcheck = 'LAcheck -vS {} *.las'.format(db)
        script += '\n' + LAcheck + '\n'
        scripts[i] = "set -vex\n" + script

    for i, script in enumerate(scripts):
        job_id = 'j_{:04d}'.format(i)
        script_dir = os.path.join('.', 'daligner-scripts', job_id)
        script_fn = os.path.join(script_dir, 'run_daligner.sh')
        io_io.mkdirs(script_dir)
        with open(script_fn, 'w') as stream:
            stream.write(script)
예제 #6
0
def tan_split(tanmask_opt, db_fn, uows_fn, bash_template_fn):
    with open(bash_template_fn, 'w') as stream:
        stream.write(
            "python -m falcon_kit.mains.dazzler --config={input.config} --db={input.db}  tan-split --split={output.split} --bash-template={output.bash_template}"
        )
    # TANmask would put track-files in the DB-directory, not '.',
    # so we need to symlink everything first.
    db = symlink_db(db_fn)

    script = ''.join([
        script_HPC_TANmask(tanmask_opt, db, prefix='tan-jobs'),
    ])
    script_fn = 'split_db.sh'
    with open(script_fn, 'w') as ofs:
        exe = bash.write_sub_script(ofs, script)
    io_io.syscall('bash -vex {}'.format(script_fn))

    # We now have files like tan-jobs.01.OVL
    # We need to parse that one. (We ignore the others.)
    lines = open('tan-jobs.01.OVL').readlines()

    re_block = re.compile(r'{}(\.\d+|)'.format(db))

    def get_blocks(line):
        """Return ['.1', '.2', ...]
        """
        return [mo.group(1) for mo in re_block.finditer(line)]

    scripts = list()
    for line in lines:
        if line.startswith('#'):
            continue
        if not line.strip():
            continue
        blocks = get_blocks(line)
        assert blocks, 'No blocks found in {!r} from {!r}'.format(
            line, 'tan-jobs.01.OVL')
        las_files = ' '.join('TAN.{db}{block}.las'.format(db=db, block=block)
                             for block in blocks)
        script_lines = [
            line,
            'LAcheck {} {}\n'.format(db, las_files),
            'TANmask {} {}\n'.format(db, las_files),
            'rm -f {}\n'.format(las_files),
        ]
        if [''] == blocks:
            # special case -- If we have only 1 block, then HPC.TANmask fails to use the block-number.
            # However, if there are multiple blocks, it is still possible for a single line to have
            # only 1 block. So we look for a solitary block that is '', and we symlink the .las to pretend
            # that it was named properly in the first place.
            script_lines.append(
                'mv .{db}.tan.data .{db}.1.tan.data\n'.format(db=db))
            script_lines.append(
                'mv .{db}.tan.anno .{db}.1.tan.anno\n'.format(db=db))
        scripts.append(''.join(script_lines))
    db_dir = os.path.dirname(db_fn)

    for i, script in enumerate(scripts):
        bash_script = """
db_dir={db_dir}
ln -sf ${{db_dir}}/.{db_prefix}.bps .
ln -sf ${{db_dir}}/.{db_prefix}.idx .
ln -sf ${{db_dir}}/{db_prefix}.db .
ln -sf ${{db_dir}}/.{db_prefix}.dust.anno .
ln -sf ${{db_dir}}/.{db_prefix}.dust.data .
{script}
""".format(db_dir=db_dir, db_prefix="raw_reads", script=script)
        job_id = 'tan_{:03d}'.format(i)
        script_dir = os.path.join('.', 'tan-scripts', job_id)
        script_fn = os.path.join(script_dir, 'run_datander.sh')
        io_io.mkdirs(script_dir)
        with open(script_fn, 'w') as stream:
            stream.write('{}\n'.format(bash_script))