Пример #1
0
def run_hgap_prepare(input_files, output_files, options):
    """Generate a config-file from options.
    """
    say('options to run_hgap_prepare:\n{}'.format(pprint.pformat(options)))
    i_subreadset_fn, = input_files
    o_hgap_cfg_fn, o_logging_cfg_fn, o_log_fn = output_files
    run_dir = os.path.dirname(o_hgap_cfg_fn)
    symlink(os.path.join(run_dir, 'stderr'), o_log_fn)

    # This will be the cfg we pass to hgap_run.
    all_cfg = collections.defaultdict(lambda: collections.defaultdict(str))

    # Get grid options, for job-distribution.
    update_for_grid(all_cfg, run_dir)

    # Set some other falcon options, based on hgap options.
    update_falcon(all_cfg)

    # Override from pbsmrtpipe config/preset.xml.
    all_cfg[OPTION_SECTION_FALCON]['genome_size'] = options[
        TASK_HGAP_GENOME_LENGTH].strip()
    all_cfg[OPTION_SECTION_FALCON]['length_cutoff'] = options[
        TASK_HGAP_SEED_LENGTH_CUTOFF].strip()
    all_cfg[OPTION_SECTION_FALCON]['seed_coverage'] = options[
        TASK_HGAP_SEED_COVERAGE].strip()
    cfg_json = options[TASK_HGAP_OPTIONS].strip()
    if not cfg_json:
        cfg_json = '{}'
    override_cfg = json.loads(stricter_json(cfg_json))
    update2(all_cfg, override_cfg)

    # Get options from pbsmrtpipe.
    pbsmrtpipe_opts = get_pbsmrtpipe_opts(run_dir)
    if OPTION_SECTION_PBSMRTPIPE not in all_cfg:
        all_cfg[OPTION_SECTION_PBSMRTPIPE] = dict()
    pbsmrtpipe_opts.update(all_cfg[OPTION_SECTION_PBSMRTPIPE])
    all_cfg[OPTION_SECTION_PBSMRTPIPE] = pbsmrtpipe_opts

    # Dump all_cfg.
    say('Dumping to {}'.format(repr(o_hgap_cfg_fn)))
    dump_as_json(all_cfg, open(o_hgap_cfg_fn, 'w'))

    # Get logging cfg.
    logging_cfg = DEFAULT_LOGGING_CFG

    # Dump logging cfg.
    say('Dumping to {}'.format(repr(o_logging_cfg_fn)))
    dump_as_json(logging_cfg, open(o_logging_cfg_fn, 'w'))
def run_hgap_prepare(input_files, output_files, options):
    """Generate a config-file from options.
    """
    say('options to run_hgap_prepare:\n{}'.format(pprint.pformat(options)))
    i_subreadset_fn, = input_files
    o_hgap_cfg_fn, o_logging_cfg_fn, o_log_fn = output_files
    run_dir = os.path.dirname(o_hgap_cfg_fn)
    symlink(os.path.join(run_dir, 'stderr'), o_log_fn)

    # This will be the cfg we pass to hgap_run.
    all_cfg = collections.defaultdict(lambda: collections.defaultdict(str))

    # Get grid options, for job-distribution.
    update_for_grid(all_cfg, run_dir)

    # Set some other falcon options, based on hgap options.
    update_falcon(all_cfg)

    # Override from pbsmrtpipe config/preset.xml.
    all_cfg[OPTION_SECTION_FALCON]['genome_size'] = options[TASK_HGAP_GENOME_LENGTH].strip()
    all_cfg[OPTION_SECTION_FALCON]['length_cutoff'] = options[TASK_HGAP_SEED_LENGTH_CUTOFF].strip()
    all_cfg[OPTION_SECTION_FALCON]['seed_coverage'] = options[TASK_HGAP_SEED_COVERAGE].strip()
    cfg_json = options[TASK_HGAP_OPTIONS].strip()
    if not cfg_json:
        cfg_json = '{}'
    override_cfg = json.loads(stricter_json(cfg_json))
    update2(all_cfg, override_cfg)

    update_pwatcher(all_cfg)

    # Get options from pbsmrtpipe.
    pbsmrtpipe_opts = get_pbsmrtpipe_opts(run_dir)
    if OPTION_SECTION_PBSMRTPIPE not in all_cfg:
        all_cfg[OPTION_SECTION_PBSMRTPIPE] = dict()
    pbsmrtpipe_opts.update(all_cfg[OPTION_SECTION_PBSMRTPIPE])
    all_cfg[OPTION_SECTION_PBSMRTPIPE] = pbsmrtpipe_opts

    # Dump all_cfg.
    say('Dumping to {}'.format(repr(o_hgap_cfg_fn)))
    dump_as_json(all_cfg, open(o_hgap_cfg_fn, 'w'))

    # Get logging cfg.
    logging_cfg = DEFAULT_LOGGING_CFG

    # Dump logging cfg.
    say('Dumping to {}'.format(repr(o_logging_cfg_fn)))
    dump_as_json(logging_cfg, open(o_logging_cfg_fn, 'w'))
Пример #3
0
def run_falcon_build_rdb(input_files, output_files):
    print("output_files: %s" % (repr(output_files)))
    cwd = os.getcwd()
    odir = os.path.realpath(os.path.abspath(os.path.dirname(output_files[0])))
    if True:  # debug
        if cwd != odir:
            raise Exception("%r != %r" % (cwd, odir))
    i_json_config_fn, i_fofn_fn = input_files
    print("output_files: %s" % repr(output_files))
    run_daligner_jobs_fn, raw_reads_db_fn, job_done_fn = output_files
    config = _get_config_from_json_fileobj(open(i_json_config_fn))
    script_fn = os.path.join(odir, "prepare_rdb.sh")  # implies run-dir too
    # job_done_fn = os.path.join(odir, 'job.done') # not needed in pbsmrtpipe today tho
    support.build_rdb(i_fofn_fn, config, job_done_fn, script_fn, run_daligner_jobs_fn)
    run_cmd("bash %s" % script_fn, sys.stdout, sys.stderr, shell=False)
    job_descs = falcon_kit.functional.get_daligner_job_descriptions(open(run_daligner_jobs_fn), "raw_reads")
    # We do not bother to calculate 'single' b/c this is only a sanity-check.
    if not job_descs:
        raise Exception("No daligner jobs generated in '%s' by '%s'." % (run_daligner_jobs_fn, script_fn))
    symlink("raw_reads.db", raw_reads_db_fn)
    return 0
Пример #4
0
def symlink_dazzdb(actualdir, db_prefix):
    """Symlink elements of dazzler db.
    For now, 3 files.
    """
    symlink(os.path.join(actualdir, '.%s.bps'%db_prefix))
    symlink(os.path.join(actualdir, '.%s.idx'%db_prefix))
    symlink(os.path.join(actualdir, '%s.db'%db_prefix))
Пример #5
0
def run_hgap(input_files, output_files, tmpdir):
    i_cfg_fn, i_logging_fn, i_subreadset_fn = input_files
    o_contigset_fn, o_preass_json_fn, o_polass_json_fn, o_log_fn, = output_files
    # Update the logging-cfg with our log-file.
    logging_cfg = json.loads(open(i_logging_fn).read())
    logging_cfg['handlers']['handler_file_all']['filename'] = o_log_fn
    logging_fn = 'logging.json'
    with open(logging_fn, 'w') as ofs:
        ofs.write(json.dumps(logging_cfg))
    # Update the cfg with our subreadset. (Inside hgap_run?)
    # Run pypeflow.hgap.main.
    cmd = 'TMPDIR={tmpdir} python -m pbfalcon.cli.hgap_run --logging {logging_fn} {i_cfg_fn}'.format(
        **locals())
    system(cmd)
    # Write Reports
    with open('run-falcon/0-rawreads/pre_assembly_stats.json'
              ) as stats_ifs:  # by convention
        with open(o_preass_json_fn, 'w') as report_ofs:
            report_preassembly.write_report_from_stats(stats_ifs, report_ofs)
    # Symlink expected outputs, by convention.
    symlink('run-gc-gather/contigset.xml', o_contigset_fn)
    symlink('run-polished-assembly-report/polished_assembly_report.json',
            o_polass_json_fn)
    return 0
Пример #6
0
def run_falcon_build_rdb(input_files, output_files):
    print('output_files: %s' % (repr(output_files)))
    cwd = os.getcwd()
    odir = os.path.realpath(os.path.abspath(os.path.dirname(output_files[0])))
    if True:  #debug
        if cwd != odir:
            raise Exception('%r != %r' % (cwd, odir))
    i_json_config_fn, i_fofn_fn = input_files
    print('output_files: %s' % repr(output_files))
    run_daligner_jobs_fn, raw_reads_db_fn, job_done_fn = output_files
    config = _get_config_from_json_fileobj(open(i_json_config_fn))
    script_fn = os.path.join(odir, 'prepare_rdb.sh')  # implies run-dir too
    #job_done_fn = os.path.join(odir, 'job.done') # not needed in pbsmrtpipe today tho
    support.build_rdb(i_fofn_fn, config, job_done_fn, script_fn,
                      run_daligner_jobs_fn)
    run_cmd('bash %s' % script_fn, sys.stdout, sys.stderr, shell=False)
    job_descs = falcon_kit.functional.get_daligner_job_descriptions(
        open(run_daligner_jobs_fn), 'raw_reads')
    # We do not bother to calculate 'single' b/c this is only a sanity-check.
    if not job_descs:
        raise Exception("No daligner jobs generated in '%s' by '%s'." %
                        (run_daligner_jobs_fn, script_fn))
    symlink('raw_reads.db', raw_reads_db_fn)
    return 0
Пример #7
0
def create_merge_tasks(i_fofn_fn, run_jobs_fn, wd, db_prefix, config):
    #merge_scripts = bash.scripts_merge(config, db_prefix, run_jobs_fn)
    tasks = {}  # pid -> (merge_params, cons_params)
    mjob_data = {}

    with open(run_jobs_fn) as f:
        for l in f:
            l = l.strip().split()
            if l[0] not in ("LAsort", "LAmerge", "mv"):
                continue
            if l[0] == "LAsort":
                # We now run this part w/ daligner, but we still need
                # a small script for some book-keeping.
                p_id = int(l[2].split(".")[1])
                mjob_data.setdefault(p_id, [])
                #mjob_data[p_id].append(  " ".join(l) ) # Already done w/ daligner!
            if l[0] == "LAmerge":
                l2 = l[2].split(".")
                if l2[1][0] == "L":
                    p_id = int(l[2].split(".")[2])
                    mjob_data.setdefault(p_id, [])
                    mjob_data[p_id].append(" ".join(l))
                else:
                    p_id = int(l[2].split(".")[1])
                    mjob_data.setdefault(p_id, [])
                    mjob_data[p_id].append(" ".join(l))
            if l[0] == "mv":
                l2 = l[1].split(".")
                if l2[1][0] == "L":
                    p_id = int(l[1].split(".")[2])
                    mjob_data.setdefault(p_id, [])
                    mjob_data[p_id].append(" ".join(l))
                else:
                    p_id = int(l[1].split(".")[1])
                    mjob_data.setdefault(p_id, [])
                    mjob_data[p_id].append(" ".join(l))

    # Could be L1.* or preads.*
    re_las = re.compile(r'\.(\d*)(\.\d*)?\.las$')

    for p_id in mjob_data:
        s_data = mjob_data[p_id]

        support.make_dirs("%s/preads" % (wd))
        support.make_dirs("%s/las_files" % (wd))
        merge_subdir = "m_%05d" % p_id
        merge_dir = os.path.join(wd, merge_subdir)
        support.make_dirs(merge_dir)
        #merge_script_file = os.path.abspath( "%s/m_%05d/m_%05d.sh" % (wd, p_id, p_id) )
        merge_script = StringIO.StringIO()
        with cd(merge_dir):
            print("i_fofn_fn=%r" % i_fofn_fn)
            # Since we could be in the gather-task-dir, instead of globbing,
            # we will read the fofn.
            for fn in open(i_fofn_fn).read().splitlines():
                basename = os.path.basename(fn)
                mo = re_las.search(basename)
                if not mo:
                    continue
                left_block = int(mo.group(1))
                if left_block != p_id:
                    # By convention, m_00005 merges L1.5.*.las, etc.
                    continue
                symlink(fn)

        for l in s_data:
            print >> merge_script, l
        las_bfn = '%s.%d.las' % (db_prefix, p_id)
        #print >> merge_script, 'echo %s >| %s' %(las_bfn, merged_las_fofn_bfn)

        #job_done = makePypeLocalFile(os.path.abspath( "%s/m_%05d/m_%05d_done" % (wd, p_id, p_id)  ))
        parameters = {
            "script": merge_script.getvalue(),
            "merge_subdir": merge_subdir,
            "config": config
        }
        merge_task = parameters

        fasta_bfn = "out.%05d.fasta" % p_id
        out_file_fn = os.path.abspath("%s/preads/%s" % (wd, fasta_bfn))
        #out_done = makePypeLocalFile(os.path.abspath( "%s/preads/c_%05d_done" % (wd, p_id)  ))
        parameters = {
            "db_fn": '{}/{}'.format(os.getcwd(), db_prefix),
            "las_fn":
            '{}/{}/{}'.format(os.getcwd(), merge_subdir,
                              las_bfn),  # assuming merge ran in merge_dir
            "out_file_fn": out_file_fn,
            #"out_done": out_done,
            "config": config
        }
        cons_task = parameters
        tasks[p_id] = (merge_task, cons_task, las_bfn, fasta_bfn)

    return tasks
Пример #8
0
def run_hgap(input_files, output_files, tmpdir):
    i_cfg_fn, i_logging_fn, i_subreadset_fn = input_files
    o_preads_fasta_fn, \
    o_polished_fasta_fn, o_polished_fastq_fn, o_polished_csv_fn, \
    o_aligned_subreads_fn, o_alignment_summary_gff_fn, o_unmapped_subreads_txt_fn, \
    o_contigset_fn, o_preass_json_fn, o_polass_json_fn, o_log_fn, = output_files
    # Update the logging-cfg with our log-file.
    logging_cfg = json.loads(open(i_logging_fn).read())
    logging_cfg['handlers']['handler_file_all']['filename'] = o_log_fn
    logging_fn = 'logging.json'
    with open(logging_fn, 'w') as ofs:
        ofs.write(json.dumps(logging_cfg))
    # Update the cfg with our subreadset. (Inside hgap_run?)
    # Run pypeflow.hgap.main.
    cmd = 'TMPDIR={tmpdir} python -m pbfalcon.cli.hgap_run --logging {logging_fn} {i_cfg_fn}'.format(
        **locals())
    system(cmd)
    # Write Reports
    with open('run-falcon/0-rawreads/report/pre_assembly_stats.json'
              ) as stats_ifs:  # by convention
        with open(o_preass_json_fn, 'w') as report_ofs:
            report_preassembly.write_report_from_stats(stats_ifs, report_ofs)
    # Symlink expected outputs, by convention.
    symlink('run-falcon/1-preads_ovl/db2falcon/preads4falcon.fasta',
            o_preads_fasta_fn)
    symlink('run-gc-gather/contigset.fasta', o_polished_fasta_fn)
    symlink('run-gc-gather/gathered.fastq', o_polished_fastq_fn)
    symlink('run-polished-assembly-report/polished_coverage_vs_quality.csv',
            o_polished_csv_fn)
    symlink('run-polished-assembly-report/alignment.summary.gff',
            o_alignment_summary_gff_fn)
    symlink('run-pbalign_gather/aligned.subreads.alignmentset.xml',
            o_aligned_subreads_fn)
    symlink('run-pbalign_gather/unmapped.txt', o_unmapped_subreads_txt_fn)
    symlink('run-gc-gather/contigset.xml', o_contigset_fn)
    symlink('run-polished-assembly-report/polished_assembly_report.json',
            o_polass_json_fn)
    return 0
Пример #9
0
def run_hgap(input_files, output_files, tmpdir):
    i_cfg_fn, i_logging_fn, i_subreadset_fn = input_files
    o_preads_fasta_fn, o_polished_fasta_fn, o_polished_fastq_fn, o_polished_csv_fn, o_aligned_subreads_fn, o_alignment_summary_gff_fn, o_unmapped_subreads_txt_fn, o_contigset_fn, o_preass_json_fn, o_polass_json_fn, o_log_fn, = (
        output_files
    )
    # Update the logging-cfg with our log-file.
    logging_cfg = json.loads(open(i_logging_fn).read())
    logging_cfg["handlers"]["handler_file_all"]["filename"] = o_log_fn
    logging_fn = "logging.json"
    with open(logging_fn, "w") as ofs:
        ofs.write(json.dumps(logging_cfg))
    # Update the cfg with our subreadset. (Inside hgap_run?)
    # Run pypeflow.hgap.main.
    cmd = "TMPDIR={tmpdir} python -m pbfalcon.cli.hgap_run --logging {logging_fn} {i_cfg_fn}".format(**locals())
    system(cmd)
    # Write Reports
    with open("run-falcon/0-rawreads/report/pre_assembly_stats.json") as stats_ifs:  # by convention
        with open(o_preass_json_fn, "w") as report_ofs:
            report_preassembly.write_report_from_stats(stats_ifs, report_ofs)
    # Symlink expected outputs, by convention.
    symlink("run-falcon/1-preads_ovl/db2falcon/preads4falcon.fasta", o_preads_fasta_fn)
    symlink("run-gc-gather/contigset.fasta", o_polished_fasta_fn)
    symlink("run-gc-gather/gathered.fastq", o_polished_fastq_fn)
    symlink("run-polished-assembly-report/polished_coverage_vs_quality.csv", o_polished_csv_fn)
    symlink("run-polished-assembly-report/alignment.summary.gff", o_alignment_summary_gff_fn)
    symlink("run-pbalign_gather/aligned.subreads.alignmentset.xml", o_aligned_subreads_fn)
    symlink("run-pbalign_gather/unmapped.txt", o_unmapped_subreads_txt_fn)
    symlink("run-gc-gather/contigset.xml", o_contigset_fn)
    symlink("run-polished-assembly-report/polished_assembly_report.json", o_polass_json_fn)
    return 0
Пример #10
0
def create_merge_tasks(i_fofn_fn, run_jobs_fn, wd, db_prefix, config):
    # merge_scripts = bash.scripts_merge(config, db_prefix, run_jobs_fn)
    tasks = {}  # pid -> (merge_params, cons_params)
    mjob_data = {}

    with open(run_jobs_fn) as f:
        for l in f:
            l = l.strip().split()
            if l[0] not in ("LAsort", "LAmerge", "mv"):
                continue
            if l[0] == "LAsort":
                # We now run this part w/ daligner, but we still need
                # a small script for some book-keeping.
                p_id = int(l[2].split(".")[1])
                mjob_data.setdefault(p_id, [])
                # mjob_data[p_id].append(  " ".join(l) ) # Already done w/ daligner!
            if l[0] == "LAmerge":
                l2 = l[2].split(".")
                if l2[1][0] == "L":
                    p_id = int(l[2].split(".")[2])
                    mjob_data.setdefault(p_id, [])
                    mjob_data[p_id].append(" ".join(l))
                else:
                    p_id = int(l[2].split(".")[1])
                    mjob_data.setdefault(p_id, [])
                    mjob_data[p_id].append(" ".join(l))
            if l[0] == "mv":
                l2 = l[1].split(".")
                if l2[1][0] == "L":
                    p_id = int(l[1].split(".")[2])
                    mjob_data.setdefault(p_id, [])
                    mjob_data[p_id].append(" ".join(l))
                else:
                    p_id = int(l[1].split(".")[1])
                    mjob_data.setdefault(p_id, [])
                    mjob_data[p_id].append(" ".join(l))

    # Could be L1.* or preads.*
    re_las = re.compile(r"\.(\d*)(\.\d*)?\.las$")

    for p_id in mjob_data:
        s_data = mjob_data[p_id]

        support.make_dirs("%s/preads" % (wd))
        support.make_dirs("%s/las_files" % (wd))
        merge_subdir = "m_%05d" % p_id
        merge_dir = os.path.join(wd, merge_subdir)
        support.make_dirs(merge_dir)
        # merge_script_file = os.path.abspath( "%s/m_%05d/m_%05d.sh" % (wd, p_id, p_id) )
        merge_script = StringIO.StringIO()
        with cd(merge_dir):
            print("i_fofn_fn=%r" % i_fofn_fn)
            # Since we could be in the gather-task-dir, instead of globbing,
            # we will read the fofn.
            for fn in open(i_fofn_fn).read().splitlines():
                basename = os.path.basename(fn)
                mo = re_las.search(basename)
                if not mo:
                    continue
                left_block = int(mo.group(1))
                if left_block != p_id:
                    # By convention, m_00005 merges L1.5.*.las, etc.
                    continue
                symlink(fn)

        for l in s_data:
            print >> merge_script, l
        las_bfn = "%s.%d.las" % (db_prefix, p_id)
        # print >> merge_script, 'echo %s >| %s' %(las_bfn, merged_las_fofn_bfn)

        # job_done = makePypeLocalFile(os.path.abspath( "%s/m_%05d/m_%05d_done" % (wd, p_id, p_id)  ))
        parameters = {"script": merge_script.getvalue(), "merge_subdir": merge_subdir, "config": config}
        merge_task = parameters

        fasta_bfn = "out.%05d.fasta" % p_id
        out_file_fn = os.path.abspath("%s/preads/%s" % (wd, fasta_bfn))
        # out_done = makePypeLocalFile(os.path.abspath( "%s/preads/c_%05d_done" % (wd, p_id)  ))
        parameters = {
            "db_fn": "{}/{}".format(os.getcwd(), db_prefix),
            "las_fn": "{}/{}/{}".format(os.getcwd(), merge_subdir, las_bfn),  # assuming merge ran in merge_dir
            "out_file_fn": out_file_fn,
            # "out_done": out_done,
            "config": config,
        }
        cons_task = parameters
        tasks[p_id] = (merge_task, cons_task, las_bfn, fasta_bfn)
        # tasks <- dict{p_id: (merge_task, cons_task, las_bfn, fasta_bfn)}, where
        # p_id is an integer, e.g., 1
        # merge_task <- dict{'merge_dir': 'm_00001', 'script'='LAmerge -v raw_reads.1 L1.1.1 L1.1.2 L1.1.3', 'script_fn': x}
        # cons_task  <- dict{'db_fn':x, 'las_fn':x, 'out_file_fn':x, 'config':config}
        # las_bfn, e.g. raw_reads.1.las
        # fasta_bfn, e.g., out.00001.fasta

    return tasks